From 93f2f30017595e419d25c5dd6fa6a8b320a1858f Mon Sep 17 00:00:00 2001 From: Mark Abraham Date: Sun, 26 Jan 2020 16:33:45 +0100 Subject: [PATCH] Make NbnxnGpu class with constructor This is needed for future reform of the NB GPU code. Also converted the "staging" struct to have initializers (used to make the default constructor) because it is held by value and would otherwise have default constructor that leaves fields uninitialized. Also replaced cl_bool with normal bool, as there was no advantage for the former. Change-Id: I1b63f0a8145dbd911062d1039f129074b0abdc3e --- docs/doxygen/suppressions.txt | 2 +- src/gromacs/nbnxm/atomdata.cpp | 4 +- src/gromacs/nbnxm/atomdata.h | 6 +- src/gromacs/nbnxm/cuda/nbnxm_cuda.cu | 14 ++--- .../nbnxm/cuda/nbnxm_cuda_data_mgmt.cu | 55 ++++++++-------- src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h | 63 +++++++++---------- src/gromacs/nbnxm/gpu_common.h | 12 ++-- src/gromacs/nbnxm/gpu_common_utils.h | 2 +- src/gromacs/nbnxm/gpu_data_mgmt.h | 41 ++++++------ src/gromacs/nbnxm/gpu_jit_support.h | 4 +- src/gromacs/nbnxm/nbnxm.h | 6 +- src/gromacs/nbnxm/nbnxm_gpu.h | 28 ++++----- src/gromacs/nbnxm/nbnxm_setup.cpp | 8 +-- src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp | 10 +-- .../nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp | 49 +++++++-------- .../nbnxm/opencl/nbnxm_ocl_jit_support.cpp | 2 +- src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h | 62 +++++++++--------- 17 files changed, 182 insertions(+), 186 deletions(-) diff --git a/docs/doxygen/suppressions.txt b/docs/doxygen/suppressions.txt index 8f3a2dd479..964a2c8197 100644 --- a/docs/doxygen/suppressions.txt +++ b/docs/doxygen/suppressions.txt @@ -35,7 +35,7 @@ src/gromacs/nbnxm/kernels_simd_2xmm/kernel_common.h: warning: should include "nb src/gromacs/nbnxm/kernels_simd_4xm/kernel_common.h: warning: should include "nbnxm_simd.h" # This seems to be a false positive -src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h: error: gmx_nbnxm_gpu_t: is in internal file(s), but appears in public documentation +src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h: error: NbnxmGpu: is in internal file(s), but appears in public documentation # Temporary while we change the SIMD implementation src/gromacs/simd/impl_sparc64_hpc_ace/impl_sparc64_hpc_ace_common.h: warning: should include "simd.h" diff --git a/src/gromacs/nbnxm/atomdata.cpp b/src/gromacs/nbnxm/atomdata.cpp index 8fd60a8f03..89d2e762b8 100644 --- a/src/gromacs/nbnxm/atomdata.cpp +++ b/src/gromacs/nbnxm/atomdata.cpp @@ -1074,7 +1074,7 @@ void nbnxn_atomdata_copy_x_to_nbat_x(const Nbnxm::GridSet& gridSet, void nbnxn_atomdata_x_to_nbat_x_gpu(const Nbnxm::GridSet& gridSet, const gmx::AtomLocality locality, bool fillLocal, - gmx_nbnxm_gpu_t* gpu_nbv, + NbnxmGpu* gpu_nbv, DeviceBuffer d_x, GpuEventSynchronizer* xReadyOnDevice) { @@ -1463,7 +1463,7 @@ void reduceForcesGpu(const gmx::AtomLocality locality, const Nbnxm::GridSet& gridSet, void* pmeForcesDevice, gmx::ArrayRef dependencyList, - gmx_nbnxm_gpu_t* gpu_nbv, + NbnxmGpu* gpu_nbv, bool useGpuFPmeReduction, bool accumulateForce) { diff --git a/src/gromacs/nbnxm/atomdata.h b/src/gromacs/nbnxm/atomdata.h index d41408d953..ceb87f71f0 100644 --- a/src/gromacs/nbnxm/atomdata.h +++ b/src/gromacs/nbnxm/atomdata.h @@ -61,7 +61,7 @@ namespace gmx class MDLogger; } -struct gmx_nbnxm_gpu_t; +struct NbnxmGpu; struct nbnxn_atomdata_t; struct nonbonded_verlet_t; struct t_mdatoms; @@ -379,7 +379,7 @@ void nbnxn_atomdata_copy_x_to_nbat_x(const Nbnxm::GridSet& gridSet, void nbnxn_atomdata_x_to_nbat_x_gpu(const Nbnxm::GridSet& gridSet, gmx::AtomLocality locality, bool fillLocal, - gmx_nbnxm_gpu_t* gpu_nbv, + NbnxmGpu* gpu_nbv, DeviceBuffer d_x, GpuEventSynchronizer* xReadyOnDevice); @@ -408,7 +408,7 @@ void reduceForcesGpu(gmx::AtomLocality locality, const Nbnxm::GridSet& gridSet, void* pmeForcesDevice, gmx::ArrayRef dependencyList, - gmx_nbnxm_gpu_t* gpu_nbv, + NbnxmGpu* gpu_nbv, bool useGpuFPmeReduction, bool accumulateForce); diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu index f5135d7bfb..22da9946f1 100644 --- a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu +++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu @@ -363,7 +363,7 @@ static inline int calc_shmem_required_nonbonded(const int num_thre * the local, this function records the event if called with the local stream as * argument and inserts in the GPU stream a wait on the event on the nonlocal. */ -void nbnxnInsertNonlocalGpuDependency(const gmx_nbnxm_gpu_t* nb, const InteractionLocality interactionLocality) +void nbnxnInsertNonlocalGpuDependency(const NbnxmGpu* nb, const InteractionLocality interactionLocality) { cudaStream_t stream = nb->stream[interactionLocality]; @@ -389,7 +389,7 @@ void nbnxnInsertNonlocalGpuDependency(const gmx_nbnxm_gpu_t* nb, const Interacti } /*! \brief Launch asynchronously the xq buffer host to device copy. */ -void gpu_copy_xq_to_gpu(gmx_nbnxm_gpu_t* nb, const nbnxn_atomdata_t* nbatom, const AtomLocality atomLocality) +void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const AtomLocality atomLocality) { GMX_ASSERT(nb, "Need a valid nbnxn_gpu object"); @@ -477,7 +477,7 @@ void gpu_copy_xq_to_gpu(gmx_nbnxm_gpu_t* nb, const nbnxn_atomdata_t* nbatom, con the local x+q H2D (and all preceding) tasks are complete and synchronize with this event in the non-local stream before launching the non-bonded kernel. */ -void gpu_launch_kernel(gmx_nbnxm_gpu_t* nb, const gmx::StepWorkload& stepWork, const InteractionLocality iloc) +void gpu_launch_kernel(NbnxmGpu* nb, const gmx::StepWorkload& stepWork, const InteractionLocality iloc) { cu_atomdata_t* adat = nb->atdat; cu_nbparam_t* nbp = nb->nbparam; @@ -589,7 +589,7 @@ static inline int calc_shmem_required_prune(const int num_threads_z) return shmem; } -void gpu_launch_kernel_pruneonly(gmx_nbnxm_gpu_t* nb, const InteractionLocality iloc, const int numParts) +void gpu_launch_kernel_pruneonly(NbnxmGpu* nb, const InteractionLocality iloc, const int numParts) { cu_atomdata_t* adat = nb->atdat; cu_nbparam_t* nbp = nb->nbparam; @@ -713,7 +713,7 @@ void gpu_launch_kernel_pruneonly(gmx_nbnxm_gpu_t* nb, const InteractionLocality } } -void gpu_launch_cpyback(gmx_nbnxm_gpu_t* nb, +void gpu_launch_cpyback(NbnxmGpu* nb, nbnxn_atomdata_t* nbatom, const gmx::StepWorkload& stepWork, const AtomLocality atomLocality) @@ -817,7 +817,7 @@ void cuda_set_cacheconfig() /* X buffer operations on GPU: performs conversion from rvec to nb format. */ void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid& grid, bool setFillerCoords, - gmx_nbnxm_gpu_t* nb, + NbnxmGpu* nb, DeviceBuffer d_x, GpuEventSynchronizer* xReadyOnDevice, const Nbnxm::AtomLocality locality, @@ -885,7 +885,7 @@ void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid& grid, */ void nbnxn_gpu_add_nbat_f_to_f(const AtomLocality atomLocality, DeviceBuffer totalForcesDevice, - gmx_nbnxm_gpu_t* nb, + NbnxmGpu* nb, void* pmeForcesDevice, gmx::ArrayRef dependencyList, int atomStart, diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu index 2714dfee8d..d99bdfd774 100644 --- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu +++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu @@ -45,7 +45,7 @@ #include #include -// TODO We would like to move this down, but the way gmx_nbnxm_gpu_t +// TODO We would like to move this down, but the way NbnxmGpu // is currently declared means this has to be before gpu_types.h #include "nbnxm_cuda_types.h" @@ -89,7 +89,7 @@ namespace Nbnxm static unsigned int gpu_min_ci_balanced_factor = 44; /* Fw. decl. */ -static void nbnxn_cuda_clear_e_fshift(gmx_nbnxm_gpu_t* nb); +static void nbnxn_cuda_clear_e_fshift(NbnxmGpu* nb); /* Fw. decl, */ static void nbnxn_cuda_free_nbparam_table(cu_nbparam_t* nbparam); @@ -400,7 +400,7 @@ static void init_timings(gmx_wallclock_gpu_nbnxn_t* t) } /*! Initializes simulation constant data. */ -static void cuda_init_const(gmx_nbnxm_gpu_t* nb, +static void cuda_init_const(NbnxmGpu* nb, const interaction_const_t* ic, const PairlistParams& listParams, const nbnxn_atomdata_t::Params& nbatParams) @@ -412,17 +412,16 @@ static void cuda_init_const(gmx_nbnxm_gpu_t* nb, nbnxn_cuda_clear_e_fshift(nb); } -gmx_nbnxm_gpu_t* gpu_init(const gmx_device_info_t* deviceInfo, - const interaction_const_t* ic, - const PairlistParams& listParams, - const nbnxn_atomdata_t* nbat, - int /*rank*/, - gmx_bool bLocalAndNonlocal) +NbnxmGpu* gpu_init(const gmx_device_info_t* deviceInfo, + const interaction_const_t* ic, + const PairlistParams& listParams, + const nbnxn_atomdata_t* nbat, + int /*rank*/, + gmx_bool bLocalAndNonlocal) { cudaError_t stat; - gmx_nbnxm_gpu_t* nb; - snew(nb, 1); + auto nb = new NbnxmGpu; snew(nb->atdat, 1); snew(nb->nbparam, 1); snew(nb->plist[InteractionLocality::Local], 1); @@ -509,7 +508,7 @@ gmx_nbnxm_gpu_t* gpu_init(const gmx_device_info_t* deviceInfo, return nb; } -void gpu_init_pairlist(gmx_nbnxm_gpu_t* nb, const NbnxnPairlistGpu* h_plist, const InteractionLocality iloc) +void gpu_init_pairlist(NbnxmGpu* nb, const NbnxnPairlistGpu* h_plist, const InteractionLocality iloc) { char sbuf[STRLEN]; bool bDoTime = (nb->bDoTime && !h_plist->sci.empty()); @@ -565,7 +564,7 @@ void gpu_init_pairlist(gmx_nbnxm_gpu_t* nb, const NbnxnPairlistGpu* h_plist, con d_plist->haveFreshList = true; } -void gpu_upload_shiftvec(gmx_nbnxm_gpu_t* nb, const nbnxn_atomdata_t* nbatom) +void gpu_upload_shiftvec(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom) { cu_atomdata_t* adat = nb->atdat; cudaStream_t ls = nb->stream[InteractionLocality::Local]; @@ -579,7 +578,7 @@ void gpu_upload_shiftvec(gmx_nbnxm_gpu_t* nb, const nbnxn_atomdata_t* nbatom) } /*! Clears the first natoms_clear elements of the GPU nonbonded force output array. */ -static void nbnxn_cuda_clear_f(gmx_nbnxm_gpu_t* nb, int natoms_clear) +static void nbnxn_cuda_clear_f(NbnxmGpu* nb, int natoms_clear) { cudaError_t stat; cu_atomdata_t* adat = nb->atdat; @@ -590,7 +589,7 @@ static void nbnxn_cuda_clear_f(gmx_nbnxm_gpu_t* nb, int natoms_clear) } /*! Clears nonbonded shift force output array and energy outputs on the GPU. */ -static void nbnxn_cuda_clear_e_fshift(gmx_nbnxm_gpu_t* nb) +static void nbnxn_cuda_clear_e_fshift(NbnxmGpu* nb) { cudaError_t stat; cu_atomdata_t* adat = nb->atdat; @@ -604,7 +603,7 @@ static void nbnxn_cuda_clear_e_fshift(gmx_nbnxm_gpu_t* nb) CU_RET_ERR(stat, "cudaMemsetAsync on e_el falied"); } -void gpu_clear_outputs(gmx_nbnxm_gpu_t* nb, bool computeVirial) +void gpu_clear_outputs(NbnxmGpu* nb, bool computeVirial) { nbnxn_cuda_clear_f(nb, nb->atdat->natoms); /* clear shift force array and energies if the outputs were @@ -615,7 +614,7 @@ void gpu_clear_outputs(gmx_nbnxm_gpu_t* nb, bool computeVirial) } } -void gpu_init_atomdata(gmx_nbnxm_gpu_t* nb, const nbnxn_atomdata_t* nbat) +void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat) { cudaError_t stat; int nalloc, natoms; @@ -702,7 +701,7 @@ static void nbnxn_cuda_free_nbparam_table(cu_nbparam_t* nbparam) } } -void gpu_free(gmx_nbnxm_gpu_t* nb) +void gpu_free(NbnxmGpu* nb) { cudaError_t stat; cu_atomdata_t* atdat; @@ -789,7 +788,7 @@ void gpu_free(gmx_nbnxm_gpu_t* nb) sfree(atdat); sfree(nbparam); sfree(nb->timings); - sfree(nb); + delete nb; if (debug) { @@ -798,7 +797,7 @@ void gpu_free(gmx_nbnxm_gpu_t* nb) } //! This function is documented in the header file -gmx_wallclock_gpu_nbnxn_t* gpu_get_timings(gmx_nbnxm_gpu_t* nb) +gmx_wallclock_gpu_nbnxn_t* gpu_get_timings(NbnxmGpu* nb) { return (nb != nullptr && nb->bDoTime) ? nb->timings : nullptr; } @@ -811,38 +810,38 @@ void gpu_reset_timings(nonbonded_verlet_t* nbv) } } -int gpu_min_ci_balanced(gmx_nbnxm_gpu_t* nb) +int gpu_min_ci_balanced(NbnxmGpu* nb) { return nb != nullptr ? gpu_min_ci_balanced_factor * nb->dev_info->prop.multiProcessorCount : 0; } -gmx_bool gpu_is_kernel_ewald_analytical(const gmx_nbnxm_gpu_t* nb) +gmx_bool gpu_is_kernel_ewald_analytical(const NbnxmGpu* nb) { return ((nb->nbparam->eeltype == eelCuEWALD_ANA) || (nb->nbparam->eeltype == eelCuEWALD_ANA_TWIN)); } -void* gpu_get_command_stream(gmx_nbnxm_gpu_t* nb, const InteractionLocality iloc) +void* gpu_get_command_stream(NbnxmGpu* nb, const InteractionLocality iloc) { assert(nb); return static_cast(&nb->stream[iloc]); } -void* gpu_get_xq(gmx_nbnxm_gpu_t* nb) +void* gpu_get_xq(NbnxmGpu* nb) { assert(nb); return static_cast(nb->atdat->xq); } -void* gpu_get_f(gmx_nbnxm_gpu_t* nb) +void* gpu_get_f(NbnxmGpu* nb) { assert(nb); return static_cast(nb->atdat->f); } -rvec* gpu_get_fshift(gmx_nbnxm_gpu_t* nb) +rvec* gpu_get_fshift(NbnxmGpu* nb) { assert(nb); @@ -851,7 +850,7 @@ rvec* gpu_get_fshift(gmx_nbnxm_gpu_t* nb) /* Initialization for X buffer operations on GPU. */ /* TODO Remove explicit pinning from host arrays from here and manage in a more natural way*/ -void nbnxn_gpu_init_x_to_nbat_x(const Nbnxm::GridSet& gridSet, gmx_nbnxm_gpu_t* gpu_nbv) +void nbnxn_gpu_init_x_to_nbat_x(const Nbnxm::GridSet& gridSet, NbnxmGpu* gpu_nbv) { cudaStream_t stream = gpu_nbv->stream[InteractionLocality::Local]; bool bDoTime = gpu_nbv->bDoTime; @@ -937,7 +936,7 @@ void nbnxn_gpu_init_x_to_nbat_x(const Nbnxm::GridSet& gridSet, gmx_nbnxm_gpu_t* /* Initialization for F buffer operations on GPU. */ void nbnxn_gpu_init_add_nbat_f_to_f(const int* cell, - gmx_nbnxm_gpu_t* gpu_nbv, + NbnxmGpu* gpu_nbv, int natoms_total, GpuEventSynchronizer* const localReductionDone) { diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h b/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h index 911536167b..be7d861639 100644 --- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h +++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h @@ -128,7 +128,6 @@ enum evdwCu /*! \cond */ typedef struct cu_atomdata cu_atomdata_t; typedef struct cu_nbparam cu_nbparam_t; -typedef struct nb_staging nb_staging_t; /*! \endcond */ @@ -138,14 +137,14 @@ typedef struct nb_staging nb_staging_t; * The energies/shift forces get downloaded here first, before getting added * to the CPU-side aggregate values. */ -struct nb_staging +struct nb_staging_t { //! LJ energy - float* e_lj; + float* e_lj = nullptr; //! electrostatic energy - float* e_el; + float* e_el = nullptr; //! shift forces - float3* fshift; + float3* fshift = nullptr; }; /** \internal @@ -267,58 +266,58 @@ class GpuEventSynchronizer; /*! \internal * \brief Main data structure for CUDA nonbonded force calculations. */ -struct gmx_nbnxm_gpu_t +struct NbnxmGpu { /*! \brief CUDA device information */ - const gmx_device_info_t* dev_info; + const gmx_device_info_t* dev_info = nullptr; /*! \brief true if doing both local/non-local NB work on GPU */ - bool bUseTwoStreams; + bool bUseTwoStreams = false; /*! \brief atom data */ - cu_atomdata_t* atdat; + cu_atomdata_t* atdat = nullptr; /*! \brief f buf ops cell index mapping */ - int* cell; + int* cell = nullptr; /*! \brief number of indices in cell buffer */ - int ncell; + int ncell = 0; /*! \brief number of indices allocated in cell buffer */ - int ncell_alloc; + int ncell_alloc = 0; /*! \brief array of atom indices */ - int* atomIndices; + int* atomIndices = nullptr; /*! \brief size of atom indices */ - int atomIndicesSize; + int atomIndicesSize = 0; /*! \brief size of atom indices allocated in device buffer */ - int atomIndicesSize_alloc; + int atomIndicesSize_alloc = 0; /*! \brief x buf ops num of atoms */ - int* cxy_na; + int* cxy_na = nullptr; /*! \brief number of elements in cxy_na */ - int ncxy_na; + int ncxy_na = 0; /*! \brief number of elements allocated allocated in device buffer */ - int ncxy_na_alloc; + int ncxy_na_alloc = 0; /*! \brief x buf ops cell index mapping */ - int* cxy_ind; + int* cxy_ind = nullptr; /*! \brief number of elements in cxy_ind */ - int ncxy_ind; + int ncxy_ind = 0; /*! \brief number of elements allocated allocated in device buffer */ - int ncxy_ind_alloc; + int ncxy_ind_alloc = 0; /*! \brief parameters required for the non-bonded calc. */ - cu_nbparam_t* nbparam; + cu_nbparam_t* nbparam = nullptr; /*! \brief pair-list data structures (local and non-local) */ - gmx::EnumerationArray plist; + gmx::EnumerationArray plist = { { nullptr } }; /*! \brief staging area where fshift/energies get downloaded */ nb_staging_t nbst; /*! \brief local and non-local GPU streams */ - gmx::EnumerationArray stream; + gmx::EnumerationArray stream = { { nullptr } }; /*! \brief Events used for synchronization */ /*! \{ */ /*! \brief Event triggered when the non-local non-bonded * kernel is done (and the local transfer can proceed) */ - cudaEvent_t nonlocal_done; + cudaEvent_t nonlocal_done = nullptr; /*! \brief Event triggered when the tasks issued in the local * stream that need to precede the non-local force or buffer * operation calculations are done (e.g. f buffer 0-ing, local * x/q H2D, buffer op initialization in local stream that is * required also by nonlocal stream ) */ - cudaEvent_t misc_ops_and_local_H2D_done; + cudaEvent_t misc_ops_and_local_H2D_done = nullptr; /*! \} */ /*! \brief True if there is work for the current domain in the @@ -329,7 +328,7 @@ struct gmx_nbnxm_gpu_t * domain. As long as bonded work is not split up into * local/nonlocal, if there is bonded GPU work, both flags * will be true. */ - gmx::EnumerationArray haveWork; + gmx::EnumerationArray haveWork = { { false } }; /*! \brief Pointer to event synchronizer triggered when the local * GPU buffer ops / reduction is complete @@ -337,22 +336,22 @@ struct gmx_nbnxm_gpu_t * \note That the synchronizer is managed outside of this module * in StatePropagatorDataGpu. */ - GpuEventSynchronizer* localFReductionDone; + GpuEventSynchronizer* localFReductionDone = nullptr; /*! \brief Event triggered when non-local coordinate buffer * has been copied from device to host. */ - GpuEventSynchronizer* xNonLocalCopyD2HDone; + GpuEventSynchronizer* xNonLocalCopyD2HDone = nullptr; /* NOTE: With current CUDA versions (<=5.0) timing doesn't work with multiple * concurrent streams, so we won't time if both l/nl work is done on GPUs. * Timer init/uninit is still done even with timing off so only the condition * setting bDoTime needs to be change if this CUDA "feature" gets fixed. */ /*! \brief True if event-based timing is enabled. */ - bool bDoTime; + bool bDoTime = false; /*! \brief CUDA event-based timers. */ - cu_timers_t* timers; + cu_timers_t* timers = nullptr; /*! \brief Timing data. TODO: deprecate this and query timers for accumulated data instead */ - gmx_wallclock_gpu_nbnxn_t* timings; + gmx_wallclock_gpu_nbnxn_t* timings = nullptr; }; #endif /* NBNXN_CUDA_TYPES_H */ diff --git a/src/gromacs/nbnxm/gpu_common.h b/src/gromacs/nbnxm/gpu_common.h index a8369ce2d2..dcfd2f8fef 100644 --- a/src/gromacs/nbnxm/gpu_common.h +++ b/src/gromacs/nbnxm/gpu_common.h @@ -124,9 +124,7 @@ static inline InteractionLocality gpuAtomToInteractionLocality(const AtomLocalit //NOLINTNEXTLINE(misc-definitions-in-headers) -void setupGpuShortRangeWork(gmx_nbnxm_gpu_t* nb, - const gmx::GpuBonded* gpuBonded, - const gmx::InteractionLocality iLocality) +void setupGpuShortRangeWork(NbnxmGpu* nb, const gmx::GpuBonded* gpuBonded, const gmx::InteractionLocality iLocality) { GMX_ASSERT(nb, "Need a valid nbnxn_gpu object"); @@ -146,13 +144,13 @@ void setupGpuShortRangeWork(gmx_nbnxm_gpu_t* nb, * \param[inout] nb Pointer to the nonbonded GPU data structure * \param[in] iLocality Interaction locality identifier */ -static bool haveGpuShortRangeWork(const gmx_nbnxm_gpu_t& nb, const gmx::InteractionLocality iLocality) +static bool haveGpuShortRangeWork(const NbnxmGpu& nb, const gmx::InteractionLocality iLocality) { return nb.haveWork[iLocality]; } //NOLINTNEXTLINE(misc-definitions-in-headers) -bool haveGpuShortRangeWork(const gmx_nbnxm_gpu_t* nb, const gmx::AtomLocality aLocality) +bool haveGpuShortRangeWork(const NbnxmGpu* nb, const gmx::AtomLocality aLocality) { GMX_ASSERT(nb, "Need a valid nbnxn_gpu object"); @@ -362,7 +360,7 @@ static inline void gpu_accumulate_timings(gmx_wallclock_gpu_nbnxn_t* timings, * \todo Move into shared source file with gmx_compile_cpp_as_cuda */ //NOLINTNEXTLINE(misc-definitions-in-headers) -bool gpu_try_finish_task(gmx_nbnxm_gpu_t* nb, +bool gpu_try_finish_task(NbnxmGpu* nb, const gmx::StepWorkload& stepWork, const AtomLocality aloc, real* e_lj, @@ -458,7 +456,7 @@ bool gpu_try_finish_task(gmx_nbnxm_gpu_t* nb, * \return The number of cycles the gpu wait took */ //NOLINTNEXTLINE(misc-definitions-in-headers) TODO: move into source file -float gpu_wait_finish_task(gmx_nbnxm_gpu_t* nb, +float gpu_wait_finish_task(NbnxmGpu* nb, const gmx::StepWorkload& stepWork, AtomLocality aloc, real* e_lj, diff --git a/src/gromacs/nbnxm/gpu_common_utils.h b/src/gromacs/nbnxm/gpu_common_utils.h index 176ab8f045..4882c3530e 100644 --- a/src/gromacs/nbnxm/gpu_common_utils.h +++ b/src/gromacs/nbnxm/gpu_common_utils.h @@ -64,7 +64,7 @@ namespace Nbnxm * local part of the force array also depends on the non-local kernel. * The skip of the local kernel is taken care of separately. */ -static inline bool canSkipNonbondedWork(const gmx_nbnxm_gpu_t& nb, InteractionLocality iloc) +static inline bool canSkipNonbondedWork(const NbnxmGpu& nb, InteractionLocality iloc) { assert(nb.plist[iloc]); return (iloc == InteractionLocality::NonLocal && nb.plist[iloc]->nsci == 0); diff --git a/src/gromacs/nbnxm/gpu_data_mgmt.h b/src/gromacs/nbnxm/gpu_data_mgmt.h index 2f504e91f8..30d44159ca 100644 --- a/src/gromacs/nbnxm/gpu_data_mgmt.h +++ b/src/gromacs/nbnxm/gpu_data_mgmt.h @@ -50,7 +50,7 @@ #include "gromacs/mdtypes/interaction_const.h" #include "gromacs/mdtypes/locality.h" -struct gmx_nbnxm_gpu_t; +struct NbnxmGpu; struct gmx_gpu_info_t; struct gmx_device_info_t; struct gmx_wallclock_gpu_nbnxn_t; @@ -63,23 +63,23 @@ namespace Nbnxm /** Initializes the data structures related to GPU nonbonded calculations. */ GPU_FUNC_QUALIFIER -gmx_nbnxm_gpu_t* gpu_init(const gmx_device_info_t gmx_unused* deviceInfo, - const interaction_const_t gmx_unused* ic, - const PairlistParams gmx_unused& listParams, - const nbnxn_atomdata_t gmx_unused* nbat, - int gmx_unused rank, - /* true if both local and non-local are done on GPU */ - gmx_bool gmx_unused bLocalAndNonlocal) GPU_FUNC_TERM_WITH_RETURN(nullptr); +NbnxmGpu* gpu_init(const gmx_device_info_t gmx_unused* deviceInfo, + const interaction_const_t gmx_unused* ic, + const PairlistParams gmx_unused& listParams, + const nbnxn_atomdata_t gmx_unused* nbat, + int gmx_unused rank, + /* true if both local and non-local are done on GPU */ + gmx_bool gmx_unused bLocalAndNonlocal) GPU_FUNC_TERM_WITH_RETURN(nullptr); /** Initializes pair-list data for GPU, called at every pair search step. */ GPU_FUNC_QUALIFIER -void gpu_init_pairlist(gmx_nbnxm_gpu_t gmx_unused* nb, +void gpu_init_pairlist(NbnxmGpu gmx_unused* nb, const struct NbnxnPairlistGpu gmx_unused* h_nblist, gmx::InteractionLocality gmx_unused iloc) GPU_FUNC_TERM; /** Initializes atom-data on the GPU, called at every pair search step. */ GPU_FUNC_QUALIFIER -void gpu_init_atomdata(gmx_nbnxm_gpu_t gmx_unused* nb, const nbnxn_atomdata_t gmx_unused* nbat) GPU_FUNC_TERM; +void gpu_init_atomdata(NbnxmGpu gmx_unused* nb, const nbnxn_atomdata_t gmx_unused* nbat) GPU_FUNC_TERM; /*! \brief Re-generate the GPU Ewald force table, resets rlist, and update the * electrostatic type switching to twin cut-off (or back) if needed. @@ -90,19 +90,19 @@ void gpu_pme_loadbal_update_param(const struct nonbonded_verlet_t gmx_unused* nb /** Uploads shift vector to the GPU if the box is dynamic (otherwise just returns). */ GPU_FUNC_QUALIFIER -void gpu_upload_shiftvec(gmx_nbnxm_gpu_t gmx_unused* nb, const nbnxn_atomdata_t gmx_unused* nbatom) GPU_FUNC_TERM; +void gpu_upload_shiftvec(NbnxmGpu gmx_unused* nb, const nbnxn_atomdata_t gmx_unused* nbatom) GPU_FUNC_TERM; /** Clears GPU outputs: nonbonded force, shift force and energy. */ GPU_FUNC_QUALIFIER -void gpu_clear_outputs(gmx_nbnxm_gpu_t gmx_unused* nb, bool gmx_unused computeVirial) GPU_FUNC_TERM; +void gpu_clear_outputs(NbnxmGpu gmx_unused* nb, bool gmx_unused computeVirial) GPU_FUNC_TERM; /** Frees all GPU resources used for the nonbonded calculations. */ GPU_FUNC_QUALIFIER -void gpu_free(gmx_nbnxm_gpu_t gmx_unused* nb) GPU_FUNC_TERM; +void gpu_free(NbnxmGpu gmx_unused* nb) GPU_FUNC_TERM; /** Returns the GPU timings structure or NULL if GPU is not used or timing is off. */ GPU_FUNC_QUALIFIER -struct gmx_wallclock_gpu_nbnxn_t* gpu_get_timings(gmx_nbnxm_gpu_t gmx_unused* nb) +struct gmx_wallclock_gpu_nbnxn_t* gpu_get_timings(NbnxmGpu gmx_unused* nb) GPU_FUNC_TERM_WITH_RETURN(nullptr); /** Resets nonbonded GPU timings. */ @@ -112,37 +112,36 @@ void gpu_reset_timings(struct nonbonded_verlet_t gmx_unused* nbv) GPU_FUNC_TERM; /** Calculates the minimum size of proximity lists to improve SM load balance * with GPU non-bonded kernels. */ GPU_FUNC_QUALIFIER -int gpu_min_ci_balanced(gmx_nbnxm_gpu_t gmx_unused* nb) GPU_FUNC_TERM_WITH_RETURN(-1); +int gpu_min_ci_balanced(NbnxmGpu gmx_unused* nb) GPU_FUNC_TERM_WITH_RETURN(-1); /** Returns if analytical Ewald GPU kernels are used. */ GPU_FUNC_QUALIFIER -gmx_bool gpu_is_kernel_ewald_analytical(const gmx_nbnxm_gpu_t gmx_unused* nb) - GPU_FUNC_TERM_WITH_RETURN(FALSE); +gmx_bool gpu_is_kernel_ewald_analytical(const NbnxmGpu gmx_unused* nb) GPU_FUNC_TERM_WITH_RETURN(FALSE); /** Returns an opaque pointer to the GPU command stream * Note: CUDA only. */ CUDA_FUNC_QUALIFIER -void* gpu_get_command_stream(gmx_nbnxm_gpu_t gmx_unused* nb, gmx::InteractionLocality gmx_unused iloc) +void* gpu_get_command_stream(NbnxmGpu gmx_unused* nb, gmx::InteractionLocality gmx_unused iloc) CUDA_FUNC_TERM_WITH_RETURN(nullptr); /** Returns an opaque pointer to the GPU coordinate+charge array * Note: CUDA only. */ CUDA_FUNC_QUALIFIER -void* gpu_get_xq(gmx_nbnxm_gpu_t gmx_unused* nb) CUDA_FUNC_TERM_WITH_RETURN(nullptr); +void* gpu_get_xq(NbnxmGpu gmx_unused* nb) CUDA_FUNC_TERM_WITH_RETURN(nullptr); /** Returns an opaque pointer to the GPU force array * Note: CUDA only. */ CUDA_FUNC_QUALIFIER -void* gpu_get_f(gmx_nbnxm_gpu_t gmx_unused* nb) CUDA_FUNC_TERM_WITH_RETURN(nullptr); +void* gpu_get_f(NbnxmGpu gmx_unused* nb) CUDA_FUNC_TERM_WITH_RETURN(nullptr); /** Returns an opaque pointer to the GPU shift force array * Note: CUDA only. */ CUDA_FUNC_QUALIFIER -rvec* gpu_get_fshift(gmx_nbnxm_gpu_t gmx_unused* nb) CUDA_FUNC_TERM_WITH_RETURN(nullptr); +rvec* gpu_get_fshift(NbnxmGpu gmx_unused* nb) CUDA_FUNC_TERM_WITH_RETURN(nullptr); } // namespace Nbnxm diff --git a/src/gromacs/nbnxm/gpu_jit_support.h b/src/gromacs/nbnxm/gpu_jit_support.h index 3a5928d2bb..183fcadc7c 100644 --- a/src/gromacs/nbnxm/gpu_jit_support.h +++ b/src/gromacs/nbnxm/gpu_jit_support.h @@ -46,9 +46,9 @@ #include "gromacs/utility/basedefinitions.h" -struct gmx_nbnxm_gpu_t; +struct NbnxmGpu; /*! \brief Handles any JIT compilation of nbnxn kernels for the selected device */ -OPENCL_FUNC_QUALIFIER void nbnxn_gpu_compile_kernels(gmx_nbnxm_gpu_t gmx_unused* nb) OPENCL_FUNC_TERM; +OPENCL_FUNC_QUALIFIER void nbnxn_gpu_compile_kernels(NbnxmGpu gmx_unused* nb) OPENCL_FUNC_TERM; #endif diff --git a/src/gromacs/nbnxm/nbnxm.h b/src/gromacs/nbnxm/nbnxm.h index c820e57726..4fc235ca4f 100644 --- a/src/gromacs/nbnxm/nbnxm.h +++ b/src/gromacs/nbnxm/nbnxm.h @@ -125,7 +125,7 @@ struct gmx_domdec_zones_t; struct gmx_enerdata_t; struct gmx_hw_info_t; struct gmx_mtop_t; -struct gmx_nbnxm_gpu_t; +struct NbnxmGpu; struct gmx_wallcycle; struct interaction_const_t; struct nbnxn_atomdata_t; @@ -225,7 +225,7 @@ public: std::unique_ptr pairSearch, std::unique_ptr nbat, const Nbnxm::KernelSetup& kernelSetup, - gmx_nbnxm_gpu_t* gpu_nbv, + NbnxmGpu* gpu_nbv, gmx_wallcycle* wcycle); ~nonbonded_verlet_t(); @@ -403,7 +403,7 @@ private: public: //! GPU Nbnxm data, only used with a physical GPU (TODO: use unique_ptr) - gmx_nbnxm_gpu_t* gpu_nbv; + NbnxmGpu* gpu_nbv; }; namespace Nbnxm diff --git a/src/gromacs/nbnxm/nbnxm_gpu.h b/src/gromacs/nbnxm/nbnxm_gpu.h index adbbcf7f0c..7b9e4b80f9 100644 --- a/src/gromacs/nbnxm/nbnxm_gpu.h +++ b/src/gromacs/nbnxm/nbnxm_gpu.h @@ -78,7 +78,7 @@ class Grid; * \param [in] aloc Atom locality flag. */ GPU_FUNC_QUALIFIER -void gpu_copy_xq_to_gpu(gmx_nbnxm_gpu_t gmx_unused* nb, +void gpu_copy_xq_to_gpu(NbnxmGpu gmx_unused* nb, const struct nbnxn_atomdata_t gmx_unused* nbdata, gmx::AtomLocality gmx_unused aloc) GPU_FUNC_TERM; @@ -93,7 +93,7 @@ void gpu_copy_xq_to_gpu(gmx_nbnxm_gpu_t gmx_unused* nb, * */ GPU_FUNC_QUALIFIER -void gpu_launch_kernel(gmx_nbnxm_gpu_t gmx_unused* nb, +void gpu_launch_kernel(NbnxmGpu gmx_unused* nb, const gmx::StepWorkload gmx_unused& stepWork, gmx::InteractionLocality gmx_unused iloc) GPU_FUNC_TERM; @@ -133,7 +133,7 @@ void gpu_launch_kernel(gmx_nbnxm_gpu_t gmx_unused* nb, * \param [in] numParts Number of parts the pair list is split into in the rolling kernel. */ GPU_FUNC_QUALIFIER -void gpu_launch_kernel_pruneonly(gmx_nbnxm_gpu_t gmx_unused* nb, +void gpu_launch_kernel_pruneonly(NbnxmGpu gmx_unused* nb, gmx::InteractionLocality gmx_unused iloc, int gmx_unused numParts) GPU_FUNC_TERM; @@ -142,7 +142,7 @@ void gpu_launch_kernel_pruneonly(gmx_nbnxm_gpu_t gmx_unused* nb, * (and energies/shift forces if required). */ GPU_FUNC_QUALIFIER -void gpu_launch_cpyback(gmx_nbnxm_gpu_t gmx_unused* nb, +void gpu_launch_cpyback(NbnxmGpu gmx_unused* nb, nbnxn_atomdata_t gmx_unused* nbatom, const gmx::StepWorkload gmx_unused& stepWork, gmx::AtomLocality gmx_unused aloc) GPU_FUNC_TERM; @@ -185,7 +185,7 @@ void gpu_launch_cpyback(gmx_nbnxm_gpu_t gmx_unused* nb, * \returns True if the nonbonded tasks associated with \p aloc locality have completed */ GPU_FUNC_QUALIFIER -bool gpu_try_finish_task(gmx_nbnxm_gpu_t gmx_unused* nb, +bool gpu_try_finish_task(NbnxmGpu gmx_unused* nb, const gmx::StepWorkload gmx_unused& stepWork, gmx::AtomLocality gmx_unused aloc, real gmx_unused* e_lj, @@ -209,7 +209,7 @@ bool gpu_try_finish_task(gmx_nbnxm_gpu_t gmx_unused* nb, * \param[out] shiftForces Shift forces buffer to accumulate into * \param[out] wcycle Pointer to wallcycle data structure */ GPU_FUNC_QUALIFIER -float gpu_wait_finish_task(gmx_nbnxm_gpu_t gmx_unused* nb, +float gpu_wait_finish_task(NbnxmGpu gmx_unused* nb, const gmx::StepWorkload gmx_unused& stepWork, gmx::AtomLocality gmx_unused aloc, real gmx_unused* e_lj, @@ -226,7 +226,7 @@ int nbnxn_gpu_pick_ewald_kernel_type(const interaction_const_t gmx_unused& ic) * Called on the NS step and performs (re-)allocations and memory copies. !*/ CUDA_FUNC_QUALIFIER void nbnxn_gpu_init_x_to_nbat_x(const Nbnxm::GridSet gmx_unused& gridSet, - gmx_nbnxm_gpu_t gmx_unused* gpu_nbv) CUDA_FUNC_TERM; + NbnxmGpu gmx_unused* gpu_nbv) CUDA_FUNC_TERM; /*! \brief X buffer operations on GPU: performs conversion from rvec to nb format. * @@ -242,7 +242,7 @@ void nbnxn_gpu_init_x_to_nbat_x(const Nbnxm::GridSet gmx_unused& gridSet, CUDA_FUNC_QUALIFIER void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid gmx_unused& grid, bool gmx_unused setFillerCoords, - gmx_nbnxm_gpu_t gmx_unused* gpu_nbv, + NbnxmGpu gmx_unused* gpu_nbv, DeviceBuffer gmx_unused d_x, GpuEventSynchronizer gmx_unused* xReadyOnDevice, gmx::AtomLocality gmx_unused locality, @@ -254,7 +254,7 @@ void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid gmx_unused& grid, * \param[in] interactionLocality Local or NonLocal sync point */ CUDA_FUNC_QUALIFIER -void nbnxnInsertNonlocalGpuDependency(const gmx_nbnxm_gpu_t gmx_unused* nb, +void nbnxnInsertNonlocalGpuDependency(const NbnxmGpu gmx_unused* nb, gmx::InteractionLocality gmx_unused interactionLocality) CUDA_FUNC_TERM; /*! \brief Set up internal flags that indicate what type of short-range work there is. @@ -270,7 +270,7 @@ void nbnxnInsertNonlocalGpuDependency(const gmx_nbnxm_gpu_t gmx_unused* nb, * \param[in] iLocality Interaction locality identifier */ GPU_FUNC_QUALIFIER -void setupGpuShortRangeWork(gmx_nbnxm_gpu_t gmx_unused* nb, +void setupGpuShortRangeWork(NbnxmGpu gmx_unused* nb, const gmx::GpuBonded gmx_unused* gpuBonded, gmx::InteractionLocality gmx_unused iLocality) GPU_FUNC_TERM; @@ -284,13 +284,13 @@ void setupGpuShortRangeWork(gmx_nbnxm_gpu_t gmx_unused* nb, * \param[in] aLocality Atom locality identifier */ GPU_FUNC_QUALIFIER -bool haveGpuShortRangeWork(const gmx_nbnxm_gpu_t gmx_unused* nb, gmx::AtomLocality gmx_unused aLocality) +bool haveGpuShortRangeWork(const NbnxmGpu gmx_unused* nb, gmx::AtomLocality gmx_unused aLocality) GPU_FUNC_TERM_WITH_RETURN(false); /*! \brief Initialization for F buffer operations on GPU */ CUDA_FUNC_QUALIFIER void nbnxn_gpu_init_add_nbat_f_to_f(const int gmx_unused* cell, - gmx_nbnxm_gpu_t gmx_unused* gpu_nbv, + NbnxmGpu gmx_unused* gpu_nbv, int gmx_unused natoms_total, GpuEventSynchronizer gmx_unused* localReductionDone) CUDA_FUNC_TERM; @@ -313,7 +313,7 @@ void nbnxn_gpu_init_add_nbat_f_to_f(const int gmx_unused* cell, CUDA_FUNC_QUALIFIER void nbnxn_gpu_add_nbat_f_to_f(gmx::AtomLocality gmx_unused atomLocality, DeviceBuffer gmx_unused totalForcesDevice, - gmx_nbnxm_gpu_t gmx_unused* gpu_nbv, + NbnxmGpu gmx_unused* gpu_nbv, void gmx_unused* pmeForcesDevice, gmx::ArrayRef gmx_unused dependencyList, int gmx_unused atomStart, @@ -325,7 +325,7 @@ void nbnxn_gpu_add_nbat_f_to_f(gmx::AtomLocality gmx_unused atomLocality, * \param[in] nb The nonbonded data GPU structure */ CUDA_FUNC_QUALIFIER -void nbnxn_wait_x_on_device(gmx_nbnxm_gpu_t gmx_unused* nb) CUDA_FUNC_TERM; +void nbnxn_wait_x_on_device(NbnxmGpu gmx_unused* nb) CUDA_FUNC_TERM; } // namespace Nbnxm #endif diff --git a/src/gromacs/nbnxm/nbnxm_setup.cpp b/src/gromacs/nbnxm/nbnxm_setup.cpp index 745414b67d..35aea4ae3d 100644 --- a/src/gromacs/nbnxm/nbnxm_setup.cpp +++ b/src/gromacs/nbnxm/nbnxm_setup.cpp @@ -320,7 +320,7 @@ namespace Nbnxm { /*! \brief Gets and returns the minimum i-list count for balacing based on the GPU used or env.var. when set */ -static int getMinimumIlistCountForGpuBalancing(gmx_nbnxm_gpu_t* nbnxmGpu) +static int getMinimumIlistCountForGpuBalancing(NbnxmGpu* nbnxmGpu) { int minimumIlistCount; @@ -440,8 +440,8 @@ std::unique_ptr init_nb_verlet(const gmx::MDLogger& mdlo fr->nbfp, mimimumNumEnergyGroupNonbonded, (useGpu || emulateGpu) ? 1 : gmx_omp_nthreads_get(emntNonbonded)); - gmx_nbnxm_gpu_t* gpu_nbv = nullptr; - int minimumIlistCountForGpuBalancing = 0; + NbnxmGpu* gpu_nbv = nullptr; + int minimumIlistCountForGpuBalancing = 0; if (useGpu) { /* init the NxN GPU data; the last argument tells whether we'll have @@ -469,7 +469,7 @@ nonbonded_verlet_t::nonbonded_verlet_t(std::unique_ptr pairlis std::unique_ptr pairSearch, std::unique_ptr nbat_in, const Nbnxm::KernelSetup& kernelSetup, - gmx_nbnxm_gpu_t* gpu_nbv_ptr, + NbnxmGpu* gpu_nbv_ptr, gmx_wallcycle* wcycle) : pairlistSets_(std::move(pairlistSets)), pairSearch_(std::move(pairSearch)), diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp b/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp index c795937a84..f4c291ce7c 100644 --- a/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp +++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp @@ -341,7 +341,7 @@ static inline cl_kernel selectPruneKernel(cl_kernel kernel_pruneonly[], bool fir * OpenCL kernel objects are cached in nb. If the requested kernel is not * found in the cache, it will be created and the cache will be updated. */ -static inline cl_kernel select_nbnxn_kernel(gmx_nbnxm_gpu_t* nb, int eeltype, int evdwtype, bool bDoEne, bool bDoPrune) +static inline cl_kernel select_nbnxn_kernel(NbnxmGpu* nb, int eeltype, int evdwtype, bool bDoEne, bool bDoPrune) { const char* kernel_name_to_run; cl_kernel* kernel_ptr; @@ -471,7 +471,7 @@ static void sync_ocl_event(cl_command_queue stream, cl_event* ocl_event) } /*! \brief Launch asynchronously the xq buffer host to device copy. */ -void gpu_copy_xq_to_gpu(gmx_nbnxm_gpu_t* nb, const nbnxn_atomdata_t* nbatom, const AtomLocality atomLocality) +void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const AtomLocality atomLocality) { GMX_ASSERT(nb, "Need a valid nbnxn_gpu object"); @@ -575,7 +575,7 @@ void gpu_copy_xq_to_gpu(gmx_nbnxm_gpu_t* nb, const nbnxn_atomdata_t* nbatom, con misc_ops_done event to record the point in time when the above operations are finished and synchronize with this event in the non-local stream. */ -void gpu_launch_kernel(gmx_nbnxm_gpu_t* nb, const gmx::StepWorkload& stepWork, const Nbnxm::InteractionLocality iloc) +void gpu_launch_kernel(NbnxmGpu* nb, const gmx::StepWorkload& stepWork, const Nbnxm::InteractionLocality iloc) { cl_atomdata_t* adat = nb->atdat; cl_nbparam_t* nbp = nb->nbparam; @@ -713,7 +713,7 @@ static inline int calc_shmem_required_prune(const int num_threads_z) * Launch the pairlist prune only kernel for the given locality. * \p numParts tells in how many parts, i.e. calls the list will be pruned. */ -void gpu_launch_kernel_pruneonly(gmx_nbnxm_gpu_t* nb, const InteractionLocality iloc, const int numParts) +void gpu_launch_kernel_pruneonly(NbnxmGpu* nb, const InteractionLocality iloc, const int numParts) { cl_atomdata_t* adat = nb->atdat; cl_nbparam_t* nbp = nb->nbparam; @@ -839,7 +839,7 @@ void gpu_launch_kernel_pruneonly(gmx_nbnxm_gpu_t* nb, const InteractionLocality * Launch asynchronously the download of nonbonded forces from the GPU * (and energies/shift forces if required). */ -void gpu_launch_cpyback(gmx_nbnxm_gpu_t* nb, +void gpu_launch_cpyback(NbnxmGpu* nb, struct nbnxn_atomdata_t* nbatom, const gmx::StepWorkload& stepWork, const AtomLocality aloc) diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp b/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp index 4943a8e0dd..f8822ae31d 100644 --- a/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp +++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp @@ -170,7 +170,7 @@ static void init_atomdata_first(cl_atomdata_t* ad, int ntypes, gmx_device_runtim /* An element of the fshift device buffer has the same size as one element of the host side fshift buffer. */ - ad->fshift_elem_size = sizeof(*cl_nb_staging_t::fshift); + ad->fshift_elem_size = sizeof(*nb_staging_t::fshift); ad->fshift = clCreateBuffer(runData->context, CL_MEM_READ_WRITE | CL_MEM_HOST_READ_ONLY, SHIFTS * ad->fshift_elem_size, nullptr, &cl_error); @@ -407,8 +407,8 @@ void gpu_pme_loadbal_update_param(const nonbonded_verlet_t* nbv, const interacti { return; } - gmx_nbnxm_gpu_t* nb = nbv->gpu_nbv; - cl_nbparam_t* nbp = nb->nbparam; + NbnxmGpu* nb = nbv->gpu_nbv; + cl_nbparam_t* nbp = nb->nbparam; set_cutoff_parameters(nbp, ic, nbv->pairlistSets().params()); @@ -527,7 +527,7 @@ static void nbnxn_gpu_create_context(gmx_device_runtime_data_t* runtimeData, } /*! \brief Initializes the OpenCL kernel pointers of the nbnxn_ocl_ptr_t input data structure. */ -static cl_kernel nbnxn_gpu_create_kernel(gmx_nbnxm_gpu_t* nb, const char* kernel_name) +static cl_kernel nbnxn_gpu_create_kernel(NbnxmGpu* nb, const char* kernel_name) { cl_kernel kernel; cl_int cl_error; @@ -544,7 +544,7 @@ static cl_kernel nbnxn_gpu_create_kernel(gmx_nbnxm_gpu_t* nb, const char* kernel /*! \brief Clears nonbonded shift force output array and energy outputs on the GPU. */ -static void nbnxn_ocl_clear_e_fshift(gmx_nbnxm_gpu_t* nb) +static void nbnxn_ocl_clear_e_fshift(NbnxmGpu* nb) { cl_int cl_error; @@ -577,7 +577,7 @@ static void nbnxn_ocl_clear_e_fshift(gmx_nbnxm_gpu_t* nb) } /*! \brief Initializes the OpenCL kernel pointers of the nbnxn_ocl_ptr_t input data structure. */ -static void nbnxn_gpu_init_kernels(gmx_nbnxm_gpu_t* nb) +static void nbnxn_gpu_init_kernels(NbnxmGpu* nb) { /* Init to 0 main kernel arrays */ /* They will be later on initialized in select_nbnxn_kernel */ @@ -606,7 +606,7 @@ static void nbnxn_gpu_init_kernels(gmx_nbnxm_gpu_t* nb) * Initializes members of the atomdata and nbparam structs and * clears e/fshift output buffers. */ -static void nbnxn_ocl_init_const(gmx_nbnxm_gpu_t* nb, +static void nbnxn_ocl_init_const(NbnxmGpu* nb, const interaction_const_t* ic, const PairlistParams& listParams, const nbnxn_atomdata_t::Params& nbatParams) @@ -617,20 +617,19 @@ static void nbnxn_ocl_init_const(gmx_nbnxm_gpu_t* nb, //! This function is documented in the header file -gmx_nbnxm_gpu_t* gpu_init(const gmx_device_info_t* deviceInfo, - const interaction_const_t* ic, - const PairlistParams& listParams, - const nbnxn_atomdata_t* nbat, - const int rank, - const gmx_bool bLocalAndNonlocal) +NbnxmGpu* gpu_init(const gmx_device_info_t* deviceInfo, + const interaction_const_t* ic, + const PairlistParams& listParams, + const nbnxn_atomdata_t* nbat, + const int rank, + const gmx_bool bLocalAndNonlocal) { - gmx_nbnxm_gpu_t* nb; cl_int cl_error; cl_command_queue_properties queue_properties; assert(ic); - snew(nb, 1); + auto nb = new NbnxmGpu; snew(nb->atdat, 1); snew(nb->nbparam, 1); snew(nb->plist[InteractionLocality::Local], 1); @@ -728,7 +727,7 @@ gmx_nbnxm_gpu_t* gpu_init(const gmx_device_info_t* deviceInfo, /*! \brief Clears the first natoms_clear elements of the GPU nonbonded force output array. */ -static void nbnxn_ocl_clear_f(gmx_nbnxm_gpu_t* nb, int natoms_clear) +static void nbnxn_ocl_clear_f(NbnxmGpu* nb, int natoms_clear) { if (natoms_clear == 0) { @@ -748,7 +747,7 @@ static void nbnxn_ocl_clear_f(gmx_nbnxm_gpu_t* nb, int natoms_clear) } //! This function is documented in the header file -void gpu_clear_outputs(gmx_nbnxm_gpu_t* nb, bool computeVirial) +void gpu_clear_outputs(NbnxmGpu* nb, bool computeVirial) { nbnxn_ocl_clear_f(nb, nb->atdat->natoms); /* clear shift force array and energies if the outputs were @@ -765,7 +764,7 @@ void gpu_clear_outputs(gmx_nbnxm_gpu_t* nb, bool computeVirial) } //! This function is documented in the header file -void gpu_init_pairlist(gmx_nbnxm_gpu_t* nb, const NbnxnPairlistGpu* h_plist, const InteractionLocality iloc) +void gpu_init_pairlist(NbnxmGpu* nb, const NbnxnPairlistGpu* h_plist, const InteractionLocality iloc) { char sbuf[STRLEN]; // Timing accumulation should happen only if there was work to do @@ -826,7 +825,7 @@ void gpu_init_pairlist(gmx_nbnxm_gpu_t* nb, const NbnxnPairlistGpu* h_plist, con } //! This function is documented in the header file -void gpu_upload_shiftvec(gmx_nbnxm_gpu_t* nb, const nbnxn_atomdata_t* nbatom) +void gpu_upload_shiftvec(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom) { cl_atomdata_t* adat = nb->atdat; cl_command_queue ls = nb->stream[InteractionLocality::Local]; @@ -841,7 +840,7 @@ void gpu_upload_shiftvec(gmx_nbnxm_gpu_t* nb, const nbnxn_atomdata_t* nbatom) } //! This function is documented in the header file -void gpu_init_atomdata(gmx_nbnxm_gpu_t* nb, const nbnxn_atomdata_t* nbat) +void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat) { cl_int cl_error; int nalloc, natoms; @@ -1001,7 +1000,7 @@ static void free_gpu_device_runtime_data(gmx_device_runtime_data_t* runData) } //! This function is documented in the header file -void gpu_free(gmx_nbnxm_gpu_t* nb) +void gpu_free(NbnxmGpu* nb) { if (nb == nullptr) { @@ -1093,7 +1092,7 @@ void gpu_free(gmx_nbnxm_gpu_t* nb) /* Free timers and timings */ delete nb->timers; sfree(nb->timings); - sfree(nb); + delete nb; if (debug) { @@ -1102,7 +1101,7 @@ void gpu_free(gmx_nbnxm_gpu_t* nb) } //! This function is documented in the header file -gmx_wallclock_gpu_nbnxn_t* gpu_get_timings(gmx_nbnxm_gpu_t* nb) +gmx_wallclock_gpu_nbnxn_t* gpu_get_timings(NbnxmGpu* nb) { return (nb != nullptr && nb->bDoTime) ? nb->timings : nullptr; } @@ -1117,13 +1116,13 @@ void gpu_reset_timings(nonbonded_verlet_t* nbv) } //! This function is documented in the header file -int gpu_min_ci_balanced(gmx_nbnxm_gpu_t* nb) +int gpu_min_ci_balanced(NbnxmGpu* nb) { return nb != nullptr ? gpu_min_ci_balanced_factor * nb->dev_info->compute_units : 0; } //! This function is documented in the header file -gmx_bool gpu_is_kernel_ewald_analytical(const gmx_nbnxm_gpu_t* nb) +gmx_bool gpu_is_kernel_ewald_analytical(const NbnxmGpu* nb) { return ((nb->nbparam->eeltype == eelOclEWALD_ANA) || (nb->nbparam->eeltype == eelOclEWALD_ANA_TWIN)); } diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl_jit_support.cpp b/src/gromacs/nbnxm/opencl/nbnxm_ocl_jit_support.cpp index 3ea5cc186d..dba6415149 100644 --- a/src/gromacs/nbnxm/opencl/nbnxm_ocl_jit_support.cpp +++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl_jit_support.cpp @@ -167,7 +167,7 @@ static std::string makeDefinesForKernelTypes(bool bFastGen, int eeltype, int vdw * * Does not throw */ -void nbnxn_gpu_compile_kernels(gmx_nbnxm_gpu_t* nb) +void nbnxn_gpu_compile_kernels(NbnxmGpu* nb) { gmx_bool bFastGen = TRUE; cl_program program = nullptr; diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h b/src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h index c6f5636658..a9379eea8f 100644 --- a/src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h +++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h @@ -60,6 +60,8 @@ #include "nbnxm_ocl_consts.h" +struct gmx_wallclock_gpu_nbnxn_t; + /* kernel does #include "gromacs/math/utilities.h" */ /* Move the actual useful stuff here: */ @@ -151,15 +153,15 @@ enum ePruneKind * The energies/shift forces get downloaded here first, before getting added * to the CPU-side aggregate values. */ -typedef struct cl_nb_staging +struct nb_staging_t { //! LJ energy - float* e_lj; + float* e_lj = nullptr; //! electrostatic energy - float* e_el; + float* e_el = nullptr; //! float3 buffer with shift forces - float (*fshift)[3]; -} cl_nb_staging_t; + float (*fshift)[3] = nullptr; +}; /*! \internal * \brief Nonbonded atom data - both inputs and outputs. @@ -204,7 +206,7 @@ typedef struct cl_atomdata size_t shift_vec_elem_size; //! true if the shift vector has been uploaded - cl_bool bShiftVecUploaded; + bool bShiftVecUploaded; } cl_atomdata_t; /*! \internal @@ -333,48 +335,48 @@ typedef struct Nbnxm::gpu_timers_t cl_timers_t; /*! \internal * \brief Main data structure for OpenCL nonbonded force calculations. */ -struct gmx_nbnxm_gpu_t +struct NbnxmGpu { //! OpenCL device information - const gmx_device_info_t* dev_info; + const gmx_device_info_t* dev_info = nullptr; //! OpenCL runtime data (context, kernels) - struct gmx_device_runtime_data_t* dev_rundata; + struct gmx_device_runtime_data_t* dev_rundata = nullptr; /**< Pointers to non-bonded kernel functions * organized similar with nb_kfunc_xxx arrays in nbnxn_ocl.cpp */ ///@{ - cl_kernel kernel_noener_noprune_ptr[eelOclNR][evdwOclNR]; - cl_kernel kernel_ener_noprune_ptr[eelOclNR][evdwOclNR]; - cl_kernel kernel_noener_prune_ptr[eelOclNR][evdwOclNR]; - cl_kernel kernel_ener_prune_ptr[eelOclNR][evdwOclNR]; + cl_kernel kernel_noener_noprune_ptr[eelOclNR][evdwOclNR] = { { nullptr } }; + cl_kernel kernel_ener_noprune_ptr[eelOclNR][evdwOclNR] = { { nullptr } }; + cl_kernel kernel_noener_prune_ptr[eelOclNR][evdwOclNR] = { { nullptr } }; + cl_kernel kernel_ener_prune_ptr[eelOclNR][evdwOclNR] = { { nullptr } }; ///@} //! prune kernels, ePruneKind defined the kernel kinds - cl_kernel kernel_pruneonly[ePruneNR]; + cl_kernel kernel_pruneonly[ePruneNR] = { nullptr }; //! true if prefetching fg i-atom LJ parameters should be used in the kernels - bool bPrefetchLjParam; + bool bPrefetchLjParam = false; /**< auxiliary kernels implementing memset-like functions */ ///@{ - cl_kernel kernel_memset_f; - cl_kernel kernel_memset_f2; - cl_kernel kernel_memset_f3; - cl_kernel kernel_zero_e_fshift; + cl_kernel kernel_memset_f = nullptr; + cl_kernel kernel_memset_f2 = nullptr; + cl_kernel kernel_memset_f3 = nullptr; + cl_kernel kernel_zero_e_fshift = nullptr; ///@} //! true if doing both local/non-local NB work on GPU - cl_bool bUseTwoStreams; + bool bUseTwoStreams = false; //! true indicates that the nonlocal_done event was enqueued - cl_bool bNonLocalStreamActive; + bool bNonLocalStreamActive = false; //! atom data - cl_atomdata_t* atdat; + cl_atomdata_t* atdat = nullptr; //! parameters required for the non-bonded calc. - cl_nbparam_t* nbparam; + cl_nbparam_t* nbparam = nullptr; //! pair-list data structures (local and non-local) - gmx::EnumerationArray plist; + gmx::EnumerationArray plist = { nullptr }; //! staging area where fshift/energies get downloaded - cl_nb_staging_t nbst; + nb_staging_t nbst; //! local and non-local GPU queues gmx::EnumerationArray stream; @@ -383,13 +385,13 @@ struct gmx_nbnxm_gpu_t /*! \{ */ /*! \brief Event triggered when the non-local non-bonded * kernel is done (and the local transfer can proceed) */ - cl_event nonlocal_done; + cl_event nonlocal_done = nullptr; /*! \brief Event triggered when the tasks issued in the local * stream that need to precede the non-local force or buffer * operation calculations are done (e.g. f buffer 0-ing, local * x/q H2D, buffer op initialization in local stream that is * required also by nonlocal stream ) */ - cl_event misc_ops_and_local_H2D_done; + cl_event misc_ops_and_local_H2D_done = nullptr; /*! \} */ //! True if there has been local/nonlocal GPU work, either bonded or nonbonded, scheduled @@ -399,11 +401,11 @@ struct gmx_nbnxm_gpu_t //! True if event-based timing is enabled. - cl_bool bDoTime; + bool bDoTime = false; //! OpenCL event-based timers. - cl_timers_t* timers; + cl_timers_t* timers = nullptr; //! Timing data. TODO: deprecate this and query timers for accumulated data instead - struct gmx_wallclock_gpu_nbnxn_t* timings; + gmx_wallclock_gpu_nbnxn_t* timings = nullptr; }; #endif /* NBNXN_OPENCL_TYPES_H */ -- 2.22.0