From 2dae5db6f3bf0a64ab68b4fcf4ee6facccdbb43b Mon Sep 17 00:00:00 2001 From: Roland Schulz Date: Wed, 12 Sep 2018 02:34:10 -0700 Subject: [PATCH] Fix clang-tidy warnings for OCL Change-Id: I2f794a2209f355c0b3ee67623360724d96982458 --- src/gromacs/CMakeLists.txt | 2 +- src/gromacs/ewald/pme-gpu-3dfft-ocl.cpp | 6 +-- src/gromacs/ewald/pme-gpu-internal.cpp | 27 ++++++------- src/gromacs/ewald/pme-gpu-internal.h | 18 ++++----- .../ewald/pme-gpu-program-impl-ocl.cpp | 2 +- src/gromacs/ewald/pme-gpu.cpp | 6 +-- src/gromacs/gpu_utils/devicebuffer_ocl.h | 2 +- src/gromacs/gpu_utils/gpu_utils.h | 2 +- src/gromacs/gpu_utils/gpu_utils_ocl.cpp | 4 +- src/gromacs/gpu_utils/gpuregiontimer_ocl.h | 4 +- src/gromacs/gpu_utils/ocl_caching.cpp | 4 +- src/gromacs/gpu_utils/ocl_caching.h | 6 +-- src/gromacs/gpu_utils/ocl_compiler.cpp | 10 ++--- src/gromacs/gpu_utils/ocl_compiler.h | 4 +- src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu | 2 +- src/gromacs/mdlib/nbnxn_gpu_common.h | 24 +++++++----- src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl.cpp | 38 +++++++++---------- .../mdlib/nbnxn_ocl/nbnxn_ocl_data_mgmt.cpp | 35 ++++++++--------- 18 files changed, 97 insertions(+), 99 deletions(-) diff --git a/src/gromacs/CMakeLists.txt b/src/gromacs/CMakeLists.txt index c2d2bb23b0..7f9093399b 100644 --- a/src/gromacs/CMakeLists.txt +++ b/src/gromacs/CMakeLists.txt @@ -223,7 +223,7 @@ if (GMX_USE_OPENCL) target_sources(libgromacs PRIVATE $ ) - target_include_directories(libgromacs PRIVATE ${_clFFT_dir}/include) + target_include_directories(libgromacs SYSTEM PRIVATE ${_clFFT_dir}/include) # Use the magic variable for how to link any library needed for # dlopen, etc. which is -ldl where needed, and empty otherwise # (e.g. Windows, BSD, Mac). diff --git a/src/gromacs/ewald/pme-gpu-3dfft-ocl.cpp b/src/gromacs/ewald/pme-gpu-3dfft-ocl.cpp index dcc29cc69e..729a32a07d 100644 --- a/src/gromacs/ewald/pme-gpu-3dfft-ocl.cpp +++ b/src/gromacs/ewald/pme-gpu-3dfft-ocl.cpp @@ -69,7 +69,7 @@ GpuParallel3dFft::GpuParallel3dFft(const PmeGpu *pmeGpu) std::array realGridSize, realGridSizePadded, complexGridSizePadded; GMX_RELEASE_ASSERT(!pme_gpu_uses_dd(pmeGpu), "FFT decomposition not implemented"); - PmeGpuKernelParamsBase *kernelParamsPtr = (PmeGpuKernelParamsBase *)pmeGpu->kernelParams.get(); + PmeGpuKernelParamsBase *kernelParamsPtr = pmeGpu->kernelParams.get(); for (int i = 0; i < DIM; i++) { realGridSize[i] = kernelParamsPtr->grid.realGridSize[i]; @@ -152,18 +152,14 @@ void GpuParallel3dFft::perform3dFft(gmx_fft_direction dir, inputGrids = &realGrid_; outputGrids = &complexGrid_; break; - - break; case GMX_FFT_COMPLEX_TO_REAL: plan = planC2R_; direction = CLFFT_BACKWARD; inputGrids = &complexGrid_; outputGrids = &realGrid_; break; - default: GMX_THROW(gmx::NotImplementedError("The chosen 3D-FFT case is not implemented on GPUs")); - break; } handleClfftError(clfftEnqueueTransform(plan, direction, commandStreams_.size(), commandStreams_.data(), diff --git a/src/gromacs/ewald/pme-gpu-internal.cpp b/src/gromacs/ewald/pme-gpu-internal.cpp index d919b84a66..a030b45a30 100644 --- a/src/gromacs/ewald/pme-gpu-internal.cpp +++ b/src/gromacs/ewald/pme-gpu-internal.cpp @@ -54,6 +54,7 @@ #include #include +#include "gromacs/compat/make_unique.h" #include "gromacs/ewald/ewald-utils.h" #include "gromacs/gpu_utils/gpu_utils.h" #include "gromacs/math/invertmatrix.h" @@ -117,11 +118,11 @@ void pme_gpu_synchronize(const PmeGpu *pmeGpu) gpuStreamSynchronize(pmeGpu->archSpecific->pmeStream); } -void pme_gpu_alloc_energy_virial(const PmeGpu *pmeGpu) +void pme_gpu_alloc_energy_virial(PmeGpu *pmeGpu) { const size_t energyAndVirialSize = c_virialAndEnergyCount * sizeof(float); allocateDeviceBuffer(&pmeGpu->kernelParams->constants.d_virialAndEnergy, c_virialAndEnergyCount, pmeGpu->archSpecific->context); - pmalloc((void **)&pmeGpu->staging.h_virialAndEnergy, energyAndVirialSize); + pmalloc(reinterpret_cast(&pmeGpu->staging.h_virialAndEnergy), energyAndVirialSize); } void pme_gpu_free_energy_virial(PmeGpu *pmeGpu) @@ -137,14 +138,14 @@ void pme_gpu_clear_energy_virial(const PmeGpu *pmeGpu) c_virialAndEnergyCount, pmeGpu->archSpecific->pmeStream); } -void pme_gpu_realloc_and_copy_bspline_values(const PmeGpu *pmeGpu) +void pme_gpu_realloc_and_copy_bspline_values(PmeGpu *pmeGpu) { const int splineValuesOffset[DIM] = { 0, pmeGpu->kernelParams->grid.realGridSize[XX], pmeGpu->kernelParams->grid.realGridSize[XX] + pmeGpu->kernelParams->grid.realGridSize[YY] }; - memcpy((void *)&pmeGpu->kernelParams->grid.splineValuesOffset, &splineValuesOffset, sizeof(splineValuesOffset)); + memcpy(&pmeGpu->kernelParams->grid.splineValuesOffset, &splineValuesOffset, sizeof(splineValuesOffset)); const int newSplineValuesSize = pmeGpu->kernelParams->grid.realGridSize[XX] + pmeGpu->kernelParams->grid.realGridSize[YY] + @@ -156,7 +157,7 @@ void pme_gpu_realloc_and_copy_bspline_values(const PmeGpu *pmeGpu) { /* Reallocate the host buffer */ pfree(pmeGpu->staging.h_splineModuli); - pmalloc((void **)&pmeGpu->staging.h_splineModuli, newSplineValuesSize * sizeof(float)); + pmalloc(reinterpret_cast(&pmeGpu->staging.h_splineModuli), newSplineValuesSize * sizeof(float)); } for (int i = 0; i < DIM; i++) { @@ -271,7 +272,7 @@ void pme_gpu_free_coefficients(const PmeGpu *pmeGpu) freeDeviceBuffer(&pmeGpu->kernelParams->atoms.d_coefficients); } -void pme_gpu_realloc_spline_data(const PmeGpu *pmeGpu) +void pme_gpu_realloc_spline_data(PmeGpu *pmeGpu) { const int order = pmeGpu->common->pme_order; const int alignment = pme_gpu_get_atoms_per_warp(pmeGpu); @@ -290,9 +291,9 @@ void pme_gpu_realloc_spline_data(const PmeGpu *pmeGpu) if (shouldRealloc) { pfree(pmeGpu->staging.h_theta); - pmalloc((void **)&pmeGpu->staging.h_theta, newSplineDataSize * sizeof(float)); + pmalloc(reinterpret_cast(&pmeGpu->staging.h_theta), newSplineDataSize * sizeof(float)); pfree(pmeGpu->staging.h_dtheta); - pmalloc((void **)&pmeGpu->staging.h_dtheta, newSplineDataSize * sizeof(float)); + pmalloc(reinterpret_cast(&pmeGpu->staging.h_dtheta), newSplineDataSize * sizeof(float)); } } @@ -305,14 +306,14 @@ void pme_gpu_free_spline_data(const PmeGpu *pmeGpu) pfree(pmeGpu->staging.h_dtheta); } -void pme_gpu_realloc_grid_indices(const PmeGpu *pmeGpu) +void pme_gpu_realloc_grid_indices(PmeGpu *pmeGpu) { const size_t newIndicesSize = DIM * pmeGpu->nAtomsAlloc; GMX_ASSERT(newIndicesSize > 0, "Bad number of atoms in PME GPU"); reallocateDeviceBuffer(&pmeGpu->kernelParams->atoms.d_gridlineIndices, newIndicesSize, &pmeGpu->archSpecific->gridlineIndicesSize, &pmeGpu->archSpecific->gridlineIndicesSizeAlloc, pmeGpu->archSpecific->context); pfree(pmeGpu->staging.h_gridlineIndices); - pmalloc((void **)&pmeGpu->staging.h_gridlineIndices, newIndicesSize * sizeof(int)); + pmalloc(reinterpret_cast(&pmeGpu->staging.h_gridlineIndices), newIndicesSize * sizeof(int)); } void pme_gpu_free_grid_indices(const PmeGpu *pmeGpu) @@ -580,7 +581,7 @@ void pme_gpu_reinit_3dfft(const PmeGpu *pmeGpu) pmeGpu->archSpecific->fftSetup.resize(0); for (int i = 0; i < pmeGpu->common->ngrids; i++) { - pmeGpu->archSpecific->fftSetup.push_back(std::unique_ptr(new GpuParallel3dFft(pmeGpu))); + pmeGpu->archSpecific->fftSetup.push_back(gmx::compat::make_unique(pmeGpu)); } } } @@ -680,7 +681,7 @@ static void pme_gpu_reinit_grids(PmeGpu *pmeGpu) for (int i = 0; i < DIM; i++) { kernelParamsPtr->grid.realGridSize[i] = pmeGpu->common->nk[i]; - kernelParamsPtr->grid.realGridSizeFP[i] = (float)kernelParamsPtr->grid.realGridSize[i]; + kernelParamsPtr->grid.realGridSizeFP[i] = static_cast(kernelParamsPtr->grid.realGridSize[i]); kernelParamsPtr->grid.realGridSizePadded[i] = kernelParamsPtr->grid.realGridSize[i]; // The complex grid currently uses no padding; @@ -761,7 +762,7 @@ static void pme_gpu_init(gmx_pme_t *pme, pme->gpu = new PmeGpu(); PmeGpu *pmeGpu = pme->gpu; changePinningPolicy(&pmeGpu->staging.h_forces, pme_get_pinning_policy()); - pmeGpu->common = std::shared_ptr(new PmeShared()); + pmeGpu->common = std::make_shared(); /* These settings are set here for the whole run; dynamic ones are set in pme_gpu_reinit() */ /* A convenience variable. */ diff --git a/src/gromacs/ewald/pme-gpu-internal.h b/src/gromacs/ewald/pme-gpu-internal.h index cb6371f2d3..e48e18a7f5 100644 --- a/src/gromacs/ewald/pme-gpu-internal.h +++ b/src/gromacs/ewald/pme-gpu-internal.h @@ -105,9 +105,9 @@ GPU_FUNC_QUALIFIER void pme_gpu_synchronize(const PmeGpu *GPU_FUNC_ARGUMENT(pmeG /*! \libinternal \brief * Allocates the fixed size energy and virial buffer both on GPU and CPU. * - * \param[in] pmeGpu The PME GPU structure. + * \param[in,out] pmeGpu The PME GPU structure. */ -void pme_gpu_alloc_energy_virial(const PmeGpu *pmeGpu); +void pme_gpu_alloc_energy_virial(PmeGpu *pmeGpu); /*! \libinternal \brief * Frees the energy and virial memory both on GPU and CPU. @@ -127,9 +127,9 @@ void pme_gpu_clear_energy_virial(const PmeGpu *pmeGpu); /*! \libinternal \brief * Reallocates and copies the pre-computed B-spline values to the GPU. * - * \param[in] pmeGpu The PME GPU structure. + * \param[in,out] pmeGpu The PME GPU structure. */ -void pme_gpu_realloc_and_copy_bspline_values(const PmeGpu *pmeGpu); +void pme_gpu_realloc_and_copy_bspline_values(PmeGpu *pmeGpu); /*! \libinternal \brief * Frees the pre-computed B-spline values on the GPU (and the transfer CPU buffers). @@ -226,9 +226,9 @@ void pme_gpu_free_coefficients(const PmeGpu *pmeGpu); /*! \libinternal \brief * Reallocates the buffers on the GPU and the host for the atoms spline data. * - * \param[in] pmeGpu The PME GPU structure. + * \param[in,out] pmeGpu The PME GPU structure. */ -void pme_gpu_realloc_spline_data(const PmeGpu *pmeGpu); +void pme_gpu_realloc_spline_data(PmeGpu *pmeGpu); /*! \libinternal \brief * Frees the buffers on the GPU for the atoms spline data. @@ -240,9 +240,9 @@ void pme_gpu_free_spline_data(const PmeGpu *pmeGpu); /*! \libinternal \brief * Reallocates the buffers on the GPU and the host for the particle gridline indices. * - * \param[in] pmeGpu The PME GPU structure. + * \param[in,out] pmeGpu The PME GPU structure. */ -void pme_gpu_realloc_grid_indices(const PmeGpu *pmeGpu); +void pme_gpu_realloc_grid_indices(PmeGpu *pmeGpu); /*! \libinternal \brief * Frees the buffer on the GPU for the particle gridline indices. @@ -654,7 +654,7 @@ GPU_FUNC_QUALIFIER void pme_gpu_destroy(PmeGpu *GPU_FUNC_ARGUMENT(pmeGpu)) GPU_F * Should be called before the pme_gpu_set_io_ranges. */ GPU_FUNC_QUALIFIER void pme_gpu_reinit_atoms(PmeGpu *GPU_FUNC_ARGUMENT(pmeGpu), - const int GPU_FUNC_ARGUMENT(nAtoms), + int GPU_FUNC_ARGUMENT(nAtoms), const real *GPU_FUNC_ARGUMENT(charges)) GPU_FUNC_TERM /*! \brief \libinternal diff --git a/src/gromacs/ewald/pme-gpu-program-impl-ocl.cpp b/src/gromacs/ewald/pme-gpu-program-impl-ocl.cpp index 1c138d2894..13f7decf54 100644 --- a/src/gromacs/ewald/pme-gpu-program-impl-ocl.cpp +++ b/src/gromacs/ewald/pme-gpu-program-impl-ocl.cpp @@ -58,7 +58,7 @@ PmeGpuProgramImpl::PmeGpuProgramImpl(const gmx_device_info_t *deviceInfo) cl_device_id deviceId = deviceInfo->ocl_gpu_id.ocl_device_id; cl_context_properties contextProperties[3]; contextProperties[0] = CL_CONTEXT_PLATFORM; - contextProperties[1] = (cl_context_properties) platformId; + contextProperties[1] = reinterpret_cast(platformId); contextProperties[2] = 0; /* Terminates the list of properties */ cl_int clError; diff --git a/src/gromacs/ewald/pme-gpu.cpp b/src/gromacs/ewald/pme-gpu.cpp index 29c357d3ca..6f8caf3249 100644 --- a/src/gromacs/ewald/pme-gpu.cpp +++ b/src/gromacs/ewald/pme-gpu.cpp @@ -190,8 +190,8 @@ void pme_gpu_launch_complex_transforms(gmx_pme_t *pme, gmx_wallcycle *wcycle) { PmeGpu *pmeGpu = pme->gpu; - const bool computeEnergyAndVirial = pmeGpu->settings.currentFlags & GMX_PME_CALC_ENER_VIR; - const bool performBackFFT = pmeGpu->settings.currentFlags & (GMX_PME_CALC_F | GMX_PME_CALC_POT); + const bool computeEnergyAndVirial = (pmeGpu->settings.currentFlags & GMX_PME_CALC_ENER_VIR) != 0; + const bool performBackFFT = (pmeGpu->settings.currentFlags & (GMX_PME_CALC_F | GMX_PME_CALC_POT)) != 0; const unsigned int gridIndex = 0; t_complex *cfftgrid = pme->cfftgrid[gridIndex]; @@ -274,7 +274,7 @@ static void pme_gpu_get_staged_results(const gmx_pme_t *pme, matrix virial, real *energy) { - const bool haveComputedEnergyAndVirial = pme->gpu->settings.currentFlags & GMX_PME_CALC_ENER_VIR; + const bool haveComputedEnergyAndVirial = (pme->gpu->settings.currentFlags & GMX_PME_CALC_ENER_VIR) != 0; *forces = pme_gpu_get_forces(pme->gpu); if (haveComputedEnergyAndVirial) diff --git a/src/gromacs/gpu_utils/devicebuffer_ocl.h b/src/gromacs/gpu_utils/devicebuffer_ocl.h index 64f969fa9d..52c949fc76 100644 --- a/src/gromacs/gpu_utils/devicebuffer_ocl.h +++ b/src/gromacs/gpu_utils/devicebuffer_ocl.h @@ -61,7 +61,7 @@ class TypedClMemory cl_mem data_; public: //! \brief An assignment operator - the purpose is to make allocation/zeroing work - void operator=(cl_mem data){data_ = data; } + TypedClMemory &operator=(cl_mem data){data_ = data; return *this; } //! \brief Returns underlying cl_mem transparently operator cl_mem() {return data_; } }; diff --git a/src/gromacs/gpu_utils/gpu_utils.h b/src/gromacs/gpu_utils/gpu_utils.h index 6ee8b09a9d..65d8502e39 100644 --- a/src/gromacs/gpu_utils/gpu_utils.h +++ b/src/gromacs/gpu_utils/gpu_utils.h @@ -213,7 +213,7 @@ void get_gpu_device_info_string(char *GPU_FUNC_ARGUMENT(s), * \returns size in bytes of gpu_dev_info */ GPU_FUNC_QUALIFIER -size_t sizeof_gpu_dev_info(void) GPU_FUNC_TERM_WITH_RETURN(0) +size_t sizeof_gpu_dev_info() GPU_FUNC_TERM_WITH_RETURN(0) /*! \brief Returns a pointer *ptr to page-locked memory of size nbytes. * diff --git a/src/gromacs/gpu_utils/gpu_utils_ocl.cpp b/src/gromacs/gpu_utils/gpu_utils_ocl.cpp index c541eb06e7..ea8451fb2e 100644 --- a/src/gromacs/gpu_utils/gpu_utils_ocl.cpp +++ b/src/gromacs/gpu_utils/gpu_utils_ocl.cpp @@ -191,7 +191,7 @@ void findGpus(gmx_gpu_info_t *gpu_info) req_dev_type = CL_DEVICE_TYPE_CPU; } - while (1) + while (true) { cl_int status = clGetPlatformIDs(0, nullptr, &ocl_platform_count); if (CL_SUCCESS != status) @@ -413,7 +413,7 @@ gmx_device_info_t *getDeviceInfo(const gmx_gpu_info_t &gpu_info, } //! This function is documented in the header file -size_t sizeof_gpu_dev_info(void) +size_t sizeof_gpu_dev_info() { return sizeof(gmx_device_info_t); } diff --git a/src/gromacs/gpu_utils/gpuregiontimer_ocl.h b/src/gromacs/gpu_utils/gpuregiontimer_ocl.h index 00f28c633f..47209b9bc4 100644 --- a/src/gromacs/gpu_utils/gpuregiontimer_ocl.h +++ b/src/gromacs/gpu_utils/gpuregiontimer_ocl.h @@ -83,9 +83,9 @@ class GpuRegionTimerImpl GpuRegionTimerImpl(GpuRegionTimerImpl &&) = delete; /*! \brief Should be called before the region start. */ - inline void openTimingRegion(CommandStream){} + inline void openTimingRegion(CommandStream /*unused*/){} /*! \brief Should be called after the region end. */ - inline void closeTimingRegion(CommandStream){} + inline void closeTimingRegion(CommandStream /*unused*/){} /*! \brief Returns the last measured region timespan (in milliseconds) and calls reset(). */ inline double getLastRangeTime() { diff --git a/src/gromacs/gpu_utils/ocl_caching.cpp b/src/gromacs/gpu_utils/ocl_caching.cpp index 566885e5b1..ef7fa819ea 100644 --- a/src/gromacs/gpu_utils/ocl_caching.cpp +++ b/src/gromacs/gpu_utils/ocl_caching.cpp @@ -179,5 +179,5 @@ writeBinaryToCache(cl_program program, const std::string &filename) fwrite(binary, 1, fileSize, f.get()); } -} // namespace -} // namespace +} // namespace ocl +} // namespace gmx diff --git a/src/gromacs/gpu_utils/ocl_caching.h b/src/gromacs/gpu_utils/ocl_caching.h index 47ee8f6225..cb6c9811e0 100644 --- a/src/gromacs/gpu_utils/ocl_caching.h +++ b/src/gromacs/gpu_utils/ocl_caching.h @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2012,2013,2014,2015,2016, by the GROMACS development team, led by + * Copyright (c) 2012,2013,2014,2015,2016,2018, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -99,7 +99,7 @@ makeProgramFromCache(const std::string &filename, void writeBinaryToCache(cl_program program, const std::string &filename); -} // namespace -} // namespace +} // namespace ocl +} // namespace gmx #endif diff --git a/src/gromacs/gpu_utils/ocl_compiler.cpp b/src/gromacs/gpu_utils/ocl_compiler.cpp index c52a6ec74b..fa8c4137e6 100644 --- a/src/gromacs/gpu_utils/ocl_compiler.cpp +++ b/src/gromacs/gpu_utils/ocl_compiler.cpp @@ -75,7 +75,7 @@ namespace ocl * * Currently caching is disabled by default unless the env var override * is used until we resolve concurrency issues. */ -static bool useBuildCache = getenv("GMX_OCL_GENCACHE"); // (NULL == getenv("GMX_OCL_NOGENCACHE")); +static bool useBuildCache = getenv("GMX_OCL_GENCACHE") != nullptr; /*! \brief Handles writing the OpenCL JIT compilation log to \c fplog. * @@ -257,7 +257,7 @@ size_t getWarpSize(cl_context context, cl_device_id deviceId) { cl_int cl_error; const char *warpSizeKernel = "__kernel void test(__global int* test){test[get_local_id(0)] = 0;}"; - cl_program program = clCreateProgramWithSource(context, 1, (const char**)&warpSizeKernel, nullptr, &cl_error); + cl_program program = clCreateProgramWithSource(context, 1, &warpSizeKernel, nullptr, &cl_error); if (cl_error != CL_SUCCESS) { GMX_THROW(InternalError("Could not create OpenCL program to determine warp size, error was " + ocl_get_error_string(cl_error))); @@ -374,7 +374,7 @@ removeExtraSpaces(std::string *str) { GMX_RELEASE_ASSERT(str != nullptr, "A pointer to an actual string must be provided"); std::string::iterator newEnd = - std::unique( str->begin(), str->end(), [ = ](char a, char b){ return isspace(a) && (a == b); } ); + std::unique( str->begin(), str->end(), [ = ](char a, char b){ return isspace(a) != 0 && (a == b); } ); str->erase(newEnd, str->end()); } @@ -551,5 +551,5 @@ compileProgram(FILE *fplog, return program; } -} // namespace -} // namespace +} // namespace ocl +} // namespace gmx diff --git a/src/gromacs/gpu_utils/ocl_compiler.h b/src/gromacs/gpu_utils/ocl_compiler.h index 54f0332fed..94dbe02ccb 100644 --- a/src/gromacs/gpu_utils/ocl_compiler.h +++ b/src/gromacs/gpu_utils/ocl_compiler.h @@ -98,7 +98,7 @@ compileProgram(FILE *fplog, cl_device_id deviceId, ocl_vendor_id_t deviceVendorId); -} // namespace -} // namespace +} // namespace ocl +} // namespace gmx #endif diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu index 960649956a..e9d816e1fe 100644 --- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu +++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu @@ -610,7 +610,7 @@ void nbnxn_gpu_launch_cpyback(gmx_nbnxn_cuda_t *nb, return; } - getGpuAtomRange(adat, aloc, adat_begin, adat_len); + getGpuAtomRange(adat, aloc, &adat_begin, &adat_len); /* beginning of timed D2H section */ if (bDoTime) diff --git a/src/gromacs/mdlib/nbnxn_gpu_common.h b/src/gromacs/mdlib/nbnxn_gpu_common.h index 90f4bc6204..7f361182c9 100644 --- a/src/gromacs/mdlib/nbnxn_gpu_common.h +++ b/src/gromacs/mdlib/nbnxn_gpu_common.h @@ -57,6 +57,8 @@ #include "gromacs/gpu_utils/gpu_utils.h" #include "gromacs/math/vec.h" +#include "gromacs/mdlib/force_flags.h" +#include "gromacs/mdlib/nb_verlet.h" #include "gromacs/mdlib/nbnxn_gpu_types.h" #include "gromacs/pbcutil/ishift.h" #include "gromacs/timing/gpu_timing.h" @@ -117,8 +119,8 @@ static inline int gpuAtomToInteractionLocality(int atomLocality) template static inline void getGpuAtomRange(const AtomDataT *atomData, int atomLocality, - int &atomRangeBegin, - int &atomRangeLen) + int *atomRangeBegin, + int *atomRangeLen) { assert(atomData); validateGpuAtomLocality(atomLocality); @@ -126,13 +128,13 @@ static inline void getGpuAtomRange(const AtomDataT *atomData, /* calculate the atom data index range based on locality */ if (LOCAL_A(atomLocality)) { - atomRangeBegin = 0; - atomRangeLen = atomData->natoms_local; + *atomRangeBegin = 0; + *atomRangeLen = atomData->natoms_local; } else { - atomRangeBegin = atomData->natoms_local; - atomRangeLen = atomData->natoms - atomData->natoms_local; + *atomRangeBegin = atomData->natoms_local; + *atomRangeLen = atomData->natoms - atomData->natoms_local; } } @@ -298,6 +300,8 @@ static inline void nbnxn_gpu_accumulate_timings(gmx_wallclock_gpu_nbnxn_t *timin } } +//TODO: move into shared source file with gmx_compile_cpp_as_cuda +//NOLINTNEXTLINE(misc-definitions-in-headers) bool nbnxn_gpu_try_finish_task(gmx_nbnxn_gpu_t *nb, int flags, int aloc, @@ -328,10 +332,11 @@ bool nbnxn_gpu_try_finish_task(gmx_nbnxn_gpu_t *nb, gpuStreamSynchronize(nb->stream[iLocality]); } - bool calcEner = flags & GMX_FORCE_ENERGY; - bool calcFshift = flags & GMX_FORCE_VIRIAL; + bool calcEner = (flags & GMX_FORCE_ENERGY) != 0; + bool calcFshift = (flags & GMX_FORCE_VIRIAL) != 0; - nbnxn_gpu_accumulate_timings(nb->timings, nb->timers, nb->plist[iLocality], aloc, calcEner, nb->bDoTime); + nbnxn_gpu_accumulate_timings(nb->timings, nb->timers, nb->plist[iLocality], aloc, calcEner, + nb->bDoTime != 0); nbnxn_gpu_reduce_staged_outputs(nb->nbst, iLocality, calcEner, calcFshift, e_lj, e_el, fshift); } @@ -360,6 +365,7 @@ bool nbnxn_gpu_try_finish_task(gmx_nbnxn_gpu_t *nb, * \param[out] e_el Pointer to the electrostatics energy output to accumulate into * \param[out] fshift Pointer to the shift force buffer to accumulate into */ +//NOLINTNEXTLINE(misc-definitions-in-headers) TODO: move into source file void nbnxn_gpu_wait_finish_task(gmx_nbnxn_gpu_t *nb, int flags, int aloc, diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl.cpp b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl.cpp index c2a055b79c..3ef88eaea1 100644 --- a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl.cpp +++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl.cpp @@ -119,7 +119,7 @@ static inline void validate_global_work_size(const KernelLaunchConfig &config, i https://www.khronos.org/registry/cl/sdk/1.0/docs/man/xhtml/clEnqueueNDRangeKernel.html */ device_size_t_size_bits = dinfo->adress_bits; - host_size_t_size_bits = (cl_uint)(sizeof(size_t) * 8); + host_size_t_size_bits = static_cast(sizeof(size_t) * 8); /* If sizeof(host size_t) <= sizeof(device size_t) => global_work_size components will always be valid @@ -131,7 +131,7 @@ static inline void validate_global_work_size(const KernelLaunchConfig &config, i { size_t device_limit; - device_limit = (((size_t)1) << device_size_t_size_bits) - 1; + device_limit = (1ull << device_size_t_size_bits) - 1; for (int i = 0; i < work_dim; i++) { @@ -393,9 +393,9 @@ void nbnxn_gpu_launch_kernel(gmx_nbnxn_ocl_t *nb, cl_timers_t *t = nb->timers; cl_command_queue stream = nb->stream[iloc]; - bool bCalcEner = flags & GMX_FORCE_ENERGY; + bool bCalcEner = (flags & GMX_FORCE_ENERGY) != 0; int bCalcFshift = flags & GMX_FORCE_VIRIAL; - bool bDoTime = nb->bDoTime; + bool bDoTime = (nb->bDoTime) != 0; cl_nbparam_params_t nbparams_params; @@ -508,10 +508,10 @@ void nbnxn_gpu_launch_kernel(gmx_nbnxn_ocl_t *nb, if (debug) { - fprintf(debug, "Non-bonded GPU launch configuration:\n\tLocal work size: %dx%dx%d\n\t" - "Global work size : %dx%d\n\t#Super-clusters/clusters: %d/%d (%d)\n", - (int)(config.blockSize[0]), (int)(config.blockSize[1]), (int)(config.blockSize[2]), - (int)(config.blockSize[0] * config.gridSize[0]), (int)(config.blockSize[1] * config.gridSize[1]), plist->nsci*c_numClPerSupercl, + fprintf(debug, "Non-bonded GPU launch configuration:\n\tLocal work size: %zux%zux%zu\n\t" + "Global work size : %zux%zu\n\t#Super-clusters/clusters: %d/%d (%d)\n", + config.blockSize[0], config.blockSize[1], config.blockSize[2], + config.blockSize[0] * config.gridSize[0], config.blockSize[1] * config.gridSize[1], plist->nsci*c_numClPerSupercl, c_numClPerSupercl, plist->na_c); } @@ -581,7 +581,7 @@ void nbnxn_gpu_launch_kernel_pruneonly(gmx_nbnxn_gpu_t *nb, cl_plist_t *plist = nb->plist[iloc]; cl_timers_t *t = nb->timers; cl_command_queue stream = nb->stream[iloc]; - bool bDoTime = nb->bDoTime; + bool bDoTime = nb->bDoTime == CL_TRUE; if (plist->haveFreshList) { @@ -658,11 +658,11 @@ void nbnxn_gpu_launch_kernel_pruneonly(gmx_nbnxn_gpu_t *nb, if (debug) { - fprintf(debug, "Pruning GPU kernel launch configuration:\n\tLocal work size: %dx%dx%d\n\t" - "\tGlobal work size: %dx%d\n\t#Super-clusters/clusters: %d/%d (%d)\n" + fprintf(debug, "Pruning GPU kernel launch configuration:\n\tLocal work size: %zux%zux%zu\n\t" + "\tGlobal work size: %zux%zu\n\t#Super-clusters/clusters: %d/%d (%d)\n" "\tShMem: %zu\n", - (int)(config.blockSize[0]), (int)(config.blockSize[1]), (int)(config.blockSize[2]), - (int)(config.blockSize[0] * config.gridSize[0]), (int)(config.blockSize[1] * config.gridSize[1]), plist->nsci*c_numClPerSupercl, + config.blockSize[0], config.blockSize[1], config.blockSize[2], + config.blockSize[0] * config.gridSize[0], config.blockSize[1] * config.gridSize[1], plist->nsci*c_numClPerSupercl, c_numClPerSupercl, plist->na_c, config.sharedMemorySize); } @@ -711,10 +711,10 @@ void nbnxn_gpu_launch_cpyback(gmx_nbnxn_ocl_t *nb, cl_atomdata_t *adat = nb->atdat; cl_timers_t *t = nb->timers; - bool bDoTime = nb->bDoTime; + bool bDoTime = nb->bDoTime == CL_TRUE; cl_command_queue stream = nb->stream[iloc]; - bool bCalcEner = flags & GMX_FORCE_ENERGY; + bool bCalcEner = (flags & GMX_FORCE_ENERGY) != 0; int bCalcFshift = flags & GMX_FORCE_VIRIAL; @@ -729,11 +729,11 @@ void nbnxn_gpu_launch_cpyback(gmx_nbnxn_ocl_t *nb, test case, overall simulation performance was higher with the API calls, but this has not been tested on AMD OpenCL, so could be worth considering in future. */ - nb->bNonLocalStreamActive = false; + nb->bNonLocalStreamActive = CL_FALSE; return; } - getGpuAtomRange(adat, aloc, adat_begin, adat_len); + getGpuAtomRange(adat, aloc, &adat_begin, &adat_len); /* beginning of timed D2H section */ if (bDoTime) @@ -764,7 +764,7 @@ void nbnxn_gpu_launch_cpyback(gmx_nbnxn_ocl_t *nb, { cl_error = clEnqueueMarkerWithWaitList(stream, 0, nullptr, &(nb->nonlocal_done)); assert(CL_SUCCESS == cl_error); - nb->bNonLocalStreamActive = true; + nb->bNonLocalStreamActive = CL_TRUE; } /* only transfer energies in the local stream */ @@ -819,7 +819,7 @@ int nbnxn_gpu_pick_ewald_kernel_type(bool bTwinCut) * */ //if ((dev_info->prop.major >= 3 || bForceAnalyticalEwald) && !bForceTabulatedEwald) - if ((1 || bForceAnalyticalEwald) && !bForceTabulatedEwald) + if (!bForceTabulatedEwald) { bUseAnalyticalEwald = true; diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_data_mgmt.cpp b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_data_mgmt.cpp index f1bddab95d..0b931f6028 100644 --- a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_data_mgmt.cpp +++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_data_mgmt.cpp @@ -149,7 +149,7 @@ static void init_atomdata_first(cl_atomdata_t *ad, int ntypes, gmx_device_runtim // TODO: handle errors, check clCreateBuffer flags ad->shift_vec = clCreateBuffer(runData->context, CL_MEM_READ_WRITE, SHIFTS * ad->shift_vec_elem_size, nullptr, &cl_error); assert(cl_error == CL_SUCCESS); - ad->bShiftVecUploaded = false; + ad->bShiftVecUploaded = CL_FALSE; /* An element of the fshift device buffer has the same size as one element of the host side fshift buffer. */ @@ -233,7 +233,6 @@ map_interaction_types_to_gpu_kernel_flavors(const interaction_const_t *ic, break; default: gmx_incons("The requested LJ combination rule is not implemented in the OpenCL GPU accelerated kernels!"); - break; } break; case eintmodFORCESWITCH: @@ -244,7 +243,6 @@ map_interaction_types_to_gpu_kernel_flavors(const interaction_const_t *ic, break; default: gmx_incons("The requested VdW interaction modifier is not implemented in the GPU accelerated kernels!"); - break; } } else if (ic->vdwtype == evdwPME) @@ -500,7 +498,7 @@ nbnxn_gpu_create_context(gmx_device_runtime_data_t *runtimeData, device_id = devInfo->ocl_gpu_id.ocl_device_id; context_properties[0] = CL_CONTEXT_PLATFORM; - context_properties[1] = (cl_context_properties) platform_id; + context_properties[1] = reinterpret_cast(platform_id); context_properties[2] = 0; /* Terminates the list of properties */ context = clCreateContext(context_properties, 1, &device_id, nullptr, nullptr, &cl_error); @@ -510,7 +508,6 @@ nbnxn_gpu_create_context(gmx_device_runtime_data_t *runtimeData, rank, devInfo->device_name, cl_error, ocl_get_error_string(cl_error).c_str()); - return; } runtimeData->context = context; @@ -640,7 +637,7 @@ void nbnxn_gpu_init(gmx_nbnxn_ocl_t **p_nb, snew(nb->plist[eintNonlocal], 1); } - nb->bUseTwoStreams = bLocalAndNonlocal; + nb->bUseTwoStreams = static_cast(bLocalAndNonlocal); nb->timers = new cl_timers_t(); snew(nb->timings, 1); @@ -650,14 +647,14 @@ void nbnxn_gpu_init(gmx_nbnxn_ocl_t **p_nb, snew(nb->dev_rundata, 1); /* init nbst */ - pmalloc((void**)&nb->nbst.e_lj, sizeof(*nb->nbst.e_lj)); - pmalloc((void**)&nb->nbst.e_el, sizeof(*nb->nbst.e_el)); - pmalloc((void**)&nb->nbst.fshift, SHIFTS * sizeof(*nb->nbst.fshift)); + pmalloc(reinterpret_cast(&nb->nbst.e_lj), sizeof(*nb->nbst.e_lj)); + pmalloc(reinterpret_cast(&nb->nbst.e_el), sizeof(*nb->nbst.e_el)); + pmalloc(reinterpret_cast(&nb->nbst.fshift), SHIFTS * sizeof(*nb->nbst.fshift)); init_plist(nb->plist[eintLocal]); /* OpenCL timing disabled if GMX_DISABLE_GPU_TIMING is defined. */ - nb->bDoTime = (getenv("GMX_DISABLE_GPU_TIMING") == nullptr); + nb->bDoTime = static_cast(getenv("GMX_DISABLE_GPU_TIMING") == nullptr); /* Create queues only after bDoTime has been initialized */ if (nb->bDoTime) @@ -679,7 +676,6 @@ void nbnxn_gpu_init(gmx_nbnxn_ocl_t **p_nb, rank, nb->dev_info->device_name, cl_error); - return; } if (nb->bUseTwoStreams) @@ -693,13 +689,12 @@ void nbnxn_gpu_init(gmx_nbnxn_ocl_t **p_nb, rank, nb->dev_info->device_name, cl_error); - return; } } if (nb->bDoTime) { - init_timers(nb->timers, nb->bUseTwoStreams); + init_timers(nb->timers, nb->bUseTwoStreams == CL_TRUE); init_timings(nb->timings); } @@ -796,7 +791,7 @@ void nbnxn_gpu_init_pairlist(gmx_nbnxn_ocl_t *nb, // Timing accumulation should happen only if there was work to do // because getLastRangeTime() gets skipped with empty lists later // which leads to the counter not being reset. - bool bDoTime = (nb->bDoTime && h_plist->nsci > 0); + bool bDoTime = ((nb->bDoTime == CL_TRUE) && h_plist->nsci > 0); cl_command_queue stream = nb->stream[iloc]; cl_plist_t *d_plist = nb->plist[iloc]; @@ -865,7 +860,7 @@ void nbnxn_gpu_upload_shiftvec(gmx_nbnxn_ocl_t *nb, { ocl_copy_H2D_async(adat->shift_vec, nbatom->shift_vec, 0, SHIFTS * adat->shift_vec_elem_size, ls, nullptr); - adat->bShiftVecUploaded = true; + adat->bShiftVecUploaded = CL_TRUE; } } @@ -876,7 +871,7 @@ void nbnxn_gpu_init_atomdata(gmx_nbnxn_ocl_t *nb, cl_int cl_error; int nalloc, natoms; bool realloced; - bool bDoTime = nb->bDoTime; + bool bDoTime = nb->bDoTime == CL_TRUE; cl_timers_t *timers = nb->timers; cl_atomdata_t *d_atdat = nb->atdat; cl_command_queue ls = nb->stream[eintLocal]; @@ -1035,16 +1030,16 @@ void nbnxn_gpu_free(gmx_nbnxn_ocl_t *nb) /* Free kernels */ int kernel_count = sizeof(nb->kernel_ener_noprune_ptr) / sizeof(nb->kernel_ener_noprune_ptr[0][0]); - free_kernels((cl_kernel*)nb->kernel_ener_noprune_ptr, kernel_count); + free_kernels(nb->kernel_ener_noprune_ptr[0], kernel_count); kernel_count = sizeof(nb->kernel_ener_prune_ptr) / sizeof(nb->kernel_ener_prune_ptr[0][0]); - free_kernels((cl_kernel*)nb->kernel_ener_prune_ptr, kernel_count); + free_kernels(nb->kernel_ener_prune_ptr[0], kernel_count); kernel_count = sizeof(nb->kernel_noener_noprune_ptr) / sizeof(nb->kernel_noener_noprune_ptr[0][0]); - free_kernels((cl_kernel*)nb->kernel_noener_noprune_ptr, kernel_count); + free_kernels(nb->kernel_noener_noprune_ptr[0], kernel_count); kernel_count = sizeof(nb->kernel_noener_prune_ptr) / sizeof(nb->kernel_noener_prune_ptr[0][0]); - free_kernels((cl_kernel*)nb->kernel_noener_prune_ptr, kernel_count); + free_kernels(nb->kernel_noener_prune_ptr[0], kernel_count); free_kernel(&(nb->kernel_memset_f)); free_kernel(&(nb->kernel_memset_f2)); -- 2.22.0