From 97fe7d2f1f7b101a55c3c7db4b32b4014b1ad0d2 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Szil=C3=A1rd=20P=C3=A1ll?= Date: Tue, 9 Oct 2018 00:49:16 +0200 Subject: [PATCH] Fix more clang-6 warnings in CUDA code Fixes #2681 Change-Id: Ic1e096328687e53ca814034c4c3eb8db40db46df --- src/gromacs/ewald/pme-gpu-program-impl.cu | 50 ++++++++++++++----- src/gromacs/gpu_utils/cudautils.cu | 14 +++--- src/gromacs/gpu_utils/cudautils.cuh | 18 +++++-- src/gromacs/gpu_utils/gpu_utils.cu | 23 ++++----- .../mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu | 44 ++++++++-------- .../nbnxn_cuda_kernel_pruneonly.cuh | 13 ++++- 6 files changed, 100 insertions(+), 62 deletions(-) diff --git a/src/gromacs/ewald/pme-gpu-program-impl.cu b/src/gromacs/ewald/pme-gpu-program-impl.cu index ac2c3a359e..db2f1b178d 100644 --- a/src/gromacs/ewald/pme-gpu-program-impl.cu +++ b/src/gromacs/ewald/pme-gpu-program-impl.cu @@ -49,6 +49,12 @@ #include "pme-gpu-internal.h" // for GridOrdering enum #include "pme-gpu-types-host.h" +// PME interpolation order +constexpr int c_pmeOrder = 4; +// These hardcoded spread/gather parameters refer to not-implemented PME GPU 2D decomposition in X/Y +constexpr bool c_wrapX = true; +constexpr bool c_wrapY = true; + //! PME CUDA kernels forward declarations. Kernels are documented in their respective files. template < const int order, @@ -59,12 +65,32 @@ template < > void pme_spline_and_spread_kernel(const PmeGpuCudaKernelParams kernelParams); +// Add extern declarations to inform that there will be a definition +// provided in another translation unit. +extern template +void pme_spline_and_spread_kernel(const PmeGpuCudaKernelParams); +extern template +void pme_spline_and_spread_kernel(const PmeGpuCudaKernelParams); +extern template +void pme_spline_and_spread_kernel(const PmeGpuCudaKernelParams); + template< GridOrdering gridOrdering, bool computeEnergyAndVirial > void pme_solve_kernel(const PmeGpuCudaKernelParams kernelParams); +// Add extern declarations to inform that there will be a definition +// provided in another translation unit. +extern template +void pme_solve_kernel(const PmeGpuCudaKernelParams); +extern template +void pme_solve_kernel(const PmeGpuCudaKernelParams); +extern template +void pme_solve_kernel(const PmeGpuCudaKernelParams); +extern template +void pme_solve_kernel(const PmeGpuCudaKernelParams); + template < const int order, const bool overwriteForces, @@ -73,6 +99,12 @@ template < > void pme_gather_kernel(const PmeGpuCudaKernelParams kernelParams); +// Add extern declarations to inform that there will be a definition +// provided in another translation unit. +extern template +void pme_gather_kernel(const PmeGpuCudaKernelParams); +extern template +void pme_gather_kernel(const PmeGpuCudaKernelParams); PmeGpuProgramImpl::PmeGpuProgramImpl(const gmx_device_info_t *) { @@ -82,19 +114,11 @@ PmeGpuProgramImpl::PmeGpuProgramImpl(const gmx_device_info_t *) solveMaxWorkGroupSize = c_solveMaxThreadsPerBlock; gatherWorkGroupSize = c_gatherMaxThreadsPerBlock; - // PME interpolation order - constexpr int pmeOrder = 4; - GMX_UNUSED_VALUE(pmeOrder); - // These hardcoded spread/gather parameters refer to not-implemented PME GPU 2D decomposition in X/Y - constexpr bool wrapX = true; - constexpr bool wrapY = true; - GMX_UNUSED_VALUE(wrapX); - GMX_UNUSED_VALUE(wrapY); - splineAndSpreadKernel = pme_spline_and_spread_kernel; - splineKernel = pme_spline_and_spread_kernel; - spreadKernel = pme_spline_and_spread_kernel; - gatherKernel = pme_gather_kernel; - gatherReduceWithInputKernel = pme_gather_kernel; + splineAndSpreadKernel = pme_spline_and_spread_kernel; + splineKernel = pme_spline_and_spread_kernel; + spreadKernel = pme_spline_and_spread_kernel; + gatherKernel = pme_gather_kernel; + gatherReduceWithInputKernel = pme_gather_kernel; solveXYZKernel = pme_solve_kernel; solveXYZEnergyKernel = pme_solve_kernel; solveYZXKernel = pme_solve_kernel; diff --git a/src/gromacs/gpu_utils/cudautils.cu b/src/gromacs/gpu_utils/cudautils.cu index b66be40f09..6731fef53d 100644 --- a/src/gromacs/gpu_utils/cudautils.cu +++ b/src/gromacs/gpu_utils/cudautils.cu @@ -48,11 +48,11 @@ // TODO: template on transferKind to avoid runtime conditionals int cu_copy_D2H(void *h_dest, void *d_src, size_t bytes, - GpuApiCallBehavior transferKind, cudaStream_t s = 0) + GpuApiCallBehavior transferKind, cudaStream_t s = nullptr) { cudaError_t stat; - if (h_dest == NULL || d_src == NULL || bytes == 0) + if (h_dest == nullptr || d_src == nullptr || bytes == 0) { return -1; } @@ -85,18 +85,18 @@ int cu_copy_D2H_sync(void * h_dest, void * d_src, size_t bytes) /*! * The copy is launched in stream s or if not specified, in stream 0. */ -int cu_copy_D2H_async(void * h_dest, void * d_src, size_t bytes, cudaStream_t s = 0) +int cu_copy_D2H_async(void * h_dest, void * d_src, size_t bytes, cudaStream_t s = nullptr) { return cu_copy_D2H(h_dest, d_src, bytes, GpuApiCallBehavior::Async, s); } // TODO: template on transferKind to avoid runtime conditionals int cu_copy_H2D(void *d_dest, void *h_src, size_t bytes, - GpuApiCallBehavior transferKind, cudaStream_t s = 0) + GpuApiCallBehavior transferKind, cudaStream_t s = nullptr) { cudaError_t stat; - if (d_dest == NULL || h_src == NULL || bytes == 0) + if (d_dest == nullptr || h_src == nullptr || bytes == 0) { return -1; } @@ -129,7 +129,7 @@ int cu_copy_H2D_sync(void * d_dest, void * h_src, size_t bytes) /*! * The copy is launched in stream s or if not specified, in stream 0. */ -int cu_copy_H2D_async(void * d_dest, void * h_src, size_t bytes, cudaStream_t s = 0) +int cu_copy_H2D_async(void * d_dest, void * h_src, size_t bytes, cudaStream_t s = nullptr) { return cu_copy_H2D(d_dest, h_src, bytes, GpuApiCallBehavior::Async, s); } @@ -175,7 +175,7 @@ static void setup1DTexture(cudaTextureObject_t &texObj, memset(&td, 0, sizeof(td)); td.readMode = cudaReadModeElementType; - stat = cudaCreateTextureObject(&texObj, &rd, &td, NULL); + stat = cudaCreateTextureObject(&texObj, &rd, &td, nullptr); CU_RET_ERR(stat, "cudaCreateTextureObject failed"); } diff --git a/src/gromacs/gpu_utils/cudautils.cuh b/src/gromacs/gpu_utils/cudautils.cuh index 230f968e36..2fb14ba46a 100644 --- a/src/gromacs/gpu_utils/cudautils.cuh +++ b/src/gromacs/gpu_utils/cudautils.cuh @@ -145,25 +145,25 @@ struct gmx_device_info_t * * The copy is launched in stream s or if not specified, in stream 0. */ -int cu_copy_D2H(void *h_dest, void *d_src, size_t bytes, GpuApiCallBehavior transferKind, cudaStream_t s /*= 0*/); +int cu_copy_D2H(void *h_dest, void *d_src, size_t bytes, GpuApiCallBehavior transferKind, cudaStream_t /*s = nullptr*/); /*! Launches synchronous host to device memory copy in stream 0. */ int cu_copy_D2H_sync(void * /*h_dest*/, void * /*d_src*/, size_t /*bytes*/); /*! Launches asynchronous host to device memory copy in stream s. */ -int cu_copy_D2H_async(void * /*h_dest*/, void * /*d_src*/, size_t /*bytes*/, cudaStream_t /*s = 0*/); +int cu_copy_D2H_async(void * /*h_dest*/, void * /*d_src*/, size_t /*bytes*/, cudaStream_t /*s = nullptr*/); /*! Launches synchronous or asynchronous host to device memory copy. * * The copy is launched in stream s or if not specified, in stream 0. */ -int cu_copy_H2D(void *d_dest, void *h_src, size_t bytes, GpuApiCallBehavior transferKind, cudaStream_t /*s = 0*/); +int cu_copy_H2D(void *d_dest, void *h_src, size_t bytes, GpuApiCallBehavior transferKind, cudaStream_t /*s = nullptr*/); /*! Launches synchronous host to device memory copy. */ int cu_copy_H2D_sync(void * /*d_dest*/, void * /*h_src*/, size_t /*bytes*/); /*! Launches asynchronous host to device memory copy in stream s. */ -int cu_copy_H2D_async(void * /*d_dest*/, void * /*h_src*/, size_t /*bytes*/, cudaStream_t /*s = 0*/); +int cu_copy_H2D_async(void * /*d_dest*/, void * /*h_src*/, size_t /*bytes*/, cudaStream_t /*s = nullptr*/); // TODO: the 2 functions below are pretty much a constructor/destructor of a simple // GPU table object. There is also almost self-contained fetchFromParamLookupTable() @@ -188,6 +188,11 @@ void initParamLookupTable(T * &d_ptr, int numElem, const gmx_device_info_t *devInfo); +// Add extern declarations so each translation unit understands that +// there will be a definition provided. +extern template void initParamLookupTable(int * &, cudaTextureObject_t &, const int *, int, const gmx_device_info_t *); +extern template void initParamLookupTable(float * &, cudaTextureObject_t &, const float *, int, const gmx_device_info_t *); + /*! \brief Destroy parameter lookup table. * * Unbinds texture object, deallocates device memory. @@ -202,6 +207,11 @@ void destroyParamLookupTable(T *d_ptr, cudaTextureObject_t texObj, const gmx_device_info_t *devInfo); +// Add extern declarations so each translation unit understands that +// there will be a definition provided. +extern template void destroyParamLookupTable(int *, cudaTextureObject_t, const gmx_device_info_t *); +extern template void destroyParamLookupTable(float *, cudaTextureObject_t, const gmx_device_info_t *); + /*! \brief Add a triplets stored in a float3 to an rvec variable. * * \param[out] a Rvec to increment diff --git a/src/gromacs/gpu_utils/gpu_utils.cu b/src/gromacs/gpu_utils/gpu_utils.cu index 110c6fc0b0..04afee54db 100644 --- a/src/gromacs/gpu_utils/gpu_utils.cu +++ b/src/gromacs/gpu_utils/gpu_utils.cu @@ -68,7 +68,7 @@ */ static int cuda_max_device_count = 32; -static bool cudaProfilerRun = ((getenv("NVPROF_ID") != NULL)); +static bool cudaProfilerRun = ((getenv("NVPROF_ID") != nullptr)); /** Dummy kernel used for sanity checking. */ static __global__ void k_dummy_test(void) @@ -421,19 +421,12 @@ bool canDetectGpus(std::string *errorMessage) void findGpus(gmx_gpu_info_t *gpu_info) { - int i, ndev, checkres; - cudaError_t stat; - cudaDeviceProp prop; - gmx_device_info_t *devs; - assert(gpu_info); gpu_info->n_dev_compatible = 0; - ndev = 0; - devs = NULL; - - stat = cudaGetDeviceCount(&ndev); + int ndev; + cudaError_t stat = cudaGetDeviceCount(&ndev); if (stat != cudaSuccess) { GMX_THROW(gmx::InternalError("Invalid call of findGpus() when CUDA API returned an error, perhaps " @@ -443,10 +436,12 @@ void findGpus(gmx_gpu_info_t *gpu_info) // We expect to start device support/sanity checks with a clean runtime error state gmx::ensureNoPendingCudaError(""); + gmx_device_info_t *devs; snew(devs, ndev); - for (i = 0; i < ndev; i++) + for (int i = 0; i < ndev; i++) { - checkres = is_gmx_supported_gpu_id(i, &prop); + cudaDeviceProp prop; + int checkres = is_gmx_supported_gpu_id(i, &prop); devs[i].id = i; devs[i].prop = prop; @@ -540,8 +535,8 @@ void gpu_set_host_malloc_and_free(bool bUseGpuKernels, } else { - *nb_alloc = NULL; - *nb_free = NULL; + *nb_alloc = nullptr; + *nb_free = nullptr; } } diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu index 1c4f06e476..58e6d343e0 100644 --- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu +++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu @@ -104,7 +104,7 @@ static void init_ewald_coulomb_force_table(const interaction_const_t *ic, cu_nbparam_t *nbp, const gmx_device_info_t *dev_info) { - if (nbp->coulomb_tab != NULL) + if (nbp->coulomb_tab != nullptr) { nbnxn_cuda_free_nbparam_table(nbp, dev_info); } @@ -134,10 +134,10 @@ static void init_atomdata_first(cu_atomdata_t *ad, int ntypes) stat = cudaMalloc((void**)&ad->e_el, sizeof(*ad->e_el)); CU_RET_ERR(stat, "cudaMalloc failed on ad->e_el"); - /* initialize to NULL poiters to data that is not allocated here and will + /* initialize to nullptr poiters to data that is not allocated here and will need reallocation in nbnxn_cuda_init_atomdata */ - ad->xq = NULL; - ad->f = NULL; + ad->xq = nullptr; + ad->f = nullptr; /* size -1 indicates that the respective array hasn't been initialized yet */ ad->natoms = -1; @@ -154,8 +154,8 @@ static int pick_ewald_kernel_type(bool bTwinCut, /* Benchmarking/development environment variables to force the use of analytical or tabulated Ewald kernel. */ - bForceAnalyticalEwald = (getenv("GMX_CUDA_NB_ANA_EWALD") != NULL); - bForceTabulatedEwald = (getenv("GMX_CUDA_NB_TAB_EWALD") != NULL); + bForceAnalyticalEwald = (getenv("GMX_CUDA_NB_ANA_EWALD") != nullptr); + bForceTabulatedEwald = (getenv("GMX_CUDA_NB_TAB_EWALD") != nullptr); if (bForceAnalyticalEwald && bForceTabulatedEwald) { @@ -185,7 +185,7 @@ static int pick_ewald_kernel_type(bool bTwinCut, /* Use twin cut-off kernels if requested by bTwinCut or the env. var. forces it (use it for debugging/benchmarking only). */ - if (!bTwinCut && (getenv("GMX_CUDA_NB_EWALD_TWINCUT") == NULL)) + if (!bTwinCut && (getenv("GMX_CUDA_NB_EWALD_TWINCUT") == nullptr)) { kernel_type = bUseAnalyticalEwald ? eelCuEWALD_ANA : eelCuEWALD_TAB; } @@ -263,7 +263,6 @@ static void init_nbparam(cu_nbparam_t *nbp, break; default: gmx_incons("The requested LJ combination rule is not implemented in the CUDA GPU accelerated kernels!"); - break; } break; case eintmodFORCESWITCH: @@ -274,7 +273,6 @@ static void init_nbparam(cu_nbparam_t *nbp, break; default: gmx_incons("The requested VdW interaction modifier is not implemented in the CUDA GPU accelerated kernels!"); - break; } } else if (ic->vdwtype == evdwPME) @@ -315,7 +313,7 @@ static void init_nbparam(cu_nbparam_t *nbp, } /* generate table for PME */ - nbp->coulomb_tab = NULL; + nbp->coulomb_tab = nullptr; if (nbp->eeltype == eelCuEWALD_TAB || nbp->eeltype == eelCuEWALD_TAB_TWIN) { init_ewald_coulomb_force_table(ic, nbp, dev_info); @@ -360,12 +358,12 @@ void nbnxn_gpu_pme_loadbal_update_param(const nonbonded_verlet_t *nbv, /*! Initializes the pair list data structure. */ static void init_plist(cu_plist_t *pl) { - /* initialize to NULL pointers to data that is not allocated here and will + /* initialize to nullptr pointers to data that is not allocated here and will need reallocation in nbnxn_gpu_init_pairlist */ - pl->sci = NULL; - pl->cj4 = NULL; - pl->imask = NULL; - pl->excl = NULL; + pl->sci = nullptr; + pl->cj4 = nullptr; + pl->imask = nullptr; + pl->excl = nullptr; /* size -1 indicates that the respective array hasn't been initialized yet */ pl->na_c = -1; @@ -440,7 +438,7 @@ void nbnxn_gpu_init(gmx_nbnxn_cuda_t **p_nb, cudaError_t stat; gmx_nbnxn_cuda_t *nb; - if (p_nb == NULL) + if (p_nb == nullptr) { return; } @@ -481,7 +479,7 @@ void nbnxn_gpu_init(gmx_nbnxn_cuda_t **p_nb, * case will be a single value. */ int highest_priority; - stat = cudaDeviceGetStreamPriorityRange(NULL, &highest_priority); + stat = cudaDeviceGetStreamPriorityRange(nullptr, &highest_priority); CU_RET_ERR(stat, "cudaDeviceGetStreamPriorityRange failed"); stat = cudaStreamCreateWithPriority(&nb->stream[eintNonlocal], @@ -500,7 +498,7 @@ void nbnxn_gpu_init(gmx_nbnxn_cuda_t **p_nb, * This is the main reason why they are disabled by default. */ // TODO: Consider turning on by default when we can detect nr of streams. - nb->bDoTime = (getenv("GMX_ENABLE_GPU_TIMING") != NULL); + nb->bDoTime = (getenv("GMX_ENABLE_GPU_TIMING") != nullptr); if (nb->bDoTime) { @@ -731,7 +729,7 @@ void nbnxn_gpu_free(gmx_nbnxn_cuda_t *nb) cu_atomdata_t *atdat; cu_nbparam_t *nbparam; - if (nb == NULL) + if (nb == nullptr) { return; } @@ -804,13 +802,13 @@ void nbnxn_gpu_free(gmx_nbnxn_cuda_t *nb) /* Free nbst */ pfree(nb->nbst.e_lj); - nb->nbst.e_lj = NULL; + nb->nbst.e_lj = nullptr; pfree(nb->nbst.e_el); - nb->nbst.e_el = NULL; + nb->nbst.e_el = nullptr; pfree(nb->nbst.fshift); - nb->nbst.fshift = NULL; + nb->nbst.fshift = nullptr; sfree(atdat); sfree(nbparam); @@ -839,7 +837,7 @@ void nbnxn_gpu_reset_timings(nonbonded_verlet_t* nbv) int nbnxn_gpu_min_ci_balanced(gmx_nbnxn_cuda_t *nb) { - return nb != NULL ? + return nb != nullptr ? gpu_min_ci_balanced_factor*nb->dev_info->prop.multiProcessorCount : 0; } diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_pruneonly.cuh b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_pruneonly.cuh index df6185bcd4..d2d1ebb808 100644 --- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_pruneonly.cuh +++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_pruneonly.cuh @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2016,2017, by the GROMACS development team, led by + * Copyright (c) 2016,2017,2018, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -110,6 +110,17 @@ __global__ void nbnxn_kernel_prune_cuda(const cu_atomdata_t atdat, int part) #ifdef FUNCTION_DECLARATION_ONLY ; /* Only do function declaration, omit the function body. */ + +// Add extern declarations so each translation unit understands that +// there will be a definition provided. +extern template +__global__ void +nbnxn_kernel_prune_cuda(const cu_atomdata_t, const cu_nbparam_t, + const cu_plist_t, int, int); +extern template +__global__ void +nbnxn_kernel_prune_cuda(const cu_atomdata_t, const cu_nbparam_t, + const cu_plist_t, int, int); #else { -- 2.22.0