From 04c763fb1d0293afd87065abed3b4fd6036bc72a Mon Sep 17 00:00:00 2001 From: Paul Bauer Date: Mon, 24 May 2021 12:58:44 +0000 Subject: [PATCH] Apply clang-tidy-11 fixes to CUDA files --- src/.clang-tidy | 10 ++- src/gromacs/domdec/gpuhaloexchange_impl.cu | 16 +--- .../ewald/pme_coordinate_receiver_gpu_impl.cu | 1 + .../ewald/pme_force_sender_gpu_impl.cu | 1 + src/gromacs/ewald/pme_force_sender_gpu_impl.h | 1 + src/gromacs/ewald/pme_gather.cu | 32 ++++--- src/gromacs/ewald/pme_gpu_3dfft.cu | 6 +- .../ewald/pme_gpu_calculate_splines.cuh | 30 +++---- src/gromacs/ewald/pme_gpu_internal.cpp | 4 +- src/gromacs/ewald/pme_gpu_program_impl.cu | 6 +- src/gromacs/ewald/pme_pp_comm_gpu_impl.cu | 3 +- src/gromacs/ewald/pme_pp_comm_gpu_impl.h | 1 - src/gromacs/ewald/pme_solve.cu | 34 ++++---- src/gromacs/gpu_utils/cuda_kernel_utils.cuh | 3 +- src/gromacs/gpu_utils/cudautils.cuh | 24 +++--- src/gromacs/gpu_utils/devicebuffer.cuh | 29 +++++-- src/gromacs/gpu_utils/devicebuffer_sycl.h | 2 +- src/gromacs/gpu_utils/gpu_utils.cu | 9 +- src/gromacs/gpu_utils/gpuregiontimer.cuh | 4 +- src/gromacs/gpu_utils/pmalloc.cu | 2 +- src/gromacs/gpu_utils/tests/hostallocator.cpp | 2 +- .../gpu_utils/tests/typecasts_runner.cpp | 12 +-- .../gpu_utils/tests/typecasts_runner.cu | 13 +-- .../gpu_utils/tests/typecasts_runner.h | 13 +-- src/gromacs/gpu_utils/vectype_ops.cuh | 27 +++--- src/gromacs/hardware/device_management.cu | 8 +- src/gromacs/listed_forces/gpubonded_impl.cu | 2 +- src/gromacs/listed_forces/gpubonded_impl.h | 4 +- src/gromacs/listed_forces/gpubondedkernels.cu | 76 ++++++++-------- src/gromacs/mdlib/gpuforcereduction_impl.cpp | 8 +- src/gromacs/mdlib/gpuforcereduction_impl.h | 12 +-- .../mdlib/gpuforcereduction_impl_internal.cu | 1 - src/gromacs/mdlib/leapfrog_gpu.h | 12 +-- src/gromacs/mdlib/lincs_gpu.cpp | 26 +++--- src/gromacs/mdlib/lincs_gpu.h | 16 ++-- src/gromacs/mdlib/lincs_gpu_internal.cu | 46 +++++----- src/gromacs/mdlib/lincs_gpu_internal.h | 16 ++-- src/gromacs/mdlib/lincs_gpu_internal_sycl.cpp | 4 +- src/gromacs/mdlib/settle_gpu.cpp | 18 ++-- src/gromacs/mdlib/settle_gpu.h | 16 ++-- src/gromacs/mdlib/settle_gpu_internal.cu | 36 ++++---- src/gromacs/mdlib/settle_gpu_internal.h | 24 +++--- .../mdlib/settle_gpu_internal_sycl.cpp | 8 +- .../mdlib/update_constrain_gpu_impl.cpp | 4 +- src/gromacs/mdlib/update_constrain_gpu_impl.h | 2 +- .../mdlib/update_constrain_gpu_internal.cu | 2 +- .../mdlib/update_constrain_gpu_internal.h | 2 +- .../update_constrain_gpu_internal_sycl.cpp | 2 +- .../mdtypes/state_propagator_data_gpu.h | 6 +- .../state_propagator_data_gpu_impl.cpp | 6 +- .../mdtypes/state_propagator_data_gpu_impl.h | 8 +- .../state_propagator_data_gpu_impl_gpu.cpp | 14 +-- src/gromacs/nbnxm/cuda/.clang-tidy | 8 ++ .../nbnxm/cuda/nbnxm_cuda_data_mgmt.cu | 2 +- src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel.cuh | 44 +++++----- .../cuda/nbnxm_cuda_kernel_pruneonly.cuh | 14 +-- .../nbnxm/cuda/nbnxm_cuda_kernel_utils.cuh | 86 ++++++++++--------- .../cuda/nbnxm_gpu_buffer_ops_internal.cu | 2 +- .../nbnxm/nbnxm_gpu_buffer_ops_internal.h | 3 +- src/gromacs/nbnxm/nbnxm_gpu_data_mgmt.cpp | 8 +- src/gromacs/pbcutil/pbc_aiuc.h | 26 +++--- src/gromacs/pbcutil/pbc_aiuc_cuda.cuh | 1 + 62 files changed, 448 insertions(+), 410 deletions(-) create mode 100644 src/gromacs/nbnxm/cuda/.clang-tidy diff --git a/src/.clang-tidy b/src/.clang-tidy index 4cfa2bddc8..252ccd00dd 100644 --- a/src/.clang-tidy +++ b/src/.clang-tidy @@ -35,6 +35,12 @@ # # -misc-no-recursion # We have way too many functions and methods relying on recursion +# +# -readability-static-accessed-through-instance +# Especially the GPU code uses this in many places. +# +# -performance-type-promotion-in-math-fn +# Those warnings are in the gpu code and we shouldn't to worry about them. Checks: clang-diagnostic-*,-clang-analyzer-*,-clang-analyzer-security.insecureAPI.strcpy, bugprone-*,misc-*,readability-*,performance-*,mpi-*, -readability-inconsistent-declaration-parameter-name, @@ -60,7 +66,9 @@ Checks: clang-diagnostic-*,-clang-analyzer-*,-clang-analyzer-security.insecureA -bugprone-narrowing-conversions, -google-readability-avoid-underscore-in-googletest-name, -cppcoreguidelines-init-variables, - -misc-no-recursion + -misc-no-recursion, + -readability-static-accessed-through-instance, + -performance-type-promotion-in-math-fn HeaderFilterRegex: .* CheckOptions: - key: cppcoreguidelines-special-member-functions.AllowSoleDefaultDtor diff --git a/src/gromacs/domdec/gpuhaloexchange_impl.cu b/src/gromacs/domdec/gpuhaloexchange_impl.cu index f32a95bae7..3a245e21e4 100644 --- a/src/gromacs/domdec/gpuhaloexchange_impl.cu +++ b/src/gromacs/domdec/gpuhaloexchange_impl.cu @@ -97,8 +97,6 @@ __global__ void packSendBufKernel(float3* __restrict__ dataPacked, *gm_dataDest = *gm_dataSrc; } } - - return; } /*! \brief unpack non-local force data buffer on the GPU using pre-populated "map" containing index @@ -128,8 +126,6 @@ __global__ void unpackRecvBufKernel(float3* __restrict__ data, *gm_dataDest = *gm_dataSrc; } } - - return; } void GpuHaloExchange::Impl::reinitHalo(float3* d_coordinatesBuffer, float3* d_forcesBuffer) @@ -251,8 +247,6 @@ void GpuHaloExchange::Impl::reinitHalo(float3* d_coordinatesBuffer, float3* d_fo wallcycle_sub_stop(wcycle_, WallCycleSubCounter::DDGpu); wallcycle_stop(wcycle_, WallCycleCounter::Domdec); - - return; } void GpuHaloExchange::Impl::enqueueWaitRemoteCoordinatesReadyEvent(GpuEventSynchronizer* coordinatesReadyOnDeviceEvent) @@ -265,12 +259,12 @@ void GpuHaloExchange::Impl::enqueueWaitRemoteCoordinatesReadyEvent(GpuEventSynch // Similarly send event to task that will push data to this task. GpuEventSynchronizer* remoteCoordinatesReadyOnDeviceEvent; MPI_Sendrecv(&coordinatesReadyOnDeviceEvent, - sizeof(GpuEventSynchronizer*), + sizeof(GpuEventSynchronizer*), //NOLINT(bugprone-sizeof-expression) MPI_BYTE, recvRankX_, 0, &remoteCoordinatesReadyOnDeviceEvent, - sizeof(GpuEventSynchronizer*), + sizeof(GpuEventSynchronizer*), //NOLINT(bugprone-sizeof-expression) MPI_BYTE, sendRankX_, 0, @@ -346,8 +340,6 @@ void GpuHaloExchange::Impl::communicateHaloCoordinates(const matrix box communicateHaloData(d_sendBuf_, xSendSize_, sendRankX_, recvPtr, xRecvSize_, recvRankX_); wallcycle_stop(wcycle_, WallCycleCounter::MoveX); - - return; } // The following method should be called after non-local buffer operations, @@ -517,12 +509,12 @@ void GpuHaloExchange::Impl::communicateHaloDataWithCudaDirect(float3* sendPtr, haloDataTransferLaunched_->markEvent(nonLocalStream_); MPI_Sendrecv(&haloDataTransferLaunched_, - sizeof(GpuEventSynchronizer*), + sizeof(GpuEventSynchronizer*), //NOLINT(bugprone-sizeof-expression) MPI_BYTE, sendRank, 0, &haloDataTransferRemote, - sizeof(GpuEventSynchronizer*), + sizeof(GpuEventSynchronizer*), //NOLINT(bugprone-sizeof-expression) MPI_BYTE, recvRank, 0, diff --git a/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cu b/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cu index 0fbede7b09..6991fe77be 100644 --- a/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cu +++ b/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cu @@ -101,6 +101,7 @@ void PmeCoordinateReceiverGpu::Impl::receiveCoordinatesSynchronizerFromPpCudaDir #if GMX_MPI // Receive event from PP task + // NOLINTNEXTLINE(bugprone-sizeof-expression) MPI_Irecv(&ppSync_[recvCount_], sizeof(GpuEventSynchronizer*), MPI_BYTE, ppRank, 0, comm_, &request_[recvCount_]); recvCount_++; #else diff --git a/src/gromacs/ewald/pme_force_sender_gpu_impl.cu b/src/gromacs/ewald/pme_force_sender_gpu_impl.cu index ecec4d6ac6..75409d0296 100644 --- a/src/gromacs/ewald/pme_force_sender_gpu_impl.cu +++ b/src/gromacs/ewald/pme_force_sender_gpu_impl.cu @@ -101,6 +101,7 @@ void PmeForceSenderGpu::Impl::sendFSynchronizerToPpCudaDirect(int ppRank) #if GMX_MPI // TODO Using MPI_Isend would be more efficient, particularly when // sending to multiple PP ranks + // NOLINTNEXTLINE(bugprone-sizeof-expression) MPI_Send(&pmeForcesReady_, sizeof(GpuEventSynchronizer*), MPI_BYTE, ppRank, 0, comm_); #else GMX_UNUSED_VALUE(ppRank); diff --git a/src/gromacs/ewald/pme_force_sender_gpu_impl.h b/src/gromacs/ewald/pme_force_sender_gpu_impl.h index 0e0ad8122c..4a73f56376 100644 --- a/src/gromacs/ewald/pme_force_sender_gpu_impl.h +++ b/src/gromacs/ewald/pme_force_sender_gpu_impl.h @@ -65,6 +65,7 @@ public: * \param[in] ppRanks List of PP ranks */ Impl(GpuEventSynchronizer* pmeForcesReady, MPI_Comm comm, gmx::ArrayRef ppRanks); + // NOLINTNEXTLINE(performance-trivially-destructible) ~Impl(); /*! \brief diff --git a/src/gromacs/ewald/pme_gather.cu b/src/gromacs/ewald/pme_gather.cu index 095c47acf8..eedee8a67e 100644 --- a/src/gromacs/ewald/pme_gather.cu +++ b/src/gromacs/ewald/pme_gather.cu @@ -63,7 +63,7 @@ __device__ __forceinline__ float read_grid_size(const float* realGridSizeFP, con case ZZ: return realGridSizeFP[ZZ]; } assert(false); - return 0.0f; + return 0.0F; } /*! \brief Reduce the partial force contributions. @@ -89,9 +89,9 @@ __device__ __forceinline__ void reduce_atom_forces(float3* __restrict__ sm_force const int splineIndex, const int lineIndex, const float* realGridSizeFP, - float& fx, - float& fy, - float& fz) + float& fx, // NOLINT(google-runtime-references) + float& fy, // NOLINT(google-runtime-references) + float& fz) // NOLINT(google-runtime-references) { if (gmx::isPowerOfTwo(order)) // Only for orders of power of 2 { @@ -135,7 +135,9 @@ __device__ __forceinline__ void reduce_atom_forces(float3* __restrict__ sm_force if (dimIndex < DIM) { const float n = read_grid_size(realGridSizeFP, dimIndex); - *((float*)(&sm_forces[atomIndexLocal]) + dimIndex) = fx * n; + float* __restrict__ sm_forcesAtomIndexOffset = + reinterpret_cast(&sm_forces[atomIndexLocal]); + sm_forcesAtomIndexOffset[dimIndex] = fx * n; } } else @@ -207,7 +209,9 @@ __device__ __forceinline__ void reduce_atom_forces(float3* __restrict__ sm_force if (sourceIndex == minStride * atomIndex) { - *((float*)(&sm_forces[atomIndex]) + dimIndex) = + float* __restrict__ sm_forcesAtomIndexOffset = + reinterpret_cast(&sm_forces[atomIndex]); + sm_forcesAtomIndexOffset[dimIndex] = (sm_forceTemp[dimIndex][sourceIndex] + sm_forceTemp[dimIndex][sourceIndex + 1]) * n; } } @@ -465,9 +469,9 @@ __launch_bounds__(c_gatherMaxThreadsPerBlock, c_gatherMinBlocksPerMP) __global__ kernelParams, atomIndexOffset, atomX, atomCharge, sm_theta, sm_dtheta, sm_gridlineIndices); __syncwarp(); } - float fx = 0.0f; - float fy = 0.0f; - float fz = 0.0f; + float fx = 0.0F; + float fy = 0.0F; + float fz = 0.0F; const int chargeCheck = pme_gpu_check_atom_charge(gm_coefficientsA[atomIndexGlobal]); @@ -545,7 +549,7 @@ __launch_bounds__(c_gatherMaxThreadsPerBlock, c_gatherMinBlocksPerMP) __global__ { int outputIndexLocal = i * iterThreads + threadLocalId; int outputIndexGlobal = blockIndex * blockForcesSize + outputIndexLocal; - float outputForceComponent = ((float*)sm_forces)[outputIndexLocal]; + float outputForceComponent = (reinterpret_cast(sm_forces)[outputIndexLocal]); gm_forces[outputIndexGlobal] = outputForceComponent; } } @@ -554,9 +558,9 @@ __launch_bounds__(c_gatherMaxThreadsPerBlock, c_gatherMinBlocksPerMP) __global__ { /* We must sync here since the same shared memory is used as above. */ __syncthreads(); - fx = 0.0f; - fy = 0.0f; - fz = 0.0f; + fx = 0.0F; + fy = 0.0F; + fz = 0.0F; const int chargeCheck = pme_gpu_check_atom_charge(gm_coefficientsB[atomIndexGlobal]); if (chargeCheck) { @@ -605,7 +609,7 @@ __launch_bounds__(c_gatherMaxThreadsPerBlock, c_gatherMinBlocksPerMP) __global__ { int outputIndexLocal = i * iterThreads + threadLocalId; int outputIndexGlobal = blockIndex * blockForcesSize + outputIndexLocal; - float outputForceComponent = ((float*)sm_forces)[outputIndexLocal]; + float outputForceComponent = (reinterpret_cast(sm_forces)[outputIndexLocal]); gm_forces[outputIndexGlobal] += outputForceComponent; } } diff --git a/src/gromacs/ewald/pme_gpu_3dfft.cu b/src/gromacs/ewald/pme_gpu_3dfft.cu index 1a8e9c577b..80daa42020 100644 --- a/src/gromacs/ewald/pme_gpu_3dfft.cu +++ b/src/gromacs/ewald/pme_gpu_3dfft.cu @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2016,2017,2018,2019,2020, by the GROMACS development team, led by + * Copyright (c) 2016,2017,2018,2019,2020,2021, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -79,9 +79,9 @@ GpuParallel3dFft::GpuParallel3dFft(const PmeGpu* pmeGpu, const int gridIndex) const int realGridSizePaddedTotal = realGridSizePadded[XX] * realGridSizePadded[YY] * realGridSizePadded[ZZ]; - realGrid_ = (cufftReal*)kernelParamsPtr->grid.d_realGrid[gridIndex]; + realGrid_ = reinterpret_cast(kernelParamsPtr->grid.d_realGrid[gridIndex]); GMX_RELEASE_ASSERT(realGrid_, "Bad (null) input real-space grid"); - complexGrid_ = (cufftComplex*)kernelParamsPtr->grid.d_fourierGrid[gridIndex]; + complexGrid_ = reinterpret_cast(kernelParamsPtr->grid.d_fourierGrid[gridIndex]); GMX_RELEASE_ASSERT(complexGrid_, "Bad (null) input complex grid"); cufftResult_t result; diff --git a/src/gromacs/ewald/pme_gpu_calculate_splines.cuh b/src/gromacs/ewald/pme_gpu_calculate_splines.cuh index 8e8496da8f..6c85fff0ff 100644 --- a/src/gromacs/ewald/pme_gpu_calculate_splines.cuh +++ b/src/gromacs/ewald/pme_gpu_calculate_splines.cuh @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2016,2017,2018,2019,2020, by the GROMACS development team, led by + * Copyright (c) 2016,2017,2018,2019,2020,2021, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -104,10 +104,10 @@ int __device__ __forceinline__ getSplineParamIndex(int paramIndexBase, int dimIn * * This is called from the spline_and_spread and gather PME kernels. */ -int __device__ __forceinline__ pme_gpu_check_atom_charge(const float coefficient) +bool __device__ __forceinline__ pme_gpu_check_atom_charge(const float coefficient) { assert(isfinite(coefficient)); - return c_skipNeutralAtoms ? (coefficient != 0.0f) : 1; + return c_skipNeutralAtoms ? (coefficient != 0.0F) : true; } //! Controls if the atom and charge data is prefeched into shared memory or loaded per thread from global @@ -126,15 +126,15 @@ __device__ inline void assertIsFinite(T arg); template<> __device__ inline void assertIsFinite(float3 gmx_unused arg) { - assert(isfinite(float(arg.x))); - assert(isfinite(float(arg.y))); - assert(isfinite(float(arg.z))); + assert(isfinite(static_cast(arg.x))); + assert(isfinite(static_cast(arg.y))); + assert(isfinite(static_cast(arg.z))); } template __device__ inline void assertIsFinite(T gmx_unused arg) { - assert(isfinite(float(arg))); + assert(isfinite(static_cast(arg))); } /*! \brief @@ -268,7 +268,7 @@ __device__ __forceinline__ void calculate_splines(const PmeGpuCudaKernelParams k const float shift = c_pmeMaxUnitcellShift; /* Fractional coordinates along box vectors, adding a positive shift to ensure t is positive for triclinic boxes */ t = (t + shift) * n; - tInt = (int)t; + tInt = static_cast(t); assert(sharedMemoryIndex < atomsPerBlock * DIM); sm_fractCoords[sharedMemoryIndex] = t - tInt; tableIndex += tInt; @@ -302,14 +302,14 @@ __device__ __forceinline__ void calculate_splines(const PmeGpuCudaKernelParams k assert(isfinite(dr)); /* dr is relative offset from lower cell limit */ - splineData[order - 1] = 0.0f; + splineData[order - 1] = 0.0F; splineData[1] = dr; - splineData[0] = 1.0f - dr; + splineData[0] = 1.0F - dr; #pragma unroll for (int k = 3; k < order; k++) { - div = 1.0f / (k - 1.0f); + div = 1.0F / (k - 1.0F); splineData[k - 1] = div * dr * splineData[k - 2]; #pragma unroll for (int l = 1; l < (k - 1); l++) @@ -317,7 +317,7 @@ __device__ __forceinline__ void calculate_splines(const PmeGpuCudaKernelParams k splineData[k - l - 1] = div * ((dr + l) * splineData[k - l - 2] + (k - l - dr) * splineData[k - l - 1]); } - splineData[0] = div * (1.0f - dr) * splineData[0]; + splineData[0] = div * (1.0F - dr) * splineData[0]; } const int thetaIndexBase = @@ -333,7 +333,7 @@ __device__ __forceinline__ void calculate_splines(const PmeGpuCudaKernelParams k const int thetaIndex = getSplineParamIndex(thetaIndexBase, dimIndex, o); - const float dtheta = ((o > 0) ? splineData[o - 1] : 0.0f) - splineData[o]; + const float dtheta = ((o > 0) ? splineData[o - 1] : 0.0F) - splineData[o]; assert(isfinite(dtheta)); assert(thetaIndex < order * DIM * atomsPerBlock); if (writeSmDtheta) @@ -348,7 +348,7 @@ __device__ __forceinline__ void calculate_splines(const PmeGpuCudaKernelParams k } } - div = 1.0f / (order - 1.0f); + div = 1.0F / (order - 1.0F); splineData[order - 1] = div * dr * splineData[order - 2]; #pragma unroll for (int k = 1; k < (order - 1); k++) @@ -357,7 +357,7 @@ __device__ __forceinline__ void calculate_splines(const PmeGpuCudaKernelParams k * ((dr + k) * splineData[order - k - 2] + (order - k - dr) * splineData[order - k - 1]); } - splineData[0] = div * (1.0f - dr) * splineData[0]; + splineData[0] = div * (1.0F - dr) * splineData[0]; /* Storing the spline values (theta) */ #pragma unroll diff --git a/src/gromacs/ewald/pme_gpu_internal.cpp b/src/gromacs/ewald/pme_gpu_internal.cpp index d0e54ea101..328e0c11f8 100644 --- a/src/gromacs/ewald/pme_gpu_internal.cpp +++ b/src/gromacs/ewald/pme_gpu_internal.cpp @@ -463,9 +463,9 @@ void pme_gpu_free_fract_shifts(const PmeGpu* pmeGpu) auto* kernelParamsPtr = pmeGpu->kernelParams.get(); #if GMX_GPU_CUDA destroyParamLookupTable(&kernelParamsPtr->grid.d_fractShiftsTable, - kernelParamsPtr->fractShiftsTableTexture); + &kernelParamsPtr->fractShiftsTableTexture); destroyParamLookupTable(&kernelParamsPtr->grid.d_gridlineIndicesTable, - kernelParamsPtr->gridlineIndicesTableTexture); + &kernelParamsPtr->gridlineIndicesTableTexture); #elif GMX_GPU_OPENCL || GMX_GPU_SYCL freeDeviceBuffer(&kernelParamsPtr->grid.d_fractShiftsTable); freeDeviceBuffer(&kernelParamsPtr->grid.d_gridlineIndicesTable); diff --git a/src/gromacs/ewald/pme_gpu_program_impl.cu b/src/gromacs/ewald/pme_gpu_program_impl.cu index b7830fa60b..ea0494ec8b 100644 --- a/src/gromacs/ewald/pme_gpu_program_impl.cu +++ b/src/gromacs/ewald/pme_gpu_program_impl.cu @@ -60,7 +60,7 @@ constexpr int c_stateB = 1; //! PME CUDA kernels forward declarations. Kernels are documented in their respective files. template -__global__ void pme_spline_and_spread_kernel(const PmeGpuCudaKernelParams kernelParams); +__global__ void pme_spline_and_spread_kernel(PmeGpuCudaKernelParams kernelParams); // Add extern declarations to inform that there will be a definition // provided in another translation unit. @@ -99,7 +99,7 @@ extern template __global__ void pme_spline_and_spread_kernel(const PmeGpuCudaKernelParams); template /* It is significantly slower to pass gridIndex as a kernel parameter */ -__global__ void pme_solve_kernel(const PmeGpuCudaKernelParams kernelParams); +__global__ void pme_solve_kernel(PmeGpuCudaKernelParams kernelParams); // Add extern declarations to inform that there will be a definition // provided in another translation unit. @@ -115,7 +115,7 @@ extern template __global__ void pme_solve_kernel -__global__ void pme_gather_kernel(const PmeGpuCudaKernelParams kernelParams); +__global__ void pme_gather_kernel(PmeGpuCudaKernelParams kernelParams); // Add extern declarations to inform that there will be a definition // provided in another translation unit. diff --git a/src/gromacs/ewald/pme_pp_comm_gpu_impl.cu b/src/gromacs/ewald/pme_pp_comm_gpu_impl.cu index 2e242a074f..39068544b4 100644 --- a/src/gromacs/ewald/pme_pp_comm_gpu_impl.cu +++ b/src/gromacs/ewald/pme_pp_comm_gpu_impl.cu @@ -89,7 +89,6 @@ void PmePpCommGpu::Impl::reinit(int size) // Reallocate buffer used for staging PME force on GPU reallocateDeviceBuffer(&d_pmeForces_, size, &d_pmeForcesSize_, &d_pmeForcesSizeAlloc_, deviceContext_); - return; } void PmePpCommGpu::Impl::receiveForceFromPmeCudaDirect(float3* pmeForcePtr, int recvSize, bool receivePmeForceToGpu) @@ -98,6 +97,7 @@ void PmePpCommGpu::Impl::receiveForceFromPmeCudaDirect(float3* pmeForcePtr, int // Receive event from PME task and add to stream, to ensure pull of data doesn't // occur before PME force calc is completed GpuEventSynchronizer* pmeSync; + // NOLINTNEXTLINE(bugprone-sizeof-expression) MPI_Recv(&pmeSync, sizeof(GpuEventSynchronizer*), MPI_BYTE, pmeRank_, 0, comm_, MPI_STATUS_IGNORE); pmeSync->enqueueWaitEvent(pmePpCommStream_); #endif @@ -166,6 +166,7 @@ void PmePpCommGpu::Impl::sendCoordinatesToPmeCudaDirect(float3* se // Record and send event to allow PME task to sync to above transfer before commencing force calculations pmeCoordinatesSynchronizer_.markEvent(pmePpCommStream_); GpuEventSynchronizer* pmeSync = &pmeCoordinatesSynchronizer_; + // NOLINTNEXTLINE(bugprone-sizeof-expression) MPI_Send(&pmeSync, sizeof(GpuEventSynchronizer*), MPI_BYTE, pmeRank_, 0, comm_); #endif } diff --git a/src/gromacs/ewald/pme_pp_comm_gpu_impl.h b/src/gromacs/ewald/pme_pp_comm_gpu_impl.h index d4ee85872e..fbd9b4f8a4 100644 --- a/src/gromacs/ewald/pme_pp_comm_gpu_impl.h +++ b/src/gromacs/ewald/pme_pp_comm_gpu_impl.h @@ -152,7 +152,6 @@ private: int sendSize, GpuEventSynchronizer* coordinatesReadyOnDeviceEvent); -private: //! GPU context handle (not used in CUDA) const DeviceContext& deviceContext_; //! Handle for CUDA stream used for the communication operations in this class diff --git a/src/gromacs/ewald/pme_solve.cu b/src/gromacs/ewald/pme_solve.cu index 3f5d2d06f4..83e21b1f11 100644 --- a/src/gromacs/ewald/pme_solve.cu +++ b/src/gromacs/ewald/pme_solve.cu @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2016,2017,2018,2019,2020, by the GROMACS development team, led by + * Copyright (c) 2016,2017,2018,2019,2020,2021, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -88,7 +88,7 @@ __launch_bounds__(c_solveMaxThreadsPerBlock) CLANG_DISABLE_OPTIMIZATION_ATTRIBUT const float* __restrict__ gm_splineValueMinor = kernelParams.grid.d_splineModuli[gridIndex] + kernelParams.grid.splineValuesOffset[minorDim]; float* __restrict__ gm_virialAndEnergy = kernelParams.constants.d_virialAndEnergy[gridIndex]; - float2* __restrict__ gm_grid = (float2*)kernelParams.grid.d_fourierGrid[gridIndex]; + float2* __restrict__ gm_grid = reinterpret_cast(kernelParams.grid.d_fourierGrid[gridIndex]); /* Various grid sizes and indices */ const int localOffsetMinor = 0, localOffsetMajor = 0, localOffsetMiddle = 0; // unused @@ -119,13 +119,13 @@ __launch_bounds__(c_solveMaxThreadsPerBlock) CLANG_DISABLE_OPTIMIZATION_ATTRIBUT const int indexMajor = blockIdx.z; /* Optional outputs */ - float energy = 0.0f; - float virxx = 0.0f; - float virxy = 0.0f; - float virxz = 0.0f; - float viryy = 0.0f; - float viryz = 0.0f; - float virzz = 0.0f; + float energy = 0.0F; + float virxx = 0.0F; + float virxy = 0.0F; + float virxz = 0.0F; + float viryy = 0.0F; + float viryz = 0.0F; + float virzz = 0.0F; assert(indexMajor < kernelParams.grid.complexGridSize[majorDim]); if ((indexMiddle < localCountMiddle) & (indexMinor < localCountMinor) @@ -176,20 +176,20 @@ __launch_bounds__(c_solveMaxThreadsPerBlock) CLANG_DISABLE_OPTIMIZATION_ATTRIBUT } /* 0.5 correction factor for the first and last components of a Z dimension */ - float corner_fac = 1.0f; + float corner_fac = 1.0F; switch (gridOrdering) { case GridOrdering::YZX: if ((kMiddle == 0) | (kMiddle == maxkMiddle)) { - corner_fac = 0.5f; + corner_fac = 0.5F; } break; case GridOrdering::XYZ: if ((kMinor == 0) | (kMinor == maxkMinor)) { - corner_fac = 0.5f; + corner_fac = 0.5F; } break; @@ -206,13 +206,13 @@ __launch_bounds__(c_solveMaxThreadsPerBlock) CLANG_DISABLE_OPTIMIZATION_ATTRIBUT + mZ * kernelParams.current.recipBox[ZZ][ZZ]; const float m2k = mhxk * mhxk + mhyk * mhyk + mhzk * mhzk; - assert(m2k != 0.0f); + assert(m2k != 0.0F); // TODO: use LDG/textures for gm_splineValue float denom = m2k * float(CUDART_PI_F) * kernelParams.current.boxVolume * gm_splineValueMajor[kMajor] * gm_splineValueMiddle[kMiddle] * gm_splineValueMinor[kMinor]; assert(isfinite(denom)); - assert(denom != 0.0f); + assert(denom != 0.0F); const float tmp1 = expf(-kernelParams.grid.ewaldFactor * m2k); const float etermk = kernelParams.constants.elFactor * tmp1 / denom; @@ -226,9 +226,9 @@ __launch_bounds__(c_solveMaxThreadsPerBlock) CLANG_DISABLE_OPTIMIZATION_ATTRIBUT if (computeEnergyAndVirial) { const float tmp1k = - 2.0f * (gridValue.x * oldGridValue.x + gridValue.y * oldGridValue.y); + 2.0F * (gridValue.x * oldGridValue.x + gridValue.y * oldGridValue.y); - float vfactor = (kernelParams.grid.ewaldFactor + 1.0f / m2k) * 2.0f; + float vfactor = (kernelParams.grid.ewaldFactor + 1.0F / m2k) * 2.0F; float ets2 = corner_fac * tmp1k; energy = ets2; @@ -303,7 +303,7 @@ __launch_bounds__(c_solveMaxThreadsPerBlock) CLANG_DISABLE_OPTIMIZATION_ATTRIBUT /* Reduce 7 outputs per warp in the shared memory */ const int stride = 8; // this is c_virialAndEnergyCount==7 rounded up to power of 2 for convenience, hence the assert - assert(c_virialAndEnergyCount == 7); + static_assert(c_virialAndEnergyCount == 7); const int reductionBufferSize = (c_solveMaxThreadsPerBlock / warp_size) * stride; __shared__ float sm_virialAndEnergy[reductionBufferSize]; diff --git a/src/gromacs/gpu_utils/cuda_kernel_utils.cuh b/src/gromacs/gpu_utils/cuda_kernel_utils.cuh index 1346c6218e..0333d84ae8 100644 --- a/src/gromacs/gpu_utils/cuda_kernel_utils.cuh +++ b/src/gromacs/gpu_utils/cuda_kernel_utils.cuh @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2017,2018,2019, by the GROMACS development team, led by + * Copyright (c) 2017,2018,2019,2021, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -67,6 +67,7 @@ template static __forceinline__ __device__ T fetchFromTexture(const cudaTextureObject_t texObj, int index) { assert(index >= 0); + // NOLINTNEXTLINE(misc-static-assert) assert(!c_disableCudaTextures); return tex1Dfetch(texObj, index); } diff --git a/src/gromacs/gpu_utils/cudautils.cuh b/src/gromacs/gpu_utils/cudautils.cuh index e5d3c3aad6..daf05a019d 100644 --- a/src/gromacs/gpu_utils/cudautils.cuh +++ b/src/gromacs/gpu_utils/cudautils.cuh @@ -2,7 +2,7 @@ * This file is part of the GROMACS molecular simulation package. * * Copyright (c) 2012,2014,2015,2016,2017 by the GROMACS development team. - * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by + * Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -61,7 +61,7 @@ namespace * * \returns A description of the API error. Returns '(CUDA error #0 (cudaSuccess): no error)' in case deviceError is cudaSuccess. */ -static inline std::string getDeviceErrorString(const cudaError_t deviceError) +inline std::string getDeviceErrorString(const cudaError_t deviceError) { return formatString("CUDA error #%d (%s): %s.", deviceError, @@ -76,7 +76,7 @@ static inline std::string getDeviceErrorString(const cudaError_t deviceError) * * \throws InternalError if deviceError is not a success. */ -static inline void checkDeviceError(const cudaError_t deviceError, const std::string& errorMessage) +inline void checkDeviceError(const cudaError_t deviceError, const std::string& errorMessage) { if (deviceError != cudaSuccess) { @@ -92,7 +92,7 @@ static inline void checkDeviceError(const cudaError_t deviceError, const std::st * * \param[in] errorMessage Undecorated error message. */ -static inline void ensureNoPendingDeviceError(const std::string& errorMessage) +inline void ensureNoPendingDeviceError(const std::string& errorMessage) { // Ensure there is no pending error that would otherwise affect // the behaviour of future error handling. @@ -135,13 +135,13 @@ enum class GpuApiCallBehavior; #ifdef CHECK_CUDA_ERRORS /*! Check for CUDA error on the return status of a CUDA RT API call. */ -# define CU_RET_ERR(deviceError, msg) \ - do \ - { \ - if (deviceError != cudaSuccess) \ - { \ - gmx_fatal(FARGS, "%s\n", (msg + gmx::getDeviceErrorString(deviceError)).c_str()); \ - } \ +# define CU_RET_ERR(deviceError, msg) \ + do \ + { \ + if ((deviceError) != cudaSuccess) \ + { \ + gmx_fatal(FARGS, "%s\n", ((msg) + gmx::getDeviceErrorString(deviceError)).c_str()); \ + } \ } while (0) #else /* CHECK_CUDA_ERRORS */ @@ -239,6 +239,7 @@ void prepareGpuKernelArgument(KernelPtr kernel, const CurrentArg* argPtr, const RemainingArgs*... otherArgsPtrs) { + // NOLINTNEXTLINE(google-readability-casting) (*kernelArgsPtr)[argIndex] = (void*)argPtr; prepareGpuKernelArgument(kernel, kernelArgsPtr, argIndex + 1, otherArgsPtrs...); } @@ -283,6 +284,7 @@ void launchGpuKernel(void (*kernel)(Args...), { dim3 blockSize(config.blockSize[0], config.blockSize[1], config.blockSize[2]); dim3 gridSize(config.gridSize[0], config.gridSize[1], config.gridSize[2]); + // NOLINTNEXTLINE(google-readability-casting) cudaLaunchKernel((void*)kernel, gridSize, blockSize, diff --git a/src/gromacs/gpu_utils/devicebuffer.cuh b/src/gromacs/gpu_utils/devicebuffer.cuh index 2b83752c2a..97e9d525b2 100644 --- a/src/gromacs/gpu_utils/devicebuffer.cuh +++ b/src/gromacs/gpu_utils/devicebuffer.cuh @@ -68,7 +68,8 @@ template void allocateDeviceBuffer(DeviceBuffer* buffer, size_t numValues, const DeviceContext& /* deviceContext */) { GMX_ASSERT(buffer, "needs a buffer pointer"); - cudaError_t stat = cudaMalloc((void**)buffer, numValues * sizeof(ValueType)); + // NOLINTNEXTLINE(google-readability-casting) + cudaError_t stat = cudaMalloc((void**)(buffer), numValues * sizeof(ValueType)); GMX_RELEASE_ASSERT( stat == cudaSuccess, ("Allocation of the device buffer failed. " + gmx::getDeviceErrorString(stat)).c_str()); @@ -130,6 +131,7 @@ void copyToDeviceBuffer(DeviceBuffer* buffer, { case GpuApiCallBehavior::Async: GMX_ASSERT(isHostMemoryPinned(hostBuffer), "Source host buffer was not pinned for CUDA"); + // NOLINTNEXTLINE(google-readability-casting) stat = cudaMemcpyAsync(*((ValueType**)buffer) + startingOffset, hostBuffer, bytes, @@ -142,7 +144,11 @@ void copyToDeviceBuffer(DeviceBuffer* buffer, case GpuApiCallBehavior::Sync: stat = cudaMemcpy( - *((ValueType**)buffer) + startingOffset, hostBuffer, bytes, cudaMemcpyHostToDevice); + // NOLINTNEXTLINE(google-readability-casting) + *((ValueType**)buffer) + startingOffset, + hostBuffer, + bytes, + cudaMemcpyHostToDevice); GMX_RELEASE_ASSERT( stat == cudaSuccess, ("Synchronous H2D copy failed. " + gmx::getDeviceErrorString(stat)).c_str()); @@ -189,6 +195,7 @@ void copyFromDeviceBuffer(ValueType* hostBuffer, GMX_ASSERT(isHostMemoryPinned(hostBuffer), "Destination host buffer was not pinned for CUDA"); stat = cudaMemcpyAsync(hostBuffer, + // NOLINTNEXTLINE(google-readability-casting) *((ValueType**)buffer) + startingOffset, bytes, cudaMemcpyDeviceToHost, @@ -199,8 +206,11 @@ void copyFromDeviceBuffer(ValueType* hostBuffer, break; case GpuApiCallBehavior::Sync: - stat = cudaMemcpy( - hostBuffer, *((ValueType**)buffer) + startingOffset, bytes, cudaMemcpyDeviceToHost); + stat = cudaMemcpy(hostBuffer, + // NOLINTNEXTLINE(google-readability-casting) + *((ValueType**)buffer) + startingOffset, + bytes, + cudaMemcpyDeviceToHost); GMX_RELEASE_ASSERT( stat == cudaSuccess, ("Synchronous D2H copy failed. " + gmx::getDeviceErrorString(stat)).c_str()); @@ -287,7 +297,11 @@ void clearDeviceBufferAsync(DeviceBuffer* buffer, const char pattern = 0; cudaError_t stat = cudaMemsetAsync( - *((ValueType**)buffer) + startingOffset, pattern, bytes, deviceStream.stream()); + // NOLINTNEXTLINE(google-readability-casting) + *((ValueType**)buffer) + startingOffset, + pattern, + bytes, + deviceStream.stream()); GMX_RELEASE_ASSERT(stat == cudaSuccess, ("Couldn't clear the device buffer. " + gmx::getDeviceErrorString(stat)).c_str()); } @@ -345,6 +359,7 @@ void initParamLookupTable(DeviceBuffer* deviceBuffer, const size_t sizeInBytes = numValues * sizeof(ValueType); cudaError_t stat = + // NOLINTNEXTLINE(google-readability-casting) cudaMemcpy(*((ValueType**)deviceBuffer), hostBuffer, sizeInBytes, cudaMemcpyHostToDevice); GMX_RELEASE_ASSERT(stat == cudaSuccess, @@ -378,11 +393,11 @@ void initParamLookupTable(DeviceBuffer* deviceBuffer, * \param[in,out] deviceTexture Device texture object to unbind. */ template -void destroyParamLookupTable(DeviceBuffer* deviceBuffer, DeviceTexture& deviceTexture) +void destroyParamLookupTable(DeviceBuffer* deviceBuffer, const DeviceTexture* deviceTexture) { if (!c_disableCudaTextures && deviceTexture && deviceBuffer) { - cudaError_t stat = cudaDestroyTextureObject(deviceTexture); + cudaError_t stat = cudaDestroyTextureObject(*deviceTexture); GMX_RELEASE_ASSERT( stat == cudaSuccess, ("Destruction of the texture object failed. " + gmx::getDeviceErrorString(stat)).c_str()); diff --git a/src/gromacs/gpu_utils/devicebuffer_sycl.h b/src/gromacs/gpu_utils/devicebuffer_sycl.h index 3ae9b615da..743b9ea785 100644 --- a/src/gromacs/gpu_utils/devicebuffer_sycl.h +++ b/src/gromacs/gpu_utils/devicebuffer_sycl.h @@ -535,7 +535,7 @@ void initParamLookupTable(DeviceBuffer* deviceBuffer, * \param[in,out] deviceBuffer Device buffer to store data in. */ template -void destroyParamLookupTable(DeviceBuffer* deviceBuffer, DeviceTexture& /* deviceTexture */) +void destroyParamLookupTable(DeviceBuffer* deviceBuffer, DeviceTexture* /* deviceTexture */) { deviceBuffer->buffer_.reset(nullptr); } diff --git a/src/gromacs/gpu_utils/gpu_utils.cu b/src/gromacs/gpu_utils/gpu_utils.cu index b35fcabd4a..b72dbaeff6 100644 --- a/src/gromacs/gpu_utils/gpu_utils.cu +++ b/src/gromacs/gpu_utils/gpu_utils.cu @@ -65,6 +65,7 @@ #include "gromacs/utility/snprintf.h" #include "gromacs/utility/stringutil.h" +// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables) static bool cudaProfilerRun = ((getenv("NVPROF_ID") != nullptr)); bool isHostMemoryPinned(const void* h_ptr) @@ -102,7 +103,7 @@ bool isHostMemoryPinned(const void* h_ptr) return isPinned; } -void startGpuProfiler(void) +void startGpuProfiler() { /* The NVPROF_ID environment variable is set by nvprof and indicates that mdrun is executed in the CUDA profiler. @@ -118,7 +119,7 @@ void startGpuProfiler(void) } } -void stopGpuProfiler(void) +void stopGpuProfiler() { /* Stopping the nvidia here allows us to eliminate the subsequent API calls from the trace, e.g. uninitialization and cleanup. */ @@ -130,7 +131,7 @@ void stopGpuProfiler(void) } } -void resetGpuProfiler(void) +void resetGpuProfiler() { /* With CUDA <=7.5 the profiler can't be properly reset; we can only start * the profiling here (can't stop it) which will achieve the desired effect if @@ -180,7 +181,7 @@ static void peerAccessCheckStat(const cudaError_t stat, { std::string errorString = gmx::formatString("%s from GPU %d to GPU %d failed", cudaCallName, gpuA, gpuB); - CU_RET_ERR(stat, errorString.c_str()); + CU_RET_ERR(stat, errorString); } if (stat != cudaSuccess) { diff --git a/src/gromacs/gpu_utils/gpuregiontimer.cuh b/src/gromacs/gpu_utils/gpuregiontimer.cuh index c56d60da61..59793095db 100644 --- a/src/gromacs/gpu_utils/gpuregiontimer.cuh +++ b/src/gromacs/gpu_utils/gpuregiontimer.cuh @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2016,2017,2018,2019,2020, by the GROMACS development team, led by + * Copyright (c) 2016,2017,2018,2019,2020,2021, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -109,7 +109,7 @@ public: * for passing into individual GPU API calls. * This is just a dummy in CUDA. */ - inline CommandEvent* fetchNextEvent() { return nullptr; } + static inline CommandEvent* fetchNextEvent() { return nullptr; } }; //! Short-hand for external use diff --git a/src/gromacs/gpu_utils/pmalloc.cu b/src/gromacs/gpu_utils/pmalloc.cu index 3a8f1058fe..cef8b97d73 100644 --- a/src/gromacs/gpu_utils/pmalloc.cu +++ b/src/gromacs/gpu_utils/pmalloc.cu @@ -66,7 +66,7 @@ void pmalloc(void** h_ptr, size_t nbytes) gmx::ensureNoPendingDeviceError("Could not allocate page-locked memory."); stat = cudaMallocHost(h_ptr, nbytes, flag); - sprintf(strbuf, "cudaMallocHost of size %d bytes failed", (int)nbytes); + sprintf(strbuf, "cudaMallocHost of size %d bytes failed", static_cast(nbytes)); CU_RET_ERR(stat, strbuf); } diff --git a/src/gromacs/gpu_utils/tests/hostallocator.cpp b/src/gromacs/gpu_utils/tests/hostallocator.cpp index 2bf3903214..21ed3e85ac 100644 --- a/src/gromacs/gpu_utils/tests/hostallocator.cpp +++ b/src/gromacs/gpu_utils/tests/hostallocator.cpp @@ -331,7 +331,7 @@ TYPED_TEST(HostAllocatorTestCopyable, ManualPinningOperationsWorkWithCuda) EXPECT_TRUE(input.empty()); resizeAndFillInput(&input, 3, 1); // realloc and copy). - auto oldInputData = input.data(); + auto* oldInputData = input.data(); changePinningPolicy(&input, PinningPolicy::CannotBePinned); EXPECT_FALSE(isPinned(input)); // These cannot be equal as both had to be allocated at the same diff --git a/src/gromacs/gpu_utils/tests/typecasts_runner.cpp b/src/gromacs/gpu_utils/tests/typecasts_runner.cpp index 023f14ae43..0256c463d9 100644 --- a/src/gromacs/gpu_utils/tests/typecasts_runner.cpp +++ b/src/gromacs/gpu_utils/tests/typecasts_runner.cpp @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2020, by the GROMACS development team, led by + * Copyright (c) 2020,2021, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -46,6 +46,8 @@ #include +#include "gromacs/utility/arrayref.h" + #include "testutils/testasserts.h" #if !GMX_GPU_CUDA @@ -56,14 +58,14 @@ namespace gmx namespace test { -void convertRVecToFloat3OnHost(std::vector& /* rVecOutput */, - const std::vector& /* rVecInput */) +void convertRVecToFloat3OnHost(ArrayRef /* rVecOutput */, + ArrayRef /* rVecInput */) { FAIL() << "Can't test float3 and RVec compatibility without CUDA."; } -void convertRVecToFloat3OnDevice(std::vector& /* rVecOutput */, - const std::vector& /* rVecInput */, +void convertRVecToFloat3OnDevice(ArrayRef /* rVecOutput */, + ArrayRef /* rVecInput */, const TestDevice* /* testDevice */) { FAIL() << "Can't test float3 and RVec compatibility without CUDA."; diff --git a/src/gromacs/gpu_utils/tests/typecasts_runner.cu b/src/gromacs/gpu_utils/tests/typecasts_runner.cu index dde856d93f..4353f9a30b 100644 --- a/src/gromacs/gpu_utils/tests/typecasts_runner.cu +++ b/src/gromacs/gpu_utils/tests/typecasts_runner.cu @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2020, by the GROMACS development team, led by + * Copyright (c) 2020,2021, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -48,6 +48,7 @@ #include "gromacs/gpu_utils/devicebuffer.h" #include "gromacs/gpu_utils/typecasts.cuh" #include "gromacs/hardware/device_information.h" +#include "gromacs/utility/arrayref.h" #include "gromacs/utility/exceptions.h" #include "gromacs/utility/stringutil.h" @@ -65,7 +66,7 @@ namespace test * \param[in] float3Output Output data in float3 format. * \param[in] numElements Size of the data buffers. */ -void inline saveFloat3InRVecFormat(std::vector& rVecOutput, const float3* float3Output, int numElements) +void inline saveFloat3InRVecFormat(ArrayRef rVecOutput, const float3* float3Output, int numElements) { for (int i = 0; i < numElements; i++) { @@ -75,7 +76,7 @@ void inline saveFloat3InRVecFormat(std::vector& rVecOutput, const flo } } -void convertRVecToFloat3OnHost(std::vector& rVecOutput, const std::vector& rVecInput) +void convertRVecToFloat3OnHost(ArrayRef rVecOutput, ArrayRef rVecInput) { const int numElements = rVecInput.size(); @@ -105,9 +106,9 @@ static __global__ void convertRVecToFloat3OnDevice_kernel(DeviceBuffer g } } -void convertRVecToFloat3OnDevice(std::vector& h_rVecOutput, - const std::vector& h_rVecInput, - const TestDevice* testDevice) +void convertRVecToFloat3OnDevice(ArrayRef h_rVecOutput, + ArrayRef h_rVecInput, + const TestDevice* testDevice) { const DeviceContext& deviceContext = testDevice->deviceContext(); const DeviceStream& deviceStream = testDevice->deviceStream(); diff --git a/src/gromacs/gpu_utils/tests/typecasts_runner.h b/src/gromacs/gpu_utils/tests/typecasts_runner.h index 44a3247a45..f1c60517e5 100644 --- a/src/gromacs/gpu_utils/tests/typecasts_runner.h +++ b/src/gromacs/gpu_utils/tests/typecasts_runner.h @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2020, by the GROMACS development team, led by + * Copyright (c) 2020,2021, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -54,6 +54,9 @@ namespace gmx { +template +class ArrayRef; + namespace test { @@ -62,7 +65,7 @@ namespace test * \param[out] rVecOutput Data in RVec format for the output. * \param[in] rVecInput Data in RVec format with the input. */ -void convertRVecToFloat3OnHost(std::vector& rVecOutput, const std::vector& rVecInput); +void convertRVecToFloat3OnHost(ArrayRef rVecOutput, ArrayRef rVecInput); /*! \brief Tests the compatibility of RVec and float3 using the conversion on device. * @@ -70,9 +73,9 @@ void convertRVecToFloat3OnHost(std::vector& rVecOutput, const std::ve * \param[in] rVecInput Data in RVec format with the input. * \param[in] testDevice Test herdware environment to get DeviceContext and DeviceStream from. */ -void convertRVecToFloat3OnDevice(std::vector& rVecOutput, - const std::vector& rVecInput, - const TestDevice* testDevice); +void convertRVecToFloat3OnDevice(ArrayRef rVecOutput, + ArrayRef rVecInput, + const TestDevice* testDevice); } // namespace test diff --git a/src/gromacs/gpu_utils/vectype_ops.cuh b/src/gromacs/gpu_utils/vectype_ops.cuh index cce3fc9008..e358e0c0e6 100644 --- a/src/gromacs/gpu_utils/vectype_ops.cuh +++ b/src/gromacs/gpu_utils/vectype_ops.cuh @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2012,2015,2016,2019,2020, by the GROMACS development team, led by + * Copyright (c) 2012,2015,2016,2019,2020,2021, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -45,7 +45,7 @@ __forceinline__ __host__ __device__ float3 make_float3(float4 a) { return make_float3(a.x, a.y, a.z); } -__forceinline__ __host__ __device__ float3 operator-(float3& a) +__forceinline__ __host__ __device__ float3 operator-(const float3& a) { return make_float3(-a.x, -a.y, -a.z); } @@ -65,18 +65,21 @@ __forceinline__ __host__ __device__ float3 operator*(float k, float3 a) { return make_float3(k * a.x, k * a.y, k * a.z); } +// NOLINTNEXTLINE(google-runtime-references) __forceinline__ __host__ __device__ void operator+=(float3& a, float3 b) { a.x += b.x; a.y += b.y; a.z += b.z; } +// NOLINTNEXTLINE(google-runtime-references) __forceinline__ __host__ __device__ void operator+=(float3& a, float4 b) { a.x += b.x; a.y += b.y; a.z += b.z; } +// NOLINTNEXTLINE(google-runtime-references) __forceinline__ __host__ __device__ void operator-=(float3& a, float3 b) { a.x -= b.x; @@ -99,12 +102,14 @@ __forceinline__ __host__ __device__ float3 operator*(float3 a, float3 b) { return make_float3(a.x * b.x, a.y * b.y, a.z * b.z); } +// NOLINTNEXTLINE(google-runtime-references) __forceinline__ __host__ __device__ void operator*=(float3& a, float3 b) { a.x *= b.x; a.y *= b.y; a.z *= b.z; } +// NOLINTNEXTLINE(google-runtime-references) __forceinline__ __host__ __device__ void operator*=(float3& a, float b) { a.x *= b; @@ -126,7 +131,7 @@ __forceinline__ __host__ __device__ float4 make_float4(float s) } __forceinline__ __host__ __device__ float4 make_float4(float3 a) { - return make_float4(a.x, a.y, a.z, 0.0f); + return make_float4(a.x, a.y, a.z, 0.0F); } __forceinline__ __host__ __device__ float4 operator+(float4 a, float4 b) { @@ -151,12 +156,14 @@ __forceinline__ __host__ __device__ void operator+=(float4& a, float4 b) a.z += b.z; a.w += b.w; } +// NOLINTNEXTLINE(google-runtime-references) __forceinline__ __host__ __device__ void operator+=(float4& a, float3 b) { a.x += b.x; a.y += b.y; a.z += b.z; } +// NOLINTNEXTLINE(google-runtime-references) __forceinline__ __host__ __device__ void operator-=(float4& a, float3 b) { a.x -= b.x; @@ -223,21 +230,21 @@ __forceinline__ __device__ float cos_angle(const float3 a, const float3 b) float ipb = norm2(b); float ip = iprod(a, b); float ipab = ipa * ipb; - if (ipab > 0.0f) + if (ipab > 0.0F) { cosval = ip * rsqrt(ipab); } else { - cosval = 1.0f; + cosval = 1.0F; } - if (cosval > 1.0f) + if (cosval > 1.0F) { - return 1.0f; + return 1.0F; } - if (cosval < -1.0f) + if (cosval < -1.0F) { - return -1.0f; + return -1.0F; } return cosval; @@ -267,8 +274,8 @@ __forceinline__ __device__ float gmx_angle(const float3 a, const float3 b) * * \param[in] a First vector. * \param[in] b Second vector. - * \returns Angle between vectors. */ +// NOLINTNEXTLINE(google-runtime-references) __forceinline__ __device__ void atomicAdd(float3& a, const float3 b) { atomicAdd(&a.x, b.x); diff --git a/src/gromacs/hardware/device_management.cu b/src/gromacs/hardware/device_management.cu index 3df63f8131..c1d669c4e2 100644 --- a/src/gromacs/hardware/device_management.cu +++ b/src/gromacs/hardware/device_management.cu @@ -2,7 +2,7 @@ * This file is part of the GROMACS molecular simulation package. * * Copyright (c) 2012,2013,2014,2015,2016, by the GROMACS development team. - * Copyright (c) 2017,2018,2019,2020, by the GROMACS development team, led by + * Copyright (c) 2017,2018,2019,2020,2021, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -66,10 +66,10 @@ * * In reality it is 16 with CUDA <=v5.0, but let's stay on the safe side. */ -static int c_cudaMaxDeviceCount = 32; +static const int c_cudaMaxDeviceCount = 32; /** Dummy kernel used for sanity checking. */ -static __global__ void dummy_kernel(void) {} +static __global__ void dummy_kernel() {} static cudaError_t checkCompiledTargetCompatibility(int deviceId, const cudaDeviceProp& deviceProp) { @@ -338,7 +338,7 @@ void setActiveDevice(const DeviceInformation& deviceInfo) if (stat != cudaSuccess) { auto message = gmx::formatString("Failed to initialize GPU #%d", deviceId); - CU_RET_ERR(stat, message.c_str()); + CU_RET_ERR(stat, message); } if (debug) diff --git a/src/gromacs/listed_forces/gpubonded_impl.cu b/src/gromacs/listed_forces/gpubonded_impl.cu index 48bd3f8360..90c5449da9 100644 --- a/src/gromacs/listed_forces/gpubonded_impl.cu +++ b/src/gromacs/listed_forces/gpubonded_impl.cu @@ -156,7 +156,7 @@ static void convertIlistToNbnxnOrder(const InteractionList& src, int numAtomsPerInteraction, ArrayRef nbnxnAtomOrder) { - GMX_ASSERT(src.size() == 0 || !nbnxnAtomOrder.empty(), "We need the nbnxn atom order"); + GMX_ASSERT(src.empty() || !nbnxnAtomOrder.empty(), "We need the nbnxn atom order"); dest->iatoms.resize(src.size()); diff --git a/src/gromacs/listed_forces/gpubonded_impl.h b/src/gromacs/listed_forces/gpubonded_impl.h index dacb612308..5e66c52eea 100644 --- a/src/gromacs/listed_forces/gpubonded_impl.h +++ b/src/gromacs/listed_forces/gpubonded_impl.h @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by + * Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -127,7 +127,7 @@ class GpuBonded::Impl public: //! Constructor Impl(const gmx_ffparams_t& ffparams, - const float electrostaticsScaleFactor, + float electrostaticsScaleFactor, const DeviceContext& deviceContext, const DeviceStream& deviceStream, gmx_wallcycle* wcycle); diff --git a/src/gromacs/listed_forces/gpubondedkernels.cu b/src/gromacs/listed_forces/gpubondedkernels.cu index 8ab52bf5b4..407e447bdc 100644 --- a/src/gromacs/listed_forces/gpubondedkernels.cu +++ b/src/gromacs/listed_forces/gpubondedkernels.cu @@ -73,7 +73,7 @@ /*-------------------------------- CUDA kernels-------------------------------- */ /*------------------------------------------------------------------------------*/ -#define CUDA_DEG2RAD_F (CUDART_PI_F / 180.0f) +#define CUDA_DEG2RAD_F (CUDART_PI_F / 180.0F) /*---------------- BONDED CUDA kernels--------------*/ @@ -81,7 +81,7 @@ __device__ __forceinline__ static void harmonic_gpu(const float kA, const float xA, const float x, float* V, float* F) { - constexpr float half = 0.5f; + constexpr float half = 0.5F; float dx, dx2; dx = x - xA; @@ -104,10 +104,10 @@ __device__ void bonds_gpu(const int i, { if (i < numBonds) { - int3 bondData = *(int3*)(d_forceatoms + 3 * i); - int type = bondData.x; - int ai = bondData.y; - int aj = bondData.z; + const int3 bondData = *(reinterpret_cast(d_forceatoms + 3 * i)); + int type = bondData.x; + int ai = bondData.y; + int aj = bondData.z; /* dx = xi - xj, corrected for periodic boundary conditions. */ float3 dx; @@ -125,7 +125,7 @@ __device__ void bonds_gpu(const int i, *vtot_loc += vbond; } - if (dr2 != 0.0f) + if (dr2 != 0.0F) { fbond *= rsqrtf(dr2); @@ -175,11 +175,11 @@ __device__ void angles_gpu(const int i, { if (i < numBonds) { - int4 angleData = *(int4*)(d_forceatoms + 4 * i); - int type = angleData.x; - int ai = angleData.y; - int aj = angleData.z; - int ak = angleData.w; + const int4 angleData = *(reinterpret_cast(d_forceatoms + 4 * i)); + int type = angleData.x; + int ai = angleData.y; + int aj = angleData.z; + int ak = angleData.w; float3 r_ij; float3 r_kj; @@ -203,9 +203,9 @@ __device__ void angles_gpu(const int i, } float cos_theta2 = cos_theta * cos_theta; - if (cos_theta2 < 1.0f) + if (cos_theta2 < 1.0F) { - float st = dVdt * rsqrtf(1.0f - cos_theta2); + float st = dVdt * rsqrtf(1.0F - cos_theta2); float sth = st * cos_theta; float nrij2 = norm2(r_ij); float nrkj2 = norm2(r_kj); @@ -248,11 +248,11 @@ __device__ void urey_bradley_gpu(const int i, { if (i < numBonds) { - int4 ubData = *(int4*)(d_forceatoms + 4 * i); - int type = ubData.x; - int ai = ubData.y; - int aj = ubData.z; - int ak = ubData.w; + const int4 ubData = *(reinterpret_cast(d_forceatoms + 4 * i)); + int type = ubData.x; + int ai = ubData.y; + int aj = ubData.z; + int ak = ubData.w; float th0A = d_forceparams[type].u_b.thetaA * CUDA_DEG2RAD_F; float kthA = d_forceparams[type].u_b.kthetaA; @@ -287,9 +287,9 @@ __device__ void urey_bradley_gpu(const int i, harmonic_gpu(kUBA, r13A, dr, &vbond, &fbond); float cos_theta2 = cos_theta * cos_theta; - if (cos_theta2 < 1.0f) + if (cos_theta2 < 1.0F) { - float st = dVdt * rsqrtf(1.0f - cos_theta2); + float st = dVdt * rsqrtf(1.0F - cos_theta2); float sth = st * cos_theta; float nrkj2 = norm2(r_kj); @@ -316,7 +316,7 @@ __device__ void urey_bradley_gpu(const int i, } /* Time for the bond calculations */ - if (dr2 != 0.0f) + if (dr2 != 0.0F) { if (calcEner) { @@ -361,7 +361,7 @@ __device__ __forceinline__ static float dih_angle_gpu(const T xi, *n = cprod(*r_kj, *r_kl); float phi = gmx_angle(*m, *n); float ipr = iprod(*r_ij, *n); - float sign = (ipr < 0.0f) ? -1.0f : 1.0f; + float sign = (ipr < 0.0F) ? -1.0F : 1.0F; phi = sign * phi; return phi; @@ -375,7 +375,7 @@ dopdihs_gpu(const float cpA, const float phiA, const int mult, const float phi, mdphi = mult * phi - phiA * CUDA_DEG2RAD_F; sdphi = sinf(mdphi); - *v = cpA * (1.0f + cosf(mdphi)); + *v = cpA * (1.0F + cosf(mdphi)); *f = -cpA * mult * sdphi; } @@ -499,7 +499,7 @@ __device__ void rbdihs_gpu(const int i, float3 sm_fShiftLoc[], const PbcAiuc pbcAiuc) { - constexpr float c0 = 0.0f, c1 = 1.0f, c2 = 2.0f, c3 = 3.0f, c4 = 4.0f, c5 = 5.0f; + constexpr float c0 = 0.0F, c1 = 1.0F, c2 = 2.0F, c3 = 3.0F, c4 = 4.0F, c5 = 5.0F; if (i < numBonds) { @@ -597,11 +597,11 @@ __device__ __forceinline__ static void make_dp_periodic_gpu(float* dp) /* dp cannot be outside (-pi,pi) */ if (*dp >= CUDART_PI_F) { - *dp -= 2.0f * CUDART_PI_F; + *dp -= 2.0F * CUDART_PI_F; } else if (*dp < -CUDART_PI_F) { - *dp += 2.0f * CUDART_PI_F; + *dp += 2.0F * CUDART_PI_F; } } @@ -658,7 +658,7 @@ __device__ void idihs_gpu(const int i, if (calcEner) { - *vtot_loc += -0.5f * ddphi * dp; + *vtot_loc += -0.5F * ddphi * dp; } } } @@ -679,10 +679,10 @@ __device__ void pairs_gpu(const int i, if (i < numBonds) { // TODO this should be made into a separate type, the GPU and CPU sizes should be compared - int3 pairData = *(int3*)(d_forceatoms + 3 * i); - int type = pairData.x; - int ai = pairData.y; - int aj = pairData.z; + const int3 pairData = *(reinterpret_cast(d_forceatoms + 3 * i)); + int type = pairData.x; + int ai = pairData.y; + int aj = pairData.z; float qq = gm_xq[ai].w * gm_xq[aj].w; float c6 = iparams[type].lj14.c6A; @@ -701,7 +701,7 @@ __device__ void pairs_gpu(const int i, float velec = scale_factor * qq * rinv; /* Calculate the LJ force * r and add it to the Coulomb part */ - float fr = (12.0f * c12 * rinv6 - 6.0f * c6) * rinv6 + velec; + float fr = (12.0F * c12 * rinv6 - 6.0F * c6) * rinv6 + velec; float finvr = fr * rinv2; float3 f = finvr * dr; @@ -737,14 +737,14 @@ __global__ void exec_kernel_gpu(BondedCudaKernelParameters kernelParams) extern __shared__ char sm_dynamicShmem[]; char* sm_nextSlotPtr = sm_dynamicShmem; - float3* sm_fShiftLoc = (float3*)sm_nextSlotPtr; + float3* sm_fShiftLoc = reinterpret_cast(sm_nextSlotPtr); sm_nextSlotPtr += c_numShiftVectors * sizeof(float3); if (calcVir) { if (threadIdx.x < c_numShiftVectors) { - sm_fShiftLoc[threadIdx.x] = make_float3(0.0f, 0.0f, 0.0f); + sm_fShiftLoc[threadIdx.x] = make_float3(0.0F, 0.0F, 0.0F); } __syncthreads(); } @@ -865,11 +865,11 @@ __global__ void exec_kernel_gpu(BondedCudaKernelParameters kernelParams) int warpId = threadIdx.x / warpSize; // Shared memory variables to hold block-local partial sum - float* sm_vTot = (float*)sm_nextSlotPtr; + float* sm_vTot = reinterpret_cast(sm_nextSlotPtr); sm_nextSlotPtr += numWarps * sizeof(float); - float* sm_vTotVdw = (float*)sm_nextSlotPtr; + float* sm_vTotVdw = reinterpret_cast(sm_nextSlotPtr); sm_nextSlotPtr += numWarps * sizeof(float); - float* sm_vTotElec = (float*)sm_nextSlotPtr; + float* sm_vTotElec = reinterpret_cast(sm_nextSlotPtr); if (threadIdx.x % warpSize == 0) { diff --git a/src/gromacs/mdlib/gpuforcereduction_impl.cpp b/src/gromacs/mdlib/gpuforcereduction_impl.cpp index 93772853d5..73972c3f66 100644 --- a/src/gromacs/mdlib/gpuforcereduction_impl.cpp +++ b/src/gromacs/mdlib/gpuforcereduction_impl.cpp @@ -81,7 +81,7 @@ void GpuForceReduction::Impl::reinit(DeviceBuffer baseForcePtr, baseForce_ = baseForcePtr; numAtoms_ = numAtoms; atomStart_ = atomStart; - accumulate_ = static_cast(accumulate); + accumulate_ = accumulate; completionMarker_ = completionMarker; cellInfo_.cell = cell.data(); @@ -112,7 +112,7 @@ void GpuForceReduction::Impl::registerRvecForce(DeviceBuffer forcePtr) rvecForceToAdd_ = forcePtr; }; -void GpuForceReduction::Impl::addDependency(GpuEventSynchronizer* const dependency) +void GpuForceReduction::Impl::addDependency(GpuEventSynchronizer* dependency) { dependencyList_.push_back(dependency); } @@ -157,8 +157,6 @@ void GpuForceReduction::Impl::execute() wallcycle_stop(wcycle_, WallCycleCounter::LaunchGpu); } -GpuForceReduction::Impl::~Impl() = default; - GpuForceReduction::GpuForceReduction(const DeviceContext& deviceContext, const DeviceStream& deviceStream, gmx_wallcycle* wcycle) : @@ -176,7 +174,7 @@ void GpuForceReduction::registerRvecForce(DeviceBuffer forcePtr) impl_->registerRvecForce(forcePtr); } -void GpuForceReduction::addDependency(GpuEventSynchronizer* const dependency) +void GpuForceReduction::addDependency(GpuEventSynchronizer* dependency) { impl_->addDependency(dependency); } diff --git a/src/gromacs/mdlib/gpuforcereduction_impl.h b/src/gromacs/mdlib/gpuforcereduction_impl.h index 316f4cca31..f4b5d05e8a 100644 --- a/src/gromacs/mdlib/gpuforcereduction_impl.h +++ b/src/gromacs/mdlib/gpuforcereduction_impl.h @@ -78,8 +78,8 @@ public: * \param [in] wcycle The wallclock counter */ Impl(const DeviceContext& deviceContext, const DeviceStream& deviceStream, gmx_wallcycle* wcycle); - ~Impl(); + ~Impl() = default; /*! \brief Register a nbnxm-format force to be reduced * * \param [in] forcePtr Pointer to force to be reduced @@ -96,7 +96,7 @@ public: * * \param [in] dependency Dependency for this reduction */ - void addDependency(GpuEventSynchronizer* const dependency); + void addDependency(GpuEventSynchronizer* dependency); /*! \brief Reinitialize the GPU force reduction * @@ -108,10 +108,10 @@ public: * \param [in] completionMarker Event to be marked when launch of reduction is complete */ void reinit(DeviceBuffer baseForcePtr, - const int numAtoms, + int numAtoms, ArrayRef cell, - const int atomStart, - const bool accumulate, + int atomStart, + bool accumulate, GpuEventSynchronizer* completionMarker = nullptr); /*! \brief Execute the force reduction */ @@ -125,7 +125,7 @@ private: //! number of atoms int numAtoms_ = 0; //! whether reduction is accumulated into base force buffer - int accumulate_ = true; + bool accumulate_ = true; //! cell information for any nbat-format forces struct cellInfo cellInfo_; //! GPU context object diff --git a/src/gromacs/mdlib/gpuforcereduction_impl_internal.cu b/src/gromacs/mdlib/gpuforcereduction_impl_internal.cu index 4fb187f484..acdaf36c32 100644 --- a/src/gromacs/mdlib/gpuforcereduction_impl_internal.cu +++ b/src/gromacs/mdlib/gpuforcereduction_impl_internal.cu @@ -91,7 +91,6 @@ static __global__ void reduceKernel(const float3* __restrict__ gm_nbnxmForce, *gm_fDest = temp; } - return; } void launchForceReductionKernel(int numAtoms, diff --git a/src/gromacs/mdlib/leapfrog_gpu.h b/src/gromacs/mdlib/leapfrog_gpu.h index 0f48770178..757aa4a7b0 100644 --- a/src/gromacs/mdlib/leapfrog_gpu.h +++ b/src/gromacs/mdlib/leapfrog_gpu.h @@ -124,12 +124,12 @@ public: void integrate(DeviceBuffer d_x, DeviceBuffer d_xp, DeviceBuffer d_v, - const DeviceBuffer d_f, - const float dt, - const bool doTemperatureScaling, + DeviceBuffer d_f, + float dt, + bool doTemperatureScaling, gmx::ArrayRef tcstat, - const bool doParrinelloRahman, - const float dtPressureCouple, + bool doParrinelloRahman, + float dtPressureCouple, const matrix prVelocityScalingMatrix); /*! \brief Set the integrator @@ -142,7 +142,7 @@ public: * \param[in] inverseMasses Inverse masses of atoms. * \param[in] tempScaleGroups Maps the atom index to temperature scale value. */ - void set(const int numAtoms, const real* inverseMasses, const unsigned short* tempScaleGroups); + void set(int numAtoms, const real* inverseMasses, const unsigned short* tempScaleGroups); /*! \brief Class with hardware-specific interfaces and implementations.*/ class Impl; diff --git a/src/gromacs/mdlib/lincs_gpu.cpp b/src/gromacs/mdlib/lincs_gpu.cpp index 761f38a120..86211b17cd 100644 --- a/src/gromacs/mdlib/lincs_gpu.cpp +++ b/src/gromacs/mdlib/lincs_gpu.cpp @@ -67,14 +67,14 @@ namespace gmx { -void LincsGpu::apply(const DeviceBuffer d_x, - DeviceBuffer d_xp, - const bool updateVelocities, - DeviceBuffer d_v, - const real invdt, - const bool computeVirial, - tensor virialScaled, - const PbcAiuc pbcAiuc) +void LincsGpu::apply(const DeviceBuffer& d_x, + DeviceBuffer d_xp, + const bool updateVelocities, + DeviceBuffer d_v, + const real invdt, + const bool computeVirial, + tensor virialScaled, + const PbcAiuc& pbcAiuc) { GMX_ASSERT(GMX_GPU_CUDA, "LINCS GPU is only implemented in CUDA."); @@ -120,8 +120,6 @@ void LincsGpu::apply(const DeviceBuffer d_x, virialScaled[ZZ][YY] += h_virialScaled_[4]; virialScaled[ZZ][ZZ] += h_virialScaled_[5]; } - - return; } LincsGpu::LincsGpu(int numIterations, @@ -482,8 +480,8 @@ void LincsGpu::set(const InteractionDefinitions& idef, const int numAtoms, const int center = c1a1; - float sqrtmu1 = 1.0 / sqrt(invmass[c1a1] + invmass[c1a2]); - float sqrtmu2 = 1.0 / sqrt(invmass[c2a1] + invmass[c2a2]); + float sqrtmu1 = 1.0 / std::sqrt(invmass[c1a1] + invmass[c1a2]); + float sqrtmu2 = 1.0 / std::sqrt(invmass[c2a1] + invmass[c2a2]); massFactorsHost.at(index) = -sign * invmass[center] * sqrtmu1 * sqrtmu2; @@ -509,8 +507,8 @@ void LincsGpu::set(const InteractionDefinitions& idef, const int numAtoms, const int center = c1a2; - float sqrtmu1 = 1.0 / sqrt(invmass[c1a1] + invmass[c1a2]); - float sqrtmu2 = 1.0 / sqrt(invmass[c2a1] + invmass[c2a2]); + float sqrtmu1 = 1.0 / std::sqrt(invmass[c1a1] + invmass[c1a2]); + float sqrtmu2 = 1.0 / std::sqrt(invmass[c2a1] + invmass[c2a2]); massFactorsHost.at(index) = sign * invmass[center] * sqrtmu1 * sqrtmu2; diff --git a/src/gromacs/mdlib/lincs_gpu.h b/src/gromacs/mdlib/lincs_gpu.h index 3ed6cd3e59..690d21df82 100644 --- a/src/gromacs/mdlib/lincs_gpu.h +++ b/src/gromacs/mdlib/lincs_gpu.h @@ -144,14 +144,14 @@ public: * \param[in,out] virialScaled Scaled virial tensor to be updated. * \param[in] pbcAiuc PBC data. */ - void apply(const DeviceBuffer d_x, - DeviceBuffer d_xp, - const bool updateVelocities, - DeviceBuffer d_v, - const real invdt, - const bool computeVirial, - tensor virialScaled, - const PbcAiuc pbcAiuc); + void apply(const DeviceBuffer& d_x, + DeviceBuffer d_xp, + bool updateVelocities, + DeviceBuffer d_v, + real invdt, + bool computeVirial, + tensor virialScaled, + const PbcAiuc& pbcAiuc); /*! \brief * Update data-structures (e.g. after NB search step). diff --git a/src/gromacs/mdlib/lincs_gpu_internal.cu b/src/gromacs/mdlib/lincs_gpu_internal.cu index 55d4b48bad..15e3a288df 100644 --- a/src/gromacs/mdlib/lincs_gpu_internal.cu +++ b/src/gromacs/mdlib/lincs_gpu_internal.cu @@ -107,7 +107,7 @@ __launch_bounds__(c_maxThreadsPerBlock) __global__ const float* __restrict__ gm_inverseMasses = kernelParams.d_inverseMasses; float* __restrict__ gm_virialScaled = kernelParams.d_virialScaled; - int threadIndex = blockIdx.x * blockDim.x + threadIdx.x; + const int threadIndex = blockIdx.x * blockDim.x + threadIdx.x; // numConstraintsThreads should be a integer multiple of blockSize (numConstraintsThreads = numBlocks*blockSize). // This is to ensure proper synchronizations and reduction. All array are padded to the required size. @@ -122,7 +122,7 @@ __launch_bounds__(c_maxThreadsPerBlock) __global__ int j = pair.j; // Mass-scaled Lagrange multiplier - float lagrangeScaled = 0.0f; + float lagrangeScaled = 0.0F; float targetLength; float inverseMassi; @@ -139,14 +139,14 @@ __launch_bounds__(c_maxThreadsPerBlock) __global__ // Everything computed for these dummies will be equal to zero if (isDummyThread) { - targetLength = 0.0f; - inverseMassi = 0.0f; - inverseMassj = 0.0f; - sqrtReducedMass = 0.0f; - - xi = make_float3(0.0f, 0.0f, 0.0f); - xj = make_float3(0.0f, 0.0f, 0.0f); - rc = make_float3(0.0f, 0.0f, 0.0f); + targetLength = 0.0F; + inverseMassi = 0.0F; + inverseMassj = 0.0F; + sqrtReducedMass = 0.0F; + + xi = make_float3(0.0F, 0.0F, 0.0F); + xj = make_float3(0.0F, 0.0F, 0.0F); + rc = make_float3(0.0F, 0.0F, 0.0F); } else { @@ -209,7 +209,7 @@ __launch_bounds__(c_maxThreadsPerBlock) __global__ { // Making sure that all sm_rhs are saved before they are accessed in a loop below __syncthreads(); - float mvb = 0.0f; + float mvb = 0.0F; for (int n = 0; n < coupledConstraintsCount; n++) { @@ -256,12 +256,12 @@ __launch_bounds__(c_maxThreadsPerBlock) __global__ float3 dx = pbcDxAiuc(pbcAiuc, xi, xj); float len2 = targetLength * targetLength; - float dlen2 = 2.0f * len2 - norm2(dx); + float dlen2 = 2.0F * len2 - norm2(dx); // TODO A little bit more effective but slightly less readable version of the below would be: // float proj = sqrtReducedMass*(targetLength - (dlen2 > 0.0f ? 1.0f : 0.0f)*dlen2*rsqrt(dlen2)); float proj; - if (dlen2 > 0.0f) + if (dlen2 > 0.0F) { proj = sqrtReducedMass * (targetLength - dlen2 * rsqrt(dlen2)); } @@ -373,8 +373,6 @@ __launch_bounds__(c_maxThreadsPerBlock) __global__ atomicAdd(&(gm_virialScaled[threadIdx.x]), sm_threadVirial[threadIdx.x * blockDim.x]); } } - - return; } /*! \brief Select templated kernel. @@ -409,14 +407,14 @@ inline auto getLincsKernelPtr(const bool updateVelocities, const bool computeVir return kernelPtr; } -void launchLincsGpuKernel(LincsGpuKernelParameters& kernelParams, - const DeviceBuffer d_x, - DeviceBuffer d_xp, - const bool updateVelocities, - DeviceBuffer d_v, - const real invdt, - const bool computeVirial, - const DeviceStream& deviceStream) +void launchLincsGpuKernel(const LincsGpuKernelParameters& kernelParams, + const DeviceBuffer& d_x, + DeviceBuffer d_xp, + const bool updateVelocities, + DeviceBuffer d_v, + const real invdt, + const bool computeVirial, + const DeviceStream& deviceStream) { auto kernelPtr = getLincsKernelPtr(updateVelocities, computeVirial); @@ -459,8 +457,6 @@ void launchLincsGpuKernel(LincsGpuKernelParameters& kernelParams, nullptr, "lincs_kernel", kernelArgs); - - return; } } // namespace gmx diff --git a/src/gromacs/mdlib/lincs_gpu_internal.h b/src/gromacs/mdlib/lincs_gpu_internal.h index 5c42a784c6..cd421c198c 100644 --- a/src/gromacs/mdlib/lincs_gpu_internal.h +++ b/src/gromacs/mdlib/lincs_gpu_internal.h @@ -57,14 +57,14 @@ struct LincsGpuKernelParameters; //! Number of threads in a GPU block constexpr static int c_threadsPerBlock = 256; -void launchLincsGpuKernel(LincsGpuKernelParameters& kernelParams, - const DeviceBuffer d_x, - DeviceBuffer d_xp, - const bool updateVelocities, - DeviceBuffer d_v, - const real invdt, - const bool computeVirial, - const DeviceStream& deviceStream); +void launchLincsGpuKernel(const LincsGpuKernelParameters& kernelParams, + const DeviceBuffer& d_x, + DeviceBuffer d_xp, + bool updateVelocities, + DeviceBuffer d_v, + real invdt, + bool computeVirial, + const DeviceStream& deviceStream); } // namespace gmx diff --git a/src/gromacs/mdlib/lincs_gpu_internal_sycl.cpp b/src/gromacs/mdlib/lincs_gpu_internal_sycl.cpp index b5594c1bc0..1e87f6968f 100644 --- a/src/gromacs/mdlib/lincs_gpu_internal_sycl.cpp +++ b/src/gromacs/mdlib/lincs_gpu_internal_sycl.cpp @@ -49,8 +49,8 @@ namespace gmx { -void launchLincsGpuKernel(LincsGpuKernelParameters& /* kernelParams */, - const DeviceBuffer /* d_x */, +void launchLincsGpuKernel(const LincsGpuKernelParameters& /* kernelParams */, + const DeviceBuffer& /* d_x */, DeviceBuffer /* d_xp */, const bool /* updateVelocities */, DeviceBuffer /* d_v */, diff --git a/src/gromacs/mdlib/settle_gpu.cpp b/src/gromacs/mdlib/settle_gpu.cpp index 659947ac35..e836ff1779 100644 --- a/src/gromacs/mdlib/settle_gpu.cpp +++ b/src/gromacs/mdlib/settle_gpu.cpp @@ -63,14 +63,14 @@ namespace gmx { -void SettleGpu::apply(const DeviceBuffer d_x, - DeviceBuffer d_xp, - const bool updateVelocities, - DeviceBuffer d_v, - const real invdt, - const bool computeVirial, - tensor virialScaled, - const PbcAiuc pbcAiuc) +void SettleGpu::apply(const DeviceBuffer& d_x, + DeviceBuffer d_xp, + const bool updateVelocities, + DeviceBuffer d_v, + const real invdt, + const bool computeVirial, + tensor virialScaled, + const PbcAiuc& pbcAiuc) { // Early exit if no settles @@ -118,8 +118,6 @@ void SettleGpu::apply(const DeviceBuffer d_x, virialScaled[ZZ][YY] += h_virialScaled_[4]; virialScaled[ZZ][ZZ] += h_virialScaled_[5]; } - - return; } SettleGpu::SettleGpu(const gmx_mtop_t& mtop, const DeviceContext& deviceContext, const DeviceStream& deviceStream) : diff --git a/src/gromacs/mdlib/settle_gpu.h b/src/gromacs/mdlib/settle_gpu.h index 0f85649b95..998f545f86 100644 --- a/src/gromacs/mdlib/settle_gpu.h +++ b/src/gromacs/mdlib/settle_gpu.h @@ -114,14 +114,14 @@ public: * \param[in,out] virialScaled Scaled virial tensor to be updated. * \param[in] pbcAiuc PBC data. */ - void apply(const DeviceBuffer d_x, - DeviceBuffer d_xp, - const bool updateVelocities, - DeviceBuffer d_v, - const real invdt, - const bool computeVirial, - tensor virialScaled, - const PbcAiuc pbcAiuc); + void apply(const DeviceBuffer& d_x, + DeviceBuffer d_xp, + bool updateVelocities, + DeviceBuffer d_v, + real invdt, + bool computeVirial, + tensor virialScaled, + const PbcAiuc& pbcAiuc); /*! \brief * Update data-structures (e.g. after NB search step). diff --git a/src/gromacs/mdlib/settle_gpu_internal.cu b/src/gromacs/mdlib/settle_gpu_internal.cu index 11dd63b035..366df8d3e4 100644 --- a/src/gromacs/mdlib/settle_gpu_internal.cu +++ b/src/gromacs/mdlib/settle_gpu_internal.cu @@ -205,7 +205,7 @@ __launch_bounds__(sc_maxThreadsPerBlock) __global__ float sinphi = a1d_z * rsqrt(pars.ra * pars.ra); - float tmp2 = 1.0f - sinphi * sinphi; + float tmp2 = 1.0F - sinphi * sinphi; if (almost_zero > tmp2) { @@ -215,7 +215,7 @@ __launch_bounds__(sc_maxThreadsPerBlock) __global__ float tmp = rsqrt(tmp2); float cosphi = tmp2 * tmp; float sinpsi = (b1d.z - c1d.z) * pars.irc2 * tmp; - tmp2 = 1.0f - sinpsi * sinpsi; + tmp2 = 1.0F - sinpsi * sinpsi; float cospsi = tmp2 * rsqrt(tmp2); @@ -235,7 +235,7 @@ __launch_bounds__(sc_maxThreadsPerBlock) __global__ float sinthe = (alpha * gamma - beta * tmp2 * rsqrt(tmp2)) * rsqrt(al2be2 * al2be2); /* --- Step4 A3' --- */ - tmp2 = 1.0f - sinthe * sinthe; + tmp2 = 1.0F - sinthe * sinthe; float costhe = tmp2 * rsqrt(tmp2); float3 a3d, b3d, c3d; @@ -318,7 +318,7 @@ __launch_bounds__(sc_maxThreadsPerBlock) __global__ { for (int d = 0; d < 6; d++) { - sm_threadVirial[d * blockDim.x + threadIdx.x] = 0.0f; + sm_threadVirial[d * blockDim.x + threadIdx.x] = 0.0F; } } } @@ -358,8 +358,6 @@ __launch_bounds__(sc_maxThreadsPerBlock) __global__ atomicAdd(&(gm_virialScaled[tib]), sm_threadVirial[tib * blockSize]); } } - - return; } /*! \brief Select templated kernel. @@ -394,18 +392,18 @@ inline auto getSettleKernelPtr(const bool updateVelocities, const bool computeVi return kernelPtr; } -void launchSettleGpuKernel(const int numSettles, - const DeviceBuffer d_atomIds, - const SettleParameters settleParameters, - const DeviceBuffer d_x, - DeviceBuffer d_xp, - const bool updateVelocities, - DeviceBuffer d_v, - const real invdt, - const bool computeVirial, - DeviceBuffer virialScaled, - const PbcAiuc pbcAiuc, - const DeviceStream& deviceStream) +void launchSettleGpuKernel(const int numSettles, + const DeviceBuffer& d_atomIds, + const SettleParameters& settleParameters, + const DeviceBuffer& d_x, + DeviceBuffer d_xp, + const bool updateVelocities, + DeviceBuffer d_v, + const real invdt, + const bool computeVirial, + DeviceBuffer virialScaled, + const PbcAiuc& pbcAiuc, + const DeviceStream& deviceStream) { static_assert( gmx::isPowerOfTwo(sc_threadsPerBlock), @@ -449,8 +447,6 @@ void launchSettleGpuKernel(const int numSettles, nullptr, "settle_kernel", kernelArgs); - - return; } } // namespace gmx diff --git a/src/gromacs/mdlib/settle_gpu_internal.h b/src/gromacs/mdlib/settle_gpu_internal.h index 9963944e74..f88d046150 100644 --- a/src/gromacs/mdlib/settle_gpu_internal.h +++ b/src/gromacs/mdlib/settle_gpu_internal.h @@ -73,18 +73,18 @@ namespace gmx * \param[in] pbcAiuc PBC data. * \param[in] deviceStream Device stream to launch kernel in. */ -void launchSettleGpuKernel(int numSettles, - const DeviceBuffer d_atomIds, - const SettleParameters settleParameters, - const DeviceBuffer d_x, - DeviceBuffer d_xp, - const bool updateVelocities, - DeviceBuffer d_v, - const real invdt, - const bool computeVirial, - DeviceBuffer virialScaled, - const PbcAiuc pbcAiuc, - const DeviceStream& deviceStream); +void launchSettleGpuKernel(int numSettles, + const DeviceBuffer& d_atomIds, + const SettleParameters& settleParameters, + const DeviceBuffer& d_x, + DeviceBuffer d_xp, + bool updateVelocities, + DeviceBuffer d_v, + real invdt, + bool computeVirial, + DeviceBuffer virialScaled, + const PbcAiuc& pbcAiuc, + const DeviceStream& deviceStream); } // namespace gmx diff --git a/src/gromacs/mdlib/settle_gpu_internal_sycl.cpp b/src/gromacs/mdlib/settle_gpu_internal_sycl.cpp index ef063c36d5..a6e6e9a40d 100644 --- a/src/gromacs/mdlib/settle_gpu_internal_sycl.cpp +++ b/src/gromacs/mdlib/settle_gpu_internal_sycl.cpp @@ -49,16 +49,16 @@ namespace gmx { void launchSettleGpuKernel(const int /* numSettles */, - const DeviceBuffer /* d_atomIds */, - const SettleParameters /* settleParameters */, - const DeviceBuffer /* d_x */, + const DeviceBuffer& /* d_atomIds */, + const SettleParameters& /* settleParameters */, + const DeviceBuffer& /* d_x */, DeviceBuffer /* d_xp */, const bool /* updateVelocities */, DeviceBuffer /* d_v */, const real /* invdt */, const bool /* computeVirial */, DeviceBuffer /* virialScaled */, - const PbcAiuc /* pbcAiuc */, + const PbcAiuc& /* pbcAiuc */, const DeviceStream& /* deviceStream */) { // SYCL_TODO diff --git a/src/gromacs/mdlib/update_constrain_gpu_impl.cpp b/src/gromacs/mdlib/update_constrain_gpu_impl.cpp index 6e5c56a867..96d447d36c 100644 --- a/src/gromacs/mdlib/update_constrain_gpu_impl.cpp +++ b/src/gromacs/mdlib/update_constrain_gpu_impl.cpp @@ -72,7 +72,7 @@ #include "gromacs/timing/wallcycle.h" #include "gromacs/topology/mtop_util.h" -static constexpr bool sc_haveGpuConstraintSupport = (GMX_GPU_CUDA); +static constexpr bool sc_haveGpuConstraintSupport = GMX_GPU_CUDA; namespace gmx { @@ -125,8 +125,6 @@ void UpdateConstrainGpu::Impl::integrate(GpuEventSynchronizer* fRead wallcycle_sub_stop(wcycle_, WallCycleSubCounter::LaunchGpuUpdateConstrain); wallcycle_stop(wcycle_, WallCycleCounter::LaunchGpu); - - return; } void UpdateConstrainGpu::Impl::scaleCoordinates(const matrix scalingMatrix) diff --git a/src/gromacs/mdlib/update_constrain_gpu_impl.h b/src/gromacs/mdlib/update_constrain_gpu_impl.h index 150c944aae..adfbccda85 100644 --- a/src/gromacs/mdlib/update_constrain_gpu_impl.h +++ b/src/gromacs/mdlib/update_constrain_gpu_impl.h @@ -153,7 +153,7 @@ public: */ void set(DeviceBuffer d_x, DeviceBuffer d_v, - const DeviceBuffer d_f, + DeviceBuffer d_f, const InteractionDefinitions& idef, const t_mdatoms& md); diff --git a/src/gromacs/mdlib/update_constrain_gpu_internal.cu b/src/gromacs/mdlib/update_constrain_gpu_internal.cu index 5787bc476e..9d1dea39af 100644 --- a/src/gromacs/mdlib/update_constrain_gpu_internal.cu +++ b/src/gromacs/mdlib/update_constrain_gpu_internal.cu @@ -77,7 +77,7 @@ __launch_bounds__(c_maxThreadsPerBlock) __global__ void launchScaleCoordinatesKernel(const int numAtoms, DeviceBuffer d_coordinates, - const ScalingMatrix mu, + const ScalingMatrix& mu, const DeviceStream& deviceStream) { KernelLaunchConfig kernelLaunchConfig; diff --git a/src/gromacs/mdlib/update_constrain_gpu_internal.h b/src/gromacs/mdlib/update_constrain_gpu_internal.h index 893110f15b..7e0a2d1a04 100644 --- a/src/gromacs/mdlib/update_constrain_gpu_internal.h +++ b/src/gromacs/mdlib/update_constrain_gpu_internal.h @@ -76,7 +76,7 @@ struct ScalingMatrix */ void launchScaleCoordinatesKernel(int numAtoms, DeviceBuffer d_coordinates, - const ScalingMatrix mu, + const ScalingMatrix& mu, const DeviceStream& deviceStream); } // namespace gmx diff --git a/src/gromacs/mdlib/update_constrain_gpu_internal_sycl.cpp b/src/gromacs/mdlib/update_constrain_gpu_internal_sycl.cpp index cf9439bf17..5f2233a7f1 100644 --- a/src/gromacs/mdlib/update_constrain_gpu_internal_sycl.cpp +++ b/src/gromacs/mdlib/update_constrain_gpu_internal_sycl.cpp @@ -69,7 +69,7 @@ static auto scaleKernel(cl::sycl::handler& void launchScaleCoordinatesKernel(const int numAtoms, DeviceBuffer d_coordinates, - const ScalingMatrix mu, + const ScalingMatrix& mu, const DeviceStream& deviceStream) { const cl::sycl::range<1> rangeAllAtoms(numAtoms); diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu.h b/src/gromacs/mdtypes/state_propagator_data_gpu.h index 3f66196509..cd119960f3 100644 --- a/src/gromacs/mdtypes/state_propagator_data_gpu.h +++ b/src/gromacs/mdtypes/state_propagator_data_gpu.h @@ -158,7 +158,7 @@ public: * * \returns Tuple, containing the index of the first atom in the range and the total number of atoms in the range. */ - std::tuple getAtomRangesFromAtomLocality(AtomLocality atomLocality); + std::tuple getAtomRangesFromAtomLocality(AtomLocality atomLocality) const; /*! \brief Get the positions buffer on the GPU. @@ -323,13 +323,13 @@ public: * * \returns The number of local atoms. */ - int numAtomsLocal(); + int numAtomsLocal() const; /*! \brief Getter for the total number of atoms. * * \returns The total number of atoms. */ - int numAtomsAll(); + int numAtomsAll() const; private: class Impl; diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp index f112d1b1cb..0ce99a4dd4 100644 --- a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp +++ b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp @@ -84,7 +84,7 @@ void StatePropagatorDataGpu::reinit(int /* numAtomsLocal */, int /* numAtomsAll "GPU implementation."); } -std::tuple StatePropagatorDataGpu::getAtomRangesFromAtomLocality(AtomLocality /* atomLocality */) +std::tuple StatePropagatorDataGpu::getAtomRangesFromAtomLocality(AtomLocality /* atomLocality */) const { GMX_ASSERT(!impl_, "A CPU stub method from GPU state propagator data was called instead of one from " @@ -254,7 +254,7 @@ const DeviceStream* StatePropagatorDataGpu::getUpdateStream() return nullptr; } -int StatePropagatorDataGpu::numAtomsLocal() +int StatePropagatorDataGpu::numAtomsLocal() const { GMX_ASSERT(!impl_, "A CPU stub method from GPU state propagator data was called instead of one from " @@ -262,7 +262,7 @@ int StatePropagatorDataGpu::numAtomsLocal() return 0; } -int StatePropagatorDataGpu::numAtomsAll() +int StatePropagatorDataGpu::numAtomsAll() const { GMX_ASSERT(!impl_, "A CPU stub method from GPU state propagator data was called instead of one from " diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h index cc9f92bcfe..0a041db04f 100644 --- a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h +++ b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h @@ -153,7 +153,7 @@ public: * * \returns Tuple, containing the index of the first atom in the range and the total number of atoms in the range. */ - std::tuple getAtomRangesFromAtomLocality(AtomLocality atomLocality); + std::tuple getAtomRangesFromAtomLocality(AtomLocality atomLocality) const; /*! \brief Get the positions buffer on the GPU. @@ -318,13 +318,13 @@ public: * * \returns The number of local atoms. */ - int numAtomsLocal(); + int numAtomsLocal() const; /*! \brief Getter for the total number of atoms. * * \returns The total number of atoms. */ - int numAtomsAll(); + int numAtomsAll() const; private: //! GPU PME stream. @@ -443,7 +443,7 @@ private: void clearOnDevice(DeviceBuffer d_data, int dataSize, AtomLocality atomLocality, - const DeviceStream& deviceStream); + const DeviceStream& deviceStream) const; }; } // namespace gmx diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp b/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp index 69e11d69c0..d4868d20dc 100644 --- a/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp +++ b/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp @@ -178,7 +178,7 @@ void StatePropagatorDataGpu::Impl::reinit(int numAtomsLocal, int numAtomsAll) wallcycle_stop(wcycle_, WallCycleCounter::LaunchGpu); } -std::tuple StatePropagatorDataGpu::Impl::getAtomRangesFromAtomLocality(AtomLocality atomLocality) +std::tuple StatePropagatorDataGpu::Impl::getAtomRangesFromAtomLocality(AtomLocality atomLocality) const { int atomsStartAt = 0; int numAtomsToCopy = 0; @@ -281,7 +281,7 @@ void StatePropagatorDataGpu::Impl::copyFromDevice(gmx::ArrayRef h_dat void StatePropagatorDataGpu::Impl::clearOnDevice(DeviceBuffer d_data, int dataSize, AtomLocality atomLocality, - const DeviceStream& deviceStream) + const DeviceStream& deviceStream) const { GMX_UNUSED_VALUE(dataSize); @@ -540,12 +540,12 @@ const DeviceStream* StatePropagatorDataGpu::Impl::getUpdateStream() return updateStream_; } -int StatePropagatorDataGpu::Impl::numAtomsLocal() +int StatePropagatorDataGpu::Impl::numAtomsLocal() const { return numAtomsLocal_; } -int StatePropagatorDataGpu::Impl::numAtomsAll() +int StatePropagatorDataGpu::Impl::numAtomsAll() const { return numAtomsAll_; } @@ -580,7 +580,7 @@ void StatePropagatorDataGpu::reinit(int numAtomsLocal, int numAtomsAll) return impl_->reinit(numAtomsLocal, numAtomsAll); } -std::tuple StatePropagatorDataGpu::getAtomRangesFromAtomLocality(AtomLocality atomLocality) +std::tuple StatePropagatorDataGpu::getAtomRangesFromAtomLocality(AtomLocality atomLocality) const { return impl_->getAtomRangesFromAtomLocality(atomLocality); } @@ -695,12 +695,12 @@ const DeviceStream* StatePropagatorDataGpu::getUpdateStream() return impl_->getUpdateStream(); } -int StatePropagatorDataGpu::numAtomsLocal() +int StatePropagatorDataGpu::numAtomsLocal() const { return impl_->numAtomsLocal(); } -int StatePropagatorDataGpu::numAtomsAll() +int StatePropagatorDataGpu::numAtomsAll() const { return impl_->numAtomsAll(); } diff --git a/src/gromacs/nbnxm/cuda/.clang-tidy b/src/gromacs/nbnxm/cuda/.clang-tidy new file mode 100644 index 0000000000..47d31a387a --- /dev/null +++ b/src/gromacs/nbnxm/cuda/.clang-tidy @@ -0,0 +1,8 @@ +# List of rationales for check suppressions (where known). +# This have to precede the list because inline comments are not +# supported by clang-tidy. +# +# -readability-implicit-bool-conversion +# This happens often in the GPU code and should generally be harmless +Checks: -readability-implicit-bool-conversion +InheritParentConfig: true diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu index 1db33caf3e..20ea65b673 100644 --- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu +++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu @@ -89,7 +89,7 @@ namespace Nbnxm * there is a bit of fluctuations in the generated block counts, we use * a target of 44 instead of the ideal value of 48. */ -static unsigned int gpu_min_ci_balanced_factor = 44; +static const unsigned int gpu_min_ci_balanced_factor = 44; void gpu_init_platform_specific(NbnxmGpu* /* nb */) { diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel.cuh b/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel.cuh index 7df9231d51..5aaf3b9ead 100644 --- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel.cuh +++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel.cuh @@ -160,7 +160,7 @@ __launch_bounds__(THREADS_PER_BLOCK) __global__ void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_cuda) # endif /* CALC_ENERGIES */ #endif /* PRUNE_NBL */ - (const NBAtomDataGpu atdat, const NBParamGpu nbparam, const Nbnxm::gpu_plist plist, bool bCalcFshift) + (NBAtomDataGpu atdat, NBParamGpu nbparam, Nbnxm::gpu_plist plist, bool bCalcFshift) #ifdef FUNCTION_DECLARATION_ONLY ; /* Only do function declaration, omit the function body. */ #else @@ -257,28 +257,29 @@ __launch_bounds__(THREADS_PER_BLOCK) * sm_nextSlotPtr should always be updated to point to the "next slot", * that is past the last point where data has been stored. */ + // NOLINTNEXTLINE(readability-redundant-declaration) extern __shared__ char sm_dynamicShmem[]; char* sm_nextSlotPtr = sm_dynamicShmem; static_assert(sizeof(char) == 1, "The shared memory offset calculation assumes that char is 1 byte"); /* shmem buffer for i x+q pre-loading */ - float4* xqib = (float4*)sm_nextSlotPtr; + float4* xqib = reinterpret_cast(sm_nextSlotPtr); sm_nextSlotPtr += (c_nbnxnGpuNumClusterPerSupercluster * c_clSize * sizeof(*xqib)); /* shmem buffer for cj, for each warp separately */ - int* cjs = (int*)(sm_nextSlotPtr); + int* cjs = reinterpret_cast(sm_nextSlotPtr); /* the cjs buffer's use expects a base pointer offset for pairs of warps in the j-concurrent execution */ cjs += tidxz * c_nbnxnGpuClusterpairSplit * c_nbnxnGpuJgroupSize; sm_nextSlotPtr += (NTHREAD_Z * c_nbnxnGpuClusterpairSplit * c_nbnxnGpuJgroupSize * sizeof(*cjs)); # ifndef LJ_COMB /* shmem buffer for i atom-type pre-loading */ - int* atib = (int*)sm_nextSlotPtr; + int* atib = reinterpret_cast(sm_nextSlotPtr); sm_nextSlotPtr += (c_nbnxnGpuNumClusterPerSupercluster * c_clSize * sizeof(*atib)); # else /* shmem buffer for i-atom LJ combination rule parameters */ - float2* ljcpib = (float2*)sm_nextSlotPtr; + float2* ljcpib = reinterpret_cast(sm_nextSlotPtr); sm_nextSlotPtr += (c_nbnxnGpuNumClusterPerSupercluster * c_clSize * sizeof(*ljcpib)); # endif /*********************************************************************/ @@ -294,8 +295,8 @@ __launch_bounds__(THREADS_PER_BLOCK) ci = sci * c_nbnxnGpuNumClusterPerSupercluster + tidxj; ai = ci * c_clSize + tidxi; - float* shiftptr = (float*)&shift_vec[nb_sci.shift]; - xqbuf = xq[ai] + make_float4(LDG(shiftptr), LDG(shiftptr + 1), LDG(shiftptr + 2), 0.0f); + const float* shiftptr = reinterpret_cast(&shift_vec[nb_sci.shift]); + xqbuf = xq[ai] + make_float4(LDG(shiftptr), LDG(shiftptr + 1), LDG(shiftptr + 2), 0.0F); xqbuf.w *= nbparam.epsfac; xqib[tidxj * c_clSize + tidxi] = xqbuf; @@ -311,7 +312,7 @@ __launch_bounds__(THREADS_PER_BLOCK) for (i = 0; i < c_nbnxnGpuNumClusterPerSupercluster; i++) { - fci_buf[i] = make_float3(0.0f); + fci_buf[i] = make_float3(0.0F); } # ifdef LJ_EWALD @@ -322,8 +323,8 @@ __launch_bounds__(THREADS_PER_BLOCK) # ifdef CALC_ENERGIES - E_lj = 0.0f; - E_el = 0.0f; + E_lj = 0.0F; + E_el = 0.0F; # ifdef EXCLUSION_FORCES /* Ewald or RF */ if (nb_sci.shift == gmx::c_centralShiftIndex @@ -339,22 +340,23 @@ __launch_bounds__(THREADS_PER_BLOCK) # ifdef LJ_EWALD // load only the first 4 bytes of the parameter pair (equivalent with nbfp[idx].x) - E_lj += LDG((float*)&nbparam.nbfp[atom_types[(sci * c_nbnxnGpuNumClusterPerSupercluster + i) * c_clSize + tidxi] - * (ntypes + 1)]); + E_lj += LDG(reinterpret_cast( + &nbparam.nbfp[atom_types[(sci * c_nbnxnGpuNumClusterPerSupercluster + i) * c_clSize + tidxi] + * (ntypes + 1)])); # endif } /* divide the self term(s) equally over the j-threads, then multiply with the coefficients. */ # ifdef LJ_EWALD E_lj /= c_clSize * NTHREAD_Z; - E_lj *= 0.5f * c_oneSixth * lje_coeff6_6; + E_lj *= 0.5F * c_oneSixth * lje_coeff6_6; # endif # if defined EL_EWALD_ANY || defined EL_RF || defined EL_CUTOFF /* Correct for epsfac^2 due to adding qi^2 */ E_el /= nbparam.epsfac * c_clSize * NTHREAD_Z; # if defined EL_RF || defined EL_CUTOFF - E_el *= -0.5f * reactionFieldShift; + E_el *= -0.5F * reactionFieldShift; # else E_el *= -beta * M_FLOAT_1_SQRTPI; /* last factor 1/sqrt(pi) */ # endif @@ -412,7 +414,7 @@ __launch_bounds__(THREADS_PER_BLOCK) ljcp_j = lj_comb[aj]; # endif - fcj_buf = make_float3(0.0f); + fcj_buf = make_float3(0.0F); # if !defined PRUNE_NBL # pragma unroll 8 @@ -441,7 +443,7 @@ __launch_bounds__(THREADS_PER_BLOCK) } # endif - int_bit = (wexcl & mask_ji) ? 1.0f : 0.0f; + int_bit = (wexcl & mask_ji) ? 1.0F : 0.0F; /* cutoff & exclusion check */ # ifdef EXCLUSION_FORCES @@ -499,7 +501,7 @@ __launch_bounds__(THREADS_PER_BLOCK) sig_r6 *= int_bit; # endif /* EXCLUSION_FORCES */ - F_invr = epsilon * sig_r6 * (sig_r6 - 1.0f) * inv_r2; + F_invr = epsilon * sig_r6 * (sig_r6 - 1.0F) * inv_r2; # endif /* !LJ_COMB_LB || CALC_ENERGIES */ # ifdef LJ_FORCE_SWITCH @@ -553,7 +555,7 @@ __launch_bounds__(THREADS_PER_BLOCK) /* Separate VDW cut-off check to enable twin-range cut-offs * (rvdw < rcoulomb <= rlist) */ - vdw_in_range = (r2 < rvdw_sq) ? 1.0f : 0.0f; + vdw_in_range = (r2 < rvdw_sq) ? 1.0F : 0.0F; F_invr *= vdw_in_range; # ifdef CALC_ENERGIES E_lj_p *= vdw_in_range; @@ -591,10 +593,10 @@ __launch_bounds__(THREADS_PER_BLOCK) # endif # ifdef EL_RF E_el += qi * qj_f - * (int_bit * inv_r + 0.5f * two_k_rf * r2 - reactionFieldShift); + * (int_bit * inv_r + 0.5F * two_k_rf * r2 - reactionFieldShift); # endif # ifdef EL_EWALD_ANY - /* 1.0f - erff is faster than erfcf */ + /* 1.0F - erff is faster than erfcf */ E_el += qi * qj_f * (inv_r * (int_bit - erff(r2 * inv_r * beta)) - int_bit * ewald_shift); # endif /* EL_EWALD_ANY */ @@ -633,7 +635,7 @@ __launch_bounds__(THREADS_PER_BLOCK) bCalcFshift = false; } - float fshift_buf = 0.0f; + float fshift_buf = 0.0F; /* reduce i forces */ for (i = 0; i < c_nbnxnGpuNumClusterPerSupercluster; i++) diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel_pruneonly.cuh b/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel_pruneonly.cuh index bcc3dd1b09..8219ad16a3 100644 --- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel_pruneonly.cuh +++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel_pruneonly.cuh @@ -104,11 +104,11 @@ */ template __launch_bounds__(THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP) __global__ - void nbnxn_kernel_prune_cuda(const NBAtomDataGpu atdat, - const NBParamGpu nbparam, - const Nbnxm::gpu_plist plist, - int numParts, - int part) + void nbnxn_kernel_prune_cuda(NBAtomDataGpu atdat, + NBParamGpu nbparam, + Nbnxm::gpu_plist plist, + int numParts, + int part) #ifdef FUNCTION_DECLARATION_ONLY ; /* Only do function declaration, omit the function body. */ @@ -152,11 +152,11 @@ nbnxn_kernel_prune_cuda(const NBAtomDataGpu, const NBParamGpu, const Nbnx "The shared memory offset calculation assumes that char is 1 byte"); /* shmem buffer for i x+q pre-loading */ - float4* xib = (float4*)sm_nextSlotPtr; + float4* xib = reinterpret_cast(sm_nextSlotPtr); sm_nextSlotPtr += (c_nbnxnGpuNumClusterPerSupercluster * c_clSize * sizeof(*xib)); /* shmem buffer for cj, for each warp separately */ - int* cjs = (int*)(sm_nextSlotPtr); + int* cjs = reinterpret_cast(sm_nextSlotPtr); /* the cjs buffer's use expects a base pointer offset for pairs of warps in the j-concurrent execution */ cjs += tidxz * c_nbnxnGpuClusterpairSplit * c_nbnxnGpuJgroupSize; sm_nextSlotPtr += (NTHREAD_Z * c_nbnxnGpuClusterpairSplit * c_nbnxnGpuJgroupSize * sizeof(*cjs)); diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel_utils.cuh b/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel_utils.cuh index 554748330f..deeb17a8fa 100644 --- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel_utils.cuh +++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel_utils.cuh @@ -72,8 +72,8 @@ static const int __device__ c_fbufStride = c_clSizeSq; static const unsigned __device__ superClInteractionMask = ((1U << c_nbnxnGpuNumClusterPerSupercluster) - 1U); -static const float __device__ c_oneSixth = 0.16666667f; -static const float __device__ c_oneTwelveth = 0.08333333f; +static const float __device__ c_oneSixth = 0.16666667F; +static const float __device__ c_oneTwelveth = 0.08333333F; /*! Convert LJ sigma,epsilon parameters to C6,C12. */ @@ -102,7 +102,7 @@ calculate_force_switch_F(const NBParamGpu nbparam, float c6, float c12, float in r = r2 * inv_r; r_switch = r - nbparam.rvdw_switch; - r_switch = r_switch >= 0.0f ? r_switch : 0.0f; + r_switch = r_switch >= 0.0F ? r_switch : 0.0F; *F_invr += -c6 * (disp_shift_V2 + disp_shift_V3 * r_switch) * r_switch * r_switch * inv_r + c12 * (repu_shift_V2 + repu_shift_V3 * r_switch) * r_switch * r_switch * inv_r; @@ -132,7 +132,7 @@ static __forceinline__ __device__ void calculate_force_switch_F_E(const NBParamG r = r2 * inv_r; r_switch = r - nbparam.rvdw_switch; - r_switch = r_switch >= 0.0f ? r_switch : 0.0f; + r_switch = r_switch >= 0.0F ? r_switch : 0.0F; *F_invr += -c6 * (disp_shift_V2 + disp_shift_V3 * r_switch) * r_switch * r_switch * inv_r + c12 * (repu_shift_V2 + repu_shift_V3 * r_switch) * r_switch * r_switch * inv_r; @@ -141,8 +141,11 @@ static __forceinline__ __device__ void calculate_force_switch_F_E(const NBParamG } /*! Apply potential switch, force-only version. */ -static __forceinline__ __device__ void -calculate_potential_switch_F(const NBParamGpu nbparam, float inv_r, float r2, float* F_invr, float* E_lj) +static __forceinline__ __device__ void calculate_potential_switch_F(const NBParamGpu& nbparam, + float inv_r, + float r2, + float* F_invr, + const float* E_lj) { float r, r_switch; float sw, dsw; @@ -159,9 +162,9 @@ calculate_potential_switch_F(const NBParamGpu nbparam, float inv_r, float r2, fl r_switch = r - nbparam.rvdw_switch; /* Unlike in the F+E kernel, conditional is faster here */ - if (r_switch > 0.0f) + if (r_switch > 0.0F) { - sw = 1.0f + (switch_V3 + (switch_V4 + switch_V5 * r_switch) * r_switch) * r_switch * r_switch * r_switch; + sw = 1.0F + (switch_V3 + (switch_V4 + switch_V5 * r_switch) * r_switch) * r_switch * r_switch * r_switch; dsw = (switch_F2 + (switch_F3 + switch_F4 * r_switch) * r_switch) * r_switch * r_switch; *F_invr = (*F_invr) * sw - inv_r * (*E_lj) * dsw; @@ -185,10 +188,10 @@ calculate_potential_switch_F_E(const NBParamGpu nbparam, float inv_r, float r2, r = r2 * inv_r; r_switch = r - nbparam.rvdw_switch; - r_switch = r_switch >= 0.0f ? r_switch : 0.0f; + r_switch = r_switch >= 0.0F ? r_switch : 0.0F; /* Unlike in the F-only kernel, masking is faster here */ - sw = 1.0f + (switch_V3 + (switch_V4 + switch_V5 * r_switch) * r_switch) * r_switch * r_switch * r_switch; + sw = 1.0F + (switch_V3 + (switch_V4 + switch_V5 * r_switch) * r_switch) * r_switch * r_switch * r_switch; dsw = (switch_F2 + (switch_F3 + switch_F4 * r_switch) * r_switch) * r_switch * r_switch; *F_invr = (*F_invr) * sw - inv_r * (*E_lj) * dsw; @@ -234,7 +237,7 @@ static __forceinline__ __device__ void calculate_lj_ewald_comb_geom_F(const NBPa inv_r6_nm = inv_r2 * inv_r2 * inv_r2; cr2 = lje_coeff2 * r2; expmcr2 = expf(-cr2); - poly = 1.0f + cr2 + 0.5f * cr2 * cr2; + poly = 1.0F + cr2 + 0.5F * cr2 * cr2; /* Subtract the grid force from the total LJ force */ *F_invr += c6grid * (inv_r6_nm - expmcr2 * (inv_r6_nm * poly + lje_coeff6_6)) * inv_r2; @@ -263,14 +266,14 @@ static __forceinline__ __device__ void calculate_lj_ewald_comb_geom_F_E(const NB inv_r6_nm = inv_r2 * inv_r2 * inv_r2; cr2 = lje_coeff2 * r2; expmcr2 = expf(-cr2); - poly = 1.0f + cr2 + 0.5f * cr2 * cr2; + poly = 1.0F + cr2 + 0.5F * cr2 * cr2; /* Subtract the grid force from the total LJ force */ *F_invr += c6grid * (inv_r6_nm - expmcr2 * (inv_r6_nm * poly + lje_coeff6_6)) * inv_r2; /* Shift should be applied only to real LJ pairs */ sh_mask = nbparam.sh_lj_ewald * int_bit; - *E_lj += c_oneSixth * c6grid * (inv_r6_nm * (1.0f - expmcr2 * poly) + sh_mask); + *E_lj += c_oneSixth * c6grid * (inv_r6_nm * (1.0F - expmcr2 * poly) + sh_mask); } /*! Fetch per-type LJ parameters. @@ -321,7 +324,7 @@ static __forceinline__ __device__ void calculate_lj_ewald_comb_LB_F_E(const NBPa inv_r6_nm = inv_r2 * inv_r2 * inv_r2; cr2 = lje_coeff2 * r2; expmcr2 = expf(-cr2); - poly = 1.0f + cr2 + 0.5f * cr2 * cr2; + poly = 1.0F + cr2 + 0.5F * cr2 * cr2; /* Subtract the grid force from the total LJ force */ *F_invr += c6grid * (inv_r6_nm - expmcr2 * (inv_r6_nm * poly + lje_coeff6_6)) * inv_r2; @@ -332,7 +335,7 @@ static __forceinline__ __device__ void calculate_lj_ewald_comb_LB_F_E(const NBPa /* Shift should be applied only to real LJ pairs */ sh_mask = nbparam.sh_lj_ewald * int_bit; - *E_lj += c_oneSixth * c6grid * (inv_r6_nm * (1.0f - expmcr2 * poly) + sh_mask); + *E_lj += c_oneSixth * c6grid * (inv_r6_nm * (1.0F - expmcr2 * poly) + sh_mask); } } @@ -376,7 +379,7 @@ __forceinline__ __host__ __device__ T lerp(T d0, T d1, T t) static __forceinline__ __device__ float interpolate_coulomb_force_r(const NBParamGpu nbparam, float r) { float normalized = nbparam.coulomb_tab_scale * r; - int index = (int)normalized; + int index = static_cast(normalized); float fraction = normalized - index; float2 d01 = fetch_coulomb_force_r(nbparam, index); @@ -389,6 +392,7 @@ static __forceinline__ __device__ float interpolate_coulomb_force_r(const NBPara * Depending on what is supported, it fetches parameters either * using direct load, texture objects, or texrefs. */ +// NOLINTNEXTLINE(google-runtime-references) static __forceinline__ __device__ void fetch_nbfp_c6_c12(float& c6, float& c12, const NBParamGpu nbparam, int baseIndex) { float2 c6c12; @@ -405,19 +409,19 @@ static __forceinline__ __device__ void fetch_nbfp_c6_c12(float& c6, float& c12, /*! Calculate analytical Ewald correction term. */ static __forceinline__ __device__ float pmecorrF(float z2) { - const float FN6 = -1.7357322914161492954e-8f; - const float FN5 = 1.4703624142580877519e-6f; - const float FN4 = -0.000053401640219807709149f; - const float FN3 = 0.0010054721316683106153f; - const float FN2 = -0.019278317264888380590f; - const float FN1 = 0.069670166153766424023f; - const float FN0 = -0.75225204789749321333f; - - const float FD4 = 0.0011193462567257629232f; - const float FD3 = 0.014866955030185295499f; - const float FD2 = 0.11583842382862377919f; - const float FD1 = 0.50736591960530292870f; - const float FD0 = 1.0f; + const float FN6 = -1.7357322914161492954e-8F; + const float FN5 = 1.4703624142580877519e-6F; + const float FN4 = -0.000053401640219807709149F; + const float FN3 = 0.0010054721316683106153F; + const float FN2 = -0.019278317264888380590F; + const float FN1 = 0.069670166153766424023F; + const float FN0 = -0.75225204789749321333F; + + const float FD4 = 0.0011193462567257629232F; + const float FD3 = 0.014866955030185295499F; + const float FD2 = 0.11583842382862377919F; + const float FD1 = 0.50736591960530292870F; + const float FD0 = 1.0F; float z4; float polyFN0, polyFN1, polyFD0, polyFD1; @@ -429,7 +433,7 @@ static __forceinline__ __device__ float pmecorrF(float z2) polyFD0 = polyFD0 * z4 + FD0; polyFD0 = polyFD1 * z2 + polyFD0; - polyFD0 = 1.0f / polyFD0; + polyFD0 = 1.0F / polyFD0; polyFN0 = FN6 * z4 + FN4; polyFN1 = FN5 * z4 + FN3; @@ -445,11 +449,11 @@ static __forceinline__ __device__ float pmecorrF(float z2) * arbitrary array sizes. */ static __forceinline__ __device__ void -reduce_force_j_generic(float* f_buf, float3* fout, int tidxi, int tidxj, int aidx) +reduce_force_j_generic(const float* f_buf, float3* fout, int tidxi, int tidxj, int aidx) { if (tidxi < 3) { - float f = 0.0f; + float f = 0.0F; for (int j = tidxj * c_clSize; j < (tidxj + 1) * c_clSize; j++) { f += f_buf[c_fbufStride * tidxi + j]; @@ -494,17 +498,17 @@ reduce_force_j_warp_shfl(float3 f, float3* fout, int tidxi, int aidx, const unsi * arbitrary array sizes. * TODO: add the tidxi < 3 trick */ -static __forceinline__ __device__ void reduce_force_i_generic(float* f_buf, - float3* fout, - float* fshift_buf, - bool bCalcFshift, - int tidxi, - int tidxj, - int aidx) +static __forceinline__ __device__ void reduce_force_i_generic(const float* f_buf, + float3* fout, + float* fshift_buf, + bool bCalcFshift, + int tidxi, + int tidxj, + int aidx) { if (tidxj < 3) { - float f = 0.0f; + float f = 0.0F; for (int j = tidxi; j < c_clSizeSq; j += c_clSize) { f += f_buf[tidxj * c_fbufStride + j]; @@ -533,7 +537,7 @@ static __forceinline__ __device__ void reduce_force_i_pow2(volatile float* f_buf int i, j; float f; - assert(c_clSize == 1 << c_clSizeLog2); + static_assert(c_clSize == 1 << c_clSizeLog2); /* Reduce the initial c_clSize values for each i atom to half * every step by using c_clSize * i threads. diff --git a/src/gromacs/nbnxm/cuda/nbnxm_gpu_buffer_ops_internal.cu b/src/gromacs/nbnxm/cuda/nbnxm_gpu_buffer_ops_internal.cu index 577b2b4b92..f6533259b5 100644 --- a/src/gromacs/nbnxm/cuda/nbnxm_gpu_buffer_ops_internal.cu +++ b/src/gromacs/nbnxm/cuda/nbnxm_gpu_buffer_ops_internal.cu @@ -89,7 +89,7 @@ static __global__ void nbnxn_gpu_x_to_nbat_x_kernel(int numColumns, // Destination address where x should be stored in nbnxm layout. We use this cast here to // save only x, y and z components, not touching the w (q) component, which is pre-defined. - float3* gm_xqDest = (float3*)&gm_xq[threadIndex + offset]; + float3* gm_xqDest = reinterpret_cast(&gm_xq[threadIndex + offset]); // Perform layout conversion of each element. if (threadIndex < numAtoms) diff --git a/src/gromacs/nbnxm/nbnxm_gpu_buffer_ops_internal.h b/src/gromacs/nbnxm/nbnxm_gpu_buffer_ops_internal.h index dc91a7a299..09b2d6e379 100644 --- a/src/gromacs/nbnxm/nbnxm_gpu_buffer_ops_internal.h +++ b/src/gromacs/nbnxm/nbnxm_gpu_buffer_ops_internal.h @@ -43,12 +43,13 @@ #include "gromacs/gpu_utils/gputraits.h" class DeviceStream; -class Grid; struct NbnxmGpu; namespace Nbnxm { +class Grid; + /*! \brief Launch coordinate layout conversion kernel * * \param[in] grid Pair-search grid. diff --git a/src/gromacs/nbnxm/nbnxm_gpu_data_mgmt.cpp b/src/gromacs/nbnxm/nbnxm_gpu_data_mgmt.cpp index 41c107deea..6e7a5b799f 100644 --- a/src/gromacs/nbnxm/nbnxm_gpu_data_mgmt.cpp +++ b/src/gromacs/nbnxm/nbnxm_gpu_data_mgmt.cpp @@ -107,7 +107,7 @@ static inline void init_ewald_coulomb_force_table(const EwaldCorrectionTables& t { if (nbp->coulomb_tab) { - destroyParamLookupTable(&nbp->coulomb_tab, nbp->coulomb_tab_texobj); + destroyParamLookupTable(&nbp->coulomb_tab, &nbp->coulomb_tab_texobj); } nbp->coulomb_tab_scale = tables.scale; @@ -1132,17 +1132,17 @@ void gpu_free(NbnxmGpu* nb) /* Free nbparam */ if (nbparam->elecType == ElecType::EwaldTab || nbparam->elecType == ElecType::EwaldTabTwin) { - destroyParamLookupTable(&nbparam->coulomb_tab, nbparam->coulomb_tab_texobj); + destroyParamLookupTable(&nbparam->coulomb_tab, &nbparam->coulomb_tab_texobj); } if (!useLjCombRule(nb->nbparam->vdwType)) { - destroyParamLookupTable(&nbparam->nbfp, nbparam->nbfp_texobj); + destroyParamLookupTable(&nbparam->nbfp, &nbparam->nbfp_texobj); } if (nbparam->vdwType == VdwType::EwaldGeom || nbparam->vdwType == VdwType::EwaldLB) { - destroyParamLookupTable(&nbparam->nbfp_comb, nbparam->nbfp_comb_texobj); + destroyParamLookupTable(&nbparam->nbfp_comb, &nbparam->nbfp_comb_texobj); } /* Free plist */ diff --git a/src/gromacs/pbcutil/pbc_aiuc.h b/src/gromacs/pbcutil/pbc_aiuc.h index bd45ec5d5c..97717d5267 100644 --- a/src/gromacs/pbcutil/pbc_aiuc.h +++ b/src/gromacs/pbcutil/pbc_aiuc.h @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2019,2020, by the GROMACS development team, led by + * Copyright (c) 2019,2020,2021, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -99,32 +99,32 @@ struct PbcAiuc static inline void setPbcAiuc(int numPbcDim, const matrix box, PbcAiuc* pbcAiuc) { - pbcAiuc->invBoxDiagZ = 0.0f; - pbcAiuc->boxZX = 0.0f; - pbcAiuc->boxZY = 0.0f; - pbcAiuc->boxZZ = 0.0f; - pbcAiuc->invBoxDiagY = 0.0f; - pbcAiuc->boxYX = 0.0f; - pbcAiuc->boxYY = 0.0f; - pbcAiuc->invBoxDiagX = 0.0f; - pbcAiuc->boxXX = 0.0f; + pbcAiuc->invBoxDiagZ = 0.0F; + pbcAiuc->boxZX = 0.0F; + pbcAiuc->boxZY = 0.0F; + pbcAiuc->boxZZ = 0.0F; + pbcAiuc->invBoxDiagY = 0.0F; + pbcAiuc->boxYX = 0.0F; + pbcAiuc->boxYY = 0.0F; + pbcAiuc->invBoxDiagX = 0.0F; + pbcAiuc->boxXX = 0.0F; if (numPbcDim > ZZ) { - pbcAiuc->invBoxDiagZ = 1.0f / box[ZZ][ZZ]; + pbcAiuc->invBoxDiagZ = 1.0F / box[ZZ][ZZ]; pbcAiuc->boxZX = box[ZZ][XX]; pbcAiuc->boxZY = box[ZZ][YY]; pbcAiuc->boxZZ = box[ZZ][ZZ]; } if (numPbcDim > YY) { - pbcAiuc->invBoxDiagY = 1.0f / box[YY][YY]; + pbcAiuc->invBoxDiagY = 1.0F / box[YY][YY]; pbcAiuc->boxYX = box[YY][XX]; pbcAiuc->boxYY = box[YY][YY]; } if (numPbcDim > XX) { - pbcAiuc->invBoxDiagX = 1.0f / box[XX][XX]; + pbcAiuc->invBoxDiagX = 1.0F / box[XX][XX]; pbcAiuc->boxXX = box[XX][XX]; } } diff --git a/src/gromacs/pbcutil/pbc_aiuc_cuda.cuh b/src/gromacs/pbcutil/pbc_aiuc_cuda.cuh index a2784a6d16..eb12862bf4 100644 --- a/src/gromacs/pbcutil/pbc_aiuc_cuda.cuh +++ b/src/gromacs/pbcutil/pbc_aiuc_cuda.cuh @@ -94,6 +94,7 @@ static inline __device__ int int3ToShiftIndex(int3 iv) */ template static __forceinline__ __device__ int +// NOLINTNEXTLINE(google-runtime-references) pbcDxAiuc(const PbcAiuc& pbcAiuc, const float4 r1, const float4 r2, float3& dr) { dr.x = r1.x - r2.x; -- 2.22.0