#
# -misc-no-recursion
# We have way too many functions and methods relying on recursion
+#
+# -readability-static-accessed-through-instance
+# Especially the GPU code uses this in many places.
+#
+# -performance-type-promotion-in-math-fn
+# Those warnings are in the gpu code and we shouldn't to worry about them.
Checks: clang-diagnostic-*,-clang-analyzer-*,-clang-analyzer-security.insecureAPI.strcpy,
bugprone-*,misc-*,readability-*,performance-*,mpi-*,
-readability-inconsistent-declaration-parameter-name,
-bugprone-narrowing-conversions,
-google-readability-avoid-underscore-in-googletest-name,
-cppcoreguidelines-init-variables,
- -misc-no-recursion
+ -misc-no-recursion,
+ -readability-static-accessed-through-instance,
+ -performance-type-promotion-in-math-fn
HeaderFilterRegex: .*
CheckOptions:
- key: cppcoreguidelines-special-member-functions.AllowSoleDefaultDtor
*gm_dataDest = *gm_dataSrc;
}
}
-
- return;
}
/*! \brief unpack non-local force data buffer on the GPU using pre-populated "map" containing index
*gm_dataDest = *gm_dataSrc;
}
}
-
- return;
}
void GpuHaloExchange::Impl::reinitHalo(float3* d_coordinatesBuffer, float3* d_forcesBuffer)
wallcycle_sub_stop(wcycle_, WallCycleSubCounter::DDGpu);
wallcycle_stop(wcycle_, WallCycleCounter::Domdec);
-
- return;
}
void GpuHaloExchange::Impl::enqueueWaitRemoteCoordinatesReadyEvent(GpuEventSynchronizer* coordinatesReadyOnDeviceEvent)
// Similarly send event to task that will push data to this task.
GpuEventSynchronizer* remoteCoordinatesReadyOnDeviceEvent;
MPI_Sendrecv(&coordinatesReadyOnDeviceEvent,
- sizeof(GpuEventSynchronizer*),
+ sizeof(GpuEventSynchronizer*), //NOLINT(bugprone-sizeof-expression)
MPI_BYTE,
recvRankX_,
0,
&remoteCoordinatesReadyOnDeviceEvent,
- sizeof(GpuEventSynchronizer*),
+ sizeof(GpuEventSynchronizer*), //NOLINT(bugprone-sizeof-expression)
MPI_BYTE,
sendRankX_,
0,
communicateHaloData(d_sendBuf_, xSendSize_, sendRankX_, recvPtr, xRecvSize_, recvRankX_);
wallcycle_stop(wcycle_, WallCycleCounter::MoveX);
-
- return;
}
// The following method should be called after non-local buffer operations,
haloDataTransferLaunched_->markEvent(nonLocalStream_);
MPI_Sendrecv(&haloDataTransferLaunched_,
- sizeof(GpuEventSynchronizer*),
+ sizeof(GpuEventSynchronizer*), //NOLINT(bugprone-sizeof-expression)
MPI_BYTE,
sendRank,
0,
&haloDataTransferRemote,
- sizeof(GpuEventSynchronizer*),
+ sizeof(GpuEventSynchronizer*), //NOLINT(bugprone-sizeof-expression)
MPI_BYTE,
recvRank,
0,
#if GMX_MPI
// Receive event from PP task
+ // NOLINTNEXTLINE(bugprone-sizeof-expression)
MPI_Irecv(&ppSync_[recvCount_], sizeof(GpuEventSynchronizer*), MPI_BYTE, ppRank, 0, comm_, &request_[recvCount_]);
recvCount_++;
#else
#if GMX_MPI
// TODO Using MPI_Isend would be more efficient, particularly when
// sending to multiple PP ranks
+ // NOLINTNEXTLINE(bugprone-sizeof-expression)
MPI_Send(&pmeForcesReady_, sizeof(GpuEventSynchronizer*), MPI_BYTE, ppRank, 0, comm_);
#else
GMX_UNUSED_VALUE(ppRank);
* \param[in] ppRanks List of PP ranks
*/
Impl(GpuEventSynchronizer* pmeForcesReady, MPI_Comm comm, gmx::ArrayRef<PpRanks> ppRanks);
+ // NOLINTNEXTLINE(performance-trivially-destructible)
~Impl();
/*! \brief
case ZZ: return realGridSizeFP[ZZ];
}
assert(false);
- return 0.0f;
+ return 0.0F;
}
/*! \brief Reduce the partial force contributions.
const int splineIndex,
const int lineIndex,
const float* realGridSizeFP,
- float& fx,
- float& fy,
- float& fz)
+ float& fx, // NOLINT(google-runtime-references)
+ float& fy, // NOLINT(google-runtime-references)
+ float& fz) // NOLINT(google-runtime-references)
{
if (gmx::isPowerOfTwo(order)) // Only for orders of power of 2
{
if (dimIndex < DIM)
{
const float n = read_grid_size(realGridSizeFP, dimIndex);
- *((float*)(&sm_forces[atomIndexLocal]) + dimIndex) = fx * n;
+ float* __restrict__ sm_forcesAtomIndexOffset =
+ reinterpret_cast<float*>(&sm_forces[atomIndexLocal]);
+ sm_forcesAtomIndexOffset[dimIndex] = fx * n;
}
}
else
if (sourceIndex == minStride * atomIndex)
{
- *((float*)(&sm_forces[atomIndex]) + dimIndex) =
+ float* __restrict__ sm_forcesAtomIndexOffset =
+ reinterpret_cast<float*>(&sm_forces[atomIndex]);
+ sm_forcesAtomIndexOffset[dimIndex] =
(sm_forceTemp[dimIndex][sourceIndex] + sm_forceTemp[dimIndex][sourceIndex + 1]) * n;
}
}
kernelParams, atomIndexOffset, atomX, atomCharge, sm_theta, sm_dtheta, sm_gridlineIndices);
__syncwarp();
}
- float fx = 0.0f;
- float fy = 0.0f;
- float fz = 0.0f;
+ float fx = 0.0F;
+ float fy = 0.0F;
+ float fz = 0.0F;
const int chargeCheck = pme_gpu_check_atom_charge(gm_coefficientsA[atomIndexGlobal]);
{
int outputIndexLocal = i * iterThreads + threadLocalId;
int outputIndexGlobal = blockIndex * blockForcesSize + outputIndexLocal;
- float outputForceComponent = ((float*)sm_forces)[outputIndexLocal];
+ float outputForceComponent = (reinterpret_cast<float*>(sm_forces)[outputIndexLocal]);
gm_forces[outputIndexGlobal] = outputForceComponent;
}
}
{
/* We must sync here since the same shared memory is used as above. */
__syncthreads();
- fx = 0.0f;
- fy = 0.0f;
- fz = 0.0f;
+ fx = 0.0F;
+ fy = 0.0F;
+ fz = 0.0F;
const int chargeCheck = pme_gpu_check_atom_charge(gm_coefficientsB[atomIndexGlobal]);
if (chargeCheck)
{
{
int outputIndexLocal = i * iterThreads + threadLocalId;
int outputIndexGlobal = blockIndex * blockForcesSize + outputIndexLocal;
- float outputForceComponent = ((float*)sm_forces)[outputIndexLocal];
+ float outputForceComponent = (reinterpret_cast<float*>(sm_forces)[outputIndexLocal]);
gm_forces[outputIndexGlobal] += outputForceComponent;
}
}
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2016,2017,2018,2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2016,2017,2018,2019,2020,2021, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
const int realGridSizePaddedTotal =
realGridSizePadded[XX] * realGridSizePadded[YY] * realGridSizePadded[ZZ];
- realGrid_ = (cufftReal*)kernelParamsPtr->grid.d_realGrid[gridIndex];
+ realGrid_ = reinterpret_cast<cufftReal*>(kernelParamsPtr->grid.d_realGrid[gridIndex]);
GMX_RELEASE_ASSERT(realGrid_, "Bad (null) input real-space grid");
- complexGrid_ = (cufftComplex*)kernelParamsPtr->grid.d_fourierGrid[gridIndex];
+ complexGrid_ = reinterpret_cast<cufftComplex*>(kernelParamsPtr->grid.d_fourierGrid[gridIndex]);
GMX_RELEASE_ASSERT(complexGrid_, "Bad (null) input complex grid");
cufftResult_t result;
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2016,2017,2018,2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2016,2017,2018,2019,2020,2021, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
*
* This is called from the spline_and_spread and gather PME kernels.
*/
-int __device__ __forceinline__ pme_gpu_check_atom_charge(const float coefficient)
+bool __device__ __forceinline__ pme_gpu_check_atom_charge(const float coefficient)
{
assert(isfinite(coefficient));
- return c_skipNeutralAtoms ? (coefficient != 0.0f) : 1;
+ return c_skipNeutralAtoms ? (coefficient != 0.0F) : true;
}
//! Controls if the atom and charge data is prefeched into shared memory or loaded per thread from global
template<>
__device__ inline void assertIsFinite(float3 gmx_unused arg)
{
- assert(isfinite(float(arg.x)));
- assert(isfinite(float(arg.y)));
- assert(isfinite(float(arg.z)));
+ assert(isfinite(static_cast<float>(arg.x)));
+ assert(isfinite(static_cast<float>(arg.y)));
+ assert(isfinite(static_cast<float>(arg.z)));
}
template<typename T>
__device__ inline void assertIsFinite(T gmx_unused arg)
{
- assert(isfinite(float(arg)));
+ assert(isfinite(static_cast<float>(arg)));
}
/*! \brief
const float shift = c_pmeMaxUnitcellShift;
/* Fractional coordinates along box vectors, adding a positive shift to ensure t is positive for triclinic boxes */
t = (t + shift) * n;
- tInt = (int)t;
+ tInt = static_cast<int>(t);
assert(sharedMemoryIndex < atomsPerBlock * DIM);
sm_fractCoords[sharedMemoryIndex] = t - tInt;
tableIndex += tInt;
assert(isfinite(dr));
/* dr is relative offset from lower cell limit */
- splineData[order - 1] = 0.0f;
+ splineData[order - 1] = 0.0F;
splineData[1] = dr;
- splineData[0] = 1.0f - dr;
+ splineData[0] = 1.0F - dr;
#pragma unroll
for (int k = 3; k < order; k++)
{
- div = 1.0f / (k - 1.0f);
+ div = 1.0F / (k - 1.0F);
splineData[k - 1] = div * dr * splineData[k - 2];
#pragma unroll
for (int l = 1; l < (k - 1); l++)
splineData[k - l - 1] =
div * ((dr + l) * splineData[k - l - 2] + (k - l - dr) * splineData[k - l - 1]);
}
- splineData[0] = div * (1.0f - dr) * splineData[0];
+ splineData[0] = div * (1.0F - dr) * splineData[0];
}
const int thetaIndexBase =
const int thetaIndex =
getSplineParamIndex<order, atomsPerWarp>(thetaIndexBase, dimIndex, o);
- const float dtheta = ((o > 0) ? splineData[o - 1] : 0.0f) - splineData[o];
+ const float dtheta = ((o > 0) ? splineData[o - 1] : 0.0F) - splineData[o];
assert(isfinite(dtheta));
assert(thetaIndex < order * DIM * atomsPerBlock);
if (writeSmDtheta)
}
}
- div = 1.0f / (order - 1.0f);
+ div = 1.0F / (order - 1.0F);
splineData[order - 1] = div * dr * splineData[order - 2];
#pragma unroll
for (int k = 1; k < (order - 1); k++)
* ((dr + k) * splineData[order - k - 2]
+ (order - k - dr) * splineData[order - k - 1]);
}
- splineData[0] = div * (1.0f - dr) * splineData[0];
+ splineData[0] = div * (1.0F - dr) * splineData[0];
/* Storing the spline values (theta) */
#pragma unroll
auto* kernelParamsPtr = pmeGpu->kernelParams.get();
#if GMX_GPU_CUDA
destroyParamLookupTable(&kernelParamsPtr->grid.d_fractShiftsTable,
- kernelParamsPtr->fractShiftsTableTexture);
+ &kernelParamsPtr->fractShiftsTableTexture);
destroyParamLookupTable(&kernelParamsPtr->grid.d_gridlineIndicesTable,
- kernelParamsPtr->gridlineIndicesTableTexture);
+ &kernelParamsPtr->gridlineIndicesTableTexture);
#elif GMX_GPU_OPENCL || GMX_GPU_SYCL
freeDeviceBuffer(&kernelParamsPtr->grid.d_fractShiftsTable);
freeDeviceBuffer(&kernelParamsPtr->grid.d_gridlineIndicesTable);
//! PME CUDA kernels forward declarations. Kernels are documented in their respective files.
template<int order, bool computeSplines, bool spreadCharges, bool wrapX, bool wrapY, int mode, bool writeGlobal, ThreadsPerAtom threadsPerAtom>
-__global__ void pme_spline_and_spread_kernel(const PmeGpuCudaKernelParams kernelParams);
+__global__ void pme_spline_and_spread_kernel(PmeGpuCudaKernelParams kernelParams);
// Add extern declarations to inform that there will be a definition
// provided in another translation unit.
pme_spline_and_spread_kernel<c_pmeOrder, true, true, c_wrapX, c_wrapY, 2, false, ThreadsPerAtom::OrderSquared>(const PmeGpuCudaKernelParams);
template<GridOrdering gridOrdering, bool computeEnergyAndVirial, const int gridIndex> /* It is significantly slower to pass gridIndex as a kernel parameter */
-__global__ void pme_solve_kernel(const PmeGpuCudaKernelParams kernelParams);
+__global__ void pme_solve_kernel(PmeGpuCudaKernelParams kernelParams);
// Add extern declarations to inform that there will be a definition
// provided in another translation unit.
// clang-format on
template<int order, bool wrapX, bool wrapY, int nGrids, bool readGlobal, ThreadsPerAtom threadsPerAtom>
-__global__ void pme_gather_kernel(const PmeGpuCudaKernelParams kernelParams);
+__global__ void pme_gather_kernel(PmeGpuCudaKernelParams kernelParams);
// Add extern declarations to inform that there will be a definition
// provided in another translation unit.
// Reallocate buffer used for staging PME force on GPU
reallocateDeviceBuffer(&d_pmeForces_, size, &d_pmeForcesSize_, &d_pmeForcesSizeAlloc_, deviceContext_);
- return;
}
void PmePpCommGpu::Impl::receiveForceFromPmeCudaDirect(float3* pmeForcePtr, int recvSize, bool receivePmeForceToGpu)
// Receive event from PME task and add to stream, to ensure pull of data doesn't
// occur before PME force calc is completed
GpuEventSynchronizer* pmeSync;
+ // NOLINTNEXTLINE(bugprone-sizeof-expression)
MPI_Recv(&pmeSync, sizeof(GpuEventSynchronizer*), MPI_BYTE, pmeRank_, 0, comm_, MPI_STATUS_IGNORE);
pmeSync->enqueueWaitEvent(pmePpCommStream_);
#endif
// Record and send event to allow PME task to sync to above transfer before commencing force calculations
pmeCoordinatesSynchronizer_.markEvent(pmePpCommStream_);
GpuEventSynchronizer* pmeSync = &pmeCoordinatesSynchronizer_;
+ // NOLINTNEXTLINE(bugprone-sizeof-expression)
MPI_Send(&pmeSync, sizeof(GpuEventSynchronizer*), MPI_BYTE, pmeRank_, 0, comm_);
#endif
}
int sendSize,
GpuEventSynchronizer* coordinatesReadyOnDeviceEvent);
-private:
//! GPU context handle (not used in CUDA)
const DeviceContext& deviceContext_;
//! Handle for CUDA stream used for the communication operations in this class
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2016,2017,2018,2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2016,2017,2018,2019,2020,2021, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
const float* __restrict__ gm_splineValueMinor = kernelParams.grid.d_splineModuli[gridIndex]
+ kernelParams.grid.splineValuesOffset[minorDim];
float* __restrict__ gm_virialAndEnergy = kernelParams.constants.d_virialAndEnergy[gridIndex];
- float2* __restrict__ gm_grid = (float2*)kernelParams.grid.d_fourierGrid[gridIndex];
+ float2* __restrict__ gm_grid = reinterpret_cast<float2*>(kernelParams.grid.d_fourierGrid[gridIndex]);
/* Various grid sizes and indices */
const int localOffsetMinor = 0, localOffsetMajor = 0, localOffsetMiddle = 0; // unused
const int indexMajor = blockIdx.z;
/* Optional outputs */
- float energy = 0.0f;
- float virxx = 0.0f;
- float virxy = 0.0f;
- float virxz = 0.0f;
- float viryy = 0.0f;
- float viryz = 0.0f;
- float virzz = 0.0f;
+ float energy = 0.0F;
+ float virxx = 0.0F;
+ float virxy = 0.0F;
+ float virxz = 0.0F;
+ float viryy = 0.0F;
+ float viryz = 0.0F;
+ float virzz = 0.0F;
assert(indexMajor < kernelParams.grid.complexGridSize[majorDim]);
if ((indexMiddle < localCountMiddle) & (indexMinor < localCountMinor)
}
/* 0.5 correction factor for the first and last components of a Z dimension */
- float corner_fac = 1.0f;
+ float corner_fac = 1.0F;
switch (gridOrdering)
{
case GridOrdering::YZX:
if ((kMiddle == 0) | (kMiddle == maxkMiddle))
{
- corner_fac = 0.5f;
+ corner_fac = 0.5F;
}
break;
case GridOrdering::XYZ:
if ((kMinor == 0) | (kMinor == maxkMinor))
{
- corner_fac = 0.5f;
+ corner_fac = 0.5F;
}
break;
+ mZ * kernelParams.current.recipBox[ZZ][ZZ];
const float m2k = mhxk * mhxk + mhyk * mhyk + mhzk * mhzk;
- assert(m2k != 0.0f);
+ assert(m2k != 0.0F);
// TODO: use LDG/textures for gm_splineValue
float denom = m2k * float(CUDART_PI_F) * kernelParams.current.boxVolume
* gm_splineValueMajor[kMajor] * gm_splineValueMiddle[kMiddle]
* gm_splineValueMinor[kMinor];
assert(isfinite(denom));
- assert(denom != 0.0f);
+ assert(denom != 0.0F);
const float tmp1 = expf(-kernelParams.grid.ewaldFactor * m2k);
const float etermk = kernelParams.constants.elFactor * tmp1 / denom;
if (computeEnergyAndVirial)
{
const float tmp1k =
- 2.0f * (gridValue.x * oldGridValue.x + gridValue.y * oldGridValue.y);
+ 2.0F * (gridValue.x * oldGridValue.x + gridValue.y * oldGridValue.y);
- float vfactor = (kernelParams.grid.ewaldFactor + 1.0f / m2k) * 2.0f;
+ float vfactor = (kernelParams.grid.ewaldFactor + 1.0F / m2k) * 2.0F;
float ets2 = corner_fac * tmp1k;
energy = ets2;
/* Reduce 7 outputs per warp in the shared memory */
const int stride =
8; // this is c_virialAndEnergyCount==7 rounded up to power of 2 for convenience, hence the assert
- assert(c_virialAndEnergyCount == 7);
+ static_assert(c_virialAndEnergyCount == 7);
const int reductionBufferSize = (c_solveMaxThreadsPerBlock / warp_size) * stride;
__shared__ float sm_virialAndEnergy[reductionBufferSize];
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2017,2018,2019, by the GROMACS development team, led by
+ * Copyright (c) 2017,2018,2019,2021, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
static __forceinline__ __device__ T fetchFromTexture(const cudaTextureObject_t texObj, int index)
{
assert(index >= 0);
+ // NOLINTNEXTLINE(misc-static-assert)
assert(!c_disableCudaTextures);
return tex1Dfetch<T>(texObj, index);
}
* This file is part of the GROMACS molecular simulation package.
*
* Copyright (c) 2012,2014,2015,2016,2017 by the GROMACS development team.
- * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
*
* \returns A description of the API error. Returns '(CUDA error #0 (cudaSuccess): no error)' in case deviceError is cudaSuccess.
*/
-static inline std::string getDeviceErrorString(const cudaError_t deviceError)
+inline std::string getDeviceErrorString(const cudaError_t deviceError)
{
return formatString("CUDA error #%d (%s): %s.",
deviceError,
*
* \throws InternalError if deviceError is not a success.
*/
-static inline void checkDeviceError(const cudaError_t deviceError, const std::string& errorMessage)
+inline void checkDeviceError(const cudaError_t deviceError, const std::string& errorMessage)
{
if (deviceError != cudaSuccess)
{
*
* \param[in] errorMessage Undecorated error message.
*/
-static inline void ensureNoPendingDeviceError(const std::string& errorMessage)
+inline void ensureNoPendingDeviceError(const std::string& errorMessage)
{
// Ensure there is no pending error that would otherwise affect
// the behaviour of future error handling.
#ifdef CHECK_CUDA_ERRORS
/*! Check for CUDA error on the return status of a CUDA RT API call. */
-# define CU_RET_ERR(deviceError, msg) \
- do \
- { \
- if (deviceError != cudaSuccess) \
- { \
- gmx_fatal(FARGS, "%s\n", (msg + gmx::getDeviceErrorString(deviceError)).c_str()); \
- } \
+# define CU_RET_ERR(deviceError, msg) \
+ do \
+ { \
+ if ((deviceError) != cudaSuccess) \
+ { \
+ gmx_fatal(FARGS, "%s\n", ((msg) + gmx::getDeviceErrorString(deviceError)).c_str()); \
+ } \
} while (0)
#else /* CHECK_CUDA_ERRORS */
const CurrentArg* argPtr,
const RemainingArgs*... otherArgsPtrs)
{
+ // NOLINTNEXTLINE(google-readability-casting)
(*kernelArgsPtr)[argIndex] = (void*)argPtr;
prepareGpuKernelArgument(kernel, kernelArgsPtr, argIndex + 1, otherArgsPtrs...);
}
{
dim3 blockSize(config.blockSize[0], config.blockSize[1], config.blockSize[2]);
dim3 gridSize(config.gridSize[0], config.gridSize[1], config.gridSize[2]);
+ // NOLINTNEXTLINE(google-readability-casting)
cudaLaunchKernel((void*)kernel,
gridSize,
blockSize,
void allocateDeviceBuffer(DeviceBuffer<ValueType>* buffer, size_t numValues, const DeviceContext& /* deviceContext */)
{
GMX_ASSERT(buffer, "needs a buffer pointer");
- cudaError_t stat = cudaMalloc((void**)buffer, numValues * sizeof(ValueType));
+ // NOLINTNEXTLINE(google-readability-casting)
+ cudaError_t stat = cudaMalloc((void**)(buffer), numValues * sizeof(ValueType));
GMX_RELEASE_ASSERT(
stat == cudaSuccess,
("Allocation of the device buffer failed. " + gmx::getDeviceErrorString(stat)).c_str());
{
case GpuApiCallBehavior::Async:
GMX_ASSERT(isHostMemoryPinned(hostBuffer), "Source host buffer was not pinned for CUDA");
+ // NOLINTNEXTLINE(google-readability-casting)
stat = cudaMemcpyAsync(*((ValueType**)buffer) + startingOffset,
hostBuffer,
bytes,
case GpuApiCallBehavior::Sync:
stat = cudaMemcpy(
- *((ValueType**)buffer) + startingOffset, hostBuffer, bytes, cudaMemcpyHostToDevice);
+ // NOLINTNEXTLINE(google-readability-casting)
+ *((ValueType**)buffer) + startingOffset,
+ hostBuffer,
+ bytes,
+ cudaMemcpyHostToDevice);
GMX_RELEASE_ASSERT(
stat == cudaSuccess,
("Synchronous H2D copy failed. " + gmx::getDeviceErrorString(stat)).c_str());
GMX_ASSERT(isHostMemoryPinned(hostBuffer),
"Destination host buffer was not pinned for CUDA");
stat = cudaMemcpyAsync(hostBuffer,
+ // NOLINTNEXTLINE(google-readability-casting)
*((ValueType**)buffer) + startingOffset,
bytes,
cudaMemcpyDeviceToHost,
break;
case GpuApiCallBehavior::Sync:
- stat = cudaMemcpy(
- hostBuffer, *((ValueType**)buffer) + startingOffset, bytes, cudaMemcpyDeviceToHost);
+ stat = cudaMemcpy(hostBuffer,
+ // NOLINTNEXTLINE(google-readability-casting)
+ *((ValueType**)buffer) + startingOffset,
+ bytes,
+ cudaMemcpyDeviceToHost);
GMX_RELEASE_ASSERT(
stat == cudaSuccess,
("Synchronous D2H copy failed. " + gmx::getDeviceErrorString(stat)).c_str());
const char pattern = 0;
cudaError_t stat = cudaMemsetAsync(
- *((ValueType**)buffer) + startingOffset, pattern, bytes, deviceStream.stream());
+ // NOLINTNEXTLINE(google-readability-casting)
+ *((ValueType**)buffer) + startingOffset,
+ pattern,
+ bytes,
+ deviceStream.stream());
GMX_RELEASE_ASSERT(stat == cudaSuccess,
("Couldn't clear the device buffer. " + gmx::getDeviceErrorString(stat)).c_str());
}
const size_t sizeInBytes = numValues * sizeof(ValueType);
cudaError_t stat =
+ // NOLINTNEXTLINE(google-readability-casting)
cudaMemcpy(*((ValueType**)deviceBuffer), hostBuffer, sizeInBytes, cudaMemcpyHostToDevice);
GMX_RELEASE_ASSERT(stat == cudaSuccess,
* \param[in,out] deviceTexture Device texture object to unbind.
*/
template<typename ValueType>
-void destroyParamLookupTable(DeviceBuffer<ValueType>* deviceBuffer, DeviceTexture& deviceTexture)
+void destroyParamLookupTable(DeviceBuffer<ValueType>* deviceBuffer, const DeviceTexture* deviceTexture)
{
if (!c_disableCudaTextures && deviceTexture && deviceBuffer)
{
- cudaError_t stat = cudaDestroyTextureObject(deviceTexture);
+ cudaError_t stat = cudaDestroyTextureObject(*deviceTexture);
GMX_RELEASE_ASSERT(
stat == cudaSuccess,
("Destruction of the texture object failed. " + gmx::getDeviceErrorString(stat)).c_str());
* \param[in,out] deviceBuffer Device buffer to store data in.
*/
template<typename ValueType>
-void destroyParamLookupTable(DeviceBuffer<ValueType>* deviceBuffer, DeviceTexture& /* deviceTexture */)
+void destroyParamLookupTable(DeviceBuffer<ValueType>* deviceBuffer, DeviceTexture* /* deviceTexture */)
{
deviceBuffer->buffer_.reset(nullptr);
}
#include "gromacs/utility/snprintf.h"
#include "gromacs/utility/stringutil.h"
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
static bool cudaProfilerRun = ((getenv("NVPROF_ID") != nullptr));
bool isHostMemoryPinned(const void* h_ptr)
return isPinned;
}
-void startGpuProfiler(void)
+void startGpuProfiler()
{
/* The NVPROF_ID environment variable is set by nvprof and indicates that
mdrun is executed in the CUDA profiler.
}
}
-void stopGpuProfiler(void)
+void stopGpuProfiler()
{
/* Stopping the nvidia here allows us to eliminate the subsequent
API calls from the trace, e.g. uninitialization and cleanup. */
}
}
-void resetGpuProfiler(void)
+void resetGpuProfiler()
{
/* With CUDA <=7.5 the profiler can't be properly reset; we can only start
* the profiling here (can't stop it) which will achieve the desired effect if
{
std::string errorString =
gmx::formatString("%s from GPU %d to GPU %d failed", cudaCallName, gpuA, gpuB);
- CU_RET_ERR(stat, errorString.c_str());
+ CU_RET_ERR(stat, errorString);
}
if (stat != cudaSuccess)
{
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2016,2017,2018,2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2016,2017,2018,2019,2020,2021, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
* for passing into individual GPU API calls.
* This is just a dummy in CUDA.
*/
- inline CommandEvent* fetchNextEvent() { return nullptr; }
+ static inline CommandEvent* fetchNextEvent() { return nullptr; }
};
//! Short-hand for external use
gmx::ensureNoPendingDeviceError("Could not allocate page-locked memory.");
stat = cudaMallocHost(h_ptr, nbytes, flag);
- sprintf(strbuf, "cudaMallocHost of size %d bytes failed", (int)nbytes);
+ sprintf(strbuf, "cudaMallocHost of size %d bytes failed", static_cast<int>(nbytes));
CU_RET_ERR(stat, strbuf);
}
EXPECT_TRUE(input.empty());
resizeAndFillInput(&input, 3, 1);
// realloc and copy).
- auto oldInputData = input.data();
+ auto* oldInputData = input.data();
changePinningPolicy(&input, PinningPolicy::CannotBePinned);
EXPECT_FALSE(isPinned(input));
// These cannot be equal as both had to be allocated at the same
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2020, by the GROMACS development team, led by
+ * Copyright (c) 2020,2021, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
#include <vector>
+#include "gromacs/utility/arrayref.h"
+
#include "testutils/testasserts.h"
#if !GMX_GPU_CUDA
namespace test
{
-void convertRVecToFloat3OnHost(std::vector<gmx::RVec>& /* rVecOutput */,
- const std::vector<gmx::RVec>& /* rVecInput */)
+void convertRVecToFloat3OnHost(ArrayRef<gmx::RVec> /* rVecOutput */,
+ ArrayRef<const gmx::RVec> /* rVecInput */)
{
FAIL() << "Can't test float3 and RVec compatibility without CUDA.";
}
-void convertRVecToFloat3OnDevice(std::vector<gmx::RVec>& /* rVecOutput */,
- const std::vector<gmx::RVec>& /* rVecInput */,
+void convertRVecToFloat3OnDevice(ArrayRef<gmx::RVec> /* rVecOutput */,
+ ArrayRef<const gmx::RVec> /* rVecInput */,
const TestDevice* /* testDevice */)
{
FAIL() << "Can't test float3 and RVec compatibility without CUDA.";
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2020, by the GROMACS development team, led by
+ * Copyright (c) 2020,2021, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
#include "gromacs/gpu_utils/devicebuffer.h"
#include "gromacs/gpu_utils/typecasts.cuh"
#include "gromacs/hardware/device_information.h"
+#include "gromacs/utility/arrayref.h"
#include "gromacs/utility/exceptions.h"
#include "gromacs/utility/stringutil.h"
* \param[in] float3Output Output data in float3 format.
* \param[in] numElements Size of the data buffers.
*/
-void inline saveFloat3InRVecFormat(std::vector<gmx::RVec>& rVecOutput, const float3* float3Output, int numElements)
+void inline saveFloat3InRVecFormat(ArrayRef<gmx::RVec> rVecOutput, const float3* float3Output, int numElements)
{
for (int i = 0; i < numElements; i++)
{
}
}
-void convertRVecToFloat3OnHost(std::vector<gmx::RVec>& rVecOutput, const std::vector<gmx::RVec>& rVecInput)
+void convertRVecToFloat3OnHost(ArrayRef<gmx::RVec> rVecOutput, ArrayRef<const gmx::RVec> rVecInput)
{
const int numElements = rVecInput.size();
}
}
-void convertRVecToFloat3OnDevice(std::vector<gmx::RVec>& h_rVecOutput,
- const std::vector<gmx::RVec>& h_rVecInput,
- const TestDevice* testDevice)
+void convertRVecToFloat3OnDevice(ArrayRef<gmx::RVec> h_rVecOutput,
+ ArrayRef<const gmx::RVec> h_rVecInput,
+ const TestDevice* testDevice)
{
const DeviceContext& deviceContext = testDevice->deviceContext();
const DeviceStream& deviceStream = testDevice->deviceStream();
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2020, by the GROMACS development team, led by
+ * Copyright (c) 2020,2021, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
namespace gmx
{
+template<typename>
+class ArrayRef;
+
namespace test
{
* \param[out] rVecOutput Data in RVec format for the output.
* \param[in] rVecInput Data in RVec format with the input.
*/
-void convertRVecToFloat3OnHost(std::vector<gmx::RVec>& rVecOutput, const std::vector<gmx::RVec>& rVecInput);
+void convertRVecToFloat3OnHost(ArrayRef<gmx::RVec> rVecOutput, ArrayRef<const gmx::RVec> rVecInput);
/*! \brief Tests the compatibility of RVec and float3 using the conversion on device.
*
* \param[in] rVecInput Data in RVec format with the input.
* \param[in] testDevice Test herdware environment to get DeviceContext and DeviceStream from.
*/
-void convertRVecToFloat3OnDevice(std::vector<gmx::RVec>& rVecOutput,
- const std::vector<gmx::RVec>& rVecInput,
- const TestDevice* testDevice);
+void convertRVecToFloat3OnDevice(ArrayRef<gmx::RVec> rVecOutput,
+ ArrayRef<const gmx::RVec> rVecInput,
+ const TestDevice* testDevice);
} // namespace test
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2012,2015,2016,2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2012,2015,2016,2019,2020,2021, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
{
return make_float3(a.x, a.y, a.z);
}
-__forceinline__ __host__ __device__ float3 operator-(float3& a)
+__forceinline__ __host__ __device__ float3 operator-(const float3& a)
{
return make_float3(-a.x, -a.y, -a.z);
}
{
return make_float3(k * a.x, k * a.y, k * a.z);
}
+// NOLINTNEXTLINE(google-runtime-references)
__forceinline__ __host__ __device__ void operator+=(float3& a, float3 b)
{
a.x += b.x;
a.y += b.y;
a.z += b.z;
}
+// NOLINTNEXTLINE(google-runtime-references)
__forceinline__ __host__ __device__ void operator+=(float3& a, float4 b)
{
a.x += b.x;
a.y += b.y;
a.z += b.z;
}
+// NOLINTNEXTLINE(google-runtime-references)
__forceinline__ __host__ __device__ void operator-=(float3& a, float3 b)
{
a.x -= b.x;
{
return make_float3(a.x * b.x, a.y * b.y, a.z * b.z);
}
+// NOLINTNEXTLINE(google-runtime-references)
__forceinline__ __host__ __device__ void operator*=(float3& a, float3 b)
{
a.x *= b.x;
a.y *= b.y;
a.z *= b.z;
}
+// NOLINTNEXTLINE(google-runtime-references)
__forceinline__ __host__ __device__ void operator*=(float3& a, float b)
{
a.x *= b;
}
__forceinline__ __host__ __device__ float4 make_float4(float3 a)
{
- return make_float4(a.x, a.y, a.z, 0.0f);
+ return make_float4(a.x, a.y, a.z, 0.0F);
}
__forceinline__ __host__ __device__ float4 operator+(float4 a, float4 b)
{
a.z += b.z;
a.w += b.w;
}
+// NOLINTNEXTLINE(google-runtime-references)
__forceinline__ __host__ __device__ void operator+=(float4& a, float3 b)
{
a.x += b.x;
a.y += b.y;
a.z += b.z;
}
+// NOLINTNEXTLINE(google-runtime-references)
__forceinline__ __host__ __device__ void operator-=(float4& a, float3 b)
{
a.x -= b.x;
float ipb = norm2(b);
float ip = iprod(a, b);
float ipab = ipa * ipb;
- if (ipab > 0.0f)
+ if (ipab > 0.0F)
{
cosval = ip * rsqrt(ipab);
}
else
{
- cosval = 1.0f;
+ cosval = 1.0F;
}
- if (cosval > 1.0f)
+ if (cosval > 1.0F)
{
- return 1.0f;
+ return 1.0F;
}
- if (cosval < -1.0f)
+ if (cosval < -1.0F)
{
- return -1.0f;
+ return -1.0F;
}
return cosval;
*
* \param[in] a First vector.
* \param[in] b Second vector.
- * \returns Angle between vectors.
*/
+// NOLINTNEXTLINE(google-runtime-references)
__forceinline__ __device__ void atomicAdd(float3& a, const float3 b)
{
atomicAdd(&a.x, b.x);
* This file is part of the GROMACS molecular simulation package.
*
* Copyright (c) 2012,2013,2014,2015,2016, by the GROMACS development team.
- * Copyright (c) 2017,2018,2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2017,2018,2019,2020,2021, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
*
* In reality it is 16 with CUDA <=v5.0, but let's stay on the safe side.
*/
-static int c_cudaMaxDeviceCount = 32;
+static const int c_cudaMaxDeviceCount = 32;
/** Dummy kernel used for sanity checking. */
-static __global__ void dummy_kernel(void) {}
+static __global__ void dummy_kernel() {}
static cudaError_t checkCompiledTargetCompatibility(int deviceId, const cudaDeviceProp& deviceProp)
{
if (stat != cudaSuccess)
{
auto message = gmx::formatString("Failed to initialize GPU #%d", deviceId);
- CU_RET_ERR(stat, message.c_str());
+ CU_RET_ERR(stat, message);
}
if (debug)
int numAtomsPerInteraction,
ArrayRef<const int> nbnxnAtomOrder)
{
- GMX_ASSERT(src.size() == 0 || !nbnxnAtomOrder.empty(), "We need the nbnxn atom order");
+ GMX_ASSERT(src.empty() || !nbnxnAtomOrder.empty(), "We need the nbnxn atom order");
dest->iatoms.resize(src.size());
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
public:
//! Constructor
Impl(const gmx_ffparams_t& ffparams,
- const float electrostaticsScaleFactor,
+ float electrostaticsScaleFactor,
const DeviceContext& deviceContext,
const DeviceStream& deviceStream,
gmx_wallcycle* wcycle);
/*-------------------------------- CUDA kernels-------------------------------- */
/*------------------------------------------------------------------------------*/
-#define CUDA_DEG2RAD_F (CUDART_PI_F / 180.0f)
+#define CUDA_DEG2RAD_F (CUDART_PI_F / 180.0F)
/*---------------- BONDED CUDA kernels--------------*/
__device__ __forceinline__ static void
harmonic_gpu(const float kA, const float xA, const float x, float* V, float* F)
{
- constexpr float half = 0.5f;
+ constexpr float half = 0.5F;
float dx, dx2;
dx = x - xA;
{
if (i < numBonds)
{
- int3 bondData = *(int3*)(d_forceatoms + 3 * i);
- int type = bondData.x;
- int ai = bondData.y;
- int aj = bondData.z;
+ const int3 bondData = *(reinterpret_cast<const int3*>(d_forceatoms + 3 * i));
+ int type = bondData.x;
+ int ai = bondData.y;
+ int aj = bondData.z;
/* dx = xi - xj, corrected for periodic boundary conditions. */
float3 dx;
*vtot_loc += vbond;
}
- if (dr2 != 0.0f)
+ if (dr2 != 0.0F)
{
fbond *= rsqrtf(dr2);
{
if (i < numBonds)
{
- int4 angleData = *(int4*)(d_forceatoms + 4 * i);
- int type = angleData.x;
- int ai = angleData.y;
- int aj = angleData.z;
- int ak = angleData.w;
+ const int4 angleData = *(reinterpret_cast<const int4*>(d_forceatoms + 4 * i));
+ int type = angleData.x;
+ int ai = angleData.y;
+ int aj = angleData.z;
+ int ak = angleData.w;
float3 r_ij;
float3 r_kj;
}
float cos_theta2 = cos_theta * cos_theta;
- if (cos_theta2 < 1.0f)
+ if (cos_theta2 < 1.0F)
{
- float st = dVdt * rsqrtf(1.0f - cos_theta2);
+ float st = dVdt * rsqrtf(1.0F - cos_theta2);
float sth = st * cos_theta;
float nrij2 = norm2(r_ij);
float nrkj2 = norm2(r_kj);
{
if (i < numBonds)
{
- int4 ubData = *(int4*)(d_forceatoms + 4 * i);
- int type = ubData.x;
- int ai = ubData.y;
- int aj = ubData.z;
- int ak = ubData.w;
+ const int4 ubData = *(reinterpret_cast<const int4*>(d_forceatoms + 4 * i));
+ int type = ubData.x;
+ int ai = ubData.y;
+ int aj = ubData.z;
+ int ak = ubData.w;
float th0A = d_forceparams[type].u_b.thetaA * CUDA_DEG2RAD_F;
float kthA = d_forceparams[type].u_b.kthetaA;
harmonic_gpu(kUBA, r13A, dr, &vbond, &fbond);
float cos_theta2 = cos_theta * cos_theta;
- if (cos_theta2 < 1.0f)
+ if (cos_theta2 < 1.0F)
{
- float st = dVdt * rsqrtf(1.0f - cos_theta2);
+ float st = dVdt * rsqrtf(1.0F - cos_theta2);
float sth = st * cos_theta;
float nrkj2 = norm2(r_kj);
}
/* Time for the bond calculations */
- if (dr2 != 0.0f)
+ if (dr2 != 0.0F)
{
if (calcEner)
{
*n = cprod(*r_kj, *r_kl);
float phi = gmx_angle(*m, *n);
float ipr = iprod(*r_ij, *n);
- float sign = (ipr < 0.0f) ? -1.0f : 1.0f;
+ float sign = (ipr < 0.0F) ? -1.0F : 1.0F;
phi = sign * phi;
return phi;
mdphi = mult * phi - phiA * CUDA_DEG2RAD_F;
sdphi = sinf(mdphi);
- *v = cpA * (1.0f + cosf(mdphi));
+ *v = cpA * (1.0F + cosf(mdphi));
*f = -cpA * mult * sdphi;
}
float3 sm_fShiftLoc[],
const PbcAiuc pbcAiuc)
{
- constexpr float c0 = 0.0f, c1 = 1.0f, c2 = 2.0f, c3 = 3.0f, c4 = 4.0f, c5 = 5.0f;
+ constexpr float c0 = 0.0F, c1 = 1.0F, c2 = 2.0F, c3 = 3.0F, c4 = 4.0F, c5 = 5.0F;
if (i < numBonds)
{
/* dp cannot be outside (-pi,pi) */
if (*dp >= CUDART_PI_F)
{
- *dp -= 2.0f * CUDART_PI_F;
+ *dp -= 2.0F * CUDART_PI_F;
}
else if (*dp < -CUDART_PI_F)
{
- *dp += 2.0f * CUDART_PI_F;
+ *dp += 2.0F * CUDART_PI_F;
}
}
if (calcEner)
{
- *vtot_loc += -0.5f * ddphi * dp;
+ *vtot_loc += -0.5F * ddphi * dp;
}
}
}
if (i < numBonds)
{
// TODO this should be made into a separate type, the GPU and CPU sizes should be compared
- int3 pairData = *(int3*)(d_forceatoms + 3 * i);
- int type = pairData.x;
- int ai = pairData.y;
- int aj = pairData.z;
+ const int3 pairData = *(reinterpret_cast<const int3*>(d_forceatoms + 3 * i));
+ int type = pairData.x;
+ int ai = pairData.y;
+ int aj = pairData.z;
float qq = gm_xq[ai].w * gm_xq[aj].w;
float c6 = iparams[type].lj14.c6A;
float velec = scale_factor * qq * rinv;
/* Calculate the LJ force * r and add it to the Coulomb part */
- float fr = (12.0f * c12 * rinv6 - 6.0f * c6) * rinv6 + velec;
+ float fr = (12.0F * c12 * rinv6 - 6.0F * c6) * rinv6 + velec;
float finvr = fr * rinv2;
float3 f = finvr * dr;
extern __shared__ char sm_dynamicShmem[];
char* sm_nextSlotPtr = sm_dynamicShmem;
- float3* sm_fShiftLoc = (float3*)sm_nextSlotPtr;
+ float3* sm_fShiftLoc = reinterpret_cast<float3*>(sm_nextSlotPtr);
sm_nextSlotPtr += c_numShiftVectors * sizeof(float3);
if (calcVir)
{
if (threadIdx.x < c_numShiftVectors)
{
- sm_fShiftLoc[threadIdx.x] = make_float3(0.0f, 0.0f, 0.0f);
+ sm_fShiftLoc[threadIdx.x] = make_float3(0.0F, 0.0F, 0.0F);
}
__syncthreads();
}
int warpId = threadIdx.x / warpSize;
// Shared memory variables to hold block-local partial sum
- float* sm_vTot = (float*)sm_nextSlotPtr;
+ float* sm_vTot = reinterpret_cast<float*>(sm_nextSlotPtr);
sm_nextSlotPtr += numWarps * sizeof(float);
- float* sm_vTotVdw = (float*)sm_nextSlotPtr;
+ float* sm_vTotVdw = reinterpret_cast<float*>(sm_nextSlotPtr);
sm_nextSlotPtr += numWarps * sizeof(float);
- float* sm_vTotElec = (float*)sm_nextSlotPtr;
+ float* sm_vTotElec = reinterpret_cast<float*>(sm_nextSlotPtr);
if (threadIdx.x % warpSize == 0)
{
baseForce_ = baseForcePtr;
numAtoms_ = numAtoms;
atomStart_ = atomStart;
- accumulate_ = static_cast<int>(accumulate);
+ accumulate_ = accumulate;
completionMarker_ = completionMarker;
cellInfo_.cell = cell.data();
rvecForceToAdd_ = forcePtr;
};
-void GpuForceReduction::Impl::addDependency(GpuEventSynchronizer* const dependency)
+void GpuForceReduction::Impl::addDependency(GpuEventSynchronizer* dependency)
{
dependencyList_.push_back(dependency);
}
wallcycle_stop(wcycle_, WallCycleCounter::LaunchGpu);
}
-GpuForceReduction::Impl::~Impl() = default;
-
GpuForceReduction::GpuForceReduction(const DeviceContext& deviceContext,
const DeviceStream& deviceStream,
gmx_wallcycle* wcycle) :
impl_->registerRvecForce(forcePtr);
}
-void GpuForceReduction::addDependency(GpuEventSynchronizer* const dependency)
+void GpuForceReduction::addDependency(GpuEventSynchronizer* dependency)
{
impl_->addDependency(dependency);
}
* \param [in] wcycle The wallclock counter
*/
Impl(const DeviceContext& deviceContext, const DeviceStream& deviceStream, gmx_wallcycle* wcycle);
- ~Impl();
+ ~Impl() = default;
/*! \brief Register a nbnxm-format force to be reduced
*
* \param [in] forcePtr Pointer to force to be reduced
*
* \param [in] dependency Dependency for this reduction
*/
- void addDependency(GpuEventSynchronizer* const dependency);
+ void addDependency(GpuEventSynchronizer* dependency);
/*! \brief Reinitialize the GPU force reduction
*
* \param [in] completionMarker Event to be marked when launch of reduction is complete
*/
void reinit(DeviceBuffer<Float3> baseForcePtr,
- const int numAtoms,
+ int numAtoms,
ArrayRef<const int> cell,
- const int atomStart,
- const bool accumulate,
+ int atomStart,
+ bool accumulate,
GpuEventSynchronizer* completionMarker = nullptr);
/*! \brief Execute the force reduction */
//! number of atoms
int numAtoms_ = 0;
//! whether reduction is accumulated into base force buffer
- int accumulate_ = true;
+ bool accumulate_ = true;
//! cell information for any nbat-format forces
struct cellInfo cellInfo_;
//! GPU context object
*gm_fDest = temp;
}
- return;
}
void launchForceReductionKernel(int numAtoms,
void integrate(DeviceBuffer<Float3> d_x,
DeviceBuffer<Float3> d_xp,
DeviceBuffer<Float3> d_v,
- const DeviceBuffer<Float3> d_f,
- const float dt,
- const bool doTemperatureScaling,
+ DeviceBuffer<Float3> d_f,
+ float dt,
+ bool doTemperatureScaling,
gmx::ArrayRef<const t_grp_tcstat> tcstat,
- const bool doParrinelloRahman,
- const float dtPressureCouple,
+ bool doParrinelloRahman,
+ float dtPressureCouple,
const matrix prVelocityScalingMatrix);
/*! \brief Set the integrator
* \param[in] inverseMasses Inverse masses of atoms.
* \param[in] tempScaleGroups Maps the atom index to temperature scale value.
*/
- void set(const int numAtoms, const real* inverseMasses, const unsigned short* tempScaleGroups);
+ void set(int numAtoms, const real* inverseMasses, const unsigned short* tempScaleGroups);
/*! \brief Class with hardware-specific interfaces and implementations.*/
class Impl;
namespace gmx
{
-void LincsGpu::apply(const DeviceBuffer<Float3> d_x,
- DeviceBuffer<Float3> d_xp,
- const bool updateVelocities,
- DeviceBuffer<Float3> d_v,
- const real invdt,
- const bool computeVirial,
- tensor virialScaled,
- const PbcAiuc pbcAiuc)
+void LincsGpu::apply(const DeviceBuffer<Float3>& d_x,
+ DeviceBuffer<Float3> d_xp,
+ const bool updateVelocities,
+ DeviceBuffer<Float3> d_v,
+ const real invdt,
+ const bool computeVirial,
+ tensor virialScaled,
+ const PbcAiuc& pbcAiuc)
{
GMX_ASSERT(GMX_GPU_CUDA, "LINCS GPU is only implemented in CUDA.");
virialScaled[ZZ][YY] += h_virialScaled_[4];
virialScaled[ZZ][ZZ] += h_virialScaled_[5];
}
-
- return;
}
LincsGpu::LincsGpu(int numIterations,
int center = c1a1;
- float sqrtmu1 = 1.0 / sqrt(invmass[c1a1] + invmass[c1a2]);
- float sqrtmu2 = 1.0 / sqrt(invmass[c2a1] + invmass[c2a2]);
+ float sqrtmu1 = 1.0 / std::sqrt(invmass[c1a1] + invmass[c1a2]);
+ float sqrtmu2 = 1.0 / std::sqrt(invmass[c2a1] + invmass[c2a2]);
massFactorsHost.at(index) = -sign * invmass[center] * sqrtmu1 * sqrtmu2;
int center = c1a2;
- float sqrtmu1 = 1.0 / sqrt(invmass[c1a1] + invmass[c1a2]);
- float sqrtmu2 = 1.0 / sqrt(invmass[c2a1] + invmass[c2a2]);
+ float sqrtmu1 = 1.0 / std::sqrt(invmass[c1a1] + invmass[c1a2]);
+ float sqrtmu2 = 1.0 / std::sqrt(invmass[c2a1] + invmass[c2a2]);
massFactorsHost.at(index) = sign * invmass[center] * sqrtmu1 * sqrtmu2;
* \param[in,out] virialScaled Scaled virial tensor to be updated.
* \param[in] pbcAiuc PBC data.
*/
- void apply(const DeviceBuffer<Float3> d_x,
- DeviceBuffer<Float3> d_xp,
- const bool updateVelocities,
- DeviceBuffer<Float3> d_v,
- const real invdt,
- const bool computeVirial,
- tensor virialScaled,
- const PbcAiuc pbcAiuc);
+ void apply(const DeviceBuffer<Float3>& d_x,
+ DeviceBuffer<Float3> d_xp,
+ bool updateVelocities,
+ DeviceBuffer<Float3> d_v,
+ real invdt,
+ bool computeVirial,
+ tensor virialScaled,
+ const PbcAiuc& pbcAiuc);
/*! \brief
* Update data-structures (e.g. after NB search step).
const float* __restrict__ gm_inverseMasses = kernelParams.d_inverseMasses;
float* __restrict__ gm_virialScaled = kernelParams.d_virialScaled;
- int threadIndex = blockIdx.x * blockDim.x + threadIdx.x;
+ const int threadIndex = blockIdx.x * blockDim.x + threadIdx.x;
// numConstraintsThreads should be a integer multiple of blockSize (numConstraintsThreads = numBlocks*blockSize).
// This is to ensure proper synchronizations and reduction. All array are padded to the required size.
int j = pair.j;
// Mass-scaled Lagrange multiplier
- float lagrangeScaled = 0.0f;
+ float lagrangeScaled = 0.0F;
float targetLength;
float inverseMassi;
// Everything computed for these dummies will be equal to zero
if (isDummyThread)
{
- targetLength = 0.0f;
- inverseMassi = 0.0f;
- inverseMassj = 0.0f;
- sqrtReducedMass = 0.0f;
-
- xi = make_float3(0.0f, 0.0f, 0.0f);
- xj = make_float3(0.0f, 0.0f, 0.0f);
- rc = make_float3(0.0f, 0.0f, 0.0f);
+ targetLength = 0.0F;
+ inverseMassi = 0.0F;
+ inverseMassj = 0.0F;
+ sqrtReducedMass = 0.0F;
+
+ xi = make_float3(0.0F, 0.0F, 0.0F);
+ xj = make_float3(0.0F, 0.0F, 0.0F);
+ rc = make_float3(0.0F, 0.0F, 0.0F);
}
else
{
{
// Making sure that all sm_rhs are saved before they are accessed in a loop below
__syncthreads();
- float mvb = 0.0f;
+ float mvb = 0.0F;
for (int n = 0; n < coupledConstraintsCount; n++)
{
float3 dx = pbcDxAiuc(pbcAiuc, xi, xj);
float len2 = targetLength * targetLength;
- float dlen2 = 2.0f * len2 - norm2(dx);
+ float dlen2 = 2.0F * len2 - norm2(dx);
// TODO A little bit more effective but slightly less readable version of the below would be:
// float proj = sqrtReducedMass*(targetLength - (dlen2 > 0.0f ? 1.0f : 0.0f)*dlen2*rsqrt(dlen2));
float proj;
- if (dlen2 > 0.0f)
+ if (dlen2 > 0.0F)
{
proj = sqrtReducedMass * (targetLength - dlen2 * rsqrt(dlen2));
}
atomicAdd(&(gm_virialScaled[threadIdx.x]), sm_threadVirial[threadIdx.x * blockDim.x]);
}
}
-
- return;
}
/*! \brief Select templated kernel.
return kernelPtr;
}
-void launchLincsGpuKernel(LincsGpuKernelParameters& kernelParams,
- const DeviceBuffer<Float3> d_x,
- DeviceBuffer<Float3> d_xp,
- const bool updateVelocities,
- DeviceBuffer<Float3> d_v,
- const real invdt,
- const bool computeVirial,
- const DeviceStream& deviceStream)
+void launchLincsGpuKernel(const LincsGpuKernelParameters& kernelParams,
+ const DeviceBuffer<Float3>& d_x,
+ DeviceBuffer<Float3> d_xp,
+ const bool updateVelocities,
+ DeviceBuffer<Float3> d_v,
+ const real invdt,
+ const bool computeVirial,
+ const DeviceStream& deviceStream)
{
auto kernelPtr = getLincsKernelPtr(updateVelocities, computeVirial);
nullptr,
"lincs_kernel<updateVelocities, computeVirial>",
kernelArgs);
-
- return;
}
} // namespace gmx
//! Number of threads in a GPU block
constexpr static int c_threadsPerBlock = 256;
-void launchLincsGpuKernel(LincsGpuKernelParameters& kernelParams,
- const DeviceBuffer<Float3> d_x,
- DeviceBuffer<Float3> d_xp,
- const bool updateVelocities,
- DeviceBuffer<Float3> d_v,
- const real invdt,
- const bool computeVirial,
- const DeviceStream& deviceStream);
+void launchLincsGpuKernel(const LincsGpuKernelParameters& kernelParams,
+ const DeviceBuffer<Float3>& d_x,
+ DeviceBuffer<Float3> d_xp,
+ bool updateVelocities,
+ DeviceBuffer<Float3> d_v,
+ real invdt,
+ bool computeVirial,
+ const DeviceStream& deviceStream);
} // namespace gmx
namespace gmx
{
-void launchLincsGpuKernel(LincsGpuKernelParameters& /* kernelParams */,
- const DeviceBuffer<Float3> /* d_x */,
+void launchLincsGpuKernel(const LincsGpuKernelParameters& /* kernelParams */,
+ const DeviceBuffer<Float3>& /* d_x */,
DeviceBuffer<Float3> /* d_xp */,
const bool /* updateVelocities */,
DeviceBuffer<Float3> /* d_v */,
namespace gmx
{
-void SettleGpu::apply(const DeviceBuffer<Float3> d_x,
- DeviceBuffer<Float3> d_xp,
- const bool updateVelocities,
- DeviceBuffer<Float3> d_v,
- const real invdt,
- const bool computeVirial,
- tensor virialScaled,
- const PbcAiuc pbcAiuc)
+void SettleGpu::apply(const DeviceBuffer<Float3>& d_x,
+ DeviceBuffer<Float3> d_xp,
+ const bool updateVelocities,
+ DeviceBuffer<Float3> d_v,
+ const real invdt,
+ const bool computeVirial,
+ tensor virialScaled,
+ const PbcAiuc& pbcAiuc)
{
// Early exit if no settles
virialScaled[ZZ][YY] += h_virialScaled_[4];
virialScaled[ZZ][ZZ] += h_virialScaled_[5];
}
-
- return;
}
SettleGpu::SettleGpu(const gmx_mtop_t& mtop, const DeviceContext& deviceContext, const DeviceStream& deviceStream) :
* \param[in,out] virialScaled Scaled virial tensor to be updated.
* \param[in] pbcAiuc PBC data.
*/
- void apply(const DeviceBuffer<Float3> d_x,
- DeviceBuffer<Float3> d_xp,
- const bool updateVelocities,
- DeviceBuffer<Float3> d_v,
- const real invdt,
- const bool computeVirial,
- tensor virialScaled,
- const PbcAiuc pbcAiuc);
+ void apply(const DeviceBuffer<Float3>& d_x,
+ DeviceBuffer<Float3> d_xp,
+ bool updateVelocities,
+ DeviceBuffer<Float3> d_v,
+ real invdt,
+ bool computeVirial,
+ tensor virialScaled,
+ const PbcAiuc& pbcAiuc);
/*! \brief
* Update data-structures (e.g. after NB search step).
float sinphi = a1d_z * rsqrt(pars.ra * pars.ra);
- float tmp2 = 1.0f - sinphi * sinphi;
+ float tmp2 = 1.0F - sinphi * sinphi;
if (almost_zero > tmp2)
{
float tmp = rsqrt(tmp2);
float cosphi = tmp2 * tmp;
float sinpsi = (b1d.z - c1d.z) * pars.irc2 * tmp;
- tmp2 = 1.0f - sinpsi * sinpsi;
+ tmp2 = 1.0F - sinpsi * sinpsi;
float cospsi = tmp2 * rsqrt(tmp2);
float sinthe = (alpha * gamma - beta * tmp2 * rsqrt(tmp2)) * rsqrt(al2be2 * al2be2);
/* --- Step4 A3' --- */
- tmp2 = 1.0f - sinthe * sinthe;
+ tmp2 = 1.0F - sinthe * sinthe;
float costhe = tmp2 * rsqrt(tmp2);
float3 a3d, b3d, c3d;
{
for (int d = 0; d < 6; d++)
{
- sm_threadVirial[d * blockDim.x + threadIdx.x] = 0.0f;
+ sm_threadVirial[d * blockDim.x + threadIdx.x] = 0.0F;
}
}
}
atomicAdd(&(gm_virialScaled[tib]), sm_threadVirial[tib * blockSize]);
}
}
-
- return;
}
/*! \brief Select templated kernel.
return kernelPtr;
}
-void launchSettleGpuKernel(const int numSettles,
- const DeviceBuffer<WaterMolecule> d_atomIds,
- const SettleParameters settleParameters,
- const DeviceBuffer<Float3> d_x,
- DeviceBuffer<Float3> d_xp,
- const bool updateVelocities,
- DeviceBuffer<Float3> d_v,
- const real invdt,
- const bool computeVirial,
- DeviceBuffer<float> virialScaled,
- const PbcAiuc pbcAiuc,
- const DeviceStream& deviceStream)
+void launchSettleGpuKernel(const int numSettles,
+ const DeviceBuffer<WaterMolecule>& d_atomIds,
+ const SettleParameters& settleParameters,
+ const DeviceBuffer<Float3>& d_x,
+ DeviceBuffer<Float3> d_xp,
+ const bool updateVelocities,
+ DeviceBuffer<Float3> d_v,
+ const real invdt,
+ const bool computeVirial,
+ DeviceBuffer<float> virialScaled,
+ const PbcAiuc& pbcAiuc,
+ const DeviceStream& deviceStream)
{
static_assert(
gmx::isPowerOfTwo(sc_threadsPerBlock),
nullptr,
"settle_kernel<updateVelocities, computeVirial>",
kernelArgs);
-
- return;
}
} // namespace gmx
* \param[in] pbcAiuc PBC data.
* \param[in] deviceStream Device stream to launch kernel in.
*/
-void launchSettleGpuKernel(int numSettles,
- const DeviceBuffer<WaterMolecule> d_atomIds,
- const SettleParameters settleParameters,
- const DeviceBuffer<Float3> d_x,
- DeviceBuffer<Float3> d_xp,
- const bool updateVelocities,
- DeviceBuffer<Float3> d_v,
- const real invdt,
- const bool computeVirial,
- DeviceBuffer<float> virialScaled,
- const PbcAiuc pbcAiuc,
- const DeviceStream& deviceStream);
+void launchSettleGpuKernel(int numSettles,
+ const DeviceBuffer<WaterMolecule>& d_atomIds,
+ const SettleParameters& settleParameters,
+ const DeviceBuffer<Float3>& d_x,
+ DeviceBuffer<Float3> d_xp,
+ bool updateVelocities,
+ DeviceBuffer<Float3> d_v,
+ real invdt,
+ bool computeVirial,
+ DeviceBuffer<float> virialScaled,
+ const PbcAiuc& pbcAiuc,
+ const DeviceStream& deviceStream);
} // namespace gmx
{
void launchSettleGpuKernel(const int /* numSettles */,
- const DeviceBuffer<WaterMolecule> /* d_atomIds */,
- const SettleParameters /* settleParameters */,
- const DeviceBuffer<Float3> /* d_x */,
+ const DeviceBuffer<WaterMolecule>& /* d_atomIds */,
+ const SettleParameters& /* settleParameters */,
+ const DeviceBuffer<Float3>& /* d_x */,
DeviceBuffer<Float3> /* d_xp */,
const bool /* updateVelocities */,
DeviceBuffer<Float3> /* d_v */,
const real /* invdt */,
const bool /* computeVirial */,
DeviceBuffer<float> /* virialScaled */,
- const PbcAiuc /* pbcAiuc */,
+ const PbcAiuc& /* pbcAiuc */,
const DeviceStream& /* deviceStream */)
{
// SYCL_TODO
#include "gromacs/timing/wallcycle.h"
#include "gromacs/topology/mtop_util.h"
-static constexpr bool sc_haveGpuConstraintSupport = (GMX_GPU_CUDA);
+static constexpr bool sc_haveGpuConstraintSupport = GMX_GPU_CUDA;
namespace gmx
{
wallcycle_sub_stop(wcycle_, WallCycleSubCounter::LaunchGpuUpdateConstrain);
wallcycle_stop(wcycle_, WallCycleCounter::LaunchGpu);
-
- return;
}
void UpdateConstrainGpu::Impl::scaleCoordinates(const matrix scalingMatrix)
*/
void set(DeviceBuffer<Float3> d_x,
DeviceBuffer<Float3> d_v,
- const DeviceBuffer<Float3> d_f,
+ DeviceBuffer<Float3> d_f,
const InteractionDefinitions& idef,
const t_mdatoms& md);
void launchScaleCoordinatesKernel(const int numAtoms,
DeviceBuffer<Float3> d_coordinates,
- const ScalingMatrix mu,
+ const ScalingMatrix& mu,
const DeviceStream& deviceStream)
{
KernelLaunchConfig kernelLaunchConfig;
*/
void launchScaleCoordinatesKernel(int numAtoms,
DeviceBuffer<Float3> d_coordinates,
- const ScalingMatrix mu,
+ const ScalingMatrix& mu,
const DeviceStream& deviceStream);
} // namespace gmx
void launchScaleCoordinatesKernel(const int numAtoms,
DeviceBuffer<Float3> d_coordinates,
- const ScalingMatrix mu,
+ const ScalingMatrix& mu,
const DeviceStream& deviceStream)
{
const cl::sycl::range<1> rangeAllAtoms(numAtoms);
*
* \returns Tuple, containing the index of the first atom in the range and the total number of atoms in the range.
*/
- std::tuple<int, int> getAtomRangesFromAtomLocality(AtomLocality atomLocality);
+ std::tuple<int, int> getAtomRangesFromAtomLocality(AtomLocality atomLocality) const;
/*! \brief Get the positions buffer on the GPU.
*
* \returns The number of local atoms.
*/
- int numAtomsLocal();
+ int numAtomsLocal() const;
/*! \brief Getter for the total number of atoms.
*
* \returns The total number of atoms.
*/
- int numAtomsAll();
+ int numAtomsAll() const;
private:
class Impl;
"GPU implementation.");
}
-std::tuple<int, int> StatePropagatorDataGpu::getAtomRangesFromAtomLocality(AtomLocality /* atomLocality */)
+std::tuple<int, int> StatePropagatorDataGpu::getAtomRangesFromAtomLocality(AtomLocality /* atomLocality */) const
{
GMX_ASSERT(!impl_,
"A CPU stub method from GPU state propagator data was called instead of one from "
return nullptr;
}
-int StatePropagatorDataGpu::numAtomsLocal()
+int StatePropagatorDataGpu::numAtomsLocal() const
{
GMX_ASSERT(!impl_,
"A CPU stub method from GPU state propagator data was called instead of one from "
return 0;
}
-int StatePropagatorDataGpu::numAtomsAll()
+int StatePropagatorDataGpu::numAtomsAll() const
{
GMX_ASSERT(!impl_,
"A CPU stub method from GPU state propagator data was called instead of one from "
*
* \returns Tuple, containing the index of the first atom in the range and the total number of atoms in the range.
*/
- std::tuple<int, int> getAtomRangesFromAtomLocality(AtomLocality atomLocality);
+ std::tuple<int, int> getAtomRangesFromAtomLocality(AtomLocality atomLocality) const;
/*! \brief Get the positions buffer on the GPU.
*
* \returns The number of local atoms.
*/
- int numAtomsLocal();
+ int numAtomsLocal() const;
/*! \brief Getter for the total number of atoms.
*
* \returns The total number of atoms.
*/
- int numAtomsAll();
+ int numAtomsAll() const;
private:
//! GPU PME stream.
void clearOnDevice(DeviceBuffer<RVec> d_data,
int dataSize,
AtomLocality atomLocality,
- const DeviceStream& deviceStream);
+ const DeviceStream& deviceStream) const;
};
} // namespace gmx
wallcycle_stop(wcycle_, WallCycleCounter::LaunchGpu);
}
-std::tuple<int, int> StatePropagatorDataGpu::Impl::getAtomRangesFromAtomLocality(AtomLocality atomLocality)
+std::tuple<int, int> StatePropagatorDataGpu::Impl::getAtomRangesFromAtomLocality(AtomLocality atomLocality) const
{
int atomsStartAt = 0;
int numAtomsToCopy = 0;
void StatePropagatorDataGpu::Impl::clearOnDevice(DeviceBuffer<RVec> d_data,
int dataSize,
AtomLocality atomLocality,
- const DeviceStream& deviceStream)
+ const DeviceStream& deviceStream) const
{
GMX_UNUSED_VALUE(dataSize);
return updateStream_;
}
-int StatePropagatorDataGpu::Impl::numAtomsLocal()
+int StatePropagatorDataGpu::Impl::numAtomsLocal() const
{
return numAtomsLocal_;
}
-int StatePropagatorDataGpu::Impl::numAtomsAll()
+int StatePropagatorDataGpu::Impl::numAtomsAll() const
{
return numAtomsAll_;
}
return impl_->reinit(numAtomsLocal, numAtomsAll);
}
-std::tuple<int, int> StatePropagatorDataGpu::getAtomRangesFromAtomLocality(AtomLocality atomLocality)
+std::tuple<int, int> StatePropagatorDataGpu::getAtomRangesFromAtomLocality(AtomLocality atomLocality) const
{
return impl_->getAtomRangesFromAtomLocality(atomLocality);
}
return impl_->getUpdateStream();
}
-int StatePropagatorDataGpu::numAtomsLocal()
+int StatePropagatorDataGpu::numAtomsLocal() const
{
return impl_->numAtomsLocal();
}
-int StatePropagatorDataGpu::numAtomsAll()
+int StatePropagatorDataGpu::numAtomsAll() const
{
return impl_->numAtomsAll();
}
--- /dev/null
+# List of rationales for check suppressions (where known).
+# This have to precede the list because inline comments are not
+# supported by clang-tidy.
+#
+# -readability-implicit-bool-conversion
+# This happens often in the GPU code and should generally be harmless
+Checks: -readability-implicit-bool-conversion
+InheritParentConfig: true
* there is a bit of fluctuations in the generated block counts, we use
* a target of 44 instead of the ideal value of 48.
*/
-static unsigned int gpu_min_ci_balanced_factor = 44;
+static const unsigned int gpu_min_ci_balanced_factor = 44;
void gpu_init_platform_specific(NbnxmGpu* /* nb */)
{
__global__ void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_cuda)
# endif /* CALC_ENERGIES */
#endif /* PRUNE_NBL */
- (const NBAtomDataGpu atdat, const NBParamGpu nbparam, const Nbnxm::gpu_plist plist, bool bCalcFshift)
+ (NBAtomDataGpu atdat, NBParamGpu nbparam, Nbnxm::gpu_plist plist, bool bCalcFshift)
#ifdef FUNCTION_DECLARATION_ONLY
; /* Only do function declaration, omit the function body. */
#else
* sm_nextSlotPtr should always be updated to point to the "next slot",
* that is past the last point where data has been stored.
*/
+ // NOLINTNEXTLINE(readability-redundant-declaration)
extern __shared__ char sm_dynamicShmem[];
char* sm_nextSlotPtr = sm_dynamicShmem;
static_assert(sizeof(char) == 1,
"The shared memory offset calculation assumes that char is 1 byte");
/* shmem buffer for i x+q pre-loading */
- float4* xqib = (float4*)sm_nextSlotPtr;
+ float4* xqib = reinterpret_cast<float4*>(sm_nextSlotPtr);
sm_nextSlotPtr += (c_nbnxnGpuNumClusterPerSupercluster * c_clSize * sizeof(*xqib));
/* shmem buffer for cj, for each warp separately */
- int* cjs = (int*)(sm_nextSlotPtr);
+ int* cjs = reinterpret_cast<int*>(sm_nextSlotPtr);
/* the cjs buffer's use expects a base pointer offset for pairs of warps in the j-concurrent execution */
cjs += tidxz * c_nbnxnGpuClusterpairSplit * c_nbnxnGpuJgroupSize;
sm_nextSlotPtr += (NTHREAD_Z * c_nbnxnGpuClusterpairSplit * c_nbnxnGpuJgroupSize * sizeof(*cjs));
# ifndef LJ_COMB
/* shmem buffer for i atom-type pre-loading */
- int* atib = (int*)sm_nextSlotPtr;
+ int* atib = reinterpret_cast<int*>(sm_nextSlotPtr);
sm_nextSlotPtr += (c_nbnxnGpuNumClusterPerSupercluster * c_clSize * sizeof(*atib));
# else
/* shmem buffer for i-atom LJ combination rule parameters */
- float2* ljcpib = (float2*)sm_nextSlotPtr;
+ float2* ljcpib = reinterpret_cast<float2*>(sm_nextSlotPtr);
sm_nextSlotPtr += (c_nbnxnGpuNumClusterPerSupercluster * c_clSize * sizeof(*ljcpib));
# endif
/*********************************************************************/
ci = sci * c_nbnxnGpuNumClusterPerSupercluster + tidxj;
ai = ci * c_clSize + tidxi;
- float* shiftptr = (float*)&shift_vec[nb_sci.shift];
- xqbuf = xq[ai] + make_float4(LDG(shiftptr), LDG(shiftptr + 1), LDG(shiftptr + 2), 0.0f);
+ const float* shiftptr = reinterpret_cast<const float*>(&shift_vec[nb_sci.shift]);
+ xqbuf = xq[ai] + make_float4(LDG(shiftptr), LDG(shiftptr + 1), LDG(shiftptr + 2), 0.0F);
xqbuf.w *= nbparam.epsfac;
xqib[tidxj * c_clSize + tidxi] = xqbuf;
for (i = 0; i < c_nbnxnGpuNumClusterPerSupercluster; i++)
{
- fci_buf[i] = make_float3(0.0f);
+ fci_buf[i] = make_float3(0.0F);
}
# ifdef LJ_EWALD
# ifdef CALC_ENERGIES
- E_lj = 0.0f;
- E_el = 0.0f;
+ E_lj = 0.0F;
+ E_el = 0.0F;
# ifdef EXCLUSION_FORCES /* Ewald or RF */
if (nb_sci.shift == gmx::c_centralShiftIndex
# ifdef LJ_EWALD
// load only the first 4 bytes of the parameter pair (equivalent with nbfp[idx].x)
- E_lj += LDG((float*)&nbparam.nbfp[atom_types[(sci * c_nbnxnGpuNumClusterPerSupercluster + i) * c_clSize + tidxi]
- * (ntypes + 1)]);
+ E_lj += LDG(reinterpret_cast<float*>(
+ &nbparam.nbfp[atom_types[(sci * c_nbnxnGpuNumClusterPerSupercluster + i) * c_clSize + tidxi]
+ * (ntypes + 1)]));
# endif
}
/* divide the self term(s) equally over the j-threads, then multiply with the coefficients. */
# ifdef LJ_EWALD
E_lj /= c_clSize * NTHREAD_Z;
- E_lj *= 0.5f * c_oneSixth * lje_coeff6_6;
+ E_lj *= 0.5F * c_oneSixth * lje_coeff6_6;
# endif
# if defined EL_EWALD_ANY || defined EL_RF || defined EL_CUTOFF
/* Correct for epsfac^2 due to adding qi^2 */
E_el /= nbparam.epsfac * c_clSize * NTHREAD_Z;
# if defined EL_RF || defined EL_CUTOFF
- E_el *= -0.5f * reactionFieldShift;
+ E_el *= -0.5F * reactionFieldShift;
# else
E_el *= -beta * M_FLOAT_1_SQRTPI; /* last factor 1/sqrt(pi) */
# endif
ljcp_j = lj_comb[aj];
# endif
- fcj_buf = make_float3(0.0f);
+ fcj_buf = make_float3(0.0F);
# if !defined PRUNE_NBL
# pragma unroll 8
}
# endif
- int_bit = (wexcl & mask_ji) ? 1.0f : 0.0f;
+ int_bit = (wexcl & mask_ji) ? 1.0F : 0.0F;
/* cutoff & exclusion check */
# ifdef EXCLUSION_FORCES
sig_r6 *= int_bit;
# endif /* EXCLUSION_FORCES */
- F_invr = epsilon * sig_r6 * (sig_r6 - 1.0f) * inv_r2;
+ F_invr = epsilon * sig_r6 * (sig_r6 - 1.0F) * inv_r2;
# endif /* !LJ_COMB_LB || CALC_ENERGIES */
# ifdef LJ_FORCE_SWITCH
/* Separate VDW cut-off check to enable twin-range cut-offs
* (rvdw < rcoulomb <= rlist)
*/
- vdw_in_range = (r2 < rvdw_sq) ? 1.0f : 0.0f;
+ vdw_in_range = (r2 < rvdw_sq) ? 1.0F : 0.0F;
F_invr *= vdw_in_range;
# ifdef CALC_ENERGIES
E_lj_p *= vdw_in_range;
# endif
# ifdef EL_RF
E_el += qi * qj_f
- * (int_bit * inv_r + 0.5f * two_k_rf * r2 - reactionFieldShift);
+ * (int_bit * inv_r + 0.5F * two_k_rf * r2 - reactionFieldShift);
# endif
# ifdef EL_EWALD_ANY
- /* 1.0f - erff is faster than erfcf */
+ /* 1.0F - erff is faster than erfcf */
E_el += qi * qj_f
* (inv_r * (int_bit - erff(r2 * inv_r * beta)) - int_bit * ewald_shift);
# endif /* EL_EWALD_ANY */
bCalcFshift = false;
}
- float fshift_buf = 0.0f;
+ float fshift_buf = 0.0F;
/* reduce i forces */
for (i = 0; i < c_nbnxnGpuNumClusterPerSupercluster; i++)
*/
template<bool haveFreshList>
__launch_bounds__(THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP) __global__
- void nbnxn_kernel_prune_cuda(const NBAtomDataGpu atdat,
- const NBParamGpu nbparam,
- const Nbnxm::gpu_plist plist,
- int numParts,
- int part)
+ void nbnxn_kernel_prune_cuda(NBAtomDataGpu atdat,
+ NBParamGpu nbparam,
+ Nbnxm::gpu_plist plist,
+ int numParts,
+ int part)
#ifdef FUNCTION_DECLARATION_ONLY
; /* Only do function declaration, omit the function body. */
"The shared memory offset calculation assumes that char is 1 byte");
/* shmem buffer for i x+q pre-loading */
- float4* xib = (float4*)sm_nextSlotPtr;
+ float4* xib = reinterpret_cast<float4*>(sm_nextSlotPtr);
sm_nextSlotPtr += (c_nbnxnGpuNumClusterPerSupercluster * c_clSize * sizeof(*xib));
/* shmem buffer for cj, for each warp separately */
- int* cjs = (int*)(sm_nextSlotPtr);
+ int* cjs = reinterpret_cast<int*>(sm_nextSlotPtr);
/* the cjs buffer's use expects a base pointer offset for pairs of warps in the j-concurrent execution */
cjs += tidxz * c_nbnxnGpuClusterpairSplit * c_nbnxnGpuJgroupSize;
sm_nextSlotPtr += (NTHREAD_Z * c_nbnxnGpuClusterpairSplit * c_nbnxnGpuJgroupSize * sizeof(*cjs));
static const unsigned __device__ superClInteractionMask =
((1U << c_nbnxnGpuNumClusterPerSupercluster) - 1U);
-static const float __device__ c_oneSixth = 0.16666667f;
-static const float __device__ c_oneTwelveth = 0.08333333f;
+static const float __device__ c_oneSixth = 0.16666667F;
+static const float __device__ c_oneTwelveth = 0.08333333F;
/*! Convert LJ sigma,epsilon parameters to C6,C12. */
r = r2 * inv_r;
r_switch = r - nbparam.rvdw_switch;
- r_switch = r_switch >= 0.0f ? r_switch : 0.0f;
+ r_switch = r_switch >= 0.0F ? r_switch : 0.0F;
*F_invr += -c6 * (disp_shift_V2 + disp_shift_V3 * r_switch) * r_switch * r_switch * inv_r
+ c12 * (repu_shift_V2 + repu_shift_V3 * r_switch) * r_switch * r_switch * inv_r;
r = r2 * inv_r;
r_switch = r - nbparam.rvdw_switch;
- r_switch = r_switch >= 0.0f ? r_switch : 0.0f;
+ r_switch = r_switch >= 0.0F ? r_switch : 0.0F;
*F_invr += -c6 * (disp_shift_V2 + disp_shift_V3 * r_switch) * r_switch * r_switch * inv_r
+ c12 * (repu_shift_V2 + repu_shift_V3 * r_switch) * r_switch * r_switch * inv_r;
}
/*! Apply potential switch, force-only version. */
-static __forceinline__ __device__ void
-calculate_potential_switch_F(const NBParamGpu nbparam, float inv_r, float r2, float* F_invr, float* E_lj)
+static __forceinline__ __device__ void calculate_potential_switch_F(const NBParamGpu& nbparam,
+ float inv_r,
+ float r2,
+ float* F_invr,
+ const float* E_lj)
{
float r, r_switch;
float sw, dsw;
r_switch = r - nbparam.rvdw_switch;
/* Unlike in the F+E kernel, conditional is faster here */
- if (r_switch > 0.0f)
+ if (r_switch > 0.0F)
{
- sw = 1.0f + (switch_V3 + (switch_V4 + switch_V5 * r_switch) * r_switch) * r_switch * r_switch * r_switch;
+ sw = 1.0F + (switch_V3 + (switch_V4 + switch_V5 * r_switch) * r_switch) * r_switch * r_switch * r_switch;
dsw = (switch_F2 + (switch_F3 + switch_F4 * r_switch) * r_switch) * r_switch * r_switch;
*F_invr = (*F_invr) * sw - inv_r * (*E_lj) * dsw;
r = r2 * inv_r;
r_switch = r - nbparam.rvdw_switch;
- r_switch = r_switch >= 0.0f ? r_switch : 0.0f;
+ r_switch = r_switch >= 0.0F ? r_switch : 0.0F;
/* Unlike in the F-only kernel, masking is faster here */
- sw = 1.0f + (switch_V3 + (switch_V4 + switch_V5 * r_switch) * r_switch) * r_switch * r_switch * r_switch;
+ sw = 1.0F + (switch_V3 + (switch_V4 + switch_V5 * r_switch) * r_switch) * r_switch * r_switch * r_switch;
dsw = (switch_F2 + (switch_F3 + switch_F4 * r_switch) * r_switch) * r_switch * r_switch;
*F_invr = (*F_invr) * sw - inv_r * (*E_lj) * dsw;
inv_r6_nm = inv_r2 * inv_r2 * inv_r2;
cr2 = lje_coeff2 * r2;
expmcr2 = expf(-cr2);
- poly = 1.0f + cr2 + 0.5f * cr2 * cr2;
+ poly = 1.0F + cr2 + 0.5F * cr2 * cr2;
/* Subtract the grid force from the total LJ force */
*F_invr += c6grid * (inv_r6_nm - expmcr2 * (inv_r6_nm * poly + lje_coeff6_6)) * inv_r2;
inv_r6_nm = inv_r2 * inv_r2 * inv_r2;
cr2 = lje_coeff2 * r2;
expmcr2 = expf(-cr2);
- poly = 1.0f + cr2 + 0.5f * cr2 * cr2;
+ poly = 1.0F + cr2 + 0.5F * cr2 * cr2;
/* Subtract the grid force from the total LJ force */
*F_invr += c6grid * (inv_r6_nm - expmcr2 * (inv_r6_nm * poly + lje_coeff6_6)) * inv_r2;
/* Shift should be applied only to real LJ pairs */
sh_mask = nbparam.sh_lj_ewald * int_bit;
- *E_lj += c_oneSixth * c6grid * (inv_r6_nm * (1.0f - expmcr2 * poly) + sh_mask);
+ *E_lj += c_oneSixth * c6grid * (inv_r6_nm * (1.0F - expmcr2 * poly) + sh_mask);
}
/*! Fetch per-type LJ parameters.
inv_r6_nm = inv_r2 * inv_r2 * inv_r2;
cr2 = lje_coeff2 * r2;
expmcr2 = expf(-cr2);
- poly = 1.0f + cr2 + 0.5f * cr2 * cr2;
+ poly = 1.0F + cr2 + 0.5F * cr2 * cr2;
/* Subtract the grid force from the total LJ force */
*F_invr += c6grid * (inv_r6_nm - expmcr2 * (inv_r6_nm * poly + lje_coeff6_6)) * inv_r2;
/* Shift should be applied only to real LJ pairs */
sh_mask = nbparam.sh_lj_ewald * int_bit;
- *E_lj += c_oneSixth * c6grid * (inv_r6_nm * (1.0f - expmcr2 * poly) + sh_mask);
+ *E_lj += c_oneSixth * c6grid * (inv_r6_nm * (1.0F - expmcr2 * poly) + sh_mask);
}
}
static __forceinline__ __device__ float interpolate_coulomb_force_r(const NBParamGpu nbparam, float r)
{
float normalized = nbparam.coulomb_tab_scale * r;
- int index = (int)normalized;
+ int index = static_cast<int>(normalized);
float fraction = normalized - index;
float2 d01 = fetch_coulomb_force_r(nbparam, index);
* Depending on what is supported, it fetches parameters either
* using direct load, texture objects, or texrefs.
*/
+// NOLINTNEXTLINE(google-runtime-references)
static __forceinline__ __device__ void fetch_nbfp_c6_c12(float& c6, float& c12, const NBParamGpu nbparam, int baseIndex)
{
float2 c6c12;
/*! Calculate analytical Ewald correction term. */
static __forceinline__ __device__ float pmecorrF(float z2)
{
- const float FN6 = -1.7357322914161492954e-8f;
- const float FN5 = 1.4703624142580877519e-6f;
- const float FN4 = -0.000053401640219807709149f;
- const float FN3 = 0.0010054721316683106153f;
- const float FN2 = -0.019278317264888380590f;
- const float FN1 = 0.069670166153766424023f;
- const float FN0 = -0.75225204789749321333f;
-
- const float FD4 = 0.0011193462567257629232f;
- const float FD3 = 0.014866955030185295499f;
- const float FD2 = 0.11583842382862377919f;
- const float FD1 = 0.50736591960530292870f;
- const float FD0 = 1.0f;
+ const float FN6 = -1.7357322914161492954e-8F;
+ const float FN5 = 1.4703624142580877519e-6F;
+ const float FN4 = -0.000053401640219807709149F;
+ const float FN3 = 0.0010054721316683106153F;
+ const float FN2 = -0.019278317264888380590F;
+ const float FN1 = 0.069670166153766424023F;
+ const float FN0 = -0.75225204789749321333F;
+
+ const float FD4 = 0.0011193462567257629232F;
+ const float FD3 = 0.014866955030185295499F;
+ const float FD2 = 0.11583842382862377919F;
+ const float FD1 = 0.50736591960530292870F;
+ const float FD0 = 1.0F;
float z4;
float polyFN0, polyFN1, polyFD0, polyFD1;
polyFD0 = polyFD0 * z4 + FD0;
polyFD0 = polyFD1 * z2 + polyFD0;
- polyFD0 = 1.0f / polyFD0;
+ polyFD0 = 1.0F / polyFD0;
polyFN0 = FN6 * z4 + FN4;
polyFN1 = FN5 * z4 + FN3;
* arbitrary array sizes.
*/
static __forceinline__ __device__ void
-reduce_force_j_generic(float* f_buf, float3* fout, int tidxi, int tidxj, int aidx)
+reduce_force_j_generic(const float* f_buf, float3* fout, int tidxi, int tidxj, int aidx)
{
if (tidxi < 3)
{
- float f = 0.0f;
+ float f = 0.0F;
for (int j = tidxj * c_clSize; j < (tidxj + 1) * c_clSize; j++)
{
f += f_buf[c_fbufStride * tidxi + j];
* arbitrary array sizes.
* TODO: add the tidxi < 3 trick
*/
-static __forceinline__ __device__ void reduce_force_i_generic(float* f_buf,
- float3* fout,
- float* fshift_buf,
- bool bCalcFshift,
- int tidxi,
- int tidxj,
- int aidx)
+static __forceinline__ __device__ void reduce_force_i_generic(const float* f_buf,
+ float3* fout,
+ float* fshift_buf,
+ bool bCalcFshift,
+ int tidxi,
+ int tidxj,
+ int aidx)
{
if (tidxj < 3)
{
- float f = 0.0f;
+ float f = 0.0F;
for (int j = tidxi; j < c_clSizeSq; j += c_clSize)
{
f += f_buf[tidxj * c_fbufStride + j];
int i, j;
float f;
- assert(c_clSize == 1 << c_clSizeLog2);
+ static_assert(c_clSize == 1 << c_clSizeLog2);
/* Reduce the initial c_clSize values for each i atom to half
* every step by using c_clSize * i threads.
// Destination address where x should be stored in nbnxm layout. We use this cast here to
// save only x, y and z components, not touching the w (q) component, which is pre-defined.
- float3* gm_xqDest = (float3*)&gm_xq[threadIndex + offset];
+ float3* gm_xqDest = reinterpret_cast<float3*>(&gm_xq[threadIndex + offset]);
// Perform layout conversion of each element.
if (threadIndex < numAtoms)
#include "gromacs/gpu_utils/gputraits.h"
class DeviceStream;
-class Grid;
struct NbnxmGpu;
namespace Nbnxm
{
+class Grid;
+
/*! \brief Launch coordinate layout conversion kernel
*
* \param[in] grid Pair-search grid.
{
if (nbp->coulomb_tab)
{
- destroyParamLookupTable(&nbp->coulomb_tab, nbp->coulomb_tab_texobj);
+ destroyParamLookupTable(&nbp->coulomb_tab, &nbp->coulomb_tab_texobj);
}
nbp->coulomb_tab_scale = tables.scale;
/* Free nbparam */
if (nbparam->elecType == ElecType::EwaldTab || nbparam->elecType == ElecType::EwaldTabTwin)
{
- destroyParamLookupTable(&nbparam->coulomb_tab, nbparam->coulomb_tab_texobj);
+ destroyParamLookupTable(&nbparam->coulomb_tab, &nbparam->coulomb_tab_texobj);
}
if (!useLjCombRule(nb->nbparam->vdwType))
{
- destroyParamLookupTable(&nbparam->nbfp, nbparam->nbfp_texobj);
+ destroyParamLookupTable(&nbparam->nbfp, &nbparam->nbfp_texobj);
}
if (nbparam->vdwType == VdwType::EwaldGeom || nbparam->vdwType == VdwType::EwaldLB)
{
- destroyParamLookupTable(&nbparam->nbfp_comb, nbparam->nbfp_comb_texobj);
+ destroyParamLookupTable(&nbparam->nbfp_comb, &nbparam->nbfp_comb_texobj);
}
/* Free plist */
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2019,2020,2021, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
static inline void setPbcAiuc(int numPbcDim, const matrix box, PbcAiuc* pbcAiuc)
{
- pbcAiuc->invBoxDiagZ = 0.0f;
- pbcAiuc->boxZX = 0.0f;
- pbcAiuc->boxZY = 0.0f;
- pbcAiuc->boxZZ = 0.0f;
- pbcAiuc->invBoxDiagY = 0.0f;
- pbcAiuc->boxYX = 0.0f;
- pbcAiuc->boxYY = 0.0f;
- pbcAiuc->invBoxDiagX = 0.0f;
- pbcAiuc->boxXX = 0.0f;
+ pbcAiuc->invBoxDiagZ = 0.0F;
+ pbcAiuc->boxZX = 0.0F;
+ pbcAiuc->boxZY = 0.0F;
+ pbcAiuc->boxZZ = 0.0F;
+ pbcAiuc->invBoxDiagY = 0.0F;
+ pbcAiuc->boxYX = 0.0F;
+ pbcAiuc->boxYY = 0.0F;
+ pbcAiuc->invBoxDiagX = 0.0F;
+ pbcAiuc->boxXX = 0.0F;
if (numPbcDim > ZZ)
{
- pbcAiuc->invBoxDiagZ = 1.0f / box[ZZ][ZZ];
+ pbcAiuc->invBoxDiagZ = 1.0F / box[ZZ][ZZ];
pbcAiuc->boxZX = box[ZZ][XX];
pbcAiuc->boxZY = box[ZZ][YY];
pbcAiuc->boxZZ = box[ZZ][ZZ];
}
if (numPbcDim > YY)
{
- pbcAiuc->invBoxDiagY = 1.0f / box[YY][YY];
+ pbcAiuc->invBoxDiagY = 1.0F / box[YY][YY];
pbcAiuc->boxYX = box[YY][XX];
pbcAiuc->boxYY = box[YY][YY];
}
if (numPbcDim > XX)
{
- pbcAiuc->invBoxDiagX = 1.0f / box[XX][XX];
+ pbcAiuc->invBoxDiagX = 1.0F / box[XX][XX];
pbcAiuc->boxXX = box[XX][XX];
}
}
*/
template<bool returnShift>
static __forceinline__ __device__ int
+// NOLINTNEXTLINE(google-runtime-references)
pbcDxAiuc(const PbcAiuc& pbcAiuc, const float4 r1, const float4 r2, float3& dr)
{
dr.x = r1.x - r2.x;