Apply clang-tidy-11 fixes to CUDA files

author Paul Bauer <paul.bauer.q@gmail.com>

Mon, 24 May 2021 12:58:44 +0000 (12:58 +0000)

committer Paul Bauer <paul.bauer.q@gmail.com>

Mon, 24 May 2021 12:58:44 +0000 (12:58 +0000)
author Paul Bauer <paul.bauer.q@gmail.com>
Mon, 24 May 2021 12:58:44 +0000 (12:58 +0000)
committer Paul Bauer <paul.bauer.q@gmail.com>
Mon, 24 May 2021 12:58:44 +0000 (12:58 +0000)
diff --git a/src/.clang-tidy b/src/.clang-tidy

index 4cfa2bddc8c112f12e021a16ff413faa9a193481..252ccd00dd1c9248d7ec829ec4ec69bfda97aca4 100644 (file)
--- a/src/.clang-tidy
+++ b/src/.clang-tidy
@@ -35,6 +35,12 @@
  #
  #         -misc-no-recursion
  # We have way too many functions and methods relying on recursion
+#
+#         -readability-static-accessed-through-instance
+# Especially the GPU code uses this in many places.
+#
+#         -performance-type-promotion-in-math-fn
+# Those warnings are in the gpu code and we shouldn't to worry about them.
  Checks:  clang-diagnostic-*,-clang-analyzer-*,-clang-analyzer-security.insecureAPI.strcpy,
           bugprone-*,misc-*,readability-*,performance-*,mpi-*,
           -readability-inconsistent-declaration-parameter-name,
@@ -60,7 +66,9 @@ Checks:  clang-diagnostic-*,-clang-analyzer-*,-clang-analyzer-security.insecureA
           -bugprone-narrowing-conversions,
           -google-readability-avoid-underscore-in-googletest-name,
           -cppcoreguidelines-init-variables,
-         -misc-no-recursion
+         -misc-no-recursion,
+         -readability-static-accessed-through-instance,
+         -performance-type-promotion-in-math-fn
  HeaderFilterRegex: .*
  CheckOptions:
    - key:           cppcoreguidelines-special-member-functions.AllowSoleDefaultDtor
diff --git a/src/gromacs/domdec/gpuhaloexchange_impl.cu b/src/gromacs/domdec/gpuhaloexchange_impl.cu

index f32a95bae7b8d9a1e331cc4f19cf96afe317f8e9..3a245e21e46b2eba12770a393ad78e1bc8423998 100644 (file)
--- a/src/gromacs/domdec/gpuhaloexchange_impl.cu
+++ b/src/gromacs/domdec/gpuhaloexchange_impl.cu
@@ -97,8 +97,6 @@ __global__ void packSendBufKernel(float3* __restrict__ dataPacked,
              *gm_dataDest = *gm_dataSrc;
          }
      }
-
-    return;
  }
  
  /*! \brief unpack non-local force data buffer on the GPU using pre-populated "map" containing index
@@ -128,8 +126,6 @@ __global__ void unpackRecvBufKernel(float3* __restrict__ data,
              *gm_dataDest = *gm_dataSrc;
          }
      }
-
-    return;
  }
  
  void GpuHaloExchange::Impl::reinitHalo(float3* d_coordinatesBuffer, float3* d_forcesBuffer)
@@ -251,8 +247,6 @@ void GpuHaloExchange::Impl::reinitHalo(float3* d_coordinatesBuffer, float3* d_fo
  
      wallcycle_sub_stop(wcycle_, WallCycleSubCounter::DDGpu);
      wallcycle_stop(wcycle_, WallCycleCounter::Domdec);
-
-    return;
  }
  
  void GpuHaloExchange::Impl::enqueueWaitRemoteCoordinatesReadyEvent(GpuEventSynchronizer* coordinatesReadyOnDeviceEvent)
@@ -265,12 +259,12 @@ void GpuHaloExchange::Impl::enqueueWaitRemoteCoordinatesReadyEvent(GpuEventSynch
      // Similarly send event to task that will push data to this task.
      GpuEventSynchronizer* remoteCoordinatesReadyOnDeviceEvent;
      MPI_Sendrecv(&coordinatesReadyOnDeviceEvent,
-                 sizeof(GpuEventSynchronizer*),
+                 sizeof(GpuEventSynchronizer*), //NOLINT(bugprone-sizeof-expression)
                   MPI_BYTE,
                   recvRankX_,
                   0,
                   &remoteCoordinatesReadyOnDeviceEvent,
-                 sizeof(GpuEventSynchronizer*),
+                 sizeof(GpuEventSynchronizer*), //NOLINT(bugprone-sizeof-expression)
                   MPI_BYTE,
                   sendRankX_,
                   0,
@@ -346,8 +340,6 @@ void GpuHaloExchange::Impl::communicateHaloCoordinates(const matrix          box
      communicateHaloData(d_sendBuf_, xSendSize_, sendRankX_, recvPtr, xRecvSize_, recvRankX_);
  
      wallcycle_stop(wcycle_, WallCycleCounter::MoveX);
-
-    return;
  }
  
  // The following method should be called after non-local buffer operations,
@@ -517,12 +509,12 @@ void GpuHaloExchange::Impl::communicateHaloDataWithCudaDirect(float3* sendPtr,
      haloDataTransferLaunched_->markEvent(nonLocalStream_);
  
      MPI_Sendrecv(&haloDataTransferLaunched_,
-                 sizeof(GpuEventSynchronizer*),
+                 sizeof(GpuEventSynchronizer*), //NOLINT(bugprone-sizeof-expression)
                   MPI_BYTE,
                   sendRank,
                   0,
                   &haloDataTransferRemote,
-                 sizeof(GpuEventSynchronizer*),
+                 sizeof(GpuEventSynchronizer*), //NOLINT(bugprone-sizeof-expression)
                   MPI_BYTE,
                   recvRank,
                   0,
diff --git a/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cu b/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cu

index 0fbede7b092929aa54d1038f7011c612bd133043..6991fe77be62bc45666f0c6c2766b51a89f31faa 100644 (file)
--- a/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cu
+++ b/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cu
@@ -101,6 +101,7 @@ void PmeCoordinateReceiverGpu::Impl::receiveCoordinatesSynchronizerFromPpCudaDir
  
  #if GMX_MPI
      // Receive event from PP task
+    // NOLINTNEXTLINE(bugprone-sizeof-expression)
      MPI_Irecv(&ppSync_[recvCount_], sizeof(GpuEventSynchronizer*), MPI_BYTE, ppRank, 0, comm_, &request_[recvCount_]);
      recvCount_++;
  #else
diff --git a/src/gromacs/ewald/pme_force_sender_gpu_impl.cu b/src/gromacs/ewald/pme_force_sender_gpu_impl.cu

index ecec4d6ac6fd42b19ac574aa7db555b74baf3856..75409d02967d08dad5915aeb4221e2876f238fd8 100644 (file)
--- a/src/gromacs/ewald/pme_force_sender_gpu_impl.cu
+++ b/src/gromacs/ewald/pme_force_sender_gpu_impl.cu
@@ -101,6 +101,7 @@ void PmeForceSenderGpu::Impl::sendFSynchronizerToPpCudaDirect(int ppRank)
  #if GMX_MPI
      // TODO Using MPI_Isend would be more efficient, particularly when
      // sending to multiple PP ranks
+    // NOLINTNEXTLINE(bugprone-sizeof-expression)
      MPI_Send(&pmeForcesReady_, sizeof(GpuEventSynchronizer*), MPI_BYTE, ppRank, 0, comm_);
  #else
      GMX_UNUSED_VALUE(ppRank);
diff --git a/src/gromacs/ewald/pme_force_sender_gpu_impl.h b/src/gromacs/ewald/pme_force_sender_gpu_impl.h

index 0e0ad8122cbdccf5ce6bcd3bbd1d73570f1ae700..4a73f5637648613a06bcff5db6e2ccd8e2d529c4 100644 (file)
--- a/src/gromacs/ewald/pme_force_sender_gpu_impl.h
+++ b/src/gromacs/ewald/pme_force_sender_gpu_impl.h
@@ -65,6 +65,7 @@ public:
       * \param[in] ppRanks         List of PP ranks
       */
      Impl(GpuEventSynchronizer* pmeForcesReady, MPI_Comm comm, gmx::ArrayRef<PpRanks> ppRanks);
+    // NOLINTNEXTLINE(performance-trivially-destructible)
      ~Impl();
  
      /*! \brief
diff --git a/src/gromacs/ewald/pme_gather.cu b/src/gromacs/ewald/pme_gather.cu

index 095c47acf89e6ec9c73a9cd26ca9ad9a81aae5c7..eedee8a67e94b15329434b6a2a654dab43276f73 100644 (file)
--- a/src/gromacs/ewald/pme_gather.cu
+++ b/src/gromacs/ewald/pme_gather.cu
@@ -63,7 +63,7 @@ __device__ __forceinline__ float read_grid_size(const float* realGridSizeFP, con
          case ZZ: return realGridSizeFP[ZZ];
      }
      assert(false);
-    return 0.0f;
+    return 0.0F;
  }
  
  /*! \brief Reduce the partial force contributions.
@@ -89,9 +89,9 @@ __device__ __forceinline__ void reduce_atom_forces(float3* __restrict__ sm_force
                                                     const int    splineIndex,
                                                     const int    lineIndex,
                                                     const float* realGridSizeFP,
-                                                   float&       fx,
-                                                   float&       fy,
-                                                   float&       fz)
+                                                   float& fx, // NOLINT(google-runtime-references)
+                                                   float& fy, // NOLINT(google-runtime-references)
+                                                   float& fz) // NOLINT(google-runtime-references)
  {
      if (gmx::isPowerOfTwo(order)) // Only for orders of power of 2
      {
@@ -135,7 +135,9 @@ __device__ __forceinline__ void reduce_atom_forces(float3* __restrict__ sm_force
          if (dimIndex < DIM)
          {
              const float n = read_grid_size(realGridSizeFP, dimIndex);
-            *((float*)(&sm_forces[atomIndexLocal]) + dimIndex) = fx * n;
+            float* __restrict__ sm_forcesAtomIndexOffset =
+                    reinterpret_cast<float*>(&sm_forces[atomIndexLocal]);
+            sm_forcesAtomIndexOffset[dimIndex] = fx * n;
          }
      }
      else
@@ -207,7 +209,9 @@ __device__ __forceinline__ void reduce_atom_forces(float3* __restrict__ sm_force
  
              if (sourceIndex == minStride * atomIndex)
              {
-                *((float*)(&sm_forces[atomIndex]) + dimIndex) =
+                float* __restrict__ sm_forcesAtomIndexOffset =
+                        reinterpret_cast<float*>(&sm_forces[atomIndex]);
+                sm_forcesAtomIndexOffset[dimIndex] =
                          (sm_forceTemp[dimIndex][sourceIndex] + sm_forceTemp[dimIndex][sourceIndex + 1]) * n;
              }
          }
@@ -465,9 +469,9 @@ __launch_bounds__(c_gatherMaxThreadsPerBlock, c_gatherMinBlocksPerMP) __global__
                  kernelParams, atomIndexOffset, atomX, atomCharge, sm_theta, sm_dtheta, sm_gridlineIndices);
          __syncwarp();
      }
-    float fx = 0.0f;
-    float fy = 0.0f;
-    float fz = 0.0f;
+    float fx = 0.0F;
+    float fy = 0.0F;
+    float fz = 0.0F;
  
      const int chargeCheck = pme_gpu_check_atom_charge(gm_coefficientsA[atomIndexGlobal]);
  
@@ -545,7 +549,7 @@ __launch_bounds__(c_gatherMaxThreadsPerBlock, c_gatherMinBlocksPerMP) __global__
          {
              int   outputIndexLocal       = i * iterThreads + threadLocalId;
              int   outputIndexGlobal      = blockIndex * blockForcesSize + outputIndexLocal;
-            float outputForceComponent   = ((float*)sm_forces)[outputIndexLocal];
+            float outputForceComponent   = (reinterpret_cast<float*>(sm_forces)[outputIndexLocal]);
              gm_forces[outputIndexGlobal] = outputForceComponent;
          }
      }
@@ -554,9 +558,9 @@ __launch_bounds__(c_gatherMaxThreadsPerBlock, c_gatherMinBlocksPerMP) __global__
      {
          /* We must sync here since the same shared memory is used as above. */
          __syncthreads();
-        fx                    = 0.0f;
-        fy                    = 0.0f;
-        fz                    = 0.0f;
+        fx                    = 0.0F;
+        fy                    = 0.0F;
+        fz                    = 0.0F;
          const int chargeCheck = pme_gpu_check_atom_charge(gm_coefficientsB[atomIndexGlobal]);
          if (chargeCheck)
          {
@@ -605,7 +609,7 @@ __launch_bounds__(c_gatherMaxThreadsPerBlock, c_gatherMinBlocksPerMP) __global__
              {
                  int   outputIndexLocal     = i * iterThreads + threadLocalId;
                  int   outputIndexGlobal    = blockIndex * blockForcesSize + outputIndexLocal;
-                float outputForceComponent = ((float*)sm_forces)[outputIndexLocal];
+                float outputForceComponent = (reinterpret_cast<float*>(sm_forces)[outputIndexLocal]);
                  gm_forces[outputIndexGlobal] += outputForceComponent;
              }
          }
diff --git a/src/gromacs/ewald/pme_gpu_3dfft.cu b/src/gromacs/ewald/pme_gpu_3dfft.cu

index 1a8e9c577b86578f7a1598e9116c277c05f37115..80daa420202c9cee1f7f8abd20807ccae3e873c3 100644 (file)
--- a/src/gromacs/ewald/pme_gpu_3dfft.cu
+++ b/src/gromacs/ewald/pme_gpu_3dfft.cu
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2016,2017,2018,2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2016,2017,2018,2019,2020,2021, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -79,9 +79,9 @@ GpuParallel3dFft::GpuParallel3dFft(const PmeGpu* pmeGpu, const int gridIndex)
      const int realGridSizePaddedTotal =
              realGridSizePadded[XX] * realGridSizePadded[YY] * realGridSizePadded[ZZ];
  
-    realGrid_ = (cufftReal*)kernelParamsPtr->grid.d_realGrid[gridIndex];
+    realGrid_ = reinterpret_cast<cufftReal*>(kernelParamsPtr->grid.d_realGrid[gridIndex]);
      GMX_RELEASE_ASSERT(realGrid_, "Bad (null) input real-space grid");
-    complexGrid_ = (cufftComplex*)kernelParamsPtr->grid.d_fourierGrid[gridIndex];
+    complexGrid_ = reinterpret_cast<cufftComplex*>(kernelParamsPtr->grid.d_fourierGrid[gridIndex]);
      GMX_RELEASE_ASSERT(complexGrid_, "Bad (null) input complex grid");
  
      cufftResult_t result;
diff --git a/src/gromacs/ewald/pme_gpu_calculate_splines.cuh b/src/gromacs/ewald/pme_gpu_calculate_splines.cuh

index 8e8496da8fc36bca3bb58eea71b59a0f973c39fb..6c85fff0ff581dd0409e652fea9f18ef333fed60 100644 (file)
--- a/src/gromacs/ewald/pme_gpu_calculate_splines.cuh
+++ b/src/gromacs/ewald/pme_gpu_calculate_splines.cuh
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2016,2017,2018,2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2016,2017,2018,2019,2020,2021, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -104,10 +104,10 @@ int __device__ __forceinline__ getSplineParamIndex(int paramIndexBase, int dimIn
   *
   * This is called from the spline_and_spread and gather PME kernels.
   */
-int __device__ __forceinline__ pme_gpu_check_atom_charge(const float coefficient)
+bool __device__ __forceinline__ pme_gpu_check_atom_charge(const float coefficient)
  {
      assert(isfinite(coefficient));
-    return c_skipNeutralAtoms ? (coefficient != 0.0f) : 1;
+    return c_skipNeutralAtoms ? (coefficient != 0.0F) : true;
  }
  
  //! Controls if the atom and charge data is prefeched into shared memory or loaded per thread from global
@@ -126,15 +126,15 @@ __device__ inline void assertIsFinite(T arg);
  template<>
  __device__ inline void assertIsFinite(float3 gmx_unused arg)
  {
-    assert(isfinite(float(arg.x)));
-    assert(isfinite(float(arg.y)));
-    assert(isfinite(float(arg.z)));
+    assert(isfinite(static_cast<float>(arg.x)));
+    assert(isfinite(static_cast<float>(arg.y)));
+    assert(isfinite(static_cast<float>(arg.z)));
  }
  
  template<typename T>
  __device__ inline void assertIsFinite(T gmx_unused arg)
  {
-    assert(isfinite(float(arg)));
+    assert(isfinite(static_cast<float>(arg)));
  }
  
  /*! \brief
@@ -268,7 +268,7 @@ __device__ __forceinline__ void calculate_splines(const PmeGpuCudaKernelParams k
              const float shift = c_pmeMaxUnitcellShift;
              /* Fractional coordinates along box vectors, adding a positive shift to ensure t is positive for triclinic boxes */
              t    = (t + shift) * n;
-            tInt = (int)t;
+            tInt = static_cast<int>(t);
              assert(sharedMemoryIndex < atomsPerBlock * DIM);
              sm_fractCoords[sharedMemoryIndex] = t - tInt;
              tableIndex += tInt;
@@ -302,14 +302,14 @@ __device__ __forceinline__ void calculate_splines(const PmeGpuCudaKernelParams k
              assert(isfinite(dr));
  
              /* dr is relative offset from lower cell limit */
-            splineData[order - 1] = 0.0f;
+            splineData[order - 1] = 0.0F;
              splineData[1]         = dr;
-            splineData[0]         = 1.0f - dr;
+            splineData[0]         = 1.0F - dr;
  
  #pragma unroll
              for (int k = 3; k < order; k++)
              {
-                div               = 1.0f / (k - 1.0f);
+                div               = 1.0F / (k - 1.0F);
                  splineData[k - 1] = div * dr * splineData[k - 2];
  #pragma unroll
                  for (int l = 1; l < (k - 1); l++)
@@ -317,7 +317,7 @@ __device__ __forceinline__ void calculate_splines(const PmeGpuCudaKernelParams k
                      splineData[k - l - 1] =
                              div * ((dr + l) * splineData[k - l - 2] + (k - l - dr) * splineData[k - l - 1]);
                  }
-                splineData[0] = div * (1.0f - dr) * splineData[0];
+                splineData[0] = div * (1.0F - dr) * splineData[0];
              }
  
              const int thetaIndexBase =
@@ -333,7 +333,7 @@ __device__ __forceinline__ void calculate_splines(const PmeGpuCudaKernelParams k
                      const int thetaIndex =
                              getSplineParamIndex<order, atomsPerWarp>(thetaIndexBase, dimIndex, o);
  
-                    const float dtheta = ((o > 0) ? splineData[o - 1] : 0.0f) - splineData[o];
+                    const float dtheta = ((o > 0) ? splineData[o - 1] : 0.0F) - splineData[o];
                      assert(isfinite(dtheta));
                      assert(thetaIndex < order * DIM * atomsPerBlock);
                      if (writeSmDtheta)
@@ -348,7 +348,7 @@ __device__ __forceinline__ void calculate_splines(const PmeGpuCudaKernelParams k
                  }
              }
  
-            div                   = 1.0f / (order - 1.0f);
+            div                   = 1.0F / (order - 1.0F);
              splineData[order - 1] = div * dr * splineData[order - 2];
  #pragma unroll
              for (int k = 1; k < (order - 1); k++)
@@ -357,7 +357,7 @@ __device__ __forceinline__ void calculate_splines(const PmeGpuCudaKernelParams k
                                              * ((dr + k) * splineData[order - k - 2]
                                                 + (order - k - dr) * splineData[order - k - 1]);
              }
-            splineData[0] = div * (1.0f - dr) * splineData[0];
+            splineData[0] = div * (1.0F - dr) * splineData[0];
  
              /* Storing the spline values (theta) */
  #pragma unroll
diff --git a/src/gromacs/ewald/pme_gpu_internal.cpp b/src/gromacs/ewald/pme_gpu_internal.cpp

index d0e54ea1012194a573b7914c2d1c676d14cdbb43..328e0c11f84eab784f4b21b222a534981a735bc1 100644 (file)
--- a/src/gromacs/ewald/pme_gpu_internal.cpp
+++ b/src/gromacs/ewald/pme_gpu_internal.cpp
@@ -463,9 +463,9 @@ void pme_gpu_free_fract_shifts(const PmeGpu* pmeGpu)
      auto* kernelParamsPtr = pmeGpu->kernelParams.get();
  #if GMX_GPU_CUDA
      destroyParamLookupTable(&kernelParamsPtr->grid.d_fractShiftsTable,
-                            kernelParamsPtr->fractShiftsTableTexture);
+                            &kernelParamsPtr->fractShiftsTableTexture);
      destroyParamLookupTable(&kernelParamsPtr->grid.d_gridlineIndicesTable,
-                            kernelParamsPtr->gridlineIndicesTableTexture);
+                            &kernelParamsPtr->gridlineIndicesTableTexture);
  #elif GMX_GPU_OPENCL || GMX_GPU_SYCL
      freeDeviceBuffer(&kernelParamsPtr->grid.d_fractShiftsTable);
      freeDeviceBuffer(&kernelParamsPtr->grid.d_gridlineIndicesTable);
diff --git a/src/gromacs/ewald/pme_gpu_program_impl.cu b/src/gromacs/ewald/pme_gpu_program_impl.cu

index b7830fa60b99f4dae6e80229225b69f1be222723..ea0494ec8bddd0e501e6fbe954062834cc0fffee 100644 (file)
--- a/src/gromacs/ewald/pme_gpu_program_impl.cu
+++ b/src/gromacs/ewald/pme_gpu_program_impl.cu
@@ -60,7 +60,7 @@ constexpr int c_stateB = 1;
  
  //! PME CUDA kernels forward declarations. Kernels are documented in their respective files.
  template<int order, bool computeSplines, bool spreadCharges, bool wrapX, bool wrapY, int mode, bool writeGlobal, ThreadsPerAtom threadsPerAtom>
-__global__ void pme_spline_and_spread_kernel(const PmeGpuCudaKernelParams kernelParams);
+__global__ void pme_spline_and_spread_kernel(PmeGpuCudaKernelParams kernelParams);
  
  // Add extern declarations to inform that there will be a definition
  // provided in another translation unit.
@@ -99,7 +99,7 @@ extern template __global__ void
  pme_spline_and_spread_kernel<c_pmeOrder, true, true, c_wrapX, c_wrapY, 2, false, ThreadsPerAtom::OrderSquared>(const PmeGpuCudaKernelParams);
  
  template<GridOrdering gridOrdering, bool computeEnergyAndVirial, const int gridIndex> /* It is significantly slower to pass gridIndex as a kernel parameter */
-__global__ void pme_solve_kernel(const PmeGpuCudaKernelParams kernelParams);
+__global__ void pme_solve_kernel(PmeGpuCudaKernelParams kernelParams);
  
  // Add extern declarations to inform that there will be a definition
  // provided in another translation unit.
@@ -115,7 +115,7 @@ extern template __global__ void pme_solve_kernel<GridOrdering::YZX, true, c_stat
  // clang-format on
  
  template<int order, bool wrapX, bool wrapY, int nGrids, bool readGlobal, ThreadsPerAtom threadsPerAtom>
-__global__ void pme_gather_kernel(const PmeGpuCudaKernelParams kernelParams);
+__global__ void pme_gather_kernel(PmeGpuCudaKernelParams kernelParams);
  
  // Add extern declarations to inform that there will be a definition
  // provided in another translation unit.
diff --git a/src/gromacs/ewald/pme_pp_comm_gpu_impl.cu b/src/gromacs/ewald/pme_pp_comm_gpu_impl.cu

index 2e242a074f91676abbc75da860a49c96754f5cb1..39068544b49750a2b6dacd6f5312eb9469f3943f 100644 (file)
--- a/src/gromacs/ewald/pme_pp_comm_gpu_impl.cu
+++ b/src/gromacs/ewald/pme_pp_comm_gpu_impl.cu
@@ -89,7 +89,6 @@ void PmePpCommGpu::Impl::reinit(int size)
  
      // Reallocate buffer used for staging PME force on GPU
      reallocateDeviceBuffer(&d_pmeForces_, size, &d_pmeForcesSize_, &d_pmeForcesSizeAlloc_, deviceContext_);
-    return;
  }
  
  void PmePpCommGpu::Impl::receiveForceFromPmeCudaDirect(float3* pmeForcePtr, int recvSize, bool receivePmeForceToGpu)
@@ -98,6 +97,7 @@ void PmePpCommGpu::Impl::receiveForceFromPmeCudaDirect(float3* pmeForcePtr, int
      // Receive event from PME task and add to stream, to ensure pull of data doesn't
      // occur before PME force calc is completed
      GpuEventSynchronizer* pmeSync;
+    // NOLINTNEXTLINE(bugprone-sizeof-expression)
      MPI_Recv(&pmeSync, sizeof(GpuEventSynchronizer*), MPI_BYTE, pmeRank_, 0, comm_, MPI_STATUS_IGNORE);
      pmeSync->enqueueWaitEvent(pmePpCommStream_);
  #endif
@@ -166,6 +166,7 @@ void PmePpCommGpu::Impl::sendCoordinatesToPmeCudaDirect(float3*               se
      // Record and send event to allow PME task to sync to above transfer before commencing force calculations
      pmeCoordinatesSynchronizer_.markEvent(pmePpCommStream_);
      GpuEventSynchronizer* pmeSync = &pmeCoordinatesSynchronizer_;
+    // NOLINTNEXTLINE(bugprone-sizeof-expression)
      MPI_Send(&pmeSync, sizeof(GpuEventSynchronizer*), MPI_BYTE, pmeRank_, 0, comm_);
  #endif
  }
diff --git a/src/gromacs/ewald/pme_pp_comm_gpu_impl.h b/src/gromacs/ewald/pme_pp_comm_gpu_impl.h

index d4ee85872e9aa5c3c1a6372574b1aaaf8313ecfc..fbd9b4f8a403e037d14db66841946e28d2b7a72f 100644 (file)
--- a/src/gromacs/ewald/pme_pp_comm_gpu_impl.h
+++ b/src/gromacs/ewald/pme_pp_comm_gpu_impl.h
@@ -152,7 +152,6 @@ private:
                                       int                   sendSize,
                                       GpuEventSynchronizer* coordinatesReadyOnDeviceEvent);
  
-private:
      //! GPU context handle (not used in CUDA)
      const DeviceContext& deviceContext_;
      //! Handle for CUDA stream used for the communication operations in this class
diff --git a/src/gromacs/ewald/pme_solve.cu b/src/gromacs/ewald/pme_solve.cu

index 3f5d2d06f45b5a0121562c1d69ef6b1d99258535..83e21b1f118a2e09dd2b31267c92ce978bc85b75 100644 (file)
--- a/src/gromacs/ewald/pme_solve.cu
+++ b/src/gromacs/ewald/pme_solve.cu
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2016,2017,2018,2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2016,2017,2018,2019,2020,2021, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -88,7 +88,7 @@ __launch_bounds__(c_solveMaxThreadsPerBlock) CLANG_DISABLE_OPTIMIZATION_ATTRIBUT
      const float* __restrict__ gm_splineValueMinor = kernelParams.grid.d_splineModuli[gridIndex]
                                                      + kernelParams.grid.splineValuesOffset[minorDim];
      float* __restrict__ gm_virialAndEnergy = kernelParams.constants.d_virialAndEnergy[gridIndex];
-    float2* __restrict__ gm_grid           = (float2*)kernelParams.grid.d_fourierGrid[gridIndex];
+    float2* __restrict__ gm_grid = reinterpret_cast<float2*>(kernelParams.grid.d_fourierGrid[gridIndex]);
  
      /* Various grid sizes and indices */
      const int localOffsetMinor = 0, localOffsetMajor = 0, localOffsetMiddle = 0; // unused
@@ -119,13 +119,13 @@ __launch_bounds__(c_solveMaxThreadsPerBlock) CLANG_DISABLE_OPTIMIZATION_ATTRIBUT
      const int indexMajor        = blockIdx.z;
  
      /* Optional outputs */
-    float energy = 0.0f;
-    float virxx  = 0.0f;
-    float virxy  = 0.0f;
-    float virxz  = 0.0f;
-    float viryy  = 0.0f;
-    float viryz  = 0.0f;
-    float virzz  = 0.0f;
+    float energy = 0.0F;
+    float virxx  = 0.0F;
+    float virxy  = 0.0F;
+    float virxz  = 0.0F;
+    float viryy  = 0.0F;
+    float viryz  = 0.0F;
+    float virzz  = 0.0F;
  
      assert(indexMajor < kernelParams.grid.complexGridSize[majorDim]);
      if ((indexMiddle < localCountMiddle) & (indexMinor < localCountMinor)
@@ -176,20 +176,20 @@ __launch_bounds__(c_solveMaxThreadsPerBlock) CLANG_DISABLE_OPTIMIZATION_ATTRIBUT
          }
  
          /* 0.5 correction factor for the first and last components of a Z dimension */
-        float corner_fac = 1.0f;
+        float corner_fac = 1.0F;
          switch (gridOrdering)
          {
              case GridOrdering::YZX:
                  if ((kMiddle == 0) | (kMiddle == maxkMiddle))
                  {
-                    corner_fac = 0.5f;
+                    corner_fac = 0.5F;
                  }
                  break;
  
              case GridOrdering::XYZ:
                  if ((kMinor == 0) | (kMinor == maxkMinor))
                  {
-                    corner_fac = 0.5f;
+                    corner_fac = 0.5F;
                  }
                  break;
  
@@ -206,13 +206,13 @@ __launch_bounds__(c_solveMaxThreadsPerBlock) CLANG_DISABLE_OPTIMIZATION_ATTRIBUT
                                 + mZ * kernelParams.current.recipBox[ZZ][ZZ];
  
              const float m2k = mhxk * mhxk + mhyk * mhyk + mhzk * mhzk;
-            assert(m2k != 0.0f);
+            assert(m2k != 0.0F);
              // TODO: use LDG/textures for gm_splineValue
              float denom = m2k * float(CUDART_PI_F) * kernelParams.current.boxVolume
                            * gm_splineValueMajor[kMajor] * gm_splineValueMiddle[kMiddle]
                            * gm_splineValueMinor[kMinor];
              assert(isfinite(denom));
-            assert(denom != 0.0f);
+            assert(denom != 0.0F);
  
              const float tmp1   = expf(-kernelParams.grid.ewaldFactor * m2k);
              const float etermk = kernelParams.constants.elFactor * tmp1 / denom;
@@ -226,9 +226,9 @@ __launch_bounds__(c_solveMaxThreadsPerBlock) CLANG_DISABLE_OPTIMIZATION_ATTRIBUT
              if (computeEnergyAndVirial)
              {
                  const float tmp1k =
-                        2.0f * (gridValue.x * oldGridValue.x + gridValue.y * oldGridValue.y);
+                        2.0F * (gridValue.x * oldGridValue.x + gridValue.y * oldGridValue.y);
  
-                float vfactor = (kernelParams.grid.ewaldFactor + 1.0f / m2k) * 2.0f;
+                float vfactor = (kernelParams.grid.ewaldFactor + 1.0F / m2k) * 2.0F;
                  float ets2    = corner_fac * tmp1k;
                  energy        = ets2;
  
@@ -303,7 +303,7 @@ __launch_bounds__(c_solveMaxThreadsPerBlock) CLANG_DISABLE_OPTIMIZATION_ATTRIBUT
          /* Reduce 7 outputs per warp in the shared memory */
          const int stride =
                  8; // this is c_virialAndEnergyCount==7 rounded up to power of 2 for convenience, hence the assert
-        assert(c_virialAndEnergyCount == 7);
+        static_assert(c_virialAndEnergyCount == 7);
          const int        reductionBufferSize = (c_solveMaxThreadsPerBlock / warp_size) * stride;
          __shared__ float sm_virialAndEnergy[reductionBufferSize];
  
diff --git a/src/gromacs/gpu_utils/cuda_kernel_utils.cuh b/src/gromacs/gpu_utils/cuda_kernel_utils.cuh

index 1346c6218e12b4081331d017f303f8224a5bb581..0333d84ae892f8f422cd0205958d648dff79cd88 100644 (file)
--- a/src/gromacs/gpu_utils/cuda_kernel_utils.cuh
+++ b/src/gromacs/gpu_utils/cuda_kernel_utils.cuh
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2017,2018,2019, by the GROMACS development team, led by
+ * Copyright (c) 2017,2018,2019,2021, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -67,6 +67,7 @@ template<typename T>
  static __forceinline__ __device__ T fetchFromTexture(const cudaTextureObject_t texObj, int index)
  {
      assert(index >= 0);
+    // NOLINTNEXTLINE(misc-static-assert)
      assert(!c_disableCudaTextures);
      return tex1Dfetch<T>(texObj, index);
  }
diff --git a/src/gromacs/gpu_utils/cudautils.cuh b/src/gromacs/gpu_utils/cudautils.cuh

index e5d3c3aad64d9b5d31386a1e17fd436536406d12..daf05a019db0c11ab377bf162ea9d4619ecd544d 100644 (file)
--- a/src/gromacs/gpu_utils/cudautils.cuh
+++ b/src/gromacs/gpu_utils/cudautils.cuh
@@ -2,7 +2,7 @@
   * This file is part of the GROMACS molecular simulation package.
   *
   * Copyright (c) 2012,2014,2015,2016,2017 by the GROMACS development team.
- * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -61,7 +61,7 @@ namespace
   *
   * \returns A description of the API error. Returns '(CUDA error #0 (cudaSuccess): no error)' in case deviceError is cudaSuccess.
   */
-static inline std::string getDeviceErrorString(const cudaError_t deviceError)
+inline std::string getDeviceErrorString(const cudaError_t deviceError)
  {
      return formatString("CUDA error #%d (%s): %s.",
                          deviceError,
@@ -76,7 +76,7 @@ static inline std::string getDeviceErrorString(const cudaError_t deviceError)
   *
   *  \throws InternalError if deviceError is not a success.
   */
-static inline void checkDeviceError(const cudaError_t deviceError, const std::string& errorMessage)
+inline void checkDeviceError(const cudaError_t deviceError, const std::string& errorMessage)
  {
      if (deviceError != cudaSuccess)
      {
@@ -92,7 +92,7 @@ static inline void checkDeviceError(const cudaError_t deviceError, const std::st
   *
   * \param[in]  errorMessage  Undecorated error message.
   */
-static inline void ensureNoPendingDeviceError(const std::string& errorMessage)
+inline void ensureNoPendingDeviceError(const std::string& errorMessage)
  {
      // Ensure there is no pending error that would otherwise affect
      // the behaviour of future error handling.
@@ -135,13 +135,13 @@ enum class GpuApiCallBehavior;
  #ifdef CHECK_CUDA_ERRORS
  
  /*! Check for CUDA error on the return status of a CUDA RT API call. */
-#    define CU_RET_ERR(deviceError, msg)                                                          \
-        do                                                                                        \
-        {                                                                                         \
-            if (deviceError != cudaSuccess)                                                       \
-            {                                                                                     \
-                gmx_fatal(FARGS, "%s\n", (msg + gmx::getDeviceErrorString(deviceError)).c_str()); \
-            }                                                                                     \
+#    define CU_RET_ERR(deviceError, msg)                                                            \
+        do                                                                                          \
+        {                                                                                           \
+            if ((deviceError) != cudaSuccess)                                                       \
+            {                                                                                       \
+                gmx_fatal(FARGS, "%s\n", ((msg) + gmx::getDeviceErrorString(deviceError)).c_str()); \
+            }                                                                                       \
          } while (0)
  
  #else /* CHECK_CUDA_ERRORS */
@@ -239,6 +239,7 @@ void prepareGpuKernelArgument(KernelPtr                          kernel,
                                const CurrentArg*                  argPtr,
                                const RemainingArgs*... otherArgsPtrs)
  {
+    // NOLINTNEXTLINE(google-readability-casting)
      (*kernelArgsPtr)[argIndex] = (void*)argPtr;
      prepareGpuKernelArgument(kernel, kernelArgsPtr, argIndex + 1, otherArgsPtrs...);
  }
@@ -283,6 +284,7 @@ void launchGpuKernel(void (*kernel)(Args...),
  {
      dim3 blockSize(config.blockSize[0], config.blockSize[1], config.blockSize[2]);
      dim3 gridSize(config.gridSize[0], config.gridSize[1], config.gridSize[2]);
+    // NOLINTNEXTLINE(google-readability-casting)
      cudaLaunchKernel((void*)kernel,
                       gridSize,
                       blockSize,
diff --git a/src/gromacs/gpu_utils/devicebuffer.cuh b/src/gromacs/gpu_utils/devicebuffer.cuh

index 2b83752c2ae2954b9bc533a8f19f62b3daf01397..97e9d525b21b3deab5cffaf476ad3173bdf473e2 100644 (file)
--- a/src/gromacs/gpu_utils/devicebuffer.cuh
+++ b/src/gromacs/gpu_utils/devicebuffer.cuh
@@ -68,7 +68,8 @@ template<typename ValueType>
  void allocateDeviceBuffer(DeviceBuffer<ValueType>* buffer, size_t numValues, const DeviceContext& /* deviceContext */)
  {
      GMX_ASSERT(buffer, "needs a buffer pointer");
-    cudaError_t stat = cudaMalloc((void**)buffer, numValues * sizeof(ValueType));
+    // NOLINTNEXTLINE(google-readability-casting)
+    cudaError_t stat = cudaMalloc((void**)(buffer), numValues * sizeof(ValueType));
      GMX_RELEASE_ASSERT(
              stat == cudaSuccess,
              ("Allocation of the device buffer failed. " + gmx::getDeviceErrorString(stat)).c_str());
@@ -130,6 +131,7 @@ void copyToDeviceBuffer(DeviceBuffer<ValueType>* buffer,
      {
          case GpuApiCallBehavior::Async:
              GMX_ASSERT(isHostMemoryPinned(hostBuffer), "Source host buffer was not pinned for CUDA");
+            // NOLINTNEXTLINE(google-readability-casting)
              stat = cudaMemcpyAsync(*((ValueType**)buffer) + startingOffset,
                                     hostBuffer,
                                     bytes,
@@ -142,7 +144,11 @@ void copyToDeviceBuffer(DeviceBuffer<ValueType>* buffer,
  
          case GpuApiCallBehavior::Sync:
              stat = cudaMemcpy(
-                    *((ValueType**)buffer) + startingOffset, hostBuffer, bytes, cudaMemcpyHostToDevice);
+                    // NOLINTNEXTLINE(google-readability-casting)
+                    *((ValueType**)buffer) + startingOffset,
+                    hostBuffer,
+                    bytes,
+                    cudaMemcpyHostToDevice);
              GMX_RELEASE_ASSERT(
                      stat == cudaSuccess,
                      ("Synchronous H2D copy failed. " + gmx::getDeviceErrorString(stat)).c_str());
@@ -189,6 +195,7 @@ void copyFromDeviceBuffer(ValueType*               hostBuffer,
              GMX_ASSERT(isHostMemoryPinned(hostBuffer),
                         "Destination host buffer was not pinned for CUDA");
              stat = cudaMemcpyAsync(hostBuffer,
+                                   // NOLINTNEXTLINE(google-readability-casting)
                                     *((ValueType**)buffer) + startingOffset,
                                     bytes,
                                     cudaMemcpyDeviceToHost,
@@ -199,8 +206,11 @@ void copyFromDeviceBuffer(ValueType*               hostBuffer,
              break;
  
          case GpuApiCallBehavior::Sync:
-            stat = cudaMemcpy(
-                    hostBuffer, *((ValueType**)buffer) + startingOffset, bytes, cudaMemcpyDeviceToHost);
+            stat = cudaMemcpy(hostBuffer,
+                              // NOLINTNEXTLINE(google-readability-casting)
+                              *((ValueType**)buffer) + startingOffset,
+                              bytes,
+                              cudaMemcpyDeviceToHost);
              GMX_RELEASE_ASSERT(
                      stat == cudaSuccess,
                      ("Synchronous D2H copy failed. " + gmx::getDeviceErrorString(stat)).c_str());
@@ -287,7 +297,11 @@ void clearDeviceBufferAsync(DeviceBuffer<ValueType>* buffer,
      const char   pattern = 0;
  
      cudaError_t stat = cudaMemsetAsync(
-            *((ValueType**)buffer) + startingOffset, pattern, bytes, deviceStream.stream());
+            // NOLINTNEXTLINE(google-readability-casting)
+            *((ValueType**)buffer) + startingOffset,
+            pattern,
+            bytes,
+            deviceStream.stream());
      GMX_RELEASE_ASSERT(stat == cudaSuccess,
                         ("Couldn't clear the device buffer. " + gmx::getDeviceErrorString(stat)).c_str());
  }
@@ -345,6 +359,7 @@ void initParamLookupTable(DeviceBuffer<ValueType>* deviceBuffer,
      const size_t sizeInBytes = numValues * sizeof(ValueType);
  
      cudaError_t stat =
+            // NOLINTNEXTLINE(google-readability-casting)
              cudaMemcpy(*((ValueType**)deviceBuffer), hostBuffer, sizeInBytes, cudaMemcpyHostToDevice);
  
      GMX_RELEASE_ASSERT(stat == cudaSuccess,
@@ -378,11 +393,11 @@ void initParamLookupTable(DeviceBuffer<ValueType>* deviceBuffer,
   * \param[in,out]  deviceTexture  Device texture object to unbind.
   */
  template<typename ValueType>
-void destroyParamLookupTable(DeviceBuffer<ValueType>* deviceBuffer, DeviceTexture& deviceTexture)
+void destroyParamLookupTable(DeviceBuffer<ValueType>* deviceBuffer, const DeviceTexture* deviceTexture)
  {
      if (!c_disableCudaTextures && deviceTexture && deviceBuffer)
      {
-        cudaError_t stat = cudaDestroyTextureObject(deviceTexture);
+        cudaError_t stat = cudaDestroyTextureObject(*deviceTexture);
          GMX_RELEASE_ASSERT(
                  stat == cudaSuccess,
                  ("Destruction of the texture object failed. " + gmx::getDeviceErrorString(stat)).c_str());
diff --git a/src/gromacs/gpu_utils/devicebuffer_sycl.h b/src/gromacs/gpu_utils/devicebuffer_sycl.h

index 3ae9b615dad16a218a300d2a902d887702de55fc..743b9ea78522f33315b46cbd9a15db685d955b6c 100644 (file)
--- a/src/gromacs/gpu_utils/devicebuffer_sycl.h
+++ b/src/gromacs/gpu_utils/devicebuffer_sycl.h
@@ -535,7 +535,7 @@ void initParamLookupTable(DeviceBuffer<ValueType>* deviceBuffer,
   * \param[in,out] deviceBuffer  Device buffer to store data in.
   */
  template<typename ValueType>
-void destroyParamLookupTable(DeviceBuffer<ValueType>* deviceBuffer, DeviceTexture& /* deviceTexture */)
+void destroyParamLookupTable(DeviceBuffer<ValueType>* deviceBuffer, DeviceTexture* /* deviceTexture */)
  {
      deviceBuffer->buffer_.reset(nullptr);
  }
diff --git a/src/gromacs/gpu_utils/gpu_utils.cu b/src/gromacs/gpu_utils/gpu_utils.cu

index b35fcabd4ab2d357378c33a65eb79a97e2692cca..b72dbaeff6e59a21185bc0cd8c1f38f350826a3e 100644 (file)
--- a/src/gromacs/gpu_utils/gpu_utils.cu
+++ b/src/gromacs/gpu_utils/gpu_utils.cu
@@ -65,6 +65,7 @@
  #include "gromacs/utility/snprintf.h"
  #include "gromacs/utility/stringutil.h"
  
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
  static bool cudaProfilerRun = ((getenv("NVPROF_ID") != nullptr));
  
  bool isHostMemoryPinned(const void* h_ptr)
@@ -102,7 +103,7 @@ bool isHostMemoryPinned(const void* h_ptr)
      return isPinned;
  }
  
-void startGpuProfiler(void)
+void startGpuProfiler()
  {
      /* The NVPROF_ID environment variable is set by nvprof and indicates that
         mdrun is executed in the CUDA profiler.
@@ -118,7 +119,7 @@ void startGpuProfiler(void)
      }
  }
  
-void stopGpuProfiler(void)
+void stopGpuProfiler()
  {
      /* Stopping the nvidia here allows us to eliminate the subsequent
         API calls from the trace, e.g. uninitialization and cleanup. */
@@ -130,7 +131,7 @@ void stopGpuProfiler(void)
      }
  }
  
-void resetGpuProfiler(void)
+void resetGpuProfiler()
  {
      /* With CUDA <=7.5 the profiler can't be properly reset; we can only start
       *  the profiling here (can't stop it) which will achieve the desired effect if
@@ -180,7 +181,7 @@ static void peerAccessCheckStat(const cudaError_t    stat,
      {
          std::string errorString =
                  gmx::formatString("%s from GPU %d to GPU %d failed", cudaCallName, gpuA, gpuB);
-        CU_RET_ERR(stat, errorString.c_str());
+        CU_RET_ERR(stat, errorString);
      }
      if (stat != cudaSuccess)
      {
diff --git a/src/gromacs/gpu_utils/gpuregiontimer.cuh b/src/gromacs/gpu_utils/gpuregiontimer.cuh

index c56d60da61dc17f00e29cff4ee1f993dca01dbbd..59793095db3ed1ec90f781442f4e905a04d8f7d6 100644 (file)
--- a/src/gromacs/gpu_utils/gpuregiontimer.cuh
+++ b/src/gromacs/gpu_utils/gpuregiontimer.cuh
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2016,2017,2018,2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2016,2017,2018,2019,2020,2021, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -109,7 +109,7 @@ public:
       * for passing into individual GPU API calls.
       * This is just a dummy in CUDA.
       */
-    inline CommandEvent* fetchNextEvent() { return nullptr; }
+    static inline CommandEvent* fetchNextEvent() { return nullptr; }
  };
  
  //! Short-hand for external use
diff --git a/src/gromacs/gpu_utils/pmalloc.cu b/src/gromacs/gpu_utils/pmalloc.cu

index 3a8f1058fe992c19a1822f90df4c6ae9461688cc..cef8b97d73c4e9254a2cf931c1f6696f9d4885a3 100644 (file)
--- a/src/gromacs/gpu_utils/pmalloc.cu
+++ b/src/gromacs/gpu_utils/pmalloc.cu
@@ -66,7 +66,7 @@ void pmalloc(void** h_ptr, size_t nbytes)
      gmx::ensureNoPendingDeviceError("Could not allocate page-locked memory.");
  
      stat = cudaMallocHost(h_ptr, nbytes, flag);
-    sprintf(strbuf, "cudaMallocHost of size %d bytes failed", (int)nbytes);
+    sprintf(strbuf, "cudaMallocHost of size %d bytes failed", static_cast<int>(nbytes));
      CU_RET_ERR(stat, strbuf);
  }
  
diff --git a/src/gromacs/gpu_utils/tests/hostallocator.cpp b/src/gromacs/gpu_utils/tests/hostallocator.cpp

index 2bf390321420bc21d0b0d0973fed58e59829ef2e..21ed3e85ac15ac450147d9c94d2d5b11b2ced879 100644 (file)
--- a/src/gromacs/gpu_utils/tests/hostallocator.cpp
+++ b/src/gromacs/gpu_utils/tests/hostallocator.cpp
@@ -331,7 +331,7 @@ TYPED_TEST(HostAllocatorTestCopyable, ManualPinningOperationsWorkWithCuda)
          EXPECT_TRUE(input.empty());
          resizeAndFillInput(&input, 3, 1);
          // realloc and copy).
-        auto oldInputData = input.data();
+        auto* oldInputData = input.data();
          changePinningPolicy(&input, PinningPolicy::CannotBePinned);
          EXPECT_FALSE(isPinned(input));
          // These cannot be equal as both had to be allocated at the same
diff --git a/src/gromacs/gpu_utils/tests/typecasts_runner.cpp b/src/gromacs/gpu_utils/tests/typecasts_runner.cpp

index 023f14ae43913fcd3820e399a27a4f2699482b25..0256c463d99926b1d86c2f37cffa1b73e7226cbc 100644 (file)
--- a/src/gromacs/gpu_utils/tests/typecasts_runner.cpp
+++ b/src/gromacs/gpu_utils/tests/typecasts_runner.cpp
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2020, by the GROMACS development team, led by
+ * Copyright (c) 2020,2021, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -46,6 +46,8 @@
  
  #include <vector>
  
+#include "gromacs/utility/arrayref.h"
+
  #include "testutils/testasserts.h"
  
  #if !GMX_GPU_CUDA
@@ -56,14 +58,14 @@ namespace gmx
  namespace test
  {
  
-void convertRVecToFloat3OnHost(std::vector<gmx::RVec>& /* rVecOutput */,
-                               const std::vector<gmx::RVec>& /* rVecInput */)
+void convertRVecToFloat3OnHost(ArrayRef<gmx::RVec> /* rVecOutput */,
+                               ArrayRef<const gmx::RVec> /* rVecInput */)
  {
      FAIL() << "Can't test float3 and RVec compatibility without CUDA.";
  }
  
-void convertRVecToFloat3OnDevice(std::vector<gmx::RVec>& /* rVecOutput */,
-                                 const std::vector<gmx::RVec>& /* rVecInput */,
+void convertRVecToFloat3OnDevice(ArrayRef<gmx::RVec> /* rVecOutput */,
+                                 ArrayRef<const gmx::RVec> /* rVecInput */,
                                   const TestDevice* /* testDevice */)
  {
      FAIL() << "Can't test float3 and RVec compatibility without CUDA.";
diff --git a/src/gromacs/gpu_utils/tests/typecasts_runner.cu b/src/gromacs/gpu_utils/tests/typecasts_runner.cu

index dde856d93fc58718f2ab0be155ab6493bc3dba9a..4353f9a30b94e37bea3ac21352bf4b235ad99d5a 100644 (file)
--- a/src/gromacs/gpu_utils/tests/typecasts_runner.cu
+++ b/src/gromacs/gpu_utils/tests/typecasts_runner.cu
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2020, by the GROMACS development team, led by
+ * Copyright (c) 2020,2021, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -48,6 +48,7 @@
  #include "gromacs/gpu_utils/devicebuffer.h"
  #include "gromacs/gpu_utils/typecasts.cuh"
  #include "gromacs/hardware/device_information.h"
+#include "gromacs/utility/arrayref.h"
  #include "gromacs/utility/exceptions.h"
  #include "gromacs/utility/stringutil.h"
  
@@ -65,7 +66,7 @@ namespace test
   * \param[in]  float3Output  Output data in float3 format.
   * \param[in]  numElements   Size of the data buffers.
   */
-void inline saveFloat3InRVecFormat(std::vector<gmx::RVec>& rVecOutput, const float3* float3Output, int numElements)
+void inline saveFloat3InRVecFormat(ArrayRef<gmx::RVec> rVecOutput, const float3* float3Output, int numElements)
  {
      for (int i = 0; i < numElements; i++)
      {
@@ -75,7 +76,7 @@ void inline saveFloat3InRVecFormat(std::vector<gmx::RVec>& rVecOutput, const flo
      }
  }
  
-void convertRVecToFloat3OnHost(std::vector<gmx::RVec>& rVecOutput, const std::vector<gmx::RVec>& rVecInput)
+void convertRVecToFloat3OnHost(ArrayRef<gmx::RVec> rVecOutput, ArrayRef<const gmx::RVec> rVecInput)
  {
      const int numElements = rVecInput.size();
  
@@ -105,9 +106,9 @@ static __global__ void convertRVecToFloat3OnDevice_kernel(DeviceBuffer<float3> g
      }
  }
  
-void convertRVecToFloat3OnDevice(std::vector<gmx::RVec>&       h_rVecOutput,
-                                 const std::vector<gmx::RVec>& h_rVecInput,
-                                 const TestDevice*             testDevice)
+void convertRVecToFloat3OnDevice(ArrayRef<gmx::RVec>       h_rVecOutput,
+                                 ArrayRef<const gmx::RVec> h_rVecInput,
+                                 const TestDevice*         testDevice)
  {
      const DeviceContext& deviceContext = testDevice->deviceContext();
      const DeviceStream&  deviceStream  = testDevice->deviceStream();
diff --git a/src/gromacs/gpu_utils/tests/typecasts_runner.h b/src/gromacs/gpu_utils/tests/typecasts_runner.h

index 44a3247a45f0b6ec46c4fb8600208a2f1d1d8463..f1c60517e51f8a509e2d35c48acdeec833c5e26b 100644 (file)
--- a/src/gromacs/gpu_utils/tests/typecasts_runner.h
+++ b/src/gromacs/gpu_utils/tests/typecasts_runner.h
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2020, by the GROMACS development team, led by
+ * Copyright (c) 2020,2021, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -54,6 +54,9 @@
  namespace gmx
  {
  
+template<typename>
+class ArrayRef;
+
  namespace test
  {
  
@@ -62,7 +65,7 @@ namespace test
   * \param[out] rVecOutput  Data in RVec format for the output.
   * \param[in]  rVecInput   Data in RVec format with the input.
   */
-void convertRVecToFloat3OnHost(std::vector<gmx::RVec>& rVecOutput, const std::vector<gmx::RVec>& rVecInput);
+void convertRVecToFloat3OnHost(ArrayRef<gmx::RVec> rVecOutput, ArrayRef<const gmx::RVec> rVecInput);
  
  /*! \brief Tests the compatibility of RVec and float3 using the conversion on device.
   *
@@ -70,9 +73,9 @@ void convertRVecToFloat3OnHost(std::vector<gmx::RVec>& rVecOutput, const std::ve
   * \param[in]  rVecInput   Data in RVec format with the input.
   * \param[in]  testDevice  Test herdware environment to get DeviceContext and DeviceStream from.
   */
-void convertRVecToFloat3OnDevice(std::vector<gmx::RVec>&       rVecOutput,
-                                 const std::vector<gmx::RVec>& rVecInput,
-                                 const TestDevice*             testDevice);
+void convertRVecToFloat3OnDevice(ArrayRef<gmx::RVec>       rVecOutput,
+                                 ArrayRef<const gmx::RVec> rVecInput,
+                                 const TestDevice*         testDevice);
  
  
  } // namespace test
diff --git a/src/gromacs/gpu_utils/vectype_ops.cuh b/src/gromacs/gpu_utils/vectype_ops.cuh

index cce3fc90082e00a5e4730c9d980a746858f2ea5b..e358e0c0e63ceb1d0dd7a330defdac169bdf3190 100644 (file)
--- a/src/gromacs/gpu_utils/vectype_ops.cuh
+++ b/src/gromacs/gpu_utils/vectype_ops.cuh
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2012,2015,2016,2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2012,2015,2016,2019,2020,2021, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -45,7 +45,7 @@ __forceinline__ __host__ __device__ float3 make_float3(float4 a)
  {
      return make_float3(a.x, a.y, a.z);
  }
-__forceinline__ __host__ __device__ float3 operator-(float3& a)
+__forceinline__ __host__ __device__ float3 operator-(const float3& a)
  {
      return make_float3(-a.x, -a.y, -a.z);
  }
@@ -65,18 +65,21 @@ __forceinline__ __host__ __device__ float3 operator*(float k, float3 a)
  {
      return make_float3(k * a.x, k * a.y, k * a.z);
  }
+// NOLINTNEXTLINE(google-runtime-references)
  __forceinline__ __host__ __device__ void operator+=(float3& a, float3 b)
  {
      a.x += b.x;
      a.y += b.y;
      a.z += b.z;
  }
+// NOLINTNEXTLINE(google-runtime-references)
  __forceinline__ __host__ __device__ void operator+=(float3& a, float4 b)
  {
      a.x += b.x;
      a.y += b.y;
      a.z += b.z;
  }
+// NOLINTNEXTLINE(google-runtime-references)
  __forceinline__ __host__ __device__ void operator-=(float3& a, float3 b)
  {
      a.x -= b.x;
@@ -99,12 +102,14 @@ __forceinline__ __host__ __device__ float3 operator*(float3 a, float3 b)
  {
      return make_float3(a.x * b.x, a.y * b.y, a.z * b.z);
  }
+// NOLINTNEXTLINE(google-runtime-references)
  __forceinline__ __host__ __device__ void operator*=(float3& a, float3 b)
  {
      a.x *= b.x;
      a.y *= b.y;
      a.z *= b.z;
  }
+// NOLINTNEXTLINE(google-runtime-references)
  __forceinline__ __host__ __device__ void operator*=(float3& a, float b)
  {
      a.x *= b;
@@ -126,7 +131,7 @@ __forceinline__ __host__ __device__ float4 make_float4(float s)
  }
  __forceinline__ __host__ __device__ float4 make_float4(float3 a)
  {
-    return make_float4(a.x, a.y, a.z, 0.0f);
+    return make_float4(a.x, a.y, a.z, 0.0F);
  }
  __forceinline__ __host__ __device__ float4 operator+(float4 a, float4 b)
  {
@@ -151,12 +156,14 @@ __forceinline__ __host__ __device__ void operator+=(float4& a, float4 b)
      a.z += b.z;
      a.w += b.w;
  }
+// NOLINTNEXTLINE(google-runtime-references)
  __forceinline__ __host__ __device__ void operator+=(float4& a, float3 b)
  {
      a.x += b.x;
      a.y += b.y;
      a.z += b.z;
  }
+// NOLINTNEXTLINE(google-runtime-references)
  __forceinline__ __host__ __device__ void operator-=(float4& a, float3 b)
  {
      a.x -= b.x;
@@ -223,21 +230,21 @@ __forceinline__ __device__ float cos_angle(const float3 a, const float3 b)
      float ipb  = norm2(b);
      float ip   = iprod(a, b);
      float ipab = ipa * ipb;
-    if (ipab > 0.0f)
+    if (ipab > 0.0F)
      {
          cosval = ip * rsqrt(ipab);
      }
      else
      {
-        cosval = 1.0f;
+        cosval = 1.0F;
      }
-    if (cosval > 1.0f)
+    if (cosval > 1.0F)
      {
-        return 1.0f;
+        return 1.0F;
      }
-    if (cosval < -1.0f)
+    if (cosval < -1.0F)
      {
-        return -1.0f;
+        return -1.0F;
      }
  
      return cosval;
@@ -267,8 +274,8 @@ __forceinline__ __device__ float gmx_angle(const float3 a, const float3 b)
   *
   * \param[in] a  First vector.
   * \param[in] b  Second vector.
- * \returns Angle between vectors.
   */
+// NOLINTNEXTLINE(google-runtime-references)
  __forceinline__ __device__ void atomicAdd(float3& a, const float3 b)
  {
      atomicAdd(&a.x, b.x);
diff --git a/src/gromacs/hardware/device_management.cu b/src/gromacs/hardware/device_management.cu

index 3df63f81310f383c7fa5e8096ad197b1bfa8ab53..c1d669c4e2bb71af661b0161de6c81cdb61612c9 100644 (file)
--- a/src/gromacs/hardware/device_management.cu
+++ b/src/gromacs/hardware/device_management.cu
@@ -2,7 +2,7 @@
   * This file is part of the GROMACS molecular simulation package.
   *
   * Copyright (c) 2012,2013,2014,2015,2016, by the GROMACS development team.
- * Copyright (c) 2017,2018,2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2017,2018,2019,2020,2021, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -66,10 +66,10 @@
   *
   * In reality it is 16 with CUDA <=v5.0, but let's stay on the safe side.
   */
-static int c_cudaMaxDeviceCount = 32;
+static const int c_cudaMaxDeviceCount = 32;
  
  /** Dummy kernel used for sanity checking. */
-static __global__ void dummy_kernel(void) {}
+static __global__ void dummy_kernel() {}
  
  static cudaError_t checkCompiledTargetCompatibility(int deviceId, const cudaDeviceProp& deviceProp)
  {
@@ -338,7 +338,7 @@ void setActiveDevice(const DeviceInformation& deviceInfo)
      if (stat != cudaSuccess)
      {
          auto message = gmx::formatString("Failed to initialize GPU #%d", deviceId);
-        CU_RET_ERR(stat, message.c_str());
+        CU_RET_ERR(stat, message);
      }
  
      if (debug)
diff --git a/src/gromacs/listed_forces/gpubonded_impl.cu b/src/gromacs/listed_forces/gpubonded_impl.cu

index 48bd3f836060d807d1172d6d4583ba80d9a9457b..90c5449da99e998db9958b0f469a976cee4c2436 100644 (file)
--- a/src/gromacs/listed_forces/gpubonded_impl.cu
+++ b/src/gromacs/listed_forces/gpubonded_impl.cu
@@ -156,7 +156,7 @@ static void convertIlistToNbnxnOrder(const InteractionList& src,
                                       int                    numAtomsPerInteraction,
                                       ArrayRef<const int>    nbnxnAtomOrder)
  {
-    GMX_ASSERT(src.size() == 0 || !nbnxnAtomOrder.empty(), "We need the nbnxn atom order");
+    GMX_ASSERT(src.empty() || !nbnxnAtomOrder.empty(), "We need the nbnxn atom order");
  
      dest->iatoms.resize(src.size());
  
diff --git a/src/gromacs/listed_forces/gpubonded_impl.h b/src/gromacs/listed_forces/gpubonded_impl.h

index dacb612308972ada61fe965ec6c1c5190bb621d8..5e66c52eea3802e77153fbd0860703046010e8fc 100644 (file)
--- a/src/gromacs/listed_forces/gpubonded_impl.h
+++ b/src/gromacs/listed_forces/gpubonded_impl.h
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -127,7 +127,7 @@ class GpuBonded::Impl
  public:
      //! Constructor
      Impl(const gmx_ffparams_t& ffparams,
-         const float           electrostaticsScaleFactor,
+         float                 electrostaticsScaleFactor,
           const DeviceContext&  deviceContext,
           const DeviceStream&   deviceStream,
           gmx_wallcycle*        wcycle);
diff --git a/src/gromacs/listed_forces/gpubondedkernels.cu b/src/gromacs/listed_forces/gpubondedkernels.cu

index 8ab52bf5b408110df131f7cfd671c86cc2bfde3e..407e447bdc95c08369159a47b597a43e6ffb8658 100644 (file)
--- a/src/gromacs/listed_forces/gpubondedkernels.cu
+++ b/src/gromacs/listed_forces/gpubondedkernels.cu
@@ -73,7 +73,7 @@
  /*-------------------------------- CUDA kernels-------------------------------- */
  /*------------------------------------------------------------------------------*/
  
-#define CUDA_DEG2RAD_F (CUDART_PI_F / 180.0f)
+#define CUDA_DEG2RAD_F (CUDART_PI_F / 180.0F)
  
  /*---------------- BONDED CUDA kernels--------------*/
  
@@ -81,7 +81,7 @@
  __device__ __forceinline__ static void
  harmonic_gpu(const float kA, const float xA, const float x, float* V, float* F)
  {
-    constexpr float half = 0.5f;
+    constexpr float half = 0.5F;
      float           dx, dx2;
  
      dx  = x - xA;
@@ -104,10 +104,10 @@ __device__ void bonds_gpu(const int       i,
  {
      if (i < numBonds)
      {
-        int3 bondData = *(int3*)(d_forceatoms + 3 * i);
-        int  type     = bondData.x;
-        int  ai       = bondData.y;
-        int  aj       = bondData.z;
+        const int3 bondData = *(reinterpret_cast<const int3*>(d_forceatoms + 3 * i));
+        int        type     = bondData.x;
+        int        ai       = bondData.y;
+        int        aj       = bondData.z;
  
          /* dx = xi - xj, corrected for periodic boundary conditions. */
          float3 dx;
@@ -125,7 +125,7 @@ __device__ void bonds_gpu(const int       i,
              *vtot_loc += vbond;
          }
  
-        if (dr2 != 0.0f)
+        if (dr2 != 0.0F)
          {
              fbond *= rsqrtf(dr2);
  
@@ -175,11 +175,11 @@ __device__ void angles_gpu(const int       i,
  {
      if (i < numBonds)
      {
-        int4 angleData = *(int4*)(d_forceatoms + 4 * i);
-        int  type      = angleData.x;
-        int  ai        = angleData.y;
-        int  aj        = angleData.z;
-        int  ak        = angleData.w;
+        const int4 angleData = *(reinterpret_cast<const int4*>(d_forceatoms + 4 * i));
+        int        type      = angleData.x;
+        int        ai        = angleData.y;
+        int        aj        = angleData.z;
+        int        ak        = angleData.w;
  
          float3 r_ij;
          float3 r_kj;
@@ -203,9 +203,9 @@ __device__ void angles_gpu(const int       i,
          }
  
          float cos_theta2 = cos_theta * cos_theta;
-        if (cos_theta2 < 1.0f)
+        if (cos_theta2 < 1.0F)
          {
-            float st    = dVdt * rsqrtf(1.0f - cos_theta2);
+            float st    = dVdt * rsqrtf(1.0F - cos_theta2);
              float sth   = st * cos_theta;
              float nrij2 = norm2(r_ij);
              float nrkj2 = norm2(r_kj);
@@ -248,11 +248,11 @@ __device__ void urey_bradley_gpu(const int       i,
  {
      if (i < numBonds)
      {
-        int4 ubData = *(int4*)(d_forceatoms + 4 * i);
-        int  type   = ubData.x;
-        int  ai     = ubData.y;
-        int  aj     = ubData.z;
-        int  ak     = ubData.w;
+        const int4 ubData = *(reinterpret_cast<const int4*>(d_forceatoms + 4 * i));
+        int        type   = ubData.x;
+        int        ai     = ubData.y;
+        int        aj     = ubData.z;
+        int        ak     = ubData.w;
  
          float th0A = d_forceparams[type].u_b.thetaA * CUDA_DEG2RAD_F;
          float kthA = d_forceparams[type].u_b.kthetaA;
@@ -287,9 +287,9 @@ __device__ void urey_bradley_gpu(const int       i,
          harmonic_gpu(kUBA, r13A, dr, &vbond, &fbond);
  
          float cos_theta2 = cos_theta * cos_theta;
-        if (cos_theta2 < 1.0f)
+        if (cos_theta2 < 1.0F)
          {
-            float st  = dVdt * rsqrtf(1.0f - cos_theta2);
+            float st  = dVdt * rsqrtf(1.0F - cos_theta2);
              float sth = st * cos_theta;
  
              float nrkj2 = norm2(r_kj);
@@ -316,7 +316,7 @@ __device__ void urey_bradley_gpu(const int       i,
          }
  
          /* Time for the bond calculations */
-        if (dr2 != 0.0f)
+        if (dr2 != 0.0F)
          {
              if (calcEner)
              {
@@ -361,7 +361,7 @@ __device__ __forceinline__ static float dih_angle_gpu(const T        xi,
      *n         = cprod(*r_kj, *r_kl);
      float phi  = gmx_angle(*m, *n);
      float ipr  = iprod(*r_ij, *n);
-    float sign = (ipr < 0.0f) ? -1.0f : 1.0f;
+    float sign = (ipr < 0.0F) ? -1.0F : 1.0F;
      phi        = sign * phi;
  
      return phi;
@@ -375,7 +375,7 @@ dopdihs_gpu(const float cpA, const float phiA, const int mult, const float phi,
  
      mdphi = mult * phi - phiA * CUDA_DEG2RAD_F;
      sdphi = sinf(mdphi);
-    *v    = cpA * (1.0f + cosf(mdphi));
+    *v    = cpA * (1.0F + cosf(mdphi));
      *f    = -cpA * mult * sdphi;
  }
  
@@ -499,7 +499,7 @@ __device__ void rbdihs_gpu(const int       i,
                             float3          sm_fShiftLoc[],
                             const PbcAiuc   pbcAiuc)
  {
-    constexpr float c0 = 0.0f, c1 = 1.0f, c2 = 2.0f, c3 = 3.0f, c4 = 4.0f, c5 = 5.0f;
+    constexpr float c0 = 0.0F, c1 = 1.0F, c2 = 2.0F, c3 = 3.0F, c4 = 4.0F, c5 = 5.0F;
  
      if (i < numBonds)
      {
@@ -597,11 +597,11 @@ __device__ __forceinline__ static void make_dp_periodic_gpu(float* dp)
      /* dp cannot be outside (-pi,pi) */
      if (*dp >= CUDART_PI_F)
      {
-        *dp -= 2.0f * CUDART_PI_F;
+        *dp -= 2.0F * CUDART_PI_F;
      }
      else if (*dp < -CUDART_PI_F)
      {
-        *dp += 2.0f * CUDART_PI_F;
+        *dp += 2.0F * CUDART_PI_F;
      }
  }
  
@@ -658,7 +658,7 @@ __device__ void idihs_gpu(const int       i,
  
          if (calcEner)
          {
-            *vtot_loc += -0.5f * ddphi * dp;
+            *vtot_loc += -0.5F * ddphi * dp;
          }
      }
  }
@@ -679,10 +679,10 @@ __device__ void pairs_gpu(const int       i,
      if (i < numBonds)
      {
          // TODO this should be made into a separate type, the GPU and CPU sizes should be compared
-        int3 pairData = *(int3*)(d_forceatoms + 3 * i);
-        int  type     = pairData.x;
-        int  ai       = pairData.y;
-        int  aj       = pairData.z;
+        const int3 pairData = *(reinterpret_cast<const int3*>(d_forceatoms + 3 * i));
+        int        type     = pairData.x;
+        int        ai       = pairData.y;
+        int        aj       = pairData.z;
  
          float qq  = gm_xq[ai].w * gm_xq[aj].w;
          float c6  = iparams[type].lj14.c6A;
@@ -701,7 +701,7 @@ __device__ void pairs_gpu(const int       i,
          float velec = scale_factor * qq * rinv;
  
          /* Calculate the LJ force * r and add it to the Coulomb part */
-        float fr = (12.0f * c12 * rinv6 - 6.0f * c6) * rinv6 + velec;
+        float fr = (12.0F * c12 * rinv6 - 6.0F * c6) * rinv6 + velec;
  
          float  finvr = fr * rinv2;
          float3 f     = finvr * dr;
@@ -737,14 +737,14 @@ __global__ void exec_kernel_gpu(BondedCudaKernelParameters kernelParams)
  
      extern __shared__ char sm_dynamicShmem[];
      char*                  sm_nextSlotPtr = sm_dynamicShmem;
-    float3*                sm_fShiftLoc   = (float3*)sm_nextSlotPtr;
+    float3*                sm_fShiftLoc   = reinterpret_cast<float3*>(sm_nextSlotPtr);
      sm_nextSlotPtr += c_numShiftVectors * sizeof(float3);
  
      if (calcVir)
      {
          if (threadIdx.x < c_numShiftVectors)
          {
-            sm_fShiftLoc[threadIdx.x] = make_float3(0.0f, 0.0f, 0.0f);
+            sm_fShiftLoc[threadIdx.x] = make_float3(0.0F, 0.0F, 0.0F);
          }
          __syncthreads();
      }
@@ -865,11 +865,11 @@ __global__ void exec_kernel_gpu(BondedCudaKernelParameters kernelParams)
          int warpId   = threadIdx.x / warpSize;
  
          // Shared memory variables to hold block-local partial sum
-        float* sm_vTot = (float*)sm_nextSlotPtr;
+        float* sm_vTot = reinterpret_cast<float*>(sm_nextSlotPtr);
          sm_nextSlotPtr += numWarps * sizeof(float);
-        float* sm_vTotVdw = (float*)sm_nextSlotPtr;
+        float* sm_vTotVdw = reinterpret_cast<float*>(sm_nextSlotPtr);
          sm_nextSlotPtr += numWarps * sizeof(float);
-        float* sm_vTotElec = (float*)sm_nextSlotPtr;
+        float* sm_vTotElec = reinterpret_cast<float*>(sm_nextSlotPtr);
  
          if (threadIdx.x % warpSize == 0)
          {
diff --git a/src/gromacs/mdlib/gpuforcereduction_impl.cpp b/src/gromacs/mdlib/gpuforcereduction_impl.cpp

index 93772853d5301781371172fe4157cb86514a844e..73972c3f6688d5fcc232b3e3f8369fd9a69ebd16 100644 (file)
--- a/src/gromacs/mdlib/gpuforcereduction_impl.cpp
+++ b/src/gromacs/mdlib/gpuforcereduction_impl.cpp
@@ -81,7 +81,7 @@ void GpuForceReduction::Impl::reinit(DeviceBuffer<Float3>  baseForcePtr,
      baseForce_        = baseForcePtr;
      numAtoms_         = numAtoms;
      atomStart_        = atomStart;
-    accumulate_       = static_cast<int>(accumulate);
+    accumulate_       = accumulate;
      completionMarker_ = completionMarker;
      cellInfo_.cell    = cell.data();
  
@@ -112,7 +112,7 @@ void GpuForceReduction::Impl::registerRvecForce(DeviceBuffer<RVec> forcePtr)
      rvecForceToAdd_ = forcePtr;
  };
  
-void GpuForceReduction::Impl::addDependency(GpuEventSynchronizer* const dependency)
+void GpuForceReduction::Impl::addDependency(GpuEventSynchronizer* dependency)
  {
      dependencyList_.push_back(dependency);
  }
@@ -157,8 +157,6 @@ void GpuForceReduction::Impl::execute()
      wallcycle_stop(wcycle_, WallCycleCounter::LaunchGpu);
  }
  
-GpuForceReduction::Impl::~Impl() = default;
-
  GpuForceReduction::GpuForceReduction(const DeviceContext& deviceContext,
                                       const DeviceStream&  deviceStream,
                                       gmx_wallcycle*       wcycle) :
@@ -176,7 +174,7 @@ void GpuForceReduction::registerRvecForce(DeviceBuffer<RVec> forcePtr)
      impl_->registerRvecForce(forcePtr);
  }
  
-void GpuForceReduction::addDependency(GpuEventSynchronizer* const dependency)
+void GpuForceReduction::addDependency(GpuEventSynchronizer* dependency)
  {
      impl_->addDependency(dependency);
  }
diff --git a/src/gromacs/mdlib/gpuforcereduction_impl.h b/src/gromacs/mdlib/gpuforcereduction_impl.h

index 316f4cca317f2d1da0a39a3224b3546767e593f3..f4b5d05e8a470e3ffee938288b18c978cbd67e10 100644 (file)
--- a/src/gromacs/mdlib/gpuforcereduction_impl.h
+++ b/src/gromacs/mdlib/gpuforcereduction_impl.h
@@ -78,8 +78,8 @@ public:
       * \param [in] wcycle        The wallclock counter
       */
      Impl(const DeviceContext& deviceContext, const DeviceStream& deviceStream, gmx_wallcycle* wcycle);
-    ~Impl();
  
+    ~Impl() = default;
      /*! \brief Register a nbnxm-format force to be reduced
       *
       * \param [in] forcePtr  Pointer to force to be reduced
@@ -96,7 +96,7 @@ public:
       *
       * \param [in] dependency   Dependency for this reduction
       */
-    void addDependency(GpuEventSynchronizer* const dependency);
+    void addDependency(GpuEventSynchronizer* dependency);
  
      /*! \brief Reinitialize the GPU force reduction
       *
@@ -108,10 +108,10 @@ public:
       * \param [in] completionMarker Event to be marked when launch of reduction is complete
       */
      void reinit(DeviceBuffer<Float3>  baseForcePtr,
-                const int             numAtoms,
+                int                   numAtoms,
                  ArrayRef<const int>   cell,
-                const int             atomStart,
-                const bool            accumulate,
+                int                   atomStart,
+                bool                  accumulate,
                  GpuEventSynchronizer* completionMarker = nullptr);
  
      /*! \brief Execute the force reduction */
@@ -125,7 +125,7 @@ private:
      //! number of atoms
      int numAtoms_ = 0;
      //! whether reduction is accumulated into base force buffer
-    int accumulate_ = true;
+    bool accumulate_ = true;
      //! cell information for any nbat-format forces
      struct cellInfo cellInfo_;
      //! GPU context object
diff --git a/src/gromacs/mdlib/gpuforcereduction_impl_internal.cu b/src/gromacs/mdlib/gpuforcereduction_impl_internal.cu

index 4fb187f48453a8a3e5a05a277dc2e93064bb22c5..acdaf36c32e35063d3d95ce1fcb7c71a4be0e528 100644 (file)
--- a/src/gromacs/mdlib/gpuforcereduction_impl_internal.cu
+++ b/src/gromacs/mdlib/gpuforcereduction_impl_internal.cu
@@ -91,7 +91,6 @@ static __global__ void reduceKernel(const float3* __restrict__ gm_nbnxmForce,
  
          *gm_fDest = temp;
      }
-    return;
  }
  
  void launchForceReductionKernel(int                        numAtoms,
diff --git a/src/gromacs/mdlib/leapfrog_gpu.h b/src/gromacs/mdlib/leapfrog_gpu.h

index 0f487701784c3524fe69508e3185624eaac4c400..757aa4a7b021945f1589c6c98b1e05275da01266 100644 (file)
--- a/src/gromacs/mdlib/leapfrog_gpu.h
+++ b/src/gromacs/mdlib/leapfrog_gpu.h
@@ -124,12 +124,12 @@ public:
      void integrate(DeviceBuffer<Float3>              d_x,
                     DeviceBuffer<Float3>              d_xp,
                     DeviceBuffer<Float3>              d_v,
-                   const DeviceBuffer<Float3>        d_f,
-                   const float                       dt,
-                   const bool                        doTemperatureScaling,
+                   DeviceBuffer<Float3>              d_f,
+                   float                             dt,
+                   bool                              doTemperatureScaling,
                     gmx::ArrayRef<const t_grp_tcstat> tcstat,
-                   const bool                        doParrinelloRahman,
-                   const float                       dtPressureCouple,
+                   bool                              doParrinelloRahman,
+                   float                             dtPressureCouple,
                     const matrix                      prVelocityScalingMatrix);
  
      /*! \brief Set the integrator
@@ -142,7 +142,7 @@ public:
       * \param[in] inverseMasses   Inverse masses of atoms.
       * \param[in] tempScaleGroups Maps the atom index to temperature scale value.
       */
-    void set(const int numAtoms, const real* inverseMasses, const unsigned short* tempScaleGroups);
+    void set(int numAtoms, const real* inverseMasses, const unsigned short* tempScaleGroups);
  
      /*! \brief Class with hardware-specific interfaces and implementations.*/
      class Impl;
diff --git a/src/gromacs/mdlib/lincs_gpu.cpp b/src/gromacs/mdlib/lincs_gpu.cpp

index 761f38a12066e66d1454a9d9271f729e152e53d7..86211b17cda06e9abe7d9d3b5578c9aec73850aa 100644 (file)
--- a/src/gromacs/mdlib/lincs_gpu.cpp
+++ b/src/gromacs/mdlib/lincs_gpu.cpp
@@ -67,14 +67,14 @@
  namespace gmx
  {
  
-void LincsGpu::apply(const DeviceBuffer<Float3> d_x,
-                     DeviceBuffer<Float3>       d_xp,
-                     const bool                 updateVelocities,
-                     DeviceBuffer<Float3>       d_v,
-                     const real                 invdt,
-                     const bool                 computeVirial,
-                     tensor                     virialScaled,
-                     const PbcAiuc              pbcAiuc)
+void LincsGpu::apply(const DeviceBuffer<Float3>& d_x,
+                     DeviceBuffer<Float3>        d_xp,
+                     const bool                  updateVelocities,
+                     DeviceBuffer<Float3>        d_v,
+                     const real                  invdt,
+                     const bool                  computeVirial,
+                     tensor                      virialScaled,
+                     const PbcAiuc&              pbcAiuc)
  {
      GMX_ASSERT(GMX_GPU_CUDA, "LINCS GPU is only implemented in CUDA.");
  
@@ -120,8 +120,6 @@ void LincsGpu::apply(const DeviceBuffer<Float3> d_x,
          virialScaled[ZZ][YY] += h_virialScaled_[4];
          virialScaled[ZZ][ZZ] += h_virialScaled_[5];
      }
-
-    return;
  }
  
  LincsGpu::LincsGpu(int                  numIterations,
@@ -482,8 +480,8 @@ void LincsGpu::set(const InteractionDefinitions& idef, const int numAtoms, const
  
                  int center = c1a1;
  
-                float sqrtmu1 = 1.0 / sqrt(invmass[c1a1] + invmass[c1a2]);
-                float sqrtmu2 = 1.0 / sqrt(invmass[c2a1] + invmass[c2a2]);
+                float sqrtmu1 = 1.0 / std::sqrt(invmass[c1a1] + invmass[c1a2]);
+                float sqrtmu2 = 1.0 / std::sqrt(invmass[c2a1] + invmass[c2a2]);
  
                  massFactorsHost.at(index) = -sign * invmass[center] * sqrtmu1 * sqrtmu2;
  
@@ -509,8 +507,8 @@ void LincsGpu::set(const InteractionDefinitions& idef, const int numAtoms, const
  
                  int center = c1a2;
  
-                float sqrtmu1 = 1.0 / sqrt(invmass[c1a1] + invmass[c1a2]);
-                float sqrtmu2 = 1.0 / sqrt(invmass[c2a1] + invmass[c2a2]);
+                float sqrtmu1 = 1.0 / std::sqrt(invmass[c1a1] + invmass[c1a2]);
+                float sqrtmu2 = 1.0 / std::sqrt(invmass[c2a1] + invmass[c2a2]);
  
                  massFactorsHost.at(index) = sign * invmass[center] * sqrtmu1 * sqrtmu2;
  
diff --git a/src/gromacs/mdlib/lincs_gpu.h b/src/gromacs/mdlib/lincs_gpu.h

index 3ed6cd3e59eed7f6df9ae75add2c14624f7420a0..690d21df82f1135b6108b4b6831c54003730f8a8 100644 (file)
--- a/src/gromacs/mdlib/lincs_gpu.h
+++ b/src/gromacs/mdlib/lincs_gpu.h
@@ -144,14 +144,14 @@ public:
       * \param[in,out] virialScaled      Scaled virial tensor to be updated.
       * \param[in]     pbcAiuc           PBC data.
       */
-    void apply(const DeviceBuffer<Float3> d_x,
-               DeviceBuffer<Float3>       d_xp,
-               const bool                 updateVelocities,
-               DeviceBuffer<Float3>       d_v,
-               const real                 invdt,
-               const bool                 computeVirial,
-               tensor                     virialScaled,
-               const PbcAiuc              pbcAiuc);
+    void apply(const DeviceBuffer<Float3>& d_x,
+               DeviceBuffer<Float3>        d_xp,
+               bool                        updateVelocities,
+               DeviceBuffer<Float3>        d_v,
+               real                        invdt,
+               bool                        computeVirial,
+               tensor                      virialScaled,
+               const PbcAiuc&              pbcAiuc);
  
      /*! \brief
       * Update data-structures (e.g. after NB search step).
diff --git a/src/gromacs/mdlib/lincs_gpu_internal.cu b/src/gromacs/mdlib/lincs_gpu_internal.cu

index 55d4b48bad623cd82d31d455f3cc9471bee21aca..15e3a288df980b151bd806bc8fa478a8d1904c1d 100644 (file)
--- a/src/gromacs/mdlib/lincs_gpu_internal.cu
+++ b/src/gromacs/mdlib/lincs_gpu_internal.cu
@@ -107,7 +107,7 @@ __launch_bounds__(c_maxThreadsPerBlock) __global__
      const float* __restrict__ gm_inverseMasses         = kernelParams.d_inverseMasses;
      float* __restrict__ gm_virialScaled                = kernelParams.d_virialScaled;
  
-    int threadIndex = blockIdx.x * blockDim.x + threadIdx.x;
+    const int threadIndex = blockIdx.x * blockDim.x + threadIdx.x;
  
      // numConstraintsThreads should be a integer multiple of blockSize (numConstraintsThreads = numBlocks*blockSize).
      // This is to ensure proper synchronizations and reduction. All array are padded to the required size.
@@ -122,7 +122,7 @@ __launch_bounds__(c_maxThreadsPerBlock) __global__
      int      j    = pair.j;
  
      // Mass-scaled Lagrange multiplier
-    float lagrangeScaled = 0.0f;
+    float lagrangeScaled = 0.0F;
  
      float targetLength;
      float inverseMassi;
@@ -139,14 +139,14 @@ __launch_bounds__(c_maxThreadsPerBlock) __global__
      // Everything computed for these dummies will be equal to zero
      if (isDummyThread)
      {
-        targetLength    = 0.0f;
-        inverseMassi    = 0.0f;
-        inverseMassj    = 0.0f;
-        sqrtReducedMass = 0.0f;
-
-        xi = make_float3(0.0f, 0.0f, 0.0f);
-        xj = make_float3(0.0f, 0.0f, 0.0f);
-        rc = make_float3(0.0f, 0.0f, 0.0f);
+        targetLength    = 0.0F;
+        inverseMassi    = 0.0F;
+        inverseMassj    = 0.0F;
+        sqrtReducedMass = 0.0F;
+
+        xi = make_float3(0.0F, 0.0F, 0.0F);
+        xj = make_float3(0.0F, 0.0F, 0.0F);
+        rc = make_float3(0.0F, 0.0F, 0.0F);
      }
      else
      {
@@ -209,7 +209,7 @@ __launch_bounds__(c_maxThreadsPerBlock) __global__
      {
          // Making sure that all sm_rhs are saved before they are accessed in a loop below
          __syncthreads();
-        float mvb = 0.0f;
+        float mvb = 0.0F;
  
          for (int n = 0; n < coupledConstraintsCount; n++)
          {
@@ -256,12 +256,12 @@ __launch_bounds__(c_maxThreadsPerBlock) __global__
          float3 dx = pbcDxAiuc(pbcAiuc, xi, xj);
  
          float len2  = targetLength * targetLength;
-        float dlen2 = 2.0f * len2 - norm2(dx);
+        float dlen2 = 2.0F * len2 - norm2(dx);
  
          // TODO A little bit more effective but slightly less readable version of the below would be:
          //      float proj = sqrtReducedMass*(targetLength - (dlen2 > 0.0f ? 1.0f : 0.0f)*dlen2*rsqrt(dlen2));
          float proj;
-        if (dlen2 > 0.0f)
+        if (dlen2 > 0.0F)
          {
              proj = sqrtReducedMass * (targetLength - dlen2 * rsqrt(dlen2));
          }
@@ -373,8 +373,6 @@ __launch_bounds__(c_maxThreadsPerBlock) __global__
              atomicAdd(&(gm_virialScaled[threadIdx.x]), sm_threadVirial[threadIdx.x * blockDim.x]);
          }
      }
-
-    return;
  }
  
  /*! \brief Select templated kernel.
@@ -409,14 +407,14 @@ inline auto getLincsKernelPtr(const bool updateVelocities, const bool computeVir
      return kernelPtr;
  }
  
-void launchLincsGpuKernel(LincsGpuKernelParameters&  kernelParams,
-                          const DeviceBuffer<Float3> d_x,
-                          DeviceBuffer<Float3>       d_xp,
-                          const bool                 updateVelocities,
-                          DeviceBuffer<Float3>       d_v,
-                          const real                 invdt,
-                          const bool                 computeVirial,
-                          const DeviceStream&        deviceStream)
+void launchLincsGpuKernel(const LincsGpuKernelParameters& kernelParams,
+                          const DeviceBuffer<Float3>&     d_x,
+                          DeviceBuffer<Float3>            d_xp,
+                          const bool                      updateVelocities,
+                          DeviceBuffer<Float3>            d_v,
+                          const real                      invdt,
+                          const bool                      computeVirial,
+                          const DeviceStream&             deviceStream)
  {
  
      auto kernelPtr = getLincsKernelPtr(updateVelocities, computeVirial);
@@ -459,8 +457,6 @@ void launchLincsGpuKernel(LincsGpuKernelParameters&  kernelParams,
                      nullptr,
                      "lincs_kernel<updateVelocities, computeVirial>",
                      kernelArgs);
-
-    return;
  }
  
  } // namespace gmx
diff --git a/src/gromacs/mdlib/lincs_gpu_internal.h b/src/gromacs/mdlib/lincs_gpu_internal.h

index 5c42a784c6cf97dff99a2172769e4b251d3b4b5c..cd421c198c07646d20e45d8ab052cb7915a590ed 100644 (file)
--- a/src/gromacs/mdlib/lincs_gpu_internal.h
+++ b/src/gromacs/mdlib/lincs_gpu_internal.h
@@ -57,14 +57,14 @@ struct LincsGpuKernelParameters;
  //! Number of threads in a GPU block
  constexpr static int c_threadsPerBlock = 256;
  
-void launchLincsGpuKernel(LincsGpuKernelParameters&  kernelParams,
-                          const DeviceBuffer<Float3> d_x,
-                          DeviceBuffer<Float3>       d_xp,
-                          const bool                 updateVelocities,
-                          DeviceBuffer<Float3>       d_v,
-                          const real                 invdt,
-                          const bool                 computeVirial,
-                          const DeviceStream&        deviceStream);
+void launchLincsGpuKernel(const LincsGpuKernelParameters& kernelParams,
+                          const DeviceBuffer<Float3>&     d_x,
+                          DeviceBuffer<Float3>            d_xp,
+                          bool                            updateVelocities,
+                          DeviceBuffer<Float3>            d_v,
+                          real                            invdt,
+                          bool                            computeVirial,
+                          const DeviceStream&             deviceStream);
  
  } // namespace gmx
  
diff --git a/src/gromacs/mdlib/lincs_gpu_internal_sycl.cpp b/src/gromacs/mdlib/lincs_gpu_internal_sycl.cpp

index b5594c1bc022e05e9b89afcf6c4a744ee4aab4f1..1e87f6968f08d8e4597dbe10614f00577134a0f4 100644 (file)
--- a/src/gromacs/mdlib/lincs_gpu_internal_sycl.cpp
+++ b/src/gromacs/mdlib/lincs_gpu_internal_sycl.cpp
@@ -49,8 +49,8 @@
  namespace gmx
  {
  
-void launchLincsGpuKernel(LincsGpuKernelParameters& /* kernelParams */,
-                          const DeviceBuffer<Float3> /* d_x */,
+void launchLincsGpuKernel(const LincsGpuKernelParameters& /* kernelParams */,
+                          const DeviceBuffer<Float3>& /* d_x */,
                            DeviceBuffer<Float3> /* d_xp */,
                            const bool /* updateVelocities */,
                            DeviceBuffer<Float3> /* d_v */,
diff --git a/src/gromacs/mdlib/settle_gpu.cpp b/src/gromacs/mdlib/settle_gpu.cpp

index 659947ac357873afb6216185a4dadd320a997961..e836ff1779281bc21298590b6becf88d469de4a6 100644 (file)
--- a/src/gromacs/mdlib/settle_gpu.cpp
+++ b/src/gromacs/mdlib/settle_gpu.cpp
@@ -63,14 +63,14 @@
  namespace gmx
  {
  
-void SettleGpu::apply(const DeviceBuffer<Float3> d_x,
-                      DeviceBuffer<Float3>       d_xp,
-                      const bool                 updateVelocities,
-                      DeviceBuffer<Float3>       d_v,
-                      const real                 invdt,
-                      const bool                 computeVirial,
-                      tensor                     virialScaled,
-                      const PbcAiuc              pbcAiuc)
+void SettleGpu::apply(const DeviceBuffer<Float3>& d_x,
+                      DeviceBuffer<Float3>        d_xp,
+                      const bool                  updateVelocities,
+                      DeviceBuffer<Float3>        d_v,
+                      const real                  invdt,
+                      const bool                  computeVirial,
+                      tensor                      virialScaled,
+                      const PbcAiuc&              pbcAiuc)
  {
  
      // Early exit if no settles
@@ -118,8 +118,6 @@ void SettleGpu::apply(const DeviceBuffer<Float3> d_x,
          virialScaled[ZZ][YY] += h_virialScaled_[4];
          virialScaled[ZZ][ZZ] += h_virialScaled_[5];
      }
-
-    return;
  }
  
  SettleGpu::SettleGpu(const gmx_mtop_t& mtop, const DeviceContext& deviceContext, const DeviceStream& deviceStream) :
diff --git a/src/gromacs/mdlib/settle_gpu.h b/src/gromacs/mdlib/settle_gpu.h

index 0f85649b95813d0734cedc7ab27e367e53da1355..998f545f86994e70e793f27f3da3e26e5efa6492 100644 (file)
--- a/src/gromacs/mdlib/settle_gpu.h
+++ b/src/gromacs/mdlib/settle_gpu.h
@@ -114,14 +114,14 @@ public:
       * \param[in,out] virialScaled      Scaled virial tensor to be updated.
       * \param[in]     pbcAiuc           PBC data.
       */
-    void apply(const DeviceBuffer<Float3> d_x,
-               DeviceBuffer<Float3>       d_xp,
-               const bool                 updateVelocities,
-               DeviceBuffer<Float3>       d_v,
-               const real                 invdt,
-               const bool                 computeVirial,
-               tensor                     virialScaled,
-               const PbcAiuc              pbcAiuc);
+    void apply(const DeviceBuffer<Float3>& d_x,
+               DeviceBuffer<Float3>        d_xp,
+               bool                        updateVelocities,
+               DeviceBuffer<Float3>        d_v,
+               real                        invdt,
+               bool                        computeVirial,
+               tensor                      virialScaled,
+               const PbcAiuc&              pbcAiuc);
  
      /*! \brief
       * Update data-structures (e.g. after NB search step).
diff --git a/src/gromacs/mdlib/settle_gpu_internal.cu b/src/gromacs/mdlib/settle_gpu_internal.cu

index 11dd63b035a12592632c735cf7a9fd3fe06fc669..366df8d3e47f04f4abde8c3bbd1f7581fa7c8abb 100644 (file)
--- a/src/gromacs/mdlib/settle_gpu_internal.cu
+++ b/src/gromacs/mdlib/settle_gpu_internal.cu
@@ -205,7 +205,7 @@ __launch_bounds__(sc_maxThreadsPerBlock) __global__
  
  
          float sinphi = a1d_z * rsqrt(pars.ra * pars.ra);
-        float tmp2   = 1.0f - sinphi * sinphi;
+        float tmp2   = 1.0F - sinphi * sinphi;
  
          if (almost_zero > tmp2)
          {
@@ -215,7 +215,7 @@ __launch_bounds__(sc_maxThreadsPerBlock) __global__
          float tmp    = rsqrt(tmp2);
          float cosphi = tmp2 * tmp;
          float sinpsi = (b1d.z - c1d.z) * pars.irc2 * tmp;
-        tmp2         = 1.0f - sinpsi * sinpsi;
+        tmp2         = 1.0F - sinpsi * sinpsi;
  
          float cospsi = tmp2 * rsqrt(tmp2);
  
@@ -235,7 +235,7 @@ __launch_bounds__(sc_maxThreadsPerBlock) __global__
          float sinthe = (alpha * gamma - beta * tmp2 * rsqrt(tmp2)) * rsqrt(al2be2 * al2be2);
  
          /*  --- Step4  A3' --- */
-        tmp2         = 1.0f - sinthe * sinthe;
+        tmp2         = 1.0F - sinthe * sinthe;
          float costhe = tmp2 * rsqrt(tmp2);
  
          float3 a3d, b3d, c3d;
@@ -318,7 +318,7 @@ __launch_bounds__(sc_maxThreadsPerBlock) __global__
          {
              for (int d = 0; d < 6; d++)
              {
-                sm_threadVirial[d * blockDim.x + threadIdx.x] = 0.0f;
+                sm_threadVirial[d * blockDim.x + threadIdx.x] = 0.0F;
              }
          }
      }
@@ -358,8 +358,6 @@ __launch_bounds__(sc_maxThreadsPerBlock) __global__
              atomicAdd(&(gm_virialScaled[tib]), sm_threadVirial[tib * blockSize]);
          }
      }
-
-    return;
  }
  
  /*! \brief Select templated kernel.
@@ -394,18 +392,18 @@ inline auto getSettleKernelPtr(const bool updateVelocities, const bool computeVi
      return kernelPtr;
  }
  
-void launchSettleGpuKernel(const int                         numSettles,
-                           const DeviceBuffer<WaterMolecule> d_atomIds,
-                           const SettleParameters            settleParameters,
-                           const DeviceBuffer<Float3>        d_x,
-                           DeviceBuffer<Float3>              d_xp,
-                           const bool                        updateVelocities,
-                           DeviceBuffer<Float3>              d_v,
-                           const real                        invdt,
-                           const bool                        computeVirial,
-                           DeviceBuffer<float>               virialScaled,
-                           const PbcAiuc                     pbcAiuc,
-                           const DeviceStream&               deviceStream)
+void launchSettleGpuKernel(const int                          numSettles,
+                           const DeviceBuffer<WaterMolecule>& d_atomIds,
+                           const SettleParameters&            settleParameters,
+                           const DeviceBuffer<Float3>&        d_x,
+                           DeviceBuffer<Float3>               d_xp,
+                           const bool                         updateVelocities,
+                           DeviceBuffer<Float3>               d_v,
+                           const real                         invdt,
+                           const bool                         computeVirial,
+                           DeviceBuffer<float>                virialScaled,
+                           const PbcAiuc&                     pbcAiuc,
+                           const DeviceStream&                deviceStream)
  {
      static_assert(
              gmx::isPowerOfTwo(sc_threadsPerBlock),
@@ -449,8 +447,6 @@ void launchSettleGpuKernel(const int                         numSettles,
                      nullptr,
                      "settle_kernel<updateVelocities, computeVirial>",
                      kernelArgs);
-
-    return;
  }
  
  } // namespace gmx
diff --git a/src/gromacs/mdlib/settle_gpu_internal.h b/src/gromacs/mdlib/settle_gpu_internal.h

index 9963944e74f0fc33f8c75075e829a6c472f71d2f..f88d0461502476b6bd7433caa76aac2471cdbc4d 100644 (file)
--- a/src/gromacs/mdlib/settle_gpu_internal.h
+++ b/src/gromacs/mdlib/settle_gpu_internal.h
@@ -73,18 +73,18 @@ namespace gmx
   * \param[in]     pbcAiuc           PBC data.
   * \param[in]     deviceStream      Device stream to launch kernel in.
   */
-void launchSettleGpuKernel(int                               numSettles,
-                           const DeviceBuffer<WaterMolecule> d_atomIds,
-                           const SettleParameters            settleParameters,
-                           const DeviceBuffer<Float3>        d_x,
-                           DeviceBuffer<Float3>              d_xp,
-                           const bool                        updateVelocities,
-                           DeviceBuffer<Float3>              d_v,
-                           const real                        invdt,
-                           const bool                        computeVirial,
-                           DeviceBuffer<float>               virialScaled,
-                           const PbcAiuc                     pbcAiuc,
-                           const DeviceStream&               deviceStream);
+void launchSettleGpuKernel(int                                numSettles,
+                           const DeviceBuffer<WaterMolecule>& d_atomIds,
+                           const SettleParameters&            settleParameters,
+                           const DeviceBuffer<Float3>&        d_x,
+                           DeviceBuffer<Float3>               d_xp,
+                           bool                               updateVelocities,
+                           DeviceBuffer<Float3>               d_v,
+                           real                               invdt,
+                           bool                               computeVirial,
+                           DeviceBuffer<float>                virialScaled,
+                           const PbcAiuc&                     pbcAiuc,
+                           const DeviceStream&                deviceStream);
  
  } // namespace gmx
  
diff --git a/src/gromacs/mdlib/settle_gpu_internal_sycl.cpp b/src/gromacs/mdlib/settle_gpu_internal_sycl.cpp

index ef063c36d5b23fbd3e98387fd4e9a177797d0b4f..a6e6e9a40d881edbd2a6d80d8331171fa010447e 100644 (file)
--- a/src/gromacs/mdlib/settle_gpu_internal_sycl.cpp
+++ b/src/gromacs/mdlib/settle_gpu_internal_sycl.cpp
@@ -49,16 +49,16 @@ namespace gmx
  {
  
  void launchSettleGpuKernel(const int /* numSettles */,
-                           const DeviceBuffer<WaterMolecule> /* d_atomIds */,
-                           const SettleParameters /* settleParameters */,
-                           const DeviceBuffer<Float3> /* d_x */,
+                           const DeviceBuffer<WaterMolecule>& /* d_atomIds */,
+                           const SettleParameters& /* settleParameters */,
+                           const DeviceBuffer<Float3>& /* d_x */,
                             DeviceBuffer<Float3> /* d_xp */,
                             const bool /* updateVelocities */,
                             DeviceBuffer<Float3> /* d_v */,
                             const real /* invdt */,
                             const bool /* computeVirial */,
                             DeviceBuffer<float> /* virialScaled */,
-                           const PbcAiuc /* pbcAiuc */,
+                           const PbcAiuc& /* pbcAiuc */,
                             const DeviceStream& /* deviceStream */)
  {
      // SYCL_TODO
diff --git a/src/gromacs/mdlib/update_constrain_gpu_impl.cpp b/src/gromacs/mdlib/update_constrain_gpu_impl.cpp

index 6e5c56a8677d10dc181462481ee9751f2e82e79d..96d447d36c22d4851b6fa87a87600363db245af7 100644 (file)
--- a/src/gromacs/mdlib/update_constrain_gpu_impl.cpp
+++ b/src/gromacs/mdlib/update_constrain_gpu_impl.cpp
@@ -72,7 +72,7 @@
  #include "gromacs/timing/wallcycle.h"
  #include "gromacs/topology/mtop_util.h"
  
-static constexpr bool sc_haveGpuConstraintSupport = (GMX_GPU_CUDA);
+static constexpr bool sc_haveGpuConstraintSupport = GMX_GPU_CUDA;
  
  namespace gmx
  {
@@ -125,8 +125,6 @@ void UpdateConstrainGpu::Impl::integrate(GpuEventSynchronizer*             fRead
  
      wallcycle_sub_stop(wcycle_, WallCycleSubCounter::LaunchGpuUpdateConstrain);
      wallcycle_stop(wcycle_, WallCycleCounter::LaunchGpu);
-
-    return;
  }
  
  void UpdateConstrainGpu::Impl::scaleCoordinates(const matrix scalingMatrix)
diff --git a/src/gromacs/mdlib/update_constrain_gpu_impl.h b/src/gromacs/mdlib/update_constrain_gpu_impl.h

index 150c944aaea27c601d36e272b806ef2d9b81ec73..adfbccda8575cf0fda8099dfc521100d979ce96d 100644 (file)
--- a/src/gromacs/mdlib/update_constrain_gpu_impl.h
+++ b/src/gromacs/mdlib/update_constrain_gpu_impl.h
@@ -153,7 +153,7 @@ public:
       */
      void set(DeviceBuffer<Float3>          d_x,
               DeviceBuffer<Float3>          d_v,
-             const DeviceBuffer<Float3>    d_f,
+             DeviceBuffer<Float3>          d_f,
               const InteractionDefinitions& idef,
               const t_mdatoms&              md);
  
diff --git a/src/gromacs/mdlib/update_constrain_gpu_internal.cu b/src/gromacs/mdlib/update_constrain_gpu_internal.cu

index 5787bc476e12f0b268aabbf3c3e8b445fc903214..9d1dea39af6c456fc07b8c33cfee0f92ce3828a4 100644 (file)
--- a/src/gromacs/mdlib/update_constrain_gpu_internal.cu
+++ b/src/gromacs/mdlib/update_constrain_gpu_internal.cu
@@ -77,7 +77,7 @@ __launch_bounds__(c_maxThreadsPerBlock) __global__
  
  void launchScaleCoordinatesKernel(const int            numAtoms,
                                    DeviceBuffer<Float3> d_coordinates,
-                                  const ScalingMatrix  mu,
+                                  const ScalingMatrix& mu,
                                    const DeviceStream&  deviceStream)
  {
      KernelLaunchConfig kernelLaunchConfig;
diff --git a/src/gromacs/mdlib/update_constrain_gpu_internal.h b/src/gromacs/mdlib/update_constrain_gpu_internal.h

index 893110f15b0e3d08993def2519a150f9c7f1bd97..7e0a2d1a0462228d2c9f63d92ca4d1601db926f3 100644 (file)
--- a/src/gromacs/mdlib/update_constrain_gpu_internal.h
+++ b/src/gromacs/mdlib/update_constrain_gpu_internal.h
@@ -76,7 +76,7 @@ struct ScalingMatrix
   */
  void launchScaleCoordinatesKernel(int                  numAtoms,
                                    DeviceBuffer<Float3> d_coordinates,
-                                  const ScalingMatrix  mu,
+                                  const ScalingMatrix& mu,
                                    const DeviceStream&  deviceStream);
  
  } // namespace gmx
diff --git a/src/gromacs/mdlib/update_constrain_gpu_internal_sycl.cpp b/src/gromacs/mdlib/update_constrain_gpu_internal_sycl.cpp

index cf9439bf1790553e6f4d536c714273f32767718a..5f2233a7f195b640d7bf3d3647c120a9b6ddb5e8 100644 (file)
--- a/src/gromacs/mdlib/update_constrain_gpu_internal_sycl.cpp
+++ b/src/gromacs/mdlib/update_constrain_gpu_internal_sycl.cpp
@@ -69,7 +69,7 @@ static auto scaleKernel(cl::sycl::handler&
  
  void launchScaleCoordinatesKernel(const int            numAtoms,
                                    DeviceBuffer<Float3> d_coordinates,
-                                  const ScalingMatrix  mu,
+                                  const ScalingMatrix& mu,
                                    const DeviceStream&  deviceStream)
  {
      const cl::sycl::range<1> rangeAllAtoms(numAtoms);
diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu.h b/src/gromacs/mdtypes/state_propagator_data_gpu.h

index 3f66196509f742859e0dcd670cc5f0189a62fad4..cd119960f39eef8b3907a28cd76e06e8d793cb8a 100644 (file)
--- a/src/gromacs/mdtypes/state_propagator_data_gpu.h
+++ b/src/gromacs/mdtypes/state_propagator_data_gpu.h
@@ -158,7 +158,7 @@ public:
       *
       * \returns Tuple, containing the index of the first atom in the range and the total number of atoms in the range.
       */
-    std::tuple<int, int> getAtomRangesFromAtomLocality(AtomLocality atomLocality);
+    std::tuple<int, int> getAtomRangesFromAtomLocality(AtomLocality atomLocality) const;
  
  
      /*! \brief Get the positions buffer on the GPU.
@@ -323,13 +323,13 @@ public:
       *
       *  \returns The number of local atoms.
       */
-    int numAtomsLocal();
+    int numAtomsLocal() const;
  
      /*! \brief Getter for the total number of atoms.
       *
       *  \returns The total number of atoms.
       */
-    int numAtomsAll();
+    int numAtomsAll() const;
  
  private:
      class Impl;
diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp

index f112d1b1cb08180971938eda99eb4632f9e6e6fc..0ce99a4dd4c4ee58f3e597cd752277b4bde3942a 100644 (file)
--- a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp
+++ b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp
@@ -84,7 +84,7 @@ void StatePropagatorDataGpu::reinit(int /* numAtomsLocal */, int /* numAtomsAll
                 "GPU implementation.");
  }
  
-std::tuple<int, int> StatePropagatorDataGpu::getAtomRangesFromAtomLocality(AtomLocality /* atomLocality */)
+std::tuple<int, int> StatePropagatorDataGpu::getAtomRangesFromAtomLocality(AtomLocality /* atomLocality */) const
  {
      GMX_ASSERT(!impl_,
                 "A CPU stub method from GPU state propagator data was called instead of one from "
@@ -254,7 +254,7 @@ const DeviceStream* StatePropagatorDataGpu::getUpdateStream()
      return nullptr;
  }
  
-int StatePropagatorDataGpu::numAtomsLocal()
+int StatePropagatorDataGpu::numAtomsLocal() const
  {
      GMX_ASSERT(!impl_,
                 "A CPU stub method from GPU state propagator data was called instead of one from "
@@ -262,7 +262,7 @@ int StatePropagatorDataGpu::numAtomsLocal()
      return 0;
  }
  
-int StatePropagatorDataGpu::numAtomsAll()
+int StatePropagatorDataGpu::numAtomsAll() const
  {
      GMX_ASSERT(!impl_,
                 "A CPU stub method from GPU state propagator data was called instead of one from "
diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h

index cc9f92bcfe9883351caa1f73bc42740ed57ed55e..0a041db04f61d47b611b259b8dd2044ffe7370ba 100644 (file)
--- a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h
+++ b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h
@@ -153,7 +153,7 @@ public:
       *
       * \returns Tuple, containing the index of the first atom in the range and the total number of atoms in the range.
       */
-    std::tuple<int, int> getAtomRangesFromAtomLocality(AtomLocality atomLocality);
+    std::tuple<int, int> getAtomRangesFromAtomLocality(AtomLocality atomLocality) const;
  
  
      /*! \brief Get the positions buffer on the GPU.
@@ -318,13 +318,13 @@ public:
       *
       *  \returns The number of local atoms.
       */
-    int numAtomsLocal();
+    int numAtomsLocal() const;
  
      /*! \brief Getter for the total number of atoms.
       *
       *  \returns The total number of atoms.
       */
-    int numAtomsAll();
+    int numAtomsAll() const;
  
  private:
      //! GPU PME stream.
@@ -443,7 +443,7 @@ private:
      void clearOnDevice(DeviceBuffer<RVec>  d_data,
                         int                 dataSize,
                         AtomLocality        atomLocality,
-                       const DeviceStream& deviceStream);
+                       const DeviceStream& deviceStream) const;
  };
  
  } // namespace gmx
diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp b/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp

index 69e11d69c0671f3ce2765f900f1a0baa462e6a11..d4868d20dc6b4945b722b129b912863458b7399b 100644 (file)
--- a/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp
+++ b/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp
@@ -178,7 +178,7 @@ void StatePropagatorDataGpu::Impl::reinit(int numAtomsLocal, int numAtomsAll)
      wallcycle_stop(wcycle_, WallCycleCounter::LaunchGpu);
  }
  
-std::tuple<int, int> StatePropagatorDataGpu::Impl::getAtomRangesFromAtomLocality(AtomLocality atomLocality)
+std::tuple<int, int> StatePropagatorDataGpu::Impl::getAtomRangesFromAtomLocality(AtomLocality atomLocality) const
  {
      int atomsStartAt   = 0;
      int numAtomsToCopy = 0;
@@ -281,7 +281,7 @@ void StatePropagatorDataGpu::Impl::copyFromDevice(gmx::ArrayRef<gmx::RVec> h_dat
  void StatePropagatorDataGpu::Impl::clearOnDevice(DeviceBuffer<RVec>  d_data,
                                                   int                 dataSize,
                                                   AtomLocality        atomLocality,
-                                                 const DeviceStream& deviceStream)
+                                                 const DeviceStream& deviceStream) const
  {
      GMX_UNUSED_VALUE(dataSize);
  
@@ -540,12 +540,12 @@ const DeviceStream* StatePropagatorDataGpu::Impl::getUpdateStream()
      return updateStream_;
  }
  
-int StatePropagatorDataGpu::Impl::numAtomsLocal()
+int StatePropagatorDataGpu::Impl::numAtomsLocal() const
  {
      return numAtomsLocal_;
  }
  
-int StatePropagatorDataGpu::Impl::numAtomsAll()
+int StatePropagatorDataGpu::Impl::numAtomsAll() const
  {
      return numAtomsAll_;
  }
@@ -580,7 +580,7 @@ void StatePropagatorDataGpu::reinit(int numAtomsLocal, int numAtomsAll)
      return impl_->reinit(numAtomsLocal, numAtomsAll);
  }
  
-std::tuple<int, int> StatePropagatorDataGpu::getAtomRangesFromAtomLocality(AtomLocality atomLocality)
+std::tuple<int, int> StatePropagatorDataGpu::getAtomRangesFromAtomLocality(AtomLocality atomLocality) const
  {
      return impl_->getAtomRangesFromAtomLocality(atomLocality);
  }
@@ -695,12 +695,12 @@ const DeviceStream* StatePropagatorDataGpu::getUpdateStream()
      return impl_->getUpdateStream();
  }
  
-int StatePropagatorDataGpu::numAtomsLocal()
+int StatePropagatorDataGpu::numAtomsLocal() const
  {
      return impl_->numAtomsLocal();
  }
  
-int StatePropagatorDataGpu::numAtomsAll()
+int StatePropagatorDataGpu::numAtomsAll() const
  {
      return impl_->numAtomsAll();
  }
diff --git a/src/gromacs/nbnxm/cuda/.clang-tidy b/src/gromacs/nbnxm/cuda/.clang-tidy

new file mode 100644 (file)

index 0000000..47d31a3
--- /dev/null
+++ b/src/gromacs/nbnxm/cuda/.clang-tidy
@@ -0,0 +1,8 @@
+# List of rationales for check suppressions (where known).
+# This have to precede the list because inline comments are not
+# supported by clang-tidy.
+#
+#         -readability-implicit-bool-conversion
+# This happens often in the GPU code and should generally be harmless
+Checks:  -readability-implicit-bool-conversion
+InheritParentConfig: true
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu

index 1db33caf3e0d093415d5908f36ea3a4919be79e7..20ea65b6730583a9490c39595fff5f24f005054d 100644 (file)
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu
@@ -89,7 +89,7 @@ namespace Nbnxm
   * there is a bit of fluctuations in the generated block counts, we use
   * a target of 44 instead of the ideal value of 48.
   */
-static unsigned int gpu_min_ci_balanced_factor = 44;
+static const unsigned int gpu_min_ci_balanced_factor = 44;
  
  void gpu_init_platform_specific(NbnxmGpu* /* nb */)
  {
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel.cuh b/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel.cuh

index 7df9231d515902ecb1f59b2df9302a15b4d6fe81..5aaf3b9ead96f5691677ab14031d77694adfbc5d 100644 (file)
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel.cuh
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel.cuh
@@ -160,7 +160,7 @@ __launch_bounds__(THREADS_PER_BLOCK)
          __global__ void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_cuda)
  #    endif /* CALC_ENERGIES */
  #endif     /* PRUNE_NBL */
-                (const NBAtomDataGpu atdat, const NBParamGpu nbparam, const Nbnxm::gpu_plist plist, bool bCalcFshift)
+                (NBAtomDataGpu atdat, NBParamGpu nbparam, Nbnxm::gpu_plist plist, bool bCalcFshift)
  #ifdef FUNCTION_DECLARATION_ONLY
                          ; /* Only do function declaration, omit the function body. */
  #else
@@ -257,28 +257,29 @@ __launch_bounds__(THREADS_PER_BLOCK)
       * sm_nextSlotPtr should always be updated to point to the "next slot",
       * that is past the last point where data has been stored.
       */
+    // NOLINTNEXTLINE(readability-redundant-declaration)
      extern __shared__ char sm_dynamicShmem[];
      char*                  sm_nextSlotPtr = sm_dynamicShmem;
      static_assert(sizeof(char) == 1,
                    "The shared memory offset calculation assumes that char is 1 byte");
  
      /* shmem buffer for i x+q pre-loading */
-    float4* xqib = (float4*)sm_nextSlotPtr;
+    float4* xqib = reinterpret_cast<float4*>(sm_nextSlotPtr);
      sm_nextSlotPtr += (c_nbnxnGpuNumClusterPerSupercluster * c_clSize * sizeof(*xqib));
  
      /* shmem buffer for cj, for each warp separately */
-    int* cjs = (int*)(sm_nextSlotPtr);
+    int* cjs = reinterpret_cast<int*>(sm_nextSlotPtr);
      /* the cjs buffer's use expects a base pointer offset for pairs of warps in the j-concurrent execution */
      cjs += tidxz * c_nbnxnGpuClusterpairSplit * c_nbnxnGpuJgroupSize;
      sm_nextSlotPtr += (NTHREAD_Z * c_nbnxnGpuClusterpairSplit * c_nbnxnGpuJgroupSize * sizeof(*cjs));
  
  #    ifndef LJ_COMB
      /* shmem buffer for i atom-type pre-loading */
-    int* atib = (int*)sm_nextSlotPtr;
+    int* atib = reinterpret_cast<int*>(sm_nextSlotPtr);
      sm_nextSlotPtr += (c_nbnxnGpuNumClusterPerSupercluster * c_clSize * sizeof(*atib));
  #    else
      /* shmem buffer for i-atom LJ combination rule parameters */
-    float2* ljcpib = (float2*)sm_nextSlotPtr;
+    float2* ljcpib = reinterpret_cast<float2*>(sm_nextSlotPtr);
      sm_nextSlotPtr += (c_nbnxnGpuNumClusterPerSupercluster * c_clSize * sizeof(*ljcpib));
  #    endif
      /*********************************************************************/
@@ -294,8 +295,8 @@ __launch_bounds__(THREADS_PER_BLOCK)
          ci = sci * c_nbnxnGpuNumClusterPerSupercluster + tidxj;
          ai = ci * c_clSize + tidxi;
  
-        float* shiftptr = (float*)&shift_vec[nb_sci.shift];
-        xqbuf = xq[ai] + make_float4(LDG(shiftptr), LDG(shiftptr + 1), LDG(shiftptr + 2), 0.0f);
+        const float* shiftptr = reinterpret_cast<const float*>(&shift_vec[nb_sci.shift]);
+        xqbuf = xq[ai] + make_float4(LDG(shiftptr), LDG(shiftptr + 1), LDG(shiftptr + 2), 0.0F);
          xqbuf.w *= nbparam.epsfac;
          xqib[tidxj * c_clSize + tidxi] = xqbuf;
  
@@ -311,7 +312,7 @@ __launch_bounds__(THREADS_PER_BLOCK)
  
      for (i = 0; i < c_nbnxnGpuNumClusterPerSupercluster; i++)
      {
-        fci_buf[i] = make_float3(0.0f);
+        fci_buf[i] = make_float3(0.0F);
      }
  
  #    ifdef LJ_EWALD
@@ -322,8 +323,8 @@ __launch_bounds__(THREADS_PER_BLOCK)
  
  
  #    ifdef CALC_ENERGIES
-    E_lj         = 0.0f;
-    E_el         = 0.0f;
+    E_lj         = 0.0F;
+    E_el         = 0.0F;
  
  #        ifdef EXCLUSION_FORCES /* Ewald or RF */
      if (nb_sci.shift == gmx::c_centralShiftIndex
@@ -339,22 +340,23 @@ __launch_bounds__(THREADS_PER_BLOCK)
  
  #            ifdef LJ_EWALD
              // load only the first 4 bytes of the parameter pair (equivalent with nbfp[idx].x)
-            E_lj += LDG((float*)&nbparam.nbfp[atom_types[(sci * c_nbnxnGpuNumClusterPerSupercluster + i) * c_clSize + tidxi]
-                                              * (ntypes + 1)]);
+            E_lj += LDG(reinterpret_cast<float*>(
+                    &nbparam.nbfp[atom_types[(sci * c_nbnxnGpuNumClusterPerSupercluster + i) * c_clSize + tidxi]
+                                  * (ntypes + 1)]));
  #            endif
          }
  
          /* divide the self term(s) equally over the j-threads, then multiply with the coefficients. */
  #            ifdef LJ_EWALD
          E_lj /= c_clSize * NTHREAD_Z;
-        E_lj *= 0.5f * c_oneSixth * lje_coeff6_6;
+        E_lj *= 0.5F * c_oneSixth * lje_coeff6_6;
  #            endif
  
  #            if defined EL_EWALD_ANY || defined EL_RF || defined EL_CUTOFF
          /* Correct for epsfac^2 due to adding qi^2 */
          E_el /= nbparam.epsfac * c_clSize * NTHREAD_Z;
  #                if defined EL_RF || defined EL_CUTOFF
-        E_el *= -0.5f * reactionFieldShift;
+        E_el *= -0.5F * reactionFieldShift;
  #                else
          E_el *= -beta * M_FLOAT_1_SQRTPI; /* last factor 1/sqrt(pi) */
  #                endif
@@ -412,7 +414,7 @@ __launch_bounds__(THREADS_PER_BLOCK)
                      ljcp_j = lj_comb[aj];
  #    endif
  
-                    fcj_buf = make_float3(0.0f);
+                    fcj_buf = make_float3(0.0F);
  
  #    if !defined PRUNE_NBL
  #        pragma unroll 8
@@ -441,7 +443,7 @@ __launch_bounds__(THREADS_PER_BLOCK)
                              }
  #    endif
  
-                            int_bit = (wexcl & mask_ji) ? 1.0f : 0.0f;
+                            int_bit = (wexcl & mask_ji) ? 1.0F : 0.0F;
  
                              /* cutoff & exclusion check */
  #    ifdef EXCLUSION_FORCES
@@ -499,7 +501,7 @@ __launch_bounds__(THREADS_PER_BLOCK)
                                  sig_r6 *= int_bit;
  #        endif /* EXCLUSION_FORCES */
  
-                                F_invr = epsilon * sig_r6 * (sig_r6 - 1.0f) * inv_r2;
+                                F_invr = epsilon * sig_r6 * (sig_r6 - 1.0F) * inv_r2;
  #    endif     /* !LJ_COMB_LB || CALC_ENERGIES */
  
  #    ifdef LJ_FORCE_SWITCH
@@ -553,7 +555,7 @@ __launch_bounds__(THREADS_PER_BLOCK)
                                  /* Separate VDW cut-off check to enable twin-range cut-offs
                                   * (rvdw < rcoulomb <= rlist)
                                   */
-                                vdw_in_range = (r2 < rvdw_sq) ? 1.0f : 0.0f;
+                                vdw_in_range = (r2 < rvdw_sq) ? 1.0F : 0.0F;
                                  F_invr *= vdw_in_range;
  #        ifdef CALC_ENERGIES
                                  E_lj_p *= vdw_in_range;
@@ -591,10 +593,10 @@ __launch_bounds__(THREADS_PER_BLOCK)
  #        endif
  #        ifdef EL_RF
                                  E_el += qi * qj_f
-                                        * (int_bit * inv_r + 0.5f * two_k_rf * r2 - reactionFieldShift);
+                                        * (int_bit * inv_r + 0.5F * two_k_rf * r2 - reactionFieldShift);
  #        endif
  #        ifdef EL_EWALD_ANY
-                                /* 1.0f - erff is faster than erfcf */
+                                /* 1.0F - erff is faster than erfcf */
                                  E_el += qi * qj_f
                                          * (inv_r * (int_bit - erff(r2 * inv_r * beta)) - int_bit * ewald_shift);
  #        endif /* EL_EWALD_ANY */
@@ -633,7 +635,7 @@ __launch_bounds__(THREADS_PER_BLOCK)
          bCalcFshift = false;
      }
  
-    float fshift_buf = 0.0f;
+    float fshift_buf = 0.0F;
  
      /* reduce i forces */
      for (i = 0; i < c_nbnxnGpuNumClusterPerSupercluster; i++)
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel_pruneonly.cuh b/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel_pruneonly.cuh

index bcc3dd1b09d641a76ef6408d775c8e5686a2364f..8219ad16a351e60e931fb14e1facb902479279df 100644 (file)
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel_pruneonly.cuh
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel_pruneonly.cuh
@@ -104,11 +104,11 @@
   */
  template<bool haveFreshList>
  __launch_bounds__(THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP) __global__
-        void nbnxn_kernel_prune_cuda(const NBAtomDataGpu    atdat,
-                                     const NBParamGpu       nbparam,
-                                     const Nbnxm::gpu_plist plist,
-                                     int                    numParts,
-                                     int                    part)
+        void nbnxn_kernel_prune_cuda(NBAtomDataGpu    atdat,
+                                     NBParamGpu       nbparam,
+                                     Nbnxm::gpu_plist plist,
+                                     int              numParts,
+                                     int              part)
  #ifdef FUNCTION_DECLARATION_ONLY
                  ; /* Only do function declaration, omit the function body. */
  
@@ -152,11 +152,11 @@ nbnxn_kernel_prune_cuda<false>(const NBAtomDataGpu, const NBParamGpu, const Nbnx
                    "The shared memory offset calculation assumes that char is 1 byte");
  
      /* shmem buffer for i x+q pre-loading */
-    float4* xib = (float4*)sm_nextSlotPtr;
+    float4* xib = reinterpret_cast<float4*>(sm_nextSlotPtr);
      sm_nextSlotPtr += (c_nbnxnGpuNumClusterPerSupercluster * c_clSize * sizeof(*xib));
  
      /* shmem buffer for cj, for each warp separately */
-    int* cjs = (int*)(sm_nextSlotPtr);
+    int* cjs = reinterpret_cast<int*>(sm_nextSlotPtr);
      /* the cjs buffer's use expects a base pointer offset for pairs of warps in the j-concurrent execution */
      cjs += tidxz * c_nbnxnGpuClusterpairSplit * c_nbnxnGpuJgroupSize;
      sm_nextSlotPtr += (NTHREAD_Z * c_nbnxnGpuClusterpairSplit * c_nbnxnGpuJgroupSize * sizeof(*cjs));
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel_utils.cuh b/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel_utils.cuh

index 554748330f24aab23b026b70c688f550c386a945..deeb17a8faa6b01c8fd113bf4389125b42d7ebc3 100644 (file)
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel_utils.cuh
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel_utils.cuh
@@ -72,8 +72,8 @@ static const int __device__ c_fbufStride = c_clSizeSq;
  static const unsigned __device__ superClInteractionMask =
          ((1U << c_nbnxnGpuNumClusterPerSupercluster) - 1U);
  
-static const float __device__ c_oneSixth    = 0.16666667f;
-static const float __device__ c_oneTwelveth = 0.08333333f;
+static const float __device__ c_oneSixth    = 0.16666667F;
+static const float __device__ c_oneTwelveth = 0.08333333F;
  
  
  /*! Convert LJ sigma,epsilon parameters to C6,C12. */
@@ -102,7 +102,7 @@ calculate_force_switch_F(const NBParamGpu nbparam, float c6, float c12, float in
  
      r        = r2 * inv_r;
      r_switch = r - nbparam.rvdw_switch;
-    r_switch = r_switch >= 0.0f ? r_switch : 0.0f;
+    r_switch = r_switch >= 0.0F ? r_switch : 0.0F;
  
      *F_invr += -c6 * (disp_shift_V2 + disp_shift_V3 * r_switch) * r_switch * r_switch * inv_r
                 + c12 * (repu_shift_V2 + repu_shift_V3 * r_switch) * r_switch * r_switch * inv_r;
@@ -132,7 +132,7 @@ static __forceinline__ __device__ void calculate_force_switch_F_E(const NBParamG
  
      r        = r2 * inv_r;
      r_switch = r - nbparam.rvdw_switch;
-    r_switch = r_switch >= 0.0f ? r_switch : 0.0f;
+    r_switch = r_switch >= 0.0F ? r_switch : 0.0F;
  
      *F_invr += -c6 * (disp_shift_V2 + disp_shift_V3 * r_switch) * r_switch * r_switch * inv_r
                 + c12 * (repu_shift_V2 + repu_shift_V3 * r_switch) * r_switch * r_switch * inv_r;
@@ -141,8 +141,11 @@ static __forceinline__ __device__ void calculate_force_switch_F_E(const NBParamG
  }
  
  /*! Apply potential switch, force-only version. */
-static __forceinline__ __device__ void
-calculate_potential_switch_F(const NBParamGpu nbparam, float inv_r, float r2, float* F_invr, float* E_lj)
+static __forceinline__ __device__ void calculate_potential_switch_F(const NBParamGpu& nbparam,
+                                                                    float             inv_r,
+                                                                    float             r2,
+                                                                    float*            F_invr,
+                                                                    const float*      E_lj)
  {
      float r, r_switch;
      float sw, dsw;
@@ -159,9 +162,9 @@ calculate_potential_switch_F(const NBParamGpu nbparam, float inv_r, float r2, fl
      r_switch = r - nbparam.rvdw_switch;
  
      /* Unlike in the F+E kernel, conditional is faster here */
-    if (r_switch > 0.0f)
+    if (r_switch > 0.0F)
      {
-        sw  = 1.0f + (switch_V3 + (switch_V4 + switch_V5 * r_switch) * r_switch) * r_switch * r_switch * r_switch;
+        sw  = 1.0F + (switch_V3 + (switch_V4 + switch_V5 * r_switch) * r_switch) * r_switch * r_switch * r_switch;
          dsw = (switch_F2 + (switch_F3 + switch_F4 * r_switch) * r_switch) * r_switch * r_switch;
  
          *F_invr = (*F_invr) * sw - inv_r * (*E_lj) * dsw;
@@ -185,10 +188,10 @@ calculate_potential_switch_F_E(const NBParamGpu nbparam, float inv_r, float r2,
  
      r        = r2 * inv_r;
      r_switch = r - nbparam.rvdw_switch;
-    r_switch = r_switch >= 0.0f ? r_switch : 0.0f;
+    r_switch = r_switch >= 0.0F ? r_switch : 0.0F;
  
      /* Unlike in the F-only kernel, masking is faster here */
-    sw  = 1.0f + (switch_V3 + (switch_V4 + switch_V5 * r_switch) * r_switch) * r_switch * r_switch * r_switch;
+    sw  = 1.0F + (switch_V3 + (switch_V4 + switch_V5 * r_switch) * r_switch) * r_switch * r_switch * r_switch;
      dsw = (switch_F2 + (switch_F3 + switch_F4 * r_switch) * r_switch) * r_switch * r_switch;
  
      *F_invr = (*F_invr) * sw - inv_r * (*E_lj) * dsw;
@@ -234,7 +237,7 @@ static __forceinline__ __device__ void calculate_lj_ewald_comb_geom_F(const NBPa
      inv_r6_nm = inv_r2 * inv_r2 * inv_r2;
      cr2       = lje_coeff2 * r2;
      expmcr2   = expf(-cr2);
-    poly      = 1.0f + cr2 + 0.5f * cr2 * cr2;
+    poly      = 1.0F + cr2 + 0.5F * cr2 * cr2;
  
      /* Subtract the grid force from the total LJ force */
      *F_invr += c6grid * (inv_r6_nm - expmcr2 * (inv_r6_nm * poly + lje_coeff6_6)) * inv_r2;
@@ -263,14 +266,14 @@ static __forceinline__ __device__ void calculate_lj_ewald_comb_geom_F_E(const NB
      inv_r6_nm = inv_r2 * inv_r2 * inv_r2;
      cr2       = lje_coeff2 * r2;
      expmcr2   = expf(-cr2);
-    poly      = 1.0f + cr2 + 0.5f * cr2 * cr2;
+    poly      = 1.0F + cr2 + 0.5F * cr2 * cr2;
  
      /* Subtract the grid force from the total LJ force */
      *F_invr += c6grid * (inv_r6_nm - expmcr2 * (inv_r6_nm * poly + lje_coeff6_6)) * inv_r2;
  
      /* Shift should be applied only to real LJ pairs */
      sh_mask = nbparam.sh_lj_ewald * int_bit;
-    *E_lj += c_oneSixth * c6grid * (inv_r6_nm * (1.0f - expmcr2 * poly) + sh_mask);
+    *E_lj += c_oneSixth * c6grid * (inv_r6_nm * (1.0F - expmcr2 * poly) + sh_mask);
  }
  
  /*! Fetch per-type LJ parameters.
@@ -321,7 +324,7 @@ static __forceinline__ __device__ void calculate_lj_ewald_comb_LB_F_E(const NBPa
      inv_r6_nm = inv_r2 * inv_r2 * inv_r2;
      cr2       = lje_coeff2 * r2;
      expmcr2   = expf(-cr2);
-    poly      = 1.0f + cr2 + 0.5f * cr2 * cr2;
+    poly      = 1.0F + cr2 + 0.5F * cr2 * cr2;
  
      /* Subtract the grid force from the total LJ force */
      *F_invr += c6grid * (inv_r6_nm - expmcr2 * (inv_r6_nm * poly + lje_coeff6_6)) * inv_r2;
@@ -332,7 +335,7 @@ static __forceinline__ __device__ void calculate_lj_ewald_comb_LB_F_E(const NBPa
  
          /* Shift should be applied only to real LJ pairs */
          sh_mask = nbparam.sh_lj_ewald * int_bit;
-        *E_lj += c_oneSixth * c6grid * (inv_r6_nm * (1.0f - expmcr2 * poly) + sh_mask);
+        *E_lj += c_oneSixth * c6grid * (inv_r6_nm * (1.0F - expmcr2 * poly) + sh_mask);
      }
  }
  
@@ -376,7 +379,7 @@ __forceinline__ __host__ __device__ T lerp(T d0, T d1, T t)
  static __forceinline__ __device__ float interpolate_coulomb_force_r(const NBParamGpu nbparam, float r)
  {
      float normalized = nbparam.coulomb_tab_scale * r;
-    int   index      = (int)normalized;
+    int   index      = static_cast<int>(normalized);
      float fraction   = normalized - index;
  
      float2 d01 = fetch_coulomb_force_r(nbparam, index);
@@ -389,6 +392,7 @@ static __forceinline__ __device__ float interpolate_coulomb_force_r(const NBPara
   *  Depending on what is supported, it fetches parameters either
   *  using direct load, texture objects, or texrefs.
   */
+// NOLINTNEXTLINE(google-runtime-references)
  static __forceinline__ __device__ void fetch_nbfp_c6_c12(float& c6, float& c12, const NBParamGpu nbparam, int baseIndex)
  {
      float2 c6c12;
@@ -405,19 +409,19 @@ static __forceinline__ __device__ void fetch_nbfp_c6_c12(float& c6, float& c12,
  /*! Calculate analytical Ewald correction term. */
  static __forceinline__ __device__ float pmecorrF(float z2)
  {
-    const float FN6 = -1.7357322914161492954e-8f;
-    const float FN5 = 1.4703624142580877519e-6f;
-    const float FN4 = -0.000053401640219807709149f;
-    const float FN3 = 0.0010054721316683106153f;
-    const float FN2 = -0.019278317264888380590f;
-    const float FN1 = 0.069670166153766424023f;
-    const float FN0 = -0.75225204789749321333f;
-
-    const float FD4 = 0.0011193462567257629232f;
-    const float FD3 = 0.014866955030185295499f;
-    const float FD2 = 0.11583842382862377919f;
-    const float FD1 = 0.50736591960530292870f;
-    const float FD0 = 1.0f;
+    const float FN6 = -1.7357322914161492954e-8F;
+    const float FN5 = 1.4703624142580877519e-6F;
+    const float FN4 = -0.000053401640219807709149F;
+    const float FN3 = 0.0010054721316683106153F;
+    const float FN2 = -0.019278317264888380590F;
+    const float FN1 = 0.069670166153766424023F;
+    const float FN0 = -0.75225204789749321333F;
+
+    const float FD4 = 0.0011193462567257629232F;
+    const float FD3 = 0.014866955030185295499F;
+    const float FD2 = 0.11583842382862377919F;
+    const float FD1 = 0.50736591960530292870F;
+    const float FD0 = 1.0F;
  
      float z4;
      float polyFN0, polyFN1, polyFD0, polyFD1;
@@ -429,7 +433,7 @@ static __forceinline__ __device__ float pmecorrF(float z2)
      polyFD0 = polyFD0 * z4 + FD0;
      polyFD0 = polyFD1 * z2 + polyFD0;
  
-    polyFD0 = 1.0f / polyFD0;
+    polyFD0 = 1.0F / polyFD0;
  
      polyFN0 = FN6 * z4 + FN4;
      polyFN1 = FN5 * z4 + FN3;
@@ -445,11 +449,11 @@ static __forceinline__ __device__ float pmecorrF(float z2)
   *  arbitrary array sizes.
   */
  static __forceinline__ __device__ void
-reduce_force_j_generic(float* f_buf, float3* fout, int tidxi, int tidxj, int aidx)
+reduce_force_j_generic(const float* f_buf, float3* fout, int tidxi, int tidxj, int aidx)
  {
      if (tidxi < 3)
      {
-        float f = 0.0f;
+        float f = 0.0F;
          for (int j = tidxj * c_clSize; j < (tidxj + 1) * c_clSize; j++)
          {
              f += f_buf[c_fbufStride * tidxi + j];
@@ -494,17 +498,17 @@ reduce_force_j_warp_shfl(float3 f, float3* fout, int tidxi, int aidx, const unsi
   *  arbitrary array sizes.
   * TODO: add the tidxi < 3 trick
   */
-static __forceinline__ __device__ void reduce_force_i_generic(float*  f_buf,
-                                                              float3* fout,
-                                                              float*  fshift_buf,
-                                                              bool    bCalcFshift,
-                                                              int     tidxi,
-                                                              int     tidxj,
-                                                              int     aidx)
+static __forceinline__ __device__ void reduce_force_i_generic(const float* f_buf,
+                                                              float3*      fout,
+                                                              float*       fshift_buf,
+                                                              bool         bCalcFshift,
+                                                              int          tidxi,
+                                                              int          tidxj,
+                                                              int          aidx)
  {
      if (tidxj < 3)
      {
-        float f = 0.0f;
+        float f = 0.0F;
          for (int j = tidxi; j < c_clSizeSq; j += c_clSize)
          {
              f += f_buf[tidxj * c_fbufStride + j];
@@ -533,7 +537,7 @@ static __forceinline__ __device__ void reduce_force_i_pow2(volatile float* f_buf
      int   i, j;
      float f;
  
-    assert(c_clSize == 1 << c_clSizeLog2);
+    static_assert(c_clSize == 1 << c_clSizeLog2);
  
      /* Reduce the initial c_clSize values for each i atom to half
       * every step by using c_clSize * i threads.
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_gpu_buffer_ops_internal.cu b/src/gromacs/nbnxm/cuda/nbnxm_gpu_buffer_ops_internal.cu

index 577b2b4b920f6201128779f5a5b0f1740badfb5e..f6533259b5c4aecafc5676629d0eb4bd4bb01a91 100644 (file)
--- a/src/gromacs/nbnxm/cuda/nbnxm_gpu_buffer_ops_internal.cu
+++ b/src/gromacs/nbnxm/cuda/nbnxm_gpu_buffer_ops_internal.cu
@@ -89,7 +89,7 @@ static __global__ void nbnxn_gpu_x_to_nbat_x_kernel(int numColumns,
  
          // Destination address where x should be stored in nbnxm layout. We use this cast here to
          // save only x, y and z components, not touching the w (q) component, which is pre-defined.
-        float3* gm_xqDest = (float3*)&gm_xq[threadIndex + offset];
+        float3* gm_xqDest = reinterpret_cast<float3*>(&gm_xq[threadIndex + offset]);
  
          // Perform layout conversion of each element.
          if (threadIndex < numAtoms)
diff --git a/src/gromacs/nbnxm/nbnxm_gpu_buffer_ops_internal.h b/src/gromacs/nbnxm/nbnxm_gpu_buffer_ops_internal.h

index dc91a7a299f82a581e52117940fe32eb646010b8..09b2d6e37974a9f04df27439917b6c672ee9fd16 100644 (file)
--- a/src/gromacs/nbnxm/nbnxm_gpu_buffer_ops_internal.h
+++ b/src/gromacs/nbnxm/nbnxm_gpu_buffer_ops_internal.h
@@ -43,12 +43,13 @@
  #include "gromacs/gpu_utils/gputraits.h"
  
  class DeviceStream;
-class Grid;
  struct NbnxmGpu;
  
  namespace Nbnxm
  {
  
+class Grid;
+
  /*! \brief Launch coordinate layout conversion kernel
   *
   * \param[in]     grid          Pair-search grid.
diff --git a/src/gromacs/nbnxm/nbnxm_gpu_data_mgmt.cpp b/src/gromacs/nbnxm/nbnxm_gpu_data_mgmt.cpp

index 41c107deeaa3a0c8b2470cfeb7b12f3f3330059a..6e7a5b799f109d9161cfc88e15191ea20932c4dc 100644 (file)
--- a/src/gromacs/nbnxm/nbnxm_gpu_data_mgmt.cpp
+++ b/src/gromacs/nbnxm/nbnxm_gpu_data_mgmt.cpp
@@ -107,7 +107,7 @@ static inline void init_ewald_coulomb_force_table(const EwaldCorrectionTables& t
  {
      if (nbp->coulomb_tab)
      {
-        destroyParamLookupTable(&nbp->coulomb_tab, nbp->coulomb_tab_texobj);
+        destroyParamLookupTable(&nbp->coulomb_tab, &nbp->coulomb_tab_texobj);
      }
  
      nbp->coulomb_tab_scale = tables.scale;
@@ -1132,17 +1132,17 @@ void gpu_free(NbnxmGpu* nb)
      /* Free nbparam */
      if (nbparam->elecType == ElecType::EwaldTab || nbparam->elecType == ElecType::EwaldTabTwin)
      {
-        destroyParamLookupTable(&nbparam->coulomb_tab, nbparam->coulomb_tab_texobj);
+        destroyParamLookupTable(&nbparam->coulomb_tab, &nbparam->coulomb_tab_texobj);
      }
  
      if (!useLjCombRule(nb->nbparam->vdwType))
      {
-        destroyParamLookupTable(&nbparam->nbfp, nbparam->nbfp_texobj);
+        destroyParamLookupTable(&nbparam->nbfp, &nbparam->nbfp_texobj);
      }
  
      if (nbparam->vdwType == VdwType::EwaldGeom || nbparam->vdwType == VdwType::EwaldLB)
      {
-        destroyParamLookupTable(&nbparam->nbfp_comb, nbparam->nbfp_comb_texobj);
+        destroyParamLookupTable(&nbparam->nbfp_comb, &nbparam->nbfp_comb_texobj);
      }
  
      /* Free plist */
diff --git a/src/gromacs/pbcutil/pbc_aiuc.h b/src/gromacs/pbcutil/pbc_aiuc.h

index bd45ec5d5c4b8c61e4b40c498a21541ecd1fde0e..97717d526786cfff8a51a55160b1794d8a9c30fb 100644 (file)
--- a/src/gromacs/pbcutil/pbc_aiuc.h
+++ b/src/gromacs/pbcutil/pbc_aiuc.h
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2019,2020,2021, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -99,32 +99,32 @@ struct PbcAiuc
  static inline void setPbcAiuc(int numPbcDim, const matrix box, PbcAiuc* pbcAiuc)
  {
  
-    pbcAiuc->invBoxDiagZ = 0.0f;
-    pbcAiuc->boxZX       = 0.0f;
-    pbcAiuc->boxZY       = 0.0f;
-    pbcAiuc->boxZZ       = 0.0f;
-    pbcAiuc->invBoxDiagY = 0.0f;
-    pbcAiuc->boxYX       = 0.0f;
-    pbcAiuc->boxYY       = 0.0f;
-    pbcAiuc->invBoxDiagX = 0.0f;
-    pbcAiuc->boxXX       = 0.0f;
+    pbcAiuc->invBoxDiagZ = 0.0F;
+    pbcAiuc->boxZX       = 0.0F;
+    pbcAiuc->boxZY       = 0.0F;
+    pbcAiuc->boxZZ       = 0.0F;
+    pbcAiuc->invBoxDiagY = 0.0F;
+    pbcAiuc->boxYX       = 0.0F;
+    pbcAiuc->boxYY       = 0.0F;
+    pbcAiuc->invBoxDiagX = 0.0F;
+    pbcAiuc->boxXX       = 0.0F;
  
      if (numPbcDim > ZZ)
      {
-        pbcAiuc->invBoxDiagZ = 1.0f / box[ZZ][ZZ];
+        pbcAiuc->invBoxDiagZ = 1.0F / box[ZZ][ZZ];
          pbcAiuc->boxZX       = box[ZZ][XX];
          pbcAiuc->boxZY       = box[ZZ][YY];
          pbcAiuc->boxZZ       = box[ZZ][ZZ];
      }
      if (numPbcDim > YY)
      {
-        pbcAiuc->invBoxDiagY = 1.0f / box[YY][YY];
+        pbcAiuc->invBoxDiagY = 1.0F / box[YY][YY];
          pbcAiuc->boxYX       = box[YY][XX];
          pbcAiuc->boxYY       = box[YY][YY];
      }
      if (numPbcDim > XX)
      {
-        pbcAiuc->invBoxDiagX = 1.0f / box[XX][XX];
+        pbcAiuc->invBoxDiagX = 1.0F / box[XX][XX];
          pbcAiuc->boxXX       = box[XX][XX];
      }
  }
diff --git a/src/gromacs/pbcutil/pbc_aiuc_cuda.cuh b/src/gromacs/pbcutil/pbc_aiuc_cuda.cuh

index a2784a6d16012af2734f8558e630fe9e516d6807..eb12862bf4fff6d163b4a437a421ec6bd81c44ea 100644 (file)
--- a/src/gromacs/pbcutil/pbc_aiuc_cuda.cuh
+++ b/src/gromacs/pbcutil/pbc_aiuc_cuda.cuh
@@ -94,6 +94,7 @@ static inline __device__ int int3ToShiftIndex(int3 iv)
   */
  template<bool returnShift>
  static __forceinline__ __device__ int
+// NOLINTNEXTLINE(google-runtime-references)
  pbcDxAiuc(const PbcAiuc& pbcAiuc, const float4 r1, const float4 r2, float3& dr)
  {
      dr.x = r1.x - r2.x;
author	Paul Bauer <paul.bauer.q@gmail.com>
	Mon, 24 May 2021 12:58:44 +0000 (12:58 +0000)
committer	Paul Bauer <paul.bauer.q@gmail.com>
	Mon, 24 May 2021 12:58:44 +0000 (12:58 +0000)
src/.clang-tidy		patch \| blob \| history
src/gromacs/domdec/gpuhaloexchange_impl.cu		patch \| blob \| history
src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cu		patch \| blob \| history
src/gromacs/ewald/pme_force_sender_gpu_impl.cu		patch \| blob \| history
src/gromacs/ewald/pme_force_sender_gpu_impl.h		patch \| blob \| history
src/gromacs/ewald/pme_gather.cu		patch \| blob \| history
src/gromacs/ewald/pme_gpu_3dfft.cu		patch \| blob \| history
src/gromacs/ewald/pme_gpu_calculate_splines.cuh		patch \| blob \| history
src/gromacs/ewald/pme_gpu_internal.cpp		patch \| blob \| history
src/gromacs/ewald/pme_gpu_program_impl.cu		patch \| blob \| history
src/gromacs/ewald/pme_pp_comm_gpu_impl.cu		patch \| blob \| history
src/gromacs/ewald/pme_pp_comm_gpu_impl.h		patch \| blob \| history
src/gromacs/ewald/pme_solve.cu		patch \| blob \| history
src/gromacs/gpu_utils/cuda_kernel_utils.cuh		patch \| blob \| history
src/gromacs/gpu_utils/cudautils.cuh		patch \| blob \| history
src/gromacs/gpu_utils/devicebuffer.cuh		patch \| blob \| history
src/gromacs/gpu_utils/devicebuffer_sycl.h		patch \| blob \| history
src/gromacs/gpu_utils/gpu_utils.cu		patch \| blob \| history
src/gromacs/gpu_utils/gpuregiontimer.cuh		patch \| blob \| history
src/gromacs/gpu_utils/pmalloc.cu		patch \| blob \| history
src/gromacs/gpu_utils/tests/hostallocator.cpp		patch \| blob \| history
src/gromacs/gpu_utils/tests/typecasts_runner.cpp		patch \| blob \| history
src/gromacs/gpu_utils/tests/typecasts_runner.cu		patch \| blob \| history
src/gromacs/gpu_utils/tests/typecasts_runner.h		patch \| blob \| history
src/gromacs/gpu_utils/vectype_ops.cuh		patch \| blob \| history
src/gromacs/hardware/device_management.cu		patch \| blob \| history
src/gromacs/listed_forces/gpubonded_impl.cu		patch \| blob \| history
src/gromacs/listed_forces/gpubonded_impl.h		patch \| blob \| history
src/gromacs/listed_forces/gpubondedkernels.cu		patch \| blob \| history
src/gromacs/mdlib/gpuforcereduction_impl.cpp		patch \| blob \| history
src/gromacs/mdlib/gpuforcereduction_impl.h		patch \| blob \| history
src/gromacs/mdlib/gpuforcereduction_impl_internal.cu		patch \| blob \| history
src/gromacs/mdlib/leapfrog_gpu.h		patch \| blob \| history
src/gromacs/mdlib/lincs_gpu.cpp		patch \| blob \| history
src/gromacs/mdlib/lincs_gpu.h		patch \| blob \| history
src/gromacs/mdlib/lincs_gpu_internal.cu		patch \| blob \| history
src/gromacs/mdlib/lincs_gpu_internal.h		patch \| blob \| history
src/gromacs/mdlib/lincs_gpu_internal_sycl.cpp		patch \| blob \| history
src/gromacs/mdlib/settle_gpu.cpp		patch \| blob \| history
src/gromacs/mdlib/settle_gpu.h		patch \| blob \| history
src/gromacs/mdlib/settle_gpu_internal.cu		patch \| blob \| history
src/gromacs/mdlib/settle_gpu_internal.h		patch \| blob \| history
src/gromacs/mdlib/settle_gpu_internal_sycl.cpp		patch \| blob \| history
src/gromacs/mdlib/update_constrain_gpu_impl.cpp		patch \| blob \| history
src/gromacs/mdlib/update_constrain_gpu_impl.h		patch \| blob \| history
src/gromacs/mdlib/update_constrain_gpu_internal.cu		patch \| blob \| history
src/gromacs/mdlib/update_constrain_gpu_internal.h		patch \| blob \| history
src/gromacs/mdlib/update_constrain_gpu_internal_sycl.cpp		patch \| blob \| history
src/gromacs/mdtypes/state_propagator_data_gpu.h		patch \| blob \| history
src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp		patch \| blob \| history
src/gromacs/mdtypes/state_propagator_data_gpu_impl.h		patch \| blob \| history
src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp		patch \| blob \| history
src/gromacs/nbnxm/cuda/.clang-tidy	[new file with mode: 0644]	patch \| blob
src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu		patch \| blob \| history
src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel.cuh		patch \| blob \| history
src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel_pruneonly.cuh		patch \| blob \| history
src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel_utils.cuh		patch \| blob \| history
src/gromacs/nbnxm/cuda/nbnxm_gpu_buffer_ops_internal.cu		patch \| blob \| history
src/gromacs/nbnxm/nbnxm_gpu_buffer_ops_internal.h		patch \| blob \| history
src/gromacs/nbnxm/nbnxm_gpu_data_mgmt.cpp		patch \| blob \| history
src/gromacs/pbcutil/pbc_aiuc.h		patch \| blob \| history
src/gromacs/pbcutil/pbc_aiuc_cuda.cuh		patch \| blob \| history