Update some nbnxm kernel constants to constexpr

[alexxy/gromacs.git] / src / gromacs / nbnxm / cuda / nbnxm_cuda_kernel.cuh
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel.cuh b/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel.cuh

index 99b1f048cfe520b1d08abc408a4b45b255174abc..89f75da5f9c2731d88b4c7b8aae9060c2c451e9a 100644 (file)
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel.cuh
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel.cuh
@@ -244,11 +244,11 @@ __launch_bounds__(THREADS_PER_BLOCK)
      unsigned int wexcl, imask, mask_ji;
      float4       xqbuf;
      float3       xi, xj, rv, f_ij, fcj_buf;
-    float3       fci_buf[c_numClPerSupercl]; /* i force buffer */
+    float3       fci_buf[c_nbnxnGpuNumClusterPerSupercluster]; /* i force buffer */
      nbnxn_sci_t  nb_sci;
  
-    /*! i-cluster interaction mask for a super-cluster with all c_numClPerSupercl=8 bits set */
-    const unsigned superClInteractionMask = ((1U << c_numClPerSupercl) - 1U);
+    /*! i-cluster interaction mask for a super-cluster with all c_nbnxnGpuNumClusterPerSupercluster=8 bits set */
+    const unsigned superClInteractionMask = ((1U << c_nbnxnGpuNumClusterPerSupercluster) - 1U);
  
      /*********************************************************************
       * Set up shared memory pointers.
@@ -262,7 +262,7 @@ __launch_bounds__(THREADS_PER_BLOCK)
  
      /* shmem buffer for i x+q pre-loading */
      float4* xqib = (float4*)sm_nextSlotPtr;
-    sm_nextSlotPtr += (c_numClPerSupercl * c_clSize * sizeof(*xqib));
+    sm_nextSlotPtr += (c_nbnxnGpuNumClusterPerSupercluster * c_clSize * sizeof(*xqib));
  
      /* shmem buffer for cj, for each warp separately */
      int* cjs = (int*)(sm_nextSlotPtr);
@@ -273,11 +273,11 @@ __launch_bounds__(THREADS_PER_BLOCK)
  #    ifndef LJ_COMB
      /* shmem buffer for i atom-type pre-loading */
      int* atib = (int*)sm_nextSlotPtr;
-    sm_nextSlotPtr += (c_numClPerSupercl * c_clSize * sizeof(*atib));
+    sm_nextSlotPtr += (c_nbnxnGpuNumClusterPerSupercluster * c_clSize * sizeof(*atib));
  #    else
      /* shmem buffer for i-atom LJ combination rule parameters */
      float2* ljcpib = (float2*)sm_nextSlotPtr;
-    sm_nextSlotPtr += (c_numClPerSupercl * c_clSize * sizeof(*ljcpib));
+    sm_nextSlotPtr += (c_nbnxnGpuNumClusterPerSupercluster * c_clSize * sizeof(*ljcpib));
  #    endif
      /*********************************************************************/
  
@@ -289,7 +289,7 @@ __launch_bounds__(THREADS_PER_BLOCK)
      if (tidxz == 0)
      {
          /* Pre-load i-atom x and q into shared memory */
-        ci = sci * c_numClPerSupercl + tidxj;
+        ci = sci * c_nbnxnGpuNumClusterPerSupercluster + tidxj;
          ai = ci * c_clSize + tidxi;
  
          float* shiftptr = (float*)&shift_vec[nb_sci.shift];
@@ -307,7 +307,7 @@ __launch_bounds__(THREADS_PER_BLOCK)
      }
      __syncthreads();
  
-    for (i = 0; i < c_numClPerSupercl; i++)
+    for (i = 0; i < c_nbnxnGpuNumClusterPerSupercluster; i++)
      {
          fci_buf[i] = make_float3(0.0f);
      }
@@ -324,10 +324,10 @@ __launch_bounds__(THREADS_PER_BLOCK)
      E_el         = 0.0f;
  
  #        ifdef EXCLUSION_FORCES /* Ewald or RF */
-    if (nb_sci.shift == CENTRAL && pl_cj4[cij4_start].cj[0] == sci * c_numClPerSupercl)
+    if (nb_sci.shift == CENTRAL && pl_cj4[cij4_start].cj[0] == sci * c_nbnxnGpuNumClusterPerSupercluster)
      {
          /* we have the diagonal: add the charge and LJ self interaction energy term */
-        for (i = 0; i < c_numClPerSupercl; i++)
+        for (i = 0; i < c_nbnxnGpuNumClusterPerSupercluster; i++)
          {
  #            if defined EL_EWALD_ANY || defined EL_RF || defined EL_CUTOFF
              qi = xqib[i * c_clSize + tidxi].w;
@@ -336,12 +336,13 @@ __launch_bounds__(THREADS_PER_BLOCK)
  
  #            ifdef LJ_EWALD
  #                if DISABLE_CUDA_TEXTURES
-            E_lj += LDG(
-                    &nbparam.nbfp[atom_types[(sci * c_numClPerSupercl + i) * c_clSize + tidxi] * (ntypes + 1) * 2]);
+            E_lj += LDG(&nbparam.nbfp[atom_types[(sci * c_nbnxnGpuNumClusterPerSupercluster + i) * c_clSize + tidxi]
+                                      * (ntypes + 1) * 2]);
  #                else
              E_lj += tex1Dfetch<float>(
                      nbparam.nbfp_texobj,
-                    atom_types[(sci * c_numClPerSupercl + i) * c_clSize + tidxi] * (ntypes + 1) * 2);
+                    atom_types[(sci * c_nbnxnGpuNumClusterPerSupercluster + i) * c_clSize + tidxi]
+                            * (ntypes + 1) * 2);
  #                endif
  #            endif
          }
@@ -397,9 +398,9 @@ __launch_bounds__(THREADS_PER_BLOCK)
                 Tested with up to nvcc 7.5 */
              for (jm = 0; jm < c_nbnxnGpuJgroupSize; jm++)
              {
-                if (imask & (superClInteractionMask << (jm * c_numClPerSupercl)))
+                if (imask & (superClInteractionMask << (jm * c_nbnxnGpuNumClusterPerSupercluster)))
                  {
-                    mask_ji = (1U << (jm * c_numClPerSupercl));
+                    mask_ji = (1U << (jm * c_nbnxnGpuNumClusterPerSupercluster));
  
                      cj = cjs[jm + (tidxj & 4) * c_nbnxnGpuJgroupSize / c_splitClSize];
                      aj = cj * c_clSize + tidxj;
@@ -419,11 +420,11 @@ __launch_bounds__(THREADS_PER_BLOCK)
  #    if !defined PRUNE_NBL
  #        pragma unroll 8
  #    endif
-                    for (i = 0; i < c_numClPerSupercl; i++)
+                    for (i = 0; i < c_nbnxnGpuNumClusterPerSupercluster; i++)
                      {
                          if (imask & mask_ji)
                          {
-                            ci = sci * c_numClPerSupercl + i; /* i cluster index */
+                            ci = sci * c_nbnxnGpuNumClusterPerSupercluster + i; /* i cluster index */
  
                              /* all threads load an atom from i cluster ci into shmem! */
                              xqbuf = xqib[i * c_clSize + tidxi];
@@ -475,7 +476,7 @@ __launch_bounds__(THREADS_PER_BLOCK)
  #    endif     /* LJ_COMB */
  
                                  // Ensure distance do not become so small that r^-12 overflows
-                                r2 = max(r2, NBNXN_MIN_RSQ);
+                                r2 = max(r2, c_nbnxnMinDistanceSquared);
  
                                  inv_r  = rsqrt(r2);
                                  inv_r2 = inv_r * inv_r;
@@ -629,9 +630,9 @@ __launch_bounds__(THREADS_PER_BLOCK)
      float fshift_buf = 0.0f;
  
      /* reduce i forces */
-    for (i = 0; i < c_numClPerSupercl; i++)
+    for (i = 0; i < c_nbnxnGpuNumClusterPerSupercluster; i++)
      {
-        ai = (sci * c_numClPerSupercl + i) * c_clSize + tidxi;
+        ai = (sci * c_nbnxnGpuNumClusterPerSupercluster + i) * c_clSize + tidxi;
          reduce_force_i_warp_shfl(fci_buf[i], f, &fshift_buf, bCalcFshift, tidxj, ai, c_fullWarpMask);
      }