Update some nbnxm kernel constants to constexpr

author Mark Abraham <mark.j.abraham@gmail.com>

Fri, 17 Jan 2020 14:43:38 +0000 (15:43 +0100)

committer Paul Bauer <paul.bauer.q@gmail.com>

Mon, 27 Jan 2020 13:00:55 +0000 (14:00 +0100)
author Mark Abraham <mark.j.abraham@gmail.com>
Fri, 17 Jan 2020 14:43:38 +0000 (15:43 +0100)
committer Paul Bauer <paul.bauer.q@gmail.com>
Mon, 27 Jan 2020 13:00:55 +0000 (14:00 +0100)
diff --git a/src/gromacs/nbnxm/constants.h b/src/gromacs/nbnxm/constants.h

deleted file mode 100644 (file)

index 41a3e15..0000000
--- a/src/gromacs/nbnxm/constants.h
+++ /dev/null
@@ -1,74 +0,0 @@
-/*
- * This file is part of the GROMACS molecular simulation package.
- *
- * Copyright (c) 2012,2013,2014,2015,2016 by the GROMACS development team.
- * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
- * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
- * and including many others, as listed in the AUTHORS file in the
- * top-level source directory and at http://www.gromacs.org.
- *
- * GROMACS is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public License
- * as published by the Free Software Foundation; either version 2.1
- * of the License, or (at your option) any later version.
- *
- * GROMACS is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with GROMACS; if not, see
- * http://www.gnu.org/licenses, or write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
- *
- * If you want to redistribute modifications to GROMACS, please
- * consider that scientific software is very special. Version
- * control is crucial - bugs must be traceable. We will be happy to
- * consider code for inclusion in the official distribution, but
- * derived work must not be called official GROMACS. Details are found
- * in the README & COPYING files - if they are missing, get the
- * official version at http://www.gromacs.org.
- *
- * To help us fund GROMACS development, we humbly ask that you cite
- * the research papers on the package. Check out http://www.gromacs.org.
- */
-
-/*! \internal \file
- *
- * \brief
- * Declares constants for the module
- *
- * \author Berk Hess <hess@kth.se>
- * \ingroup module_nbnxm
- */
-
-#ifndef GMX_NBNXN_CONSTANTS_H
-#define GMX_NBNXN_CONSTANTS_H
-
-/*! \brief Lower limit for square interaction distances in nonbonded kernels.
- *
- * For smaller values we will overflow when calculating r^-1 or r^-12, but
- * to keep it simple we always apply the limit from the tougher r^-12 condition.
- */
-#if GMX_DOUBLE
-// Some double precision SIMD architectures use single precision in the first
-// step, so although the double precision criterion would allow smaller rsq,
-// we need to stay in single precision with some margin for the N-R iterations.
-#    define NBNXN_MIN_RSQ 1.0e-36
-#else
-// The worst intermediate value we might evaluate is r^-12, which
-// means we should ensure r^2 stays above pow(GMX_FLOAT_MAX,-1.0/6.0)*1.01 (some margin)
-#    define NBNXN_MIN_RSQ 3.82e-07f // r > 6.2e-4
-#endif
-
-
-//! The number of clusters in a super-cluster, used for GPU
-#define c_nbnxnGpuNumClusterPerSupercluster 8
-
-/*! \brief With GPU kernels we group cluster pairs in 4 to optimize memory usage
- * of integers containing 32 bits.
- */
-#define c_nbnxnGpuJgroupSize (32 / c_nbnxnGpuNumClusterPerSupercluster)
-
-#endif
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu

index 22da9946f1664e8b0bd20d41f2af7ad17ded3dfd..359e6c590525b8b295515231025dbfa29a51fd06 100644 (file)
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
@@ -338,19 +338,19 @@ static inline int calc_shmem_required_nonbonded(const int               num_thre
      /* size of shmem (force-buffers/xq/atom type preloading) */
      /* NOTE: with the default kernel on sm3.0 we need shmem only for pre-loading */
      /* i-atom x+q in shared memory */
-    shmem = c_numClPerSupercl * c_clSize * sizeof(float4);
+    shmem = c_nbnxnGpuNumClusterPerSupercluster * c_clSize * sizeof(float4);
      /* cj in shared memory, for each warp separately */
      shmem += num_threads_z * c_nbnxnGpuClusterpairSplit * c_nbnxnGpuJgroupSize * sizeof(int);
  
      if (nbp->vdwtype == evdwCuCUTCOMBGEOM || nbp->vdwtype == evdwCuCUTCOMBLB)
      {
          /* i-atom LJ combination parameters in shared memory */
-        shmem += c_numClPerSupercl * c_clSize * sizeof(float2);
+        shmem += c_nbnxnGpuNumClusterPerSupercluster * c_clSize * sizeof(float2);
      }
      else
      {
          /* i-atom types in shared memory */
-        shmem += c_numClPerSupercl * c_clSize * sizeof(int);
+        shmem += c_nbnxnGpuNumClusterPerSupercluster * c_clSize * sizeof(int);
      }
  
      return shmem;
@@ -552,8 +552,8 @@ void gpu_launch_kernel(NbnxmGpu* nb, const gmx::StepWorkload& stepWork, const In
                  "\tGrid: %zux%zu\n\t#Super-clusters/clusters: %d/%d (%d)\n"
                  "\tShMem: %zu\n",
                  config.blockSize[0], config.blockSize[1], config.blockSize[2], config.gridSize[0],
-                config.gridSize[1], plist->nsci * c_numClPerSupercl, c_numClPerSupercl, plist->na_c,
-                config.sharedMemorySize);
+                config.gridSize[1], plist->nsci * c_nbnxnGpuNumClusterPerSupercluster,
+                c_nbnxnGpuNumClusterPerSupercluster, plist->na_c, config.sharedMemorySize);
      }
  
      auto*      timingEvent = bDoTime ? t->interaction[iloc].nb_k.fetchNextEvent() : nullptr;
@@ -582,7 +582,7 @@ static inline int calc_shmem_required_prune(const int num_threads_z)
      int shmem;
  
      /* i-atom x in shared memory */
-    shmem = c_numClPerSupercl * c_clSize * sizeof(float4);
+    shmem = c_nbnxnGpuNumClusterPerSupercluster * c_clSize * sizeof(float4);
      /* cj in shared memory, for each warp separately */
      shmem += num_threads_z * c_nbnxnGpuClusterpairSplit * c_nbnxnGpuJgroupSize * sizeof(int);
  
@@ -676,8 +676,8 @@ void gpu_launch_kernel_pruneonly(NbnxmGpu* nb, const InteractionLocality iloc, c
                  "\tGrid: %zux%zu\n\t#Super-clusters/clusters: %d/%d (%d)\n"
                  "\tShMem: %zu\n",
                  config.blockSize[0], config.blockSize[1], config.blockSize[2], config.gridSize[0],
-                config.gridSize[1], numSciInPart * c_numClPerSupercl, c_numClPerSupercl,
-                plist->na_c, config.sharedMemorySize);
+                config.gridSize[1], numSciInPart * c_nbnxnGpuNumClusterPerSupercluster,
+                c_nbnxnGpuNumClusterPerSupercluster, plist->na_c, config.sharedMemorySize);
      }
  
      auto*          timingEvent  = bDoTime ? timer->fetchNextEvent() : nullptr;
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel.cuh b/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel.cuh

index 99b1f048cfe520b1d08abc408a4b45b255174abc..89f75da5f9c2731d88b4c7b8aae9060c2c451e9a 100644 (file)
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel.cuh
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel.cuh
@@ -244,11 +244,11 @@ __launch_bounds__(THREADS_PER_BLOCK)
      unsigned int wexcl, imask, mask_ji;
      float4       xqbuf;
      float3       xi, xj, rv, f_ij, fcj_buf;
-    float3       fci_buf[c_numClPerSupercl]; /* i force buffer */
+    float3       fci_buf[c_nbnxnGpuNumClusterPerSupercluster]; /* i force buffer */
      nbnxn_sci_t  nb_sci;
  
-    /*! i-cluster interaction mask for a super-cluster with all c_numClPerSupercl=8 bits set */
-    const unsigned superClInteractionMask = ((1U << c_numClPerSupercl) - 1U);
+    /*! i-cluster interaction mask for a super-cluster with all c_nbnxnGpuNumClusterPerSupercluster=8 bits set */
+    const unsigned superClInteractionMask = ((1U << c_nbnxnGpuNumClusterPerSupercluster) - 1U);
  
      /*********************************************************************
       * Set up shared memory pointers.
@@ -262,7 +262,7 @@ __launch_bounds__(THREADS_PER_BLOCK)
  
      /* shmem buffer for i x+q pre-loading */
      float4* xqib = (float4*)sm_nextSlotPtr;
-    sm_nextSlotPtr += (c_numClPerSupercl * c_clSize * sizeof(*xqib));
+    sm_nextSlotPtr += (c_nbnxnGpuNumClusterPerSupercluster * c_clSize * sizeof(*xqib));
  
      /* shmem buffer for cj, for each warp separately */
      int* cjs = (int*)(sm_nextSlotPtr);
@@ -273,11 +273,11 @@ __launch_bounds__(THREADS_PER_BLOCK)
  #    ifndef LJ_COMB
      /* shmem buffer for i atom-type pre-loading */
      int* atib = (int*)sm_nextSlotPtr;
-    sm_nextSlotPtr += (c_numClPerSupercl * c_clSize * sizeof(*atib));
+    sm_nextSlotPtr += (c_nbnxnGpuNumClusterPerSupercluster * c_clSize * sizeof(*atib));
  #    else
      /* shmem buffer for i-atom LJ combination rule parameters */
      float2* ljcpib = (float2*)sm_nextSlotPtr;
-    sm_nextSlotPtr += (c_numClPerSupercl * c_clSize * sizeof(*ljcpib));
+    sm_nextSlotPtr += (c_nbnxnGpuNumClusterPerSupercluster * c_clSize * sizeof(*ljcpib));
  #    endif
      /*********************************************************************/
  
@@ -289,7 +289,7 @@ __launch_bounds__(THREADS_PER_BLOCK)
      if (tidxz == 0)
      {
          /* Pre-load i-atom x and q into shared memory */
-        ci = sci * c_numClPerSupercl + tidxj;
+        ci = sci * c_nbnxnGpuNumClusterPerSupercluster + tidxj;
          ai = ci * c_clSize + tidxi;
  
          float* shiftptr = (float*)&shift_vec[nb_sci.shift];
@@ -307,7 +307,7 @@ __launch_bounds__(THREADS_PER_BLOCK)
      }
      __syncthreads();
  
-    for (i = 0; i < c_numClPerSupercl; i++)
+    for (i = 0; i < c_nbnxnGpuNumClusterPerSupercluster; i++)
      {
          fci_buf[i] = make_float3(0.0f);
      }
@@ -324,10 +324,10 @@ __launch_bounds__(THREADS_PER_BLOCK)
      E_el         = 0.0f;
  
  #        ifdef EXCLUSION_FORCES /* Ewald or RF */
-    if (nb_sci.shift == CENTRAL && pl_cj4[cij4_start].cj[0] == sci * c_numClPerSupercl)
+    if (nb_sci.shift == CENTRAL && pl_cj4[cij4_start].cj[0] == sci * c_nbnxnGpuNumClusterPerSupercluster)
      {
          /* we have the diagonal: add the charge and LJ self interaction energy term */
-        for (i = 0; i < c_numClPerSupercl; i++)
+        for (i = 0; i < c_nbnxnGpuNumClusterPerSupercluster; i++)
          {
  #            if defined EL_EWALD_ANY || defined EL_RF || defined EL_CUTOFF
              qi = xqib[i * c_clSize + tidxi].w;
@@ -336,12 +336,13 @@ __launch_bounds__(THREADS_PER_BLOCK)
  
  #            ifdef LJ_EWALD
  #                if DISABLE_CUDA_TEXTURES
-            E_lj += LDG(
-                    &nbparam.nbfp[atom_types[(sci * c_numClPerSupercl + i) * c_clSize + tidxi] * (ntypes + 1) * 2]);
+            E_lj += LDG(&nbparam.nbfp[atom_types[(sci * c_nbnxnGpuNumClusterPerSupercluster + i) * c_clSize + tidxi]
+                                      * (ntypes + 1) * 2]);
  #                else
              E_lj += tex1Dfetch<float>(
                      nbparam.nbfp_texobj,
-                    atom_types[(sci * c_numClPerSupercl + i) * c_clSize + tidxi] * (ntypes + 1) * 2);
+                    atom_types[(sci * c_nbnxnGpuNumClusterPerSupercluster + i) * c_clSize + tidxi]
+                            * (ntypes + 1) * 2);
  #                endif
  #            endif
          }
@@ -397,9 +398,9 @@ __launch_bounds__(THREADS_PER_BLOCK)
                 Tested with up to nvcc 7.5 */
              for (jm = 0; jm < c_nbnxnGpuJgroupSize; jm++)
              {
-                if (imask & (superClInteractionMask << (jm * c_numClPerSupercl)))
+                if (imask & (superClInteractionMask << (jm * c_nbnxnGpuNumClusterPerSupercluster)))
                  {
-                    mask_ji = (1U << (jm * c_numClPerSupercl));
+                    mask_ji = (1U << (jm * c_nbnxnGpuNumClusterPerSupercluster));
  
                      cj = cjs[jm + (tidxj & 4) * c_nbnxnGpuJgroupSize / c_splitClSize];
                      aj = cj * c_clSize + tidxj;
@@ -419,11 +420,11 @@ __launch_bounds__(THREADS_PER_BLOCK)
  #    if !defined PRUNE_NBL
  #        pragma unroll 8
  #    endif
-                    for (i = 0; i < c_numClPerSupercl; i++)
+                    for (i = 0; i < c_nbnxnGpuNumClusterPerSupercluster; i++)
                      {
                          if (imask & mask_ji)
                          {
-                            ci = sci * c_numClPerSupercl + i; /* i cluster index */
+                            ci = sci * c_nbnxnGpuNumClusterPerSupercluster + i; /* i cluster index */
  
                              /* all threads load an atom from i cluster ci into shmem! */
                              xqbuf = xqib[i * c_clSize + tidxi];
@@ -475,7 +476,7 @@ __launch_bounds__(THREADS_PER_BLOCK)
  #    endif     /* LJ_COMB */
  
                                  // Ensure distance do not become so small that r^-12 overflows
-                                r2 = max(r2, NBNXN_MIN_RSQ);
+                                r2 = max(r2, c_nbnxnMinDistanceSquared);
  
                                  inv_r  = rsqrt(r2);
                                  inv_r2 = inv_r * inv_r;
@@ -629,9 +630,9 @@ __launch_bounds__(THREADS_PER_BLOCK)
      float fshift_buf = 0.0f;
  
      /* reduce i forces */
-    for (i = 0; i < c_numClPerSupercl; i++)
+    for (i = 0; i < c_nbnxnGpuNumClusterPerSupercluster; i++)
      {
-        ai = (sci * c_numClPerSupercl + i) * c_clSize + tidxi;
+        ai = (sci * c_nbnxnGpuNumClusterPerSupercluster + i) * c_clSize + tidxi;
          reduce_force_i_warp_shfl(fci_buf[i], f, &fshift_buf, bCalcFshift, tidxj, ai, c_fullWarpMask);
      }
  
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel_pruneonly.cuh b/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel_pruneonly.cuh

index 8570e55c0bd09774b0cd5fabc5976fb4d6654c1b..e9c5b5114397901af4a22ce1534c54e95ab0e883 100644 (file)
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel_pruneonly.cuh
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel_pruneonly.cuh
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2016,2017,2018,2019, by the GROMACS development team, led by
+ * Copyright (c) 2016,2017,2018,2019,2020, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -152,7 +152,7 @@ nbnxn_kernel_prune_cuda<false>(const cu_atomdata_t, const cu_nbparam_t, const cu
  
      /* shmem buffer for i x+q pre-loading */
      float4* xib = (float4*)sm_nextSlotPtr;
-    sm_nextSlotPtr += (c_numClPerSupercl * c_clSize * sizeof(*xib));
+    sm_nextSlotPtr += (c_nbnxnGpuNumClusterPerSupercluster * c_clSize * sizeof(*xib));
  
      /* shmem buffer for cj, for each warp separately */
      int* cjs = (int*)(sm_nextSlotPtr);
@@ -171,7 +171,7 @@ nbnxn_kernel_prune_cuda<false>(const cu_atomdata_t, const cu_nbparam_t, const cu
      if (tidxz == 0)
      {
          /* Pre-load i-atom x and q into shared memory */
-        int ci = sci * c_numClPerSupercl + tidxj;
+        int ci = sci * c_nbnxnGpuNumClusterPerSupercluster + tidxj;
          int ai = ci * c_clSize + tidxi;
  
          /* We don't need q, but using float4 in shmem avoids bank conflicts.
@@ -220,9 +220,9 @@ nbnxn_kernel_prune_cuda<false>(const cu_atomdata_t, const cu_nbparam_t, const cu
  #    pragma unroll 4
              for (int jm = 0; jm < c_nbnxnGpuJgroupSize; jm++)
              {
-                if (imaskCheck & (superClInteractionMask << (jm * c_numClPerSupercl)))
+                if (imaskCheck & (superClInteractionMask << (jm * c_nbnxnGpuNumClusterPerSupercluster)))
                  {
-                    unsigned int mask_ji = (1U << (jm * c_numClPerSupercl));
+                    unsigned int mask_ji = (1U << (jm * c_nbnxnGpuNumClusterPerSupercluster));
  
                      int cj = cjs[jm + (tidxj & 4) * c_nbnxnGpuJgroupSize / c_splitClSize];
                      int aj = cj * c_clSize + tidxj;
@@ -232,7 +232,7 @@ nbnxn_kernel_prune_cuda<false>(const cu_atomdata_t, const cu_nbparam_t, const cu
                      float3 xj  = make_float3(tmp.x, tmp.y, tmp.z);
  
  #    pragma unroll 8
-                    for (int i = 0; i < c_numClPerSupercl; i++)
+                    for (int i = 0; i < c_nbnxnGpuNumClusterPerSupercluster; i++)
                      {
                          if (imaskCheck & mask_ji)
                          {
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel_utils.cuh b/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel_utils.cuh

index 6fc2e4523db588792acacc8e4ff29977d7142ee4..cbca9452b99b597fca4d03bdf434f6053cd2bb8d 100644 (file)
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel_utils.cuh
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel_utils.cuh
@@ -68,8 +68,9 @@ static const int __device__ c_clSizeSq = c_clSize * c_clSize;
  static const int __device__ c_splitClSize = c_clSize / c_nbnxnGpuClusterpairSplit;
  /*! \brief Stride in the force accumualation buffer */
  static const int __device__ c_fbufStride = c_clSizeSq;
-/*! \brief i-cluster interaction mask for a super-cluster with all c_numClPerSupercl=8 bits set */
-static const unsigned __device__ superClInteractionMask = ((1U << c_numClPerSupercl) - 1U);
+/*! \brief i-cluster interaction mask for a super-cluster with all c_nbnxnGpuNumClusterPerSupercluster=8 bits set */
+static const unsigned __device__ superClInteractionMask =
+        ((1U << c_nbnxnGpuNumClusterPerSupercluster) - 1U);
  
  static const float __device__ c_oneSixth    = 0.16666667f;
  static const float __device__ c_oneTwelveth = 0.08333333f;
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h b/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h

index be7d86163915410a85aa2fa2d34a00869f95fa34..863e6a1efc09de5c6a81d7b5656ad01fe946cf86 100644 (file)
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h
@@ -72,10 +72,8 @@ const int c_cudaPruneKernelJ4Concurrency = GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY
  
  /* TODO: consider moving this to kernel_utils */
  /* Convenience defines */
-/*! \brief number of clusters per supercluster. */
-static const int c_numClPerSupercl = c_nbnxnGpuNumClusterPerSupercluster;
  /*! \brief cluster size = number of atoms per cluster. */
-static const int c_clSize = c_nbnxnGpuClusterSize;
+static constexpr int c_clSize = c_nbnxnGpuClusterSize;
  
  /*! \brief Electrostatic CUDA kernel flavors.
   *
diff --git a/src/gromacs/nbnxm/kernels_reference/kernel_gpu_ref.cpp b/src/gromacs/nbnxm/kernels_reference/kernel_gpu_ref.cpp

index ce7a72c2ffea76d96f04bf95601163fa356036a2..00b7c647d566c35c86a7cf0238f3fe709f99bd21 100644 (file)
--- a/src/gromacs/nbnxm/kernels_reference/kernel_gpu_ref.cpp
+++ b/src/gromacs/nbnxm/kernels_reference/kernel_gpu_ref.cpp
@@ -52,8 +52,7 @@
  #include "gromacs/pbcutil/ishift.h"
  #include "gromacs/utility/fatalerror.h"
  
-static const int c_numClPerSupercl = c_nbnxnGpuNumClusterPerSupercluster;
-static const int c_clSize          = c_nbnxnGpuClusterSize;
+static constexpr int c_clSize = c_nbnxnGpuClusterSize;
  
  void nbnxn_kernel_gpu_ref(const NbnxnPairlistGpu*    nbl,
                            const nbnxn_atomdata_t*    nbat,
@@ -151,14 +150,14 @@ void nbnxn_kernel_gpu_ref(const NbnxnPairlistGpu*    nbl,
          vctot    = 0;
          Vvdwtot  = 0;
  
-        if (nbln.shift == CENTRAL && nbl->cj4[cj4_ind0].cj[0] == sci * c_numClPerSupercl)
+        if (nbln.shift == CENTRAL && nbl->cj4[cj4_ind0].cj[0] == sci * c_nbnxnGpuNumClusterPerSupercluster)
          {
              /* we have the diagonal:
               * add the charge self interaction energy term
               */
-            for (im = 0; im < c_numClPerSupercl; im++)
+            for (im = 0; im < c_nbnxnGpuNumClusterPerSupercluster; im++)
              {
-                ci = sci * c_numClPerSupercl + im;
+                ci = sci * c_nbnxnGpuNumClusterPerSupercluster + im;
                  for (ic = 0; ic < c_clSize; ic++)
                  {
                      ia = ci * c_clSize + ic;
@@ -186,16 +185,17 @@ void nbnxn_kernel_gpu_ref(const NbnxnPairlistGpu*    nbl,
              {
                  cj = nbl->cj4[cj4_ind].cj[jm];
  
-                for (im = 0; im < c_numClPerSupercl; im++)
+                for (im = 0; im < c_nbnxnGpuNumClusterPerSupercluster; im++)
                  {
                      /* We're only using the first imask,
                       * but here imei[1].imask is identical.
                       */
-                    if ((nbl->cj4[cj4_ind].imei[0].imask >> (jm * c_numClPerSupercl + im)) & 1)
+                    if ((nbl->cj4[cj4_ind].imei[0].imask >> (jm * c_nbnxnGpuNumClusterPerSupercluster + im))
+                        & 1)
                      {
                          gmx_bool within_rlist;
  
-                        ci = sci * c_numClPerSupercl + im;
+                        ci = sci * c_nbnxnGpuNumClusterPerSupercluster + im;
  
                          within_rlist = FALSE;
                          npair        = 0;
@@ -228,7 +228,7 @@ void nbnxn_kernel_gpu_ref(const NbnxnPairlistGpu*    nbl,
                                          c_nbnxnGpuClusterSize / c_nbnxnGpuClusterpairSplit;
                                  int_bit = static_cast<real>(
                                          (excl[jc / clusterPerSplit]->pair[(jc & (clusterPerSplit - 1)) * c_clSize + ic]
-                                         >> (jm * c_numClPerSupercl + im))
+                                         >> (jm * c_nbnxnGpuNumClusterPerSupercluster + im))
                                          & 1);
  
                                  js  = ja * nbat->xstride;
@@ -255,7 +255,7 @@ void nbnxn_kernel_gpu_ref(const NbnxnPairlistGpu*    nbl,
                                  }
  
                                  // Ensure distance do not become so small that r^-12 overflows
-                                rsq = std::max(rsq, NBNXN_MIN_RSQ);
+                                rsq = std::max(rsq, c_nbnxnMinDistanceSquared);
  
                                  rinv   = gmx::invsqrt(rsq);
                                  rinvsq = rinv * rinv;
diff --git a/src/gromacs/nbnxm/kernels_reference/kernel_ref_inner.h b/src/gromacs/nbnxm/kernels_reference/kernel_ref_inner.h

index 8bc5aab82a6e6ae069b384a8b92c7f0c48ad18f7..59d9147aa6d2d4e57026e0ee73f2cc3ac06756aa 100644 (file)
--- a/src/gromacs/nbnxm/kernels_reference/kernel_ref_inner.h
+++ b/src/gromacs/nbnxm/kernels_reference/kernel_ref_inner.h
@@ -128,7 +128,7 @@
  
              // Ensure the distances do not fall below the limit where r^-12 overflows.
              // This should never happen for normal interactions.
-            rsq = std::max(rsq, NBNXN_MIN_RSQ);
+            rsq = std::max(rsq, c_nbnxnMinDistanceSquared);
  
  #ifdef COUNT_PAIRS
              npair++;
diff --git a/src/gromacs/nbnxm/kernels_simd_2xmm/kernel_outer.h b/src/gromacs/nbnxm/kernels_simd_2xmm/kernel_outer.h

index 7cf25805a5e35b9446b16f2e32271bf68fd5bb38..17e8f894fac9f4f1c50c2824565d8d024a503905 100644 (file)
--- a/src/gromacs/nbnxm/kernels_simd_2xmm/kernel_outer.h
+++ b/src/gromacs/nbnxm/kernels_simd_2xmm/kernel_outer.h
@@ -311,7 +311,7 @@
      rcvdw2_S = SimdReal(ic->rvdw * ic->rvdw);
  #endif
  
-    minRsq_S = SimdReal(NBNXN_MIN_RSQ);
+    minRsq_S = SimdReal(c_nbnxnMinDistanceSquared);
  
      const real* gmx_restrict q        = nbatParams.q.data();
      const real               facel    = ic->epsfac;
diff --git a/src/gromacs/nbnxm/kernels_simd_4xm/kernel_outer.h b/src/gromacs/nbnxm/kernels_simd_4xm/kernel_outer.h

index e115c9b8ce6d7afc6a0cc1c757230a5c8a8f6b50..3559f2bbf183d10db3e0a8d43cdf514f3ca3aed2 100644 (file)
--- a/src/gromacs/nbnxm/kernels_simd_4xm/kernel_outer.h
+++ b/src/gromacs/nbnxm/kernels_simd_4xm/kernel_outer.h
@@ -329,7 +329,7 @@
      rcvdw2_S = SimdReal(ic->rvdw * ic->rvdw);
  #endif
  
-    minRsq_S = SimdReal(NBNXN_MIN_RSQ);
+    minRsq_S = SimdReal(c_nbnxnMinDistanceSquared);
  
      const real* gmx_restrict q        = nbatParams.q.data();
      const real               facel    = ic->epsfac;
diff --git a/src/gromacs/nbnxm/opencl/CMakeLists.txt b/src/gromacs/nbnxm/opencl/CMakeLists.txt

index 9b573495d7ebe473d60c99ddc382a96fa88ec9a8..0c3690645790d03e61cb88beff6ef8b09d8a6524 100644 (file)
--- a/src/gromacs/nbnxm/opencl/CMakeLists.txt
+++ b/src/gromacs/nbnxm/opencl/CMakeLists.txt
@@ -71,11 +71,18 @@ foreach(ELEC_DEF IN LISTS ELEC_DEFS)
              string(REGEX REPLACE ".*=" "" ELEC_NAME "${ELEC_DEF}")
              string(REGEX REPLACE ".*=" "" VDW_NAME "${VDW_DEF}")
              set(OBJ_FILE nbnxm_ocl_kernel${ELEC_NAME}${VDW_NAME}_${VENDOR}.o)
+            # The constants below duplicate various others (e.g. from pairlist.h)
+            # but as the kernels compiled here are not used for production,
+            # it will be OK if the values would fall out of sync.
              add_custom_command(OUTPUT ${OBJ_FILE} COMMAND ${OCL_COMPILER}
                  ${CMAKE_CURRENT_SOURCE_DIR}/nbnxm_ocl_kernels.cl ${CLANG_TIDY_ARGS}
                  -Xclang -finclude-default-header  -D_${VENDOR}_SOURCE_
                  -DGMX_OCL_FASTGEN ${ELEC_DEF} ${VDW_DEF}
-                -DNBNXN_GPU_CLUSTER_SIZE=${CLUSTER_SIZE} -DIATYPE_SHMEM
+                -Dc_nbnxnGpuClusterSize=${CLUSTER_SIZE}
+                -Dc_nbnxnMinDistanceSquared=3.82e-07F
+                -Dc_nbnxnGpuNumClusterPerSupercluster=8
+                -Dc_nbnxnGpuJgroupSize=4
+                -DIATYPE_SHMEM
                  -c -I ${CMAKE_SOURCE_DIR}/src -std=cl1.2
                  -Weverything  -Wno-conversion -Wno-missing-variable-declarations -Wno-used-but-marked-unused
                  -Wno-cast-align -Wno-incompatible-pointer-types
diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp b/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp

index 9da920db7638cd91e51f3bf3b3faecb4bc4fe663..d0e693f85facf028c575bd7d82e52c723777ccdf 100644 (file)
--- a/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp
+++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp
@@ -95,8 +95,7 @@ namespace Nbnxm
  
  /*! \brief Convenience constants */
  //@{
-static const int c_numClPerSupercl = c_nbnxnGpuNumClusterPerSupercluster;
-static const int c_clSize          = c_nbnxnGpuClusterSize;
+static constexpr int c_clSize = c_nbnxnGpuClusterSize;
  //@}
  
  
@@ -396,7 +395,7 @@ static inline int calc_shmem_required_nonbonded(int vdwType, bool bPrefetchLjPar
      /* size of shmem (force-buffers/xq/atom type preloading) */
      /* NOTE: with the default kernel on sm3.0 we need shmem only for pre-loading */
      /* i-atom x+q in shared memory */
-    shmem = c_numClPerSupercl * c_clSize * sizeof(float) * 4; /* xqib */
+    shmem = c_nbnxnGpuNumClusterPerSupercluster * c_clSize * sizeof(float) * 4; /* xqib */
      /* cj in shared memory, for both warps separately
       * TODO: in the "nowarp kernels we load cj only once  so the factor 2 is not needed.
       */
@@ -406,12 +405,13 @@ static inline int calc_shmem_required_nonbonded(int vdwType, bool bPrefetchLjPar
          if (useLjCombRule(vdwType))
          {
              /* i-atom LJ combination parameters in shared memory */
-            shmem += c_numClPerSupercl * c_clSize * 2 * sizeof(float); /* atib abused for ljcp, float2 */
+            shmem += c_nbnxnGpuNumClusterPerSupercluster * c_clSize * 2
+                     * sizeof(float); /* atib abused for ljcp, float2 */
          }
          else
          {
              /* i-atom types in shared memory */
-            shmem += c_numClPerSupercl * c_clSize * sizeof(int); /* atib */
+            shmem += c_nbnxnGpuNumClusterPerSupercluster * c_clSize * sizeof(int); /* atib */
          }
      }
      /* force reduction buffers in shared memory */
@@ -643,7 +643,8 @@ void gpu_launch_kernel(NbnxmGpu* nb, const gmx::StepWorkload& stepWork, const Nb
                  "Global work size : %zux%zu\n\t#Super-clusters/clusters: %d/%d (%d)\n",
                  config.blockSize[0], config.blockSize[1], config.blockSize[2],
                  config.blockSize[0] * config.gridSize[0], config.blockSize[1] * config.gridSize[1],
-                plist->nsci * c_numClPerSupercl, c_numClPerSupercl, plist->na_c);
+                plist->nsci * c_nbnxnGpuNumClusterPerSupercluster,
+                c_nbnxnGpuNumClusterPerSupercluster, plist->na_c);
      }
  
      fillin_ocl_structures(nbp, &nbparams_params);
@@ -697,7 +698,7 @@ static inline int calc_shmem_required_prune(const int num_threads_z)
      int shmem;
  
      /* i-atom x in shared memory (for convenience we load all 4 components including q) */
-    shmem = c_numClPerSupercl * c_clSize * sizeof(float) * 4;
+    shmem = c_nbnxnGpuNumClusterPerSupercluster * c_clSize * sizeof(float) * 4;
      /* cj in shared memory, for each warp separately
       * Note: only need to load once per wavefront, but to keep the code simple,
       * for now we load twice on AMD.
@@ -803,7 +804,8 @@ void gpu_launch_kernel_pruneonly(NbnxmGpu* nb, const InteractionLocality iloc, c
                  "\tShMem: %zu\n",
                  config.blockSize[0], config.blockSize[1], config.blockSize[2],
                  config.blockSize[0] * config.gridSize[0], config.blockSize[1] * config.gridSize[1],
-                plist->nsci * c_numClPerSupercl, c_numClPerSupercl, plist->na_c, config.sharedMemorySize);
+                plist->nsci * c_nbnxnGpuNumClusterPerSupercluster,
+                c_nbnxnGpuNumClusterPerSupercluster, plist->na_c, config.sharedMemorySize);
      }
  
      cl_nbparam_params_t nbparams_params;
diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl_jit_support.cpp b/src/gromacs/nbnxm/opencl/nbnxm_ocl_jit_support.cpp

index dba641514997be022b746940bd7e8102d0f2d494..7374d433d37b7cc01941fc027726d5592c539c56 100644 (file)
--- a/src/gromacs/nbnxm/opencl/nbnxm_ocl_jit_support.cpp
+++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl_jit_support.cpp
@@ -184,16 +184,18 @@ void nbnxn_gpu_compile_kernels(NbnxmGpu* nb)
          std::string extraDefines =
                  makeDefinesForKernelTypes(bFastGen, nb->nbparam->eeltype, nb->nbparam->vdwtype);
  
-        /* Here we pass macros and static const int variables defined
+        /* Here we pass macros and static const/constexpr int variables defined
           * in include files outside the opencl as macros, to avoid
-         * including those files in the JIT compilation that happens
-         * at runtime. This is particularly a problem for headers that
-         * depend on config.h, such as pairlist.h. */
+         * including those files in the plain-C JIT compilation that happens
+         * at runtime. */
          extraDefines += gmx::formatString(
-                " -DNBNXN_GPU_CLUSTER_SIZE=%d "
+                " -Dc_nbnxnGpuClusterSize=%d"
+                " -Dc_nbnxnMinDistanceSquared=%g"
+                " -Dc_nbnxnGpuNumClusterPerSupercluster=%d"
+                " -Dc_nbnxnGpuJgroupSize=%d"
                  "%s",
-                c_nbnxnGpuClusterSize, /* Defined in nbnxn_pairlist.h */
-                (nb->bPrefetchLjParam) ? "-DIATYPE_SHMEM" : "");
+                c_nbnxnGpuClusterSize, c_nbnxnMinDistanceSquared, c_nbnxnGpuNumClusterPerSupercluster,
+                c_nbnxnGpuJgroupSize, (nb->bPrefetchLjParam) ? " -DIATYPE_SHMEM" : "");
          try
          {
              /* TODO when we have a proper MPI-aware logging module,
diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl_kernel.clh b/src/gromacs/nbnxm/opencl/nbnxm_ocl_kernel.clh

index cbdb45f01a68310725983d1d23682ec367e17e68..b238843a91efe632b73285eca77573db6689c97e 100644 (file)
--- a/src/gromacs/nbnxm/opencl/nbnxm_ocl_kernel.clh
+++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl_kernel.clh
@@ -2,7 +2,7 @@
   * This file is part of the GROMACS molecular simulation package.
   *
   * Copyright (c) 2012-2018, The GROMACS development team.
- * Copyright (c) 2019, by the GROMACS development team, led by
+ * Copyright (c) 2019,2020, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -177,10 +177,10 @@ __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_opencl)
      const int bidx  = get_group_id(0);
      const int widx  = tidx / WARP_SIZE; /* warp index */
  
-    /*! i-cluster interaction mask for a super-cluster with all NCL_PER_SUPERCL=8 bits set */
-    const unsigned superClInteractionMask = ((1U << NCL_PER_SUPERCL) - 1U);
+    /*! i-cluster interaction mask for a super-cluster with all c_nbnxnGpuNumClusterPerSupercluster=8 bits set */
+    const unsigned superClInteractionMask = ((1U << c_nbnxnGpuNumClusterPerSupercluster) - 1U);
  
-#define LOCAL_OFFSET (xqib + NCL_PER_SUPERCL * CL_SIZE)
+#define LOCAL_OFFSET (xqib + c_nbnxnGpuNumClusterPerSupercluster * CL_SIZE)
      CjType cjs = 0;
  #if USE_CJ_PREFETCH
      /* shmem buffer for cj, for both warps separately */
@@ -194,11 +194,11 @@ __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_opencl)
      /* shmem buffer for i atom-type pre-loading */
      __local int* atib = (__local int*)(LOCAL_OFFSET); //NOLINT(google-readability-casting)
  #        undef LOCAL_OFFSET
-#        define LOCAL_OFFSET (atib + NCL_PER_SUPERCL * CL_SIZE)
+#        define LOCAL_OFFSET (atib + c_nbnxnGpuNumClusterPerSupercluster * CL_SIZE)
  #    else
      __local float2* ljcpib      = (__local float2*)(LOCAL_OFFSET);
  #        undef LOCAL_OFFSET
-#        define LOCAL_OFFSET (ljcpib + NCL_PER_SUPERCL * CL_SIZE)
+#        define LOCAL_OFFSET (ljcpib + c_nbnxnGpuNumClusterPerSupercluster * CL_SIZE)
  #    endif
  #endif
  
@@ -225,10 +225,10 @@ __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_opencl)
      const int         cij4_start = nb_sci.cj4_ind_start; /* first ...*/
      const int         cij4_end   = nb_sci.cj4_ind_end;   /* and last index of j clusters */
  
-    for (int i = 0; i < NCL_PER_SUPERCL; i += CL_SIZE)
+    for (int i = 0; i < c_nbnxnGpuNumClusterPerSupercluster; i += CL_SIZE)
      {
          /* Pre-load i-atom x and q into shared memory */
-        const int ci = sci * NCL_PER_SUPERCL + tidxj + i;
+        const int ci = sci * c_nbnxnGpuNumClusterPerSupercluster + tidxj + i;
          const int ai = ci * CL_SIZE + tidxi;
  
          float4 xqbuf = xq[ai]
@@ -254,8 +254,8 @@ __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_opencl)
  #endif
      barrier(CLK_LOCAL_MEM_FENCE);
  
-    float3 fci_buf[NCL_PER_SUPERCL]; /* i force buffer */
-    for (int ci_offset = 0; ci_offset < NCL_PER_SUPERCL; ci_offset++)
+    float3 fci_buf[c_nbnxnGpuNumClusterPerSupercluster]; /* i force buffer */
+    for (int ci_offset = 0; ci_offset < c_nbnxnGpuNumClusterPerSupercluster; ci_offset++)
      {
          fci_buf[ci_offset] = (float3)(0.0F);
      }
@@ -272,17 +272,18 @@ __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_opencl)
      float E_el = 0.0F;
  
  #    if defined EXCLUSION_FORCES /* Ewald or RF */
-    if (nb_sci.shift == CENTRAL && pl_cj4[cij4_start].cj[0] == sci * NCL_PER_SUPERCL)
+    if (nb_sci.shift == CENTRAL && pl_cj4[cij4_start].cj[0] == sci * c_nbnxnGpuNumClusterPerSupercluster)
      {
          /* we have the diagonal: add the charge and LJ self interaction energy term */
-        for (int i = 0; i < NCL_PER_SUPERCL; i++)
+        for (int i = 0; i < c_nbnxnGpuNumClusterPerSupercluster; i++)
          {
  #        if defined EL_EWALD_ANY || defined EL_RF || defined EL_CUTOFF
              const float qi = xqib[i * CL_SIZE + tidxi].w;
              E_el += qi * qi;
  #        endif
  #        if defined LJ_EWALD
-            E_lj += nbfp_climg2d[atom_types[(sci * NCL_PER_SUPERCL + i) * CL_SIZE + tidxi] * (ntypes + 1) * 2];
+            E_lj += nbfp_climg2d[atom_types[(sci * c_nbnxnGpuNumClusterPerSupercluster + i) * CL_SIZE + tidxi]
+                                 * (ntypes + 1) * 2];
  #        endif /* LJ_EWALD */
          }
  
@@ -335,9 +336,9 @@ __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_opencl)
  #endif
              for (int jm = 0; jm < c_nbnxnGpuJgroupSize; jm++)
              {
-                if (imask & (superClInteractionMask << (jm * NCL_PER_SUPERCL)))
+                if (imask & (superClInteractionMask << (jm * c_nbnxnGpuNumClusterPerSupercluster)))
                  {
-                    unsigned int mask_ji = (1U << (jm * NCL_PER_SUPERCL));
+                    unsigned int mask_ji = (1U << (jm * c_nbnxnGpuNumClusterPerSupercluster));
  
                      const int cj = loadCj(cjs, pl_cj4[j4].cj, jm, tidxi, tidxj);
                      const int aj = cj * CL_SIZE + tidxj;
@@ -357,11 +358,11 @@ __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_opencl)
  #if !defined PRUNE_NBL
  #    pragma unroll 8
  #endif
-                    for (int i = 0; i < NCL_PER_SUPERCL; i++)
+                    for (int i = 0; i < c_nbnxnGpuNumClusterPerSupercluster; i++)
                      {
                          if (imask & mask_ji)
                          {
-                            const int gmx_unused ci = sci * NCL_PER_SUPERCL + i; /* i cluster index */
+                            const int gmx_unused ci = sci * c_nbnxnGpuNumClusterPerSupercluster + i; /* i cluster index */
  
                              /* all threads load an atom from i cluster ci into shmem! */
                              const float4 xiqbuf = xqib[i * CL_SIZE + tidxi];
@@ -423,8 +424,10 @@ __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_opencl)
  #    endif /* LJ_COMB_GEOM */
  #endif     /* LJ_COMB */
  
-                                // Ensure distance do not become so small that r^-12 overflows
-                                r2 = max(r2, NBNXN_MIN_RSQ);
+                                // Ensure distance do not become so small that r^-12 overflows.
+                                // Cast to float to ensure the correct built-in max() function
+                                // is called.
+                                r2 = max(r2, (float)c_nbnxnMinDistanceSquared);
  
                                  const float inv_r  = rsqrt(r2);
                                  const float inv_r2 = inv_r * inv_r;
diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl_kernel_pruneonly.clh b/src/gromacs/nbnxm/opencl/nbnxm_ocl_kernel_pruneonly.clh

index 475c1d68b063d692ca5ed82bbfa4b41825395d97..16dd0c4962dbf0b853403b9ec955b6029b1d4e16 100644 (file)
--- a/src/gromacs/nbnxm/opencl/nbnxm_ocl_kernel_pruneonly.clh
+++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl_kernel_pruneonly.clh
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2016,2017,2018,2019, by the GROMACS development team, led by
+ * Copyright (c) 2016,2017,2018,2019,2020, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -101,24 +101,23 @@ nbnxn_kernel_prune_rolling_opencl
  #endif
  
      // TODO move these consts to utils and unify their use with the nonbonded kernels
-    const int c_numClPerSupercl = NCL_PER_SUPERCL;
-    const int c_clSize          = CL_SIZE;
+    const int c_clSize = CL_SIZE;
  
      // TODO pass this value at compile-time as a macro
      const int c_nbnxnGpuClusterpairSplit = 2;
  
-    /*! i-cluster interaction mask for a super-cluster with all c_numClPerSupercl=8 bits set */
-    const unsigned superClInteractionMask = ((1U << c_numClPerSupercl) - 1U);
+    /*! i-cluster interaction mask for a super-cluster with all c_nbnxnGpuNumClusterPerSupercluster=8 bits set */
+    const unsigned superClInteractionMask = ((1U << c_nbnxnGpuNumClusterPerSupercluster) - 1U);
  
-#define LOCAL_OFFSET (xib + c_numClPerSupercl * c_clSize)
+#define LOCAL_OFFSET (xib + c_nbnxnGpuNumClusterPerSupercluster * c_clSize)
      /* shmem buffer for i cj pre-loading */
      CjType cjs = 0;
  #if USE_CJ_PREFETCH
      cjs = (((__local int*)(LOCAL_OFFSET)) + tidxz * c_nbnxnGpuClusterpairSplit * c_nbnxnGpuJgroupSize);
  #    undef LOCAL_OFFSET
  /* Offset calculated using xib because cjs depends on on tidxz! */
-#    define LOCAL_OFFSET                                      \
-        (((__local int*)(xib + c_numClPerSupercl * c_clSize)) \
+#    define LOCAL_OFFSET                                                        \
+        (((__local int*)(xib + c_nbnxnGpuNumClusterPerSupercluster * c_clSize)) \
           + (NTHREAD_Z * c_nbnxnGpuClusterpairSplit * c_nbnxnGpuJgroupSize))
  #endif
  #if !USE_SUBGROUP_ANY
@@ -147,10 +146,10 @@ nbnxn_kernel_prune_rolling_opencl
  
      if (tidxz == 0)
      {
-        for (int i = 0; i < NCL_PER_SUPERCL; i += CL_SIZE)
+        for (int i = 0; i < c_nbnxnGpuNumClusterPerSupercluster; i += CL_SIZE)
          {
              /* Pre-load i-atom x and q into shared memory */
-            const int ci = sci * c_numClPerSupercl + tidxj + i;
+            const int ci = sci * c_nbnxnGpuNumClusterPerSupercluster + tidxj + i;
              const int ai = ci * c_clSize + tidxi;
  
              /* We don't need q, but using float4 in shmem avoids bank conflicts */
@@ -194,9 +193,9 @@ nbnxn_kernel_prune_rolling_opencl
  #pragma unroll 4
              for (int jm = 0; jm < c_nbnxnGpuJgroupSize; jm++)
              {
-                if (imaskCheck & (superClInteractionMask << (jm * c_numClPerSupercl)))
+                if (imaskCheck & (superClInteractionMask << (jm * c_nbnxnGpuNumClusterPerSupercluster)))
                  {
-                    unsigned int mask_ji = (1U << (jm * c_numClPerSupercl));
+                    unsigned int mask_ji = (1U << (jm * c_nbnxnGpuNumClusterPerSupercluster));
  
                      const int cj = loadCj(cjs, pl_cj4[j4].cj, jm, tidxi, tidxj);
                      const int aj = cj * c_clSize + tidxj;
@@ -206,7 +205,7 @@ nbnxn_kernel_prune_rolling_opencl
                      const float3 xj  = (float3)(tmp.xyz);
  
  #pragma unroll 8
-                    for (int i = 0; i < c_numClPerSupercl; i++)
+                    for (int i = 0; i < c_nbnxnGpuNumClusterPerSupercluster; i++)
                      {
                          if (imaskCheck & mask_ji)
                          {
diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl_kernel_utils.clh b/src/gromacs/nbnxm/opencl/nbnxm_ocl_kernel_utils.clh

index 05f66de5573c0f1b6d1dce09149136cc4bc57cd9..7d70a5908abf11b4595df527e7560e7e1fdb38a9 100644 (file)
--- a/src/gromacs/nbnxm/opencl/nbnxm_ocl_kernel_utils.clh
+++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl_kernel_utils.clh
@@ -47,13 +47,11 @@
  
  #include "gromacs/gpu_utils/device_utils.clh"
  #include "gromacs/gpu_utils/vectype_ops.clh"
-#include "gromacs/nbnxm/constants.h"
  #include "gromacs/pbcutil/ishift.h"
  
  #include "nbnxm_ocl_consts.h"
  
-#define CL_SIZE (NBNXN_GPU_CLUSTER_SIZE)
-#define NCL_PER_SUPERCL c_nbnxnGpuNumClusterPerSupercluster
+#define CL_SIZE (c_nbnxnGpuClusterSize)
  
  #define WARP_SIZE (CL_SIZE * CL_SIZE / 2) // Currently only c_nbnxnGpuClusterpairSplit=2 supported
  
@@ -233,8 +231,8 @@ typedef struct
                                                 */
  } nbnxn_excl_t;
  
-/*! i-cluster interaction mask for a super-cluster with all NCL_PER_SUPERCL bits set */
-__constant unsigned supercl_interaction_mask = ((1U << NCL_PER_SUPERCL) - 1U);
+/*! i-cluster interaction mask for a super-cluster with all c_nbnxnGpuNumClusterPerSupercluster bits set */
+__constant unsigned supercl_interaction_mask = ((1U << c_nbnxnGpuNumClusterPerSupercluster) - 1U);
  
  gmx_opencl_inline void preloadCj4Generic(__local int*        sm_cjPreload,
                                           const __global int* gm_cj,
@@ -698,9 +696,9 @@ gmx_opencl_inline void reduce_force_i_and_shift_shfl(float3*         fci_buf,
      /* Only does reduction over 4 elements in cluster (2 per warp). Needs to be changed
       * for CL_SIZE>4.*/
      float2 fshift_buf = 0;
-    for (int ci_offset = 0; ci_offset < NCL_PER_SUPERCL; ci_offset++)
+    for (int ci_offset = 0; ci_offset < c_nbnxnGpuNumClusterPerSupercluster; ci_offset++)
      {
-        int    aidx = (sci * NCL_PER_SUPERCL + ci_offset) * CL_SIZE + tidxi;
+        int    aidx = (sci * c_nbnxnGpuNumClusterPerSupercluster + ci_offset) * CL_SIZE + tidxi;
          float3 fin  = fci_buf[ci_offset];
          fin.x += intel_sub_group_shuffle_down(fin.x, fin.x, CL_SIZE);
          fin.y += intel_sub_group_shuffle_up(fin.y, fin.y, CL_SIZE);
@@ -754,9 +752,9 @@ gmx_opencl_inline void reduce_force_i_and_shift_pow2(volatile __local float* f_b
                                                       __global float*         fshift)
  {
      float fshift_buf = 0;
-    for (int ci_offset = 0; ci_offset < NCL_PER_SUPERCL; ci_offset++)
+    for (int ci_offset = 0; ci_offset < c_nbnxnGpuNumClusterPerSupercluster; ci_offset++)
      {
-        int aidx = (sci * NCL_PER_SUPERCL + ci_offset) * CL_SIZE + tidxi;
+        int aidx = (sci * c_nbnxnGpuNumClusterPerSupercluster + ci_offset) * CL_SIZE + tidxi;
          int tidx = tidxi + tidxj * CL_SIZE;
          /* store i forces in shmem */
          f_buf[tidx]                   = fci_buf[ci_offset].x;
diff --git a/src/gromacs/nbnxm/pairlist.h b/src/gromacs/nbnxm/pairlist.h

index 3f7c62705fa7045dc58762adcdbf8f4b3b305e30..97805f7f1b5a615ab051922d41a9851858b35d16 100644 (file)
--- a/src/gromacs/nbnxm/pairlist.h
+++ b/src/gromacs/nbnxm/pairlist.h
@@ -48,9 +48,6 @@
  #include "gromacs/utility/enumerationhelpers.h"
  #include "gromacs/utility/real.h"
  
-// This file with constants is separate from this file to be able
-// to include it during OpenCL jitting without including config.h
-#include "constants.h"
  #include "pairlistparams.h"
  
  struct NbnxnPairlistCpuWork;
@@ -133,6 +130,31 @@ constexpr unsigned int NBNXN_INTERACTION_MASK_DIAG_J8_1 = 0x0080c0e0U;
  //! \}
  //! \}
  
+/*! \brief Lower limit for square interaction distances in nonbonded kernels.
+ *
+ * For smaller values we will overflow when calculating r^-1 or r^-12, but
+ * to keep it simple we always apply the limit from the tougher r^-12 condition.
+ */
+#if GMX_DOUBLE
+// Some double precision SIMD architectures use single precision in the first
+// step, so although the double precision criterion would allow smaller rsq,
+// we need to stay in single precision with some margin for the N-R iterations.
+constexpr double c_nbnxnMinDistanceSquared = 1.0e-36;
+#else
+// The worst intermediate value we might evaluate is r^-12, which
+// means we should ensure r^2 stays above pow(GMX_FLOAT_MAX,-1.0/6.0)*1.01 (some margin)
+constexpr float c_nbnxnMinDistanceSquared = 3.82e-07F; // r > 6.2e-4
+#endif
+
+
+//! The number of clusters in a super-cluster, used for GPU
+constexpr int c_nbnxnGpuNumClusterPerSupercluster = 8;
+
+/*! \brief With GPU kernels we group cluster pairs in 4 to optimize memory usage
+ * of integers containing 32 bits.
+ */
+constexpr int c_nbnxnGpuJgroupSize = (32 / c_nbnxnGpuNumClusterPerSupercluster);
+
  /*! \internal
   * \brief Simple pair-list i-unit
   */
author	Mark Abraham <mark.j.abraham@gmail.com>
	Fri, 17 Jan 2020 14:43:38 +0000 (15:43 +0100)
committer	Paul Bauer <paul.bauer.q@gmail.com>
	Mon, 27 Jan 2020 13:00:55 +0000 (14:00 +0100)
src/gromacs/nbnxm/constants.h	[deleted file]	patch \| blob \| history
src/gromacs/nbnxm/cuda/nbnxm_cuda.cu		patch \| blob \| history
src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel.cuh		patch \| blob \| history
src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel_pruneonly.cuh		patch \| blob \| history
src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel_utils.cuh		patch \| blob \| history
src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h		patch \| blob \| history
src/gromacs/nbnxm/kernels_reference/kernel_gpu_ref.cpp		patch \| blob \| history
src/gromacs/nbnxm/kernels_reference/kernel_ref_inner.h		patch \| blob \| history
src/gromacs/nbnxm/kernels_simd_2xmm/kernel_outer.h		patch \| blob \| history
src/gromacs/nbnxm/kernels_simd_4xm/kernel_outer.h		patch \| blob \| history
src/gromacs/nbnxm/opencl/CMakeLists.txt		patch \| blob \| history
src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp		patch \| blob \| history
src/gromacs/nbnxm/opencl/nbnxm_ocl_jit_support.cpp		patch \| blob \| history
src/gromacs/nbnxm/opencl/nbnxm_ocl_kernel.clh		patch \| blob \| history
src/gromacs/nbnxm/opencl/nbnxm_ocl_kernel_pruneonly.clh		patch \| blob \| history
src/gromacs/nbnxm/opencl/nbnxm_ocl_kernel_utils.clh		patch \| blob \| history
src/gromacs/nbnxm/pairlist.h		patch \| blob \| history