From 5eedb4d2f25b23c05a4faadf86f126ad7ea3a419 Mon Sep 17 00:00:00 2001
From: Roland Schulz <roland.schulz@intel.com>
Date: Tue, 2 Oct 2018 15:06:52 -0700
Subject: [PATCH] Remove majority of OCL command line constants

Related #2661

Change-Id: I8a139501bdd4f479183e829ba4c861f36c73afea
---
 src/gromacs/gpu_utils/ocl_compiler.cpp        |  2 +-
 src/gromacs/mdlib/nbnxn_consts.h              |  8 +++-
 .../mdlib/nbnxn_ocl/nbnxn_ocl_consts.h        | 47 +++++++++++++++++++
 .../mdlib/nbnxn_ocl/nbnxn_ocl_jit_support.cpp | 15 ++----
 .../mdlib/nbnxn_ocl/nbnxn_ocl_kernel.clh      |  4 +-
 .../nbnxn_ocl/nbnxn_ocl_kernel_pruneonly.clh  |  1 -
 .../nbnxn_ocl/nbnxn_ocl_kernel_utils.clh      | 14 +++---
 src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_types.h | 25 ++--------
 src/gromacs/mdlib/nbnxn_pairlist.h            |  8 +---
 src/gromacs/mdlib/nbnxn_search.cpp            |  2 +-
 10 files changed, 74 insertions(+), 52 deletions(-)
 create mode 100644 src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_consts.h

diff --git a/src/gromacs/gpu_utils/ocl_compiler.cpp b/src/gromacs/gpu_utils/ocl_compiler.cpp
index fa8c4137e6..e3e8b1e8c5 100644
--- a/src/gromacs/gpu_utils/ocl_compiler.cpp
+++ b/src/gromacs/gpu_utils/ocl_compiler.cpp
@@ -420,7 +420,7 @@ compileProgram(FILE              *fplog,
 {
     cl_int      cl_error;
     std::string kernelRootPath  = getSourceRootPath(kernelRelativePath);
-    std::string includeRootPath = getSourceRootPath("src/gromacs/gpu_utils");
+    std::string includeRootPath = getSourceRootPath("src");
 
     GMX_RELEASE_ASSERT(fplog != nullptr, "Need a valid log file for building OpenCL programs");
 
diff --git a/src/gromacs/mdlib/nbnxn_consts.h b/src/gromacs/mdlib/nbnxn_consts.h
index eb693466fc..24ffbb94ad 100644
--- a/src/gromacs/mdlib/nbnxn_consts.h
+++ b/src/gromacs/mdlib/nbnxn_consts.h
@@ -36,7 +36,6 @@
 #ifndef _nbnxn_consts_h
 #define _nbnxn_consts_h
 
-
 /* With CPU kernels the i-cluster size is always 4 atoms.
  * With x86 SIMD the j-cluster size can be 2, 4 or 8, otherwise 4.
  */
@@ -73,5 +72,12 @@
 #define NBNXN_INTERACTION_MASK_DIAG_J8_0  0xf0f8fcfeU
 #define NBNXN_INTERACTION_MASK_DIAG_J8_1  0x0080c0e0U
 
+/* The number of clusters in a super-cluster, used for GPU */
+#define c_nbnxnGpuNumClusterPerSupercluster  8
+
+/* With GPU kernels we group cluster pairs in 4 to optimize memory usage
+ * of integers containing 32 bits.
+ */
+#define c_nbnxnGpuJgroupSize (32/c_nbnxnGpuNumClusterPerSupercluster)
 
 #endif
diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_consts.h b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_consts.h
new file mode 100644
index 0000000000..4a542686ed
--- /dev/null
+++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_consts.h
@@ -0,0 +1,47 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2018, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+#ifndef NBNXN_OPENCL_CONSTS_H
+#define NBNXN_OPENCL_CONSTS_H
+
+/*! \brief Macros defining platform-dependent defaults for the prune kernel's j4 processing concurrency.
+ *
+ *  The GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY macro allows compile-time override.
+ */
+/*! @{ */
+#define GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY_DEFAULT       4
+//The following has to match getOclPruneKernelJ4Concurrency
+#define GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY_DEFAULT
+/*! @} */
+#endif
diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_jit_support.cpp b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_jit_support.cpp
index 9c5eaebea3..155cd32589 100644
--- a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_jit_support.cpp
+++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_jit_support.cpp
@@ -198,19 +198,10 @@ nbnxn_gpu_compile_kernels(gmx_nbnxn_ocl_t *nb)
          */
 
         extraDefines += gmx::formatString(
-                    " -DCENTRAL=%d "
-                    "-DNBNXN_GPU_NCLUSTER_PER_SUPERCLUSTER=%d -DNBNXN_GPU_CLUSTER_SIZE=%d -DNBNXN_GPU_JGROUP_SIZE=%d "
-                    "-DGMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY=%d "
-                    "-DNBNXN_MIN_RSQ=%s %s",
-                    CENTRAL,                                                /* Defined in ishift.h */
-                    c_nbnxnGpuNumClusterPerSupercluster,                    /* Defined in nbnxn_pairlist.h */
+                    " -DNBNXN_GPU_CLUSTER_SIZE=%d "
+                    "%s",
                     c_nbnxnGpuClusterSize,                                  /* Defined in nbnxn_pairlist.h */
-                    c_nbnxnGpuJgroupSize,                                   /* Defined in nbnxn_pairlist.h */
-                    getOclPruneKernelJ4Concurrency(nb->dev_info->vendor_e), /* In nbnxn_ocl_types.h  */
-                    STRINGIFY_MACRO(NBNXN_MIN_RSQ)                          /* Defined in nbnxn_consts.h */
-                                                                            /* NBNXN_MIN_RSQ passed as string to avoid
-                                                                                floating point representation problems with sprintf */
-                    , (nb->bPrefetchLjParam) ? "-DIATYPE_SHMEM" : ""
+                    (nb->bPrefetchLjParam) ? "-DIATYPE_SHMEM" : ""
                     );
 
         try
diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel.clh b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel.clh
index 5ce66fa010..aba1c026bf 100644
--- a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel.clh
+++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel.clh
@@ -183,7 +183,7 @@ __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_opencl)
     /* shmem buffer for cj, for both warps separately */
     cjs = (__local int *)(LOCAL_OFFSET);
     #undef LOCAL_OFFSET
-    #define LOCAL_OFFSET cjs + 2 * NBNXN_GPU_JGROUP_SIZE
+    #define LOCAL_OFFSET cjs + 2 * c_nbnxnGpuJgroupSize
 #endif //USE_CJ_PREFETCH
 
 #ifdef IATYPE_SHMEM
@@ -327,7 +327,7 @@ __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_opencl)
 #if !defined PRUNE_NBL && !defined _NVIDIA_SOURCE_
 #pragma unroll 4
 #endif
-            for (int jm = 0; jm < NBNXN_GPU_JGROUP_SIZE; jm++)
+            for (int jm = 0; jm < c_nbnxnGpuJgroupSize; jm++)
             {
                 if (imask & (superClInteractionMask << (jm * NCL_PER_SUPERCL)))
                 {
diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_pruneonly.clh b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_pruneonly.clh
index 2c1e9c196f..ce4b885b2f 100644
--- a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_pruneonly.clh
+++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_pruneonly.clh
@@ -103,7 +103,6 @@ __kernel void nbnxn_kernel_prune_rolling_opencl
     // TODO move these consts to utils and unify their use with the nonbonded kernels
     const int c_numClPerSupercl    = NCL_PER_SUPERCL;
     const int c_clSize             = CL_SIZE;
-    const int c_nbnxnGpuJgroupSize = NBNXN_GPU_JGROUP_SIZE;
 
     // TODO pass this value at compile-time as a macro
     const int c_nbnxnGpuClusterpairSplit = 2;
diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_utils.clh b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_utils.clh
index 4d5e7c9ba2..be24e7906e 100644
--- a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_utils.clh
+++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_utils.clh
@@ -33,11 +33,15 @@
  * the research papers on the package. Check out http://www.gromacs.org.
  */
 
-#include "device_utils.clh"
-#include "vectype_ops.clh"
+#include "gromacs/gpu_utils/vectype_ops.clh"
+#include "gromacs/gpu_utils/device_utils.clh"
+#include "gromacs/mdlib/nbnxn_consts.h"
+#include "gromacs/pbcutil/ishift.h"
+
+#include "nbnxn_ocl_consts.h"
 
 #define CL_SIZE                 (NBNXN_GPU_CLUSTER_SIZE)
-#define NCL_PER_SUPERCL         (NBNXN_GPU_NCLUSTER_PER_SUPERCLUSTER)
+#define NCL_PER_SUPERCL         c_nbnxnGpuNumClusterPerSupercluster
 
 #define WARP_SIZE  (CL_SIZE*CL_SIZE/2) //Currently only c_nbnxnGpuClusterpairSplit=2 supported
 
@@ -189,13 +193,12 @@ void preloadCj4Generic(__local int        *sm_cjPreload,
 {
     /* Pre-load cj into shared memory */
 #if defined _AMD_SOURCE_ //TODO: fix by setting c_nbnxnGpuClusterpairSplit properly
-    if (tidxj == 0 & tidxi < NBNXN_GPU_JGROUP_SIZE)
+    if (tidxj == 0 & tidxi < c_nbnxnGpuJgroupSize)
     {
         sm_cjPreload[tidxi] = gm_cj[tidxi];
     }
 #else
     const int c_clSize                   = CL_SIZE;
-    const int c_nbnxnGpuJgroupSize       = NBNXN_GPU_JGROUP_SIZE;
     const int c_nbnxnGpuClusterpairSplit = 2;
     const int c_splitClSize              = c_clSize/c_nbnxnGpuClusterpairSplit;
 
@@ -258,7 +261,6 @@ int loadCjPreload(__local int*        sm_cjPreload,
     int       warpLoadOffset = 0; //TODO: fix by setting c_nbnxnGpuClusterpairSplit properly
 #else
     const int c_clSize                   = CL_SIZE;
-    const int c_nbnxnGpuJgroupSize       = NBNXN_GPU_JGROUP_SIZE;
     const int c_nbnxnGpuClusterpairSplit = 2;
     const int c_splitClSize              = c_clSize/c_nbnxnGpuClusterpairSplit;
 
diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_types.h b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_types.h
index 1808def1a0..b56afd6455 100644
--- a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_types.h
+++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_types.h
@@ -51,7 +51,9 @@
 #include "gromacs/gpu_utils/oclutils.h"
 #include "gromacs/mdlib/nbnxn_gpu_types_common.h"
 #include "gromacs/mdlib/nbnxn_pairlist.h"
+#include "gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_consts.h"
 #include "gromacs/mdtypes/interaction_const.h"
+#include "gromacs/utility/fatalerror.h"
 #include "gromacs/utility/real.h"
 
 /* kernel does #include "gromacs/math/utilities.h" */
@@ -60,29 +62,13 @@
 //! Define 1/sqrt(pi)
 #define M_FLOAT_1_SQRTPI 0.564189583547756f
 
-/*! \brief Macros defining platform-dependent defaults for the prune kernel's j4 processing concurrency.
- *
- *  The GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY macro allows compile-time override.
- */
-/*! @{ */
-#ifndef GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY
-#define GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY_AMD       4
-#define GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY_NVIDIA    4
-#define GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY_DEFAULT   4
-#else
-#define GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY_AMD       GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY
-#define GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY_NVIDIA    GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY
-#define GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY_DEFAULT   GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY
-#endif
 /*! @} */
 /*! \brief Constants for platform-dependent defaults for the prune kernel's j4 processing concurrency.
  *
  *  Initialized using macros that can be overridden at compile-time (using #GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY).
  */
 /*! @{ */
-const int c_oclPruneKernelJ4ConcurrencyAMD     = GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY_AMD;
-const int c_oclPruneKernelJ4ConcurrencyNVIDIA  = GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY_NVIDIA;
-const int c_oclPruneKernelJ4ConcurrencyDefault = GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY_DEFAULT;
+const int c_oclPruneKernelJ4ConcurrencyDEFAULT     = GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY_DEFAULT;
 /*! @} */
 
 /*! \brief Returns the j4 processing concurrency parameter for the vendor \p vendorId
@@ -90,12 +76,9 @@ const int c_oclPruneKernelJ4ConcurrencyDefault = GMX_NBNXN_PRUNE_KERNEL_J4_CONCU
  */
 static inline int getOclPruneKernelJ4Concurrency(int vendorId)
 {
-    assert(vendorId < OCL_VENDOR_UNKNOWN);
     switch (vendorId)
     {
-        case OCL_VENDOR_AMD:    return c_oclPruneKernelJ4ConcurrencyAMD;     break;
-        case OCL_VENDOR_NVIDIA: return c_oclPruneKernelJ4ConcurrencyNVIDIA;  break;
-        default:                return c_oclPruneKernelJ4ConcurrencyDefault; break;
+        default: return c_oclPruneKernelJ4ConcurrencyDEFAULT;
     }
 }
 
diff --git a/src/gromacs/mdlib/nbnxn_pairlist.h b/src/gromacs/mdlib/nbnxn_pairlist.h
index df36cade83..facd39e1d2 100644
--- a/src/gromacs/mdlib/nbnxn_pairlist.h
+++ b/src/gromacs/mdlib/nbnxn_pairlist.h
@@ -41,6 +41,7 @@
 #include <cstddef>
 
 #include "gromacs/math/vectypes.h"
+#include "gromacs/mdlib/nbnxn_consts.h"
 #include "gromacs/mdtypes/nblist.h"
 #include "gromacs/utility/basedefinitions.h"
 #include "gromacs/utility/bitmask.h"
@@ -83,13 +84,6 @@ static constexpr int c_nbnxnGpuClusterSize = GMX_OCL_NB_CLUSTER_SIZE;
 static constexpr int c_nbnxnGpuClusterSize = 8;
 #endif
 
-/* The number of clusters in a super-cluster, used for GPU */
-static constexpr int c_nbnxnGpuNumClusterPerSupercluster = 8;
-
-/* With GPU kernels we group cluster pairs in 4 to optimize memory usage
- * of integers containing 32 bits.
- */
-static constexpr int c_nbnxnGpuJgroupSize = 32/c_nbnxnGpuNumClusterPerSupercluster;
 
 /* In CUDA the number of threads in a warp is 32 and we have cluster pairs
  * of 8*8=64 atoms, so it's convenient to store data for cluster pair halves.
diff --git a/src/gromacs/mdlib/nbnxn_search.cpp b/src/gromacs/mdlib/nbnxn_search.cpp
index e8f048157e..db292857fd 100644
--- a/src/gromacs/mdlib/nbnxn_search.cpp
+++ b/src/gromacs/mdlib/nbnxn_search.cpp
@@ -1101,7 +1101,7 @@ static void print_nblist_statistics_supersub(FILE *fp, const nbnxn_pairlist_t *n
         {
             fprintf(fp, "nbl j-list #i-subcell %d %7d %4.1f\n",
                     b, c[b],
-                    100.0*c[b]/static_cast<double>(nbl->ncj4*c_nbnxnGpuJgroupSize));
+                    100.0*c[b]/int{nbl->ncj4*c_nbnxnGpuJgroupSize});
         }
     }
 }
-- 
2.22.0