Remove majority of OCL command line constants

author Roland Schulz <roland.schulz@intel.com>

Tue, 2 Oct 2018 22:06:52 +0000 (15:06 -0700)

committer Szilárd Páll <pall.szilard@gmail.com>

Mon, 8 Oct 2018 16:05:44 +0000 (18:05 +0200)
author Roland Schulz <roland.schulz@intel.com>
Tue, 2 Oct 2018 22:06:52 +0000 (15:06 -0700)
committer Szilárd Páll <pall.szilard@gmail.com>
Mon, 8 Oct 2018 16:05:44 +0000 (18:05 +0200)
diff --git a/src/gromacs/gpu_utils/ocl_compiler.cpp b/src/gromacs/gpu_utils/ocl_compiler.cpp

index fa8c4137e610d95c00f7809d171d449e17e578b3..e3e8b1e8c5424beb273566927bc3132d0f1be56a 100644 (file)
--- a/src/gromacs/gpu_utils/ocl_compiler.cpp
+++ b/src/gromacs/gpu_utils/ocl_compiler.cpp
@@ -420,7 +420,7 @@ compileProgram(FILE              *fplog,
  {
      cl_int      cl_error;
      std::string kernelRootPath  = getSourceRootPath(kernelRelativePath);
-    std::string includeRootPath = getSourceRootPath("src/gromacs/gpu_utils");
+    std::string includeRootPath = getSourceRootPath("src");
  
      GMX_RELEASE_ASSERT(fplog != nullptr, "Need a valid log file for building OpenCL programs");
  
diff --git a/src/gromacs/mdlib/nbnxn_consts.h b/src/gromacs/mdlib/nbnxn_consts.h

index eb693466fcd94ed8b2d42f1d14a2e4b3da05ae8a..24ffbb94ad890716edd43353a0654020b25e6bfa 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_consts.h
+++ b/src/gromacs/mdlib/nbnxn_consts.h
@@ -36,7 +36,6 @@
  #ifndef _nbnxn_consts_h
  #define _nbnxn_consts_h
  
-
  /* With CPU kernels the i-cluster size is always 4 atoms.
   * With x86 SIMD the j-cluster size can be 2, 4 or 8, otherwise 4.
   */
@@ -73,5 +72,12 @@
  #define NBNXN_INTERACTION_MASK_DIAG_J8_0  0xf0f8fcfeU
  #define NBNXN_INTERACTION_MASK_DIAG_J8_1  0x0080c0e0U
  
+/* The number of clusters in a super-cluster, used for GPU */
+#define c_nbnxnGpuNumClusterPerSupercluster  8
+
+/* With GPU kernels we group cluster pairs in 4 to optimize memory usage
+ * of integers containing 32 bits.
+ */
+#define c_nbnxnGpuJgroupSize (32/c_nbnxnGpuNumClusterPerSupercluster)
  
  #endif
diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_consts.h b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_consts.h

new file mode 100644 (file)

index 0000000..4a54268
--- /dev/null
+++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_consts.h
@@ -0,0 +1,47 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2018, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+#ifndef NBNXN_OPENCL_CONSTS_H
+#define NBNXN_OPENCL_CONSTS_H
+
+/*! \brief Macros defining platform-dependent defaults for the prune kernel's j4 processing concurrency.
+ *
+ *  The GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY macro allows compile-time override.
+ */
+/*! @{ */
+#define GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY_DEFAULT       4
+//The following has to match getOclPruneKernelJ4Concurrency
+#define GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY_DEFAULT
+/*! @} */
+#endif
diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_jit_support.cpp b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_jit_support.cpp

index 9c5eaebea3be366be8bf2197aaede93a8f31251c..155cd325895f5ce0ae5d0bc73f6c03982f0c0a24 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_jit_support.cpp
+++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_jit_support.cpp
@@ -198,19 +198,10 @@ nbnxn_gpu_compile_kernels(gmx_nbnxn_ocl_t *nb)
           */
  
          extraDefines += gmx::formatString(
-                    " -DCENTRAL=%d "
-                    "-DNBNXN_GPU_NCLUSTER_PER_SUPERCLUSTER=%d -DNBNXN_GPU_CLUSTER_SIZE=%d -DNBNXN_GPU_JGROUP_SIZE=%d "
-                    "-DGMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY=%d "
-                    "-DNBNXN_MIN_RSQ=%s %s",
-                    CENTRAL,                                                /* Defined in ishift.h */
-                    c_nbnxnGpuNumClusterPerSupercluster,                    /* Defined in nbnxn_pairlist.h */
+                    " -DNBNXN_GPU_CLUSTER_SIZE=%d "
+                    "%s",
                      c_nbnxnGpuClusterSize,                                  /* Defined in nbnxn_pairlist.h */
-                    c_nbnxnGpuJgroupSize,                                   /* Defined in nbnxn_pairlist.h */
-                    getOclPruneKernelJ4Concurrency(nb->dev_info->vendor_e), /* In nbnxn_ocl_types.h  */
-                    STRINGIFY_MACRO(NBNXN_MIN_RSQ)                          /* Defined in nbnxn_consts.h */
-                                                                            /* NBNXN_MIN_RSQ passed as string to avoid
-                                                                                floating point representation problems with sprintf */
-                    , (nb->bPrefetchLjParam) ? "-DIATYPE_SHMEM" : ""
+                    (nb->bPrefetchLjParam) ? "-DIATYPE_SHMEM" : ""
                      );
  
          try
diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel.clh b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel.clh

index 5ce66fa01069c7e184e53d5302cdb7b30aa81ea8..aba1c026bf278c13aa1e513e43824245e42a1181 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel.clh
+++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel.clh
@@ -183,7 +183,7 @@ __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_opencl)
      /* shmem buffer for cj, for both warps separately */
      cjs = (__local int *)(LOCAL_OFFSET);
      #undef LOCAL_OFFSET
-    #define LOCAL_OFFSET cjs + 2 * NBNXN_GPU_JGROUP_SIZE
+    #define LOCAL_OFFSET cjs + 2 * c_nbnxnGpuJgroupSize
  #endif //USE_CJ_PREFETCH
  
  #ifdef IATYPE_SHMEM
@@ -327,7 +327,7 @@ __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_opencl)
  #if !defined PRUNE_NBL && !defined _NVIDIA_SOURCE_
  #pragma unroll 4
  #endif
-            for (int jm = 0; jm < NBNXN_GPU_JGROUP_SIZE; jm++)
+            for (int jm = 0; jm < c_nbnxnGpuJgroupSize; jm++)
              {
                  if (imask & (superClInteractionMask << (jm * NCL_PER_SUPERCL)))
                  {
diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_pruneonly.clh b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_pruneonly.clh

index 2c1e9c196f21c45f6440455e10ce6b2e7abe3762..ce4b885b2f79a186afc69af8f04c599809152672 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_pruneonly.clh
+++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_pruneonly.clh
@@ -103,7 +103,6 @@ __kernel void nbnxn_kernel_prune_rolling_opencl
      // TODO move these consts to utils and unify their use with the nonbonded kernels
      const int c_numClPerSupercl    = NCL_PER_SUPERCL;
      const int c_clSize             = CL_SIZE;
-    const int c_nbnxnGpuJgroupSize = NBNXN_GPU_JGROUP_SIZE;
  
      // TODO pass this value at compile-time as a macro
      const int c_nbnxnGpuClusterpairSplit = 2;
diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_utils.clh b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_utils.clh

index 4d5e7c9ba2d2c400df1e9084e5cf247445deab1e..be24e7906ec7df56939bfc74e07ed7ae9e59c42b 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_utils.clh
+++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_utils.clh
@@ -33,11 +33,15 @@
   * the research papers on the package. Check out http://www.gromacs.org.
   */
  
-#include "device_utils.clh"
-#include "vectype_ops.clh"
+#include "gromacs/gpu_utils/vectype_ops.clh"
+#include "gromacs/gpu_utils/device_utils.clh"
+#include "gromacs/mdlib/nbnxn_consts.h"
+#include "gromacs/pbcutil/ishift.h"
+
+#include "nbnxn_ocl_consts.h"
  
  #define CL_SIZE                 (NBNXN_GPU_CLUSTER_SIZE)
-#define NCL_PER_SUPERCL         (NBNXN_GPU_NCLUSTER_PER_SUPERCLUSTER)
+#define NCL_PER_SUPERCL         c_nbnxnGpuNumClusterPerSupercluster
  
  #define WARP_SIZE  (CL_SIZE*CL_SIZE/2) //Currently only c_nbnxnGpuClusterpairSplit=2 supported
  
@@ -189,13 +193,12 @@ void preloadCj4Generic(__local int        *sm_cjPreload,
  {
      /* Pre-load cj into shared memory */
  #if defined _AMD_SOURCE_ //TODO: fix by setting c_nbnxnGpuClusterpairSplit properly
-    if (tidxj == 0 & tidxi < NBNXN_GPU_JGROUP_SIZE)
+    if (tidxj == 0 & tidxi < c_nbnxnGpuJgroupSize)
      {
          sm_cjPreload[tidxi] = gm_cj[tidxi];
      }
  #else
      const int c_clSize                   = CL_SIZE;
-    const int c_nbnxnGpuJgroupSize       = NBNXN_GPU_JGROUP_SIZE;
      const int c_nbnxnGpuClusterpairSplit = 2;
      const int c_splitClSize              = c_clSize/c_nbnxnGpuClusterpairSplit;
  
@@ -258,7 +261,6 @@ int loadCjPreload(__local int*        sm_cjPreload,
      int       warpLoadOffset = 0; //TODO: fix by setting c_nbnxnGpuClusterpairSplit properly
  #else
      const int c_clSize                   = CL_SIZE;
-    const int c_nbnxnGpuJgroupSize       = NBNXN_GPU_JGROUP_SIZE;
      const int c_nbnxnGpuClusterpairSplit = 2;
      const int c_splitClSize              = c_clSize/c_nbnxnGpuClusterpairSplit;
  
diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_types.h b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_types.h

index 1808def1a01728da70a0b39450c6db1b5c2006fa..b56afd64554fcc5dfe8507e8e1e52ddd04ac6010 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_types.h
+++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_types.h
@@ -51,7 +51,9 @@
  #include "gromacs/gpu_utils/oclutils.h"
  #include "gromacs/mdlib/nbnxn_gpu_types_common.h"
  #include "gromacs/mdlib/nbnxn_pairlist.h"
+#include "gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_consts.h"
  #include "gromacs/mdtypes/interaction_const.h"
+#include "gromacs/utility/fatalerror.h"
  #include "gromacs/utility/real.h"
  
  /* kernel does #include "gromacs/math/utilities.h" */
@@ -60,29 +62,13 @@
  //! Define 1/sqrt(pi)
  #define M_FLOAT_1_SQRTPI 0.564189583547756f
  
-/*! \brief Macros defining platform-dependent defaults for the prune kernel's j4 processing concurrency.
- *
- *  The GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY macro allows compile-time override.
- */
-/*! @{ */
-#ifndef GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY
-#define GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY_AMD       4
-#define GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY_NVIDIA    4
-#define GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY_DEFAULT   4
-#else
-#define GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY_AMD       GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY
-#define GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY_NVIDIA    GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY
-#define GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY_DEFAULT   GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY
-#endif
  /*! @} */
  /*! \brief Constants for platform-dependent defaults for the prune kernel's j4 processing concurrency.
   *
   *  Initialized using macros that can be overridden at compile-time (using #GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY).
   */
  /*! @{ */
-const int c_oclPruneKernelJ4ConcurrencyAMD     = GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY_AMD;
-const int c_oclPruneKernelJ4ConcurrencyNVIDIA  = GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY_NVIDIA;
-const int c_oclPruneKernelJ4ConcurrencyDefault = GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY_DEFAULT;
+const int c_oclPruneKernelJ4ConcurrencyDEFAULT     = GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY_DEFAULT;
  /*! @} */
  
  /*! \brief Returns the j4 processing concurrency parameter for the vendor \p vendorId
@@ -90,12 +76,9 @@ const int c_oclPruneKernelJ4ConcurrencyDefault = GMX_NBNXN_PRUNE_KERNEL_J4_CONCU
   */
  static inline int getOclPruneKernelJ4Concurrency(int vendorId)
  {
-    assert(vendorId < OCL_VENDOR_UNKNOWN);
      switch (vendorId)
      {
-        case OCL_VENDOR_AMD:    return c_oclPruneKernelJ4ConcurrencyAMD;     break;
-        case OCL_VENDOR_NVIDIA: return c_oclPruneKernelJ4ConcurrencyNVIDIA;  break;
-        default:                return c_oclPruneKernelJ4ConcurrencyDefault; break;
+        default: return c_oclPruneKernelJ4ConcurrencyDEFAULT;
      }
  }
  
diff --git a/src/gromacs/mdlib/nbnxn_pairlist.h b/src/gromacs/mdlib/nbnxn_pairlist.h

index df36cade831b2a0e4fedd77d7303be1bf3acbc3e..facd39e1d2a0c32355f3d2c63fd87345d5be4364 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_pairlist.h
+++ b/src/gromacs/mdlib/nbnxn_pairlist.h
@@ -41,6 +41,7 @@
  #include <cstddef>
  
  #include "gromacs/math/vectypes.h"
+#include "gromacs/mdlib/nbnxn_consts.h"
  #include "gromacs/mdtypes/nblist.h"
  #include "gromacs/utility/basedefinitions.h"
  #include "gromacs/utility/bitmask.h"
@@ -83,13 +84,6 @@ static constexpr int c_nbnxnGpuClusterSize = GMX_OCL_NB_CLUSTER_SIZE;
  static constexpr int c_nbnxnGpuClusterSize = 8;
  #endif
  
-/* The number of clusters in a super-cluster, used for GPU */
-static constexpr int c_nbnxnGpuNumClusterPerSupercluster = 8;
-
-/* With GPU kernels we group cluster pairs in 4 to optimize memory usage
- * of integers containing 32 bits.
- */
-static constexpr int c_nbnxnGpuJgroupSize = 32/c_nbnxnGpuNumClusterPerSupercluster;
  
  /* In CUDA the number of threads in a warp is 32 and we have cluster pairs
   * of 8*8=64 atoms, so it's convenient to store data for cluster pair halves.
diff --git a/src/gromacs/mdlib/nbnxn_search.cpp b/src/gromacs/mdlib/nbnxn_search.cpp

index e8f048157e259cf4a22772fd5e77a74132039594..db292857fd39895e4bd60716d665d0db9ca82cb3 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_search.cpp
+++ b/src/gromacs/mdlib/nbnxn_search.cpp
@@ -1101,7 +1101,7 @@ static void print_nblist_statistics_supersub(FILE *fp, const nbnxn_pairlist_t *n
          {
              fprintf(fp, "nbl j-list #i-subcell %d %7d %4.1f\n",
                      b, c[b],
-                    100.0*c[b]/static_cast<double>(nbl->ncj4*c_nbnxnGpuJgroupSize));
+                    100.0*c[b]/int{nbl->ncj4*c_nbnxnGpuJgroupSize});
          }
      }
  }
author	Roland Schulz <roland.schulz@intel.com>
	Tue, 2 Oct 2018 22:06:52 +0000 (15:06 -0700)
committer	Szilárd Páll <pall.szilard@gmail.com>
	Mon, 8 Oct 2018 16:05:44 +0000 (18:05 +0200)
src/gromacs/gpu_utils/ocl_compiler.cpp		patch \| blob \| history
src/gromacs/mdlib/nbnxn_consts.h		patch \| blob \| history
src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_consts.h	[new file with mode: 0644]	patch \| blob
src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_jit_support.cpp		patch \| blob \| history
src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel.clh		patch \| blob \| history
src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_pruneonly.clh		patch \| blob \| history
src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_utils.clh		patch \| blob \| history
src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_types.h		patch \| blob \| history
src/gromacs/mdlib/nbnxn_pairlist.h		patch \| blob \| history
src/gromacs/mdlib/nbnxn_search.cpp		patch \| blob \| history