From 5eedb4d2f25b23c05a4faadf86f126ad7ea3a419 Mon Sep 17 00:00:00 2001 From: Roland Schulz Date: Tue, 2 Oct 2018 15:06:52 -0700 Subject: [PATCH] Remove majority of OCL command line constants Related #2661 Change-Id: I8a139501bdd4f479183e829ba4c861f36c73afea --- src/gromacs/gpu_utils/ocl_compiler.cpp | 2 +- src/gromacs/mdlib/nbnxn_consts.h | 8 +++- .../mdlib/nbnxn_ocl/nbnxn_ocl_consts.h | 47 +++++++++++++++++++ .../mdlib/nbnxn_ocl/nbnxn_ocl_jit_support.cpp | 15 ++---- .../mdlib/nbnxn_ocl/nbnxn_ocl_kernel.clh | 4 +- .../nbnxn_ocl/nbnxn_ocl_kernel_pruneonly.clh | 1 - .../nbnxn_ocl/nbnxn_ocl_kernel_utils.clh | 14 +++--- src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_types.h | 25 ++-------- src/gromacs/mdlib/nbnxn_pairlist.h | 8 +--- src/gromacs/mdlib/nbnxn_search.cpp | 2 +- 10 files changed, 74 insertions(+), 52 deletions(-) create mode 100644 src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_consts.h diff --git a/src/gromacs/gpu_utils/ocl_compiler.cpp b/src/gromacs/gpu_utils/ocl_compiler.cpp index fa8c4137e6..e3e8b1e8c5 100644 --- a/src/gromacs/gpu_utils/ocl_compiler.cpp +++ b/src/gromacs/gpu_utils/ocl_compiler.cpp @@ -420,7 +420,7 @@ compileProgram(FILE *fplog, { cl_int cl_error; std::string kernelRootPath = getSourceRootPath(kernelRelativePath); - std::string includeRootPath = getSourceRootPath("src/gromacs/gpu_utils"); + std::string includeRootPath = getSourceRootPath("src"); GMX_RELEASE_ASSERT(fplog != nullptr, "Need a valid log file for building OpenCL programs"); diff --git a/src/gromacs/mdlib/nbnxn_consts.h b/src/gromacs/mdlib/nbnxn_consts.h index eb693466fc..24ffbb94ad 100644 --- a/src/gromacs/mdlib/nbnxn_consts.h +++ b/src/gromacs/mdlib/nbnxn_consts.h @@ -36,7 +36,6 @@ #ifndef _nbnxn_consts_h #define _nbnxn_consts_h - /* With CPU kernels the i-cluster size is always 4 atoms. * With x86 SIMD the j-cluster size can be 2, 4 or 8, otherwise 4. */ @@ -73,5 +72,12 @@ #define NBNXN_INTERACTION_MASK_DIAG_J8_0 0xf0f8fcfeU #define NBNXN_INTERACTION_MASK_DIAG_J8_1 0x0080c0e0U +/* The number of clusters in a super-cluster, used for GPU */ +#define c_nbnxnGpuNumClusterPerSupercluster 8 + +/* With GPU kernels we group cluster pairs in 4 to optimize memory usage + * of integers containing 32 bits. + */ +#define c_nbnxnGpuJgroupSize (32/c_nbnxnGpuNumClusterPerSupercluster) #endif diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_consts.h b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_consts.h new file mode 100644 index 0000000000..4a542686ed --- /dev/null +++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_consts.h @@ -0,0 +1,47 @@ +/* + * This file is part of the GROMACS molecular simulation package. + * + * Copyright (c) 2018, by the GROMACS development team, led by + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, + * and including many others, as listed in the AUTHORS file in the + * top-level source directory and at http://www.gromacs.org. + * + * GROMACS is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 + * of the License, or (at your option) any later version. + * + * GROMACS is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with GROMACS; if not, see + * http://www.gnu.org/licenses, or write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * If you want to redistribute modifications to GROMACS, please + * consider that scientific software is very special. Version + * control is crucial - bugs must be traceable. We will be happy to + * consider code for inclusion in the official distribution, but + * derived work must not be called official GROMACS. Details are found + * in the README & COPYING files - if they are missing, get the + * official version at http://www.gromacs.org. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the research papers on the package. Check out http://www.gromacs.org. + */ +#ifndef NBNXN_OPENCL_CONSTS_H +#define NBNXN_OPENCL_CONSTS_H + +/*! \brief Macros defining platform-dependent defaults for the prune kernel's j4 processing concurrency. + * + * The GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY macro allows compile-time override. + */ +/*! @{ */ +#define GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY_DEFAULT 4 +//The following has to match getOclPruneKernelJ4Concurrency +#define GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY_DEFAULT +/*! @} */ +#endif diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_jit_support.cpp b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_jit_support.cpp index 9c5eaebea3..155cd32589 100644 --- a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_jit_support.cpp +++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_jit_support.cpp @@ -198,19 +198,10 @@ nbnxn_gpu_compile_kernels(gmx_nbnxn_ocl_t *nb) */ extraDefines += gmx::formatString( - " -DCENTRAL=%d " - "-DNBNXN_GPU_NCLUSTER_PER_SUPERCLUSTER=%d -DNBNXN_GPU_CLUSTER_SIZE=%d -DNBNXN_GPU_JGROUP_SIZE=%d " - "-DGMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY=%d " - "-DNBNXN_MIN_RSQ=%s %s", - CENTRAL, /* Defined in ishift.h */ - c_nbnxnGpuNumClusterPerSupercluster, /* Defined in nbnxn_pairlist.h */ + " -DNBNXN_GPU_CLUSTER_SIZE=%d " + "%s", c_nbnxnGpuClusterSize, /* Defined in nbnxn_pairlist.h */ - c_nbnxnGpuJgroupSize, /* Defined in nbnxn_pairlist.h */ - getOclPruneKernelJ4Concurrency(nb->dev_info->vendor_e), /* In nbnxn_ocl_types.h */ - STRINGIFY_MACRO(NBNXN_MIN_RSQ) /* Defined in nbnxn_consts.h */ - /* NBNXN_MIN_RSQ passed as string to avoid - floating point representation problems with sprintf */ - , (nb->bPrefetchLjParam) ? "-DIATYPE_SHMEM" : "" + (nb->bPrefetchLjParam) ? "-DIATYPE_SHMEM" : "" ); try diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel.clh b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel.clh index 5ce66fa010..aba1c026bf 100644 --- a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel.clh +++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel.clh @@ -183,7 +183,7 @@ __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_opencl) /* shmem buffer for cj, for both warps separately */ cjs = (__local int *)(LOCAL_OFFSET); #undef LOCAL_OFFSET - #define LOCAL_OFFSET cjs + 2 * NBNXN_GPU_JGROUP_SIZE + #define LOCAL_OFFSET cjs + 2 * c_nbnxnGpuJgroupSize #endif //USE_CJ_PREFETCH #ifdef IATYPE_SHMEM @@ -327,7 +327,7 @@ __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_opencl) #if !defined PRUNE_NBL && !defined _NVIDIA_SOURCE_ #pragma unroll 4 #endif - for (int jm = 0; jm < NBNXN_GPU_JGROUP_SIZE; jm++) + for (int jm = 0; jm < c_nbnxnGpuJgroupSize; jm++) { if (imask & (superClInteractionMask << (jm * NCL_PER_SUPERCL))) { diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_pruneonly.clh b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_pruneonly.clh index 2c1e9c196f..ce4b885b2f 100644 --- a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_pruneonly.clh +++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_pruneonly.clh @@ -103,7 +103,6 @@ __kernel void nbnxn_kernel_prune_rolling_opencl // TODO move these consts to utils and unify their use with the nonbonded kernels const int c_numClPerSupercl = NCL_PER_SUPERCL; const int c_clSize = CL_SIZE; - const int c_nbnxnGpuJgroupSize = NBNXN_GPU_JGROUP_SIZE; // TODO pass this value at compile-time as a macro const int c_nbnxnGpuClusterpairSplit = 2; diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_utils.clh b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_utils.clh index 4d5e7c9ba2..be24e7906e 100644 --- a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_utils.clh +++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_utils.clh @@ -33,11 +33,15 @@ * the research papers on the package. Check out http://www.gromacs.org. */ -#include "device_utils.clh" -#include "vectype_ops.clh" +#include "gromacs/gpu_utils/vectype_ops.clh" +#include "gromacs/gpu_utils/device_utils.clh" +#include "gromacs/mdlib/nbnxn_consts.h" +#include "gromacs/pbcutil/ishift.h" + +#include "nbnxn_ocl_consts.h" #define CL_SIZE (NBNXN_GPU_CLUSTER_SIZE) -#define NCL_PER_SUPERCL (NBNXN_GPU_NCLUSTER_PER_SUPERCLUSTER) +#define NCL_PER_SUPERCL c_nbnxnGpuNumClusterPerSupercluster #define WARP_SIZE (CL_SIZE*CL_SIZE/2) //Currently only c_nbnxnGpuClusterpairSplit=2 supported @@ -189,13 +193,12 @@ void preloadCj4Generic(__local int *sm_cjPreload, { /* Pre-load cj into shared memory */ #if defined _AMD_SOURCE_ //TODO: fix by setting c_nbnxnGpuClusterpairSplit properly - if (tidxj == 0 & tidxi < NBNXN_GPU_JGROUP_SIZE) + if (tidxj == 0 & tidxi < c_nbnxnGpuJgroupSize) { sm_cjPreload[tidxi] = gm_cj[tidxi]; } #else const int c_clSize = CL_SIZE; - const int c_nbnxnGpuJgroupSize = NBNXN_GPU_JGROUP_SIZE; const int c_nbnxnGpuClusterpairSplit = 2; const int c_splitClSize = c_clSize/c_nbnxnGpuClusterpairSplit; @@ -258,7 +261,6 @@ int loadCjPreload(__local int* sm_cjPreload, int warpLoadOffset = 0; //TODO: fix by setting c_nbnxnGpuClusterpairSplit properly #else const int c_clSize = CL_SIZE; - const int c_nbnxnGpuJgroupSize = NBNXN_GPU_JGROUP_SIZE; const int c_nbnxnGpuClusterpairSplit = 2; const int c_splitClSize = c_clSize/c_nbnxnGpuClusterpairSplit; diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_types.h b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_types.h index 1808def1a0..b56afd6455 100644 --- a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_types.h +++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_types.h @@ -51,7 +51,9 @@ #include "gromacs/gpu_utils/oclutils.h" #include "gromacs/mdlib/nbnxn_gpu_types_common.h" #include "gromacs/mdlib/nbnxn_pairlist.h" +#include "gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_consts.h" #include "gromacs/mdtypes/interaction_const.h" +#include "gromacs/utility/fatalerror.h" #include "gromacs/utility/real.h" /* kernel does #include "gromacs/math/utilities.h" */ @@ -60,29 +62,13 @@ //! Define 1/sqrt(pi) #define M_FLOAT_1_SQRTPI 0.564189583547756f -/*! \brief Macros defining platform-dependent defaults for the prune kernel's j4 processing concurrency. - * - * The GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY macro allows compile-time override. - */ -/*! @{ */ -#ifndef GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY -#define GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY_AMD 4 -#define GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY_NVIDIA 4 -#define GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY_DEFAULT 4 -#else -#define GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY_AMD GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY -#define GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY_NVIDIA GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY -#define GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY_DEFAULT GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY -#endif /*! @} */ /*! \brief Constants for platform-dependent defaults for the prune kernel's j4 processing concurrency. * * Initialized using macros that can be overridden at compile-time (using #GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY). */ /*! @{ */ -const int c_oclPruneKernelJ4ConcurrencyAMD = GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY_AMD; -const int c_oclPruneKernelJ4ConcurrencyNVIDIA = GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY_NVIDIA; -const int c_oclPruneKernelJ4ConcurrencyDefault = GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY_DEFAULT; +const int c_oclPruneKernelJ4ConcurrencyDEFAULT = GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY_DEFAULT; /*! @} */ /*! \brief Returns the j4 processing concurrency parameter for the vendor \p vendorId @@ -90,12 +76,9 @@ const int c_oclPruneKernelJ4ConcurrencyDefault = GMX_NBNXN_PRUNE_KERNEL_J4_CONCU */ static inline int getOclPruneKernelJ4Concurrency(int vendorId) { - assert(vendorId < OCL_VENDOR_UNKNOWN); switch (vendorId) { - case OCL_VENDOR_AMD: return c_oclPruneKernelJ4ConcurrencyAMD; break; - case OCL_VENDOR_NVIDIA: return c_oclPruneKernelJ4ConcurrencyNVIDIA; break; - default: return c_oclPruneKernelJ4ConcurrencyDefault; break; + default: return c_oclPruneKernelJ4ConcurrencyDEFAULT; } } diff --git a/src/gromacs/mdlib/nbnxn_pairlist.h b/src/gromacs/mdlib/nbnxn_pairlist.h index df36cade83..facd39e1d2 100644 --- a/src/gromacs/mdlib/nbnxn_pairlist.h +++ b/src/gromacs/mdlib/nbnxn_pairlist.h @@ -41,6 +41,7 @@ #include #include "gromacs/math/vectypes.h" +#include "gromacs/mdlib/nbnxn_consts.h" #include "gromacs/mdtypes/nblist.h" #include "gromacs/utility/basedefinitions.h" #include "gromacs/utility/bitmask.h" @@ -83,13 +84,6 @@ static constexpr int c_nbnxnGpuClusterSize = GMX_OCL_NB_CLUSTER_SIZE; static constexpr int c_nbnxnGpuClusterSize = 8; #endif -/* The number of clusters in a super-cluster, used for GPU */ -static constexpr int c_nbnxnGpuNumClusterPerSupercluster = 8; - -/* With GPU kernels we group cluster pairs in 4 to optimize memory usage - * of integers containing 32 bits. - */ -static constexpr int c_nbnxnGpuJgroupSize = 32/c_nbnxnGpuNumClusterPerSupercluster; /* In CUDA the number of threads in a warp is 32 and we have cluster pairs * of 8*8=64 atoms, so it's convenient to store data for cluster pair halves. diff --git a/src/gromacs/mdlib/nbnxn_search.cpp b/src/gromacs/mdlib/nbnxn_search.cpp index e8f048157e..db292857fd 100644 --- a/src/gromacs/mdlib/nbnxn_search.cpp +++ b/src/gromacs/mdlib/nbnxn_search.cpp @@ -1101,7 +1101,7 @@ static void print_nblist_statistics_supersub(FILE *fp, const nbnxn_pairlist_t *n { fprintf(fp, "nbl j-list #i-subcell %d %7d %4.1f\n", b, c[b], - 100.0*c[b]/static_cast(nbl->ncj4*c_nbnxnGpuJgroupSize)); + 100.0*c[b]/int{nbl->ncj4*c_nbnxnGpuJgroupSize}); } } } -- 2.22.0