From 12e241d11a611d502a78361f6c310d50bb9a0c7d Mon Sep 17 00:00:00 2001 From: Artem Zhmurov Date: Thu, 18 Feb 2021 19:31:47 +0300 Subject: [PATCH] Unify and clarify a couple of macro definitions in NBNXM 1. The GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY definition was essentially duplicated in CUDA and OpenCL. This unify them. 2. The 1/sqrt(PI) macro was defined several times in OpenCL. --- src/gromacs/nbnxm/cuda/nbnxm_cuda.cu | 2 +- src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h | 13 ------------ src/gromacs/nbnxm/gpu_types_common.h | 10 ++++++++++ src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp | 4 +--- src/gromacs/nbnxm/opencl/nbnxm_ocl_consts.h | 22 +++++++++------------ src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h | 16 --------------- 6 files changed, 21 insertions(+), 46 deletions(-) diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu index aa5652011b..6796da5aac 100644 --- a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu +++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu @@ -735,7 +735,7 @@ void gpu_launch_kernel_pruneonly(NbnxmGpu* nb, const InteractionLocality iloc, c * and j-cluster concurrency, in x, y, and z, respectively. * - The 1D block-grid contains as many blocks as super-clusters. */ - int num_threads_z = c_cudaPruneKernelJ4Concurrency; + int num_threads_z = c_pruneKernelJ4Concurrency; int nblock = calc_nb_kernel_nblock(numSciInPart, &nb->deviceContext_->deviceInfo()); KernelLaunchConfig config; config.blockSize[0] = c_clSize; diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h b/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h index 1cd1606de3..65a247ad08 100644 --- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h +++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h @@ -59,19 +59,6 @@ #include "gromacs/timing/gpu_timing.h" #include "gromacs/utility/enumerationhelpers.h" -/*! \brief Macro definining default for the prune kernel's j4 processing concurrency. - * - * The GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY macro allows compile-time override. - */ -#ifndef GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY -# define GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY 4 -#endif -/*! \brief Default for the prune kernel's j4 processing concurrency. - * - * Initialized using the #GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY macro which allows compile-time override. - */ -const int c_cudaPruneKernelJ4Concurrency = GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY; - /* TODO: consider moving this to kernel_utils */ /* Convenience defines */ /*! \brief cluster size = number of atoms per cluster. */ diff --git a/src/gromacs/nbnxm/gpu_types_common.h b/src/gromacs/nbnxm/gpu_types_common.h index 26cd3c165f..ebd5db9b34 100644 --- a/src/gromacs/nbnxm/gpu_types_common.h +++ b/src/gromacs/nbnxm/gpu_types_common.h @@ -63,6 +63,16 @@ # include "gromacs/gpu_utils/gpuregiontimer_sycl.h" #endif +/*! \brief Macro definining default for the prune kernel's j4 processing concurrency. + * + * The GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY macro allows compile-time override with the default value of 4. + */ +#ifndef GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY +# define GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY 4 +#endif +//! Default for the prune kernel's j4 processing concurrency. +static constexpr int c_pruneKernelJ4Concurrency = GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY; + /** \internal * \brief Parameters required for the GPU nonbonded calculations. */ diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp b/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp index 16cd265d84..7e4eeca484 100644 --- a/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp +++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp @@ -860,9 +860,7 @@ void gpu_launch_kernel_pruneonly(NbnxmGpu* nb, const InteractionLocality iloc, c * and j-cluster concurrency, in x, y, and z, respectively. * - The 1D block-grid contains as many blocks as super-clusters. */ - int num_threads_z = c_oclPruneKernelJ4ConcurrencyDEFAULT; - - + int num_threads_z = c_pruneKernelJ4Concurrency; /* kernel launch config */ KernelLaunchConfig config; config.sharedMemorySize = calc_shmem_required_prune(num_threads_z); diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl_consts.h b/src/gromacs/nbnxm/opencl/nbnxm_ocl_consts.h index 87872c6552..bef4f21f16 100644 --- a/src/gromacs/nbnxm/opencl/nbnxm_ocl_consts.h +++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl_consts.h @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by + * Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -40,19 +40,15 @@ * \author Berk Hess * \ingroup module_nbnxm */ -#ifndef NBNXN_OPENCL_CONSTS_H -#define NBNXN_OPENCL_CONSTS_H +#ifndef NBNXN_OCL_CONSTS_H +#define NBNXN_OCL_CONSTS_H -/*! \internal \file - * \brief Macros defining platform-dependent defaults for the prune kernel's j4 processing concurrency. - * - * The GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY macro allows compile-time override. +/*! \brief Macro definining default for the prune kernel's j4 processing concurrency. * - * \ingroup module_nbnxm + * The GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY macro allows compile-time override with the default value of 4. */ -/*! @{ */ -#define GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY_DEFAULT 4 -// The following has to match getOclPruneKernelJ4Concurrency -#define GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY_DEFAULT -/*! @} */ +#ifndef GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY +# define GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY 4 +#endif + #endif diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h b/src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h index 2e7098227d..013345b32c 100644 --- a/src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h +++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h @@ -60,24 +60,8 @@ #include "gromacs/utility/fatalerror.h" #include "gromacs/utility/real.h" -#include "nbnxm_ocl_consts.h" - struct gmx_wallclock_gpu_nbnxn_t; -/* kernel does #include "gromacs/math/utilities.h" */ -/* Move the actual useful stuff here: */ - -//! Define 1/sqrt(pi) -#define M_FLOAT_1_SQRTPI 0.564189583547756f - -/*! \brief Constants for platform-dependent defaults for the prune kernel's j4 processing concurrency. - * - * Initialized using macros that can be overridden at compile-time (using #GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY). - */ -/*! @{ */ -const int c_oclPruneKernelJ4ConcurrencyDEFAULT = GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY_DEFAULT; -/*! @} */ - /*! \brief Pruning kernel flavors. * * The values correspond to the first call of the pruning post-list generation -- 2.22.0