From 4d6252adf2db70503443924a7669478a19ccb3c6 Mon Sep 17 00:00:00 2001 From: Mark Abraham Date: Thu, 27 Sep 2018 17:20:02 +0200 Subject: [PATCH] Remove NVML support Net value for this feature isn't high enough to maintain it. gmx_device_info_t can now be used const-correct in many more places. Fixes #2655 Change-Id: I2fac7d8b5613bee3fe0a6020862fc57224f7f6c4 --- admin/builds/post-submit-matrix.txt | 1 - cmake/FindNVML.cmake | 147 --------- cmake/gmxManageGPU.cmake | 20 -- docs/install-guide/index.rst | 19 -- docs/release-notes/removed-features.rst | 8 + docs/user-guide/mdrun-performance.rst | 21 -- src/config.h.cmakein | 3 - src/gromacs/ewald/pme-gpu-internal.cpp | 12 +- src/gromacs/ewald/pme-gpu-internal.h | 4 +- src/gromacs/ewald/pme-gpu-types-host.h | 2 +- src/gromacs/ewald/pme.cpp | 30 +- src/gromacs/ewald/pme.h | 2 +- src/gromacs/ewald/tests/pmetestcommon.cpp | 6 +- src/gromacs/ewald/tests/pmetestcommon.h | 4 +- .../ewald/tests/testhardwarecontexts.cpp | 5 +- .../ewald/tests/testhardwarecontexts.h | 10 +- src/gromacs/gpu_utils/cudautils.cuh | 18 -- src/gromacs/gpu_utils/gpu_utils.cu | 288 +----------------- src/gromacs/gpu_utils/gpu_utils.h | 7 +- src/gromacs/gpu_utils/gpu_utils_ocl.cpp | 3 +- src/gromacs/mdrun/runner.cpp | 4 +- 21 files changed, 51 insertions(+), 563 deletions(-) delete mode 100644 cmake/FindNVML.cmake diff --git a/admin/builds/post-submit-matrix.txt b/admin/builds/post-submit-matrix.txt index f72fdf66eb..78152c6298 100644 --- a/admin/builds/post-submit-matrix.txt +++ b/admin/builds/post-submit-matrix.txt @@ -72,7 +72,6 @@ icc-18 msvc-2017 fftpack simd=avx2_256 release # TODO # Add SIMD + OpenMP + CUDA asan build # Add OpenMP + CUDA + device sharing TSAN build -# Test with NVML support # Test statically linked hwloc support (if/when it can work well) # Test 3D DD (2D is partially covered in regressiontests) # Test own-fftw build (from local copy of the file) diff --git a/cmake/FindNVML.cmake b/cmake/FindNVML.cmake deleted file mode 100644 index 439168d38a..0000000000 --- a/cmake/FindNVML.cmake +++ /dev/null @@ -1,147 +0,0 @@ -# -# This file is part of the GROMACS molecular simulation package. -# -# Copyright (c) 2014,2015,2017,2018, by the GROMACS development team, led by -# Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, -# and including many others, as listed in the AUTHORS file in the -# top-level source directory and at http://www.gromacs.org. -# -# GROMACS is free software; you can redistribute it and/or -# modify it under the terms of the GNU Lesser General Public License -# as published by the Free Software Foundation; either version 2.1 -# of the License, or (at your option) any later version. -# -# GROMACS is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -# Lesser General Public License for more details. -# -# You should have received a copy of the GNU Lesser General Public -# License along with GROMACS; if not, see -# http://www.gnu.org/licenses, or write to the Free Software Foundation, -# Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. -# -# If you want to redistribute modifications to GROMACS, please -# consider that scientific software is very special. Version -# control is crucial - bugs must be traceable. We will be happy to -# consider code for inclusion in the official distribution, but -# derived work must not be called official GROMACS. Details are found -# in the README & COPYING files - if they are missing, get the -# official version at http://www.gromacs.org. -# -# To help us fund GROMACS development, we humbly ask that you cite -# the research papers on the package. Check out http://www.gromacs.org. - -#.rst: -# FindNVML -# -------- -# -# Find the NVIDIA Management Library (NVML) includes and library. NVML documentation -# is available at: http://docs.nvidia.com/deploy/nvml-api/index.html -# -# Starting with CUDA 8 NVML is part of the CUDA Toolkit. Prior to CUDA 8 NVML was part -# of the GPU Deployment Kit (GDK) and GPU_DEPLOYMENT_KIT_ROOT_DIR can be specified -# if the GPU Deployment Kit is not installed in a default location. -# -# FindNVML defines the following variables: -# -# NVML_INCLUDE_DIR, where to find nvml.h, etc. -# NVML_LIBRARY, the libraries needed to use NVML. -# NVML_FOUND, If false, do not try to use NVML. -# - -# Jiri Kraus, NVIDIA Corp (nvidia.com - jkraus) -# -# Copyright (c) 2008 - 2014,2017 NVIDIA Corporation. All rights reserved. -# -# This code is licensed under the MIT License. See the FindNVML.cmake script -# for the text of the license. - -# The MIT License -# -# License for the specific language governing rights and limitations under -# Permission is hereby granted, free of charge, to any person obtaining a -# copy of this software and associated documentation files (the "Software"), -# to deal in the Software without restriction, including without limitation -# the rights to use, copy, modify, merge, publish, distribute, sublicense, -# and/or sell copies of the Software, and to permit persons to whom the -# Software is furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included -# in all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS -# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -# DEALINGS IN THE SOFTWARE. -# -############################################################################### - -if( CMAKE_SYSTEM_NAME STREQUAL "Windows" ) - set(NVML_NAMES nvml) -else() - set(NVML_NAMES nvidia-ml) -endif() - -if (CUDA_FOUND) - if( CMAKE_SYSTEM_NAME STREQUAL "Windows" ) - if(${CUDA_VERSION_STRING} VERSION_LESS "8.0") - set( NVML_LIB_PATHS "C:/Program Files/NVIDIA Corporation/GDK/nvml/lib" ) - if(GPU_DEPLOYMENT_KIT_ROOT_DIR) - list(APPEND NVML_LIB_PATHS "${GPU_DEPLOYMENT_KIT_ROOT_DIR}/nvml/lib") - endif() - - set( NVML_INC_PATHS "C:/Program Files/NVIDIA Corporation/GDK/nvml/include" ) - if(GPU_DEPLOYMENT_KIT_ROOT_DIR) - list(APPEND NVML_INC_PATHS "${GPU_DEPLOYMENT_KIT_ROOT_DIR}/nvml/include") - endif() - else() - set( NVML_LIB_PATHS "${CUDA_TOOLKIT_ROOT_DIR}/lib/x64" ) - set( NVML_INC_PATHS ${CUDA_INCLUDE_DIRS} ) - endif() - else() - - set( NVML_LIB_PATHS /usr/lib64 ) - if(${CUDA_VERSION_STRING} VERSION_LESS "8.0") - # The Linux installer for the GPU Deployment Kit adds a "usr" - # suffix to a custom path if one is used, so a user could - # reasonably set GPU_DEPLOYMENT_KIT_ROOT_DIR to the value they - # passed to the installer, or the root where they later found the - # kit to be installed. Below, we cater for both possibilities. - if(GPU_DEPLOYMENT_KIT_ROOT_DIR) - list(APPEND NVML_LIB_PATHS - "${GPU_DEPLOYMENT_KIT_ROOT_DIR}/src/gdk/nvml/lib" - "${GPU_DEPLOYMENT_KIT_ROOT_DIR}/usr/src/gdk/nvml/lib" - ) - endif() - else() - list(APPEND NVML_LIB_PATHS "${CUDA_TOOLKIT_ROOT_DIR}/lib64/stubs") - endif() - - if(${CUDA_VERSION_STRING} VERSION_LESS "8.0") - set( NVML_INC_PATHS /usr/include/nvidia/gdk/ /usr/include ) - if(GPU_DEPLOYMENT_KIT_ROOT_DIR) - list(APPEND NVML_INC_PATHS - "${GPU_DEPLOYMENT_KIT_ROOT_DIR}/include/nvidia/gdk" - "${GPU_DEPLOYMENT_KIT_ROOT_DIR}/usr/include/nvidia/gdk" - ) - endif() - else() - set( NVML_INC_PATHS ${CUDA_INCLUDE_DIRS} ) - endif() - endif() -endif() - -find_library(NVML_LIBRARY NAMES ${NVML_NAMES} PATHS ${NVML_LIB_PATHS} ) - -find_path(NVML_INCLUDE_DIR nvml.h PATHS ${NVML_INC_PATHS}) - -# handle the QUIETLY and REQUIRED arguments and set NVML_FOUND to TRUE if -# all listed variables are TRUE -include(FindPackageHandleStandardArgs) -FIND_PACKAGE_HANDLE_STANDARD_ARGS(NVML DEFAULT_MSG NVML_LIBRARY NVML_INCLUDE_DIR) - -mark_as_advanced(NVML_LIBRARY NVML_INCLUDE_DIR) diff --git a/cmake/gmxManageGPU.cmake b/cmake/gmxManageGPU.cmake index 65db27e771..eaec53a18e 100644 --- a/cmake/gmxManageGPU.cmake +++ b/cmake/gmxManageGPU.cmake @@ -130,26 +130,6 @@ ${_msg}") endif() endif() -# Try to find NVML if a GPU accelerated binary should be build. -if (GMX_GPU) - if (DEFINED NVML_LIBRARY) - set(NVML_FIND_QUIETLY TRUE) - endif() - find_package(NVML) - # TODO Default to off, since linking is not implemented reliably - option(GMX_USE_NVML "Use NVML support for better CUDA performance" OFF) - mark_as_advanced(GMX_USE_NVML) - if(GMX_USE_NVML) - if(NVML_FOUND) - include_directories(SYSTEM ${NVML_INCLUDE_DIR}) - set(HAVE_NVML 1) - list(APPEND GMX_EXTRA_LIBRARIES ${NVML_LIBRARY}) - else() - message(FATAL_ERROR "NVML support was required, but was not detected. Please consult the install guide.") - endif() - endif() -endif() - # Annoyingly enough, FindCUDA leaves a few variables behind as non-advanced. # We need to mark these advanced outside the conditional, otherwise, if the # user turns GMX_GPU=OFF after a failed cmake pass, these variables will be diff --git a/docs/install-guide/index.rst b/docs/install-guide/index.rst index 9b88cdc904..a0bee47c96 100644 --- a/docs/install-guide/index.rst +++ b/docs/install-guide/index.rst @@ -627,25 +627,6 @@ If you have the CUDA_ Toolkit installed, you can use ``cmake`` with: need to specify manually which of your C++ compilers should be used, e.g. with the advanced option ``CUDA_HOST_COMPILER``. -To make it -possible to get best performance from NVIDIA Tesla and Quadro GPUs, -you should install the `GPU Deployment Kit -`_ and configure -|Gromacs| to use it by setting the CMake variable -``-DGPU_DEPLOYMENT_KIT_ROOT_DIR=/path/to/your/kit``. The NVML support -is most useful if -``nvidia-smi --applications-clocks-permission=UNRESTRICTED`` is run -(as root). When application clocks permissions are unrestricted, the -GPU clock speed can be increased automatically, which increases the -GPU kernel performance roughly proportional to the clock -increase. When using |Gromacs| on suitable GPUs under restricted -permissions, clocks cannot be changed, and in that case informative -log file messages will be produced. Background details can be found at -this `NVIDIA blog post -`_. -NVML support is only available if detected, and may be disabled by -turning off the ``GMX_USE_NVML`` CMake advanced option. - By default, code will be generated for the most common CUDA architectures. However, to reduce build time and binary size we do not generate code for every single possible architecture, which in rare cases (say, Tegra systems) diff --git a/docs/release-notes/removed-features.rst b/docs/release-notes/removed-features.rst index b84fc5de17..fbe66c04ef 100644 --- a/docs/release-notes/removed-features.rst +++ b/docs/release-notes/removed-features.rst @@ -1,3 +1,11 @@ Removed features ^^^^^^^^^^^^^^^^ +NVML support removed on NVIDIA GPUs +""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" +NVML support (for reporting GPU application clocks or changing these +for higher throughput) is no longer available. It was only ever supported on +high-end hardware and changing clocks is on recent generations of hardware only +useful when root permissions were available to the user. It may become less useful +as GROMACS evolves, complicated the GROMACS code, and wasn't regularly tested or maintained. +It might return if some of these conditions change. diff --git a/docs/user-guide/mdrun-performance.rst b/docs/user-guide/mdrun-performance.rst index 5fa2f686e3..27de1daf26 100644 --- a/docs/user-guide/mdrun-performance.rst +++ b/docs/user-guide/mdrun-performance.rst @@ -740,27 +740,6 @@ TODO In future patch: Running :ref:`mdrun ` with GPUs ------------------------------------------ -NVIDIA GPUs from the professional line (Tesla or Quadro) starting with -the Kepler generation (compute capability 3.5 and later) support changing the -processor and memory clock frequency with the help of the applications clocks feature. -With many workloads, using higher clock rates than the default provides significant -performance improvements. -For more information see the `NVIDIA blog article`_ on this topic. -For |Gromacs| the highest application clock rates are optimal on all hardware -available to date (up to and including Maxwell, compute capability 5.2). - -Application clocks can be set using the NVIDIA system managemet tool -``nvidia-smi``. If the system permissions allow, :ref:`gmx mdrun` has -built-in support to set application clocks if built with :ref:`NVML support`. -Note that application clocks are a global setting, hence affect the -performance of all applications that use the respective GPU(s). -For this reason, :ref:`gmx mdrun` sets application clocks at initialization -to the values optimal for |Gromacs| and it restores them before exiting -to the values found at startup, unless it detects that they were altered -during its runtime. - -.. _NVIDIA blog article: https://devblogs.nvidia.com/parallelforall/increase-performance-gpu-boost-k80-autoboost/ - .. _gmx-gpu-tasks: Types of GPU tasks diff --git a/src/config.h.cmakein b/src/config.h.cmakein index cab63d23a4..0f3aecbe57 100644 --- a/src/config.h.cmakein +++ b/src/config.h.cmakein @@ -244,9 +244,6 @@ /* Cluster size used by nonbonded OpenCL kernel. Should be 8 for NVIDIA/AMD and 4 for Intel */ #define GMX_OCL_NB_CLUSTER_SIZE @GMX_OCL_NB_CLUSTER_SIZE@ -/* Use NVML */ -#cmakedefine01 HAVE_NVML - /* Define relative path to OpenCL kernels */ #define GMX_INSTALL_OCLDIR "@GMX_INSTALL_OCLDIR@" diff --git a/src/gromacs/ewald/pme-gpu-internal.cpp b/src/gromacs/ewald/pme-gpu-internal.cpp index a030b45a30..8219fd8268 100644 --- a/src/gromacs/ewald/pme-gpu-internal.cpp +++ b/src/gromacs/ewald/pme-gpu-internal.cpp @@ -755,9 +755,9 @@ static void pme_gpu_copy_common_data_from(const gmx_pme_t *pme) * \param[in,out] gpuInfo The GPU information structure. * \param[in] pmeGpuProgram The handle to the program/kernel data created outside (e.g. in unit tests/runner) */ -static void pme_gpu_init(gmx_pme_t *pme, - gmx_device_info_t *gpuInfo, - PmeGpuProgramHandle pmeGpuProgram) +static void pme_gpu_init(gmx_pme_t *pme, + const gmx_device_info_t *gpuInfo, + PmeGpuProgramHandle pmeGpuProgram) { pme->gpu = new PmeGpu(); PmeGpu *pmeGpu = pme->gpu; @@ -856,9 +856,9 @@ void pme_gpu_get_real_grid_sizes(const PmeGpu *pmeGpu, gmx::IVec *gridSize, gmx: } } -void pme_gpu_reinit(gmx_pme_t *pme, - gmx_device_info_t *gpuInfo, - PmeGpuProgramHandle pmeGpuProgram) +void pme_gpu_reinit(gmx_pme_t *pme, + const gmx_device_info_t *gpuInfo, + PmeGpuProgramHandle pmeGpuProgram) { if (!pme_gpu_active(pme)) { diff --git a/src/gromacs/ewald/pme-gpu-internal.h b/src/gromacs/ewald/pme-gpu-internal.h index e48e18a7f5..2932136efe 100644 --- a/src/gromacs/ewald/pme-gpu-internal.h +++ b/src/gromacs/ewald/pme-gpu-internal.h @@ -628,12 +628,12 @@ GPU_FUNC_QUALIFIER void pme_gpu_get_real_grid_sizes(const PmeGpu *GPU_FUNC_ARGUM * (Re-)initializes the PME GPU data at the beginning of the run or on DLB. * * \param[in,out] pme The PME structure. - * \param[in,out] gpuInfo The GPU information structure. + * \param[in] gpuInfo The GPU information structure. * \param[in] pmeGpuProgram The PME GPU program data * \throws gmx::NotImplementedError if this generally valid PME structure is not valid for GPU runs. */ GPU_FUNC_QUALIFIER void pme_gpu_reinit(gmx_pme_t *GPU_FUNC_ARGUMENT(pme), - gmx_device_info_t *GPU_FUNC_ARGUMENT(gpuInfo), + const gmx_device_info_t *GPU_FUNC_ARGUMENT(gpuInfo), PmeGpuProgramHandle GPU_FUNC_ARGUMENT(pmeGpuProgram)) GPU_FUNC_TERM /*! \libinternal \brief diff --git a/src/gromacs/ewald/pme-gpu-types-host.h b/src/gromacs/ewald/pme-gpu-types-host.h index a372d32430..63d2e660d8 100644 --- a/src/gromacs/ewald/pme-gpu-types-host.h +++ b/src/gromacs/ewald/pme-gpu-types-host.h @@ -201,7 +201,7 @@ struct PmeGpu int nAtomsAlloc; /*! \brief A pointer to the device used during the execution. */ - gmx_device_info_t *deviceInfo; + const gmx_device_info_t *deviceInfo; /*! \brief Kernel scheduling grid width limit in X - derived from deviceinfo compute capability in CUDA. * Declared as very large int to make it useful in computations with type promotion, to avoid overflows. diff --git a/src/gromacs/ewald/pme.cpp b/src/gromacs/ewald/pme.cpp index 46411815da..699bde35ee 100644 --- a/src/gromacs/ewald/pme.cpp +++ b/src/gromacs/ewald/pme.cpp @@ -603,21 +603,21 @@ static int div_round_up(int enumerator, int denominator) return (enumerator + denominator - 1)/denominator; } -gmx_pme_t *gmx_pme_init(const t_commrec *cr, - const NumPmeDomains &numPmeDomains, - const t_inputrec *ir, - int homenr, - gmx_bool bFreeEnergy_q, - gmx_bool bFreeEnergy_lj, - gmx_bool bReproducible, - real ewaldcoeff_q, - real ewaldcoeff_lj, - int nthread, - PmeRunMode runMode, - PmeGpu *pmeGpu, - gmx_device_info_t *gpuInfo, - PmeGpuProgramHandle pmeGpuProgram, - const gmx::MDLogger & /*mdlog*/) +gmx_pme_t *gmx_pme_init(const t_commrec *cr, + const NumPmeDomains &numPmeDomains, + const t_inputrec *ir, + int homenr, + gmx_bool bFreeEnergy_q, + gmx_bool bFreeEnergy_lj, + gmx_bool bReproducible, + real ewaldcoeff_q, + real ewaldcoeff_lj, + int nthread, + PmeRunMode runMode, + PmeGpu *pmeGpu, + const gmx_device_info_t *gpuInfo, + PmeGpuProgramHandle pmeGpuProgram, + const gmx::MDLogger & /*mdlog*/) { int use_threads, sum_use_threads, i; ivec ndata; diff --git a/src/gromacs/ewald/pme.h b/src/gromacs/ewald/pme.h index be91f1c5a6..81988e6bc9 100644 --- a/src/gromacs/ewald/pme.h +++ b/src/gromacs/ewald/pme.h @@ -143,7 +143,7 @@ gmx_pme_t *gmx_pme_init(const t_commrec *cr, int nthread, PmeRunMode runMode, PmeGpu *pmeGpu, - gmx_device_info_t *gpuInfo, + const gmx_device_info_t *gpuInfo, PmeGpuProgramHandle pmeGpuProgram, const gmx::MDLogger &mdlog); diff --git a/src/gromacs/ewald/tests/pmetestcommon.cpp b/src/gromacs/ewald/tests/pmetestcommon.cpp index 186e2a1cfa..90f5a400eb 100644 --- a/src/gromacs/ewald/tests/pmetestcommon.cpp +++ b/src/gromacs/ewald/tests/pmetestcommon.cpp @@ -103,7 +103,7 @@ uint64_t getSplineModuliDoublePrecisionUlps(int splineOrder) //! PME initialization - internal static PmeSafePointer pmeInitInternal(const t_inputrec *inputRec, CodePath mode, - gmx_device_info_t *gpuInfo, + const gmx_device_info_t *gpuInfo, PmeGpuProgramHandle pmeGpuProgram, size_t atomCount, const Matrix3x3 &box, @@ -152,7 +152,7 @@ static PmeSafePointer pmeInitInternal(const t_inputrec *inputRec, //! Simple PME initialization based on input, no atom data PmeSafePointer pmeInitEmpty(const t_inputrec *inputRec, CodePath mode, - gmx_device_info_t *gpuInfo, + const gmx_device_info_t *gpuInfo, PmeGpuProgramHandle pmeGpuProgram, const Matrix3x3 &box, real ewaldCoeff_q, @@ -166,7 +166,7 @@ PmeSafePointer pmeInitEmpty(const t_inputrec *inputRec, //! PME initialization with atom data PmeSafePointer pmeInitAtoms(const t_inputrec *inputRec, CodePath mode, - gmx_device_info_t *gpuInfo, + const gmx_device_info_t *gpuInfo, PmeGpuProgramHandle pmeGpuProgram, const CoordinatesVector &coordinates, const ChargesVector &charges, diff --git a/src/gromacs/ewald/tests/pmetestcommon.h b/src/gromacs/ewald/tests/pmetestcommon.h index cc516f621c..d3e9696b2c 100644 --- a/src/gromacs/ewald/tests/pmetestcommon.h +++ b/src/gromacs/ewald/tests/pmetestcommon.h @@ -120,14 +120,14 @@ uint64_t getSplineModuliDoublePrecisionUlps(int splineOrder); //! Simple PME initialization (no atom data) PmeSafePointer pmeInitEmpty(const t_inputrec *inputRec, CodePath mode = CodePath::CPU, - gmx_device_info_t *gpuInfo = nullptr, + const gmx_device_info_t *gpuInfo = nullptr, PmeGpuProgramHandle pmeGpuProgram = nullptr, const Matrix3x3 &box = {{1.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, 1.0f}}, real ewaldCoeff_q = 0.0f, real ewaldCoeff_lj = 0.0f); //! PME initialization with atom data and system box PmeSafePointer pmeInitAtoms(const t_inputrec *inputRec, CodePath mode, - gmx_device_info_t *gpuInfo, + const gmx_device_info_t *gpuInfo, PmeGpuProgramHandle pmeGpuProgram, const CoordinatesVector &coordinates, const ChargesVector &charges, diff --git a/src/gromacs/ewald/tests/testhardwarecontexts.cpp b/src/gromacs/ewald/tests/testhardwarecontexts.cpp index ab7fcb038e..f97f6b6519 100644 --- a/src/gromacs/ewald/tests/testhardwarecontexts.cpp +++ b/src/gromacs/ewald/tests/testhardwarecontexts.cpp @@ -116,12 +116,11 @@ void PmeTestEnvironment::SetUp() // PME can only run on the CPU, so don't make any more test contexts. return; } - const MDLogger dummyLogger; // Constructing contexts for all compatible GPUs - will be empty on non-GPU builds for (int gpuIndex : getCompatibleGpus(hardwareInfo_->gpu_info)) { - gmx_device_info_t *deviceInfo = getDeviceInfo(hardwareInfo_->gpu_info, gpuIndex); - init_gpu(dummyLogger, deviceInfo); + const gmx_device_info_t *deviceInfo = getDeviceInfo(hardwareInfo_->gpu_info, gpuIndex); + init_gpu(deviceInfo); char stmp[200] = {}; get_gpu_device_info_string(stmp, hardwareInfo_->gpu_info, gpuIndex); diff --git a/src/gromacs/ewald/tests/testhardwarecontexts.h b/src/gromacs/ewald/tests/testhardwarecontexts.h index 7918b005e4..3d2f7ce02b 100644 --- a/src/gromacs/ewald/tests/testhardwarecontexts.h +++ b/src/gromacs/ewald/tests/testhardwarecontexts.h @@ -77,7 +77,7 @@ struct TestHardwareContext //! Readable description std::string description_; //! Device information pointer - gmx_device_info_t *deviceInfo_; + const gmx_device_info_t *deviceInfo_; //! Persistent compiled GPU kernels for PME. PmeGpuProgramStorage program_; @@ -85,13 +85,13 @@ struct TestHardwareContext //! Retuns the code path for this context. CodePath getCodePath() const { return codePath_; } //! Returns a human-readable context description line - std::string getDescription() const{return description_; } + std::string getDescription() const{return description_; } //! Returns the device info pointer - gmx_device_info_t *getDeviceInfo() const{return deviceInfo_; } + const gmx_device_info_t *getDeviceInfo() const{return deviceInfo_; } //! Returns the persistent PME GPU kernels - PmeGpuProgramHandle getPmeGpuProgram() const{return program_.get(); } + PmeGpuProgramHandle getPmeGpuProgram() const{return program_.get(); } //! Constructs the context - TestHardwareContext(CodePath codePath, const char *description, gmx_device_info_t *deviceInfo) : + TestHardwareContext(CodePath codePath, const char *description, const gmx_device_info_t *deviceInfo) : codePath_(codePath), description_(description), deviceInfo_(deviceInfo), program_(buildPmeGpuProgram(deviceInfo_)) {} ~TestHardwareContext(); diff --git a/src/gromacs/gpu_utils/cudautils.cuh b/src/gromacs/gpu_utils/cudautils.cuh index dac92fa6fe..230f968e36 100644 --- a/src/gromacs/gpu_utils/cudautils.cuh +++ b/src/gromacs/gpu_utils/cudautils.cuh @@ -35,12 +35,7 @@ #ifndef GMX_GPU_UTILS_CUDAUTILS_CUH #define GMX_GPU_UTILS_CUDAUTILS_CUH -#include "config.h" - #include -#if HAVE_NVML -#include -#endif /* HAVE_NVML */ #include #include @@ -130,7 +125,6 @@ enum class GpuApiCallBehavior; #define CU_RET_ERR(status, msg) do { } while (0) #define CU_CHECK_PREV_ERR() do { } while (0) -#define HANDLE_NVML_RET_ERR(status, msg) do { } while (0) #endif /* CHECK_CUDA_ERRORS */ @@ -139,24 +133,12 @@ enum class GpuApiCallBehavior; * The CUDA device information is queried and set at detection and contains * both information about the device/hardware returned by the runtime as well * as additional data like support status. - * - * \todo extract an object to manage NVML details */ struct gmx_device_info_t { int id; /* id of the CUDA device */ cudaDeviceProp prop; /* CUDA device properties */ int stat; /* result of the device check */ - unsigned int nvml_orig_app_sm_clock; /* The original SM clock before we changed it */ - unsigned int nvml_orig_app_mem_clock; /* The original memory clock before we changed it */ - gmx_bool nvml_app_clocks_changed; /* If application clocks have been changed */ - unsigned int nvml_set_app_sm_clock; /* The SM clock we set */ - unsigned int nvml_set_app_mem_clock; /* The memory clock we set */ -#if HAVE_NVML - nvmlDevice_t nvml_device_id; /* NVML device id */ - // TODO This can become a bool with a more useful name - nvmlEnableState_t nvml_is_restricted; /* Status of application clocks permission */ -#endif /* HAVE_NVML */ }; /*! Launches synchronous or asynchronous device to host memory copy. diff --git a/src/gromacs/gpu_utils/gpu_utils.cu b/src/gromacs/gpu_utils/gpu_utils.cu index 3c0722c69d..ed94834961 100644 --- a/src/gromacs/gpu_utils/gpu_utils.cu +++ b/src/gromacs/gpu_utils/gpu_utils.cu @@ -42,8 +42,6 @@ #include "gpu_utils.h" -#include "config.h" - #include #include #include @@ -58,38 +56,11 @@ #include "gromacs/utility/exceptions.h" #include "gromacs/utility/fatalerror.h" #include "gromacs/utility/gmxassert.h" -#include "gromacs/utility/logger.h" #include "gromacs/utility/programcontext.h" #include "gromacs/utility/smalloc.h" #include "gromacs/utility/snprintf.h" #include "gromacs/utility/stringutil.h" -#if HAVE_NVML -#include -#define HAVE_NVML_APPLICATION_CLOCKS (NVML_API_VERSION >= 6) -#else /* HAVE_NVML */ -#define HAVE_NVML_APPLICATION_CLOCKS 0 -#endif /* HAVE_NVML */ - -#if defined(CHECK_CUDA_ERRORS) && HAVE_NVML_APPLICATION_CLOCKS -/*! Check for NVML error on the return status of a NVML API call. */ -# define HANDLE_NVML_RET_ERR(status, msg) \ - do { \ - if (status != NVML_SUCCESS) \ - { \ - gmx_warning("%s: %s\n", msg, nvmlErrorString(status)); \ - } \ - } while (0) -#else /* defined(CHECK_CUDA_ERRORS) && HAVE_NVML_APPLICATION_CLOCKS */ -# define HANDLE_NVML_RET_ERR(status, msg) do { } while (0) -#endif /* defined(CHECK_CUDA_ERRORS) && HAVE_NVML_APPLICATION_CLOCKS */ - -#if HAVE_NVML_APPLICATION_CLOCKS -static const gmx_bool bCompiledWithApplicationClockSupport = true; -#else -static const gmx_bool gmx_unused bCompiledWithApplicationClockSupport = false; -#endif - /*! \internal \brief * Max number of devices supported by CUDA (for consistency checking). * @@ -273,256 +244,7 @@ static int do_sanity_checks(int dev_id, cudaDeviceProp *dev_prop) return 0; } -#if HAVE_NVML_APPLICATION_CLOCKS -/*! \brief Determines and adds the NVML device ID to the passed \cuda_dev. - * - * Determines and adds the NVML device ID to the passed \cuda_dev. This is done by - * matching PCI-E information from \cuda_dev with the available NVML devices. - * - * \param[in,out] cuda_dev CUDA device information to enrich with NVML device info - * \returns true if \cuda_dev could be enriched with matching NVML device information. - */ -static bool addNVMLDeviceId(gmx_device_info_t* cuda_dev) -{ - nvmlDevice_t nvml_device_id; - unsigned int nvml_device_count = 0; - nvmlReturn_t nvml_stat = nvmlDeviceGetCount ( &nvml_device_count ); - bool nvmlWasInitialized = false; - HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetCount failed" ); - for (unsigned int nvml_device_idx = 0; nvml_stat == NVML_SUCCESS && nvml_device_idx < nvml_device_count; ++nvml_device_idx) - { - nvml_stat = nvmlDeviceGetHandleByIndex ( nvml_device_idx, &nvml_device_id ); - HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetHandleByIndex failed" ); - if (nvml_stat != NVML_SUCCESS) - { - break; - } - - nvmlPciInfo_t nvml_pci_info; - nvml_stat = nvmlDeviceGetPciInfo ( nvml_device_id, &nvml_pci_info ); - HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetPciInfo failed" ); - if (nvml_stat != NVML_SUCCESS) - { - break; - } - if (static_cast(cuda_dev->prop.pciBusID) == nvml_pci_info.bus && - static_cast(cuda_dev->prop.pciDeviceID) == nvml_pci_info.device && - static_cast(cuda_dev->prop.pciDomainID) == nvml_pci_info.domain) - { - nvmlWasInitialized = true; - cuda_dev->nvml_device_id = nvml_device_id; - break; - } - } - return nvmlWasInitialized; -} - -/*! \brief Reads and returns the application clocks for device. - * - * \param[in] device The GPU device - * \param[out] app_sm_clock The current application SM clock - * \param[out] app_mem_clock The current application memory clock - * \returns if applacation clocks are supported - */ -static bool getApplicationClocks(const gmx_device_info_t *cuda_dev, - unsigned int *app_sm_clock, - unsigned int *app_mem_clock) -{ - nvmlReturn_t nvml_stat; - - nvml_stat = nvmlDeviceGetApplicationsClock(cuda_dev->nvml_device_id, NVML_CLOCK_SM, app_sm_clock); - if (NVML_ERROR_NOT_SUPPORTED == nvml_stat) - { - return false; - } - HANDLE_NVML_RET_ERR(nvml_stat, "nvmlDeviceGetApplicationsClock failed for NVIDIA_CLOCK_SM"); - nvml_stat = nvmlDeviceGetApplicationsClock(cuda_dev->nvml_device_id, NVML_CLOCK_MEM, app_mem_clock); - HANDLE_NVML_RET_ERR(nvml_stat, "nvmlDeviceGetApplicationsClock failed for NVIDIA_CLOCK_MEM"); - - return true; -} -#endif /* HAVE_NVML_APPLICATION_CLOCKS */ - -/*! \brief Tries to set application clocks for the GPU with the given index. - * - * Application clocks are set to the max supported value to increase - * performance if application clock permissions allow this. For future - * GPU architectures a more sophisticated scheme might be required. - * - * \todo Refactor this into a detection phase and a work phase. Also - * refactor to remove compile-time dependence on logging header. - * - * \param mdlog log file to write to - * \param[in] cuda_dev GPU device info for the GPU in use - * \returns true if no error occurs during application clocks handling. - */ -static gmx_bool init_gpu_application_clocks( - const gmx::MDLogger &mdlog, - gmx_device_info_t *cuda_dev) -{ - const cudaDeviceProp *prop = &cuda_dev->prop; - int cuda_compute_capability = prop->major * 10 + prop->minor; - gmx_bool bGpuCanUseApplicationClocks = - ((0 == gmx_wcmatch("*Tesla*", prop->name) && cuda_compute_capability >= 35 ) || - (0 == gmx_wcmatch("*Quadro*", prop->name) && cuda_compute_capability >= 52 )); - if (!bGpuCanUseApplicationClocks) - { - return true; - } -#if !HAVE_NVML - GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted( - "NOTE: GROMACS was configured without NVML support hence it can not exploit\n" - " application clocks of the detected %s GPU to improve performance.\n" - " Recompile with the NVML library (compatible with the driver used) or set application clocks manually.", - prop->name); - return true; -#else - if (!bCompiledWithApplicationClockSupport) - { - GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted( - "NOTE: GROMACS was compiled with an old NVML library which does not support\n" - " managing application clocks of the detected %s GPU to improve performance.\n" - " If your GPU supports application clocks, upgrade NVML (and driver) and recompile or set the clocks manually.", - prop->name ); - return true; - } - - /* We've compiled with NVML application clocks support, and have a GPU that can use it */ - nvmlReturn_t nvml_stat = NVML_SUCCESS; - char *env; - //TODO: GMX_GPU_APPLICATION_CLOCKS is currently only used to enable/disable setting of application clocks - // this variable can be later used to give a user more fine grained control. - env = getenv("GMX_GPU_APPLICATION_CLOCKS"); - if (env != NULL && ( strcmp( env, "0") == 0 || - gmx_strcasecmp( env, "OFF") == 0 || - gmx_strcasecmp( env, "DISABLE") == 0 )) - { - return true; - } - nvml_stat = nvmlInit(); - HANDLE_NVML_RET_ERR( nvml_stat, "nvmlInit failed." ); - if (nvml_stat != NVML_SUCCESS) - { - return false; - } - - if (!addNVMLDeviceId(cuda_dev)) - { - return false; - } - //get current application clocks setting - if (!getApplicationClocks(cuda_dev, - &cuda_dev->nvml_orig_app_sm_clock, - &cuda_dev->nvml_orig_app_mem_clock)) - { - return false; - } - //get max application clocks - unsigned int max_sm_clock = 0; - unsigned int max_mem_clock = 0; - nvml_stat = nvmlDeviceGetMaxClockInfo(cuda_dev->nvml_device_id, NVML_CLOCK_SM, &max_sm_clock); - HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetMaxClockInfo failed" ); - nvml_stat = nvmlDeviceGetMaxClockInfo(cuda_dev->nvml_device_id, NVML_CLOCK_MEM, &max_mem_clock); - HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetMaxClockInfo failed" ); - - cuda_dev->nvml_is_restricted = NVML_FEATURE_ENABLED; - cuda_dev->nvml_app_clocks_changed = false; - - if (cuda_dev->nvml_orig_app_sm_clock >= max_sm_clock) - { - //TODO: This should probably be integrated into the GPU Properties table. - GMX_LOG(mdlog.info).appendTextFormatted( - "Application clocks (GPU clocks) for %s are (%d,%d)", - cuda_dev->prop.name, cuda_dev->nvml_orig_app_mem_clock, cuda_dev->nvml_orig_app_sm_clock); - return true; - } - - if (cuda_compute_capability >= 60) - { - // Only warn about not being able to change clocks if they are not already at the max values - if (max_mem_clock > cuda_dev->nvml_orig_app_mem_clock || max_sm_clock > cuda_dev->nvml_orig_app_sm_clock) - { - GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted( - "Cannot change application clocks for %s to optimal values due to insufficient permissions. Current values are (%d,%d), max values are (%d,%d).\nPlease contact your admin to change application clocks.\n", - cuda_dev->prop.name, cuda_dev->nvml_orig_app_mem_clock, cuda_dev->nvml_orig_app_sm_clock, max_mem_clock, max_sm_clock); - } - return true; - } - - nvml_stat = nvmlDeviceGetAPIRestriction(cuda_dev->nvml_device_id, NVML_RESTRICTED_API_SET_APPLICATION_CLOCKS, &(cuda_dev->nvml_is_restricted)); - HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetAPIRestriction failed" ); - - if (nvml_stat != NVML_SUCCESS) - { - GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted( - "Cannot change GPU application clocks to optimal values due to NVML error (%d): %s.", - nvml_stat, nvmlErrorString(nvml_stat)); - return false; - } - - if (cuda_dev->nvml_is_restricted != NVML_FEATURE_DISABLED) - { - // Only warn about not being able to change clocks if they are not already at the max values - if (max_mem_clock > cuda_dev->nvml_orig_app_mem_clock || max_sm_clock > cuda_dev->nvml_orig_app_sm_clock) - { - GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted( - "Cannot change application clocks for %s to optimal values due to insufficient permissions. Current values are (%d,%d), max values are (%d,%d).\nUse sudo nvidia-smi -acp UNRESTRICTED or contact your admin to change application clocks.", - cuda_dev->prop.name, cuda_dev->nvml_orig_app_mem_clock, cuda_dev->nvml_orig_app_sm_clock, max_mem_clock, max_sm_clock); - } - return true; - } - - /* Note: Distinguishing between different types of GPUs here might be necessary in the future, - e.g. if max application clocks should not be used for certain GPUs. */ - GMX_LOG(mdlog.warning).appendTextFormatted( - "Changing GPU application clocks for %s to (%d,%d)", - cuda_dev->prop.name, max_mem_clock, max_sm_clock); - nvml_stat = nvmlDeviceSetApplicationsClocks(cuda_dev->nvml_device_id, max_mem_clock, max_sm_clock); - HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetApplicationsClock failed" ); - cuda_dev->nvml_app_clocks_changed = true; - cuda_dev->nvml_set_app_sm_clock = max_sm_clock; - cuda_dev->nvml_set_app_mem_clock = max_mem_clock; - - return true; -#endif /* HAVE_NVML */ -} - -/*! \brief Resets application clocks if changed and cleans up NVML for the passed \gpu_dev. - * - * \param[in] gpu_dev CUDA device information - */ -static gmx_bool reset_gpu_application_clocks(const gmx_device_info_t gmx_unused * cuda_dev) -{ -#if !HAVE_NVML_APPLICATION_CLOCKS - GMX_UNUSED_VALUE(cuda_dev); - return true; -#else /* HAVE_NVML_APPLICATION_CLOCKS */ - nvmlReturn_t nvml_stat = NVML_SUCCESS; - if (cuda_dev && - cuda_dev->nvml_is_restricted == NVML_FEATURE_DISABLED && - cuda_dev->nvml_app_clocks_changed) - { - /* Check if the clocks are still what we set them to. - * If so, set them back to the state we originally found them in. - * If not, don't touch them, because something else set them later. - */ - unsigned int app_sm_clock, app_mem_clock; - getApplicationClocks(cuda_dev, &app_sm_clock, &app_mem_clock); - if (app_sm_clock == cuda_dev->nvml_set_app_sm_clock && - app_mem_clock == cuda_dev->nvml_set_app_mem_clock) - { - nvml_stat = nvmlDeviceSetApplicationsClocks(cuda_dev->nvml_device_id, cuda_dev->nvml_orig_app_mem_clock, cuda_dev->nvml_orig_app_sm_clock); - HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceSetApplicationsClock failed" ); - } - } - nvml_stat = nvmlShutdown(); - HANDLE_NVML_RET_ERR( nvml_stat, "nvmlShutdown failed" ); - return (nvml_stat == NVML_SUCCESS); -#endif /* HAVE_NVML_APPLICATION_CLOCKS */ -} - -void init_gpu(const gmx::MDLogger &mdlog, - gmx_device_info_t *deviceInfo) +void init_gpu(const gmx_device_info_t *deviceInfo) { cudaError_t stat; @@ -541,9 +263,6 @@ void init_gpu(const gmx::MDLogger &mdlog, } checkCompiledTargetCompatibility(deviceInfo); - - //Ignoring return value as NVML errors should be treated not critical. - init_gpu_application_clocks(mdlog, deviceInfo); } void free_gpu(const gmx_device_info_t *deviceInfo) @@ -566,11 +285,6 @@ void free_gpu(const gmx_device_info_t *deviceInfo) fprintf(stderr, "Cleaning up context on GPU ID #%d\n", gpuid); } - if (!reset_gpu_application_clocks(deviceInfo)) - { - gmx_warning("Failed to reset GPU application clocks on GPU #%d", deviceInfo->id); - } - stat = cudaDeviceReset(); if (stat != cudaSuccess) { diff --git a/src/gromacs/gpu_utils/gpu_utils.h b/src/gromacs/gpu_utils/gpu_utils.h index 65d8502e39..ace93a5e3a 100644 --- a/src/gromacs/gpu_utils/gpu_utils.h +++ b/src/gromacs/gpu_utils/gpu_utils.h @@ -58,7 +58,6 @@ struct gmx_gpu_info_t; namespace gmx { -class MDLogger; } //! Enum which is only used to describe transfer calls at the moment @@ -143,15 +142,13 @@ void free_gpu_info(const gmx_gpu_info_t *gpu_info); * TODO Doxygen complains about these - probably a Doxygen bug, since * the patterns here are the same as elsewhere in this header. * - * param[in] mdlog log file to write to - * \param[inout] deviceInfo device info of the GPU to initialize + * \param[in] deviceInfo device info of the GPU to initialize * * Issues a fatal error for any critical errors that occur during * initialization. */ GPU_FUNC_QUALIFIER -void init_gpu(const gmx::MDLogger &GPU_FUNC_ARGUMENT(mdlog), - gmx_device_info_t *GPU_FUNC_ARGUMENT(deviceInfo)) GPU_FUNC_TERM +void init_gpu(const gmx_device_info_t *GPU_FUNC_ARGUMENT(deviceInfo)) GPU_FUNC_TERM /*! \brief Frees up the CUDA GPU used by the active context at the time of calling. * diff --git a/src/gromacs/gpu_utils/gpu_utils_ocl.cpp b/src/gromacs/gpu_utils/gpu_utils_ocl.cpp index 68e74775ce..af3018033b 100644 --- a/src/gromacs/gpu_utils/gpu_utils_ocl.cpp +++ b/src/gromacs/gpu_utils/gpu_utils_ocl.cpp @@ -381,8 +381,7 @@ void get_gpu_device_info_string(char *s, const gmx_gpu_info_t &gpu_info, int ind } //! This function is documented in the header file -void init_gpu(const gmx::MDLogger & /*mdlog*/, - gmx_device_info_t *deviceInfo) +void init_gpu(const gmx_device_info_t *deviceInfo) { assert(deviceInfo); diff --git a/src/gromacs/mdrun/runner.cpp b/src/gromacs/mdrun/runner.cpp index 678ba99535..d2ac9390fd 100644 --- a/src/gromacs/mdrun/runner.cpp +++ b/src/gromacs/mdrun/runner.cpp @@ -1022,7 +1022,7 @@ int Mdrunner::mdrunner() { int nonbondedDeviceId = nbGpuTaskMapping->deviceId_; nonbondedDeviceInfo = getDeviceInfo(hwinfo->gpu_info, nonbondedDeviceId); - init_gpu(mdlog, nonbondedDeviceInfo); + init_gpu(nonbondedDeviceInfo); if (DOMAINDECOMP(cr)) { @@ -1046,7 +1046,7 @@ int Mdrunner::mdrunner() if (thisRankHasPmeGpuTask) { pmeDeviceInfo = getDeviceInfo(hwinfo->gpu_info, pmeGpuTaskMapping->deviceId_); - init_gpu(mdlog, pmeDeviceInfo); + init_gpu(pmeDeviceInfo); pmeGpuProgram = buildPmeGpuProgram(pmeDeviceInfo); // TODO It would be nice to move this logic into the factory // function. See Redmine #2535. -- 2.22.0