Remove NVML support
authorMark Abraham <mark.j.abraham@gmail.com>
Thu, 27 Sep 2018 15:20:02 +0000 (17:20 +0200)
committerBerk Hess <hess@kth.se>
Sat, 29 Sep 2018 21:02:31 +0000 (23:02 +0200)
Net value for this feature isn't high enough to maintain it.

gmx_device_info_t can now be used const-correct in many more places.

Fixes #2655

Change-Id: I2fac7d8b5613bee3fe0a6020862fc57224f7f6c4

21 files changed:
admin/builds/post-submit-matrix.txt
cmake/FindNVML.cmake [deleted file]
cmake/gmxManageGPU.cmake
docs/install-guide/index.rst
docs/release-notes/removed-features.rst
docs/user-guide/mdrun-performance.rst
src/config.h.cmakein
src/gromacs/ewald/pme-gpu-internal.cpp
src/gromacs/ewald/pme-gpu-internal.h
src/gromacs/ewald/pme-gpu-types-host.h
src/gromacs/ewald/pme.cpp
src/gromacs/ewald/pme.h
src/gromacs/ewald/tests/pmetestcommon.cpp
src/gromacs/ewald/tests/pmetestcommon.h
src/gromacs/ewald/tests/testhardwarecontexts.cpp
src/gromacs/ewald/tests/testhardwarecontexts.h
src/gromacs/gpu_utils/cudautils.cuh
src/gromacs/gpu_utils/gpu_utils.cu
src/gromacs/gpu_utils/gpu_utils.h
src/gromacs/gpu_utils/gpu_utils_ocl.cpp
src/gromacs/mdrun/runner.cpp

index f72fdf66eb8e686e70f723664a6811c60651400a..78152c629829ce69c4deb0183b278c667189bf3e 100644 (file)
@@ -72,7 +72,6 @@ icc-18 msvc-2017 fftpack simd=avx2_256 release
 # TODO
 # Add SIMD + OpenMP + CUDA asan build
 # Add OpenMP + CUDA + device sharing TSAN build
-# Test with NVML support
 # Test statically linked hwloc support (if/when it can work well)
 # Test 3D DD (2D is partially covered in regressiontests)
 # Test own-fftw build (from local copy of the file)
diff --git a/cmake/FindNVML.cmake b/cmake/FindNVML.cmake
deleted file mode 100644 (file)
index 439168d..0000000
+++ /dev/null
@@ -1,147 +0,0 @@
-#
-# This file is part of the GROMACS molecular simulation package.
-#
-# Copyright (c) 2014,2015,2017,2018, by the GROMACS development team, led by
-# Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
-# and including many others, as listed in the AUTHORS file in the
-# top-level source directory and at http://www.gromacs.org.
-#
-# GROMACS is free software; you can redistribute it and/or
-# modify it under the terms of the GNU Lesser General Public License
-# as published by the Free Software Foundation; either version 2.1
-# of the License, or (at your option) any later version.
-#
-# GROMACS is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-# Lesser General Public License for more details.
-#
-# You should have received a copy of the GNU Lesser General Public
-# License along with GROMACS; if not, see
-# http://www.gnu.org/licenses, or write to the Free Software Foundation,
-# Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
-#
-# If you want to redistribute modifications to GROMACS, please
-# consider that scientific software is very special. Version
-# control is crucial - bugs must be traceable. We will be happy to
-# consider code for inclusion in the official distribution, but
-# derived work must not be called official GROMACS. Details are found
-# in the README & COPYING files - if they are missing, get the
-# official version at http://www.gromacs.org.
-#
-# To help us fund GROMACS development, we humbly ask that you cite
-# the research papers on the package. Check out http://www.gromacs.org.
-
-#.rst:
-# FindNVML
-# --------
-#
-# Find the NVIDIA Management Library (NVML) includes and library. NVML documentation
-# is available at: http://docs.nvidia.com/deploy/nvml-api/index.html
-#
-# Starting with CUDA 8 NVML is part of the CUDA Toolkit. Prior to CUDA 8 NVML was part
-# of the GPU Deployment Kit (GDK) and GPU_DEPLOYMENT_KIT_ROOT_DIR can be specified
-# if the GPU Deployment Kit is not installed in a default location.
-#
-# FindNVML defines the following variables:
-#
-#   NVML_INCLUDE_DIR, where to find nvml.h, etc.
-#   NVML_LIBRARY, the libraries needed to use NVML.
-#   NVML_FOUND, If false, do not try to use NVML.
-#
-
-#   Jiri Kraus, NVIDIA Corp (nvidia.com - jkraus)
-#
-#   Copyright (c) 2008 - 2014,2017 NVIDIA Corporation.  All rights reserved.
-#
-#   This code is licensed under the MIT License.  See the FindNVML.cmake script
-#   for the text of the license.
-
-# The MIT License
-#
-# License for the specific language governing rights and limitations under
-# Permission is hereby granted, free of charge, to any person obtaining a
-# copy of this software and associated documentation files (the "Software"),
-# to deal in the Software without restriction, including without limitation
-# the rights to use, copy, modify, merge, publish, distribute, sublicense,
-# and/or sell copies of the Software, and to permit persons to whom the
-# Software is furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included
-# in all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
-# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-# DEALINGS IN THE SOFTWARE.
-#
-###############################################################################
-
-if( CMAKE_SYSTEM_NAME STREQUAL "Windows"  )
-    set(NVML_NAMES nvml)
-else()
-    set(NVML_NAMES nvidia-ml)
-endif()
-
-if (CUDA_FOUND)
-    if( CMAKE_SYSTEM_NAME STREQUAL "Windows"  )
-        if(${CUDA_VERSION_STRING} VERSION_LESS "8.0")
-            set( NVML_LIB_PATHS "C:/Program Files/NVIDIA Corporation/GDK/nvml/lib" )
-            if(GPU_DEPLOYMENT_KIT_ROOT_DIR)
-                list(APPEND NVML_LIB_PATHS "${GPU_DEPLOYMENT_KIT_ROOT_DIR}/nvml/lib")
-            endif()
-
-            set( NVML_INC_PATHS "C:/Program Files/NVIDIA Corporation/GDK/nvml/include" )
-            if(GPU_DEPLOYMENT_KIT_ROOT_DIR)
-                list(APPEND NVML_INC_PATHS "${GPU_DEPLOYMENT_KIT_ROOT_DIR}/nvml/include")
-            endif()
-        else()
-            set( NVML_LIB_PATHS "${CUDA_TOOLKIT_ROOT_DIR}/lib/x64" )
-            set( NVML_INC_PATHS ${CUDA_INCLUDE_DIRS} )
-        endif()
-    else()
-
-        set( NVML_LIB_PATHS /usr/lib64 )
-        if(${CUDA_VERSION_STRING} VERSION_LESS "8.0")
-            # The Linux installer for the GPU Deployment Kit adds a "usr"
-            # suffix to a custom path if one is used, so a user could
-            # reasonably set GPU_DEPLOYMENT_KIT_ROOT_DIR to the value they
-            # passed to the installer, or the root where they later found the
-            # kit to be installed. Below, we cater for both possibilities.
-            if(GPU_DEPLOYMENT_KIT_ROOT_DIR)
-                list(APPEND NVML_LIB_PATHS
-                    "${GPU_DEPLOYMENT_KIT_ROOT_DIR}/src/gdk/nvml/lib"
-                    "${GPU_DEPLOYMENT_KIT_ROOT_DIR}/usr/src/gdk/nvml/lib"
-                    )
-            endif()
-        else()
-            list(APPEND NVML_LIB_PATHS "${CUDA_TOOLKIT_ROOT_DIR}/lib64/stubs")
-        endif()
-
-        if(${CUDA_VERSION_STRING} VERSION_LESS "8.0")
-            set( NVML_INC_PATHS /usr/include/nvidia/gdk/ /usr/include )
-            if(GPU_DEPLOYMENT_KIT_ROOT_DIR)
-                list(APPEND NVML_INC_PATHS
-                    "${GPU_DEPLOYMENT_KIT_ROOT_DIR}/include/nvidia/gdk"
-                    "${GPU_DEPLOYMENT_KIT_ROOT_DIR}/usr/include/nvidia/gdk"
-                    )
-            endif()
-        else()
-            set( NVML_INC_PATHS ${CUDA_INCLUDE_DIRS} )
-        endif()
-    endif()
-endif()
-
-find_library(NVML_LIBRARY NAMES ${NVML_NAMES} PATHS ${NVML_LIB_PATHS} )
-
-find_path(NVML_INCLUDE_DIR nvml.h PATHS ${NVML_INC_PATHS})
-
-# handle the QUIETLY and REQUIRED arguments and set NVML_FOUND to TRUE if
-# all listed variables are TRUE
-include(FindPackageHandleStandardArgs)
-FIND_PACKAGE_HANDLE_STANDARD_ARGS(NVML DEFAULT_MSG NVML_LIBRARY NVML_INCLUDE_DIR)
-
-mark_as_advanced(NVML_LIBRARY NVML_INCLUDE_DIR)
index 65db27e771a16fa3029cf65bf0eea4795b111a59..eaec53a18e17a1b6b12bf4aff0c4d8918a0f571e 100644 (file)
@@ -130,26 +130,6 @@ ${_msg}")
     endif()
 endif()
 
-# Try to find NVML if a GPU accelerated binary should be build.
-if (GMX_GPU)
-    if (DEFINED NVML_LIBRARY)
-        set(NVML_FIND_QUIETLY TRUE)
-    endif()
-    find_package(NVML)
-    # TODO Default to off, since linking is not implemented reliably
-    option(GMX_USE_NVML "Use NVML support for better CUDA performance" OFF)
-    mark_as_advanced(GMX_USE_NVML)
-    if(GMX_USE_NVML)
-        if(NVML_FOUND)
-            include_directories(SYSTEM ${NVML_INCLUDE_DIR})
-            set(HAVE_NVML 1)
-            list(APPEND GMX_EXTRA_LIBRARIES ${NVML_LIBRARY})
-        else()
-            message(FATAL_ERROR "NVML support was required, but was not detected. Please consult the install guide.")
-        endif()
-    endif()
-endif()
-
 # Annoyingly enough, FindCUDA leaves a few variables behind as non-advanced.
 # We need to mark these advanced outside the conditional, otherwise, if the
 # user turns GMX_GPU=OFF after a failed cmake pass, these variables will be
index 9b88cdc904ad544130c56f119223bf91b8a72e8f..a0bee47c968eb6bf10e5f92dc476646d54bd2730 100644 (file)
@@ -627,25 +627,6 @@ If you have the CUDA_ Toolkit installed, you can use ``cmake`` with:
 need to specify manually which of your C++ compilers should be used,
 e.g. with the advanced option ``CUDA_HOST_COMPILER``.
 
-To make it
-possible to get best performance from NVIDIA Tesla and Quadro GPUs,
-you should install the `GPU Deployment Kit
-<https://developer.nvidia.com/gpu-deployment-kit>`_ and configure
-|Gromacs| to use it by setting the CMake variable
-``-DGPU_DEPLOYMENT_KIT_ROOT_DIR=/path/to/your/kit``. The NVML support
-is most useful if
-``nvidia-smi --applications-clocks-permission=UNRESTRICTED`` is run
-(as root). When application clocks permissions are unrestricted, the
-GPU clock speed can be increased automatically, which increases the
-GPU kernel performance roughly proportional to the clock
-increase. When using |Gromacs| on suitable GPUs under restricted
-permissions, clocks cannot be changed, and in that case informative
-log file messages will be produced. Background details can be found at
-this `NVIDIA blog post
-<http://devblogs.nvidia.com/parallelforall/increase-performance-gpu-boost-k80-autoboost/>`_.
-NVML support is only available if detected, and may be disabled by
-turning off the ``GMX_USE_NVML`` CMake advanced option.
-
 By default, code will be generated for the most common CUDA architectures.
 However, to reduce build time and binary size we do not generate code for
 every single possible architecture, which in rare cases (say, Tegra systems)
index b84fc5de17d4d30b36b466f19901fc3850ac64bb..fbe66c04ef138d0b0ccb4882cbb6b7c93fc7a2cf 100644 (file)
@@ -1,3 +1,11 @@
 Removed features
 ^^^^^^^^^^^^^^^^
 
+NVML support removed on NVIDIA GPUs
+"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
+NVML support (for reporting GPU application clocks  or changing these
+for higher throughput) is no longer available. It was only ever supported on
+high-end hardware and changing clocks is on recent generations of hardware only
+useful when root permissions were available to the user. It may become less useful
+as GROMACS evolves, complicated the GROMACS code, and wasn't regularly tested or maintained.
+It might return if some of these conditions change.
index 5fa2f686e3f793bce8578152b1199b079a00aeb6..27de1daf269b56d11963feb05d244721a170651c 100644 (file)
@@ -740,27 +740,6 @@ TODO In future patch:
 Running :ref:`mdrun <gmx mdrun>` with GPUs
 ------------------------------------------
 
-NVIDIA GPUs from the professional line (Tesla or Quadro) starting with
-the Kepler generation (compute capability 3.5 and later) support changing the
-processor and memory clock frequency with the help of the applications clocks feature.
-With many workloads, using higher clock rates than the default provides significant
-performance improvements.
-For more information see the `NVIDIA blog article`_ on this topic.
-For |Gromacs| the highest application clock rates are optimal on all hardware
-available to date (up to and including Maxwell, compute capability 5.2).
-
-Application clocks can be set using the NVIDIA system managemet tool
-``nvidia-smi``. If the system permissions allow, :ref:`gmx mdrun` has
-built-in support to set application clocks if built with :ref:`NVML support<CUDA GPU acceleration>`.
-Note that application clocks are a global setting, hence affect the
-performance of all applications that use the respective GPU(s).
-For this reason, :ref:`gmx mdrun` sets application clocks at initialization
-to the values optimal for |Gromacs| and it restores them before exiting
-to the values found at startup, unless it detects that they were altered
-during its runtime.
-
-.. _NVIDIA blog article: https://devblogs.nvidia.com/parallelforall/increase-performance-gpu-boost-k80-autoboost/
-
 .. _gmx-gpu-tasks:
 
 Types of GPU tasks
index cab63d23a4425305e077eed4040166f990612ea6..0f3aecbe57007d6865d0772f4fa7d999d94b4e75 100644 (file)
 /* Cluster size used by nonbonded OpenCL kernel. Should be 8 for NVIDIA/AMD and 4 for Intel */
 #define GMX_OCL_NB_CLUSTER_SIZE @GMX_OCL_NB_CLUSTER_SIZE@
 
-/* Use NVML */
-#cmakedefine01 HAVE_NVML
-
 /* Define relative path to OpenCL kernels */
 #define GMX_INSTALL_OCLDIR "@GMX_INSTALL_OCLDIR@"
 
index a030b45a30c7323573f6e0054a582b5c446a9e8c..8219fd82685bf4780c6ecdecab507cea0906d44e 100644 (file)
@@ -755,9 +755,9 @@ static void pme_gpu_copy_common_data_from(const gmx_pme_t *pme)
  * \param[in,out] gpuInfo        The GPU information structure.
  * \param[in]     pmeGpuProgram  The handle to the program/kernel data created outside (e.g. in unit tests/runner)
  */
-static void pme_gpu_init(gmx_pme_t          *pme,
-                         gmx_device_info_t  *gpuInfo,
-                         PmeGpuProgramHandle pmeGpuProgram)
+static void pme_gpu_init(gmx_pme_t               *pme,
+                         const gmx_device_info_t *gpuInfo,
+                         PmeGpuProgramHandle      pmeGpuProgram)
 {
     pme->gpu          = new PmeGpu();
     PmeGpu *pmeGpu = pme->gpu;
@@ -856,9 +856,9 @@ void pme_gpu_get_real_grid_sizes(const PmeGpu *pmeGpu, gmx::IVec *gridSize, gmx:
     }
 }
 
-void pme_gpu_reinit(gmx_pme_t          *pme,
-                    gmx_device_info_t  *gpuInfo,
-                    PmeGpuProgramHandle pmeGpuProgram)
+void pme_gpu_reinit(gmx_pme_t               *pme,
+                    const gmx_device_info_t *gpuInfo,
+                    PmeGpuProgramHandle      pmeGpuProgram)
 {
     if (!pme_gpu_active(pme))
     {
index e48e18a7f543d044c2e5f981b7501b2277025a3c..2932136efe954efb5eb9f4f85f427a7432865c75 100644 (file)
@@ -628,12 +628,12 @@ GPU_FUNC_QUALIFIER void pme_gpu_get_real_grid_sizes(const PmeGpu *GPU_FUNC_ARGUM
  * (Re-)initializes the PME GPU data at the beginning of the run or on DLB.
  *
  * \param[in,out] pme             The PME structure.
- * \param[in,out] gpuInfo         The GPU information structure.
+ * \param[in]     gpuInfo         The GPU information structure.
  * \param[in]     pmeGpuProgram   The PME GPU program data
  * \throws gmx::NotImplementedError if this generally valid PME structure is not valid for GPU runs.
  */
 GPU_FUNC_QUALIFIER void pme_gpu_reinit(gmx_pme_t *GPU_FUNC_ARGUMENT(pme),
-                                       gmx_device_info_t *GPU_FUNC_ARGUMENT(gpuInfo),
+                                       const gmx_device_info_t *GPU_FUNC_ARGUMENT(gpuInfo),
                                        PmeGpuProgramHandle GPU_FUNC_ARGUMENT(pmeGpuProgram)) GPU_FUNC_TERM
 
 /*! \libinternal \brief
index a372d324309fc5cb684b1c575758a5f2fef81345..63d2e660d84e5e3a9bf26e8e5b8360981455a47b 100644 (file)
@@ -201,7 +201,7 @@ struct PmeGpu
     int nAtomsAlloc;
 
     /*! \brief A pointer to the device used during the execution. */
-    gmx_device_info_t *deviceInfo;
+    const gmx_device_info_t *deviceInfo;
 
     /*! \brief Kernel scheduling grid width limit in X - derived from deviceinfo compute capability in CUDA.
      * Declared as very large int to make it useful in computations with type promotion, to avoid overflows.
index 46411815da26cf4d28aafaa21173650aea8cd105..699bde35ee9eba682b9114c416c33cbd7e467404 100644 (file)
@@ -603,21 +603,21 @@ static int div_round_up(int enumerator, int denominator)
     return (enumerator + denominator - 1)/denominator;
 }
 
-gmx_pme_t *gmx_pme_init(const t_commrec     *cr,
-                        const NumPmeDomains &numPmeDomains,
-                        const t_inputrec    *ir,
-                        int                  homenr,
-                        gmx_bool             bFreeEnergy_q,
-                        gmx_bool             bFreeEnergy_lj,
-                        gmx_bool             bReproducible,
-                        real                 ewaldcoeff_q,
-                        real                 ewaldcoeff_lj,
-                        int                  nthread,
-                        PmeRunMode           runMode,
-                        PmeGpu              *pmeGpu,
-                        gmx_device_info_t   *gpuInfo,
-                        PmeGpuProgramHandle  pmeGpuProgram,
-                        const gmx::MDLogger  & /*mdlog*/)
+gmx_pme_t *gmx_pme_init(const t_commrec         *cr,
+                        const NumPmeDomains     &numPmeDomains,
+                        const t_inputrec        *ir,
+                        int                      homenr,
+                        gmx_bool                 bFreeEnergy_q,
+                        gmx_bool                 bFreeEnergy_lj,
+                        gmx_bool                 bReproducible,
+                        real                     ewaldcoeff_q,
+                        real                     ewaldcoeff_lj,
+                        int                      nthread,
+                        PmeRunMode               runMode,
+                        PmeGpu                  *pmeGpu,
+                        const gmx_device_info_t *gpuInfo,
+                        PmeGpuProgramHandle      pmeGpuProgram,
+                        const gmx::MDLogger      & /*mdlog*/)
 {
     int               use_threads, sum_use_threads, i;
     ivec              ndata;
index be91f1c5a623c07e12ba4903924b6f237cd537f3..81988e6bc9ce65beef8971891655cb9bc19991bd 100644 (file)
@@ -143,7 +143,7 @@ gmx_pme_t *gmx_pme_init(const t_commrec *cr,
                         int nthread,
                         PmeRunMode runMode,
                         PmeGpu *pmeGpu,
-                        gmx_device_info_t *gpuInfo,
+                        const gmx_device_info_t *gpuInfo,
                         PmeGpuProgramHandle pmeGpuProgram,
                         const gmx::MDLogger &mdlog);
 
index 186e2a1cfaed76bc2e799c5b73fc21d2401b4852..90f5a400eb1a79a5fae23e45c456f875627a13b8 100644 (file)
@@ -103,7 +103,7 @@ uint64_t getSplineModuliDoublePrecisionUlps(int splineOrder)
 //! PME initialization - internal
 static PmeSafePointer pmeInitInternal(const t_inputrec         *inputRec,
                                       CodePath                  mode,
-                                      gmx_device_info_t        *gpuInfo,
+                                      const gmx_device_info_t  *gpuInfo,
                                       PmeGpuProgramHandle       pmeGpuProgram,
                                       size_t                    atomCount,
                                       const Matrix3x3          &box,
@@ -152,7 +152,7 @@ static PmeSafePointer pmeInitInternal(const t_inputrec         *inputRec,
 //! Simple PME initialization based on input, no atom data
 PmeSafePointer pmeInitEmpty(const t_inputrec         *inputRec,
                             CodePath                  mode,
-                            gmx_device_info_t        *gpuInfo,
+                            const gmx_device_info_t  *gpuInfo,
                             PmeGpuProgramHandle       pmeGpuProgram,
                             const Matrix3x3          &box,
                             real                      ewaldCoeff_q,
@@ -166,7 +166,7 @@ PmeSafePointer pmeInitEmpty(const t_inputrec         *inputRec,
 //! PME initialization with atom data
 PmeSafePointer pmeInitAtoms(const t_inputrec         *inputRec,
                             CodePath                  mode,
-                            gmx_device_info_t        *gpuInfo,
+                            const gmx_device_info_t  *gpuInfo,
                             PmeGpuProgramHandle       pmeGpuProgram,
                             const CoordinatesVector  &coordinates,
                             const ChargesVector      &charges,
index cc516f621c4c5c5dfb82db84eb7d255a81ff5301..d3e9696b2cbc62b99add86eb0a8757bac225d0fe 100644 (file)
@@ -120,14 +120,14 @@ uint64_t getSplineModuliDoublePrecisionUlps(int splineOrder);
 //! Simple PME initialization (no atom data)
 PmeSafePointer pmeInitEmpty(const t_inputrec *inputRec,
                             CodePath mode = CodePath::CPU,
-                            gmx_device_info_t *gpuInfo = nullptr,
+                            const gmx_device_info_t *gpuInfo = nullptr,
                             PmeGpuProgramHandle pmeGpuProgram = nullptr,
                             const Matrix3x3 &box = {{1.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, 1.0f}},
                             real ewaldCoeff_q = 0.0f, real ewaldCoeff_lj = 0.0f);
 //! PME initialization with atom data and system box
 PmeSafePointer pmeInitAtoms(const t_inputrec         *inputRec,
                             CodePath                  mode,
-                            gmx_device_info_t        *gpuInfo,
+                            const gmx_device_info_t  *gpuInfo,
                             PmeGpuProgramHandle       pmeGpuProgram,
                             const CoordinatesVector  &coordinates,
                             const ChargesVector      &charges,
index ab7fcb038ecfbead3a4e39cccb44e6f423c5e760..f97f6b65195173fe8d44fdc14310c75a2da1c58f 100644 (file)
@@ -116,12 +116,11 @@ void PmeTestEnvironment::SetUp()
         // PME can only run on the CPU, so don't make any more test contexts.
         return;
     }
-    const MDLogger dummyLogger;
     // Constructing contexts for all compatible GPUs - will be empty on non-GPU builds
     for (int gpuIndex : getCompatibleGpus(hardwareInfo_->gpu_info))
     {
-        gmx_device_info_t *deviceInfo = getDeviceInfo(hardwareInfo_->gpu_info, gpuIndex);
-        init_gpu(dummyLogger, deviceInfo);
+        const gmx_device_info_t *deviceInfo = getDeviceInfo(hardwareInfo_->gpu_info, gpuIndex);
+        init_gpu(deviceInfo);
 
         char        stmp[200] = {};
         get_gpu_device_info_string(stmp, hardwareInfo_->gpu_info, gpuIndex);
index 7918b005e49389e49e622e45e062eb8ee3f0783a..3d2f7ce02b2add31748dc4edd1fb97d806d01f52 100644 (file)
@@ -77,7 +77,7 @@ struct TestHardwareContext
     //! Readable description
     std::string               description_;
     //! Device information pointer
-    gmx_device_info_t        *deviceInfo_;
+    const gmx_device_info_t  *deviceInfo_;
     //! Persistent compiled GPU kernels for PME.
     PmeGpuProgramStorage      program_;
 
@@ -85,13 +85,13 @@ struct TestHardwareContext
         //! Retuns the code path for this context.
         CodePath getCodePath() const { return codePath_; }
         //! Returns a human-readable context description line
-        std::string         getDescription() const{return description_; }
+        std::string              getDescription() const{return description_; }
         //! Returns the device info pointer
-        gmx_device_info_t  *getDeviceInfo() const{return deviceInfo_; }
+        const gmx_device_info_t *getDeviceInfo() const{return deviceInfo_; }
         //! Returns the persistent PME GPU kernels
-        PmeGpuProgramHandle getPmeGpuProgram() const{return program_.get(); }
+        PmeGpuProgramHandle      getPmeGpuProgram() const{return program_.get(); }
         //! Constructs the context
-        TestHardwareContext(CodePath codePath, const char *description, gmx_device_info_t *deviceInfo) :
+        TestHardwareContext(CodePath codePath, const char *description, const gmx_device_info_t *deviceInfo) :
             codePath_(codePath), description_(description), deviceInfo_(deviceInfo),
             program_(buildPmeGpuProgram(deviceInfo_)) {}
         ~TestHardwareContext();
index dac92fa6fe70b45a40e838aef682cd5cf8c268a5..230f968e361feacca35b58910e3d53057ccb85ff 100644 (file)
 #ifndef GMX_GPU_UTILS_CUDAUTILS_CUH
 #define GMX_GPU_UTILS_CUDAUTILS_CUH
 
-#include "config.h"
-
 #include <stdio.h>
-#if HAVE_NVML
-#include <nvml.h>
-#endif /* HAVE_NVML */
 
 #include <array>
 #include <string>
@@ -130,7 +125,6 @@ enum class GpuApiCallBehavior;
 
 #define CU_RET_ERR(status, msg) do { } while (0)
 #define CU_CHECK_PREV_ERR()     do { } while (0)
-#define HANDLE_NVML_RET_ERR(status, msg) do { } while (0)
 
 #endif /* CHECK_CUDA_ERRORS */
 
@@ -139,24 +133,12 @@ enum class GpuApiCallBehavior;
  * The CUDA device information is queried and set at detection and contains
  * both information about the device/hardware returned by the runtime as well
  * as additional data like support status.
- *
- * \todo extract an object to manage NVML details
  */
 struct gmx_device_info_t
 {
     int                 id;                      /* id of the CUDA device */
     cudaDeviceProp      prop;                    /* CUDA device properties */
     int                 stat;                    /* result of the device check */
-    unsigned int        nvml_orig_app_sm_clock;  /* The original SM clock before we changed it */
-    unsigned int        nvml_orig_app_mem_clock; /* The original memory clock before we changed it */
-    gmx_bool            nvml_app_clocks_changed; /* If application clocks have been changed */
-    unsigned int        nvml_set_app_sm_clock;   /* The SM clock we set */
-    unsigned int        nvml_set_app_mem_clock;  /* The memory clock we set */
-#if HAVE_NVML
-    nvmlDevice_t        nvml_device_id;          /* NVML device id */
-    // TODO This can become a bool with a more useful name
-    nvmlEnableState_t   nvml_is_restricted;      /* Status of application clocks permission */
-#endif                                           /* HAVE_NVML */
 };
 
 /*! Launches synchronous or asynchronous device to host memory copy.
index 3c0722c69d7156f91c652b0aebac36d62c5524e7..ed9483496138d12b4b05d263c7d6721f657f7c8c 100644 (file)
@@ -42,8 +42,6 @@
 
 #include "gpu_utils.h"
 
-#include "config.h"
-
 #include <assert.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include "gromacs/utility/exceptions.h"
 #include "gromacs/utility/fatalerror.h"
 #include "gromacs/utility/gmxassert.h"
-#include "gromacs/utility/logger.h"
 #include "gromacs/utility/programcontext.h"
 #include "gromacs/utility/smalloc.h"
 #include "gromacs/utility/snprintf.h"
 #include "gromacs/utility/stringutil.h"
 
-#if HAVE_NVML
-#include <nvml.h>
-#define HAVE_NVML_APPLICATION_CLOCKS (NVML_API_VERSION >= 6)
-#else  /* HAVE_NVML */
-#define HAVE_NVML_APPLICATION_CLOCKS 0
-#endif /* HAVE_NVML */
-
-#if defined(CHECK_CUDA_ERRORS) && HAVE_NVML_APPLICATION_CLOCKS
-/*! Check for NVML error on the return status of a NVML API call. */
-#  define HANDLE_NVML_RET_ERR(status, msg) \
-    do { \
-        if (status != NVML_SUCCESS) \
-        { \
-            gmx_warning("%s: %s\n", msg, nvmlErrorString(status)); \
-        } \
-    } while (0)
-#else  /* defined(CHECK_CUDA_ERRORS) && HAVE_NVML_APPLICATION_CLOCKS */
-#  define HANDLE_NVML_RET_ERR(status, msg) do { } while (0)
-#endif /* defined(CHECK_CUDA_ERRORS) && HAVE_NVML_APPLICATION_CLOCKS */
-
-#if HAVE_NVML_APPLICATION_CLOCKS
-static const gmx_bool            bCompiledWithApplicationClockSupport = true;
-#else
-static const gmx_bool gmx_unused bCompiledWithApplicationClockSupport = false;
-#endif
-
 /*! \internal \brief
  * Max number of devices supported by CUDA (for consistency checking).
  *
@@ -273,256 +244,7 @@ static int do_sanity_checks(int dev_id, cudaDeviceProp *dev_prop)
     return 0;
 }
 
-#if HAVE_NVML_APPLICATION_CLOCKS
-/*! \brief Determines and adds the NVML device ID to the passed \cuda_dev.
- *
- * Determines and adds the NVML device ID to the passed \cuda_dev. This is done by
- * matching PCI-E information from \cuda_dev with the available NVML devices.
- *
- * \param[in,out] cuda_dev  CUDA device information to enrich with NVML device info
- * \returns                 true if \cuda_dev could be enriched with matching NVML device information.
- */
-static bool addNVMLDeviceId(gmx_device_info_t* cuda_dev)
-{
-    nvmlDevice_t nvml_device_id;
-    unsigned int nvml_device_count  = 0;
-    nvmlReturn_t nvml_stat          = nvmlDeviceGetCount ( &nvml_device_count );
-    bool         nvmlWasInitialized = false;
-    HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetCount failed" );
-    for (unsigned int nvml_device_idx = 0; nvml_stat == NVML_SUCCESS && nvml_device_idx < nvml_device_count; ++nvml_device_idx)
-    {
-        nvml_stat = nvmlDeviceGetHandleByIndex ( nvml_device_idx, &nvml_device_id );
-        HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetHandleByIndex failed" );
-        if (nvml_stat != NVML_SUCCESS)
-        {
-            break;
-        }
-
-        nvmlPciInfo_t nvml_pci_info;
-        nvml_stat = nvmlDeviceGetPciInfo ( nvml_device_id, &nvml_pci_info );
-        HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetPciInfo failed" );
-        if (nvml_stat != NVML_SUCCESS)
-        {
-            break;
-        }
-        if (static_cast<unsigned int>(cuda_dev->prop.pciBusID) == nvml_pci_info.bus &&
-            static_cast<unsigned int>(cuda_dev->prop.pciDeviceID) == nvml_pci_info.device &&
-            static_cast<unsigned int>(cuda_dev->prop.pciDomainID) == nvml_pci_info.domain)
-        {
-            nvmlWasInitialized         = true;
-            cuda_dev->nvml_device_id   = nvml_device_id;
-            break;
-        }
-    }
-    return nvmlWasInitialized;
-}
-
-/*! \brief Reads and returns the application clocks for device.
- *
- * \param[in]  device        The GPU device
- * \param[out] app_sm_clock  The current application SM clock
- * \param[out] app_mem_clock The current application memory clock
- * \returns if applacation clocks are supported
- */
-static bool getApplicationClocks(const gmx_device_info_t *cuda_dev,
-                                 unsigned int            *app_sm_clock,
-                                 unsigned int            *app_mem_clock)
-{
-    nvmlReturn_t nvml_stat;
-
-    nvml_stat = nvmlDeviceGetApplicationsClock(cuda_dev->nvml_device_id, NVML_CLOCK_SM, app_sm_clock);
-    if (NVML_ERROR_NOT_SUPPORTED == nvml_stat)
-    {
-        return false;
-    }
-    HANDLE_NVML_RET_ERR(nvml_stat, "nvmlDeviceGetApplicationsClock failed for NVIDIA_CLOCK_SM");
-    nvml_stat = nvmlDeviceGetApplicationsClock(cuda_dev->nvml_device_id, NVML_CLOCK_MEM, app_mem_clock);
-    HANDLE_NVML_RET_ERR(nvml_stat, "nvmlDeviceGetApplicationsClock failed for NVIDIA_CLOCK_MEM");
-
-    return true;
-}
-#endif /* HAVE_NVML_APPLICATION_CLOCKS */
-
-/*! \brief Tries to set application clocks for the GPU with the given index.
- *
- * Application clocks are set to the max supported value to increase
- * performance if application clock permissions allow this. For future
- * GPU architectures a more sophisticated scheme might be required.
- *
- * \todo Refactor this into a detection phase and a work phase. Also
- * refactor to remove compile-time dependence on logging header.
- *
- * \param     mdlog         log file to write to
- * \param[in] cuda_dev      GPU device info for the GPU in use
- * \returns                 true if no error occurs during application clocks handling.
- */
-static gmx_bool init_gpu_application_clocks(
-        const gmx::MDLogger &mdlog,
-        gmx_device_info_t   *cuda_dev)
-{
-    const cudaDeviceProp *prop                        = &cuda_dev->prop;
-    int                   cuda_compute_capability     = prop->major * 10 + prop->minor;
-    gmx_bool              bGpuCanUseApplicationClocks =
-        ((0 == gmx_wcmatch("*Tesla*", prop->name) && cuda_compute_capability >= 35 ) ||
-         (0 == gmx_wcmatch("*Quadro*", prop->name) && cuda_compute_capability >= 52 ));
-    if (!bGpuCanUseApplicationClocks)
-    {
-        return true;
-    }
-#if !HAVE_NVML
-    GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted(
-            "NOTE: GROMACS was configured without NVML support hence it can not exploit\n"
-            "      application clocks of the detected %s GPU to improve performance.\n"
-            "      Recompile with the NVML library (compatible with the driver used) or set application clocks manually.",
-            prop->name);
-    return true;
-#else
-    if (!bCompiledWithApplicationClockSupport)
-    {
-        GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted(
-                "NOTE: GROMACS was compiled with an old NVML library which does not support\n"
-                "      managing application clocks of the detected %s GPU to improve performance.\n"
-                "      If your GPU supports application clocks, upgrade NVML (and driver) and recompile or set the clocks manually.",
-                prop->name );
-        return true;
-    }
-
-    /* We've compiled with NVML application clocks support, and have a GPU that can use it */
-    nvmlReturn_t nvml_stat = NVML_SUCCESS;
-    char        *env;
-    //TODO: GMX_GPU_APPLICATION_CLOCKS is currently only used to enable/disable setting of application clocks
-    //      this variable can be later used to give a user more fine grained control.
-    env = getenv("GMX_GPU_APPLICATION_CLOCKS");
-    if (env != NULL && ( strcmp( env, "0") == 0 ||
-                         gmx_strcasecmp( env, "OFF") == 0 ||
-                         gmx_strcasecmp( env, "DISABLE") == 0 ))
-    {
-        return true;
-    }
-    nvml_stat = nvmlInit();
-    HANDLE_NVML_RET_ERR( nvml_stat, "nvmlInit failed." );
-    if (nvml_stat != NVML_SUCCESS)
-    {
-        return false;
-    }
-
-    if (!addNVMLDeviceId(cuda_dev))
-    {
-        return false;
-    }
-    //get current application clocks setting
-    if (!getApplicationClocks(cuda_dev,
-                              &cuda_dev->nvml_orig_app_sm_clock,
-                              &cuda_dev->nvml_orig_app_mem_clock))
-    {
-        return false;
-    }
-    //get max application clocks
-    unsigned int max_sm_clock  = 0;
-    unsigned int max_mem_clock = 0;
-    nvml_stat = nvmlDeviceGetMaxClockInfo(cuda_dev->nvml_device_id, NVML_CLOCK_SM, &max_sm_clock);
-    HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetMaxClockInfo failed" );
-    nvml_stat = nvmlDeviceGetMaxClockInfo(cuda_dev->nvml_device_id, NVML_CLOCK_MEM, &max_mem_clock);
-    HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetMaxClockInfo failed" );
-
-    cuda_dev->nvml_is_restricted      = NVML_FEATURE_ENABLED;
-    cuda_dev->nvml_app_clocks_changed = false;
-
-    if (cuda_dev->nvml_orig_app_sm_clock >= max_sm_clock)
-    {
-        //TODO: This should probably be integrated into the GPU Properties table.
-        GMX_LOG(mdlog.info).appendTextFormatted(
-                "Application clocks (GPU clocks) for %s are (%d,%d)",
-                cuda_dev->prop.name, cuda_dev->nvml_orig_app_mem_clock, cuda_dev->nvml_orig_app_sm_clock);
-        return true;
-    }
-
-    if (cuda_compute_capability >= 60)
-    {
-        // Only warn about not being able to change clocks if they are not already at the max values
-        if (max_mem_clock > cuda_dev->nvml_orig_app_mem_clock || max_sm_clock > cuda_dev->nvml_orig_app_sm_clock)
-        {
-            GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted(
-                    "Cannot change application clocks for %s to optimal values due to insufficient permissions. Current values are (%d,%d), max values are (%d,%d).\nPlease contact your admin to change application clocks.\n",
-                    cuda_dev->prop.name, cuda_dev->nvml_orig_app_mem_clock, cuda_dev->nvml_orig_app_sm_clock, max_mem_clock, max_sm_clock);
-        }
-        return true;
-    }
-
-    nvml_stat = nvmlDeviceGetAPIRestriction(cuda_dev->nvml_device_id, NVML_RESTRICTED_API_SET_APPLICATION_CLOCKS, &(cuda_dev->nvml_is_restricted));
-    HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetAPIRestriction failed" );
-
-    if (nvml_stat != NVML_SUCCESS)
-    {
-        GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted(
-                "Cannot change GPU application clocks to optimal values due to NVML error (%d): %s.",
-                nvml_stat, nvmlErrorString(nvml_stat));
-        return false;
-    }
-
-    if (cuda_dev->nvml_is_restricted != NVML_FEATURE_DISABLED)
-    {
-        // Only warn about not being able to change clocks if they are not already at the max values
-        if (max_mem_clock > cuda_dev->nvml_orig_app_mem_clock || max_sm_clock > cuda_dev->nvml_orig_app_sm_clock)
-        {
-            GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted(
-                    "Cannot change application clocks for %s to optimal values due to insufficient permissions. Current values are (%d,%d), max values are (%d,%d).\nUse sudo nvidia-smi -acp UNRESTRICTED or contact your admin to change application clocks.",
-                    cuda_dev->prop.name, cuda_dev->nvml_orig_app_mem_clock, cuda_dev->nvml_orig_app_sm_clock, max_mem_clock, max_sm_clock);
-        }
-        return true;
-    }
-
-    /* Note: Distinguishing between different types of GPUs here might be necessary in the future,
-       e.g. if max application clocks should not be used for certain GPUs. */
-    GMX_LOG(mdlog.warning).appendTextFormatted(
-            "Changing GPU application clocks for %s to (%d,%d)",
-            cuda_dev->prop.name, max_mem_clock, max_sm_clock);
-    nvml_stat = nvmlDeviceSetApplicationsClocks(cuda_dev->nvml_device_id, max_mem_clock, max_sm_clock);
-    HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetApplicationsClock failed" );
-    cuda_dev->nvml_app_clocks_changed = true;
-    cuda_dev->nvml_set_app_sm_clock   = max_sm_clock;
-    cuda_dev->nvml_set_app_mem_clock  = max_mem_clock;
-
-    return true;
-#endif /* HAVE_NVML */
-}
-
-/*! \brief Resets application clocks if changed and cleans up NVML for the passed \gpu_dev.
- *
- * \param[in] gpu_dev  CUDA device information
- */
-static gmx_bool reset_gpu_application_clocks(const gmx_device_info_t gmx_unused * cuda_dev)
-{
-#if !HAVE_NVML_APPLICATION_CLOCKS
-    GMX_UNUSED_VALUE(cuda_dev);
-    return true;
-#else /* HAVE_NVML_APPLICATION_CLOCKS */
-    nvmlReturn_t nvml_stat = NVML_SUCCESS;
-    if (cuda_dev &&
-        cuda_dev->nvml_is_restricted == NVML_FEATURE_DISABLED &&
-        cuda_dev->nvml_app_clocks_changed)
-    {
-        /* Check if the clocks are still what we set them to.
-         * If so, set them back to the state we originally found them in.
-         * If not, don't touch them, because something else set them later.
-         */
-        unsigned int app_sm_clock, app_mem_clock;
-        getApplicationClocks(cuda_dev, &app_sm_clock, &app_mem_clock);
-        if (app_sm_clock  == cuda_dev->nvml_set_app_sm_clock &&
-            app_mem_clock == cuda_dev->nvml_set_app_mem_clock)
-        {
-            nvml_stat = nvmlDeviceSetApplicationsClocks(cuda_dev->nvml_device_id, cuda_dev->nvml_orig_app_mem_clock, cuda_dev->nvml_orig_app_sm_clock);
-            HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceSetApplicationsClock failed" );
-        }
-    }
-    nvml_stat = nvmlShutdown();
-    HANDLE_NVML_RET_ERR( nvml_stat, "nvmlShutdown failed" );
-    return (nvml_stat == NVML_SUCCESS);
-#endif /* HAVE_NVML_APPLICATION_CLOCKS */
-}
-
-void init_gpu(const gmx::MDLogger &mdlog,
-              gmx_device_info_t   *deviceInfo)
+void init_gpu(const gmx_device_info_t *deviceInfo)
 {
     cudaError_t stat;
 
@@ -541,9 +263,6 @@ void init_gpu(const gmx::MDLogger &mdlog,
     }
 
     checkCompiledTargetCompatibility(deviceInfo);
-
-    //Ignoring return value as NVML errors should be treated not critical.
-    init_gpu_application_clocks(mdlog, deviceInfo);
 }
 
 void free_gpu(const gmx_device_info_t *deviceInfo)
@@ -566,11 +285,6 @@ void free_gpu(const gmx_device_info_t *deviceInfo)
         fprintf(stderr, "Cleaning up context on GPU ID #%d\n", gpuid);
     }
 
-    if (!reset_gpu_application_clocks(deviceInfo))
-    {
-        gmx_warning("Failed to reset GPU application clocks on GPU #%d", deviceInfo->id);
-    }
-
     stat = cudaDeviceReset();
     if (stat != cudaSuccess)
     {
index 65d8502e39e6d8ee6a9842dffdcd2e7c524f3dd0..ace93a5e3a6f01e4a9d986caf414777e62318a81 100644 (file)
@@ -58,7 +58,6 @@ struct gmx_gpu_info_t;
 
 namespace gmx
 {
-class MDLogger;
 }
 
 //! Enum which is only used to describe transfer calls at the moment
@@ -143,15 +142,13 @@ void free_gpu_info(const gmx_gpu_info_t *gpu_info);
  * TODO Doxygen complains about these - probably a Doxygen bug, since
  * the patterns here are the same as elsewhere in this header.
  *
- *  param[in]    mdlog        log file to write to
- * \param[inout] deviceInfo   device info of the GPU to initialize
+ * \param[in]    deviceInfo   device info of the GPU to initialize
  *
  * Issues a fatal error for any critical errors that occur during
  * initialization.
  */
 GPU_FUNC_QUALIFIER
-void init_gpu(const gmx::MDLogger &GPU_FUNC_ARGUMENT(mdlog),
-              gmx_device_info_t *GPU_FUNC_ARGUMENT(deviceInfo)) GPU_FUNC_TERM
+void init_gpu(const gmx_device_info_t *GPU_FUNC_ARGUMENT(deviceInfo)) GPU_FUNC_TERM
 
 /*! \brief Frees up the CUDA GPU used by the active context at the time of calling.
  *
index 68e74775cee0fa71fc8717d829f05601a1f03240..af3018033b2542f77fd45c0856ecf6da39375e82 100644 (file)
@@ -381,8 +381,7 @@ void get_gpu_device_info_string(char *s, const gmx_gpu_info_t &gpu_info, int ind
 }
 
 //! This function is documented in the header file
-void init_gpu(const gmx::MDLogger               & /*mdlog*/,
-              gmx_device_info_t                *deviceInfo)
+void init_gpu(const gmx_device_info_t *deviceInfo)
 {
     assert(deviceInfo);
 
index 678ba99535d7139db614d2b63f5a776d49681854..d2ac9390fd1db938ec27c6576a1fdbb83a9cfca2 100644 (file)
@@ -1022,7 +1022,7 @@ int Mdrunner::mdrunner()
         {
             int nonbondedDeviceId = nbGpuTaskMapping->deviceId_;
             nonbondedDeviceInfo = getDeviceInfo(hwinfo->gpu_info, nonbondedDeviceId);
-            init_gpu(mdlog, nonbondedDeviceInfo);
+            init_gpu(nonbondedDeviceInfo);
 
             if (DOMAINDECOMP(cr))
             {
@@ -1046,7 +1046,7 @@ int Mdrunner::mdrunner()
     if (thisRankHasPmeGpuTask)
     {
         pmeDeviceInfo = getDeviceInfo(hwinfo->gpu_info, pmeGpuTaskMapping->deviceId_);
-        init_gpu(mdlog, pmeDeviceInfo);
+        init_gpu(pmeDeviceInfo);
         pmeGpuProgram = buildPmeGpuProgram(pmeDeviceInfo);
         // TODO It would be nice to move this logic into the factory
         // function. See Redmine #2535.