Net value for this feature isn't high enough to maintain it.
gmx_device_info_t can now be used const-correct in many more places.
Fixes #2655
Change-Id: I2fac7d8b5613bee3fe0a6020862fc57224f7f6c4
# TODO
# Add SIMD + OpenMP + CUDA asan build
# Add OpenMP + CUDA + device sharing TSAN build
-# Test with NVML support
# Test statically linked hwloc support (if/when it can work well)
# Test 3D DD (2D is partially covered in regressiontests)
# Test own-fftw build (from local copy of the file)
+++ /dev/null
-#
-# This file is part of the GROMACS molecular simulation package.
-#
-# Copyright (c) 2014,2015,2017,2018, by the GROMACS development team, led by
-# Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
-# and including many others, as listed in the AUTHORS file in the
-# top-level source directory and at http://www.gromacs.org.
-#
-# GROMACS is free software; you can redistribute it and/or
-# modify it under the terms of the GNU Lesser General Public License
-# as published by the Free Software Foundation; either version 2.1
-# of the License, or (at your option) any later version.
-#
-# GROMACS is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-# Lesser General Public License for more details.
-#
-# You should have received a copy of the GNU Lesser General Public
-# License along with GROMACS; if not, see
-# http://www.gnu.org/licenses, or write to the Free Software Foundation,
-# Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-#
-# If you want to redistribute modifications to GROMACS, please
-# consider that scientific software is very special. Version
-# control is crucial - bugs must be traceable. We will be happy to
-# consider code for inclusion in the official distribution, but
-# derived work must not be called official GROMACS. Details are found
-# in the README & COPYING files - if they are missing, get the
-# official version at http://www.gromacs.org.
-#
-# To help us fund GROMACS development, we humbly ask that you cite
-# the research papers on the package. Check out http://www.gromacs.org.
-
-#.rst:
-# FindNVML
-# --------
-#
-# Find the NVIDIA Management Library (NVML) includes and library. NVML documentation
-# is available at: http://docs.nvidia.com/deploy/nvml-api/index.html
-#
-# Starting with CUDA 8 NVML is part of the CUDA Toolkit. Prior to CUDA 8 NVML was part
-# of the GPU Deployment Kit (GDK) and GPU_DEPLOYMENT_KIT_ROOT_DIR can be specified
-# if the GPU Deployment Kit is not installed in a default location.
-#
-# FindNVML defines the following variables:
-#
-# NVML_INCLUDE_DIR, where to find nvml.h, etc.
-# NVML_LIBRARY, the libraries needed to use NVML.
-# NVML_FOUND, If false, do not try to use NVML.
-#
-
-# Jiri Kraus, NVIDIA Corp (nvidia.com - jkraus)
-#
-# Copyright (c) 2008 - 2014,2017 NVIDIA Corporation. All rights reserved.
-#
-# This code is licensed under the MIT License. See the FindNVML.cmake script
-# for the text of the license.
-
-# The MIT License
-#
-# License for the specific language governing rights and limitations under
-# Permission is hereby granted, free of charge, to any person obtaining a
-# copy of this software and associated documentation files (the "Software"),
-# to deal in the Software without restriction, including without limitation
-# the rights to use, copy, modify, merge, publish, distribute, sublicense,
-# and/or sell copies of the Software, and to permit persons to whom the
-# Software is furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included
-# in all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
-# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-# DEALINGS IN THE SOFTWARE.
-#
-###############################################################################
-
-if( CMAKE_SYSTEM_NAME STREQUAL "Windows" )
- set(NVML_NAMES nvml)
-else()
- set(NVML_NAMES nvidia-ml)
-endif()
-
-if (CUDA_FOUND)
- if( CMAKE_SYSTEM_NAME STREQUAL "Windows" )
- if(${CUDA_VERSION_STRING} VERSION_LESS "8.0")
- set( NVML_LIB_PATHS "C:/Program Files/NVIDIA Corporation/GDK/nvml/lib" )
- if(GPU_DEPLOYMENT_KIT_ROOT_DIR)
- list(APPEND NVML_LIB_PATHS "${GPU_DEPLOYMENT_KIT_ROOT_DIR}/nvml/lib")
- endif()
-
- set( NVML_INC_PATHS "C:/Program Files/NVIDIA Corporation/GDK/nvml/include" )
- if(GPU_DEPLOYMENT_KIT_ROOT_DIR)
- list(APPEND NVML_INC_PATHS "${GPU_DEPLOYMENT_KIT_ROOT_DIR}/nvml/include")
- endif()
- else()
- set( NVML_LIB_PATHS "${CUDA_TOOLKIT_ROOT_DIR}/lib/x64" )
- set( NVML_INC_PATHS ${CUDA_INCLUDE_DIRS} )
- endif()
- else()
-
- set( NVML_LIB_PATHS /usr/lib64 )
- if(${CUDA_VERSION_STRING} VERSION_LESS "8.0")
- # The Linux installer for the GPU Deployment Kit adds a "usr"
- # suffix to a custom path if one is used, so a user could
- # reasonably set GPU_DEPLOYMENT_KIT_ROOT_DIR to the value they
- # passed to the installer, or the root where they later found the
- # kit to be installed. Below, we cater for both possibilities.
- if(GPU_DEPLOYMENT_KIT_ROOT_DIR)
- list(APPEND NVML_LIB_PATHS
- "${GPU_DEPLOYMENT_KIT_ROOT_DIR}/src/gdk/nvml/lib"
- "${GPU_DEPLOYMENT_KIT_ROOT_DIR}/usr/src/gdk/nvml/lib"
- )
- endif()
- else()
- list(APPEND NVML_LIB_PATHS "${CUDA_TOOLKIT_ROOT_DIR}/lib64/stubs")
- endif()
-
- if(${CUDA_VERSION_STRING} VERSION_LESS "8.0")
- set( NVML_INC_PATHS /usr/include/nvidia/gdk/ /usr/include )
- if(GPU_DEPLOYMENT_KIT_ROOT_DIR)
- list(APPEND NVML_INC_PATHS
- "${GPU_DEPLOYMENT_KIT_ROOT_DIR}/include/nvidia/gdk"
- "${GPU_DEPLOYMENT_KIT_ROOT_DIR}/usr/include/nvidia/gdk"
- )
- endif()
- else()
- set( NVML_INC_PATHS ${CUDA_INCLUDE_DIRS} )
- endif()
- endif()
-endif()
-
-find_library(NVML_LIBRARY NAMES ${NVML_NAMES} PATHS ${NVML_LIB_PATHS} )
-
-find_path(NVML_INCLUDE_DIR nvml.h PATHS ${NVML_INC_PATHS})
-
-# handle the QUIETLY and REQUIRED arguments and set NVML_FOUND to TRUE if
-# all listed variables are TRUE
-include(FindPackageHandleStandardArgs)
-FIND_PACKAGE_HANDLE_STANDARD_ARGS(NVML DEFAULT_MSG NVML_LIBRARY NVML_INCLUDE_DIR)
-
-mark_as_advanced(NVML_LIBRARY NVML_INCLUDE_DIR)
endif()
endif()
-# Try to find NVML if a GPU accelerated binary should be build.
-if (GMX_GPU)
- if (DEFINED NVML_LIBRARY)
- set(NVML_FIND_QUIETLY TRUE)
- endif()
- find_package(NVML)
- # TODO Default to off, since linking is not implemented reliably
- option(GMX_USE_NVML "Use NVML support for better CUDA performance" OFF)
- mark_as_advanced(GMX_USE_NVML)
- if(GMX_USE_NVML)
- if(NVML_FOUND)
- include_directories(SYSTEM ${NVML_INCLUDE_DIR})
- set(HAVE_NVML 1)
- list(APPEND GMX_EXTRA_LIBRARIES ${NVML_LIBRARY})
- else()
- message(FATAL_ERROR "NVML support was required, but was not detected. Please consult the install guide.")
- endif()
- endif()
-endif()
-
# Annoyingly enough, FindCUDA leaves a few variables behind as non-advanced.
# We need to mark these advanced outside the conditional, otherwise, if the
# user turns GMX_GPU=OFF after a failed cmake pass, these variables will be
need to specify manually which of your C++ compilers should be used,
e.g. with the advanced option ``CUDA_HOST_COMPILER``.
-To make it
-possible to get best performance from NVIDIA Tesla and Quadro GPUs,
-you should install the `GPU Deployment Kit
-<https://developer.nvidia.com/gpu-deployment-kit>`_ and configure
-|Gromacs| to use it by setting the CMake variable
-``-DGPU_DEPLOYMENT_KIT_ROOT_DIR=/path/to/your/kit``. The NVML support
-is most useful if
-``nvidia-smi --applications-clocks-permission=UNRESTRICTED`` is run
-(as root). When application clocks permissions are unrestricted, the
-GPU clock speed can be increased automatically, which increases the
-GPU kernel performance roughly proportional to the clock
-increase. When using |Gromacs| on suitable GPUs under restricted
-permissions, clocks cannot be changed, and in that case informative
-log file messages will be produced. Background details can be found at
-this `NVIDIA blog post
-<http://devblogs.nvidia.com/parallelforall/increase-performance-gpu-boost-k80-autoboost/>`_.
-NVML support is only available if detected, and may be disabled by
-turning off the ``GMX_USE_NVML`` CMake advanced option.
-
By default, code will be generated for the most common CUDA architectures.
However, to reduce build time and binary size we do not generate code for
every single possible architecture, which in rare cases (say, Tegra systems)
Removed features
^^^^^^^^^^^^^^^^
+NVML support removed on NVIDIA GPUs
+"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
+NVML support (for reporting GPU application clocks or changing these
+for higher throughput) is no longer available. It was only ever supported on
+high-end hardware and changing clocks is on recent generations of hardware only
+useful when root permissions were available to the user. It may become less useful
+as GROMACS evolves, complicated the GROMACS code, and wasn't regularly tested or maintained.
+It might return if some of these conditions change.
Running :ref:`mdrun <gmx mdrun>` with GPUs
------------------------------------------
-NVIDIA GPUs from the professional line (Tesla or Quadro) starting with
-the Kepler generation (compute capability 3.5 and later) support changing the
-processor and memory clock frequency with the help of the applications clocks feature.
-With many workloads, using higher clock rates than the default provides significant
-performance improvements.
-For more information see the `NVIDIA blog article`_ on this topic.
-For |Gromacs| the highest application clock rates are optimal on all hardware
-available to date (up to and including Maxwell, compute capability 5.2).
-
-Application clocks can be set using the NVIDIA system managemet tool
-``nvidia-smi``. If the system permissions allow, :ref:`gmx mdrun` has
-built-in support to set application clocks if built with :ref:`NVML support<CUDA GPU acceleration>`.
-Note that application clocks are a global setting, hence affect the
-performance of all applications that use the respective GPU(s).
-For this reason, :ref:`gmx mdrun` sets application clocks at initialization
-to the values optimal for |Gromacs| and it restores them before exiting
-to the values found at startup, unless it detects that they were altered
-during its runtime.
-
-.. _NVIDIA blog article: https://devblogs.nvidia.com/parallelforall/increase-performance-gpu-boost-k80-autoboost/
-
.. _gmx-gpu-tasks:
Types of GPU tasks
/* Cluster size used by nonbonded OpenCL kernel. Should be 8 for NVIDIA/AMD and 4 for Intel */
#define GMX_OCL_NB_CLUSTER_SIZE @GMX_OCL_NB_CLUSTER_SIZE@
-/* Use NVML */
-#cmakedefine01 HAVE_NVML
-
/* Define relative path to OpenCL kernels */
#define GMX_INSTALL_OCLDIR "@GMX_INSTALL_OCLDIR@"
* \param[in,out] gpuInfo The GPU information structure.
* \param[in] pmeGpuProgram The handle to the program/kernel data created outside (e.g. in unit tests/runner)
*/
-static void pme_gpu_init(gmx_pme_t *pme,
- gmx_device_info_t *gpuInfo,
- PmeGpuProgramHandle pmeGpuProgram)
+static void pme_gpu_init(gmx_pme_t *pme,
+ const gmx_device_info_t *gpuInfo,
+ PmeGpuProgramHandle pmeGpuProgram)
{
pme->gpu = new PmeGpu();
PmeGpu *pmeGpu = pme->gpu;
}
}
-void pme_gpu_reinit(gmx_pme_t *pme,
- gmx_device_info_t *gpuInfo,
- PmeGpuProgramHandle pmeGpuProgram)
+void pme_gpu_reinit(gmx_pme_t *pme,
+ const gmx_device_info_t *gpuInfo,
+ PmeGpuProgramHandle pmeGpuProgram)
{
if (!pme_gpu_active(pme))
{
* (Re-)initializes the PME GPU data at the beginning of the run or on DLB.
*
* \param[in,out] pme The PME structure.
- * \param[in,out] gpuInfo The GPU information structure.
+ * \param[in] gpuInfo The GPU information structure.
* \param[in] pmeGpuProgram The PME GPU program data
* \throws gmx::NotImplementedError if this generally valid PME structure is not valid for GPU runs.
*/
GPU_FUNC_QUALIFIER void pme_gpu_reinit(gmx_pme_t *GPU_FUNC_ARGUMENT(pme),
- gmx_device_info_t *GPU_FUNC_ARGUMENT(gpuInfo),
+ const gmx_device_info_t *GPU_FUNC_ARGUMENT(gpuInfo),
PmeGpuProgramHandle GPU_FUNC_ARGUMENT(pmeGpuProgram)) GPU_FUNC_TERM
/*! \libinternal \brief
int nAtomsAlloc;
/*! \brief A pointer to the device used during the execution. */
- gmx_device_info_t *deviceInfo;
+ const gmx_device_info_t *deviceInfo;
/*! \brief Kernel scheduling grid width limit in X - derived from deviceinfo compute capability in CUDA.
* Declared as very large int to make it useful in computations with type promotion, to avoid overflows.
return (enumerator + denominator - 1)/denominator;
}
-gmx_pme_t *gmx_pme_init(const t_commrec *cr,
- const NumPmeDomains &numPmeDomains,
- const t_inputrec *ir,
- int homenr,
- gmx_bool bFreeEnergy_q,
- gmx_bool bFreeEnergy_lj,
- gmx_bool bReproducible,
- real ewaldcoeff_q,
- real ewaldcoeff_lj,
- int nthread,
- PmeRunMode runMode,
- PmeGpu *pmeGpu,
- gmx_device_info_t *gpuInfo,
- PmeGpuProgramHandle pmeGpuProgram,
- const gmx::MDLogger & /*mdlog*/)
+gmx_pme_t *gmx_pme_init(const t_commrec *cr,
+ const NumPmeDomains &numPmeDomains,
+ const t_inputrec *ir,
+ int homenr,
+ gmx_bool bFreeEnergy_q,
+ gmx_bool bFreeEnergy_lj,
+ gmx_bool bReproducible,
+ real ewaldcoeff_q,
+ real ewaldcoeff_lj,
+ int nthread,
+ PmeRunMode runMode,
+ PmeGpu *pmeGpu,
+ const gmx_device_info_t *gpuInfo,
+ PmeGpuProgramHandle pmeGpuProgram,
+ const gmx::MDLogger & /*mdlog*/)
{
int use_threads, sum_use_threads, i;
ivec ndata;
int nthread,
PmeRunMode runMode,
PmeGpu *pmeGpu,
- gmx_device_info_t *gpuInfo,
+ const gmx_device_info_t *gpuInfo,
PmeGpuProgramHandle pmeGpuProgram,
const gmx::MDLogger &mdlog);
//! PME initialization - internal
static PmeSafePointer pmeInitInternal(const t_inputrec *inputRec,
CodePath mode,
- gmx_device_info_t *gpuInfo,
+ const gmx_device_info_t *gpuInfo,
PmeGpuProgramHandle pmeGpuProgram,
size_t atomCount,
const Matrix3x3 &box,
//! Simple PME initialization based on input, no atom data
PmeSafePointer pmeInitEmpty(const t_inputrec *inputRec,
CodePath mode,
- gmx_device_info_t *gpuInfo,
+ const gmx_device_info_t *gpuInfo,
PmeGpuProgramHandle pmeGpuProgram,
const Matrix3x3 &box,
real ewaldCoeff_q,
//! PME initialization with atom data
PmeSafePointer pmeInitAtoms(const t_inputrec *inputRec,
CodePath mode,
- gmx_device_info_t *gpuInfo,
+ const gmx_device_info_t *gpuInfo,
PmeGpuProgramHandle pmeGpuProgram,
const CoordinatesVector &coordinates,
const ChargesVector &charges,
//! Simple PME initialization (no atom data)
PmeSafePointer pmeInitEmpty(const t_inputrec *inputRec,
CodePath mode = CodePath::CPU,
- gmx_device_info_t *gpuInfo = nullptr,
+ const gmx_device_info_t *gpuInfo = nullptr,
PmeGpuProgramHandle pmeGpuProgram = nullptr,
const Matrix3x3 &box = {{1.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, 1.0f}},
real ewaldCoeff_q = 0.0f, real ewaldCoeff_lj = 0.0f);
//! PME initialization with atom data and system box
PmeSafePointer pmeInitAtoms(const t_inputrec *inputRec,
CodePath mode,
- gmx_device_info_t *gpuInfo,
+ const gmx_device_info_t *gpuInfo,
PmeGpuProgramHandle pmeGpuProgram,
const CoordinatesVector &coordinates,
const ChargesVector &charges,
// PME can only run on the CPU, so don't make any more test contexts.
return;
}
- const MDLogger dummyLogger;
// Constructing contexts for all compatible GPUs - will be empty on non-GPU builds
for (int gpuIndex : getCompatibleGpus(hardwareInfo_->gpu_info))
{
- gmx_device_info_t *deviceInfo = getDeviceInfo(hardwareInfo_->gpu_info, gpuIndex);
- init_gpu(dummyLogger, deviceInfo);
+ const gmx_device_info_t *deviceInfo = getDeviceInfo(hardwareInfo_->gpu_info, gpuIndex);
+ init_gpu(deviceInfo);
char stmp[200] = {};
get_gpu_device_info_string(stmp, hardwareInfo_->gpu_info, gpuIndex);
//! Readable description
std::string description_;
//! Device information pointer
- gmx_device_info_t *deviceInfo_;
+ const gmx_device_info_t *deviceInfo_;
//! Persistent compiled GPU kernels for PME.
PmeGpuProgramStorage program_;
//! Retuns the code path for this context.
CodePath getCodePath() const { return codePath_; }
//! Returns a human-readable context description line
- std::string getDescription() const{return description_; }
+ std::string getDescription() const{return description_; }
//! Returns the device info pointer
- gmx_device_info_t *getDeviceInfo() const{return deviceInfo_; }
+ const gmx_device_info_t *getDeviceInfo() const{return deviceInfo_; }
//! Returns the persistent PME GPU kernels
- PmeGpuProgramHandle getPmeGpuProgram() const{return program_.get(); }
+ PmeGpuProgramHandle getPmeGpuProgram() const{return program_.get(); }
//! Constructs the context
- TestHardwareContext(CodePath codePath, const char *description, gmx_device_info_t *deviceInfo) :
+ TestHardwareContext(CodePath codePath, const char *description, const gmx_device_info_t *deviceInfo) :
codePath_(codePath), description_(description), deviceInfo_(deviceInfo),
program_(buildPmeGpuProgram(deviceInfo_)) {}
~TestHardwareContext();
#ifndef GMX_GPU_UTILS_CUDAUTILS_CUH
#define GMX_GPU_UTILS_CUDAUTILS_CUH
-#include "config.h"
-
#include <stdio.h>
-#if HAVE_NVML
-#include <nvml.h>
-#endif /* HAVE_NVML */
#include <array>
#include <string>
#define CU_RET_ERR(status, msg) do { } while (0)
#define CU_CHECK_PREV_ERR() do { } while (0)
-#define HANDLE_NVML_RET_ERR(status, msg) do { } while (0)
#endif /* CHECK_CUDA_ERRORS */
* The CUDA device information is queried and set at detection and contains
* both information about the device/hardware returned by the runtime as well
* as additional data like support status.
- *
- * \todo extract an object to manage NVML details
*/
struct gmx_device_info_t
{
int id; /* id of the CUDA device */
cudaDeviceProp prop; /* CUDA device properties */
int stat; /* result of the device check */
- unsigned int nvml_orig_app_sm_clock; /* The original SM clock before we changed it */
- unsigned int nvml_orig_app_mem_clock; /* The original memory clock before we changed it */
- gmx_bool nvml_app_clocks_changed; /* If application clocks have been changed */
- unsigned int nvml_set_app_sm_clock; /* The SM clock we set */
- unsigned int nvml_set_app_mem_clock; /* The memory clock we set */
-#if HAVE_NVML
- nvmlDevice_t nvml_device_id; /* NVML device id */
- // TODO This can become a bool with a more useful name
- nvmlEnableState_t nvml_is_restricted; /* Status of application clocks permission */
-#endif /* HAVE_NVML */
};
/*! Launches synchronous or asynchronous device to host memory copy.
#include "gpu_utils.h"
-#include "config.h"
-
#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include "gromacs/utility/exceptions.h"
#include "gromacs/utility/fatalerror.h"
#include "gromacs/utility/gmxassert.h"
-#include "gromacs/utility/logger.h"
#include "gromacs/utility/programcontext.h"
#include "gromacs/utility/smalloc.h"
#include "gromacs/utility/snprintf.h"
#include "gromacs/utility/stringutil.h"
-#if HAVE_NVML
-#include <nvml.h>
-#define HAVE_NVML_APPLICATION_CLOCKS (NVML_API_VERSION >= 6)
-#else /* HAVE_NVML */
-#define HAVE_NVML_APPLICATION_CLOCKS 0
-#endif /* HAVE_NVML */
-
-#if defined(CHECK_CUDA_ERRORS) && HAVE_NVML_APPLICATION_CLOCKS
-/*! Check for NVML error on the return status of a NVML API call. */
-# define HANDLE_NVML_RET_ERR(status, msg) \
- do { \
- if (status != NVML_SUCCESS) \
- { \
- gmx_warning("%s: %s\n", msg, nvmlErrorString(status)); \
- } \
- } while (0)
-#else /* defined(CHECK_CUDA_ERRORS) && HAVE_NVML_APPLICATION_CLOCKS */
-# define HANDLE_NVML_RET_ERR(status, msg) do { } while (0)
-#endif /* defined(CHECK_CUDA_ERRORS) && HAVE_NVML_APPLICATION_CLOCKS */
-
-#if HAVE_NVML_APPLICATION_CLOCKS
-static const gmx_bool bCompiledWithApplicationClockSupport = true;
-#else
-static const gmx_bool gmx_unused bCompiledWithApplicationClockSupport = false;
-#endif
-
/*! \internal \brief
* Max number of devices supported by CUDA (for consistency checking).
*
return 0;
}
-#if HAVE_NVML_APPLICATION_CLOCKS
-/*! \brief Determines and adds the NVML device ID to the passed \cuda_dev.
- *
- * Determines and adds the NVML device ID to the passed \cuda_dev. This is done by
- * matching PCI-E information from \cuda_dev with the available NVML devices.
- *
- * \param[in,out] cuda_dev CUDA device information to enrich with NVML device info
- * \returns true if \cuda_dev could be enriched with matching NVML device information.
- */
-static bool addNVMLDeviceId(gmx_device_info_t* cuda_dev)
-{
- nvmlDevice_t nvml_device_id;
- unsigned int nvml_device_count = 0;
- nvmlReturn_t nvml_stat = nvmlDeviceGetCount ( &nvml_device_count );
- bool nvmlWasInitialized = false;
- HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetCount failed" );
- for (unsigned int nvml_device_idx = 0; nvml_stat == NVML_SUCCESS && nvml_device_idx < nvml_device_count; ++nvml_device_idx)
- {
- nvml_stat = nvmlDeviceGetHandleByIndex ( nvml_device_idx, &nvml_device_id );
- HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetHandleByIndex failed" );
- if (nvml_stat != NVML_SUCCESS)
- {
- break;
- }
-
- nvmlPciInfo_t nvml_pci_info;
- nvml_stat = nvmlDeviceGetPciInfo ( nvml_device_id, &nvml_pci_info );
- HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetPciInfo failed" );
- if (nvml_stat != NVML_SUCCESS)
- {
- break;
- }
- if (static_cast<unsigned int>(cuda_dev->prop.pciBusID) == nvml_pci_info.bus &&
- static_cast<unsigned int>(cuda_dev->prop.pciDeviceID) == nvml_pci_info.device &&
- static_cast<unsigned int>(cuda_dev->prop.pciDomainID) == nvml_pci_info.domain)
- {
- nvmlWasInitialized = true;
- cuda_dev->nvml_device_id = nvml_device_id;
- break;
- }
- }
- return nvmlWasInitialized;
-}
-
-/*! \brief Reads and returns the application clocks for device.
- *
- * \param[in] device The GPU device
- * \param[out] app_sm_clock The current application SM clock
- * \param[out] app_mem_clock The current application memory clock
- * \returns if applacation clocks are supported
- */
-static bool getApplicationClocks(const gmx_device_info_t *cuda_dev,
- unsigned int *app_sm_clock,
- unsigned int *app_mem_clock)
-{
- nvmlReturn_t nvml_stat;
-
- nvml_stat = nvmlDeviceGetApplicationsClock(cuda_dev->nvml_device_id, NVML_CLOCK_SM, app_sm_clock);
- if (NVML_ERROR_NOT_SUPPORTED == nvml_stat)
- {
- return false;
- }
- HANDLE_NVML_RET_ERR(nvml_stat, "nvmlDeviceGetApplicationsClock failed for NVIDIA_CLOCK_SM");
- nvml_stat = nvmlDeviceGetApplicationsClock(cuda_dev->nvml_device_id, NVML_CLOCK_MEM, app_mem_clock);
- HANDLE_NVML_RET_ERR(nvml_stat, "nvmlDeviceGetApplicationsClock failed for NVIDIA_CLOCK_MEM");
-
- return true;
-}
-#endif /* HAVE_NVML_APPLICATION_CLOCKS */
-
-/*! \brief Tries to set application clocks for the GPU with the given index.
- *
- * Application clocks are set to the max supported value to increase
- * performance if application clock permissions allow this. For future
- * GPU architectures a more sophisticated scheme might be required.
- *
- * \todo Refactor this into a detection phase and a work phase. Also
- * refactor to remove compile-time dependence on logging header.
- *
- * \param mdlog log file to write to
- * \param[in] cuda_dev GPU device info for the GPU in use
- * \returns true if no error occurs during application clocks handling.
- */
-static gmx_bool init_gpu_application_clocks(
- const gmx::MDLogger &mdlog,
- gmx_device_info_t *cuda_dev)
-{
- const cudaDeviceProp *prop = &cuda_dev->prop;
- int cuda_compute_capability = prop->major * 10 + prop->minor;
- gmx_bool bGpuCanUseApplicationClocks =
- ((0 == gmx_wcmatch("*Tesla*", prop->name) && cuda_compute_capability >= 35 ) ||
- (0 == gmx_wcmatch("*Quadro*", prop->name) && cuda_compute_capability >= 52 ));
- if (!bGpuCanUseApplicationClocks)
- {
- return true;
- }
-#if !HAVE_NVML
- GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted(
- "NOTE: GROMACS was configured without NVML support hence it can not exploit\n"
- " application clocks of the detected %s GPU to improve performance.\n"
- " Recompile with the NVML library (compatible with the driver used) or set application clocks manually.",
- prop->name);
- return true;
-#else
- if (!bCompiledWithApplicationClockSupport)
- {
- GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted(
- "NOTE: GROMACS was compiled with an old NVML library which does not support\n"
- " managing application clocks of the detected %s GPU to improve performance.\n"
- " If your GPU supports application clocks, upgrade NVML (and driver) and recompile or set the clocks manually.",
- prop->name );
- return true;
- }
-
- /* We've compiled with NVML application clocks support, and have a GPU that can use it */
- nvmlReturn_t nvml_stat = NVML_SUCCESS;
- char *env;
- //TODO: GMX_GPU_APPLICATION_CLOCKS is currently only used to enable/disable setting of application clocks
- // this variable can be later used to give a user more fine grained control.
- env = getenv("GMX_GPU_APPLICATION_CLOCKS");
- if (env != NULL && ( strcmp( env, "0") == 0 ||
- gmx_strcasecmp( env, "OFF") == 0 ||
- gmx_strcasecmp( env, "DISABLE") == 0 ))
- {
- return true;
- }
- nvml_stat = nvmlInit();
- HANDLE_NVML_RET_ERR( nvml_stat, "nvmlInit failed." );
- if (nvml_stat != NVML_SUCCESS)
- {
- return false;
- }
-
- if (!addNVMLDeviceId(cuda_dev))
- {
- return false;
- }
- //get current application clocks setting
- if (!getApplicationClocks(cuda_dev,
- &cuda_dev->nvml_orig_app_sm_clock,
- &cuda_dev->nvml_orig_app_mem_clock))
- {
- return false;
- }
- //get max application clocks
- unsigned int max_sm_clock = 0;
- unsigned int max_mem_clock = 0;
- nvml_stat = nvmlDeviceGetMaxClockInfo(cuda_dev->nvml_device_id, NVML_CLOCK_SM, &max_sm_clock);
- HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetMaxClockInfo failed" );
- nvml_stat = nvmlDeviceGetMaxClockInfo(cuda_dev->nvml_device_id, NVML_CLOCK_MEM, &max_mem_clock);
- HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetMaxClockInfo failed" );
-
- cuda_dev->nvml_is_restricted = NVML_FEATURE_ENABLED;
- cuda_dev->nvml_app_clocks_changed = false;
-
- if (cuda_dev->nvml_orig_app_sm_clock >= max_sm_clock)
- {
- //TODO: This should probably be integrated into the GPU Properties table.
- GMX_LOG(mdlog.info).appendTextFormatted(
- "Application clocks (GPU clocks) for %s are (%d,%d)",
- cuda_dev->prop.name, cuda_dev->nvml_orig_app_mem_clock, cuda_dev->nvml_orig_app_sm_clock);
- return true;
- }
-
- if (cuda_compute_capability >= 60)
- {
- // Only warn about not being able to change clocks if they are not already at the max values
- if (max_mem_clock > cuda_dev->nvml_orig_app_mem_clock || max_sm_clock > cuda_dev->nvml_orig_app_sm_clock)
- {
- GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted(
- "Cannot change application clocks for %s to optimal values due to insufficient permissions. Current values are (%d,%d), max values are (%d,%d).\nPlease contact your admin to change application clocks.\n",
- cuda_dev->prop.name, cuda_dev->nvml_orig_app_mem_clock, cuda_dev->nvml_orig_app_sm_clock, max_mem_clock, max_sm_clock);
- }
- return true;
- }
-
- nvml_stat = nvmlDeviceGetAPIRestriction(cuda_dev->nvml_device_id, NVML_RESTRICTED_API_SET_APPLICATION_CLOCKS, &(cuda_dev->nvml_is_restricted));
- HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetAPIRestriction failed" );
-
- if (nvml_stat != NVML_SUCCESS)
- {
- GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted(
- "Cannot change GPU application clocks to optimal values due to NVML error (%d): %s.",
- nvml_stat, nvmlErrorString(nvml_stat));
- return false;
- }
-
- if (cuda_dev->nvml_is_restricted != NVML_FEATURE_DISABLED)
- {
- // Only warn about not being able to change clocks if they are not already at the max values
- if (max_mem_clock > cuda_dev->nvml_orig_app_mem_clock || max_sm_clock > cuda_dev->nvml_orig_app_sm_clock)
- {
- GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted(
- "Cannot change application clocks for %s to optimal values due to insufficient permissions. Current values are (%d,%d), max values are (%d,%d).\nUse sudo nvidia-smi -acp UNRESTRICTED or contact your admin to change application clocks.",
- cuda_dev->prop.name, cuda_dev->nvml_orig_app_mem_clock, cuda_dev->nvml_orig_app_sm_clock, max_mem_clock, max_sm_clock);
- }
- return true;
- }
-
- /* Note: Distinguishing between different types of GPUs here might be necessary in the future,
- e.g. if max application clocks should not be used for certain GPUs. */
- GMX_LOG(mdlog.warning).appendTextFormatted(
- "Changing GPU application clocks for %s to (%d,%d)",
- cuda_dev->prop.name, max_mem_clock, max_sm_clock);
- nvml_stat = nvmlDeviceSetApplicationsClocks(cuda_dev->nvml_device_id, max_mem_clock, max_sm_clock);
- HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetApplicationsClock failed" );
- cuda_dev->nvml_app_clocks_changed = true;
- cuda_dev->nvml_set_app_sm_clock = max_sm_clock;
- cuda_dev->nvml_set_app_mem_clock = max_mem_clock;
-
- return true;
-#endif /* HAVE_NVML */
-}
-
-/*! \brief Resets application clocks if changed and cleans up NVML for the passed \gpu_dev.
- *
- * \param[in] gpu_dev CUDA device information
- */
-static gmx_bool reset_gpu_application_clocks(const gmx_device_info_t gmx_unused * cuda_dev)
-{
-#if !HAVE_NVML_APPLICATION_CLOCKS
- GMX_UNUSED_VALUE(cuda_dev);
- return true;
-#else /* HAVE_NVML_APPLICATION_CLOCKS */
- nvmlReturn_t nvml_stat = NVML_SUCCESS;
- if (cuda_dev &&
- cuda_dev->nvml_is_restricted == NVML_FEATURE_DISABLED &&
- cuda_dev->nvml_app_clocks_changed)
- {
- /* Check if the clocks are still what we set them to.
- * If so, set them back to the state we originally found them in.
- * If not, don't touch them, because something else set them later.
- */
- unsigned int app_sm_clock, app_mem_clock;
- getApplicationClocks(cuda_dev, &app_sm_clock, &app_mem_clock);
- if (app_sm_clock == cuda_dev->nvml_set_app_sm_clock &&
- app_mem_clock == cuda_dev->nvml_set_app_mem_clock)
- {
- nvml_stat = nvmlDeviceSetApplicationsClocks(cuda_dev->nvml_device_id, cuda_dev->nvml_orig_app_mem_clock, cuda_dev->nvml_orig_app_sm_clock);
- HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceSetApplicationsClock failed" );
- }
- }
- nvml_stat = nvmlShutdown();
- HANDLE_NVML_RET_ERR( nvml_stat, "nvmlShutdown failed" );
- return (nvml_stat == NVML_SUCCESS);
-#endif /* HAVE_NVML_APPLICATION_CLOCKS */
-}
-
-void init_gpu(const gmx::MDLogger &mdlog,
- gmx_device_info_t *deviceInfo)
+void init_gpu(const gmx_device_info_t *deviceInfo)
{
cudaError_t stat;
}
checkCompiledTargetCompatibility(deviceInfo);
-
- //Ignoring return value as NVML errors should be treated not critical.
- init_gpu_application_clocks(mdlog, deviceInfo);
}
void free_gpu(const gmx_device_info_t *deviceInfo)
fprintf(stderr, "Cleaning up context on GPU ID #%d\n", gpuid);
}
- if (!reset_gpu_application_clocks(deviceInfo))
- {
- gmx_warning("Failed to reset GPU application clocks on GPU #%d", deviceInfo->id);
- }
-
stat = cudaDeviceReset();
if (stat != cudaSuccess)
{
namespace gmx
{
-class MDLogger;
}
//! Enum which is only used to describe transfer calls at the moment
* TODO Doxygen complains about these - probably a Doxygen bug, since
* the patterns here are the same as elsewhere in this header.
*
- * param[in] mdlog log file to write to
- * \param[inout] deviceInfo device info of the GPU to initialize
+ * \param[in] deviceInfo device info of the GPU to initialize
*
* Issues a fatal error for any critical errors that occur during
* initialization.
*/
GPU_FUNC_QUALIFIER
-void init_gpu(const gmx::MDLogger &GPU_FUNC_ARGUMENT(mdlog),
- gmx_device_info_t *GPU_FUNC_ARGUMENT(deviceInfo)) GPU_FUNC_TERM
+void init_gpu(const gmx_device_info_t *GPU_FUNC_ARGUMENT(deviceInfo)) GPU_FUNC_TERM
/*! \brief Frees up the CUDA GPU used by the active context at the time of calling.
*
}
//! This function is documented in the header file
-void init_gpu(const gmx::MDLogger & /*mdlog*/,
- gmx_device_info_t *deviceInfo)
+void init_gpu(const gmx_device_info_t *deviceInfo)
{
assert(deviceInfo);
{
int nonbondedDeviceId = nbGpuTaskMapping->deviceId_;
nonbondedDeviceInfo = getDeviceInfo(hwinfo->gpu_info, nonbondedDeviceId);
- init_gpu(mdlog, nonbondedDeviceInfo);
+ init_gpu(nonbondedDeviceInfo);
if (DOMAINDECOMP(cr))
{
if (thisRankHasPmeGpuTask)
{
pmeDeviceInfo = getDeviceInfo(hwinfo->gpu_info, pmeGpuTaskMapping->deviceId_);
- init_gpu(mdlog, pmeDeviceInfo);
+ init_gpu(pmeDeviceInfo);
pmeGpuProgram = buildPmeGpuProgram(pmeDeviceInfo);
// TODO It would be nice to move this logic into the factory
// function. See Redmine #2535.