--- /dev/null
- int id; /* id of the CUDA device */
- cudaDeviceProp prop; /* CUDA device properties */
- int stat; /* result of the device check */
- gmx_bool nvml_initialized; /* If NVML was initialized */
- gmx_bool nvml_ap_clocks_changed; /* If application clocks have been changed */
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2014,2015,2016, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+#ifndef GMX_GPU_UTILS_CUDAUTILS_CUH
+#define GMX_GPU_UTILS_CUDAUTILS_CUH
+
+#include "config.h"
+
+#include <stdio.h>
+#if HAVE_NVML
+#include <nvml.h>
+#endif /* HAVE_NVML */
+
+#include "gromacs/utility/fatalerror.h"
+
+/* TODO error checking needs to be rewritten. We have 2 types of error checks needed
+ based on where they occur in the code:
+ - non performance-critical: these errors are unsafe to be ignored and must be
+ _always_ checked for, e.g. initializations
+ - performance critical: handling errors might hurt performance so care need to be taken
+ when/if we should check for them at all, e.g. in cu_upload_X. However, we should be
+ able to turn the check for these errors on!
+
+ Probably we'll need two sets of the macros below...
+
+ */
+#define CHECK_CUDA_ERRORS
+
+#ifdef CHECK_CUDA_ERRORS
+
+/*! Check for CUDA error on the return status of a CUDA RT API call. */
+#define CU_RET_ERR(status, msg) \
+ do { \
+ if (status != cudaSuccess) \
+ { \
+ gmx_fatal(FARGS, "%s: %s\n", msg, cudaGetErrorString(status)); \
+ } \
+ } while (0)
+
+/*! Check for any previously occurred uncaught CUDA error. */
+#define CU_CHECK_PREV_ERR() \
+ do { \
+ cudaError_t _CU_CHECK_PREV_ERR_status = cudaGetLastError(); \
+ if (_CU_CHECK_PREV_ERR_status != cudaSuccess) { \
+ gmx_warning("Just caught a previously occurred CUDA error (%s), will try to continue.", cudaGetErrorString(_CU_CHECK_PREV_ERR_status)); \
+ } \
+ } while (0)
+
+/*! Check for any previously occurred uncaught CUDA error
+ -- aimed at use after kernel calls. */
+#define CU_LAUNCH_ERR(msg) \
+ do { \
+ cudaError_t _CU_LAUNCH_ERR_status = cudaGetLastError(); \
+ if (_CU_LAUNCH_ERR_status != cudaSuccess) { \
+ gmx_fatal(FARGS, "Error while launching kernel %s: %s\n", msg, cudaGetErrorString(_CU_LAUNCH_ERR_status)); \
+ } \
+ } while (0)
+
+/*! Synchronize with GPU and check for any previously occurred uncaught CUDA error
+ -- aimed at use after kernel calls. */
+#define CU_LAUNCH_ERR_SYNC(msg) \
+ do { \
+ cudaError_t _CU_SYNC_LAUNCH_ERR_status = cudaThreadSynchronize(); \
+ if (_CU_SYNC_LAUNCH_ERR_status != cudaSuccess) { \
+ gmx_fatal(FARGS, "Error while launching kernel %s: %s\n", msg, cudaGetErrorString(_CU_SYNC_LAUNCH_ERR_status)); \
+ } \
+ } while (0)
+
+#else /* CHECK_CUDA_ERRORS */
+
+#define CU_RET_ERR(status, msg) do { } while (0)
+#define CU_CHECK_PREV_ERR() do { } while (0)
+#define CU_LAUNCH_ERR(msg) do { } while (0)
+#define CU_LAUNCH_ERR_SYNC(msg) do { } while (0)
+#define HANDLE_NVML_RET_ERR(status, msg) do { } while (0)
+
+#endif /* CHECK_CUDA_ERRORS */
+
+/*! \brief CUDA device information.
+ *
+ * The CUDA device information is queried and set at detection and contains
+ * both information about the device/hardware returned by the runtime as well
+ * as additional data like support status.
+ */
+struct gmx_device_info_t
+{
- nvmlDevice_t nvml_device_id; /* NVML device id */
- nvmlEnableState_t nvml_is_restricted; /* Status of application clocks permission */
- #endif /* HAVE_NVML */
++ int id; /* id of the CUDA device */
++ cudaDeviceProp prop; /* CUDA device properties */
++ int stat; /* result of the device check */
++ gmx_bool nvml_initialized; /* If NVML was initialized */
++ unsigned int nvml_orig_app_sm_clock; /* The original SM clock before we changed it */
++ unsigned int nvml_orig_app_mem_clock; /* The original memory clock before we changed it */
++ gmx_bool nvml_app_clocks_changed; /* If application clocks have been changed */
++ unsigned int nvml_set_app_sm_clock; /* The SM clock we set */
++ unsigned int nvml_set_app_mem_clock; /* The memory clock we set */
+#if HAVE_NVML
++ nvmlDevice_t nvml_device_id; /* NVML device id */
++ nvmlEnableState_t nvml_is_restricted; /* Status of application clocks permission */
++#endif /* HAVE_NVML */
+};
+
+
+/*! Launches asynchronous host to device memory copy in stream 0. */
+int cu_copy_D2H(void * /*h_dest*/, void * /*d_src*/, size_t /*bytes*/);
+
+/*! Launches asynchronous host to device memory copy in stream s. */
+int cu_copy_D2H_async(void * /*h_dest*/, void * /*d_src*/, size_t /*bytes*/, cudaStream_t /*s = 0*/);
+
+/*! Allocates host memory and launches synchronous host to device memory copy. */
+int cu_copy_D2H_alloc(void ** /*h_dest*/, void * /*d_src*/, size_t /*bytes*/);
+
+
+/*! Launches synchronous host to device memory copy. */
+int cu_copy_H2D(void * /*d_dest*/, void * /*h_src*/, size_t /*bytes*/);
+
+/*! Launches asynchronous host to device memory copy in stream s. */
+int cu_copy_H2D_async(void * /*d_dest*/, void * /*h_src*/, size_t /*bytes*/, cudaStream_t /*s = 0*/);
+
+/*! Allocates device memory and launches synchronous host to device memory copy. */
+int cu_copy_H2D_alloc(void ** /*d_dest*/, void * /*h_src*/, size_t /*bytes*/);
+
+/*! Frees device memory and resets the size and allocation size to -1. */
+void cu_free_buffered(void *d_ptr, int *n = NULL, int *nalloc = NULL);
+
+/*! Reallocates the device memory and copies data from the host. */
+void cu_realloc_buffered(void **d_dest, void *h_src,
+ size_t type_size,
+ int *curr_size, int *curr_alloc_size,
+ int req_size,
+ cudaStream_t s,
+ bool bAsync);
+
+/*! Waits for event e to complete, */
+int cu_wait_event(cudaEvent_t /*e*/);
+
+/*! Calculates and returns the time elapsed between event start and end. */
+float cu_event_elapsed(cudaEvent_t /*start*/, cudaEvent_t /*end*/);
+
+/*! Waits for event end to complete and calculates the time between start and end. */
+int cu_wait_event_time(cudaEvent_t /*end*/, cudaEvent_t /*begin*/, float * /*time*/);
+
+#endif
--- /dev/null
- if (!addNVMLDeviceId( &(gpu_info->gpu_dev[gpuid])))
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2010,2011,2012,2013,2014,2015,2016, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \file
+ * \brief Define functions for detection and initialization for CUDA devices.
+ *
+ * \author Szilard Pall <pall.szilard@gmail.com>
+ */
+
+#include "gmxpre.h"
+
+#include "gpu_utils.h"
+
+#include "config.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <cuda_profiler_api.h>
+
+#include "gromacs/gpu_utils/cudautils.cuh"
+#include "gromacs/gpu_utils/pmalloc_cuda.h"
+#include "gromacs/hardware/gpu_hw_info.h"
+#include "gromacs/utility/basedefinitions.h"
+#include "gromacs/utility/cstringutil.h"
+#include "gromacs/utility/smalloc.h"
+
+#if HAVE_NVML
+#include <nvml.h>
+#define HAVE_NVML_APPLICATION_CLOCKS (NVML_API_VERSION >= 6)
+#else /* HAVE_NVML */
+#define HAVE_NVML_APPLICATION_CLOCKS 0
+#endif /* HAVE_NVML */
+
+#if defined(CHECK_CUDA_ERRORS) && HAVE_NVML_APPLICATION_CLOCKS
+/*! Check for NVML error on the return status of a NVML API call. */
+# define HANDLE_NVML_RET_ERR(status, msg) \
+ do { \
+ if (status != NVML_SUCCESS) \
+ { \
+ gmx_warning("%s: %s\n", msg, nvmlErrorString(status)); \
+ } \
+ } while (0)
+#else /* defined(CHECK_CUDA_ERRORS) && HAVE_NVML_APPLICATION_CLOCKS */
+# define HANDLE_NVML_RET_ERR(status, msg) do { } while (0)
+#endif /* defined(CHECK_CUDA_ERRORS) && HAVE_NVML_APPLICATION_CLOCKS */
+
+#if HAVE_NVML_APPLICATION_CLOCKS
+static const gmx_bool bCompiledWithApplicationClockSupport = true;
+#else
+static const gmx_bool gmx_unused bCompiledWithApplicationClockSupport = false;
+#endif
+
+/*! \internal \brief
+ * Max number of devices supported by CUDA (for consistency checking).
+ *
+ * In reality it is 16 with CUDA <=v5.0, but let's stay on the safe side.
+ */
+static int cuda_max_device_count = 32;
+
+static bool cudaProfilerRun = ((getenv("NVPROF_ID") != NULL));
+
+/** Dummy kernel used for sanity checking. */
+__global__ void k_dummy_test()
+{
+}
+
+
+/*!
+ * \brief Runs GPU sanity checks.
+ *
+ * Runs a series of checks to determine that the given GPU and underlying CUDA
+ * driver/runtime functions properly.
+ * Returns properties of a device with given ID or the one that has
+ * already been initialized earlier in the case if of \dev_id == -1.
+ *
+ * \param[in] dev_id the device ID of the GPU or -1 if the device has already been initialized
+ * \param[out] dev_prop pointer to the structure in which the device properties will be returned
+ * \returns 0 if the device looks OK
+ *
+ * TODO: introduce errors codes and handle errors more smoothly.
+ */
+static int do_sanity_checks(int dev_id, cudaDeviceProp *dev_prop)
+{
+ cudaError_t cu_err;
+ int dev_count, id;
+
+ cu_err = cudaGetDeviceCount(&dev_count);
+ if (cu_err != cudaSuccess)
+ {
+ fprintf(stderr, "Error %d while querying device count: %s\n", cu_err,
+ cudaGetErrorString(cu_err));
+ return -1;
+ }
+
+ /* no CUDA compatible device at all */
+ if (dev_count == 0)
+ {
+ return -1;
+ }
+
+ /* things might go horribly wrong if cudart is not compatible with the driver */
+ if (dev_count < 0 || dev_count > cuda_max_device_count)
+ {
+ return -1;
+ }
+
+ if (dev_id == -1) /* device already selected let's not destroy the context */
+ {
+ cu_err = cudaGetDevice(&id);
+ if (cu_err != cudaSuccess)
+ {
+ fprintf(stderr, "Error %d while querying device id: %s\n", cu_err,
+ cudaGetErrorString(cu_err));
+ return -1;
+ }
+ }
+ else
+ {
+ id = dev_id;
+ if (id > dev_count - 1) /* pfff there's no such device */
+ {
+ fprintf(stderr, "The requested device with id %d does not seem to exist (device count=%d)\n",
+ dev_id, dev_count);
+ return -1;
+ }
+ }
+
+ memset(dev_prop, 0, sizeof(cudaDeviceProp));
+ cu_err = cudaGetDeviceProperties(dev_prop, id);
+ if (cu_err != cudaSuccess)
+ {
+ fprintf(stderr, "Error %d while querying device properties: %s\n", cu_err,
+ cudaGetErrorString(cu_err));
+ return -1;
+ }
+
+ /* both major & minor is 9999 if no CUDA capable devices are present */
+ if (dev_prop->major == 9999 && dev_prop->minor == 9999)
+ {
+ return -1;
+ }
+ /* we don't care about emulation mode */
+ if (dev_prop->major == 0)
+ {
+ return -1;
+ }
+
+ if (id != -1)
+ {
+ cu_err = cudaSetDevice(id);
+ if (cu_err != cudaSuccess)
+ {
+ fprintf(stderr, "Error %d while switching to device #%d: %s\n",
+ cu_err, id, cudaGetErrorString(cu_err));
+ return -1;
+ }
+ }
+
+ /* try to execute a dummy kernel */
+ k_dummy_test<<< 1, 512>>> ();
+ if (cudaThreadSynchronize() != cudaSuccess)
+ {
+ return -1;
+ }
+
+ /* destroy context if we created one */
+ if (id != -1)
+ {
+ cu_err = cudaDeviceReset();
+ CU_RET_ERR(cu_err, "cudaDeviceReset failed");
+ }
+
+ return 0;
+}
+
+#if HAVE_NVML
+/* TODO: We should actually be using md_print_warn in md_logging.c,
+ * but we can't include mpi.h in CUDA code.
+ */
+static void md_print_info(FILE *fplog,
+ const char *fmt, ...)
+{
+ va_list ap;
+
+ if (fplog != NULL)
+ {
+ /* We should only print to stderr on the master node,
+ * in most cases fplog is only set on the master node, so this works.
+ */
+ va_start(ap, fmt);
+ vfprintf(stderr, fmt, ap);
+ va_end(ap);
+
+ va_start(ap, fmt);
+ vfprintf(fplog, fmt, ap);
+ va_end(ap);
+ }
+}
+#endif /*HAVE_NVML*/
+
+/* TODO: We should actually be using md_print_warn in md_logging.c,
+ * but we can't include mpi.h in CUDA code.
+ * This is replicated from nbnxn_cuda_data_mgmt.cu.
+ */
+static void md_print_warn(FILE *fplog,
+ const char *fmt, ...)
+{
+ va_list ap;
+
+ if (fplog != NULL)
+ {
+ /* We should only print to stderr on the master node,
+ * in most cases fplog is only set on the master node, so this works.
+ */
+ va_start(ap, fmt);
+ fprintf(stderr, "\n");
+ vfprintf(stderr, fmt, ap);
+ fprintf(stderr, "\n");
+ va_end(ap);
+
+ va_start(ap, fmt);
+ fprintf(fplog, "\n");
+ vfprintf(fplog, fmt, ap);
+ fprintf(fplog, "\n");
+ va_end(ap);
+ }
+}
+
+#if HAVE_NVML_APPLICATION_CLOCKS
+/*! \brief Determines and adds the NVML device ID to the passed \cuda_dev.
+ *
+ * Determines and adds the NVML device ID to the passed \cuda_dev. This is done by
+ * matching PCI-E information from \cuda_dev with the available NVML devices.
+ *
+ * \param[in,out] cuda_dev CUDA device information to enrich with NVML device info
+ * \returns true if \cuda_dev could be enriched with matching NVML device information.
+ */
+static bool addNVMLDeviceId(gmx_device_info_t* cuda_dev)
+{
+ nvmlDevice_t nvml_device_id;
+ unsigned int nvml_device_count = 0;
+ nvmlReturn_t nvml_stat = nvmlDeviceGetCount ( &nvml_device_count );
+ cuda_dev->nvml_initialized = false;
+ HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetCount failed" );
+ for (unsigned int nvml_device_idx = 0; nvml_stat == NVML_SUCCESS && nvml_device_idx < nvml_device_count; ++nvml_device_idx)
+ {
+ nvml_stat = nvmlDeviceGetHandleByIndex ( nvml_device_idx, &nvml_device_id );
+ HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetHandleByIndex failed" );
+ if (nvml_stat != NVML_SUCCESS)
+ {
+ break;
+ }
+
+ nvmlPciInfo_t nvml_pci_info;
+ nvml_stat = nvmlDeviceGetPciInfo ( nvml_device_id, &nvml_pci_info );
+ HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetPciInfo failed" );
+ if (nvml_stat != NVML_SUCCESS)
+ {
+ break;
+ }
+ if (static_cast<unsigned int>(cuda_dev->prop.pciBusID) == nvml_pci_info.bus &&
+ static_cast<unsigned int>(cuda_dev->prop.pciDeviceID) == nvml_pci_info.device &&
+ static_cast<unsigned int>(cuda_dev->prop.pciDomainID) == nvml_pci_info.domain)
+ {
+ cuda_dev->nvml_initialized = true;
+ cuda_dev->nvml_device_id = nvml_device_id;
+ break;
+ }
+ }
+ return cuda_dev->nvml_initialized;
+}
++
++/*! \brief Reads and returns the application clocks for device.
++ *
++ * \param[in] device The GPU device
++ * \param[out] app_sm_clock The current application SM clock
++ * \param[out] app_mem_clock The current application memory clock
++ * \returns if applacation clocks are supported
++ */
++static bool getApplicationClocks(const gmx_device_info_t *cuda_dev,
++ unsigned int *app_sm_clock,
++ unsigned int *app_mem_clock)
++{
++ nvmlReturn_t nvml_stat;
++
++ nvml_stat = nvmlDeviceGetApplicationsClock(cuda_dev->nvml_device_id, NVML_CLOCK_SM, app_sm_clock);
++ if (NVML_ERROR_NOT_SUPPORTED == nvml_stat)
++ {
++ return false;
++ }
++ HANDLE_NVML_RET_ERR(nvml_stat, "nvmlDeviceGetApplicationsClock failed");
++ nvml_stat = nvmlDeviceGetApplicationsClock(cuda_dev->nvml_device_id, NVML_CLOCK_MEM, app_mem_clock);
++ HANDLE_NVML_RET_ERR(nvml_stat, "nvmlDeviceGetApplicationsClock failed");
++
++ return true;
++}
+#endif /* HAVE_NVML_APPLICATION_CLOCKS */
+
+/*! \brief Tries to set application clocks for the GPU with the given index.
+ *
+ * The variable \gpuid is the index of the GPU in the gpu_info.cuda_dev array
+ * to handle the application clocks for. Application clocks are set to the
+ * max supported value to increase performance if application clock permissions
+ * allow this. For future GPU architectures a more sophisticated scheme might be
+ * required.
+ *
+ * \param[out] fplog log file to write to
+ * \param[in] gpuid index of the GPU to set application clocks for
+ * \param[in] gpu_info GPU info of all detected devices in the system.
+ * \returns true if no error occurs during application clocks handling.
+ */
+static gmx_bool init_gpu_application_clocks(FILE gmx_unused *fplog, int gmx_unused gpuid, const gmx_gpu_info_t gmx_unused *gpu_info)
+{
+ const cudaDeviceProp *prop = &gpu_info->gpu_dev[gpuid].prop;
+ int cuda_version_number = prop->major * 10 + prop->minor;
+ gmx_bool bGpuCanUseApplicationClocks =
+ ((0 == gmx_wcmatch("*Tesla*", prop->name) && cuda_version_number >= 35 ) ||
+ (0 == gmx_wcmatch("*Quadro*", prop->name) && cuda_version_number >= 52 ));
+ if (!bGpuCanUseApplicationClocks)
+ {
+ return true;
+ }
+#if !HAVE_NVML
+ int cuda_driver = 0;
+ int cuda_runtime = 0;
+ cudaDriverGetVersion(&cuda_driver);
+ cudaRuntimeGetVersion(&cuda_runtime);
+ md_print_warn( fplog, "NOTE: GROMACS was configured without NVML support hence it can not exploit\n"
+ " application clocks of the detected %s GPU to improve performance.\n"
+ " Recompile with the NVML library (compatible with the driver used) or set application clocks manually.\n",
+ prop->name);
+ return true;
+#else
+ if (!bCompiledWithApplicationClockSupport)
+ {
+ int cuda_driver = 0;
+ int cuda_runtime = 0;
+ cudaDriverGetVersion(&cuda_driver);
+ cudaRuntimeGetVersion(&cuda_runtime);
+ md_print_warn( fplog, "NOTE: GROMACS was compiled with an old NVML library which does not support\n"
+ " managing application clocks of the detected %s GPU to improve performance.\n"
+ " If your GPU supports application clocks, upgrade NVML (and driver) and recompile or set the clocks manually.\n",
+ prop->name );
+ return true;
+ }
+
+ /* We've compiled with NVML application clocks support, and have a GPU that can use it */
+ nvmlReturn_t nvml_stat = NVML_SUCCESS;
+ char *env;
+ //TODO: GMX_GPU_APPLICATION_CLOCKS is currently only used to enable/disable setting of application clocks
+ // this variable can be later used to give a user more fine grained control.
+ env = getenv("GMX_GPU_APPLICATION_CLOCKS");
+ if (env != NULL && ( strcmp( env, "0") == 0 ||
+ gmx_strcasecmp( env, "OFF") == 0 ||
+ gmx_strcasecmp( env, "DISABLE") == 0 ))
+ {
+ return true;
+ }
+ nvml_stat = nvmlInit();
+ HANDLE_NVML_RET_ERR( nvml_stat, "nvmlInit failed." );
+ if (nvml_stat != NVML_SUCCESS)
+ {
+ return false;
+ }
- unsigned int app_sm_clock = 0;
- unsigned int app_mem_clock = 0;
- nvml_stat = nvmlDeviceGetApplicationsClock ( gpu_info->gpu_dev[gpuid].nvml_device_id, NVML_CLOCK_SM, &app_sm_clock );
- if (NVML_ERROR_NOT_SUPPORTED == nvml_stat)
++
++ gmx_device_info_t *cuda_dev = &(gpu_info->gpu_dev[gpuid]);
++
++ if (!addNVMLDeviceId(cuda_dev))
+ {
+ return false;
+ }
+ //get current application clocks setting
- HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetApplicationsClock failed" );
- nvml_stat = nvmlDeviceGetApplicationsClock ( gpu_info->gpu_dev[gpuid].nvml_device_id, NVML_CLOCK_MEM, &app_mem_clock );
- HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetApplicationsClock failed" );
++ if (!getApplicationClocks(cuda_dev,
++ &cuda_dev->nvml_orig_app_sm_clock,
++ &cuda_dev->nvml_orig_app_mem_clock))
+ {
+ return false;
+ }
- nvml_stat = nvmlDeviceGetMaxClockInfo ( gpu_info->gpu_dev[gpuid].nvml_device_id, NVML_CLOCK_SM, &max_sm_clock );
+ //get max application clocks
+ unsigned int max_sm_clock = 0;
+ unsigned int max_mem_clock = 0;
- nvml_stat = nvmlDeviceGetMaxClockInfo ( gpu_info->gpu_dev[gpuid].nvml_device_id, NVML_CLOCK_MEM, &max_mem_clock );
++ nvml_stat = nvmlDeviceGetMaxClockInfo(cuda_dev->nvml_device_id, NVML_CLOCK_SM, &max_sm_clock);
+ HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetMaxClockInfo failed" );
- gpu_info->gpu_dev[gpuid].nvml_is_restricted = NVML_FEATURE_ENABLED;
- gpu_info->gpu_dev[gpuid].nvml_ap_clocks_changed = false;
++ nvml_stat = nvmlDeviceGetMaxClockInfo(cuda_dev->nvml_device_id, NVML_CLOCK_MEM, &max_mem_clock);
+ HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetMaxClockInfo failed" );
+
- nvml_stat = nvmlDeviceGetAPIRestriction ( gpu_info->gpu_dev[gpuid].nvml_device_id, NVML_RESTRICTED_API_SET_APPLICATION_CLOCKS, &(gpu_info->gpu_dev[gpuid].nvml_is_restricted) );
++ cuda_dev->nvml_is_restricted = NVML_FEATURE_ENABLED;
++ cuda_dev->nvml_app_clocks_changed = false;
+
- if (nvml_stat == NVML_SUCCESS && app_sm_clock < max_sm_clock && gpu_info->gpu_dev[gpuid].nvml_is_restricted == NVML_FEATURE_DISABLED)
++ nvml_stat = nvmlDeviceGetAPIRestriction(cuda_dev->nvml_device_id, NVML_RESTRICTED_API_SET_APPLICATION_CLOCKS, &(cuda_dev->nvml_is_restricted));
+ HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetAPIRestriction failed" );
+
+ /* Note: Distinguishing between different types of GPUs here might be necessary in the future,
+ e.g. if max application clocks should not be used for certain GPUs. */
- md_print_info( fplog, "Changing GPU application clocks for %s to (%d,%d)\n", gpu_info->gpu_dev[gpuid].prop.name, max_mem_clock, max_sm_clock);
- nvml_stat = nvmlDeviceSetApplicationsClocks ( gpu_info->gpu_dev[gpuid].nvml_device_id, max_mem_clock, max_sm_clock );
++ if (nvml_stat == NVML_SUCCESS && cuda_dev->nvml_orig_app_sm_clock < max_sm_clock && cuda_dev->nvml_is_restricted == NVML_FEATURE_DISABLED)
+ {
- gpu_info->gpu_dev[gpuid].nvml_ap_clocks_changed = true;
++ md_print_info(fplog, "Changing GPU application clocks for %s to (%d,%d)\n", cuda_dev->prop.name, max_mem_clock, max_sm_clock);
++ nvml_stat = nvmlDeviceSetApplicationsClocks(cuda_dev->nvml_device_id, max_mem_clock, max_sm_clock);
+ HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetApplicationsClock failed" );
- else if (nvml_stat == NVML_SUCCESS && app_sm_clock < max_sm_clock)
++ cuda_dev->nvml_app_clocks_changed = true;
++ cuda_dev->nvml_set_app_sm_clock = max_sm_clock;
++ cuda_dev->nvml_set_app_mem_clock = max_mem_clock;
+ }
- md_print_warn( fplog, "Can not change application clocks for %s to optimal values due to insufficient permissions. Current values are (%d,%d), max values are (%d,%d).\nUse sudo nvidia-smi -acp UNRESTRICTED or contact your admin to change application clocks.\n", gpu_info->gpu_dev[gpuid].prop.name, app_mem_clock, app_sm_clock, max_mem_clock, max_sm_clock);
++ else if (nvml_stat == NVML_SUCCESS && cuda_dev->nvml_orig_app_sm_clock < max_sm_clock)
+ {
- else if (nvml_stat == NVML_SUCCESS && app_sm_clock == max_sm_clock)
++ md_print_warn(fplog, "Can not change application clocks for %s to optimal values due to insufficient permissions. Current values are (%d,%d), max values are (%d,%d).\nUse sudo nvidia-smi -acp UNRESTRICTED or contact your admin to change application clocks.\n", cuda_dev->prop.name, cuda_dev->nvml_orig_app_mem_clock, cuda_dev->nvml_orig_app_sm_clock, max_mem_clock, max_sm_clock);
+ }
- //TODO: This should probably be integrated into the GPU Properties table.
- md_print_info( fplog, "Application clocks (GPU clocks) for %s are (%d,%d)\n", gpu_info->gpu_dev[gpuid].prop.name, app_mem_clock, app_sm_clock);
++ else if (nvml_stat == NVML_SUCCESS && cuda_dev->nvml_orig_app_sm_clock == max_sm_clock)
+ {
- cuda_dev->nvml_ap_clocks_changed)
++ md_print_info(fplog, "Application clocks (GPU clocks) for %s are (%d,%d)\n", cuda_dev->prop.name, cuda_dev->nvml_orig_app_mem_clock, cuda_dev->nvml_orig_app_sm_clock);
+ }
+ else
+ {
+ md_print_warn( fplog, "Can not change GPU application clocks to optimal values due to NVML error (%d): %s.\n", nvml_stat, nvmlErrorString(nvml_stat));
+ }
+ return (nvml_stat == NVML_SUCCESS);
+#endif /* HAVE_NVML */
+}
+
+/*! \brief Resets application clocks if changed and cleans up NVML for the passed \gpu_dev.
+ *
+ * \param[in] gpu_dev CUDA device information
+ */
+static gmx_bool reset_gpu_application_clocks(const gmx_device_info_t gmx_unused * cuda_dev)
+{
+#if !HAVE_NVML_APPLICATION_CLOCKS
+ GMX_UNUSED_VALUE(cuda_dev);
+ return true;
+#else /* HAVE_NVML_APPLICATION_CLOCKS */
+ nvmlReturn_t nvml_stat = NVML_SUCCESS;
+ if (cuda_dev &&
+ cuda_dev->nvml_is_restricted == NVML_FEATURE_DISABLED &&
- nvml_stat = nvmlDeviceResetApplicationsClocks( cuda_dev->nvml_device_id );
- HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceResetApplicationsClocks failed" );
++ cuda_dev->nvml_app_clocks_changed)
+ {
++ /* Check if the clocks are still what we set them to.
++ * If so, set them back to the state we originally found them in.
++ * If not, don't touch them, because something else set them later.
++ */
++ unsigned int app_sm_clock, app_mem_clock;
++ getApplicationClocks(cuda_dev, &app_sm_clock, &app_mem_clock);
++ if (app_sm_clock == cuda_dev->nvml_set_app_sm_clock &&
++ app_mem_clock == cuda_dev->nvml_set_app_mem_clock)
++ {
++ nvml_stat = nvmlDeviceSetApplicationsClocks(cuda_dev->nvml_device_id, cuda_dev->nvml_orig_app_mem_clock, cuda_dev->nvml_orig_app_sm_clock);
++ HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetApplicationsClock failed" );
++ }
+ }
+ nvml_stat = nvmlShutdown();
+ HANDLE_NVML_RET_ERR( nvml_stat, "nvmlShutdown failed" );
+ return (nvml_stat == NVML_SUCCESS);
+#endif /* HAVE_NVML_APPLICATION_CLOCKS */
+}
+
+gmx_bool init_gpu(FILE gmx_unused *fplog, int mygpu, char *result_str,
+ const struct gmx_gpu_info_t *gpu_info,
+ const struct gmx_gpu_opt_t *gpu_opt)
+{
+ cudaError_t stat;
+ char sbuf[STRLEN];
+ int gpuid;
+
+ assert(gpu_info);
+ assert(result_str);
+
+ if (mygpu < 0 || mygpu >= gpu_opt->n_dev_use)
+ {
+ sprintf(sbuf, "Trying to initialize an inexistent GPU: "
+ "there are %d %s-selected GPU(s), but #%d was requested.",
+ gpu_opt->n_dev_use, gpu_opt->bUserSet ? "user" : "auto", mygpu);
+ gmx_incons(sbuf);
+ }
+
+ gpuid = gpu_info->gpu_dev[gpu_opt->dev_use[mygpu]].id;
+
+ stat = cudaSetDevice(gpuid);
+ strncpy(result_str, cudaGetErrorString(stat), STRLEN);
+
+ if (debug)
+ {
+ fprintf(stderr, "Initialized GPU ID #%d: %s\n", gpuid, gpu_info->gpu_dev[gpuid].prop.name);
+ }
+
+ //Ignoring return value as NVML errors should be treated not critical.
+ if (stat == cudaSuccess)
+ {
+ init_gpu_application_clocks(fplog, gpuid, gpu_info);
+ }
+ return (stat == cudaSuccess);
+}
+
+gmx_bool free_cuda_gpu(
+ int gmx_unused mygpu, char *result_str,
+ const gmx_gpu_info_t gmx_unused *gpu_info,
+ const gmx_gpu_opt_t gmx_unused *gpu_opt
+ )
+{
+ cudaError_t stat;
+ gmx_bool reset_gpu_application_clocks_status = true;
+ int gpuid;
+
+ assert(result_str);
+
+ if (debug)
+ {
+ int gpuid;
+ stat = cudaGetDevice(&gpuid);
+ CU_RET_ERR(stat, "cudaGetDevice failed");
+ fprintf(stderr, "Cleaning up context on GPU ID #%d\n", gpuid);
+ }
+
+ gpuid = gpu_opt ? gpu_opt->dev_use[mygpu] : -1;
+ if (gpuid != -1)
+ {
+ reset_gpu_application_clocks_status = reset_gpu_application_clocks( &(gpu_info->gpu_dev[gpuid]) );
+ }
+
+ stat = cudaDeviceReset();
+ strncpy(result_str, cudaGetErrorString(stat), STRLEN);
+ return (stat == cudaSuccess) && reset_gpu_application_clocks_status;
+}
+
+/*! \brief Returns true if the gpu characterized by the device properties is
+ * supported by the native gpu acceleration.
+ *
+ * \param[in] dev_prop the CUDA device properties of the gpus to test.
+ * \returns true if the GPU properties passed indicate a compatible
+ * GPU, otherwise false.
+ */
+static bool is_gmx_supported_gpu(const cudaDeviceProp *dev_prop)
+{
+ return (dev_prop->major >= 2);
+}
+
+/*! \brief Helper function that checks whether a given GPU status indicates compatible GPU.
+ *
+ * \param[in] stat GPU status.
+ * \returns true if the provided status is egpuCompatible, otherwise false.
+ */
+static bool is_compatible_gpu(int stat)
+{
+ return (stat == egpuCompatible);
+}
+
+/*! \brief Checks if a GPU with a given ID is supported by the native GROMACS acceleration.
+ *
+ * Returns a status value which indicates compatibility or one of the following
+ * errors: incompatibility, insistence, or insanity (=unexpected behavior).
+ * It also returns the respective device's properties in \dev_prop (if applicable).
+ *
+ * \param[in] dev_id the ID of the GPU to check.
+ * \param[out] dev_prop the CUDA device properties of the device checked.
+ * \returns the status of the requested device
+ */
+static int is_gmx_supported_gpu_id(int dev_id, cudaDeviceProp *dev_prop)
+{
+ cudaError_t stat;
+ int ndev;
+
+ stat = cudaGetDeviceCount(&ndev);
+ if (stat != cudaSuccess)
+ {
+ return egpuInsane;
+ }
+
+ if (dev_id > ndev - 1)
+ {
+ return egpuNonexistent;
+ }
+
+ /* TODO: currently we do not make a distinction between the type of errors
+ * that can appear during sanity checks. This needs to be improved, e.g if
+ * the dummy test kernel fails to execute with a "device busy message" we
+ * should appropriately report that the device is busy instead of insane.
+ */
+ if (do_sanity_checks(dev_id, dev_prop) == 0)
+ {
+ if (is_gmx_supported_gpu(dev_prop))
+ {
+ return egpuCompatible;
+ }
+ else
+ {
+ return egpuIncompatible;
+ }
+ }
+ else
+ {
+ return egpuInsane;
+ }
+}
+
+
+int detect_gpus(gmx_gpu_info_t *gpu_info, char *err_str)
+{
+ int i, ndev, checkres, retval;
+ cudaError_t stat;
+ cudaDeviceProp prop;
+ gmx_device_info_t *devs;
+
+ assert(gpu_info);
+ assert(err_str);
+
+ gpu_info->n_dev_compatible = 0;
+
+ ndev = 0;
+ devs = NULL;
+
+ stat = cudaGetDeviceCount(&ndev);
+ if (stat != cudaSuccess)
+ {
+ const char *s;
+
+ /* cudaGetDeviceCount failed which means that there is something
+ * wrong with the machine: driver-runtime mismatch, all GPUs being
+ * busy in exclusive mode, or some other condition which should
+ * result in us issuing a warning a falling back to CPUs. */
+ retval = -1;
+ s = cudaGetErrorString(stat);
+ strncpy(err_str, s, STRLEN*sizeof(err_str[0]));
+ }
+ else
+ {
+ snew(devs, ndev);
+ for (i = 0; i < ndev; i++)
+ {
+ checkres = is_gmx_supported_gpu_id(i, &prop);
+
+ devs[i].id = i;
+ devs[i].prop = prop;
+ devs[i].stat = checkres;
+
+ if (checkres == egpuCompatible)
+ {
+ gpu_info->n_dev_compatible++;
+ }
+ }
+ retval = 0;
+ }
+
+ gpu_info->n_dev = ndev;
+ gpu_info->gpu_dev = devs;
+
+ return retval;
+}
+
+void pick_compatible_gpus(const gmx_gpu_info_t *gpu_info,
+ gmx_gpu_opt_t *gpu_opt)
+{
+ int i, ncompat;
+ int *compat;
+
+ assert(gpu_info);
+ /* gpu_dev/n_dev have to be either NULL/0 or not (NULL/0) */
+ assert((gpu_info->n_dev != 0 ? 0 : 1) ^ (gpu_info->gpu_dev == NULL ? 0 : 1));
+
+ snew(compat, gpu_info->n_dev);
+ ncompat = 0;
+ for (i = 0; i < gpu_info->n_dev; i++)
+ {
+ if (is_compatible_gpu(gpu_info->gpu_dev[i].stat))
+ {
+ ncompat++;
+ compat[ncompat - 1] = i;
+ }
+ }
+
+ gpu_opt->n_dev_compatible = ncompat;
+ snew(gpu_opt->dev_compatible, ncompat);
+ memcpy(gpu_opt->dev_compatible, compat, ncompat*sizeof(*compat));
+ sfree(compat);
+}
+
+gmx_bool check_selected_gpus(int *checkres,
+ const gmx_gpu_info_t *gpu_info,
+ gmx_gpu_opt_t *gpu_opt)
+{
+ int i, id;
+ bool bAllOk;
+
+ assert(checkres);
+ assert(gpu_info);
+ assert(gpu_opt->n_dev_use >= 0);
+
+ if (gpu_opt->n_dev_use == 0)
+ {
+ return TRUE;
+ }
+
+ assert(gpu_opt->dev_use);
+
+ /* we will assume that all GPUs requested are valid IDs,
+ otherwise we'll bail anyways */
+
+ bAllOk = true;
+ for (i = 0; i < gpu_opt->n_dev_use; i++)
+ {
+ id = gpu_opt->dev_use[i];
+
+ /* devices are stored in increasing order of IDs in gpu_dev */
+ gpu_opt->dev_use[i] = id;
+
+ checkres[i] = (id >= gpu_info->n_dev) ?
+ egpuNonexistent : gpu_info->gpu_dev[id].stat;
+
+ bAllOk = bAllOk && is_compatible_gpu(checkres[i]);
+ }
+
+ return bAllOk;
+}
+
+void free_gpu_info(const gmx_gpu_info_t *gpu_info)
+{
+ if (gpu_info == NULL)
+ {
+ return;
+ }
+
+ sfree(gpu_info->gpu_dev);
+}
+
+void get_gpu_device_info_string(char *s, const gmx_gpu_info_t *gpu_info, int index)
+{
+ assert(s);
+ assert(gpu_info);
+
+ if (index < 0 && index >= gpu_info->n_dev)
+ {
+ return;
+ }
+
+ gmx_device_info_t *dinfo = &gpu_info->gpu_dev[index];
+
+ bool bGpuExists =
+ dinfo->stat == egpuCompatible ||
+ dinfo->stat == egpuIncompatible;
+
+ if (!bGpuExists)
+ {
+ sprintf(s, "#%d: %s, stat: %s",
+ dinfo->id, "N/A",
+ gpu_detect_res_str[dinfo->stat]);
+ }
+ else
+ {
+ sprintf(s, "#%d: NVIDIA %s, compute cap.: %d.%d, ECC: %3s, stat: %s",
+ dinfo->id, dinfo->prop.name,
+ dinfo->prop.major, dinfo->prop.minor,
+ dinfo->prop.ECCEnabled ? "yes" : " no",
+ gpu_detect_res_str[dinfo->stat]);
+ }
+}
+
+int get_gpu_device_id(const gmx_gpu_info_t *gpu_info,
+ const gmx_gpu_opt_t *gpu_opt,
+ int idx)
+{
+ assert(gpu_info);
+ assert(gpu_opt);
+ assert(idx >= 0 && idx < gpu_opt->n_dev_use);
+
+ return gpu_info->gpu_dev[gpu_opt->dev_use[idx]].id;
+}
+
+int get_current_cuda_gpu_device_id(void)
+{
+ int gpuid;
+ CU_RET_ERR(cudaGetDevice(&gpuid), "cudaGetDevice failed");
+
+ return gpuid;
+}
+
+size_t sizeof_gpu_dev_info(void)
+{
+ return sizeof(gmx_device_info_t);
+}
+
+void gpu_set_host_malloc_and_free(bool bUseGpuKernels,
+ gmx_host_alloc_t **nb_alloc,
+ gmx_host_free_t **nb_free)
+{
+ if (bUseGpuKernels)
+ {
+ *nb_alloc = &pmalloc;
+ *nb_free = &pfree;
+ }
+ else
+ {
+ *nb_alloc = NULL;
+ *nb_free = NULL;
+ }
+}
+
+void startGpuProfiler(void)
+{
+ /* The NVPROF_ID environment variable is set by nvprof and indicates that
+ mdrun is executed in the CUDA profiler.
+ If nvprof was run is with "--profile-from-start off", the profiler will
+ be started here. This way we can avoid tracing the CUDA events from the
+ first part of the run. Starting the profiler again does nothing.
+ */
+ if (cudaProfilerRun)
+ {
+ cudaError_t stat;
+ stat = cudaProfilerStart();
+ CU_RET_ERR(stat, "cudaProfilerStart failed");
+ }
+}
+
+void stopGpuProfiler(void)
+{
+ /* Stopping the nvidia here allows us to eliminate the subsequent
+ API calls from the trace, e.g. uninitialization and cleanup. */
+ if (cudaProfilerRun)
+ {
+ cudaError_t stat;
+ stat = cudaProfilerStop();
+ CU_RET_ERR(stat, "cudaProfilerStop failed");
+ }
+}
+
+void resetGpuProfiler(void)
+{
+ /* With CUDA <=7.5 the profiler can't be properly reset; we can only start
+ * the profiling here (can't stop it) which will achieve the desired effect if
+ * the run was started with the profiling disabled.
+ *
+ * TODO: add a stop (or replace it with reset) when this will work correctly in CUDA.
+ * stopGpuProfiler();
+ */
+ if (cudaProfilerRun)
+ {
+ startGpuProfiler();
+ }
+}
--- /dev/null
- md_print_warn(cr, fplog,
- "%d CPUs configured, but only %d of them are online.\n"
- "This can happen on embedded platforms (e.g. ARM) where the OS shuts some cores\n"
- "off to save power, and will turn them back on later when the load increases.\n"
- "However, this will likely mean GROMACS cannot pin threads to those cores. You\n"
- "will likely see much better performance by forcing all cores to be online, and\n"
- "making sure they run at their full clock frequency.", count, countOnline);
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014,2015,2016, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/*! \internal \file
+ * \brief
+ * Implements gmx::HardwareTopology.
+ *
+ * \author Erik Lindahl <erik.lindahl@gmail.com>
+ * \ingroup module_hardware
+ */
+
+#include "gmxpre.h"
+
+#include "hardwaretopology.h"
+
+#include "config.h"
+
+#include <cstdio>
+
+#include <algorithm>
+#include <vector>
+
+#if GMX_HWLOC
+# include <hwloc.h>
+#endif
+
+#include "gromacs/gmxlib/md_logging.h"
+#include "gromacs/hardware/cpuinfo.h"
+#include "gromacs/utility/gmxassert.h"
+#include "gromacs/utility/gmxomp.h"
+
+#ifdef HAVE_UNISTD_H
+# include <unistd.h> // sysconf()
+#endif
+#if GMX_NATIVE_WINDOWS
+# include <windows.h> // GetSystemInfo()
+#endif
+
++#if defined(_M_ARM) || defined(__arm__) || defined(__ARM_ARCH) || defined (__aarch64__)
++//! Constant used to help minimize preprocessed code
++static const bool isArm = true;
++#else
++//! Constant used to help minimize preprocessed code
++static const bool isArm = false;
++#endif
++
+namespace gmx
+{
+
+namespace
+{
+
+/*****************************************************************************
+ * *
+ * Utility functions for extracting hardware topology from CpuInfo object *
+ * *
+ *****************************************************************************/
+
+/*! \brief Initialize machine data from basic information in cpuinfo
+ *
+ * \param machine Machine tree structure where information will be assigned
+ * if the cpuinfo object contains topology information.
+ * \param supportLevel If topology information is available in CpuInfo,
+ * this will be updated to reflect the amount of
+ * information written to the machine structure.
+ */
+void
+parseCpuInfo(HardwareTopology::Machine * machine,
+ HardwareTopology::SupportLevel * supportLevel)
+{
+ CpuInfo cpuInfo(CpuInfo::detect());
+
+ if (!cpuInfo.logicalProcessors().empty())
+ {
+ int nSockets = 0;
+ int nCores = 0;
+ int nHwThreads = 0;
+
+ // Copy the logical processor information from cpuinfo
+ for (auto &l : cpuInfo.logicalProcessors())
+ {
+ machine->logicalProcessors.push_back( { l.socketRankInMachine, l.coreRankInSocket, l.hwThreadRankInCore, -1 } );
+ nSockets = std::max(nSockets, l.socketRankInMachine);
+ nCores = std::max(nCores, l.coreRankInSocket);
+ nHwThreads = std::max(nHwThreads, l.hwThreadRankInCore);
+ }
+
+ // Fill info form sockets/cores/hwthreads
+ int socketId = 0;
+ int coreId = 0;
+ int hwThreadId = 0;
+
+ machine->sockets.resize(nSockets + 1);
+ for (auto &s : machine->sockets)
+ {
+ s.id = socketId++;
+ s.cores.resize(nCores + 1);
+ for (auto &c : s.cores)
+ {
+ c.id = coreId++;
+ c.numaNodeId = -1; // No numa information
+ c.hwThreads.resize(nHwThreads + 1);
+ for (auto &t : c.hwThreads)
+ {
+ t.id = hwThreadId++;
+ t.logicalProcessorId = -1; // set as unassigned for now
+ }
+ }
+ }
+
+ // Fill the logical processor id in the right place
+ for (std::size_t i = 0; i < machine->logicalProcessors.size(); i++)
+ {
+ const HardwareTopology::LogicalProcessor &l = machine->logicalProcessors[i];
+ machine->sockets[l.socketRankInMachine].cores[l.coreRankInSocket].hwThreads[l.hwThreadRankInCore].logicalProcessorId = static_cast<int>(i);
+ }
+ machine->logicalProcessorCount = machine->logicalProcessors.size();
+ *supportLevel = HardwareTopology::SupportLevel::Basic;
+ }
+ else
+ {
+ *supportLevel = HardwareTopology::SupportLevel::None;
+ }
+}
+
+#if GMX_HWLOC
+
+#if HWLOC_API_VERSION < 0x00010b00
+# define HWLOC_OBJ_PACKAGE HWLOC_OBJ_SOCKET
+# define HWLOC_OBJ_NUMANODE HWLOC_OBJ_NODE
+#endif
+
+/*****************************************************************************
+ * *
+ * Utility functions for extracting hardware topology from hwloc library *
+ * *
+ *****************************************************************************/
+
+/*! \brief Return vector of all descendants of a given type in hwloc topology
+ *
+ * \param obj Non-null hwloc object.
+ * \param type hwloc object type to find. The routine will only search
+ * on levels below obj.
+ *
+ * \return vector containing all the objects of given type that are
+ * descendants of the provided object. If no objects of this type
+ * were found, the vector will be empty.
+ */
+const std::vector<hwloc_obj_t>
+getHwLocDescendantsByType(const hwloc_obj_t obj, const hwloc_obj_type_t type)
+{
+ GMX_RELEASE_ASSERT(obj, "NULL hwloc object provided to getHwLocDescendantsByType()");
+
+ std::vector<hwloc_obj_t> v;
+
+ // Go through children; if this object has no children obj->arity is 0,
+ // and we'll return an empty vector.
+ for (std::size_t i = 0; i < obj->arity; i++)
+ {
+ // If the child is the type we're looking for, add it directly.
+ // Otherwise call this routine recursively for each child.
+ if (obj->children[i]->type == type)
+ {
+ v.push_back(obj->children[i]);
+ }
+ else
+ {
+ std::vector<hwloc_obj_t> v2 = getHwLocDescendantsByType(obj->children[i], type);
+ v.insert(v.end(), v2.begin(), v2.end());
+ }
+ }
+ return v;
+}
+
+/*! \brief Read information about sockets, cores and threads from hwloc topology
+ *
+ * \param topo hwloc topology handle that has been initialized and loaded
+ * \param machine Pointer to the machine structure in the HardwareTopology
+ * class, where the tree of sockets/cores/threads will be written.
+ *
+ * \return If all the data is found the return value is 0, otherwise non-zero.
+ */
+int
+parseHwLocSocketsCoresThreads(const hwloc_topology_t topo,
+ HardwareTopology::Machine * machine)
+{
+ const hwloc_obj_t root = hwloc_get_root_obj(topo);
+ std::vector<hwloc_obj_t> hwlocSockets = getHwLocDescendantsByType(root, HWLOC_OBJ_PACKAGE);
+
+ machine->logicalProcessorCount = hwloc_get_nbobjs_by_type(topo, HWLOC_OBJ_PU);
+ machine->logicalProcessors.resize(machine->logicalProcessorCount);
+ machine->sockets.resize(hwlocSockets.size());
+
+ bool topologyOk = !hwlocSockets.empty(); // Fail if we have no sockets in machine
+
+ for (std::size_t i = 0; i < hwlocSockets.size() && topologyOk; i++)
+ {
+ // Assign information about this socket
+ machine->sockets[i].id = hwlocSockets[i]->logical_index;
+
+ // Get children (cores)
+ std::vector<hwloc_obj_t> hwlocCores = getHwLocDescendantsByType(hwlocSockets[i], HWLOC_OBJ_CORE);
+ machine->sockets[i].cores.resize(hwlocCores.size());
+
+ topologyOk = topologyOk && !hwlocCores.empty(); // Fail if we have no cores in socket
+
+ // Loop over child cores
+ for (std::size_t j = 0; j < hwlocCores.size() && topologyOk; j++)
+ {
+ // Assign information about this core
+ machine->sockets[i].cores[j].id = hwlocCores[j]->logical_index;
+ machine->sockets[i].cores[j].numaNodeId = -1;
+
+ // Get children (hwthreads)
+ std::vector<hwloc_obj_t> hwlocPUs = getHwLocDescendantsByType(hwlocCores[j], HWLOC_OBJ_PU);
+ machine->sockets[i].cores[j].hwThreads.resize(hwlocPUs.size());
+
+ topologyOk = topologyOk && !hwlocPUs.empty(); // Fail if we have no hwthreads in core
+
+ // Loop over child hwthreads
+ for (std::size_t k = 0; k < hwlocPUs.size() && topologyOk; k++)
+ {
+ // Assign information about this hwthread
+ std::size_t logicalProcessorId = hwlocPUs[k]->os_index;
+ machine->sockets[i].cores[j].hwThreads[k].id = hwlocPUs[k]->logical_index;
+ machine->sockets[i].cores[j].hwThreads[k].logicalProcessorId = logicalProcessorId;
+
+ if (logicalProcessorId < machine->logicalProcessors.size())
+ {
+ // Cross-assign data for this hwthread to the logicalprocess vector
+ machine->logicalProcessors[logicalProcessorId].socketRankInMachine = static_cast<int>(i);
+ machine->logicalProcessors[logicalProcessorId].coreRankInSocket = static_cast<int>(j);
+ machine->logicalProcessors[logicalProcessorId].hwThreadRankInCore = static_cast<int>(k);
+ machine->logicalProcessors[logicalProcessorId].numaNodeId = -1;
+ }
+ else
+ {
+ topologyOk = false;
+ }
+ }
+ }
+ }
+
+ if (topologyOk)
+ {
+ return 0;
+ }
+ else
+ {
+ machine->logicalProcessors.clear();
+ machine->sockets.clear();
+ return -1;
+ }
+}
+
+/*! \brief Read cache information from hwloc topology
+ *
+ * \param topo hwloc topology handle that has been initialized and loaded
+ * \param machine Pointer to the machine structure in the HardwareTopology
+ * class, where cache data will be filled.
+ *
+ * \return If any cache data is found the return value is 0, otherwise non-zero.
+ */
+int
+parseHwLocCache(const hwloc_topology_t topo,
+ HardwareTopology::Machine * machine)
+{
+ // Parse caches up to L5
+ for (int cachelevel : { 1, 2, 3, 4, 5})
+ {
+ int depth = hwloc_get_cache_type_depth(topo, cachelevel, HWLOC_OBJ_CACHE_DATA);
+
+ if (depth >= 0)
+ {
+ hwloc_obj_t cache = hwloc_get_next_obj_by_depth(topo, depth, NULL);
+ if (cache != NULL)
+ {
+ std::vector<hwloc_obj_t> hwThreads = getHwLocDescendantsByType(cache, HWLOC_OBJ_PU);
+
+ machine->caches.push_back( {
+ static_cast<int>(cache->attr->cache.depth),
+ static_cast<std::size_t>(cache->attr->cache.size),
+ static_cast<int>(cache->attr->cache.linesize),
+ static_cast<int>(cache->attr->cache.associativity),
+ std::max(static_cast<int>(hwThreads.size()), 1)
+ } );
+ }
+ }
+ }
+ return machine->caches.empty();
+}
+
+
+/*! \brief Read numa information from hwloc topology
+ *
+ * \param topo hwloc topology handle that has been initialized and loaded
+ * \param machine Pointer to the machine structure in the HardwareTopology
+ * class, where numa information will be filled.
+ *
+ * Hwloc should virtually always be able to detect numa information, but if
+ * there is only a single numa node in the system it is not reported at all.
+ * In this case we create a single numa node covering all cores.
+ *
+ * This function uses the basic socket/core/thread information detected by
+ * parseHwLocSocketsCoresThreads(), which means that routine must have
+ * completed successfully before calling this one. If this is not the case,
+ * you will get an error return code.
+ *
+ * \return If the data found makes sense (either in the numa node or the
+ * entire machine) the return value is 0, otherwise non-zero.
+ */
+int
+parseHwLocNuma(const hwloc_topology_t topo,
+ HardwareTopology::Machine * machine)
+{
+ const hwloc_obj_t root = hwloc_get_root_obj(topo);
+ std::vector<hwloc_obj_t> hwlocNumaNodes = getHwLocDescendantsByType(root, HWLOC_OBJ_NUMANODE);
+ bool topologyOk = true;
+
+ if (!hwlocNumaNodes.empty())
+ {
+ machine->numa.nodes.resize(hwlocNumaNodes.size());
+
+ for (std::size_t i = 0; i < hwlocNumaNodes.size(); i++)
+ {
+ machine->numa.nodes[i].id = hwlocNumaNodes[i]->logical_index;
+ machine->numa.nodes[i].memory = hwlocNumaNodes[i]->memory.total_memory;
+ machine->numa.nodes[i].logicalProcessorId.clear();
+
+ // Get list of PUs in this numa node
+ std::vector<hwloc_obj_t> hwlocPUs = getHwLocDescendantsByType(hwlocNumaNodes[i], HWLOC_OBJ_PU);
+
+ for (auto &p : hwlocPUs)
+ {
+ machine->numa.nodes[i].logicalProcessorId.push_back(p->os_index);
+
+ GMX_RELEASE_ASSERT(p->os_index < machine->logicalProcessors.size(), "OS index of PU in hwloc larger than processor count");
+
+ machine->logicalProcessors[p->os_index].numaNodeId = static_cast<int>(i);
+ std::size_t s = machine->logicalProcessors[p->os_index].socketRankInMachine;
+ std::size_t c = machine->logicalProcessors[p->os_index].coreRankInSocket;
+
+ GMX_RELEASE_ASSERT(s < machine->sockets.size(), "Socket index in logicalProcessors larger than socket count");
+ GMX_RELEASE_ASSERT(c < machine->sockets[s].cores.size(), "Core index in logicalProcessors larger than core count");
+ // Set numaNodeId in core too
+ machine->sockets[s].cores[c].numaNodeId = i;
+ }
+ }
+
+ int depth = hwloc_get_type_depth(topo, HWLOC_OBJ_NUMANODE);
+ const struct hwloc_distances_s * dist = hwloc_get_whole_distance_matrix_by_depth(topo, depth);
+ if (dist != NULL && dist->nbobjs == hwlocNumaNodes.size())
+ {
+ machine->numa.baseLatency = dist->latency_base;
+ machine->numa.maxRelativeLatency = dist->latency_max;
+ machine->numa.relativeLatency.resize(dist->nbobjs);
+ for (std::size_t i = 0; i < dist->nbobjs; i++)
+ {
+ machine->numa.relativeLatency[i].resize(dist->nbobjs);
+ for (std::size_t j = 0; j < dist->nbobjs; j++)
+ {
+ machine->numa.relativeLatency[i][j] = dist->latency[i*dist->nbobjs+j];
+ }
+ }
+ }
+ else
+ {
+ topologyOk = false;
+ }
+ }
+ else
+ {
+ // No numa nodes found. Use the entire machine as a numa node.
+ const hwloc_obj_t hwlocMachine = hwloc_get_next_obj_by_type(topo, HWLOC_OBJ_MACHINE, NULL);
+
+ if (hwlocMachine != NULL)
+ {
+ machine->numa.nodes.resize(1);
+ machine->numa.nodes[0].id = 0;
+ machine->numa.nodes[0].memory = hwlocMachine->memory.total_memory;
+ machine->numa.baseLatency = 10;
+ machine->numa.maxRelativeLatency = 1;
+ machine->numa.relativeLatency = { { 1.0 } };
+
+ for (int i = 0; i < machine->logicalProcessorCount; i++)
+ {
+ machine->numa.nodes[0].logicalProcessorId.push_back(i);
+ }
+ for (auto &l : machine->logicalProcessors)
+ {
+ l.numaNodeId = 0;
+ }
+ for (auto &s : machine->sockets)
+ {
+ for (auto &c : s.cores)
+ {
+ c.numaNodeId = 0;
+ }
+ }
+ }
+ else
+ {
+ topologyOk = false;
+ }
+ }
+
+ if (topologyOk)
+ {
+ return 0;
+ }
+ else
+ {
+ machine->numa.nodes.clear();
+ return -1;
+ }
+
+}
+
+/*! \brief Read PCI device information from hwloc topology
+ *
+ * \param topo hwloc topology handle that has been initialized and loaded
+ * \param machine Pointer to the machine structure in the HardwareTopology
+ * class, where PCI device information will be filled.
+ * *
+ * \return If any devices were found the return value is 0, otherwise non-zero.
+ */
+int
+parseHwLocDevices(const hwloc_topology_t topo,
+ HardwareTopology::Machine * machine)
+{
+ const hwloc_obj_t root = hwloc_get_root_obj(topo);
+ std::vector<hwloc_obj_t> pcidevs = getHwLocDescendantsByType(root, HWLOC_OBJ_PCI_DEVICE);
+
+ for (auto &p : pcidevs)
+ {
+ const hwloc_obj_t ancestor = hwloc_get_ancestor_obj_by_type(topo, HWLOC_OBJ_NUMANODE, p);
+ int numaId;
+ if (ancestor != NULL)
+ {
+ numaId = ancestor->logical_index;
+ }
+ else
+ {
+ // If we only have a single numa node we belong to it, otherwise set it to -1 (unknown)
+ numaId = (machine->numa.nodes.size() == 1) ? 0 : -1;
+ }
+
+ GMX_RELEASE_ASSERT(p->attr, "Attributes should not be NULL for hwloc PCI object");
+
+ machine->devices.push_back( {
+ p->attr->pcidev.vendor_id,
+ p->attr->pcidev.device_id,
+ p->attr->pcidev.class_id,
+ p->attr->pcidev.domain,
+ p->attr->pcidev.bus,
+ p->attr->pcidev.dev,
+ p->attr->pcidev.func,
+ numaId
+ } );
+ }
+ return pcidevs.empty();
+}
+
+void
+parseHwLoc(HardwareTopology::Machine * machine,
+ HardwareTopology::SupportLevel * supportLevel,
+ bool * isThisSystem)
+{
+ hwloc_topology_t topo;
+
+ // Initialize a hwloc object, set flags to request IO device information too,
+ // try to load the topology, and get the root object. If either step fails,
+ // return that we do not have any support at all from hwloc.
+ if (hwloc_topology_init(&topo) != 0)
+ {
+ hwloc_topology_destroy(topo);
+ return; // SupportLevel::None.
+ }
+
+ hwloc_topology_set_flags(topo, HWLOC_TOPOLOGY_FLAG_IO_DEVICES);
+
+ if (hwloc_topology_load(topo) != 0 || hwloc_get_root_obj(topo) == NULL)
+ {
+ hwloc_topology_destroy(topo);
+ return; // SupportLevel::None.
+ }
+
+ // If we get here, we can get a valid root object for the topology
+ *isThisSystem = hwloc_topology_is_thissystem(topo);
+
+ // Parse basic information about sockets, cores, and hardware threads
+ if (parseHwLocSocketsCoresThreads(topo, machine) == 0)
+ {
+ *supportLevel = HardwareTopology::SupportLevel::Basic;
+ }
+ else
+ {
+ hwloc_topology_destroy(topo);
+ return; // SupportLevel::None.
+ }
+
+ // Get information about cache and numa nodes
+ if (parseHwLocCache(topo, machine) == 0 && parseHwLocNuma(topo, machine) == 0)
+ {
+ *supportLevel = HardwareTopology::SupportLevel::Full;
+ }
+ else
+ {
+ hwloc_topology_destroy(topo);
+ return; // SupportLevel::Basic.
+ }
+
+ // PCI devices
+ if (parseHwLocDevices(topo, machine) == 0)
+ {
+ *supportLevel = HardwareTopology::SupportLevel::FullWithDevices;
+ }
+
+ hwloc_topology_destroy(topo);
+ return; // SupportLevel::Full or SupportLevel::FullWithDevices.
+}
+
+#endif
+
+/*! \brief Try to detect the number of logical processors.
+ *
+ * \return The number of hardware processing units, or 0 if it fails.
+ */
+int
+detectLogicalProcessorCount(FILE *fplog, const t_commrec *cr)
+{
+ int count = 0;
+
+ {
+#if GMX_NATIVE_WINDOWS
+ // Windows
+ SYSTEM_INFO sysinfo;
+ GetSystemInfo( &sysinfo );
+ count = sysinfo.dwNumberOfProcessors;
+#elif defined HAVE_SYSCONF
+ // We are probably on Unix. Check if we have the argument to use before executing any calls
+# if defined(_SC_NPROCESSORS_CONF)
+ count = sysconf(_SC_NPROCESSORS_CONF);
+# if defined(_SC_NPROCESSORS_ONLN)
+ /* On e.g. Arm, the Linux kernel can use advanced power saving features where
+ * processors are brought online/offline dynamically. This will cause
+ * _SC_NPROCESSORS_ONLN to report 1 at the beginning of the run. For this
+ * reason we now warn if this mismatches with the detected core count. */
+ int countOnline = sysconf(_SC_NPROCESSORS_ONLN);
+ if (count != countOnline)
+ {
++ /* We assume that this scenario means that the kernel has
++ disabled threads or cores, and that the only safe course is
++ to assume that _SC_NPROCESSORS_ONLN should be used. Even
++ this may not be valid if running in a containerized
++ environment, such system calls may read from
++ /sys/devices/system/cpu and report what the OS sees, rather
++ than what the container cgroup is supposed to set up as
++ limits. But we're not sure right now whether there's any
++ (standard-ish) way to handle that.
++
++ On ARM, the kernel may have powered down the cores,
++ which we'll warn the user about now. On x86, this
++ means HT is disabled by the kernel, not in the
++ BIOS. We're not sure what it means on other
++ architectures, or even if it is possible, because
++ sysconf is rather non-standardized. */
++ if (isArm)
++ {
++ md_print_warn(cr, fplog,
++ "%d CPUs configured, but only %d of them are online.\n"
++ "This can happen on embedded platforms (e.g. ARM) where the OS shuts some cores\n"
++ "off to save power, and will turn them back on later when the load increases.\n"
++ "However, this will likely mean GROMACS cannot pin threads to those cores. You\n"
++ "will likely see much better performance by forcing all cores to be online, and\n"
++ "making sure they run at their full clock frequency.", count, countOnline);
++ }
++ else
++ {
++ md_print_warn(cr, fplog,
++ "Note: %d CPUs configured, but only %d of them are online, so GROMACS will use the latter.",
++ count, countOnline);
++ // We use the online count to avoid (potential) oversubscription.
++ count = countOnline;
++ }
+ }
+# endif
+# elif defined(_SC_NPROC_CONF)
+ count = sysconf(_SC_NPROC_CONF);
+# elif defined(_SC_NPROCESSORS_ONLN)
+ count = sysconf(_SC_NPROCESSORS_ONLN);
+# elif defined(_SC_NPROC_ONLN)
+ count = sysconf(_SC_NPROC_ONLN);
+# else
+# warning "No valid sysconf argument value found. Executables will not be able to determine the number of logical cores: mdrun will use 1 thread by default!"
+# endif // End of check for sysconf argument values
+
+#else
+ count = 0; // Neither windows nor Unix.
+#endif
+ }
+#if GMX_OPENMP
+ int countFromOpenmp = gmx_omp_get_num_procs();
+ if (count != countFromOpenmp)
+ {
+ md_print_warn(cr, fplog,
+ "Number of logical cores detected (%d) does not match the number reported by OpenMP (%d).\n"
+ "Consider setting the launch configuration manually!",
+ count, countFromOpenmp);
+ }
+#endif
+
+ GMX_UNUSED_VALUE(cr);
+ GMX_UNUSED_VALUE(fplog);
+ return count;
+}
+
+} // namespace anonymous
+
+// static
+HardwareTopology HardwareTopology::detect(FILE *fplog, const t_commrec *cr)
+{
+ HardwareTopology result;
+
+ // Default values for machine and numa stuff
+ result.machine_.logicalProcessorCount = 0;
+ result.machine_.numa.baseLatency = 0.0;
+ result.machine_.numa.maxRelativeLatency = 0.0;
+ result.supportLevel_ = SupportLevel::None;
+ result.isThisSystem_ = true;
+
+#if GMX_HWLOC
+ parseHwLoc(&result.machine_, &result.supportLevel_, &result.isThisSystem_);
+#endif
+
+ // If something went wrong in hwloc (or if it was not present) we might
+ // have more information in cpuInfo
+ if (result.supportLevel_ < SupportLevel::Basic)
+ {
+ // There might be topology information in cpuInfo
+ parseCpuInfo(&result.machine_, &result.supportLevel_);
+ }
+ // If we did not manage to get anything from either hwloc or cpuInfo, find the cpu count at least
+ if (result.supportLevel_ == SupportLevel::None)
+ {
+ // No topology information; try to detect the number of logical processors at least
+ result.machine_.logicalProcessorCount = detectLogicalProcessorCount(fplog, cr);
+ if (result.machine_.logicalProcessorCount > 0)
+ {
+ result.supportLevel_ = SupportLevel::LogicalProcessorCount;
+ }
+ }
+ return result;
+}
+
+
+HardwareTopology::HardwareTopology()
+ : supportLevel_(SupportLevel::None)
+{
+}
+
+int HardwareTopology::numberOfCores() const
+{
+ if (supportLevel() >= SupportLevel::Basic)
+ {
+ // We assume all sockets have the same number of cores as socket 0.
+ // Since topology information is present, we can assume there is at least one socket.
+ return machine().sockets.size() * machine().sockets[0].cores.size();
+ }
+ else if (supportLevel() >= SupportLevel::LogicalProcessorCount)
+ {
+ return machine().logicalProcessorCount;
+ }
+ else
+ {
+ return 0;
+ }
+}
+
+} // namespace gmx