From: Mark Abraham Date: Wed, 29 Jun 2016 13:30:40 +0000 (+0200) Subject: Merge branch release-5-1 into release-2016 X-Git-Url: http://biod.pnpi.spb.ru/gitweb/?a=commitdiff_plain;h=ed8d537d336d90081b62d78a8ecd63f67cfb7a17;p=alexxy%2Fgromacs.git Merge branch release-5-1 into release-2016 The changes to checks in release-5-1 to check_nthreads_hw_avail() are all incorporated into detectLogicalProcessorCount(). Change-Id: Ifbee24dc40630988a14026b9f45cfaf6e92390ba --- ed8d537d336d90081b62d78a8ecd63f67cfb7a17 diff --cc src/gromacs/gpu_utils/cudautils.cuh index 495b1a4d29,0000000000..5daa040d28 mode 100644,000000..100644 --- a/src/gromacs/gpu_utils/cudautils.cuh +++ b/src/gromacs/gpu_utils/cudautils.cuh @@@ -1,169 -1,0 +1,173 @@@ +/* + * This file is part of the GROMACS molecular simulation package. + * + * Copyright (c) 2012,2014,2015,2016, by the GROMACS development team, led by + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, + * and including many others, as listed in the AUTHORS file in the + * top-level source directory and at http://www.gromacs.org. + * + * GROMACS is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 + * of the License, or (at your option) any later version. + * + * GROMACS is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with GROMACS; if not, see + * http://www.gnu.org/licenses, or write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * If you want to redistribute modifications to GROMACS, please + * consider that scientific software is very special. Version + * control is crucial - bugs must be traceable. We will be happy to + * consider code for inclusion in the official distribution, but + * derived work must not be called official GROMACS. Details are found + * in the README & COPYING files - if they are missing, get the + * official version at http://www.gromacs.org. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the research papers on the package. Check out http://www.gromacs.org. + */ +#ifndef GMX_GPU_UTILS_CUDAUTILS_CUH +#define GMX_GPU_UTILS_CUDAUTILS_CUH + +#include "config.h" + +#include +#if HAVE_NVML +#include +#endif /* HAVE_NVML */ + +#include "gromacs/utility/fatalerror.h" + +/* TODO error checking needs to be rewritten. We have 2 types of error checks needed + based on where they occur in the code: + - non performance-critical: these errors are unsafe to be ignored and must be + _always_ checked for, e.g. initializations + - performance critical: handling errors might hurt performance so care need to be taken + when/if we should check for them at all, e.g. in cu_upload_X. However, we should be + able to turn the check for these errors on! + + Probably we'll need two sets of the macros below... + + */ +#define CHECK_CUDA_ERRORS + +#ifdef CHECK_CUDA_ERRORS + +/*! Check for CUDA error on the return status of a CUDA RT API call. */ +#define CU_RET_ERR(status, msg) \ + do { \ + if (status != cudaSuccess) \ + { \ + gmx_fatal(FARGS, "%s: %s\n", msg, cudaGetErrorString(status)); \ + } \ + } while (0) + +/*! Check for any previously occurred uncaught CUDA error. */ +#define CU_CHECK_PREV_ERR() \ + do { \ + cudaError_t _CU_CHECK_PREV_ERR_status = cudaGetLastError(); \ + if (_CU_CHECK_PREV_ERR_status != cudaSuccess) { \ + gmx_warning("Just caught a previously occurred CUDA error (%s), will try to continue.", cudaGetErrorString(_CU_CHECK_PREV_ERR_status)); \ + } \ + } while (0) + +/*! Check for any previously occurred uncaught CUDA error + -- aimed at use after kernel calls. */ +#define CU_LAUNCH_ERR(msg) \ + do { \ + cudaError_t _CU_LAUNCH_ERR_status = cudaGetLastError(); \ + if (_CU_LAUNCH_ERR_status != cudaSuccess) { \ + gmx_fatal(FARGS, "Error while launching kernel %s: %s\n", msg, cudaGetErrorString(_CU_LAUNCH_ERR_status)); \ + } \ + } while (0) + +/*! Synchronize with GPU and check for any previously occurred uncaught CUDA error + -- aimed at use after kernel calls. */ +#define CU_LAUNCH_ERR_SYNC(msg) \ + do { \ + cudaError_t _CU_SYNC_LAUNCH_ERR_status = cudaThreadSynchronize(); \ + if (_CU_SYNC_LAUNCH_ERR_status != cudaSuccess) { \ + gmx_fatal(FARGS, "Error while launching kernel %s: %s\n", msg, cudaGetErrorString(_CU_SYNC_LAUNCH_ERR_status)); \ + } \ + } while (0) + +#else /* CHECK_CUDA_ERRORS */ + +#define CU_RET_ERR(status, msg) do { } while (0) +#define CU_CHECK_PREV_ERR() do { } while (0) +#define CU_LAUNCH_ERR(msg) do { } while (0) +#define CU_LAUNCH_ERR_SYNC(msg) do { } while (0) +#define HANDLE_NVML_RET_ERR(status, msg) do { } while (0) + +#endif /* CHECK_CUDA_ERRORS */ + +/*! \brief CUDA device information. + * + * The CUDA device information is queried and set at detection and contains + * both information about the device/hardware returned by the runtime as well + * as additional data like support status. + */ +struct gmx_device_info_t +{ - int id; /* id of the CUDA device */ - cudaDeviceProp prop; /* CUDA device properties */ - int stat; /* result of the device check */ - gmx_bool nvml_initialized; /* If NVML was initialized */ - gmx_bool nvml_ap_clocks_changed; /* If application clocks have been changed */ ++ int id; /* id of the CUDA device */ ++ cudaDeviceProp prop; /* CUDA device properties */ ++ int stat; /* result of the device check */ ++ gmx_bool nvml_initialized; /* If NVML was initialized */ ++ unsigned int nvml_orig_app_sm_clock; /* The original SM clock before we changed it */ ++ unsigned int nvml_orig_app_mem_clock; /* The original memory clock before we changed it */ ++ gmx_bool nvml_app_clocks_changed; /* If application clocks have been changed */ ++ unsigned int nvml_set_app_sm_clock; /* The SM clock we set */ ++ unsigned int nvml_set_app_mem_clock; /* The memory clock we set */ +#if HAVE_NVML - nvmlDevice_t nvml_device_id; /* NVML device id */ - nvmlEnableState_t nvml_is_restricted; /* Status of application clocks permission */ - #endif /* HAVE_NVML */ ++ nvmlDevice_t nvml_device_id; /* NVML device id */ ++ nvmlEnableState_t nvml_is_restricted; /* Status of application clocks permission */ ++#endif /* HAVE_NVML */ +}; + + +/*! Launches asynchronous host to device memory copy in stream 0. */ +int cu_copy_D2H(void * /*h_dest*/, void * /*d_src*/, size_t /*bytes*/); + +/*! Launches asynchronous host to device memory copy in stream s. */ +int cu_copy_D2H_async(void * /*h_dest*/, void * /*d_src*/, size_t /*bytes*/, cudaStream_t /*s = 0*/); + +/*! Allocates host memory and launches synchronous host to device memory copy. */ +int cu_copy_D2H_alloc(void ** /*h_dest*/, void * /*d_src*/, size_t /*bytes*/); + + +/*! Launches synchronous host to device memory copy. */ +int cu_copy_H2D(void * /*d_dest*/, void * /*h_src*/, size_t /*bytes*/); + +/*! Launches asynchronous host to device memory copy in stream s. */ +int cu_copy_H2D_async(void * /*d_dest*/, void * /*h_src*/, size_t /*bytes*/, cudaStream_t /*s = 0*/); + +/*! Allocates device memory and launches synchronous host to device memory copy. */ +int cu_copy_H2D_alloc(void ** /*d_dest*/, void * /*h_src*/, size_t /*bytes*/); + +/*! Frees device memory and resets the size and allocation size to -1. */ +void cu_free_buffered(void *d_ptr, int *n = NULL, int *nalloc = NULL); + +/*! Reallocates the device memory and copies data from the host. */ +void cu_realloc_buffered(void **d_dest, void *h_src, + size_t type_size, + int *curr_size, int *curr_alloc_size, + int req_size, + cudaStream_t s, + bool bAsync); + +/*! Waits for event e to complete, */ +int cu_wait_event(cudaEvent_t /*e*/); + +/*! Calculates and returns the time elapsed between event start and end. */ +float cu_event_elapsed(cudaEvent_t /*start*/, cudaEvent_t /*end*/); + +/*! Waits for event end to complete and calculates the time between start and end. */ +int cu_wait_event_time(cudaEvent_t /*end*/, cudaEvent_t /*begin*/, float * /*time*/); + +#endif diff --cc src/gromacs/gpu_utils/gpu_utils.cu index f08d55443e,0000000000..e8b78249e9 mode 100644,000000..100644 --- a/src/gromacs/gpu_utils/gpu_utils.cu +++ b/src/gromacs/gpu_utils/gpu_utils.cu @@@ -1,832 -1,0 +1,867 @@@ +/* + * This file is part of the GROMACS molecular simulation package. + * + * Copyright (c) 2010,2011,2012,2013,2014,2015,2016, by the GROMACS development team, led by + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, + * and including many others, as listed in the AUTHORS file in the + * top-level source directory and at http://www.gromacs.org. + * + * GROMACS is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 + * of the License, or (at your option) any later version. + * + * GROMACS is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with GROMACS; if not, see + * http://www.gnu.org/licenses, or write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * If you want to redistribute modifications to GROMACS, please + * consider that scientific software is very special. Version + * control is crucial - bugs must be traceable. We will be happy to + * consider code for inclusion in the official distribution, but + * derived work must not be called official GROMACS. Details are found + * in the README & COPYING files - if they are missing, get the + * official version at http://www.gromacs.org. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the research papers on the package. Check out http://www.gromacs.org. + */ +/*! \file + * \brief Define functions for detection and initialization for CUDA devices. + * + * \author Szilard Pall + */ + +#include "gmxpre.h" + +#include "gpu_utils.h" + +#include "config.h" + +#include +#include +#include + +#include + +#include "gromacs/gpu_utils/cudautils.cuh" +#include "gromacs/gpu_utils/pmalloc_cuda.h" +#include "gromacs/hardware/gpu_hw_info.h" +#include "gromacs/utility/basedefinitions.h" +#include "gromacs/utility/cstringutil.h" +#include "gromacs/utility/smalloc.h" + +#if HAVE_NVML +#include +#define HAVE_NVML_APPLICATION_CLOCKS (NVML_API_VERSION >= 6) +#else /* HAVE_NVML */ +#define HAVE_NVML_APPLICATION_CLOCKS 0 +#endif /* HAVE_NVML */ + +#if defined(CHECK_CUDA_ERRORS) && HAVE_NVML_APPLICATION_CLOCKS +/*! Check for NVML error on the return status of a NVML API call. */ +# define HANDLE_NVML_RET_ERR(status, msg) \ + do { \ + if (status != NVML_SUCCESS) \ + { \ + gmx_warning("%s: %s\n", msg, nvmlErrorString(status)); \ + } \ + } while (0) +#else /* defined(CHECK_CUDA_ERRORS) && HAVE_NVML_APPLICATION_CLOCKS */ +# define HANDLE_NVML_RET_ERR(status, msg) do { } while (0) +#endif /* defined(CHECK_CUDA_ERRORS) && HAVE_NVML_APPLICATION_CLOCKS */ + +#if HAVE_NVML_APPLICATION_CLOCKS +static const gmx_bool bCompiledWithApplicationClockSupport = true; +#else +static const gmx_bool gmx_unused bCompiledWithApplicationClockSupport = false; +#endif + +/*! \internal \brief + * Max number of devices supported by CUDA (for consistency checking). + * + * In reality it is 16 with CUDA <=v5.0, but let's stay on the safe side. + */ +static int cuda_max_device_count = 32; + +static bool cudaProfilerRun = ((getenv("NVPROF_ID") != NULL)); + +/** Dummy kernel used for sanity checking. */ +__global__ void k_dummy_test() +{ +} + + +/*! + * \brief Runs GPU sanity checks. + * + * Runs a series of checks to determine that the given GPU and underlying CUDA + * driver/runtime functions properly. + * Returns properties of a device with given ID or the one that has + * already been initialized earlier in the case if of \dev_id == -1. + * + * \param[in] dev_id the device ID of the GPU or -1 if the device has already been initialized + * \param[out] dev_prop pointer to the structure in which the device properties will be returned + * \returns 0 if the device looks OK + * + * TODO: introduce errors codes and handle errors more smoothly. + */ +static int do_sanity_checks(int dev_id, cudaDeviceProp *dev_prop) +{ + cudaError_t cu_err; + int dev_count, id; + + cu_err = cudaGetDeviceCount(&dev_count); + if (cu_err != cudaSuccess) + { + fprintf(stderr, "Error %d while querying device count: %s\n", cu_err, + cudaGetErrorString(cu_err)); + return -1; + } + + /* no CUDA compatible device at all */ + if (dev_count == 0) + { + return -1; + } + + /* things might go horribly wrong if cudart is not compatible with the driver */ + if (dev_count < 0 || dev_count > cuda_max_device_count) + { + return -1; + } + + if (dev_id == -1) /* device already selected let's not destroy the context */ + { + cu_err = cudaGetDevice(&id); + if (cu_err != cudaSuccess) + { + fprintf(stderr, "Error %d while querying device id: %s\n", cu_err, + cudaGetErrorString(cu_err)); + return -1; + } + } + else + { + id = dev_id; + if (id > dev_count - 1) /* pfff there's no such device */ + { + fprintf(stderr, "The requested device with id %d does not seem to exist (device count=%d)\n", + dev_id, dev_count); + return -1; + } + } + + memset(dev_prop, 0, sizeof(cudaDeviceProp)); + cu_err = cudaGetDeviceProperties(dev_prop, id); + if (cu_err != cudaSuccess) + { + fprintf(stderr, "Error %d while querying device properties: %s\n", cu_err, + cudaGetErrorString(cu_err)); + return -1; + } + + /* both major & minor is 9999 if no CUDA capable devices are present */ + if (dev_prop->major == 9999 && dev_prop->minor == 9999) + { + return -1; + } + /* we don't care about emulation mode */ + if (dev_prop->major == 0) + { + return -1; + } + + if (id != -1) + { + cu_err = cudaSetDevice(id); + if (cu_err != cudaSuccess) + { + fprintf(stderr, "Error %d while switching to device #%d: %s\n", + cu_err, id, cudaGetErrorString(cu_err)); + return -1; + } + } + + /* try to execute a dummy kernel */ + k_dummy_test<<< 1, 512>>> (); + if (cudaThreadSynchronize() != cudaSuccess) + { + return -1; + } + + /* destroy context if we created one */ + if (id != -1) + { + cu_err = cudaDeviceReset(); + CU_RET_ERR(cu_err, "cudaDeviceReset failed"); + } + + return 0; +} + +#if HAVE_NVML +/* TODO: We should actually be using md_print_warn in md_logging.c, + * but we can't include mpi.h in CUDA code. + */ +static void md_print_info(FILE *fplog, + const char *fmt, ...) +{ + va_list ap; + + if (fplog != NULL) + { + /* We should only print to stderr on the master node, + * in most cases fplog is only set on the master node, so this works. + */ + va_start(ap, fmt); + vfprintf(stderr, fmt, ap); + va_end(ap); + + va_start(ap, fmt); + vfprintf(fplog, fmt, ap); + va_end(ap); + } +} +#endif /*HAVE_NVML*/ + +/* TODO: We should actually be using md_print_warn in md_logging.c, + * but we can't include mpi.h in CUDA code. + * This is replicated from nbnxn_cuda_data_mgmt.cu. + */ +static void md_print_warn(FILE *fplog, + const char *fmt, ...) +{ + va_list ap; + + if (fplog != NULL) + { + /* We should only print to stderr on the master node, + * in most cases fplog is only set on the master node, so this works. + */ + va_start(ap, fmt); + fprintf(stderr, "\n"); + vfprintf(stderr, fmt, ap); + fprintf(stderr, "\n"); + va_end(ap); + + va_start(ap, fmt); + fprintf(fplog, "\n"); + vfprintf(fplog, fmt, ap); + fprintf(fplog, "\n"); + va_end(ap); + } +} + +#if HAVE_NVML_APPLICATION_CLOCKS +/*! \brief Determines and adds the NVML device ID to the passed \cuda_dev. + * + * Determines and adds the NVML device ID to the passed \cuda_dev. This is done by + * matching PCI-E information from \cuda_dev with the available NVML devices. + * + * \param[in,out] cuda_dev CUDA device information to enrich with NVML device info + * \returns true if \cuda_dev could be enriched with matching NVML device information. + */ +static bool addNVMLDeviceId(gmx_device_info_t* cuda_dev) +{ + nvmlDevice_t nvml_device_id; + unsigned int nvml_device_count = 0; + nvmlReturn_t nvml_stat = nvmlDeviceGetCount ( &nvml_device_count ); + cuda_dev->nvml_initialized = false; + HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetCount failed" ); + for (unsigned int nvml_device_idx = 0; nvml_stat == NVML_SUCCESS && nvml_device_idx < nvml_device_count; ++nvml_device_idx) + { + nvml_stat = nvmlDeviceGetHandleByIndex ( nvml_device_idx, &nvml_device_id ); + HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetHandleByIndex failed" ); + if (nvml_stat != NVML_SUCCESS) + { + break; + } + + nvmlPciInfo_t nvml_pci_info; + nvml_stat = nvmlDeviceGetPciInfo ( nvml_device_id, &nvml_pci_info ); + HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetPciInfo failed" ); + if (nvml_stat != NVML_SUCCESS) + { + break; + } + if (static_cast(cuda_dev->prop.pciBusID) == nvml_pci_info.bus && + static_cast(cuda_dev->prop.pciDeviceID) == nvml_pci_info.device && + static_cast(cuda_dev->prop.pciDomainID) == nvml_pci_info.domain) + { + cuda_dev->nvml_initialized = true; + cuda_dev->nvml_device_id = nvml_device_id; + break; + } + } + return cuda_dev->nvml_initialized; +} ++ ++/*! \brief Reads and returns the application clocks for device. ++ * ++ * \param[in] device The GPU device ++ * \param[out] app_sm_clock The current application SM clock ++ * \param[out] app_mem_clock The current application memory clock ++ * \returns if applacation clocks are supported ++ */ ++static bool getApplicationClocks(const gmx_device_info_t *cuda_dev, ++ unsigned int *app_sm_clock, ++ unsigned int *app_mem_clock) ++{ ++ nvmlReturn_t nvml_stat; ++ ++ nvml_stat = nvmlDeviceGetApplicationsClock(cuda_dev->nvml_device_id, NVML_CLOCK_SM, app_sm_clock); ++ if (NVML_ERROR_NOT_SUPPORTED == nvml_stat) ++ { ++ return false; ++ } ++ HANDLE_NVML_RET_ERR(nvml_stat, "nvmlDeviceGetApplicationsClock failed"); ++ nvml_stat = nvmlDeviceGetApplicationsClock(cuda_dev->nvml_device_id, NVML_CLOCK_MEM, app_mem_clock); ++ HANDLE_NVML_RET_ERR(nvml_stat, "nvmlDeviceGetApplicationsClock failed"); ++ ++ return true; ++} +#endif /* HAVE_NVML_APPLICATION_CLOCKS */ + +/*! \brief Tries to set application clocks for the GPU with the given index. + * + * The variable \gpuid is the index of the GPU in the gpu_info.cuda_dev array + * to handle the application clocks for. Application clocks are set to the + * max supported value to increase performance if application clock permissions + * allow this. For future GPU architectures a more sophisticated scheme might be + * required. + * + * \param[out] fplog log file to write to + * \param[in] gpuid index of the GPU to set application clocks for + * \param[in] gpu_info GPU info of all detected devices in the system. + * \returns true if no error occurs during application clocks handling. + */ +static gmx_bool init_gpu_application_clocks(FILE gmx_unused *fplog, int gmx_unused gpuid, const gmx_gpu_info_t gmx_unused *gpu_info) +{ + const cudaDeviceProp *prop = &gpu_info->gpu_dev[gpuid].prop; + int cuda_version_number = prop->major * 10 + prop->minor; + gmx_bool bGpuCanUseApplicationClocks = + ((0 == gmx_wcmatch("*Tesla*", prop->name) && cuda_version_number >= 35 ) || + (0 == gmx_wcmatch("*Quadro*", prop->name) && cuda_version_number >= 52 )); + if (!bGpuCanUseApplicationClocks) + { + return true; + } +#if !HAVE_NVML + int cuda_driver = 0; + int cuda_runtime = 0; + cudaDriverGetVersion(&cuda_driver); + cudaRuntimeGetVersion(&cuda_runtime); + md_print_warn( fplog, "NOTE: GROMACS was configured without NVML support hence it can not exploit\n" + " application clocks of the detected %s GPU to improve performance.\n" + " Recompile with the NVML library (compatible with the driver used) or set application clocks manually.\n", + prop->name); + return true; +#else + if (!bCompiledWithApplicationClockSupport) + { + int cuda_driver = 0; + int cuda_runtime = 0; + cudaDriverGetVersion(&cuda_driver); + cudaRuntimeGetVersion(&cuda_runtime); + md_print_warn( fplog, "NOTE: GROMACS was compiled with an old NVML library which does not support\n" + " managing application clocks of the detected %s GPU to improve performance.\n" + " If your GPU supports application clocks, upgrade NVML (and driver) and recompile or set the clocks manually.\n", + prop->name ); + return true; + } + + /* We've compiled with NVML application clocks support, and have a GPU that can use it */ + nvmlReturn_t nvml_stat = NVML_SUCCESS; + char *env; + //TODO: GMX_GPU_APPLICATION_CLOCKS is currently only used to enable/disable setting of application clocks + // this variable can be later used to give a user more fine grained control. + env = getenv("GMX_GPU_APPLICATION_CLOCKS"); + if (env != NULL && ( strcmp( env, "0") == 0 || + gmx_strcasecmp( env, "OFF") == 0 || + gmx_strcasecmp( env, "DISABLE") == 0 )) + { + return true; + } + nvml_stat = nvmlInit(); + HANDLE_NVML_RET_ERR( nvml_stat, "nvmlInit failed." ); + if (nvml_stat != NVML_SUCCESS) + { + return false; + } - if (!addNVMLDeviceId( &(gpu_info->gpu_dev[gpuid]))) ++ ++ gmx_device_info_t *cuda_dev = &(gpu_info->gpu_dev[gpuid]); ++ ++ if (!addNVMLDeviceId(cuda_dev)) + { + return false; + } + //get current application clocks setting - unsigned int app_sm_clock = 0; - unsigned int app_mem_clock = 0; - nvml_stat = nvmlDeviceGetApplicationsClock ( gpu_info->gpu_dev[gpuid].nvml_device_id, NVML_CLOCK_SM, &app_sm_clock ); - if (NVML_ERROR_NOT_SUPPORTED == nvml_stat) ++ if (!getApplicationClocks(cuda_dev, ++ &cuda_dev->nvml_orig_app_sm_clock, ++ &cuda_dev->nvml_orig_app_mem_clock)) + { + return false; + } - HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetApplicationsClock failed" ); - nvml_stat = nvmlDeviceGetApplicationsClock ( gpu_info->gpu_dev[gpuid].nvml_device_id, NVML_CLOCK_MEM, &app_mem_clock ); - HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetApplicationsClock failed" ); + //get max application clocks + unsigned int max_sm_clock = 0; + unsigned int max_mem_clock = 0; - nvml_stat = nvmlDeviceGetMaxClockInfo ( gpu_info->gpu_dev[gpuid].nvml_device_id, NVML_CLOCK_SM, &max_sm_clock ); ++ nvml_stat = nvmlDeviceGetMaxClockInfo(cuda_dev->nvml_device_id, NVML_CLOCK_SM, &max_sm_clock); + HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetMaxClockInfo failed" ); - nvml_stat = nvmlDeviceGetMaxClockInfo ( gpu_info->gpu_dev[gpuid].nvml_device_id, NVML_CLOCK_MEM, &max_mem_clock ); ++ nvml_stat = nvmlDeviceGetMaxClockInfo(cuda_dev->nvml_device_id, NVML_CLOCK_MEM, &max_mem_clock); + HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetMaxClockInfo failed" ); + - gpu_info->gpu_dev[gpuid].nvml_is_restricted = NVML_FEATURE_ENABLED; - gpu_info->gpu_dev[gpuid].nvml_ap_clocks_changed = false; ++ cuda_dev->nvml_is_restricted = NVML_FEATURE_ENABLED; ++ cuda_dev->nvml_app_clocks_changed = false; + - nvml_stat = nvmlDeviceGetAPIRestriction ( gpu_info->gpu_dev[gpuid].nvml_device_id, NVML_RESTRICTED_API_SET_APPLICATION_CLOCKS, &(gpu_info->gpu_dev[gpuid].nvml_is_restricted) ); ++ nvml_stat = nvmlDeviceGetAPIRestriction(cuda_dev->nvml_device_id, NVML_RESTRICTED_API_SET_APPLICATION_CLOCKS, &(cuda_dev->nvml_is_restricted)); + HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetAPIRestriction failed" ); + + /* Note: Distinguishing between different types of GPUs here might be necessary in the future, + e.g. if max application clocks should not be used for certain GPUs. */ - if (nvml_stat == NVML_SUCCESS && app_sm_clock < max_sm_clock && gpu_info->gpu_dev[gpuid].nvml_is_restricted == NVML_FEATURE_DISABLED) ++ if (nvml_stat == NVML_SUCCESS && cuda_dev->nvml_orig_app_sm_clock < max_sm_clock && cuda_dev->nvml_is_restricted == NVML_FEATURE_DISABLED) + { - md_print_info( fplog, "Changing GPU application clocks for %s to (%d,%d)\n", gpu_info->gpu_dev[gpuid].prop.name, max_mem_clock, max_sm_clock); - nvml_stat = nvmlDeviceSetApplicationsClocks ( gpu_info->gpu_dev[gpuid].nvml_device_id, max_mem_clock, max_sm_clock ); ++ md_print_info(fplog, "Changing GPU application clocks for %s to (%d,%d)\n", cuda_dev->prop.name, max_mem_clock, max_sm_clock); ++ nvml_stat = nvmlDeviceSetApplicationsClocks(cuda_dev->nvml_device_id, max_mem_clock, max_sm_clock); + HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetApplicationsClock failed" ); - gpu_info->gpu_dev[gpuid].nvml_ap_clocks_changed = true; ++ cuda_dev->nvml_app_clocks_changed = true; ++ cuda_dev->nvml_set_app_sm_clock = max_sm_clock; ++ cuda_dev->nvml_set_app_mem_clock = max_mem_clock; + } - else if (nvml_stat == NVML_SUCCESS && app_sm_clock < max_sm_clock) ++ else if (nvml_stat == NVML_SUCCESS && cuda_dev->nvml_orig_app_sm_clock < max_sm_clock) + { - md_print_warn( fplog, "Can not change application clocks for %s to optimal values due to insufficient permissions. Current values are (%d,%d), max values are (%d,%d).\nUse sudo nvidia-smi -acp UNRESTRICTED or contact your admin to change application clocks.\n", gpu_info->gpu_dev[gpuid].prop.name, app_mem_clock, app_sm_clock, max_mem_clock, max_sm_clock); ++ md_print_warn(fplog, "Can not change application clocks for %s to optimal values due to insufficient permissions. Current values are (%d,%d), max values are (%d,%d).\nUse sudo nvidia-smi -acp UNRESTRICTED or contact your admin to change application clocks.\n", cuda_dev->prop.name, cuda_dev->nvml_orig_app_mem_clock, cuda_dev->nvml_orig_app_sm_clock, max_mem_clock, max_sm_clock); + } - else if (nvml_stat == NVML_SUCCESS && app_sm_clock == max_sm_clock) ++ else if (nvml_stat == NVML_SUCCESS && cuda_dev->nvml_orig_app_sm_clock == max_sm_clock) + { - //TODO: This should probably be integrated into the GPU Properties table. - md_print_info( fplog, "Application clocks (GPU clocks) for %s are (%d,%d)\n", gpu_info->gpu_dev[gpuid].prop.name, app_mem_clock, app_sm_clock); ++ md_print_info(fplog, "Application clocks (GPU clocks) for %s are (%d,%d)\n", cuda_dev->prop.name, cuda_dev->nvml_orig_app_mem_clock, cuda_dev->nvml_orig_app_sm_clock); + } + else + { + md_print_warn( fplog, "Can not change GPU application clocks to optimal values due to NVML error (%d): %s.\n", nvml_stat, nvmlErrorString(nvml_stat)); + } + return (nvml_stat == NVML_SUCCESS); +#endif /* HAVE_NVML */ +} + +/*! \brief Resets application clocks if changed and cleans up NVML for the passed \gpu_dev. + * + * \param[in] gpu_dev CUDA device information + */ +static gmx_bool reset_gpu_application_clocks(const gmx_device_info_t gmx_unused * cuda_dev) +{ +#if !HAVE_NVML_APPLICATION_CLOCKS + GMX_UNUSED_VALUE(cuda_dev); + return true; +#else /* HAVE_NVML_APPLICATION_CLOCKS */ + nvmlReturn_t nvml_stat = NVML_SUCCESS; + if (cuda_dev && + cuda_dev->nvml_is_restricted == NVML_FEATURE_DISABLED && - cuda_dev->nvml_ap_clocks_changed) ++ cuda_dev->nvml_app_clocks_changed) + { - nvml_stat = nvmlDeviceResetApplicationsClocks( cuda_dev->nvml_device_id ); - HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceResetApplicationsClocks failed" ); ++ /* Check if the clocks are still what we set them to. ++ * If so, set them back to the state we originally found them in. ++ * If not, don't touch them, because something else set them later. ++ */ ++ unsigned int app_sm_clock, app_mem_clock; ++ getApplicationClocks(cuda_dev, &app_sm_clock, &app_mem_clock); ++ if (app_sm_clock == cuda_dev->nvml_set_app_sm_clock && ++ app_mem_clock == cuda_dev->nvml_set_app_mem_clock) ++ { ++ nvml_stat = nvmlDeviceSetApplicationsClocks(cuda_dev->nvml_device_id, cuda_dev->nvml_orig_app_mem_clock, cuda_dev->nvml_orig_app_sm_clock); ++ HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetApplicationsClock failed" ); ++ } + } + nvml_stat = nvmlShutdown(); + HANDLE_NVML_RET_ERR( nvml_stat, "nvmlShutdown failed" ); + return (nvml_stat == NVML_SUCCESS); +#endif /* HAVE_NVML_APPLICATION_CLOCKS */ +} + +gmx_bool init_gpu(FILE gmx_unused *fplog, int mygpu, char *result_str, + const struct gmx_gpu_info_t *gpu_info, + const struct gmx_gpu_opt_t *gpu_opt) +{ + cudaError_t stat; + char sbuf[STRLEN]; + int gpuid; + + assert(gpu_info); + assert(result_str); + + if (mygpu < 0 || mygpu >= gpu_opt->n_dev_use) + { + sprintf(sbuf, "Trying to initialize an inexistent GPU: " + "there are %d %s-selected GPU(s), but #%d was requested.", + gpu_opt->n_dev_use, gpu_opt->bUserSet ? "user" : "auto", mygpu); + gmx_incons(sbuf); + } + + gpuid = gpu_info->gpu_dev[gpu_opt->dev_use[mygpu]].id; + + stat = cudaSetDevice(gpuid); + strncpy(result_str, cudaGetErrorString(stat), STRLEN); + + if (debug) + { + fprintf(stderr, "Initialized GPU ID #%d: %s\n", gpuid, gpu_info->gpu_dev[gpuid].prop.name); + } + + //Ignoring return value as NVML errors should be treated not critical. + if (stat == cudaSuccess) + { + init_gpu_application_clocks(fplog, gpuid, gpu_info); + } + return (stat == cudaSuccess); +} + +gmx_bool free_cuda_gpu( + int gmx_unused mygpu, char *result_str, + const gmx_gpu_info_t gmx_unused *gpu_info, + const gmx_gpu_opt_t gmx_unused *gpu_opt + ) +{ + cudaError_t stat; + gmx_bool reset_gpu_application_clocks_status = true; + int gpuid; + + assert(result_str); + + if (debug) + { + int gpuid; + stat = cudaGetDevice(&gpuid); + CU_RET_ERR(stat, "cudaGetDevice failed"); + fprintf(stderr, "Cleaning up context on GPU ID #%d\n", gpuid); + } + + gpuid = gpu_opt ? gpu_opt->dev_use[mygpu] : -1; + if (gpuid != -1) + { + reset_gpu_application_clocks_status = reset_gpu_application_clocks( &(gpu_info->gpu_dev[gpuid]) ); + } + + stat = cudaDeviceReset(); + strncpy(result_str, cudaGetErrorString(stat), STRLEN); + return (stat == cudaSuccess) && reset_gpu_application_clocks_status; +} + +/*! \brief Returns true if the gpu characterized by the device properties is + * supported by the native gpu acceleration. + * + * \param[in] dev_prop the CUDA device properties of the gpus to test. + * \returns true if the GPU properties passed indicate a compatible + * GPU, otherwise false. + */ +static bool is_gmx_supported_gpu(const cudaDeviceProp *dev_prop) +{ + return (dev_prop->major >= 2); +} + +/*! \brief Helper function that checks whether a given GPU status indicates compatible GPU. + * + * \param[in] stat GPU status. + * \returns true if the provided status is egpuCompatible, otherwise false. + */ +static bool is_compatible_gpu(int stat) +{ + return (stat == egpuCompatible); +} + +/*! \brief Checks if a GPU with a given ID is supported by the native GROMACS acceleration. + * + * Returns a status value which indicates compatibility or one of the following + * errors: incompatibility, insistence, or insanity (=unexpected behavior). + * It also returns the respective device's properties in \dev_prop (if applicable). + * + * \param[in] dev_id the ID of the GPU to check. + * \param[out] dev_prop the CUDA device properties of the device checked. + * \returns the status of the requested device + */ +static int is_gmx_supported_gpu_id(int dev_id, cudaDeviceProp *dev_prop) +{ + cudaError_t stat; + int ndev; + + stat = cudaGetDeviceCount(&ndev); + if (stat != cudaSuccess) + { + return egpuInsane; + } + + if (dev_id > ndev - 1) + { + return egpuNonexistent; + } + + /* TODO: currently we do not make a distinction between the type of errors + * that can appear during sanity checks. This needs to be improved, e.g if + * the dummy test kernel fails to execute with a "device busy message" we + * should appropriately report that the device is busy instead of insane. + */ + if (do_sanity_checks(dev_id, dev_prop) == 0) + { + if (is_gmx_supported_gpu(dev_prop)) + { + return egpuCompatible; + } + else + { + return egpuIncompatible; + } + } + else + { + return egpuInsane; + } +} + + +int detect_gpus(gmx_gpu_info_t *gpu_info, char *err_str) +{ + int i, ndev, checkres, retval; + cudaError_t stat; + cudaDeviceProp prop; + gmx_device_info_t *devs; + + assert(gpu_info); + assert(err_str); + + gpu_info->n_dev_compatible = 0; + + ndev = 0; + devs = NULL; + + stat = cudaGetDeviceCount(&ndev); + if (stat != cudaSuccess) + { + const char *s; + + /* cudaGetDeviceCount failed which means that there is something + * wrong with the machine: driver-runtime mismatch, all GPUs being + * busy in exclusive mode, or some other condition which should + * result in us issuing a warning a falling back to CPUs. */ + retval = -1; + s = cudaGetErrorString(stat); + strncpy(err_str, s, STRLEN*sizeof(err_str[0])); + } + else + { + snew(devs, ndev); + for (i = 0; i < ndev; i++) + { + checkres = is_gmx_supported_gpu_id(i, &prop); + + devs[i].id = i; + devs[i].prop = prop; + devs[i].stat = checkres; + + if (checkres == egpuCompatible) + { + gpu_info->n_dev_compatible++; + } + } + retval = 0; + } + + gpu_info->n_dev = ndev; + gpu_info->gpu_dev = devs; + + return retval; +} + +void pick_compatible_gpus(const gmx_gpu_info_t *gpu_info, + gmx_gpu_opt_t *gpu_opt) +{ + int i, ncompat; + int *compat; + + assert(gpu_info); + /* gpu_dev/n_dev have to be either NULL/0 or not (NULL/0) */ + assert((gpu_info->n_dev != 0 ? 0 : 1) ^ (gpu_info->gpu_dev == NULL ? 0 : 1)); + + snew(compat, gpu_info->n_dev); + ncompat = 0; + for (i = 0; i < gpu_info->n_dev; i++) + { + if (is_compatible_gpu(gpu_info->gpu_dev[i].stat)) + { + ncompat++; + compat[ncompat - 1] = i; + } + } + + gpu_opt->n_dev_compatible = ncompat; + snew(gpu_opt->dev_compatible, ncompat); + memcpy(gpu_opt->dev_compatible, compat, ncompat*sizeof(*compat)); + sfree(compat); +} + +gmx_bool check_selected_gpus(int *checkres, + const gmx_gpu_info_t *gpu_info, + gmx_gpu_opt_t *gpu_opt) +{ + int i, id; + bool bAllOk; + + assert(checkres); + assert(gpu_info); + assert(gpu_opt->n_dev_use >= 0); + + if (gpu_opt->n_dev_use == 0) + { + return TRUE; + } + + assert(gpu_opt->dev_use); + + /* we will assume that all GPUs requested are valid IDs, + otherwise we'll bail anyways */ + + bAllOk = true; + for (i = 0; i < gpu_opt->n_dev_use; i++) + { + id = gpu_opt->dev_use[i]; + + /* devices are stored in increasing order of IDs in gpu_dev */ + gpu_opt->dev_use[i] = id; + + checkres[i] = (id >= gpu_info->n_dev) ? + egpuNonexistent : gpu_info->gpu_dev[id].stat; + + bAllOk = bAllOk && is_compatible_gpu(checkres[i]); + } + + return bAllOk; +} + +void free_gpu_info(const gmx_gpu_info_t *gpu_info) +{ + if (gpu_info == NULL) + { + return; + } + + sfree(gpu_info->gpu_dev); +} + +void get_gpu_device_info_string(char *s, const gmx_gpu_info_t *gpu_info, int index) +{ + assert(s); + assert(gpu_info); + + if (index < 0 && index >= gpu_info->n_dev) + { + return; + } + + gmx_device_info_t *dinfo = &gpu_info->gpu_dev[index]; + + bool bGpuExists = + dinfo->stat == egpuCompatible || + dinfo->stat == egpuIncompatible; + + if (!bGpuExists) + { + sprintf(s, "#%d: %s, stat: %s", + dinfo->id, "N/A", + gpu_detect_res_str[dinfo->stat]); + } + else + { + sprintf(s, "#%d: NVIDIA %s, compute cap.: %d.%d, ECC: %3s, stat: %s", + dinfo->id, dinfo->prop.name, + dinfo->prop.major, dinfo->prop.minor, + dinfo->prop.ECCEnabled ? "yes" : " no", + gpu_detect_res_str[dinfo->stat]); + } +} + +int get_gpu_device_id(const gmx_gpu_info_t *gpu_info, + const gmx_gpu_opt_t *gpu_opt, + int idx) +{ + assert(gpu_info); + assert(gpu_opt); + assert(idx >= 0 && idx < gpu_opt->n_dev_use); + + return gpu_info->gpu_dev[gpu_opt->dev_use[idx]].id; +} + +int get_current_cuda_gpu_device_id(void) +{ + int gpuid; + CU_RET_ERR(cudaGetDevice(&gpuid), "cudaGetDevice failed"); + + return gpuid; +} + +size_t sizeof_gpu_dev_info(void) +{ + return sizeof(gmx_device_info_t); +} + +void gpu_set_host_malloc_and_free(bool bUseGpuKernels, + gmx_host_alloc_t **nb_alloc, + gmx_host_free_t **nb_free) +{ + if (bUseGpuKernels) + { + *nb_alloc = &pmalloc; + *nb_free = &pfree; + } + else + { + *nb_alloc = NULL; + *nb_free = NULL; + } +} + +void startGpuProfiler(void) +{ + /* The NVPROF_ID environment variable is set by nvprof and indicates that + mdrun is executed in the CUDA profiler. + If nvprof was run is with "--profile-from-start off", the profiler will + be started here. This way we can avoid tracing the CUDA events from the + first part of the run. Starting the profiler again does nothing. + */ + if (cudaProfilerRun) + { + cudaError_t stat; + stat = cudaProfilerStart(); + CU_RET_ERR(stat, "cudaProfilerStart failed"); + } +} + +void stopGpuProfiler(void) +{ + /* Stopping the nvidia here allows us to eliminate the subsequent + API calls from the trace, e.g. uninitialization and cleanup. */ + if (cudaProfilerRun) + { + cudaError_t stat; + stat = cudaProfilerStop(); + CU_RET_ERR(stat, "cudaProfilerStop failed"); + } +} + +void resetGpuProfiler(void) +{ + /* With CUDA <=7.5 the profiler can't be properly reset; we can only start + * the profiling here (can't stop it) which will achieve the desired effect if + * the run was started with the profiling disabled. + * + * TODO: add a stop (or replace it with reset) when this will work correctly in CUDA. + * stopGpuProfiler(); + */ + if (cudaProfilerRun) + { + startGpuProfiler(); + } +} diff --cc src/gromacs/hardware/hardwaretopology.cpp index 8a284ae0e2,0000000000..59a4d1d9ff mode 100644,000000..100644 --- a/src/gromacs/hardware/hardwaretopology.cpp +++ b/src/gromacs/hardware/hardwaretopology.cpp @@@ -1,677 -1,0 +1,712 @@@ +/* + * This file is part of the GROMACS molecular simulation package. + * + * Copyright (c) 2012,2013,2014,2015,2016, by the GROMACS development team, led by + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, + * and including many others, as listed in the AUTHORS file in the + * top-level source directory and at http://www.gromacs.org. + * + * GROMACS is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 + * of the License, or (at your option) any later version. + * + * GROMACS is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with GROMACS; if not, see + * http://www.gnu.org/licenses, or write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * If you want to redistribute modifications to GROMACS, please + * consider that scientific software is very special. Version + * control is crucial - bugs must be traceable. We will be happy to + * consider code for inclusion in the official distribution, but + * derived work must not be called official GROMACS. Details are found + * in the README & COPYING files - if they are missing, get the + * official version at http://www.gromacs.org. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the research papers on the package. Check out http://www.gromacs.org. + */ + +/*! \internal \file + * \brief + * Implements gmx::HardwareTopology. + * + * \author Erik Lindahl + * \ingroup module_hardware + */ + +#include "gmxpre.h" + +#include "hardwaretopology.h" + +#include "config.h" + +#include + +#include +#include + +#if GMX_HWLOC +# include +#endif + +#include "gromacs/gmxlib/md_logging.h" +#include "gromacs/hardware/cpuinfo.h" +#include "gromacs/utility/gmxassert.h" +#include "gromacs/utility/gmxomp.h" + +#ifdef HAVE_UNISTD_H +# include // sysconf() +#endif +#if GMX_NATIVE_WINDOWS +# include // GetSystemInfo() +#endif + ++#if defined(_M_ARM) || defined(__arm__) || defined(__ARM_ARCH) || defined (__aarch64__) ++//! Constant used to help minimize preprocessed code ++static const bool isArm = true; ++#else ++//! Constant used to help minimize preprocessed code ++static const bool isArm = false; ++#endif ++ +namespace gmx +{ + +namespace +{ + +/***************************************************************************** + * * + * Utility functions for extracting hardware topology from CpuInfo object * + * * + *****************************************************************************/ + +/*! \brief Initialize machine data from basic information in cpuinfo + * + * \param machine Machine tree structure where information will be assigned + * if the cpuinfo object contains topology information. + * \param supportLevel If topology information is available in CpuInfo, + * this will be updated to reflect the amount of + * information written to the machine structure. + */ +void +parseCpuInfo(HardwareTopology::Machine * machine, + HardwareTopology::SupportLevel * supportLevel) +{ + CpuInfo cpuInfo(CpuInfo::detect()); + + if (!cpuInfo.logicalProcessors().empty()) + { + int nSockets = 0; + int nCores = 0; + int nHwThreads = 0; + + // Copy the logical processor information from cpuinfo + for (auto &l : cpuInfo.logicalProcessors()) + { + machine->logicalProcessors.push_back( { l.socketRankInMachine, l.coreRankInSocket, l.hwThreadRankInCore, -1 } ); + nSockets = std::max(nSockets, l.socketRankInMachine); + nCores = std::max(nCores, l.coreRankInSocket); + nHwThreads = std::max(nHwThreads, l.hwThreadRankInCore); + } + + // Fill info form sockets/cores/hwthreads + int socketId = 0; + int coreId = 0; + int hwThreadId = 0; + + machine->sockets.resize(nSockets + 1); + for (auto &s : machine->sockets) + { + s.id = socketId++; + s.cores.resize(nCores + 1); + for (auto &c : s.cores) + { + c.id = coreId++; + c.numaNodeId = -1; // No numa information + c.hwThreads.resize(nHwThreads + 1); + for (auto &t : c.hwThreads) + { + t.id = hwThreadId++; + t.logicalProcessorId = -1; // set as unassigned for now + } + } + } + + // Fill the logical processor id in the right place + for (std::size_t i = 0; i < machine->logicalProcessors.size(); i++) + { + const HardwareTopology::LogicalProcessor &l = machine->logicalProcessors[i]; + machine->sockets[l.socketRankInMachine].cores[l.coreRankInSocket].hwThreads[l.hwThreadRankInCore].logicalProcessorId = static_cast(i); + } + machine->logicalProcessorCount = machine->logicalProcessors.size(); + *supportLevel = HardwareTopology::SupportLevel::Basic; + } + else + { + *supportLevel = HardwareTopology::SupportLevel::None; + } +} + +#if GMX_HWLOC + +#if HWLOC_API_VERSION < 0x00010b00 +# define HWLOC_OBJ_PACKAGE HWLOC_OBJ_SOCKET +# define HWLOC_OBJ_NUMANODE HWLOC_OBJ_NODE +#endif + +/***************************************************************************** + * * + * Utility functions for extracting hardware topology from hwloc library * + * * + *****************************************************************************/ + +/*! \brief Return vector of all descendants of a given type in hwloc topology + * + * \param obj Non-null hwloc object. + * \param type hwloc object type to find. The routine will only search + * on levels below obj. + * + * \return vector containing all the objects of given type that are + * descendants of the provided object. If no objects of this type + * were found, the vector will be empty. + */ +const std::vector +getHwLocDescendantsByType(const hwloc_obj_t obj, const hwloc_obj_type_t type) +{ + GMX_RELEASE_ASSERT(obj, "NULL hwloc object provided to getHwLocDescendantsByType()"); + + std::vector v; + + // Go through children; if this object has no children obj->arity is 0, + // and we'll return an empty vector. + for (std::size_t i = 0; i < obj->arity; i++) + { + // If the child is the type we're looking for, add it directly. + // Otherwise call this routine recursively for each child. + if (obj->children[i]->type == type) + { + v.push_back(obj->children[i]); + } + else + { + std::vector v2 = getHwLocDescendantsByType(obj->children[i], type); + v.insert(v.end(), v2.begin(), v2.end()); + } + } + return v; +} + +/*! \brief Read information about sockets, cores and threads from hwloc topology + * + * \param topo hwloc topology handle that has been initialized and loaded + * \param machine Pointer to the machine structure in the HardwareTopology + * class, where the tree of sockets/cores/threads will be written. + * + * \return If all the data is found the return value is 0, otherwise non-zero. + */ +int +parseHwLocSocketsCoresThreads(const hwloc_topology_t topo, + HardwareTopology::Machine * machine) +{ + const hwloc_obj_t root = hwloc_get_root_obj(topo); + std::vector hwlocSockets = getHwLocDescendantsByType(root, HWLOC_OBJ_PACKAGE); + + machine->logicalProcessorCount = hwloc_get_nbobjs_by_type(topo, HWLOC_OBJ_PU); + machine->logicalProcessors.resize(machine->logicalProcessorCount); + machine->sockets.resize(hwlocSockets.size()); + + bool topologyOk = !hwlocSockets.empty(); // Fail if we have no sockets in machine + + for (std::size_t i = 0; i < hwlocSockets.size() && topologyOk; i++) + { + // Assign information about this socket + machine->sockets[i].id = hwlocSockets[i]->logical_index; + + // Get children (cores) + std::vector hwlocCores = getHwLocDescendantsByType(hwlocSockets[i], HWLOC_OBJ_CORE); + machine->sockets[i].cores.resize(hwlocCores.size()); + + topologyOk = topologyOk && !hwlocCores.empty(); // Fail if we have no cores in socket + + // Loop over child cores + for (std::size_t j = 0; j < hwlocCores.size() && topologyOk; j++) + { + // Assign information about this core + machine->sockets[i].cores[j].id = hwlocCores[j]->logical_index; + machine->sockets[i].cores[j].numaNodeId = -1; + + // Get children (hwthreads) + std::vector hwlocPUs = getHwLocDescendantsByType(hwlocCores[j], HWLOC_OBJ_PU); + machine->sockets[i].cores[j].hwThreads.resize(hwlocPUs.size()); + + topologyOk = topologyOk && !hwlocPUs.empty(); // Fail if we have no hwthreads in core + + // Loop over child hwthreads + for (std::size_t k = 0; k < hwlocPUs.size() && topologyOk; k++) + { + // Assign information about this hwthread + std::size_t logicalProcessorId = hwlocPUs[k]->os_index; + machine->sockets[i].cores[j].hwThreads[k].id = hwlocPUs[k]->logical_index; + machine->sockets[i].cores[j].hwThreads[k].logicalProcessorId = logicalProcessorId; + + if (logicalProcessorId < machine->logicalProcessors.size()) + { + // Cross-assign data for this hwthread to the logicalprocess vector + machine->logicalProcessors[logicalProcessorId].socketRankInMachine = static_cast(i); + machine->logicalProcessors[logicalProcessorId].coreRankInSocket = static_cast(j); + machine->logicalProcessors[logicalProcessorId].hwThreadRankInCore = static_cast(k); + machine->logicalProcessors[logicalProcessorId].numaNodeId = -1; + } + else + { + topologyOk = false; + } + } + } + } + + if (topologyOk) + { + return 0; + } + else + { + machine->logicalProcessors.clear(); + machine->sockets.clear(); + return -1; + } +} + +/*! \brief Read cache information from hwloc topology + * + * \param topo hwloc topology handle that has been initialized and loaded + * \param machine Pointer to the machine structure in the HardwareTopology + * class, where cache data will be filled. + * + * \return If any cache data is found the return value is 0, otherwise non-zero. + */ +int +parseHwLocCache(const hwloc_topology_t topo, + HardwareTopology::Machine * machine) +{ + // Parse caches up to L5 + for (int cachelevel : { 1, 2, 3, 4, 5}) + { + int depth = hwloc_get_cache_type_depth(topo, cachelevel, HWLOC_OBJ_CACHE_DATA); + + if (depth >= 0) + { + hwloc_obj_t cache = hwloc_get_next_obj_by_depth(topo, depth, NULL); + if (cache != NULL) + { + std::vector hwThreads = getHwLocDescendantsByType(cache, HWLOC_OBJ_PU); + + machine->caches.push_back( { + static_cast(cache->attr->cache.depth), + static_cast(cache->attr->cache.size), + static_cast(cache->attr->cache.linesize), + static_cast(cache->attr->cache.associativity), + std::max(static_cast(hwThreads.size()), 1) + } ); + } + } + } + return machine->caches.empty(); +} + + +/*! \brief Read numa information from hwloc topology + * + * \param topo hwloc topology handle that has been initialized and loaded + * \param machine Pointer to the machine structure in the HardwareTopology + * class, where numa information will be filled. + * + * Hwloc should virtually always be able to detect numa information, but if + * there is only a single numa node in the system it is not reported at all. + * In this case we create a single numa node covering all cores. + * + * This function uses the basic socket/core/thread information detected by + * parseHwLocSocketsCoresThreads(), which means that routine must have + * completed successfully before calling this one. If this is not the case, + * you will get an error return code. + * + * \return If the data found makes sense (either in the numa node or the + * entire machine) the return value is 0, otherwise non-zero. + */ +int +parseHwLocNuma(const hwloc_topology_t topo, + HardwareTopology::Machine * machine) +{ + const hwloc_obj_t root = hwloc_get_root_obj(topo); + std::vector hwlocNumaNodes = getHwLocDescendantsByType(root, HWLOC_OBJ_NUMANODE); + bool topologyOk = true; + + if (!hwlocNumaNodes.empty()) + { + machine->numa.nodes.resize(hwlocNumaNodes.size()); + + for (std::size_t i = 0; i < hwlocNumaNodes.size(); i++) + { + machine->numa.nodes[i].id = hwlocNumaNodes[i]->logical_index; + machine->numa.nodes[i].memory = hwlocNumaNodes[i]->memory.total_memory; + machine->numa.nodes[i].logicalProcessorId.clear(); + + // Get list of PUs in this numa node + std::vector hwlocPUs = getHwLocDescendantsByType(hwlocNumaNodes[i], HWLOC_OBJ_PU); + + for (auto &p : hwlocPUs) + { + machine->numa.nodes[i].logicalProcessorId.push_back(p->os_index); + + GMX_RELEASE_ASSERT(p->os_index < machine->logicalProcessors.size(), "OS index of PU in hwloc larger than processor count"); + + machine->logicalProcessors[p->os_index].numaNodeId = static_cast(i); + std::size_t s = machine->logicalProcessors[p->os_index].socketRankInMachine; + std::size_t c = machine->logicalProcessors[p->os_index].coreRankInSocket; + + GMX_RELEASE_ASSERT(s < machine->sockets.size(), "Socket index in logicalProcessors larger than socket count"); + GMX_RELEASE_ASSERT(c < machine->sockets[s].cores.size(), "Core index in logicalProcessors larger than core count"); + // Set numaNodeId in core too + machine->sockets[s].cores[c].numaNodeId = i; + } + } + + int depth = hwloc_get_type_depth(topo, HWLOC_OBJ_NUMANODE); + const struct hwloc_distances_s * dist = hwloc_get_whole_distance_matrix_by_depth(topo, depth); + if (dist != NULL && dist->nbobjs == hwlocNumaNodes.size()) + { + machine->numa.baseLatency = dist->latency_base; + machine->numa.maxRelativeLatency = dist->latency_max; + machine->numa.relativeLatency.resize(dist->nbobjs); + for (std::size_t i = 0; i < dist->nbobjs; i++) + { + machine->numa.relativeLatency[i].resize(dist->nbobjs); + for (std::size_t j = 0; j < dist->nbobjs; j++) + { + machine->numa.relativeLatency[i][j] = dist->latency[i*dist->nbobjs+j]; + } + } + } + else + { + topologyOk = false; + } + } + else + { + // No numa nodes found. Use the entire machine as a numa node. + const hwloc_obj_t hwlocMachine = hwloc_get_next_obj_by_type(topo, HWLOC_OBJ_MACHINE, NULL); + + if (hwlocMachine != NULL) + { + machine->numa.nodes.resize(1); + machine->numa.nodes[0].id = 0; + machine->numa.nodes[0].memory = hwlocMachine->memory.total_memory; + machine->numa.baseLatency = 10; + machine->numa.maxRelativeLatency = 1; + machine->numa.relativeLatency = { { 1.0 } }; + + for (int i = 0; i < machine->logicalProcessorCount; i++) + { + machine->numa.nodes[0].logicalProcessorId.push_back(i); + } + for (auto &l : machine->logicalProcessors) + { + l.numaNodeId = 0; + } + for (auto &s : machine->sockets) + { + for (auto &c : s.cores) + { + c.numaNodeId = 0; + } + } + } + else + { + topologyOk = false; + } + } + + if (topologyOk) + { + return 0; + } + else + { + machine->numa.nodes.clear(); + return -1; + } + +} + +/*! \brief Read PCI device information from hwloc topology + * + * \param topo hwloc topology handle that has been initialized and loaded + * \param machine Pointer to the machine structure in the HardwareTopology + * class, where PCI device information will be filled. + * * + * \return If any devices were found the return value is 0, otherwise non-zero. + */ +int +parseHwLocDevices(const hwloc_topology_t topo, + HardwareTopology::Machine * machine) +{ + const hwloc_obj_t root = hwloc_get_root_obj(topo); + std::vector pcidevs = getHwLocDescendantsByType(root, HWLOC_OBJ_PCI_DEVICE); + + for (auto &p : pcidevs) + { + const hwloc_obj_t ancestor = hwloc_get_ancestor_obj_by_type(topo, HWLOC_OBJ_NUMANODE, p); + int numaId; + if (ancestor != NULL) + { + numaId = ancestor->logical_index; + } + else + { + // If we only have a single numa node we belong to it, otherwise set it to -1 (unknown) + numaId = (machine->numa.nodes.size() == 1) ? 0 : -1; + } + + GMX_RELEASE_ASSERT(p->attr, "Attributes should not be NULL for hwloc PCI object"); + + machine->devices.push_back( { + p->attr->pcidev.vendor_id, + p->attr->pcidev.device_id, + p->attr->pcidev.class_id, + p->attr->pcidev.domain, + p->attr->pcidev.bus, + p->attr->pcidev.dev, + p->attr->pcidev.func, + numaId + } ); + } + return pcidevs.empty(); +} + +void +parseHwLoc(HardwareTopology::Machine * machine, + HardwareTopology::SupportLevel * supportLevel, + bool * isThisSystem) +{ + hwloc_topology_t topo; + + // Initialize a hwloc object, set flags to request IO device information too, + // try to load the topology, and get the root object. If either step fails, + // return that we do not have any support at all from hwloc. + if (hwloc_topology_init(&topo) != 0) + { + hwloc_topology_destroy(topo); + return; // SupportLevel::None. + } + + hwloc_topology_set_flags(topo, HWLOC_TOPOLOGY_FLAG_IO_DEVICES); + + if (hwloc_topology_load(topo) != 0 || hwloc_get_root_obj(topo) == NULL) + { + hwloc_topology_destroy(topo); + return; // SupportLevel::None. + } + + // If we get here, we can get a valid root object for the topology + *isThisSystem = hwloc_topology_is_thissystem(topo); + + // Parse basic information about sockets, cores, and hardware threads + if (parseHwLocSocketsCoresThreads(topo, machine) == 0) + { + *supportLevel = HardwareTopology::SupportLevel::Basic; + } + else + { + hwloc_topology_destroy(topo); + return; // SupportLevel::None. + } + + // Get information about cache and numa nodes + if (parseHwLocCache(topo, machine) == 0 && parseHwLocNuma(topo, machine) == 0) + { + *supportLevel = HardwareTopology::SupportLevel::Full; + } + else + { + hwloc_topology_destroy(topo); + return; // SupportLevel::Basic. + } + + // PCI devices + if (parseHwLocDevices(topo, machine) == 0) + { + *supportLevel = HardwareTopology::SupportLevel::FullWithDevices; + } + + hwloc_topology_destroy(topo); + return; // SupportLevel::Full or SupportLevel::FullWithDevices. +} + +#endif + +/*! \brief Try to detect the number of logical processors. + * + * \return The number of hardware processing units, or 0 if it fails. + */ +int +detectLogicalProcessorCount(FILE *fplog, const t_commrec *cr) +{ + int count = 0; + + { +#if GMX_NATIVE_WINDOWS + // Windows + SYSTEM_INFO sysinfo; + GetSystemInfo( &sysinfo ); + count = sysinfo.dwNumberOfProcessors; +#elif defined HAVE_SYSCONF + // We are probably on Unix. Check if we have the argument to use before executing any calls +# if defined(_SC_NPROCESSORS_CONF) + count = sysconf(_SC_NPROCESSORS_CONF); +# if defined(_SC_NPROCESSORS_ONLN) + /* On e.g. Arm, the Linux kernel can use advanced power saving features where + * processors are brought online/offline dynamically. This will cause + * _SC_NPROCESSORS_ONLN to report 1 at the beginning of the run. For this + * reason we now warn if this mismatches with the detected core count. */ + int countOnline = sysconf(_SC_NPROCESSORS_ONLN); + if (count != countOnline) + { - md_print_warn(cr, fplog, - "%d CPUs configured, but only %d of them are online.\n" - "This can happen on embedded platforms (e.g. ARM) where the OS shuts some cores\n" - "off to save power, and will turn them back on later when the load increases.\n" - "However, this will likely mean GROMACS cannot pin threads to those cores. You\n" - "will likely see much better performance by forcing all cores to be online, and\n" - "making sure they run at their full clock frequency.", count, countOnline); ++ /* We assume that this scenario means that the kernel has ++ disabled threads or cores, and that the only safe course is ++ to assume that _SC_NPROCESSORS_ONLN should be used. Even ++ this may not be valid if running in a containerized ++ environment, such system calls may read from ++ /sys/devices/system/cpu and report what the OS sees, rather ++ than what the container cgroup is supposed to set up as ++ limits. But we're not sure right now whether there's any ++ (standard-ish) way to handle that. ++ ++ On ARM, the kernel may have powered down the cores, ++ which we'll warn the user about now. On x86, this ++ means HT is disabled by the kernel, not in the ++ BIOS. We're not sure what it means on other ++ architectures, or even if it is possible, because ++ sysconf is rather non-standardized. */ ++ if (isArm) ++ { ++ md_print_warn(cr, fplog, ++ "%d CPUs configured, but only %d of them are online.\n" ++ "This can happen on embedded platforms (e.g. ARM) where the OS shuts some cores\n" ++ "off to save power, and will turn them back on later when the load increases.\n" ++ "However, this will likely mean GROMACS cannot pin threads to those cores. You\n" ++ "will likely see much better performance by forcing all cores to be online, and\n" ++ "making sure they run at their full clock frequency.", count, countOnline); ++ } ++ else ++ { ++ md_print_warn(cr, fplog, ++ "Note: %d CPUs configured, but only %d of them are online, so GROMACS will use the latter.", ++ count, countOnline); ++ // We use the online count to avoid (potential) oversubscription. ++ count = countOnline; ++ } + } +# endif +# elif defined(_SC_NPROC_CONF) + count = sysconf(_SC_NPROC_CONF); +# elif defined(_SC_NPROCESSORS_ONLN) + count = sysconf(_SC_NPROCESSORS_ONLN); +# elif defined(_SC_NPROC_ONLN) + count = sysconf(_SC_NPROC_ONLN); +# else +# warning "No valid sysconf argument value found. Executables will not be able to determine the number of logical cores: mdrun will use 1 thread by default!" +# endif // End of check for sysconf argument values + +#else + count = 0; // Neither windows nor Unix. +#endif + } +#if GMX_OPENMP + int countFromOpenmp = gmx_omp_get_num_procs(); + if (count != countFromOpenmp) + { + md_print_warn(cr, fplog, + "Number of logical cores detected (%d) does not match the number reported by OpenMP (%d).\n" + "Consider setting the launch configuration manually!", + count, countFromOpenmp); + } +#endif + + GMX_UNUSED_VALUE(cr); + GMX_UNUSED_VALUE(fplog); + return count; +} + +} // namespace anonymous + +// static +HardwareTopology HardwareTopology::detect(FILE *fplog, const t_commrec *cr) +{ + HardwareTopology result; + + // Default values for machine and numa stuff + result.machine_.logicalProcessorCount = 0; + result.machine_.numa.baseLatency = 0.0; + result.machine_.numa.maxRelativeLatency = 0.0; + result.supportLevel_ = SupportLevel::None; + result.isThisSystem_ = true; + +#if GMX_HWLOC + parseHwLoc(&result.machine_, &result.supportLevel_, &result.isThisSystem_); +#endif + + // If something went wrong in hwloc (or if it was not present) we might + // have more information in cpuInfo + if (result.supportLevel_ < SupportLevel::Basic) + { + // There might be topology information in cpuInfo + parseCpuInfo(&result.machine_, &result.supportLevel_); + } + // If we did not manage to get anything from either hwloc or cpuInfo, find the cpu count at least + if (result.supportLevel_ == SupportLevel::None) + { + // No topology information; try to detect the number of logical processors at least + result.machine_.logicalProcessorCount = detectLogicalProcessorCount(fplog, cr); + if (result.machine_.logicalProcessorCount > 0) + { + result.supportLevel_ = SupportLevel::LogicalProcessorCount; + } + } + return result; +} + + +HardwareTopology::HardwareTopology() + : supportLevel_(SupportLevel::None) +{ +} + +int HardwareTopology::numberOfCores() const +{ + if (supportLevel() >= SupportLevel::Basic) + { + // We assume all sockets have the same number of cores as socket 0. + // Since topology information is present, we can assume there is at least one socket. + return machine().sockets.size() * machine().sockets[0].cores.size(); + } + else if (supportLevel() >= SupportLevel::LogicalProcessorCount) + { + return machine().logicalProcessorCount; + } + else + { + return 0; + } +} + +} // namespace gmx