From e519efa8063c6edae55123e6abf01218c6116e68 Mon Sep 17 00:00:00 2001 From: Artem Zhmurov Date: Mon, 24 Aug 2020 08:39:46 +0000 Subject: [PATCH] Move GPU devices management into hardware subfolder This consolidate the GPU management in CUDA and OpenCL into one place. --- src/gromacs/ewald/pme_gpu_internal.cpp | 1 + .../ewald/tests/testhardwarecontexts.cpp | 2 +- src/gromacs/gpu_utils/CMakeLists.txt | 1 - src/gromacs/gpu_utils/device_context_ocl.cpp | 2 +- src/gromacs/gpu_utils/device_stream_ocl.cpp | 1 + src/gromacs/gpu_utils/gpu_testutils.cpp | 4 +- src/gromacs/gpu_utils/gpu_utils.cpp | 52 +- src/gromacs/gpu_utils/gpu_utils.cu | 421 ---------------- src/gromacs/gpu_utils/gpu_utils.h | 154 ------ src/gromacs/gpu_utils/gputraits.cuh | 16 - src/gromacs/gpu_utils/gputraits.h | 6 - src/gromacs/gpu_utils/gputraits_ocl.h | 32 -- src/gromacs/gpu_utils/ocl_compiler.h | 1 + .../gpu_utils/tests/devicetransfers.cu | 4 +- .../gpu_utils/tests/devicetransfers_ocl.cpp | 2 +- src/gromacs/gpu_utils/tests/gputest.cpp | 2 +- .../gpu_utils/tests/typecasts_runner.cu | 1 + src/gromacs/hardware/CMakeLists.txt | 15 + src/gromacs/hardware/detecthardware.cpp | 2 +- src/gromacs/hardware/device_information.h | 148 ++++++ src/gromacs/hardware/device_management.cpp | 90 ++++ src/gromacs/hardware/device_management.cu | 466 ++++++++++++++++++ src/gromacs/hardware/device_management.h | 188 +++++++ .../hardware/device_management_common.cpp | 89 ++++ .../device_management_ocl.cpp} | 56 ++- src/gromacs/hardware/gpu_hw_info.h | 41 -- src/gromacs/hardware/printhardware.cpp | 2 +- src/gromacs/mdlib/tests/constrtestrunners.cu | 2 +- .../mdlib/tests/leapfrogtestrunners.cu | 2 +- src/gromacs/mdlib/tests/settletestrunners.cu | 2 +- src/gromacs/mdrun/runner.cpp | 2 +- src/gromacs/nbnxm/cuda/nbnxm_cuda.cu | 1 + .../nbnxm/cuda/nbnxm_cuda_data_mgmt.cu | 2 +- src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp | 1 + .../nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp | 2 +- src/gromacs/taskassignment/taskassignment.cpp | 2 +- src/gromacs/taskassignment/usergpuids.cpp | 4 +- 37 files changed, 1051 insertions(+), 768 deletions(-) create mode 100644 src/gromacs/hardware/device_information.h create mode 100644 src/gromacs/hardware/device_management.cpp create mode 100644 src/gromacs/hardware/device_management.cu create mode 100644 src/gromacs/hardware/device_management.h create mode 100644 src/gromacs/hardware/device_management_common.cpp rename src/gromacs/{gpu_utils/gpu_utils_ocl.cpp => hardware/device_management_ocl.cpp} (99%) diff --git a/src/gromacs/ewald/pme_gpu_internal.cpp b/src/gromacs/ewald/pme_gpu_internal.cpp index 24e9a4ba4b..88e39976c7 100644 --- a/src/gromacs/ewald/pme_gpu_internal.cpp +++ b/src/gromacs/ewald/pme_gpu_internal.cpp @@ -59,6 +59,7 @@ #include "gromacs/gpu_utils/device_context.h" #include "gromacs/gpu_utils/device_stream.h" #include "gromacs/gpu_utils/gpu_utils.h" +#include "gromacs/hardware/device_information.h" #include "gromacs/math/invertmatrix.h" #include "gromacs/math/units.h" #include "gromacs/timing/gpu_timing.h" diff --git a/src/gromacs/ewald/tests/testhardwarecontexts.cpp b/src/gromacs/ewald/tests/testhardwarecontexts.cpp index 96f36f9810..6ae36951d5 100644 --- a/src/gromacs/ewald/tests/testhardwarecontexts.cpp +++ b/src/gromacs/ewald/tests/testhardwarecontexts.cpp @@ -47,8 +47,8 @@ #include #include "gromacs/ewald/pme.h" -#include "gromacs/gpu_utils/gpu_utils.h" #include "gromacs/hardware/detecthardware.h" +#include "gromacs/hardware/device_management.h" #include "gromacs/hardware/hw_info.h" #include "gromacs/utility/basenetwork.h" #include "gromacs/utility/exceptions.h" diff --git a/src/gromacs/gpu_utils/CMakeLists.txt b/src/gromacs/gpu_utils/CMakeLists.txt index 4db569f5fa..a85efdeda1 100644 --- a/src/gromacs/gpu_utils/CMakeLists.txt +++ b/src/gromacs/gpu_utils/CMakeLists.txt @@ -47,7 +47,6 @@ if(GMX_GPU_OPENCL) gmx_add_libgromacs_sources( device_context_ocl.cpp device_stream_ocl.cpp - gpu_utils_ocl.cpp ocl_compiler.cpp ocl_caching.cpp oclutils.cpp diff --git a/src/gromacs/gpu_utils/device_context_ocl.cpp b/src/gromacs/gpu_utils/device_context_ocl.cpp index cfbd60c1a3..e8deb7fab2 100644 --- a/src/gromacs/gpu_utils/device_context_ocl.cpp +++ b/src/gromacs/gpu_utils/device_context_ocl.cpp @@ -45,7 +45,7 @@ #include "device_context_ocl.h" -#include "gromacs/gpu_utils/gputraits.h" +#include "gromacs/hardware/device_information.h" #include "gromacs/utility/exceptions.h" #include "gromacs/utility/fatalerror.h" #include "gromacs/utility/gmxassert.h" diff --git a/src/gromacs/gpu_utils/device_stream_ocl.cpp b/src/gromacs/gpu_utils/device_stream_ocl.cpp index 39c58ff5da..84407b1674 100644 --- a/src/gromacs/gpu_utils/device_stream_ocl.cpp +++ b/src/gromacs/gpu_utils/device_stream_ocl.cpp @@ -45,6 +45,7 @@ #include "gromacs/gpu_utils/device_context_ocl.h" #include "gromacs/gpu_utils/device_stream.h" #include "gromacs/gpu_utils/gputraits_ocl.h" +#include "gromacs/hardware/device_information.h" #include "gromacs/utility/exceptions.h" #include "gromacs/utility/gmxassert.h" #include "gromacs/utility/stringutil.h" diff --git a/src/gromacs/gpu_utils/gpu_testutils.cpp b/src/gromacs/gpu_utils/gpu_testutils.cpp index 63a8756afc..99b173c4a6 100644 --- a/src/gromacs/gpu_utils/gpu_testutils.cpp +++ b/src/gromacs/gpu_utils/gpu_testutils.cpp @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2019, by the GROMACS development team, led by + * Copyright (c) 2019,2020, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -41,7 +41,7 @@ #include "gpu_testutils.h" -#include "gromacs/gpu_utils/gpu_utils.h" +#include "gromacs/hardware/device_management.h" #include "gromacs/hardware/gpu_hw_info.h" bool canComputeOnGpu() diff --git a/src/gromacs/gpu_utils/gpu_utils.cpp b/src/gromacs/gpu_utils/gpu_utils.cpp index 004ad30c86..a8eb03b23a 100644 --- a/src/gromacs/gpu_utils/gpu_utils.cpp +++ b/src/gromacs/gpu_utils/gpu_utils.cpp @@ -41,11 +41,9 @@ #include "gpu_utils.h" -#include "config.h" - #include -#include "gromacs/hardware/gpu_hw_info.h" +#include "gromacs/hardware/device_information.h" #include "gromacs/utility/arrayref.h" #include "gromacs/utility/smalloc.h" #include "gromacs/utility/stringutil.h" @@ -54,54 +52,6 @@ # pragma warning(disable : 6237) #endif -//! Constant used to help minimize preprocessed code -static constexpr bool c_binarySupportsGpus = (GMX_GPU != 0); - -bool canPerformGpuDetection() -{ - if (c_binarySupportsGpus && getenv("GMX_DISABLE_GPU_DETECTION") == nullptr) - { - return isGpuDetectionFunctional(nullptr); - } - else - { - return false; - } -} - -#if !GMX_GPU -DeviceStatus gpu_info_get_stat(const gmx_gpu_info_t& /*unused*/, int /*unused*/) -{ - return DeviceStatus::Nonexistent; -} -#endif - -void free_gpu_info(const gmx_gpu_info_t* gpu_info) -{ - sfree(static_cast(gpu_info->deviceInfo)); // circumvent is_pod check in sfree -} - -std::vector getCompatibleGpus(const gmx_gpu_info_t& gpu_info) -{ - // Possible minor over-allocation here, but not important for anything - std::vector compatibleGpus; - compatibleGpus.reserve(gpu_info.n_dev); - for (int i = 0; i < gpu_info.n_dev; i++) - { - assert(gpu_info.deviceInfo); - if (gpu_info_get_stat(gpu_info, i) == DeviceStatus::Compatible) - { - compatibleGpus.push_back(i); - } - } - return compatibleGpus; -} - -const char* getGpuCompatibilityDescription(const gmx_gpu_info_t& gpu_info, int index) -{ - return (index >= gpu_info.n_dev ? c_deviceStateString[DeviceStatus::Nonexistent] - : c_deviceStateString[gpu_info_get_stat(gpu_info, index)]); -} /*! \brief Help build a descriptive message in \c error if there are * \c errorReasons why nonbondeds on a GPU are not supported. * diff --git a/src/gromacs/gpu_utils/gpu_utils.cu b/src/gromacs/gpu_utils/gpu_utils.cu index 7d282e5a1a..e0ae3bed30 100644 --- a/src/gromacs/gpu_utils/gpu_utils.cu +++ b/src/gromacs/gpu_utils/gpu_utils.cu @@ -65,39 +65,8 @@ #include "gromacs/utility/snprintf.h" #include "gromacs/utility/stringutil.h" -/*! \internal \brief - * Max number of devices supported by CUDA (for consistency checking). - * - * In reality it is 16 with CUDA <=v5.0, but let's stay on the safe side. - */ -static int cuda_max_device_count = 32; - static bool cudaProfilerRun = ((getenv("NVPROF_ID") != nullptr)); -/** Dummy kernel used for sanity checking. */ -static __global__ void k_dummy_test(void) {} - -static cudaError_t checkCompiledTargetCompatibility(int deviceId, const cudaDeviceProp& deviceProp) -{ - cudaFuncAttributes attributes; - cudaError_t stat = cudaFuncGetAttributes(&attributes, k_dummy_test); - - if (cudaErrorInvalidDeviceFunction == stat) - { - fprintf(stderr, - "\nWARNING: The %s binary does not include support for the CUDA architecture of " - "the GPU ID #%d (compute capability %d.%d) detected during detection. " - "By default, GROMACS supports all architectures of compute " - "capability >= 3.0, so your GPU " - "might be rare, or some architectures were disabled in the build. \n" - "Consult the install guide for how to use the GMX_CUDA_TARGET_SM and " - "GMX_CUDA_TARGET_COMPUTE CMake variables to add this architecture. \n", - gmx::getProgramContext().displayName(), deviceId, deviceProp.major, deviceProp.minor); - } - - return stat; -} - bool isHostMemoryPinned(const void* h_ptr) { cudaPointerAttributes memoryAttributes; @@ -133,391 +102,6 @@ bool isHostMemoryPinned(const void* h_ptr) return isPinned; } -/*! - * \brief Runs GPU sanity checks. - * - * Runs a series of checks to determine that the given GPU and underlying CUDA - * driver/runtime functions properly. - * - * \todo Currently we do not make a distinction between the type of errors - * that can appear during functionality checks. This needs to be improved, - * e.g if the dummy test kernel fails to execute with a "device busy message" - * we should appropriately report that the device is busy instead of NonFunctional. - * - * \todo Introduce errors codes and handle errors more smoothly. - * - * - * \param[in] dev_id the device ID of the GPU or -1 if the device has already been initialized - * \param[in] dev_prop The device properties structure - * \returns 0 if the device looks OK, -1 if it sanity checks failed, and -2 if the device is busy - */ -static DeviceStatus isDeviceFunctional(int dev_id, const cudaDeviceProp& dev_prop) -{ - cudaError_t cu_err; - int dev_count, id; - - cu_err = cudaGetDeviceCount(&dev_count); - if (cu_err != cudaSuccess) - { - fprintf(stderr, "Error %d while querying device count: %s\n", cu_err, cudaGetErrorString(cu_err)); - return DeviceStatus::NonFunctional; - } - - /* no CUDA compatible device at all */ - if (dev_count == 0) - { - return DeviceStatus::NonFunctional; - } - - /* things might go horribly wrong if cudart is not compatible with the driver */ - if (dev_count < 0 || dev_count > cuda_max_device_count) - { - return DeviceStatus::NonFunctional; - } - - if (dev_id == -1) /* device already selected let's not destroy the context */ - { - cu_err = cudaGetDevice(&id); - if (cu_err != cudaSuccess) - { - fprintf(stderr, "Error %d while querying device id: %s\n", cu_err, cudaGetErrorString(cu_err)); - return DeviceStatus::NonFunctional; - } - } - else - { - id = dev_id; - if (id > dev_count - 1) /* pfff there's no such device */ - { - fprintf(stderr, - "The requested device with id %d does not seem to exist (device count=%d)\n", - dev_id, dev_count); - return DeviceStatus::NonFunctional; - } - } - - /* both major & minor is 9999 if no CUDA capable devices are present */ - if (dev_prop.major == 9999 && dev_prop.minor == 9999) - { - return DeviceStatus::NonFunctional; - } - /* we don't care about emulation mode */ - if (dev_prop.major == 0) - { - return DeviceStatus::NonFunctional; - } - - if (id != -1) - { - cu_err = cudaSetDevice(id); - if (cu_err != cudaSuccess) - { - fprintf(stderr, "Error %d while switching to device #%d: %s\n", cu_err, id, - cudaGetErrorString(cu_err)); - return DeviceStatus::NonFunctional; - } - } - - cu_err = checkCompiledTargetCompatibility(dev_id, dev_prop); - // Avoid triggering an error if GPU devices are in exclusive or prohibited mode; - // it is enough to check for cudaErrorDevicesUnavailable only here because - // if we encounter it that will happen in cudaFuncGetAttributes in the above function. - if (cu_err == cudaErrorDevicesUnavailable) - { - return DeviceStatus::Unavailable; - } - else if (cu_err != cudaSuccess) - { - return DeviceStatus::NonFunctional; - } - - /* try to execute a dummy kernel */ - try - { - KernelLaunchConfig config; - config.blockSize[0] = 512; - const auto dummyArguments = prepareGpuKernelArguments(k_dummy_test, config); - DeviceInformation deviceInfo; - const DeviceContext deviceContext(deviceInfo); - const DeviceStream deviceStream(deviceContext, DeviceStreamPriority::Normal, false); - launchGpuKernel(k_dummy_test, config, deviceStream, nullptr, "Dummy kernel", dummyArguments); - } - catch (gmx::GromacsException& ex) - { - // launchGpuKernel error is not fatal and should continue with marking the device bad - fprintf(stderr, - "Error occurred while running dummy kernel sanity check on device #%d:\n %s\n", id, - formatExceptionMessageToString(ex).c_str()); - return DeviceStatus::NonFunctional; - } - - if (cudaDeviceSynchronize() != cudaSuccess) - { - return DeviceStatus::NonFunctional; - } - - /* destroy context if we created one */ - if (id != -1) - { - cu_err = cudaDeviceReset(); - CU_RET_ERR(cu_err, "cudaDeviceReset failed"); - } - - return DeviceStatus::Compatible; -} - -void init_gpu(const DeviceInformation* deviceInfo) -{ - cudaError_t stat; - - assert(deviceInfo); - - stat = cudaSetDevice(deviceInfo->id); - if (stat != cudaSuccess) - { - auto message = gmx::formatString("Failed to initialize GPU #%d", deviceInfo->id); - CU_RET_ERR(stat, message.c_str()); - } - - if (debug) - { - fprintf(stderr, "Initialized GPU ID #%d: %s\n", deviceInfo->id, deviceInfo->prop.name); - } -} - -void free_gpu(const DeviceInformation* deviceInfo) -{ - // One should only attempt to clear the device context when - // it has been used, but currently the only way to know that a GPU - // device was used is that deviceInfo will be non-null. - if (deviceInfo == nullptr) - { - return; - } - - cudaError_t stat; - - if (debug) - { - int gpuid; - stat = cudaGetDevice(&gpuid); - CU_RET_ERR(stat, "cudaGetDevice failed"); - fprintf(stderr, "Cleaning up context on GPU ID #%d\n", gpuid); - } - - stat = cudaDeviceReset(); - if (stat != cudaSuccess) - { - gmx_warning("Failed to free GPU #%d: %s", deviceInfo->id, cudaGetErrorString(stat)); - } -} - -DeviceInformation* getDeviceInfo(const gmx_gpu_info_t& gpu_info, int deviceId) -{ - if (deviceId < 0 || deviceId >= gpu_info.n_dev) - { - gmx_incons("Invalid GPU deviceId requested"); - } - return &gpu_info.deviceInfo[deviceId]; -} - -/*! \brief Returns true if the gpu characterized by the device properties is - * supported by the native gpu acceleration. - * - * \param[in] dev_prop the CUDA device properties of the gpus to test. - * \returns true if the GPU properties passed indicate a compatible - * GPU, otherwise false. - */ -static bool is_gmx_supported_gpu(const cudaDeviceProp& dev_prop) -{ - return (dev_prop.major >= 3); -} - -/*! \brief Checks if a GPU with a given ID is supported by the native GROMACS acceleration. - * - * Returns a status value which indicates compatibility or one of the following - * errors: incompatibility or insanity (=unexpected behavior). - * - * As the error handling only permits returning the state of the GPU, this function - * does not clear the CUDA runtime API status allowing the caller to inspect the error - * upon return. Note that this also means it is the caller's responsibility to - * reset the CUDA runtime state. - * - * \param[in] deviceId the ID of the GPU to check. - * \param[in] deviceProp the CUDA device properties of the device checked. - * \returns the status of the requested device - */ -static DeviceStatus checkDeviceStatus(int deviceId, const cudaDeviceProp& deviceProp) -{ - if (!is_gmx_supported_gpu(deviceProp)) - { - return DeviceStatus::Incompatible; - } - return isDeviceFunctional(deviceId, deviceProp); -} - -bool isGpuDetectionFunctional(std::string* errorMessage) -{ - cudaError_t stat; - int driverVersion = -1; - stat = cudaDriverGetVersion(&driverVersion); - GMX_ASSERT(stat != cudaErrorInvalidValue, - "An impossible null pointer was passed to cudaDriverGetVersion"); - GMX_RELEASE_ASSERT( - stat == cudaSuccess, - gmx::formatString("An unexpected value was returned from cudaDriverGetVersion %s: %s", - cudaGetErrorName(stat), cudaGetErrorString(stat)) - .c_str()); - bool foundDriver = (driverVersion > 0); - if (!foundDriver) - { - // Can't detect GPUs if there is no driver - if (errorMessage != nullptr) - { - errorMessage->assign("No valid CUDA driver found"); - } - return false; - } - - int numDevices; - stat = cudaGetDeviceCount(&numDevices); - if (stat != cudaSuccess) - { - if (errorMessage != nullptr) - { - /* cudaGetDeviceCount failed which means that there is - * something wrong with the machine: driver-runtime - * mismatch, all GPUs being busy in exclusive mode, - * invalid CUDA_VISIBLE_DEVICES, or some other condition - * which should result in GROMACS issuing at least a - * warning. */ - errorMessage->assign(cudaGetErrorString(stat)); - } - - // Consume the error now that we have prepared to handle - // it. This stops it reappearing next time we check for - // errors. Note that if CUDA_VISIBLE_DEVICES does not contain - // valid devices, then cudaGetLastError returns the - // (undocumented) cudaErrorNoDevice, but this should not be a - // problem as there should be no future CUDA API calls. - // NVIDIA bug report #2038718 has been filed. - cudaGetLastError(); - // Can't detect GPUs - return false; - } - - // We don't actually use numDevices here, that's not the job of - // this function. - return true; -} - -void findGpus(gmx_gpu_info_t* gpu_info) -{ - assert(gpu_info); - - gpu_info->n_dev_compatible = 0; - - int ndev; - cudaError_t stat = cudaGetDeviceCount(&ndev); - if (stat != cudaSuccess) - { - GMX_THROW(gmx::InternalError( - "Invalid call of findGpus() when CUDA API returned an error, perhaps " - "canDetectGpus() was not called appropriately beforehand.")); - } - - // We expect to start device support/sanity checks with a clean runtime error state - gmx::ensureNoPendingCudaError(""); - - DeviceInformation* devs; - snew(devs, ndev); - for (int i = 0; i < ndev; i++) - { - cudaDeviceProp prop; - memset(&prop, 0, sizeof(cudaDeviceProp)); - stat = cudaGetDeviceProperties(&prop, i); - const DeviceStatus checkResult = - (stat != cudaSuccess) ? DeviceStatus::NonFunctional : checkDeviceStatus(i, prop); - - devs[i].id = i; - devs[i].prop = prop; - devs[i].stat = checkResult; - - if (checkResult == DeviceStatus::Compatible) - { - gpu_info->n_dev_compatible++; - } - else - { - // TODO: - // - we inspect the CUDA API state to retrieve and record any - // errors that occurred during is_gmx_supported_gpu_id() here, - // but this would be more elegant done within is_gmx_supported_gpu_id() - // and only return a string with the error if one was encountered. - // - we'll be reporting without rank information which is not ideal. - // - we'll end up warning also in cases where users would already - // get an error before mdrun aborts. - // - // Here we also clear the CUDA API error state so potential - // errors during sanity checks don't propagate. - if ((stat = cudaGetLastError()) != cudaSuccess) - { - gmx_warning("An error occurred while sanity checking device #%d; %s: %s", - devs[i].id, cudaGetErrorName(stat), cudaGetErrorString(stat)); - } - } - } - - stat = cudaPeekAtLastError(); - GMX_RELEASE_ASSERT(stat == cudaSuccess, - gmx::formatString("We promise to return with clean CUDA state, but " - "non-success state encountered: %s: %s", - cudaGetErrorName(stat), cudaGetErrorString(stat)) - .c_str()); - - gpu_info->n_dev = ndev; - gpu_info->deviceInfo = devs; -} - -void get_gpu_device_info_string(char* s, const gmx_gpu_info_t& gpu_info, int index) -{ - assert(s); - - if (index < 0 && index >= gpu_info.n_dev) - { - return; - } - - DeviceInformation* dinfo = &gpu_info.deviceInfo[index]; - - bool bGpuExists = - (dinfo->stat != DeviceStatus::Nonexistent && dinfo->stat != DeviceStatus::NonFunctional); - - if (!bGpuExists) - { - sprintf(s, "#%d: %s, stat: %s", dinfo->id, "N/A", c_deviceStateString[dinfo->stat]); - } - else - { - sprintf(s, "#%d: NVIDIA %s, compute cap.: %d.%d, ECC: %3s, stat: %s", dinfo->id, - dinfo->prop.name, dinfo->prop.major, dinfo->prop.minor, - dinfo->prop.ECCEnabled ? "yes" : " no", c_deviceStateString[dinfo->stat]); - } -} - -int get_current_cuda_gpu_device_id(void) -{ - int gpuid; - CU_RET_ERR(cudaGetDevice(&gpuid), "cudaGetDevice failed"); - - return gpuid; -} - -size_t sizeof_gpu_dev_info(void) -{ - return sizeof(DeviceInformation); -} - void startGpuProfiler(void) { /* The NVPROF_ID environment variable is set by nvprof and indicates that @@ -561,11 +145,6 @@ void resetGpuProfiler(void) } } -DeviceStatus gpu_info_get_stat(const gmx_gpu_info_t& info, int index) -{ - return info.deviceInfo[index].stat; -} - /*! \brief Check status returned from peer access CUDA call, and error out or warn appropriately * \param[in] stat CUDA call return status * \param[in] gpuA ID for GPU initiating peer access call diff --git a/src/gromacs/gpu_utils/gpu_utils.h b/src/gromacs/gpu_utils/gpu_utils.h index 0e27565c51..fce1e99580 100644 --- a/src/gromacs/gpu_utils/gpu_utils.h +++ b/src/gromacs/gpu_utils/gpu_utils.h @@ -54,10 +54,6 @@ #include "gromacs/gpu_utils/gpu_macros.h" #include "gromacs/utility/basedefinitions.h" -struct DeviceInformation; -enum class DeviceStatus : int; -struct gmx_gpu_info_t; - namespace gmx { class MDLogger; @@ -77,156 +73,6 @@ enum class GpuTaskCompletion Check /*<< Only check whether the task has completed */ }; -/*! \brief Return whether GPUs can be detected - * - * Returns true when this is a build of \Gromacs configured to support - * GPU usage, GPU detection is not disabled by an environment variable - * and a valid device driver, ICD, and/or runtime was detected. - * Does not throw. */ -bool canPerformGpuDetection(); - -/*! \brief Return whether GPU detection is functioning correctly - * - * Returns true when this is a build of \Gromacs configured to support - * GPU usage, and a valid device driver, ICD, and/or runtime was detected. - * - * This function is not intended to be called from build - * configurations that do not support GPUs, and there will be no - * descriptive message in that case. - * - * \param[out] errorMessage When returning false on a build configured with - * GPU support and non-nullptr was passed, - * the string contains a descriptive message about - * why GPUs cannot be detected. - * - * Does not throw. */ -GPU_FUNC_QUALIFIER -bool isGpuDetectionFunctional(std::string* GPU_FUNC_ARGUMENT(errorMessage)) - GPU_FUNC_TERM_WITH_RETURN(false); - -/*! \brief Find all GPUs in the system. - * - * Will detect every GPU supported by the device driver in use. - * Must only be called if canPerformGpuDetection() has returned true. - * This routine also checks for the compatibility of each and fill the - * gpu_info->deviceInfo array with the required information on each the - * device: ID, device properties, status. - * - * Note that this function leaves the GPU runtime API error state clean; - * this is implemented ATM in the CUDA flavor. - * TODO: check if errors do propagate in OpenCL as they do in CUDA and - * whether there is a mechanism to "clear" them. - * - * \param[in] gpu_info pointer to structure holding GPU information. - * - * \throws InternalError if a GPU API returns an unexpected failure (because - * the call to canDetectGpus() should always prevent this occuring) - */ -GPU_FUNC_QUALIFIER -void findGpus(gmx_gpu_info_t* GPU_FUNC_ARGUMENT(gpu_info)) GPU_FUNC_TERM; - -/*! \brief Return a container of the detected GPUs that are compatible. - * - * This function filters the result of the detection for compatible - * GPUs, based on the previously run compatibility tests. - * - * \param[in] gpu_info Information detected about GPUs, including compatibility. - * \return vector of IDs of GPUs already recorded as compatible */ -std::vector getCompatibleGpus(const gmx_gpu_info_t& gpu_info); - -/*! \brief Return a string describing how compatible the GPU with given \c index is. - * - * \param[in] gpu_info Information about detected GPUs - * \param[in] index index of GPU to ask about - * \returns A null-terminated C string describing the compatibility status, useful for error messages. - */ -const char* getGpuCompatibilityDescription(const gmx_gpu_info_t& gpu_info, int index); - -/*! \brief Frees the gpu_dev and dev_use array fields of \p gpu_info. - * - * \param[in] gpu_info pointer to structure holding GPU information - */ -void free_gpu_info(const gmx_gpu_info_t* gpu_info); - -/*! \brief Initializes the GPU described by \c deviceInfo. - * - * TODO Doxygen complains about these - probably a Doxygen bug, since - * the patterns here are the same as elsewhere in this header. - * - * \param[in] deviceInfo device info of the GPU to initialize - * - * Issues a fatal error for any critical errors that occur during - * initialization. - */ -GPU_FUNC_QUALIFIER -void init_gpu(const DeviceInformation* GPU_FUNC_ARGUMENT(deviceInfo)) GPU_FUNC_TERM; - -/*! \brief Frees up the CUDA GPU used by the active context at the time of calling. - * - * If \c deviceInfo is nullptr, then it is understood that no device - * was selected so no context is active to be freed. Otherwise, the - * context is explicitly destroyed and therefore all data uploaded to - * the GPU is lost. This must only be called when none of this data is - * required anymore, because subsequent attempts to free memory - * associated with the context will otherwise fail. - * - * Calls gmx_warning upon errors. - * - * \param[in] deviceInfo device info of the GPU to clean up for - * - * \returns true if no error occurs during the freeing. - */ -CUDA_FUNC_QUALIFIER -void free_gpu(const DeviceInformation* CUDA_FUNC_ARGUMENT(deviceInfo)) CUDA_FUNC_TERM; - -/*! \brief Return a pointer to the device info for \c deviceId - * - * \param[in] gpu_info GPU info of all detected devices in the system. - * \param[in] deviceId ID for the GPU device requested. - * - * \returns Pointer to the device info for \c deviceId. - */ -GPU_FUNC_QUALIFIER -DeviceInformation* getDeviceInfo(const gmx_gpu_info_t& GPU_FUNC_ARGUMENT(gpu_info), - int GPU_FUNC_ARGUMENT(deviceId)) GPU_FUNC_TERM_WITH_RETURN(nullptr); - -/*! \brief Returns the device ID of the CUDA GPU currently in use. - * - * The GPU used is the one that is active at the time of the call in the active context. - * - * \returns device ID of the GPU in use at the time of the call - */ -CUDA_FUNC_QUALIFIER -int get_current_cuda_gpu_device_id() CUDA_FUNC_TERM_WITH_RETURN(-1); - -/*! \brief Formats and returns a device information string for a given GPU. - * - * Given an index *directly* into the array of available GPUs (gpu_dev) - * returns a formatted info string for the respective GPU which includes - * ID, name, compute capability, and detection status. - * - * \param[out] s pointer to output string (has to be allocated externally) - * \param[in] gpu_info Information about detected GPUs - * \param[in] index an index *directly* into the array of available GPUs - */ -GPU_FUNC_QUALIFIER -void get_gpu_device_info_string(char* GPU_FUNC_ARGUMENT(s), - const gmx_gpu_info_t& GPU_FUNC_ARGUMENT(gpu_info), - int GPU_FUNC_ARGUMENT(index)) GPU_FUNC_TERM; - - -/*! \brief Returns the size of the gpu_dev_info struct. - * - * The size of gpu_dev_info can be used for allocation and communication. - * - * \returns size in bytes of gpu_dev_info - */ -GPU_FUNC_QUALIFIER -size_t sizeof_gpu_dev_info() GPU_FUNC_TERM_WITH_RETURN(0); - -//! Get status of device with specified index -DeviceStatus gpu_info_get_stat(const gmx_gpu_info_t& info, int index); - /*! \brief Check if GROMACS has been built with GPU support. * * \param[in] error Pointer to error string or nullptr. diff --git a/src/gromacs/gpu_utils/gputraits.cuh b/src/gromacs/gpu_utils/gputraits.cuh index 98fd8d04ef..a165df595d 100644 --- a/src/gromacs/gpu_utils/gputraits.cuh +++ b/src/gromacs/gpu_utils/gputraits.cuh @@ -51,22 +51,6 @@ //! Device texture for fast read-only data fetching using DeviceTexture = cudaTextureObject_t; -/*! \brief CUDA device information. - * - * The CUDA device information is queried and set at detection and contains - * both information about the device/hardware returned by the runtime as well - * as additional data like support status. - */ -struct DeviceInformation -{ - //! ID of the CUDA device. - int id; - //! CUDA device properties. - cudaDeviceProp prop; - //! Device status. - DeviceStatus stat; -}; - //! \brief Single GPU call timing event - meaningless in CUDA using CommandEvent = void; diff --git a/src/gromacs/gpu_utils/gputraits.h b/src/gromacs/gpu_utils/gputraits.h index e3a3a9275e..79ccaa64aa 100644 --- a/src/gromacs/gpu_utils/gputraits.h +++ b/src/gromacs/gpu_utils/gputraits.h @@ -59,12 +59,6 @@ using DeviceTexture = void*; -//! \internal Stub for device information. -struct DeviceInformation -{ - // No member needed -}; - //! \brief Single GPU call timing event using CommandEvent = void*; diff --git a/src/gromacs/gpu_utils/gputraits_ocl.h b/src/gromacs/gpu_utils/gputraits_ocl.h index a3eb510c95..b3c6c8340e 100644 --- a/src/gromacs/gpu_utils/gputraits_ocl.h +++ b/src/gromacs/gpu_utils/gputraits_ocl.h @@ -50,38 +50,6 @@ using DeviceTexture = void*; -//! OpenCL device vendors -enum class DeviceVendor : int -{ - Unknown = 0, //!< No data - Nvidia = 1, //!< NVIDIA - Amd = 2, //!< Advanced Micro Devices - Intel = 3, //!< Intel - Count = 4 -}; - -/*! \internal - * \brief OpenCL device information. - * - * The OpenCL device information is queried and set at detection and contains - * both information about the device/hardware returned by the runtime as well - * as additional data like support status. - */ -struct DeviceInformation -{ - cl_platform_id oclPlatformId; //!< OpenCL Platform ID. - cl_device_id oclDeviceId; //!< OpenCL Device ID. - char device_name[256]; //!< Device name. - char device_version[256]; //!< Device version. - char vendorName[256]; //!< Device vendor name. - int compute_units; //!< Number of compute units. - int adress_bits; //!< Number of address bits the device is capable of. - DeviceStatus stat; //!< Device status. - DeviceVendor deviceVendor; //!< Device vendor. - size_t maxWorkItemSizes[3]; //!< Workgroup size limits (CL_DEVICE_MAX_WORK_ITEM_SIZES). - size_t maxWorkGroupSize; //!< Workgroup total size limit (CL_DEVICE_MAX_WORK_GROUP_SIZE). -}; - //! \brief Single GPU call timing event using CommandEvent = cl_event; diff --git a/src/gromacs/gpu_utils/ocl_compiler.h b/src/gromacs/gpu_utils/ocl_compiler.h index 4cd381865e..00baec0f06 100644 --- a/src/gromacs/gpu_utils/ocl_compiler.h +++ b/src/gromacs/gpu_utils/ocl_compiler.h @@ -48,6 +48,7 @@ #include #include "gromacs/gpu_utils/oclutils.h" +#include "gromacs/hardware/device_information.h" namespace gmx { diff --git a/src/gromacs/gpu_utils/tests/devicetransfers.cu b/src/gromacs/gpu_utils/tests/devicetransfers.cu index e3a56be9c3..0636285a1e 100644 --- a/src/gromacs/gpu_utils/tests/devicetransfers.cu +++ b/src/gromacs/gpu_utils/tests/devicetransfers.cu @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2017,2018,2019, by the GROMACS development team, led by + * Copyright (c) 2017,2018,2019,2020, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -49,7 +49,7 @@ #include "devicetransfers.h" #include "gromacs/gpu_utils/cudautils.cuh" -#include "gromacs/gpu_utils/gpu_utils.h" +#include "gromacs/hardware/device_management.h" #include "gromacs/hardware/gpu_hw_info.h" #include "gromacs/utility/arrayref.h" #include "gromacs/utility/exceptions.h" diff --git a/src/gromacs/gpu_utils/tests/devicetransfers_ocl.cpp b/src/gromacs/gpu_utils/tests/devicetransfers_ocl.cpp index ffe60c00e9..8338e58fa8 100644 --- a/src/gromacs/gpu_utils/tests/devicetransfers_ocl.cpp +++ b/src/gromacs/gpu_utils/tests/devicetransfers_ocl.cpp @@ -41,8 +41,8 @@ #include "gmxpre.h" #include "gromacs/gpu_utils/gmxopencl.h" -#include "gromacs/gpu_utils/gpu_utils.h" #include "gromacs/gpu_utils/oclutils.h" +#include "gromacs/hardware/device_management.h" #include "gromacs/hardware/gpu_hw_info.h" #include "gromacs/utility/arrayref.h" #include "gromacs/utility/exceptions.h" diff --git a/src/gromacs/gpu_utils/tests/gputest.cpp b/src/gromacs/gpu_utils/tests/gputest.cpp index a862361586..4caabc374f 100644 --- a/src/gromacs/gpu_utils/tests/gputest.cpp +++ b/src/gromacs/gpu_utils/tests/gputest.cpp @@ -44,7 +44,7 @@ #include -#include "gromacs/gpu_utils/gpu_utils.h" +#include "gromacs/hardware/device_management.h" #include "gromacs/hardware/gpu_hw_info.h" #include "gromacs/utility/smalloc.h" diff --git a/src/gromacs/gpu_utils/tests/typecasts_runner.cu b/src/gromacs/gpu_utils/tests/typecasts_runner.cu index 1aedf1a166..0a4134bd9c 100644 --- a/src/gromacs/gpu_utils/tests/typecasts_runner.cu +++ b/src/gromacs/gpu_utils/tests/typecasts_runner.cu @@ -47,6 +47,7 @@ #include "gromacs/gpu_utils/cudautils.cuh" #include "gromacs/gpu_utils/devicebuffer.h" #include "gromacs/gpu_utils/typecasts.cuh" +#include "gromacs/hardware/device_information.h" #include "gromacs/utility/exceptions.h" #include "gromacs/utility/stringutil.h" diff --git a/src/gromacs/hardware/CMakeLists.txt b/src/gromacs/hardware/CMakeLists.txt index efc684c31f..c93767f790 100644 --- a/src/gromacs/hardware/CMakeLists.txt +++ b/src/gromacs/hardware/CMakeLists.txt @@ -35,11 +35,26 @@ gmx_add_libgromacs_sources( cpuinfo.cpp detecthardware.cpp + device_management_common.cpp hardwaretopology.cpp printhardware.cpp identifyavx512fmaunits.cpp ) +if(GMX_GPU_OPENCL) + gmx_add_libgromacs_sources( + device_management_ocl.cpp + ) +elseif(GMX_GPU_CUDA) + gmx_add_libgromacs_sources( + device_management.cu + ) +else() + gmx_add_libgromacs_sources( + device_management.cpp + ) +endif() + if (BUILD_TESTING) add_subdirectory(tests) endif() diff --git a/src/gromacs/hardware/detecthardware.cpp b/src/gromacs/hardware/detecthardware.cpp index e635ca0b17..7e8ac92c24 100644 --- a/src/gromacs/hardware/detecthardware.cpp +++ b/src/gromacs/hardware/detecthardware.cpp @@ -48,8 +48,8 @@ #include #include "gromacs/compat/pointers.h" -#include "gromacs/gpu_utils/gpu_utils.h" #include "gromacs/hardware/cpuinfo.h" +#include "gromacs/hardware/device_management.h" #include "gromacs/hardware/hardwaretopology.h" #include "gromacs/hardware/hw_info.h" #include "gromacs/simd/support.h" diff --git a/src/gromacs/hardware/device_information.h b/src/gromacs/hardware/device_information.h new file mode 100644 index 0000000000..d9116a3a72 --- /dev/null +++ b/src/gromacs/hardware/device_information.h @@ -0,0 +1,148 @@ +/* + * This file is part of the GROMACS molecular simulation package. + * + * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, + * and including many others, as listed in the AUTHORS file in the + * top-level source directory and at http://www.gromacs.org. + * + * GROMACS is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 + * of the License, or (at your option) any later version. + * + * GROMACS is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with GROMACS; if not, see + * http://www.gnu.org/licenses, or write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * If you want to redistribute modifications to GROMACS, please + * consider that scientific software is very special. Version + * control is crucial - bugs must be traceable. We will be happy to + * consider code for inclusion in the official distribution, but + * derived work must not be called official GROMACS. Details are found + * in the README & COPYING files - if they are missing, get the + * official version at http://www.gromacs.org. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the research papers on the package. Check out http://www.gromacs.org. + */ +/*! \libinternal \file + * \brief Declares the GPU type traits for non-GPU builds. + * + * \author Mark Abraham + * \author Artem Zhmurov + * + * \inlibraryapi + * \ingroup module_hardware + */ +#ifndef GMX_HARDWARE_DEVICE_INFORMATION_H +#define GMX_HARDWARE_DEVICE_INFORMATION_H + +#include "config.h" + +#if GMX_GPU_CUDA +# include +#endif + +#if GMX_GPU_OPENCL +# include "gromacs/gpu_utils/gmxopencl.h" +#endif +#include "gromacs/utility/enumerationhelpers.h" + +//! Constant used to help minimize preprocessed code +static constexpr bool c_binarySupportsGpus = (GMX_GPU != 0); + +//! Possible results of the GPU detection/check. +enum class DeviceStatus : int +{ + //! The device is compatible + Compatible = 0, + //! Device does not exist + Nonexistent = 1, + //! Device is not compatible + Incompatible = 2, + //! OpenCL device has incompatible cluster size for non-bonded kernels. + IncompatibleClusterSize = 3, + /*! \brief An error occurred he functionality checks. + * That indicates malfunctioning of the device, driver, or incompatible driver/runtime. + */ + NonFunctional = 4, + /*! \brief CUDA devices are busy or unavailable. + * typically due to use of \p cudaComputeModeExclusive, \p cudaComputeModeProhibited modes. + */ + Unavailable = 5, + //! Enumeration size + Count = 6 +}; + +/*! \brief Names of the GPU detection/check results + * + * Check-source wants to warn about the use of a symbol name that would + * require an inclusion of config.h. However the use is in a comment, so that + * is a false warning. So C-style string concatenation is used to fool the + * naive parser in check-source. That needs a clang-format suppression + * in order to look reasonable. Also clang-tidy wants to suggest that a comma is + * missing, so that is suppressed. + */ +static const gmx::EnumerationArray c_deviceStateString = { + "compatible", "nonexistent", "incompatible", + // clang-format off + // NOLINTNEXTLINE(bugprone-suspicious-missing-comma) + "incompatible (please recompile with correct GMX" "_OPENCL_NB_CLUSTER_SIZE of 4)", + // clang-format on + "non-functional", "unavailable" +}; + +//! Device vendors +enum class DeviceVendor : int +{ + //! No data + Unknown = 0, + //! NVIDIA + Nvidia = 1, + //! Advanced Micro Devices + Amd = 2, + //! Intel + Intel = 3, + //! Enumeration size + Count = 4 +}; + + +/*! \libinternal \brief Platform-dependent device information. + * + * The device information is queried and set at detection and contains + * both information about the device/hardware returned by the runtime as well + * as additional data like support status. + */ +struct DeviceInformation +{ + //! Device status. + DeviceStatus stat; + //! ID of the device. + int id; + +#if GMX_GPU_CUDA + //! CUDA device properties. + cudaDeviceProp prop; +#elif GMX_GPU_OPENCL + cl_platform_id oclPlatformId; //!< OpenCL Platform ID. + cl_device_id oclDeviceId; //!< OpenCL Device ID. + char device_name[256]; //!< Device name. + char device_version[256]; //!< Device version. + char vendorName[256]; //!< Device vendor name. + int compute_units; //!< Number of compute units. + int adress_bits; //!< Number of address bits the device is capable of. + DeviceVendor deviceVendor; //!< Device vendor. + size_t maxWorkItemSizes[3]; //!< Workgroup size limits (CL_DEVICE_MAX_WORK_ITEM_SIZES). + size_t maxWorkGroupSize; //!< Workgroup total size limit (CL_DEVICE_MAX_WORK_GROUP_SIZE). +#endif +}; + +#endif // GMX_HARDWARE_DEVICE_INFORMATION_H diff --git a/src/gromacs/hardware/device_management.cpp b/src/gromacs/hardware/device_management.cpp new file mode 100644 index 0000000000..1d03f1b0be --- /dev/null +++ b/src/gromacs/hardware/device_management.cpp @@ -0,0 +1,90 @@ +/* + * This file is part of the GROMACS molecular simulation package. + * + * Copyright (c) 2012,2013,2014,2015,2017 The GROMACS development team. + * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, + * and including many others, as listed in the AUTHORS file in the + * top-level source directory and at http://www.gromacs.org. + * + * GROMACS is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 + * of the License, or (at your option) any later version. + * + * GROMACS is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with GROMACS; if not, see + * http://www.gnu.org/licenses, or write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * If you want to redistribute modifications to GROMACS, please + * consider that scientific software is very special. Version + * control is crucial - bugs must be traceable. We will be happy to + * consider code for inclusion in the official distribution, but + * derived work must not be called official GROMACS. Details are found + * in the README & COPYING files - if they are missing, get the + * official version at http://www.gromacs.org. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the research papers on the package. Check out http://www.gromacs.org. + */ +/*! \internal \file + * \brief Defines the CPU stubs for the device management. + * + * \author Artem Zhmurov + * + * \ingroup module_hardware + */ +#include "gmxpre.h" + +#include "device_management.h" + +bool isGpuDetectionFunctional(std::string* errorMessage) +{ + if (errorMessage != nullptr) + { + errorMessage->assign("GROMACS has been built without GPU support."); + } + return false; +} + +void findGpus(gmx_gpu_info_t* /* gpu_info */) +{ + GMX_RELEASE_ASSERT(false, "Trying to initialize GPUs in the build that does not support them."); +} + +void init_gpu(const DeviceInformation* /* deviceInfo */) +{ + GMX_RELEASE_ASSERT(false, "Trying to initialize GPU in the build that does not support GPUs."); +} + +void free_gpu(const DeviceInformation* /* deviceInfo */) {} + +DeviceInformation* getDeviceInfo(const gmx_gpu_info_t& /* gpu_info */, int /* deviceId */) +{ + GMX_RELEASE_ASSERT( + false, "Trying to get GPU device information in the build that does not support GPUs."); + return nullptr; +} + +void get_gpu_device_info_string(char* /* s */, const gmx_gpu_info_t& /* gpu_info */, int /* index */) +{ + GMX_RELEASE_ASSERT( + false, + "Trying to get the GPU device description in the build that does not support GPUs."); +} + +size_t sizeof_gpu_dev_info() +{ + return 0; +} + +DeviceStatus gpu_info_get_stat(const gmx_gpu_info_t& /* gpu_info */, int /* index */) +{ + return DeviceStatus::Nonexistent; +} diff --git a/src/gromacs/hardware/device_management.cu b/src/gromacs/hardware/device_management.cu new file mode 100644 index 0000000000..fba12ace11 --- /dev/null +++ b/src/gromacs/hardware/device_management.cu @@ -0,0 +1,466 @@ +/* + * This file is part of the GROMACS molecular simulation package. + * + * Copyright (c) 2012,2013,2014,2015,2017 The GROMACS development team. + * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, + * and including many others, as listed in the AUTHORS file in the + * top-level source directory and at http://www.gromacs.org. + * + * GROMACS is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 + * of the License, or (at your option) any later version. + * + * GROMACS is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with GROMACS; if not, see + * http://www.gnu.org/licenses, or write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * If you want to redistribute modifications to GROMACS, please + * consider that scientific software is very special. Version + * control is crucial - bugs must be traceable. We will be happy to + * consider code for inclusion in the official distribution, but + * derived work must not be called official GROMACS. Details are found + * in the README & COPYING files - if they are missing, get the + * official version at http://www.gromacs.org. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the research papers on the package. Check out http://www.gromacs.org. + */ +/*! \internal \file + * \brief Defines the CUDA implementations of the device management. + * + * \author Artem Zhmurov + * + * \ingroup module_hardware + */ +#include "gmxpre.h" + +#include "device_management.h" + +#include + +#include "gromacs/gpu_utils/cudautils.cuh" +#include "gromacs/gpu_utils/device_context.h" +#include "gromacs/gpu_utils/device_stream.h" +#include "gromacs/utility/programcontext.h" +#include "gromacs/utility/smalloc.h" + +/*! \internal \brief + * Max number of devices supported by CUDA (for consistency checking). + * + * In reality it is 16 with CUDA <=v5.0, but let's stay on the safe side. + */ +static int cuda_max_device_count = 32; + +/** Dummy kernel used for sanity checking. */ +static __global__ void k_dummy_test(void) {} + +static cudaError_t checkCompiledTargetCompatibility(int deviceId, const cudaDeviceProp& deviceProp) +{ + cudaFuncAttributes attributes; + cudaError_t stat = cudaFuncGetAttributes(&attributes, k_dummy_test); + + if (cudaErrorInvalidDeviceFunction == stat) + { + fprintf(stderr, + "\nWARNING: The %s binary does not include support for the CUDA architecture of " + "the GPU ID #%d (compute capability %d.%d) detected during detection. " + "By default, GROMACS supports all architectures of compute " + "capability >= 3.0, so your GPU " + "might be rare, or some architectures were disabled in the build. \n" + "Consult the install guide for how to use the GMX_CUDA_TARGET_SM and " + "GMX_CUDA_TARGET_COMPUTE CMake variables to add this architecture. \n", + gmx::getProgramContext().displayName(), deviceId, deviceProp.major, deviceProp.minor); + } + + return stat; +} + +/*! + * \brief Runs GPU sanity checks. + * + * Runs a series of checks to determine that the given GPU and underlying CUDA + * driver/runtime functions properly. + * + * \todo Currently we do not make a distinction between the type of errors + * that can appear during functionality checks. This needs to be improved, + * e.g if the dummy test kernel fails to execute with a "device busy message" + * we should appropriately report that the device is busy instead of NonFunctional. + * + * \todo Introduce errors codes and handle errors more smoothly. + * + * + * \param[in] dev_id the device ID of the GPU or -1 if the device has already been initialized + * \param[in] dev_prop The device properties structure + * \returns 0 if the device looks OK, -1 if it sanity checks failed, and -2 if the device is busy + */ +static DeviceStatus isDeviceFunctional(int dev_id, const cudaDeviceProp& dev_prop) +{ + cudaError_t cu_err; + int dev_count, id; + + cu_err = cudaGetDeviceCount(&dev_count); + if (cu_err != cudaSuccess) + { + fprintf(stderr, "Error %d while querying device count: %s\n", cu_err, cudaGetErrorString(cu_err)); + return DeviceStatus::NonFunctional; + } + + /* no CUDA compatible device at all */ + if (dev_count == 0) + { + return DeviceStatus::NonFunctional; + } + + /* things might go horribly wrong if cudart is not compatible with the driver */ + if (dev_count < 0 || dev_count > cuda_max_device_count) + { + return DeviceStatus::NonFunctional; + } + + if (dev_id == -1) /* device already selected let's not destroy the context */ + { + cu_err = cudaGetDevice(&id); + if (cu_err != cudaSuccess) + { + fprintf(stderr, "Error %d while querying device id: %s\n", cu_err, cudaGetErrorString(cu_err)); + return DeviceStatus::NonFunctional; + } + } + else + { + id = dev_id; + if (id > dev_count - 1) /* pfff there's no such device */ + { + fprintf(stderr, + "The requested device with id %d does not seem to exist (device count=%d)\n", + dev_id, dev_count); + return DeviceStatus::NonFunctional; + } + } + + /* both major & minor is 9999 if no CUDA capable devices are present */ + if (dev_prop.major == 9999 && dev_prop.minor == 9999) + { + return DeviceStatus::NonFunctional; + } + /* we don't care about emulation mode */ + if (dev_prop.major == 0) + { + return DeviceStatus::NonFunctional; + } + + if (id != -1) + { + cu_err = cudaSetDevice(id); + if (cu_err != cudaSuccess) + { + fprintf(stderr, "Error %d while switching to device #%d: %s\n", cu_err, id, + cudaGetErrorString(cu_err)); + return DeviceStatus::NonFunctional; + } + } + + cu_err = checkCompiledTargetCompatibility(dev_id, dev_prop); + // Avoid triggering an error if GPU devices are in exclusive or prohibited mode; + // it is enough to check for cudaErrorDevicesUnavailable only here because + // if we encounter it that will happen in cudaFuncGetAttributes in the above function. + if (cu_err == cudaErrorDevicesUnavailable) + { + return DeviceStatus::Unavailable; + } + else if (cu_err != cudaSuccess) + { + return DeviceStatus::NonFunctional; + } + + /* try to execute a dummy kernel */ + try + { + KernelLaunchConfig config; + config.blockSize[0] = 512; + const auto dummyArguments = prepareGpuKernelArguments(k_dummy_test, config); + DeviceInformation deviceInfo; + const DeviceContext deviceContext(deviceInfo); + const DeviceStream deviceStream(deviceContext, DeviceStreamPriority::Normal, false); + launchGpuKernel(k_dummy_test, config, deviceStream, nullptr, "Dummy kernel", dummyArguments); + } + catch (gmx::GromacsException& ex) + { + // launchGpuKernel error is not fatal and should continue with marking the device bad + fprintf(stderr, + "Error occurred while running dummy kernel sanity check on device #%d:\n %s\n", id, + formatExceptionMessageToString(ex).c_str()); + return DeviceStatus::NonFunctional; + } + + if (cudaDeviceSynchronize() != cudaSuccess) + { + return DeviceStatus::NonFunctional; + } + + /* destroy context if we created one */ + if (id != -1) + { + cu_err = cudaDeviceReset(); + CU_RET_ERR(cu_err, "cudaDeviceReset failed"); + } + + return DeviceStatus::Compatible; +} + +/*! \brief Returns true if the gpu characterized by the device properties is + * supported by the native gpu acceleration. + * + * \param[in] dev_prop the CUDA device properties of the gpus to test. + * \returns true if the GPU properties passed indicate a compatible + * GPU, otherwise false. + */ +static bool is_gmx_supported_gpu(const cudaDeviceProp& dev_prop) +{ + return (dev_prop.major >= 3); +} + +/*! \brief Checks if a GPU with a given ID is supported by the native GROMACS acceleration. + * + * Returns a status value which indicates compatibility or one of the following + * errors: incompatibility or insanity (=unexpected behavior). + * + * As the error handling only permits returning the state of the GPU, this function + * does not clear the CUDA runtime API status allowing the caller to inspect the error + * upon return. Note that this also means it is the caller's responsibility to + * reset the CUDA runtime state. + * + * \param[in] deviceId the ID of the GPU to check. + * \param[in] deviceProp the CUDA device properties of the device checked. + * \returns the status of the requested device + */ +static DeviceStatus checkDeviceStatus(int deviceId, const cudaDeviceProp& deviceProp) +{ + if (!is_gmx_supported_gpu(deviceProp)) + { + return DeviceStatus::Incompatible; + } + return isDeviceFunctional(deviceId, deviceProp); +} + +bool isGpuDetectionFunctional(std::string* errorMessage) +{ + cudaError_t stat; + int driverVersion = -1; + stat = cudaDriverGetVersion(&driverVersion); + GMX_ASSERT(stat != cudaErrorInvalidValue, + "An impossible null pointer was passed to cudaDriverGetVersion"); + GMX_RELEASE_ASSERT( + stat == cudaSuccess, + gmx::formatString("An unexpected value was returned from cudaDriverGetVersion %s: %s", + cudaGetErrorName(stat), cudaGetErrorString(stat)) + .c_str()); + bool foundDriver = (driverVersion > 0); + if (!foundDriver) + { + // Can't detect GPUs if there is no driver + if (errorMessage != nullptr) + { + errorMessage->assign("No valid CUDA driver found"); + } + return false; + } + + int numDevices; + stat = cudaGetDeviceCount(&numDevices); + if (stat != cudaSuccess) + { + if (errorMessage != nullptr) + { + /* cudaGetDeviceCount failed which means that there is + * something wrong with the machine: driver-runtime + * mismatch, all GPUs being busy in exclusive mode, + * invalid CUDA_VISIBLE_DEVICES, or some other condition + * which should result in GROMACS issuing at least a + * warning. */ + errorMessage->assign(cudaGetErrorString(stat)); + } + + // Consume the error now that we have prepared to handle + // it. This stops it reappearing next time we check for + // errors. Note that if CUDA_VISIBLE_DEVICES does not contain + // valid devices, then cudaGetLastError returns the + // (undocumented) cudaErrorNoDevice, but this should not be a + // problem as there should be no future CUDA API calls. + // NVIDIA bug report #2038718 has been filed. + cudaGetLastError(); + // Can't detect GPUs + return false; + } + + // We don't actually use numDevices here, that's not the job of + // this function. + return true; +} + +void findGpus(gmx_gpu_info_t* gpu_info) +{ + assert(gpu_info); + + gpu_info->n_dev_compatible = 0; + + int ndev; + cudaError_t stat = cudaGetDeviceCount(&ndev); + if (stat != cudaSuccess) + { + GMX_THROW(gmx::InternalError( + "Invalid call of findGpus() when CUDA API returned an error, perhaps " + "canDetectGpus() was not called appropriately beforehand.")); + } + + // We expect to start device support/sanity checks with a clean runtime error state + gmx::ensureNoPendingCudaError(""); + + DeviceInformation* devs; + snew(devs, ndev); + for (int i = 0; i < ndev; i++) + { + cudaDeviceProp prop; + memset(&prop, 0, sizeof(cudaDeviceProp)); + stat = cudaGetDeviceProperties(&prop, i); + const DeviceStatus checkResult = + (stat != cudaSuccess) ? DeviceStatus::NonFunctional : checkDeviceStatus(i, prop); + + devs[i].id = i; + devs[i].prop = prop; + devs[i].stat = checkResult; + + if (checkResult == DeviceStatus::Compatible) + { + gpu_info->n_dev_compatible++; + } + else + { + // TODO: + // - we inspect the CUDA API state to retrieve and record any + // errors that occurred during is_gmx_supported_gpu_id() here, + // but this would be more elegant done within is_gmx_supported_gpu_id() + // and only return a string with the error if one was encountered. + // - we'll be reporting without rank information which is not ideal. + // - we'll end up warning also in cases where users would already + // get an error before mdrun aborts. + // + // Here we also clear the CUDA API error state so potential + // errors during sanity checks don't propagate. + if ((stat = cudaGetLastError()) != cudaSuccess) + { + gmx_warning("An error occurred while sanity checking device #%d; %s: %s", + devs[i].id, cudaGetErrorName(stat), cudaGetErrorString(stat)); + } + } + } + + stat = cudaPeekAtLastError(); + GMX_RELEASE_ASSERT(stat == cudaSuccess, + gmx::formatString("We promise to return with clean CUDA state, but " + "non-success state encountered: %s: %s", + cudaGetErrorName(stat), cudaGetErrorString(stat)) + .c_str()); + + gpu_info->n_dev = ndev; + gpu_info->deviceInfo = devs; +} + +void init_gpu(const DeviceInformation* deviceInfo) +{ + cudaError_t stat; + + assert(deviceInfo); + + stat = cudaSetDevice(deviceInfo->id); + if (stat != cudaSuccess) + { + auto message = gmx::formatString("Failed to initialize GPU #%d", deviceInfo->id); + CU_RET_ERR(stat, message.c_str()); + } + + if (debug) + { + fprintf(stderr, "Initialized GPU ID #%d: %s\n", deviceInfo->id, deviceInfo->prop.name); + } +} + +void free_gpu(const DeviceInformation* deviceInfo) +{ + // One should only attempt to clear the device context when + // it has been used, but currently the only way to know that a GPU + // device was used is that deviceInfo will be non-null. + if (deviceInfo == nullptr) + { + return; + } + + cudaError_t stat; + + if (debug) + { + int gpuid; + stat = cudaGetDevice(&gpuid); + CU_RET_ERR(stat, "cudaGetDevice failed"); + fprintf(stderr, "Cleaning up context on GPU ID #%d\n", gpuid); + } + + stat = cudaDeviceReset(); + if (stat != cudaSuccess) + { + gmx_warning("Failed to free GPU #%d: %s", deviceInfo->id, cudaGetErrorString(stat)); + } +} + +DeviceInformation* getDeviceInfo(const gmx_gpu_info_t& gpu_info, int deviceId) +{ + if (deviceId < 0 || deviceId >= gpu_info.n_dev) + { + gmx_incons("Invalid GPU deviceId requested"); + } + return &gpu_info.deviceInfo[deviceId]; +} + +void get_gpu_device_info_string(char* s, const gmx_gpu_info_t& gpu_info, int index) +{ + assert(s); + + if (index < 0 && index >= gpu_info.n_dev) + { + return; + } + + DeviceInformation* dinfo = &gpu_info.deviceInfo[index]; + + bool bGpuExists = + (dinfo->stat != DeviceStatus::Nonexistent && dinfo->stat != DeviceStatus::NonFunctional); + + if (!bGpuExists) + { + sprintf(s, "#%d: %s, stat: %s", dinfo->id, "N/A", c_deviceStateString[dinfo->stat]); + } + else + { + sprintf(s, "#%d: NVIDIA %s, compute cap.: %d.%d, ECC: %3s, stat: %s", dinfo->id, + dinfo->prop.name, dinfo->prop.major, dinfo->prop.minor, + dinfo->prop.ECCEnabled ? "yes" : " no", c_deviceStateString[dinfo->stat]); + } +} + +size_t sizeof_gpu_dev_info(void) +{ + return sizeof(DeviceInformation); +} + +DeviceStatus gpu_info_get_stat(const gmx_gpu_info_t& info, int index) +{ + return info.deviceInfo[index].stat; +} diff --git a/src/gromacs/hardware/device_management.h b/src/gromacs/hardware/device_management.h new file mode 100644 index 0000000000..ed86982c67 --- /dev/null +++ b/src/gromacs/hardware/device_management.h @@ -0,0 +1,188 @@ +/* + * This file is part of the GROMACS molecular simulation package. + * + * Copyright (c) 2020, by the GROMACS development team, led by + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, + * and including many others, as listed in the AUTHORS file in the + * top-level source directory and at http://www.gromacs.org. + * + * GROMACS is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 + * of the License, or (at your option) any later version. + * + * GROMACS is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with GROMACS; if not, see + * http://www.gnu.org/licenses, or write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * If you want to redistribute modifications to GROMACS, please + * consider that scientific software is very special. Version + * control is crucial - bugs must be traceable. We will be happy to + * consider code for inclusion in the official distribution, but + * derived work must not be called official GROMACS. Details are found + * in the README & COPYING files - if they are missing, get the + * official version at http://www.gromacs.org. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the research papers on the package. Check out http://www.gromacs.org. + */ +/*! \libinternal \file + * + * \brief Implements the device management for OpenCL. + * + * \author Artem Zhmurov + * + * \inlibraryapi + * \ingroup module_hardware + */ +#ifndef GMX_HARDWARE_DEVICE_MANAGEMENT_H +#define GMX_HARDWARE_DEVICE_MANAGEMENT_H + +#include "gmxpre.h" + +#include +#include + +#include "gromacs/hardware/device_information.h" + +struct DeviceInformation; +enum class DeviceStatus : int; +struct gmx_gpu_info_t; + +/*! \brief Return whether GPUs can be detected + * + * Returns true when this is a build of \Gromacs configured to support + * GPU usage, GPU detection is not disabled by an environment variable + * and a valid device driver, ICD, and/or runtime was detected. + * Does not throw. */ +bool canPerformGpuDetection(); + +/*! \brief Return whether GPU detection is functioning correctly + * + * Returns true when this is a build of \Gromacs configured to support + * GPU usage, and a valid device driver, ICD, and/or runtime was detected. + * + * This function is not intended to be called from build + * configurations that do not support GPUs, and there will be no + * descriptive message in that case. + * + * \param[out] errorMessage When returning false on a build configured with + * GPU support and non-nullptr was passed, + * the string contains a descriptive message about + * why GPUs cannot be detected. + * + * Does not throw. */ +bool isGpuDetectionFunctional(std::string* errorMessage); + +/*! \brief Find all GPUs in the system. + * + * Will detect every GPU supported by the device driver in use. + * Must only be called if canPerformGpuDetection() has returned true. + * This routine also checks for the compatibility of each and fill the + * gpu_info->deviceInfo array with the required information on each the + * device: ID, device properties, status. + * + * Note that this function leaves the GPU runtime API error state clean; + * this is implemented ATM in the CUDA flavor. + * TODO: check if errors do propagate in OpenCL as they do in CUDA and + * whether there is a mechanism to "clear" them. + * + * \param[in] gpu_info pointer to structure holding GPU information. + * + * \throws InternalError if a GPU API returns an unexpected failure (because + * the call to canDetectGpus() should always prevent this occuring) + */ +void findGpus(gmx_gpu_info_t* gpu_info); + +/*! \brief Return a container of the detected GPUs that are compatible. + * + * This function filters the result of the detection for compatible + * GPUs, based on the previously run compatibility tests. + * + * \param[in] gpu_info Information detected about GPUs, including compatibility. + * \return vector of IDs of GPUs already recorded as compatible */ +std::vector getCompatibleGpus(const gmx_gpu_info_t& gpu_info); + +/*! \brief Return a string describing how compatible the GPU with given \c index is. + * + * \param[in] gpu_info Information about detected GPUs + * \param[in] index index of GPU to ask about + * \returns A null-terminated C string describing the compatibility status, useful for error messages. + */ +const char* getGpuCompatibilityDescription(const gmx_gpu_info_t& gpu_info, int index); + +/*! \brief Frees the gpu_dev and dev_use array fields of \p gpu_info. + * + * \param[in] gpu_info pointer to structure holding GPU information + */ +void free_gpu_info(const gmx_gpu_info_t* gpu_info); + +/*! \brief Initializes the GPU described by \c deviceInfo. + * + * TODO Doxygen complains about these - probably a Doxygen bug, since + * the patterns here are the same as elsewhere in this header. + * + * \param[in] deviceInfo device info of the GPU to initialize + * + * Issues a fatal error for any critical errors that occur during + * initialization. + */ +void init_gpu(const DeviceInformation* deviceInfo); + +/*! \brief Frees up the CUDA GPU used by the active context at the time of calling. + * + * If \c deviceInfo is nullptr, then it is understood that no device + * was selected so no context is active to be freed. Otherwise, the + * context is explicitly destroyed and therefore all data uploaded to + * the GPU is lost. This must only be called when none of this data is + * required anymore, because subsequent attempts to free memory + * associated with the context will otherwise fail. + * + * Calls gmx_warning upon errors. + * + * \param[in] deviceInfo device info of the GPU to clean up for + * + * \returns true if no error occurs during the freeing. + */ +void free_gpu(const DeviceInformation* deviceInfo); + +/*! \brief Return a pointer to the device info for \c deviceId + * + * \param[in] gpu_info GPU info of all detected devices in the system. + * \param[in] deviceId ID for the GPU device requested. + * + * \returns Pointer to the device info for \c deviceId. + */ +DeviceInformation* getDeviceInfo(const gmx_gpu_info_t& gpu_info, int deviceId); + +/*! \brief Formats and returns a device information string for a given GPU. + * + * Given an index *directly* into the array of available GPUs (gpu_dev) + * returns a formatted info string for the respective GPU which includes + * ID, name, compute capability, and detection status. + * + * \param[out] s pointer to output string (has to be allocated externally) + * \param[in] gpu_info Information about detected GPUs + * \param[in] index an index *directly* into the array of available GPUs + */ +void get_gpu_device_info_string(char* s, const gmx_gpu_info_t& gpu_info, int index); + + +/*! \brief Returns the size of the gpu_dev_info struct. + * + * The size of gpu_dev_info can be used for allocation and communication. + * + * \returns size in bytes of gpu_dev_info + */ +size_t sizeof_gpu_dev_info(); + +//! Get status of device with specified index +DeviceStatus gpu_info_get_stat(const gmx_gpu_info_t& info, int index); + +#endif // GMX_HARDWARE_DEVICE_MANAGEMENT_H diff --git a/src/gromacs/hardware/device_management_common.cpp b/src/gromacs/hardware/device_management_common.cpp new file mode 100644 index 0000000000..d5325b77e0 --- /dev/null +++ b/src/gromacs/hardware/device_management_common.cpp @@ -0,0 +1,89 @@ +/* + * This file is part of the GROMACS molecular simulation package. + * + * Copyright (c) 2012,2013,2014,2015,2017 The GROMACS development team. + * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, + * and including many others, as listed in the AUTHORS file in the + * top-level source directory and at http://www.gromacs.org. + * + * GROMACS is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 + * of the License, or (at your option) any later version. + * + * GROMACS is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with GROMACS; if not, see + * http://www.gnu.org/licenses, or write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * If you want to redistribute modifications to GROMACS, please + * consider that scientific software is very special. Version + * control is crucial - bugs must be traceable. We will be happy to + * consider code for inclusion in the official distribution, but + * derived work must not be called official GROMACS. Details are found + * in the README & COPYING files - if they are missing, get the + * official version at http://www.gromacs.org. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the research papers on the package. Check out http://www.gromacs.org. + */ +/*! \internal \file + * \brief Defines the implementations of the device management that are common for CPU, CUDA and OpenCL. + * + * \author Artem Zhmurov + * + * \ingroup module_hardware + */ +#include "gmxpre.h" + +#include + +#include "gromacs/hardware/device_information.h" +#include "gromacs/hardware/device_management.h" +#include "gromacs/hardware/gpu_hw_info.h" +#include "gromacs/utility/smalloc.h" + +bool canPerformGpuDetection() +{ + if (c_binarySupportsGpus && getenv("GMX_DISABLE_GPU_DETECTION") == nullptr) + { + return isGpuDetectionFunctional(nullptr); + } + else + { + return false; + } +} + +std::vector getCompatibleGpus(const gmx_gpu_info_t& gpu_info) +{ + // Possible minor over-allocation here, but not important for anything + std::vector compatibleGpus; + compatibleGpus.reserve(gpu_info.n_dev); + for (int i = 0; i < gpu_info.n_dev; i++) + { + assert(gpu_info.deviceInfo); + if (gpu_info_get_stat(gpu_info, i) == DeviceStatus::Compatible) + { + compatibleGpus.push_back(i); + } + } + return compatibleGpus; +} + +const char* getGpuCompatibilityDescription(const gmx_gpu_info_t& gpu_info, int index) +{ + return (index >= gpu_info.n_dev ? c_deviceStateString[DeviceStatus::Nonexistent] + : c_deviceStateString[gpu_info_get_stat(gpu_info, index)]); +} + +void free_gpu_info(const gmx_gpu_info_t* gpu_info) +{ + sfree(static_cast(gpu_info->deviceInfo)); // circumvent is_pod check in sfree +} diff --git a/src/gromacs/gpu_utils/gpu_utils_ocl.cpp b/src/gromacs/hardware/device_management_ocl.cpp similarity index 99% rename from src/gromacs/gpu_utils/gpu_utils_ocl.cpp rename to src/gromacs/hardware/device_management_ocl.cpp index 9f78bbda6c..3cf2eec706 100644 --- a/src/gromacs/gpu_utils/gpu_utils_ocl.cpp +++ b/src/gromacs/hardware/device_management_ocl.cpp @@ -59,10 +59,11 @@ #include -#include "gromacs/gpu_utils/gpu_utils.h" #include "gromacs/gpu_utils/ocl_compiler.h" #include "gromacs/gpu_utils/oclraii.h" #include "gromacs/gpu_utils/oclutils.h" +#include "gromacs/hardware/device_information.h" +#include "gromacs/hardware/device_management.h" #include "gromacs/hardware/hw_info.h" #include "gromacs/utility/cstringutil.h" #include "gromacs/utility/exceptions.h" @@ -518,32 +519,6 @@ void findGpus(gmx_gpu_info_t* gpu_info) sfree(ocl_platform_ids); } -void get_gpu_device_info_string(char* s, const gmx_gpu_info_t& gpu_info, int index) -{ - assert(s); - - if (index < 0 && index >= gpu_info.n_dev) - { - return; - } - - DeviceInformation* dinfo = &gpu_info.deviceInfo[index]; - - bool bGpuExists = - (dinfo->stat != DeviceStatus::Nonexistent && dinfo->stat != DeviceStatus::NonFunctional); - - if (!bGpuExists) - { - sprintf(s, "#%d: %s, stat: %s", index, "N/A", c_deviceStateString[dinfo->stat]); - } - else - { - sprintf(s, "#%d: name: %s, vendor: %s, device version: %s, stat: %s", index, dinfo->device_name, - dinfo->vendorName, dinfo->device_version, c_deviceStateString[dinfo->stat]); - } -} - - void init_gpu(const DeviceInformation* deviceInfo) { assert(deviceInfo); @@ -566,6 +541,8 @@ void init_gpu(const DeviceInformation* deviceInfo) } } +void free_gpu(const DeviceInformation* /* deviceInfo */) {} + DeviceInformation* getDeviceInfo(const gmx_gpu_info_t& gpu_info, int deviceId) { if (deviceId < 0 || deviceId >= gpu_info.n_dev) @@ -575,6 +552,31 @@ DeviceInformation* getDeviceInfo(const gmx_gpu_info_t& gpu_info, int deviceId) return &gpu_info.deviceInfo[deviceId]; } +void get_gpu_device_info_string(char* s, const gmx_gpu_info_t& gpu_info, int index) +{ + assert(s); + + if (index < 0 && index >= gpu_info.n_dev) + { + return; + } + + DeviceInformation* dinfo = &gpu_info.deviceInfo[index]; + + bool bGpuExists = + (dinfo->stat != DeviceStatus::Nonexistent && dinfo->stat != DeviceStatus::NonFunctional); + + if (!bGpuExists) + { + sprintf(s, "#%d: %s, stat: %s", index, "N/A", c_deviceStateString[dinfo->stat]); + } + else + { + sprintf(s, "#%d: name: %s, vendor: %s, device version: %s, stat: %s", index, dinfo->device_name, + dinfo->vendorName, dinfo->device_version, c_deviceStateString[dinfo->stat]); + } +} + size_t sizeof_gpu_dev_info() { return sizeof(DeviceInformation); diff --git a/src/gromacs/hardware/gpu_hw_info.h b/src/gromacs/hardware/gpu_hw_info.h index 125a3a9eed..ff114d1248 100644 --- a/src/gromacs/hardware/gpu_hw_info.h +++ b/src/gromacs/hardware/gpu_hw_info.h @@ -41,47 +41,6 @@ struct DeviceInformation; -//! Possible results of the GPU detection/check. -enum class DeviceStatus : int -{ - //! The device is compatible - Compatible = 0, - //! Device does not exist - Nonexistent = 1, - //! Device is not compatible - Incompatible = 2, - //! OpenCL device has incompatible cluster size for non-bonded kernels. - IncompatibleClusterSize = 3, - /*! \brief An error occurred he functionality checks. - * That indicates malfunctioning of the device, driver, or incompatible driver/runtime. - */ - NonFunctional = 4, - /*! \brief CUDA devices are busy or unavailable. - * typically due to use of \p cudaComputeModeExclusive, \p cudaComputeModeProhibited modes. - */ - Unavailable = 5, - //! Enumeration size - Count = 6 -}; - -/*! \brief Names of the GPU detection/check results - * - * Check-source wants to warn about the use of a symbol name that would - * require an inclusion of config.h. However the use is in a comment, so that - * is a false warning. So C-style string concatenation is used to fool the - * naive parser in check-source. That needs a clang-format suppression - * in order to look reasonable. Also clang-tidy wants to suggest that a comma is - * missing, so that is suppressed. - */ -static const gmx::EnumerationArray c_deviceStateString = { - "compatible", "nonexistent", "incompatible", - // clang-format off - // NOLINTNEXTLINE(bugprone-suspicious-missing-comma) - "incompatible (please recompile with correct GMX" "_OPENCL_NB_CLUSTER_SIZE of 4)", - // clang-format on - "non-functional", "unavailable" -}; - /*! \brief Information about GPU devices on this physical node. * * Includes either CUDA or OpenCL devices. The gmx_hardware_detect diff --git a/src/gromacs/hardware/printhardware.cpp b/src/gromacs/hardware/printhardware.cpp index 4283441c0e..b7af58092b 100644 --- a/src/gromacs/hardware/printhardware.cpp +++ b/src/gromacs/hardware/printhardware.cpp @@ -44,8 +44,8 @@ #include #include -#include "gromacs/gpu_utils/gpu_utils.h" #include "gromacs/hardware/cpuinfo.h" +#include "gromacs/hardware/device_management.h" #include "gromacs/hardware/hardwaretopology.h" #include "gromacs/hardware/hw_info.h" #include "gromacs/hardware/identifyavx512fmaunits.h" diff --git a/src/gromacs/mdlib/tests/constrtestrunners.cu b/src/gromacs/mdlib/tests/constrtestrunners.cu index 161c63b5d5..6b97a80649 100644 --- a/src/gromacs/mdlib/tests/constrtestrunners.cu +++ b/src/gromacs/mdlib/tests/constrtestrunners.cu @@ -52,7 +52,7 @@ #include #include "gromacs/gpu_utils/devicebuffer.cuh" -#include "gromacs/gpu_utils/gpu_utils.h" +#include "gromacs/hardware/device_management.h" #include "gromacs/mdlib/lincs_gpu.cuh" #include "gromacs/pbcutil/pbc.h" #include "gromacs/utility/unique_cptr.h" diff --git a/src/gromacs/mdlib/tests/leapfrogtestrunners.cu b/src/gromacs/mdlib/tests/leapfrogtestrunners.cu index f895816ad2..7f9a5766a3 100644 --- a/src/gromacs/mdlib/tests/leapfrogtestrunners.cu +++ b/src/gromacs/mdlib/tests/leapfrogtestrunners.cu @@ -53,7 +53,7 @@ #include #include "gromacs/gpu_utils/devicebuffer.cuh" -#include "gromacs/gpu_utils/gpu_utils.h" +#include "gromacs/hardware/device_information.h" #include "gromacs/math/vec.h" #include "gromacs/mdlib/leapfrog_gpu.cuh" #include "gromacs/mdlib/stat.h" diff --git a/src/gromacs/mdlib/tests/settletestrunners.cu b/src/gromacs/mdlib/tests/settletestrunners.cu index f9cf9867f3..6bbf8eb5e3 100644 --- a/src/gromacs/mdlib/tests/settletestrunners.cu +++ b/src/gromacs/mdlib/tests/settletestrunners.cu @@ -52,7 +52,7 @@ #include #include "gromacs/gpu_utils/devicebuffer.cuh" -#include "gromacs/gpu_utils/gpu_utils.h" +#include "gromacs/hardware/device_management.h" #include "gromacs/mdlib/settle_gpu.cuh" #include "gromacs/utility/unique_cptr.h" diff --git a/src/gromacs/mdrun/runner.cpp b/src/gromacs/mdrun/runner.cpp index c2664db220..37388e1e6e 100644 --- a/src/gromacs/mdrun/runner.cpp +++ b/src/gromacs/mdrun/runner.cpp @@ -75,9 +75,9 @@ #include "gromacs/gmxlib/nrnb.h" #include "gromacs/gpu_utils/device_context.h" #include "gromacs/gpu_utils/device_stream_manager.h" -#include "gromacs/gpu_utils/gpu_utils.h" #include "gromacs/hardware/cpuinfo.h" #include "gromacs/hardware/detecthardware.h" +#include "gromacs/hardware/device_management.h" #include "gromacs/hardware/printhardware.h" #include "gromacs/imd/imd.h" #include "gromacs/listed_forces/disre.h" diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu index 71e598fdf1..5f41fda5ef 100644 --- a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu +++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu @@ -58,6 +58,7 @@ #include "gromacs/gpu_utils/gpueventsynchronizer.cuh" #include "gromacs/gpu_utils/typecasts.cuh" #include "gromacs/gpu_utils/vectype_ops.cuh" +#include "gromacs/hardware/device_information.h" #include "gromacs/mdtypes/simulation_workload.h" #include "gromacs/nbnxm/atomdata.h" #include "gromacs/nbnxm/gpu_common.h" diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu index f5d64d7d83..cc1b6f37ea 100644 --- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu +++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu @@ -55,7 +55,7 @@ #include "gromacs/gpu_utils/gpu_utils.h" #include "gromacs/gpu_utils/gpueventsynchronizer.cuh" #include "gromacs/gpu_utils/pmalloc_cuda.h" -#include "gromacs/hardware/gpu_hw_info.h" +#include "gromacs/hardware/device_information.h" #include "gromacs/math/vectypes.h" #include "gromacs/mdlib/force_flags.h" #include "gromacs/mdtypes/interaction_const.h" diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp b/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp index eaa7bfec4b..1744e9890c 100644 --- a/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp +++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp @@ -73,6 +73,7 @@ #include "gromacs/gpu_utils/device_context.h" #include "gromacs/gpu_utils/gputraits_ocl.h" #include "gromacs/gpu_utils/oclutils.h" +#include "gromacs/hardware/device_information.h" #include "gromacs/hardware/hw_info.h" #include "gromacs/mdtypes/simulation_workload.h" #include "gromacs/nbnxm/atomdata.h" diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp b/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp index 7d74ebac4c..58d9624e17 100644 --- a/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp +++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp @@ -53,8 +53,8 @@ #include #include "gromacs/gpu_utils/device_stream_manager.h" -#include "gromacs/gpu_utils/gpu_utils.h" #include "gromacs/gpu_utils/oclutils.h" +#include "gromacs/hardware/device_information.h" #include "gromacs/hardware/gpu_hw_info.h" #include "gromacs/math/vectypes.h" #include "gromacs/mdlib/force_flags.h" diff --git a/src/gromacs/taskassignment/taskassignment.cpp b/src/gromacs/taskassignment/taskassignment.cpp index 8e0ace91cf..1688e69293 100644 --- a/src/gromacs/taskassignment/taskassignment.cpp +++ b/src/gromacs/taskassignment/taskassignment.cpp @@ -60,7 +60,7 @@ #include "gromacs/domdec/domdec.h" #include "gromacs/gmxlib/network.h" -#include "gromacs/gpu_utils/gpu_utils.h" +#include "gromacs/hardware/device_management.h" #include "gromacs/hardware/hw_info.h" #include "gromacs/mdrunutility/multisim.h" #include "gromacs/mdtypes/commrec.h" diff --git a/src/gromacs/taskassignment/usergpuids.cpp b/src/gromacs/taskassignment/usergpuids.cpp index a370e506f7..275dbfd43b 100644 --- a/src/gromacs/taskassignment/usergpuids.cpp +++ b/src/gromacs/taskassignment/usergpuids.cpp @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2017,2018,2019, by the GROMACS development team, led by + * Copyright (c) 2017,2018,2019,2020, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -49,7 +49,7 @@ #include #include -#include "gromacs/gpu_utils/gpu_utils.h" +#include "gromacs/hardware/device_management.h" #include "gromacs/hardware/hw_info.h" #include "gromacs/utility/exceptions.h" #include "gromacs/utility/stringutil.h" -- 2.22.0