From: Artem Zhmurov Date: Thu, 17 Sep 2020 12:49:10 +0000 (+0000) Subject: Remove the CPU hardware context from the general list of test hardware contexts X-Git-Url: http://biod.pnpi.spb.ru/gitweb/?a=commitdiff_plain;h=288cd3bf5fb4269dbcdd7e8ce621f9c8e586693d;p=alexxy%2Fgromacs.git Remove the CPU hardware context from the general list of test hardware contexts Having CPU code path in the general hardware context list leads to having many if (CPU) clauses. This complicates things in places, where there are several CPU runners. Since in most cases (i.e. everywhere apart from Ewald tests) the runners know the hardware they should use, the selection of CPU or GPU is now done when the runners are created. TestHardwareContext is to become TestDeviceContext and will only hold GPU devices. --- diff --git a/admin/gitlab-ci/gromacs.gitlab-ci.yml b/admin/gitlab-ci/gromacs.gitlab-ci.yml index f2afb2d093..76d95bb0ba 100644 --- a/admin/gitlab-ci/gromacs.gitlab-ci.yml +++ b/admin/gitlab-ci/gromacs.gitlab-ci.yml @@ -639,6 +639,16 @@ gromacs:oneapi-2021.1-beta08-opencl:release:build: - export OMPI_ALLOW_RUN_AS_ROOT=1 - export OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1 - export ASAN_OPTIONS="check_initialization_order=1:detect_invalid_pointer_pairs=1:strict_init_order=true:strict_string_checks=true:detect_stack_use_after_return=true" + # If $GMX_TEST_REQUIRED_NUMBER_OF_DEVICES is not set and we have GPUs, set it + - if [ -z $GMX_TEST_REQUIRED_NUMBER_OF_DEVICES ] && [ -n $KUBERNETES_EXTENDED_RESOURCE_NAME ] ; then + if grep -q '/gpu$' <<< "$KUBERNETES_EXTENDED_RESOURCE_NAME"; then + echo "export GMX_TEST_REQUIRED_NUMBER_OF_DEVICES=\"$KUBERNETES_EXTENDED_RESOURCE_LIMIT\""; + export GMX_TEST_REQUIRED_NUMBER_OF_DEVICES="$KUBERNETES_EXTENDED_RESOURCE_LIMIT"; + fi + fi + - if grep -qF 'nvidia.com/gpu' <<< "$KUBERNETES_EXTENDED_RESOURCE_NAME"; then + nvidia-smi || true; + fi - ctest -D $CTEST_RUN_MODE --output-on-failure | tee ctestLog.log || true - awk '/The following tests FAILED/,/^Errors while running CTest|^$/' ctestLog.log | tee ctestErrors.log diff --git a/docs/user-guide/environment-variables.rst b/docs/user-guide/environment-variables.rst index 3878c4f298..9868a443a2 100644 --- a/docs/user-guide/environment-variables.rst +++ b/docs/user-guide/environment-variables.rst @@ -125,6 +125,12 @@ Debugging arrive first. Setting this variable switches to the generic path with fixed waiting order. +``GMX_TEST_REQUIRED_NUMBER_OF_DEVICES`` + sets the number of GPUs required by the test suite. By default, the test suite would + fall-back to using CPU if GPUs could not be detected. Set it to a positive integer value + to ensure that at least this at least this number of usable GPUs are detected. Default: + 0 (not testing GPU availability). + There are a number of extra environment variables like these that are used in debugging - check the code! diff --git a/src/gromacs/gpu_utils/tests/CMakeLists.txt b/src/gromacs/gpu_utils/tests/CMakeLists.txt index 18674a3e38..4140718519 100644 --- a/src/gromacs/gpu_utils/tests/CMakeLists.txt +++ b/src/gromacs/gpu_utils/tests/CMakeLists.txt @@ -42,6 +42,7 @@ gmx_add_unit_test(GpuUtilsUnitTests gpu_utils-test HARDWARE_DETECTION CPP_SOURCE_FILES # Tests of code clfftinitializer.cpp + device_availability.cpp device_stream_manager.cpp hostallocator.cpp pinnedmemorychecker.cpp diff --git a/src/gromacs/gpu_utils/tests/device_availability.cpp b/src/gromacs/gpu_utils/tests/device_availability.cpp new file mode 100644 index 0000000000..1807367872 --- /dev/null +++ b/src/gromacs/gpu_utils/tests/device_availability.cpp @@ -0,0 +1,123 @@ +/* + * This file is part of the GROMACS molecular simulation package. + * + * Copyright (c) 2020, by the GROMACS development team, led by + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, + * and including many others, as listed in the AUTHORS file in the + * top-level source directory and at http://www.gromacs.org. + * + * GROMACS is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 + * of the License, or (at your option) any later version. + * + * GROMACS is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with GROMACS; if not, see + * http://www.gnu.org/licenses, or write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * If you want to redistribute modifications to GROMACS, please + * consider that scientific software is very special. Version + * control is crucial - bugs must be traceable. We will be happy to + * consider code for inclusion in the official distribution, but + * derived work must not be called official GROMACS. Details are found + * in the README & COPYING files - if they are missing, get the + * official version at http://www.gromacs.org. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the research papers on the package. Check out http://www.gromacs.org. + */ +/*! \internal \file + * \brief + * Check if GPUs are available when they should be. + * + * This is to test that CI can detect and use GPUs, when they are available. + * Driver and CUDA mismatch can lead to the tests falling back to the CPU + * code path quietly, leaving the GPU path untested. This is designed to + * fail when GPUs should be available, indicated by setting the environment + * variable \c GMX_TEST_REQUIRED_NUMBER_OF_DEVICES to 1 or greater. + * + * \author Artem Zhmurov + * \author Andrey Alekseenko + */ +#include "gmxpre.h" + +#include "config.h" + +#include + +#include + +#include "gromacs/hardware/device_management.h" +#include "gromacs/utility/baseversion.h" + +#include "testutils/test_hardware_environment.h" + +namespace gmx +{ + +namespace test +{ + +static unsigned long int getRequiredNumberOfDevices() +{ + const char* required_num_of_devices_str = getenv("GMX_TEST_REQUIRED_NUMBER_OF_DEVICES"); + if (required_num_of_devices_str == nullptr) + { + return 0; + } + else + { + errno = 0; + char* strtol_end; + const long int required_num_of_devices = + std::strtol(required_num_of_devices_str, &strtol_end, 10); + GMX_RELEASE_ASSERT(errno == 0, "Invalid value of GMX_TEST_REQUIRED_NUMBER_OF_DEVICES."); + GMX_RELEASE_ASSERT(*strtol_end == '\0', + "Trailing characters in GMX_TEST_REQUIRED_NUMBER_OF_DEVICES."); + GMX_RELEASE_ASSERT(required_num_of_devices >= 0, + "GMX_TEST_REQUIRED_NUMBER_OF_DEVICES can not be negative."); + return required_num_of_devices; + } +} + +TEST(DevicesAvailable, ShouldBeAbleToRunOnDevice) +{ + const unsigned long int required_num_of_devices = getRequiredNumberOfDevices(); + + if (required_num_of_devices != 0) + { + ASSERT_TRUE(GMX_GPU) << "GROMACS was compiled without GPU support, yet " + "GMX_TEST_REQUIRED_NUMBER_OF_DEVICES is set."; + + std::string platformString(getGpuImplementationString()); + + std::string errorMessage = "Can't perform device detection in " + platformString + ":\n"; + std::string detectionErrorMessage; + + // Test if the detection is working + ASSERT_TRUE(isDeviceDetectionEnabled()) + << errorMessage << "GPU device detection is disabled by " + << "GMX_DISABLE_GPU_DETECTION environment variable."; + ASSERT_TRUE(isDeviceDetectionFunctional(&detectionErrorMessage)) + << errorMessage + << "GPU device checks failed with the following message: " << detectionErrorMessage; + + // This is to test that the GPU detection test environment is working + const std::vector>& testDeviceList = + getTestHardwareEnvironment()->getTestDeviceList(); + ASSERT_FALSE(testDeviceList.empty()) + << errorMessage << "The test environment has an empty list of capable devices."; + ASSERT_GE(testDeviceList.size(), required_num_of_devices) + << errorMessage << "Requested " << required_num_of_devices << " device(s), " + << "but only " << testDeviceList.size() << " detected in the test environment."; + } +} + +} // namespace test +} // namespace gmx diff --git a/src/gromacs/hardware/device_information.h b/src/gromacs/hardware/device_information.h index 8c8020efaa..b38cccb703 100644 --- a/src/gromacs/hardware/device_information.h +++ b/src/gromacs/hardware/device_information.h @@ -71,7 +71,7 @@ enum class DeviceStatus : int Incompatible = 2, //! OpenCL device has incompatible cluster size for non-bonded kernels. IncompatibleClusterSize = 3, - /*! \brief An error occurred he functionality checks. + /*! \brief An error occurred during the functionality checks. * That indicates malfunctioning of the device, driver, or incompatible driver/runtime. */ NonFunctional = 4, diff --git a/src/gromacs/hardware/device_management.h b/src/gromacs/hardware/device_management.h index f037f7d47e..e3860253e9 100644 --- a/src/gromacs/hardware/device_management.h +++ b/src/gromacs/hardware/device_management.h @@ -118,7 +118,8 @@ bool canComputeOnDevice(); * properties, status. * * Note that this function leaves the GPU runtime API error state clean; - * this is implemented ATM in the CUDA flavor. + * this is implemented ATM in the CUDA flavor. This invalidates any existing + * CUDA streams, allocated memory on GPU, etc. * * \todo: Check if errors do propagate in OpenCL as they do in CUDA and * whether there is a mechanism to "clear" them.