Improve issues with CUDA interacting with std flags

author Mark Abraham <mark.j.abraham@gmail.com>

Thu, 5 Nov 2020 08:00:48 +0000 (08:00 +0000)

committer Paul Bauer <paul.bauer.q@gmail.com>

Thu, 5 Nov 2020 08:00:48 +0000 (08:00 +0000)
author Mark Abraham <mark.j.abraham@gmail.com>
Thu, 5 Nov 2020 08:00:48 +0000 (08:00 +0000)
committer Paul Bauer <paul.bauer.q@gmail.com>
Thu, 5 Nov 2020 08:00:48 +0000 (08:00 +0000)
diff --git a/cmake/gmxManageNvccConfig.cmake b/cmake/gmxManageNvccConfig.cmake

index 898246762958ab3b80f0aa5dc749eba9f0a9b0df..12e54cb40efca94399964c7cff008d92dca0dd38 100644 (file)
--- a/cmake/gmxManageNvccConfig.cmake
+++ b/cmake/gmxManageNvccConfig.cmake
@@ -159,7 +159,15 @@ if (CUDA_VERSION VERSION_LESS 11.0)
      # version.
      list(APPEND GMX_CUDA_NVCC_FLAGS "${CMAKE_CXX14_STANDARD_COMPILE_OPTION}")
  else()
-    list(APPEND GMX_CUDA_NVCC_FLAGS "${CMAKE_CXX17_STANDARD_COMPILE_OPTION}")
+    # gcc-7 pre-dated C++17, so uses the -std=c++1z compiler flag for it,
+    # which modern nvcc does not recognize. So we work around that by
+    # compiling in C++14 mode. Clang doesn't have this problem because nvcc
+    # only supports version of clang that already understood -std=c++17
+    if (CMAKE_CXX_COMPILER_ID MATCHES "GNU" AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 8)
+        list(APPEND GMX_CUDA_NVCC_FLAGS "${CMAKE_CXX14_STANDARD_COMPILE_OPTION}")
+    else()
+        list(APPEND GMX_CUDA_NVCC_FLAGS "${CMAKE_CXX17_STANDARD_COMPILE_OPTION}")
+    endif()
  endif()
  
  # assemble the CUDA flags
diff --git a/src/gromacs/compat/pointers.h b/src/gromacs/compat/pointers.h

index f10760581fec26b052fe82d55306c35bfe8083a5..37689a919033a1927d7491a0cea59d6254578a9b 100644 (file)
--- a/src/gromacs/compat/pointers.h
+++ b/src/gromacs/compat/pointers.h
@@ -89,24 +89,24 @@ template<class T>
  class not_null
  {
  public:
-    static_assert(std::is_assignable_v<T&, std::nullptr_t>, "T cannot be assigned nullptr.");
+    static_assert(std::is_assignable<T&, std::nullptr_t>::value, "T cannot be assigned nullptr.");
  
      //! Move constructor. Asserts in debug mode if \c is nullptr.
-    template<typename U, typename = std::enable_if_t<std::is_convertible_v<U, T>>>
+    template<typename U, typename = std::enable_if_t<std::is_convertible<U, T>::value>>
      constexpr explicit not_null(U&& u) : ptr_(std::forward<U>(u))
      {
          Expects(ptr_ != nullptr);
      }
  
      //! Simple constructor. Asserts in debug mode if \c u is nullptr.
-    template<typename = std::enable_if_t<!std::is_same_v<std::nullptr_t, T>>>
+    template<typename = std::enable_if_t<!std::is_same<std::nullptr_t, T>::value>>
      constexpr explicit not_null(T u) : ptr_(u)
      {
          Expects(ptr_ != nullptr);
      }
  
      //! Copy constructor.
-    template<typename U, typename = std::enable_if_t<std::is_convertible_v<U, T>>>
+    template<typename U, typename = std::enable_if_t<std::is_convertible<U, T>::value>>
      constexpr not_null(const not_null<U>& other) : not_null(other.get())
      {
      }
diff --git a/src/gromacs/hardware/CMakeLists.txt b/src/gromacs/hardware/CMakeLists.txt

index 8161c495dd2d8b24104fcdd9f380f4cd86e18452..668e6ab1867766ba4c3e9694cec5d09e89686ead 100644 (file)
--- a/src/gromacs/hardware/CMakeLists.txt
+++ b/src/gromacs/hardware/CMakeLists.txt
@@ -37,6 +37,7 @@ gmx_add_libgromacs_sources(
      detecthardware.cpp
      device_management_common.cpp
      hardwaretopology.cpp
+    prepare_detection.cpp
      printhardware.cpp
      identifyavx512fmaunits.cpp
      )
diff --git a/src/gromacs/hardware/detecthardware.cpp b/src/gromacs/hardware/detecthardware.cpp

index 884b98a4a2079cf063a215863cd8f91c7871cb5d..54e2eefb3e7ff4d7d55e1079800d4d776114acf1 100644 (file)
--- a/src/gromacs/hardware/detecthardware.cpp
+++ b/src/gromacs/hardware/detecthardware.cpp
@@ -41,13 +41,10 @@
  
  #include <algorithm>
  #include <array>
-#include <chrono>
  #include <memory>
  #include <string>
-#include <thread>
  #include <vector>
  
-#include "gromacs/compat/pointers.h"
  #include "gromacs/hardware/cpuinfo.h"
  #include "gromacs/hardware/device_management.h"
  #include "gromacs/hardware/hardwaretopology.h"
@@ -62,11 +59,11 @@
  #include "gromacs/utility/gmxassert.h"
  #include "gromacs/utility/inmemoryserializer.h"
  #include "gromacs/utility/logger.h"
-#include "gromacs/utility/mutex.h"
  #include "gromacs/utility/physicalnodecommunicator.h"
  
  #include "architecture.h"
  #include "device_information.h"
+#include "prepare_detection.h"
  
  #ifdef HAVE_UNISTD_H
  #    include <unistd.h> // sysconf()
@@ -140,7 +137,7 @@ static DeviceDetectionResult detectAllDeviceInformation(const PhysicalNodeCommun
      // Read-only access is enforced with providing those ranks with a
      // handle to a const object, so usage is also free of races.
      GMX_UNUSED_VALUE(physicalNodeComm);
-    isMasterRankOfPhysicalNode = true;
+    isMasterRankOfPhysicalNode        = true;
  #endif
  
      /* The SYCL and OpenCL support requires us to run detection on all
@@ -203,9 +200,9 @@ static DeviceDetectionResult detectAllDeviceInformation(const PhysicalNodeCommun
  }
  
  //! Reduce the locally collected \p hardwareInfo over MPI ranks
-static void gmx_collect_hardware_mpi(const gmx::CpuInfo&              cpuInfo,
-                                     const PhysicalNodeCommunicator&  physicalNodeComm,
-                                     compat::not_null<gmx_hw_info_t*> hardwareInfo)
+static void gmx_collect_hardware_mpi(const gmx::CpuInfo&             cpuInfo,
+                                     const PhysicalNodeCommunicator& physicalNodeComm,
+                                     gmx_hw_info_t*                  hardwareInfo)
  {
      const int ncore = hardwareInfo->hardwareTopology->numberOfCores();
      /* Zen1 is assumed for:
@@ -296,7 +293,6 @@ static void gmx_collect_hardware_mpi(const gmx::CpuInfo&              cpuInfo,
      hardwareInfo->bIdenticalGPUs      = (maxMinReduced[4] == -maxMinReduced[9]);
      hardwareInfo->haveAmdZen1Cpu      = (maxMinReduced[10] > 0);
  #else
-    /* All ranks use the same pointer, protected by a mutex in the caller */
      hardwareInfo->nphysicalnode       = 1;
      hardwareInfo->ncore_tot           = ncore;
      hardwareInfo->ncore_min           = ncore;
@@ -315,90 +311,6 @@ static void gmx_collect_hardware_mpi(const gmx::CpuInfo&              cpuInfo,
  #endif
  }
  
-/*! \brief Utility that does dummy computing for max 2 seconds to spin up cores
- *
- *  This routine will check the number of cores configured and online
- *  (using sysconf), and the spins doing dummy compute operations for up to
- *  2 seconds, or until all cores have come online. This can be used prior to
- *  hardware detection for platforms that take unused processors offline.
- *
- *  This routine will not throw exceptions. In principle it should be
- *  declared noexcept, but at least icc 19.1 and 21-beta08 with the
- *  libstdc++-7.5 has difficulty implementing a std::vector of
- *  std::thread started with this function when declared noexcept. It
- *  is not clear whether the problem is the compiler or the standard
- *  library. Fortunately, this function is not performance sensitive,
- *  and only runs on platforms other than x86 and POWER (ie ARM),
- *  so the possible overhead introduced by omitting noexcept is not
- *  important.
- */
-static void spinUpCore()
-{
-#if defined(HAVE_SYSCONF) && defined(_SC_NPROCESSORS_CONF) && defined(_SC_NPROCESSORS_ONLN)
-    float dummy           = 0.1;
-    int   countConfigured = sysconf(_SC_NPROCESSORS_CONF);    // noexcept
-    auto  start           = std::chrono::steady_clock::now(); // noexcept
-
-    while (sysconf(_SC_NPROCESSORS_ONLN) < countConfigured
-           && std::chrono::steady_clock::now() - start < std::chrono::seconds(2))
-    {
-        for (int i = 1; i < 10000; i++)
-        {
-            dummy /= i;
-        }
-    }
-
-    if (dummy < 0)
-    {
-        printf("This cannot happen, but prevents loop from being optimized away.");
-    }
-#endif
-}
-
-/*! \brief Prepare the system before hardware topology detection
- *
- * This routine should perform any actions we want to put the system in a state
- * where we want it to be before detecting the hardware topology. For most
- * processors there is nothing to do, but some architectures (in particular ARM)
- * have support for taking configured cores offline, which will make them disappear
- * from the online processor count.
- *
- * This routine checks if there is a mismatch between the number of cores
- * configured and online, and in that case we issue a small workload that
- * attempts to wake sleeping cores before doing the actual detection.
- *
- * This type of mismatch can also occur for x86 or PowerPC on Linux, if SMT has only
- * been disabled in the kernel (rather than bios). Since those cores will never
- * come online automatically, we currently skip this test for x86 & PowerPC to
- * avoid wasting 2 seconds. We also skip the test if there is no thread support.
- *
- * \note Cores will sleep relatively quickly again, so it's important to issue
- *       the real detection code directly after this routine.
- */
-static void hardwareTopologyPrepareDetection()
-{
-#if defined(HAVE_SYSCONF) && defined(_SC_NPROCESSORS_CONF) \
-        && (defined(THREAD_PTHREADS) || defined(THREAD_WINDOWS))
-
-    // Modify this conditional when/if x86 or PowerPC starts to sleep some cores
-    if (c_architecture != Architecture::X86 && c_architecture != Architecture::PowerPC)
-    {
-        int                      countConfigured = sysconf(_SC_NPROCESSORS_CONF);
-        std::vector<std::thread> workThreads(countConfigured);
-
-        for (auto& t : workThreads)
-        {
-            t = std::thread(spinUpCore);
-        }
-
-        for (auto& t : workThreads)
-        {
-            t.join();
-        }
-    }
-#endif
-}
-
  void hardwareTopologyDoubleCheckDetection(const gmx::MDLogger gmx_unused& mdlog,
                                            const gmx::HardwareTopology gmx_unused& hardwareTopology)
  {
@@ -448,7 +360,7 @@ void hardwareTopologyDoubleCheckDetection(const gmx::MDLogger gmx_unused& mdlog,
  
  std::unique_ptr<gmx_hw_info_t> gmx_detect_hardware(const PhysicalNodeCommunicator& physicalNodeComm)
  {
-    // Make the new hardwareInfo in a temporary.
+    // Ensure all cores have spun up, where applicable.
      hardwareTopologyPrepareDetection();
  
      // TODO: We should also do CPU hardware detection only once on each
@@ -469,7 +381,7 @@ std::unique_ptr<gmx_hw_info_t> gmx_detect_hardware(const PhysicalNodeCommunicato
          std::swap(hardwareInfo->hardwareDetectionWarnings_, deviceDetectionResult.deviceDetectionWarnings_);
      }
  
-    gmx_collect_hardware_mpi(*hardwareInfo->cpuInfo, physicalNodeComm, compat::make_not_null(hardwareInfo));
+    gmx_collect_hardware_mpi(*hardwareInfo->cpuInfo, physicalNodeComm, hardwareInfo.get());
  
      return hardwareInfo;
  }
diff --git a/src/gromacs/hardware/prepare_detection.cpp b/src/gromacs/hardware/prepare_detection.cpp

new file mode 100644 (file)

index 0000000..885afdc
--- /dev/null
+++ b/src/gromacs/hardware/prepare_detection.cpp
@@ -0,0 +1,134 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014,2015,2016 by the GROMACS development team.
+ * Copyright (c) 2017,2018,2019,2020, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ * \brief Defines routine for activating potentially deactivated cores
+ * so they can be detected.
+ *
+ * The use of std::thread makes for brittle interaction with std
+ * library headers. Its caller also handles GPU detection and
+ * allocation of device-specific data structures. This is more
+ * manageable when separated into two distinct translation units.
+ *
+ * \author Erik Lindahl <erik.lindahl@scilifelab.se>
+ * \author Mark Abraham <mark.j.abraham@gmail.com>
+ * \ingroup module_hardware
+ */
+#include "gmxpre.h"
+
+#include "prepare_detection.h"
+
+#include "config.h"
+
+#include <cstdio>
+
+#include <chrono>
+#include <thread>
+#include <vector>
+
+#include "architecture.h"
+
+#ifdef HAVE_UNISTD_H
+#    include <unistd.h> // sysconf()
+#endif
+
+namespace gmx
+{
+
+/*! \brief Utility that does dummy computing for max 2 seconds to spin up cores
+ *
+ *  This routine will check the number of cores configured and online
+ *  (using sysconf), and the spins doing dummy compute operations for up to
+ *  2 seconds, or until all cores have come online. This can be used prior to
+ *  hardware detection for platforms that take unused processors offline.
+ *
+ *  This routine will not throw exceptions. In principle it should be
+ *  declared noexcept, but at least icc 19.1 and 21-beta08 with the
+ *  libstdc++-7.5 has difficulty implementing a std::vector of
+ *  std::thread started with this function when declared noexcept. It
+ *  is not clear whether the problem is the compiler or the standard
+ *  library. Fortunately, this function is not performance sensitive,
+ *  and only runs on platforms other than x86 and POWER (ie ARM),
+ *  so the possible overhead introduced by omitting noexcept is not
+ *  important.
+ */
+static void spinUpCore()
+{
+#if defined(HAVE_SYSCONF) && defined(_SC_NPROCESSORS_CONF) && defined(_SC_NPROCESSORS_ONLN)
+    float dummy           = 0.1;
+    int   countConfigured = sysconf(_SC_NPROCESSORS_CONF);    // noexcept
+    auto  start           = std::chrono::steady_clock::now(); // noexcept
+
+    while (sysconf(_SC_NPROCESSORS_ONLN) < countConfigured
+           && std::chrono::steady_clock::now() - start < std::chrono::seconds(2))
+    {
+        for (int i = 1; i < 10000; i++)
+        {
+            dummy /= i;
+        }
+    }
+
+    if (dummy < 0)
+    {
+        printf("This cannot happen, but prevents loop from being optimized away.");
+    }
+#endif
+}
+
+void hardwareTopologyPrepareDetection()
+{
+#if defined(HAVE_SYSCONF) && defined(_SC_NPROCESSORS_CONF) \
+        && (defined(THREAD_PTHREADS) || defined(THREAD_WINDOWS))
+
+    // Modify this conditional when/if x86 or PowerPC starts to sleep some cores
+    if (c_architecture != Architecture::X86 && c_architecture != Architecture::PowerPC)
+    {
+        int                      countConfigured = sysconf(_SC_NPROCESSORS_CONF);
+        std::vector<std::thread> workThreads(countConfigured);
+
+        for (auto& t : workThreads)
+        {
+            t = std::thread(spinUpCore);
+        }
+
+        for (auto& t : workThreads)
+        {
+            t.join();
+        }
+    }
+#endif
+}
+
+} // namespace gmx
diff --git a/src/gromacs/hardware/prepare_detection.h b/src/gromacs/hardware/prepare_detection.h

new file mode 100644 (file)

index 0000000..38c94b0
--- /dev/null
+++ b/src/gromacs/hardware/prepare_detection.h
@@ -0,0 +1,74 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2020, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal
+ * \file
+ * \brief Declares routine for activating potentially deactivated
+ * cores so they can be detected.
+ *
+ * \author Erik Lindahl <erik.lindahl@scilifelab.se>
+ * \author Mark Abraham <mark.j.abraham@gmail.com>
+ * \ingroup module_hardware
+ */
+#ifndef GMX_HARDWARE_PREPAREDETECTION_H
+#define GMX_HARDWARE_PREPAREDETECTION_H
+
+namespace gmx
+{
+
+/*! \brief Prepare the system before hardware topology detection
+ *
+ * This routine should perform any actions we want to put the system in a state
+ * where we want it to be before detecting the hardware topology. For most
+ * processors there is nothing to do, but some architectures (in particular ARM)
+ * have support for taking configured cores offline, which will make them disappear
+ * from the online processor count.
+ *
+ * This routine checks if there is a mismatch between the number of cores
+ * configured and online, and in that case we issue a small workload that
+ * attempts to wake sleeping cores before doing the actual detection.
+ *
+ * This type of mismatch can also occur for x86 or PowerPC on Linux, if SMT has only
+ * been disabled in the kernel (rather than bios). Since those cores will never
+ * come online automatically, we currently skip this test for x86 & PowerPC to
+ * avoid wasting 2 seconds. We also skip the test if there is no thread support.
+ *
+ * \note Cores will sleep relatively quickly again, so it's important to issue
+ *       the real detection code directly after this routine.
+ */
+void hardwareTopologyPrepareDetection();
+
+} // namespace gmx
+
+#endif
author	Mark Abraham <mark.j.abraham@gmail.com>
	Thu, 5 Nov 2020 08:00:48 +0000 (08:00 +0000)
committer	Paul Bauer <paul.bauer.q@gmail.com>
	Thu, 5 Nov 2020 08:00:48 +0000 (08:00 +0000)
cmake/gmxManageNvccConfig.cmake		patch \| blob \| history
src/gromacs/compat/pointers.h		patch \| blob \| history
src/gromacs/hardware/CMakeLists.txt		patch \| blob \| history
src/gromacs/hardware/detecthardware.cpp		patch \| blob \| history
src/gromacs/hardware/prepare_detection.cpp	[new file with mode: 0644]	patch \| blob
src/gromacs/hardware/prepare_detection.h	[new file with mode: 0644]	patch \| blob