set(CMAKE_REQUIRED_FLAGS "${CXX11_CXX_FLAG} ${${STDLIB_CXX_FLAG_NAME}}")
set(CMAKE_REQUIRED_LIBRARIES "${${STDLIB_LIBRARIES_NAME}}")
check_cxx_source_compiles(
-"#include <map>
+"#include <chrono>
+#include <map>
#include <memory>
+#include <thread>
#include <utility>
int main() {
typedef std::unique_ptr<int> intPointer;
intPointer p(new int(10));
std::map<int, std::unique_ptr<int>> m;
m.insert(std::make_pair(5, std::move(p)));
+ auto start = std::chrono::system_clock::now();
+ if (std::chrono::system_clock::now() - start < std::chrono::seconds(2))
+ {
+ std::thread t;
+ }
}" CXX11_STDLIB_PRESENT)
if(NOT CXX11_STDLIB_PRESENT)
message(FATAL_ERROR "This version of GROMACS requires C++11-compatible standard library. Please use a newer compiler, or a newer standard library, or use the GROMACS 5.1.x release. See the installation guide for details.")
'stddef.h', 'stdint.h', 'stdio.h', 'stdlib.h', 'string.h',
'time.h']
_std_c_cpp_headers = ['c' + x[:-2] for x in _std_c_headers]
- _std_cpp_headers = ['algorithm', 'array', 'deque', 'exception', 'fstream',
+ _std_cpp_headers = ['algorithm', 'array', 'chrono', 'deque', 'exception', 'fstream',
'functional', 'iomanip', 'ios', 'iosfwd', 'iostream', 'istream', 'iterator',
'limits', 'list', 'map', 'memory', 'new', 'numeric', 'ostream', 'random',
'regex', 'set', 'sstream', 'stdexcept', 'streambuf', 'string', 'strstream',
- 'tuple', 'type_traits', 'typeindex', 'typeinfo', 'vector', 'utility']
+ 'thread', 'tuple', 'type_traits', 'typeindex', 'typeinfo', 'vector', 'utility']
def __init__(self, style='pub-priv', absolute=False):
"""Initialize a sorted with the given style."""
spreading computation over multiple threads, such as OpenMP,
pthreads, winthreads, CUDA, OpenCL, and OpenACC. Some kinds of
hardware can map more than one software thread to a core; on
- Intel x86 processors this is called "hyper-threading."
- Normally, :ref:`gmx mdrun` will not benefit from such mapping.
+ Intel x86 processors this is called "hyper-threading", while
+ the more general concept is often called SMT for
+ "simultaneous multi-threading". IBM Power8 can for instance use
+ up to 8 hardware threads per core.
+ This feature can usually be enabled or disabled either in
+ the hardware bios or through a setting in the Linux operating
+ system. GROMACS can typically make use of this, for a moderate
+ free performance boost. In most cases it will be
+ enabled by default e.g. on new x86 processors, but in some cases
+ the system administrators might have disabled it. If that is the
+ case, ask if they can re-enable it for you. If you are not sure
+ if it is enabled, check the output of the CPU information in
+ the log file and compare with CPU specifications you find online.
thread affinity (pinning)
By default, most operating systems allow software threads to migrate
#include <cstring>
#include <algorithm>
+#include <chrono>
#include <string>
+#include <thread>
#include <vector>
#include "thread_mpi/threads.h"
#include "gromacs/utility/exceptions.h"
#include "gromacs/utility/fatalerror.h"
#include "gromacs/utility/gmxassert.h"
-#include "gromacs/utility/gmxomp.h"
#include "gromacs/utility/programcontext.h"
#include "gromacs/utility/smalloc.h"
#include "gromacs/utility/stringutil.h"
#include "gromacs/utility/sysinfo.h"
+#ifdef HAVE_UNISTD_H
+# include <unistd.h> // sysconf()
+#endif
+
+//! Convenience macro to help us avoid ifdefs each time we use sysconf
+#if !defined(_SC_NPROCESSORS_ONLN) && defined(_SC_NPROC_ONLN)
+# define _SC_NPROCESSORS_ONLN _SC_NPROC_ONLN
+#endif
+
+//! Convenience macro to help us avoid ifdefs each time we use sysconf
+#if !defined(_SC_NPROCESSORS_CONF) && defined(_SC_NPROC_CONF)
+# define _SC_NPROCESSORS_CONF _SC_NPROC_CONF
+#endif
+
+#if defined (__i386__) || defined (__x86_64__) || defined (_M_IX86) || defined (_M_X64)
+//! Constant used to help minimize preprocessed code
+static const bool isX86 = true;
+#else
+//! Constant used to help minimize preprocessed code
+static const bool isX86 = false;
+#endif
+
+#if defined __powerpc__ || defined __ppc__ || defined __PPC__
+static const bool isPowerPC = true;
+#else
+static const bool isPowerPC = false;
+#endif
+
+//! Constant used to help minimize preprocessed code
static const bool bGPUBinary = GMX_GPU != GMX_GPU_NONE;
-static const bool bHasOmpSupport = GMX_OPENMP;
/* Note that some of the following arrays must match the "GPU support
* enumeration" in src/config.h.cmakein, so that GMX_GPU looks up an
#endif
}
+/*! \brief Utility that does dummy computing for max 2 seconds to spin up cores
+ *
+ * This routine will check the number of cores configured and online
+ * (using sysconf), and the spins doing dummy compute operations for up to
+ * 2 seconds, or until all cores have come online. This can be used prior to
+ * hardware detection for platforms that take unused processors offline.
+ *
+ * This routine will not throw exceptions.
+ */
+static void
+spinUpCore() noexcept
+{
+#if defined(HAVE_SYSCONF) && defined(_SC_NPROCESSORS_CONF) && defined(_SC_NPROCESSORS_ONLN)
+ // steady_clock is better than system_clock, but unsupported in gcc-4.6.4.
+ // For release-2017 we can retire gcc-4.6 support and move to steady_clock.
+ float dummy = 0.1;
+ int countConfigured = sysconf(_SC_NPROCESSORS_CONF); // noexcept
+ auto start = std::chrono::system_clock::now(); // noexcept
+
+ while (sysconf(_SC_NPROCESSORS_ONLN) < countConfigured &&
+ std::chrono::system_clock::now() - start < std::chrono::seconds(2))
+ {
+ for (int i = 1; i < 10000; i++)
+ {
+ dummy /= i;
+ }
+ }
+
+ if (dummy < 0)
+ {
+ printf("This cannot happen, but prevents loop from being optimized away.");
+ }
+#endif
+}
+
+/*! \brief Prepare the system before hardware topology detection
+ *
+ * This routine should perform any actions we want to put the system in a state
+ * where we want it to be before detecting the hardware topology. For most
+ * processors there is nothing to do, but some architectures (in particular ARM)
+ * have support for taking configured cores offline, which will make them disappear
+ * from the online processor count.
+ *
+ * This routine checks if there is a mismatch between the number of cores
+ * configured and online, and in that case we issue a small workload that
+ * attempts to wake sleeping cores before doing the actual detection.
+ *
+ * This type of mismatch can also occur for x86 or PowerPC on Linux, if SMT has only
+ * been disabled in the kernel (rather than bios). Since those cores will never
+ * come online automatically, we currently skip this test for x86 & PowerPC to
+ * avoid wasting 2 seconds. We also skip the test if there is no thread support.
+ *
+ * \note Cores will sleep relatively quickly again, so it's important to issue
+ * the real detection code directly after this routine.
+ */
+static void
+hardwareTopologyPrepareDetection()
+{
+#if defined(HAVE_SYSCONF) && defined(_SC_NPROCESSORS_CONF) && \
+ (defined(THREAD_PTHREADS) || defined(THREAD_WINDOWS))
+
+ // Modify this conditional when/if x86 or PowerPC starts to sleep some cores
+ if (!isX86 && !isPowerPC)
+ {
+ int countConfigured = sysconf(_SC_NPROCESSORS_CONF);
+ std::vector<std::thread> workThreads(countConfigured);
+
+ for (auto &t : workThreads)
+ {
+ t = std::thread(spinUpCore);
+ }
+
+ for (auto &t : workThreads)
+ {
+ t.join();
+ }
+ }
+#endif
+}
+
+/*! \brief Sanity check hardware topology and optionally print some notes to log
+ *
+ * \param fplog Log file pointer. This can be NULL, but the then routine
+ * will not do anything.
+ * \param hardwareTopology Reference to hardwareTopology object.
+ */
+static void
+hardwareTopologyDoubleCheckDetection(FILE gmx_unused *fplog,
+ const gmx::HardwareTopology gmx_unused &hardwareTopology)
+{
+#if defined HAVE_SYSCONF && defined(_SC_NPROCESSORS_CONF)
+ if (fplog == NULL ||
+ hardwareTopology.supportLevel() < gmx::HardwareTopology::SupportLevel::LogicalProcessorCount)
+ {
+ return;
+ }
+
+ int countFromDetection = hardwareTopology.machine().logicalProcessorCount;
+ int countConfigured = sysconf(_SC_NPROCESSORS_CONF);
+
+ /* BIOS, kernel or user actions can take physical processors
+ * offline. We already cater for the some of the cases inside the hardwareToplogy
+ * by trying to spin up cores just before we detect, but there could be other
+ * cases where it is worthwhile to hint that there might be more resources available.
+ */
+ if (countConfigured >= 0 && countConfigured != countFromDetection)
+ {
+ fprintf(fplog, "Note: %d CPUs configured, but only %d were detected to be online.\n", countConfigured, countFromDetection);
+
+ if (isX86 && countConfigured == 2*countFromDetection)
+ {
+ fprintf(fplog, " X86 Hyperthreading is likely disabled; enable it for better performance.\n");
+ }
+ // For PowerPC (likely Power8) it is possible to set SMT to either 2,4, or 8-way hardware threads.
+ // We only warn if it is completely disabled since default performance drops with SMT8.
+ if (isPowerPC && countConfigured == 8*countFromDetection)
+ {
+ fprintf(fplog, " PowerPC SMT is likely disabled; enable SMT2/SMT4 for better performance.\n");
+ }
+ }
+#endif
+}
+
+
gmx_hw_info_t *gmx_detect_hardware(FILE *fplog, const t_commrec *cr,
gmx_bool bDetectGPUs)
{
snew(hwinfo_g, 1);
hwinfo_g->cpuInfo = new gmx::CpuInfo(gmx::CpuInfo::detect());
- hwinfo_g->hardwareTopology = new gmx::HardwareTopology(gmx::HardwareTopology::detect(fplog, cr));
+
+ hardwareTopologyPrepareDetection();
+ hwinfo_g->hardwareTopology = new gmx::HardwareTopology(gmx::HardwareTopology::detect());
+
+ // If we detected the topology on this system, double-check that it makes sense
+ if (hwinfo_g->hardwareTopology->isThisSystem())
+ {
+ hardwareTopologyDoubleCheckDetection(fplog, *(hwinfo_g->hardwareTopology));
+ }
// TODO: Get rid of this altogether.
hwinfo_g->nthreads_hw_avail = hwinfo_g->hardwareTopology->machine().logicalProcessorCount;
check_use_of_rdtscp_on_this_cpu(fplog, cr, cpuInfo);
}
-void checkLogicalProcessorCountIsConsistentWithOpenmp(FILE *fplog, const t_commrec *cr,
- const gmx::HardwareTopology *hardwareTopology)
-{
- if (bHasOmpSupport &&
- hardwareTopology->supportLevel() >=
- gmx::HardwareTopology::SupportLevel::LogicalProcessorCount)
- {
- int countFromDetection = hardwareTopology->machine().logicalProcessorCount;
- int countFromOpenmp = gmx_omp_get_num_procs();
- if (countFromDetection != countFromOpenmp)
- {
- md_print_warn(cr, fplog,
- "Number of logical cores detected (%d) does not match the number reported by OpenMP (%d).\n"
- "Consider setting the launch configuration manually!",
- countFromDetection, countFromOpenmp);
- }
- }
-}
-
//! \brief Return if any GPU ID (e.g in a user-supplied string) is repeated
static gmx_bool anyGpuIdIsRepeated(const gmx_gpu_opt_t *gpu_opt)
{
* example. */
gmx_bool gmx_gpu_sharing_supported();
-/* Construct the global hwinfo structure and return a pointer to
- it. Caller is responsible for freeing this pointer. */
+/*! \brief Run detection, consistency checks, and make available on all ranks.
+ *
+ * This routine constructs the global hwinfo structure and returns a pointer to
+ * it. It will run a preamble before executing cpu and hardware checks, and
+ * then run consistency checks afterwards. The results will also be made
+ * available on all nodes.
+ * Caller is responsible for freeing this pointer.
+ */
gmx_hw_info_t *gmx_detect_hardware(FILE *fplog, const t_commrec *cr,
gmx_bool bDetectGPUs);
void gmx_print_detected_hardware(FILE *fplog, const t_commrec *cr,
const gmx_hw_info_t *hwinfo);
-//! Warn the user if the OpenMP system doesn't agree with the hardware detection about the number of logical processors.
-void checkLogicalProcessorCountIsConsistentWithOpenmp(FILE *fplog, const t_commrec *cr,
- const gmx::HardwareTopology *hardwareTopology);
-
void gmx_hardware_info_free(gmx_hw_info_t *hwinfo);
void gmx_parse_gpu_ids(gmx_gpu_opt_t *gpu_opt);
# include <hwloc.h>
#endif
-#include "gromacs/gmxlib/md_logging.h"
#include "gromacs/hardware/cpuinfo.h"
#include "gromacs/utility/gmxassert.h"
# include <windows.h> // GetSystemInfo()
#endif
-#if defined(_M_ARM) || defined(__arm__) || defined(__ARM_ARCH) || defined (__aarch64__)
-//! Constant used to help minimize preprocessed code
-static const bool isArm = true;
-#else
-//! Constant used to help minimize preprocessed code
-static const bool isArm = false;
+//! Convenience macro to help us avoid ifdefs each time we use sysconf
+#if !defined(_SC_NPROCESSORS_ONLN) && defined(_SC_NPROC_ONLN)
+# define _SC_NPROCESSORS_ONLN _SC_NPROC_ONLN
#endif
namespace gmx
* \return The number of hardware processing units, or 0 if it fails.
*/
int
-detectLogicalProcessorCount(FILE *fplog, const t_commrec *cr)
+detectLogicalProcessorCount()
{
int count = 0;
SYSTEM_INFO sysinfo;
GetSystemInfo( &sysinfo );
count = sysinfo.dwNumberOfProcessors;
-#elif defined HAVE_SYSCONF
+#elif defined(HAVE_SYSCONF) && defined(_SC_NPROCESSORS_ONLN)
// We are probably on Unix. Check if we have the argument to use before executing any calls
-# if defined(_SC_NPROCESSORS_CONF)
- count = sysconf(_SC_NPROCESSORS_CONF);
-# if defined(_SC_NPROCESSORS_ONLN)
- /* On e.g. Arm, the Linux kernel can use advanced power saving features where
- * processors are brought online/offline dynamically. This will cause
- * _SC_NPROCESSORS_ONLN to report 1 at the beginning of the run. For this
- * reason we now warn if this mismatches with the detected core count. */
- int countOnline = sysconf(_SC_NPROCESSORS_ONLN);
- if (count != countOnline)
- {
- /* We assume that this scenario means that the kernel has
- disabled threads or cores, and that the only safe course is
- to assume that _SC_NPROCESSORS_ONLN should be used. Even
- this may not be valid if running in a containerized
- environment, such system calls may read from
- /sys/devices/system/cpu and report what the OS sees, rather
- than what the container cgroup is supposed to set up as
- limits. But we're not sure right now whether there's any
- (standard-ish) way to handle that.
-
- On ARM, the kernel may have powered down the cores,
- which we'll warn the user about now. On x86, this
- means HT is disabled by the kernel, not in the
- BIOS. We're not sure what it means on other
- architectures, or even if it is possible, because
- sysconf is rather non-standardized. */
- if (isArm)
- {
- md_print_warn(cr, fplog,
- "%d CPUs configured, but only %d of them are online.\n"
- "This can happen on embedded platforms (e.g. ARM) where the OS shuts some cores\n"
- "off to save power, and will turn them back on later when the load increases.\n"
- "However, this will likely mean GROMACS cannot pin threads to those cores. You\n"
- "will likely see much better performance by forcing all cores to be online, and\n"
- "making sure they run at their full clock frequency.", count, countOnline);
- }
- else
- {
- md_print_warn(cr, fplog,
- "Note: %d CPUs configured, but only %d of them are online, so GROMACS will use the latter.",
- count, countOnline);
- // We use the online count to avoid (potential) oversubscription.
- count = countOnline;
- }
- }
-# endif
-# elif defined(_SC_NPROC_CONF)
- count = sysconf(_SC_NPROC_CONF);
-# elif defined(_SC_NPROCESSORS_ONLN)
count = sysconf(_SC_NPROCESSORS_ONLN);
-# elif defined(_SC_NPROC_ONLN)
- count = sysconf(_SC_NPROC_ONLN);
-# else
-# warning "No valid sysconf argument value found. Executables will not be able to determine the number of logical cores: mdrun will use 1 thread by default!"
-# endif // End of check for sysconf argument values
-
#else
count = 0; // Neither windows nor Unix.
#endif
}
- GMX_UNUSED_VALUE(cr);
- GMX_UNUSED_VALUE(fplog);
return count;
}
} // namespace anonymous
// static
-HardwareTopology HardwareTopology::detect(FILE *fplog, const t_commrec *cr)
+HardwareTopology HardwareTopology::detect()
{
HardwareTopology result;
if (result.supportLevel_ == SupportLevel::None)
{
// No topology information; try to detect the number of logical processors at least
- result.machine_.logicalProcessorCount = detectLogicalProcessorCount(fplog, cr);
+ result.machine_.logicalProcessorCount = detectLogicalProcessorCount();
if (result.machine_.logicalProcessorCount > 0)
{
result.supportLevel_ = SupportLevel::LogicalProcessorCount;
#define GMX_HARDWARE_HARDWARETOPOLOGY_H
#include <cstdint>
-#include <cstdio>
#include <vector>
-struct t_commrec;
-
namespace gmx
{
public:
- /*! \brief Detects the hardware topology.
- *
- * Writes any warnings to stderr, and \c fplog if it is not nullptr.
- */
- static HardwareTopology detect(FILE *fplog, const t_commrec *cr);
+ /*! \brief Detects the hardware topology. */
+ static HardwareTopology detect();
/*! \brief Check what topology information that is available and valid
*
// depends on the architecture, but we can at least make sure that it
// works to execute the tests
- gmx::HardwareTopology hwTop(gmx::HardwareTopology::detect(nullptr, nullptr));
+ gmx::HardwareTopology hwTop(gmx::HardwareTopology::detect());
// If we cannot even find the number of logical processors we want to flag it
EXPECT_GT(hwTop.supportLevel(), gmx::HardwareTopology::SupportLevel::None)
TEST(HardwareTopologyTest, HwlocExecute)
{
#if defined(__linux__)
- gmx::HardwareTopology hwTop(gmx::HardwareTopology::detect(nullptr, nullptr));
+ gmx::HardwareTopology hwTop(gmx::HardwareTopology::detect());
// On Linux with hwloc support we should be able to get at least basic information
EXPECT_GE(hwTop.supportLevel(), gmx::HardwareTopology::SupportLevel::Basic)
TEST(HardwareTopologyTest, ProcessorSelfconsistency)
{
- gmx::HardwareTopology hwTop(gmx::HardwareTopology::detect(nullptr, nullptr));
+ gmx::HardwareTopology hwTop(gmx::HardwareTopology::detect());
if (hwTop.supportLevel() >= gmx::HardwareTopology::SupportLevel::Basic)
{
TEST(HardwareTopologyTest, NumaCacheSelfconsistency)
{
- gmx::HardwareTopology hwTop(gmx::HardwareTopology::detect(nullptr, nullptr));
+ gmx::HardwareTopology hwTop(gmx::HardwareTopology::detect());
if (hwTop.supportLevel() >= gmx::HardwareTopology::SupportLevel::Full)
{
hw_opt->gpu_opt.n_dev_use = 0;
}
- checkLogicalProcessorCountIsConsistentWithOpenmp(fplog, cr, hwinfo->hardwareTopology);
-
/* check consistency across ranks of things like SIMD
* support and number of GPUs selected */
gmx_check_hw_runconf_consistency(fplog, hwinfo, cr, hw_opt, bUseGPU);