From: Mark Abraham <mark.j.abraham@gmail.com>
Date: Wed, 29 Jun 2016 13:30:40 +0000 (+0200)
Subject: Merge branch release-5-1 into release-2016
X-Git-Url: http://biod.pnpi.spb.ru/gitweb/?a=commitdiff_plain;h=ed8d537d336d90081b62d78a8ecd63f67cfb7a17;p=alexxy%2Fgromacs.git

Merge branch release-5-1 into release-2016

The changes to checks in release-5-1 to check_nthreads_hw_avail() are
all incorporated into detectLogicalProcessorCount().

Change-Id: Ifbee24dc40630988a14026b9f45cfaf6e92390ba
---

ed8d537d336d90081b62d78a8ecd63f67cfb7a17
diff --cc src/gromacs/gpu_utils/cudautils.cuh
index 495b1a4d29,0000000000..5daa040d28
mode 100644,000000..100644
--- a/src/gromacs/gpu_utils/cudautils.cuh
+++ b/src/gromacs/gpu_utils/cudautils.cuh
@@@ -1,169 -1,0 +1,173 @@@
 +/*
 + * This file is part of the GROMACS molecular simulation package.
 + *
 + * Copyright (c) 2012,2014,2015,2016, by the GROMACS development team, led by
 + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
 + * and including many others, as listed in the AUTHORS file in the
 + * top-level source directory and at http://www.gromacs.org.
 + *
 + * GROMACS is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU Lesser General Public License
 + * as published by the Free Software Foundation; either version 2.1
 + * of the License, or (at your option) any later version.
 + *
 + * GROMACS is distributed in the hope that it will be useful,
 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 + * Lesser General Public License for more details.
 + *
 + * You should have received a copy of the GNU Lesser General Public
 + * License along with GROMACS; if not, see
 + * http://www.gnu.org/licenses, or write to the Free Software Foundation,
 + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
 + *
 + * If you want to redistribute modifications to GROMACS, please
 + * consider that scientific software is very special. Version
 + * control is crucial - bugs must be traceable. We will be happy to
 + * consider code for inclusion in the official distribution, but
 + * derived work must not be called official GROMACS. Details are found
 + * in the README & COPYING files - if they are missing, get the
 + * official version at http://www.gromacs.org.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the research papers on the package. Check out http://www.gromacs.org.
 + */
 +#ifndef GMX_GPU_UTILS_CUDAUTILS_CUH
 +#define GMX_GPU_UTILS_CUDAUTILS_CUH
 +
 +#include "config.h"
 +
 +#include <stdio.h>
 +#if HAVE_NVML
 +#include <nvml.h>
 +#endif /* HAVE_NVML */
 +
 +#include "gromacs/utility/fatalerror.h"
 +
 +/* TODO error checking needs to be rewritten. We have 2 types of error checks needed
 +   based on where they occur in the code:
 +   - non performance-critical: these errors are unsafe to be ignored and must be
 +     _always_ checked for, e.g. initializations
 +   - performance critical: handling errors might hurt performance so care need to be taken
 +     when/if we should check for them at all, e.g. in cu_upload_X. However, we should be
 +     able to turn the check for these errors on!
 +
 +   Probably we'll need two sets of the macros below...
 +
 + */
 +#define CHECK_CUDA_ERRORS
 +
 +#ifdef CHECK_CUDA_ERRORS
 +
 +/*! Check for CUDA error on the return status of a CUDA RT API call. */
 +#define CU_RET_ERR(status, msg) \
 +    do { \
 +        if (status != cudaSuccess) \
 +        { \
 +            gmx_fatal(FARGS, "%s: %s\n", msg, cudaGetErrorString(status)); \
 +        } \
 +    } while (0)
 +
 +/*! Check for any previously occurred uncaught CUDA error. */
 +#define CU_CHECK_PREV_ERR() \
 +    do { \
 +        cudaError_t _CU_CHECK_PREV_ERR_status = cudaGetLastError(); \
 +        if (_CU_CHECK_PREV_ERR_status != cudaSuccess) { \
 +            gmx_warning("Just caught a previously occurred CUDA error (%s), will try to continue.", cudaGetErrorString(_CU_CHECK_PREV_ERR_status)); \
 +        } \
 +    } while (0)
 +
 +/*! Check for any previously occurred uncaught CUDA error
 +   -- aimed at use after kernel calls. */
 +#define CU_LAUNCH_ERR(msg) \
 +    do { \
 +        cudaError_t _CU_LAUNCH_ERR_status = cudaGetLastError(); \
 +        if (_CU_LAUNCH_ERR_status != cudaSuccess) { \
 +            gmx_fatal(FARGS, "Error while launching kernel %s: %s\n", msg, cudaGetErrorString(_CU_LAUNCH_ERR_status)); \
 +        } \
 +    } while (0)
 +
 +/*! Synchronize with GPU and check for any previously occurred uncaught CUDA error
 +   -- aimed at use after kernel calls. */
 +#define CU_LAUNCH_ERR_SYNC(msg) \
 +    do { \
 +        cudaError_t _CU_SYNC_LAUNCH_ERR_status = cudaThreadSynchronize(); \
 +        if (_CU_SYNC_LAUNCH_ERR_status != cudaSuccess) { \
 +            gmx_fatal(FARGS, "Error while launching kernel %s: %s\n", msg, cudaGetErrorString(_CU_SYNC_LAUNCH_ERR_status)); \
 +        } \
 +    } while (0)
 +
 +#else /* CHECK_CUDA_ERRORS */
 +
 +#define CU_RET_ERR(status, msg) do { } while (0)
 +#define CU_CHECK_PREV_ERR()     do { } while (0)
 +#define CU_LAUNCH_ERR(msg)      do { } while (0)
 +#define CU_LAUNCH_ERR_SYNC(msg) do { } while (0)
 +#define HANDLE_NVML_RET_ERR(status, msg) do { } while (0)
 +
 +#endif /* CHECK_CUDA_ERRORS */
 +
 +/*! \brief CUDA device information.
 + *
 + * The CUDA device information is queried and set at detection and contains
 + * both information about the device/hardware returned by the runtime as well
 + * as additional data like support status.
 + */
 +struct gmx_device_info_t
 +{
-     int                 id;                     /* id of the CUDA device */
-     cudaDeviceProp      prop;                   /* CUDA device properties */
-     int                 stat;                   /* result of the device check */
-     gmx_bool            nvml_initialized;       /* If NVML was initialized */
-     gmx_bool            nvml_ap_clocks_changed; /* If application clocks have been changed */
++    int                 id;                      /* id of the CUDA device */
++    cudaDeviceProp      prop;                    /* CUDA device properties */
++    int                 stat;                    /* result of the device check */
++    gmx_bool            nvml_initialized;        /* If NVML was initialized */
++    unsigned int        nvml_orig_app_sm_clock;  /* The original SM clock before we changed it */
++    unsigned int        nvml_orig_app_mem_clock; /* The original memory clock before we changed it */
++    gmx_bool            nvml_app_clocks_changed; /* If application clocks have been changed */
++    unsigned int        nvml_set_app_sm_clock;   /* The SM clock we set */
++    unsigned int        nvml_set_app_mem_clock;  /* The memory clock we set */
 +#if HAVE_NVML
-     nvmlDevice_t        nvml_device_id;         /* NVML device id */
-     nvmlEnableState_t   nvml_is_restricted;     /* Status of application clocks permission */
- #endif                                          /* HAVE_NVML */
++    nvmlDevice_t        nvml_device_id;          /* NVML device id */
++    nvmlEnableState_t   nvml_is_restricted;      /* Status of application clocks permission */
++#endif                                           /* HAVE_NVML */
 +};
 +
 +
 +/*! Launches asynchronous host to device memory copy in stream 0. */
 +int cu_copy_D2H(void * /*h_dest*/, void * /*d_src*/, size_t /*bytes*/);
 +
 +/*! Launches asynchronous host to device memory copy in stream s. */
 +int cu_copy_D2H_async(void * /*h_dest*/, void * /*d_src*/, size_t /*bytes*/, cudaStream_t /*s = 0*/);
 +
 +/*! Allocates host memory and launches synchronous host to device memory copy. */
 +int cu_copy_D2H_alloc(void ** /*h_dest*/, void * /*d_src*/, size_t /*bytes*/);
 +
 +
 +/*! Launches synchronous host to device memory copy. */
 +int cu_copy_H2D(void * /*d_dest*/, void * /*h_src*/, size_t /*bytes*/);
 +
 +/*! Launches asynchronous host to device memory copy in stream s. */
 +int cu_copy_H2D_async(void * /*d_dest*/, void * /*h_src*/, size_t /*bytes*/, cudaStream_t /*s = 0*/);
 +
 +/*! Allocates device memory and launches synchronous host to device memory copy. */
 +int cu_copy_H2D_alloc(void ** /*d_dest*/, void * /*h_src*/, size_t /*bytes*/);
 +
 +/*! Frees device memory and resets the size and allocation size to -1. */
 +void cu_free_buffered(void *d_ptr, int *n = NULL, int *nalloc = NULL);
 +
 +/*! Reallocates the device memory and copies data from the host. */
 +void cu_realloc_buffered(void **d_dest, void *h_src,
 +                         size_t type_size,
 +                         int *curr_size, int *curr_alloc_size,
 +                         int req_size,
 +                         cudaStream_t s,
 +                         bool bAsync);
 +
 +/*! Waits for event e to complete, */
 +int cu_wait_event(cudaEvent_t /*e*/);
 +
 +/*! Calculates and returns the time elapsed between event start and end. */
 +float cu_event_elapsed(cudaEvent_t /*start*/, cudaEvent_t /*end*/);
 +
 +/*! Waits for event end to complete and calculates the time between start and end. */
 +int cu_wait_event_time(cudaEvent_t /*end*/, cudaEvent_t /*begin*/, float * /*time*/);
 +
 +#endif
diff --cc src/gromacs/gpu_utils/gpu_utils.cu
index f08d55443e,0000000000..e8b78249e9
mode 100644,000000..100644
--- a/src/gromacs/gpu_utils/gpu_utils.cu
+++ b/src/gromacs/gpu_utils/gpu_utils.cu
@@@ -1,832 -1,0 +1,867 @@@
 +/*
 + * This file is part of the GROMACS molecular simulation package.
 + *
 + * Copyright (c) 2010,2011,2012,2013,2014,2015,2016, by the GROMACS development team, led by
 + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
 + * and including many others, as listed in the AUTHORS file in the
 + * top-level source directory and at http://www.gromacs.org.
 + *
 + * GROMACS is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU Lesser General Public License
 + * as published by the Free Software Foundation; either version 2.1
 + * of the License, or (at your option) any later version.
 + *
 + * GROMACS is distributed in the hope that it will be useful,
 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 + * Lesser General Public License for more details.
 + *
 + * You should have received a copy of the GNU Lesser General Public
 + * License along with GROMACS; if not, see
 + * http://www.gnu.org/licenses, or write to the Free Software Foundation,
 + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
 + *
 + * If you want to redistribute modifications to GROMACS, please
 + * consider that scientific software is very special. Version
 + * control is crucial - bugs must be traceable. We will be happy to
 + * consider code for inclusion in the official distribution, but
 + * derived work must not be called official GROMACS. Details are found
 + * in the README & COPYING files - if they are missing, get the
 + * official version at http://www.gromacs.org.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the research papers on the package. Check out http://www.gromacs.org.
 + */
 +/*! \file
 + *  \brief Define functions for detection and initialization for CUDA devices.
 + *
 + *  \author Szilard Pall <pall.szilard@gmail.com>
 + */
 +
 +#include "gmxpre.h"
 +
 +#include "gpu_utils.h"
 +
 +#include "config.h"
 +
 +#include <assert.h>
 +#include <stdio.h>
 +#include <stdlib.h>
 +
 +#include <cuda_profiler_api.h>
 +
 +#include "gromacs/gpu_utils/cudautils.cuh"
 +#include "gromacs/gpu_utils/pmalloc_cuda.h"
 +#include "gromacs/hardware/gpu_hw_info.h"
 +#include "gromacs/utility/basedefinitions.h"
 +#include "gromacs/utility/cstringutil.h"
 +#include "gromacs/utility/smalloc.h"
 +
 +#if HAVE_NVML
 +#include <nvml.h>
 +#define HAVE_NVML_APPLICATION_CLOCKS (NVML_API_VERSION >= 6)
 +#else  /* HAVE_NVML */
 +#define HAVE_NVML_APPLICATION_CLOCKS 0
 +#endif /* HAVE_NVML */
 +
 +#if defined(CHECK_CUDA_ERRORS) && HAVE_NVML_APPLICATION_CLOCKS
 +/*! Check for NVML error on the return status of a NVML API call. */
 +#  define HANDLE_NVML_RET_ERR(status, msg) \
 +    do { \
 +        if (status != NVML_SUCCESS) \
 +        { \
 +            gmx_warning("%s: %s\n", msg, nvmlErrorString(status)); \
 +        } \
 +    } while (0)
 +#else  /* defined(CHECK_CUDA_ERRORS) && HAVE_NVML_APPLICATION_CLOCKS */
 +#  define HANDLE_NVML_RET_ERR(status, msg) do { } while (0)
 +#endif /* defined(CHECK_CUDA_ERRORS) && HAVE_NVML_APPLICATION_CLOCKS */
 +
 +#if HAVE_NVML_APPLICATION_CLOCKS
 +static const gmx_bool            bCompiledWithApplicationClockSupport = true;
 +#else
 +static const gmx_bool gmx_unused bCompiledWithApplicationClockSupport = false;
 +#endif
 +
 +/*! \internal \brief
 + * Max number of devices supported by CUDA (for consistency checking).
 + *
 + * In reality it is 16 with CUDA <=v5.0, but let's stay on the safe side.
 + */
 +static int  cuda_max_device_count = 32;
 +
 +static bool cudaProfilerRun      = ((getenv("NVPROF_ID") != NULL));
 +
 +/** Dummy kernel used for sanity checking. */
 +__global__ void k_dummy_test()
 +{
 +}
 +
 +
 +/*!
 + * \brief Runs GPU sanity checks.
 + *
 + * Runs a series of checks to determine that the given GPU and underlying CUDA
 + * driver/runtime functions properly.
 + * Returns properties of a device with given ID or the one that has
 + * already been initialized earlier in the case if of \dev_id == -1.
 + *
 + * \param[in]  dev_id      the device ID of the GPU or -1 if the device has already been initialized
 + * \param[out] dev_prop    pointer to the structure in which the device properties will be returned
 + * \returns                0 if the device looks OK
 + *
 + * TODO: introduce errors codes and handle errors more smoothly.
 + */
 +static int do_sanity_checks(int dev_id, cudaDeviceProp *dev_prop)
 +{
 +    cudaError_t cu_err;
 +    int         dev_count, id;
 +
 +    cu_err = cudaGetDeviceCount(&dev_count);
 +    if (cu_err != cudaSuccess)
 +    {
 +        fprintf(stderr, "Error %d while querying device count: %s\n", cu_err,
 +                cudaGetErrorString(cu_err));
 +        return -1;
 +    }
 +
 +    /* no CUDA compatible device at all */
 +    if (dev_count == 0)
 +    {
 +        return -1;
 +    }
 +
 +    /* things might go horribly wrong if cudart is not compatible with the driver */
 +    if (dev_count < 0 || dev_count > cuda_max_device_count)
 +    {
 +        return -1;
 +    }
 +
 +    if (dev_id == -1) /* device already selected let's not destroy the context */
 +    {
 +        cu_err = cudaGetDevice(&id);
 +        if (cu_err != cudaSuccess)
 +        {
 +            fprintf(stderr, "Error %d while querying device id: %s\n", cu_err,
 +                    cudaGetErrorString(cu_err));
 +            return -1;
 +        }
 +    }
 +    else
 +    {
 +        id = dev_id;
 +        if (id > dev_count - 1) /* pfff there's no such device */
 +        {
 +            fprintf(stderr, "The requested device with id %d does not seem to exist (device count=%d)\n",
 +                    dev_id, dev_count);
 +            return -1;
 +        }
 +    }
 +
 +    memset(dev_prop, 0, sizeof(cudaDeviceProp));
 +    cu_err = cudaGetDeviceProperties(dev_prop, id);
 +    if (cu_err != cudaSuccess)
 +    {
 +        fprintf(stderr, "Error %d while querying device properties: %s\n", cu_err,
 +                cudaGetErrorString(cu_err));
 +        return -1;
 +    }
 +
 +    /* both major & minor is 9999 if no CUDA capable devices are present */
 +    if (dev_prop->major == 9999 && dev_prop->minor == 9999)
 +    {
 +        return -1;
 +    }
 +    /* we don't care about emulation mode */
 +    if (dev_prop->major == 0)
 +    {
 +        return -1;
 +    }
 +
 +    if (id != -1)
 +    {
 +        cu_err = cudaSetDevice(id);
 +        if (cu_err != cudaSuccess)
 +        {
 +            fprintf(stderr, "Error %d while switching to device #%d: %s\n",
 +                    cu_err, id, cudaGetErrorString(cu_err));
 +            return -1;
 +        }
 +    }
 +
 +    /* try to execute a dummy kernel */
 +    k_dummy_test<<< 1, 512>>> ();
 +    if (cudaThreadSynchronize() != cudaSuccess)
 +    {
 +        return -1;
 +    }
 +
 +    /* destroy context if we created one */
 +    if (id != -1)
 +    {
 +        cu_err = cudaDeviceReset();
 +        CU_RET_ERR(cu_err, "cudaDeviceReset failed");
 +    }
 +
 +    return 0;
 +}
 +
 +#if HAVE_NVML
 +/* TODO: We should actually be using md_print_warn in md_logging.c,
 + * but we can't include mpi.h in CUDA code.
 + */
 +static void md_print_info(FILE       *fplog,
 +                          const char *fmt, ...)
 +{
 +    va_list ap;
 +
 +    if (fplog != NULL)
 +    {
 +        /* We should only print to stderr on the master node,
 +         * in most cases fplog is only set on the master node, so this works.
 +         */
 +        va_start(ap, fmt);
 +        vfprintf(stderr, fmt, ap);
 +        va_end(ap);
 +
 +        va_start(ap, fmt);
 +        vfprintf(fplog, fmt, ap);
 +        va_end(ap);
 +    }
 +}
 +#endif /*HAVE_NVML*/
 +
 +/* TODO: We should actually be using md_print_warn in md_logging.c,
 + * but we can't include mpi.h in CUDA code.
 + * This is replicated from nbnxn_cuda_data_mgmt.cu.
 + */
 +static void md_print_warn(FILE       *fplog,
 +                          const char *fmt, ...)
 +{
 +    va_list ap;
 +
 +    if (fplog != NULL)
 +    {
 +        /* We should only print to stderr on the master node,
 +         * in most cases fplog is only set on the master node, so this works.
 +         */
 +        va_start(ap, fmt);
 +        fprintf(stderr, "\n");
 +        vfprintf(stderr, fmt, ap);
 +        fprintf(stderr, "\n");
 +        va_end(ap);
 +
 +        va_start(ap, fmt);
 +        fprintf(fplog, "\n");
 +        vfprintf(fplog, fmt, ap);
 +        fprintf(fplog, "\n");
 +        va_end(ap);
 +    }
 +}
 +
 +#if HAVE_NVML_APPLICATION_CLOCKS
 +/*! \brief Determines and adds the NVML device ID to the passed \cuda_dev.
 + *
 + * Determines and adds the NVML device ID to the passed \cuda_dev. This is done by
 + * matching PCI-E information from \cuda_dev with the available NVML devices.
 + *
 + * \param[in,out] cuda_dev  CUDA device information to enrich with NVML device info
 + * \returns                 true if \cuda_dev could be enriched with matching NVML device information.
 + */
 +static bool addNVMLDeviceId(gmx_device_info_t* cuda_dev)
 +{
 +    nvmlDevice_t nvml_device_id;
 +    unsigned int nvml_device_count = 0;
 +    nvmlReturn_t nvml_stat         = nvmlDeviceGetCount ( &nvml_device_count );
 +    cuda_dev->nvml_initialized = false;
 +    HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetCount failed" );
 +    for (unsigned int nvml_device_idx = 0; nvml_stat == NVML_SUCCESS && nvml_device_idx < nvml_device_count; ++nvml_device_idx)
 +    {
 +        nvml_stat = nvmlDeviceGetHandleByIndex ( nvml_device_idx, &nvml_device_id );
 +        HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetHandleByIndex failed" );
 +        if (nvml_stat != NVML_SUCCESS)
 +        {
 +            break;
 +        }
 +
 +        nvmlPciInfo_t nvml_pci_info;
 +        nvml_stat = nvmlDeviceGetPciInfo ( nvml_device_id, &nvml_pci_info );
 +        HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetPciInfo failed" );
 +        if (nvml_stat != NVML_SUCCESS)
 +        {
 +            break;
 +        }
 +        if (static_cast<unsigned int>(cuda_dev->prop.pciBusID) == nvml_pci_info.bus &&
 +            static_cast<unsigned int>(cuda_dev->prop.pciDeviceID) == nvml_pci_info.device &&
 +            static_cast<unsigned int>(cuda_dev->prop.pciDomainID) == nvml_pci_info.domain)
 +        {
 +            cuda_dev->nvml_initialized = true;
 +            cuda_dev->nvml_device_id   = nvml_device_id;
 +            break;
 +        }
 +    }
 +    return cuda_dev->nvml_initialized;
 +}
++
++/*! \brief Reads and returns the application clocks for device.
++ *
++ * \param[in]  device        The GPU device
++ * \param[out] app_sm_clock  The current application SM clock
++ * \param[out] app_mem_clock The current application memory clock
++ * \returns if applacation clocks are supported
++ */
++static bool getApplicationClocks(const gmx_device_info_t *cuda_dev,
++                                 unsigned int            *app_sm_clock,
++                                 unsigned int            *app_mem_clock)
++{
++    nvmlReturn_t nvml_stat;
++
++    nvml_stat = nvmlDeviceGetApplicationsClock(cuda_dev->nvml_device_id, NVML_CLOCK_SM, app_sm_clock);
++    if (NVML_ERROR_NOT_SUPPORTED == nvml_stat)
++    {
++        return false;
++    }
++    HANDLE_NVML_RET_ERR(nvml_stat, "nvmlDeviceGetApplicationsClock failed");
++    nvml_stat = nvmlDeviceGetApplicationsClock(cuda_dev->nvml_device_id, NVML_CLOCK_MEM, app_mem_clock);
++    HANDLE_NVML_RET_ERR(nvml_stat, "nvmlDeviceGetApplicationsClock failed");
++
++    return true;
++}
 +#endif /* HAVE_NVML_APPLICATION_CLOCKS */
 +
 +/*! \brief Tries to set application clocks for the GPU with the given index.
 + *
 + * The variable \gpuid is the index of the GPU in the gpu_info.cuda_dev array
 + * to handle the application clocks for. Application clocks are set to the
 + * max supported value to increase performance if application clock permissions
 + * allow this. For future GPU architectures a more sophisticated scheme might be
 + * required.
 + *
 + * \param[out] fplog        log file to write to
 + * \param[in] gpuid         index of the GPU to set application clocks for
 + * \param[in] gpu_info      GPU info of all detected devices in the system.
 + * \returns                 true if no error occurs during application clocks handling.
 + */
 +static gmx_bool init_gpu_application_clocks(FILE gmx_unused *fplog, int gmx_unused gpuid, const gmx_gpu_info_t gmx_unused *gpu_info)
 +{
 +    const cudaDeviceProp *prop                        = &gpu_info->gpu_dev[gpuid].prop;
 +    int                   cuda_version_number         = prop->major * 10 + prop->minor;
 +    gmx_bool              bGpuCanUseApplicationClocks =
 +        ((0 == gmx_wcmatch("*Tesla*", prop->name) && cuda_version_number >= 35 ) ||
 +         (0 == gmx_wcmatch("*Quadro*", prop->name) && cuda_version_number >= 52 ));
 +    if (!bGpuCanUseApplicationClocks)
 +    {
 +        return true;
 +    }
 +#if !HAVE_NVML
 +    int cuda_driver  = 0;
 +    int cuda_runtime = 0;
 +    cudaDriverGetVersion(&cuda_driver);
 +    cudaRuntimeGetVersion(&cuda_runtime);
 +    md_print_warn( fplog, "NOTE: GROMACS was configured without NVML support hence it can not exploit\n"
 +                   "      application clocks of the detected %s GPU to improve performance.\n"
 +                   "      Recompile with the NVML library (compatible with the driver used) or set application clocks manually.\n",
 +                   prop->name);
 +    return true;
 +#else
 +    if (!bCompiledWithApplicationClockSupport)
 +    {
 +        int cuda_driver  = 0;
 +        int cuda_runtime = 0;
 +        cudaDriverGetVersion(&cuda_driver);
 +        cudaRuntimeGetVersion(&cuda_runtime);
 +        md_print_warn( fplog, "NOTE: GROMACS was compiled with an old NVML library which does not support\n"
 +                       "      managing application clocks of the detected %s GPU to improve performance.\n"
 +                       "      If your GPU supports application clocks, upgrade NVML (and driver) and recompile or set the clocks manually.\n",
 +                       prop->name );
 +        return true;
 +    }
 +
 +    /* We've compiled with NVML application clocks support, and have a GPU that can use it */
 +    nvmlReturn_t nvml_stat = NVML_SUCCESS;
 +    char        *env;
 +    //TODO: GMX_GPU_APPLICATION_CLOCKS is currently only used to enable/disable setting of application clocks
 +    //      this variable can be later used to give a user more fine grained control.
 +    env = getenv("GMX_GPU_APPLICATION_CLOCKS");
 +    if (env != NULL && ( strcmp( env, "0") == 0 ||
 +                         gmx_strcasecmp( env, "OFF") == 0 ||
 +                         gmx_strcasecmp( env, "DISABLE") == 0 ))
 +    {
 +        return true;
 +    }
 +    nvml_stat = nvmlInit();
 +    HANDLE_NVML_RET_ERR( nvml_stat, "nvmlInit failed." );
 +    if (nvml_stat != NVML_SUCCESS)
 +    {
 +        return false;
 +    }
-     if (!addNVMLDeviceId( &(gpu_info->gpu_dev[gpuid])))
++
++    gmx_device_info_t *cuda_dev = &(gpu_info->gpu_dev[gpuid]);
++
++    if (!addNVMLDeviceId(cuda_dev))
 +    {
 +        return false;
 +    }
 +    //get current application clocks setting
-     unsigned int app_sm_clock  = 0;
-     unsigned int app_mem_clock = 0;
-     nvml_stat = nvmlDeviceGetApplicationsClock ( gpu_info->gpu_dev[gpuid].nvml_device_id, NVML_CLOCK_SM, &app_sm_clock );
-     if (NVML_ERROR_NOT_SUPPORTED == nvml_stat)
++    if (!getApplicationClocks(cuda_dev,
++                              &cuda_dev->nvml_orig_app_sm_clock,
++                              &cuda_dev->nvml_orig_app_mem_clock))
 +    {
 +        return false;
 +    }
-     HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetApplicationsClock failed" );
-     nvml_stat = nvmlDeviceGetApplicationsClock ( gpu_info->gpu_dev[gpuid].nvml_device_id, NVML_CLOCK_MEM, &app_mem_clock );
-     HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetApplicationsClock failed" );
 +    //get max application clocks
 +    unsigned int max_sm_clock  = 0;
 +    unsigned int max_mem_clock = 0;
-     nvml_stat = nvmlDeviceGetMaxClockInfo ( gpu_info->gpu_dev[gpuid].nvml_device_id, NVML_CLOCK_SM, &max_sm_clock );
++    nvml_stat = nvmlDeviceGetMaxClockInfo(cuda_dev->nvml_device_id, NVML_CLOCK_SM, &max_sm_clock);
 +    HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetMaxClockInfo failed" );
-     nvml_stat = nvmlDeviceGetMaxClockInfo ( gpu_info->gpu_dev[gpuid].nvml_device_id, NVML_CLOCK_MEM, &max_mem_clock );
++    nvml_stat = nvmlDeviceGetMaxClockInfo(cuda_dev->nvml_device_id, NVML_CLOCK_MEM, &max_mem_clock);
 +    HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetMaxClockInfo failed" );
 +
-     gpu_info->gpu_dev[gpuid].nvml_is_restricted     = NVML_FEATURE_ENABLED;
-     gpu_info->gpu_dev[gpuid].nvml_ap_clocks_changed = false;
++    cuda_dev->nvml_is_restricted      = NVML_FEATURE_ENABLED;
++    cuda_dev->nvml_app_clocks_changed = false;
 +
-     nvml_stat = nvmlDeviceGetAPIRestriction ( gpu_info->gpu_dev[gpuid].nvml_device_id, NVML_RESTRICTED_API_SET_APPLICATION_CLOCKS, &(gpu_info->gpu_dev[gpuid].nvml_is_restricted) );
++    nvml_stat = nvmlDeviceGetAPIRestriction(cuda_dev->nvml_device_id, NVML_RESTRICTED_API_SET_APPLICATION_CLOCKS, &(cuda_dev->nvml_is_restricted));
 +    HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetAPIRestriction failed" );
 +
 +    /* Note: Distinguishing between different types of GPUs here might be necessary in the future,
 +       e.g. if max application clocks should not be used for certain GPUs. */
-     if (nvml_stat == NVML_SUCCESS && app_sm_clock < max_sm_clock && gpu_info->gpu_dev[gpuid].nvml_is_restricted == NVML_FEATURE_DISABLED)
++    if (nvml_stat == NVML_SUCCESS && cuda_dev->nvml_orig_app_sm_clock < max_sm_clock && cuda_dev->nvml_is_restricted == NVML_FEATURE_DISABLED)
 +    {
-         md_print_info( fplog, "Changing GPU application clocks for %s to (%d,%d)\n", gpu_info->gpu_dev[gpuid].prop.name, max_mem_clock, max_sm_clock);
-         nvml_stat = nvmlDeviceSetApplicationsClocks ( gpu_info->gpu_dev[gpuid].nvml_device_id, max_mem_clock, max_sm_clock );
++        md_print_info(fplog, "Changing GPU application clocks for %s to (%d,%d)\n", cuda_dev->prop.name, max_mem_clock, max_sm_clock);
++        nvml_stat = nvmlDeviceSetApplicationsClocks(cuda_dev->nvml_device_id, max_mem_clock, max_sm_clock);
 +        HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetApplicationsClock failed" );
-         gpu_info->gpu_dev[gpuid].nvml_ap_clocks_changed = true;
++        cuda_dev->nvml_app_clocks_changed = true;
++        cuda_dev->nvml_set_app_sm_clock   = max_sm_clock;
++        cuda_dev->nvml_set_app_mem_clock  = max_mem_clock;
 +    }
-     else if (nvml_stat == NVML_SUCCESS && app_sm_clock < max_sm_clock)
++    else if (nvml_stat == NVML_SUCCESS && cuda_dev->nvml_orig_app_sm_clock < max_sm_clock)
 +    {
-         md_print_warn( fplog, "Can not change application clocks for %s to optimal values due to insufficient permissions. Current values are (%d,%d), max values are (%d,%d).\nUse sudo nvidia-smi -acp UNRESTRICTED or contact your admin to change application clocks.\n", gpu_info->gpu_dev[gpuid].prop.name, app_mem_clock, app_sm_clock, max_mem_clock, max_sm_clock);
++        md_print_warn(fplog, "Can not change application clocks for %s to optimal values due to insufficient permissions. Current values are (%d,%d), max values are (%d,%d).\nUse sudo nvidia-smi -acp UNRESTRICTED or contact your admin to change application clocks.\n", cuda_dev->prop.name, cuda_dev->nvml_orig_app_mem_clock, cuda_dev->nvml_orig_app_sm_clock, max_mem_clock, max_sm_clock);
 +    }
-     else if (nvml_stat == NVML_SUCCESS && app_sm_clock == max_sm_clock)
++    else if (nvml_stat == NVML_SUCCESS && cuda_dev->nvml_orig_app_sm_clock == max_sm_clock)
 +    {
-         //TODO: This should probably be integrated into the GPU Properties table.
-         md_print_info( fplog, "Application clocks (GPU clocks) for %s are (%d,%d)\n", gpu_info->gpu_dev[gpuid].prop.name, app_mem_clock, app_sm_clock);
++        md_print_info(fplog, "Application clocks (GPU clocks) for %s are (%d,%d)\n", cuda_dev->prop.name, cuda_dev->nvml_orig_app_mem_clock, cuda_dev->nvml_orig_app_sm_clock);
 +    }
 +    else
 +    {
 +        md_print_warn( fplog,  "Can not change GPU application clocks to optimal values due to NVML error (%d): %s.\n", nvml_stat, nvmlErrorString(nvml_stat));
 +    }
 +    return (nvml_stat == NVML_SUCCESS);
 +#endif /* HAVE_NVML */
 +}
 +
 +/*! \brief Resets application clocks if changed and cleans up NVML for the passed \gpu_dev.
 + *
 + * \param[in] gpu_dev  CUDA device information
 + */
 +static gmx_bool reset_gpu_application_clocks(const gmx_device_info_t gmx_unused * cuda_dev)
 +{
 +#if !HAVE_NVML_APPLICATION_CLOCKS
 +    GMX_UNUSED_VALUE(cuda_dev);
 +    return true;
 +#else /* HAVE_NVML_APPLICATION_CLOCKS */
 +    nvmlReturn_t nvml_stat = NVML_SUCCESS;
 +    if (cuda_dev &&
 +        cuda_dev->nvml_is_restricted == NVML_FEATURE_DISABLED &&
-         cuda_dev->nvml_ap_clocks_changed)
++        cuda_dev->nvml_app_clocks_changed)
 +    {
-         nvml_stat = nvmlDeviceResetApplicationsClocks( cuda_dev->nvml_device_id );
-         HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceResetApplicationsClocks failed" );
++        /* Check if the clocks are still what we set them to.
++         * If so, set them back to the state we originally found them in.
++         * If not, don't touch them, because something else set them later.
++         */
++        unsigned int app_sm_clock, app_mem_clock;
++        getApplicationClocks(cuda_dev, &app_sm_clock, &app_mem_clock);
++        if (app_sm_clock  == cuda_dev->nvml_set_app_sm_clock &&
++            app_mem_clock == cuda_dev->nvml_set_app_mem_clock)
++        {
++            nvml_stat = nvmlDeviceSetApplicationsClocks(cuda_dev->nvml_device_id, cuda_dev->nvml_orig_app_mem_clock, cuda_dev->nvml_orig_app_sm_clock);
++            HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetApplicationsClock failed" );
++        }
 +    }
 +    nvml_stat = nvmlShutdown();
 +    HANDLE_NVML_RET_ERR( nvml_stat, "nvmlShutdown failed" );
 +    return (nvml_stat == NVML_SUCCESS);
 +#endif /* HAVE_NVML_APPLICATION_CLOCKS */
 +}
 +
 +gmx_bool init_gpu(FILE gmx_unused *fplog, int mygpu, char *result_str,
 +                  const struct gmx_gpu_info_t *gpu_info,
 +                  const struct gmx_gpu_opt_t *gpu_opt)
 +{
 +    cudaError_t stat;
 +    char        sbuf[STRLEN];
 +    int         gpuid;
 +
 +    assert(gpu_info);
 +    assert(result_str);
 +
 +    if (mygpu < 0 || mygpu >= gpu_opt->n_dev_use)
 +    {
 +        sprintf(sbuf, "Trying to initialize an inexistent GPU: "
 +                "there are %d %s-selected GPU(s), but #%d was requested.",
 +                gpu_opt->n_dev_use, gpu_opt->bUserSet ? "user" : "auto", mygpu);
 +        gmx_incons(sbuf);
 +    }
 +
 +    gpuid = gpu_info->gpu_dev[gpu_opt->dev_use[mygpu]].id;
 +
 +    stat = cudaSetDevice(gpuid);
 +    strncpy(result_str, cudaGetErrorString(stat), STRLEN);
 +
 +    if (debug)
 +    {
 +        fprintf(stderr, "Initialized GPU ID #%d: %s\n", gpuid, gpu_info->gpu_dev[gpuid].prop.name);
 +    }
 +
 +    //Ignoring return value as NVML errors should be treated not critical.
 +    if (stat == cudaSuccess)
 +    {
 +        init_gpu_application_clocks(fplog, gpuid, gpu_info);
 +    }
 +    return (stat == cudaSuccess);
 +}
 +
 +gmx_bool free_cuda_gpu(
 +        int gmx_unused mygpu, char *result_str,
 +        const gmx_gpu_info_t gmx_unused *gpu_info,
 +        const gmx_gpu_opt_t gmx_unused *gpu_opt
 +        )
 +{
 +    cudaError_t  stat;
 +    gmx_bool     reset_gpu_application_clocks_status = true;
 +    int          gpuid;
 +
 +    assert(result_str);
 +
 +    if (debug)
 +    {
 +        int gpuid;
 +        stat = cudaGetDevice(&gpuid);
 +        CU_RET_ERR(stat, "cudaGetDevice failed");
 +        fprintf(stderr, "Cleaning up context on GPU ID #%d\n", gpuid);
 +    }
 +
 +    gpuid = gpu_opt ? gpu_opt->dev_use[mygpu] : -1;
 +    if (gpuid != -1)
 +    {
 +        reset_gpu_application_clocks_status = reset_gpu_application_clocks( &(gpu_info->gpu_dev[gpuid]) );
 +    }
 +
 +    stat = cudaDeviceReset();
 +    strncpy(result_str, cudaGetErrorString(stat), STRLEN);
 +    return (stat == cudaSuccess) && reset_gpu_application_clocks_status;
 +}
 +
 +/*! \brief Returns true if the gpu characterized by the device properties is
 + *  supported by the native gpu acceleration.
 + *
 + * \param[in] dev_prop  the CUDA device properties of the gpus to test.
 + * \returns             true if the GPU properties passed indicate a compatible
 + *                      GPU, otherwise false.
 + */
 +static bool is_gmx_supported_gpu(const cudaDeviceProp *dev_prop)
 +{
 +    return (dev_prop->major >= 2);
 +}
 +
 +/*! \brief Helper function that checks whether a given GPU status indicates compatible GPU.
 + *
 + * \param[in] stat  GPU status.
 + * \returns         true if the provided status is egpuCompatible, otherwise false.
 + */
 +static bool is_compatible_gpu(int stat)
 +{
 +    return (stat == egpuCompatible);
 +}
 +
 +/*! \brief Checks if a GPU with a given ID is supported by the native GROMACS acceleration.
 + *
 + *  Returns a status value which indicates compatibility or one of the following
 + *  errors: incompatibility, insistence, or insanity (=unexpected behavior).
 + *  It also returns the respective device's properties in \dev_prop (if applicable).
 + *
 + *  \param[in]  dev_id   the ID of the GPU to check.
 + *  \param[out] dev_prop the CUDA device properties of the device checked.
 + *  \returns             the status of the requested device
 + */
 +static int is_gmx_supported_gpu_id(int dev_id, cudaDeviceProp *dev_prop)
 +{
 +    cudaError_t stat;
 +    int         ndev;
 +
 +    stat = cudaGetDeviceCount(&ndev);
 +    if (stat != cudaSuccess)
 +    {
 +        return egpuInsane;
 +    }
 +
 +    if (dev_id > ndev - 1)
 +    {
 +        return egpuNonexistent;
 +    }
 +
 +    /* TODO: currently we do not make a distinction between the type of errors
 +     * that can appear during sanity checks. This needs to be improved, e.g if
 +     * the dummy test kernel fails to execute with a "device busy message" we
 +     * should appropriately report that the device is busy instead of insane.
 +     */
 +    if (do_sanity_checks(dev_id, dev_prop) == 0)
 +    {
 +        if (is_gmx_supported_gpu(dev_prop))
 +        {
 +            return egpuCompatible;
 +        }
 +        else
 +        {
 +            return egpuIncompatible;
 +        }
 +    }
 +    else
 +    {
 +        return egpuInsane;
 +    }
 +}
 +
 +
 +int detect_gpus(gmx_gpu_info_t *gpu_info, char *err_str)
 +{
 +    int                i, ndev, checkres, retval;
 +    cudaError_t        stat;
 +    cudaDeviceProp     prop;
 +    gmx_device_info_t *devs;
 +
 +    assert(gpu_info);
 +    assert(err_str);
 +
 +    gpu_info->n_dev_compatible = 0;
 +
 +    ndev    = 0;
 +    devs    = NULL;
 +
 +    stat = cudaGetDeviceCount(&ndev);
 +    if (stat != cudaSuccess)
 +    {
 +        const char *s;
 +
 +        /* cudaGetDeviceCount failed which means that there is something
 +         * wrong with the machine: driver-runtime mismatch, all GPUs being
 +         * busy in exclusive mode, or some other condition which should
 +         * result in us issuing a warning a falling back to CPUs. */
 +        retval = -1;
 +        s      = cudaGetErrorString(stat);
 +        strncpy(err_str, s, STRLEN*sizeof(err_str[0]));
 +    }
 +    else
 +    {
 +        snew(devs, ndev);
 +        for (i = 0; i < ndev; i++)
 +        {
 +            checkres = is_gmx_supported_gpu_id(i, &prop);
 +
 +            devs[i].id   = i;
 +            devs[i].prop = prop;
 +            devs[i].stat = checkres;
 +
 +            if (checkres == egpuCompatible)
 +            {
 +                gpu_info->n_dev_compatible++;
 +            }
 +        }
 +        retval = 0;
 +    }
 +
 +    gpu_info->n_dev   = ndev;
 +    gpu_info->gpu_dev = devs;
 +
 +    return retval;
 +}
 +
 +void pick_compatible_gpus(const gmx_gpu_info_t *gpu_info,
 +                          gmx_gpu_opt_t        *gpu_opt)
 +{
 +    int  i, ncompat;
 +    int *compat;
 +
 +    assert(gpu_info);
 +    /* gpu_dev/n_dev have to be either NULL/0 or not (NULL/0) */
 +    assert((gpu_info->n_dev != 0 ? 0 : 1) ^ (gpu_info->gpu_dev == NULL ? 0 : 1));
 +
 +    snew(compat, gpu_info->n_dev);
 +    ncompat = 0;
 +    for (i = 0; i < gpu_info->n_dev; i++)
 +    {
 +        if (is_compatible_gpu(gpu_info->gpu_dev[i].stat))
 +        {
 +            ncompat++;
 +            compat[ncompat - 1] = i;
 +        }
 +    }
 +
 +    gpu_opt->n_dev_compatible = ncompat;
 +    snew(gpu_opt->dev_compatible, ncompat);
 +    memcpy(gpu_opt->dev_compatible, compat, ncompat*sizeof(*compat));
 +    sfree(compat);
 +}
 +
 +gmx_bool check_selected_gpus(int                  *checkres,
 +                             const gmx_gpu_info_t *gpu_info,
 +                             gmx_gpu_opt_t        *gpu_opt)
 +{
 +    int  i, id;
 +    bool bAllOk;
 +
 +    assert(checkres);
 +    assert(gpu_info);
 +    assert(gpu_opt->n_dev_use >= 0);
 +
 +    if (gpu_opt->n_dev_use == 0)
 +    {
 +        return TRUE;
 +    }
 +
 +    assert(gpu_opt->dev_use);
 +
 +    /* we will assume that all GPUs requested are valid IDs,
 +       otherwise we'll bail anyways */
 +
 +    bAllOk = true;
 +    for (i = 0; i < gpu_opt->n_dev_use; i++)
 +    {
 +        id = gpu_opt->dev_use[i];
 +
 +        /* devices are stored in increasing order of IDs in gpu_dev */
 +        gpu_opt->dev_use[i] = id;
 +
 +        checkres[i] = (id >= gpu_info->n_dev) ?
 +            egpuNonexistent : gpu_info->gpu_dev[id].stat;
 +
 +        bAllOk = bAllOk && is_compatible_gpu(checkres[i]);
 +    }
 +
 +    return bAllOk;
 +}
 +
 +void free_gpu_info(const gmx_gpu_info_t *gpu_info)
 +{
 +    if (gpu_info == NULL)
 +    {
 +        return;
 +    }
 +
 +    sfree(gpu_info->gpu_dev);
 +}
 +
 +void get_gpu_device_info_string(char *s, const gmx_gpu_info_t *gpu_info, int index)
 +{
 +    assert(s);
 +    assert(gpu_info);
 +
 +    if (index < 0 && index >= gpu_info->n_dev)
 +    {
 +        return;
 +    }
 +
 +    gmx_device_info_t *dinfo = &gpu_info->gpu_dev[index];
 +
 +    bool               bGpuExists =
 +        dinfo->stat == egpuCompatible ||
 +        dinfo->stat == egpuIncompatible;
 +
 +    if (!bGpuExists)
 +    {
 +        sprintf(s, "#%d: %s, stat: %s",
 +                dinfo->id, "N/A",
 +                gpu_detect_res_str[dinfo->stat]);
 +    }
 +    else
 +    {
 +        sprintf(s, "#%d: NVIDIA %s, compute cap.: %d.%d, ECC: %3s, stat: %s",
 +                dinfo->id, dinfo->prop.name,
 +                dinfo->prop.major, dinfo->prop.minor,
 +                dinfo->prop.ECCEnabled ? "yes" : " no",
 +                gpu_detect_res_str[dinfo->stat]);
 +    }
 +}
 +
 +int get_gpu_device_id(const gmx_gpu_info_t *gpu_info,
 +                      const gmx_gpu_opt_t  *gpu_opt,
 +                      int                   idx)
 +{
 +    assert(gpu_info);
 +    assert(gpu_opt);
 +    assert(idx >= 0 && idx < gpu_opt->n_dev_use);
 +
 +    return gpu_info->gpu_dev[gpu_opt->dev_use[idx]].id;
 +}
 +
 +int get_current_cuda_gpu_device_id(void)
 +{
 +    int gpuid;
 +    CU_RET_ERR(cudaGetDevice(&gpuid), "cudaGetDevice failed");
 +
 +    return gpuid;
 +}
 +
 +size_t sizeof_gpu_dev_info(void)
 +{
 +    return sizeof(gmx_device_info_t);
 +}
 +
 +void gpu_set_host_malloc_and_free(bool               bUseGpuKernels,
 +                                  gmx_host_alloc_t **nb_alloc,
 +                                  gmx_host_free_t  **nb_free)
 +{
 +    if (bUseGpuKernels)
 +    {
 +        *nb_alloc = &pmalloc;
 +        *nb_free  = &pfree;
 +    }
 +    else
 +    {
 +        *nb_alloc = NULL;
 +        *nb_free  = NULL;
 +    }
 +}
 +
 +void startGpuProfiler(void)
 +{
 +    /* The NVPROF_ID environment variable is set by nvprof and indicates that
 +       mdrun is executed in the CUDA profiler.
 +       If nvprof was run is with "--profile-from-start off", the profiler will
 +       be started here. This way we can avoid tracing the CUDA events from the
 +       first part of the run. Starting the profiler again does nothing.
 +     */
 +    if (cudaProfilerRun)
 +    {
 +        cudaError_t stat;
 +        stat = cudaProfilerStart();
 +        CU_RET_ERR(stat, "cudaProfilerStart failed");
 +    }
 +}
 +
 +void stopGpuProfiler(void)
 +{
 +    /* Stopping the nvidia here allows us to eliminate the subsequent
 +       API calls from the trace, e.g. uninitialization and cleanup. */
 +    if (cudaProfilerRun)
 +    {
 +        cudaError_t stat;
 +        stat = cudaProfilerStop();
 +        CU_RET_ERR(stat, "cudaProfilerStop failed");
 +    }
 +}
 +
 +void resetGpuProfiler(void)
 +{
 +    /* With CUDA <=7.5 the profiler can't be properly reset; we can only start
 +     *  the profiling here (can't stop it) which will achieve the desired effect if
 +     *  the run was started with the profiling disabled.
 +     *
 +     * TODO: add a stop (or replace it with reset) when this will work correctly in CUDA.
 +     * stopGpuProfiler();
 +     */
 +    if (cudaProfilerRun)
 +    {
 +        startGpuProfiler();
 +    }
 +}
diff --cc src/gromacs/hardware/hardwaretopology.cpp
index 8a284ae0e2,0000000000..59a4d1d9ff
mode 100644,000000..100644
--- a/src/gromacs/hardware/hardwaretopology.cpp
+++ b/src/gromacs/hardware/hardwaretopology.cpp
@@@ -1,677 -1,0 +1,712 @@@
 +/*
 + * This file is part of the GROMACS molecular simulation package.
 + *
 + * Copyright (c) 2012,2013,2014,2015,2016, by the GROMACS development team, led by
 + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
 + * and including many others, as listed in the AUTHORS file in the
 + * top-level source directory and at http://www.gromacs.org.
 + *
 + * GROMACS is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU Lesser General Public License
 + * as published by the Free Software Foundation; either version 2.1
 + * of the License, or (at your option) any later version.
 + *
 + * GROMACS is distributed in the hope that it will be useful,
 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 + * Lesser General Public License for more details.
 + *
 + * You should have received a copy of the GNU Lesser General Public
 + * License along with GROMACS; if not, see
 + * http://www.gnu.org/licenses, or write to the Free Software Foundation,
 + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
 + *
 + * If you want to redistribute modifications to GROMACS, please
 + * consider that scientific software is very special. Version
 + * control is crucial - bugs must be traceable. We will be happy to
 + * consider code for inclusion in the official distribution, but
 + * derived work must not be called official GROMACS. Details are found
 + * in the README & COPYING files - if they are missing, get the
 + * official version at http://www.gromacs.org.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the research papers on the package. Check out http://www.gromacs.org.
 + */
 +
 +/*! \internal \file
 + * \brief
 + * Implements gmx::HardwareTopology.
 + *
 + * \author Erik Lindahl <erik.lindahl@gmail.com>
 + * \ingroup module_hardware
 + */
 +
 +#include "gmxpre.h"
 +
 +#include "hardwaretopology.h"
 +
 +#include "config.h"
 +
 +#include <cstdio>
 +
 +#include <algorithm>
 +#include <vector>
 +
 +#if GMX_HWLOC
 +#    include <hwloc.h>
 +#endif
 +
 +#include "gromacs/gmxlib/md_logging.h"
 +#include "gromacs/hardware/cpuinfo.h"
 +#include "gromacs/utility/gmxassert.h"
 +#include "gromacs/utility/gmxomp.h"
 +
 +#ifdef HAVE_UNISTD_H
 +#    include <unistd.h>       // sysconf()
 +#endif
 +#if GMX_NATIVE_WINDOWS
 +#    include <windows.h>      // GetSystemInfo()
 +#endif
 +
++#if defined(_M_ARM) || defined(__arm__) || defined(__ARM_ARCH) || defined (__aarch64__)
++//! Constant used to help minimize preprocessed code
++static const bool isArm = true;
++#else
++//! Constant used to help minimize preprocessed code
++static const bool isArm = false;
++#endif
++
 +namespace gmx
 +{
 +
 +namespace
 +{
 +
 +/*****************************************************************************
 + *                                                                           *
 + *   Utility functions for extracting hardware topology from CpuInfo object  *
 + *                                                                           *
 + *****************************************************************************/
 +
 +/*! \brief Initialize machine data from basic information in cpuinfo
 + *
 + *  \param  machine      Machine tree structure where information will be assigned
 + *                       if the cpuinfo object contains topology information.
 + *  \param  supportLevel If topology information is available in CpuInfo,
 + *                       this will be updated to reflect the amount of
 + *                       information written to the machine structure.
 + */
 +void
 +parseCpuInfo(HardwareTopology::Machine *        machine,
 +             HardwareTopology::SupportLevel *   supportLevel)
 +{
 +    CpuInfo cpuInfo(CpuInfo::detect());
 +
 +    if (!cpuInfo.logicalProcessors().empty())
 +    {
 +        int nSockets   = 0;
 +        int nCores     = 0;
 +        int nHwThreads = 0;
 +
 +        // Copy the logical processor information from cpuinfo
 +        for (auto &l : cpuInfo.logicalProcessors())
 +        {
 +            machine->logicalProcessors.push_back( { l.socketRankInMachine, l.coreRankInSocket, l.hwThreadRankInCore, -1 } );
 +            nSockets   = std::max(nSockets, l.socketRankInMachine);
 +            nCores     = std::max(nCores, l.coreRankInSocket);
 +            nHwThreads = std::max(nHwThreads, l.hwThreadRankInCore);
 +        }
 +
 +        // Fill info form sockets/cores/hwthreads
 +        int socketId   = 0;
 +        int coreId     = 0;
 +        int hwThreadId = 0;
 +
 +        machine->sockets.resize(nSockets + 1);
 +        for (auto &s : machine->sockets)
 +        {
 +            s.id = socketId++;
 +            s.cores.resize(nCores + 1);
 +            for (auto &c : s.cores)
 +            {
 +                c.id         = coreId++;
 +                c.numaNodeId = -1; // No numa information
 +                c.hwThreads.resize(nHwThreads + 1);
 +                for (auto &t : c.hwThreads)
 +                {
 +                    t.id                 = hwThreadId++;
 +                    t.logicalProcessorId = -1; // set as unassigned for now
 +                }
 +            }
 +        }
 +
 +        // Fill the logical processor id in the right place
 +        for (std::size_t i = 0; i < machine->logicalProcessors.size(); i++)
 +        {
 +            const HardwareTopology::LogicalProcessor &l = machine->logicalProcessors[i];
 +            machine->sockets[l.socketRankInMachine].cores[l.coreRankInSocket].hwThreads[l.hwThreadRankInCore].logicalProcessorId = static_cast<int>(i);
 +        }
 +        machine->logicalProcessorCount = machine->logicalProcessors.size();
 +        *supportLevel                  = HardwareTopology::SupportLevel::Basic;
 +    }
 +    else
 +    {
 +        *supportLevel = HardwareTopology::SupportLevel::None;
 +    }
 +}
 +
 +#if GMX_HWLOC
 +
 +#if HWLOC_API_VERSION < 0x00010b00
 +#    define HWLOC_OBJ_PACKAGE  HWLOC_OBJ_SOCKET
 +#    define HWLOC_OBJ_NUMANODE HWLOC_OBJ_NODE
 +#endif
 +
 +/*****************************************************************************
 + *                                                                           *
 + *   Utility functions for extracting hardware topology from hwloc library   *
 + *                                                                           *
 + *****************************************************************************/
 +
 +/*! \brief Return vector of all descendants of a given type in hwloc topology
 + *
 + *  \param obj   Non-null hwloc object.
 + *  \param type  hwloc object type to find. The routine will only search
 + *               on levels below obj.
 + *
 + *  \return vector containing all the objects of given type that are
 + *          descendants of the provided object. If no objects of this type
 + *          were found, the vector will be empty.
 + */
 +const std::vector<hwloc_obj_t>
 +getHwLocDescendantsByType(const hwloc_obj_t obj, const hwloc_obj_type_t type)
 +{
 +    GMX_RELEASE_ASSERT(obj, "NULL hwloc object provided to getHwLocDescendantsByType()");
 +
 +    std::vector<hwloc_obj_t> v;
 +
 +    // Go through children; if this object has no children obj->arity is 0,
 +    // and we'll return an empty vector.
 +    for (std::size_t i = 0; i < obj->arity; i++)
 +    {
 +        // If the child is the type we're looking for, add it directly.
 +        // Otherwise call this routine recursively for each child.
 +        if (obj->children[i]->type == type)
 +        {
 +            v.push_back(obj->children[i]);
 +        }
 +        else
 +        {
 +            std::vector<hwloc_obj_t> v2 = getHwLocDescendantsByType(obj->children[i], type);
 +            v.insert(v.end(), v2.begin(), v2.end());
 +        }
 +    }
 +    return v;
 +}
 +
 +/*! \brief Read information about sockets, cores and threads from hwloc topology
 + *
 + *  \param topo    hwloc topology handle that has been initialized and loaded
 + *  \param machine Pointer to the machine structure in the HardwareTopology
 + *                 class, where the tree of sockets/cores/threads will be written.
 + *
 + *  \return If all the data is found the return value is 0, otherwise non-zero.
 + */
 +int
 +parseHwLocSocketsCoresThreads(const hwloc_topology_t             topo,
 +                              HardwareTopology::Machine *        machine)
 +{
 +    const hwloc_obj_t              root         = hwloc_get_root_obj(topo);
 +    std::vector<hwloc_obj_t>       hwlocSockets = getHwLocDescendantsByType(root, HWLOC_OBJ_PACKAGE);
 +
 +    machine->logicalProcessorCount = hwloc_get_nbobjs_by_type(topo, HWLOC_OBJ_PU);
 +    machine->logicalProcessors.resize(machine->logicalProcessorCount);
 +    machine->sockets.resize(hwlocSockets.size());
 +
 +    bool topologyOk = !hwlocSockets.empty(); // Fail if we have no sockets in machine
 +
 +    for (std::size_t i = 0; i < hwlocSockets.size() && topologyOk; i++)
 +    {
 +        // Assign information about this socket
 +        machine->sockets[i].id = hwlocSockets[i]->logical_index;
 +
 +        // Get children (cores)
 +        std::vector<hwloc_obj_t> hwlocCores = getHwLocDescendantsByType(hwlocSockets[i], HWLOC_OBJ_CORE);
 +        machine->sockets[i].cores.resize(hwlocCores.size());
 +
 +        topologyOk = topologyOk && !hwlocCores.empty(); // Fail if we have no cores in socket
 +
 +        // Loop over child cores
 +        for (std::size_t j = 0; j < hwlocCores.size() && topologyOk; j++)
 +        {
 +            // Assign information about this core
 +            machine->sockets[i].cores[j].id         = hwlocCores[j]->logical_index;
 +            machine->sockets[i].cores[j].numaNodeId = -1;
 +
 +            // Get children (hwthreads)
 +            std::vector<hwloc_obj_t> hwlocPUs = getHwLocDescendantsByType(hwlocCores[j], HWLOC_OBJ_PU);
 +            machine->sockets[i].cores[j].hwThreads.resize(hwlocPUs.size());
 +
 +            topologyOk = topologyOk && !hwlocPUs.empty(); // Fail if we have no hwthreads in core
 +
 +            // Loop over child hwthreads
 +            for (std::size_t k = 0; k < hwlocPUs.size() && topologyOk; k++)
 +            {
 +                // Assign information about this hwthread
 +                std::size_t logicalProcessorId                               = hwlocPUs[k]->os_index;
 +                machine->sockets[i].cores[j].hwThreads[k].id                 = hwlocPUs[k]->logical_index;
 +                machine->sockets[i].cores[j].hwThreads[k].logicalProcessorId = logicalProcessorId;
 +
 +                if (logicalProcessorId < machine->logicalProcessors.size())
 +                {
 +                    // Cross-assign data for this hwthread to the logicalprocess vector
 +                    machine->logicalProcessors[logicalProcessorId].socketRankInMachine = static_cast<int>(i);
 +                    machine->logicalProcessors[logicalProcessorId].coreRankInSocket    = static_cast<int>(j);
 +                    machine->logicalProcessors[logicalProcessorId].hwThreadRankInCore  = static_cast<int>(k);
 +                    machine->logicalProcessors[logicalProcessorId].numaNodeId          = -1;
 +                }
 +                else
 +                {
 +                    topologyOk = false;
 +                }
 +            }
 +        }
 +    }
 +
 +    if (topologyOk)
 +    {
 +        return 0;
 +    }
 +    else
 +    {
 +        machine->logicalProcessors.clear();
 +        machine->sockets.clear();
 +        return -1;
 +    }
 +}
 +
 +/*! \brief Read cache information from hwloc topology
 + *
 + *  \param topo    hwloc topology handle that has been initialized and loaded
 + *  \param machine Pointer to the machine structure in the HardwareTopology
 + *                 class, where cache data will be filled.
 + *
 + *  \return If any cache data is found the return value is 0, otherwise non-zero.
 + */
 +int
 +parseHwLocCache(const hwloc_topology_t             topo,
 +                HardwareTopology::Machine *        machine)
 +{
 +    // Parse caches up to L5
 +    for (int cachelevel : { 1, 2, 3, 4, 5})
 +    {
 +        int depth = hwloc_get_cache_type_depth(topo, cachelevel, HWLOC_OBJ_CACHE_DATA);
 +
 +        if (depth >= 0)
 +        {
 +            hwloc_obj_t cache = hwloc_get_next_obj_by_depth(topo, depth, NULL);
 +            if (cache != NULL)
 +            {
 +                std::vector<hwloc_obj_t> hwThreads = getHwLocDescendantsByType(cache, HWLOC_OBJ_PU);
 +
 +                machine->caches.push_back( {
 +                                               static_cast<int>(cache->attr->cache.depth),
 +                                               static_cast<std::size_t>(cache->attr->cache.size),
 +                                               static_cast<int>(cache->attr->cache.linesize),
 +                                               static_cast<int>(cache->attr->cache.associativity),
 +                                               std::max(static_cast<int>(hwThreads.size()), 1)
 +                                           } );
 +            }
 +        }
 +    }
 +    return machine->caches.empty();
 +}
 +
 +
 +/*! \brief Read numa information from hwloc topology
 + *
 + *  \param topo    hwloc topology handle that has been initialized and loaded
 + *  \param machine Pointer to the machine structure in the HardwareTopology
 + *                 class, where numa information will be filled.
 + *
 + *  Hwloc should virtually always be able to detect numa information, but if
 + *  there is only a single numa node in the system it is not reported at all.
 + *  In this case we create a single numa node covering all cores.
 + *
 + *  This function uses the basic socket/core/thread information detected by
 + *  parseHwLocSocketsCoresThreads(), which means that routine must have
 + *  completed successfully before calling this one. If this is not the case,
 + *  you will get an error return code.
 + *
 + *  \return If the data found makes sense (either in the numa node or the
 + *          entire machine) the return value is 0, otherwise non-zero.
 + */
 +int
 +parseHwLocNuma(const hwloc_topology_t             topo,
 +               HardwareTopology::Machine *        machine)
 +{
 +    const hwloc_obj_t        root           = hwloc_get_root_obj(topo);
 +    std::vector<hwloc_obj_t> hwlocNumaNodes = getHwLocDescendantsByType(root, HWLOC_OBJ_NUMANODE);
 +    bool                     topologyOk     = true;
 +
 +    if (!hwlocNumaNodes.empty())
 +    {
 +        machine->numa.nodes.resize(hwlocNumaNodes.size());
 +
 +        for (std::size_t i = 0; i < hwlocNumaNodes.size(); i++)
 +        {
 +            machine->numa.nodes[i].id     = hwlocNumaNodes[i]->logical_index;
 +            machine->numa.nodes[i].memory = hwlocNumaNodes[i]->memory.total_memory;
 +            machine->numa.nodes[i].logicalProcessorId.clear();
 +
 +            // Get list of PUs in this numa node
 +            std::vector<hwloc_obj_t> hwlocPUs = getHwLocDescendantsByType(hwlocNumaNodes[i], HWLOC_OBJ_PU);
 +
 +            for (auto &p : hwlocPUs)
 +            {
 +                machine->numa.nodes[i].logicalProcessorId.push_back(p->os_index);
 +
 +                GMX_RELEASE_ASSERT(p->os_index < machine->logicalProcessors.size(), "OS index of PU in hwloc larger than processor count");
 +
 +                machine->logicalProcessors[p->os_index].numaNodeId = static_cast<int>(i);
 +                std::size_t s = machine->logicalProcessors[p->os_index].socketRankInMachine;
 +                std::size_t c = machine->logicalProcessors[p->os_index].coreRankInSocket;
 +
 +                GMX_RELEASE_ASSERT(s < machine->sockets.size(), "Socket index in logicalProcessors larger than socket count");
 +                GMX_RELEASE_ASSERT(c < machine->sockets[s].cores.size(), "Core index in logicalProcessors larger than core count");
 +                // Set numaNodeId in core too
 +                machine->sockets[s].cores[c].numaNodeId = i;
 +            }
 +        }
 +
 +        int depth = hwloc_get_type_depth(topo, HWLOC_OBJ_NUMANODE);
 +        const struct hwloc_distances_s * dist = hwloc_get_whole_distance_matrix_by_depth(topo, depth);
 +        if (dist != NULL && dist->nbobjs == hwlocNumaNodes.size())
 +        {
 +            machine->numa.baseLatency        = dist->latency_base;
 +            machine->numa.maxRelativeLatency = dist->latency_max;
 +            machine->numa.relativeLatency.resize(dist->nbobjs);
 +            for (std::size_t i = 0; i < dist->nbobjs; i++)
 +            {
 +                machine->numa.relativeLatency[i].resize(dist->nbobjs);
 +                for (std::size_t j = 0; j < dist->nbobjs; j++)
 +                {
 +                    machine->numa.relativeLatency[i][j] = dist->latency[i*dist->nbobjs+j];
 +                }
 +            }
 +        }
 +        else
 +        {
 +            topologyOk = false;
 +        }
 +    }
 +    else
 +    {
 +        // No numa nodes found. Use the entire machine as a numa node.
 +        const hwloc_obj_t hwlocMachine = hwloc_get_next_obj_by_type(topo, HWLOC_OBJ_MACHINE, NULL);
 +
 +        if (hwlocMachine != NULL)
 +        {
 +            machine->numa.nodes.resize(1);
 +            machine->numa.nodes[0].id           = 0;
 +            machine->numa.nodes[0].memory       = hwlocMachine->memory.total_memory;
 +            machine->numa.baseLatency           = 10;
 +            machine->numa.maxRelativeLatency    = 1;
 +            machine->numa.relativeLatency       = { { 1.0 } };
 +
 +            for (int i = 0; i < machine->logicalProcessorCount; i++)
 +            {
 +                machine->numa.nodes[0].logicalProcessorId.push_back(i);
 +            }
 +            for (auto &l : machine->logicalProcessors)
 +            {
 +                l.numaNodeId = 0;
 +            }
 +            for (auto &s : machine->sockets)
 +            {
 +                for (auto &c : s.cores)
 +                {
 +                    c.numaNodeId = 0;
 +                }
 +            }
 +        }
 +        else
 +        {
 +            topologyOk = false;
 +        }
 +    }
 +
 +    if (topologyOk)
 +    {
 +        return 0;
 +    }
 +    else
 +    {
 +        machine->numa.nodes.clear();
 +        return -1;
 +    }
 +
 +}
 +
 +/*! \brief Read PCI device information from hwloc topology
 + *
 + *  \param topo    hwloc topology handle that has been initialized and loaded
 + *  \param machine Pointer to the machine structure in the HardwareTopology
 + *                 class, where PCI device information will be filled.
 + * *
 + *  \return If any devices were found the return value is 0, otherwise non-zero.
 + */
 +int
 +parseHwLocDevices(const hwloc_topology_t             topo,
 +                  HardwareTopology::Machine *        machine)
 +{
 +    const hwloc_obj_t        root    = hwloc_get_root_obj(topo);
 +    std::vector<hwloc_obj_t> pcidevs = getHwLocDescendantsByType(root, HWLOC_OBJ_PCI_DEVICE);
 +
 +    for (auto &p : pcidevs)
 +    {
 +        const hwloc_obj_t ancestor = hwloc_get_ancestor_obj_by_type(topo, HWLOC_OBJ_NUMANODE, p);
 +        int               numaId;
 +        if (ancestor != NULL)
 +        {
 +            numaId = ancestor->logical_index;
 +        }
 +        else
 +        {
 +            // If we only have a single numa node we belong to it, otherwise set it to -1 (unknown)
 +            numaId = (machine->numa.nodes.size() == 1) ?  0 : -1;
 +        }
 +
 +        GMX_RELEASE_ASSERT(p->attr, "Attributes should not be NULL for hwloc PCI object");
 +
 +        machine->devices.push_back( {
 +                                        p->attr->pcidev.vendor_id,
 +                                        p->attr->pcidev.device_id,
 +                                        p->attr->pcidev.class_id,
 +                                        p->attr->pcidev.domain,
 +                                        p->attr->pcidev.bus,
 +                                        p->attr->pcidev.dev,
 +                                        p->attr->pcidev.func,
 +                                        numaId
 +                                    } );
 +    }
 +    return pcidevs.empty();
 +}
 +
 +void
 +parseHwLoc(HardwareTopology::Machine *        machine,
 +           HardwareTopology::SupportLevel *   supportLevel,
 +           bool *                             isThisSystem)
 +{
 +    hwloc_topology_t    topo;
 +
 +    // Initialize a hwloc object, set flags to request IO device information too,
 +    // try to load the topology, and get the root object. If either step fails,
 +    // return that we do not have any support at all from hwloc.
 +    if (hwloc_topology_init(&topo) != 0)
 +    {
 +        hwloc_topology_destroy(topo);
 +        return; // SupportLevel::None.
 +    }
 +
 +    hwloc_topology_set_flags(topo, HWLOC_TOPOLOGY_FLAG_IO_DEVICES);
 +
 +    if (hwloc_topology_load(topo) != 0 || hwloc_get_root_obj(topo) == NULL)
 +    {
 +        hwloc_topology_destroy(topo);
 +        return; // SupportLevel::None.
 +    }
 +
 +    // If we get here, we can get a valid root object for the topology
 +    *isThisSystem = hwloc_topology_is_thissystem(topo);
 +
 +    // Parse basic information about sockets, cores, and hardware threads
 +    if (parseHwLocSocketsCoresThreads(topo, machine) == 0)
 +    {
 +        *supportLevel = HardwareTopology::SupportLevel::Basic;
 +    }
 +    else
 +    {
 +        hwloc_topology_destroy(topo);
 +        return; // SupportLevel::None.
 +    }
 +
 +    // Get information about cache and numa nodes
 +    if (parseHwLocCache(topo, machine) == 0 && parseHwLocNuma(topo, machine) == 0)
 +    {
 +        *supportLevel = HardwareTopology::SupportLevel::Full;
 +    }
 +    else
 +    {
 +        hwloc_topology_destroy(topo);
 +        return; // SupportLevel::Basic.
 +    }
 +
 +    // PCI devices
 +    if (parseHwLocDevices(topo, machine) == 0)
 +    {
 +        *supportLevel = HardwareTopology::SupportLevel::FullWithDevices;
 +    }
 +
 +    hwloc_topology_destroy(topo);
 +    return; // SupportLevel::Full or SupportLevel::FullWithDevices.
 +}
 +
 +#endif
 +
 +/*! \brief Try to detect the number of logical processors.
 + *
 + *  \return The number of hardware processing units, or 0 if it fails.
 + */
 +int
 +detectLogicalProcessorCount(FILE *fplog, const t_commrec *cr)
 +{
 +    int count = 0;
 +
 +    {
 +#if GMX_NATIVE_WINDOWS
 +        // Windows
 +        SYSTEM_INFO sysinfo;
 +        GetSystemInfo( &sysinfo );
 +        count = sysinfo.dwNumberOfProcessors;
 +#elif defined HAVE_SYSCONF
 +        // We are probably on Unix. Check if we have the argument to use before executing any calls
 +#    if defined(_SC_NPROCESSORS_CONF)
 +        count = sysconf(_SC_NPROCESSORS_CONF);
 +#        if defined(_SC_NPROCESSORS_ONLN)
 +        /* On e.g. Arm, the Linux kernel can use advanced power saving features where
 +         * processors are brought online/offline dynamically. This will cause
 +         * _SC_NPROCESSORS_ONLN to report 1 at the beginning of the run. For this
 +         * reason we now warn if this mismatches with the detected core count. */
 +        int countOnline = sysconf(_SC_NPROCESSORS_ONLN);
 +        if (count != countOnline)
 +        {
-             md_print_warn(cr, fplog,
-                           "%d CPUs configured, but only %d of them are online.\n"
-                           "This can happen on embedded platforms (e.g. ARM) where the OS shuts some cores\n"
-                           "off to save power, and will turn them back on later when the load increases.\n"
-                           "However, this will likely mean GROMACS cannot pin threads to those cores. You\n"
-                           "will likely see much better performance by forcing all cores to be online, and\n"
-                           "making sure they run at their full clock frequency.", count, countOnline);
++            /* We assume that this scenario means that the kernel has
++               disabled threads or cores, and that the only safe course is
++               to assume that _SC_NPROCESSORS_ONLN should be used. Even
++               this may not be valid if running in a containerized
++               environment, such system calls may read from
++               /sys/devices/system/cpu and report what the OS sees, rather
++               than what the container cgroup is supposed to set up as
++               limits. But we're not sure right now whether there's any
++               (standard-ish) way to handle that.
++
++               On ARM, the kernel may have powered down the cores,
++               which we'll warn the user about now. On x86, this
++               means HT is disabled by the kernel, not in the
++               BIOS. We're not sure what it means on other
++               architectures, or even if it is possible, because
++               sysconf is rather non-standardized. */
++            if (isArm)
++            {
++                md_print_warn(cr, fplog,
++                              "%d CPUs configured, but only %d of them are online.\n"
++                              "This can happen on embedded platforms (e.g. ARM) where the OS shuts some cores\n"
++                              "off to save power, and will turn them back on later when the load increases.\n"
++                              "However, this will likely mean GROMACS cannot pin threads to those cores. You\n"
++                              "will likely see much better performance by forcing all cores to be online, and\n"
++                              "making sure they run at their full clock frequency.", count, countOnline);
++            }
++            else
++            {
++                md_print_warn(cr, fplog,
++                              "Note: %d CPUs configured, but only %d of them are online, so GROMACS will use the latter.",
++                              count, countOnline);
++                // We use the online count to avoid (potential) oversubscription.
++                count = countOnline;
++            }
 +        }
 +#        endif
 +#    elif defined(_SC_NPROC_CONF)
 +        count = sysconf(_SC_NPROC_CONF);
 +#    elif defined(_SC_NPROCESSORS_ONLN)
 +        count = sysconf(_SC_NPROCESSORS_ONLN);
 +#    elif defined(_SC_NPROC_ONLN)
 +        count = sysconf(_SC_NPROC_ONLN);
 +#    else
 +#       warning "No valid sysconf argument value found. Executables will not be able to determine the number of logical cores: mdrun will use 1 thread by default!"
 +#    endif      // End of check for sysconf argument values
 +
 +#else
 +        count = 0; // Neither windows nor Unix.
 +#endif
 +    }
 +#if GMX_OPENMP
 +    int countFromOpenmp = gmx_omp_get_num_procs();
 +    if (count != countFromOpenmp)
 +    {
 +        md_print_warn(cr, fplog,
 +                      "Number of logical cores detected (%d) does not match the number reported by OpenMP (%d).\n"
 +                      "Consider setting the launch configuration manually!",
 +                      count, countFromOpenmp);
 +    }
 +#endif
 +
 +    GMX_UNUSED_VALUE(cr);
 +    GMX_UNUSED_VALUE(fplog);
 +    return count;
 +}
 +
 +}   // namespace anonymous
 +
 +// static
 +HardwareTopology HardwareTopology::detect(FILE *fplog, const t_commrec *cr)
 +{
 +    HardwareTopology result;
 +
 +    // Default values for machine and numa stuff
 +    result.machine_.logicalProcessorCount   = 0;
 +    result.machine_.numa.baseLatency        = 0.0;
 +    result.machine_.numa.maxRelativeLatency = 0.0;
 +    result.supportLevel_                    = SupportLevel::None;
 +    result.isThisSystem_                    = true;
 +
 +#if GMX_HWLOC
 +    parseHwLoc(&result.machine_, &result.supportLevel_, &result.isThisSystem_);
 +#endif
 +
 +    // If something went wrong in hwloc (or if it was not present) we might
 +    // have more information in cpuInfo
 +    if (result.supportLevel_ < SupportLevel::Basic)
 +    {
 +        // There might be topology information in cpuInfo
 +        parseCpuInfo(&result.machine_, &result.supportLevel_);
 +    }
 +    // If we did not manage to get anything from either hwloc or cpuInfo, find the cpu count at least
 +    if (result.supportLevel_ == SupportLevel::None)
 +    {
 +        // No topology information; try to detect the number of logical processors at least
 +        result.machine_.logicalProcessorCount = detectLogicalProcessorCount(fplog, cr);
 +        if (result.machine_.logicalProcessorCount > 0)
 +        {
 +            result.supportLevel_ = SupportLevel::LogicalProcessorCount;
 +        }
 +    }
 +    return result;
 +}
 +
 +
 +HardwareTopology::HardwareTopology()
 +    : supportLevel_(SupportLevel::None)
 +{
 +}
 +
 +int HardwareTopology::numberOfCores() const
 +{
 +    if (supportLevel() >= SupportLevel::Basic)
 +    {
 +        // We assume all sockets have the same number of cores as socket 0.
 +        // Since topology information is present, we can assume there is at least one socket.
 +        return machine().sockets.size() * machine().sockets[0].cores.size();
 +    }
 +    else if (supportLevel() >= SupportLevel::LogicalProcessorCount)
 +    {
 +        return machine().logicalProcessorCount;
 +    }
 +    else
 +    {
 +        return 0;
 +    }
 +}
 +
 +} // namespace gmx