Merge branch release-5-1 into release-2016

author Mark Abraham <mark.j.abraham@gmail.com>

Wed, 29 Jun 2016 13:30:40 +0000 (15:30 +0200)

committer Mark Abraham <mark.j.abraham@gmail.com>

Thu, 30 Jun 2016 20:00:07 +0000 (22:00 +0200)
author Mark Abraham <mark.j.abraham@gmail.com>
Wed, 29 Jun 2016 13:30:40 +0000 (15:30 +0200)
committer Mark Abraham <mark.j.abraham@gmail.com>
Thu, 30 Jun 2016 20:00:07 +0000 (22:00 +0200)
diff --cc src/gromacs/gpu_utils/cudautils.cuh

index 495b1a4d291ce32e853d01d91a4cf49ad3c1d155,0000000000000000000000000000000000000000..5daa040d28f5e00e964766c351ed7af57917c991

mode 100644,000000..100644
--- 1/src/gromacs/gpu_utils/cudautils.cuh
--- /dev/null
+++ b/src/gromacs/gpu_utils/cudautils.cuh
@@@ -1,169 -1,0 +1,173 @@@
-     int                 id;                     /* id of the CUDA device */
-     cudaDeviceProp      prop;                   /* CUDA device properties */
-     int                 stat;                   /* result of the device check */
-     gmx_bool            nvml_initialized;       /* If NVML was initialized */
-     gmx_bool            nvml_ap_clocks_changed; /* If application clocks have been changed */
+ +/*
+ + * This file is part of the GROMACS molecular simulation package.
+ + *
+ + * Copyright (c) 2012,2014,2015,2016, by the GROMACS development team, led by
+ + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ + * and including many others, as listed in the AUTHORS file in the
+ + * top-level source directory and at http://www.gromacs.org.
+ + *
+ + * GROMACS is free software; you can redistribute it and/or
+ + * modify it under the terms of the GNU Lesser General Public License
+ + * as published by the Free Software Foundation; either version 2.1
+ + * of the License, or (at your option) any later version.
+ + *
+ + * GROMACS is distributed in the hope that it will be useful,
+ + * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ + * Lesser General Public License for more details.
+ + *
+ + * You should have received a copy of the GNU Lesser General Public
+ + * License along with GROMACS; if not, see
+ + * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ + *
+ + * If you want to redistribute modifications to GROMACS, please
+ + * consider that scientific software is very special. Version
+ + * control is crucial - bugs must be traceable. We will be happy to
+ + * consider code for inclusion in the official distribution, but
+ + * derived work must not be called official GROMACS. Details are found
+ + * in the README & COPYING files - if they are missing, get the
+ + * official version at http://www.gromacs.org.
+ + *
+ + * To help us fund GROMACS development, we humbly ask that you cite
+ + * the research papers on the package. Check out http://www.gromacs.org.
+ + */
+ +#ifndef GMX_GPU_UTILS_CUDAUTILS_CUH
+ +#define GMX_GPU_UTILS_CUDAUTILS_CUH
+ +
+ +#include "config.h"
+ +
+ +#include <stdio.h>
+ +#if HAVE_NVML
+ +#include <nvml.h>
+ +#endif /* HAVE_NVML */
+ +
+ +#include "gromacs/utility/fatalerror.h"
+ +
+ +/* TODO error checking needs to be rewritten. We have 2 types of error checks needed
+ +   based on where they occur in the code:
+ +   - non performance-critical: these errors are unsafe to be ignored and must be
+ +     _always_ checked for, e.g. initializations
+ +   - performance critical: handling errors might hurt performance so care need to be taken
+ +     when/if we should check for them at all, e.g. in cu_upload_X. However, we should be
+ +     able to turn the check for these errors on!
+ +
+ +   Probably we'll need two sets of the macros below...
+ +
+ + */
+ +#define CHECK_CUDA_ERRORS
+ +
+ +#ifdef CHECK_CUDA_ERRORS
+ +
+ +/*! Check for CUDA error on the return status of a CUDA RT API call. */
+ +#define CU_RET_ERR(status, msg) \
+ +    do { \
+ +        if (status != cudaSuccess) \
+ +        { \
+ +            gmx_fatal(FARGS, "%s: %s\n", msg, cudaGetErrorString(status)); \
+ +        } \
+ +    } while (0)
+ +
+ +/*! Check for any previously occurred uncaught CUDA error. */
+ +#define CU_CHECK_PREV_ERR() \
+ +    do { \
+ +        cudaError_t _CU_CHECK_PREV_ERR_status = cudaGetLastError(); \
+ +        if (_CU_CHECK_PREV_ERR_status != cudaSuccess) { \
+ +            gmx_warning("Just caught a previously occurred CUDA error (%s), will try to continue.", cudaGetErrorString(_CU_CHECK_PREV_ERR_status)); \
+ +        } \
+ +    } while (0)
+ +
+ +/*! Check for any previously occurred uncaught CUDA error
+ +   -- aimed at use after kernel calls. */
+ +#define CU_LAUNCH_ERR(msg) \
+ +    do { \
+ +        cudaError_t _CU_LAUNCH_ERR_status = cudaGetLastError(); \
+ +        if (_CU_LAUNCH_ERR_status != cudaSuccess) { \
+ +            gmx_fatal(FARGS, "Error while launching kernel %s: %s\n", msg, cudaGetErrorString(_CU_LAUNCH_ERR_status)); \
+ +        } \
+ +    } while (0)
+ +
+ +/*! Synchronize with GPU and check for any previously occurred uncaught CUDA error
+ +   -- aimed at use after kernel calls. */
+ +#define CU_LAUNCH_ERR_SYNC(msg) \
+ +    do { \
+ +        cudaError_t _CU_SYNC_LAUNCH_ERR_status = cudaThreadSynchronize(); \
+ +        if (_CU_SYNC_LAUNCH_ERR_status != cudaSuccess) { \
+ +            gmx_fatal(FARGS, "Error while launching kernel %s: %s\n", msg, cudaGetErrorString(_CU_SYNC_LAUNCH_ERR_status)); \
+ +        } \
+ +    } while (0)
+ +
+ +#else /* CHECK_CUDA_ERRORS */
+ +
+ +#define CU_RET_ERR(status, msg) do { } while (0)
+ +#define CU_CHECK_PREV_ERR()     do { } while (0)
+ +#define CU_LAUNCH_ERR(msg)      do { } while (0)
+ +#define CU_LAUNCH_ERR_SYNC(msg) do { } while (0)
+ +#define HANDLE_NVML_RET_ERR(status, msg) do { } while (0)
+ +
+ +#endif /* CHECK_CUDA_ERRORS */
+ +
+ +/*! \brief CUDA device information.
+ + *
+ + * The CUDA device information is queried and set at detection and contains
+ + * both information about the device/hardware returned by the runtime as well
+ + * as additional data like support status.
+ + */
+ +struct gmx_device_info_t
+ +{
-     nvmlDevice_t        nvml_device_id;         /* NVML device id */
-     nvmlEnableState_t   nvml_is_restricted;     /* Status of application clocks permission */
- #endif                                          /* HAVE_NVML */
++    int                 id;                      /* id of the CUDA device */
++    cudaDeviceProp      prop;                    /* CUDA device properties */
++    int                 stat;                    /* result of the device check */
++    gmx_bool            nvml_initialized;        /* If NVML was initialized */
++    unsigned int        nvml_orig_app_sm_clock;  /* The original SM clock before we changed it */
++    unsigned int        nvml_orig_app_mem_clock; /* The original memory clock before we changed it */
++    gmx_bool            nvml_app_clocks_changed; /* If application clocks have been changed */
++    unsigned int        nvml_set_app_sm_clock;   /* The SM clock we set */
++    unsigned int        nvml_set_app_mem_clock;  /* The memory clock we set */
+ +#if HAVE_NVML
++    nvmlDevice_t        nvml_device_id;          /* NVML device id */
++    nvmlEnableState_t   nvml_is_restricted;      /* Status of application clocks permission */
++#endif                                           /* HAVE_NVML */
+ +};
+ +
+ +
+ +/*! Launches asynchronous host to device memory copy in stream 0. */
+ +int cu_copy_D2H(void * /*h_dest*/, void * /*d_src*/, size_t /*bytes*/);
+ +
+ +/*! Launches asynchronous host to device memory copy in stream s. */
+ +int cu_copy_D2H_async(void * /*h_dest*/, void * /*d_src*/, size_t /*bytes*/, cudaStream_t /*s = 0*/);
+ +
+ +/*! Allocates host memory and launches synchronous host to device memory copy. */
+ +int cu_copy_D2H_alloc(void ** /*h_dest*/, void * /*d_src*/, size_t /*bytes*/);
+ +
+ +
+ +/*! Launches synchronous host to device memory copy. */
+ +int cu_copy_H2D(void * /*d_dest*/, void * /*h_src*/, size_t /*bytes*/);
+ +
+ +/*! Launches asynchronous host to device memory copy in stream s. */
+ +int cu_copy_H2D_async(void * /*d_dest*/, void * /*h_src*/, size_t /*bytes*/, cudaStream_t /*s = 0*/);
+ +
+ +/*! Allocates device memory and launches synchronous host to device memory copy. */
+ +int cu_copy_H2D_alloc(void ** /*d_dest*/, void * /*h_src*/, size_t /*bytes*/);
+ +
+ +/*! Frees device memory and resets the size and allocation size to -1. */
+ +void cu_free_buffered(void *d_ptr, int *n = NULL, int *nalloc = NULL);
+ +
+ +/*! Reallocates the device memory and copies data from the host. */
+ +void cu_realloc_buffered(void **d_dest, void *h_src,
+ +                         size_t type_size,
+ +                         int *curr_size, int *curr_alloc_size,
+ +                         int req_size,
+ +                         cudaStream_t s,
+ +                         bool bAsync);
+ +
+ +/*! Waits for event e to complete, */
+ +int cu_wait_event(cudaEvent_t /*e*/);
+ +
+ +/*! Calculates and returns the time elapsed between event start and end. */
+ +float cu_event_elapsed(cudaEvent_t /*start*/, cudaEvent_t /*end*/);
+ +
+ +/*! Waits for event end to complete and calculates the time between start and end. */
+ +int cu_wait_event_time(cudaEvent_t /*end*/, cudaEvent_t /*begin*/, float * /*time*/);
+ +
+ +#endif
diff --cc src/gromacs/gpu_utils/gpu_utils.cu

index f08d55443e6d234175d1a822db25ebd03ea39a93,0000000000000000000000000000000000000000..e8b78249e909a45dcf6142ab1b3b83de3d1ff2c7

mode 100644,000000..100644
--- 1/src/gromacs/gpu_utils/gpu_utils.cu
--- /dev/null
+++ b/src/gromacs/gpu_utils/gpu_utils.cu
@@@ -1,832 -1,0 +1,867 @@@
-     if (!addNVMLDeviceId( &(gpu_info->gpu_dev[gpuid])))
+ +/*
+ + * This file is part of the GROMACS molecular simulation package.
+ + *
+ + * Copyright (c) 2010,2011,2012,2013,2014,2015,2016, by the GROMACS development team, led by
+ + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ + * and including many others, as listed in the AUTHORS file in the
+ + * top-level source directory and at http://www.gromacs.org.
+ + *
+ + * GROMACS is free software; you can redistribute it and/or
+ + * modify it under the terms of the GNU Lesser General Public License
+ + * as published by the Free Software Foundation; either version 2.1
+ + * of the License, or (at your option) any later version.
+ + *
+ + * GROMACS is distributed in the hope that it will be useful,
+ + * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ + * Lesser General Public License for more details.
+ + *
+ + * You should have received a copy of the GNU Lesser General Public
+ + * License along with GROMACS; if not, see
+ + * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ + *
+ + * If you want to redistribute modifications to GROMACS, please
+ + * consider that scientific software is very special. Version
+ + * control is crucial - bugs must be traceable. We will be happy to
+ + * consider code for inclusion in the official distribution, but
+ + * derived work must not be called official GROMACS. Details are found
+ + * in the README & COPYING files - if they are missing, get the
+ + * official version at http://www.gromacs.org.
+ + *
+ + * To help us fund GROMACS development, we humbly ask that you cite
+ + * the research papers on the package. Check out http://www.gromacs.org.
+ + */
+ +/*! \file
+ + *  \brief Define functions for detection and initialization for CUDA devices.
+ + *
+ + *  \author Szilard Pall <pall.szilard@gmail.com>
+ + */
+ +
+ +#include "gmxpre.h"
+ +
+ +#include "gpu_utils.h"
+ +
+ +#include "config.h"
+ +
+ +#include <assert.h>
+ +#include <stdio.h>
+ +#include <stdlib.h>
+ +
+ +#include <cuda_profiler_api.h>
+ +
+ +#include "gromacs/gpu_utils/cudautils.cuh"
+ +#include "gromacs/gpu_utils/pmalloc_cuda.h"
+ +#include "gromacs/hardware/gpu_hw_info.h"
+ +#include "gromacs/utility/basedefinitions.h"
+ +#include "gromacs/utility/cstringutil.h"
+ +#include "gromacs/utility/smalloc.h"
+ +
+ +#if HAVE_NVML
+ +#include <nvml.h>
+ +#define HAVE_NVML_APPLICATION_CLOCKS (NVML_API_VERSION >= 6)
+ +#else  /* HAVE_NVML */
+ +#define HAVE_NVML_APPLICATION_CLOCKS 0
+ +#endif /* HAVE_NVML */
+ +
+ +#if defined(CHECK_CUDA_ERRORS) && HAVE_NVML_APPLICATION_CLOCKS
+ +/*! Check for NVML error on the return status of a NVML API call. */
+ +#  define HANDLE_NVML_RET_ERR(status, msg) \
+ +    do { \
+ +        if (status != NVML_SUCCESS) \
+ +        { \
+ +            gmx_warning("%s: %s\n", msg, nvmlErrorString(status)); \
+ +        } \
+ +    } while (0)
+ +#else  /* defined(CHECK_CUDA_ERRORS) && HAVE_NVML_APPLICATION_CLOCKS */
+ +#  define HANDLE_NVML_RET_ERR(status, msg) do { } while (0)
+ +#endif /* defined(CHECK_CUDA_ERRORS) && HAVE_NVML_APPLICATION_CLOCKS */
+ +
+ +#if HAVE_NVML_APPLICATION_CLOCKS
+ +static const gmx_bool            bCompiledWithApplicationClockSupport = true;
+ +#else
+ +static const gmx_bool gmx_unused bCompiledWithApplicationClockSupport = false;
+ +#endif
+ +
+ +/*! \internal \brief
+ + * Max number of devices supported by CUDA (for consistency checking).
+ + *
+ + * In reality it is 16 with CUDA <=v5.0, but let's stay on the safe side.
+ + */
+ +static int  cuda_max_device_count = 32;
+ +
+ +static bool cudaProfilerRun      = ((getenv("NVPROF_ID") != NULL));
+ +
+ +/** Dummy kernel used for sanity checking. */
+ +__global__ void k_dummy_test()
+ +{
+ +}
+ +
+ +
+ +/*!
+ + * \brief Runs GPU sanity checks.
+ + *
+ + * Runs a series of checks to determine that the given GPU and underlying CUDA
+ + * driver/runtime functions properly.
+ + * Returns properties of a device with given ID or the one that has
+ + * already been initialized earlier in the case if of \dev_id == -1.
+ + *
+ + * \param[in]  dev_id      the device ID of the GPU or -1 if the device has already been initialized
+ + * \param[out] dev_prop    pointer to the structure in which the device properties will be returned
+ + * \returns                0 if the device looks OK
+ + *
+ + * TODO: introduce errors codes and handle errors more smoothly.
+ + */
+ +static int do_sanity_checks(int dev_id, cudaDeviceProp *dev_prop)
+ +{
+ +    cudaError_t cu_err;
+ +    int         dev_count, id;
+ +
+ +    cu_err = cudaGetDeviceCount(&dev_count);
+ +    if (cu_err != cudaSuccess)
+ +    {
+ +        fprintf(stderr, "Error %d while querying device count: %s\n", cu_err,
+ +                cudaGetErrorString(cu_err));
+ +        return -1;
+ +    }
+ +
+ +    /* no CUDA compatible device at all */
+ +    if (dev_count == 0)
+ +    {
+ +        return -1;
+ +    }
+ +
+ +    /* things might go horribly wrong if cudart is not compatible with the driver */
+ +    if (dev_count < 0 || dev_count > cuda_max_device_count)
+ +    {
+ +        return -1;
+ +    }
+ +
+ +    if (dev_id == -1) /* device already selected let's not destroy the context */
+ +    {
+ +        cu_err = cudaGetDevice(&id);
+ +        if (cu_err != cudaSuccess)
+ +        {
+ +            fprintf(stderr, "Error %d while querying device id: %s\n", cu_err,
+ +                    cudaGetErrorString(cu_err));
+ +            return -1;
+ +        }
+ +    }
+ +    else
+ +    {
+ +        id = dev_id;
+ +        if (id > dev_count - 1) /* pfff there's no such device */
+ +        {
+ +            fprintf(stderr, "The requested device with id %d does not seem to exist (device count=%d)\n",
+ +                    dev_id, dev_count);
+ +            return -1;
+ +        }
+ +    }
+ +
+ +    memset(dev_prop, 0, sizeof(cudaDeviceProp));
+ +    cu_err = cudaGetDeviceProperties(dev_prop, id);
+ +    if (cu_err != cudaSuccess)
+ +    {
+ +        fprintf(stderr, "Error %d while querying device properties: %s\n", cu_err,
+ +                cudaGetErrorString(cu_err));
+ +        return -1;
+ +    }
+ +
+ +    /* both major & minor is 9999 if no CUDA capable devices are present */
+ +    if (dev_prop->major == 9999 && dev_prop->minor == 9999)
+ +    {
+ +        return -1;
+ +    }
+ +    /* we don't care about emulation mode */
+ +    if (dev_prop->major == 0)
+ +    {
+ +        return -1;
+ +    }
+ +
+ +    if (id != -1)
+ +    {
+ +        cu_err = cudaSetDevice(id);
+ +        if (cu_err != cudaSuccess)
+ +        {
+ +            fprintf(stderr, "Error %d while switching to device #%d: %s\n",
+ +                    cu_err, id, cudaGetErrorString(cu_err));
+ +            return -1;
+ +        }
+ +    }
+ +
+ +    /* try to execute a dummy kernel */
+ +    k_dummy_test<<< 1, 512>>> ();
+ +    if (cudaThreadSynchronize() != cudaSuccess)
+ +    {
+ +        return -1;
+ +    }
+ +
+ +    /* destroy context if we created one */
+ +    if (id != -1)
+ +    {
+ +        cu_err = cudaDeviceReset();
+ +        CU_RET_ERR(cu_err, "cudaDeviceReset failed");
+ +    }
+ +
+ +    return 0;
+ +}
+ +
+ +#if HAVE_NVML
+ +/* TODO: We should actually be using md_print_warn in md_logging.c,
+ + * but we can't include mpi.h in CUDA code.
+ + */
+ +static void md_print_info(FILE       *fplog,
+ +                          const char *fmt, ...)
+ +{
+ +    va_list ap;
+ +
+ +    if (fplog != NULL)
+ +    {
+ +        /* We should only print to stderr on the master node,
+ +         * in most cases fplog is only set on the master node, so this works.
+ +         */
+ +        va_start(ap, fmt);
+ +        vfprintf(stderr, fmt, ap);
+ +        va_end(ap);
+ +
+ +        va_start(ap, fmt);
+ +        vfprintf(fplog, fmt, ap);
+ +        va_end(ap);
+ +    }
+ +}
+ +#endif /*HAVE_NVML*/
+ +
+ +/* TODO: We should actually be using md_print_warn in md_logging.c,
+ + * but we can't include mpi.h in CUDA code.
+ + * This is replicated from nbnxn_cuda_data_mgmt.cu.
+ + */
+ +static void md_print_warn(FILE       *fplog,
+ +                          const char *fmt, ...)
+ +{
+ +    va_list ap;
+ +
+ +    if (fplog != NULL)
+ +    {
+ +        /* We should only print to stderr on the master node,
+ +         * in most cases fplog is only set on the master node, so this works.
+ +         */
+ +        va_start(ap, fmt);
+ +        fprintf(stderr, "\n");
+ +        vfprintf(stderr, fmt, ap);
+ +        fprintf(stderr, "\n");
+ +        va_end(ap);
+ +
+ +        va_start(ap, fmt);
+ +        fprintf(fplog, "\n");
+ +        vfprintf(fplog, fmt, ap);
+ +        fprintf(fplog, "\n");
+ +        va_end(ap);
+ +    }
+ +}
+ +
+ +#if HAVE_NVML_APPLICATION_CLOCKS
+ +/*! \brief Determines and adds the NVML device ID to the passed \cuda_dev.
+ + *
+ + * Determines and adds the NVML device ID to the passed \cuda_dev. This is done by
+ + * matching PCI-E information from \cuda_dev with the available NVML devices.
+ + *
+ + * \param[in,out] cuda_dev  CUDA device information to enrich with NVML device info
+ + * \returns                 true if \cuda_dev could be enriched with matching NVML device information.
+ + */
+ +static bool addNVMLDeviceId(gmx_device_info_t* cuda_dev)
+ +{
+ +    nvmlDevice_t nvml_device_id;
+ +    unsigned int nvml_device_count = 0;
+ +    nvmlReturn_t nvml_stat         = nvmlDeviceGetCount ( &nvml_device_count );
+ +    cuda_dev->nvml_initialized = false;
+ +    HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetCount failed" );
+ +    for (unsigned int nvml_device_idx = 0; nvml_stat == NVML_SUCCESS && nvml_device_idx < nvml_device_count; ++nvml_device_idx)
+ +    {
+ +        nvml_stat = nvmlDeviceGetHandleByIndex ( nvml_device_idx, &nvml_device_id );
+ +        HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetHandleByIndex failed" );
+ +        if (nvml_stat != NVML_SUCCESS)
+ +        {
+ +            break;
+ +        }
+ +
+ +        nvmlPciInfo_t nvml_pci_info;
+ +        nvml_stat = nvmlDeviceGetPciInfo ( nvml_device_id, &nvml_pci_info );
+ +        HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetPciInfo failed" );
+ +        if (nvml_stat != NVML_SUCCESS)
+ +        {
+ +            break;
+ +        }
+ +        if (static_cast<unsigned int>(cuda_dev->prop.pciBusID) == nvml_pci_info.bus &&
+ +            static_cast<unsigned int>(cuda_dev->prop.pciDeviceID) == nvml_pci_info.device &&
+ +            static_cast<unsigned int>(cuda_dev->prop.pciDomainID) == nvml_pci_info.domain)
+ +        {
+ +            cuda_dev->nvml_initialized = true;
+ +            cuda_dev->nvml_device_id   = nvml_device_id;
+ +            break;
+ +        }
+ +    }
+ +    return cuda_dev->nvml_initialized;
+ +}
++
++/*! \brief Reads and returns the application clocks for device.
++ *
++ * \param[in]  device        The GPU device
++ * \param[out] app_sm_clock  The current application SM clock
++ * \param[out] app_mem_clock The current application memory clock
++ * \returns if applacation clocks are supported
++ */
++static bool getApplicationClocks(const gmx_device_info_t *cuda_dev,
++                                 unsigned int            *app_sm_clock,
++                                 unsigned int            *app_mem_clock)
++{
++    nvmlReturn_t nvml_stat;
++
++    nvml_stat = nvmlDeviceGetApplicationsClock(cuda_dev->nvml_device_id, NVML_CLOCK_SM, app_sm_clock);
++    if (NVML_ERROR_NOT_SUPPORTED == nvml_stat)
++    {
++        return false;
++    }
++    HANDLE_NVML_RET_ERR(nvml_stat, "nvmlDeviceGetApplicationsClock failed");
++    nvml_stat = nvmlDeviceGetApplicationsClock(cuda_dev->nvml_device_id, NVML_CLOCK_MEM, app_mem_clock);
++    HANDLE_NVML_RET_ERR(nvml_stat, "nvmlDeviceGetApplicationsClock failed");
++
++    return true;
++}
+ +#endif /* HAVE_NVML_APPLICATION_CLOCKS */
+ +
+ +/*! \brief Tries to set application clocks for the GPU with the given index.
+ + *
+ + * The variable \gpuid is the index of the GPU in the gpu_info.cuda_dev array
+ + * to handle the application clocks for. Application clocks are set to the
+ + * max supported value to increase performance if application clock permissions
+ + * allow this. For future GPU architectures a more sophisticated scheme might be
+ + * required.
+ + *
+ + * \param[out] fplog        log file to write to
+ + * \param[in] gpuid         index of the GPU to set application clocks for
+ + * \param[in] gpu_info      GPU info of all detected devices in the system.
+ + * \returns                 true if no error occurs during application clocks handling.
+ + */
+ +static gmx_bool init_gpu_application_clocks(FILE gmx_unused *fplog, int gmx_unused gpuid, const gmx_gpu_info_t gmx_unused *gpu_info)
+ +{
+ +    const cudaDeviceProp *prop                        = &gpu_info->gpu_dev[gpuid].prop;
+ +    int                   cuda_version_number         = prop->major * 10 + prop->minor;
+ +    gmx_bool              bGpuCanUseApplicationClocks =
+ +        ((0 == gmx_wcmatch("*Tesla*", prop->name) && cuda_version_number >= 35 ) ||
+ +         (0 == gmx_wcmatch("*Quadro*", prop->name) && cuda_version_number >= 52 ));
+ +    if (!bGpuCanUseApplicationClocks)
+ +    {
+ +        return true;
+ +    }
+ +#if !HAVE_NVML
+ +    int cuda_driver  = 0;
+ +    int cuda_runtime = 0;
+ +    cudaDriverGetVersion(&cuda_driver);
+ +    cudaRuntimeGetVersion(&cuda_runtime);
+ +    md_print_warn( fplog, "NOTE: GROMACS was configured without NVML support hence it can not exploit\n"
+ +                   "      application clocks of the detected %s GPU to improve performance.\n"
+ +                   "      Recompile with the NVML library (compatible with the driver used) or set application clocks manually.\n",
+ +                   prop->name);
+ +    return true;
+ +#else
+ +    if (!bCompiledWithApplicationClockSupport)
+ +    {
+ +        int cuda_driver  = 0;
+ +        int cuda_runtime = 0;
+ +        cudaDriverGetVersion(&cuda_driver);
+ +        cudaRuntimeGetVersion(&cuda_runtime);
+ +        md_print_warn( fplog, "NOTE: GROMACS was compiled with an old NVML library which does not support\n"
+ +                       "      managing application clocks of the detected %s GPU to improve performance.\n"
+ +                       "      If your GPU supports application clocks, upgrade NVML (and driver) and recompile or set the clocks manually.\n",
+ +                       prop->name );
+ +        return true;
+ +    }
+ +
+ +    /* We've compiled with NVML application clocks support, and have a GPU that can use it */
+ +    nvmlReturn_t nvml_stat = NVML_SUCCESS;
+ +    char        *env;
+ +    //TODO: GMX_GPU_APPLICATION_CLOCKS is currently only used to enable/disable setting of application clocks
+ +    //      this variable can be later used to give a user more fine grained control.
+ +    env = getenv("GMX_GPU_APPLICATION_CLOCKS");
+ +    if (env != NULL && ( strcmp( env, "0") == 0 ||
+ +                         gmx_strcasecmp( env, "OFF") == 0 ||
+ +                         gmx_strcasecmp( env, "DISABLE") == 0 ))
+ +    {
+ +        return true;
+ +    }
+ +    nvml_stat = nvmlInit();
+ +    HANDLE_NVML_RET_ERR( nvml_stat, "nvmlInit failed." );
+ +    if (nvml_stat != NVML_SUCCESS)
+ +    {
+ +        return false;
+ +    }
-     unsigned int app_sm_clock  = 0;
-     unsigned int app_mem_clock = 0;
-     nvml_stat = nvmlDeviceGetApplicationsClock ( gpu_info->gpu_dev[gpuid].nvml_device_id, NVML_CLOCK_SM, &app_sm_clock );
-     if (NVML_ERROR_NOT_SUPPORTED == nvml_stat)
++
++    gmx_device_info_t *cuda_dev = &(gpu_info->gpu_dev[gpuid]);
++
++    if (!addNVMLDeviceId(cuda_dev))
+ +    {
+ +        return false;
+ +    }
+ +    //get current application clocks setting
-     HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetApplicationsClock failed" );
-     nvml_stat = nvmlDeviceGetApplicationsClock ( gpu_info->gpu_dev[gpuid].nvml_device_id, NVML_CLOCK_MEM, &app_mem_clock );
-     HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetApplicationsClock failed" );
++    if (!getApplicationClocks(cuda_dev,
++                              &cuda_dev->nvml_orig_app_sm_clock,
++                              &cuda_dev->nvml_orig_app_mem_clock))
+ +    {
+ +        return false;
+ +    }
-     nvml_stat = nvmlDeviceGetMaxClockInfo ( gpu_info->gpu_dev[gpuid].nvml_device_id, NVML_CLOCK_SM, &max_sm_clock );
+ +    //get max application clocks
+ +    unsigned int max_sm_clock  = 0;
+ +    unsigned int max_mem_clock = 0;
-     nvml_stat = nvmlDeviceGetMaxClockInfo ( gpu_info->gpu_dev[gpuid].nvml_device_id, NVML_CLOCK_MEM, &max_mem_clock );
++    nvml_stat = nvmlDeviceGetMaxClockInfo(cuda_dev->nvml_device_id, NVML_CLOCK_SM, &max_sm_clock);
+ +    HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetMaxClockInfo failed" );
-     gpu_info->gpu_dev[gpuid].nvml_is_restricted     = NVML_FEATURE_ENABLED;
-     gpu_info->gpu_dev[gpuid].nvml_ap_clocks_changed = false;
++    nvml_stat = nvmlDeviceGetMaxClockInfo(cuda_dev->nvml_device_id, NVML_CLOCK_MEM, &max_mem_clock);
+ +    HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetMaxClockInfo failed" );
+ +
-     nvml_stat = nvmlDeviceGetAPIRestriction ( gpu_info->gpu_dev[gpuid].nvml_device_id, NVML_RESTRICTED_API_SET_APPLICATION_CLOCKS, &(gpu_info->gpu_dev[gpuid].nvml_is_restricted) );
++    cuda_dev->nvml_is_restricted      = NVML_FEATURE_ENABLED;
++    cuda_dev->nvml_app_clocks_changed = false;
+ +
-     if (nvml_stat == NVML_SUCCESS && app_sm_clock < max_sm_clock && gpu_info->gpu_dev[gpuid].nvml_is_restricted == NVML_FEATURE_DISABLED)
++    nvml_stat = nvmlDeviceGetAPIRestriction(cuda_dev->nvml_device_id, NVML_RESTRICTED_API_SET_APPLICATION_CLOCKS, &(cuda_dev->nvml_is_restricted));
+ +    HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetAPIRestriction failed" );
+ +
+ +    /* Note: Distinguishing between different types of GPUs here might be necessary in the future,
+ +       e.g. if max application clocks should not be used for certain GPUs. */
-         md_print_info( fplog, "Changing GPU application clocks for %s to (%d,%d)\n", gpu_info->gpu_dev[gpuid].prop.name, max_mem_clock, max_sm_clock);
-         nvml_stat = nvmlDeviceSetApplicationsClocks ( gpu_info->gpu_dev[gpuid].nvml_device_id, max_mem_clock, max_sm_clock );
++    if (nvml_stat == NVML_SUCCESS && cuda_dev->nvml_orig_app_sm_clock < max_sm_clock && cuda_dev->nvml_is_restricted == NVML_FEATURE_DISABLED)
+ +    {
-         gpu_info->gpu_dev[gpuid].nvml_ap_clocks_changed = true;
++        md_print_info(fplog, "Changing GPU application clocks for %s to (%d,%d)\n", cuda_dev->prop.name, max_mem_clock, max_sm_clock);
++        nvml_stat = nvmlDeviceSetApplicationsClocks(cuda_dev->nvml_device_id, max_mem_clock, max_sm_clock);
+ +        HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetApplicationsClock failed" );
-     else if (nvml_stat == NVML_SUCCESS && app_sm_clock < max_sm_clock)
++        cuda_dev->nvml_app_clocks_changed = true;
++        cuda_dev->nvml_set_app_sm_clock   = max_sm_clock;
++        cuda_dev->nvml_set_app_mem_clock  = max_mem_clock;
+ +    }
-         md_print_warn( fplog, "Can not change application clocks for %s to optimal values due to insufficient permissions. Current values are (%d,%d), max values are (%d,%d).\nUse sudo nvidia-smi -acp UNRESTRICTED or contact your admin to change application clocks.\n", gpu_info->gpu_dev[gpuid].prop.name, app_mem_clock, app_sm_clock, max_mem_clock, max_sm_clock);
++    else if (nvml_stat == NVML_SUCCESS && cuda_dev->nvml_orig_app_sm_clock < max_sm_clock)
+ +    {
-     else if (nvml_stat == NVML_SUCCESS && app_sm_clock == max_sm_clock)
++        md_print_warn(fplog, "Can not change application clocks for %s to optimal values due to insufficient permissions. Current values are (%d,%d), max values are (%d,%d).\nUse sudo nvidia-smi -acp UNRESTRICTED or contact your admin to change application clocks.\n", cuda_dev->prop.name, cuda_dev->nvml_orig_app_mem_clock, cuda_dev->nvml_orig_app_sm_clock, max_mem_clock, max_sm_clock);
+ +    }
-         //TODO: This should probably be integrated into the GPU Properties table.
-         md_print_info( fplog, "Application clocks (GPU clocks) for %s are (%d,%d)\n", gpu_info->gpu_dev[gpuid].prop.name, app_mem_clock, app_sm_clock);
++    else if (nvml_stat == NVML_SUCCESS && cuda_dev->nvml_orig_app_sm_clock == max_sm_clock)
+ +    {
-         cuda_dev->nvml_ap_clocks_changed)
++        md_print_info(fplog, "Application clocks (GPU clocks) for %s are (%d,%d)\n", cuda_dev->prop.name, cuda_dev->nvml_orig_app_mem_clock, cuda_dev->nvml_orig_app_sm_clock);
+ +    }
+ +    else
+ +    {
+ +        md_print_warn( fplog,  "Can not change GPU application clocks to optimal values due to NVML error (%d): %s.\n", nvml_stat, nvmlErrorString(nvml_stat));
+ +    }
+ +    return (nvml_stat == NVML_SUCCESS);
+ +#endif /* HAVE_NVML */
+ +}
+ +
+ +/*! \brief Resets application clocks if changed and cleans up NVML for the passed \gpu_dev.
+ + *
+ + * \param[in] gpu_dev  CUDA device information
+ + */
+ +static gmx_bool reset_gpu_application_clocks(const gmx_device_info_t gmx_unused * cuda_dev)
+ +{
+ +#if !HAVE_NVML_APPLICATION_CLOCKS
+ +    GMX_UNUSED_VALUE(cuda_dev);
+ +    return true;
+ +#else /* HAVE_NVML_APPLICATION_CLOCKS */
+ +    nvmlReturn_t nvml_stat = NVML_SUCCESS;
+ +    if (cuda_dev &&
+ +        cuda_dev->nvml_is_restricted == NVML_FEATURE_DISABLED &&
-         nvml_stat = nvmlDeviceResetApplicationsClocks( cuda_dev->nvml_device_id );
-         HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceResetApplicationsClocks failed" );
++        cuda_dev->nvml_app_clocks_changed)
+ +    {
++        /* Check if the clocks are still what we set them to.
++         * If so, set them back to the state we originally found them in.
++         * If not, don't touch them, because something else set them later.
++         */
++        unsigned int app_sm_clock, app_mem_clock;
++        getApplicationClocks(cuda_dev, &app_sm_clock, &app_mem_clock);
++        if (app_sm_clock  == cuda_dev->nvml_set_app_sm_clock &&
++            app_mem_clock == cuda_dev->nvml_set_app_mem_clock)
++        {
++            nvml_stat = nvmlDeviceSetApplicationsClocks(cuda_dev->nvml_device_id, cuda_dev->nvml_orig_app_mem_clock, cuda_dev->nvml_orig_app_sm_clock);
++            HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetApplicationsClock failed" );
++        }
+ +    }
+ +    nvml_stat = nvmlShutdown();
+ +    HANDLE_NVML_RET_ERR( nvml_stat, "nvmlShutdown failed" );
+ +    return (nvml_stat == NVML_SUCCESS);
+ +#endif /* HAVE_NVML_APPLICATION_CLOCKS */
+ +}
+ +
+ +gmx_bool init_gpu(FILE gmx_unused *fplog, int mygpu, char *result_str,
+ +                  const struct gmx_gpu_info_t *gpu_info,
+ +                  const struct gmx_gpu_opt_t *gpu_opt)
+ +{
+ +    cudaError_t stat;
+ +    char        sbuf[STRLEN];
+ +    int         gpuid;
+ +
+ +    assert(gpu_info);
+ +    assert(result_str);
+ +
+ +    if (mygpu < 0 || mygpu >= gpu_opt->n_dev_use)
+ +    {
+ +        sprintf(sbuf, "Trying to initialize an inexistent GPU: "
+ +                "there are %d %s-selected GPU(s), but #%d was requested.",
+ +                gpu_opt->n_dev_use, gpu_opt->bUserSet ? "user" : "auto", mygpu);
+ +        gmx_incons(sbuf);
+ +    }
+ +
+ +    gpuid = gpu_info->gpu_dev[gpu_opt->dev_use[mygpu]].id;
+ +
+ +    stat = cudaSetDevice(gpuid);
+ +    strncpy(result_str, cudaGetErrorString(stat), STRLEN);
+ +
+ +    if (debug)
+ +    {
+ +        fprintf(stderr, "Initialized GPU ID #%d: %s\n", gpuid, gpu_info->gpu_dev[gpuid].prop.name);
+ +    }
+ +
+ +    //Ignoring return value as NVML errors should be treated not critical.
+ +    if (stat == cudaSuccess)
+ +    {
+ +        init_gpu_application_clocks(fplog, gpuid, gpu_info);
+ +    }
+ +    return (stat == cudaSuccess);
+ +}
+ +
+ +gmx_bool free_cuda_gpu(
+ +        int gmx_unused mygpu, char *result_str,
+ +        const gmx_gpu_info_t gmx_unused *gpu_info,
+ +        const gmx_gpu_opt_t gmx_unused *gpu_opt
+ +        )
+ +{
+ +    cudaError_t  stat;
+ +    gmx_bool     reset_gpu_application_clocks_status = true;
+ +    int          gpuid;
+ +
+ +    assert(result_str);
+ +
+ +    if (debug)
+ +    {
+ +        int gpuid;
+ +        stat = cudaGetDevice(&gpuid);
+ +        CU_RET_ERR(stat, "cudaGetDevice failed");
+ +        fprintf(stderr, "Cleaning up context on GPU ID #%d\n", gpuid);
+ +    }
+ +
+ +    gpuid = gpu_opt ? gpu_opt->dev_use[mygpu] : -1;
+ +    if (gpuid != -1)
+ +    {
+ +        reset_gpu_application_clocks_status = reset_gpu_application_clocks( &(gpu_info->gpu_dev[gpuid]) );
+ +    }
+ +
+ +    stat = cudaDeviceReset();
+ +    strncpy(result_str, cudaGetErrorString(stat), STRLEN);
+ +    return (stat == cudaSuccess) && reset_gpu_application_clocks_status;
+ +}
+ +
+ +/*! \brief Returns true if the gpu characterized by the device properties is
+ + *  supported by the native gpu acceleration.
+ + *
+ + * \param[in] dev_prop  the CUDA device properties of the gpus to test.
+ + * \returns             true if the GPU properties passed indicate a compatible
+ + *                      GPU, otherwise false.
+ + */
+ +static bool is_gmx_supported_gpu(const cudaDeviceProp *dev_prop)
+ +{
+ +    return (dev_prop->major >= 2);
+ +}
+ +
+ +/*! \brief Helper function that checks whether a given GPU status indicates compatible GPU.
+ + *
+ + * \param[in] stat  GPU status.
+ + * \returns         true if the provided status is egpuCompatible, otherwise false.
+ + */
+ +static bool is_compatible_gpu(int stat)
+ +{
+ +    return (stat == egpuCompatible);
+ +}
+ +
+ +/*! \brief Checks if a GPU with a given ID is supported by the native GROMACS acceleration.
+ + *
+ + *  Returns a status value which indicates compatibility or one of the following
+ + *  errors: incompatibility, insistence, or insanity (=unexpected behavior).
+ + *  It also returns the respective device's properties in \dev_prop (if applicable).
+ + *
+ + *  \param[in]  dev_id   the ID of the GPU to check.
+ + *  \param[out] dev_prop the CUDA device properties of the device checked.
+ + *  \returns             the status of the requested device
+ + */
+ +static int is_gmx_supported_gpu_id(int dev_id, cudaDeviceProp *dev_prop)
+ +{
+ +    cudaError_t stat;
+ +    int         ndev;
+ +
+ +    stat = cudaGetDeviceCount(&ndev);
+ +    if (stat != cudaSuccess)
+ +    {
+ +        return egpuInsane;
+ +    }
+ +
+ +    if (dev_id > ndev - 1)
+ +    {
+ +        return egpuNonexistent;
+ +    }
+ +
+ +    /* TODO: currently we do not make a distinction between the type of errors
+ +     * that can appear during sanity checks. This needs to be improved, e.g if
+ +     * the dummy test kernel fails to execute with a "device busy message" we
+ +     * should appropriately report that the device is busy instead of insane.
+ +     */
+ +    if (do_sanity_checks(dev_id, dev_prop) == 0)
+ +    {
+ +        if (is_gmx_supported_gpu(dev_prop))
+ +        {
+ +            return egpuCompatible;
+ +        }
+ +        else
+ +        {
+ +            return egpuIncompatible;
+ +        }
+ +    }
+ +    else
+ +    {
+ +        return egpuInsane;
+ +    }
+ +}
+ +
+ +
+ +int detect_gpus(gmx_gpu_info_t *gpu_info, char *err_str)
+ +{
+ +    int                i, ndev, checkres, retval;
+ +    cudaError_t        stat;
+ +    cudaDeviceProp     prop;
+ +    gmx_device_info_t *devs;
+ +
+ +    assert(gpu_info);
+ +    assert(err_str);
+ +
+ +    gpu_info->n_dev_compatible = 0;
+ +
+ +    ndev    = 0;
+ +    devs    = NULL;
+ +
+ +    stat = cudaGetDeviceCount(&ndev);
+ +    if (stat != cudaSuccess)
+ +    {
+ +        const char *s;
+ +
+ +        /* cudaGetDeviceCount failed which means that there is something
+ +         * wrong with the machine: driver-runtime mismatch, all GPUs being
+ +         * busy in exclusive mode, or some other condition which should
+ +         * result in us issuing a warning a falling back to CPUs. */
+ +        retval = -1;
+ +        s      = cudaGetErrorString(stat);
+ +        strncpy(err_str, s, STRLEN*sizeof(err_str[0]));
+ +    }
+ +    else
+ +    {
+ +        snew(devs, ndev);
+ +        for (i = 0; i < ndev; i++)
+ +        {
+ +            checkres = is_gmx_supported_gpu_id(i, &prop);
+ +
+ +            devs[i].id   = i;
+ +            devs[i].prop = prop;
+ +            devs[i].stat = checkres;
+ +
+ +            if (checkres == egpuCompatible)
+ +            {
+ +                gpu_info->n_dev_compatible++;
+ +            }
+ +        }
+ +        retval = 0;
+ +    }
+ +
+ +    gpu_info->n_dev   = ndev;
+ +    gpu_info->gpu_dev = devs;
+ +
+ +    return retval;
+ +}
+ +
+ +void pick_compatible_gpus(const gmx_gpu_info_t *gpu_info,
+ +                          gmx_gpu_opt_t        *gpu_opt)
+ +{
+ +    int  i, ncompat;
+ +    int *compat;
+ +
+ +    assert(gpu_info);
+ +    /* gpu_dev/n_dev have to be either NULL/0 or not (NULL/0) */
+ +    assert((gpu_info->n_dev != 0 ? 0 : 1) ^ (gpu_info->gpu_dev == NULL ? 0 : 1));
+ +
+ +    snew(compat, gpu_info->n_dev);
+ +    ncompat = 0;
+ +    for (i = 0; i < gpu_info->n_dev; i++)
+ +    {
+ +        if (is_compatible_gpu(gpu_info->gpu_dev[i].stat))
+ +        {
+ +            ncompat++;
+ +            compat[ncompat - 1] = i;
+ +        }
+ +    }
+ +
+ +    gpu_opt->n_dev_compatible = ncompat;
+ +    snew(gpu_opt->dev_compatible, ncompat);
+ +    memcpy(gpu_opt->dev_compatible, compat, ncompat*sizeof(*compat));
+ +    sfree(compat);
+ +}
+ +
+ +gmx_bool check_selected_gpus(int                  *checkres,
+ +                             const gmx_gpu_info_t *gpu_info,
+ +                             gmx_gpu_opt_t        *gpu_opt)
+ +{
+ +    int  i, id;
+ +    bool bAllOk;
+ +
+ +    assert(checkres);
+ +    assert(gpu_info);
+ +    assert(gpu_opt->n_dev_use >= 0);
+ +
+ +    if (gpu_opt->n_dev_use == 0)
+ +    {
+ +        return TRUE;
+ +    }
+ +
+ +    assert(gpu_opt->dev_use);
+ +
+ +    /* we will assume that all GPUs requested are valid IDs,
+ +       otherwise we'll bail anyways */
+ +
+ +    bAllOk = true;
+ +    for (i = 0; i < gpu_opt->n_dev_use; i++)
+ +    {
+ +        id = gpu_opt->dev_use[i];
+ +
+ +        /* devices are stored in increasing order of IDs in gpu_dev */
+ +        gpu_opt->dev_use[i] = id;
+ +
+ +        checkres[i] = (id >= gpu_info->n_dev) ?
+ +            egpuNonexistent : gpu_info->gpu_dev[id].stat;
+ +
+ +        bAllOk = bAllOk && is_compatible_gpu(checkres[i]);
+ +    }
+ +
+ +    return bAllOk;
+ +}
+ +
+ +void free_gpu_info(const gmx_gpu_info_t *gpu_info)
+ +{
+ +    if (gpu_info == NULL)
+ +    {
+ +        return;
+ +    }
+ +
+ +    sfree(gpu_info->gpu_dev);
+ +}
+ +
+ +void get_gpu_device_info_string(char *s, const gmx_gpu_info_t *gpu_info, int index)
+ +{
+ +    assert(s);
+ +    assert(gpu_info);
+ +
+ +    if (index < 0 && index >= gpu_info->n_dev)
+ +    {
+ +        return;
+ +    }
+ +
+ +    gmx_device_info_t *dinfo = &gpu_info->gpu_dev[index];
+ +
+ +    bool               bGpuExists =
+ +        dinfo->stat == egpuCompatible ||
+ +        dinfo->stat == egpuIncompatible;
+ +
+ +    if (!bGpuExists)
+ +    {
+ +        sprintf(s, "#%d: %s, stat: %s",
+ +                dinfo->id, "N/A",
+ +                gpu_detect_res_str[dinfo->stat]);
+ +    }
+ +    else
+ +    {
+ +        sprintf(s, "#%d: NVIDIA %s, compute cap.: %d.%d, ECC: %3s, stat: %s",
+ +                dinfo->id, dinfo->prop.name,
+ +                dinfo->prop.major, dinfo->prop.minor,
+ +                dinfo->prop.ECCEnabled ? "yes" : " no",
+ +                gpu_detect_res_str[dinfo->stat]);
+ +    }
+ +}
+ +
+ +int get_gpu_device_id(const gmx_gpu_info_t *gpu_info,
+ +                      const gmx_gpu_opt_t  *gpu_opt,
+ +                      int                   idx)
+ +{
+ +    assert(gpu_info);
+ +    assert(gpu_opt);
+ +    assert(idx >= 0 && idx < gpu_opt->n_dev_use);
+ +
+ +    return gpu_info->gpu_dev[gpu_opt->dev_use[idx]].id;
+ +}
+ +
+ +int get_current_cuda_gpu_device_id(void)
+ +{
+ +    int gpuid;
+ +    CU_RET_ERR(cudaGetDevice(&gpuid), "cudaGetDevice failed");
+ +
+ +    return gpuid;
+ +}
+ +
+ +size_t sizeof_gpu_dev_info(void)
+ +{
+ +    return sizeof(gmx_device_info_t);
+ +}
+ +
+ +void gpu_set_host_malloc_and_free(bool               bUseGpuKernels,
+ +                                  gmx_host_alloc_t **nb_alloc,
+ +                                  gmx_host_free_t  **nb_free)
+ +{
+ +    if (bUseGpuKernels)
+ +    {
+ +        *nb_alloc = &pmalloc;
+ +        *nb_free  = &pfree;
+ +    }
+ +    else
+ +    {
+ +        *nb_alloc = NULL;
+ +        *nb_free  = NULL;
+ +    }
+ +}
+ +
+ +void startGpuProfiler(void)
+ +{
+ +    /* The NVPROF_ID environment variable is set by nvprof and indicates that
+ +       mdrun is executed in the CUDA profiler.
+ +       If nvprof was run is with "--profile-from-start off", the profiler will
+ +       be started here. This way we can avoid tracing the CUDA events from the
+ +       first part of the run. Starting the profiler again does nothing.
+ +     */
+ +    if (cudaProfilerRun)
+ +    {
+ +        cudaError_t stat;
+ +        stat = cudaProfilerStart();
+ +        CU_RET_ERR(stat, "cudaProfilerStart failed");
+ +    }
+ +}
+ +
+ +void stopGpuProfiler(void)
+ +{
+ +    /* Stopping the nvidia here allows us to eliminate the subsequent
+ +       API calls from the trace, e.g. uninitialization and cleanup. */
+ +    if (cudaProfilerRun)
+ +    {
+ +        cudaError_t stat;
+ +        stat = cudaProfilerStop();
+ +        CU_RET_ERR(stat, "cudaProfilerStop failed");
+ +    }
+ +}
+ +
+ +void resetGpuProfiler(void)
+ +{
+ +    /* With CUDA <=7.5 the profiler can't be properly reset; we can only start
+ +     *  the profiling here (can't stop it) which will achieve the desired effect if
+ +     *  the run was started with the profiling disabled.
+ +     *
+ +     * TODO: add a stop (or replace it with reset) when this will work correctly in CUDA.
+ +     * stopGpuProfiler();
+ +     */
+ +    if (cudaProfilerRun)
+ +    {
+ +        startGpuProfiler();
+ +    }
+ +}
diff --cc src/gromacs/hardware/hardwaretopology.cpp

index 8a284ae0e2e40130c8d5e3af0dc7ea4fc92b49d2,0000000000000000000000000000000000000000..59a4d1d9ffec41f8e881a75f4d641a59c48a1a6c

mode 100644,000000..100644
--- 1/src/gromacs/hardware/hardwaretopology.cpp
--- /dev/null
+++ b/src/gromacs/hardware/hardwaretopology.cpp
@@@ -1,677 -1,0 +1,712 @@@
-             md_print_warn(cr, fplog,
-                           "%d CPUs configured, but only %d of them are online.\n"
-                           "This can happen on embedded platforms (e.g. ARM) where the OS shuts some cores\n"
-                           "off to save power, and will turn them back on later when the load increases.\n"
-                           "However, this will likely mean GROMACS cannot pin threads to those cores. You\n"
-                           "will likely see much better performance by forcing all cores to be online, and\n"
-                           "making sure they run at their full clock frequency.", count, countOnline);
+ +/*
+ + * This file is part of the GROMACS molecular simulation package.
+ + *
+ + * Copyright (c) 2012,2013,2014,2015,2016, by the GROMACS development team, led by
+ + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ + * and including many others, as listed in the AUTHORS file in the
+ + * top-level source directory and at http://www.gromacs.org.
+ + *
+ + * GROMACS is free software; you can redistribute it and/or
+ + * modify it under the terms of the GNU Lesser General Public License
+ + * as published by the Free Software Foundation; either version 2.1
+ + * of the License, or (at your option) any later version.
+ + *
+ + * GROMACS is distributed in the hope that it will be useful,
+ + * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ + * Lesser General Public License for more details.
+ + *
+ + * You should have received a copy of the GNU Lesser General Public
+ + * License along with GROMACS; if not, see
+ + * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ + *
+ + * If you want to redistribute modifications to GROMACS, please
+ + * consider that scientific software is very special. Version
+ + * control is crucial - bugs must be traceable. We will be happy to
+ + * consider code for inclusion in the official distribution, but
+ + * derived work must not be called official GROMACS. Details are found
+ + * in the README & COPYING files - if they are missing, get the
+ + * official version at http://www.gromacs.org.
+ + *
+ + * To help us fund GROMACS development, we humbly ask that you cite
+ + * the research papers on the package. Check out http://www.gromacs.org.
+ + */
+ +
+ +/*! \internal \file
+ + * \brief
+ + * Implements gmx::HardwareTopology.
+ + *
+ + * \author Erik Lindahl <erik.lindahl@gmail.com>
+ + * \ingroup module_hardware
+ + */
+ +
+ +#include "gmxpre.h"
+ +
+ +#include "hardwaretopology.h"
+ +
+ +#include "config.h"
+ +
+ +#include <cstdio>
+ +
+ +#include <algorithm>
+ +#include <vector>
+ +
+ +#if GMX_HWLOC
+ +#    include <hwloc.h>
+ +#endif
+ +
+ +#include "gromacs/gmxlib/md_logging.h"
+ +#include "gromacs/hardware/cpuinfo.h"
+ +#include "gromacs/utility/gmxassert.h"
+ +#include "gromacs/utility/gmxomp.h"
+ +
+ +#ifdef HAVE_UNISTD_H
+ +#    include <unistd.h>       // sysconf()
+ +#endif
+ +#if GMX_NATIVE_WINDOWS
+ +#    include <windows.h>      // GetSystemInfo()
+ +#endif
+ +
++#if defined(_M_ARM) || defined(__arm__) || defined(__ARM_ARCH) || defined (__aarch64__)
++//! Constant used to help minimize preprocessed code
++static const bool isArm = true;
++#else
++//! Constant used to help minimize preprocessed code
++static const bool isArm = false;
++#endif
++
+ +namespace gmx
+ +{
+ +
+ +namespace
+ +{
+ +
+ +/*****************************************************************************
+ + *                                                                           *
+ + *   Utility functions for extracting hardware topology from CpuInfo object  *
+ + *                                                                           *
+ + *****************************************************************************/
+ +
+ +/*! \brief Initialize machine data from basic information in cpuinfo
+ + *
+ + *  \param  machine      Machine tree structure where information will be assigned
+ + *                       if the cpuinfo object contains topology information.
+ + *  \param  supportLevel If topology information is available in CpuInfo,
+ + *                       this will be updated to reflect the amount of
+ + *                       information written to the machine structure.
+ + */
+ +void
+ +parseCpuInfo(HardwareTopology::Machine *        machine,
+ +             HardwareTopology::SupportLevel *   supportLevel)
+ +{
+ +    CpuInfo cpuInfo(CpuInfo::detect());
+ +
+ +    if (!cpuInfo.logicalProcessors().empty())
+ +    {
+ +        int nSockets   = 0;
+ +        int nCores     = 0;
+ +        int nHwThreads = 0;
+ +
+ +        // Copy the logical processor information from cpuinfo
+ +        for (auto &l : cpuInfo.logicalProcessors())
+ +        {
+ +            machine->logicalProcessors.push_back( { l.socketRankInMachine, l.coreRankInSocket, l.hwThreadRankInCore, -1 } );
+ +            nSockets   = std::max(nSockets, l.socketRankInMachine);
+ +            nCores     = std::max(nCores, l.coreRankInSocket);
+ +            nHwThreads = std::max(nHwThreads, l.hwThreadRankInCore);
+ +        }
+ +
+ +        // Fill info form sockets/cores/hwthreads
+ +        int socketId   = 0;
+ +        int coreId     = 0;
+ +        int hwThreadId = 0;
+ +
+ +        machine->sockets.resize(nSockets + 1);
+ +        for (auto &s : machine->sockets)
+ +        {
+ +            s.id = socketId++;
+ +            s.cores.resize(nCores + 1);
+ +            for (auto &c : s.cores)
+ +            {
+ +                c.id         = coreId++;
+ +                c.numaNodeId = -1; // No numa information
+ +                c.hwThreads.resize(nHwThreads + 1);
+ +                for (auto &t : c.hwThreads)
+ +                {
+ +                    t.id                 = hwThreadId++;
+ +                    t.logicalProcessorId = -1; // set as unassigned for now
+ +                }
+ +            }
+ +        }
+ +
+ +        // Fill the logical processor id in the right place
+ +        for (std::size_t i = 0; i < machine->logicalProcessors.size(); i++)
+ +        {
+ +            const HardwareTopology::LogicalProcessor &l = machine->logicalProcessors[i];
+ +            machine->sockets[l.socketRankInMachine].cores[l.coreRankInSocket].hwThreads[l.hwThreadRankInCore].logicalProcessorId = static_cast<int>(i);
+ +        }
+ +        machine->logicalProcessorCount = machine->logicalProcessors.size();
+ +        *supportLevel                  = HardwareTopology::SupportLevel::Basic;
+ +    }
+ +    else
+ +    {
+ +        *supportLevel = HardwareTopology::SupportLevel::None;
+ +    }
+ +}
+ +
+ +#if GMX_HWLOC
+ +
+ +#if HWLOC_API_VERSION < 0x00010b00
+ +#    define HWLOC_OBJ_PACKAGE  HWLOC_OBJ_SOCKET
+ +#    define HWLOC_OBJ_NUMANODE HWLOC_OBJ_NODE
+ +#endif
+ +
+ +/*****************************************************************************
+ + *                                                                           *
+ + *   Utility functions for extracting hardware topology from hwloc library   *
+ + *                                                                           *
+ + *****************************************************************************/
+ +
+ +/*! \brief Return vector of all descendants of a given type in hwloc topology
+ + *
+ + *  \param obj   Non-null hwloc object.
+ + *  \param type  hwloc object type to find. The routine will only search
+ + *               on levels below obj.
+ + *
+ + *  \return vector containing all the objects of given type that are
+ + *          descendants of the provided object. If no objects of this type
+ + *          were found, the vector will be empty.
+ + */
+ +const std::vector<hwloc_obj_t>
+ +getHwLocDescendantsByType(const hwloc_obj_t obj, const hwloc_obj_type_t type)
+ +{
+ +    GMX_RELEASE_ASSERT(obj, "NULL hwloc object provided to getHwLocDescendantsByType()");
+ +
+ +    std::vector<hwloc_obj_t> v;
+ +
+ +    // Go through children; if this object has no children obj->arity is 0,
+ +    // and we'll return an empty vector.
+ +    for (std::size_t i = 0; i < obj->arity; i++)
+ +    {
+ +        // If the child is the type we're looking for, add it directly.
+ +        // Otherwise call this routine recursively for each child.
+ +        if (obj->children[i]->type == type)
+ +        {
+ +            v.push_back(obj->children[i]);
+ +        }
+ +        else
+ +        {
+ +            std::vector<hwloc_obj_t> v2 = getHwLocDescendantsByType(obj->children[i], type);
+ +            v.insert(v.end(), v2.begin(), v2.end());
+ +        }
+ +    }
+ +    return v;
+ +}
+ +
+ +/*! \brief Read information about sockets, cores and threads from hwloc topology
+ + *
+ + *  \param topo    hwloc topology handle that has been initialized and loaded
+ + *  \param machine Pointer to the machine structure in the HardwareTopology
+ + *                 class, where the tree of sockets/cores/threads will be written.
+ + *
+ + *  \return If all the data is found the return value is 0, otherwise non-zero.
+ + */
+ +int
+ +parseHwLocSocketsCoresThreads(const hwloc_topology_t             topo,
+ +                              HardwareTopology::Machine *        machine)
+ +{
+ +    const hwloc_obj_t              root         = hwloc_get_root_obj(topo);
+ +    std::vector<hwloc_obj_t>       hwlocSockets = getHwLocDescendantsByType(root, HWLOC_OBJ_PACKAGE);
+ +
+ +    machine->logicalProcessorCount = hwloc_get_nbobjs_by_type(topo, HWLOC_OBJ_PU);
+ +    machine->logicalProcessors.resize(machine->logicalProcessorCount);
+ +    machine->sockets.resize(hwlocSockets.size());
+ +
+ +    bool topologyOk = !hwlocSockets.empty(); // Fail if we have no sockets in machine
+ +
+ +    for (std::size_t i = 0; i < hwlocSockets.size() && topologyOk; i++)
+ +    {
+ +        // Assign information about this socket
+ +        machine->sockets[i].id = hwlocSockets[i]->logical_index;
+ +
+ +        // Get children (cores)
+ +        std::vector<hwloc_obj_t> hwlocCores = getHwLocDescendantsByType(hwlocSockets[i], HWLOC_OBJ_CORE);
+ +        machine->sockets[i].cores.resize(hwlocCores.size());
+ +
+ +        topologyOk = topologyOk && !hwlocCores.empty(); // Fail if we have no cores in socket
+ +
+ +        // Loop over child cores
+ +        for (std::size_t j = 0; j < hwlocCores.size() && topologyOk; j++)
+ +        {
+ +            // Assign information about this core
+ +            machine->sockets[i].cores[j].id         = hwlocCores[j]->logical_index;
+ +            machine->sockets[i].cores[j].numaNodeId = -1;
+ +
+ +            // Get children (hwthreads)
+ +            std::vector<hwloc_obj_t> hwlocPUs = getHwLocDescendantsByType(hwlocCores[j], HWLOC_OBJ_PU);
+ +            machine->sockets[i].cores[j].hwThreads.resize(hwlocPUs.size());
+ +
+ +            topologyOk = topologyOk && !hwlocPUs.empty(); // Fail if we have no hwthreads in core
+ +
+ +            // Loop over child hwthreads
+ +            for (std::size_t k = 0; k < hwlocPUs.size() && topologyOk; k++)
+ +            {
+ +                // Assign information about this hwthread
+ +                std::size_t logicalProcessorId                               = hwlocPUs[k]->os_index;
+ +                machine->sockets[i].cores[j].hwThreads[k].id                 = hwlocPUs[k]->logical_index;
+ +                machine->sockets[i].cores[j].hwThreads[k].logicalProcessorId = logicalProcessorId;
+ +
+ +                if (logicalProcessorId < machine->logicalProcessors.size())
+ +                {
+ +                    // Cross-assign data for this hwthread to the logicalprocess vector
+ +                    machine->logicalProcessors[logicalProcessorId].socketRankInMachine = static_cast<int>(i);
+ +                    machine->logicalProcessors[logicalProcessorId].coreRankInSocket    = static_cast<int>(j);
+ +                    machine->logicalProcessors[logicalProcessorId].hwThreadRankInCore  = static_cast<int>(k);
+ +                    machine->logicalProcessors[logicalProcessorId].numaNodeId          = -1;
+ +                }
+ +                else
+ +                {
+ +                    topologyOk = false;
+ +                }
+ +            }
+ +        }
+ +    }
+ +
+ +    if (topologyOk)
+ +    {
+ +        return 0;
+ +    }
+ +    else
+ +    {
+ +        machine->logicalProcessors.clear();
+ +        machine->sockets.clear();
+ +        return -1;
+ +    }
+ +}
+ +
+ +/*! \brief Read cache information from hwloc topology
+ + *
+ + *  \param topo    hwloc topology handle that has been initialized and loaded
+ + *  \param machine Pointer to the machine structure in the HardwareTopology
+ + *                 class, where cache data will be filled.
+ + *
+ + *  \return If any cache data is found the return value is 0, otherwise non-zero.
+ + */
+ +int
+ +parseHwLocCache(const hwloc_topology_t             topo,
+ +                HardwareTopology::Machine *        machine)
+ +{
+ +    // Parse caches up to L5
+ +    for (int cachelevel : { 1, 2, 3, 4, 5})
+ +    {
+ +        int depth = hwloc_get_cache_type_depth(topo, cachelevel, HWLOC_OBJ_CACHE_DATA);
+ +
+ +        if (depth >= 0)
+ +        {
+ +            hwloc_obj_t cache = hwloc_get_next_obj_by_depth(topo, depth, NULL);
+ +            if (cache != NULL)
+ +            {
+ +                std::vector<hwloc_obj_t> hwThreads = getHwLocDescendantsByType(cache, HWLOC_OBJ_PU);
+ +
+ +                machine->caches.push_back( {
+ +                                               static_cast<int>(cache->attr->cache.depth),
+ +                                               static_cast<std::size_t>(cache->attr->cache.size),
+ +                                               static_cast<int>(cache->attr->cache.linesize),
+ +                                               static_cast<int>(cache->attr->cache.associativity),
+ +                                               std::max(static_cast<int>(hwThreads.size()), 1)
+ +                                           } );
+ +            }
+ +        }
+ +    }
+ +    return machine->caches.empty();
+ +}
+ +
+ +
+ +/*! \brief Read numa information from hwloc topology
+ + *
+ + *  \param topo    hwloc topology handle that has been initialized and loaded
+ + *  \param machine Pointer to the machine structure in the HardwareTopology
+ + *                 class, where numa information will be filled.
+ + *
+ + *  Hwloc should virtually always be able to detect numa information, but if
+ + *  there is only a single numa node in the system it is not reported at all.
+ + *  In this case we create a single numa node covering all cores.
+ + *
+ + *  This function uses the basic socket/core/thread information detected by
+ + *  parseHwLocSocketsCoresThreads(), which means that routine must have
+ + *  completed successfully before calling this one. If this is not the case,
+ + *  you will get an error return code.
+ + *
+ + *  \return If the data found makes sense (either in the numa node or the
+ + *          entire machine) the return value is 0, otherwise non-zero.
+ + */
+ +int
+ +parseHwLocNuma(const hwloc_topology_t             topo,
+ +               HardwareTopology::Machine *        machine)
+ +{
+ +    const hwloc_obj_t        root           = hwloc_get_root_obj(topo);
+ +    std::vector<hwloc_obj_t> hwlocNumaNodes = getHwLocDescendantsByType(root, HWLOC_OBJ_NUMANODE);
+ +    bool                     topologyOk     = true;
+ +
+ +    if (!hwlocNumaNodes.empty())
+ +    {
+ +        machine->numa.nodes.resize(hwlocNumaNodes.size());
+ +
+ +        for (std::size_t i = 0; i < hwlocNumaNodes.size(); i++)
+ +        {
+ +            machine->numa.nodes[i].id     = hwlocNumaNodes[i]->logical_index;
+ +            machine->numa.nodes[i].memory = hwlocNumaNodes[i]->memory.total_memory;
+ +            machine->numa.nodes[i].logicalProcessorId.clear();
+ +
+ +            // Get list of PUs in this numa node
+ +            std::vector<hwloc_obj_t> hwlocPUs = getHwLocDescendantsByType(hwlocNumaNodes[i], HWLOC_OBJ_PU);
+ +
+ +            for (auto &p : hwlocPUs)
+ +            {
+ +                machine->numa.nodes[i].logicalProcessorId.push_back(p->os_index);
+ +
+ +                GMX_RELEASE_ASSERT(p->os_index < machine->logicalProcessors.size(), "OS index of PU in hwloc larger than processor count");
+ +
+ +                machine->logicalProcessors[p->os_index].numaNodeId = static_cast<int>(i);
+ +                std::size_t s = machine->logicalProcessors[p->os_index].socketRankInMachine;
+ +                std::size_t c = machine->logicalProcessors[p->os_index].coreRankInSocket;
+ +
+ +                GMX_RELEASE_ASSERT(s < machine->sockets.size(), "Socket index in logicalProcessors larger than socket count");
+ +                GMX_RELEASE_ASSERT(c < machine->sockets[s].cores.size(), "Core index in logicalProcessors larger than core count");
+ +                // Set numaNodeId in core too
+ +                machine->sockets[s].cores[c].numaNodeId = i;
+ +            }
+ +        }
+ +
+ +        int depth = hwloc_get_type_depth(topo, HWLOC_OBJ_NUMANODE);
+ +        const struct hwloc_distances_s * dist = hwloc_get_whole_distance_matrix_by_depth(topo, depth);
+ +        if (dist != NULL && dist->nbobjs == hwlocNumaNodes.size())
+ +        {
+ +            machine->numa.baseLatency        = dist->latency_base;
+ +            machine->numa.maxRelativeLatency = dist->latency_max;
+ +            machine->numa.relativeLatency.resize(dist->nbobjs);
+ +            for (std::size_t i = 0; i < dist->nbobjs; i++)
+ +            {
+ +                machine->numa.relativeLatency[i].resize(dist->nbobjs);
+ +                for (std::size_t j = 0; j < dist->nbobjs; j++)
+ +                {
+ +                    machine->numa.relativeLatency[i][j] = dist->latency[i*dist->nbobjs+j];
+ +                }
+ +            }
+ +        }
+ +        else
+ +        {
+ +            topologyOk = false;
+ +        }
+ +    }
+ +    else
+ +    {
+ +        // No numa nodes found. Use the entire machine as a numa node.
+ +        const hwloc_obj_t hwlocMachine = hwloc_get_next_obj_by_type(topo, HWLOC_OBJ_MACHINE, NULL);
+ +
+ +        if (hwlocMachine != NULL)
+ +        {
+ +            machine->numa.nodes.resize(1);
+ +            machine->numa.nodes[0].id           = 0;
+ +            machine->numa.nodes[0].memory       = hwlocMachine->memory.total_memory;
+ +            machine->numa.baseLatency           = 10;
+ +            machine->numa.maxRelativeLatency    = 1;
+ +            machine->numa.relativeLatency       = { { 1.0 } };
+ +
+ +            for (int i = 0; i < machine->logicalProcessorCount; i++)
+ +            {
+ +                machine->numa.nodes[0].logicalProcessorId.push_back(i);
+ +            }
+ +            for (auto &l : machine->logicalProcessors)
+ +            {
+ +                l.numaNodeId = 0;
+ +            }
+ +            for (auto &s : machine->sockets)
+ +            {
+ +                for (auto &c : s.cores)
+ +                {
+ +                    c.numaNodeId = 0;
+ +                }
+ +            }
+ +        }
+ +        else
+ +        {
+ +            topologyOk = false;
+ +        }
+ +    }
+ +
+ +    if (topologyOk)
+ +    {
+ +        return 0;
+ +    }
+ +    else
+ +    {
+ +        machine->numa.nodes.clear();
+ +        return -1;
+ +    }
+ +
+ +}
+ +
+ +/*! \brief Read PCI device information from hwloc topology
+ + *
+ + *  \param topo    hwloc topology handle that has been initialized and loaded
+ + *  \param machine Pointer to the machine structure in the HardwareTopology
+ + *                 class, where PCI device information will be filled.
+ + * *
+ + *  \return If any devices were found the return value is 0, otherwise non-zero.
+ + */
+ +int
+ +parseHwLocDevices(const hwloc_topology_t             topo,
+ +                  HardwareTopology::Machine *        machine)
+ +{
+ +    const hwloc_obj_t        root    = hwloc_get_root_obj(topo);
+ +    std::vector<hwloc_obj_t> pcidevs = getHwLocDescendantsByType(root, HWLOC_OBJ_PCI_DEVICE);
+ +
+ +    for (auto &p : pcidevs)
+ +    {
+ +        const hwloc_obj_t ancestor = hwloc_get_ancestor_obj_by_type(topo, HWLOC_OBJ_NUMANODE, p);
+ +        int               numaId;
+ +        if (ancestor != NULL)
+ +        {
+ +            numaId = ancestor->logical_index;
+ +        }
+ +        else
+ +        {
+ +            // If we only have a single numa node we belong to it, otherwise set it to -1 (unknown)
+ +            numaId = (machine->numa.nodes.size() == 1) ?  0 : -1;
+ +        }
+ +
+ +        GMX_RELEASE_ASSERT(p->attr, "Attributes should not be NULL for hwloc PCI object");
+ +
+ +        machine->devices.push_back( {
+ +                                        p->attr->pcidev.vendor_id,
+ +                                        p->attr->pcidev.device_id,
+ +                                        p->attr->pcidev.class_id,
+ +                                        p->attr->pcidev.domain,
+ +                                        p->attr->pcidev.bus,
+ +                                        p->attr->pcidev.dev,
+ +                                        p->attr->pcidev.func,
+ +                                        numaId
+ +                                    } );
+ +    }
+ +    return pcidevs.empty();
+ +}
+ +
+ +void
+ +parseHwLoc(HardwareTopology::Machine *        machine,
+ +           HardwareTopology::SupportLevel *   supportLevel,
+ +           bool *                             isThisSystem)
+ +{
+ +    hwloc_topology_t    topo;
+ +
+ +    // Initialize a hwloc object, set flags to request IO device information too,
+ +    // try to load the topology, and get the root object. If either step fails,
+ +    // return that we do not have any support at all from hwloc.
+ +    if (hwloc_topology_init(&topo) != 0)
+ +    {
+ +        hwloc_topology_destroy(topo);
+ +        return; // SupportLevel::None.
+ +    }
+ +
+ +    hwloc_topology_set_flags(topo, HWLOC_TOPOLOGY_FLAG_IO_DEVICES);
+ +
+ +    if (hwloc_topology_load(topo) != 0 || hwloc_get_root_obj(topo) == NULL)
+ +    {
+ +        hwloc_topology_destroy(topo);
+ +        return; // SupportLevel::None.
+ +    }
+ +
+ +    // If we get here, we can get a valid root object for the topology
+ +    *isThisSystem = hwloc_topology_is_thissystem(topo);
+ +
+ +    // Parse basic information about sockets, cores, and hardware threads
+ +    if (parseHwLocSocketsCoresThreads(topo, machine) == 0)
+ +    {
+ +        *supportLevel = HardwareTopology::SupportLevel::Basic;
+ +    }
+ +    else
+ +    {
+ +        hwloc_topology_destroy(topo);
+ +        return; // SupportLevel::None.
+ +    }
+ +
+ +    // Get information about cache and numa nodes
+ +    if (parseHwLocCache(topo, machine) == 0 && parseHwLocNuma(topo, machine) == 0)
+ +    {
+ +        *supportLevel = HardwareTopology::SupportLevel::Full;
+ +    }
+ +    else
+ +    {
+ +        hwloc_topology_destroy(topo);
+ +        return; // SupportLevel::Basic.
+ +    }
+ +
+ +    // PCI devices
+ +    if (parseHwLocDevices(topo, machine) == 0)
+ +    {
+ +        *supportLevel = HardwareTopology::SupportLevel::FullWithDevices;
+ +    }
+ +
+ +    hwloc_topology_destroy(topo);
+ +    return; // SupportLevel::Full or SupportLevel::FullWithDevices.
+ +}
+ +
+ +#endif
+ +
+ +/*! \brief Try to detect the number of logical processors.
+ + *
+ + *  \return The number of hardware processing units, or 0 if it fails.
+ + */
+ +int
+ +detectLogicalProcessorCount(FILE *fplog, const t_commrec *cr)
+ +{
+ +    int count = 0;
+ +
+ +    {
+ +#if GMX_NATIVE_WINDOWS
+ +        // Windows
+ +        SYSTEM_INFO sysinfo;
+ +        GetSystemInfo( &sysinfo );
+ +        count = sysinfo.dwNumberOfProcessors;
+ +#elif defined HAVE_SYSCONF
+ +        // We are probably on Unix. Check if we have the argument to use before executing any calls
+ +#    if defined(_SC_NPROCESSORS_CONF)
+ +        count = sysconf(_SC_NPROCESSORS_CONF);
+ +#        if defined(_SC_NPROCESSORS_ONLN)
+ +        /* On e.g. Arm, the Linux kernel can use advanced power saving features where
+ +         * processors are brought online/offline dynamically. This will cause
+ +         * _SC_NPROCESSORS_ONLN to report 1 at the beginning of the run. For this
+ +         * reason we now warn if this mismatches with the detected core count. */
+ +        int countOnline = sysconf(_SC_NPROCESSORS_ONLN);
+ +        if (count != countOnline)
+ +        {
++            /* We assume that this scenario means that the kernel has
++               disabled threads or cores, and that the only safe course is
++               to assume that _SC_NPROCESSORS_ONLN should be used. Even
++               this may not be valid if running in a containerized
++               environment, such system calls may read from
++               /sys/devices/system/cpu and report what the OS sees, rather
++               than what the container cgroup is supposed to set up as
++               limits. But we're not sure right now whether there's any
++               (standard-ish) way to handle that.
++
++               On ARM, the kernel may have powered down the cores,
++               which we'll warn the user about now. On x86, this
++               means HT is disabled by the kernel, not in the
++               BIOS. We're not sure what it means on other
++               architectures, or even if it is possible, because
++               sysconf is rather non-standardized. */
++            if (isArm)
++            {
++                md_print_warn(cr, fplog,
++                              "%d CPUs configured, but only %d of them are online.\n"
++                              "This can happen on embedded platforms (e.g. ARM) where the OS shuts some cores\n"
++                              "off to save power, and will turn them back on later when the load increases.\n"
++                              "However, this will likely mean GROMACS cannot pin threads to those cores. You\n"
++                              "will likely see much better performance by forcing all cores to be online, and\n"
++                              "making sure they run at their full clock frequency.", count, countOnline);
++            }
++            else
++            {
++                md_print_warn(cr, fplog,
++                              "Note: %d CPUs configured, but only %d of them are online, so GROMACS will use the latter.",
++                              count, countOnline);
++                // We use the online count to avoid (potential) oversubscription.
++                count = countOnline;
++            }
+ +        }
+ +#        endif
+ +#    elif defined(_SC_NPROC_CONF)
+ +        count = sysconf(_SC_NPROC_CONF);
+ +#    elif defined(_SC_NPROCESSORS_ONLN)
+ +        count = sysconf(_SC_NPROCESSORS_ONLN);
+ +#    elif defined(_SC_NPROC_ONLN)
+ +        count = sysconf(_SC_NPROC_ONLN);
+ +#    else
+ +#       warning "No valid sysconf argument value found. Executables will not be able to determine the number of logical cores: mdrun will use 1 thread by default!"
+ +#    endif      // End of check for sysconf argument values
+ +
+ +#else
+ +        count = 0; // Neither windows nor Unix.
+ +#endif
+ +    }
+ +#if GMX_OPENMP
+ +    int countFromOpenmp = gmx_omp_get_num_procs();
+ +    if (count != countFromOpenmp)
+ +    {
+ +        md_print_warn(cr, fplog,
+ +                      "Number of logical cores detected (%d) does not match the number reported by OpenMP (%d).\n"
+ +                      "Consider setting the launch configuration manually!",
+ +                      count, countFromOpenmp);
+ +    }
+ +#endif
+ +
+ +    GMX_UNUSED_VALUE(cr);
+ +    GMX_UNUSED_VALUE(fplog);
+ +    return count;
+ +}
+ +
+ +}   // namespace anonymous
+ +
+ +// static
+ +HardwareTopology HardwareTopology::detect(FILE *fplog, const t_commrec *cr)
+ +{
+ +    HardwareTopology result;
+ +
+ +    // Default values for machine and numa stuff
+ +    result.machine_.logicalProcessorCount   = 0;
+ +    result.machine_.numa.baseLatency        = 0.0;
+ +    result.machine_.numa.maxRelativeLatency = 0.0;
+ +    result.supportLevel_                    = SupportLevel::None;
+ +    result.isThisSystem_                    = true;
+ +
+ +#if GMX_HWLOC
+ +    parseHwLoc(&result.machine_, &result.supportLevel_, &result.isThisSystem_);
+ +#endif
+ +
+ +    // If something went wrong in hwloc (or if it was not present) we might
+ +    // have more information in cpuInfo
+ +    if (result.supportLevel_ < SupportLevel::Basic)
+ +    {
+ +        // There might be topology information in cpuInfo
+ +        parseCpuInfo(&result.machine_, &result.supportLevel_);
+ +    }
+ +    // If we did not manage to get anything from either hwloc or cpuInfo, find the cpu count at least
+ +    if (result.supportLevel_ == SupportLevel::None)
+ +    {
+ +        // No topology information; try to detect the number of logical processors at least
+ +        result.machine_.logicalProcessorCount = detectLogicalProcessorCount(fplog, cr);
+ +        if (result.machine_.logicalProcessorCount > 0)
+ +        {
+ +            result.supportLevel_ = SupportLevel::LogicalProcessorCount;
+ +        }
+ +    }
+ +    return result;
+ +}
+ +
+ +
+ +HardwareTopology::HardwareTopology()
+ +    : supportLevel_(SupportLevel::None)
+ +{
+ +}
+ +
+ +int HardwareTopology::numberOfCores() const
+ +{
+ +    if (supportLevel() >= SupportLevel::Basic)
+ +    {
+ +        // We assume all sockets have the same number of cores as socket 0.
+ +        // Since topology information is present, we can assume there is at least one socket.
+ +        return machine().sockets.size() * machine().sockets[0].cores.size();
+ +    }
+ +    else if (supportLevel() >= SupportLevel::LogicalProcessorCount)
+ +    {
+ +        return machine().logicalProcessorCount;
+ +    }
+ +    else
+ +    {
+ +        return 0;
+ +    }
+ +}
+ +
+ +} // namespace gmx
author	Mark Abraham <mark.j.abraham@gmail.com>
	Wed, 29 Jun 2016 13:30:40 +0000 (15:30 +0200)
committer	Mark Abraham <mark.j.abraham@gmail.com>
	Thu, 30 Jun 2016 20:00:07 +0000 (22:00 +0200)
		1	2
src/gromacs/gpu_utils/cudautils.cuh	patch \|	diff1 \|	\|	blob \| history
src/gromacs/gpu_utils/gpu_utils.cu	patch \|	diff1 \|	\|	blob \| history
src/gromacs/hardware/hardwaretopology.cpp	patch \|	diff1 \|	\|	blob \| history