message(WARNING "The fftw library found is compiled without SIMD support, which makes it slow. Consider recompiling it or contact your admin")
else()
if(${GMX_SIMD} MATCHES "AVX" AND NOT (${FFTW}_HAVE_SSE OR ${FFTW}_HAVE_SSE2))
- # If we end up here we have an AVX Gromacs build, and FFTW
- # with SIMD. FFTW 3.3.5 will have the behaviour that
- # configuring with AVX support also adds SSE support, which is
- # what we want. There is no good way to detect the FFTW
- # version, however.
- message(WARNING "The FFTW library was compiled with neither --enable-sse nor --enable-sse2; those would have enabled SSE(2) SIMD instructions. This will give suboptimal performance. You should (re)compile the FFTW library with both SSE2 and AVX instruction support (use both --enable-sse2 and --enable-avx). More recent versions of FFTW compile support for such narrower SIMD by default.")
+ # If we end up here we have an AVX Gromacs build, and
+ # FFTW with SIMD.
+ message(WARNING "The FFTW library was compiled with neither --enable-sse nor --enable-sse2; those would have enabled SSE(2) SIMD instructions. This will give suboptimal performance. You should (re)compile the FFTW library with --enable-sse2 and --enable-avx (and --enable-avx2 or --enable-avx512 if supported).")
endif()
endif()
set(FFT_STATUS_MESSAGE "Using external FFT library - FFTW3")
# each release. It's hard to test because it is only used for
# REGRESSIONTEST_DOWNLOAD, which doesn't work until that tarball has
# been placed on the server.
-set(REGRESSIONTEST_MD5SUM "5f49cfc4f04a34f117340cf9b3e5f8a2" CACHE INTERNAL "MD5 sum of the regressiontests tarball")
+set(REGRESSIONTEST_MD5SUM "366438549270d005fa6def6e56ca0256" CACHE INTERNAL "MD5 sum of the regressiontests tarball")
math(EXPR GMX_VERSION_NUMERIC
"${GMX_VERSION_MAJOR}*10000 + ${GMX_VERSION_PATCH}")
hardware, libraries, and compilers are only going to continue to get
more complex.
+Quick and dirty cluster installation
+------------------------------------
+
+On a cluster where users are expected to be running across multiple
+nodes using MPI, make one installation similar to the above, and
+another using an MPI wrapper compiler and which is `building only
+mdrun`_, because that is the only component of |Gromacs| that uses
+MPI.
+
Typical installation
--------------------
As above, and with further details below, but you should consider
* ``-DCMAKE_C_COMPILER=xxx`` equal to the name of the C99 `Compiler`_ you wish to use (or the environment variable ``CC``)
* ``-DCMAKE_CXX_COMPILER=xxx`` equal to the name of the C++98 `compiler`_ you wish to use (or the environment variable ``CXX``)
-* ``-DGMX_MPI=on`` to build using `MPI support`_
+* ``-DGMX_MPI=on`` to build using `MPI support`_ (generally good to combine with `building only mdrun`_)
* ``-DGMX_GPU=on`` to build using nvcc to run using NVIDIA `CUDA GPU acceleration`_ or an OpenCL_ GPU
* ``-DGMX_USE_OPENCL=on`` to build with OpenCL_ support enabled. ``GMX_GPU`` must also be set.
* ``-DGMX_SIMD=xxx`` to specify the level of `SIMD support`_ of the node on which |Gromacs| will run
* that you build FFTW from the source code.
If you build FFTW from source yourself, get the most recent version
-and follow the `FFTW installation guide`_. Note that we have recently
-contributed new SIMD optimization for several extra platforms to
-FFTW, which will appear in FFTW-3.3.5 (for now it is available in the
-FFTW repository on github, or you can find a very unofficial prerelease
-version at ftp://ftp.gromacs.org/pub/contrib ).
-Choose the precision for FFTW (i.e. single/float vs. double) to
-match whether you will later use mixed or double precision for
-|Gromacs|. There is no need to compile FFTW with
-threading or MPI support, but it does no harm. On x86 hardware,
-compile with *both* ``--enable-sse2`` and ``--enable-avx`` for
-FFTW-3.3.4 and earlier. As of FFTW-3.3.5 you should also add
-``--enable-avx2``. FFTW will create a fat library with codelets
-for all different instruction sets, and pick the fastest supported
-one at runtime. On IBM Power8, you definitely want the upcoming
-FFTW-3.3.5 and to compile it with ``--enable-vsx`` for SIMD support. If you are
-using a Cray, there is a special modified (commercial) version of
-FFTs using the FFTW interface which can be slightly faster.
+and follow the `FFTW installation guide`_. Choose the precision for
+FFTW (i.e. single/float vs. double) to match whether you will later
+use mixed or double precision for |Gromacs|. There is no need to
+compile FFTW with threading or MPI support, but it does no harm. On
+x86 hardware, compile with *both* ``--enable-sse2`` and
+``--enable-avx`` for FFTW-3.3.4 and earlier. From FFTW-3.3.5, you
+should also add ``--enable-avx2`` also. On Intel chipsets supporting
+512-wide AVX, including KNL, add ``--enable-avx512`` also. FFTW will
+create a fat library with codelets for all different instruction sets,
+and pick the fastest supported one at runtime. On IBM Power8, you
+definitely want FFTW-3.3.5 and to compile it with ``--enable-vsx`` for
+SIMD support. If you are using a Cray, there is a special modified
+(commercial) version of FFTs using the FFTW interface which can be
+slightly faster.
Using MKL
^^^^^^^^^
OpenCL GPU acceleration
^^^^^^^^^^^^^^^^^^^^^^^
-To build Gromacs with OpenCL support enabled, an OpenCL_ SDK
-(e.g. `from AMD <http://developer.amd.com/appsdk>`_) must be installed
-in a path found in ``CMAKE_PREFIX_PATH`` (or via the environment
-variables ``AMDAPPSDKROOT`` or ``CUDA_PATH``), and the following CMake
-flags must be set
+
+The primary target of the |Gromacs| OpenCL support is accelerating simulations
+on AMD hardware, both discrete GPUs and APUs (integrated CPU+GPU chips).
+The |Gromacs| OpenCL on NVIDIA GPUs works, but performance
+and other limitations make it less practical (for details see the user guide).
+
+To build |Gromacs| with OpenCL_ support enabled, two components are
+required: the OpenCL_ headers and the wrapper library that acts
+as a client driver loader (so-called ICD loader).
+The additional, runtime-only dependency is the vendor-specific GPU driver
+for the device targeted. This also contains the OpenCL_ compiler.
+As the GPU compute kernels are compiled on-demand at run time,
+this vendor-specific compiler and driver is not needed for building |Gromacs|.
+The former, compile-time dependencies are standard components,
+hence stock versions can be obtained from most Linux distribution
+repositories (e.g. ``opencl-headers`` and ``ocl-icd-libopencl1`` on Debian/Ubuntu).
+Only the compatibility with the required OpenCL_ version |REQUIRED_OPENCL_MIN_VERSION|
+needs to be ensured.
+Alternatively, the headers and library can also be obtained from vendor SDKs
+(e.g. `from AMD <http://developer.amd.com/appsdk>`_),
+which must be installed in a path found in ``CMAKE_PREFIX_PATH`` (or via the environment
+variables ``AMDAPPSDKROOT`` or ``CUDA_PATH``).
+
+To trigger an OpenCL_ build the following CMake flags must be set
::
cmake .. -DGMX_GPU=ON -DGMX_USE_OPENCL=ON
-Building |Gromacs| OpenCL support for a CUDA_ GPU works, but see the
-known limitations in the user guide. If you want to
-do so anyway, because NVIDIA OpenCL support is part of the CUDA
-package, a C++ compiler supported by your CUDA installation is
-required.
-
On Mac OS, an AMD GPU can be used only with OS version 10.10.4 and
higher; earlier OS versions are known to run incorrectly.
Building only mdrun
^^^^^^^^^^^^^^^^^^^
-Past versions of the build system offered "mdrun" and "install-mdrun"
-targets (similarly for other programs too) to build and install only
-the mdrun program, respectively. Such a build is useful when the
-configuration is only relevant for mdrun (such as with
-parallelization options for MPI, SIMD, GPUs, or on BlueGene or Cray),
-or the length of time for the compile-link-install cycle is relevant
-when developing.
This is now supported with the ``cmake`` option
-``-DGMX_BUILD_MDRUN_ONLY=ON``, which will build a cut-down version of
-``libgromacs`` and/or the mdrun program.
+``-DGMX_BUILD_MDRUN_ONLY=ON``, which will build a different version of
+``libgromacs`` and the ``mdrun`` program.
Naturally, now ``make install`` installs only those
products. By default, mdrun-only builds will default to static linking
against |Gromacs| libraries, because this is generally a good idea for
-the targets for which an mdrun-only build is desirable. If you re-use
-a build tree and change to the mdrun-only build, then you will inherit
-the setting for ``BUILD_SHARED_LIBS`` from the old build, and will be
-warned that you may wish to manage ``BUILD_SHARED_LIBS`` yourself.
+the targets for which an mdrun-only build is desirable.
Installing |Gromacs|
--------------------
run an MPI program is called ``srun``.
The ``make check`` target also runs integration-style tests that may run
-with MPI if ``GMX_MPI=ON`` was set. To make these work, you may need to
-set the CMake variables ``MPIEXEC``, ``MPIEXEC_NUMPROC_FLAG``, ``NUMPROC``,
+with MPI if ``GMX_MPI=ON`` was set. To make these work with various possible
+MPI libraries, you may need to
+set the CMake variables ``MPIEXEC``, ``MPIEXEC_NUMPROC_FLAG``,
``MPIEXEC_PREFLAGS`` and ``MPIEXEC_POSTFLAGS`` so that
``mdrun-mpi-test_mpi`` would run on multiple ranks via the shell command
${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} ${NUMPROC} ${MPIEXEC_PREFLAGS} \
mdrun-mpi-test_mpi ${MPIEXEC_POSTFLAGS} -otherflags
-Typically, one might use variable values ``mpirun``, ``-np``, ``2``, ``''``,
-``''`` respectively, in order to run on two ranks.
+A typical example for SLURM is
+
+::
+
+ cmake .. -DGMX_MPI=on -DMPIEXEC=srun -DMPIEXEC_NUMPROC_FLAG=-n -DMPIEXEC_PREFLAGS= -DMPIEXEC_POSTFLAGS=
Testing |Gromacs| for performance
When different ranks have a different computational load
(load imbalance), all ranks will have to wait for the one
that takes the most time. One would like to avoid such a situation.
-Load imbalance can occur due to three reasons:
+Load imbalance can occur due to four reasons:
\begin{itemize}
\item inhomogeneous particle distribution
\item inhomogeneous interaction cost distribution (charged/uncharged,
water/non-water due to {\gromacs} water innerloops)
\item statistical fluctuation (only with small particle numbers)
+\item differences in communication time, due to network topology and/or other jobs on the machine interfering with our communication
\end{itemize}
So we need a dynamic load balancing algorithm
where the volume of each domain decomposition cell
By default, {\tt mdrun} automatically turns on the dynamic load
balancing during a simulation when the total performance loss
-due to the force calculation imbalance is 5\% or more.
+due to the force calculation imbalance is 2\% or more.
{\bf Note} that the reported force load imbalance numbers might be higher,
since the force calculation is only part of work that needs to be done
during an integration step.
The minimum allowed scaling can be changed with the {\tt -dds}
option of {\tt mdrun}.
+The load imbalance is measured by timing a single region of the MD step
+on each MPI rank. This region can not include MPI communication, as
+timing of MPI calls does not allow separating wait due to imbalance
+from actual communication.
+The domain volumes are then scaled, with under-relaxation, inversely
+proportional with the measured time. This procedure will decrease the
+load imbalance when the change in load in the measured region correlates
+with the change in domain volume and the load outside
+the measured region does not depend strongly on the domain volume.
+In CPU-only simulations, the load is measured between the coordinate
+and the force communication. In hybrid CPU-GPU simulations we overlap
+communication on the CPU with calculation on the GPU. Therefore we
+measure from the last communication before the force calculation to
+when the CPU or GPU is finished, whichever is last.
+When not using PME ranks, we subtract the time in PME from the CPU time,
+as this includes MPI calls and the PME load is independent of domain size.
+This generally works well, unless the non-bonded load is low and there is
+imbalance in the bonded interactions. Then two issues can arise.
+Dynamic load balancing can increase the imbalance in update and constraints
+and with PME the coordinate and force redistribution time can go up
+significantly. Although dynamic load balancing
+can significantly improve performance in cases where there is imbalance in
+the bonded interactions on the CPU, there are many situations in which
+some domains continue decreasing in size and the load imbalance increases
+and/or PME coordinate and force redistribution cost increases significantly.
+As of version 2016.1, {\tt mdrun} disables the dynamic load balancing when
+measurement indicates that it deteriorates performance. This means that in most
+cases the user will get good performance with the default, automated
+dynamic load balancing setting.
+
\subsection{Constraints in parallel\index{constraints}}
\label{subsec:plincs}
Since with domain decomposition parts of molecules can reside
disable the use of the lower-latency cudaLaunchKernel API even when supported (CUDA >=v7.0).
Should only be used for benchmarking purposes.
+``GMX_DISABLE_CUDA_TIMING``
+ Disables GPU timing of CUDA tasks; synonymous with ``GMX_DISABLE_GPU_TIMING``.
+
``GMX_CYCLE_ALL``
times all code during runs. Incompatible with threads.
disables architecture-specific SIMD-optimized (SSE2, SSE4.1, AVX, etc.)
non-bonded kernels thus forcing the use of plain C kernels.
-``GMX_DISABLE_CUDA_TIMING``
+``GMX_DISABLE_GPU_TIMING``
timing of asynchronously executed GPU operations can have a
non-negligible overhead with short step times. Disabling timing can improve performance in these cases.
only the flavor required for the simulation is generated and
compiled.
-``GMX_OCL_FASTMATH``
- Adds the option ``cl-fast-relaxed-math`` to the compiler
- options (in the CUDA version this is enabled by default, it is likely that
- the same will happen with the OpenCL version soon)
+``GMX_OCL_DISABLE_FASTMATH``
+ Prevents the use of ``-cl-fast-relaxed-math`` compiler option.
``GMX_OCL_DUMP_LOG``
If defined, the OpenCL build log is always written to the
.. _NVIDIA blog article: https://devblogs.nvidia.com/parallelforall/increase-performance-gpu-boost-k80-autoboost/
+Reducing overheads in GPU accelerated runs
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+In order for CPU cores and GPU(s) to execute concurrently, tasks are
+launched and executed asynchronously on the GPU(s) while the CPU cores
+execute non-offloaded force computation (like long-range PME electrostatics).
+Asynchronous task launches are handled by GPU device driver and
+require CPU involvement. Therefore, the work of scheduling
+GPU tasks will incur an overhead that can in some cases significantly
+delay or interfere with the CPU execution.
+
+Delays in CPU execution are caused by the latency of launching GPU tasks,
+an overhead that can become significant as simulation ns/day increases
+(i.e. with shorter wall-time per step).
+The overhead is measured by :ref:`gmx mdrun` and reported in the performance
+summary section of the log file ("Launch GPU ops" row).
+A few percent of runtime spent in this category is normal,
+but in fast-iterating and multi-GPU parallel runs 10% or larger overheads can be observed.
+In general, there a user can do little to avoid such overheads, but there
+are a few cases where tweaks can give performance benefits.
+In single-rank runs timing of GPU tasks is by default enabled and,
+while in most cases its impact is small, in fast runs performance can be affected.
+The performance impact will be most significant on NVIDIA GPUs with CUDA,
+less on AMD with OpenCL.
+In these cases, when more than a few percent of "Launch GPU ops" time is observed,
+it is recommended turning off timing by setting the ``GMX_DISABLE_GPU_TIMING``
+environment variable.
+In parallel runs with with many ranks sharing a GPU
+launch overheads can also be reduced by staring fewer thread-MPI
+or MPI ranks per GPU; e.g. most often one rank per thread or core is not optimal.
+
+The second type of overhead, interference of the GPU driver with CPU computation,
+is caused by the scheduling and coordination of GPU tasks.
+A separate GPU driver thread can require CPU resources
+which may clash with the concurrently running non-offloaded tasks,
+potentially degrading the performance of PME or bonded force computation.
+This effect is most pronounced when using AMD GPUs with OpenCL with
+all stable driver releases to date (up to and including fglrx 12.15).
+To minimize the overhead it is recommended to
+leave a CPU hardware thread unused when launching :ref:`gmx mdrun`,
+especially on CPUs with high core count and/or HyperThreading enabled.
+E.g. on a machine with a 4-core CPU and eight threads (via HyperThreading) and an AMD GPU,
+try ``gmx mdrun -ntomp 7 -pin on``.
+This will leave free CPU resources for the GPU task scheduling
+reducing interference with CPU computation.
+Note that assigning fewer resources to :ref:`gmx mdrun` CPU computation
+involves a tradeoff which may outweigh the benefits of reduced GPU driver overhead,
+in particular without HyperThreading and with few CPU cores.
+
TODO In future patch: any tips not covered above
Running the OpenCL version of mdrun
minimum OpenCL version required is |REQUIRED_OPENCL_MIN_VERSION|. See
also the :ref:`known limitations <opencl-known-limitations>`.
+Devices from the AMD GCN architectures (all series) and NVIDIA Fermi
+and later (compute capability 2.0) are known to work, but before
+doing production runs always make sure that the |Gromacs| tests
+pass successfully on the hardware.
+
+The OpenCL GPU kernels are compiled at run time. Hence,
+building the OpenCL program can take a few seconds introducing a slight
+delay in the :ref:`gmx mdrun` startup. This is not normally a
+problem for long production MD, but you might prefer to do some kinds
+of work, e.g. that runs very few steps, on just the CPU (e.g. see ``-nb`` above).
+
The same ``-gpu_id`` option (or ``GMX_GPU_ID`` environment variable)
used to select CUDA devices, or to define a mapping of GPUs to PP
ranks, is used for OpenCL devices.
-The following devices are known to work correctly:
- - AMD: FirePro W5100, HD 7950, FirePro W9100, Radeon R7 240,
- Radeon R7 M260, Radeon R9 290
- - NVIDIA: GeForce GTX 660M, GeForce GTX 660Ti, GeForce GTX 750Ti,
- GeForce GTX 780, GTX Titan
-
-Building the OpenCL program can take a few seconds when :ref:`gmx
-mdrun` starts up, because the kernels that run on the
-GPU can only be compiled at run time. This is not normally a
-problem for long production MD, but you might prefer to do some kinds
-of work on just the CPU (e.g. see ``-nb`` above).
-
Some other :ref:`OpenCL management <opencl-management>` environment
variables may be of interest to developers.
almost no performance gain when using NVIDIA GPUs.
The issue affects NVIDIA driver versions up to 349 series, but it
known to be fixed 352 and later driver releases.
+- On NVIDIA GPUs the OpenCL kernels achieve much lower performance
+ than the equivalent CUDA kernels due to limitations of the NVIDIA OpenCL
+ compiler.
- The AMD APPSDK version 3.0 ships with OpenCL compiler/runtime components,
libamdocl12cl64.so and libamdocl64.so (only in earlier releases),
that conflict with newer fglrx GPU drivers which provide the same libraries.
#
# This file is part of the GROMACS molecular simulation package.
#
-# Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
+# Copyright (c) 2012,2013,2014,2015,2016, by the GROMACS development team, led by
# Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
# and including many others, as listed in the AUTHORS file in the
# top-level source directory and at http://www.gromacs.org.
set(_fftw_simd_support_level "--enable-sse2")
elseif(${GMX_SIMD} MATCHES "^(AVX)")
# Testing shows FFTW configured with --enable-sse2 --enable-avx is
- # slightly faster on most architectures than --enable-sse2 alone
- set(_fftw_simd_support_level --enable-sse2;--enable-avx)
+ # slightly faster on most architectures than --enable-sse2 alone.
+ # Support for --enable-avx2 was only added in 3.3.5, but
+ # configuring with it is at worst a warning, even on an earlier
+ # version.
+ set(_fftw_simd_support_level --enable-sse2;--enable-avx;--enable-avx2;--enable-avx512)
+elseif(${GMX_SIMD} MATCHES "^(VSX)")
+ set(_fftw_simd_support_level --enable-vsx)
endif()
set(GMX_BUILD_OWN_FFTW_OPTIMIZATION_CONFIGURATION ${_fftw_simd_support_level} CACHE INTERNAL "Optimization flags for FFTW compilation")
endif()
# Machinery for running the external project
-set(EXTERNAL_FFTW_VERSION 3.3.4)
+set(EXTERNAL_FFTW_VERSION 3.3.5)
# cmake make eats slashes //// -> //
set(GMX_BUILD_OWN_FFTW_URL
"http:////www.fftw.org/fftw-${EXTERNAL_FFTW_VERSION}.tar.gz" CACHE PATH
"URL from where to download fftw (use an absolute path when offline, adjust GMX_BUILD_OWN_FFTW_MD5 if downloading other version than ${EXTERNAL_FFTW_VERSION})")
-set(GMX_BUILD_OWN_FFTW_MD5 2edab8c06b24feeb3b82bbb3ebf3e7b3 CACHE STRING
+set(GMX_BUILD_OWN_FFTW_MD5 6cc08a3b9c7ee06fdd5b9eb02e06f569 CACHE STRING
"Expected MD5 hash for the file at GMX_BUILD_OWN_FFTW_URL")
mark_as_advanced(GMX_BUILD_OWN_FFTW_URL GMX_BUILD_OWN_FFTW_MD5)
#define GMX_DD_NNODES_SENDRECV 4
+/* We check if to turn on DLB at the first and every 100 DD partitionings.
+ * With large imbalance DLB will turn on at the first step, so we can
+ * make the interval so large that the MPI overhead of the check is negligible.
+ */
+static const int c_checkTurnDlbOnInterval = 100;
+/* We need to check if DLB results in worse performance and then turn it off.
+ * We check this more often then for turning DLB on, because the DLB can scale
+ * the domains very rapidly, so if unlucky the load imbalance can go up quickly
+ * and furthermore, we are already synchronizing often with DLB, so
+ * the overhead of the MPI Bcast is not that high.
+ */
+static const int c_checkTurnDlbOffInterval = 20;
+
+/* Forward declaration */
+static void dd_dlb_set_should_check_whether_to_turn_dlb_on(gmx_domdec_t *dd, gmx_bool bValue);
+
+
/*
#define dd_index(n,i) ((((i)[ZZ]*(n)[YY] + (i)[YY])*(n)[XX]) + (i)[XX])
static bool dlbIsOn(const gmx_domdec_comm_t *comm)
{
- return (comm->dlbState == edlbsOn);
+ return (comm->dlbState == edlbsOnCanTurnOff ||
+ comm->dlbState == edlbsOnForever);
}
static void vec_rvec_init(vec_rvec_t *v)
/* This error should never be triggered under normal
* circumstances, but you never know ...
*/
- gmx_fatal(FARGS, "Step %s: The domain decomposition grid has shifted too much in the %c-direction around cell %d %d %d. This should not have happened. Running with fewer ranks might avoid this issue.",
+ gmx_fatal(FARGS, "step %s: The domain decomposition grid has shifted too much in the %c-direction around cell %d %d %d. This should not have happened. Running with fewer ranks might avoid this issue.",
gmx_step_str(step, buf),
dim2char(dim), dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
}
if (bPBC && cell_size[i] < cellsize_limit_f*DD_CELL_MARGIN2/DD_CELL_MARGIN)
{
char buf[22];
- gmx_fatal(FARGS, "Step %s: the dynamic load balancing could not balance dimension %c: box size %f, triclinic skew factor %f, #cells %d, minimum cell size %f\n",
+ gmx_fatal(FARGS, "step %s: the dynamic load balancing could not balance dimension %c: box size %f, triclinic skew factor %f, #cells %d, minimum cell size %f\n",
gmx_step_str(step, buf),
dim2char(dim), ddbox->box_size[dim], ddbox->skew_fac[dim],
ncd, comm->cellsize_min[dim]);
comm->cellsize_min[dim])
{
char buf[22];
- gmx_fatal(FARGS, "Step %s: The %c-size (%f) times the triclinic skew factor (%f) is smaller than the smallest allowed cell size (%f) for domain decomposition grid cell %d %d %d",
+ gmx_fatal(FARGS, "step %s: The %c-size (%f) times the triclinic skew factor (%f) is smaller than the smallest allowed cell size (%f) for domain decomposition grid cell %d %d %d",
gmx_step_str(step, buf), dim2char(dim),
comm->cell_x1[dim] - comm->cell_x0[dim],
ddbox->skew_fac[dim],
{
case 'a': dlbState = edlbsOffCanTurnOn; break;
case 'n': dlbState = edlbsOffForever; break;
- case 'y': dlbState = edlbsOn; break;
+ case 'y': dlbState = edlbsOnForever; break;
default: gmx_incons("Unknown dlb_opt");
}
if (!EI_DYNAMICS(ir->eI))
{
- if (dlbState == edlbsOn)
+ if (dlbState == edlbsOnForever)
{
sprintf(buf, "NOTE: dynamic load balancing is only supported with dynamics, not with integrator '%s'\n", EI(ir->eI));
dd_warning(cr, fplog, buf);
case edlbsOffForever:
break;
case edlbsOffCanTurnOn:
+ case edlbsOnCanTurnOff:
dd_warning(cr, fplog, "NOTE: reproducibility requested, will not use dynamic load balancing\n");
dlbState = edlbsOffForever;
break;
- case edlbsOn:
+ case edlbsOnForever:
dd_warning(cr, fplog, "WARNING: reproducibility requested with dynamic load balancing, the simulation will NOT be binary reproducible\n");
break;
default:
/* Initialize to GPU share count to 0, might change later */
comm->nrank_gpu_shared = 0;
- comm->dlbState = check_dlb_support(fplog, cr, dlb_opt, comm->bRecordLoad, Flags, ir);
- comm->bCheckWhetherToTurnDlbOn = TRUE;
+ comm->dlbState = check_dlb_support(fplog, cr, dlb_opt, comm->bRecordLoad, Flags, ir);
+ dd_dlb_set_should_check_whether_to_turn_dlb_on(dd, TRUE);
+ /* To consider turning DLB on after 2*nstlist steps we need to check
+ * at partitioning count 3. Thus we need to increase the first count by 2.
+ */
+ comm->ddPartioningCountFirstDlbOff += 2;
if (fplog)
{
gmx_domdec_comm_t *comm;
real cellsize_min;
int d, nc, i;
- char buf[STRLEN];
dd = cr->dd;
comm = dd->comm;
- if (fplog)
- {
- fprintf(fplog, "At step %s the performance loss due to force load imbalance is %.1f %%\n", gmx_step_str(step, buf), dd_force_imb_perf_loss(dd)*100);
- }
-
cellsize_min = comm->cellsize_min[dd->dim[0]];
for (d = 1; d < dd->ndim; d++)
{
if (cellsize_min < comm->cellsize_limit*1.05)
{
- dd_warning(cr, fplog, "NOTE: the minimum cell size is smaller than 1.05 times the cell size limit, will not turn on dynamic load balancing\n");
+ char buf[STRLEN];
+ sprintf(buf, "step %" GMX_PRId64 " Measured %.1f %% performance load due to load imbalance, but the minimum cell size is smaller than 1.05 times the cell size limit. Will no longer try dynamic load balancing.\n", step, dd_force_imb_perf_loss(dd)*100);
/* Change DLB from "auto" to "no". */
comm->dlbState = edlbsOffForever;
return;
}
- dd_warning(cr, fplog, "NOTE: Turning on dynamic load balancing\n");
- comm->dlbState = edlbsOn;
+ char buf[STRLEN];
+ sprintf(buf, "step %" GMX_PRId64 " Turning on dynamic load balancing, because the performance loss due to load imbalance is %.1f %%.\n", step, dd_force_imb_perf_loss(dd)*100);
+ dd_warning(cr, fplog, buf);
+ comm->dlbState = edlbsOnCanTurnOff;
+
+ /* Store the non-DLB performance, so we can check if DLB actually
+ * improves performance.
+ */
+ GMX_RELEASE_ASSERT(comm->cycl_n[ddCyclStep] > 0, "When we turned on DLB, we should have measured cycles");
+ comm->cyclesPerStepBeforeDLB = comm->cycl[ddCyclStep]/comm->cycl_n[ddCyclStep];
set_dlb_limits(dd);
}
}
+static void turn_off_dlb(FILE *fplog, t_commrec *cr, gmx_int64_t step)
+{
+ gmx_domdec_t *dd = cr->dd;
+
+ char buf[STRLEN];
+ sprintf(buf, "step %" GMX_PRId64 " Turning off dynamic load balancing, because it is degrading performance.\n", step);
+ dd_warning(cr, fplog, buf);
+ dd->comm->dlbState = edlbsOffCanTurnOn;
+ dd->comm->haveTurnedOffDlb = true;
+ dd->comm->ddPartioningCountFirstDlbOff = dd->ddp_count;
+}
+
+static void turn_off_dlb_forever(FILE *fplog, t_commrec *cr, gmx_int64_t step)
+{
+ GMX_RELEASE_ASSERT(cr->dd->comm->dlbState == edlbsOffCanTurnOn, "Can only turn off DLB forever when it was in the can-turn-on state");
+ char buf[STRLEN];
+ sprintf(buf, "step %" GMX_PRId64 " Will no longer try dynamic load balancing, as it degraded performance.\n", step);
+ dd_warning(cr, fplog, buf);
+ cr->dd->comm->dlbState = edlbsOffForever;
+}
+
static char *init_bLocalCG(const gmx_mtop_t *mtop)
{
int ncg, cg;
if (dd->comm->dlbState == edlbsOffCanTurnOn)
{
dd->comm->bCheckWhetherToTurnDlbOn = bValue;
+
+ if (bValue == TRUE)
+ {
+ /* Store the DD partitioning count, so we can ignore cycle counts
+ * over the next nstlist steps, which are often slower.
+ */
+ dd->comm->ddPartioningCountFirstDlbOff = dd->ddp_count;
+ }
}
}
*/
static gmx_bool dd_dlb_get_should_check_whether_to_turn_dlb_on(gmx_domdec_t *dd)
{
- const int nddp_chk_dlb = 100;
-
if (dd->comm->dlbState != edlbsOffCanTurnOn)
{
return FALSE;
}
+ if (dd->ddp_count <= dd->comm->ddPartioningCountFirstDlbOff)
+ {
+ /* We ignore the first nstlist steps at the start of the run
+ * or after PME load balancing or after turning DLB off, since
+ * these often have extra allocation or cache miss overhead.
+ */
+ return FALSE;
+ }
+
/* We should check whether we should use DLB directly after
* unlocking DLB. */
if (dd->comm->bCheckWhetherToTurnDlbOn)
dd_dlb_set_should_check_whether_to_turn_dlb_on(dd, FALSE);
return TRUE;
}
- /* We should also check whether we should use DLB every 100
+ /* We check whether we should use DLB every c_checkTurnDlbOnInterval
* partitionings (we do not do this every partioning, so that we
* avoid excessive communication). */
- if (dd->comm->n_load_have % nddp_chk_dlb == nddp_chk_dlb - 1)
+ if (dd->comm->n_load_have % c_checkTurnDlbOnInterval == c_checkTurnDlbOnInterval - 1)
{
return TRUE;
}
gmx_bool dd_dlb_is_on(const gmx_domdec_t *dd)
{
- return (dd->comm->dlbState == edlbsOn);
+ return dlbIsOn(dd->comm);
}
gmx_bool dd_dlb_is_locked(const gmx_domdec_t *dd)
gmx_int64_t step_pcoupl;
rvec cell_ns_x0, cell_ns_x1;
int i, n, ncgindex_set, ncg_home_old = -1, ncg_moved, nat_f_novirsum;
- gmx_bool bBoxChanged, bNStGlobalComm, bDoDLB, bCheckWhetherToTurnDlbOn, bTurnOnDLB, bLogLoad;
+ gmx_bool bBoxChanged, bNStGlobalComm, bDoDLB, bCheckWhetherToTurnDlbOn, bLogLoad;
gmx_bool bRedist, bSortCG, bResortAll;
ivec ncells_old = {0, 0, 0}, ncells_new = {0, 0, 0}, np;
real grid_density;
}
comm->n_load_collect++;
- if (bCheckWhetherToTurnDlbOn)
+ if (dlbIsOn(comm))
{
+ if (DDMASTER(dd))
+ {
+ /* Add the measured cycles to the running average */
+ const float averageFactor = 0.1f;
+ comm->cyclesPerStepDlbExpAverage =
+ (1 - averageFactor)*comm->cyclesPerStepDlbExpAverage +
+ averageFactor*comm->cycl[ddCyclStep]/comm->cycl_n[ddCyclStep];
+ }
+ if (comm->dlbState == edlbsOnCanTurnOff &&
+ dd->comm->n_load_have % c_checkTurnDlbOffInterval == c_checkTurnDlbOffInterval - 1)
+ {
+ gmx_bool turnOffDlb;
+ if (DDMASTER(dd))
+ {
+ /* If the running averaged cycles with DLB are more
+ * than before we turned on DLB, turn off DLB.
+ * We will again run and check the cycles without DLB
+ * and we can then decide if to turn off DLB forever.
+ */
+ turnOffDlb = (comm->cyclesPerStepDlbExpAverage >
+ comm->cyclesPerStepBeforeDLB);
+ }
+ dd_bcast(dd, sizeof(turnOffDlb), &turnOffDlb);
+ if (turnOffDlb)
+ {
+ /* To turn off DLB, we need to redistribute the atoms */
+ dd_collect_state(dd, state_local, state_global);
+ bMasterState = TRUE;
+ turn_off_dlb(fplog, cr, step);
+ }
+ }
+ }
+ else if (bCheckWhetherToTurnDlbOn)
+ {
+ gmx_bool turnOffDlbForever = FALSE;
+ gmx_bool turnOnDlb = FALSE;
+
/* Since the timings are node dependent, the master decides */
if (DDMASTER(dd))
{
- /* Here we check if the max PME rank load is more than 0.98
- * the max PP force load. If so, PP DLB will not help,
- * since we are (almost) limited by PME. Furthermore,
- * DLB will cause a significant extra x/f redistribution
- * cost on the PME ranks, which will then surely result
- * in lower total performance.
- * This check might be fragile, since one measurement
- * below 0.98 (although only done once every 100 DD part.)
- * could turn on DLB for the rest of the run.
+ /* If we recently turned off DLB, we want to check if
+ * performance is better without DLB. We want to do this
+ * ASAP to minimize the chance that external factors
+ * slowed down the DLB step are gone here and we
+ * incorrectly conclude that DLB was causing the slowdown.
+ * So we measure one nstlist block, no running average.
*/
- if (cr->npmenodes > 0 &&
- dd_pme_f_ratio(dd) > 1 - DD_PERF_LOSS_DLB_ON)
+ if (comm->haveTurnedOffDlb &&
+ comm->cycl[ddCyclStep]/comm->cycl_n[ddCyclStep] <
+ comm->cyclesPerStepDlbExpAverage)
{
- bTurnOnDLB = FALSE;
+ /* After turning off DLB we ran nstlist steps in fewer
+ * cycles than with DLB. This likely means that DLB
+ * in not benefical, but this could be due to a one
+ * time unlucky fluctuation, so we require two such
+ * observations in close succession to turn off DLB
+ * forever.
+ */
+ if (comm->dlbSlowerPartitioningCount > 0 &&
+ dd->ddp_count < comm->dlbSlowerPartitioningCount + 10*c_checkTurnDlbOnInterval)
+ {
+ turnOffDlbForever = TRUE;
+ }
+ comm->haveTurnedOffDlb = false;
+ /* Register when we last measured DLB slowdown */
+ comm->dlbSlowerPartitioningCount = dd->ddp_count;
}
else
{
- bTurnOnDLB =
- (dd_force_imb_perf_loss(dd) >= DD_PERF_LOSS_DLB_ON);
- }
- if (debug)
- {
- fprintf(debug, "step %s, imb loss %f\n",
- gmx_step_str(step, sbuf),
- dd_force_imb_perf_loss(dd));
+ /* Here we check if the max PME rank load is more than 0.98
+ * the max PP force load. If so, PP DLB will not help,
+ * since we are (almost) limited by PME. Furthermore,
+ * DLB will cause a significant extra x/f redistribution
+ * cost on the PME ranks, which will then surely result
+ * in lower total performance.
+ */
+ if (cr->npmenodes > 0 &&
+ dd_pme_f_ratio(dd) > 1 - DD_PERF_LOSS_DLB_ON)
+ {
+ turnOnDlb = FALSE;
+ }
+ else
+ {
+ turnOnDlb = (dd_force_imb_perf_loss(dd) >= DD_PERF_LOSS_DLB_ON);
+ }
}
}
- dd_bcast(dd, sizeof(bTurnOnDLB), &bTurnOnDLB);
- if (bTurnOnDLB)
+ struct
+ {
+ gmx_bool turnOffDlbForever;
+ gmx_bool turnOnDlb;
+ }
+ bools {
+ turnOffDlbForever, turnOnDlb
+ };
+ dd_bcast(dd, sizeof(bools), &bools);
+ if (bools.turnOffDlbForever)
+ {
+ turn_off_dlb_forever(fplog, cr, step);
+ }
+ else if (bools.turnOnDlb)
{
turn_on_dlb(fplog, cr, step);
bDoDLB = TRUE;
edlbsOffForever, /**< DLB is off and will never be turned on */
edlbsOffCanTurnOn, /**< DLB is off and will turn on on imbalance */
edlbsOffTemporarilyLocked, /**< DLB is off and temporarily can't turn on */
- edlbsOn, /**< DLB is on and will stay on forever */
+ edlbsOnCanTurnOff, /**< DLB is on and can turn off when slow */
+ edlbsOnForever, /**< DLB is on and will stay on forever, because the user chose this */
edlbsNR /**< The number of DLB states */
};
-/* Allowed DLB state transitions:
- * edlbsOffCanTurnOn -> edlbsOn
+/* Allowed DLB state transitions in automatic mode:
+ * edlbsOffCanTurnOn -> edlbsOnCanTurnOff
* edlbsOffCanTurnOn -> edlbsOffForever
* edlbsOffCanTurnOn -> edlbsOffTemporarilyLocked
* edlbsOffTemporarilyLocked -> edlbsOffCanTurnOn
+ * edlbsOnCanTurnOff -> edlbsOffCanTurnOn
*/
/*! \brief The PME domain decomposition for one dimension */
int dlbState;
/* With dlbState=edlbsOffCanTurnOn, should we check if to DLB on at the next DD? */
gmx_bool bCheckWhetherToTurnDlbOn;
+ /* The first DD count since we are running without DLB */
+ int ddPartioningCountFirstDlbOff;
/* Cell sizes for static load balancing, first index cartesian */
real **slb_frac;
/** Which cg distribution is stored on the master node,
* stored as DD partitioning call count.
*/
- int master_cg_ddp_count;
+ gmx_int64_t master_cg_ddp_count;
/** The number of cg's received from the direct neighbors */
int zone_ncg1[DD_MAXZONE];
/** How many times have we collected the load measurements */
int n_load_collect;
+ /* Cycle count history for DLB checks */
+ float cyclesPerStepBeforeDLB; /**< The averaged cycles per step over the last nstlist step before turning on DLB */
+ float cyclesPerStepDlbExpAverage; /**< The running average of the cycles per step during DLB */
+ bool haveTurnedOffDlb; /**< Have we turned off DLB (after turning DLB on)? */
+ gmx_int64_t dlbSlowerPartitioningCount; /**< The DD step at which we last measured that DLB off was faster than DLB on, 0 if there was no such step */
+
/* Statistics */
double sum_nat[ddnatNR-ddnatZONE]; /**< The atoms per zone, summed over the steps */
int ndecomp; /**< The number of partioning calls */
nSA_time = str_nelem(is->anneal_time, MAXPTR, ptr1);
if (nSA_time != k)
{
- gmx_fatal(FARGS, "Found %d annealing-time values, wanter %d\n", nSA_time, k);
+ gmx_fatal(FARGS, "Found %d annealing-time values, wanted %d\n", nSA_time, k);
}
nSA_temp = str_nelem(is->anneal_temp, MAXPTR, ptr2);
if (nSA_temp != k)
compilerOptions += " -cl-opt-disable";
}
- if (getenv("GMX_OCL_FASTMATH") )
+ /* Fastmath imprves performance on all supported arch */
+ if (getenv("GMX_OCL_DISABLE_FASTMATH") == NULL)
{
compilerOptions += " -cl-fast-relaxed-math";
}
if (bWarn)
{
- if (maxwarn >= 0)
+ if (maxwarn < INT_MAX)
{
cconerr(lincsd, xprime, pbc,
&ncons_loc, &p_ssd, &p_max, &p_imax);
invdt, v, vir != NULL, vir_r_m_dr,
econq, nrnb,
constr->maxwarn, &constr->warncount_lincs);
- if (!bOK && constr->maxwarn >= 0)
+ if (!bOK && constr->maxwarn < INT_MAX)
{
if (fplog != NULL)
{
idef, ir, x, xprime, nrnb,
constr->lagr, lambda, dvdlambda,
invdt, v, vir != NULL, vir_r_m_dr,
- constr->maxwarn >= 0, econq);
+ constr->maxwarn < INT_MAX, econq);
break;
case (econqVeloc):
bOK = bshakef(fplog, constr->shaked,
idef, ir, x, min_proj, nrnb,
constr->lagr, lambda, dvdlambda,
invdt, NULL, vir != NULL, vir_r_m_dr,
- constr->maxwarn >= 0, econq);
+ constr->maxwarn < INT_MAX, econq);
break;
default:
gmx_fatal(FARGS, "Internal error, SHAKE called for constraining something else than coordinates");
break;
}
- if (!bOK && constr->maxwarn >= 0)
+ if (!bOK && constr->maxwarn < INT_MAX)
{
if (fplog != NULL)
{
{
constr->maxwarn = 0;
sscanf(env, "%8d", &constr->maxwarn);
+ if (constr->maxwarn < 0)
+ {
+ constr->maxwarn = INT_MAX;
+ }
if (fplog)
{
fprintf(fplog,
constr->maxwarn);
}
}
- if (constr->maxwarn < 0 && fplog)
- {
- fprintf(fplog, "maxwarn < 0, will not stop on constraint errors\n");
- }
constr->warncount_lincs = 0;
constr->warncount_settle = 0;
/* CUDA timing disabled as event timers don't work:
- with multiple streams = domain-decomposition;
- - when turned off by GMX_DISABLE_CUDA_TIMING.
+ - when turned off by GMX_DISABLE_CUDA_TIMING/GMX_DISABLE_GPU_TIMING.
*/
nb->bDoTime = (!nb->bUseTwoStreams &&
- (getenv("GMX_DISABLE_CUDA_TIMING") == NULL));
+ (getenv("GMX_DISABLE_CUDA_TIMING") == NULL) &&
+ (getenv("GMX_DISABLE_GPU_TIMING") == NULL));
if (nb->bDoTime)
{
init_plist(nb->plist[eintLocal]);
/* OpenCL timing disabled if GMX_DISABLE_OCL_TIMING is defined. */
- nb->bDoTime = (getenv("GMX_DISABLE_OCL_TIMING") == NULL);
+ /* TODO deprecate the first env var in the 2017 release. */
+ nb->bDoTime = (getenv("GMX_DISABLE_OCL_TIMING") == NULL &&
+ getenv("GMX_DISABLE_GPU_TIMING") == NULL);
/* Create queues only after bDoTime has been initialized */
if (nb->bDoTime)
#include "nbnxn_gpu.h"
-static const bool useCuda = GMX_GPU == GMX_GPU_CUDA;
-static const bool useOpenCL = GMX_GPU == GMX_GPU_OPENCL;
-
void print_time(FILE *out,
gmx_walltime_accounting_t walltime_accounting,
gmx_int64_t step,
if (bDoForces && DOMAINDECOMP(cr))
{
- if (bUseGPU && useCuda)
+ if (bUseGPU)
{
/* We are done with the CPU compute, but the GPU local non-bonded
* kernel can still be running while we communicate the forces.
if (bUseOrEmulGPU)
{
/* wait for local forces (or calculate in emulation mode) */
- if (bUseGPU && useCuda)
+ if (bUseGPU)
{
float cycles_tmp, cycles_wait_est;
- const float cuda_api_overhead_margin = 50000.0f; /* cycles */
+ /* Measured overhead on CUDA and OpenCL with(out) GPU sharing
+ * is between 0.5 and 1.5 Mcycles. So 2 MCycles is an overestimate,
+ * but even with a step of 0.1 ms the difference is less than 1%
+ * of the step time.
+ */
+ const float gpuWaitApiOverheadMargin = 2e6f; /* cycles */
wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
nbnxn_gpu_wait_for_gpu(nbv->gpu_nbv,
{
cycles_wait_est = gmx_cycles_read() - cycleCountBeforeLocalWorkCompletes;
- if (cycles_tmp < cuda_api_overhead_margin)
+ if (cycles_tmp < gpuWaitApiOverheadMargin)
{
/* We measured few cycles, it could be that the kernel
* and transfer finished earlier and there was no actual
*/
cycles_force += cycles_wait_est;
cycles_wait_gpu += cycles_wait_est;
- }
- else if (bUseGPU && useOpenCL)
- {
- wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
- nbnxn_gpu_wait_for_gpu(nbv->gpu_nbv,
- flags, eatLocal,
- enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
- fr->fshift);
- cycles_wait_gpu += wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
- }
- if (bUseGPU)
- {
/* now clear the GPU outputs while we finish the step on the CPU */
wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU_NB);
nbnxn_gpu_clear_outputs(nbv->gpu_nbv, flags);