Merge branch release-2016

author Mark Abraham <mark.j.abraham@gmail.com>

Mon, 31 Oct 2016 11:39:40 +0000 (12:39 +0100)

committer Mark Abraham <mark.j.abraham@gmail.com>

Mon, 31 Oct 2016 13:38:43 +0000 (14:38 +0100)
author Mark Abraham <mark.j.abraham@gmail.com>
Mon, 31 Oct 2016 11:39:40 +0000 (12:39 +0100)
committer Mark Abraham <mark.j.abraham@gmail.com>
Mon, 31 Oct 2016 13:38:43 +0000 (14:38 +0100)
diff --git a/cmake/gmxManageFFTLibraries.cmake b/cmake/gmxManageFFTLibraries.cmake

index db3e5d4f0915b734627535b9e2de5699f2ec426c..800b2f633ca71fe5dec3230db5fee93413598e8f 100644 (file)
--- a/cmake/gmxManageFFTLibraries.cmake
+++ b/cmake/gmxManageFFTLibraries.cmake
@@ -93,12 +93,9 @@ if(${GMX_FFT_LIBRARY} STREQUAL "FFTW3")
              message(WARNING "The fftw library found is compiled without SIMD support, which makes it slow. Consider recompiling it or contact your admin")
          else()
              if(${GMX_SIMD} MATCHES "AVX" AND NOT (${FFTW}_HAVE_SSE OR ${FFTW}_HAVE_SSE2))
-                # If we end up here we have an AVX Gromacs build, and FFTW
-                # with SIMD. FFTW 3.3.5 will have the behaviour that
-                # configuring with AVX support also adds SSE support, which is
-                # what we want. There is no good way to detect the FFTW
-                # version, however.
-                message(WARNING "The FFTW library was compiled with neither --enable-sse nor --enable-sse2; those would have enabled SSE(2) SIMD instructions. This will give suboptimal performance. You should (re)compile the FFTW library with both SSE2 and AVX instruction support (use both --enable-sse2 and --enable-avx). More recent versions of FFTW compile support for such narrower SIMD by default.")
+                # If we end up here we have an AVX Gromacs build, and
+                # FFTW with SIMD.
+                message(WARNING "The FFTW library was compiled with neither --enable-sse nor --enable-sse2; those would have enabled SSE(2) SIMD instructions. This will give suboptimal performance. You should (re)compile the FFTW library with --enable-sse2 and --enable-avx (and --enable-avx2 or --enable-avx512 if supported).")
              endif()
          endif()
          set(FFT_STATUS_MESSAGE "Using external FFT library - FFTW3")
diff --git a/cmake/gmxVersionInfo.cmake b/cmake/gmxVersionInfo.cmake

index f7d7a2bc9efbefdca7b65c78e3bc2480f82557db..6a4b9619ef1bb045711dec147e9fb2822f3b9a4f 100644 (file)
--- a/cmake/gmxVersionInfo.cmake
+++ b/cmake/gmxVersionInfo.cmake
@@ -229,7 +229,7 @@ set(REGRESSIONTEST_BRANCH "refs/heads/release-2016")
  # each release. It's hard to test because it is only used for
  # REGRESSIONTEST_DOWNLOAD, which doesn't work until that tarball has
  # been placed on the server.
-set(REGRESSIONTEST_MD5SUM "5f49cfc4f04a34f117340cf9b3e5f8a2" CACHE INTERNAL "MD5 sum of the regressiontests tarball")
+set(REGRESSIONTEST_MD5SUM "366438549270d005fa6def6e56ca0256" CACHE INTERNAL "MD5 sum of the regressiontests tarball")
  
  math(EXPR GMX_VERSION_NUMERIC
       "${GMX_VERSION_MAJOR}*10000 + ${GMX_VERSION_PATCH}")
diff --git a/docs/install-guide/index.rst b/docs/install-guide/index.rst

index ffa7047de32472cfb70c0ea714e86c0cb9de51ea..9fc01b432e03b38570d5663cf36777408a9f0f88 100644 (file)
--- a/docs/install-guide/index.rst
+++ b/docs/install-guide/index.rst
@@ -46,6 +46,15 @@ fast. If you want to get the maximum value for your hardware with
  hardware, libraries, and compilers are only going to continue to get
  more complex.
  
+Quick and dirty cluster installation
+------------------------------------
+
+On a cluster where users are expected to be running across multiple
+nodes using MPI, make one installation similar to the above, and
+another using an MPI wrapper compiler and which is `building only
+mdrun`_, because that is the only component of |Gromacs| that uses
+MPI.
+
  Typical installation
  --------------------
  As above, and with further details below, but you should consider
@@ -54,7 +63,7 @@ appropriate value instead of ``xxx`` :
  
  * ``-DCMAKE_C_COMPILER=xxx`` equal to the name of the C99 `Compiler`_ you wish to use (or the environment variable ``CC``)
  * ``-DCMAKE_CXX_COMPILER=xxx`` equal to the name of the C++98 `compiler`_ you wish to use (or the environment variable ``CXX``)
-* ``-DGMX_MPI=on`` to build using `MPI support`_
+* ``-DGMX_MPI=on`` to build using `MPI support`_ (generally good to combine with `building only mdrun`_)
  * ``-DGMX_GPU=on`` to build using nvcc to run using NVIDIA `CUDA GPU acceleration`_ or an OpenCL_ GPU
  * ``-DGMX_USE_OPENCL=on`` to build with OpenCL_ support enabled. ``GMX_GPU`` must also be set.
  * ``-DGMX_SIMD=xxx`` to specify the level of `SIMD support`_ of the node on which |Gromacs| will run
@@ -259,23 +268,20 @@ recommends either
  * that you build FFTW from the source code.
  
  If you build FFTW from source yourself, get the most recent version
-and follow the `FFTW installation guide`_. Note that we have recently
-contributed new SIMD optimization for several extra platforms to
-FFTW, which will appear in FFTW-3.3.5 (for now it is available in the
-FFTW repository on github, or you can find a very unofficial prerelease
-version at ftp://ftp.gromacs.org/pub/contrib ).
-Choose the precision for FFTW (i.e. single/float vs. double) to
-match whether you will later use mixed or double precision for
-|Gromacs|. There is no need to compile FFTW with
-threading or MPI support, but it does no harm. On x86 hardware,
-compile with *both* ``--enable-sse2`` and ``--enable-avx`` for
-FFTW-3.3.4 and earlier. As of FFTW-3.3.5 you should also add
-``--enable-avx2``. FFTW will create a fat library with codelets
-for all different instruction sets, and pick the fastest supported
-one at runtime. On IBM Power8, you definitely want the upcoming
-FFTW-3.3.5 and to compile it with ``--enable-vsx`` for SIMD support. If you are
-using a Cray, there is a special modified (commercial) version of
-FFTs using the FFTW interface which can be slightly faster.
+and follow the `FFTW installation guide`_. Choose the precision for
+FFTW (i.e. single/float vs. double) to match whether you will later
+use mixed or double precision for |Gromacs|. There is no need to
+compile FFTW with threading or MPI support, but it does no harm. On
+x86 hardware, compile with *both* ``--enable-sse2`` and
+``--enable-avx`` for FFTW-3.3.4 and earlier. From FFTW-3.3.5, you
+should also add ``--enable-avx2`` also. On Intel chipsets supporting
+512-wide AVX, including KNL, add ``--enable-avx512`` also. FFTW will
+create a fat library with codelets for all different instruction sets,
+and pick the fastest supported one at runtime. On IBM Power8, you
+definitely want FFTW-3.3.5 and to compile it with ``--enable-vsx`` for
+SIMD support. If you are using a Cray, there is a special modified
+(commercial) version of FFTs using the FFTW interface which can be
+slightly faster.
  
  Using MKL
  ^^^^^^^^^
@@ -605,22 +611,35 @@ CPUs also works well.
  
  OpenCL GPU acceleration
  ^^^^^^^^^^^^^^^^^^^^^^^
-To build Gromacs with OpenCL support enabled, an OpenCL_ SDK
-(e.g. `from AMD <http://developer.amd.com/appsdk>`_) must be installed
-in a path found in ``CMAKE_PREFIX_PATH`` (or via the environment
-variables ``AMDAPPSDKROOT`` or ``CUDA_PATH``), and the following CMake
-flags must be set
+
+The primary target of the |Gromacs| OpenCL support is accelerating simulations
+on AMD hardware, both discrete GPUs and APUs (integrated CPU+GPU chips).
+The |Gromacs| OpenCL on NVIDIA GPUs works, but performance
+and other limitations make it less practical (for details see the user guide).
+
+To build |Gromacs| with OpenCL_ support enabled, two components are
+required: the OpenCL_ headers and the wrapper library that acts
+as a client driver loader (so-called ICD loader).
+The additional, runtime-only dependency is the vendor-specific GPU driver
+for the device targeted. This also contains the OpenCL_ compiler.
+As the GPU compute kernels are compiled  on-demand at run time,
+this vendor-specific compiler and driver is not needed for building |Gromacs|.
+The former, compile-time dependencies are standard components,
+hence stock versions can be obtained from most Linux distribution
+repositories (e.g. ``opencl-headers`` and ``ocl-icd-libopencl1`` on Debian/Ubuntu).
+Only the compatibility with the required OpenCL_ version |REQUIRED_OPENCL_MIN_VERSION|
+needs to be ensured.
+Alternatively, the headers and library can also be obtained from vendor SDKs
+(e.g. `from AMD <http://developer.amd.com/appsdk>`_),
+which must be installed in a path found in ``CMAKE_PREFIX_PATH`` (or via the environment
+variables ``AMDAPPSDKROOT`` or ``CUDA_PATH``).
+
+To trigger an OpenCL_ build the following CMake flags must be set
  
  ::
  
      cmake .. -DGMX_GPU=ON -DGMX_USE_OPENCL=ON
  
-Building |Gromacs| OpenCL support for a CUDA_ GPU works, but see the
-known limitations in the user guide. If you want to
-do so anyway, because NVIDIA OpenCL support is part of the CUDA
-package, a C++ compiler supported by your CUDA installation is
-required.
-
  On Mac OS, an AMD GPU can be used only with OS version 10.10.4 and
  higher; earlier OS versions are known to run incorrectly.
  
@@ -797,24 +816,14 @@ supported by ``cmake`` (e.g. ``ninja``) also work well.
  
  Building only mdrun
  ^^^^^^^^^^^^^^^^^^^
-Past versions of the build system offered "mdrun" and "install-mdrun"
-targets (similarly for other programs too) to build and install only
-the mdrun program, respectively. Such a build is useful when the
-configuration is only relevant for mdrun (such as with
-parallelization options for MPI, SIMD, GPUs, or on BlueGene or Cray),
-or the length of time for the compile-link-install cycle is relevant
-when developing.
  
  This is now supported with the ``cmake`` option
-``-DGMX_BUILD_MDRUN_ONLY=ON``, which will build a cut-down version of
-``libgromacs`` and/or the mdrun program.
+``-DGMX_BUILD_MDRUN_ONLY=ON``, which will build a different version of
+``libgromacs`` and the ``mdrun`` program.
  Naturally, now ``make install`` installs only those
  products. By default, mdrun-only builds will default to static linking
  against |Gromacs| libraries, because this is generally a good idea for
-the targets for which an mdrun-only build is desirable. If you re-use
-a build tree and change to the mdrun-only build, then you will inherit
-the setting for ``BUILD_SHARED_LIBS`` from the old build, and will be
-warned that you may wish to manage ``BUILD_SHARED_LIBS`` yourself.
+the targets for which an mdrun-only build is desirable.
  
  Installing |Gromacs|
  --------------------
@@ -926,8 +935,9 @@ be run. You can use ``./gmxtest.pl -mpirun srun`` if your command to
  run an MPI program is called ``srun``.
  
  The ``make check`` target also runs integration-style tests that may run
-with MPI if ``GMX_MPI=ON`` was set. To make these work, you may need to
-set the CMake variables ``MPIEXEC``, ``MPIEXEC_NUMPROC_FLAG``, ``NUMPROC``,
+with MPI if ``GMX_MPI=ON`` was set. To make these work with various possible
+MPI libraries, you may need to
+set the CMake variables ``MPIEXEC``, ``MPIEXEC_NUMPROC_FLAG``,
  ``MPIEXEC_PREFLAGS`` and ``MPIEXEC_POSTFLAGS`` so that
  ``mdrun-mpi-test_mpi`` would run on multiple ranks via the shell command
  
@@ -936,8 +946,11 @@ set the CMake variables ``MPIEXEC``, ``MPIEXEC_NUMPROC_FLAG``, ``NUMPROC``,
      ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} ${NUMPROC} ${MPIEXEC_PREFLAGS} \
            mdrun-mpi-test_mpi ${MPIEXEC_POSTFLAGS} -otherflags
  
-Typically, one might use variable values ``mpirun``, ``-np``, ``2``, ``''``,
-``''`` respectively, in order to run on two ranks.
+A typical example for SLURM is
+
+::
+
+     cmake .. -DGMX_MPI=on -DMPIEXEC=srun -DMPIEXEC_NUMPROC_FLAG=-n -DMPIEXEC_PREFLAGS= -DMPIEXEC_POSTFLAGS=
  
  
  Testing |Gromacs| for performance
diff --git a/docs/manual/algorithms.tex b/docs/manual/algorithms.tex

index 70c9bd9d12c17a4accae04b23c427db88bb71a78..aedfab61cdebaffecaae5a89f43fb8fe2e317a31 100644 (file)
--- a/docs/manual/algorithms.tex
+++ b/docs/manual/algorithms.tex
@@ -2777,12 +2777,13 @@ should be calculated on which rank.
  When different ranks have a different computational load
  (load imbalance), all ranks will have to wait for the one
  that takes the most time. One would like to avoid such a situation.
-Load imbalance can occur due to three reasons:
+Load imbalance can occur due to four reasons:
  \begin{itemize}
  \item inhomogeneous particle distribution
  \item inhomogeneous interaction cost distribution (charged/uncharged,
    water/non-water due to {\gromacs} water innerloops)
  \item statistical fluctuation (only with small particle numbers)
+\item differences in communication time, due to network topology and/or other jobs on the machine interfering with our communication
  \end{itemize}
  So we need a dynamic load balancing algorithm
  where the volume of each domain decomposition cell
@@ -2806,7 +2807,7 @@ of a distance between following, staggered boundaries of cells.
  
  By default, {\tt mdrun} automatically turns on the dynamic load
  balancing during a simulation when the total performance loss
-due to the force calculation imbalance is 5\% or more.
+due to the force calculation imbalance is 2\% or more.
  {\bf Note} that the reported force load imbalance numbers might be higher,
  since the force calculation is only part of work that needs to be done
  during an integration step.
@@ -2824,6 +2825,36 @@ for compensation of a load imbalance of 100\%.
  The minimum allowed scaling can be changed with the {\tt -dds}
  option of {\tt mdrun}.
  
+The load imbalance is measured by timing a single region of the MD step
+on each MPI rank. This region can not include MPI communication, as
+timing of MPI calls does not allow separating wait due to imbalance
+from actual communication.
+The domain volumes are then scaled, with under-relaxation, inversely
+proportional with the measured time. This procedure will decrease the
+load imbalance when the change in load in the measured region correlates
+with the change in domain volume and the load outside
+the measured region does not depend strongly on the domain volume.
+In CPU-only simulations, the load is measured between the coordinate
+and the force communication. In hybrid CPU-GPU simulations we overlap
+communication on the CPU with calculation on the GPU. Therefore we
+measure from the last communication before the force calculation to
+when the CPU or GPU is finished, whichever is last.
+When not using PME ranks, we subtract the time in PME from the CPU time,
+as this includes MPI calls and the PME load is independent of domain size.
+This generally works well, unless the non-bonded load is low and there is
+imbalance in the bonded interactions. Then two issues can arise.
+Dynamic load balancing can increase the imbalance in update and constraints
+and with PME the coordinate and force redistribution time can go up
+significantly. Although dynamic load balancing
+can significantly improve performance in cases where there is imbalance in
+the bonded interactions on the CPU, there are many situations in which
+some domains continue decreasing in size and the load imbalance increases
+and/or PME coordinate and force redistribution cost increases significantly.
+As of version 2016.1, {\tt mdrun} disables the dynamic load balancing when
+measurement indicates that it deteriorates performance. This means that in most
+cases the user will get good performance with the default, automated
+dynamic load balancing setting.
+
  \subsection{Constraints in parallel\index{constraints}}
  \label{subsec:plincs}
  Since with domain decomposition parts of molecules can reside
diff --git a/docs/user-guide/environment-variables.rst b/docs/user-guide/environment-variables.rst

index 9b4114081bc3df82af669e9d3ac37df2080f40b4..15e1be8166af7ecea0553aa9b5d498018ea35519 100644 (file)
--- a/docs/user-guide/environment-variables.rst
+++ b/docs/user-guide/environment-variables.rst
@@ -136,6 +136,9 @@ Performance and Run Control
          disable the use of the lower-latency cudaLaunchKernel API even when supported (CUDA >=v7.0).
          Should only be used for benchmarking purposes.
  
+``GMX_DISABLE_CUDA_TIMING``
+        Disables GPU timing of CUDA tasks; synonymous with ``GMX_DISABLE_GPU_TIMING``.
+
  ``GMX_CYCLE_ALL``
          times all code during runs.  Incompatible with threads.
  
@@ -173,7 +176,7 @@ Performance and Run Control
          disables architecture-specific SIMD-optimized (SSE2, SSE4.1, AVX, etc.)
          non-bonded kernels thus forcing the use of plain C kernels.
  
-``GMX_DISABLE_CUDA_TIMING``
+``GMX_DISABLE_GPU_TIMING``
          timing of asynchronously executed GPU operations can have a
          non-negligible overhead with short step times. Disabling timing can improve performance in these cases.
  
@@ -381,10 +384,8 @@ compilation of OpenCL kernels, but they are also used in device selection.
          only the flavor required for the simulation is generated and
          compiled.
  
-``GMX_OCL_FASTMATH``
-        Adds the option ``cl-fast-relaxed-math`` to the compiler
-        options (in the CUDA version this is enabled by default, it is likely that
-        the same will happen with the OpenCL version soon)
+``GMX_OCL_DISABLE_FASTMATH``
+        Prevents the use of ``-cl-fast-relaxed-math`` compiler option.
  
  ``GMX_OCL_DUMP_LOG``
          If defined, the OpenCL build log is always written to the
diff --git a/docs/user-guide/mdrun-performance.rst b/docs/user-guide/mdrun-performance.rst

index 012c403848c03755b1e44af6096e7652dc301a76..405a6096acc4993b0fefd07ebba251c5f76850bf 100644 (file)
--- a/docs/user-guide/mdrun-performance.rst
+++ b/docs/user-guide/mdrun-performance.rst
@@ -654,6 +654,55 @@ during its runtime.
  
  .. _NVIDIA blog article: https://devblogs.nvidia.com/parallelforall/increase-performance-gpu-boost-k80-autoboost/
  
+Reducing overheads in GPU accelerated runs
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+In order for CPU cores and GPU(s) to execute concurrently, tasks are
+launched and executed asynchronously on the GPU(s) while the CPU cores
+execute non-offloaded force computation (like long-range PME electrostatics).
+Asynchronous task launches are handled by GPU device driver and
+require CPU involvement. Therefore, the work of scheduling
+GPU tasks will incur an overhead that can in some cases significantly
+delay or interfere with the CPU execution.
+
+Delays in CPU execution are caused by the latency of launching GPU tasks,
+an overhead that can become significant as simulation ns/day increases
+(i.e. with shorter wall-time per step).
+The overhead is measured by :ref:`gmx mdrun` and reported in the performance
+summary section of the log file ("Launch GPU ops" row). 
+A few percent of runtime spent in this category is normal, 
+but in fast-iterating and multi-GPU parallel runs 10% or larger overheads can be observed.
+In general, there a user can do little to avoid such overheads, but there
+are a few cases where tweaks can give performance benefits.
+In single-rank runs timing of GPU tasks is by default enabled and,
+while in most cases its impact is small, in fast runs performance can be affected.
+The performance impact will be most significant on NVIDIA GPUs with CUDA,
+less on AMD with OpenCL.
+In these cases, when more than a few percent of "Launch GPU ops" time is observed,
+it is recommended turning off timing by setting the ``GMX_DISABLE_GPU_TIMING``
+environment variable.
+In parallel runs with with many ranks sharing a GPU
+launch overheads can also be reduced by staring fewer thread-MPI
+or MPI ranks per GPU; e.g. most often one rank per thread or core is not optimal.
+
+The second type of overhead, interference of the GPU driver with CPU computation,
+is caused by the scheduling and coordination of GPU tasks.
+A separate GPU driver thread can require CPU resources
+which may clash with the concurrently running non-offloaded tasks,
+potentially degrading the performance of PME or bonded force computation.
+This effect is most pronounced when using AMD GPUs with OpenCL with
+all stable driver releases to date (up to and including fglrx 12.15).
+To minimize the overhead it is recommended to
+leave a CPU hardware thread unused when launching :ref:`gmx mdrun`,
+especially on CPUs with high core count and/or HyperThreading enabled.
+E.g. on a machine with a 4-core CPU and eight threads (via HyperThreading) and an AMD GPU,
+try ``gmx mdrun -ntomp 7 -pin on``.
+This will leave free CPU resources for the GPU task scheduling
+reducing interference with CPU computation.
+Note that assigning fewer resources to :ref:`gmx mdrun` CPU computation
+involves a tradeoff which may outweigh the benefits of reduced GPU driver overhead,
+in particular without HyperThreading and with few CPU cores.
+
  TODO In future patch: any tips not covered above
  
  Running the OpenCL version of mdrun
@@ -664,22 +713,21 @@ GPUs. Make sure that you have the latest drivers installed. The
  minimum OpenCL version required is |REQUIRED_OPENCL_MIN_VERSION|. See
  also the :ref:`known limitations <opencl-known-limitations>`.
  
+Devices from the AMD GCN architectures (all series) and NVIDIA Fermi
+and later (compute capability 2.0) are known to work, but before
+doing production runs always make sure that the |Gromacs| tests
+pass successfully on the hardware.
+
+The OpenCL GPU kernels are compiled at run time. Hence,
+building the OpenCL program can take a few seconds introducing a slight
+delay in the :ref:`gmx mdrun` startup. This is not normally a
+problem for long production MD, but you might prefer to do some kinds
+of work, e.g. that runs very few steps, on just the CPU (e.g. see ``-nb`` above).
+
  The same ``-gpu_id`` option (or ``GMX_GPU_ID`` environment variable)
  used to select CUDA devices, or to define a mapping of GPUs to PP
  ranks, is used for OpenCL devices.
  
-The following devices are known to work correctly:
-   - AMD: FirePro W5100, HD 7950, FirePro W9100, Radeon R7 240,
-     Radeon R7 M260, Radeon R9 290
-   - NVIDIA: GeForce GTX 660M, GeForce GTX 660Ti, GeForce GTX 750Ti,
-     GeForce GTX 780, GTX Titan
-
-Building the OpenCL program can take a few seconds when :ref:`gmx
-mdrun` starts up, because the kernels that run on the
-GPU can only be compiled at run time. This is not normally a
-problem for long production MD, but you might prefer to do some kinds
-of work on just the CPU (e.g. see ``-nb`` above).
-
  Some other :ref:`OpenCL management <opencl-management>` environment
  variables may be of interest to developers.
  
@@ -696,6 +744,9 @@ Limitations in the current OpenCL support of interest to |Gromacs| users:
    almost no performance gain when using NVIDIA GPUs.
    The issue affects NVIDIA driver versions up to 349 series, but it
    known to be fixed 352 and later driver releases.
+- On NVIDIA GPUs the OpenCL kernels achieve much lower performance
+  than the equivalent CUDA kernels due to limitations of the NVIDIA OpenCL
+  compiler.
  - The AMD APPSDK version 3.0 ships with OpenCL compiler/runtime components,
    libamdocl12cl64.so and libamdocl64.so (only in earlier releases),
    that conflict with newer fglrx GPU drivers which provide the same libraries.
diff --git a/src/contrib/fftw/CMakeLists.txt b/src/contrib/fftw/CMakeLists.txt

index a39af0b5ab6b61c5ce781fe571559c494b852bd0..5c061659c44a8d5e1f2699921b9dcfcf68f6a44d 100644 (file)
--- a/src/contrib/fftw/CMakeLists.txt
+++ b/src/contrib/fftw/CMakeLists.txt
@@ -1,7 +1,7 @@
  #
  # This file is part of the GROMACS molecular simulation package.
  #
-# Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
+# Copyright (c) 2012,2013,2014,2015,2016, by the GROMACS development team, led by
  # Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  # and including many others, as listed in the AUTHORS file in the
  # top-level source directory and at http://www.gromacs.org.
@@ -69,8 +69,13 @@ elseif(${GMX_SIMD} MATCHES "^(SSE)")
      set(_fftw_simd_support_level "--enable-sse2")
  elseif(${GMX_SIMD} MATCHES "^(AVX)")
      # Testing shows FFTW configured with --enable-sse2 --enable-avx is
-    # slightly faster on most architectures than --enable-sse2 alone
-    set(_fftw_simd_support_level --enable-sse2;--enable-avx)
+    # slightly faster on most architectures than --enable-sse2 alone.
+    # Support for --enable-avx2 was only added in 3.3.5, but
+    # configuring with it is at worst a warning, even on an earlier
+    # version.
+    set(_fftw_simd_support_level --enable-sse2;--enable-avx;--enable-avx2;--enable-avx512)
+elseif(${GMX_SIMD} MATCHES "^(VSX)")
+    set(_fftw_simd_support_level --enable-vsx)
  endif()
  set(GMX_BUILD_OWN_FFTW_OPTIMIZATION_CONFIGURATION ${_fftw_simd_support_level} CACHE INTERNAL "Optimization flags for FFTW compilation")
  
@@ -80,12 +85,12 @@ if (TARGET_HOST)
  endif()
  
  # Machinery for running the external project
-set(EXTERNAL_FFTW_VERSION 3.3.4)
+set(EXTERNAL_FFTW_VERSION 3.3.5)
  # cmake make eats slashes //// -> //
  set(GMX_BUILD_OWN_FFTW_URL
      "http:////www.fftw.org/fftw-${EXTERNAL_FFTW_VERSION}.tar.gz" CACHE PATH
      "URL from where to download fftw (use an absolute path when offline, adjust GMX_BUILD_OWN_FFTW_MD5 if downloading other version than ${EXTERNAL_FFTW_VERSION})")
-set(GMX_BUILD_OWN_FFTW_MD5 2edab8c06b24feeb3b82bbb3ebf3e7b3 CACHE STRING
+set(GMX_BUILD_OWN_FFTW_MD5 6cc08a3b9c7ee06fdd5b9eb02e06f569 CACHE STRING
      "Expected MD5 hash for the file at GMX_BUILD_OWN_FFTW_URL")
  mark_as_advanced(GMX_BUILD_OWN_FFTW_URL GMX_BUILD_OWN_FFTW_MD5)
  
diff --git a/src/gromacs/domdec/domdec.cpp b/src/gromacs/domdec/domdec.cpp

index e1c020808d36743150df0413cc71fd518d302cbf..93a685edce4cbf4c90620d432dc88786363d23d7 100644 (file)
--- a/src/gromacs/domdec/domdec.cpp
+++ b/src/gromacs/domdec/domdec.cpp
@@ -179,6 +179,23 @@ static const int
  #define GMX_DD_NNODES_SENDRECV 4
  
  
+/* We check if to turn on DLB at the first and every 100 DD partitionings.
+ * With large imbalance DLB will turn on at the first step, so we can
+ * make the interval so large that the MPI overhead of the check is negligible.
+ */
+static const int c_checkTurnDlbOnInterval  = 100;
+/* We need to check if DLB results in worse performance and then turn it off.
+ * We check this more often then for turning DLB on, because the DLB can scale
+ * the domains very rapidly, so if unlucky the load imbalance can go up quickly
+ * and furthermore, we are already synchronizing often with DLB, so
+ * the overhead of the MPI Bcast is not that high.
+ */
+static const int c_checkTurnDlbOffInterval =  20;
+
+/* Forward declaration */
+static void dd_dlb_set_should_check_whether_to_turn_dlb_on(gmx_domdec_t *dd, gmx_bool bValue);
+
+
  /*
     #define dd_index(n,i) ((((i)[ZZ]*(n)[YY] + (i)[YY])*(n)[XX]) + (i)[XX])
  
@@ -258,7 +275,8 @@ t_block *dd_charge_groups_global(gmx_domdec_t *dd)
  
  static bool dlbIsOn(const gmx_domdec_comm_t *comm)
  {
-    return (comm->dlbState == edlbsOn);
+    return (comm->dlbState == edlbsOnCanTurnOff ||
+            comm->dlbState == edlbsOnForever);
  }
  
  static void vec_rvec_init(vec_rvec_t *v)
@@ -2523,7 +2541,7 @@ static gmx_bool check_grid_jump(gmx_int64_t     step,
                  /* This error should never be triggered under normal
                   * circumstances, but you never know ...
                   */
-                gmx_fatal(FARGS, "Step %s: The domain decomposition grid has shifted too much in the %c-direction around cell %d %d %d. This should not have happened. Running with fewer ranks might avoid this issue.",
+                gmx_fatal(FARGS, "step %s: The domain decomposition grid has shifted too much in the %c-direction around cell %d %d %d. This should not have happened. Running with fewer ranks might avoid this issue.",
                            gmx_step_str(step, buf),
                            dim2char(dim), dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
              }
@@ -3012,7 +3030,7 @@ static void dd_cell_sizes_dlb_root_enforce_limits(gmx_domdec_t *dd,
      if (bPBC && cell_size[i] < cellsize_limit_f*DD_CELL_MARGIN2/DD_CELL_MARGIN)
      {
          char buf[22];
-        gmx_fatal(FARGS, "Step %s: the dynamic load balancing could not balance dimension %c: box size %f, triclinic skew factor %f, #cells %d, minimum cell size %f\n",
+        gmx_fatal(FARGS, "step %s: the dynamic load balancing could not balance dimension %c: box size %f, triclinic skew factor %f, #cells %d, minimum cell size %f\n",
                    gmx_step_str(step, buf),
                    dim2char(dim), ddbox->box_size[dim], ddbox->skew_fac[dim],
                    ncd, comm->cellsize_min[dim]);
@@ -3557,7 +3575,7 @@ static void comm_dd_ns_cell_sizes(gmx_domdec_t *dd,
              comm->cellsize_min[dim])
          {
              char buf[22];
-            gmx_fatal(FARGS, "Step %s: The %c-size (%f) times the triclinic skew factor (%f) is smaller than the smallest allowed cell size (%f) for domain decomposition grid cell %d %d %d",
+            gmx_fatal(FARGS, "step %s: The %c-size (%f) times the triclinic skew factor (%f) is smaller than the smallest allowed cell size (%f) for domain decomposition grid cell %d %d %d",
                        gmx_step_str(step, buf), dim2char(dim),
                        comm->cell_x1[dim] - comm->cell_x0[dim],
                        ddbox->skew_fac[dim],
@@ -6198,7 +6216,7 @@ static int check_dlb_support(FILE *fplog, t_commrec *cr,
      {
          case 'a': dlbState = edlbsOffCanTurnOn; break;
          case 'n': dlbState = edlbsOffForever;   break;
-        case 'y': dlbState = edlbsOn;           break;
+        case 'y': dlbState = edlbsOnForever;    break;
          default: gmx_incons("Unknown dlb_opt");
      }
  
@@ -6209,7 +6227,7 @@ static int check_dlb_support(FILE *fplog, t_commrec *cr,
  
      if (!EI_DYNAMICS(ir->eI))
      {
-        if (dlbState == edlbsOn)
+        if (dlbState == edlbsOnForever)
          {
              sprintf(buf, "NOTE: dynamic load balancing is only supported with dynamics, not with integrator '%s'\n", EI(ir->eI));
              dd_warning(cr, fplog, buf);
@@ -6231,10 +6249,11 @@ static int check_dlb_support(FILE *fplog, t_commrec *cr,
              case edlbsOffForever:
                  break;
              case edlbsOffCanTurnOn:
+            case edlbsOnCanTurnOff:
                  dd_warning(cr, fplog, "NOTE: reproducibility requested, will not use dynamic load balancing\n");
                  dlbState = edlbsOffForever;
                  break;
-            case edlbsOn:
+            case edlbsOnForever:
                  dd_warning(cr, fplog, "WARNING: reproducibility requested with dynamic load balancing, the simulation will NOT be binary reproducible\n");
                  break;
              default:
@@ -6347,8 +6366,12 @@ static void set_dd_limits_and_grid(FILE *fplog, t_commrec *cr, gmx_domdec_t *dd,
      /* Initialize to GPU share count to 0, might change later */
      comm->nrank_gpu_shared = 0;
  
-    comm->dlbState                 = check_dlb_support(fplog, cr, dlb_opt, comm->bRecordLoad, Flags, ir);
-    comm->bCheckWhetherToTurnDlbOn = TRUE;
+    comm->dlbState         = check_dlb_support(fplog, cr, dlb_opt, comm->bRecordLoad, Flags, ir);
+    dd_dlb_set_should_check_whether_to_turn_dlb_on(dd, TRUE);
+    /* To consider turning DLB on after 2*nstlist steps we need to check
+     * at partitioning count 3. Thus we need to increase the first count by 2.
+     */
+    comm->ddPartioningCountFirstDlbOff += 2;
  
      if (fplog)
      {
@@ -6724,16 +6747,10 @@ static void turn_on_dlb(FILE *fplog, t_commrec *cr, gmx_int64_t step)
      gmx_domdec_comm_t *comm;
      real               cellsize_min;
      int                d, nc, i;
-    char               buf[STRLEN];
  
      dd   = cr->dd;
      comm = dd->comm;
  
-    if (fplog)
-    {
-        fprintf(fplog, "At step %s the performance loss due to force load imbalance is %.1f %%\n", gmx_step_str(step, buf), dd_force_imb_perf_loss(dd)*100);
-    }
-
      cellsize_min = comm->cellsize_min[dd->dim[0]];
      for (d = 1; d < dd->ndim; d++)
      {
@@ -6742,7 +6759,8 @@ static void turn_on_dlb(FILE *fplog, t_commrec *cr, gmx_int64_t step)
  
      if (cellsize_min < comm->cellsize_limit*1.05)
      {
-        dd_warning(cr, fplog, "NOTE: the minimum cell size is smaller than 1.05 times the cell size limit, will not turn on dynamic load balancing\n");
+        char buf[STRLEN];
+        sprintf(buf, "step %" GMX_PRId64 " Measured %.1f %% performance load due to load imbalance, but the minimum cell size is smaller than 1.05 times the cell size limit. Will no longer try dynamic load balancing.\n", step, dd_force_imb_perf_loss(dd)*100);
  
          /* Change DLB from "auto" to "no". */
          comm->dlbState = edlbsOffForever;
@@ -6750,8 +6768,16 @@ static void turn_on_dlb(FILE *fplog, t_commrec *cr, gmx_int64_t step)
          return;
      }
  
-    dd_warning(cr, fplog, "NOTE: Turning on dynamic load balancing\n");
-    comm->dlbState = edlbsOn;
+    char buf[STRLEN];
+    sprintf(buf, "step %" GMX_PRId64 " Turning on dynamic load balancing, because the performance loss due to load imbalance is %.1f %%.\n", step, dd_force_imb_perf_loss(dd)*100);
+    dd_warning(cr, fplog, buf);
+    comm->dlbState = edlbsOnCanTurnOff;
+
+    /* Store the non-DLB performance, so we can check if DLB actually
+     * improves performance.
+     */
+    GMX_RELEASE_ASSERT(comm->cycl_n[ddCyclStep] > 0, "When we turned on DLB, we should have measured cycles");
+    comm->cyclesPerStepBeforeDLB = comm->cycl[ddCyclStep]/comm->cycl_n[ddCyclStep];
  
      set_dlb_limits(dd);
  
@@ -6780,6 +6806,27 @@ static void turn_on_dlb(FILE *fplog, t_commrec *cr, gmx_int64_t step)
      }
  }
  
+static void turn_off_dlb(FILE *fplog, t_commrec *cr, gmx_int64_t step)
+{
+    gmx_domdec_t *dd = cr->dd;
+
+    char          buf[STRLEN];
+    sprintf(buf, "step %" GMX_PRId64 " Turning off dynamic load balancing, because it is degrading performance.\n", step);
+    dd_warning(cr, fplog, buf);
+    dd->comm->dlbState                     = edlbsOffCanTurnOn;
+    dd->comm->haveTurnedOffDlb             = true;
+    dd->comm->ddPartioningCountFirstDlbOff = dd->ddp_count;
+}
+
+static void turn_off_dlb_forever(FILE *fplog, t_commrec *cr, gmx_int64_t step)
+{
+    GMX_RELEASE_ASSERT(cr->dd->comm->dlbState == edlbsOffCanTurnOn, "Can only turn off DLB forever when it was in the can-turn-on state");
+    char buf[STRLEN];
+    sprintf(buf, "step %" GMX_PRId64 " Will no longer try dynamic load balancing, as it degraded performance.\n", step);
+    dd_warning(cr, fplog, buf);
+    cr->dd->comm->dlbState = edlbsOffForever;
+}
+
  static char *init_bLocalCG(const gmx_mtop_t *mtop)
  {
      int   ncg, cg;
@@ -7344,6 +7391,14 @@ static void dd_dlb_set_should_check_whether_to_turn_dlb_on(gmx_domdec_t *dd, gmx
      if (dd->comm->dlbState == edlbsOffCanTurnOn)
      {
          dd->comm->bCheckWhetherToTurnDlbOn = bValue;
+
+        if (bValue == TRUE)
+        {
+            /* Store the DD partitioning count, so we can ignore cycle counts
+             * over the next nstlist steps, which are often slower.
+             */
+            dd->comm->ddPartioningCountFirstDlbOff = dd->ddp_count;
+        }
      }
  }
  
@@ -7352,13 +7407,20 @@ static void dd_dlb_set_should_check_whether_to_turn_dlb_on(gmx_domdec_t *dd, gmx
   */
  static gmx_bool dd_dlb_get_should_check_whether_to_turn_dlb_on(gmx_domdec_t *dd)
  {
-    const int nddp_chk_dlb = 100;
-
      if (dd->comm->dlbState != edlbsOffCanTurnOn)
      {
          return FALSE;
      }
  
+    if (dd->ddp_count <= dd->comm->ddPartioningCountFirstDlbOff)
+    {
+        /* We ignore the first nstlist steps at the start of the run
+         * or after PME load balancing or after turning DLB off, since
+         * these often have extra allocation or cache miss overhead.
+         */
+        return FALSE;
+    }
+
      /* We should check whether we should use DLB directly after
       * unlocking DLB. */
      if (dd->comm->bCheckWhetherToTurnDlbOn)
@@ -7368,10 +7430,10 @@ static gmx_bool dd_dlb_get_should_check_whether_to_turn_dlb_on(gmx_domdec_t *dd)
          dd_dlb_set_should_check_whether_to_turn_dlb_on(dd, FALSE);
          return TRUE;
      }
-    /* We should also check whether we should use DLB every 100
+    /* We check whether we should use DLB every c_checkTurnDlbOnInterval
       * partitionings (we do not do this every partioning, so that we
       * avoid excessive communication). */
-    if (dd->comm->n_load_have % nddp_chk_dlb == nddp_chk_dlb - 1)
+    if (dd->comm->n_load_have % c_checkTurnDlbOnInterval == c_checkTurnDlbOnInterval - 1)
      {
          return TRUE;
      }
@@ -7381,7 +7443,7 @@ static gmx_bool dd_dlb_get_should_check_whether_to_turn_dlb_on(gmx_domdec_t *dd)
  
  gmx_bool dd_dlb_is_on(const gmx_domdec_t *dd)
  {
-    return (dd->comm->dlbState == edlbsOn);
+    return dlbIsOn(dd->comm);
  }
  
  gmx_bool dd_dlb_is_locked(const gmx_domdec_t *dd)
@@ -9132,7 +9194,7 @@ void dd_partition_system(FILE                *fplog,
      gmx_int64_t        step_pcoupl;
      rvec               cell_ns_x0, cell_ns_x1;
      int                i, n, ncgindex_set, ncg_home_old = -1, ncg_moved, nat_f_novirsum;
-    gmx_bool           bBoxChanged, bNStGlobalComm, bDoDLB, bCheckWhetherToTurnDlbOn, bTurnOnDLB, bLogLoad;
+    gmx_bool           bBoxChanged, bNStGlobalComm, bDoDLB, bCheckWhetherToTurnDlbOn, bLogLoad;
      gmx_bool           bRedist, bSortCG, bResortAll;
      ivec               ncells_old = {0, 0, 0}, ncells_new = {0, 0, 0}, np;
      real               grid_density;
@@ -9223,40 +9285,109 @@ void dd_partition_system(FILE                *fplog,
              }
              comm->n_load_collect++;
  
-            if (bCheckWhetherToTurnDlbOn)
+            if (dlbIsOn(comm))
              {
+                if (DDMASTER(dd))
+                {
+                    /* Add the measured cycles to the running average */
+                    const float averageFactor        = 0.1f;
+                    comm->cyclesPerStepDlbExpAverage =
+                        (1 - averageFactor)*comm->cyclesPerStepDlbExpAverage +
+                        averageFactor*comm->cycl[ddCyclStep]/comm->cycl_n[ddCyclStep];
+                }
+                if (comm->dlbState == edlbsOnCanTurnOff &&
+                    dd->comm->n_load_have % c_checkTurnDlbOffInterval == c_checkTurnDlbOffInterval - 1)
+                {
+                    gmx_bool turnOffDlb;
+                    if (DDMASTER(dd))
+                    {
+                        /* If the running averaged cycles with DLB are more
+                         * than before we turned on DLB, turn off DLB.
+                         * We will again run and check the cycles without DLB
+                         * and we can then decide if to turn off DLB forever.
+                         */
+                        turnOffDlb = (comm->cyclesPerStepDlbExpAverage >
+                                      comm->cyclesPerStepBeforeDLB);
+                    }
+                    dd_bcast(dd, sizeof(turnOffDlb), &turnOffDlb);
+                    if (turnOffDlb)
+                    {
+                        /* To turn off DLB, we need to redistribute the atoms */
+                        dd_collect_state(dd, state_local, state_global);
+                        bMasterState = TRUE;
+                        turn_off_dlb(fplog, cr, step);
+                    }
+                }
+            }
+            else if (bCheckWhetherToTurnDlbOn)
+            {
+                gmx_bool turnOffDlbForever = FALSE;
+                gmx_bool turnOnDlb         = FALSE;
+
                  /* Since the timings are node dependent, the master decides */
                  if (DDMASTER(dd))
                  {
-                    /* Here we check if the max PME rank load is more than 0.98
-                     * the max PP force load. If so, PP DLB will not help,
-                     * since we are (almost) limited by PME. Furthermore,
-                     * DLB will cause a significant extra x/f redistribution
-                     * cost on the PME ranks, which will then surely result
-                     * in lower total performance.
-                     * This check might be fragile, since one measurement
-                     * below 0.98 (although only done once every 100 DD part.)
-                     * could turn on DLB for the rest of the run.
+                    /* If we recently turned off DLB, we want to check if
+                     * performance is better without DLB. We want to do this
+                     * ASAP to minimize the chance that external factors
+                     * slowed down the DLB step are gone here and we
+                     * incorrectly conclude that DLB was causing the slowdown.
+                     * So we measure one nstlist block, no running average.
                       */
-                    if (cr->npmenodes > 0 &&
-                        dd_pme_f_ratio(dd) > 1 - DD_PERF_LOSS_DLB_ON)
+                    if (comm->haveTurnedOffDlb &&
+                        comm->cycl[ddCyclStep]/comm->cycl_n[ddCyclStep] <
+                        comm->cyclesPerStepDlbExpAverage)
                      {
-                        bTurnOnDLB = FALSE;
+                        /* After turning off DLB we ran nstlist steps in fewer
+                         * cycles than with DLB. This likely means that DLB
+                         * in not benefical, but this could be due to a one
+                         * time unlucky fluctuation, so we require two such
+                         * observations in close succession to turn off DLB
+                         * forever.
+                         */
+                        if (comm->dlbSlowerPartitioningCount > 0 &&
+                            dd->ddp_count < comm->dlbSlowerPartitioningCount + 10*c_checkTurnDlbOnInterval)
+                        {
+                            turnOffDlbForever = TRUE;
+                        }
+                        comm->haveTurnedOffDlb           = false;
+                        /* Register when we last measured DLB slowdown */
+                        comm->dlbSlowerPartitioningCount = dd->ddp_count;
                      }
                      else
                      {
-                        bTurnOnDLB =
-                            (dd_force_imb_perf_loss(dd) >= DD_PERF_LOSS_DLB_ON);
-                    }
-                    if (debug)
-                    {
-                        fprintf(debug, "step %s, imb loss %f\n",
-                                gmx_step_str(step, sbuf),
-                                dd_force_imb_perf_loss(dd));
+                        /* Here we check if the max PME rank load is more than 0.98
+                         * the max PP force load. If so, PP DLB will not help,
+                         * since we are (almost) limited by PME. Furthermore,
+                         * DLB will cause a significant extra x/f redistribution
+                         * cost on the PME ranks, which will then surely result
+                         * in lower total performance.
+                         */
+                        if (cr->npmenodes > 0 &&
+                            dd_pme_f_ratio(dd) > 1 - DD_PERF_LOSS_DLB_ON)
+                        {
+                            turnOnDlb = FALSE;
+                        }
+                        else
+                        {
+                            turnOnDlb = (dd_force_imb_perf_loss(dd) >= DD_PERF_LOSS_DLB_ON);
+                        }
                      }
                  }
-                dd_bcast(dd, sizeof(bTurnOnDLB), &bTurnOnDLB);
-                if (bTurnOnDLB)
+                struct
+                {
+                    gmx_bool turnOffDlbForever;
+                    gmx_bool turnOnDlb;
+                }
+                bools {
+                    turnOffDlbForever, turnOnDlb
+                };
+                dd_bcast(dd, sizeof(bools), &bools);
+                if (bools.turnOffDlbForever)
+                {
+                    turn_off_dlb_forever(fplog, cr, step);
+                }
+                else if (bools.turnOnDlb)
                  {
                      turn_on_dlb(fplog, cr, step);
                      bDoDLB = TRUE;
diff --git a/src/gromacs/domdec/domdec_internal.h b/src/gromacs/domdec/domdec_internal.h

index 169debb5b4041f9c4f8d5c8e98a17d02bf17dbb5..08697fb20c69744c29af9b04e2916eefd5b1bf08 100644 (file)
--- a/src/gromacs/domdec/domdec_internal.h
+++ b/src/gromacs/domdec/domdec_internal.h
@@ -149,15 +149,17 @@ enum {
      edlbsOffForever,           /**< DLB is off and will never be turned on */
      edlbsOffCanTurnOn,         /**< DLB is off and will turn on on imbalance */
      edlbsOffTemporarilyLocked, /**< DLB is off and temporarily can't turn on */
-    edlbsOn,                   /**< DLB is on and will stay on forever */
+    edlbsOnCanTurnOff,         /**< DLB is on and can turn off when slow */
+    edlbsOnForever,            /**< DLB is on and will stay on forever, because the user chose this */
      edlbsNR                    /**< The number of DLB states */
  };
  
-/* Allowed DLB state transitions:
- *   edlbsOffCanTurnOn         -> edlbsOn
+/* Allowed DLB state transitions in automatic mode:
+ *   edlbsOffCanTurnOn         -> edlbsOnCanTurnOff
   *   edlbsOffCanTurnOn         -> edlbsOffForever
   *   edlbsOffCanTurnOn         -> edlbsOffTemporarilyLocked
   *   edlbsOffTemporarilyLocked -> edlbsOffCanTurnOn
+ *   edlbsOnCanTurnOff         -> edlbsOffCanTurnOn
   */
  
  /*! \brief The PME domain decomposition for one dimension */
@@ -243,6 +245,8 @@ struct gmx_domdec_comm_t
      int      dlbState;
      /* With dlbState=edlbsOffCanTurnOn, should we check if to DLB on at the next DD? */
      gmx_bool bCheckWhetherToTurnDlbOn;
+    /* The first DD count since we are running without DLB */
+    int      ddPartioningCountFirstDlbOff;
  
      /* Cell sizes for static load balancing, first index cartesian */
      real **slb_frac;
@@ -290,7 +294,7 @@ struct gmx_domdec_comm_t
      /** Which cg distribution is stored on the master node,
       *  stored as DD partitioning call count.
       */
-    int master_cg_ddp_count;
+    gmx_int64_t master_cg_ddp_count;
  
      /** The number of cg's received from the direct neighbors */
      int  zone_ncg1[DD_MAXZONE];
@@ -357,6 +361,12 @@ struct gmx_domdec_comm_t
      /** How many times have we collected the load measurements */
      int    n_load_collect;
  
+    /* Cycle count history for DLB checks */
+    float       cyclesPerStepBeforeDLB;     /**< The averaged cycles per step over the last nstlist step before turning on DLB */
+    float       cyclesPerStepDlbExpAverage; /**< The running average of the cycles per step during DLB */
+    bool        haveTurnedOffDlb;           /**< Have we turned off DLB (after turning DLB on)? */
+    gmx_int64_t dlbSlowerPartitioningCount; /**< The DD step at which we last measured that DLB off was faster than DLB on, 0 if there was no such step */
+
      /* Statistics */
      double sum_nat[ddnatNR-ddnatZONE]; /**< The atoms per zone, summed over the steps */
      int    ndecomp;                    /**< The number of partioning calls */
diff --git a/src/gromacs/gmxpreprocess/readir.cpp b/src/gromacs/gmxpreprocess/readir.cpp

index 9e4fe92d633775db8378609680ac6779388d263d..aac04e776aa14e5bef314811b63c446879462b32 100644 (file)
--- a/src/gromacs/gmxpreprocess/readir.cpp
+++ b/src/gromacs/gmxpreprocess/readir.cpp
@@ -3457,7 +3457,7 @@ void do_index(const char* mdparin, const char *ndx,
                  nSA_time = str_nelem(is->anneal_time, MAXPTR, ptr1);
                  if (nSA_time != k)
                  {
-                    gmx_fatal(FARGS, "Found %d annealing-time values, wanter %d\n", nSA_time, k);
+                    gmx_fatal(FARGS, "Found %d annealing-time values, wanted %d\n", nSA_time, k);
                  }
                  nSA_temp = str_nelem(is->anneal_temp, MAXPTR, ptr2);
                  if (nSA_temp != k)
diff --git a/src/gromacs/gpu_utils/ocl_compiler.cpp b/src/gromacs/gpu_utils/ocl_compiler.cpp

index d9be778045f71f91d7ad229ff6802a8080557f1c..3c5e500b42c98de42acc684963ff06e015cf2c2f 100644 (file)
--- a/src/gromacs/gpu_utils/ocl_compiler.cpp
+++ b/src/gromacs/gpu_utils/ocl_compiler.cpp
@@ -174,7 +174,8 @@ selectCompilerOptions(ocl_vendor_id_t deviceVendorId)
          compilerOptions += " -cl-opt-disable";
      }
  
-    if (getenv("GMX_OCL_FASTMATH") )
+    /* Fastmath imprves performance on all supported arch */
+    if (getenv("GMX_OCL_DISABLE_FASTMATH") == NULL)
      {
          compilerOptions += " -cl-fast-relaxed-math";
      }
diff --git a/src/gromacs/mdlib/clincs.cpp b/src/gromacs/mdlib/clincs.cpp

index f5224bbebbb74a71551aeff2a97a19acfe7efd1e..7a47c46910dfe2874b0a48de03c9e3b1c2ceb024 100644 (file)
--- a/src/gromacs/mdlib/clincs.cpp
+++ b/src/gromacs/mdlib/clincs.cpp
@@ -2470,7 +2470,7 @@ gmx_bool constrain_lincs(FILE *fplog, gmx_bool bLog, gmx_bool bEner,
  
          if (bWarn)
          {
-            if (maxwarn >= 0)
+            if (maxwarn < INT_MAX)
              {
                  cconerr(lincsd, xprime, pbc,
                          &ncons_loc, &p_ssd, &p_max, &p_imax);
diff --git a/src/gromacs/mdlib/constr.cpp b/src/gromacs/mdlib/constr.cpp

index 715d8b4e1a5c55a35b1aee6e39f95b5831fa6149..630e19024807a3486ca7b839d16c73614b05e69f 100644 (file)
--- a/src/gromacs/mdlib/constr.cpp
+++ b/src/gromacs/mdlib/constr.cpp
@@ -391,7 +391,7 @@ gmx_bool constrain(FILE *fplog, gmx_bool bLog, gmx_bool bEner,
                                invdt, v, vir != NULL, vir_r_m_dr,
                                econq, nrnb,
                                constr->maxwarn, &constr->warncount_lincs);
-        if (!bOK && constr->maxwarn >= 0)
+        if (!bOK && constr->maxwarn < INT_MAX)
          {
              if (fplog != NULL)
              {
@@ -412,7 +412,7 @@ gmx_bool constrain(FILE *fplog, gmx_bool bLog, gmx_bool bEner,
                                idef, ir, x, xprime, nrnb,
                                constr->lagr, lambda, dvdlambda,
                                invdt, v, vir != NULL, vir_r_m_dr,
-                              constr->maxwarn >= 0, econq);
+                              constr->maxwarn < INT_MAX, econq);
                  break;
              case (econqVeloc):
                  bOK = bshakef(fplog, constr->shaked,
@@ -420,14 +420,14 @@ gmx_bool constrain(FILE *fplog, gmx_bool bLog, gmx_bool bEner,
                                idef, ir, x, min_proj, nrnb,
                                constr->lagr, lambda, dvdlambda,
                                invdt, NULL, vir != NULL, vir_r_m_dr,
-                              constr->maxwarn >= 0, econq);
+                              constr->maxwarn < INT_MAX, econq);
                  break;
              default:
                  gmx_fatal(FARGS, "Internal error, SHAKE called for constraining something else than coordinates");
                  break;
          }
  
-        if (!bOK && constr->maxwarn >= 0)
+        if (!bOK && constr->maxwarn < INT_MAX)
          {
              if (fplog != NULL)
              {
@@ -1279,6 +1279,10 @@ gmx_constr_t init_constraints(FILE *fplog,
      {
          constr->maxwarn = 0;
          sscanf(env, "%8d", &constr->maxwarn);
+        if (constr->maxwarn < 0)
+        {
+            constr->maxwarn = INT_MAX;
+        }
          if (fplog)
          {
              fprintf(fplog,
@@ -1292,10 +1296,6 @@ gmx_constr_t init_constraints(FILE *fplog,
                      constr->maxwarn);
          }
      }
-    if (constr->maxwarn < 0 && fplog)
-    {
-        fprintf(fplog, "maxwarn < 0, will not stop on constraint errors\n");
-    }
      constr->warncount_lincs  = 0;
      constr->warncount_settle = 0;
  
diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu

index 5239842a9ec6e8da0d489c2bfbebe2c8a4da31ef..a061f795f3f8082b3c237f39371da78deb8af9a8 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu
+++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu
@@ -655,10 +655,11 @@ void nbnxn_gpu_init(gmx_nbnxn_cuda_t         **p_nb,
  
      /* CUDA timing disabled as event timers don't work:
         - with multiple streams = domain-decomposition;
-       - when turned off by GMX_DISABLE_CUDA_TIMING.
+       - when turned off by GMX_DISABLE_CUDA_TIMING/GMX_DISABLE_GPU_TIMING.
       */
      nb->bDoTime = (!nb->bUseTwoStreams &&
-                   (getenv("GMX_DISABLE_CUDA_TIMING") == NULL));
+                   (getenv("GMX_DISABLE_CUDA_TIMING") == NULL) &&
+                   (getenv("GMX_DISABLE_GPU_TIMING") == NULL));
  
      if (nb->bDoTime)
      {
diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_data_mgmt.cpp b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_data_mgmt.cpp

index c7e3329a7c210d8dd3a595684d4292c607350c47..bd7398cc0a14d294536ed0edf7e6241c81e42cb8 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_data_mgmt.cpp
+++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_data_mgmt.cpp
@@ -738,7 +738,9 @@ void nbnxn_gpu_init(gmx_nbnxn_ocl_t          **p_nb,
      init_plist(nb->plist[eintLocal]);
  
      /* OpenCL timing disabled if GMX_DISABLE_OCL_TIMING is defined. */
-    nb->bDoTime = (getenv("GMX_DISABLE_OCL_TIMING") == NULL);
+    /* TODO deprecate the first env var in the 2017 release. */
+    nb->bDoTime = (getenv("GMX_DISABLE_OCL_TIMING") == NULL &&
+                   getenv("GMX_DISABLE_GPU_TIMING") == NULL);
  
      /* Create queues only after bDoTime has been initialized */
      if (nb->bDoTime)
diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp

index ab1b7ceac3e90c15803b305732c2f6c9ed1ffa59..adb06226038f562813f0e6af89f44d97ae3df085 100644 (file)
--- a/src/gromacs/mdlib/sim_util.cpp
+++ b/src/gromacs/mdlib/sim_util.cpp
@@ -108,9 +108,6 @@
  
  #include "nbnxn_gpu.h"
  
-static const bool useCuda   = GMX_GPU == GMX_GPU_CUDA;
-static const bool useOpenCL = GMX_GPU == GMX_GPU_OPENCL;
-
  void print_time(FILE                     *out,
                  gmx_walltime_accounting_t walltime_accounting,
                  gmx_int64_t               step,
@@ -1274,7 +1271,7 @@ void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
  
      if (bDoForces && DOMAINDECOMP(cr))
      {
-        if (bUseGPU && useCuda)
+        if (bUseGPU)
          {
              /* We are done with the CPU compute, but the GPU local non-bonded
               * kernel can still be running while we communicate the forces.
@@ -1293,10 +1290,15 @@ void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
      if (bUseOrEmulGPU)
      {
          /* wait for local forces (or calculate in emulation mode) */
-        if (bUseGPU && useCuda)
+        if (bUseGPU)
          {
              float       cycles_tmp, cycles_wait_est;
-            const float cuda_api_overhead_margin = 50000.0f; /* cycles */
+            /* Measured overhead on CUDA and OpenCL with(out) GPU sharing
+             * is between 0.5 and 1.5 Mcycles. So 2 MCycles is an overestimate,
+             * but even with a step of 0.1 ms the difference is less than 1%
+             * of the step time.
+             */
+            const float gpuWaitApiOverheadMargin = 2e6f; /* cycles */
  
              wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
              nbnxn_gpu_wait_for_gpu(nbv->gpu_nbv,
@@ -1309,7 +1311,7 @@ void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
              {
                  cycles_wait_est = gmx_cycles_read() - cycleCountBeforeLocalWorkCompletes;
  
-                if (cycles_tmp < cuda_api_overhead_margin)
+                if (cycles_tmp < gpuWaitApiOverheadMargin)
                  {
                      /* We measured few cycles, it could be that the kernel
                       * and transfer finished earlier and there was no actual
@@ -1332,19 +1334,7 @@ void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
               */
              cycles_force    += cycles_wait_est;
              cycles_wait_gpu += cycles_wait_est;
-        }
-        else if (bUseGPU && useOpenCL)
-        {
  
-            wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
-            nbnxn_gpu_wait_for_gpu(nbv->gpu_nbv,
-                                   flags, eatLocal,
-                                   enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
-                                   fr->fshift);
-            cycles_wait_gpu += wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
-        }
-        if (bUseGPU)
-        {
              /* now clear the GPU outputs while we finish the step on the CPU */
              wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU_NB);
              nbnxn_gpu_clear_outputs(nbv->gpu_nbv, flags);
author	Mark Abraham <mark.j.abraham@gmail.com>
	Mon, 31 Oct 2016 11:39:40 +0000 (12:39 +0100)
committer	Mark Abraham <mark.j.abraham@gmail.com>
	Mon, 31 Oct 2016 13:38:43 +0000 (14:38 +0100)
cmake/gmxManageFFTLibraries.cmake		patch \| blob \| history
cmake/gmxVersionInfo.cmake		patch \| blob \| history
docs/install-guide/index.rst		patch \| blob \| history
docs/manual/algorithms.tex		patch \| blob \| history
docs/user-guide/environment-variables.rst		patch \| blob \| history
docs/user-guide/mdrun-performance.rst		patch \| blob \| history
src/contrib/fftw/CMakeLists.txt		patch \| blob \| history
src/gromacs/domdec/domdec.cpp		patch \| blob \| history
src/gromacs/domdec/domdec_internal.h		patch \| blob \| history
src/gromacs/gmxpreprocess/readir.cpp		patch \| blob \| history
src/gromacs/gpu_utils/ocl_compiler.cpp		patch \| blob \| history
src/gromacs/mdlib/clincs.cpp		patch \| blob \| history
src/gromacs/mdlib/constr.cpp		patch \| blob \| history
src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu		patch \| blob \| history
src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_data_mgmt.cpp		patch \| blob \| history
src/gromacs/mdlib/sim_util.cpp		patch \| blob \| history