# Test newest gcc supported by newest CUDA at time of release
# Test thread-MPI with CUDA
-gcc-5 gpu cuda-8.0 openmp release
+# Test SIMD (AVX2_256) GPU code-path
+gcc-5 gpu cuda-8.0 openmp release simd=avx2_256
# Test with ThreadSanitizer (without OpenMP, because of Redmine #1850)
# Test AVX2_256 SIMD
#
# This file is part of the GROMACS molecular simulation package.
#
-# Copyright (c) 2014,2015, by the GROMACS development team, led by
+# Copyright (c) 2014,2015,2017, by the GROMACS development team, led by
# Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
# and including many others, as listed in the AUTHORS file in the
# top-level source directory and at http://www.gromacs.org.
# --------
#
# Find the NVIDIA Management Library (NVML) includes and library. NVML documentation
-# is available at: http://docs.nvidia.com/deploy/nvml-api/index.html
+# is available at: http://docs.nvidia.com/deploy/nvml-api/index.html
#
-# NVML is part of the GPU Deployment Kit (GDK) and GPU_DEPLOYMENT_KIT_ROOT_DIR can
-# be specified if the GPU Deployment Kit is not installed in a default location.
+# Starting with CUDA 8 NVML is part of the CUDA Toolkit. Prior to CUDA 8 NVML was part
+# of the GPU Deployment Kit (GDK) and GPU_DEPLOYMENT_KIT_ROOT_DIR can be specified
+# if the GPU Deployment Kit is not installed in a default location.
#
-# FindNVML defines the following variables:
+# FindNVML defines the following variables:
#
# NVML_INCLUDE_DIR, where to find nvml.h, etc.
# NVML_LIBRARY, the libraries needed to use NVML.
# Jiri Kraus, NVIDIA Corp (nvidia.com - jkraus)
#
-# Copyright (c) 2008 - 2014 NVIDIA Corporation. All rights reserved.
+# Copyright (c) 2008 - 2014,2017 NVIDIA Corporation. All rights reserved.
#
# This code is licensed under the MIT License. See the FindNVML.cmake script
# for the text of the license.
###############################################################################
if( CMAKE_SYSTEM_NAME STREQUAL "Windows" )
- set( NVML_LIB_PATHS "C:/Program Files/NVIDIA Corporation/GDK/nvml/lib" )
- if(GPU_DEPLOYMENT_KIT_ROOT_DIR)
- list(APPEND NVML_LIB_PATHS "${GPU_DEPLOYMENT_KIT_ROOT_DIR}/nvml/lib")
- endif()
set(NVML_NAMES nvml)
-
- set( NVML_INC_PATHS "C:/Program Files/NVIDIA Corporation/GDK/nvml/include" )
- if(GPU_DEPLOYMENT_KIT_ROOT_DIR)
- list(APPEND NVML_INC_PATHS "${GPU_DEPLOYMENT_KIT_ROOT_DIR}/nvml/include")
+ if(${CUDA_VERSION_STRING} VERSION_LESS "8.0")
+ set( NVML_LIB_PATHS "C:/Program Files/NVIDIA Corporation/GDK/nvml/lib" )
+ if(GPU_DEPLOYMENT_KIT_ROOT_DIR)
+ list(APPEND NVML_LIB_PATHS "${GPU_DEPLOYMENT_KIT_ROOT_DIR}/nvml/lib")
+ endif()
+
+ set( NVML_INC_PATHS "C:/Program Files/NVIDIA Corporation/GDK/nvml/include" )
+ if(GPU_DEPLOYMENT_KIT_ROOT_DIR)
+ list(APPEND NVML_INC_PATHS "${GPU_DEPLOYMENT_KIT_ROOT_DIR}/nvml/include")
+ endif()
+ else()
+ set( NVML_LIB_PATHS "${CUDA_TOOLKIT_ROOT_DIR}/lib/x64" )
+ set( NVML_INC_PATHS ${CUDA_INCLUDE_DIRS} )
endif()
else()
- # The Linux installer for the GPU Deployment Kit adds a "usr"
- # suffix to a custom path if one is used, so a user could
- # reasonably set GPU_DEPLOYMENT_KIT_ROOT_DIR to the value they
- # passed to the installer, or the root where they later found the
- # kit to be installed. Below, we cater for both possibilities.
+ set(NVML_NAMES nvidia-ml)
+
set( NVML_LIB_PATHS /usr/lib64 )
- if(GPU_DEPLOYMENT_KIT_ROOT_DIR)
- list(APPEND NVML_LIB_PATHS
- "${GPU_DEPLOYMENT_KIT_ROOT_DIR}/src/gdk/nvml/lib"
- "${GPU_DEPLOYMENT_KIT_ROOT_DIR}/usr/src/gdk/nvml/lib"
- )
+ if(${CUDA_VERSION_STRING} VERSION_LESS "8.0")
+ # The Linux installer for the GPU Deployment Kit adds a "usr"
+ # suffix to a custom path if one is used, so a user could
+ # reasonably set GPU_DEPLOYMENT_KIT_ROOT_DIR to the value they
+ # passed to the installer, or the root where they later found the
+ # kit to be installed. Below, we cater for both possibilities.
+ if(GPU_DEPLOYMENT_KIT_ROOT_DIR)
+ list(APPEND NVML_LIB_PATHS
+ "${GPU_DEPLOYMENT_KIT_ROOT_DIR}/src/gdk/nvml/lib"
+ "${GPU_DEPLOYMENT_KIT_ROOT_DIR}/usr/src/gdk/nvml/lib"
+ )
+ endif()
+ else()
+ list(APPEND NVML_LIB_PATHS "${CUDA_TOOLKIT_ROOT_DIR}/lib64/stubs")
endif()
- set(NVML_NAMES nvidia-ml)
-
- set( NVML_INC_PATHS /usr/include/nvidia/gdk/ /usr/include )
- if(GPU_DEPLOYMENT_KIT_ROOT_DIR)
- list(APPEND NVML_INC_PATHS
- "${GPU_DEPLOYMENT_KIT_ROOT_DIR}/include/nvidia/gdk"
- "${GPU_DEPLOYMENT_KIT_ROOT_DIR}/usr/include/nvidia/gdk"
- )
+
+ if(${CUDA_VERSION_STRING} VERSION_LESS "8.0")
+ set( NVML_INC_PATHS /usr/include/nvidia/gdk/ /usr/include )
+ if(GPU_DEPLOYMENT_KIT_ROOT_DIR)
+ list(APPEND NVML_INC_PATHS
+ "${GPU_DEPLOYMENT_KIT_ROOT_DIR}/include/nvidia/gdk"
+ "${GPU_DEPLOYMENT_KIT_ROOT_DIR}/usr/include/nvidia/gdk"
+ )
+ endif()
+ else()
+ set( NVML_INC_PATHS ${CUDA_INCLUDE_DIRS} )
endif()
endif()
#
# This file is part of the GROMACS molecular simulation package.
#
-# Copyright (c) 2012,2013,2014,2015,2016, by the GROMACS development team, led by
+# Copyright (c) 2012,2013,2014,2015,2016,2017, by the GROMACS development team, led by
# Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
# and including many others, as listed in the AUTHORS file in the
# top-level source directory and at http://www.gromacs.org.
# assemble the CUDA flags
list(APPEND GMX_CUDA_NVCC_FLAGS "${GMX_CUDA_NVCC_GENCODE_FLAGS}")
list(APPEND GMX_CUDA_NVCC_FLAGS "-use_fast_math")
+if (CUDA_VERSION VERSION_EQUAL "8.0")
+ # requesting sm_20 triggers deprecation messages with nvcc 8.0 which we better avoid
+ list(APPEND GMX_CUDA_NVCC_FLAGS "-Wno-deprecated-gpu-targets")
+endif()
# assemble the CUDA host compiler flags
list(APPEND GMX_CUDA_NVCC_FLAGS "${CUDA_HOST_COMPILER_OPTIONS}")
user-guide/index.rst
user-guide/getting-started.rst
user-guide/flow.rst
+ user-guide/floating-point.rst
user-guide/system-preparation.rst
user-guide/cutoff-schemes.rst
user-guide/managing-simulations.rst
.. cmake:: GMX_DOUBLE
- Many part of GROMACS are implemented in terms of "real" precision,
+ Many part of |Gromacs| are implemented in terms of "real" precision,
which is actually either a single- or double-precision type,
according to the value of this flag. Some parts of the code
deliberately use single- or double-precision types, and these are
* All source files and other non-trivial scripts should contain a copyright
header with a predetermined format and license information (check existing
- files). Copyright holder should be "the GROMACS development team" for the
+ files). Copyright holder should be "the |Gromacs| development team" for the
years where the code has been in the |Gromacs| source repository, but earlier
years can hold other copyrights.
* Whenever you update a file, you should check that the current year is listed
Allowed language features
=========================
-GROMACS uses C99 for C files and C++11 for C++ files.
+|Gromacs| uses C99 for C files and C++11 for C++ files.
C++ has a lot of features, but to keep the source code maintainable and easy to read,
we will avoid using some of them in Gromacs code. The basic principle is to keep things
as simple as possible.
``install-guide``
Makes the INSTALL file for the tarball with Sphinx
``webpage-sphinx``
- Makes all the components of the GROMACS webpage that require Sphinx,
+ Makes all the components of the |Gromacs| webpage that require Sphinx,
including install guide and user guide.
``webpage``
- Makes the complete GROMACS webpage, requires everything. When complete,
+ Makes the complete |Gromacs| webpage, requires everything. When complete,
you can browse ``docs/html/index.html`` to find everything.
If built from a release tarball, the ``SOURCE_MD5SUM``,
* Answers to `Frequently Asked Questions <http://www.gromacs.org/Documentation/FAQs>`_
-* Coping with `errors while using GROMACS <http://www.gromacs.org/Documentation/Errors>`_
+* Coping with `errors while using |Gromacs| <http://www.gromacs.org/Documentation/Errors>`_
* Links to `tutorial material <http://www.gromacs.org/Documentation/Tutorials>`_
If you need to customize this further, use
::
- -DGMX_FFT_LIBRARY=mkl
- -DMKL_LIBRARIES="/full/path/to/libone.so;/full/path/to/libtwo.so"
- -DMKL_INCLUDE_DIR="/full/path/to/mkl/include"
+
+ cmake -DGMX_FFT_LIBRARY=mkl \
+ -DMKL_LIBRARIES="/full/path/to/libone.so;/full/path/to/libtwo.so" \
+ -DMKL_INCLUDE_DIR="/full/path/to/mkl/include"
The full list and order(!) of libraries you require are found in Intel's MKL documentation for your system.
.. _non-standard location:
-Where to install GROMACS
-^^^^^^^^^^^^^^^^^^^^^^^^
+Where to install |Gromacs|
+^^^^^^^^^^^^^^^^^^^^^^^^^^
|Gromacs| is installed in the directory to which
``CMAKE_INSTALL_PREFIX`` points. It may not be the source directory or
supported by the nvcc compiler (and the |Gromacs| build system).
However, it can be beneficial to manually pick the specific CUDA architecture(s)
to generate code for either to reduce compilation time (and binary size) or to
-target a new architecture not yet supported by the |GROMACS| build system.
+target a new architecture not yet supported by the |Gromacs| build system.
Setting the desired CUDA architecture(s) and virtual architecture(s)
can be done using the ``GMX_CUDA_TARGET_SM`` and ``GMX_CUDA_TARGET_COMPUTE``
variables, respectively. These take a semicolon delimited string with
should use super-user privileges only for ``make install`` and
not the whole procedure.
-.. _getting access to GROMACS:
+.. _getting access to |Gromacs|:
Getting access to |Gromacs| after installation
----------------------------------------------
The simplest way to run the checks is to build |Gromacs| with
``-DREGRESSIONTEST_DOWNLOAD``, and run ``make check``.
|Gromacs| will automatically download and run the tests for you.
-Alternatively, you can download and unpack the GROMACS
+Alternatively, you can download and unpack the |Gromacs|
regression test suite |gmx-regressiontests-package| tarball yourself
and use the advanced ``cmake`` option ``REGRESSIONTEST_PATH`` to
specify the path to the unpacked tarball, which will then be used for
---------------
:ref:`gro`
- GROMACS format
+ |Gromacs| format
:ref:`g96`
GROMOS-96 format
:ref:`pdb`
Files with the dat file extension contain generic input or output.
As it is not possible
-to categorize all data file formats, GROMACS has a generic file format called
+to categorize all data file formats, |Gromacs| has a generic file format called
dat of which no format is given.
.. _dlg:
eps
---
-The eps file format is not a special GROMACS format, but just a
+The eps file format is not a special |Gromacs| format, but just a
variant of the standard PostScript(tm). A sample eps file as
generated by the :ref:`gmx xpm2ps` program is
included below. It shows the secondary structure of a peptide as a function
A file with the g96 extension can be a GROMOS-96 initial/final
configuration file or a coordinate trajectory file or a combination of both.
The file is fixed format, all floats are written as 15.9 (files can get huge).
-GROMACS supports the following data blocks in the given order:
+|Gromacs| supports the following data blocks in the given order:
* Header block:
See the GROMOS-96 manual for a complete description of the blocks.
-Note that all GROMACS programs can read compressed or g-zipped files.
+Note that all |Gromacs| programs can read compressed or g-zipped files.
.. _gro:
Note that separate molecules or ions (e.g. water or Cl-) are regarded
as residues. If you want to write such a file in your own program
-without using the GROMACS libraries you can use the following formats:
+without using the |Gromacs| libraries you can use the following formats:
C format
``"%5d%-5s%5s%5d%8.3f%8.3f%8.3f%8.4f%8.4f%8.4f"``
log
---
-Logfiles are generated by some GROMACS programs and are usually in
+Logfiles are generated by some |Gromacs| programs and are usually in
human-readable format. Use ``more logfile``.
.. _m2p:
ndx
---
-The GROMACS index file (usually called index.ndx) contains some
+The |Gromacs| index file (usually called index.ndx) contains some
user definable sets of atoms. The file can be read by
most analysis programs, by the graphics program
(:ref:`gmx view`)
---
Files with the out file extension contain generic output. As it is not possible
-to categorize all data file formats, GROMACS has a generic file format called
+to categorize all data file formats, |Gromacs| has a generic file format called
out of which no format is given.
.. _pdb:
databank file format describes the positions of atoms in a molecular
structure. Coordinates are read from the ATOM and HETATM records,
until the file ends or an ENDMDL record is encountered.
-GROMACS programs can read and write a simulation box in the
+|Gromacs| programs can read and write a simulation box in the
CRYST1 entry.
The pdb format can also be used as a trajectory format:
several structures, separated by ENDMDL, can be read from
The rtp file extension stands for residue topology.
Such a file is needed by :ref:`gmx pdb2gmx`
-to make a GROMACS topology for a protein contained in a :ref:`pdb`
+to make a |Gromacs| topology for a protein contained in a :ref:`pdb`
file. The file contains the default interaction type for the 4 bonded
interactions and residue entries, which consist of atoms and
optionally bonds, angles dihedrals and impropers.
Files with the trr file extension contain the trajectory of a simulation.
In this file all the coordinates, velocities, forces and energies are
-printed as you told GROMACS in your mdp file. This file is in portable binary
+printed as you told |Gromacs| in your mdp file. This file is in portable binary
format and can be read with :ref:`gmx dump`::
gmx dump -f traj.trr
xpm
---
-The GROMACS xpm file format is compatible with the XPixMap format
+The |Gromacs| xpm file format is compatible with the XPixMap format
and is used for storing matrix data.
-Thus GROMACS xpm files can be viewed directly with programs like XV.
+Thus |Gromacs| xpm files can be viewed directly with programs like XV.
Alternatively, they can be imported into GIMP and scaled to 300 DPI,
using strong antialiasing for font and graphics.
The first matrix data line in an xpm file corresponds to the last matrix
row.
-In addition to the XPixMap format, GROMACS xpm files may contain
+In addition to the XPixMap format, |Gromacs| xpm files may contain
extra fields. The information in these fields is used when converting
an xpm file to EPS with :ref:`gmx xpm2ps`.
The optional extra field are:
* Between the colormap and the matrix data, the fields ``x-axis`` and/or
``y-axis`` may be present followed by the tick-marks for that axis.
-The example GROMACS xpm file below contains all the extra fields.
+The example |Gromacs| xpm file below contains all the extra fields.
The C-comment delimiters and the colon in the extra fields are optional.
::
xvg
---
-Almost all output from GROMACS analysis tools is ready as input for
+Almost all output from |Gromacs| analysis tools is ready as input for
Grace, formerly known as Xmgr. We use Grace, because it is very flexible, and it is also
free software. It produces PostScript(tm) output, which is very suitable
for inclusion in eg. LaTeX documents, but also for other word processors.
-A sample Grace session with GROMACS data is shown below:
+A sample Grace session with |Gromacs| data is shown below:
.. image:: xvgr.gif
:alt: hallo
--- /dev/null
+Floating point arithmetic
+=========================
+
+|Gromacs| spends its life doing arithmetic on real numbers, often summing many
+millions of them. These real numbers are encoded on computers in so-called
+binary floating-point representation. This representation is somewhat like
+scientific exponential notation (but uses binary rather than decimal), and is
+necessary for the fastest possible speed for calculations. Unfortunately the
+laws of algebra only approximately apply to binary floating-point. In part,
+this is because some real numbers that are represented simply and exactly in
+decimal (like 1/5=0.2) have no exact representation in binary floating-point,
+just as 1/3 cannot be represented in decimal. There are many sources you can
+find with a search engine that discuss this issue more exhaustively, such as
+`Wikipedia <https://en.wikipedia.org/wiki/Floating-point_arithmetic>`__ and
+David Goldberg's 1991 paper *What every computer scientist should know about
+floating-point arithmetic* (`article <https://docs.oracle.com/cd/E19957-01/806-3568/ncg_goldberg.html>`__,
+`addendum <https://docs.oracle.com/cd/E37069_01/html/E39019/z400228248508.html>`__).
+Bruce Dawson also has a written a number of very valuable blog posts on modern
+floating-point programming at his
+`Random ASCII site <https://randomascii.wordpress.com/category/floating-point/>`__
+that are worth reading.
+
+So, the sum of a large number of binary representations of exact decimal
+numbers need not equal the expected algebraic or decimal result. Users observe
+this phenomenon in sums of partial charges expressed to two decimal places that
+sometimes only approximate the integer total charge to which they contribute
+(however a deviation in the first decimal place would always be indicative of a
+badly-formed topology). When |Gromacs| has to represent such floating-point
+numbers in output, it sometimes uses a computer form of scientific notation
+known as E notation. In such notation, a number like -9.999971e-01 is actually
+-0.9999971, which is close enough to -1 for purposes of assessing the total
+charge of a system.
+
+It is also not appropriate for |Gromacs| to guess to round things, because such
+rounding relies on assumptions about the inputs that need not be true. Instead
+the user needs to understand how their tools work.
Flow Chart
==========
-This is a flow chart of a typical GROMACS MD run of a protein
+This is a flow chart of a typical |Gromacs| MD run of a protein
in a box of water.
A more detailed example is available in :doc:`getting-started`.
Several steps of energy minimization may be necessary,
the default case, the binaries are located in
``/usr/local/gromacs/bin``, however, you can ask your local system
administrator for more information, and then follow the advice for
-:ref:`getting access to GROMACS`.
+:ref:`getting access to |Gromacs|`.
Flowchart of typical simulation
-------------------------------
:maxdepth: 2
getting-started
+ floating-point
system-preparation
cutoff-schemes
mdrun-features
the calculations. Since the speed estimate is not deterministic, the
results may vary from run to run.
* Random numbers used for instance as a seed for generating velocities
- (in GROMACS at the preprocessing stage).
+ (in |Gromacs| at the preprocessing stage).
* Uninitialized variables in the code (but there shouldn't be any)
* Dynamic linking to different versions of shared libraries (e.g. for FFTs)
* Dynamic load balancing, since particles are redistributed to
The `Central Limit Theorem <https://en.wikipedia.org/wiki/Central_limit_theorem>`
tells us that in the case of infinitely long
simulations, all observables converge to their equilibrium
-values. Molecular simulations in GROMACS adhere to this theorem, and
+values. Molecular simulations in |Gromacs| adhere to this theorem, and
hence, for instance, the energy of your system will converge to a
finite value, the diffusion constant of your water molecules will
converge to a finite value, and so on. That means all the important
computation is also convenient to do this way.
This feature requires
-:ref:`configuring GROMACS with an external MPI library <mpi-support>`
+:ref:`configuring |Gromacs| with an external MPI library <mpi-support>`
so that the set of
simulations can communicate. The ``n`` simulations within the set can
use internal MPI parallelism also, so that ``mpirun -np x mdrun_mpi``
input files. The random seed for replica exchange is set with
``-reseed``. After every exchange, the velocities are scaled and
neighbor searching is performed. See the Reference Manual for more
-details on how replica exchange functions in GROMACS.
+details on how replica exchange functions in |Gromacs|.
Controlling the length of the simulation
----------------------------------------
This method was initially described as a ProtSqueeze technique
(Yesylevskyy S.O., J Chem Inf Model 47(5) (2007) 1986-94) and
-later implemented in GROMACS as g_membed tool (Wolf et al, J Comp Chem 31 (2010) 2169-2174).
+later implemented in |Gromacs| as g_membed tool (Wolf et al, J Comp Chem 31 (2010) 2169-2174).
Currently the functionality of g_membed is available in mdrun if
``-membed`` option is specified (see below).
Getting good performance from mdrun
===================================
-The GROMACS build system and the :ref:`gmx mdrun` tool has a lot of built-in
+The |Gromacs| build system and the :ref:`gmx mdrun` tool has a lot of built-in
and configurable intelligence to detect your hardware and make pretty
effective use of that hardware. For a lot of casual and serious use of
:ref:`gmx mdrun`, the automatic machinery works well enough. But to get the
up to 8 hardware threads per core.
This feature can usually be enabled or disabled either in
the hardware bios or through a setting in the Linux operating
- system. GROMACS can typically make use of this, for a moderate
+ system. |Gromacs| can typically make use of this, for a moderate
free performance boost. In most cases it will be
enabled by default e.g. on new x86 processors, but in some cases
the system administrators might have disabled it. If that is the
numbers of floating-point instructions in a single cycle.
-GROMACS background information
-------------------------------
+|Gromacs| background information
+--------------------------------
The algorithms in :ref:`gmx mdrun` and their implementations are most relevant
when choosing how to make good use of the hardware. For details,
see the Reference Manual. The most important of these are
Domain Decomposition
The domain decomposition (DD) algorithm decomposes the
(short-ranged) component of the non-bonded interactions into
- domains that share spatial locality, which permits efficient
- code to be written. Each domain handles all of the
+ domains that share spatial locality, which permits the use of
+ efficient algorithms. Each domain handles all of the
particle-particle (PP) interactions for its members, and is
- mapped to a single rank. Within a PP rank, OpenMP threads can
- share the workload, or the work can be off-loaded to a
+ mapped to a single MPI rank. Within a PP rank, OpenMP threads
+ can share the workload, and some work can be off-loaded to a
GPU. The PP rank also handles any bonded interactions for the
members of its domain. A GPU may perform work for more than
one PP rank, but it is normally most efficient to use a single
are efficient to use within a single :term:`node`. The default configuration
using a suitable compiler will deploy a multi-level hybrid parallelism
that uses CUDA, OpenMP and the threading platform native to the
-hardware. For programming convenience, in GROMACS, those native
+hardware. For programming convenience, in |Gromacs|, those native
threads are used to implement on a single node the same MPI scheme as
would be used between nodes, but much more efficient; this is called
thread-MPI. From a user's perspective, real MPI and thread-MPI look
-almost the same, and GROMACS refers to MPI ranks to mean either kind,
+almost the same, and |Gromacs| refers to MPI ranks to mean either kind,
except where noted. A real external MPI can be used for :ref:`gmx mdrun` within
a single node, but runs more slowly than the thread-MPI version.
log file, stdout and stderr are used to print diagnostics that
inform the user about the choices made and possible consequences.
-A number of command-line parameters are available to vary the default
+A number of command-line parameters are available to modify the default
behavior.
``-nt``
The total number of threads to use. The default, 0, will start as
many threads as available cores. Whether the threads are
- thread-MPI ranks, or OpenMP threads within such ranks depends on
+ thread-MPI ranks, and/or OpenMP threads within such ranks depends on
other settings.
``-ntmpi``
The total number of ranks to dedicate to the long-ranged
component of PME, if used. The default, -1, will dedicate ranks
only if the total number of threads is at least 12, and will use
- around one-third of the ranks for the long-ranged component.
+ around a quarter of the ranks for the long-ranged component.
``-ntomp_pme``
When using PME with separate PME ranks,
are no separate PME ranks.
``-nb``
- Can be set to "auto", "cpu", "gpu", "cpu_gpu."
+ Used to set where to execute the non-bonded interactions.
+ Can be set to "auto", "cpu", "gpu", "gpu_cpu."
Defaults to "auto," which uses a compatible GPU if available.
Setting "cpu" requires that no GPU is used. Setting "gpu" requires
that a compatible GPU be available and will be used. Setting
- "cpu_gpu" permits the CPU to execute a GPU-like code path, which
- will run slowly on the CPU and should only be used for debugging.
+ "gpu_cpu" lets the GPU compute the local and the CPU the non-local
+ non-bonded interactions. Is only faster under a narrow range of
+ conditions.
Examples for mdrun on one node
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Running mdrun on more than one node
-----------------------------------
-This requires configuring GROMACS to build with an external MPI
+This requires configuring |Gromacs| to build with an external MPI
library. By default, this mdrun executable is run with
:ref:`mdrun_mpi`. All of the considerations for running single-node
mdrun still apply, except that ``-ntmpi`` and ``-nt`` cause a fatal
multiple of 32
- Some Ewald tabulated kernels are known to produce incorrect results, so
(correct) analytical kernels are used instead.
+
+Performance checklist
+---------------------
+
+There are many different aspects that affect the performance of simulations in
+|Gromacs|. Most simulations require a lot of computational resources, therefore
+it can be worthwhile to optimize the use of those resources. Several issues
+mentioned in the list below could lead to a performance difference of a factor
+of 2. So it can be useful go through the checklist.
+
+|Gromacs| configuration
+^^^^^^^^^^^^^^^^^^^^^^^
+
+* Don't use double precision unless you're absolute sure you need it.
+* Compile the FFTW library (yourself) with the correct flags on x86 (in most
+ cases, the correct flags are automatically configured).
+* On x86, use gcc or icc as the compiler (not pgi or the Cray compiler).
+* On POWER, use gcc instead of IBM's xlc.
+* Use a new compiler version, especially for gcc (e.g. from the version 5 to 6
+ the performance of the compiled code improved a lot).
+* MPI library: OpenMPI usually has good performance and causes little trouble.
+* Make sure your compiler supports OpenMP (some versions of Clang don't).
+* If you have GPUs that support either CUDA or OpenCL, use them.
+
+ * Configure with ``-DGMX_GPU=ON`` (add ``-DGMX_USE_OPENCL=ON`` for OpenCL).
+ * For CUDA, use the newest CUDA availabe for your GPU to take advantage of the
+ latest performance enhancements.
+ * Use a recent GPU driver.
+ * If compiling on a cluster head node, make sure that ``GMX_CPU_ACCELERATION``
+ is appropriate for the compute nodes.
+
+Run setup
+^^^^^^^^^
+
+* For an approximately spherical solute, use a rhombic dodecahedron unit cell.
+* When using a time-step of 2 fs, use :mdp:`cutoff-scheme` = :mdp:`h-bonds`
+ (and not :mdp:`all-bonds`), since this is faster, especially with GPUs,
+ and most force fields have been parametrized with only bonds involving
+ hydrogens constrained.
+* You can increase the time-step to 4 or 5 fs when using virtual interaction
+ sites (``gmx pdb2gmx -vsite h``).
+* For massively parallel runs with PME, you might need to try different numbers
+ of PME ranks (``gmx mdrun -npme ???``) to achieve best performance;
+ ``gmx tune_pme`` can help automate this search.
+* For massively parallel runs (also ``gmx mdrun -multidir``), or with a slow
+ network, global communication can become a bottleneck and you can reduce it
+ with ``gmx mdrun -gcom`` (note that this does affect the frequency of
+ temperature and pressure coupling).
+
+Checking and improving performance
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+* Look at the end of the ``md.log`` file to see the performance and the cycle
+ counters and wall-clock time for different parts of the MD calculation. The
+ PP/PME load ratio is also printed, with a warning when a lot of performance is
+ lost due to imbalance.
+* Adjust the number of PME ranks and/or the cut-off and PME grid-spacing when
+ there is a large PP/PME imbalance. Note that even with a small reported
+ imbalance, the automated PME-tuning might have reduced the initial imbalance.
+ You could still gain performance by changing the mdp parameters or increasing
+ the number of PME ranks.
+* If the neighbor searching takes a lot of time, increase nstlist (with the
+ Verlet cut-off scheme, this automatically adjusts the size of the neighbour
+ list to do more non-bonded computation to keep energy drift constant).
+
+ * If ``Comm. energies`` takes a lot of time (a note will be printed in the log
+ file), increase nstcalcenergy or use ``mdrun -gcom``.
+ * If all communication takes a lot of time, you might be running on too many
+ cores, or you could try running combined MPI/OpenMP parallelization with 2
+ or 4 OpenMP threads per MPI process.
#define DD_NLOAD_MAX 9
-const char *edlbs_names[edlbsNR] = { "off", "auto", "locked", "on" };
+const char *edlbs_names[edlbsNR] = { "off", "auto", "locked", "on", "on" };
/* The size per charge group of the cggl_flag buffer in gmx_domdec_comm_t */
#define DD_CGIBS 2
void read_eigenvalues(int vecs[], const char *eigfile, real values[],
- gmx_bool bHesse, real kT)
+ gmx_bool bHesse, real kT, int natoms_average_struct)
{
int neig, nrow, i;
double **eigval;
{
for (i = 0; vecs[i]; i++)
{
- if (vecs[i] > (neig-6))
+ /* Make sure this eigenvalue does not correspond to one of the last 6 eigenvectors of the
+ * covariance matrix. These correspond to the rotational and translational degrees of
+ * freedom and will be zero within numerical accuracy.
+ *
+ * Note that the total number of eigenvectors produced by gmx covar depends on:
+ * 1) the total number of degrees of freedom of the system (3N, with N the number of atoms)
+ * 2) the number S of independent configurations fed into gmx covar.
+ * For long trajectories with lots of frames, usually S >= 3N + 1, so that one indeed gets
+ * 3N eigenvalues (of which the last 6 will have zero eigenvalues).
+ * For S < 3N + 1, however, the covariance matrix becomes rank deficient, and the number
+ * of possible eigenvalues is just S - 1. Since in make_edi we only know N but not S, we can
+ * only warn the user if he picked one of the last 6 of 3N.
+ */
+ if (vecs[i] > ( 3 * natoms_average_struct - 6 ))
{
gmx_fatal(FARGS, "ERROR: You have chosen one of the last 6 eigenvectors of the COVARIANCE Matrix. That does not make sense, since they correspond to the 6 rotational and translational degrees of freedom.\n\n");
}
if (listen[evFLOOD][0] != 0)
{
- read_eigenvalues(listen[evFLOOD], opt2fn("-eig", NFILE, fnm), evStepList[evFLOOD], bHesse, kB*T);
+ read_eigenvalues(listen[evFLOOD], opt2fn("-eig", NFILE, fnm), evStepList[evFLOOD], bHesse, kB*T, nav);
}
edi_params.flood.tau = tau;
evdw_names[ir->vdwtype],
eintmod_names[eintmodPOTSHIFT],
eintmod_names[eintmodNONE]);
+ warning_error(wi, err_buf);
}
}
{
return false;
}
- HANDLE_NVML_RET_ERR(nvml_stat, "nvmlDeviceGetApplicationsClock failed");
+ HANDLE_NVML_RET_ERR(nvml_stat, "nvmlDeviceGetApplicationsClock failed for NVIDIA_CLOCK_SM");
nvml_stat = nvmlDeviceGetApplicationsClock(cuda_dev->nvml_device_id, NVML_CLOCK_MEM, app_mem_clock);
- HANDLE_NVML_RET_ERR(nvml_stat, "nvmlDeviceGetApplicationsClock failed");
+ HANDLE_NVML_RET_ERR(nvml_stat, "nvmlDeviceGetApplicationsClock failed for NVIDIA_CLOCK_MEM");
return true;
}
app_mem_clock == cuda_dev->nvml_set_app_mem_clock)
{
nvml_stat = nvmlDeviceSetApplicationsClocks(cuda_dev->nvml_device_id, cuda_dev->nvml_orig_app_mem_clock, cuda_dev->nvml_orig_app_sm_clock);
- HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetApplicationsClock failed" );
+ HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceSetApplicationsClock failed" );
}
}
nvml_stat = nvmlShutdown();
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2015,2016, by the GROMACS development team, led by
+ * Copyright (c) 2015,2016,2017, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
for (float f = 1.0; f < 10.0; f += 1.0)
{
- result.push_back(gmx::sixthroot(f));
+ result.push_back(gmx::invsixthroot(f));
}
checker.checkSequence(result.begin(), result.end(), "InvsixthrootFloat");
}
for (double d = 1.0; d < 10.0; d += 1.0)
{
- result.push_back(gmx::sixthroot(d));
+ result.push_back(gmx::invsixthroot(d));
}
checker.checkSequence(result.begin(), result.end(), "InvsixthrootDouble");
}
for (int i = 1; i < 10; i++)
{
- result.push_back(gmx::sixthroot(i));
+ result.push_back(gmx::invsixthroot(i));
}
checker.checkSequence(result.begin(), result.end(), "InvsixthrootInteger");
}
<Sequence Name="InvsixthrootDouble">
<Int Name="Length">9</Int>
<Real>1</Real>
- <Real>1.122462048309373</Real>
- <Real>1.2009369551760027</Real>
- <Real>1.2599210498948732</Real>
- <Real>1.3076604860118306</Real>
- <Real>1.3480061545972777</Real>
- <Real>1.3830875542684886</Real>
- <Real>1.4142135623730951</Real>
- <Real>1.4422495703074083</Real>
+ <Real>0.89089871814033927</Real>
+ <Real>0.83268317765560429</Real>
+ <Real>0.79370052598409968</Real>
+ <Real>0.76472449133173004</Real>
+ <Real>0.74183637559040227</Real>
+ <Real>0.72302002639948371</Real>
+ <Real>0.70710678118654746</Real>
+ <Real>0.69336127435063477</Real>
</Sequence>
</ReferenceData>
<Sequence Name="InvsixthrootFloat">
<Int Name="Length">9</Int>
<Real>1</Real>
- <Real>1.122462</Real>
- <Real>1.2009369</Real>
- <Real>1.2599211</Real>
- <Real>1.3076605</Real>
- <Real>1.3480061</Real>
- <Real>1.3830875</Real>
- <Real>1.4142135</Real>
- <Real>1.4422495</Real>
+ <Real>0.8908987</Real>
+ <Real>0.83268321</Real>
+ <Real>0.79370052</Real>
+ <Real>0.76472449</Real>
+ <Real>0.74183637</Real>
+ <Real>0.72302002</Real>
+ <Real>0.70710677</Real>
+ <Real>0.69336128</Real>
</Sequence>
</ReferenceData>
<Sequence Name="InvsixthrootInteger">
<Int Name="Length">9</Int>
<Real>1</Real>
- <Real>1.122462048309373</Real>
- <Real>1.2009369551760027</Real>
- <Real>1.2599210498948732</Real>
- <Real>1.3076604860118306</Real>
- <Real>1.3480061545972777</Real>
- <Real>1.3830875542684886</Real>
- <Real>1.4142135623730951</Real>
- <Real>1.4422495703074083</Real>
+ <Real>0.89089871814033927</Real>
+ <Real>0.83268317765560429</Real>
+ <Real>0.79370052598409968</Real>
+ <Real>0.76472449133173004</Real>
+ <Real>0.74183637559040227</Real>
+ <Real>0.72302002639948371</Real>
+ <Real>0.70710678118654746</Real>
+ <Real>0.69336127435063477</Real>
</Sequence>
</ReferenceData>
continue;
}
- pcrd->f_scal = dr_tot[c]/((pull->group[pcrd->params.group[0]].invtm + pull->group[pcrd->params.group[1]].invtm)*dt*dt);
+ /* Accumulate the forces, in case we have multiple constraint steps */
+ pcrd->f_scal += dr_tot[c]/((pull->group[pcrd->params.group[0]].invtm + pull->group[pcrd->params.group[1]].invtm)*dt*dt);
if (vir != nullptr && pcrd->params.eGeom != epullgDIRPBC && bMaster)
{
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2014,2015,2016, by the GROMACS development team, led by
+ * Copyright (c) 2014,2015,2016,2017, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
};
}
-static inline float gmx_simdcall
+static inline double gmx_simdcall
dotProduct(Simd4Double a, Simd4Double b)
{
vector4double dp_sh0 = vec_mul(a.simdInternal_, b.simdInternal_);
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2014,2015, by the GROMACS development team, led by
+ * Copyright (c) 2014,2015,2017, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
};
}
-static inline float gmx_simdcall
+static inline double gmx_simdcall
dotProduct(Simd4Double a, Simd4Double b)
{
__m128d tmp1, tmp2;
};
}
-static inline float gmx_simdcall
+static inline double gmx_simdcall
reduce(Simd4Double a)
{
__m128d a0, a1;
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2014,2015,2016, by the GROMACS development team, led by
+ * Copyright (c) 2014,2015,2016,2017, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
};
}
-static inline float gmx_simdcall
+static inline double gmx_simdcall
dotProduct(Simd4Double a, Simd4Double b)
{
__m128d tmp1, tmp2;
};
}
-static inline float gmx_simdcall
+static inline double gmx_simdcall
reduce(Simd4Double a)
{
__m128d a0, a1;
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2014,2015, by the GROMACS development team, led by
+ * Copyright (c) 2014,2015,2017, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
};
}
-static inline float gmx_simdcall
+static inline double gmx_simdcall
dotProduct(Simd4Double a, Simd4Double b)
{
return _mm512_mask_reduce_add_pd(_mm512_int2mask(7),
};
}
-static inline float gmx_simdcall
+static inline double gmx_simdcall
reduce(Simd4Double a)
{
return _mm512_mask_reduce_add_pd(_mm512_int2mask(0xF), a.simdInternal_);
copy_mat(ekind->tcstat[i].ekinh, ekind->tcstat[i].ekinh_old);
}
}
- if (ir->eI != eiVV)
- {
- enerd->term[F_TEMP] *= 2; /* result of averages being done over previous and current step,
- and there is no previous step */
- }
/* need to make an initiation call to get the Trotter variables set, as well as other constants for non-trotter
temperature control */
if (MASTER(cr))
{
- if (constr && !ir->bContinuation && ir->eConstrAlg == econtLINCS)
- {
- fprintf(fplog,
- "RMS relative constraint deviation after constraining: %.2e\n",
- constr_rmsd(constr));
- }
- if (EI_STATE_VELOCITY(ir->eI))
+ if (!ir->bContinuation)
{
- fprintf(fplog, "Initial temperature: %g K\n", enerd->term[F_TEMP]);
+ if (constr && ir->eConstrAlg == econtLINCS)
+ {
+ fprintf(fplog,
+ "RMS relative constraint deviation after constraining: %.2e\n",
+ constr_rmsd(constr));
+ }
+ if (EI_STATE_VELOCITY(ir->eI))
+ {
+ real temp = enerd->term[F_TEMP];
+ if (ir->eI != eiVV)
+ {
+ /* Result of Ekin averaged over velocities of -half
+ * and +half step, while we only have -half step here.
+ */
+ temp *= 2;
+ }
+ fprintf(fplog, "Initial temperature: %g K\n", temp);
+ }
}
+
if (bRerunMD)
{
fprintf(stderr, "starting md rerun '%s', reading coordinates from"