Merge branch release-2016

author Mark Abraham <mark.j.abraham@gmail.com>

Mon, 31 Oct 2016 11:39:40 +0000 (12:39 +0100)

committer Mark Abraham <mark.j.abraham@gmail.com>

Mon, 31 Oct 2016 13:38:43 +0000 (14:38 +0100)
author Mark Abraham <mark.j.abraham@gmail.com>
Mon, 31 Oct 2016 11:39:40 +0000 (12:39 +0100)
committer Mark Abraham <mark.j.abraham@gmail.com>
Mon, 31 Oct 2016 13:38:43 +0000 (14:38 +0100)
diff --combined cmake/gmxVersionInfo.cmake

index f7d7a2bc9efbefdca7b65c78e3bc2480f82557db,e8fe834201f3a7ea0aaf82139236dab0392edcf8..6a4b9619ef1bb045711dec147e9fb2822f3b9a4f
--- 1/cmake/gmxVersionInfo.cmake
--- 2/cmake/gmxVersionInfo.cmake
+++ b/cmake/gmxVersionInfo.cmake
@@@ -187,8 -187,8 +187,8 @@@
   
   # The GROMACS convention is that these are the version number of the next
   # release that is going to be made from this branch.
- -set(GMX_VERSION_MAJOR 2016)
- -set(GMX_VERSION_PATCH 2)
+ +set(GMX_VERSION_MAJOR 2017)
+ +set(GMX_VERSION_PATCH 0)
   # The suffix, on the other hand, is used mainly for betas and release
   # candidates, where it signifies the most recent such release from
   # this branch; it will be empty before the first such release, as well
@@@ -204,8 -204,8 +204,8 @@@ set(GMX_VERSION_SUFFIX ""
   # here. The important thing is to minimize the chance of third-party
   # code being able to dynamically link with a version of libgromacs
   # that might not work.
- -set(LIBRARY_SOVERSION_MAJOR 2)
- -set(LIBRARY_SOVERSION_MINOR 1)
+ +set(LIBRARY_SOVERSION_MAJOR 3)
+ +set(LIBRARY_SOVERSION_MINOR 0)
   set(LIBRARY_VERSION ${LIBRARY_SOVERSION_MAJOR}.${LIBRARY_SOVERSION_MINOR}.0)
   
   #####################################################################
@@@ -229,7 -229,7 +229,7 @@@ set(REGRESSIONTEST_BRANCH "refs/heads/r
   # each release. It's hard to test because it is only used for
   # REGRESSIONTEST_DOWNLOAD, which doesn't work until that tarball has
   # been placed on the server.
- set(REGRESSIONTEST_MD5SUM "5f49cfc4f04a34f117340cf9b3e5f8a2" CACHE INTERNAL "MD5 sum of the regressiontests tarball")
+ set(REGRESSIONTEST_MD5SUM "366438549270d005fa6def6e56ca0256" CACHE INTERNAL "MD5 sum of the regressiontests tarball")
   
   math(EXPR GMX_VERSION_NUMERIC
        "${GMX_VERSION_MAJOR}*10000 + ${GMX_VERSION_PATCH}")
diff --combined docs/install-guide/index.rst

index ffa7047de32472cfb70c0ea714e86c0cb9de51ea,6add227204190e8c0dffa8a43e03f5b92d892e34..9fc01b432e03b38570d5663cf36777408a9f0f88
--- 1/docs/install-guide/index.rst
--- 2/docs/install-guide/index.rst
+++ b/docs/install-guide/index.rst
@@@ -15,7 -15,7 +15,7 @@@ These instructions pertain to building 
   Quick and dirty installation
   ----------------------------
   1. Get the latest version of your C and C++ compilers.
- -2. Check that you have CMake version |GMX_CMAKE_MINIMUM_REQUIRED_VERSION| or later.
+ +2. Check that you have CMake version |CMAKE_MINIMUM_REQUIRED_VERSION| or later.
   3. Get and unpack the latest version of the |Gromacs| tarball.
   4. Make a separate build directory and change to it. 
   5. Run ``cmake`` with the path to the source as an argument
@@@ -46,6 -46,15 +46,15 @@@ fast. If you want to get the maximum va
   hardware, libraries, and compilers are only going to continue to get
   more complex.
   
+ Quick and dirty cluster installation
+ ------------------------------------
+ 
+ On a cluster where users are expected to be running across multiple
+ nodes using MPI, make one installation similar to the above, and
+ another using an MPI wrapper compiler and which is `building only
+ mdrun`_, because that is the only component of |Gromacs| that uses
+ MPI.
+ 
   Typical installation
   --------------------
   As above, and with further details below, but you should consider
@@@ -54,7 -63,7 +63,7 @@@ appropriate value instead of ``xxx`` 
   
   * ``-DCMAKE_C_COMPILER=xxx`` equal to the name of the C99 `Compiler`_ you wish to use (or the environment variable ``CC``)
   * ``-DCMAKE_CXX_COMPILER=xxx`` equal to the name of the C++98 `compiler`_ you wish to use (or the environment variable ``CXX``)
- * ``-DGMX_MPI=on`` to build using `MPI support`_
+ * ``-DGMX_MPI=on`` to build using `MPI support`_ (generally good to combine with `building only mdrun`_)
   * ``-DGMX_GPU=on`` to build using nvcc to run using NVIDIA `CUDA GPU acceleration`_ or an OpenCL_ GPU
   * ``-DGMX_USE_OPENCL=on`` to build with OpenCL_ support enabled. ``GMX_GPU`` must also be set.
   * ``-DGMX_SIMD=xxx`` to specify the level of `SIMD support`_ of the node on which |Gromacs| will run
@@@ -91,11 -100,10 +100,11 @@@ compiler. We recommend gcc, because it 
   frequently provides the best performance.
   
   You should strive to use the most recent version of your
- -compiler. Minimum supported compiler versions are
- -* GNU (gcc) 4.6
- -* Intel (icc) 14
- -* LLVM (clang) 3.4
+ +compiler. Since we require full C++11 support the minimum supported
+ +compiler versions are
+ +* GNU (gcc) 4.8.1
+ +* Intel (icc) 15.0
+ +* LLVM (clang) 3.3
   * Microsoft (MSVC) 2015
   Other compilers may work (Cray, Pathscale, older clang) but do
   not offer competitive performance. We recommend against PGI because
@@@ -112,7 -120,7 +121,7 @@@ other compilers, read on
   
   On Linux, both the Intel and clang compiler use the libstdc++ which
   comes with gcc as the default C++ library. For |Gromacs|, we require
- -the compiler to support libstc++ version 4.6.1 or higher. To select a
+ +the compiler to support libstc++ version 4.8.1 or higher. To select a
   particular libstdc++ library, use:
   
   * For Intel: ``-DGMX_STDLIB_CXX_FLAGS=-gcc-name=/path/to/gcc/binary``
@@@ -141,11 -149,6 +150,11 @@@ For all non-x86 platforms, your best op
   the vendor's default or recommended compiler, and check for
   specialized information below.
   
+ +For updated versions of gcc to add to your Linux OS, see
+ +
+ +* Ubuntu: `Ubuntu toolchain ppa page`_
+ +* RHEL/CentOS: `EPEL page`_ or the RedHat Developer Toolset
+ +
   Compiling with parallelization options
   --------------------------------------
   
@@@ -157,10 -160,8 +166,10 @@@ generally built into your compiler and 
   GPU support
   ^^^^^^^^^^^
   |Gromacs| has excellent support for NVIDIA GPUs supported via CUDA.
- -NVIDIA's CUDA_ version |REQUIRED_CUDA_VERSION| software development kit is required,
- -and the latest version is strongly encouraged. NVIDIA GPUs with at
+ +On Linux with gcc, NVIDIA's CUDA_ version |REQUIRED_CUDA_VERSION|
+ +software development kit is required, and the latest
+ +version is strongly encouraged. Using Intel or Microsoft compilers
+ +requires version 7.0 and 8.0, respectively. NVIDIA GPUs with at
   least NVIDIA compute capability |REQUIRED_CUDA_COMPUTE_CAPABILITY| are
   required, e.g. Fermi, Kepler, Maxwell or Pascal cards. You are strongly recommended to
   get the latest CUDA version and driver supported by your hardware, but
@@@ -217,7 -218,7 +226,7 @@@ CMak
   -----
   
   |Gromacs| builds with the CMake build system, requiring at least
- -version |GMX_CMAKE_MINIMUM_REQUIRED_VERSION|. You can check whether
+ +version |CMAKE_MINIMUM_REQUIRED_VERSION|. You can check whether
   CMake is installed, and what version it is, with ``cmake
   --version``. If you need to install CMake, then first check whether
   your platform's package management system provides a suitable version,
@@@ -259,23 -260,20 +268,20 @@@ recommends eithe
   * that you build FFTW from the source code.
   
   If you build FFTW from source yourself, get the most recent version
- and follow the `FFTW installation guide`_. Note that we have recently
- contributed new SIMD optimization for several extra platforms to
- FFTW, which will appear in FFTW-3.3.5 (for now it is available in the
- FFTW repository on github, or you can find a very unofficial prerelease
- version at ftp://ftp.gromacs.org/pub/contrib ).
- Choose the precision for FFTW (i.e. single/float vs. double) to
- match whether you will later use mixed or double precision for
- |Gromacs|. There is no need to compile FFTW with
- threading or MPI support, but it does no harm. On x86 hardware,
- compile with *both* ``--enable-sse2`` and ``--enable-avx`` for
- FFTW-3.3.4 and earlier. As of FFTW-3.3.5 you should also add
- ``--enable-avx2``. FFTW will create a fat library with codelets
- for all different instruction sets, and pick the fastest supported
- one at runtime. On IBM Power8, you definitely want the upcoming
- FFTW-3.3.5 and to compile it with ``--enable-vsx`` for SIMD support. If you are
- using a Cray, there is a special modified (commercial) version of
- FFTs using the FFTW interface which can be slightly faster.
+ and follow the `FFTW installation guide`_. Choose the precision for
+ FFTW (i.e. single/float vs. double) to match whether you will later
+ use mixed or double precision for |Gromacs|. There is no need to
+ compile FFTW with threading or MPI support, but it does no harm. On
+ x86 hardware, compile with *both* ``--enable-sse2`` and
+ ``--enable-avx`` for FFTW-3.3.4 and earlier. From FFTW-3.3.5, you
+ should also add ``--enable-avx2`` also. On Intel chipsets supporting
+ 512-wide AVX, including KNL, add ``--enable-avx512`` also. FFTW will
+ create a fat library with codelets for all different instruction sets,
+ and pick the fastest supported one at runtime. On IBM Power8, you
+ definitely want FFTW-3.3.5 and to compile it with ``--enable-vsx`` for
+ SIMD support. If you are using a Cray, there is a special modified
+ (commercial) version of FFTs using the FFTW interface which can be
+ slightly faster.
   
   Using MKL
   ^^^^^^^^^
@@@ -605,22 -603,35 +611,35 @@@ CPUs also works well
   
   OpenCL GPU acceleration
   ^^^^^^^^^^^^^^^^^^^^^^^
- To build Gromacs with OpenCL support enabled, an OpenCL_ SDK
- (e.g. `from AMD <http://developer.amd.com/appsdk>`_) must be installed
- in a path found in ``CMAKE_PREFIX_PATH`` (or via the environment
- variables ``AMDAPPSDKROOT`` or ``CUDA_PATH``), and the following CMake
- flags must be set
+ 
+ The primary target of the |Gromacs| OpenCL support is accelerating simulations
+ on AMD hardware, both discrete GPUs and APUs (integrated CPU+GPU chips).
+ The |Gromacs| OpenCL on NVIDIA GPUs works, but performance
+ and other limitations make it less practical (for details see the user guide).
+ 
+ To build |Gromacs| with OpenCL_ support enabled, two components are
+ required: the OpenCL_ headers and the wrapper library that acts
+ as a client driver loader (so-called ICD loader).
+ The additional, runtime-only dependency is the vendor-specific GPU driver
+ for the device targeted. This also contains the OpenCL_ compiler.
+ As the GPU compute kernels are compiled  on-demand at run time,
+ this vendor-specific compiler and driver is not needed for building |Gromacs|.
+ The former, compile-time dependencies are standard components,
+ hence stock versions can be obtained from most Linux distribution
+ repositories (e.g. ``opencl-headers`` and ``ocl-icd-libopencl1`` on Debian/Ubuntu).
+ Only the compatibility with the required OpenCL_ version |REQUIRED_OPENCL_MIN_VERSION|
+ needs to be ensured.
+ Alternatively, the headers and library can also be obtained from vendor SDKs
+ (e.g. `from AMD <http://developer.amd.com/appsdk>`_),
+ which must be installed in a path found in ``CMAKE_PREFIX_PATH`` (or via the environment
+ variables ``AMDAPPSDKROOT`` or ``CUDA_PATH``).
+ 
+ To trigger an OpenCL_ build the following CMake flags must be set
   
   ::
   
       cmake .. -DGMX_GPU=ON -DGMX_USE_OPENCL=ON
   
- Building |Gromacs| OpenCL support for a CUDA_ GPU works, but see the
- known limitations in the user guide. If you want to
- do so anyway, because NVIDIA OpenCL support is part of the CUDA
- package, a C++ compiler supported by your CUDA installation is
- required.
- 
   On Mac OS, an AMD GPU can be used only with OS version 10.10.4 and
   higher; earlier OS versions are known to run incorrectly.
   
@@@ -797,24 -808,14 +816,14 @@@ supported by ``cmake`` (e.g. ``ninja``
   
   Building only mdrun
   ^^^^^^^^^^^^^^^^^^^
- Past versions of the build system offered "mdrun" and "install-mdrun"
- targets (similarly for other programs too) to build and install only
- the mdrun program, respectively. Such a build is useful when the
- configuration is only relevant for mdrun (such as with
- parallelization options for MPI, SIMD, GPUs, or on BlueGene or Cray),
- or the length of time for the compile-link-install cycle is relevant
- when developing.
   
   This is now supported with the ``cmake`` option
- ``-DGMX_BUILD_MDRUN_ONLY=ON``, which will build a cut-down version of
- ``libgromacs`` and/or the mdrun program.
+ ``-DGMX_BUILD_MDRUN_ONLY=ON``, which will build a different version of
+ ``libgromacs`` and the ``mdrun`` program.
   Naturally, now ``make install`` installs only those
   products. By default, mdrun-only builds will default to static linking
   against |Gromacs| libraries, because this is generally a good idea for
- the targets for which an mdrun-only build is desirable. If you re-use
- a build tree and change to the mdrun-only build, then you will inherit
- the setting for ``BUILD_SHARED_LIBS`` from the old build, and will be
- warned that you may wish to manage ``BUILD_SHARED_LIBS`` yourself.
+ the targets for which an mdrun-only build is desirable.
   
   Installing |Gromacs|
   --------------------
@@@ -926,8 -927,9 +935,9 @@@ be run. You can use ``./gmxtest.pl -mpi
   run an MPI program is called ``srun``.
   
   The ``make check`` target also runs integration-style tests that may run
- with MPI if ``GMX_MPI=ON`` was set. To make these work, you may need to
- set the CMake variables ``MPIEXEC``, ``MPIEXEC_NUMPROC_FLAG``, ``NUMPROC``,
+ with MPI if ``GMX_MPI=ON`` was set. To make these work with various possible
+ MPI libraries, you may need to
+ set the CMake variables ``MPIEXEC``, ``MPIEXEC_NUMPROC_FLAG``,
   ``MPIEXEC_PREFLAGS`` and ``MPIEXEC_POSTFLAGS`` so that
   ``mdrun-mpi-test_mpi`` would run on multiple ranks via the shell command
   
@@@ -936,8 -938,11 +946,11 @@@
       ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} ${NUMPROC} ${MPIEXEC_PREFLAGS} \
             mdrun-mpi-test_mpi ${MPIEXEC_POSTFLAGS} -otherflags
   
- Typically, one might use variable values ``mpirun``, ``-np``, ``2``, ``''``,
- ``''`` respectively, in order to run on two ranks.
+ A typical example for SLURM is
+ 
+ ::
+ 
+      cmake .. -DGMX_MPI=on -DMPIEXEC=srun -DMPIEXEC_NUMPROC_FLAG=-n -DMPIEXEC_PREFLAGS= -DMPIEXEC_POSTFLAGS=
   
   
   Testing |Gromacs| for performance
@@@ -1134,9 -1139,9 +1147,9 @@@ much everywhere, it is important that w
   it works because we have tested it. We do test on Linux, Windows, and
   Mac with a range of compilers and libraries for a range of our
   configuration options. Every commit in our git source code repository
- -is currently tested on x86 with gcc versions ranging from 4.6 through
- -5.2, and versions 16 of the Intel compiler as well as Clang
- -version 3.4 through 3.8. For this, we use a variety of GNU/Linux
+ +is currently tested on x86 with a number of gcc versions ranging from 4.8.1
+ +through 6.1, versions 16 of the Intel compiler, and Clang
+ +versions 3.4 through 3.8. For this, we use a variety of GNU/Linux
   flavors and versions as well as recent versions of Windows. Under
   Windows, we test both MSVC 2015 and version 16 of the Intel compiler.
   For details, you can
diff --combined src/gromacs/domdec/domdec.cpp

index e1c020808d36743150df0413cc71fd518d302cbf,ac17ffb7dd0081b98ee7d34fc2bc7656ae063f56..93a685edce4cbf4c90620d432dc88786363d23d7
--- 1/src/gromacs/domdec/domdec.cpp
--- 2/src/gromacs/domdec/domdec.cpp
+++ b/src/gromacs/domdec/domdec.cpp
@@@ -70,7 -70,6 +70,7 @@@
   #include "gromacs/mdlib/gmx_omp_nthreads.h"
   #include "gromacs/mdlib/mdatoms.h"
   #include "gromacs/mdlib/mdrun.h"
+ +#include "gromacs/mdlib/mdsetup.h"
   #include "gromacs/mdlib/nb_verlet.h"
   #include "gromacs/mdlib/nbnxn_grid.h"
   #include "gromacs/mdlib/nsgrid.h"
@@@ -92,7 -91,6 +92,7 @@@
   #include "gromacs/topology/block.h"
   #include "gromacs/topology/idef.h"
   #include "gromacs/topology/ifunc.h"
+ +#include "gromacs/topology/mtop_lookup.h"
   #include "gromacs/topology/mtop_util.h"
   #include "gromacs/topology/topology.h"
   #include "gromacs/utility/basedefinitions.h"
@@@ -179,6 -177,23 +179,23 @@@ static const in
   #define GMX_DD_NNODES_SENDRECV 4
   
   
+ /* We check if to turn on DLB at the first and every 100 DD partitionings.
+  * With large imbalance DLB will turn on at the first step, so we can
+  * make the interval so large that the MPI overhead of the check is negligible.
+  */
+ static const int c_checkTurnDlbOnInterval  = 100;
+ /* We need to check if DLB results in worse performance and then turn it off.
+  * We check this more often then for turning DLB on, because the DLB can scale
+  * the domains very rapidly, so if unlucky the load imbalance can go up quickly
+  * and furthermore, we are already synchronizing often with DLB, so
+  * the overhead of the MPI Bcast is not that high.
+  */
+ static const int c_checkTurnDlbOffInterval =  20;
+ 
+ /* Forward declaration */
+ static void dd_dlb_set_should_check_whether_to_turn_dlb_on(gmx_domdec_t *dd, gmx_bool bValue);
+ 
+ 
   /*
      #define dd_index(n,i) ((((i)[ZZ]*(n)[YY] + (i)[YY])*(n)[XX]) + (i)[XX])
   
@@@ -258,7 -273,8 +275,8 @@@ t_block *dd_charge_groups_global(gmx_do
   
   static bool dlbIsOn(const gmx_domdec_comm_t *comm)
   {
-     return (comm->dlbState == edlbsOn);
+     return (comm->dlbState == edlbsOnCanTurnOff ||
+             comm->dlbState == edlbsOnForever);
   }
   
   static void vec_rvec_init(vec_rvec_t *v)
@@@ -285,8 -301,13 +303,8 @@@ void dd_store_state(gmx_domdec_t *dd, t
           gmx_incons("The state does not the domain decomposition state");
       }
   
- -    state->ncg_gl = dd->ncg_home;
- -    if (state->ncg_gl > state->cg_gl_nalloc)
- -    {
- -        state->cg_gl_nalloc = over_alloc_dd(state->ncg_gl);
- -        srenew(state->cg_gl, state->cg_gl_nalloc);
- -    }
- -    for (i = 0; i < state->ncg_gl; i++)
+ +    state->cg_gl.resize(dd->ncg_home);
+ +    for (i = 0; i < dd->ncg_home; i++)
       {
           state->cg_gl[i] = dd->index_gl[i];
       }
@@@ -343,15 -364,6 +361,15 @@@ void dd_get_ns_ranges(const gmx_domdec_
       }
   }
   
+ +int dd_natoms_mdatoms(const gmx_domdec_t *dd)
+ +{
+ +    /* We currently set mdatoms entries for all atoms:
+ +     * local + non-local + communicated for vsite + constraints
+ +     */
+ +
+ +    return dd->comm->nat[ddnatNR - 1];
+ +}
+ +
   int dd_natoms_vsite(const gmx_domdec_t *dd)
   {
       return dd->comm->nat[ddnatVSITE];
@@@ -1063,8 -1075,8 +1081,8 @@@ static void dd_collect_cg(gmx_domdec_t 
   
           cgs_gl = &dd->comm->cgs_gl;
   
- -        ncg_home = state_local->ncg_gl;
- -        cg       = state_local->cg_gl;
+ +        ncg_home = state_local->cg_gl.size();
+ +        cg       = state_local->cg_gl.data();
           nat_home = 0;
           for (i = 0; i < ncg_home; i++)
           {
@@@ -1128,7 -1140,7 +1146,7 @@@
   }
   
   static void dd_collect_vec_sendrecv(gmx_domdec_t *dd,
- -                                    rvec *lv, rvec *v)
+ +                                    const rvec *lv, rvec *v)
   {
       gmx_domdec_master_t *ma;
       int                  n, i, c, a, nalloc = 0;
@@@ -1140,8 -1152,8 +1158,8 @@@
       if (!DDMASTER(dd))
       {
   #if GMX_MPI
- -        MPI_Send(lv, dd->nat_home*sizeof(rvec), MPI_BYTE, DDMASTERRANK(dd),
- -                 dd->rank, dd->mpi_comm_all);
+ +        MPI_Send(const_cast<void *>(static_cast<const void *>(lv)), dd->nat_home*sizeof(rvec), MPI_BYTE,
+ +                 DDMASTERRANK(dd), dd->rank, dd->mpi_comm_all);
   #endif
       }
       else
@@@ -1205,7 -1217,7 +1223,7 @@@ static void get_commbuffer_counts(gmx_d
   }
   
   static void dd_collect_vec_gatherv(gmx_domdec_t *dd,
- -                                   rvec *lv, rvec *v)
+ +                                   const rvec *lv, rvec *v)
   {
       gmx_domdec_master_t *ma;
       int                 *rcounts = NULL, *disps = NULL;
@@@ -1242,15 -1254,11 +1260,15 @@@
       }
   }
   
- -void dd_collect_vec(gmx_domdec_t *dd,
- -                    t_state *state_local, rvec *lv, rvec *v)
+ +void dd_collect_vec(gmx_domdec_t           *dd,
+ +                    t_state                *state_local,
+ +                    const PaddedRVecVector *localVector,
+ +                    rvec                   *v)
   {
       dd_collect_cg(dd, state_local);
   
+ +    const rvec *lv = as_rvec_array(localVector->data());
+ +
       if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
       {
           dd_collect_vec_sendrecv(dd, lv, v);
@@@ -1261,14 -1269,6 +1279,14 @@@
       }
   }
   
+ +void dd_collect_vec(gmx_domdec_t           *dd,
+ +                    t_state                *state_local,
+ +                    const PaddedRVecVector *localVector,
+ +                    PaddedRVecVector       *vector)
+ +{
+ +    dd_collect_vec(dd, state_local, localVector, as_rvec_array(vector->data()));
+ +}
+ +
   
   void dd_collect_state(gmx_domdec_t *dd,
                         t_state *state_local, t_state *state)
@@@ -1317,15 -1317,15 +1335,15 @@@
               switch (est)
               {
                   case estX:
- -                    dd_collect_vec(dd, state_local, state_local->x, state->x);
+ +                    dd_collect_vec(dd, state_local, &state_local->x, &state->x);
                       break;
                   case estV:
- -                    dd_collect_vec(dd, state_local, state_local->v, state->v);
+ +                    dd_collect_vec(dd, state_local, &state_local->v, &state->v);
                       break;
                   case est_SDX_NOTSUPPORTED:
                       break;
                   case estCGP:
- -                    dd_collect_vec(dd, state_local, state_local->cg_p, state->cg_p);
+ +                    dd_collect_vec(dd, state_local, &state_local->cg_p, &state->cg_p);
                       break;
                   case estDISRE_INITF:
                   case estDISRE_RM3TAV:
@@@ -1339,15 -1339,17 +1357,15 @@@
       }
   }
   
- -static void dd_realloc_state(t_state *state, rvec **f, int nalloc)
+ +static void dd_resize_state(t_state *state, PaddedRVecVector *f, int natoms)
   {
       int est;
   
       if (debug)
       {
- -        fprintf(debug, "Reallocating state: currently %d, required %d, allocating %d\n", state->nalloc, nalloc, over_alloc_dd(nalloc));
+ +        fprintf(debug, "Resizing state: currently %d, required %d\n", state->natoms, natoms);
       }
   
- -    state->nalloc = over_alloc_dd(nalloc);
- -
       for (est = 0; est < estNR; est++)
       {
           if (EST_DISTR(est) && (state->flags & (1<<est)))
@@@ -1358,15 -1360,15 +1376,15 @@@
               switch (est)
               {
                   case estX:
- -                    srenew(state->x, state->nalloc + 1);
+ +                    state->x.resize(natoms + 1);
                       break;
                   case estV:
- -                    srenew(state->v, state->nalloc + 1);
+ +                    state->v.resize(natoms + 1);
                       break;
                   case est_SDX_NOTSUPPORTED:
                       break;
                   case estCGP:
- -                    srenew(state->cg_p, state->nalloc + 1);
+ +                    state->cg_p.resize(natoms + 1);
                       break;
                   case estDISRE_INITF:
                   case estDISRE_RM3TAV:
@@@ -1375,41 -1377,39 +1393,41 @@@
                       /* No reallocation required */
                       break;
                   default:
- -                    gmx_incons("Unknown state entry encountered in dd_realloc_state");
+ +                    gmx_incons("Unknown state entry encountered in dd_resize_state");
               }
           }
       }
   
       if (f != NULL)
       {
- -        srenew(*f, state->nalloc);
+ +        (*f).resize(natoms + 1);
       }
   }
   
- -static void dd_check_alloc_ncg(t_forcerec *fr, t_state *state, rvec **f,
- -                               int nalloc)
+ +static void dd_check_alloc_ncg(t_forcerec       *fr,
+ +                               t_state          *state,
+ +                               PaddedRVecVector *f,
+ +                               int               numChargeGroups)
   {
- -    if (nalloc > fr->cg_nalloc)
+ +    if (numChargeGroups > fr->cg_nalloc)
       {
           if (debug)
           {
- -            fprintf(debug, "Reallocating forcerec: currently %d, required %d, allocating %d\n", fr->cg_nalloc, nalloc, over_alloc_dd(nalloc));
+ +            fprintf(debug, "Reallocating forcerec: currently %d, required %d, allocating %d\n", fr->cg_nalloc, numChargeGroups, over_alloc_dd(numChargeGroups));
           }
- -        fr->cg_nalloc = over_alloc_dd(nalloc);
+ +        fr->cg_nalloc = over_alloc_dd(numChargeGroups);
           srenew(fr->cginfo, fr->cg_nalloc);
           if (fr->cutoff_scheme == ecutsGROUP)
           {
               srenew(fr->cg_cm, fr->cg_nalloc);
           }
       }
- -    if (fr->cutoff_scheme == ecutsVERLET && nalloc > state->nalloc)
+ +    if (fr->cutoff_scheme == ecutsVERLET)
       {
           /* We don't use charge groups, we use x in state to set up
            * the atom communication.
            */
- -        dd_realloc_state(state, f, nalloc);
+ +        dd_resize_state(state, f, numChargeGroups);
       }
   }
   
@@@ -1519,11 -1519,7 +1537,11 @@@ static void dd_distribute_vec(gmx_domde
   
   static void dd_distribute_dfhist(gmx_domdec_t *dd, df_history_t *dfhist)
   {
- -    int i;
+ +    if (dfhist == NULL)
+ +    {
+ +        return;
+ +    }
+ +
       dd_bcast(dd, sizeof(int), &dfhist->bEquil);
       dd_bcast(dd, sizeof(int), &dfhist->nlambda);
       dd_bcast(dd, sizeof(real), &dfhist->wl_delta);
@@@ -1538,7 -1534,7 +1556,7 @@@
           dd_bcast(dd, sizeof(real)*nlam, dfhist->sum_minvar);
           dd_bcast(dd, sizeof(real)*nlam, dfhist->sum_variance);
   
- -        for (i = 0; i < nlam; i++)
+ +        for (int i = 0; i < nlam; i++)
           {
               dd_bcast(dd, sizeof(real)*nlam, dfhist->accum_p[i]);
               dd_bcast(dd, sizeof(real)*nlam, dfhist->accum_m[i]);
@@@ -1552,7 -1548,7 +1570,7 @@@
   
   static void dd_distribute_state(gmx_domdec_t *dd, t_block *cgs,
                                   t_state *state, t_state *state_local,
- -                                rvec **f)
+ +                                PaddedRVecVector *f)
   {
       int  i, j, nh;
   
@@@ -1572,10 -1568,7 +1590,10 @@@
           copy_mat(state->boxv, state_local->boxv);
           copy_mat(state->svir_prev, state_local->svir_prev);
           copy_mat(state->fvir_prev, state_local->fvir_prev);
- -        copy_df_history(&state_local->dfhist, &state->dfhist);
+ +        if (state->dfhist != NULL)
+ +        {
+ +            copy_df_history(state_local->dfhist, state->dfhist);
+ +        }
           for (i = 0; i < state_local->ngtc; i++)
           {
               for (j = 0; j < nh; j++)
@@@ -1594,7 -1587,7 +1612,7 @@@
               }
           }
       }
- -    dd_bcast(dd, ((efptNR)*sizeof(real)), state_local->lambda);
+ +    dd_bcast(dd, ((efptNR)*sizeof(real)), state_local->lambda.data());
       dd_bcast(dd, sizeof(int), &state_local->fep_state);
       dd_bcast(dd, sizeof(real), &state_local->veta);
       dd_bcast(dd, sizeof(real), &state_local->vol0);
@@@ -1603,17 -1596,19 +1621,17 @@@
       dd_bcast(dd, sizeof(state_local->boxv), state_local->boxv);
       dd_bcast(dd, sizeof(state_local->svir_prev), state_local->svir_prev);
       dd_bcast(dd, sizeof(state_local->fvir_prev), state_local->fvir_prev);
- -    dd_bcast(dd, ((state_local->ngtc*nh)*sizeof(double)), state_local->nosehoover_xi);
- -    dd_bcast(dd, ((state_local->ngtc*nh)*sizeof(double)), state_local->nosehoover_vxi);
- -    dd_bcast(dd, state_local->ngtc*sizeof(double), state_local->therm_integral);
- -    dd_bcast(dd, ((state_local->nnhpres*nh)*sizeof(double)), state_local->nhpres_xi);
- -    dd_bcast(dd, ((state_local->nnhpres*nh)*sizeof(double)), state_local->nhpres_vxi);
+ +    dd_bcast(dd, ((state_local->ngtc*nh)*sizeof(double)), state_local->nosehoover_xi.data());
+ +    dd_bcast(dd, ((state_local->ngtc*nh)*sizeof(double)), state_local->nosehoover_vxi.data());
+ +    dd_bcast(dd, state_local->ngtc*sizeof(double), state_local->therm_integral.data());
+ +    dd_bcast(dd, ((state_local->nnhpres*nh)*sizeof(double)), state_local->nhpres_xi.data());
+ +    dd_bcast(dd, ((state_local->nnhpres*nh)*sizeof(double)), state_local->nhpres_vxi.data());
   
       /* communicate df_history -- required for restarting from checkpoint */
- -    dd_distribute_dfhist(dd, &state_local->dfhist);
+ +    dd_distribute_dfhist(dd, state_local->dfhist);
+ +
+ +    dd_resize_state(state_local, f, dd->nat_home);
   
- -    if (dd->nat_home > state_local->nalloc)
- -    {
- -        dd_realloc_state(state_local, f, dd->nat_home);
- -    }
       for (i = 0; i < estNR; i++)
       {
           if (EST_DISTR(i) && (state_local->flags & (1<<i)))
@@@ -1621,15 -1616,15 +1639,15 @@@
               switch (i)
               {
                   case estX:
- -                    dd_distribute_vec(dd, cgs, state->x, state_local->x);
+ +                    dd_distribute_vec(dd, cgs, as_rvec_array(state->x.data()), as_rvec_array(state_local->x.data()));
                       break;
                   case estV:
- -                    dd_distribute_vec(dd, cgs, state->v, state_local->v);
+ +                    dd_distribute_vec(dd, cgs, as_rvec_array(state->v.data()), as_rvec_array(state_local->v.data()));
                       break;
                   case est_SDX_NOTSUPPORTED:
                       break;
                   case estCGP:
- -                    dd_distribute_vec(dd, cgs, state->cg_p, state_local->cg_p);
+ +                    dd_distribute_vec(dd, cgs, as_rvec_array(state->cg_p.data()), as_rvec_array(state_local->cg_p.data()));
                       break;
                   case estDISRE_INITF:
                   case estDISRE_RM3TAV:
@@@ -1754,7 -1749,7 +1772,7 @@@ void write_dd_pdb(const char *fn, gmx_i
       char          fname[STRLEN], buf[22];
       FILE         *out;
       int           i, ii, resnr, c;
- -    char         *atomname, *resname;
+ +    const char   *atomname, *resname;
       real          b;
       gmx_domdec_t *dd;
   
@@@ -1770,11 -1765,10 +1788,11 @@@
   
       fprintf(out, "TITLE     %s\n", title);
       gmx_write_pdb_box(out, dd->bScrewPBC ? epbcSCREW : epbcXYZ, box);
+ +    int molb = 0;
       for (i = 0; i < natoms; i++)
       {
           ii = dd->gatindex[i];
- -        gmx_mtop_atominfo_global(mtop, ii, &atomname, &resnr, &resname);
+ +        mtopGetAtomAndResidueName(mtop, ii, &molb, &atomname, &resnr, &resname, nullptr);
           if (i < dd->comm->nat[ddnatZONE])
           {
               c = 0;
@@@ -2157,26 -2151,25 +2175,26 @@@ static void set_zones_ncg_home(gmx_domd
   }
   
   static void rebuild_cgindex(gmx_domdec_t *dd,
- -                            const int *gcgs_index, t_state *state)
+ +                            const int *gcgs_index, const t_state *state)
   {
- -    int nat, i, *ind, *dd_cg_gl, *cgindex, cg_gl;
+ +    int * gmx_restrict dd_cg_gl = dd->index_gl;
+ +    int * gmx_restrict cgindex  = dd->cgindex;
+ +    int                nat      = 0;
   
- -    ind        = state->cg_gl;
- -    dd_cg_gl   = dd->index_gl;
- -    cgindex    = dd->cgindex;
- -    nat        = 0;
+ +    /* Copy back the global charge group indices from state
+ +     * and rebuild the local charge group to atom index.
+ +     */
       cgindex[0] = nat;
- -    for (i = 0; i < state->ncg_gl; i++)
+ +    for (unsigned int i = 0; i < state->cg_gl.size(); i++)
       {
           cgindex[i]  = nat;
- -        cg_gl       = ind[i];
+ +        int cg_gl   = state->cg_gl[i];
           dd_cg_gl[i] = cg_gl;
           nat        += gcgs_index[cg_gl+1] - gcgs_index[cg_gl];
       }
- -    cgindex[i] = nat;
+ +    cgindex[state->cg_gl.size()] = nat;
   
- -    dd->ncg_home = state->ncg_gl;
+ +    dd->ncg_home = state->cg_gl.size();
       dd->nat_home = nat;
   
       set_zones_ncg_home(dd);
@@@ -2523,7 -2516,7 +2541,7 @@@ static gmx_bool check_grid_jump(gmx_int
                   /* This error should never be triggered under normal
                    * circumstances, but you never know ...
                    */
-                 gmx_fatal(FARGS, "Step %s: The domain decomposition grid has shifted too much in the %c-direction around cell %d %d %d. This should not have happened. Running with fewer ranks might avoid this issue.",
+                 gmx_fatal(FARGS, "step %s: The domain decomposition grid has shifted too much in the %c-direction around cell %d %d %d. This should not have happened. Running with fewer ranks might avoid this issue.",
                             gmx_step_str(step, buf),
                             dim2char(dim), dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
               }
@@@ -3012,7 -3005,7 +3030,7 @@@ static void dd_cell_sizes_dlb_root_enfo
       if (bPBC && cell_size[i] < cellsize_limit_f*DD_CELL_MARGIN2/DD_CELL_MARGIN)
       {
           char buf[22];
-         gmx_fatal(FARGS, "Step %s: the dynamic load balancing could not balance dimension %c: box size %f, triclinic skew factor %f, #cells %d, minimum cell size %f\n",
+         gmx_fatal(FARGS, "step %s: the dynamic load balancing could not balance dimension %c: box size %f, triclinic skew factor %f, #cells %d, minimum cell size %f\n",
                     gmx_step_str(step, buf),
                     dim2char(dim), ddbox->box_size[dim], ddbox->skew_fac[dim],
                     ncd, comm->cellsize_min[dim]);
@@@ -3557,7 -3550,7 +3575,7 @@@ static void comm_dd_ns_cell_sizes(gmx_d
               comm->cellsize_min[dim])
           {
               char buf[22];
-             gmx_fatal(FARGS, "Step %s: The %c-size (%f) times the triclinic skew factor (%f) is smaller than the smallest allowed cell size (%f) for domain decomposition grid cell %d %d %d",
+             gmx_fatal(FARGS, "step %s: The %c-size (%f) times the triclinic skew factor (%f) is smaller than the smallest allowed cell size (%f) for domain decomposition grid cell %d %d %d",
                         gmx_step_str(step, buf), dim2char(dim),
                         comm->cell_x1[dim] - comm->cell_x0[dim],
                         ddbox->skew_fac[dim],
@@@ -4239,7 -4232,7 +4257,7 @@@ static void calc_cg_move(FILE *fplog, g
                       if (pos_d >= limit1[d])
                       {
                           cg_move_error(fplog, dd, step, cg, d, 1,
- -                                      cg_cm != state->x, limitd[d],
+ +                                      cg_cm != as_rvec_array(state->x.data()), limitd[d],
                                         cg_cm[cg], cm_new, pos_d);
                       }
                       dev[d] = 1;
@@@ -4266,7 -4259,7 +4284,7 @@@
                       if (pos_d < limit0[d])
                       {
                           cg_move_error(fplog, dd, step, cg, d, -1,
- -                                      cg_cm != state->x, limitd[d],
+ +                                      cg_cm != as_rvec_array(state->x.data()), limitd[d],
                                         cg_cm[cg], cm_new, pos_d);
                       }
                       dev[d] = -1;
@@@ -4350,7 -4343,7 +4368,7 @@@
   
   static void dd_redistribute_cg(FILE *fplog, gmx_int64_t step,
                                  gmx_domdec_t *dd, ivec tric_dir,
- -                               t_state *state, rvec **f,
+ +                               t_state *state, PaddedRVecVector *f,
                                  t_forcerec *fr,
                                  gmx_bool bCompact,
                                  t_nrnb *nrnb,
@@@ -4479,7 -4472,7 +4497,7 @@@
                            cgindex,
                            ( thread   *dd->ncg_home)/nthread,
                            ((thread+1)*dd->ncg_home)/nthread,
- -                         fr->cutoff_scheme == ecutsGROUP ? cg_cm : state->x,
+ +                         fr->cutoff_scheme == ecutsGROUP ? cg_cm : as_rvec_array(state->x.data()),
                            move);
           }
           GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
@@@ -4558,7 -4551,7 +4576,7 @@@
                */
               home_pos_cg =
                   compact_and_copy_vec_cg(dd->ncg_home, move, cgindex,
- -                                        nvec, state->x, comm, FALSE);
+ +                                        nvec, as_rvec_array(state->x.data()), comm, FALSE);
               if (bCompact)
               {
                   home_pos_cg -= *ncg_moved;
@@@ -4572,19 -4565,16 +4590,19 @@@
       vec         = 0;
       home_pos_at =
           compact_and_copy_vec_at(dd->ncg_home, move, cgindex,
- -                                nvec, vec++, state->x, comm, bCompact);
+ +                                nvec, vec++, as_rvec_array(state->x.data()),
+ +                                comm, bCompact);
       if (bV)
       {
           compact_and_copy_vec_at(dd->ncg_home, move, cgindex,
- -                                nvec, vec++, state->v, comm, bCompact);
+ +                                nvec, vec++, as_rvec_array(state->v.data()),
+ +                                comm, bCompact);
       }
       if (bCGP)
       {
           compact_and_copy_vec_at(dd->ncg_home, move, cgindex,
- -                                nvec, vec++, state->cg_p, comm, bCompact);
+ +                                nvec, vec++, as_rvec_array(state->cg_p.data()),
+ +                                comm, bCompact);
       }
   
       if (bCompact)
@@@ -4660,13 -4650,6 +4678,13 @@@
               nvr      += i;
           }
   
+ +        dd_check_alloc_ncg(fr, state, f, home_pos_cg + ncg_recv);
+ +        if (fr->cutoff_scheme == ecutsGROUP)
+ +        {
+ +            /* Here we resize to more than necessary and shrink later */
+ +            dd_resize_state(state, f, home_pos_at + ncg_recv*MAX_CGCGSIZE);
+ +        }
+ +
           /* Process the received charge groups */
           buf_pos = 0;
           for (cg = 0; cg < ncg_recv; cg++)
@@@ -4779,6 -4762,7 +4797,6 @@@
                   dd->index_gl[home_pos_cg]  = comm->buf_int[cg*DD_CGIBS];
                   dd->cgindex[home_pos_cg+1] = dd->cgindex[home_pos_cg] + nrcg;
                   /* Copy the state from the buffer */
- -                dd_check_alloc_ncg(fr, state, f, home_pos_cg+1);
                   if (fr->cutoff_scheme == ecutsGROUP)
                   {
                       cg_cm = fr->cg_cm;
@@@ -4794,6 -4778,10 +4812,6 @@@
                       comm->bLocalCG[dd->index_gl[home_pos_cg]] = TRUE;
                   }
   
- -                if (home_pos_at+nrcg > state->nalloc)
- -                {
- -                    dd_realloc_state(state, f, home_pos_at+nrcg);
- -                }
                   for (i = 0; i < nrcg; i++)
                   {
                       copy_rvec(comm->vbuf.v[buf_pos++],
@@@ -4862,12 -4850,6 +4880,12 @@@
       dd->ncg_home = home_pos_cg;
       dd->nat_home = home_pos_at;
   
+ +    if (fr->cutoff_scheme == ecutsGROUP && !bCompact)
+ +    {
+ +        /* We overallocated before, we need to set the right size here */
+ +        dd_resize_state(state, f, dd->nat_home);
+ +    }
+ +
       if (debug)
       {
           fprintf(debug,
@@@ -6198,7 -6180,7 +6216,7 @@@ static int check_dlb_support(FILE *fplo
       {
           case 'a': dlbState = edlbsOffCanTurnOn; break;
           case 'n': dlbState = edlbsOffForever;   break;
-         case 'y': dlbState = edlbsOn;           break;
+         case 'y': dlbState = edlbsOnForever;    break;
           default: gmx_incons("Unknown dlb_opt");
       }
   
@@@ -6209,7 -6191,7 +6227,7 @@@
   
       if (!EI_DYNAMICS(ir->eI))
       {
-         if (dlbState == edlbsOn)
+         if (dlbState == edlbsOnForever)
           {
               sprintf(buf, "NOTE: dynamic load balancing is only supported with dynamics, not with integrator '%s'\n", EI(ir->eI));
               dd_warning(cr, fplog, buf);
@@@ -6231,10 -6213,11 +6249,11 @@@
               case edlbsOffForever:
                   break;
               case edlbsOffCanTurnOn:
+             case edlbsOnCanTurnOff:
                   dd_warning(cr, fplog, "NOTE: reproducibility requested, will not use dynamic load balancing\n");
                   dlbState = edlbsOffForever;
                   break;
-             case edlbsOn:
+             case edlbsOnForever:
                   dd_warning(cr, fplog, "WARNING: reproducibility requested with dynamic load balancing, the simulation will NOT be binary reproducible\n");
                   break;
               default:
@@@ -6347,8 -6330,12 +6366,12 @@@ static void set_dd_limits_and_grid(FIL
       /* Initialize to GPU share count to 0, might change later */
       comm->nrank_gpu_shared = 0;
   
-     comm->dlbState                 = check_dlb_support(fplog, cr, dlb_opt, comm->bRecordLoad, Flags, ir);
-     comm->bCheckWhetherToTurnDlbOn = TRUE;
+     comm->dlbState         = check_dlb_support(fplog, cr, dlb_opt, comm->bRecordLoad, Flags, ir);
+     dd_dlb_set_should_check_whether_to_turn_dlb_on(dd, TRUE);
+     /* To consider turning DLB on after 2*nstlist steps we need to check
+      * at partitioning count 3. Thus we need to increase the first count by 2.
+      */
+     comm->ddPartioningCountFirstDlbOff += 2;
   
       if (fplog)
       {
@@@ -6724,16 -6711,10 +6747,10 @@@ static void turn_on_dlb(FILE *fplog, t_
       gmx_domdec_comm_t *comm;
       real               cellsize_min;
       int                d, nc, i;
-     char               buf[STRLEN];
   
       dd   = cr->dd;
       comm = dd->comm;
   
-     if (fplog)
-     {
-         fprintf(fplog, "At step %s the performance loss due to force load imbalance is %.1f %%\n", gmx_step_str(step, buf), dd_force_imb_perf_loss(dd)*100);
-     }
- 
       cellsize_min = comm->cellsize_min[dd->dim[0]];
       for (d = 1; d < dd->ndim; d++)
       {
@@@ -6742,7 -6723,8 +6759,8 @@@
   
       if (cellsize_min < comm->cellsize_limit*1.05)
       {
-         dd_warning(cr, fplog, "NOTE: the minimum cell size is smaller than 1.05 times the cell size limit, will not turn on dynamic load balancing\n");
+         char buf[STRLEN];
+         sprintf(buf, "step %" GMX_PRId64 " Measured %.1f %% performance load due to load imbalance, but the minimum cell size is smaller than 1.05 times the cell size limit. Will no longer try dynamic load balancing.\n", step, dd_force_imb_perf_loss(dd)*100);
   
           /* Change DLB from "auto" to "no". */
           comm->dlbState = edlbsOffForever;
@@@ -6750,8 -6732,16 +6768,16 @@@
           return;
       }
   
-     dd_warning(cr, fplog, "NOTE: Turning on dynamic load balancing\n");
-     comm->dlbState = edlbsOn;
+     char buf[STRLEN];
+     sprintf(buf, "step %" GMX_PRId64 " Turning on dynamic load balancing, because the performance loss due to load imbalance is %.1f %%.\n", step, dd_force_imb_perf_loss(dd)*100);
+     dd_warning(cr, fplog, buf);
+     comm->dlbState = edlbsOnCanTurnOff;
+ 
+     /* Store the non-DLB performance, so we can check if DLB actually
+      * improves performance.
+      */
+     GMX_RELEASE_ASSERT(comm->cycl_n[ddCyclStep] > 0, "When we turned on DLB, we should have measured cycles");
+     comm->cyclesPerStepBeforeDLB = comm->cycl[ddCyclStep]/comm->cycl_n[ddCyclStep];
   
       set_dlb_limits(dd);
   
@@@ -6780,6 -6770,27 +6806,27 @@@
       }
   }
   
+ static void turn_off_dlb(FILE *fplog, t_commrec *cr, gmx_int64_t step)
+ {
+     gmx_domdec_t *dd = cr->dd;
+ 
+     char          buf[STRLEN];
+     sprintf(buf, "step %" GMX_PRId64 " Turning off dynamic load balancing, because it is degrading performance.\n", step);
+     dd_warning(cr, fplog, buf);
+     dd->comm->dlbState                     = edlbsOffCanTurnOn;
+     dd->comm->haveTurnedOffDlb             = true;
+     dd->comm->ddPartioningCountFirstDlbOff = dd->ddp_count;
+ }
+ 
+ static void turn_off_dlb_forever(FILE *fplog, t_commrec *cr, gmx_int64_t step)
+ {
+     GMX_RELEASE_ASSERT(cr->dd->comm->dlbState == edlbsOffCanTurnOn, "Can only turn off DLB forever when it was in the can-turn-on state");
+     char buf[STRLEN];
+     sprintf(buf, "step %" GMX_PRId64 " Will no longer try dynamic load balancing, as it degraded performance.\n", step);
+     dd_warning(cr, fplog, buf);
+     cr->dd->comm->dlbState = edlbsOffForever;
+ }
+ 
   static char *init_bLocalCG(const gmx_mtop_t *mtop)
   {
       int   ncg, cg;
@@@ -7241,7 -7252,7 +7288,7 @@@ static gmx_bool test_dd_cutoff(t_commre
       dd = cr->dd;
   
       set_ddbox(dd, FALSE, cr, ir, state->box,
- -              TRUE, &dd->comm->cgs_gl, state->x, &ddbox);
+ +              TRUE, &dd->comm->cgs_gl, as_rvec_array(state->x.data()), &ddbox);
   
       LocallyLimited = 0;
   
@@@ -7344,6 -7355,14 +7391,14 @@@ static void dd_dlb_set_should_check_whe
       if (dd->comm->dlbState == edlbsOffCanTurnOn)
       {
           dd->comm->bCheckWhetherToTurnDlbOn = bValue;
+ 
+         if (bValue == TRUE)
+         {
+             /* Store the DD partitioning count, so we can ignore cycle counts
+              * over the next nstlist steps, which are often slower.
+              */
+             dd->comm->ddPartioningCountFirstDlbOff = dd->ddp_count;
+         }
       }
   }
   
@@@ -7352,13 -7371,20 +7407,20 @@@
    */
   static gmx_bool dd_dlb_get_should_check_whether_to_turn_dlb_on(gmx_domdec_t *dd)
   {
-     const int nddp_chk_dlb = 100;
- 
       if (dd->comm->dlbState != edlbsOffCanTurnOn)
       {
           return FALSE;
       }
   
+     if (dd->ddp_count <= dd->comm->ddPartioningCountFirstDlbOff)
+     {
+         /* We ignore the first nstlist steps at the start of the run
+          * or after PME load balancing or after turning DLB off, since
+          * these often have extra allocation or cache miss overhead.
+          */
+         return FALSE;
+     }
+ 
       /* We should check whether we should use DLB directly after
        * unlocking DLB. */
       if (dd->comm->bCheckWhetherToTurnDlbOn)
@@@ -7368,10 -7394,10 +7430,10 @@@
           dd_dlb_set_should_check_whether_to_turn_dlb_on(dd, FALSE);
           return TRUE;
       }
-     /* We should also check whether we should use DLB every 100
+     /* We check whether we should use DLB every c_checkTurnDlbOnInterval
        * partitionings (we do not do this every partioning, so that we
        * avoid excessive communication). */
-     if (dd->comm->n_load_have % nddp_chk_dlb == nddp_chk_dlb - 1)
+     if (dd->comm->n_load_have % c_checkTurnDlbOnInterval == c_checkTurnDlbOnInterval - 1)
       {
           return TRUE;
       }
@@@ -7381,7 -7407,7 +7443,7 @@@
   
   gmx_bool dd_dlb_is_on(const gmx_domdec_t *dd)
   {
-     return (dd->comm->dlbState == edlbsOn);
+     return dlbIsOn(dd->comm);
   }
   
   gmx_bool dd_dlb_is_locked(const gmx_domdec_t *dd)
@@@ -7905,8 -7931,7 +7967,8 @@@ get_zone_pulse_cgs(gmx_domdec_t *dd
   
   static void setup_dd_communication(gmx_domdec_t *dd,
                                      matrix box, gmx_ddbox_t *ddbox,
- -                                   t_forcerec *fr, t_state *state, rvec **f)
+ +                                   t_forcerec *fr,
+ +                                   t_state *state, PaddedRVecVector *f)
   {
       int                    dim_ind, dim, dim0, dim1, dim2, dimd, p, nat_tot;
       int                    nzone, nzone_send, zone, zonei, cg0, cg1;
@@@ -7953,7 -7978,7 +8015,7 @@@
               cg_cm = fr->cg_cm;
               break;
           case ecutsVERLET:
- -            cg_cm = state->x;
+ +            cg_cm = as_rvec_array(state->x.data());
               break;
           default:
               gmx_incons("unimplemented");
@@@ -8291,7 -8316,7 +8353,7 @@@
               }
               else
               {
- -                cg_cm = state->x;
+ +                cg_cm = as_rvec_array(state->x.data());
               }
               /* Communicate cg_cm */
               if (cd->bInPlace)
@@@ -8933,15 -8958,15 +8995,15 @@@ static void dd_sort_state(gmx_domdec_t 
               switch (i)
               {
                   case estX:
- -                    order_vec_atom(dd->ncg_home, cgindex, cgsort, state->x, vbuf);
+ +                    order_vec_atom(dd->ncg_home, cgindex, cgsort, as_rvec_array(state->x.data()), vbuf);
                       break;
                   case estV:
- -                    order_vec_atom(dd->ncg_home, cgindex, cgsort, state->v, vbuf);
+ +                    order_vec_atom(dd->ncg_home, cgindex, cgsort, as_rvec_array(state->v.data()), vbuf);
                       break;
                   case est_SDX_NOTSUPPORTED:
                       break;
                   case estCGP:
- -                    order_vec_atom(dd->ncg_home, cgindex, cgsort, state->cg_p, vbuf);
+ +                    order_vec_atom(dd->ncg_home, cgindex, cgsort, as_rvec_array(state->cg_p.data()), vbuf);
                       break;
                   case estLD_RNG:
                   case estLD_RNGI:
@@@ -9115,7 -9140,7 +9177,7 @@@ void dd_partition_system(FIL
                            const gmx_mtop_t    *top_global,
                            const t_inputrec    *ir,
                            t_state             *state_local,
- -                         rvec               **f,
+ +                         PaddedRVecVector    *f,
                            t_mdatoms           *mdatoms,
                            gmx_localtop_t      *top_local,
                            t_forcerec          *fr,
@@@ -9132,7 -9157,7 +9194,7 @@@
       gmx_int64_t        step_pcoupl;
       rvec               cell_ns_x0, cell_ns_x1;
       int                i, n, ncgindex_set, ncg_home_old = -1, ncg_moved, nat_f_novirsum;
-     gmx_bool           bBoxChanged, bNStGlobalComm, bDoDLB, bCheckWhetherToTurnDlbOn, bTurnOnDLB, bLogLoad;
+     gmx_bool           bBoxChanged, bNStGlobalComm, bDoDLB, bCheckWhetherToTurnDlbOn, bLogLoad;
       gmx_bool           bRedist, bSortCG, bResortAll;
       ivec               ncells_old = {0, 0, 0}, ncells_new = {0, 0, 0}, np;
       real               grid_density;
@@@ -9223,40 -9248,109 +9285,109 @@@
               }
               comm->n_load_collect++;
   
-             if (bCheckWhetherToTurnDlbOn)
+             if (dlbIsOn(comm))
               {
+                 if (DDMASTER(dd))
+                 {
+                     /* Add the measured cycles to the running average */
+                     const float averageFactor        = 0.1f;
+                     comm->cyclesPerStepDlbExpAverage =
+                         (1 - averageFactor)*comm->cyclesPerStepDlbExpAverage +
+                         averageFactor*comm->cycl[ddCyclStep]/comm->cycl_n[ddCyclStep];
+                 }
+                 if (comm->dlbState == edlbsOnCanTurnOff &&
+                     dd->comm->n_load_have % c_checkTurnDlbOffInterval == c_checkTurnDlbOffInterval - 1)
+                 {
+                     gmx_bool turnOffDlb;
+                     if (DDMASTER(dd))
+                     {
+                         /* If the running averaged cycles with DLB are more
+                          * than before we turned on DLB, turn off DLB.
+                          * We will again run and check the cycles without DLB
+                          * and we can then decide if to turn off DLB forever.
+                          */
+                         turnOffDlb = (comm->cyclesPerStepDlbExpAverage >
+                                       comm->cyclesPerStepBeforeDLB);
+                     }
+                     dd_bcast(dd, sizeof(turnOffDlb), &turnOffDlb);
+                     if (turnOffDlb)
+                     {
+                         /* To turn off DLB, we need to redistribute the atoms */
+                         dd_collect_state(dd, state_local, state_global);
+                         bMasterState = TRUE;
+                         turn_off_dlb(fplog, cr, step);
+                     }
+                 }
+             }
+             else if (bCheckWhetherToTurnDlbOn)
+             {
+                 gmx_bool turnOffDlbForever = FALSE;
+                 gmx_bool turnOnDlb         = FALSE;
+ 
                   /* Since the timings are node dependent, the master decides */
                   if (DDMASTER(dd))
                   {
-                     /* Here we check if the max PME rank load is more than 0.98
-                      * the max PP force load. If so, PP DLB will not help,
-                      * since we are (almost) limited by PME. Furthermore,
-                      * DLB will cause a significant extra x/f redistribution
-                      * cost on the PME ranks, which will then surely result
-                      * in lower total performance.
-                      * This check might be fragile, since one measurement
-                      * below 0.98 (although only done once every 100 DD part.)
-                      * could turn on DLB for the rest of the run.
+                     /* If we recently turned off DLB, we want to check if
+                      * performance is better without DLB. We want to do this
+                      * ASAP to minimize the chance that external factors
+                      * slowed down the DLB step are gone here and we
+                      * incorrectly conclude that DLB was causing the slowdown.
+                      * So we measure one nstlist block, no running average.
                        */
-                     if (cr->npmenodes > 0 &&
-                         dd_pme_f_ratio(dd) > 1 - DD_PERF_LOSS_DLB_ON)
+                     if (comm->haveTurnedOffDlb &&
+                         comm->cycl[ddCyclStep]/comm->cycl_n[ddCyclStep] <
+                         comm->cyclesPerStepDlbExpAverage)
                       {
-                         bTurnOnDLB = FALSE;
+                         /* After turning off DLB we ran nstlist steps in fewer
+                          * cycles than with DLB. This likely means that DLB
+                          * in not benefical, but this could be due to a one
+                          * time unlucky fluctuation, so we require two such
+                          * observations in close succession to turn off DLB
+                          * forever.
+                          */
+                         if (comm->dlbSlowerPartitioningCount > 0 &&
+                             dd->ddp_count < comm->dlbSlowerPartitioningCount + 10*c_checkTurnDlbOnInterval)
+                         {
+                             turnOffDlbForever = TRUE;
+                         }
+                         comm->haveTurnedOffDlb           = false;
+                         /* Register when we last measured DLB slowdown */
+                         comm->dlbSlowerPartitioningCount = dd->ddp_count;
                       }
                       else
                       {
-                         bTurnOnDLB =
-                             (dd_force_imb_perf_loss(dd) >= DD_PERF_LOSS_DLB_ON);
-                     }
-                     if (debug)
-                     {
-                         fprintf(debug, "step %s, imb loss %f\n",
-                                 gmx_step_str(step, sbuf),
-                                 dd_force_imb_perf_loss(dd));
+                         /* Here we check if the max PME rank load is more than 0.98
+                          * the max PP force load. If so, PP DLB will not help,
+                          * since we are (almost) limited by PME. Furthermore,
+                          * DLB will cause a significant extra x/f redistribution
+                          * cost on the PME ranks, which will then surely result
+                          * in lower total performance.
+                          */
+                         if (cr->npmenodes > 0 &&
+                             dd_pme_f_ratio(dd) > 1 - DD_PERF_LOSS_DLB_ON)
+                         {
+                             turnOnDlb = FALSE;
+                         }
+                         else
+                         {
+                             turnOnDlb = (dd_force_imb_perf_loss(dd) >= DD_PERF_LOSS_DLB_ON);
+                         }
                       }
                   }
-                 dd_bcast(dd, sizeof(bTurnOnDLB), &bTurnOnDLB);
-                 if (bTurnOnDLB)
+                 struct
+                 {
+                     gmx_bool turnOffDlbForever;
+                     gmx_bool turnOnDlb;
+                 }
+                 bools {
+                     turnOffDlbForever, turnOnDlb
+                 };
+                 dd_bcast(dd, sizeof(bools), &bools);
+                 if (bools.turnOffDlbForever)
+                 {
+                     turn_off_dlb_forever(fplog, cr, step);
+                 }
+                 else if (bools.turnOnDlb)
                   {
                       turn_on_dlb(fplog, cr, step);
                       bDoDLB = TRUE;
@@@ -9276,10 -9370,10 +9407,10 @@@
           ncgindex_set = 0;
   
           set_ddbox(dd, bMasterState, cr, ir, state_global->box,
- -                  TRUE, cgs_gl, state_global->x, &ddbox);
+ +                  TRUE, cgs_gl, as_rvec_array(state_global->x.data()), &ddbox);
   
           get_cg_distribution(fplog, dd, cgs_gl,
- -                            state_global->box, &ddbox, state_global->x);
+ +                            state_global->box, &ddbox, as_rvec_array(state_global->x.data()));
   
           dd_distribute_state(dd, cgs_gl,
                               state_global, state_local, f);
@@@ -9292,7 -9386,7 +9423,7 @@@
           if (fr->cutoff_scheme == ecutsGROUP)
           {
               calc_cgcm(fplog, 0, dd->ncg_home,
- -                      &top_local->cgs, state_local->x, fr->cg_cm);
+ +                      &top_local->cgs, as_rvec_array(state_local->x.data()), fr->cg_cm);
           }
   
           inc_nrnb(nrnb, eNR_CGCM, dd->nat_home);
@@@ -9323,7 -9417,7 +9454,7 @@@
           {
               /* Redetermine the cg COMs */
               calc_cgcm(fplog, 0, dd->ncg_home,
- -                      &top_local->cgs, state_local->x, fr->cg_cm);
+ +                      &top_local->cgs, as_rvec_array(state_local->x.data()), fr->cg_cm);
           }
   
           inc_nrnb(nrnb, eNR_CGCM, dd->nat_home);
@@@ -9331,7 -9425,7 +9462,7 @@@
           dd_set_cginfo(dd->index_gl, 0, dd->ncg_home, fr, comm->bLocalCG);
   
           set_ddbox(dd, bMasterState, cr, ir, state_local->box,
- -                  TRUE, &top_local->cgs, state_local->x, &ddbox);
+ +                  TRUE, &top_local->cgs, as_rvec_array(state_local->x.data()), &ddbox);
   
           bRedist = dlbIsOn(comm);
       }
@@@ -9350,7 -9444,7 +9481,7 @@@
               copy_rvec(comm->box_size, ddbox.box_size);
           }
           set_ddbox(dd, bMasterState, cr, ir, state_local->box,
- -                  bNStGlobalComm, &top_local->cgs, state_local->x, &ddbox);
+ +                  bNStGlobalComm, &top_local->cgs, as_rvec_array(state_local->x.data()), &ddbox);
   
           bBoxChanged = TRUE;
           bRedist     = TRUE;
@@@ -9439,7 -9533,7 +9570,7 @@@
                                     0, dd->ncg_home,
                                     comm->zones.dens_zone0,
                                     fr->cginfo,
- -                                  state_local->x,
+ +                                  as_rvec_array(state_local->x.data()),
                                     ncg_moved, bRedist ? comm->moved : NULL,
                                     fr->nbv->grp[eintLocal].kernel_type,
                                     fr->nbv->grp[eintLocal].nbat);
@@@ -9475,10 -9569,6 +9606,10 @@@
           }
           dd_sort_state(dd, fr->cg_cm, fr, state_local,
                         bResortAll ? -1 : ncg_home_old);
+ +
+ +        /* After sorting and compacting we set the correct size */
+ +        dd_resize_state(state_local, f, dd->nat_home);
+ +
           /* Rebuild all the indices */
           ga2la_clear(dd->ga2la);
           ncgindex_set = 0;
@@@ -9507,7 -9597,7 +9638,7 @@@
   
       /*
          write_dd_pdb("dd_home",step,"dump",top_global,cr,
- -                 -1,state_local->x,state_local->box);
+ +                 -1,as_rvec_array(state_local->x.data()),state_local->box);
        */
   
       wallcycle_sub_start(wcycle, ewcsDD_MAKETOP);
@@@ -9520,7 -9610,7 +9651,7 @@@
       dd_make_local_top(dd, &comm->zones, dd->npbcdim, state_local->box,
                         comm->cellsize_min, np,
                         fr,
- -                      fr->cutoff_scheme == ecutsGROUP ? fr->cg_cm : state_local->x,
+ +                      fr->cutoff_scheme == ecutsGROUP ? fr->cg_cm : as_rvec_array(state_local->x.data()),
                         vsite, top_global, top_local);
   
       wallcycle_sub_stop(wcycle, ewcsDD_MAKETOP);
@@@ -9562,8 -9652,10 +9693,8 @@@
        * or constraint communication.
        */
       state_local->natoms = comm->nat[ddnatNR-1];
- -    if (state_local->natoms > state_local->nalloc)
- -    {
- -        dd_realloc_state(state_local, f, state_local->natoms);
- -    }
+ +
+ +    dd_resize_state(state_local, f, state_local->natoms);
   
       if (fr->bF_NoVirSum)
       {
@@@ -9597,15 -9689,31 +9728,15 @@@
       forcerec_set_ranges(fr, dd->ncg_home, dd->ncg_tot,
                           dd->nat_tot, comm->nat[ddnatCON], nat_f_novirsum);
   
- -    /* We make the all mdatoms up to nat_tot_con.
- -     * We could save some work by only setting invmass
- -     * between nat_tot and nat_tot_con.
- -     */
- -    /* This call also sets the new number of home particles to dd->nat_home */
- -    atoms2md(top_global, ir,
- -             comm->nat[ddnatCON], dd->gatindex, dd->nat_home, mdatoms);
- -
- -    /* Now we have the charges we can sort the FE interactions */
- -    dd_sort_local_top(dd, mdatoms, top_local);
- -
- -    if (vsite != NULL)
- -    {
- -        /* Now we have updated mdatoms, we can do the last vsite bookkeeping */
- -        split_vsites_over_threads(top_local->idef.il, top_local->idef.iparams,
- -                                  mdatoms, FALSE, vsite);
- -    }
+ +    /* Update atom data for mdatoms and several algorithms */
+ +    mdAlgorithmsSetupAtomData(cr, ir, top_global, top_local, fr,
+ +                              NULL, mdatoms, vsite, NULL);
   
       if (ir->implicit_solvent)
       {
           make_local_gb(cr, fr->born, ir->gb_algorithm);
       }
   
- -    setup_bonded_threading(fr, &top_local->idef);
- -
       if (!(cr->duty & DUTY_PME))
       {
           /* Send the charges and/or c6/sigmas to our PME only node */
@@@ -9653,15 -9761,15 +9784,15 @@@
        * the last vsite construction, we need to communicate the constructing
        * atom coordinates again (for spreading the forces this MD step).
        */
- -    dd_move_x_vsites(dd, state_local->box, state_local->x);
+ +    dd_move_x_vsites(dd, state_local->box, as_rvec_array(state_local->x.data()));
   
       wallcycle_sub_stop(wcycle, ewcsDD_TOPOTHER);
   
       if (comm->nstDDDump > 0 && step % comm->nstDDDump == 0)
       {
- -        dd_move_x(dd, state_local->box, state_local->x);
+ +        dd_move_x(dd, state_local->box, as_rvec_array(state_local->x.data()));
           write_dd_pdb("dd_dump", step, "dump", top_global, cr,
- -                     -1, state_local->x, state_local->box);
+ +                     -1, as_rvec_array(state_local->x.data()), state_local->box);
       }
   
       /* Store the partitioning step */
diff --combined src/gromacs/gmxpreprocess/readir.cpp

index 9e4fe92d633775db8378609680ac6779388d263d,054c0f648632621e8ff51bfbc2f1ffd3de6266d5..aac04e776aa14e5bef314811b63c446879462b32
--- 1/src/gromacs/gmxpreprocess/readir.cpp
--- 2/src/gromacs/gmxpreprocess/readir.cpp
+++ b/src/gromacs/gmxpreprocess/readir.cpp
@@@ -58,8 -58,6 +58,8 @@@
   #include "gromacs/mdtypes/inputrec.h"
   #include "gromacs/mdtypes/md_enums.h"
   #include "gromacs/mdtypes/pull-params.h"
+ +#include "gromacs/options/options.h"
+ +#include "gromacs/options/treesupport.h"
   #include "gromacs/pbcutil/pbc.h"
   #include "gromacs/topology/block.h"
   #include "gromacs/topology/ifunc.h"
@@@ -68,15 -66,8 +68,15 @@@
   #include "gromacs/topology/symtab.h"
   #include "gromacs/topology/topology.h"
   #include "gromacs/utility/cstringutil.h"
+ +#include "gromacs/utility/exceptions.h"
   #include "gromacs/utility/fatalerror.h"
+ +#include "gromacs/utility/gmxassert.h"
+ +#include "gromacs/utility/ikeyvaluetreeerror.h"
+ +#include "gromacs/utility/keyvaluetree.h"
+ +#include "gromacs/utility/keyvaluetreetransform.h"
   #include "gromacs/utility/smalloc.h"
+ +#include "gromacs/utility/stringcompare.h"
+ +#include "gromacs/utility/stringutil.h"
   
   #define MAXPTR 254
   #define NOGID  255
@@@ -105,6 -96,8 +105,6 @@@ typedef struct t_inputrec_string
       char   QMmethod[STRLEN], QMbasis[STRLEN], QMcharge[STRLEN], QMmult[STRLEN],
              bSH[STRLEN], CASorbitals[STRLEN], CASelectrons[STRLEN], SAon[STRLEN],
              SAoff[STRLEN], SAsteps[STRLEN], bTS[STRLEN], bOPT[STRLEN];
- -    char efield_x[STRLEN], efield_xt[STRLEN], efield_y[STRLEN],
- -         efield_yt[STRLEN], efield_z[STRLEN], efield_zt[STRLEN];
   
   } gmx_inputrec_strings;
   
@@@ -1766,50 -1759,6 +1766,50 @@@ static gmx_bool couple_lambda_has_vdw_o
               couple_lambda_value == ecouplamVDWQ);
   }
   
+ +namespace
+ +{
+ +
+ +class MdpErrorHandler : public gmx::IKeyValueTreeErrorHandler
+ +{
+ +    public:
+ +        explicit MdpErrorHandler(warninp_t wi)
+ +            : wi_(wi), mapping_(nullptr)
+ +        {
+ +        }
+ +
+ +        void setBackMapping(const gmx::IKeyValueTreeBackMapping &mapping)
+ +        {
+ +            mapping_ = &mapping;
+ +        }
+ +
+ +        virtual bool onError(gmx::UserInputError *ex, const gmx::KeyValueTreePath &context)
+ +        {
+ +            ex->prependContext(gmx::formatString("Error in mdp option \"%s\":",
+ +                                                 getOptionName(context).c_str()));
+ +            std::string message = gmx::formatExceptionMessageToString(*ex);
+ +            warning_error(wi_, message.c_str());
+ +            return true;
+ +        }
+ +
+ +    private:
+ +        std::string getOptionName(const gmx::KeyValueTreePath &context)
+ +        {
+ +            if (mapping_ != nullptr)
+ +            {
+ +                gmx::KeyValueTreePath path = mapping_->originalPath(context);
+ +                GMX_ASSERT(path.size() == 1, "Inconsistent mapping back to mdp options");
+ +                return path[0];
+ +            }
+ +            GMX_ASSERT(context.size() == 1, "Inconsistent context for mdp option parsing");
+ +            return context[0];
+ +        }
+ +
+ +        warninp_t                            wi_;
+ +        const gmx::IKeyValueTreeBackMapping *mapping_;
+ +};
+ +
+ +} // namespace
+ +
   void get_ir(const char *mdparin, const char *mdparout,
               t_inputrec *ir, t_gromppopts *opts,
               warninp_t wi)
@@@ -2238,26 -2187,18 +2238,26 @@@
       }
   
       /* Electric fields */
- -    CCTYPE("Electric fields");
- -    CTYPE ("Format is number of terms (int) and for all terms an amplitude (real)");
- -    CTYPE ("and a phase angle (real)");
- -    STYPE ("E-x",     is->efield_x,   NULL);
- -    CTYPE ("Time dependent (pulsed) electric field. Format is omega, time for pulse");
- -    CTYPE ("peak, and sigma (width) for pulse. Sigma = 0 removes pulse, leaving");
- -    CTYPE ("the field to be a cosine function.");
- -    STYPE ("E-xt",    is->efield_xt,  NULL);
- -    STYPE ("E-y",     is->efield_y,   NULL);
- -    STYPE ("E-yt",    is->efield_yt,  NULL);
- -    STYPE ("E-z",     is->efield_z,   NULL);
- -    STYPE ("E-zt",    is->efield_zt,  NULL);
+ +    {
+ +        gmx::KeyValueTreeObject      convertedValues = flatKeyValueTreeFromInpFile(ninp, inp);
+ +        gmx::KeyValueTreeTransformer transform;
+ +        transform.rules()->addRule()
+ +            .keyMatchType("/", gmx::StringCompareType::CaseAndDashInsensitive);
+ +        ir->efield->initMdpTransform(transform.rules());
+ +        for (const auto &path : transform.mappedPaths())
+ +        {
+ +            GMX_ASSERT(path.size() == 1, "Inconsistent mapping back to mdp options");
+ +            mark_einp_set(ninp, inp, path[0].c_str());
+ +        }
+ +        gmx::Options                 options;
+ +        ir->efield->initMdpOptions(&options);
+ +        MdpErrorHandler              errorHandler(wi);
+ +        auto                         result
+ +            = transform.transform(convertedValues, &errorHandler);
+ +        errorHandler.setBackMapping(result.backMapping());
+ +        gmx::assignOptionsFromKeyValueTree(&options, result.object(),
+ +                                           &errorHandler);
+ +    }
   
       /* Ion/water position swapping ("computational electrophysiology") */
       CCTYPE("Ion/water position swapping for computational electrophysiology setups");
@@@ -2807,6 -2748,7 +2807,6 @@@ static void calc_nrdf(gmx_mtop_t *mtop
       double                 *nrdf_tc, *nrdf_vcm, nrdf_uc, *nrdf_vcm_sub;
       ivec                   *dof_vcm;
       gmx_mtop_atomloop_all_t aloop;
- -    t_atom                 *atom;
       int                     mb, mol, ftype, as;
       gmx_molblock_t         *molb;
       gmx_moltype_t          *molt;
@@@ -2847,7 -2789,6 +2847,7 @@@
   
       snew(nrdf2, natoms);
       aloop = gmx_mtop_atomloop_all_init(mtop);
+ +    const t_atom *atom;
       while (gmx_mtop_atomloop_all_next(aloop, &i, &atom))
       {
           nrdf2[i] = 0;
@@@ -3082,6 -3023,54 +3082,6 @@@
       sfree(nrdf_vcm_sub);
   }
   
- -static void decode_cos(char *s, t_cosines *cosine)
- -{
- -    char              *t;
- -    char               format[STRLEN], f1[STRLEN];
- -    double             a, phi;
- -    int                i;
- -
- -    t = gmx_strdup(s);
- -    trim(t);
- -
- -    cosine->n   = 0;
- -    cosine->a   = NULL;
- -    cosine->phi = NULL;
- -    if (strlen(t))
- -    {
- -        if (sscanf(t, "%d", &(cosine->n)) != 1)
- -        {
- -            gmx_fatal(FARGS, "Cannot parse cosine multiplicity from string '%s'", t);
- -        }
- -        if (cosine->n <= 0)
- -        {
- -            cosine->n = 0;
- -        }
- -        else
- -        {
- -            snew(cosine->a, cosine->n);
- -            snew(cosine->phi, cosine->n);
- -
- -            sprintf(format, "%%*d");
- -            for (i = 0; (i < cosine->n); i++)
- -            {
- -                double  gmx_unused canary;
- -
- -                strcpy(f1, format);
- -                strcat(f1, "%lf%lf%lf");
- -                if (sscanf(t, f1, &a, &phi, &canary) != 2)
- -                {
- -                    gmx_fatal(FARGS, "Invalid input for electric field shift: '%s'", t);
- -                }
- -                cosine->a[i]   = a;
- -                cosine->phi[i] = phi;
- -                strcat(format, "%*lf%*lf");
- -            }
- -        }
- -    }
- -    sfree(t);
- -}
- -
   static gmx_bool do_egp_flag(t_inputrec *ir, gmx_groups_t *groups,
                               const char *option, const char *val, int flag)
   {
@@@ -3457,7 -3446,7 +3457,7 @@@ void do_index(const char* mdparin, cons
                   nSA_time = str_nelem(is->anneal_time, MAXPTR, ptr1);
                   if (nSA_time != k)
                   {
-                     gmx_fatal(FARGS, "Found %d annealing-time values, wanter %d\n", nSA_time, k);
+                     gmx_fatal(FARGS, "Found %d annealing-time values, wanted %d\n", nSA_time, k);
                   }
                   nSA_temp = str_nelem(is->anneal_temp, MAXPTR, ptr2);
                   if (nSA_temp != k)
@@@ -3800,6 -3789,13 +3800,6 @@@
           gmx_fatal(FARGS, "Can only have energy group pair tables in combination with user tables for VdW and/or Coulomb");
       }
   
- -    decode_cos(is->efield_x, &(ir->ex[XX]));
- -    decode_cos(is->efield_xt, &(ir->et[XX]));
- -    decode_cos(is->efield_y, &(ir->ex[YY]));
- -    decode_cos(is->efield_yt, &(ir->et[YY]));
- -    decode_cos(is->efield_z, &(ir->ex[ZZ]));
- -    decode_cos(is->efield_zt, &(ir->et[ZZ]));
- -
       for (i = 0; (i < grps->nr); i++)
       {
           sfree(gnames[i]);
@@@ -4079,6 -4075,7 +4079,6 @@@ void triple_check(const char *mdparin, 
       rvec                      acc;
       gmx_mtop_atomloop_block_t aloopb;
       gmx_mtop_atomloop_all_t   aloop;
- -    t_atom                   *atom;
       ivec                      AbsRef;
       char                      warn_buf[STRLEN];
   
@@@ -4175,7 -4172,6 +4175,7 @@@
   
       bCharge = FALSE;
       aloopb  = gmx_mtop_atomloop_block_init(sys);
+ +    const t_atom *atom;
       while (gmx_mtop_atomloop_block_next(aloopb, &atom, &nmol))
       {
           if (atom->q != 0 || atom->qB != 0)
@@@ -4244,7 -4240,6 +4244,7 @@@
           clear_rvec(acc);
           snew(mgrp, sys->groups.grps[egcACC].nr);
           aloop = gmx_mtop_atomloop_all_init(sys);
+ +        const t_atom *atom;
           while (gmx_mtop_atomloop_all_next(aloop, &i, &atom))
           {
               mgrp[ggrpnr(&sys->groups, egcACC, i)] += atom->m;
diff --combined src/gromacs/mdlib/constr.cpp

index 715d8b4e1a5c55a35b1aee6e39f95b5831fa6149,8b9e70309bf2d098b28a440f54957923dd3654a7..630e19024807a3486ca7b839d16c73614b05e69f
--- 1/src/gromacs/mdlib/constr.cpp
--- 2/src/gromacs/mdlib/constr.cpp
+++ b/src/gromacs/mdlib/constr.cpp
@@@ -64,7 -64,6 +64,7 @@@
   #include "gromacs/pulling/pull.h"
   #include "gromacs/topology/block.h"
   #include "gromacs/topology/invblock.h"
+ +#include "gromacs/topology/mtop_lookup.h"
   #include "gromacs/topology/mtop_util.h"
   #include "gromacs/utility/exceptions.h"
   #include "gromacs/utility/fatalerror.h"
@@@ -184,7 -183,7 +184,7 @@@ static void write_constr_pdb(const cha
       FILE         *out;
       int           dd_ac0 = 0, dd_ac1 = 0, i, ii, resnr;
       gmx_domdec_t *dd;
- -    char         *anm, *resnm;
+ +    const char   *anm, *resnm;
   
       dd = NULL;
       if (DOMAINDECOMP(cr))
@@@ -208,7 -207,6 +208,7 @@@
   
       fprintf(out, "TITLE     %s\n", title);
       gmx_write_pdb_box(out, -1, box);
+ +    int molb = 0;
       for (i = start; i < start+homenr; i++)
       {
           if (dd != NULL)
@@@ -223,7 -221,7 +223,7 @@@
           {
               ii = i;
           }
- -        gmx_mtop_atominfo_global(mtop, ii, &anm, &resnr, &resnm);
+ +        mtopGetAtomAndResidueName(mtop, ii, &molb, &anm, &resnr, &resnm, nullptr);
           gmx_fprintf_pdb_atomline(out, epdbATOM, ii+1, anm, ' ', resnm, ' ', resnr, ' ',
                                    10*x[i][XX], 10*x[i][YY], 10*x[i][ZZ], 1.0, 0.0, "");
       }
@@@ -391,7 -389,7 +391,7 @@@ gmx_bool constrain(FILE *fplog, gmx_boo
                                 invdt, v, vir != NULL, vir_r_m_dr,
                                 econq, nrnb,
                                 constr->maxwarn, &constr->warncount_lincs);
-         if (!bOK && constr->maxwarn >= 0)
+         if (!bOK && constr->maxwarn < INT_MAX)
           {
               if (fplog != NULL)
               {
@@@ -412,7 -410,7 +412,7 @@@
                                 idef, ir, x, xprime, nrnb,
                                 constr->lagr, lambda, dvdlambda,
                                 invdt, v, vir != NULL, vir_r_m_dr,
-                               constr->maxwarn >= 0, econq);
+                               constr->maxwarn < INT_MAX, econq);
                   break;
               case (econqVeloc):
                   bOK = bshakef(fplog, constr->shaked,
@@@ -420,14 -418,14 +420,14 @@@
                                 idef, ir, x, min_proj, nrnb,
                                 constr->lagr, lambda, dvdlambda,
                                 invdt, NULL, vir != NULL, vir_r_m_dr,
-                               constr->maxwarn >= 0, econq);
+                               constr->maxwarn < INT_MAX, econq);
                   break;
               default:
                   gmx_fatal(FARGS, "Internal error, SHAKE called for constraining something else than coordinates");
                   break;
           }
   
-         if (!bOK && constr->maxwarn >= 0)
+         if (!bOK && constr->maxwarn < INT_MAX)
           {
               if (fplog != NULL)
               {
@@@ -1279,6 -1277,10 +1279,10 @@@ gmx_constr_t init_constraints(FILE *fpl
       {
           constr->maxwarn = 0;
           sscanf(env, "%8d", &constr->maxwarn);
+         if (constr->maxwarn < 0)
+         {
+             constr->maxwarn = INT_MAX;
+         }
           if (fplog)
           {
               fprintf(fplog,
@@@ -1292,19 -1294,15 +1296,15 @@@
                       constr->maxwarn);
           }
       }
-     if (constr->maxwarn < 0 && fplog)
-     {
-         fprintf(fplog, "maxwarn < 0, will not stop on constraint errors\n");
-     }
       constr->warncount_lincs  = 0;
       constr->warncount_settle = 0;
   
       /* Initialize the essential dynamics sampling.
        * Put the pointer to the ED struct in constr */
       constr->ed = ed;
- -    if (ed != NULL || state->edsamstate.nED > 0)
+ +    if (ed != NULL || state->edsamstate != NULL)
       {
- -        init_edsam(mtop, ir, cr, ed, state->x, state->box, &state->edsamstate);
+ +        init_edsam(mtop, ir, cr, ed, as_rvec_array(state->x.data()), state->box, state->edsamstate);
       }
   
       constr->warn_mtop = mtop;
diff --combined src/gromacs/mdlib/sim_util.cpp

index ab1b7ceac3e90c15803b305732c2f6c9ed1ffa59,5fd64cb318699b095a0d60214ea0a15198d71725..adb06226038f562813f0e6af89f44d97ae3df085
--- 1/src/gromacs/mdlib/sim_util.cpp
--- 2/src/gromacs/mdlib/sim_util.cpp
+++ b/src/gromacs/mdlib/sim_util.cpp
@@@ -108,9 -108,6 +108,6 @@@
   
   #include "nbnxn_gpu.h"
   
- static const bool useCuda   = GMX_GPU == GMX_GPU_CUDA;
- static const bool useOpenCL = GMX_GPU == GMX_GPU_OPENCL;
- 
   void print_time(FILE                     *out,
                   gmx_walltime_accounting_t walltime_accounting,
                   gmx_int64_t               step,
@@@ -206,18 -203,89 +203,18 @@@ void print_start(FILE *fplog, t_commre
                           walltime_accounting_get_start_time_stamp(walltime_accounting));
   }
   
- -static void sum_forces(int start, int end, rvec f[], rvec flr[])
+ +static void sum_forces(rvec f[], const PaddedRVecVector *forceToAdd)
   {
- -    int i;
+ +    /* TODO: remove this - 1 when padding is properly implemented */
+ +    int         end  = forceToAdd->size() - 1;
+ +    const rvec *fAdd = as_rvec_array(forceToAdd->data());
   
- -    if (gmx_debug_at)
- -    {
- -        pr_rvecs(debug, 0, "fsr", f+start, end-start);
- -        pr_rvecs(debug, 0, "flr", flr+start, end-start);
- -    }
       // cppcheck-suppress unreadVariable
       int gmx_unused nt = gmx_omp_nthreads_get(emntDefault);
   #pragma omp parallel for num_threads(nt) schedule(static)
- -    for (i = start; i < end; i++)
+ +    for (int i = 0; i < end; i++)
       {
- -        rvec_inc(f[i], flr[i]);
- -    }
- -}
- -
- -/*
- - * calc_f_el calculates forces due to an electric field.
- - *
- - * force is kJ mol^-1 nm^-1 = e * kJ mol^-1 nm^-1 / e
- - *
- - * Et[] contains the parameters for the time dependent
- - * part of the field.
- - * Ex[] contains the parameters for
- - * the spatial dependent part of the field.
- - * The function should return the energy due to the electric field
- - * (if any) but for now returns 0.
- - *
- - * WARNING:
- - * There can be problems with the virial.
- - * Since the field is not self-consistent this is unavoidable.
- - * For neutral molecules the virial is correct within this approximation.
- - * For neutral systems with many charged molecules the error is small.
- - * But for systems with a net charge or a few charged molecules
- - * the error can be significant when the field is high.
- - * Solution: implement a self-consistent electric field into PME.
- - */
- -static void calc_f_el(FILE *fp, int  start, int homenr,
- -                      real charge[], rvec f[],
- -                      t_cosines Ex[], t_cosines Et[], double t)
- -{
- -    rvec Ext;
- -    real t0;
- -    int  i, m;
- -
- -    for (m = 0; (m < DIM); m++)
- -    {
- -        if (Et[m].n > 0)
- -        {
- -            if (Et[m].n == 3)
- -            {
- -                t0     = Et[m].a[1];
- -                Ext[m] = std::cos(Et[m].a[0]*(t-t0))*std::exp(-gmx::square(t-t0)/(2.0*gmx::square(Et[m].a[2])));
- -            }
- -            else
- -            {
- -                Ext[m] = std::cos(Et[m].a[0]*t);
- -            }
- -        }
- -        else
- -        {
- -            Ext[m] = 1.0;
- -        }
- -        if (Ex[m].n > 0)
- -        {
- -            /* Convert the field strength from V/nm to MD-units */
- -            Ext[m] *= Ex[m].a[0]*FIELDFAC;
- -            for (i = start; (i < start+homenr); i++)
- -            {
- -                f[i][m] += charge[i]*Ext[m];
- -            }
- -        }
- -        else
- -        {
- -            Ext[m] = 0;
- -        }
- -    }
- -    if (fp != NULL)
- -    {
- -        fprintf(fp, "%10g  %10g  %10g  %10g #FIELD\n", t,
- -                Ext[XX]/FIELDFAC, Ext[YY]/FIELDFAC, Ext[ZZ]/FIELDFAC);
+ +        rvec_inc(f[i], fAdd[i]);
       }
   }
   
@@@ -302,7 -370,7 +299,7 @@@ static void pme_receive_force_ener(t_co
       wallcycle_start(wcycle, ewcPP_PMEWAITRECVF);
       dvdl_q  = 0;
       dvdl_lj = 0;
- -    gmx_pme_receive_f(cr, fr->f_novirsum, fr->vir_el_recip, &e_q,
+ +    gmx_pme_receive_f(cr, as_rvec_array(fr->f_novirsum->data()), fr->vir_el_recip, &e_q,
                         fr->vir_lj_recip, &e_lj, &dvdl_q, &dvdl_lj,
                         &cycles_seppme);
       enerd->term[F_COUL_RECIP] += e_q;
@@@ -359,7 -427,7 +356,7 @@@ static void post_process_forces(t_commr
                * if the constructing atoms aren't local.
                */
               wallcycle_start(wcycle, ewcVSITESPREAD);
- -            spread_vsite_f(vsite, x, fr->f_novirsum, NULL,
+ +            spread_vsite_f(vsite, x, as_rvec_array(fr->f_novirsum->data()), NULL,
                              (flags & GMX_FORCE_VIRIAL), fr->vir_el_recip,
                              nrnb,
                              &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr);
@@@ -368,8 -436,15 +365,8 @@@
           if (flags & GMX_FORCE_VIRIAL)
           {
               /* Now add the forces, this is local */
- -            if (fr->bDomDec)
- -            {
- -                sum_forces(0, fr->f_novirsum_n, f, fr->f_novirsum);
- -            }
- -            else
- -            {
- -                sum_forces(0, mdatoms->homenr,
- -                           f, fr->f_novirsum);
- -            }
+ +            sum_forces(f, fr->f_novirsum);
+ +
               if (EEL_FULL(fr->eeltype))
               {
                   /* Add the mesh contribution to the virial */
@@@ -711,18 -786,19 +708,18 @@@ void do_force_cutsVERLET(FILE *fplog, t
                            gmx_localtop_t *top,
                            gmx_groups_t gmx_unused *groups,
                            matrix box, rvec x[], history_t *hist,
- -                         rvec f[],
+ +                         PaddedRVecVector *force,
                            tensor vir_force,
                            t_mdatoms *mdatoms,
                            gmx_enerdata_t *enerd, t_fcdata *fcd,
                            real *lambda, t_graph *graph,
                            t_forcerec *fr, interaction_const_t *ic,
                            gmx_vsite_t *vsite, rvec mu_tot,
- -                         double t, FILE *field, gmx_edsam_t ed,
+ +                         double t, gmx_edsam_t ed,
                            gmx_bool bBornRadii,
                            int flags)
   {
       int                 cg1, i, j;
- -    int                 start, homenr;
       double              mu[2*DIM];
       gmx_bool            bStateChanged, bNS, bFillGrid, bCalcCGCM;
       gmx_bool            bDoForces, bUseGPU, bUseOrEmulGPU;
@@@ -739,8 -815,8 +736,8 @@@
       cycles_wait_gpu = 0;
       nbv             = fr->nbv;
   
- -    start  = 0;
- -    homenr = mdatoms->homenr;
+ +    const int start  = 0;
+ +    const int homenr = mdatoms->homenr;
   
       clear_mat(vir_force);
   
@@@ -816,6 -892,8 +813,6 @@@
            * we do not need to worry about shifting.
            */
   
- -        int pme_flags = 0;
- -
           wallcycle_start(wcycle, ewcPP_PMESENDX);
   
           bBS = (inputrec->nwall == 2);
@@@ -825,10 -903,20 +822,10 @@@
               svmul(inputrec->wall_ewald_zfac, boxs[ZZ], boxs[ZZ]);
           }
   
- -        if (EEL_PME(fr->eeltype))
- -        {
- -            pme_flags |= GMX_PME_DO_COULOMB;
- -        }
- -
- -        if (EVDW_PME(fr->vdwtype))
- -        {
- -            pme_flags |= GMX_PME_DO_LJ;
- -        }
- -
           gmx_pme_send_coordinates(cr, bBS ? boxs : box, x,
- -                                 mdatoms->nChargePerturbed, mdatoms->nTypePerturbed, lambda[efptCOUL], lambda[efptVDW],
+ +                                 lambda[efptCOUL], lambda[efptVDW],
                                    (flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)),
- -                                 pme_flags, step);
+ +                                 step);
   
           wallcycle_stop(wcycle, ewcPP_PMESENDX);
       }
@@@ -1083,9 -1171,6 +1080,9 @@@
           wallcycle_stop(wcycle, ewcROT);
       }
   
+ +    /* Temporary solution until all routines take PaddedRVecVector */
+ +    rvec *f = as_rvec_array(force->data());
+ +
       /* Start the force cycle counter.
        * This counter is stopped after do_force_lowlevel.
        * No parallel communication should occur while this counter is running,
@@@ -1100,7 -1185,7 +1097,7 @@@
           {
               if (flags & GMX_FORCE_VIRIAL)
               {
- -                fr->f_novirsum = fr->f_novirsum_alloc;
+ +                fr->f_novirsum = fr->forceBufferNoVirialSummation;
               }
               else
               {
@@@ -1108,7 -1193,7 +1105,7 @@@
                    * a separate array for forces that do not contribute
                    * to the pressure.
                    */
- -                fr->f_novirsum = f;
+ +                fr->f_novirsum = force;
               }
           }
   
@@@ -1116,9 -1201,14 +1113,9 @@@
           {
               if (flags & GMX_FORCE_VIRIAL)
               {
- -                if (fr->bDomDec)
- -                {
- -                    clear_rvecs_omp(fr->f_novirsum_n, fr->f_novirsum);
- -                }
- -                else
- -                {
- -                    clear_rvecs_omp(homenr, fr->f_novirsum+start);
- -                }
+ +                /* TODO: remove this - 1 when padding is properly implemented */
+ +                clear_rvecs_omp(fr->f_novirsum->size() - 1,
+ +                                as_rvec_array(fr->f_novirsum->data()));
               }
           }
           /* Clear the short- and long-range forces */
@@@ -1274,7 -1364,7 +1271,7 @@@
   
       if (bDoForces && DOMAINDECOMP(cr))
       {
-         if (bUseGPU && useCuda)
+         if (bUseGPU)
           {
               /* We are done with the CPU compute, but the GPU local non-bonded
                * kernel can still be running while we communicate the forces.
@@@ -1293,10 -1383,15 +1290,15 @@@
       if (bUseOrEmulGPU)
       {
           /* wait for local forces (or calculate in emulation mode) */
-         if (bUseGPU && useCuda)
+         if (bUseGPU)
           {
               float       cycles_tmp, cycles_wait_est;
-             const float cuda_api_overhead_margin = 50000.0f; /* cycles */
+             /* Measured overhead on CUDA and OpenCL with(out) GPU sharing
+              * is between 0.5 and 1.5 Mcycles. So 2 MCycles is an overestimate,
+              * but even with a step of 0.1 ms the difference is less than 1%
+              * of the step time.
+              */
+             const float gpuWaitApiOverheadMargin = 2e6f; /* cycles */
   
               wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
               nbnxn_gpu_wait_for_gpu(nbv->gpu_nbv,
@@@ -1309,7 -1404,7 +1311,7 @@@
               {
                   cycles_wait_est = gmx_cycles_read() - cycleCountBeforeLocalWorkCompletes;
   
-                 if (cycles_tmp < cuda_api_overhead_margin)
+                 if (cycles_tmp < gpuWaitApiOverheadMargin)
                   {
                       /* We measured few cycles, it could be that the kernel
                        * and transfer finished earlier and there was no actual
@@@ -1332,19 -1427,7 +1334,7 @@@
                */
               cycles_force    += cycles_wait_est;
               cycles_wait_gpu += cycles_wait_est;
-         }
-         else if (bUseGPU && useOpenCL)
-         {
   
-             wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
-             nbnxn_gpu_wait_for_gpu(nbv->gpu_nbv,
-                                    flags, eatLocal,
-                                    enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
-                                    fr->fshift);
-             cycles_wait_gpu += wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
-         }
-         if (bUseGPU)
-         {
               /* now clear the GPU outputs while we finish the step on the CPU */
               wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU_NB);
               nbnxn_gpu_clear_outputs(nbv->gpu_nbv, flags);
@@@ -1381,10 -1464,12 +1371,10 @@@
   
       if (bDoForces)
       {
- -        if (inputrecElecField(inputrec))
+ +        /* Compute forces due to electric field */
+ +        if (fr->efield != nullptr)
           {
- -            /* Compute forces due to electric field */
- -            calc_f_el(MASTER(cr) ? field : NULL,
- -                      start, homenr, mdatoms->chargeA, fr->f_novirsum,
- -                      inputrec->ex, inputrec->et, t);
+ +            fr->efield->calculateForces(cr, mdatoms, fr->f_novirsum, t);
           }
   
           /* If we have NoVirSum forces, but we do not calculate the virial,
@@@ -1460,24 -1545,25 +1450,24 @@@ void do_force_cutsGROUP(FILE *fplog, t_
                           gmx_localtop_t *top,
                           gmx_groups_t *groups,
                           matrix box, rvec x[], history_t *hist,
- -                        rvec f[],
+ +                        PaddedRVecVector *force,
                           tensor vir_force,
                           t_mdatoms *mdatoms,
                           gmx_enerdata_t *enerd, t_fcdata *fcd,
                           real *lambda, t_graph *graph,
                           t_forcerec *fr, gmx_vsite_t *vsite, rvec mu_tot,
- -                        double t, FILE *field, gmx_edsam_t ed,
+ +                        double t, gmx_edsam_t ed,
                           gmx_bool bBornRadii,
                           int flags)
   {
       int        cg0, cg1, i, j;
- -    int        start, homenr;
       double     mu[2*DIM];
       gmx_bool   bStateChanged, bNS, bFillGrid, bCalcCGCM;
       gmx_bool   bDoForces;
       float      cycles_pme, cycles_force;
   
- -    start  = 0;
- -    homenr = mdatoms->homenr;
+ +    const int  start  = 0;
+ +    const int  homenr = mdatoms->homenr;
   
       clear_mat(vir_force);
   
@@@ -1562,6 -1648,8 +1552,6 @@@
            * we do not need to worry about shifting.
            */
   
- -        int pme_flags = 0;
- -
           wallcycle_start(wcycle, ewcPP_PMESENDX);
   
           bBS = (inputrec->nwall == 2);
@@@ -1571,10 -1659,20 +1561,10 @@@
               svmul(inputrec->wall_ewald_zfac, boxs[ZZ], boxs[ZZ]);
           }
   
- -        if (EEL_PME(fr->eeltype))
- -        {
- -            pme_flags |= GMX_PME_DO_COULOMB;
- -        }
- -
- -        if (EVDW_PME(fr->vdwtype))
- -        {
- -            pme_flags |= GMX_PME_DO_LJ;
- -        }
- -
           gmx_pme_send_coordinates(cr, bBS ? boxs : box, x,
- -                                 mdatoms->nChargePerturbed, mdatoms->nTypePerturbed, lambda[efptCOUL], lambda[efptVDW],
+ +                                 lambda[efptCOUL], lambda[efptVDW],
                                    (flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)),
- -                                 pme_flags, step);
+ +                                 step);
   
           wallcycle_stop(wcycle, ewcPP_PMESENDX);
       }
@@@ -1662,9 -1760,6 +1652,9 @@@
           wallcycle_stop(wcycle, ewcROT);
       }
   
+ +    /* Temporary solution until all routines take PaddedRVecVector */
+ +    rvec *f = as_rvec_array(force->data());
+ +
       /* Start the force cycle counter.
        * This counter is stopped after do_force_lowlevel.
        * No parallel communication should occur while this counter is running,
@@@ -1680,10 -1775,15 +1670,10 @@@
           {
               if (flags & GMX_FORCE_VIRIAL)
               {
- -                fr->f_novirsum = fr->f_novirsum_alloc;
- -                if (fr->bDomDec)
- -                {
- -                    clear_rvecs(fr->f_novirsum_n, fr->f_novirsum);
- -                }
- -                else
- -                {
- -                    clear_rvecs(homenr, fr->f_novirsum+start);
- -                }
+ +                fr->f_novirsum = fr->forceBufferNoVirialSummation;
+ +                /* TODO: remove this - 1 when padding is properly implemented */
+ +                clear_rvecs(fr->f_novirsum->size() - 1,
+ +                            as_rvec_array(fr->f_novirsum->data()));
               }
               else
               {
@@@ -1691,7 -1791,7 +1681,7 @@@
                    * a separate array for forces that do not contribute
                    * to the pressure.
                    */
- -                fr->f_novirsum = f;
+ +                fr->f_novirsum = force;
               }
           }
   
@@@ -1739,10 -1839,12 +1729,10 @@@
   
       if (bDoForces)
       {
- -        if (inputrecElecField(inputrec))
+ +        /* Compute forces due to electric field */
+ +        if (fr->efield != nullptr)
           {
- -            /* Compute forces due to electric field */
- -            calc_f_el(MASTER(cr) ? field : NULL,
- -                      start, homenr, mdatoms->chargeA, fr->f_novirsum,
- -                      inputrec->ex, inputrec->et, t);
+ +            fr->efield->calculateForces(cr, mdatoms, fr->f_novirsum, t);
           }
   
           /* Communicate the forces */
@@@ -1760,7 -1862,7 +1750,7 @@@
               if (EEL_FULL(fr->eeltype) && cr->dd->n_intercg_excl &&
                   (flags & GMX_FORCE_VIRIAL))
               {
- -                dd_move_f(cr->dd, fr->f_novirsum, NULL);
+ +                dd_move_f(cr->dd, as_rvec_array(fr->f_novirsum->data()), NULL);
               }
               wallcycle_stop(wcycle, ewcMOVEF);
           }
@@@ -1835,15 -1937,15 +1825,15 @@@ void do_force(FILE *fplog, t_commrec *c
                 gmx_int64_t step, t_nrnb *nrnb, gmx_wallcycle_t wcycle,
                 gmx_localtop_t *top,
                 gmx_groups_t *groups,
- -              matrix box, rvec x[], history_t *hist,
- -              rvec f[],
+ +              matrix box, PaddedRVecVector *coordinates, history_t *hist,
+ +              PaddedRVecVector *force,
                 tensor vir_force,
                 t_mdatoms *mdatoms,
                 gmx_enerdata_t *enerd, t_fcdata *fcd,
- -              real *lambda, t_graph *graph,
+ +              std::vector<real> *lambda, t_graph *graph,
                 t_forcerec *fr,
                 gmx_vsite_t *vsite, rvec mu_tot,
- -              double t, FILE *field, gmx_edsam_t ed,
+ +              double t, gmx_edsam_t ed,
                 gmx_bool bBornRadii,
                 int flags)
   {
@@@ -1853,11 -1955,6 +1843,11 @@@
           flags &= ~GMX_FORCE_NONBONDED;
       }
   
+ +    GMX_ASSERT(coordinates->size() >= static_cast<unsigned int>(fr->natoms_force + 1), "We might need 1 element extra for SIMD");
+ +    GMX_ASSERT(force->size() >= static_cast<unsigned int>(fr->natoms_force + 1), "We might need 1 element extra for SIMD");
+ +
+ +    rvec *x = as_rvec_array(coordinates->data());
+ +
       switch (inputrec->cutoff_scheme)
       {
           case ecutsVERLET:
@@@ -1866,13 -1963,13 +1856,13 @@@
                                   top,
                                   groups,
                                   box, x, hist,
- -                                f, vir_force,
+ +                                force, vir_force,
                                   mdatoms,
                                   enerd, fcd,
- -                                lambda, graph,
+ +                                lambda->data(), graph,
                                   fr, fr->ic,
                                   vsite, mu_tot,
- -                                t, field, ed,
+ +                                t, ed,
                                   bBornRadii,
                                   flags);
               break;
@@@ -1882,12 -1979,12 +1872,12 @@@
                                  top,
                                  groups,
                                  box, x, hist,
- -                               f, vir_force,
+ +                               force, vir_force,
                                  mdatoms,
                                  enerd, fcd,
- -                               lambda, graph,
+ +                               lambda->data(), graph,
                                  fr, vsite, mu_tot,
- -                               t, field, ed,
+ +                               t, ed,
                                  bBornRadii,
                                  flags);
               break;
@@@ -1934,7 -2031,7 +1924,7 @@@ void do_constrain_first(FILE *fplog, gm
       /* constrain the current position */
       constrain(NULL, TRUE, FALSE, constr, &(top->idef),
                 ir, cr, step, 0, 1.0, md,
- -              state->x, state->x, NULL,
+ +              as_rvec_array(state->x.data()), as_rvec_array(state->x.data()), NULL,
                 fr->bMolPBC, state->box,
                 state->lambda[efptBONDED], &dvdl_dum,
                 NULL, NULL, nrnb, econqCoord);
@@@ -1944,7 -2041,7 +1934,7 @@@
           /* also may be useful if we need the ekin from the halfstep for velocity verlet */
           constrain(NULL, TRUE, FALSE, constr, &(top->idef),
                     ir, cr, step, 0, 1.0, md,
- -                  state->x, state->v, state->v,
+ +                  as_rvec_array(state->x.data()), as_rvec_array(state->v.data()), as_rvec_array(state->v.data()),
                     fr->bMolPBC, state->box,
                     state->lambda[efptBONDED], &dvdl_dum,
                     NULL, NULL, nrnb, econqVeloc);
@@@ -1974,10 -2071,10 +1964,10 @@@
           dvdl_dum = 0;
           constrain(NULL, TRUE, FALSE, constr, &(top->idef),
                     ir, cr, step, -1, 1.0, md,
- -                  state->x, savex, NULL,
+ +                  as_rvec_array(state->x.data()), savex, NULL,
                     fr->bMolPBC, state->box,
                     state->lambda[efptBONDED], &dvdl_dum,
- -                  state->v, NULL, nrnb, econqCoord);
+ +                  as_rvec_array(state->v.data()), NULL, nrnb, econqCoord);
   
           for (i = start; i < end; i++)
           {
@@@ -2462,7 -2559,7 +2452,7 @@@ void put_atoms_in_box_omp(int ePBC, con
   }
   
   // TODO This can be cleaned up a lot, and move back to runner.cpp
- -void finish_run(FILE *fplog, t_commrec *cr,
+ +void finish_run(FILE *fplog, const gmx::MDLogger &mdlog, t_commrec *cr,
                   t_inputrec *inputrec,
                   t_nrnb nrnb[], gmx_wallcycle_t wcycle,
                   gmx_walltime_accounting_t walltime_accounting,
@@@ -2539,7 -2636,7 +2529,7 @@@
       {
           struct gmx_wallclock_gpu_t* gputimes = use_GPU(nbv) ? nbnxn_gpu_get_timings(nbv->gpu_nbv) : NULL;
   
- -        wallcycle_print(fplog, cr->nnodes, cr->npmenodes, nthreads_pp, nthreads_pme,
+ +        wallcycle_print(fplog, mdlog, cr->nnodes, cr->npmenodes, nthreads_pp, nthreads_pme,
                           elapsed_time_over_all_ranks,
                           wcycle, cycle_sum, gputimes);
   
@@@ -2565,51 -2662,60 +2555,51 @@@
       }
   }
   
- -extern void initialize_lambdas(FILE *fplog, t_inputrec *ir, int *fep_state, real *lambda, double *lam0)
+ +extern void initialize_lambdas(FILE *fplog, t_inputrec *ir, int *fep_state, std::vector<real> *lambda, double *lam0)
   {
       /* this function works, but could probably use a logic rewrite to keep all the different
          types of efep straight. */
   
- -    int       i;
+ +    if ((ir->efep == efepNO) && (ir->bSimTemp == FALSE))
+ +    {
+ +        return;
+ +    }
+ +
       t_lambda *fep = ir->fepvals;
+ +    *fep_state    = fep->init_fep_state; /* this might overwrite the checkpoint
+ +                                            if checkpoint is set -- a kludge is in for now
+ +                                            to prevent this.*/
   
- -    if ((ir->efep == efepNO) && (ir->bSimTemp == FALSE))
+ +    lambda->resize(efptNR);
+ +
+ +    for (int i = 0; i < efptNR; i++)
       {
- -        for (i = 0; i < efptNR; i++)
+ +        /* overwrite lambda state with init_lambda for now for backwards compatibility */
+ +        if (fep->init_lambda >= 0) /* if it's -1, it was never initializd */
           {
- -            lambda[i] = 0.0;
+ +            (*lambda)[i] = fep->init_lambda;
               if (lam0)
               {
- -                lam0[i] = 0.0;
+ +                lam0[i] = (*lambda)[i];
               }
           }
- -        return;
- -    }
- -    else
- -    {
- -        *fep_state = fep->init_fep_state; /* this might overwrite the checkpoint
- -                                             if checkpoint is set -- a kludge is in for now
- -                                             to prevent this.*/
- -        for (i = 0; i < efptNR; i++)
+ +        else
           {
- -            /* overwrite lambda state with init_lambda for now for backwards compatibility */
- -            if (fep->init_lambda >= 0) /* if it's -1, it was never initializd */
- -            {
- -                lambda[i] = fep->init_lambda;
- -                if (lam0)
- -                {
- -                    lam0[i] = lambda[i];
- -                }
- -            }
- -            else
+ +            (*lambda)[i] = fep->all_lambda[i][*fep_state];
+ +            if (lam0)
               {
- -                lambda[i] = fep->all_lambda[i][*fep_state];
- -                if (lam0)
- -                {
- -                    lam0[i] = lambda[i];
- -                }
+ +                lam0[i] = (*lambda)[i];
               }
           }
- -        if (ir->bSimTemp)
+ +    }
+ +    if (ir->bSimTemp)
+ +    {
+ +        /* need to rescale control temperatures to match current state */
+ +        for (int i = 0; i < ir->opts.ngtc; i++)
           {
- -            /* need to rescale control temperatures to match current state */
- -            for (i = 0; i < ir->opts.ngtc; i++)
+ +            if (ir->opts.ref_t[i] > 0)
               {
- -                if (ir->opts.ref_t[i] > 0)
- -                {
- -                    ir->opts.ref_t[i] = ir->simtempvals->temperatures[*fep_state];
- -                }
+ +                ir->opts.ref_t[i] = ir->simtempvals->temperatures[*fep_state];
               }
           }
       }
@@@ -2618,9 -2724,9 +2608,9 @@@
       if (fplog != NULL)
       {
           fprintf(fplog, "Initial vector of lambda components:[ ");
- -        for (i = 0; i < efptNR; i++)
+ +        for (int i = 0; i < efptNR; i++)
           {
- -            fprintf(fplog, "%10.4f ", lambda[i]);
+ +            fprintf(fplog, "%10.4f ", (*lambda)[i]);
           }
           fprintf(fplog, "]\n");
       }
@@@ -2631,7 -2737,7 +2621,7 @@@
   void init_md(FILE *fplog,
                t_commrec *cr, t_inputrec *ir, const gmx_output_env_t *oenv,
                double *t, double *t0,
- -             real *lambda, int *fep_state, double *lam0,
+ +             std::vector<real> *lambda, int *fep_state, double *lam0,
                t_nrnb *nrnb, gmx_mtop_t *mtop,
                gmx_update_t **upd,
                int nfile, const t_filenm fnm[],
@@@ -2688,6 -2794,10 +2678,6 @@@
               please_cite(fplog, "Goga2012");
           }
       }
- -    if ((ir->et[XX].n > 0) || (ir->et[YY].n > 0) || (ir->et[ZZ].n > 0))
- -    {
- -        please_cite(fplog, "Caleman2008a");
- -    }
       init_nrnb(nrnb);
   
       if (nfile != -1)
author	Mark Abraham <mark.j.abraham@gmail.com>
	Mon, 31 Oct 2016 11:39:40 +0000 (12:39 +0100)
committer	Mark Abraham <mark.j.abraham@gmail.com>
	Mon, 31 Oct 2016 13:38:43 +0000 (14:38 +0100)
		1	2
cmake/gmxVersionInfo.cmake	patch \|	diff1 \|	diff2 \|	blob \| history
docs/install-guide/index.rst	patch \|	diff1 \|	diff2 \|	blob \| history
src/gromacs/domdec/domdec.cpp	patch \|	diff1 \|	diff2 \|	blob \| history
src/gromacs/gmxpreprocess/readir.cpp	patch \|	diff1 \|	diff2 \|	blob \| history
src/gromacs/mdlib/constr.cpp	patch \|	diff1 \|	diff2 \|	blob \| history
src/gromacs/mdlib/sim_util.cpp	patch \|	diff1 \|	diff2 \|	blob \| history