From: Mark Abraham <mark.j.abraham@gmail.com>
Date: Wed, 1 Oct 2014 20:02:02 +0000 (+0200)
Subject: Merge release-5-0 into master
X-Git-Url: http://biod.pnpi.spb.ru/gitweb/?a=commitdiff_plain;h=19d3c2e5d0c401eb59010960d11a18b6ba2c54c6;hp=fe90f1c1c71a3a43a27ec9ba76e772ae54786c7f;p=alexxy%2Fgromacs.git

Merge release-5-0 into master

Conflicts:
	CMakeLists.txt
Version numbers not bumped; fixed to use the right
name for RelWithDebInfo.

	cmake/gmxCFlags.cmake
Fixed to use the right name for RelWithDebInfo.

	src/gromacs/listed-forces/bonded.cpp
New RB SIMD function in bonded.cpp had unused variables, now
eliminated

	src/gromacs/mdlib/domdec.cpp
Bug fixes from release-5-0 incorporated. std::max now used in code
newly arrived from release-5-0.

md.cpp had no conflict, but fr->nbv->bUseGPU had to
be replaced by use_GPU(fr->nbv) to work in master branch.

Change-Id: I65326b691745111fbdaa9435be6c92fa1acf6e7d
---

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 70185949c4..7fbe947522 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -81,7 +81,7 @@ if(CMAKE_CONFIGURATION_TYPES)
         "List of configuration types"
         FORCE)
 endif()
-set(build_types_with_explicit_flags RELEASE DEBUG RELWITHDEBUGINFO RELWITHASSERT MINSIZEREL PROFILE)
+set(build_types_with_explicit_flags RELEASE DEBUG RELWITHDEBINFO RELWITHASSERT MINSIZEREL PROFILE)
 
 set_property(GLOBAL PROPERTY FIND_LIBRARY_USE_LIB64_PATHS ON)
 
diff --git a/admin/uncrustify.cfg b/admin/uncrustify.cfg
index d89351a7e6..f7308115b2 100644
--- a/admin/uncrustify.cfg
+++ b/admin/uncrustify.cfg
@@ -1575,3 +1575,12 @@ pp_define_at_level                       = false    # false/true
 # all tokens are separated by any mix of ',' commas, '=' equal signs
 # and whitespace (space, tab)
 #
+
+# Teach uncrustify about the GROMACS attribute aliases that we use
+# to hide compiler differences. This means that declarations like
+#
+# int i, j;
+# int nthreads gmx_unused;
+#
+# does not align i with gmx_unused.
+set ATTRIBUTE gmx_unused gmx_inline gmx_restrict
diff --git a/cmake/gmxCFlags.cmake b/cmake/gmxCFlags.cmake
index 011cada36e..3a0e3bbe15 100644
--- a/cmake/gmxCFlags.cmake
+++ b/cmake/gmxCFlags.cmake
@@ -64,7 +64,7 @@ function(gmx_set_cmake_compiler_flags)
         # be set up elsewhere and passed to this function, but it is
         # inconvenient in CMake to pass more than one list, and such a
         # list is only used here.
-        foreach(build_type RELWITHDEBUGINFO RELWITHASSERT MINSIZEREL PROFILE)
+        foreach(build_type RELWITHDEBINFO RELWITHASSERT MINSIZEREL PROFILE)
             set(GMXC_${language}FLAGS_${build_type} "${GMXC_${language}FLAGS_RELEASE}")
         endforeach()
         # Copy the flags that are only used by the real Release build
@@ -85,9 +85,12 @@ function(gmx_set_cmake_compiler_flags)
             endif()
 
             # Append to the variables for the given build type for
-            # each language, in the parent scope.
+            # each language, in the parent scope. We add our new variables at the end, so
+            # compiler-specific choices are more likely to override default CMake choices.
+            # This is for instance useful for RelWithDebInfo builds, where we want to use the full
+            # set of our optimization flags detected in this file, rather than having -O2 override them.
             set(CMAKE_${language}_FLAGS${punctuation}${build_type}
-                "${GMXC_${language}FLAGS${punctuation}${build_type}} ${CMAKE_${language}_FLAGS${punctuation}${build_type}}"
+                "${CMAKE_${language}_FLAGS${punctuation}${build_type}} ${GMXC_${language}FLAGS${punctuation}${build_type}}"
                 PARENT_SCOPE)
         endforeach()
     endforeach()
diff --git a/cmake/gmxTestCompilerProblems.cmake b/cmake/gmxTestCompilerProblems.cmake
index e534730013..a8cdb870df 100644
--- a/cmake/gmxTestCompilerProblems.cmake
+++ b/cmake/gmxTestCompilerProblems.cmake
@@ -80,8 +80,11 @@ macro(gmx_test_compiler_problems)
         message(WARNING "All tested PGI compiler versions (up to 12.9.0) generate binaries which produce incorrect results, or even fail to  compile Gromacs. Highly recommended to use a different compiler. If you choose to use PGI, make sure to run the regressiontests.")
     endif()
 
-    if(CMAKE_COMPILER_IS_GNUCC AND WIN32 AND (GMX_SIMD STREQUAL "AVX_256" OR GMX_SIMD STREQUAL "AVX2_256"))
-        message(WARNING "GCC on Windows with AVX crashes. Choose SSE4_1 or a different compiler.") # GCC bug 49001.
+    if(CMAKE_COMPILER_IS_GNUCC AND
+            (CMAKE_C_COMPILER_VERSION VERSION_LESS "4.9.0" OR CMAKE_SIZEOF_VOID_P EQUAL 8)
+            AND (WIN32 OR CYGWIN)
+            AND GMX_SIMD MATCHES "AVX" AND NOT GMX_SIMD STREQUAL AVX_128_FMA)
+        message(WARNING "GCC on Windows (GCC older than 4.9 or any version when compiling for 64bit) with AVX (other than AVX_128_FMA) crashes. Choose a different GMX_SIMD or a different compiler.") # GCC bug 49001, 54412.
     endif()
 
     if(CMAKE_C_COMPILER_ID MATCHES "Clang" AND WIN32 AND NOT CYGWIN)
diff --git a/src/gromacs/gmxana/gmx_tune_pme.c b/src/gromacs/gmxana/gmx_tune_pme.c
index b9c2af3e71..c4a7e6e021 100644
--- a/src/gromacs/gmxana/gmx_tune_pme.c
+++ b/src/gromacs/gmxana/gmx_tune_pme.c
@@ -1308,9 +1308,12 @@ static void make_sure_it_runs(char *mdrun_cmd_line, int length, FILE *fp,
     {
         /* To prevent confusion, do not again issue a gmx_fatal here since we already
          * get the error message from mdrun itself */
-        sprintf(msg,    "Cannot run the benchmark simulations! Please check the error message of\n"
+        sprintf(msg,
+                "Cannot run the first benchmark simulation! Please check the error message of\n"
                 "mdrun for the source of the problem. Did you provide a command line\n"
-                "argument that neither g_tune_pme nor mdrun understands? Offending command:\n"
+                "argument that neither gmx tune_pme nor mdrun understands? If you're\n"
+                "sure your command line should work, you can bypass this check with \n"
+                "gmx tune_pme -nocheck. The failing command was:\n"
                 "\n%s\n\n", command);
 
         fprintf(stderr, "%s", msg);
@@ -1341,7 +1344,7 @@ static void do_the_tests(
         int             npme_fixed,     /* If >= -1, test fixed number of PME
                                          * nodes only                             */
         const char     *npmevalues_opt, /* Which -npme values should be tested    */
-        t_perf        **perfdata,       /* Here the performace data is stored     */
+        t_perf        **perfdata,       /* Here the performance data is stored    */
         int            *pmeentries,     /* Entries in the nPMEnodes list          */
         int             repeats,        /* Repeat each test this often            */
         int             nnodes,         /* Total number of nodes = nPP + nPME     */
@@ -1354,7 +1357,8 @@ static void do_the_tests(
         const t_filenm *fnm,            /* List of filenames from command line    */
         int             nfile,          /* Number of files specified on the cmdl. */
         int             presteps,       /* DLB equilibration steps, is checked    */
-        gmx_int64_t     cpt_steps)      /* Time step counter in the checkpoint    */
+        gmx_int64_t     cpt_steps,      /* Time step counter in the checkpoint    */
+        gmx_bool        bCheck)         /* Check whether benchmark mdrun works    */
 {
     int      i, nr, k, ret, count = 0, totaltests;
     int     *nPMEnodes = NULL;
@@ -1454,8 +1458,11 @@ static void do_the_tests(
                         cmd_stub, pd->nPMEnodes, tpr_names[k], cmd_args_bench);
 
                 /* To prevent that all benchmarks fail due to a show-stopper argument
-                 * on the mdrun command line, we make a quick check first */
-                if (bFirst)
+                 * on the mdrun command line, we make a quick check first.
+                 * This check can be turned off in cases where the automatically chosen
+                 * number of PME-only ranks leads to a number of PP ranks for which no
+                 * decomposition can be found (e.g. for large prime numbers) */
+                if (bFirst && bCheck)
                 {
                     make_sure_it_runs(pd->mdrun_cmd_line, cmdline_length, fp, fnm, nfile);
                 }
@@ -2016,6 +2023,9 @@ int gmx_tune_pme(int argc, char *argv[])
         "need to provide a machine- or hostfile. This can also be passed",
         "via the MPIRUN variable, e.g.[PAR]",
         "[TT]export MPIRUN=\"/usr/local/mpirun -machinefile hosts\"[tt][PAR]",
+        "Before doing the actual benchmark runs, [THISMODULE] will do a quick",
+        "check whether mdrun works as expected with the provided parallel settings",
+        "if the [TT]-check[tt] option is activated (the default).",
         "Please call [THISMODULE] with the normal options you would pass to",
         "[gmx-mdrun] and add [TT]-np[tt] for the number of ranks to perform the",
         "tests on, or [TT]-ntmpi[tt] for the number of threads. You can also add [TT]-r[tt]",
@@ -2025,7 +2035,7 @@ int gmx_tune_pme(int argc, char *argv[])
         "written with enlarged cutoffs and smaller Fourier grids respectively.",
         "Typically, the first test (number 0) will be with the settings from the input",
         "[TT].tpr[tt] file; the last test (number [TT]ntpr[tt]) will have the Coulomb cutoff",
-        "specified by [TT]-rmax[tt] with a somwhat smaller PME grid at the same time. ",
+        "specified by [TT]-rmax[tt] with a somewhat smaller PME grid at the same time. ",
         "In this last test, the Fourier spacing is multiplied with [TT]rmax[tt]/rcoulomb. ",
         "The remaining [TT].tpr[tt] files will have equally-spaced Coulomb radii (and Fourier "
         "spacings) between these extremes. [BB]Note[bb] that you can set [TT]-ntpr[tt] to 1",
@@ -2035,7 +2045,7 @@ int gmx_tune_pme(int argc, char *argv[])
         "MD systems. The dynamic load balancing needs about 100 time steps",
         "to adapt to local load imbalances, therefore the time step counters",
         "are by default reset after 100 steps. For large systems (>1M atoms), as well as ",
-        "for a higher accuarcy of the measurements, you should set [TT]-resetstep[tt] to a higher value.",
+        "for a higher accuracy of the measurements, you should set [TT]-resetstep[tt] to a higher value.",
         "From the 'DD' load imbalance entries in the md.log output file you",
         "can tell after how many steps the load is sufficiently balanced. Example call:[PAR]"
         "[TT]gmx tune_pme -np 64 -s protein.tpr -launch[tt][PAR]",
@@ -2162,6 +2172,7 @@ int gmx_tune_pme(int argc, char *argv[])
     gmx_bool     bKeepAndNumCPT        = FALSE;
     gmx_bool     bResetCountersHalfWay = FALSE;
     gmx_bool     bBenchmark            = TRUE;
+    gmx_bool     bCheck                = TRUE;
 
     output_env_t oenv = NULL;
 
@@ -2205,6 +2216,8 @@ int gmx_tune_pme(int argc, char *argv[])
           "Launch the real simulation after optimization" },
         { "-bench",    FALSE, etBOOL, {&bBenchmark},
           "Run the benchmarks or just create the input [TT].tpr[tt] files?" },
+        { "-check",    FALSE, etBOOL, {&bCheck},
+          "Before the benchmark runs, check whether mdrun works in parallel" },
         /******************/
         /* mdrun options: */
         /******************/
@@ -2457,7 +2470,7 @@ int gmx_tune_pme(int argc, char *argv[])
     {
         do_the_tests(fp, tpr_names, maxPMEnodes, minPMEnodes, npme_fixed, npmevalues_opt[0], perfdata, &pmeentries,
                      repeats, nnodes, ntprs, bThreads, cmd_mpirun, cmd_np, cmd_mdrun,
-                     cmd_args_bench, fnm, NFILE, presteps, cpt_steps);
+                     cmd_args_bench, fnm, NFILE, presteps, cpt_steps, bCheck);
 
         fprintf(fp, "\nTuning took%8.1f minutes.\n", (gmx_gettime()-seconds)/60.0);
 
diff --git a/src/gromacs/gmxlib/calcgrid.c b/src/gromacs/gmxlib/calcgrid.c
index 1de1f57efe..7bbee76f12 100644
--- a/src/gromacs/gmxlib/calcgrid.c
+++ b/src/gromacs/gmxlib/calcgrid.c
@@ -136,7 +136,7 @@ real calc_grid(FILE *fp, matrix box, real gr_sp,
                 /* Determine how many pre-factors of 2 we need */
                 fac2 = 1;
                 i    = g_baseNR - 1;
-                while (fac2*grid_base[i-1] < nmin)
+                while (fac2*grid_base[i] < nmin)
                 {
                     fac2 *= 2;
                 }
diff --git a/src/gromacs/gmxpreprocess/toppush.c b/src/gromacs/gmxpreprocess/toppush.c
index 161940b72e..8272b7787e 100644
--- a/src/gromacs/gmxpreprocess/toppush.c
+++ b/src/gromacs/gmxpreprocess/toppush.c
@@ -599,9 +599,11 @@ static void push_bondtype(t_params     *       bt,
                     {
                         sprintf(errbuf, "Overriding %s parameters.%s",
                                 interaction_function[ftype].longname,
-                                (ftype == F_PDIHS) ? "\nUse dihedraltype 4 to allow several multiplicity terms." : "");
+                                (ftype == F_PDIHS) ?
+                                "\nUse dihedraltype 9 to allow several multiplicity terms. Only consecutive lines are combined. Non-consective lines overwrite each other."
+                                : "");
                         warning(wi, errbuf);
-                        fprintf(stderr, "  old:");
+                        fprintf(stderr, "  old:                                         ");
                         for (j = 0; (j < nrfp); j++)
                         {
                             fprintf(stderr, " %g", bt->param[i].c[j]);
diff --git a/src/gromacs/legacyheaders/domdec.h b/src/gromacs/legacyheaders/domdec.h
index 13b55711e8..c44c7bf840 100644
--- a/src/gromacs/legacyheaders/domdec.h
+++ b/src/gromacs/legacyheaders/domdec.h
@@ -139,6 +139,12 @@ void change_dd_dlb_cutoff_limit(t_commrec *cr);
  * possible after subsequently setting a shorter cut-off with change_dd_cutoff.
  */
 
+gmx_bool dd_dlb_is_locked(const gmx_domdec_t *dd);
+/* Return if the DLB lock is set */
+
+void dd_dlb_set_lock(gmx_domdec_t *dd, gmx_bool bValue);
+/* Set a lock such that with DLB=auto DLB can (not) get turned on */
+
 void dd_setup_dlb_resource_sharing(t_commrec           *cr,
                                    const gmx_hw_info_t *hwinfo,
                                    const gmx_hw_opt_t  *hw_opt);
diff --git a/src/gromacs/listed-forces/bonded.cpp b/src/gromacs/listed-forces/bonded.cpp
index b1ded380a4..dfb846d9e7 100644
--- a/src/gromacs/listed-forces/bonded.cpp
+++ b/src/gromacs/listed-forces/bonded.cpp
@@ -2117,6 +2117,152 @@ pdihs_noener_simd(int nbonds,
     }
 }
 
+/* This is mostly a copy of pdihs_noener_simd above, but with using
+ * the RB potential instead of a harmonic potential.
+ * This function can replace rbdihs() when no energy and virial are needed.
+ */
+static void
+rbdihs_noener_simd(int nbonds,
+                   const t_iatom forceatoms[], const t_iparams forceparams[],
+                   const rvec x[], rvec f[],
+                   const t_pbc *pbc, const t_graph gmx_unused *g,
+                   real gmx_unused lambda,
+                   const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+                   int gmx_unused *global_atom_index)
+{
+    const int             nfa1 = 5;
+    int                   i, iu, s, j;
+    int                   type, ai[GMX_SIMD_REAL_WIDTH], aj[GMX_SIMD_REAL_WIDTH], ak[GMX_SIMD_REAL_WIDTH], al[GMX_SIMD_REAL_WIDTH];
+    real                  dr_array[3*DIM*GMX_SIMD_REAL_WIDTH+GMX_SIMD_REAL_WIDTH], *dr;
+    real                  buf_array[(NR_RBDIHS + 4)*GMX_SIMD_REAL_WIDTH+GMX_SIMD_REAL_WIDTH], *buf;
+    real                 *parm, *p, *q;
+
+    gmx_simd_real_t       phi_S;
+    gmx_simd_real_t       ddphi_S, cosfac_S;
+    gmx_simd_real_t       mx_S, my_S, mz_S;
+    gmx_simd_real_t       nx_S, ny_S, nz_S;
+    gmx_simd_real_t       nrkj_m2_S, nrkj_n2_S;
+    gmx_simd_real_t       parm_S, c_S;
+    gmx_simd_real_t       sin_S, cos_S;
+    gmx_simd_real_t       sf_i_S, msf_l_S;
+    pbc_simd_t            pbc_simd;
+
+    gmx_simd_real_t       pi_S  = gmx_simd_set1_r(M_PI);
+    gmx_simd_real_t       one_S = gmx_simd_set1_r(1.0);
+
+    /* Ensure SIMD register alignment */
+    dr  = gmx_simd_align_r(dr_array);
+    buf = gmx_simd_align_r(buf_array);
+
+    /* Extract aligned pointer for parameters and variables */
+    parm  = buf;
+    p     = buf + (NR_RBDIHS + 0)*GMX_SIMD_REAL_WIDTH;
+    q     = buf + (NR_RBDIHS + 1)*GMX_SIMD_REAL_WIDTH;
+
+    set_pbc_simd(pbc, &pbc_simd);
+
+    /* nbonds is the number of dihedrals times nfa1, here we step GMX_SIMD_REAL_WIDTH dihs */
+    for (i = 0; (i < nbonds); i += GMX_SIMD_REAL_WIDTH*nfa1)
+    {
+        /* Collect atoms quadruplets for GMX_SIMD_REAL_WIDTH dihedrals.
+         * iu indexes into forceatoms, we should not let iu go beyond nbonds.
+         */
+        iu = i;
+        for (s = 0; s < GMX_SIMD_REAL_WIDTH; s++)
+        {
+            type  = forceatoms[iu];
+            ai[s] = forceatoms[iu+1];
+            aj[s] = forceatoms[iu+2];
+            ak[s] = forceatoms[iu+3];
+            al[s] = forceatoms[iu+4];
+
+            /* We don't need the first parameter, since that's a constant
+             * which only affects the energies, not the forces.
+             */
+            for (j = 1; j < NR_RBDIHS; j++)
+            {
+                parm[j*GMX_SIMD_REAL_WIDTH + s] =
+                    forceparams[type].rbdihs.rbcA[j];
+            }
+
+            /* At the end fill the arrays with identical entries */
+            if (iu + nfa1 < nbonds)
+            {
+                iu += nfa1;
+            }
+        }
+
+        /* Caclulate GMX_SIMD_REAL_WIDTH dihedral angles at once */
+        dih_angle_simd(x, ai, aj, ak, al, &pbc_simd,
+                       dr,
+                       &phi_S,
+                       &mx_S, &my_S, &mz_S,
+                       &nx_S, &ny_S, &nz_S,
+                       &nrkj_m2_S,
+                       &nrkj_n2_S,
+                       p, q);
+
+        /* Change to polymer convention */
+        phi_S = gmx_simd_sub_r(phi_S, pi_S);
+
+        gmx_simd_sincos_r(phi_S, &sin_S, &cos_S);
+
+        ddphi_S   = gmx_simd_setzero_r();
+        c_S       = one_S;
+        cosfac_S  = one_S;
+        for (j = 1; j < NR_RBDIHS; j++)
+        {
+            parm_S   = gmx_simd_load_r(parm + j*GMX_SIMD_REAL_WIDTH);
+            ddphi_S  = gmx_simd_fmadd_r(gmx_simd_mul_r(c_S, parm_S), cosfac_S, ddphi_S);
+            cosfac_S = gmx_simd_mul_r(cosfac_S, cos_S);
+            c_S      = gmx_simd_add_r(c_S, one_S);
+        }
+
+        /* Note that here we do not use the minus sign which is present
+         * in the normal RB code. This is corrected for through (m)sf below.
+         */
+        ddphi_S  = gmx_simd_mul_r(ddphi_S, sin_S);
+
+        sf_i_S   = gmx_simd_mul_r(ddphi_S, nrkj_m2_S);
+        msf_l_S  = gmx_simd_mul_r(ddphi_S, nrkj_n2_S);
+
+        /* After this m?_S will contain f[i] */
+        mx_S     = gmx_simd_mul_r(sf_i_S, mx_S);
+        my_S     = gmx_simd_mul_r(sf_i_S, my_S);
+        mz_S     = gmx_simd_mul_r(sf_i_S, mz_S);
+
+        /* After this m?_S will contain -f[l] */
+        nx_S     = gmx_simd_mul_r(msf_l_S, nx_S);
+        ny_S     = gmx_simd_mul_r(msf_l_S, ny_S);
+        nz_S     = gmx_simd_mul_r(msf_l_S, nz_S);
+
+        gmx_simd_store_r(dr + 0*GMX_SIMD_REAL_WIDTH, mx_S);
+        gmx_simd_store_r(dr + 1*GMX_SIMD_REAL_WIDTH, my_S);
+        gmx_simd_store_r(dr + 2*GMX_SIMD_REAL_WIDTH, mz_S);
+        gmx_simd_store_r(dr + 3*GMX_SIMD_REAL_WIDTH, nx_S);
+        gmx_simd_store_r(dr + 4*GMX_SIMD_REAL_WIDTH, ny_S);
+        gmx_simd_store_r(dr + 5*GMX_SIMD_REAL_WIDTH, nz_S);
+
+        iu = i;
+        s  = 0;
+        do
+        {
+            do_dih_fup_noshiftf_precalc(ai[s], aj[s], ak[s], al[s],
+                                        p[s], q[s],
+                                        dr[     XX *GMX_SIMD_REAL_WIDTH+s],
+                                        dr[     YY *GMX_SIMD_REAL_WIDTH+s],
+                                        dr[     ZZ *GMX_SIMD_REAL_WIDTH+s],
+                                        dr[(DIM+XX)*GMX_SIMD_REAL_WIDTH+s],
+                                        dr[(DIM+YY)*GMX_SIMD_REAL_WIDTH+s],
+                                        dr[(DIM+ZZ)*GMX_SIMD_REAL_WIDTH+s],
+                                        f);
+            s++;
+            iu += nfa1;
+        }
+        while (s < GMX_SIMD_REAL_WIDTH && iu < nbonds);
+    }
+}
+
 #endif /* GMX_SIMD_HAVE_REAL */
 
 
@@ -4405,6 +4551,19 @@ static real calc_one_bond(int thread,
                 global_atom_index);
             v = 0;
         }
+#ifdef GMX_SIMD_HAVE_REAL
+        else if (ftype == F_RBDIHS &&
+                 !bCalcEnerVir && fr->efep == efepNO)
+        {
+            /* No energies, shift forces, dvdl */
+            rbdihs_noener_simd(nbn, idef->il[ftype].iatoms+nb0,
+                               idef->iparams,
+                               (const rvec*)x, f,
+                               pbc, g, lambda[efptFTYPE], md, fcd,
+                               global_atom_index);
+            v = 0;
+        }
+#endif
         else
         {
             v = interaction_function[ftype].ifunc(nbn, iatoms+nb0,
diff --git a/src/gromacs/mdlib/domdec.cpp b/src/gromacs/mdlib/domdec.cpp
index 13934d5fa6..06f2602916 100644
--- a/src/gromacs/mdlib/domdec.cpp
+++ b/src/gromacs/mdlib/domdec.cpp
@@ -275,6 +275,8 @@ typedef struct gmx_domdec_comm
 
     /* The DLB option */
     int      eDLB;
+    /* Is eDLB=edlbAUTO locked such that we currently can't turn it on? */
+    gmx_bool bDLB_locked;
     /* Are we actually using DLB? */
     gmx_bool bDynLoadBal;
 
@@ -391,9 +393,9 @@ typedef struct gmx_domdec_comm
     int    eFlop;
     double flop;
     int    flop_n;
-    /* Have often have did we have load measurements */
+    /* How many times have did we have load measurements */
     int    n_load_have;
-    /* Have often have we collected the load measurements */
+    /* How many times have we collected the load measurements */
     int    n_load_collect;
 
     /* Statistics */
@@ -3465,7 +3467,7 @@ static void set_dd_cell_sizes_dlb_root(gmx_domdec_t *dd,
             cell_size[i] = 1.0/ncd;
         }
     }
-    else if (dd_load_count(comm))
+    else if (dd_load_count(comm) > 0)
     {
         load_aver  = comm->load[d].sum_m/ncd;
         change_max = 0;
@@ -4336,7 +4338,7 @@ static void clear_and_mark_ind(int ncg, int *move,
 static void print_cg_move(FILE *fplog,
                           gmx_domdec_t *dd,
                           gmx_int64_t step, int cg, int dim, int dir,
-                          gmx_bool bHaveLimitdAndCMOld, real limitd,
+                          gmx_bool bHaveCgcmOld, real limitd,
                           rvec cm_old, rvec cm_new, real pos_d)
 {
     gmx_domdec_comm_t *comm;
@@ -4345,19 +4347,22 @@ static void print_cg_move(FILE *fplog,
     comm = dd->comm;
 
     fprintf(fplog, "\nStep %s:\n", gmx_step_str(step, buf));
-    if (bHaveLimitdAndCMOld)
+    if (limitd > 0)
     {
-        fprintf(fplog, "The charge group starting at atom %d moved more than the distance allowed by the domain decomposition (%f) in direction %c\n",
+        fprintf(fplog, "%s %d moved more than the distance allowed by the domain decomposition (%f) in direction %c\n",
+                dd->comm->bCGs ? "The charge group starting at atom" : "Atom",
                 ddglatnr(dd, dd->cgindex[cg]), limitd, dim2char(dim));
     }
     else
     {
-        fprintf(fplog, "The charge group starting at atom %d moved than the distance allowed by the domain decomposition in direction %c\n",
+        /* We don't have a limiting distance available: don't print it */
+        fprintf(fplog, "%s %d moved more than the distance allowed by the domain decomposition in direction %c\n",
+                dd->comm->bCGs ? "The charge group starting at atom" : "Atom",
                 ddglatnr(dd, dd->cgindex[cg]), dim2char(dim));
     }
     fprintf(fplog, "distance out of cell %f\n",
             dir == 1 ? pos_d - comm->cell_x1[dim] : pos_d - comm->cell_x0[dim]);
-    if (bHaveLimitdAndCMOld)
+    if (bHaveCgcmOld)
     {
         fprintf(fplog, "Old coordinates: %8.3f %8.3f %8.3f\n",
                 cm_old[XX], cm_old[YY], cm_old[ZZ]);
@@ -4375,19 +4380,20 @@ static void print_cg_move(FILE *fplog,
 static void cg_move_error(FILE *fplog,
                           gmx_domdec_t *dd,
                           gmx_int64_t step, int cg, int dim, int dir,
-                          gmx_bool bHaveLimitdAndCMOld, real limitd,
+                          gmx_bool bHaveCgcmOld, real limitd,
                           rvec cm_old, rvec cm_new, real pos_d)
 {
     if (fplog)
     {
         print_cg_move(fplog, dd, step, cg, dim, dir,
-                      bHaveLimitdAndCMOld, limitd, cm_old, cm_new, pos_d);
+                      bHaveCgcmOld, limitd, cm_old, cm_new, pos_d);
     }
     print_cg_move(stderr, dd, step, cg, dim, dir,
-                  bHaveLimitdAndCMOld, limitd, cm_old, cm_new, pos_d);
+                  bHaveCgcmOld, limitd, cm_old, cm_new, pos_d);
     gmx_fatal(FARGS,
-              "A charge group moved too far between two domain decomposition steps\n"
-              "This usually means that your system is not well equilibrated");
+              "%s moved too far between two domain decomposition steps\n"
+              "This usually means that your system is not well equilibrated",
+              dd->comm->bCGs ? "A charge group" : "An atom");
 }
 
 static void rotate_state_atom(t_state *state, int a)
@@ -4509,7 +4515,8 @@ static void calc_cg_move(FILE *fplog, gmx_int64_t step,
                 {
                     if (pos_d >= limit1[d])
                     {
-                        cg_move_error(fplog, dd, step, cg, d, 1, TRUE, limitd[d],
+                        cg_move_error(fplog, dd, step, cg, d, 1,
+                                      cg_cm != state->x, limitd[d],
                                       cg_cm[cg], cm_new, pos_d);
                     }
                     dev[d] = 1;
@@ -4535,7 +4542,8 @@ static void calc_cg_move(FILE *fplog, gmx_int64_t step,
                 {
                     if (pos_d < limit0[d])
                     {
-                        cg_move_error(fplog, dd, step, cg, d, -1, TRUE, limitd[d],
+                        cg_move_error(fplog, dd, step, cg, d, -1,
+                                      cg_cm != state->x, limitd[d],
                                       cg_cm[cg], cm_new, pos_d);
                     }
                     dev[d] = -1;
@@ -4949,7 +4957,7 @@ static void dd_redistribute_cg(FILE *fplog, gmx_int64_t step,
                 {
                     cg_move_error(fplog, dd, step, cg, dim,
                                   (flag & DD_FLAG_FW(d)) ? 1 : 0,
-                                  FALSE, 0,
+                                  fr->cutoff_scheme == ecutsGROUP, 0,
                                   comm->vbuf.v[buf_pos],
                                   comm->vbuf.v[buf_pos],
                                   comm->vbuf.v[buf_pos][dim]);
@@ -6673,7 +6681,8 @@ gmx_domdec_t *init_domain_decomposition(FILE *fplog, t_commrec *cr,
     /* Initialize to GPU share count to 0, might change later */
     comm->nrank_gpu_shared = 0;
 
-    comm->eDLB = check_dlb_support(fplog, cr, dlb_opt, comm->bRecordLoad, Flags, ir);
+    comm->eDLB        = check_dlb_support(fplog, cr, dlb_opt, comm->bRecordLoad, Flags, ir);
+    comm->bDLB_locked = FALSE;
 
     comm->bDynLoadBal = (comm->eDLB == edlbYES);
     if (fplog)
@@ -6739,6 +6748,13 @@ gmx_domdec_t *init_domain_decomposition(FILE *fplog, t_commrec *cr,
     comm->cellsize_limit = 0;
     comm->bBondComm      = FALSE;
 
+    /* Atoms should be able to move by up to half the list buffer size (if > 0)
+     * within nstlist steps. Since boundaries are allowed to displace by half
+     * a cell size, DD cells should be at least the size of the list buffer.
+     */
+    comm->cellsize_limit = std::max(comm->cellsize_limit,
+                                    ir->rlistlong - std::max(ir->rvdw, ir->rcoulomb));
+
     if (comm->bInterCGBondeds)
     {
         if (comm_distance_min > 0)
@@ -7564,6 +7580,20 @@ void change_dd_dlb_cutoff_limit(t_commrec *cr)
     comm->PMELoadBal_max_cutoff = comm->cutoff;
 }
 
+gmx_bool dd_dlb_is_locked(const gmx_domdec_t *dd)
+{
+    return dd->comm->bDLB_locked;
+}
+
+void dd_dlb_set_lock(gmx_domdec_t *dd, gmx_bool bValue)
+{
+    /* We can only lock the DLB when it is set to auto, otherwise don't lock */
+    if (dd->comm->eDLB == edlbAUTO)
+    {
+        dd->comm->bDLB_locked = bValue;
+    }
+}
+
 static void merge_cg_buffers(int ncell,
                              gmx_domdec_comm_dim_t *cd, int pulse,
                              int  *ncg_cell,
@@ -9324,17 +9354,17 @@ void dd_partition_system(FILE                *fplog,
     }
 
     /* Check if we have recorded loads on the nodes */
-    if (comm->bRecordLoad && dd_load_count(comm))
+    if (comm->bRecordLoad && dd_load_count(comm) > 0)
     {
-        if (comm->eDLB == edlbAUTO && !comm->bDynLoadBal)
+        if (comm->eDLB == edlbAUTO && !comm->bDynLoadBal && !dd_dlb_is_locked(dd))
         {
             /* Check if we should use DLB at the second partitioning
              * and every 100 partitionings,
              * so the extra communication cost is negligible.
              */
-            n         = std::max(100, nstglobalcomm);
+            const int nddp_chk_dlb = 100;
             bCheckDLB = (comm->n_load_collect == 0 ||
-                         comm->n_load_have % n == n-1);
+                         comm->n_load_have % nddp_chk_dlb == nddp_chk_dlb - 1);
         }
         else
         {
@@ -9372,8 +9402,26 @@ void dd_partition_system(FILE                *fplog,
                 /* Since the timings are node dependent, the master decides */
                 if (DDMASTER(dd))
                 {
-                    bTurnOnDLB =
-                        (dd_force_imb_perf_loss(dd) >= DD_PERF_LOSS_DLB_ON);
+                    /* Here we check if the max PME rank load is more than 0.98
+                     * the max PP force load. If so, PP DLB will not help,
+                     * since we are (almost) limited by PME. Furthermore,
+                     * DLB will cause a significant extra x/f redistribution
+                     * cost on the PME ranks, which will then surely result
+                     * in lower total performance.
+                     * This check might be fragile, since one measurement
+                     * below 0.98 (although only done once every 100 DD part.)
+                     * could turn on DLB for the rest of the run.
+                     */
+                    if (cr->npmenodes > 0 &&
+                        dd_pme_f_ratio(dd) > 1 - DD_PERF_LOSS_DLB_ON)
+                    {
+                        bTurnOnDLB = FALSE;
+                    }
+                    else
+                    {
+                        bTurnOnDLB =
+                            (dd_force_imb_perf_loss(dd) >= DD_PERF_LOSS_DLB_ON);
+                    }
                     if (debug)
                     {
                         fprintf(debug, "step %s, imb loss %f\n",
diff --git a/src/gromacs/mdlib/force.c b/src/gromacs/mdlib/force.c
index f98b073fee..4313d3e75e 100644
--- a/src/gromacs/mdlib/force.c
+++ b/src/gromacs/mdlib/force.c
@@ -118,9 +118,11 @@ static void reduce_thread_forces(int n, rvec *f,
                                  int nthreads, f_thread_t *f_t)
 {
     int t, i;
+    int nthreads_loop gmx_unused;
 
     /* This reduction can run over any number of threads */
-#pragma omp parallel for num_threads(gmx_omp_nthreads_get(emntBonded)) private(t) schedule(static)
+    nthreads_loop = gmx_omp_nthreads_get(emntBonded);
+#pragma omp parallel for num_threads(nthreads_loop) private(t) schedule(static)
     for (i = 0; i < n; i++)
     {
         for (t = 1; t < nthreads; t++)
diff --git a/src/gromacs/mdlib/mdatom.c b/src/gromacs/mdlib/mdatom.c
index 01c5c55bd6..d1722dedf1 100644
--- a/src/gromacs/mdlib/mdatom.c
+++ b/src/gromacs/mdlib/mdatom.c
@@ -118,6 +118,7 @@ void atoms2md(gmx_mtop_t *mtop, t_inputrec *ir,
     t_grpopts            *opts;
     gmx_groups_t         *groups;
     gmx_molblock_t       *molblock;
+    int                   nthreads gmx_unused;
 
     bLJPME = EVDW_PME(ir->vdwtype);
 
@@ -230,7 +231,8 @@ void atoms2md(gmx_mtop_t *mtop, t_inputrec *ir,
 
     alook = gmx_mtop_atomlookup_init(mtop);
 
-#pragma omp parallel for num_threads(gmx_omp_nthreads_get(emntDefault)) schedule(static)
+    nthreads = gmx_omp_nthreads_get(emntDefault);
+#pragma omp parallel for num_threads(nthreads) schedule(static)
     for (i = 0; i < md->nr; i++)
     {
         int      g, ag, molb;
diff --git a/src/gromacs/mdlib/minimize.c b/src/gromacs/mdlib/minimize.c
index 234bcbdd8f..5255d544d1 100644
--- a/src/gromacs/mdlib/minimize.c
+++ b/src/gromacs/mdlib/minimize.c
@@ -550,6 +550,7 @@ static void do_em_step(t_commrec *cr, t_inputrec *ir, t_mdatoms *md,
     int      start, end;
     rvec    *x1, *x2;
     real     dvdl_constr;
+    int      nthreads gmx_unused;
 
     s1 = &ems1->s;
     s2 = &ems2->s;
@@ -587,7 +588,8 @@ static void do_em_step(t_commrec *cr, t_inputrec *ir, t_mdatoms *md,
     x1 = s1->x;
     x2 = s2->x;
 
-#pragma omp parallel num_threads(gmx_omp_nthreads_get(emntUpdate))
+    nthreads = gmx_omp_nthreads_get(emntUpdate);
+#pragma omp parallel num_threads(nthreads)
     {
         int gf, i, m;
 
diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu
index 799425edab..c024f7b02c 100644
--- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu
+++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu
@@ -212,7 +212,7 @@ static inline nbnxn_cu_kfunc_ptr_t select_nbnxn_kernel(int  eeltype,
     nbnxn_cu_kfunc_ptr_t res;
 
     assert(eeltype < eelCuNR);
-    assert(evdwtype < eelCuNR);
+    assert(evdwtype < evdwCuNR);
 
     if (bDoEne)
     {
diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernels.cuh b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernels.cuh
index c468306899..3b37869e6d 100644
--- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernels.cuh
+++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernels.cuh
@@ -157,7 +157,7 @@
 /* Analytical Ewald interaction kernels with twin-range cut-off
  */
 #define EL_EWALD_ANA
-#define LJ_CUTOFF_CHECK
+#define VDW_CUTOFF_CHECK
 
 /* cut-off + V shift LJ */
 #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwTwinCut_VdwLJ ## __VA_ARGS__
@@ -189,7 +189,7 @@
 #undef NB_KERNEL_FUNC_NAME
 
 #undef EL_EWALD_ANA
-#undef LJ_CUTOFF_CHECK
+#undef VDW_CUTOFF_CHECK
 
 
 /* Tabulated Ewald interaction kernels */
@@ -229,7 +229,7 @@
 
 /* Tabulated Ewald interaction kernels with twin-range cut-off */
 #define EL_EWALD_TAB
-#define LJ_CUTOFF_CHECK
+#define VDW_CUTOFF_CHECK
 
 /* cut-off + V shift LJ */
 #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwQSTabTwinCut_VdwLJ ## __VA_ARGS__
@@ -261,4 +261,4 @@
 #undef NB_KERNEL_FUNC_NAME
 
 #undef EL_EWALD_TAB
-#undef LJ_CUTOFF_CHECK
+#undef VDW_CUTOFF_CHECK
diff --git a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_file_generator/nbnxn_kernel_simd_template.c.pre b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_file_generator/nbnxn_kernel_simd_template.c.pre
index 9432b2f311..5e410cdecc 100644
--- a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_file_generator/nbnxn_kernel_simd_template.c.pre
+++ b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_file_generator/nbnxn_kernel_simd_template.c.pre
@@ -139,6 +139,7 @@ void
     nbnxn_pairlist_t **nbl;
     int                coulkt, vdwkt = 0;
     int                nb;
+    int                nthreads gmx_unused;
 
     nnbl = nbl_list->nnbl;
     nbl  = nbl_list->nbl;
@@ -210,7 +211,8 @@ void
         gmx_incons("Unsupported VdW interaction type");
     }}
 
-#pragma omp parallel for schedule(static) num_threads(gmx_omp_nthreads_get(emntNonbonded))
+    nthreads = gmx_omp_nthreads_get(emntNonbonded);
+#pragma omp parallel for schedule(static) num_threads(nthreads)
     for (nb = 0; nb < nnbl; nb++)
     {{
         nbnxn_atomdata_output_t *out;
diff --git a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_ref.c b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_ref.c
index 3db03bbec5..4cad2a2f56 100644
--- a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_ref.c
+++ b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_ref.c
@@ -185,6 +185,7 @@ nbnxn_kernel_ref(const nbnxn_pairlist_set_t *nbl_list,
     int                coult;
     int                vdwt;
     int                nb;
+    int                nthreads gmx_unused;
 
     nnbl = nbl_list->nnbl;
     nbl  = nbl_list->nbl;
@@ -242,7 +243,8 @@ nbnxn_kernel_ref(const nbnxn_pairlist_set_t *nbl_list,
         gmx_incons("Unsupported vdwtype in nbnxn reference kernel");
     }
 
-#pragma omp parallel for schedule(static) num_threads(gmx_omp_nthreads_get(emntNonbonded))
+    nthreads = gmx_omp_nthreads_get(emntNonbonded);
+#pragma omp parallel for schedule(static) num_threads(nthreads)
     for (nb = 0; nb < nnbl; nb++)
     {
         nbnxn_atomdata_output_t *out;
diff --git a/src/gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn.c b/src/gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn.c
index ed1456873b..1d8567b206 100644
--- a/src/gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn.c
+++ b/src/gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn.c
@@ -274,6 +274,7 @@ nbnxn_kernel_simd_2xnn(nbnxn_pairlist_set_t      gmx_unused *nbl_list,
     nbnxn_pairlist_t **nbl;
     int                coulkt, vdwkt = 0;
     int                nb;
+    int                nthreads gmx_unused;
 
     nnbl = nbl_list->nnbl;
     nbl  = nbl_list->nbl;
@@ -345,7 +346,8 @@ nbnxn_kernel_simd_2xnn(nbnxn_pairlist_set_t      gmx_unused *nbl_list,
         gmx_incons("Unsupported VdW interaction type");
     }
 
-#pragma omp parallel for schedule(static) num_threads(gmx_omp_nthreads_get(emntNonbonded))
+    nthreads = gmx_omp_nthreads_get(emntNonbonded);
+#pragma omp parallel for schedule(static) num_threads(nthreads)
     for (nb = 0; nb < nnbl; nb++)
     {
         nbnxn_atomdata_output_t *out;
diff --git a/src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn.c b/src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn.c
index d17ba1e7bd..3531299efd 100644
--- a/src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn.c
+++ b/src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn.c
@@ -273,6 +273,7 @@ nbnxn_kernel_simd_4xn(nbnxn_pairlist_set_t      gmx_unused *nbl_list,
     nbnxn_pairlist_t **nbl;
     int                coulkt, vdwkt = 0;
     int                nb;
+    int                nthreads gmx_unused;
 
     nnbl = nbl_list->nnbl;
     nbl  = nbl_list->nbl;
@@ -344,7 +345,8 @@ nbnxn_kernel_simd_4xn(nbnxn_pairlist_set_t      gmx_unused *nbl_list,
         gmx_incons("Unsupported VdW interaction type");
     }
 
-#pragma omp parallel for schedule(static) num_threads(gmx_omp_nthreads_get(emntNonbonded))
+    nthreads = gmx_omp_nthreads_get(emntNonbonded);
+#pragma omp parallel for schedule(static) num_threads(nthreads)
     for (nb = 0; nb < nnbl; nb++)
     {
         nbnxn_atomdata_output_t *out;
diff --git a/src/gromacs/mdlib/nbnxn_search.c b/src/gromacs/mdlib/nbnxn_search.c
index f13de8c7db..56e616967c 100644
--- a/src/gromacs/mdlib/nbnxn_search.c
+++ b/src/gromacs/mdlib/nbnxn_search.c
@@ -1931,6 +1931,7 @@ void nbnxn_grid_add_simple(nbnxn_search_t    nbs,
     float        *bbcz;
     nbnxn_bb_t   *bb;
     int           ncd, sc;
+    int           nthreads gmx_unused;
 
     grid = &nbs->grid[0];
 
@@ -1957,7 +1958,8 @@ void nbnxn_grid_add_simple(nbnxn_search_t    nbs,
     bbcz = grid->bbcz_simple;
     bb   = grid->bb_simple;
 
-#pragma omp parallel for num_threads(gmx_omp_nthreads_get(emntPairsearch)) schedule(static)
+    nthreads = gmx_omp_nthreads_get(emntPairsearch);
+#pragma omp parallel for num_threads(nthreads) schedule(static)
     for (sc = 0; sc < grid->nc; sc++)
     {
         int c, tx, na;
@@ -4473,6 +4475,7 @@ static void combine_nblists(int nnbl, nbnxn_pairlist_t **nbl,
 {
     int nsci, ncj4, nexcl;
     int n, i;
+    int nthreads gmx_unused;
 
     if (nblc->bSimple)
     {
@@ -4513,7 +4516,8 @@ static void combine_nblists(int nnbl, nbnxn_pairlist_t **nbl,
     /* Each thread should copy its own data to the combined arrays,
      * as otherwise data will go back and forth between different caches.
      */
-#pragma omp parallel for num_threads(gmx_omp_nthreads_get(emntPairsearch)) schedule(static)
+    nthreads = gmx_omp_nthreads_get(emntPairsearch);
+#pragma omp parallel for num_threads(nthreads) schedule(static)
     for (n = 0; n < nnbl; n++)
     {
         int                     sci_offset;
diff --git a/src/gromacs/mdlib/ns.c b/src/gromacs/mdlib/ns.c
index 9145929c47..3a1396cd40 100644
--- a/src/gromacs/mdlib/ns.c
+++ b/src/gromacs/mdlib/ns.c
@@ -427,6 +427,14 @@ static gmx_inline void close_i_nblist(t_nblist *nlist)
         nlist->jindex[nri+1] = nlist->nrj;
 
         len = nlist->nrj -  nlist->jindex[nri];
+        /* If there are no j-particles we have to reduce the
+         * number of i-particles again, to prevent errors in the
+         * kernel functions.
+         */
+        if ((len == 0) && (nlist->nri > 0))
+        {
+            nlist->nri--;
+        }
     }
 }
 
diff --git a/src/gromacs/mdlib/sim_util.c b/src/gromacs/mdlib/sim_util.c
index e6686abd02..b35dd827ac 100644
--- a/src/gromacs/mdlib/sim_util.c
+++ b/src/gromacs/mdlib/sim_util.c
@@ -1399,6 +1399,16 @@ void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
 
     if (bDoForces && DOMAINDECOMP(cr))
     {
+        if (bUseGPU)
+        {
+            /* We are done with the CPU compute, but the GPU local non-bonded
+             * kernel can still be running while we communicate the forces.
+             * We start a counter here, so we can, hopefully, time the rest
+             * of the GPU kernel execution and data transfer.
+             */
+            wallcycle_start(wcycle, ewcWAIT_GPU_NB_L_EST);
+        }
+
         /* Communicate the forces */
         wallcycle_start(wcycle, ewcMOVEF);
         dd_move_f(cr->dd, f, fr->fshift);
@@ -1429,13 +1439,44 @@ void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
         /* wait for local forces (or calculate in emulation mode) */
         if (bUseGPU)
         {
+            float       cycles_tmp, cycles_wait_est;
+            const float cuda_api_overhead_margin = 50000.0f; /* cycles */
+
             wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
             nbnxn_cuda_wait_gpu(nbv->cu_nbv,
                                 nbv->grp[eintLocal].nbat,
                                 flags, eatLocal,
                                 enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
                                 fr->fshift);
-            cycles_wait_gpu += wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
+            cycles_tmp      = wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
+
+            if (bDoForces && DOMAINDECOMP(cr))
+            {
+                cycles_wait_est = wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L_EST);
+
+                if (cycles_tmp < cuda_api_overhead_margin)
+                {
+                    /* We measured few cycles, it could be that the kernel
+                     * and transfer finished earlier and there was no actual
+                     * wait time, only API call overhead.
+                     * Then the actual time could be anywhere between 0 and
+                     * cycles_wait_est. As a compromise, we use half the time.
+                     */
+                    cycles_wait_est *= 0.5f;
+                }
+            }
+            else
+            {
+                /* No force communication so we actually timed the wait */
+                cycles_wait_est = cycles_tmp;
+            }
+            /* Even though this is after dd_move_f, the actual task we are
+             * waiting for runs asynchronously with dd_move_f and we usually
+             * have nothing to balance it with, so we can and should add
+             * the time to the force time for load balancing.
+             */
+            cycles_force    += cycles_wait_est;
+            cycles_wait_gpu += cycles_wait_est;
 
             /* now clear the GPU outputs while we finish the step on the CPU */
 
diff --git a/src/gromacs/mdlib/update.c b/src/gromacs/mdlib/update.c
index 3be84a8a93..373212ca8a 100644
--- a/src/gromacs/mdlib/update.c
+++ b/src/gromacs/mdlib/update.c
@@ -1767,7 +1767,9 @@ void update_constraints(FILE             *fplog,
         }
         else
         {
-#pragma omp parallel for num_threads(gmx_omp_nthreads_get(emntUpdate)) schedule(static)
+            nth = gmx_omp_nthreads_get(emntUpdate);
+
+#pragma omp parallel for num_threads(nth) schedule(static)
             for (i = start; i < nrend; i++)
             {
                 copy_rvec(upd->xp[i], state->x[i]);
diff --git a/src/gromacs/timing/wallcycle.c b/src/gromacs/timing/wallcycle.c
index 27eb275cd0..3b0d906636 100644
--- a/src/gromacs/timing/wallcycle.c
+++ b/src/gromacs/timing/wallcycle.c
@@ -99,7 +99,7 @@ static const char *wcn[ewcNR] =
     "DD comm. bounds", "Vsite constr.", "Send X to PME", "Neighbor search", "Launch GPU ops.",
     "Comm. coord.", "Born radii", "Force", "Wait + Comm. F", "PME mesh",
     "PME redist. X/F", "PME spread/gather", "PME 3D-FFT", "PME 3D-FFT Comm.", "PME solve LJ", "PME solve Elec",
-    "PME wait for PP", "Wait + Recv. PME F", "Wait GPU nonlocal", "Wait GPU local", "NB X/F buffer ops.",
+    "PME wait for PP", "Wait + Recv. PME F", "Wait GPU nonlocal", "Wait GPU local", "Wait GPU loc. est.", "NB X/F buffer ops.",
     "Vsite spread", "COM pull force",
     "Write traj.", "Update", "Constraints", "Comm. energies",
     "Enforced rotation", "Add rot. forces", "Coordinate swapping", "IMD", "Test"
@@ -388,6 +388,12 @@ void wallcycle_sum(t_commrec *cr, gmx_wallcycle_t wc)
 
     wcc = wc->wcc;
 
+    /* The GPU wait estimate counter is used for load balancing only
+     * and will mess up the total due to double counting: clear it.
+     */
+    wcc[ewcWAIT_GPU_NB_L_EST].n = 0;
+    wcc[ewcWAIT_GPU_NB_L_EST].c = 0;
+
     for (i = 0; i < ewcNR; i++)
     {
         if (is_pme_counter(i) || (i == ewcRUN && cr->duty == DUTY_PME))
diff --git a/src/gromacs/timing/wallcycle.h b/src/gromacs/timing/wallcycle.h
index 104173598a..69c92cde72 100644
--- a/src/gromacs/timing/wallcycle.h
+++ b/src/gromacs/timing/wallcycle.h
@@ -54,7 +54,7 @@ enum {
     ewcDDCOMMBOUND, ewcVSITECONSTR, ewcPP_PMESENDX, ewcNS, ewcLAUNCH_GPU_NB,
     ewcMOVEX, ewcGB, ewcFORCE, ewcMOVEF, ewcPMEMESH,
     ewcPME_REDISTXF, ewcPME_SPREADGATHER, ewcPME_FFT, ewcPME_FFTCOMM, ewcLJPME, ewcPME_SOLVE,
-    ewcPMEWAITCOMM, ewcPP_PMEWAITRECVF, ewcWAIT_GPU_NB_NL, ewcWAIT_GPU_NB_L, ewcNB_XF_BUF_OPS,
+    ewcPMEWAITCOMM, ewcPP_PMEWAITRECVF, ewcWAIT_GPU_NB_NL, ewcWAIT_GPU_NB_L, ewcWAIT_GPU_NB_L_EST, ewcNB_XF_BUF_OPS,
     ewcVSITESPREAD, ewcPULLPOT,
     ewcTRAJ, ewcUPDATE, ewcCONSTR, ewcMoveE, ewcROT, ewcROTadd, ewcSWAP, ewcIMD,
     ewcTEST, ewcNR
diff --git a/src/programs/mdrun/md.cpp b/src/programs/mdrun/md.cpp
index 9864f8789f..0504150c00 100644
--- a/src/programs/mdrun/md.cpp
+++ b/src/programs/mdrun/md.cpp
@@ -1881,6 +1881,21 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
                     }
                     dd_bcast(cr->dd, sizeof(gmx_bool), &bPMETuneRunning);
 
+                    if (bPMETuneRunning &&
+                        use_GPU(fr->nbv) && DOMAINDECOMP(cr) &&
+                        !(cr->duty & DUTY_PME))
+                    {
+                        /* Lock DLB=auto to off (does nothing when DLB=yes/no).
+                         * With GPUs + separate PME ranks, we don't want DLB.
+                         * This could happen when we scan coarse grids and
+                         * it would then never be turned off again.
+                         * This would hurt performance at the final, optimal
+                         * grid spacing, where DLB almost never helps.
+                         * Also, DLB can limit the cut-off for PME tuning.
+                         */
+                        dd_dlb_set_lock(cr->dd, TRUE);
+                    }
+
                     if (bPMETuneRunning || step_rel > ir->nstlist*50)
                     {
                         bPMETuneTry     = FALSE;
@@ -1911,6 +1926,16 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
                     {
                         calc_enervirdiff(NULL, ir->eDispCorr, fr);
                     }
+
+                    if (!bPMETuneRunning &&
+                        DOMAINDECOMP(cr) &&
+                        dd_dlb_is_locked(cr->dd))
+                    {
+                        /* Unlock the DLB=auto, DLB is allowed to activate
+                         * (but we don't expect it to activate in most cases).
+                         */
+                        dd_dlb_set_lock(cr->dd, FALSE);
+                    }
                 }
                 cycles_pmes = 0;
             }
diff --git a/src/programs/mdrun/pme_loadbal.c b/src/programs/mdrun/pme_loadbal.c
index 5ba76e63e0..dd2cd1a2e9 100644
--- a/src/programs/mdrun/pme_loadbal.c
+++ b/src/programs/mdrun/pme_loadbal.c
@@ -265,6 +265,15 @@ static gmx_bool pme_loadbal_increase_cutoff(pme_load_balancing_t  pme_lb,
     while (sp <= 1.001*pme_lb->setup[pme_lb->cur].spacing || !grid_ok);
 
     set->rcut_coulomb = pme_lb->cut_spacing*sp;
+    if (set->rcut_coulomb < pme_lb->rcut_coulomb_start)
+    {
+        /* This is unlikely, but can happen when e.g. continuing from
+         * a checkpoint after equilibration where the box shrank a lot.
+         * We want to avoid rcoulomb getting smaller than rvdw
+         * and there might be more issues with decreasing rcoulomb.
+         */
+        set->rcut_coulomb = pme_lb->rcut_coulomb_start;
+    }
 
     if (pme_lb->cutoff_scheme == ecutsVERLET)
     {