From: Erik Lindahl <erik.lindahl@gmail.com>
Date: Thu, 9 Jul 2015 10:56:19 +0000 (+0200)
Subject: Merge "Merge release-5-1 into master"
X-Git-Url: http://biod.pnpi.spb.ru/gitweb/?a=commitdiff_plain;h=be4c77e5c2c4a1512110217d0a3386b6945d11e7;hp=b7cb5f5232b4edb2118b2ba7bd78ef35adebad04;p=alexxy%2Fgromacs.git

Merge "Merge release-5-1 into master"
---

diff --git a/cmake/FindPythonModule.cmake b/cmake/FindPythonModule.cmake
index b8e965d1e6..02773c643f 100644
--- a/cmake/FindPythonModule.cmake
+++ b/cmake/FindPythonModule.cmake
@@ -32,29 +32,48 @@
 # To help us fund GROMACS development, we humbly ask that you cite
 # the research papers on the package. Check out http://www.gromacs.org.
 
-# Adapted from code posted on cmake-users by Mark Moll
+# Adapted from code posted on cmake-users by Mark Moll (the execute_process()
+# call remains, but other things have been rewritten for nicer behavior).
 find_package(PythonInterp)
-function(find_python_module module)
-    string(TOUPPER ${module} module_upper)
-    if(NOT PYTHONMODULE_${module_upper})
-        if(ARGC GREATER 1 AND ARGV1 STREQUAL "REQUIRED")
-	    set(${module}_FIND_REQUIRED TRUE)
-	endif()
-        if (NOT PYTHON_EXECUTABLE)
-            message(STATUS "Cannot find python module ${module} because no python executable is known")
-        else()
-	    # A module's location is usually a directory, but for binary modules
-	    # it's a .so file.
-	    execute_process(COMMAND "${PYTHON_EXECUTABLE}" "-c"
-	        "import re, ${module}; print re.compile('/__init__.py.*').sub('',${module}.__file__)"
-	        RESULT_VARIABLE _${module}_status 
-	        OUTPUT_VARIABLE _${module}_location
-	        ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
+
+function (find_python_module module)
+    string(TOUPPER ${module} _module_upper)
+    set(_find_package_module ${module})
+    set(_out_var PYTHONMODULE_${_module_upper})
+
+    include(CMakeParseArguments)
+    set(_options QUIET REQUIRED)
+    cmake_parse_arguments(ARG "${_options}" "" "" ${ARGN})
+    if (ARG_UNPARSED_ARGUMENTS)
+        message(FATAL_ERROR "Unknown arguments: ${ARG_UNPARSED_ARGUMENTS}")
+    endif()
+    if (ARG_REQUIRED)
+        set(${_find_package_module}_FIND_REQUIRED TRUE)
+    endif()
+    if (ARG_QUIET)
+        set(${_find_package_module}_FIND_QUIETLY TRUE)
+    endif()
+
+    if (NOT ${_out_var})
+        set(_status 1)
+        if (PYTHON_EXECUTABLE)
+            # A module's location is usually a directory, but for binary modules
+            # it's a .so file.
+            execute_process(COMMAND "${PYTHON_EXECUTABLE}" "-c"
+                "import re, ${module}; print re.compile('/__init__.py.*').sub('',${module}.__file__)"
+                RESULT_VARIABLE _status
+                OUTPUT_VARIABLE _location
+                ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
+        endif()
+        if(_status)
+            set(_location ${_find_package_module}-NOTFOUND)
         endif()
-	if(NOT _${module}_status)
-	    set(PYTHONMODULE_${module_upper} ${_${module}_location} CACHE STRING 
-		"Location of Python module ${module}")
-	endif()
+        set(${_out_var} ${_location} CACHE STRING
+            "Location of Python module ${module}" FORCE)
+        mark_as_advanced(${_out_var})
     endif()
-    find_package_handle_standard_args(PYTHONMODULE_${module} DEFAULT_MSG PYTHONMODULE_${module_upper})
+    include(FindPackageHandleStandardArgs)
+    find_package_handle_standard_args(
+        ${_find_package_module} DEFAULT_MSG
+        ${_out_var} PYTHON_EXECUTABLE)
 endfunction()
diff --git a/cmake/FindSphinx.cmake b/cmake/FindSphinx.cmake
index 46bf447abd..6eae513025 100644
--- a/cmake/FindSphinx.cmake
+++ b/cmake/FindSphinx.cmake
@@ -41,8 +41,7 @@ find_program(SPHINX_EXECUTABLE NAMES sphinx-build
 mark_as_advanced(SPHINX_EXECUTABLE)
 
 # Detect Sphinx version
-
-if(SPHINX_FOUND AND NOT DEFINED SPHINX_EXECUTABLE_VERSION)
+if (SPHINX_EXECUTABLE AND NOT DEFINED SPHINX_EXECUTABLE_VERSION)
     execute_process(
         COMMAND ${SPHINX_EXECUTABLE} --version
         OUTPUT_VARIABLE SPHINX_VERSION_OUTPUT_VARIABLE
@@ -50,14 +49,17 @@ if(SPHINX_FOUND AND NOT DEFINED SPHINX_EXECUTABLE_VERSION)
         ERROR_QUIET
         OUTPUT_STRIP_TRAILING_WHITESPACE
         )
-    string(REGEX REPLACE "Sphinx \\(${SPHINX_EXECUTABLE}\\) ([^ ]+)" "\\1" SPHINX_EXECUTABLE_VERSION ${SPHINX_VERSION_OUTPUT_VARIABLE})
+    string(REGEX REPLACE "Sphinx \\([^)]*\\) ([^ ]+)" "\\1" SPHINX_EXECUTABLE_VERSION ${SPHINX_VERSION_OUTPUT_VARIABLE})
     set(SPHINX_EXECUTABLE_VERSION "${SPHINX_EXECUTABLE_VERSION}" CACHE INTERNAL "Version of ${SPHINX_EXECUTABLE}")
 endif()
 
+set(_find_deps_options)
+if (Sphinx_FIND_QUIETLY)
+    set(_find_deps_options QUIET)
+endif()
 include(FindPythonModule)
-find_python_module(pygments)
-
-if(PYTHONMODULE_PYGMENTS)
+find_python_module(pygments ${_find_deps_options})
+if (PYTHONMODULE_PYGMENTS)
     set(Sphinx_pygments_FOUND 1)
 endif()
 
diff --git a/cmake/gmxVersionInfo.cmake b/cmake/gmxVersionInfo.cmake
index 162b32818e..f7a412b281 100644
--- a/cmake/gmxVersionInfo.cmake
+++ b/cmake/gmxVersionInfo.cmake
@@ -253,7 +253,7 @@ set(REGRESSIONTEST_BRANCH "refs/heads/master")
 # each release. It's hard to test because it is only used for
 # REGRESSIONTEST_DOWNLOAD, which doesn't work until that tarball has
 # been placed on the server.
-set(REGRESSIONTEST_MD5SUM "6f8531a6e3c2a8912327b9cd450d8745" CACHE INTERNAL "MD5 sum of the regressiontests tarball")
+set(REGRESSIONTEST_MD5SUM "bb67f145095249e9d4a93227fc4c352e" CACHE INTERNAL "MD5 sum of the regressiontests tarball")
 
 math(EXPR GMX_VERSION_NUMERIC
      "${GMX_VERSION_MAJOR}*10000 + ${GMX_VERSION_MINOR}*100 + ${GMX_VERSION_PATCH}")
diff --git a/docs/CMakeLists.txt b/docs/CMakeLists.txt
index d3f190a635..17a7681c8b 100644
--- a/docs/CMakeLists.txt
+++ b/docs/CMakeLists.txt
@@ -59,7 +59,7 @@ mark_as_advanced(SOURCE_MD5SUM)
 set(EXPECTED_DOXYGEN_VERSION 1.8.5)
 
 find_package(PythonInterp)
-find_package(Sphinx 1.2.3 COMPONENTS pygments)
+find_package(Sphinx 1.2.3 QUIET COMPONENTS pygments)
 
 # Even if we aren't going to make the full webpage, set up to put all
 # the documentation output in the same place, for convenience
@@ -250,6 +250,10 @@ else()
         COMMAND ${CMAKE_COMMAND} -E echo
             "HTML pages cannot be built because Sphinx is not available"
         VERBATIM)
+    add_custom_target(install-guide
+        COMMAND ${CMAKE_COMMAND} -E echo
+            "INSTALL cannot be built because Sphinx is not available"
+        VERBATIM)
     add_custom_target(man
         COMMAND ${CMAKE_COMMAND} -E echo
             "man pages cannot be built because Sphinx is not available"
diff --git a/docs/user-guide/mdp-options.rst b/docs/user-guide/mdp-options.rst
index 5767bd4f99..4fb9c0b204 100644
--- a/docs/user-guide/mdp-options.rst
+++ b/docs/user-guide/mdp-options.rst
@@ -1695,16 +1695,6 @@ applicable pulling coordinate.
    system, *e.g.* a water slab (see Engin et al. J. Chem. Phys. B
    2010).
 
-.. mdp:: pull-coord1-groups
-
-   The two groups indices should be given on which this pull
-   coordinate will operate. The first index can be 0, in which case an
-   absolute reference of :mdp:`pull-coord1-origin` is used. With an
-   absolute reference the system is no longer translation invariant
-   and one should think about what to do with the center of mass
-   motion. Note that (only) for :mdp:`pull-coord1-geometry` =
-   :mdp-value:`direction-relative` four groups are required.
-
 .. mdp:: pull-coord1-type:
 
    .. mdp-value:: umbrella
@@ -1780,6 +1770,16 @@ applicable pulling coordinate.
       component. This geometry is not supported with constraint
       pulling.
 
+.. mdp:: pull-coord1-groups
+
+   The two groups indices should be given on which this pull
+   coordinate will operate. The first index can be 0, in which case an
+   absolute reference of :mdp:`pull-coord1-origin` is used. With an
+   absolute reference the system is no longer translation invariant
+   and one should think about what to do with the center of mass
+   motion. Note that (only) for :mdp:`pull-coord1-geometry` =
+   :mdp-value:`direction-relative` four groups are required.
+
 .. mdp:: pull-coord1-dim
 
    (Y Y Y)
diff --git a/src/gromacs/domdec/domdec.cpp b/src/gromacs/domdec/domdec.cpp
index e8de945b94..3986ed0562 100644
--- a/src/gromacs/domdec/domdec.cpp
+++ b/src/gromacs/domdec/domdec.cpp
@@ -216,9 +216,20 @@ enum {
 };
 
 enum {
-    edlbAUTO, edlbNO, edlbYES, edlbNR
+    edlbsOffForever,           /* DLB is off and will never be turned on */
+    edlbsOffCanTurnOn,         /* DLB is off and will turn on with imbalance */
+    edlbsOffTemporarilyLocked, /* DLB is off and temporarily can not turn on */
+    edlbsOn,                   /* DLB is on and will stay on forever */
+    edlbsNR
 };
-const char *edlb_names[edlbNR] = { "auto", "no", "yes" };
+/* Allowed DLB state transitions:
+ *   edlbsOffCanTurnOn         -> edlbsOn
+ *   edlbsOffCanTurnOn         -> edlbsOffForever
+ *   edlbsOffCanTurnOn         -> edlbsOffTemporarilyLocked
+ *   edlbsOffTemporarilyLocked -> edlbsOffCanTurnOn
+ */
+
+const char *edlbs_names[edlbsNR] = { "off", "auto", "locked", "on" };
 
 typedef struct
 {
@@ -297,14 +308,10 @@ typedef struct gmx_domdec_comm
     t_blocka *cglink;
     char     *bLocalCG;
 
-    /* The DLB option */
-    int      eDLB;
-    /* Is eDLB=edlbAUTO locked such that we currently can't turn it on? */
-    gmx_bool bDLB_locked;
-    /* With eDLB=edlbAUTO, should we check if to DLB on at the next DD? */
+    /* The DLB state, possible values are defined above */
+    int      dlbState;
+    /* With dlbState=edlbsOffCanTurnOn, should we check if to DLB on at the next DD? */
     gmx_bool bCheckWhetherToTurnDlbOn;
-    /* Are we actually using DLB? */
-    gmx_bool bDynLoadBal;
 
     /* Cell sizes for static load balancing, first index cartesian */
     real **slb_frac;
@@ -587,6 +594,11 @@ t_block *dd_charge_groups_global(gmx_domdec_t *dd)
     return &dd->comm->cgs_gl;
 }
 
+static bool dlbIsOn(const gmx_domdec_comm_t *comm)
+{
+    return (comm->dlbState == edlbsOn);
+}
+
 static void vec_rvec_init(vec_rvec_t *v)
 {
     v->nalloc = 0;
@@ -665,7 +677,7 @@ void dd_get_ns_ranges(gmx_domdec_t *dd, int icg,
         dim         = dd->dim[d];
         shift0[dim] = zones->izone[izone].shift0[dim];
         shift1[dim] = zones->izone[izone].shift1[dim];
-        if (dd->comm->tric_dir[dim] || (dd->bGridJump && d > 0))
+        if (dd->comm->tric_dir[dim] || (dlbIsOn(dd->comm) && d > 0))
         {
             /* A conservative approach, this can be optimized */
             shift0[dim] -= 1;
@@ -3219,7 +3231,7 @@ static void set_dd_cell_sizes_slb(gmx_domdec_t *dd, gmx_ddbox_t *ddbox,
         }
     }
 
-    if (!comm->bDynLoadBal)
+    if (!dlbIsOn(comm))
     {
         copy_rvec(cellsize_min, comm->cellsize_min);
     }
@@ -3815,7 +3827,7 @@ static void set_dd_cell_sizes(gmx_domdec_t *dd,
     copy_rvec(comm->cell_x0, comm->old_cell_x0);
     copy_rvec(comm->cell_x1, comm->old_cell_x1);
 
-    if (comm->bDynLoadBal)
+    if (dlbIsOn(comm))
     {
         if (DDMASTER(dd))
         {
@@ -3856,7 +3868,7 @@ static void comm_dd_ns_cell_sizes(gmx_domdec_t *dd,
         /* Without PBC we don't have restrictions on the outer cells */
         if (!(dim >= ddbox->npbcdim &&
               (dd->ci[dim] == 0 || dd->ci[dim] == dd->nc[dim] - 1)) &&
-            comm->bDynLoadBal &&
+            dlbIsOn(comm) &&
             (comm->cell_x1[dim] - comm->cell_x0[dim])*ddbox->skew_fac[dim] <
             comm->cellsize_min[dim])
         {
@@ -3870,11 +3882,11 @@ static void comm_dd_ns_cell_sizes(gmx_domdec_t *dd,
         }
     }
 
-    if ((dd->bGridJump && dd->ndim > 1) || ddbox->nboundeddim < DIM)
+    if ((dlbIsOn(dd->comm) && dd->ndim > 1) || ddbox->nboundeddim < DIM)
     {
         /* Communicate the boundaries and update cell_ns_x0/1 */
         dd_move_cellx(dd, ddbox, cell_ns_x0, cell_ns_x1);
-        if (dd->bGridJump && dd->ndim > 1)
+        if (dlbIsOn(dd->comm) && dd->ndim > 1)
         {
             check_grid_jump(step, dd, dd->comm->cutoff, ddbox, TRUE);
         }
@@ -5000,7 +5012,7 @@ static void dd_redistribute_cg(FILE *fplog, gmx_int64_t step,
                 /* Check which direction this cg should go */
                 for (d2 = d+1; (d2 < dd->ndim && mc == -1); d2++)
                 {
-                    if (dd->bGridJump)
+                    if (dlbIsOn(dd->comm))
                     {
                         /* The cell boundaries for dimension d2 are not equal
                          * for each cell row of the lower dimension(s),
@@ -5299,7 +5311,7 @@ static void get_load_distribution(gmx_domdec_t *dd, gmx_wallcycle_t wcycle)
             (dd->ci[dd->dim[d+1]] == 0 && dd->ci[dd->dim[dd->ndim-1]] == 0))
         {
             load = &comm->load[d];
-            if (dd->bGridJump)
+            if (dlbIsOn(dd->comm))
             {
                 cell_frac = comm->cell_f1[d] - comm->cell_f0[d];
             }
@@ -5308,7 +5320,7 @@ static void get_load_distribution(gmx_domdec_t *dd, gmx_wallcycle_t wcycle)
             {
                 sbuf[pos++] = dd_force_load(comm);
                 sbuf[pos++] = sbuf[0];
-                if (dd->bGridJump)
+                if (dlbIsOn(dd->comm))
                 {
                     sbuf[pos++] = sbuf[0];
                     sbuf[pos++] = cell_frac;
@@ -5328,7 +5340,7 @@ static void get_load_distribution(gmx_domdec_t *dd, gmx_wallcycle_t wcycle)
             {
                 sbuf[pos++] = comm->load[d+1].sum;
                 sbuf[pos++] = comm->load[d+1].max;
-                if (dd->bGridJump)
+                if (dlbIsOn(dd->comm))
                 {
                     sbuf[pos++] = comm->load[d+1].sum_m;
                     sbuf[pos++] = comm->load[d+1].cvol_min*cell_frac;
@@ -5357,7 +5369,7 @@ static void get_load_distribution(gmx_domdec_t *dd, gmx_wallcycle_t wcycle)
             if (dd->ci[dim] == dd->master_ci[dim])
             {
                 /* We are the root, process this row */
-                if (comm->bDynLoadBal)
+                if (dlbIsOn(comm))
                 {
                     root = comm->root[d];
                 }
@@ -5374,7 +5386,7 @@ static void get_load_distribution(gmx_domdec_t *dd, gmx_wallcycle_t wcycle)
                     load->sum += load->load[pos++];
                     load->max  = std::max(load->max, load->load[pos]);
                     pos++;
-                    if (dd->bGridJump)
+                    if (dlbIsOn(dd->comm))
                     {
                         if (root->bLimited)
                         {
@@ -5408,7 +5420,7 @@ static void get_load_distribution(gmx_domdec_t *dd, gmx_wallcycle_t wcycle)
                         pos++;
                     }
                 }
-                if (comm->bDynLoadBal && root->bLimited)
+                if (dlbIsOn(comm) && root->bLimited)
                 {
                     load->sum_m *= dd->nc[dim];
                     load->flags |= (1<<d);
@@ -5423,7 +5435,7 @@ static void get_load_distribution(gmx_domdec_t *dd, gmx_wallcycle_t wcycle)
         comm->load_step  += comm->cycl[ddCyclStep];
         comm->load_sum   += comm->load[0].sum;
         comm->load_max   += comm->load[0].max;
-        if (comm->bDynLoadBal)
+        if (dlbIsOn(comm))
         {
             for (d = 0; d < dd->ndim; d++)
             {
@@ -5492,7 +5504,7 @@ static void print_dd_load_av(FILE *fplog, gmx_domdec_t *dd)
             fprintf(stderr, "%s", buf);
         }
         bLim = FALSE;
-        if (comm->bDynLoadBal)
+        if (dlbIsOn(comm))
         {
             sprintf(buf, " Steps where the load balancing was limited by -rdd, -rcon and/or -dds:");
             for (d = 0; d < dd->ndim; d++)
@@ -5535,7 +5547,7 @@ static void print_dd_load_av(FILE *fplog, gmx_domdec_t *dd)
             sprintf(buf,
                     "NOTE: %.1f %% of the available CPU time was lost due to load imbalance\n"
                     "      in the domain decomposition.\n", lossf*100);
-            if (!comm->bDynLoadBal)
+            if (!dlbIsOn(comm))
             {
                 sprintf(buf+strlen(buf), "      You might want to use dynamic load balancing (option -dlb.)\n");
             }
@@ -5580,6 +5592,9 @@ static float dd_f_imbal(gmx_domdec_t *dd)
 
 float dd_pme_f_ratio(gmx_domdec_t *dd)
 {
+    /* Should only be called on the DD master rank */
+    assert(DDMASTER(dd));
+
     if (dd->comm->load[0].mdf > 0 && dd->comm->cycl_n[ddCyclPME] > 0)
     {
         return dd->comm->load[0].pme/dd->comm->load[0].mdf;
@@ -5610,7 +5625,7 @@ static void dd_print_load(FILE *fplog, gmx_domdec_t *dd, gmx_int64_t step)
         fprintf(fplog, "\n");
     }
     fprintf(fplog, "DD  step %s", gmx_step_str(step, buf));
-    if (dd->comm->bDynLoadBal)
+    if (dlbIsOn(dd->comm))
     {
         fprintf(fplog, "  vol min/aver %5.3f%c",
                 dd_vol_min(dd), flags ? '!' : ' ');
@@ -5628,7 +5643,7 @@ static void dd_print_load(FILE *fplog, gmx_domdec_t *dd, gmx_int64_t step)
 
 static void dd_print_load_verbose(gmx_domdec_t *dd)
 {
-    if (dd->comm->bDynLoadBal)
+    if (dlbIsOn(dd->comm))
     {
         fprintf(stderr, "vol %4.2f%c ",
                 dd_vol_min(dd), dd_load_flags(dd) ? '!' : ' ');
@@ -5669,7 +5684,7 @@ static void make_load_communicator(gmx_domdec_t *dd, int dim_ind, ivec loc)
     if (bPartOfGroup)
     {
         dd->comm->mpi_comm_load[dim_ind] = c_row;
-        if (dd->comm->eDLB != edlbNO)
+        if (dd->comm->dlbState != edlbsOffForever)
         {
             if (dd->ci[dim] == dd->master_ci[dim])
             {
@@ -5963,7 +5978,7 @@ void setup_dd_grid(FILE *fplog, gmx_domdec_t *dd)
         }
     }
 
-    if (dd->comm->eDLB != edlbNO)
+    if (dd->comm->dlbState != edlbsOffForever)
     {
         snew(dd->comm->root, dd->ndim);
     }
@@ -6540,60 +6555,60 @@ static int check_dlb_support(FILE *fplog, t_commrec *cr,
                              const char *dlb_opt, gmx_bool bRecordLoad,
                              unsigned long Flags, t_inputrec *ir)
 {
-    int           eDLB = -1;
+    int           dlbState = -1;
     char          buf[STRLEN];
 
     switch (dlb_opt[0])
     {
-        case 'a': eDLB = edlbAUTO; break;
-        case 'n': eDLB = edlbNO;   break;
-        case 'y': eDLB = edlbYES;  break;
+        case 'a': dlbState = edlbsOffCanTurnOn; break;
+        case 'n': dlbState = edlbsOffForever;   break;
+        case 'y': dlbState = edlbsOn;           break;
         default: gmx_incons("Unknown dlb_opt");
     }
 
     if (Flags & MD_RERUN)
     {
-        return edlbNO;
+        return edlbsOffForever;
     }
 
     if (!EI_DYNAMICS(ir->eI))
     {
-        if (eDLB == edlbYES)
+        if (dlbState == edlbsOn)
         {
             sprintf(buf, "NOTE: dynamic load balancing is only supported with dynamics, not with integrator '%s'\n", EI(ir->eI));
             dd_warning(cr, fplog, buf);
         }
 
-        return edlbNO;
+        return edlbsOffForever;
     }
 
     if (!bRecordLoad)
     {
         dd_warning(cr, fplog, "NOTE: Cycle counting is not supported on this architecture, will not use dynamic load balancing\n");
 
-        return edlbNO;
+        return edlbsOffForever;
     }
 
     if (Flags & MD_REPRODUCIBLE)
     {
-        switch (eDLB)
+        switch (dlbState)
         {
-            case edlbNO:
+            case edlbsOffForever:
                 break;
-            case edlbAUTO:
+            case edlbsOffCanTurnOn:
                 dd_warning(cr, fplog, "NOTE: reproducibility requested, will not use dynamic load balancing\n");
-                eDLB = edlbNO;
+                dlbState = edlbsOffForever;
                 break;
-            case edlbYES:
+            case edlbsOn:
                 dd_warning(cr, fplog, "WARNING: reproducibility requested with dynamic load balancing, the simulation will NOT be binary reproducible\n");
                 break;
             default:
-                gmx_fatal(FARGS, "Death horror: undefined case (%d) for load balancing choice", eDLB);
+                gmx_fatal(FARGS, "Death horror: undefined case (%d) for load balancing choice", dlbState);
                 break;
         }
     }
 
-    return eDLB;
+    return dlbState;
 }
 
 static void set_dd_dim(FILE *fplog, gmx_domdec_t *dd)
@@ -6739,16 +6754,14 @@ gmx_domdec_t *init_domain_decomposition(FILE *fplog, t_commrec *cr,
     /* Initialize to GPU share count to 0, might change later */
     comm->nrank_gpu_shared = 0;
 
-    comm->eDLB                     = check_dlb_support(fplog, cr, dlb_opt, comm->bRecordLoad, Flags, ir);
-    comm->bDLB_locked              = FALSE;
+    comm->dlbState                 = check_dlb_support(fplog, cr, dlb_opt, comm->bRecordLoad, Flags, ir);
     comm->bCheckWhetherToTurnDlbOn = TRUE;
 
-    comm->bDynLoadBal = (comm->eDLB == edlbYES);
     if (fplog)
     {
-        fprintf(fplog, "Dynamic load balancing: %s\n", edlb_names[comm->eDLB]);
+        fprintf(fplog, "Dynamic load balancing: %s\n",
+                edlbs_names[comm->dlbState]);
     }
-    dd->bGridJump              = comm->bDynLoadBal;
     comm->bPMELoadBalDLBLimits = FALSE;
 
     if (comm->nstSortCG)
@@ -6941,7 +6954,7 @@ gmx_domdec_t *init_domain_decomposition(FILE *fplog, t_commrec *cr,
 
         /* We need to choose the optimal DD grid and possibly PME nodes */
         limit = dd_choose_grid(fplog, cr, dd, ir, mtop, box, ddbox,
-                               comm->eDLB != edlbNO, dlb_scale,
+                               comm->dlbState != edlbsOffForever, dlb_scale,
                                comm->cellsize_limit, comm->cutoff,
                                comm->bInterCGBondeds);
 
@@ -6950,7 +6963,7 @@ gmx_domdec_t *init_domain_decomposition(FILE *fplog, t_commrec *cr,
             bC = (dd->bInterCGcons && rconstr > r_bonded_limit);
             sprintf(buf, "Change the number of ranks or mdrun option %s%s%s",
                     !bC ? "-rdd" : "-rcon",
-                    comm->eDLB != edlbNO ? " or -dds" : "",
+                    comm->dlbState != edlbsOffForever ? " or -dds" : "",
                     bC ? " or your LINCS settings" : "");
 
             gmx_fatal_collective(FARGS, cr, NULL,
@@ -7045,7 +7058,7 @@ gmx_domdec_t *init_domain_decomposition(FILE *fplog, t_commrec *cr,
     *npme_y = comm->npmenodes_y;
 
     snew(comm->slb_frac, DIM);
-    if (comm->eDLB == edlbNO)
+    if (comm->dlbState == edlbsOffForever)
     {
         comm->slb_frac[XX] = get_slb_frac(fplog, "x", dd->nc[XX], sizex);
         comm->slb_frac[YY] = get_slb_frac(fplog, "y", dd->nc[YY], sizey);
@@ -7054,7 +7067,7 @@ gmx_domdec_t *init_domain_decomposition(FILE *fplog, t_commrec *cr,
 
     if (comm->bInterCGBondeds && comm->cutoff_mbody == 0)
     {
-        if (comm->bBondComm || comm->eDLB != edlbNO)
+        if (comm->bBondComm || comm->dlbState != edlbsOffForever)
         {
             /* Set the bonded communication distance to halfway
              * the minimum and the maximum,
@@ -7062,7 +7075,7 @@ gmx_domdec_t *init_domain_decomposition(FILE *fplog, t_commrec *cr,
              */
             acs                = average_cellsize_min(dd, ddbox);
             comm->cutoff_mbody = 0.5*(r_bonded + acs);
-            if (comm->eDLB != edlbNO)
+            if (comm->dlbState != edlbsOffForever)
             {
                 /* Check if this does not limit the scaling */
                 comm->cutoff_mbody = std::min(comm->cutoff_mbody, dlb_scale*acs);
@@ -7153,14 +7166,13 @@ static void turn_on_dlb(FILE *fplog, t_commrec *cr, gmx_int64_t step)
         dd_warning(cr, fplog, "NOTE: the minimum cell size is smaller than 1.05 times the cell size limit, will not turn on dynamic load balancing\n");
 
         /* Change DLB from "auto" to "no". */
-        comm->eDLB = edlbNO;
+        comm->dlbState = edlbsOffForever;
 
         return;
     }
 
     dd_warning(cr, fplog, "NOTE: Turning on dynamic load balancing\n");
-    comm->bDynLoadBal = TRUE;
-    dd->bGridJump     = TRUE;
+    comm->dlbState = edlbsOn;
 
     set_dlb_limits(dd);
 
@@ -7331,7 +7343,7 @@ static void print_dd_settings(FILE *fplog, gmx_domdec_t *dd,
                     std::max(comm->cutoff, comm->cutoff_mbody));
             fprintf(fplog, "%40s  %-7s %6.3f nm\n",
                     "multi-body bonded interactions", "(-rdd)",
-                    (comm->bBondComm || dd->bGridJump) ? comm->cutoff_mbody : std::min(comm->cutoff, limit));
+                    (comm->bBondComm || dlbIsOn(dd->comm)) ? comm->cutoff_mbody : std::min(comm->cutoff, limit));
         }
         if (dd->vsite_comm)
         {
@@ -7450,7 +7462,7 @@ static void set_cell_limits_dlb(gmx_domdec_t      *dd,
     {
         comm->cutoff_mbody = std::min(comm->cutoff, comm->cellsize_limit);
     }
-    if (comm->bDynLoadBal)
+    if (dlbIsOn(comm))
     {
         set_dlb_limits(dd);
     }
@@ -7509,13 +7521,13 @@ void set_dd_parameters(FILE *fplog, gmx_domdec_t *dd, real dlb_scale,
     {
         fprintf(debug, "The DD cut-off is %f\n", comm->cutoff);
     }
-    if (comm->eDLB != edlbNO)
+    if (comm->dlbState != edlbsOffForever)
     {
         set_cell_limits_dlb(dd, dlb_scale, ir, ddbox);
     }
 
-    print_dd_settings(fplog, dd, ir, comm->bDynLoadBal, dlb_scale, ddbox);
-    if (comm->eDLB == edlbAUTO)
+    print_dd_settings(fplog, dd, ir, dlbIsOn(comm), dlb_scale, ddbox);
+    if (comm->dlbState == edlbsOffCanTurnOn)
     {
         if (fplog)
         {
@@ -7571,7 +7583,7 @@ static gmx_bool test_dd_cutoff(t_commrec *cr,
 
         np = 1 + (int)(cutoff_req*inv_cell_size*ddbox.skew_fac[dim]);
 
-        if (dd->comm->eDLB != edlbNO && dim < ddbox.npbcdim &&
+        if (dd->comm->dlbState != edlbsOffForever && dim < ddbox.npbcdim &&
             dd->comm->cd[d].np_dlb > 0)
         {
             if (np > dd->comm->cd[d].np_dlb)
@@ -7590,12 +7602,12 @@ static gmx_bool test_dd_cutoff(t_commrec *cr,
         }
     }
 
-    if (dd->comm->eDLB != edlbNO)
+    if (dd->comm->dlbState != edlbsOffForever)
     {
         /* If DLB is not active yet, we don't need to check the grid jumps.
          * Actually we shouldn't, because then the grid jump data is not set.
          */
-        if (dd->comm->bDynLoadBal &&
+        if (dlbIsOn(dd->comm) &&
             check_grid_jump(0, dd, cutoff_req, &ddbox, FALSE))
         {
             LocallyLimited = 1;
@@ -7655,7 +7667,7 @@ void set_dd_dlb_max_cutoff(t_commrec *cr, real cutoff)
  */
 static void dd_dlb_set_should_check_whether_to_turn_dlb_on(gmx_domdec_t *dd, gmx_bool bValue)
 {
-    if (dd->comm->eDLB == edlbAUTO && !dd_dlb_is_locked(dd))
+    if (dd->comm->dlbState == edlbsOffCanTurnOn)
     {
         dd->comm->bCheckWhetherToTurnDlbOn = bValue;
     }
@@ -7668,7 +7680,7 @@ static gmx_bool dd_dlb_get_should_check_whether_to_turn_dlb_on(gmx_domdec_t *dd)
 {
     const int nddp_chk_dlb = 100;
 
-    if (dd->comm->eDLB != edlbAUTO)
+    if (dd->comm->dlbState != edlbsOffCanTurnOn)
     {
         return FALSE;
     }
@@ -7695,30 +7707,30 @@ static gmx_bool dd_dlb_get_should_check_whether_to_turn_dlb_on(gmx_domdec_t *dd)
 
 gmx_bool dd_dlb_is_on(const gmx_domdec_t *dd)
 {
-    return dd->comm->bDynLoadBal;
+    return (dd->comm->dlbState == edlbsOn);
 }
 
 gmx_bool dd_dlb_is_locked(const gmx_domdec_t *dd)
 {
-    return dd->comm->bDLB_locked;
+    return (dd->comm->dlbState == edlbsOffTemporarilyLocked);
 }
 
 void dd_dlb_lock(gmx_domdec_t *dd)
 {
     /* We can only lock the DLB when it is set to auto, otherwise don't do anything */
-    if (dd->comm->eDLB == edlbAUTO)
+    if (dd->comm->dlbState == edlbsOffCanTurnOn)
     {
-        dd->comm->bDLB_locked = TRUE;
+        dd->comm->dlbState = edlbsOffTemporarilyLocked;
     }
 }
 
 void dd_dlb_unlock(gmx_domdec_t *dd)
 {
     /* We can only lock the DLB when it is set to auto, otherwise don't do anything */
-    if (dd->comm->eDLB == edlbAUTO)
+    if (dd->comm->dlbState == edlbsOffTemporarilyLocked)
     {
-        dd->comm->bDLB_locked = FALSE;
-        dd_dlb_set_should_check_whether_to_turn_dlb_on(dd, !dd->comm->bDynLoadBal);
+        dd->comm->dlbState = edlbsOffCanTurnOn;
+        dd_dlb_set_should_check_whether_to_turn_dlb_on(dd, TRUE);
     }
 }
 
@@ -7883,7 +7895,7 @@ set_dd_corners(const gmx_domdec_t *dd,
         c->c[1][0] = comm->cell_x0[dim1];
         /* All rows can see this row */
         c->c[1][1] = comm->cell_x0[dim1];
-        if (dd->bGridJump)
+        if (dlbIsOn(dd->comm))
         {
             c->c[1][1] = std::max(comm->cell_x0[dim1], comm->zone_d1[1].mch0);
             if (bDistMB)
@@ -7902,7 +7914,7 @@ set_dd_corners(const gmx_domdec_t *dd,
             {
                 c->c[2][j] = comm->cell_x0[dim2];
             }
-            if (dd->bGridJump)
+            if (dlbIsOn(dd->comm))
             {
                 /* Use the maximum of the i-cells that see a j-cell */
                 for (i = 0; i < zones->nizone; i++)
@@ -7937,7 +7949,7 @@ set_dd_corners(const gmx_domdec_t *dd,
              */
             c->cr1[0] = comm->cell_x1[dim1];
             c->cr1[3] = comm->cell_x1[dim1];
-            if (dd->bGridJump)
+            if (dlbIsOn(dd->comm))
             {
                 c->cr1[0] = std::max(comm->cell_x1[dim1], comm->zone_d1[1].mch1);
                 if (bDistMB)
@@ -8276,7 +8288,7 @@ static void setup_dd_communication(gmx_domdec_t *dd,
     bBondComm = comm->bBondComm;
 
     /* Do we need to determine extra distances for multi-body bondeds? */
-    bDistMB = (comm->bInterCGMultiBody && dd->bGridJump && dd->ndim > 1);
+    bDistMB = (comm->bInterCGMultiBody && dlbIsOn(dd->comm) && dd->ndim > 1);
 
     /* Do we need to determine extra distances for only two-body bondeds? */
     bDist2B = (bBondComm && !bDistMB);
@@ -8708,7 +8720,7 @@ static void set_zones_size(gmx_domdec_t *dd,
     zones = &comm->zones;
 
     /* Do we need to determine extra distances for multi-body bondeds? */
-    bDistMB = (comm->bInterCGMultiBody && dd->bGridJump && dd->ndim > 1);
+    bDistMB = (comm->bInterCGMultiBody && dlbIsOn(dd->comm) && dd->ndim > 1);
 
     for (z = zone_start; z < zone_end; z++)
     {
@@ -8728,7 +8740,7 @@ static void set_zones_size(gmx_domdec_t *dd,
             /* With a staggered grid we have different sizes
              * for non-shifted dimensions.
              */
-            if (dd->bGridJump && zones->shift[z][dim] == 0)
+            if (dlbIsOn(dd->comm) && zones->shift[z][dim] == 0)
             {
                 if (d == 1)
                 {
@@ -8757,7 +8769,7 @@ static void set_zones_size(gmx_domdec_t *dd,
             if (zones->shift[z][dim] > 0)
             {
                 dim = dd->dim[d];
-                if (!dd->bGridJump || d == 0)
+                if (!dlbIsOn(dd->comm) || d == 0)
                 {
                     zones->size[z].x0[dim] = comm->cell_x1[dim];
                     zones->size[z].x1[dim] = comm->cell_x1[dim] + rcs;
@@ -9469,7 +9481,7 @@ void dd_partition_system(FILE                *fplog,
 
     bNStGlobalComm = (step % nstglobalcomm == 0);
 
-    if (!comm->bDynLoadBal)
+    if (!dlbIsOn(comm))
     {
         bDoDLB = FALSE;
     }
@@ -9630,7 +9642,7 @@ void dd_partition_system(FILE                *fplog,
         set_ddbox(dd, bMasterState, cr, ir, state_local->box,
                   TRUE, &top_local->cgs, state_local->x, &ddbox);
 
-        bRedist = comm->bDynLoadBal;
+        bRedist = dlbIsOn(comm);
     }
     else
     {
diff --git a/src/gromacs/domdec/domdec_specatomcomm.cpp b/src/gromacs/domdec/domdec_specatomcomm.cpp
index abc52ba315..75d6a2e3d1 100644
--- a/src/gromacs/domdec/domdec_specatomcomm.cpp
+++ b/src/gromacs/domdec/domdec_specatomcomm.cpp
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2006,2007,2008,2009,2010,2012,2013,2014, by the GROMACS development team, led by
+ * Copyright (c) 2006,2007,2008,2009,2010,2012,2013,2014,2015, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -603,7 +603,7 @@ int setup_specat_communication(gmx_domdec_t             *dd,
                   dd->ci[XX], dd->ci[YY], dd->ci[ZZ],
                   nrecv_local, ireq->n, specat_type,
                   specat_type, add_err,
-                  dd->bGridJump ? " or use the -rcon option of mdrun" : "");
+                  dd_dlb_is_on(dd) ? " or use the -rcon option of mdrun" : "");
     }
 
     spac->at_start = at_start;
diff --git a/src/gromacs/ewald/pme-load-balancing.cpp b/src/gromacs/ewald/pme-load-balancing.cpp
index f8facc5621..089361143a 100644
--- a/src/gromacs/ewald/pme-load-balancing.cpp
+++ b/src/gromacs/ewald/pme-load-balancing.cpp
@@ -934,11 +934,17 @@ void pme_loadbal_do(pme_load_balancing_t *pme_lb,
         {
             pme_lb->bBalance = dd_dlb_is_on(cr->dd);
         }
-        else
+        /* We should ignore the first timing to avoid timing allocation
+         * overhead. And since the PME load balancing is called just
+         * before DD repartitioning, the ratio returned by dd_pme_f_ratio
+         * is not over the last nstlist steps, but the nstlist steps before
+         * that. So the first useful ratio is available at step_rel=3*nstlist.
+         */
+        else if (step_rel >= 3*ir->nstlist)
         {
             if (DDMASTER(cr->dd))
             {
-                /* PME node load is too high, start tuning */
+                /* If PME rank load is too high, start tuning */
                 pme_lb->bBalance =
                     (dd_pme_f_ratio(cr->dd) >= loadBalanceTriggerFactor);
             }
@@ -1019,7 +1025,7 @@ void pme_loadbal_do(pme_load_balancing_t *pme_lb,
     }
 
     if (!pme_lb->bBalance &&
-        (!pme_lb->bSepPMERanks || (step_rel <= pme_lb->step_rel_stop)))
+        (!pme_lb->bSepPMERanks || step_rel > pme_lb->step_rel_stop))
     {
         /* We have just deactivated the balancing and we're not measuring PP/PME
          * imbalance during the first steps of the run: deactivate the tuning.
diff --git a/src/gromacs/gmxana/gmx_dos.c b/src/gromacs/gmxana/gmx_dos.c
index 7592091e9b..d4423adc47 100644
--- a/src/gromacs/gmxana/gmx_dos.c
+++ b/src/gromacs/gmxana/gmx_dos.c
@@ -56,6 +56,7 @@
 #include "gromacs/math/units.h"
 #include "gromacs/math/utilities.h"
 #include "gromacs/math/vec.h"
+#include "gromacs/topology/index.h"
 #include "gromacs/utility/fatalerror.h"
 #include "gromacs/utility/futil.h"
 #include "gromacs/utility/smalloc.h"
@@ -65,6 +66,40 @@ enum {
     VACF, MVACF, DOS, DOS_SOLID, DOS_DIFF, DOS_CP, DOS_S, DOS_A, DOS_E, DOS_NR
 };
 
+static int calcMoleculesInIndexGroup(t_block *mols, int natoms, atom_id *index, int nindex)
+{
+    int   i    = 0;
+    int   mol  = 0;
+    int   nMol = 0;
+    int   j;
+
+    while (i < nindex)
+    {
+        while (index[i] > mols->index[mol])
+        {
+            mol++;
+            if (mol >= mols->nr)
+            {
+                gmx_fatal(FARGS, "Atom index out of range: %d", index[i]+1);
+            }
+        }
+        for (j = mols->index[mol]; j < mols->index[mol+1]; j++)
+        {
+            if (index[i] != j)
+            {
+                gmx_fatal(FARGS, "The index group does not consist of whole molecules");
+            }
+            i++;
+            if (i == natoms)
+            {
+                gmx_fatal(FARGS, "Index contains atom numbers larger than the topology");
+            }
+        }
+        nMol++;
+    }
+    return nMol;
+}
+
 static double FD(double Delta, double f)
 {
     return (2*pow(Delta, -4.5)*pow(f, 7.5) -
@@ -214,47 +249,6 @@ static real wEsolid(real nu, real beta)
     }
 }
 
-static void dump_fy(output_env_t oenv, real toler)
-{
-    FILE       *fp;
-    double      Delta, f, y, DD;
-    const char *leg[] = { "f", "fy", "y" };
-
-    DD = pow(10.0, 0.125);
-    fp = xvgropen("fy.xvg", "Fig. 2, Lin2003a", "Delta", "y or fy", oenv);
-    xvgr_legend(fp, asize(leg), leg, oenv);
-    if (output_env_get_print_xvgr_codes(oenv))
-    {
-        fprintf(fp, "@    world 1e-05, 0, 1000, 1\n");
-        fprintf(fp, "@    xaxes scale Logarithmic\n");
-    }
-    for (Delta = 1e-5; (Delta <= 1000); Delta *= DD)
-    {
-        f = calc_fluidicity(Delta, toler);
-        y = calc_y(f, Delta, toler);
-        fprintf(fp, "%10g  %10g  %10g  %10g\n", Delta, f, f*y, y);
-    }
-    xvgrclose(fp);
-}
-
-static void dump_w(output_env_t oenv, real beta)
-{
-    FILE       *fp;
-    double      nu;
-    const char *leg[] = { "wCv", "wS", "wA", "wE" };
-
-    fp = xvgropen("w.xvg", "Fig. 1, Berens1983a", "\\f{12}b\\f{4}h\\f{12}n",
-                  "w", oenv);
-    xvgr_legend(fp, asize(leg), leg, oenv);
-    for (nu = 1; (nu < 100); nu += 0.05)
-    {
-        fprintf(fp, "%10g  %10g  %10g  %10g  %10g\n", beta*PLANCK*nu,
-                wCsolid(nu, beta), wSsolid(nu, beta),
-                wAsolid(nu, beta), wEsolid(nu, beta));
-    }
-    xvgrclose(fp);
-}
-
 int gmx_dos(int argc, char *argv[])
 {
     const char         *desc[] = {
@@ -264,6 +258,11 @@ int gmx_dos(int argc, char *argv[])
         "all vibrations. For flexible systems that would be around a few fs",
         "between saving. Properties based on the DoS are printed on the",
         "standard output."
+        "Note that the density of states is calculated from the mass-weighted",
+        "autocorrelation, and by default only from the square of the real",
+        "component rather than absolute value. This means the shape can differ",
+        "substantially from the plain vibrational power spectrum you can",
+        "calculate with gmx velacc."
     };
     const char         *bugs[] = {
         "This program needs a lot of memory: total usage equals the number of atoms times 3 times number of frames times 4 (or 8 when run in double precision)."
@@ -284,10 +283,16 @@ int gmx_dos(int argc, char *argv[])
     gmx_fft_t           fft;
     double              cP, S, A, E, DiffCoeff, Delta, f, y, z, sigHS, Shs, Sig, DoS0, recip_fac;
     double              wCdiff, wSdiff, wAdiff, wEdiff;
-
-    static     gmx_bool bVerbose = TRUE, bAbsolute = FALSE, bNormalize = FALSE;
-    static     gmx_bool bRecip   = FALSE, bDump = FALSE;
+    int                 grpNatoms;
+    atom_id            *index;
+    char               *grpname;
+    double              invNormalize;
+    gmx_bool            normalizeAutocorrelation;
+
+    static     gmx_bool bVerbose = TRUE, bAbsolute = FALSE, bNormalizeDos = FALSE;
+    static     gmx_bool bRecip   = FALSE;
     static     real     Temp     = 298.15, toler = 1e-6;
+
     t_pargs             pa[]     = {
         { "-v", FALSE, etBOOL, {&bVerbose},
           "Be loud and noisy." },
@@ -295,14 +300,12 @@ int gmx_dos(int argc, char *argv[])
           "Use cm^-1 on X-axis instead of 1/ps for DoS plots." },
         { "-abs", FALSE, etBOOL, {&bAbsolute},
           "Use the absolute value of the Fourier transform of the VACF as the Density of States. Default is to use the real component only" },
-        { "-normdos", FALSE, etBOOL, {&bNormalize},
-          "Normalize the DoS such that it adds up to 3N. This is a hack that should not be necessary." },
+        { "-normdos", FALSE, etBOOL, {&bNormalizeDos},
+          "Normalize the DoS such that it adds up to 3N. This should usually not be necessary." },
         { "-T", FALSE, etREAL, {&Temp},
           "Temperature in the simulation" },
         { "-toler", FALSE, etREAL, {&toler},
-          "[HIDDEN]Tolerance when computing the fluidicity using bisection algorithm" },
-        { "-dump", FALSE, etBOOL, {&bDump},
-          "[HIDDEN]Dump the y/fy plot corresponding to Fig. 2 inLin2003a and the and the weighting functions corresponding to Fig. 1 in Berens1983a." }
+          "[HIDDEN]Tolerance when computing the fluidicity using bisection algorithm" }
     };
 
     t_filenm            fnm[] = {
@@ -331,30 +334,26 @@ int gmx_dos(int argc, char *argv[])
     }
 
     beta = 1/(Temp*BOLTZ);
-    if (bDump)
-    {
-        printf("Dumping reference figures. Thanks for your patience.\n");
-        dump_fy(oenv, toler);
-        dump_w(oenv, beta);
-        exit(0);
-    }
 
     fplog = gmx_fio_fopen(ftp2fn(efLOG, NFILE, fnm), "w");
     fprintf(fplog, "Doing density of states analysis based on trajectory.\n");
     please_cite(fplog, "Pascal2011a");
     please_cite(fplog, "Caleman2011b");
 
-    read_tps_conf(ftp2fn(efTPR, NFILE, fnm), title, &top, &ePBC, NULL, NULL, box,
-                  TRUE);
+    read_tps_conf(ftp2fn(efTPR, NFILE, fnm), title, &top, &ePBC, NULL, NULL, box, TRUE);
+
+    /* Handle index groups */
+    get_index(&top.atoms, ftp2fn_null(efNDX, NFILE, fnm), 1, &grpNatoms, &index, &grpname);
+
     V     = det(box);
     tmass = 0;
-    for (i = 0; (i < top.atoms.nr); i++)
+    for (i = 0; i < grpNatoms; i++)
     {
-        tmass += top.atoms.atom[i].m;
+        tmass += top.atoms.atom[index[i]].m;
     }
 
-    Natom = top.atoms.nr;
-    Nmol  = top.mols.nr;
+    Natom = grpNatoms;
+    Nmol  = calcMoleculesInIndexGroup(&top.mols, top.atoms.nr, index, grpNatoms);
     gnx   = Natom*DIM;
 
     /* Correlation stuff */
@@ -390,9 +389,9 @@ int gmx_dos(int argc, char *argv[])
         }
         for (i = 0; i < gnx; i += DIM)
         {
-            c1[i+XX][nframes] = fr.v[i/DIM][XX];
-            c1[i+YY][nframes] = fr.v[i/DIM][YY];
-            c1[i+ZZ][nframes] = fr.v[i/DIM][ZZ];
+            c1[i+XX][nframes] = fr.v[index[i/DIM]][XX];
+            c1[i+YY][nframes] = fr.v[index[i/DIM]][YY];
+            c1[i+ZZ][nframes] = fr.v[index[i/DIM]][ZZ];
         }
 
         t1 = fr.time;
@@ -413,6 +412,21 @@ int gmx_dos(int argc, char *argv[])
         printf("Going to do %d fourier transforms of length %d. Hang on.\n",
                gnx, nframes);
     }
+    /* Unfortunately the -normalize program option for the autocorrelation
+     * function calculation is added as a hack with a static variable in the
+     * autocorrelation.c source. That would work if we called the normal
+     * do_autocorr(), but this routine overrides that by directly calling
+     * the low-level functionality. That unfortunately leads to ignoring the
+     * default value for the option (which is to normalize).
+     * Since the absolute value seems to be important for the subsequent
+     * analysis below, we detect the value directly from the option, calculate
+     * the autocorrelation without normalization, and then apply the
+     * normalization just to the autocorrelation output
+     * (or not, if the user asked for a non-normalized autocorrelation).
+     */
+    normalizeAutocorrelation = opt2parg_bool("-normalize", npargs, ppa);
+
+    /* Note that we always disable normalization here, regardless of user settings */
     low_do_autocorr(NULL, oenv, NULL, nframes, gnx, nframes, c1, dt, eacNormal, 0, FALSE,
                     FALSE, FALSE, -1, -1, 0);
     snew(dos, DOS_NR);
@@ -427,7 +441,7 @@ int gmx_dos(int argc, char *argv[])
     }
     for (i = 0; (i < gnx); i += DIM)
     {
-        mi = top.atoms.atom[i/DIM].m;
+        mi = top.atoms.atom[index[i/DIM]].m;
         for (j = 0; (j < nframes/2); j++)
         {
             c1j            = (c1[i+XX][j] + c1[i+YY][j] + c1[i+ZZ][j]);
@@ -435,20 +449,28 @@ int gmx_dos(int argc, char *argv[])
             dos[MVACF][j] += mi*c1j;
         }
     }
-    fp = xvgropen(opt2fn("-vacf", NFILE, fnm), "Velocity ACF",
+
+    fp = xvgropen(opt2fn("-vacf", NFILE, fnm), "Velocity autocorrelation function",
                   "Time (ps)", "C(t)", oenv);
     snew(tt, nframes/2);
+
+    invNormalize = normalizeAutocorrelation ? 1.0/dos[VACF][0] : 1.0;
+
     for (j = 0; (j < nframes/2); j++)
     {
         tt[j] = j*dt;
-        fprintf(fp, "%10g  %10g\n", tt[j], dos[VACF][j]);
+        fprintf(fp, "%10g  %10g\n", tt[j], dos[VACF][j] * invNormalize);
     }
     xvgrclose(fp);
-    fp = xvgropen(opt2fn("-mvacf", NFILE, fnm), "Mass-weighted velocity ACF",
+
+    fp = xvgropen(opt2fn("-mvacf", NFILE, fnm), "Mass-weighted velocity autocorrelation function",
                   "Time (ps)", "C(t)", oenv);
+
+    invNormalize = normalizeAutocorrelation ? 1.0/dos[VACF][0] : 1.0;
+
     for (j = 0; (j < nframes/2); j++)
     {
-        fprintf(fp, "%10g  %10g\n", tt[j], dos[MVACF][j]);
+        fprintf(fp, "%10g  %10g\n", tt[j], dos[MVACF][j] * invNormalize);
     }
     xvgrclose(fp);
 
@@ -483,7 +505,7 @@ int gmx_dos(int argc, char *argv[])
     }
     /* Normalize it */
     dostot = evaluate_integral(nframes/4, nu, dos[DOS], NULL, nframes/4, &stddev);
-    if (bNormalize)
+    if (bNormalizeDos)
     {
         for (j = 0; (j < nframes/4); j++)
         {
@@ -571,18 +593,6 @@ int gmx_dos(int argc, char *argv[])
     cP = BOLTZ * evaluate_integral(nframes/4, nu, dos[DOS_CP], NULL,
                                    nframes/4, &stddev);
     fprintf(fplog, "Heat capacity %g J/mol K\n", 1000*cP/Nmol);
-
-    /*
-       S  = BOLTZ * evaluate_integral(nframes/4,nu,dos[DOS_S],NULL,
-                                   nframes/4,&stddev);
-       fprintf(fplog,"Entropy %g J/mol K\n",1000*S/Nmol);
-       A  = BOLTZ * evaluate_integral(nframes/4,nu,dos[DOS_A],NULL,
-                                   nframes/4,&stddev);
-       fprintf(fplog,"Helmholtz energy %g kJ/mol\n",A/Nmol);
-       E  = BOLTZ * evaluate_integral(nframes/4,nu,dos[DOS_E],NULL,
-                                   nframes/4,&stddev);
-       fprintf(fplog,"Internal energy %g kJ/mol\n",E/Nmol);
-     */
     fprintf(fplog, "\nArrivederci!\n");
     gmx_fio_fclose(fplog);
 
diff --git a/src/gromacs/gmxlib/gmx_detect_hardware.cpp b/src/gromacs/gmxlib/gmx_detect_hardware.cpp
index 5e4733df1d..e93fe43d10 100644
--- a/src/gromacs/gmxlib/gmx_detect_hardware.cpp
+++ b/src/gromacs/gmxlib/gmx_detect_hardware.cpp
@@ -77,29 +77,40 @@
 
 
 #ifdef GMX_GPU
-const gmx_bool bGPUBinary = TRUE;
+
+static const bool  bGPUBinary = TRUE;
+
 #  ifdef GMX_USE_OPENCL
-const char    *gpu_implementation        = "OpenCL";
+
+static const char *gpu_implementation       = "OpenCL";
 /* Our current OpenCL implementation only supports using exactly one
  * GPU per PP rank, so sharing is impossible */
-const gmx_bool bGpuSharingSupported      = FALSE;
+static const bool bGpuSharingSupported      = false;
 /* Our current OpenCL implementation is not known to handle
  * concurrency correctly (at context creation, JIT compilation, or JIT
  * cache-management stages). OpenCL runtimes need not support it
  * either; library MPI segfaults when creating OpenCL contexts;
  * thread-MPI seems to work but is not yet known to be safe. */
-const gmx_bool bMultiGpuPerNodeSupported = FALSE;
-#  else
-const char    *gpu_implementation        = "CUDA";
-const gmx_bool bGpuSharingSupported      = TRUE;
-const gmx_bool bMultiGpuPerNodeSupported = TRUE;
-#  endif
-#else
-const gmx_bool bGPUBinary                = FALSE;
-const char    *gpu_implementation        = "non-GPU";
-const gmx_bool bGpuSharingSupported      = FALSE;
-const gmx_bool bMultiGpuPerNodeSupported = FALSE;
-#endif
+static const bool bMultiGpuPerNodeSupported = false;
+
+#  else /* GMX_USE_OPENCL */
+
+// Our CUDA implementation supports everything
+static const char *gpu_implementation        = "CUDA";
+static const bool  bGpuSharingSupported      = true;
+static const bool  bMultiGpuPerNodeSupported = true;
+
+#  endif /* GMX_USE_OPENCL */
+
+#else    /* GMX_GPU */
+
+// Not compiled with GPU support
+static const bool  bGPUBinary                = false;
+static const char *gpu_implementation        = "non-GPU";
+static const bool  bGpuSharingSupported      = false;
+static const bool  bMultiGpuPerNodeSupported = false;
+
+#endif /* GMX_GPU */
 
 /* Names of the GPU detection/check results (see e_gpu_detect_res_t in hw_info.h). */
 const char * const gpu_detect_res_str[egpuNR] =
@@ -124,6 +135,16 @@ static void set_gpu_ids(gmx_gpu_opt_t *gpu_opt, int nrank, int rank);
 static int gmx_count_gpu_dev_unique(const gmx_gpu_info_t *gpu_info,
                                     const gmx_gpu_opt_t  *gpu_opt);
 
+gmx_bool gmx_multiple_gpu_per_node_supported()
+{
+    return bMultiGpuPerNodeSupported;
+}
+
+gmx_bool gmx_gpu_sharing_supported()
+{
+    return bGpuSharingSupported;
+}
+
 static void sprint_gpus(char *sbuf, const gmx_gpu_info_t *gpu_info)
 {
     int      i, ndev;
@@ -440,7 +461,8 @@ void gmx_check_hw_runconf_consistency(FILE                *fplog,
         }
         else
         {
-            if (ngpu_comp > npppn)
+            /* TODO Should we have a gpu_opt->n_dev_supported field? */
+            if (ngpu_comp > npppn && gmx_multiple_gpu_per_node_supported())
             {
                 md_print_warn(cr, fplog,
                               "NOTE: potentially sub-optimal launch configuration, %s started with less\n"
@@ -460,13 +482,26 @@ void gmx_check_hw_runconf_consistency(FILE                *fplog,
                  */
                 if (cr->rank_pp_intranode == 0)
                 {
+                    std::string reasonForLimit;
+                    if (ngpu_comp > 1 &&
+                        ngpu_use == 1 &&
+                        !gmx_multiple_gpu_per_node_supported())
+                    {
+                        reasonForLimit  = "can be used by ";
+                        reasonForLimit += gpu_implementation;
+                        reasonForLimit += " in GROMACS";
+                    }
+                    else
+                    {
+                        reasonForLimit = "was detected";
+                    }
                     gmx_fatal(FARGS,
                               "Incorrect launch configuration: mismatching number of PP %s%s and GPUs%s.\n"
-                              "%s was started with %d PP %s%s%s, but only %d GPU%s were detected.",
+                              "%s was started with %d PP %s%s%s, but only %d GPU%s %s.",
                               th_or_proc, btMPI ? "s" : "es", pernode,
                               ShortProgram(), npppn, th_or_proc,
                               th_or_proc_plural, pernode,
-                              ngpu_use, gpu_use_plural);
+                              ngpu_use, gpu_use_plural, reasonForLimit.c_str());
                 }
             }
         }
@@ -552,7 +587,7 @@ static int gmx_count_gpu_dev_unique(const gmx_gpu_info_t *gpu_info,
     {
         int device_id;
 
-        device_id           = bGpuSharingSupported ? get_gpu_device_id(gpu_info, gpu_opt, i) : i;
+        device_id           = gmx_gpu_sharing_supported() ? get_gpu_device_id(gpu_info, gpu_opt, i) : i;
         uniq_ids[device_id] = 1;
     }
     /* Count the devices used. */
@@ -1121,11 +1156,11 @@ void gmx_parse_gpu_ids(gmx_gpu_opt_t *gpu_opt)
         parse_digits_from_plain_string(env,
                                        &gpu_opt->n_dev_use,
                                        &gpu_opt->dev_use);
-        if (!bMultiGpuPerNodeSupported && 1 < gpu_opt->n_dev_use)
+        if (!gmx_multiple_gpu_per_node_supported() && 1 < gpu_opt->n_dev_use)
         {
             gmx_fatal(FARGS, "The %s implementation only supports using exactly one PP rank per node", gpu_implementation);
         }
-        if (!bGpuSharingSupported && anyGpuIdIsRepeated(gpu_opt))
+        if (!gmx_gpu_sharing_supported() && anyGpuIdIsRepeated(gpu_opt))
         {
             gmx_fatal(FARGS, "The %s implementation only supports using exactly one PP rank per GPU", gpu_implementation);
         }
@@ -1231,7 +1266,7 @@ static void set_gpu_ids(gmx_gpu_opt_t *gpu_opt, int nrank, int rank)
     {
         if (nrank % gpu_opt->n_dev_compatible == 0)
         {
-            nshare = bGpuSharingSupported ? nrank/gpu_opt->n_dev_compatible : 1;
+            nshare = gmx_gpu_sharing_supported() ? nrank/gpu_opt->n_dev_compatible : 1;
         }
         else
         {
@@ -1252,7 +1287,7 @@ static void set_gpu_ids(gmx_gpu_opt_t *gpu_opt, int nrank, int rank)
 
     /* Here we will waste GPUs when nrank < gpu_opt->n_dev_compatible */
     gpu_opt->n_dev_use = std::min(gpu_opt->n_dev_compatible*nshare, nrank);
-    if (!bMultiGpuPerNodeSupported)
+    if (!gmx_multiple_gpu_per_node_supported())
     {
         gpu_opt->n_dev_use = std::min(gpu_opt->n_dev_use, 1);
     }
diff --git a/src/gromacs/gmxlib/gpu_utils/ocl_compiler.cpp b/src/gromacs/gmxlib/gpu_utils/ocl_compiler.cpp
index 783016de1a..4e43fd0e1c 100644
--- a/src/gromacs/gmxlib/gpu_utils/ocl_compiler.cpp
+++ b/src/gromacs/gmxlib/gpu_utils/ocl_compiler.cpp
@@ -750,7 +750,7 @@ ocl_get_build_options_string(cl_context           context,
 
         for (std::string::size_type i = 0; i < unescaped_ocl_root_path.length(); i++)
         {
-            if (inputStr[i] == ' ')
+            if (unescaped_ocl_root_path[i] == ' ')
             {
                 ocl_root_path.push_back('\\');
             }
diff --git a/src/gromacs/gmxpreprocess/readpull.c b/src/gromacs/gmxpreprocess/readpull.c
index 49793283cc..f2fb2d2203 100644
--- a/src/gromacs/gmxpreprocess/readpull.c
+++ b/src/gromacs/gmxpreprocess/readpull.c
@@ -262,12 +262,12 @@ char **read_pullparams(int *ninp_p, t_inpfile **inp_p,
         int ngroup;
 
         pcrd = &pull->coord[i-1];
-        sprintf(buf, "pull-coord%d-groups", i);
-        STYPE(buf,              groups, "");
         sprintf(buf, "pull-coord%d-type", i);
         EETYPE(buf,             pcrd->eType, epull_names);
         sprintf(buf, "pull-coord%d-geometry", i);
         EETYPE(buf,             pcrd->eGeom, epullg_names);
+        sprintf(buf, "pull-coord%d-groups", i);
+        STYPE(buf,              groups, "");
 
         nscan  = sscanf(groups, "%d %d %d %d %d", &pcrd->group[0], &pcrd->group[1],  &pcrd->group[2], &pcrd->group[3], &idum);
         ngroup = (pcrd->eGeom == epullgDIRRELATIVE) ? 4 : 2;
diff --git a/src/gromacs/legacyheaders/gmx_detect_hardware.h b/src/gromacs/legacyheaders/gmx_detect_hardware.h
index 56b0384fe6..338ad8dd81 100644
--- a/src/gromacs/legacyheaders/gmx_detect_hardware.h
+++ b/src/gromacs/legacyheaders/gmx_detect_hardware.h
@@ -45,6 +45,18 @@ extern "C" {
 } /* fixes auto-indentation problems */
 #endif
 
+/*! \brief Return whether mdrun can use more than one GPU per node
+ *
+ * The OpenCL implementation cannot use more than one GPU per node,
+ * for example. */
+gmx_bool gmx_multiple_gpu_per_node_supported();
+
+/*! \brief Return whether PP ranks can share a GPU
+ *
+ * The OpenCL implementation cannot share a GPU between ranks, for
+ * example. */
+gmx_bool gmx_gpu_sharing_supported();
+
 /* the init and consistency functions depend on commrec that may not be
    consistent in cuda because MPI types don't exist there.  */
 #ifndef __CUDACC__
diff --git a/src/gromacs/legacyheaders/types/commrec.h b/src/gromacs/legacyheaders/types/commrec.h
index cfc7669a2f..64521b85ae 100644
--- a/src/gromacs/legacyheaders/types/commrec.h
+++ b/src/gromacs/legacyheaders/types/commrec.h
@@ -3,7 +3,7 @@
  *
  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
  * Copyright (c) 2001-2004, The GROMACS development team.
- * Copyright (c) 2013,2014, by the GROMACS development team, led by
+ * Copyright (c) 2013,2014,2015, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -161,7 +161,6 @@ struct gmx_domdec_t {
     ivec     nc;
     int      ndim;
     ivec     dim; /* indexed by 0 to ndim */
-    gmx_bool bGridJump;
 
     /* PBC from dim 0 to npbcdim */
     int npbcdim;
diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_types.h b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_types.h
index f6ee458292..e09de482f9 100644
--- a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_types.h
+++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_types.h
@@ -297,16 +297,16 @@ struct gmx_nbnxn_ocl_t
     cl_command_queue    stream[2];      /**< local and non-local GPU queues                             */
 
     /** events used for synchronization */
-    cl_event nonlocal_done;              /**< event triggered when the non-local non-bonded kernel
-                                            is done (and the local transfer can proceed) */
-    cl_event isc_ops_and_local_H2D_done; /**< event triggered when the tasks issued in
-                                            the local stream that need to precede the
-                                            non-local force calculations are done
-                                            (e.g. f buffer 0-ing, local x/q H2D) */
-
-    cl_bool                     bDoTime; /**< True if event-based timing is enabled.                     */
-    cl_timers_t                *timers;  /**< OpenCL event-based timers.                                 */
-    struct gmx_wallclock_gpu_t *timings; /**< Timing data.                                               */
+    cl_event nonlocal_done;               /**< event triggered when the non-local non-bonded kernel
+                                             is done (and the local transfer can proceed) */
+    cl_event misc_ops_and_local_H2D_done; /**< event triggered when the tasks issued in
+                                             the local stream that need to precede the
+                                             non-local force calculations are done
+                                             (e.g. f buffer 0-ing, local x/q H2D) */
+
+    cl_bool                     bDoTime;  /**< True if event-based timing is enabled.                     */
+    cl_timers_t                *timers;   /**< OpenCL event-based timers.                                 */
+    struct gmx_wallclock_gpu_t *timings;  /**< Timing data.                                               */
 };
 
 #ifdef __cplusplus
diff --git a/src/gromacs/mdlib/nbnxn_search.c b/src/gromacs/mdlib/nbnxn_search.c
index a372082356..16fd3b0101 100644
--- a/src/gromacs/mdlib/nbnxn_search.c
+++ b/src/gromacs/mdlib/nbnxn_search.c
@@ -4014,7 +4014,8 @@ static void split_sci_entry(nbnxn_pairlist_t *nbl,
                 nsp_cj4 += (nbl->cj4[cj4].imei[0].imask >> p) & 1;
             }
 
-            if (nsp_cj4 > 0 && nsp + nsp_cj4 > nsp_max)
+            /* Check if we should split at this cj4 to get a list of size nsp */
+            if (nsp > 0 && nsp + nsp_cj4 > nsp_max)
             {
                 /* Split the list at cj4 */
                 nbl->sci[sci].cj4_ind_end = cj4;
@@ -4379,7 +4380,7 @@ static void get_nsubpair_target(const nbnxn_search_t  nbs,
 
     ls[XX] = (grid->c1[XX] - grid->c0[XX])/(grid->ncx*GPU_NSUBCELL_X);
     ls[YY] = (grid->c1[YY] - grid->c0[YY])/(grid->ncy*GPU_NSUBCELL_Y);
-    ls[ZZ] = (grid->c1[ZZ] - grid->c0[ZZ])*grid->ncx*grid->ncy/(grid->nc*GPU_NSUBCELL_Z);
+    ls[ZZ] = grid->na_c/(grid->atom_density*ls[XX]*ls[YY]);
 
     /* The average squared length of the diagonal of a sub cell */
     xy_diag2 = ls[XX]*ls[XX] + ls[YY]*ls[YY] + ls[ZZ]*ls[ZZ];
@@ -4409,11 +4410,25 @@ static void get_nsubpair_target(const nbnxn_search_t  nbs,
         /* 4 octants of a sphere */
         vol_est += 0.5*4.0/3.0*M_PI*pow(r_eff_sup, 3);
 
+        /* Estimate the number of cluster pairs as the local number of
+         * clusters times the volume they interact with times the density.
+         */
         nsp_est = grid->nsubc_tot*vol_est*grid->atom_density/grid->na_c;
 
         /* Subtract the non-local pair count */
         nsp_est -= nsp_est_nl;
 
+        /* For small cut-offs nsp_est will be an underesimate.
+         * With DD nsp_est_nl is an overestimate so nsp_est can get negative.
+         * So to avoid too small or negative nsp_est we set a minimum of
+         * all cells interacting with all 3^3 direct neighbors (3^3-1)/2+1=14.
+         * This might be a slight overestimate for small non-periodic groups of
+         * atoms as will occur for a local domain with DD, but for small
+         * groups of atoms we'll anyhow be limited by nsubpair_target_min,
+         * so this overestimation will not matter.
+         */
+        nsp_est = max(nsp_est, grid->nsubc_tot*14.0);
+
         if (debug)
         {
             fprintf(debug, "nsp_est local %5.1f non-local %5.1f\n",
diff --git a/src/gromacs/mdlib/nsgrid.cpp b/src/gromacs/mdlib/nsgrid.cpp
index b92d5a5c76..63649e3bee 100644
--- a/src/gromacs/mdlib/nsgrid.cpp
+++ b/src/gromacs/mdlib/nsgrid.cpp
@@ -250,7 +250,7 @@ static void set_grid_sizes(matrix box, rvec izones_x0, rvec izones_x1, real rlis
              * direction has uniform DD cell boundaries.
              */
             bDDRect = !(ddbox->tric_dir[i] ||
-                        (dd->bGridJump && i != dd->dim[0]));
+                        (dd_dlb_is_on(dd) && i != dd->dim[0]));
 
             radd = rlist;
             if (i >= ddbox->npbcdim &&
diff --git a/src/gromacs/trajectoryanalysis/modules/distance.cpp b/src/gromacs/trajectoryanalysis/modules/distance.cpp
index 96f752b5d5..4b07d1483a 100644
--- a/src/gromacs/trajectoryanalysis/modules/distance.cpp
+++ b/src/gromacs/trajectoryanalysis/modules/distance.cpp
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2010,2011,2012,2013,2014, by the GROMACS development team, led by
+ * Copyright (c) 2010,2011,2012,2013,2014,2015, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -271,7 +271,7 @@ Distance::initAnalysis(const TrajectoryAnalysisSettings &settings,
     {
         AnalysisDataPlotModulePointer plotm(
                 new AnalysisDataPlotModule(settings.plotSettings()));
-        plotm->setFileName(fnAll_);
+        plotm->setFileName(fnXYZ_);
         plotm->setTitle("Distance");
         plotm->setXAxisIsTime();
         plotm->setYLabel("Distance (nm)");
diff --git a/src/programs/mdrun/resource-division.cpp b/src/programs/mdrun/resource-division.cpp
index 8378845cc1..9042aee8a9 100644
--- a/src/programs/mdrun/resource-division.cpp
+++ b/src/programs/mdrun/resource-division.cpp
@@ -53,6 +53,22 @@
 #include "gromacs/utility/fatalerror.h"
 
 
+/* DISCLAIMER: All the atom count and thread numbers below are heuristic.
+ * The real switching points will depend on the system simulation,
+ * the algorithms used and the hardware it's running on, as well as if there
+ * are other jobs running on the same machine. We try to take into account
+ * factors that have a large influence, such as recent Intel CPUs being
+ * much better at wide multi-threading. The remaining factors should
+ * (hopefully) have a small influence, such that the performance just before
+ * and after a switch point doesn't change too much.
+ */
+
+#ifdef GMX_OPENMP
+static const bool bOMP = true;
+#else
+static const bool bOMP = false;
+#endif
+
 #ifdef GMX_THREAD_MPI
 /* The minimum number of atoms per tMPI thread. With fewer atoms than this,
  * the number of threads will get lowered.
@@ -64,7 +80,7 @@ static const int min_atoms_per_gpu        = 900;
 /* TODO choose nthreads_omp based on hardware topology
    when we have a hardware topology detection library */
 /* First we consider the case of no MPI (1 MPI rank).
- * In general, when running up to 4 threads, OpenMP should be faster.
+ * In general, when running up to 8 threads, OpenMP should be faster.
  * Note: on AMD Bulldozer we should avoid running OpenMP over two dies.
  * On Intel>=Nehalem running OpenMP on a single CPU is always faster,
  * even on two CPUs it's usually faster (but with many OpenMP threads
@@ -77,16 +93,16 @@ static const int min_atoms_per_gpu        = 900;
  * Sandy/Ivy Bridge, Has/Broadwell. By checking for AVX instead of
  * model numbers we ensure also future Intel CPUs are covered.
  */
-const int nthreads_omp_always_faster_default   =  6;
-const int nthreads_omp_always_faster_Nehalem   = 12;
-const int nthreads_omp_always_faster_Intel_AVX = 16;
+const int nthreads_omp_faster_default   =  8;
+const int nthreads_omp_faster_Nehalem   = 12;
+const int nthreads_omp_faster_Intel_AVX = 16;
 /* For CPU only runs the fastest options are usually MPI or OpenMP only.
  * With one GPU, using MPI only is almost never optimal, so we need to
  * compare running pure OpenMP with combined MPI+OpenMP. This means higher
  * OpenMP threads counts can still be ok. Multiplying the numbers above
  * by a factor of 2 seems to be a good estimate.
  */
-const int nthreads_omp_always_faster_gpu_fac   =  2;
+const int nthreads_omp_faster_gpu_fac   =  2;
 
 /* This is the case with MPI (2 or more MPI PP ranks).
  * By default we will terminate with a fatal error when more than 8
@@ -104,34 +120,30 @@ const int nthreads_omp_mpi_ok_min_gpu          =  2;
 const int nthreads_omp_mpi_target_max          =  6;
 
 
-#ifdef GMX_USE_OPENCL
-static const bool bGpuSharingSupported = false;
-#else
-static const bool bGpuSharingSupported = true;
-#endif
-
-
-static int nthreads_omp_always_faster(gmx_cpuid_t cpuid_info, gmx_bool bUseGPU)
+/* Returns the maximum OpenMP thread count for which using a single MPI rank
+ * should be faster than using multiple ranks with the same total thread count.
+ */
+static int nthreads_omp_faster(gmx_cpuid_t cpuid_info, gmx_bool bUseGPU)
 {
     int nth;
 
     if (gmx_cpuid_vendor(cpuid_info) == GMX_CPUID_VENDOR_INTEL &&
         gmx_cpuid_feature(cpuid_info, GMX_CPUID_FEATURE_X86_AVX))
     {
-        nth = nthreads_omp_always_faster_Intel_AVX;
+        nth = nthreads_omp_faster_Intel_AVX;
     }
     else if (gmx_cpuid_is_intel_nehalem(cpuid_info))
     {
-        nth = nthreads_omp_always_faster_Nehalem;
+        nth = nthreads_omp_faster_Nehalem;
     }
     else
     {
-        nth = nthreads_omp_always_faster_default;
+        nth = nthreads_omp_faster_default;
     }
 
     if (bUseGPU)
     {
-        nth *= nthreads_omp_always_faster_gpu_fac;
+        nth *= nthreads_omp_faster_gpu_fac;
     }
 
     nth = std::min(nth, GMX_OPENMP_MAX_THREADS);
@@ -139,6 +151,26 @@ static int nthreads_omp_always_faster(gmx_cpuid_t cpuid_info, gmx_bool bUseGPU)
     return nth;
 }
 
+/* Returns that maximum OpenMP thread count that passes the efficiency check */
+static int nthreads_omp_efficient_max(int gmx_unused nrank,
+                                      gmx_cpuid_t    cpuid_info,
+                                      gmx_bool       bUseGPU)
+{
+#if defined GMX_OPENMP && defined GMX_MPI
+    if (nrank > 1)
+    {
+        return nthreads_omp_mpi_ok_max;
+    }
+    else
+#endif
+    {
+        return nthreads_omp_faster(cpuid_info, bUseGPU);
+    }
+}
+
+/* Return the number of thread-MPI ranks to use.
+ * This is chosen such that we can always obey our own efficiency checks.
+ */
 static int get_tmpi_omp_thread_division(const gmx_hw_info_t *hwinfo,
                                         const gmx_hw_opt_t  *hw_opt,
                                         int                  nthreads_tot,
@@ -161,9 +193,9 @@ static int get_tmpi_omp_thread_division(const gmx_hw_info_t *hwinfo,
             /* #thread < #gpu is very unlikely, but if so: waste gpu(s) */
             nrank = nthreads_tot;
         }
-        else if (bGpuSharingSupported &&
-                 (nthreads_tot > nthreads_omp_always_faster(hwinfo->cpuid_info,
-                                                            ngpu > 0) ||
+        else if (gmx_gpu_sharing_supported() &&
+                 (nthreads_tot > nthreads_omp_faster(hwinfo->cpuid_info,
+                                                     ngpu > 0) ||
                   (ngpu > 1 && nthreads_tot/ngpu > nthreads_omp_mpi_target_max)))
         {
             /* The high OpenMP thread count will likely result in sub-optimal
@@ -193,8 +225,7 @@ static int get_tmpi_omp_thread_division(const gmx_hw_info_t *hwinfo,
     }
     else
     {
-        if (nthreads_tot <= nthreads_omp_always_faster(hwinfo->cpuid_info,
-                                                       ngpu > 0))
+        if (nthreads_tot <= nthreads_omp_faster(hwinfo->cpuid_info, ngpu > 0))
         {
             /* Use pure OpenMP parallelization */
             nrank = 1;
@@ -210,6 +241,34 @@ static int get_tmpi_omp_thread_division(const gmx_hw_info_t *hwinfo,
 }
 
 
+static int getMaxGpuUsable(FILE *fplog, const t_commrec *cr, const gmx_hw_info_t *hwinfo, int cutoff_scheme)
+{
+    /* This code relies on the fact that GPU are not detected when GPU
+     * acceleration was disabled at run time by the user.
+     */
+    if (cutoff_scheme == ecutsVERLET &&
+        hwinfo->gpu_info.n_dev_compatible > 0)
+    {
+        if (gmx_multiple_gpu_per_node_supported())
+        {
+            return hwinfo->gpu_info.n_dev_compatible;
+        }
+        else
+        {
+            if (hwinfo->gpu_info.n_dev_compatible > 1)
+            {
+                md_print_warn(cr, fplog, "More than one compatible GPU is available, but GROMACS can only use one of them. Using a single thread-MPI rank.\n");
+            }
+            return 1;
+        }
+    }
+    else
+    {
+        return 0;
+    }
+}
+
+
 #ifdef GMX_THREAD_MPI
 /* Get the number of MPI ranks to use for thread-MPI based on how many
  * were requested, which algorithms we're using,
@@ -219,7 +278,7 @@ static int get_tmpi_omp_thread_division(const gmx_hw_info_t *hwinfo,
  * with the hardware, except that ntmpi could be larger than #GPU.
  */
 int get_nthreads_mpi(const gmx_hw_info_t *hwinfo,
-                     const gmx_hw_opt_t  *hw_opt,
+                     gmx_hw_opt_t        *hw_opt,
                      const t_inputrec    *inputrec,
                      const gmx_mtop_t    *mtop,
                      const t_commrec     *cr,
@@ -227,16 +286,15 @@ int get_nthreads_mpi(const gmx_hw_info_t *hwinfo,
 {
     int      nthreads_hw, nthreads_tot_max, nrank, ngpu;
     int      min_atoms_per_mpi_rank;
-    gmx_bool bCanUseGPU;
 
     /* Check if an algorithm does not support parallel simulation.  */
     if (inputrec->eI == eiLBFGS ||
         inputrec->coulombtype == eelEWALD)
     {
-        md_print_warn(cr, fplog, "The integration or electrostatics algorithm doesn't support parallel runs. Using a single thread-MPI thread.\n");
+        md_print_warn(cr, fplog, "The integration or electrostatics algorithm doesn't support parallel runs. Using a single thread-MPI rank.\n");
         if (hw_opt->nthreads_tmpi > 1)
         {
-            gmx_fatal(FARGS, "You asked for more than 1 thread-MPI thread, but an algorithm doesn't support that");
+            gmx_fatal(FARGS, "You asked for more than 1 thread-MPI rank, but an algorithm doesn't support that");
         }
 
         return 1;
@@ -266,16 +324,7 @@ int get_nthreads_mpi(const gmx_hw_info_t *hwinfo,
         nthreads_tot_max = nthreads_hw;
     }
 
-    bCanUseGPU = (inputrec->cutoff_scheme == ecutsVERLET &&
-                  hwinfo->gpu_info.n_dev_compatible > 0);
-    if (bCanUseGPU)
-    {
-        ngpu = hwinfo->gpu_info.n_dev_compatible;
-    }
-    else
-    {
-        ngpu = 0;
-    }
+    ngpu = getMaxGpuUsable(fplog, cr, hwinfo, inputrec->cutoff_scheme);
 
     if (inputrec->cutoff_scheme == ecutsGROUP)
     {
@@ -297,7 +346,7 @@ int get_nthreads_mpi(const gmx_hw_info_t *hwinfo,
     }
     else
     {
-        if (bCanUseGPU)
+        if (ngpu >= 1)
         {
             min_atoms_per_mpi_rank = min_atoms_per_gpu;
         }
@@ -362,6 +411,26 @@ int get_nthreads_mpi(const gmx_hw_info_t *hwinfo,
 
         nrank = nrank_new;
 
+        /* We reduced the number of tMPI ranks, which means we might violate
+         * our own efficiency checks if we simply use all hardware threads.
+         */
+        if (bOMP && hw_opt->nthreads_omp <= 0 && hw_opt->nthreads_tot <= 0)
+        {
+            /* The user set neither the total nor the OpenMP thread count,
+             * we should use all hardware threads, unless we will violate
+             * our own efficiency limitation on the thread count.
+             */
+            int  nt_omp_max;
+
+            nt_omp_max = nthreads_omp_efficient_max(nrank, hwinfo->cpuid_info, ngpu >= 1);
+
+            if (nrank*nt_omp_max < hwinfo->nthreads_hw_avail)
+            {
+                /* Limit the number of OpenMP threads to start */
+                hw_opt->nthreads_omp = nt_omp_max;
+            }
+        }
+
         fprintf(stderr, "\n");
         fprintf(stderr, "NOTE: Parallelization is limited by the small number of atoms,\n");
         fprintf(stderr, "      only starting %d thread-MPI ranks.\n", nrank);
@@ -375,7 +444,7 @@ int get_nthreads_mpi(const gmx_hw_info_t *hwinfo,
 
 void check_resource_division_efficiency(const gmx_hw_info_t *hwinfo,
                                         const gmx_hw_opt_t  *hw_opt,
-                                        gmx_bool             bNTOptSet,
+                                        gmx_bool             bNtOmpOptionSet,
                                         t_commrec           *cr,
                                         FILE                *fplog)
 {
@@ -392,6 +461,7 @@ void check_resource_division_efficiency(const gmx_hw_info_t *hwinfo,
      * OpenMP have been initialized. Check that here.
      */
 #ifdef GMX_THREAD_MPI
+    assert(nthreads_omp_faster_default >= nthreads_omp_mpi_ok_max);
     assert(hw_opt->nthreads_tmpi >= 1);
 #endif
     assert(gmx_omp_nthreads_get(emntDefault) >= 1);
@@ -434,7 +504,7 @@ void check_resource_division_efficiency(const gmx_hw_info_t *hwinfo,
     if (DOMAINDECOMP(cr) && cr->nnodes > 1)
     {
         if (nth_omp_max < nthreads_omp_mpi_ok_min ||
-            (!(ngpu > 0 && !bGpuSharingSupported) &&
+            (!(ngpu > 0 && !gmx_gpu_sharing_supported()) &&
              nth_omp_max > nthreads_omp_mpi_ok_max))
         {
             /* Note that we print target_max here, not ok_max */
@@ -443,7 +513,7 @@ void check_resource_division_efficiency(const gmx_hw_info_t *hwinfo,
                     nthreads_omp_mpi_ok_min,
                     nthreads_omp_mpi_target_max);
 
-            if (bNTOptSet)
+            if (bNtOmpOptionSet)
             {
                 md_print_warn(cr, fplog, "NOTE: %s\n", buf);
             }
@@ -460,8 +530,8 @@ void check_resource_division_efficiency(const gmx_hw_info_t *hwinfo,
     else
     {
         /* No domain decomposition (or only one domain) */
-        if (!(ngpu > 0 && !bGpuSharingSupported) &&
-            nth_omp_max > nthreads_omp_always_faster(hwinfo->cpuid_info, ngpu > 0))
+        if (!(ngpu > 0 && !gmx_gpu_sharing_supported()) &&
+            nth_omp_max > nthreads_omp_faster(hwinfo->cpuid_info, ngpu > 0))
         {
             /* To arrive here, the user/system set #ranks and/or #OMPthreads */
             gmx_bool bEnvSet;
@@ -469,7 +539,7 @@ void check_resource_division_efficiency(const gmx_hw_info_t *hwinfo,
 
             bEnvSet = (getenv("OMP_NUM_THREADS") != NULL);
 
-            if (bNTOptSet || bEnvSet)
+            if (bNtOmpOptionSet || bEnvSet)
             {
                 sprintf(buf2, "You requested %d OpenMP threads", nth_omp_max);
             }
@@ -489,7 +559,7 @@ void check_resource_division_efficiency(const gmx_hw_info_t *hwinfo,
              * with different values per rank or node, since in that case
              * the user can not set -ntomp to override the error.
              */
-            if (bNTOptSet || (bEnvSet && nth_omp_min != nth_omp_max))
+            if (bNtOmpOptionSet || (bEnvSet && nth_omp_min != nth_omp_max))
             {
                 md_print_warn(cr, fplog, "NOTE: %s\n", buf);
             }
@@ -502,7 +572,7 @@ void check_resource_division_efficiency(const gmx_hw_info_t *hwinfo,
 #else /* GMX_OPENMP && GMX_MPI */
       /* No OpenMP and/or MPI: it doesn't make much sense to check */
     GMX_UNUSED_VALUE(hw_opt);
-    GMX_UNUSED_VALUE(bNTOptSet);
+    GMX_UNUSED_VALUE(bNtOmpOptionSet);
     /* Check if we have more than 1 physical core, if detected,
      * or more than 1 hardware thread if physical cores were not detected.
      */
@@ -538,17 +608,18 @@ void check_and_update_hw_opt_1(gmx_hw_opt_t *hw_opt,
     }
     if (hw_opt->nthreads_tmpi > 0)
     {
-        gmx_fatal(FARGS, "Setting the number of thread-MPI threads is only supported with thread-MPI and GROMACS was compiled without thread-MPI");
+        gmx_fatal(FARGS, "Setting the number of thread-MPI ranks is only supported with thread-MPI and GROMACS was compiled without thread-MPI");
     }
 #endif
 
-#ifndef GMX_OPENMP
-    if (hw_opt->nthreads_omp > 1)
+    if (!bOMP)
     {
-        gmx_fatal(FARGS, "More than 1 OpenMP thread requested, but GROMACS was compiled without OpenMP support");
+        if (hw_opt->nthreads_omp > 1)
+        {
+            gmx_fatal(FARGS, "More than 1 OpenMP thread requested, but GROMACS was compiled without OpenMP support");
+        }
+        hw_opt->nthreads_omp = 1;
     }
-    hw_opt->nthreads_omp = 1;
-#endif
 
     if (hw_opt->nthreads_tot > 0 && hw_opt->nthreads_omp_pme <= 0)
     {
@@ -559,14 +630,14 @@ void check_and_update_hw_opt_1(gmx_hw_opt_t *hw_opt,
             hw_opt->nthreads_omp > 0 &&
             hw_opt->nthreads_tot != hw_opt->nthreads_tmpi*hw_opt->nthreads_omp)
         {
-            gmx_fatal(FARGS, "The total number of threads requested (%d) does not match the thread-MPI threads (%d) times the OpenMP threads (%d) requested",
+            gmx_fatal(FARGS, "The total number of threads requested (%d) does not match the thread-MPI ranks (%d) times the OpenMP threads (%d) requested",
                       hw_opt->nthreads_tot, hw_opt->nthreads_tmpi, hw_opt->nthreads_omp);
         }
 
         if (hw_opt->nthreads_tmpi > 0 &&
             hw_opt->nthreads_tot % hw_opt->nthreads_tmpi != 0)
         {
-            gmx_fatal(FARGS, "The total number of threads requested (%d) is not divisible by the number of thread-MPI threads requested (%d)",
+            gmx_fatal(FARGS, "The total number of threads requested (%d) is not divisible by the number of thread-MPI ranks requested (%d)",
                       hw_opt->nthreads_tot, hw_opt->nthreads_tmpi);
         }
 
@@ -584,12 +655,10 @@ void check_and_update_hw_opt_1(gmx_hw_opt_t *hw_opt,
         }
     }
 
-#ifndef GMX_OPENMP
-    if (hw_opt->nthreads_omp > 1)
+    if (!bOMP && hw_opt->nthreads_omp > 1)
     {
         gmx_fatal(FARGS, "OpenMP threads are requested, but GROMACS was compiled without OpenMP support");
     }
-#endif
 
     if (hw_opt->nthreads_omp_pme > 0 && hw_opt->nthreads_omp <= 0)
     {
diff --git a/src/programs/mdrun/resource-division.h b/src/programs/mdrun/resource-division.h
index ca98237741..42f4f05182 100644
--- a/src/programs/mdrun/resource-division.h
+++ b/src/programs/mdrun/resource-division.h
@@ -45,9 +45,10 @@
  * At the point we have already called check_and_update_hw_opt.
  * Thus all options should be internally consistent and consistent
  * with the hardware, except that ntmpi could be larger than #GPU.
+ * If necessary, this function will modify hw_opt->nthreads_omp.
  */
 int get_nthreads_mpi(const gmx_hw_info_t *hwinfo,
-                     const gmx_hw_opt_t  *hw_opt,
+                     gmx_hw_opt_t        *hw_opt,
                      const t_inputrec    *inputrec,
                      const gmx_mtop_t    *mtop,
                      const t_commrec     *cr,
@@ -58,12 +59,12 @@ int get_nthreads_mpi(const gmx_hw_info_t *hwinfo,
  * intended to catch cases where the user starts 1 MPI rank per hardware
  * thread or 1 rank per physical node.
  * With a sub-optimal setup a note is printed to fplog and stderr when
- * bNtOptSet==TRUE; with bNtOptSet==FALSE a fatal error is issued.
+ * bNtOmpSet==TRUE; with bNtOptOptionSet==FALSE a fatal error is issued.
  * This function should be called after thread-MPI and OpenMP are set up.
  */
 void check_resource_division_efficiency(const gmx_hw_info_t *hwinfo,
                                         const gmx_hw_opt_t  *hw_opt,
-                                        gmx_bool             bNTOptSet,
+                                        gmx_bool             bNtOmpOptionSet,
                                         t_commrec           *cr,
                                         FILE                *fplog);
 
diff --git a/src/programs/mdrun/runner.cpp b/src/programs/mdrun/runner.cpp
index a917c62ec5..f25ad46c9b 100644
--- a/src/programs/mdrun/runner.cpp
+++ b/src/programs/mdrun/runner.cpp
@@ -781,10 +781,6 @@ int mdrunner(gmx_hw_opt_t *hw_opt,
                                                  hw_opt,
                                                  inputrec, mtop,
                                                  cr, fplog);
-        if (hw_opt->nthreads_tot > 0 && hw_opt->nthreads_omp <= 0)
-        {
-            hw_opt->nthreads_omp = hw_opt->nthreads_tot/hw_opt->nthreads_tmpi;
-        }
 
         if (hw_opt->nthreads_tmpi > 1)
         {