From: Mark Abraham <mark.j.abraham@gmail.com>
Date: Wed, 1 Oct 2014 20:02:02 +0000 (+0200)
Subject: Merge release-5-0 into master
X-Git-Url: http://biod.pnpi.spb.ru/gitweb/?a=commitdiff_plain;h=19d3c2e5d0c401eb59010960d11a18b6ba2c54c6;hp=-c;p=alexxy%2Fgromacs.git

Merge release-5-0 into master

Conflicts:
	CMakeLists.txt
Version numbers not bumped; fixed to use the right
name for RelWithDebInfo.

	cmake/gmxCFlags.cmake
Fixed to use the right name for RelWithDebInfo.

	src/gromacs/listed-forces/bonded.cpp
New RB SIMD function in bonded.cpp had unused variables, now
eliminated

	src/gromacs/mdlib/domdec.cpp
Bug fixes from release-5-0 incorporated. std::max now used in code
newly arrived from release-5-0.

md.cpp had no conflict, but fr->nbv->bUseGPU had to
be replaced by use_GPU(fr->nbv) to work in master branch.

Change-Id: I65326b691745111fbdaa9435be6c92fa1acf6e7d
---

19d3c2e5d0c401eb59010960d11a18b6ba2c54c6
diff --combined CMakeLists.txt
index 70185949c4,53498532af..7fbe947522
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@@ -48,28 -48,75 +48,28 @@@ set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CM
  set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
  set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
  
 -# PROJECT_VERSION should have the following structure:
 -# VERSION-dev[-SUFFIX] where the VERSION should have the for: vMajor.vMinor.vPatch
 -#
 -# The "-dev" suffix is important to keep because it makes possible to distinguish
 -# between a build from official release and a build from git release branch on a
 -# machine with no git.
 -#
 -# NOTE: when releasing the "-dev" suffix needs to be stripped off!
 -# REGRESSIONTEST_VERSION and REGRESSIONTEST_BRANCH should always be
 -# defined.
 -set(PROJECT_VERSION "5.0.3-dev")
 -# If this is a released tarball, "-dev" will not be present in
 -# PROJECT_VERSION, and REGRESSIONTEST_VERSION specifies the version
 -# number of the regressiontest tarball against which the code tarball
 -# can be tested. This will be the version of the last patch release.
 -set(REGRESSIONTEST_VERSION "5.0.3-dev")
 -# The MD5 checksum of the regressiontest tarball. Only used if "-dev"
 -# is not present in the PROJECT_VERSION
 -set(REGRESSIONTEST_MD5SUM "004f2377adf252e2caa9c241390d4b9c")
 -# If this is not a released tarball, "-dev" will be present in
 -# PROJECT_VERSION, and REGRESSIONTEST_BRANCH specifies the name of the
 -# gerrit.gromacs.org branch whose HEAD can test this code, *if* this
 -# code contains all recent fixes from the corresponding code branch.
 -set(REGRESSIONTEST_BRANCH "refs/heads/release-5-0")
 -
 -set(CUSTOM_VERSION_STRING ""
 -    CACHE STRING "Custom version string (if empty, use hard-coded default)")
 -mark_as_advanced(CUSTOM_VERSION_STRING)
 -if (CUSTOM_VERSION_STRING)
 -    set(PROJECT_VERSION ${CUSTOM_VERSION_STRING})
 -endif()
 -set(LIBRARY_SOVERSION 0)
 -set(LIBRARY_VERSION ${LIBRARY_SOVERSION}.0.0)
 -# It is a bit irritating, but this has to be set separately for now!
 -SET(CPACK_PACKAGE_VERSION_MAJOR "5")
 -SET(CPACK_PACKAGE_VERSION_MINOR "0")
 -#SET(CPACK_PACKAGE_VERSION_PATCH "0")
 -
 -# The numerical gromacs version. It is 40600 for 4.6.0.
 -# The #define GMX_VERSION in gromacs/version.h is set to this value.
 -math(EXPR NUM_VERSION
 -    "${CPACK_PACKAGE_VERSION_MAJOR}*10000 + ${CPACK_PACKAGE_VERSION_MINOR}*100")
 -if(CPACK_PACKAGE_VERSION_PATCH)
 -    math(EXPR NUM_VERSION
 -         "${NUM_VERSION} + ${CPACK_PACKAGE_VERSION_PATCH}")
 -endif()
 -
 -# The API version tracks the numerical Gromacs version (for now).
 -# It is potentially different from the Gromacs version in the future, if
 -# the programs/libraries diverge from the presumably more stable API.
 -# The #define GMX_API_VERSION in version.h is set to this value to
 -# provide backward compatibility of software written against the Gromacs API.
 -set(API_VERSION ${NUM_VERSION})
 +# Set up common version variables, as well as general information about
 +# the build tree (whether the build is from a source package or from a git
 +# repository).  Also declares a few functions that will be used for generating
 +# version info files later.
 +include(gmxVersionInfo)
  
  if(CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT AND UNIX)
      set(CMAKE_INSTALL_PREFIX "/usr/local/gromacs" CACHE STRING "Installation prefix (installation will need write permissions here)" FORCE)
  endif()
  
  include(gmxBuildTypeReference)
 +include(gmxBuildTypeProfile)
  include(gmxBuildTypeTSAN)
  include(gmxBuildTypeASAN)
  include(gmxBuildTypeReleaseWithAssert)
  
  if(NOT CMAKE_BUILD_TYPE)
 -    set(CMAKE_BUILD_TYPE "Release" CACHE STRING "Choose the type of build, options are: Debug Release RelWithDebInfo MinSizeRel Reference RelWithAssert." FORCE)
 +    set(CMAKE_BUILD_TYPE "Release" CACHE STRING "Choose the type of build, options are: Debug Release RelWithDebInfo MinSizeRel Reference RelWithAssert Profile." FORCE)
      # There's no need to offer a user the choice of ThreadSanitizer
      # Set the possible values of build type for cmake-gui
      set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release"
 -        "MinSizeRel" "RelWithDebInfo" "Reference" "RelWithAssert")
 +        "MinSizeRel" "RelWithDebInfo" "Reference" "RelWithAssert" "Profile")
  endif()
  if(CMAKE_CONFIGURATION_TYPES)
      # Add appropriate GROMACS-specific build types for the Visual
@@@ -81,10 -128,34 +81,10 @@@
          "List of configuration types"
          FORCE)
  endif()
- set(build_types_with_explicit_flags RELEASE DEBUG RELWITHDEBUGINFO RELWITHASSERT MINSIZEREL PROFILE)
 -set(build_types_with_explicit_flags RELEASE DEBUG RELWITHDEBINFO RELWITHASSERT MINSIZEREL)
++set(build_types_with_explicit_flags RELEASE DEBUG RELWITHDEBINFO RELWITHASSERT MINSIZEREL PROFILE)
  
 -enable_language(C)
 -enable_language(CXX)
  set_property(GLOBAL PROPERTY FIND_LIBRARY_USE_LIB64_PATHS ON)
  
 -set(CPACK_PACKAGE_NAME "gromacs")
 -set(CPACK_PACKAGE_VERSION ${PROJECT_VERSION})
 -set(CPACK_SOURCE_PACKAGE_FILE_NAME "${CPACK_PACKAGE_NAME}-${CPACK_PACKAGE_VERSION}")
 -set(CPACK_PACKAGE_VENDOR "gromacs.org")
 -set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "Gromacs - a toolkit for high-performance molecular simulation")
 -set(CPACK_RESOURCE_FILE_WELCOME "${CMAKE_SOURCE_DIR}/admin/InstallWelcome.txt")
 -# Its GPL/LGPL, so they do not have to agree to a license for mere usage, but some installers require this...
 -set(CPACK_RESOURCE_FILE_LICENSE "${CMAKE_SOURCE_DIR}/COPYING")
 -set(CPACK_RESOURCE_FILE_README "${CMAKE_SOURCE_DIR}/admin/InstallInfo.txt")
 -set(CPACK_SOURCE_IGNORE_FILES "\\\\.isreposource$;\\\\.git/;\\\\.gitignore$;\\\\.gitattributes;")
 -set(CPACK_PROJECT_CONFIG_FILE "${CMAKE_SOURCE_DIR}/CPackInit.cmake")
 -# CPack source archives include only the directories we list here.
 -# This variable is a list of pairs of names of source and destination
 -# directories. Most of these are used for content GROMACS generates as
 -# part of the configuration or build.
 -set(CPACK_SOURCE_INSTALLED_DIRECTORIES "${CMAKE_SOURCE_DIR};/;${CMAKE_BINARY_DIR}/src/programs/completion;src/programs/completion;${CMAKE_BINARY_DIR}/docs/man/man1;docs/man/man1;${CMAKE_BINARY_DIR}/docs/man/man7;docs/man/man7;${CMAKE_BINARY_DIR}/docs/old-html/final;docs/old-html/final;${CMAKE_BINARY_DIR}/docs/install-guide/final;/")
 -set(CPACK_PACKAGE_CONTACT "gmx-users@gromacs.org")
 -set(CPACK_GMX_BUILD_HELP "${GMX_BUILD_HELP}") #Works even though GMX_BUILD_HELP is defined later because it is off by default.
 -
 -#must come after all cpack settings!
 -include(CPack)
 -
  # Set a default valgrind suppression file.
  # This unfortunately needs to duplicate information from CTest to work as
  # expected...
@@@ -94,12 -165,14 +94,12 @@@ set(MEMORYCHECK_SUPPRESSIONS_FIL
      "File that contains suppressions for the memory checker")
  include(CTest)
  
 -set(SOURCE_IS_GIT_REPOSITORY OFF)
 -set(SOURCE_IS_SOURCE_DISTRIBUTION OFF)
 -if(EXISTS "${CMAKE_SOURCE_DIR}/.git")
 -    set(SOURCE_IS_GIT_REPOSITORY ON)
 -endif()
 -if(NOT EXISTS "${CMAKE_SOURCE_DIR}/admin/.isreposource")
 -    set(SOURCE_IS_SOURCE_DISTRIBUTION ON)
 -endif()
 +include(gmxCPackUtilities)
 +gmx_cpack_init()
 +
 +# Variables that accumulate stuff influencing the installed headers
 +set(INSTALLED_HEADER_INCLUDE_DIRS "")
 +set(INSTALLED_HEADER_DEFINITIONS "")
  
  ########################################################################
  # Check and warn if cache generated on a different host is being reused
@@@ -190,7 -263,7 +190,7 @@@ gmx_option_multichoice
      GMX_SIMD
      "SIMD instruction set for CPU kernels and compiler optimization"
      "${GMX_SUGGESTED_SIMD}"
 -    None SSE2 SSE4.1 AVX_128_FMA AVX_256 AVX2_256 IBM_QPX Sparc64_HPC_ACE Reference)
 +    None SSE2 SSE4.1 AVX_128_FMA AVX_256 AVX2_256 ARM_NEON ARM_NEON_ASIMD IBM_QPX IBM_VMX IBM_VSX Sparc64_HPC_ACE Reference)
  
  gmx_option_multichoice(
      GMX_FFT_LIBRARY
@@@ -231,6 -304,9 +231,6 @@@ option(GMX_OPENMP "Enable OpenMP-based 
  
  option(GMX_USE_TNG "Use the TNG library for trajectory I/O" ON)
  
 -option(GMX_GIT_VERSION_INFO "Generate git version information" ${SOURCE_IS_GIT_REPOSITORY})
 -mark_as_advanced(GMX_GIT_VERSION_INFO)
 -
  if(UNIX)
      option(GMX_SYMLINK_OLD_BINARY_NAMES "Create symbolic links for pre-5.0 binary names" ON)
  endif()
@@@ -304,15 -380,17 +304,15 @@@ if(GMX_SIMD STREQUAL "AVX_256
  endif()
  
  
 -
 -set(PKG_CFLAGS "")
  if(GMX_DOUBLE)
      add_definitions(-DGMX_DOUBLE)
 -    set(PKG_CFLAGS "${PKG_CFLAGS} -DGMX_DOUBLE")
 +    list(APPEND INSTALLED_HEADER_DEFINITIONS "-DGMX_DOUBLE")
      if(GMX_RELAXED_DOUBLE_PRECISION)
          add_definitions(-DGMX_RELAXED_DOUBLE_PRECISION)
      endif()
  endif()
  if(GMX_SOFTWARE_INVSQRT)
 -  set(PKG_CFLAGS "${PKG_CFLAGS} -DGMX_SOFTWARE_INVSQRT")
 +    list(APPEND INSTALLED_HEADER_DEFINITIONS "-DGMX_SOFTWARE_INVSQRT")
  endif()
  
  if(WIN32 AND NOT CYGWIN)
@@@ -477,6 -555,25 +477,6 @@@ if(WIN32 AND NOT CYGWIN
      add_definitions(-DNOMINMAX)
  endif()
  
 -# only bother with finding git and using version.h if the source is a git repo
 -if(GMX_GIT_VERSION_INFO)
 -    if (NOT SOURCE_IS_GIT_REPOSITORY)
 -        message(FATAL_ERROR
 -            "Cannot generate git version information from source tree not under git. "
 -            "Set GMX_GIT_VERSION_INFO=OFF to proceed.")
 -    endif()
 -    # We need at least git v1.5.3 be able to parse git's date output. If not
 -    # found or the version is too small, we can't generate version information.
 -    find_package(Git)
 -
 -    if(NOT GIT_FOUND OR GIT_VERSION_STRING VERSION_LESS "1.5.3")
 -        message(FATAL_ERROR
 -            "No compatible git version found (>= 1.5.3 required). "
 -            "Won't be able to generate development version information. "
 -            "Set GMX_GIT_VERSION_INFO=OFF to proceed.")
 -    endif()
 -endif()
 -
  # Detect boost unless GMX_EXTERNAL_BOOST is explicitly OFF
  # Used for default if GMX_EXTERNAL_BOOST is not defined (first CMake pass)
  if(NOT DEFINED GMX_EXTERNAL_BOOST OR GMX_EXTERNAL_BOOST)
@@@ -519,6 -616,12 +519,6 @@@ if(GMX_USE_TNG AND NOT GMX_EXTERNAL_TNG
      gmx_test_zlib(HAVE_ZLIB)
  endif()
  
 -########################################################################
 -# Generate development version info for cache
 -########################################################################
 -# set(GEN_VERSION_INFO_INTERNAL "ON")
 -# include(gmxGenerateVersionString)
 -
  ########################################################################
  # Our own GROMACS tests
  ########################################################################
@@@ -528,6 -631,10 +528,6 @@@ include_directories(BEFORE ${CMAKE_SOUR
  include_directories(BEFORE ${CMAKE_SOURCE_DIR}/src/external/thread_mpi/include)
  # Required for config.h, maybe should only be set in src/CMakeLists.txt
  include_directories(BEFORE ${CMAKE_BINARY_DIR}/src)
 -# Required for gmx_header_config_gen.h to be found before installation
 -include_directories(BEFORE ${CMAKE_BINARY_DIR}/src/gromacs/utility)
 -# Required for now to make old code compile
 -include_directories(BEFORE ${CMAKE_SOURCE_DIR}/src/gromacs/legacyheaders)
  
  include(gmxTestInlineASM)
  gmx_test_inline_asm_gcc_x86(GMX_X86_GCC_INLINE_ASM)
@@@ -682,18 -789,14 +682,18 @@@ if(GMX_FAHCORE
    include_directories(${COREWRAP_INCLUDE_DIR})
  endif()
  
 -option(GMX_BUILD_HELP "Build man pages, HTML help, and completions automatically (requires that compiled binaries can be executed on the build host)" OFF)
 +# Value of AUTO tries to generate things, but will only produce warnings if
 +# that fails.
 +set(build_help_default AUTO)
 +if (SOURCE_IS_SOURCE_DISTRIBUTION)
 +    set(build_help_default OFF)
 +endif()
 +gmx_option_trivalue(GMX_BUILD_HELP "Build man pages, HTML help, and completions automatically (requires that compiled binaries can be executed on the build host)" ${build_help_default})
  mark_as_advanced(GMX_BUILD_HELP)
 -if (GMX_BUILD_HELP AND SOURCE_IS_SOURCE_DISTRIBUTION AND
 -    "${CMAKE_CURRENT_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_BINARY_DIR}")
 -
 +if (GMX_BUILD_HELP AND SOURCE_IS_SOURCE_DISTRIBUTION AND BUILD_IS_INSOURCE)
      message(FATAL_ERROR
 -        "Rebuilding HTML and man pages is not supported for in-source "
 -        "builds from a source distribution. "
 +        "Rebuilding HTML and man pages or shell completions is not supported "
 +        "for in-source builds from a source distribution. "
          "Set GMX_BUILD_HELP=OFF or do an out-of-source build to proceed.")
  endif()
  
@@@ -725,6 -828,7 +725,6 @@@ else(
      set(GMX_EXE_LINKER_FLAGS ${GMX_EXE_LINKER_FLAGS} ${OpenMP_LINKER_FLAGS})
      set(GMX_SHARED_LINKER_FLAGS ${GMX_SHARED_LINKER_FLAGS} ${OpenMP_SHARED_LINKER_FLAGS})
  endif()
 -set(PKG_CFLAGS "${PKG_CFLAGS} ${OpenMP_C_FLAGS}")
  
  ########################################################################
  # Specify install locations
@@@ -741,24 -845,17 +741,24 @@@ mark_as_advanced(GMX_LIB_INSTALL_DIR GM
  
  # These variables are used internally to provide a central location for
  # customizing the install locations.
 -set(LIB_INSTALL_DIR  ${GMX_LIB_INSTALL_DIR})
 -set(BIN_INSTALL_DIR  bin)
 -set(DATA_INSTALL_DIR share/${GMX_DATA_INSTALL_DIR})
 -set(MAN_INSTALL_DIR  share/man)
 -set(INCL_INSTALL_DIR include)
 +set(LIB_INSTALL_DIR       ${GMX_LIB_INSTALL_DIR})
 +set(BIN_INSTALL_DIR       bin)
 +set(DATA_INSTALL_DIR      share/${GMX_DATA_INSTALL_DIR})
 +set(MAN_INSTALL_DIR       share/man)
 +# If the nesting level wrt. the installation root is changed,
 +# gromacs-config.cmake.cmakein needs to be adapted.
 +set(CMAKE_INSTALL_DIR     share/cmake)
 +# TODO: Make GMXRC adapt if this is changed
 +set(PKGCONFIG_INSTALL_DIR ${LIB_INSTALL_DIR}/pkgconfig)
 +set(INCL_INSTALL_DIR      include)
  
  # These variables get written into config.h for use in finding the data
  # directories.
  set(GMXLIB_SEARCH_DIR share/${GMX_DATA_INSTALL_DIR}/top)
  set(GMXLIB_FALLBACK   ${CMAKE_INSTALL_PREFIX}/${DATA_INSTALL_DIR}/top)
  
 +list(APPEND INSTALLED_HEADER_INCLUDE_DIRS ${INCL_INSTALL_DIR})
 +
  # Binary and library suffix options
  include(gmxManageSuffixes)
  
@@@ -804,14 -901,15 +804,14 @@@ install(FILES COPYING DESTINATION ${DAT
  
  if(GMX_EXTERNAL_BOOST)
      include_directories(${Boost_INCLUDE_DIRS})
 -    set(PKG_CFLAGS "${PKG_CFLAGS} -I${Boost_INCLUDE_DIRS}")
 +    list(APPEND INSTALLED_HEADER_INCLUDE_DIRS ${Boost_INCLUDE_DIRS})
  else()
      include_directories(BEFORE ${CMAKE_SOURCE_DIR}/src/external/boost)
 +    list(APPEND INSTALLED_HEADER_INCLUDE_DIRS ${INCL_INSTALL_DIR}/gromacs/external/boost)
 +    list(APPEND INSTALLED_HEADED_DEFINITIONS "-DBOOST_NO_TYPEID")
      # typeid not supported for minimal internal version
      # (would add significant amount of code)
      add_definitions(-DBOOST_NO_TYPEID)
 -    # TODO: Propagate the above settings to the installed CMakeFiles.txt template
 -    # (from share/template/)
 -    set(PKG_CFLAGS "${PKG_CFLAGS} -DBOOST_NO_TYPEID -I${CMAKE_INSTALL_PREFIX}/${INCL_INSTALL_DIR}/gromacs/external/boost")
      if (NOT GMX_BUILD_MDRUN_ONLY)
          install(DIRECTORY ${CMAKE_SOURCE_DIR}/src/external/boost/boost
                  DESTINATION ${INCL_INSTALL_DIR}/gromacs/external/boost
@@@ -854,8 -952,6 +854,8 @@@ if (BUILD_TESTING
      add_subdirectory(tests)
  endif()
  
 +gmx_cpack_write_config()
 +
  # Issue a warning if NVIDIA GPUs were detected, but CUDA was not found.
  # Don't bother the user after the first configure pass.
  if ((CUDA_NOTFOUND_AUTO AND GMX_DETECT_GPU_AVAILABLE) AND NOT GMX_GPU_DETECTION_DONE)
diff --combined cmake/gmxCFlags.cmake
index 011cada36e,124107b65c..3a0e3bbe15
--- a/cmake/gmxCFlags.cmake
+++ b/cmake/gmxCFlags.cmake
@@@ -64,7 -64,7 +64,7 @@@ function(gmx_set_cmake_compiler_flags
          # be set up elsewhere and passed to this function, but it is
          # inconvenient in CMake to pass more than one list, and such a
          # list is only used here.
-         foreach(build_type RELWITHDEBUGINFO RELWITHASSERT MINSIZEREL PROFILE)
 -        foreach(build_type RELWITHDEBINFO RELWITHASSERT MINSIZEREL)
++        foreach(build_type RELWITHDEBINFO RELWITHASSERT MINSIZEREL PROFILE)
              set(GMXC_${language}FLAGS_${build_type} "${GMXC_${language}FLAGS_RELEASE}")
          endforeach()
          # Copy the flags that are only used by the real Release build
@@@ -85,9 -85,12 +85,12 @@@
              endif()
  
              # Append to the variables for the given build type for
-             # each language, in the parent scope.
+             # each language, in the parent scope. We add our new variables at the end, so
+             # compiler-specific choices are more likely to override default CMake choices.
+             # This is for instance useful for RelWithDebInfo builds, where we want to use the full
+             # set of our optimization flags detected in this file, rather than having -O2 override them.
              set(CMAKE_${language}_FLAGS${punctuation}${build_type}
-                 "${GMXC_${language}FLAGS${punctuation}${build_type}} ${CMAKE_${language}_FLAGS${punctuation}${build_type}}"
+                 "${CMAKE_${language}_FLAGS${punctuation}${build_type}} ${GMXC_${language}FLAGS${punctuation}${build_type}}"
                  PARENT_SCOPE)
          endforeach()
      endforeach()
@@@ -110,12 -113,14 +113,12 @@@ MACRO(gmx_c_flags
          # Since 4.8 on by default. For previous version disabling is a no-op. Only disabling for Release because with assert
          # the warnings are OK.
          GMX_TEST_CFLAG(CFLAGS_WARN_REL "-Wno-array-bounds" GMXC_CFLAGS_RELEASE_ONLY)
 -        # Since gcc 4.8 strict - false postives with old gmx_fatal. TODO: Remove in master
 -        GMX_TEST_CFLAG(CFLAGS_WARN_UNINIT "-Wno-maybe-uninitialized" GMXC_CFLAGS)
          if(CYGWIN)
              GMX_TEST_CFLAG(CFLAGS_WARN_SUBSCRIPT "-Wno-char-subscripts" GMXC_CFLAGS)
          endif()
          # new in gcc 4.5
          GMX_TEST_CFLAG(CFLAGS_EXCESS_PREC "-fexcess-precision=fast" GMXC_CFLAGS_RELEASE)
 -        GMX_TEST_CFLAG(CFLAGS_COPT "-fomit-frame-pointer -funroll-all-loops"
 +        GMX_TEST_CFLAG(CFLAGS_COPT "-funroll-all-loops"
                         GMXC_CFLAGS_RELEASE)
          GMX_TEST_CFLAG(CFLAGS_NOINLINE "-fno-inline" GMXC_CFLAGS_DEBUG)
      endif()
@@@ -131,7 -136,7 +134,7 @@@
          GMX_TEST_CFLAG(CXXFLAGS_WARN_REL "-Wno-array-bounds" GMXC_CXXFLAGS_RELEASE_ONLY)
          # new in gcc 4.5
          GMX_TEST_CXXFLAG(CXXFLAGS_EXCESS_PREC "-fexcess-precision=fast" GMXC_CXXFLAGS_RELEASE)
 -        GMX_TEST_CXXFLAG(CXXFLAGS_COPT "-fomit-frame-pointer -funroll-all-loops"
 +        GMX_TEST_CXXFLAG(CXXFLAGS_COPT "-funroll-all-loops"
                           GMXC_CXXFLAGS_RELEASE)
          GMX_TEST_CXXFLAG(CXXFLAGS_NOINLINE "-fno-inline" GMXC_CXXFLAGS_DEBUG)
      endif()
@@@ -148,7 -153,9 +151,7 @@@
                      GMX_TEST_CFLAG(CFLAGS_PRAGMA "-wd161" GMXC_CFLAGS)
                  endif()
              endif()
 -# 111: statement is unreachable
 -# 177: function ".." was declared but never referenced
 -# 181: argument of type ".." is incompatible with format "..", expecting argument of type ".."
 +# 177: function/variable ".." was declared but never referenced
  # 193: zero used for undefined preprocessing identifier ".."
  # 271: trailing comma is nonstandard
  # 304: access control not specified ("public" by default)
@@@ -171,7 -178,7 +174,7 @@@
  #3346: dynamic exception specifications are deprecated
  #11074: Inlining inhibited by limit max-size(/max-total-size)
  #11076: To get full report use -opt-report=3 -opt-report-phase ipo (shown for previous remark)
 -            GMX_TEST_CFLAG(CFLAGS_WARN "-w3 -wd111 -wd177 -wd181 -wd193 -wd271 -wd304 -wd383 -wd424 -wd444 -wd522 -wd593 -wd869 -wd981 -wd1418 -wd1419 -wd1572 -wd1599 -wd2259 -wd2415 -wd2547 -wd2557 -wd3280 -wd3346 -wd11074 -wd11076" GMXC_CFLAGS)
 +            GMX_TEST_CFLAG(CFLAGS_WARN "-w3 -wd177 -wd193 -wd271 -wd304 -wd383 -wd424 -wd444 -wd522 -wd593 -wd869 -wd981 -wd1418 -wd1419 -wd1572 -wd1599 -wd2259 -wd2415 -wd2547 -wd2557 -wd3280 -wd3346 -wd11074 -wd11076" GMXC_CFLAGS)
              GMX_TEST_CFLAG(CFLAGS_STDGNU "-std=gnu99" GMXC_CFLAGS)
              GMX_TEST_CFLAG(CFLAGS_OPT "-ip -funroll-all-loops -alias-const -ansi-alias" GMXC_CFLAGS_RELEASE)
          else()
@@@ -182,7 -189,7 +185,7 @@@
                      GMX_TEST_CFLAG(CFLAGS_PRAGMA "/wd161" GMXC_CFLAGS)
                  endif()
              endif()
 -            GMX_TEST_CFLAG(CFLAGS_WARN "/W3 /wd111 /wd177 /wd181 /wd193 /wd271 /wd304 /wd383 /wd424 /wd444 /wd522 /wd593 /wd869 /wd981 /wd1418 /wd1419 /wd1572 /wd1599 /wd2259 /wd2415 /wd2547 /wd2557 /wd3280 /wd3346" GMXC_CFLAGS)
 +            GMX_TEST_CFLAG(CFLAGS_WARN "/W3 /wd177 /wd193 /wd271 /wd304 /wd383 /wd424 /wd444 /wd522 /wd593 /wd869 /wd981 /wd1418 /wd1419 /wd1572 /wd1599 /wd2259 /wd2415 /wd2547 /wd2557 /wd3280 /wd3346" GMXC_CFLAGS)
              GMX_TEST_CFLAG(CFLAGS_OPT "/Qip" GMXC_CFLAGS_RELEASE)
          endif()
      endif()
@@@ -199,7 -206,7 +202,7 @@@
  #All but the following warnings are identical for the C-compiler (see above)
  #1782: #pragma once is obsolete
  #2282: unrecognized GCC pragma
 -            GMX_TEST_CXXFLAG(CXXFLAGS_WARN "-w3 -wd111 -wd177 -wd181 -wd193 -wd271 -wd304 -wd383 -wd424 -wd444 -wd522 -wd593 -wd869 -wd981 -wd1418 -wd1419 -wd1572 -wd1599 -wd2259 -wd2415 -wd2547 -wd2557 -wd3280 -wd3346 -wd11074 -wd11076 -wd1782 -wd2282" GMXC_CXXFLAGS)
 +            GMX_TEST_CXXFLAG(CXXFLAGS_WARN "-w3 -wd177 -wd193 -wd271 -wd304 -wd383 -wd424 -wd444 -wd522 -wd593 -wd869 -wd981 -wd1418 -wd1419 -wd1572 -wd1599 -wd2259 -wd2415 -wd2547 -wd2557 -wd3280 -wd3346 -wd11074 -wd11076 -wd1782 -wd2282" GMXC_CXXFLAGS)
              GMX_TEST_CXXFLAG(CXXFLAGS_OPT "-ip -funroll-all-loops -alias-const -ansi-alias" GMXC_CXXFLAGS_RELEASE)
          else()
              if(NOT GMX_OPENMP)
@@@ -209,7 -216,7 +212,7 @@@
                      GMX_TEST_CXXFLAG(CXXFLAGS_PRAGMA "/wd161" GMXC_CXXFLAGS)
                  endif()
              endif()
 -            GMX_TEST_CXXFLAG(CXXFLAGS_WARN "/W3 /wd111 /wd177 /wd181 /wd193 /wd271 /wd304 /wd383 /wd424 /wd444 /wd522 /wd593 /wd869 /wd981 /wd1418 /wd1419 /wd1572 /wd1599 /wd2259 /wd2415 /wd2547 /wd2557 /wd3280 /wd3346 /wd1782 /wd2282" GMXC_CXXFLAGS)
 +            GMX_TEST_CXXFLAG(CXXFLAGS_WARN "/W3 /wd177 /wd193 /wd271 /wd304 /wd383 /wd424 /wd444 /wd522 /wd593 /wd869 /wd981 /wd1418 /wd1419 /wd1572 /wd1599 /wd2259 /wd2415 /wd2547 /wd2557 /wd3280 /wd3346 /wd1782 /wd2282" GMXC_CXXFLAGS)
              GMX_TEST_CXXFLAG(CXXFLAGS_OPT "/Qip" GMXC_CXXFLAGS_RELEASE)
          endif()
      endif()
diff --combined src/gromacs/gmxana/gmx_tune_pme.c
index b9c2af3e71,a437803932..c4a7e6e021
--- a/src/gromacs/gmxana/gmx_tune_pme.c
+++ b/src/gromacs/gmxana/gmx_tune_pme.c
@@@ -32,36 -32,35 +32,36 @@@
   * To help us fund GROMACS development, we humbly ask that you cite
   * the research papers on the package. Check out http://www.gromacs.org.
   */
 -#ifdef HAVE_CONFIG_H
 -#include <config.h>
 -#endif
 +#include "gmxpre.h"
 +
 +#include "config.h"
  
 +#include <stdlib.h>
  #include <time.h>
 +
  #ifdef HAVE_SYS_TIME_H
  #include <sys/time.h>
  #endif
  
  #include "gromacs/commandline/pargs.h"
 -#include "typedefs.h"
 -#include "types/commrec.h"
 -#include "gromacs/utility/smalloc.h"
 -#include "vec.h"
 -#include "copyrite.h"
  #include "gromacs/fileio/tpxio.h"
 -#include "gromacs/utility/cstringutil.h"
 -#include "readinp.h"
 -#include "calcgrid.h"
 -#include "checkpoint.h"
 -#include "macros.h"
 -#include "gmx_ana.h"
 -#include "names.h"
 -#include "perf_est.h"
 -#include "inputrec.h"
 -#include "gromacs/timing/walltime_accounting.h"
 +#include "gromacs/gmxana/gmx_ana.h"
 +#include "gromacs/legacyheaders/calcgrid.h"
 +#include "gromacs/legacyheaders/checkpoint.h"
 +#include "gromacs/legacyheaders/inputrec.h"
 +#include "gromacs/legacyheaders/macros.h"
 +#include "gromacs/legacyheaders/names.h"
 +#include "gromacs/legacyheaders/perf_est.h"
 +#include "gromacs/legacyheaders/readinp.h"
 +#include "gromacs/legacyheaders/typedefs.h"
 +#include "gromacs/legacyheaders/types/commrec.h"
  #include "gromacs/math/utilities.h"
 -
 -#include "gmx_fatal.h"
 +#include "gromacs/math/vec.h"
 +#include "gromacs/timing/walltime_accounting.h"
 +#include "gromacs/utility/baseversion.h"
 +#include "gromacs/utility/cstringutil.h"
 +#include "gromacs/utility/fatalerror.h"
 +#include "gromacs/utility/smalloc.h"
  
  /* Enum for situations that can occur during log file parsing, the
   * corresponding string entries can be found in do_the_tests() in
@@@ -144,6 -143,29 +144,6 @@@ static void cleandata(t_perf *perfdata
  }
  
  
 -static gmx_bool is_equal(real a, real b)
 -{
 -    real diff, eps = 1.0e-7;
 -
 -
 -    diff = a - b;
 -
 -    if (diff < 0.0)
 -    {
 -        diff = -diff;
 -    }
 -
 -    if (diff < eps)
 -    {
 -        return TRUE;
 -    }
 -    else
 -    {
 -        return FALSE;
 -    }
 -}
 -
 -
  static void remove_if_exists(const char *fn)
  {
      if (gmx_fexist(fn))
@@@ -531,8 -553,8 +531,8 @@@ static gmx_bool analyze_data
      fprintf(fp, "\n");
  
      /* Only mention settings if they were modified: */
 -    bRefinedCoul = !is_equal(info->rcoulomb[k_win], info->rcoulomb[0]);
 -    bRefinedVdW  = !is_equal(info->rvdw[k_win], info->rvdw[0]    );
 +    bRefinedCoul = !gmx_within_tol(info->rcoulomb[k_win], info->rcoulomb[0], GMX_REAL_EPS);
 +    bRefinedVdW  = !gmx_within_tol(info->rvdw[k_win], info->rvdw[0], GMX_REAL_EPS);
      bRefinedGrid = !(info->nkx[k_win] == info->nkx[0] &&
                       info->nky[k_win] == info->nky[0] &&
                       info->nkz[k_win] == info->nkz[0]);
@@@ -594,25 -616,25 +594,25 @@@ static void get_program_paths(gmx_bool 
      {
          if ( (cp = getenv("MPIRUN")) != NULL)
          {
 -            *cmd_mpirun = strdup(cp);
 +            *cmd_mpirun = gmx_strdup(cp);
          }
          else
          {
 -            *cmd_mpirun = strdup(def_mpirun);
 +            *cmd_mpirun = gmx_strdup(def_mpirun);
          }
      }
      else
      {
 -        *cmd_mpirun = strdup(empty_mpirun);
 +        *cmd_mpirun = gmx_strdup(empty_mpirun);
      }
  
      if ( (cp = getenv("MDRUN" )) != NULL)
      {
 -        *cmd_mdrun  = strdup(cp);
 +        *cmd_mdrun  = gmx_strdup(cp);
      }
      else
      {
 -        *cmd_mdrun  = strdup(def_mdrun);
 +        *cmd_mdrun  = gmx_strdup(def_mdrun);
      }
  }
  
@@@ -964,11 -986,11 +964,11 @@@ static void make_benchmark_tprs
          {
              /* Determine which Coulomb radii rc to use in the benchmarks */
              add = (rmax-rmin)/(*ntprs-1);
 -            if (is_equal(rmin, info->rcoulomb[0]))
 +            if (gmx_within_tol(rmin, info->rcoulomb[0], GMX_REAL_EPS))
              {
                  ir->rcoulomb = rmin + j*add;
              }
 -            else if (is_equal(rmax, info->rcoulomb[0]))
 +            else if (gmx_within_tol(rmax, info->rcoulomb[0], GMX_REAL_EPS))
              {
                  ir->rcoulomb = rmin + (j-1)*add;
              }
@@@ -1068,8 -1090,8 +1068,8 @@@
          fprintf(fp, "  %-14s\n", fn_bench_tprs[j]);
  
          /* Make it clear to the user that some additional settings were modified */
 -        if (!is_equal(ir->rvdw, info->rvdw[0])
 -            || !is_equal(ir->rlistlong, info->rlistlong[0]) )
 +        if (!gmx_within_tol(ir->rvdw, info->rvdw[0], GMX_REAL_EPS)
 +            || !gmx_within_tol(ir->rlistlong, info->rlistlong[0], GMX_REAL_EPS) )
          {
              bNote = TRUE;
          }
@@@ -1308,9 -1330,12 +1308,12 @@@ static void make_sure_it_runs(char *mdr
      {
          /* To prevent confusion, do not again issue a gmx_fatal here since we already
           * get the error message from mdrun itself */
-         sprintf(msg,    "Cannot run the benchmark simulations! Please check the error message of\n"
+         sprintf(msg,
+                 "Cannot run the first benchmark simulation! Please check the error message of\n"
                  "mdrun for the source of the problem. Did you provide a command line\n"
-                 "argument that neither g_tune_pme nor mdrun understands? Offending command:\n"
+                 "argument that neither gmx tune_pme nor mdrun understands? If you're\n"
+                 "sure your command line should work, you can bypass this check with \n"
+                 "gmx tune_pme -nocheck. The failing command was:\n"
                  "\n%s\n\n", command);
  
          fprintf(stderr, "%s", msg);
@@@ -1341,7 -1366,7 +1344,7 @@@ static void do_the_tests
          int             npme_fixed,     /* If >= -1, test fixed number of PME
                                           * nodes only                             */
          const char     *npmevalues_opt, /* Which -npme values should be tested    */
-         t_perf        **perfdata,       /* Here the performace data is stored     */
+         t_perf        **perfdata,       /* Here the performance data is stored    */
          int            *pmeentries,     /* Entries in the nPMEnodes list          */
          int             repeats,        /* Repeat each test this often            */
          int             nnodes,         /* Total number of nodes = nPP + nPME     */
@@@ -1354,7 -1379,8 +1357,8 @@@
          const t_filenm *fnm,            /* List of filenames from command line    */
          int             nfile,          /* Number of files specified on the cmdl. */
          int             presteps,       /* DLB equilibration steps, is checked    */
-         gmx_int64_t     cpt_steps)      /* Time step counter in the checkpoint    */
+         gmx_int64_t     cpt_steps,      /* Time step counter in the checkpoint    */
+         gmx_bool        bCheck)         /* Check whether benchmark mdrun works    */
  {
      int      i, nr, k, ret, count = 0, totaltests;
      int     *nPMEnodes = NULL;
@@@ -1454,8 -1480,11 +1458,11 @@@
                          cmd_stub, pd->nPMEnodes, tpr_names[k], cmd_args_bench);
  
                  /* To prevent that all benchmarks fail due to a show-stopper argument
-                  * on the mdrun command line, we make a quick check first */
-                 if (bFirst)
+                  * on the mdrun command line, we make a quick check first.
+                  * This check can be turned off in cases where the automatically chosen
+                  * number of PME-only ranks leads to a number of PP ranks for which no
+                  * decomposition can be found (e.g. for large prime numbers) */
+                 if (bFirst && bCheck)
                  {
                      make_sure_it_runs(pd->mdrun_cmd_line, cmdline_length, fp, fnm, nfile);
                  }
@@@ -1631,13 -1660,13 +1638,13 @@@ static void check_input
      /* Add test scenarios if rmin or rmax were set */
      if (*ntprs <= 2)
      {
 -        if (!is_equal(*rmin, rcoulomb) && (*ntprs == 1) )
 +        if (!gmx_within_tol(*rmin, rcoulomb, GMX_REAL_EPS) && (*ntprs == 1) )
          {
              (*ntprs)++;
              fprintf(stderr, "NOTE: Setting -rmin to %g changed -ntpr to %d\n",
                      *rmin, *ntprs);
          }
 -        if (!is_equal(*rmax, rcoulomb) && (*ntprs == 1) )
 +        if (!gmx_within_tol(*rmax, rcoulomb, GMX_REAL_EPS) && (*ntprs == 1) )
          {
              (*ntprs)++;
              fprintf(stderr, "NOTE: Setting -rmax to %g changed -ntpr to %d\n",
@@@ -1646,13 -1675,13 +1653,13 @@@
      }
      old = *ntprs;
      /* If one of rmin, rmax is set, we need 2 tpr files at minimum */
 -    if (!is_equal(*rmax, rcoulomb) || !is_equal(*rmin, rcoulomb) )
 +    if (!gmx_within_tol(*rmax, rcoulomb, GMX_REAL_EPS) || !gmx_within_tol(*rmin, rcoulomb, GMX_REAL_EPS) )
      {
          *ntprs = max(*ntprs, 2);
      }
  
      /* If both rmin, rmax are set, we need 3 tpr files at minimum */
 -    if (!is_equal(*rmax, rcoulomb) && !is_equal(*rmin, rcoulomb) )
 +    if (!gmx_within_tol(*rmax, rcoulomb, GMX_REAL_EPS) && !gmx_within_tol(*rmin, rcoulomb, GMX_REAL_EPS) )
      {
          *ntprs = max(*ntprs, 3);
      }
@@@ -1664,7 -1693,7 +1671,7 @@@
  
      if (*ntprs > 1)
      {
 -        if (is_equal(*rmin, rcoulomb) && is_equal(rcoulomb, *rmax)) /* We have just a single rc */
 +        if (gmx_within_tol(*rmin, rcoulomb, GMX_REAL_EPS) && gmx_within_tol(rcoulomb, *rmax, GMX_REAL_EPS)) /* We have just a single rc */
          {
              fprintf(stderr, "WARNING: Resetting -ntpr to 1 since no Coulomb radius scaling is requested.\n"
                      "Please set rmin < rmax to test Coulomb radii in the [rmin, rmax] interval\n"
@@@ -2016,6 -2045,9 +2023,9 @@@ int gmx_tune_pme(int argc, char *argv[]
          "need to provide a machine- or hostfile. This can also be passed",
          "via the MPIRUN variable, e.g.[PAR]",
          "[TT]export MPIRUN=\"/usr/local/mpirun -machinefile hosts\"[tt][PAR]",
+         "Before doing the actual benchmark runs, [THISMODULE] will do a quick",
+         "check whether mdrun works as expected with the provided parallel settings",
+         "if the [TT]-check[tt] option is activated (the default).",
          "Please call [THISMODULE] with the normal options you would pass to",
          "[gmx-mdrun] and add [TT]-np[tt] for the number of ranks to perform the",
          "tests on, or [TT]-ntmpi[tt] for the number of threads. You can also add [TT]-r[tt]",
@@@ -2025,7 -2057,7 +2035,7 @@@
          "written with enlarged cutoffs and smaller Fourier grids respectively.",
          "Typically, the first test (number 0) will be with the settings from the input",
          "[TT].tpr[tt] file; the last test (number [TT]ntpr[tt]) will have the Coulomb cutoff",
-         "specified by [TT]-rmax[tt] with a somwhat smaller PME grid at the same time. ",
+         "specified by [TT]-rmax[tt] with a somewhat smaller PME grid at the same time. ",
          "In this last test, the Fourier spacing is multiplied with [TT]rmax[tt]/rcoulomb. ",
          "The remaining [TT].tpr[tt] files will have equally-spaced Coulomb radii (and Fourier "
          "spacings) between these extremes. [BB]Note[bb] that you can set [TT]-ntpr[tt] to 1",
@@@ -2035,7 -2067,7 +2045,7 @@@
          "MD systems. The dynamic load balancing needs about 100 time steps",
          "to adapt to local load imbalances, therefore the time step counters",
          "are by default reset after 100 steps. For large systems (>1M atoms), as well as ",
-         "for a higher accuarcy of the measurements, you should set [TT]-resetstep[tt] to a higher value.",
+         "for a higher accuracy of the measurements, you should set [TT]-resetstep[tt] to a higher value.",
          "From the 'DD' load imbalance entries in the md.log output file you",
          "can tell after how many steps the load is sufficiently balanced. Example call:[PAR]"
          "[TT]gmx tune_pme -np 64 -s protein.tpr -launch[tt][PAR]",
@@@ -2092,9 -2124,9 +2102,9 @@@
          /* g_tune_pme */
          { efOUT, "-p",      "perf",     ffWRITE },
          { efLOG, "-err",    "bencherr", ffWRITE },
 -        { efTPX, "-so",     "tuned",    ffWRITE },
 +        { efTPR, "-so",     "tuned",    ffWRITE },
          /* mdrun: */
 -        { efTPX, NULL,      NULL,       ffREAD },
 +        { efTPR, NULL,      NULL,       ffREAD },
          { efTRN, "-o",      NULL,       ffWRITE },
          { efCOMPRESSED, "-x", NULL,     ffOPTWR },
          { efCPT, "-cpi",    NULL,       ffOPTRD },
@@@ -2162,6 -2194,7 +2172,7 @@@
      gmx_bool     bKeepAndNumCPT        = FALSE;
      gmx_bool     bResetCountersHalfWay = FALSE;
      gmx_bool     bBenchmark            = TRUE;
+     gmx_bool     bCheck                = TRUE;
  
      output_env_t oenv = NULL;
  
@@@ -2205,6 -2238,8 +2216,8 @@@
            "Launch the real simulation after optimization" },
          { "-bench",    FALSE, etBOOL, {&bBenchmark},
            "Run the benchmarks or just create the input [TT].tpr[tt] files?" },
+         { "-check",    FALSE, etBOOL, {&bCheck},
+           "Before the benchmark runs, check whether mdrun works in parallel" },
          /******************/
          /* mdrun options: */
          /******************/
@@@ -2364,8 -2399,7 +2377,8 @@@
      sep_line(fp);
      fprintf(fp, "\n      P E R F O R M A N C E   R E S U L T S\n");
      sep_line(fp);
 -    fprintf(fp, "%s for Gromacs %s\n", ShortProgram(), GromacsVersion());
 +    fprintf(fp, "%s for Gromacs %s\n", output_env_get_program_display_name(oenv),
 +            gmx_version());
      if (!bThreads)
      {
          fprintf(fp, "Number of ranks         : %d\n", nnodes);
@@@ -2457,7 -2491,7 +2470,7 @@@
      {
          do_the_tests(fp, tpr_names, maxPMEnodes, minPMEnodes, npme_fixed, npmevalues_opt[0], perfdata, &pmeentries,
                       repeats, nnodes, ntprs, bThreads, cmd_mpirun, cmd_np, cmd_mdrun,
-                      cmd_args_bench, fnm, NFILE, presteps, cpt_steps);
+                      cmd_args_bench, fnm, NFILE, presteps, cpt_steps, bCheck);
  
          fprintf(fp, "\nTuning took%8.1f minutes.\n", (gmx_gettime()-seconds)/60.0);
  
diff --combined src/gromacs/gmxlib/calcgrid.c
index 1de1f57efe,81856d489c..7bbee76f12
--- a/src/gromacs/gmxlib/calcgrid.c
+++ b/src/gromacs/gmxlib/calcgrid.c
@@@ -34,15 -34,16 +34,15 @@@
   * To help us fund GROMACS development, we humbly ask that you cite
   * the research papers on the package. Check out http://www.gromacs.org.
   */
 -#ifdef HAVE_CONFIG_H
 -#include <config.h>
 -#endif
 +#include "gmxpre.h"
 +
 +#include "gromacs/legacyheaders/calcgrid.h"
  
  #include <math.h>
  
 -#include "typedefs.h"
 +#include "gromacs/legacyheaders/typedefs.h"
 +#include "gromacs/utility/fatalerror.h"
  #include "gromacs/utility/smalloc.h"
 -#include "gmx_fatal.h"
 -#include "calcgrid.h"
  
  /* The grid sizes below are based on timing of a 3D cubic grid in fftw
   * compiled with SSE using 4 threads in fft5d.c.
@@@ -136,7 -137,7 +136,7 @@@ real calc_grid(FILE *fp, matrix box, re
                  /* Determine how many pre-factors of 2 we need */
                  fac2 = 1;
                  i    = g_baseNR - 1;
-                 while (fac2*grid_base[i-1] < nmin)
+                 while (fac2*grid_base[i] < nmin)
                  {
                      fac2 *= 2;
                  }
diff --combined src/gromacs/gmxpreprocess/toppush.c
index 161940b72e,fab21dda51..8272b7787e
--- a/src/gromacs/gmxpreprocess/toppush.c
+++ b/src/gromacs/gmxpreprocess/toppush.c
@@@ -34,27 -34,28 +34,27 @@@
   * To help us fund GROMACS development, we humbly ask that you cite
   * the research papers on the package. Check out http://www.gromacs.org.
   */
 -#ifdef HAVE_CONFIG_H
 -#include <config.h>
 -#endif
 +#include "gmxpre.h"
  
 +#include "toppush.h"
 +
 +#include <assert.h>
  #include <ctype.h>
  #include <math.h>
 -#include <assert.h>
 +#include <stdlib.h>
  
 -#include "sysstuff.h"
 -#include "gromacs/utility/smalloc.h"
 -#include "macros.h"
 +#include "gromacs/gmxpreprocess/gpp_atomtype.h"
 +#include "gromacs/gmxpreprocess/gpp_bond_atomtype.h"
 +#include "gromacs/gmxpreprocess/readir.h"
 +#include "gromacs/gmxpreprocess/topdirs.h"
 +#include "gromacs/gmxpreprocess/toputil.h"
 +#include "gromacs/legacyheaders/macros.h"
 +#include "gromacs/legacyheaders/names.h"
 +#include "gromacs/legacyheaders/warninp.h"
 +#include "gromacs/topology/symtab.h"
  #include "gromacs/utility/cstringutil.h"
 -#include "names.h"
 -#include "toputil.h"
 -#include "toppush.h"
 -#include "topdirs.h"
 -#include "readir.h"
 -#include "symtab.h"
 -#include "gmx_fatal.h"
 -#include "warninp.h"
 -#include "gpp_atomtype.h"
 -#include "gpp_bond_atomtype.h"
 +#include "gromacs/utility/fatalerror.h"
 +#include "gromacs/utility/smalloc.h"
  
  void generate_nbparams(int comb, int ftype, t_params *plist, gpp_atomtype_t atype,
                         warninp_t wi)
@@@ -599,9 -600,11 +599,11 @@@ static void push_bondtype(t_params     
                      {
                          sprintf(errbuf, "Overriding %s parameters.%s",
                                  interaction_function[ftype].longname,
-                                 (ftype == F_PDIHS) ? "\nUse dihedraltype 4 to allow several multiplicity terms." : "");
+                                 (ftype == F_PDIHS) ?
+                                 "\nUse dihedraltype 9 to allow several multiplicity terms. Only consecutive lines are combined. Non-consective lines overwrite each other."
+                                 : "");
                          warning(wi, errbuf);
-                         fprintf(stderr, "  old:");
+                         fprintf(stderr, "  old:                                         ");
                          for (j = 0; (j < nrfp); j++)
                          {
                              fprintf(stderr, " %g", bt->param[i].c[j]);
diff --combined src/gromacs/legacyheaders/domdec.h
index 13b55711e8,240a7901cc..c44c7bf840
--- a/src/gromacs/legacyheaders/domdec.h
+++ b/src/gromacs/legacyheaders/domdec.h
@@@ -36,10 -36,9 +36,10 @@@
  #ifndef _domdec_h
  #define _domdec_h
  
 -#include "typedefs.h"
 -#include "vsite.h"
 -#include "genborn.h"
 +#include "gromacs/legacyheaders/genborn.h"
 +#include "gromacs/legacyheaders/typedefs.h"
 +#include "gromacs/legacyheaders/vsite.h"
 +#include "gromacs/timing/wallcycle.h"
  
  #ifdef __cplusplus
  extern "C" {
@@@ -139,6 -138,12 +139,12 @@@ void change_dd_dlb_cutoff_limit(t_commr
   * possible after subsequently setting a shorter cut-off with change_dd_cutoff.
   */
  
+ gmx_bool dd_dlb_is_locked(const gmx_domdec_t *dd);
+ /* Return if the DLB lock is set */
+ 
+ void dd_dlb_set_lock(gmx_domdec_t *dd, gmx_bool bValue);
+ /* Set a lock such that with DLB=auto DLB can (not) get turned on */
+ 
  void dd_setup_dlb_resource_sharing(t_commrec           *cr,
                                     const gmx_hw_info_t *hwinfo,
                                     const gmx_hw_opt_t  *hw_opt);
diff --combined src/gromacs/listed-forces/bonded.cpp
index b1ded380a4,0eb8d55791..dfb846d9e7
--- a/src/gromacs/listed-forces/bonded.cpp
+++ b/src/gromacs/listed-forces/bonded.cpp
@@@ -34,50 -34,36 +34,50 @@@
   * To help us fund GROMACS development, we humbly ask that you cite
   * the research papers on the package. Check out http://www.gromacs.org.
   */
 -#ifdef HAVE_CONFIG_H
 -#include <config.h>
 -#endif
 +/*! \internal \file
 + *
 + * \brief This file defines functions necessary for mdrun and tools to
 + * compute energies and forces for bonded interactions.
 + *
 + * \author Mark Abraham <mark.j.abraham@gmail.com>
 + *
 + * \ingroup module_listed-forces
 + */
 +#include "gmxpre.h"
 +
 +#include "bonded.h"
 +
 +#include "config.h"
  
 -#include <math.h>
  #include <assert.h>
 -#include "physics.h"
 -#include "vec.h"
 -#include "gromacs/math/utilities.h"
 -#include "txtdump.h"
 -#include "bondf.h"
 -#include "gromacs/utility/smalloc.h"
 -#include "pbc.h"
 -#include "ns.h"
 -#include "macros.h"
 -#include "names.h"
 -#include "gmx_fatal.h"
 -#include "mshift.h"
 -#include "main.h"
 -#include "disre.h"
 -#include "orires.h"
 -#include "force.h"
 -#include "nonbonded.h"
 -#include "restcbt.h"
  
 +#include <cmath>
 +
 +#include <algorithm>
 +
 +#include "gromacs/legacyheaders/disre.h"
 +#include "gromacs/legacyheaders/force.h"
 +#include "gromacs/legacyheaders/macros.h"
 +#include "gromacs/legacyheaders/names.h"
 +#include "gromacs/legacyheaders/nonbonded.h"
 +#include "gromacs/legacyheaders/ns.h"
 +#include "gromacs/legacyheaders/orires.h"
 +#include "gromacs/legacyheaders/txtdump.h"
 +#include "gromacs/math/units.h"
 +#include "gromacs/math/utilities.h"
 +#include "gromacs/math/vec.h"
 +#include "gromacs/pbcutil/ishift.h"
 +#include "gromacs/pbcutil/mshift.h"
 +#include "gromacs/pbcutil/pbc.h"
  #include "gromacs/simd/simd.h"
  #include "gromacs/simd/simd_math.h"
  #include "gromacs/simd/vector_operations.h"
 +#include "gromacs/utility/fatalerror.h"
 +#include "gromacs/utility/smalloc.h"
  
 -/* Find a better place for this? */
 +#include "restcbt.h"
 +
 +/*! \brief Mysterious CMAP coefficient matrix */
  const int cmap_coeff_matrix[] = {
      1, 0, -3,  2, 0, 0,  0,  0, -3,  0,  9, -6,  2,  0, -6,  4,
      0, 0,  0,  0, 0, 0,  0,  0,  3,  0, -9,  6, -2,  0,  6, -4,
@@@ -98,9 -84,7 +98,9 @@@
  };
  
  
 -
 +/* TODO This function should go and live in nonbonded.c where it is
 +   really needed. Here, it only supports giving a fatal error message
 +   with FENE_bonds */
  int glatnr(int *global_atom_index, int i)
  {
      int atnr;
@@@ -117,9 -101,6 +117,9 @@@
      return atnr;
  }
  
 +/*! \brief Compute dx = xi - xj, modulo PBC if non-NULL
 + *
 + * \todo This kind of code appears in many places. Consolidate it */
  static int pbc_rvec_sub(const t_pbc *pbc, const rvec xi, const rvec xj, rvec dx)
  {
      if (pbc)
@@@ -148,7 -129,7 +148,7 @@@ typedef struct 
      gmx_simd_real_t bxx;
  } pbc_simd_t;
  
 -/* Set the SIMD pbc data from a normal t_pbc struct */
 +/*! \brief Set the SIMD pbc data from a normal t_pbc struct */
  static void set_pbc_simd(const t_pbc *pbc, pbc_simd_t *pbc_simd)
  {
      rvec inv_bdiag;
@@@ -188,7 -169,7 +188,7 @@@
      }
  }
  
 -/* Correct distance vector *dx,*dy,*dz for PBC using SIMD */
 +/*! \brief Correct distance vector *dx,*dy,*dz for PBC using SIMD */
  static gmx_inline void
  pbc_dx_simd(gmx_simd_real_t *dx, gmx_simd_real_t *dy, gmx_simd_real_t *dz,
              const pbc_simd_t *pbc)
@@@ -210,9 -191,10 +210,9 @@@
  
  #endif /* GMX_SIMD_HAVE_REAL */
  
 -/*
 - * Morse potential bond by Frank Everdij
 +/*! \brief Morse potential bond
   *
 - * Three parameters needed:
 + * By Frank Everdij. Three parameters needed:
   *
   * b0 = equilibrium distance in nm
   * be = beta in nm^-1 (actually, it's nu_e*Sqrt(2*pi*pi*mu/D_e))
@@@ -221,6 -203,7 +221,6 @@@
   * Note: the potential is referenced to be +cb at infinite separation
   *       and zero at the equilibrium distance!
   */
 -
  real morse_bonds(int nbonds,
                   const t_iatom forceatoms[], const t_iparams forceparams[],
                   const rvec x[], rvec f[], rvec fshift[],
@@@ -295,7 -278,6 +295,7 @@@
      return vtot;
  }
  
 +//! \cond
  real cubic_bonds(int nbonds,
                   const t_iatom forceatoms[], const t_iparams forceparams[],
                   const rvec x[], rvec f[], rvec fshift[],
@@@ -369,7 -351,7 +369,7 @@@ real FENE_bonds(int nbonds
      const real half = 0.5;
      const real one  = 1.0;
      real       bm, kb;
 -    real       dr, dr2, bm2, omdr2obm2, fbond, vbond, fij, vtot;
 +    real       dr2, bm2, omdr2obm2, fbond, vbond, fij, vtot;
      rvec       dx;
      int        i, m, ki, type, ai, aj;
      ivec       dt;
@@@ -769,6 -751,7 +769,6 @@@ real water_pol(int nbonds
          kk[YY] = sqr(qS)*ONE_4PI_EPS0/forceparams[type0].wpol.al_y;
          kk[ZZ] = sqr(qS)*ONE_4PI_EPS0/forceparams[type0].wpol.al_z;
          r_HH   = 1.0/forceparams[type0].wpol.rHH;
 -        r_OD   = 1.0/forceparams[type0].wpol.rOD;
          if (debug)
          {
              fprintf(debug, "WPOL: qS  = %10.5f aS = %5d\n", qS, aS);
@@@ -892,7 -875,7 +892,7 @@@ static real do_1_thole(const rvec xi, c
                         rvec fshift[], real afac)
  {
      rvec r12;
 -    real r12sq, r12_1, r12n, r12bar, v0, v1, fscal, ebar, fff;
 +    real r12sq, r12_1, r12bar, v0, v1, fscal, ebar, fff;
      int  m, t;
  
      t      = pbc_rvec_sub(pbc, xi, xj, r12);                      /*  3 */
@@@ -931,10 -914,9 +931,10 @@@ real thole_pol(int nbonds
                 int gmx_unused *global_atom_index)
  {
      /* Interaction between two pairs of particles with opposite charge */
 -    int  i, type, a1, da1, a2, da2;
 -    real q1, q2, qq, a, al1, al2, afac;
 -    real V = 0;
 +    int        i, type, a1, da1, a2, da2;
 +    real       q1, q2, qq, a, al1, al2, afac;
 +    real       V             = 0;
 +    const real minusOneOnSix = -1.0/6.0;
  
      for (i = 0; (i < nbonds); )
      {
@@@ -949,7 -931,7 +949,7 @@@
          al1   = forceparams[type].thole.alpha1;
          al2   = forceparams[type].thole.alpha2;
          qq    = q1*q2;
 -        afac  = a*pow(al1*al2, -1.0/6.0);
 +        afac  = a*pow(al1*al2, minusOneOnSix);
          V    += do_1_thole(x[a1], x[a2], f[a1], f[a2], pbc, qq, fshift, afac);
          V    += do_1_thole(x[da1], x[a2], f[da1], f[a2], pbc, -qq, fshift, afac);
          V    += do_1_thole(x[a1], x[da2], f[a1], f[da2], pbc, -qq, fshift, afac);
@@@ -1734,9 -1716,10 +1734,9 @@@ do_dih_fup_noshiftf(int i, int j, int k
                      rvec m, rvec n, rvec f[])
  {
      rvec f_i, f_j, f_k, f_l;
 -    rvec uvec, vvec, svec, dx_jl;
 +    rvec uvec, vvec, svec;
      real iprm, iprn, nrkj, nrkj2, nrkj_1, nrkj_2;
      real a, b, p, q, toler;
 -    ivec jt, dt_ij, dt_kj, dt_lj;
  
      iprm  = iprod(m, m);       /*  5    */
      iprn  = iprod(n, n);       /*  5	*/
@@@ -2006,9 -1989,10 +2006,9 @@@ pdihs_noener_simd(int nbonds
      const int             nfa1 = 5;
      int                   i, iu, s;
      int                   type, ai[GMX_SIMD_REAL_WIDTH], aj[GMX_SIMD_REAL_WIDTH], ak[GMX_SIMD_REAL_WIDTH], al[GMX_SIMD_REAL_WIDTH];
 -    real                  ddphi;
      real                  dr_array[3*DIM*GMX_SIMD_REAL_WIDTH+GMX_SIMD_REAL_WIDTH], *dr;
      real                  buf_array[7*GMX_SIMD_REAL_WIDTH+GMX_SIMD_REAL_WIDTH], *buf;
 -    real                 *cp, *phi0, *mult, *phi, *p, *q, *sf_i, *msf_l;
 +    real                 *cp, *phi0, *mult, *p, *q;
      gmx_simd_real_t       phi0_S, phi_S;
      gmx_simd_real_t       mx_S, my_S, mz_S;
      gmx_simd_real_t       nx_S, ny_S, nz_S;
@@@ -2029,6 -2013,8 +2029,6 @@@
      mult  = buf + 2*GMX_SIMD_REAL_WIDTH;
      p     = buf + 3*GMX_SIMD_REAL_WIDTH;
      q     = buf + 4*GMX_SIMD_REAL_WIDTH;
 -    sf_i  = buf + 5*GMX_SIMD_REAL_WIDTH;
 -    msf_l = buf + 6*GMX_SIMD_REAL_WIDTH;
  
      set_pbc_simd(pbc, &pbc_simd);
  
@@@ -2117,6 -2103,155 +2117,152 @@@
      }
  }
  
+ /* This is mostly a copy of pdihs_noener_simd above, but with using
+  * the RB potential instead of a harmonic potential.
+  * This function can replace rbdihs() when no energy and virial are needed.
+  */
+ static void
+ rbdihs_noener_simd(int nbonds,
+                    const t_iatom forceatoms[], const t_iparams forceparams[],
+                    const rvec x[], rvec f[],
+                    const t_pbc *pbc, const t_graph gmx_unused *g,
+                    real gmx_unused lambda,
+                    const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+                    int gmx_unused *global_atom_index)
+ {
+     const int             nfa1 = 5;
+     int                   i, iu, s, j;
+     int                   type, ai[GMX_SIMD_REAL_WIDTH], aj[GMX_SIMD_REAL_WIDTH], ak[GMX_SIMD_REAL_WIDTH], al[GMX_SIMD_REAL_WIDTH];
 -    real                  ddphi;
+     real                  dr_array[3*DIM*GMX_SIMD_REAL_WIDTH+GMX_SIMD_REAL_WIDTH], *dr;
+     real                  buf_array[(NR_RBDIHS + 4)*GMX_SIMD_REAL_WIDTH+GMX_SIMD_REAL_WIDTH], *buf;
 -    real                 *parm, *phi, *p, *q, *sf_i, *msf_l;
++    real                 *parm, *p, *q;
+ 
+     gmx_simd_real_t       phi_S;
+     gmx_simd_real_t       ddphi_S, cosfac_S;
+     gmx_simd_real_t       mx_S, my_S, mz_S;
+     gmx_simd_real_t       nx_S, ny_S, nz_S;
+     gmx_simd_real_t       nrkj_m2_S, nrkj_n2_S;
+     gmx_simd_real_t       parm_S, c_S;
+     gmx_simd_real_t       sin_S, cos_S;
+     gmx_simd_real_t       sf_i_S, msf_l_S;
+     pbc_simd_t            pbc_simd;
+ 
+     gmx_simd_real_t       pi_S  = gmx_simd_set1_r(M_PI);
+     gmx_simd_real_t       one_S = gmx_simd_set1_r(1.0);
+ 
+     /* Ensure SIMD register alignment */
+     dr  = gmx_simd_align_r(dr_array);
+     buf = gmx_simd_align_r(buf_array);
+ 
+     /* Extract aligned pointer for parameters and variables */
+     parm  = buf;
+     p     = buf + (NR_RBDIHS + 0)*GMX_SIMD_REAL_WIDTH;
+     q     = buf + (NR_RBDIHS + 1)*GMX_SIMD_REAL_WIDTH;
 -    sf_i  = buf + (NR_RBDIHS + 2)*GMX_SIMD_REAL_WIDTH;
 -    msf_l = buf + (NR_RBDIHS + 3)*GMX_SIMD_REAL_WIDTH;
+ 
+     set_pbc_simd(pbc, &pbc_simd);
+ 
+     /* nbonds is the number of dihedrals times nfa1, here we step GMX_SIMD_REAL_WIDTH dihs */
+     for (i = 0; (i < nbonds); i += GMX_SIMD_REAL_WIDTH*nfa1)
+     {
+         /* Collect atoms quadruplets for GMX_SIMD_REAL_WIDTH dihedrals.
+          * iu indexes into forceatoms, we should not let iu go beyond nbonds.
+          */
+         iu = i;
+         for (s = 0; s < GMX_SIMD_REAL_WIDTH; s++)
+         {
+             type  = forceatoms[iu];
+             ai[s] = forceatoms[iu+1];
+             aj[s] = forceatoms[iu+2];
+             ak[s] = forceatoms[iu+3];
+             al[s] = forceatoms[iu+4];
+ 
+             /* We don't need the first parameter, since that's a constant
+              * which only affects the energies, not the forces.
+              */
+             for (j = 1; j < NR_RBDIHS; j++)
+             {
+                 parm[j*GMX_SIMD_REAL_WIDTH + s] =
+                     forceparams[type].rbdihs.rbcA[j];
+             }
+ 
+             /* At the end fill the arrays with identical entries */
+             if (iu + nfa1 < nbonds)
+             {
+                 iu += nfa1;
+             }
+         }
+ 
+         /* Caclulate GMX_SIMD_REAL_WIDTH dihedral angles at once */
+         dih_angle_simd(x, ai, aj, ak, al, &pbc_simd,
+                        dr,
+                        &phi_S,
+                        &mx_S, &my_S, &mz_S,
+                        &nx_S, &ny_S, &nz_S,
+                        &nrkj_m2_S,
+                        &nrkj_n2_S,
+                        p, q);
+ 
+         /* Change to polymer convention */
+         phi_S = gmx_simd_sub_r(phi_S, pi_S);
+ 
+         gmx_simd_sincos_r(phi_S, &sin_S, &cos_S);
+ 
+         ddphi_S   = gmx_simd_setzero_r();
+         c_S       = one_S;
+         cosfac_S  = one_S;
+         for (j = 1; j < NR_RBDIHS; j++)
+         {
+             parm_S   = gmx_simd_load_r(parm + j*GMX_SIMD_REAL_WIDTH);
+             ddphi_S  = gmx_simd_fmadd_r(gmx_simd_mul_r(c_S, parm_S), cosfac_S, ddphi_S);
+             cosfac_S = gmx_simd_mul_r(cosfac_S, cos_S);
+             c_S      = gmx_simd_add_r(c_S, one_S);
+         }
+ 
+         /* Note that here we do not use the minus sign which is present
+          * in the normal RB code. This is corrected for through (m)sf below.
+          */
+         ddphi_S  = gmx_simd_mul_r(ddphi_S, sin_S);
+ 
+         sf_i_S   = gmx_simd_mul_r(ddphi_S, nrkj_m2_S);
+         msf_l_S  = gmx_simd_mul_r(ddphi_S, nrkj_n2_S);
+ 
+         /* After this m?_S will contain f[i] */
+         mx_S     = gmx_simd_mul_r(sf_i_S, mx_S);
+         my_S     = gmx_simd_mul_r(sf_i_S, my_S);
+         mz_S     = gmx_simd_mul_r(sf_i_S, mz_S);
+ 
+         /* After this m?_S will contain -f[l] */
+         nx_S     = gmx_simd_mul_r(msf_l_S, nx_S);
+         ny_S     = gmx_simd_mul_r(msf_l_S, ny_S);
+         nz_S     = gmx_simd_mul_r(msf_l_S, nz_S);
+ 
+         gmx_simd_store_r(dr + 0*GMX_SIMD_REAL_WIDTH, mx_S);
+         gmx_simd_store_r(dr + 1*GMX_SIMD_REAL_WIDTH, my_S);
+         gmx_simd_store_r(dr + 2*GMX_SIMD_REAL_WIDTH, mz_S);
+         gmx_simd_store_r(dr + 3*GMX_SIMD_REAL_WIDTH, nx_S);
+         gmx_simd_store_r(dr + 4*GMX_SIMD_REAL_WIDTH, ny_S);
+         gmx_simd_store_r(dr + 5*GMX_SIMD_REAL_WIDTH, nz_S);
+ 
+         iu = i;
+         s  = 0;
+         do
+         {
+             do_dih_fup_noshiftf_precalc(ai[s], aj[s], ak[s], al[s],
+                                         p[s], q[s],
+                                         dr[     XX *GMX_SIMD_REAL_WIDTH+s],
+                                         dr[     YY *GMX_SIMD_REAL_WIDTH+s],
+                                         dr[     ZZ *GMX_SIMD_REAL_WIDTH+s],
+                                         dr[(DIM+XX)*GMX_SIMD_REAL_WIDTH+s],
+                                         dr[(DIM+YY)*GMX_SIMD_REAL_WIDTH+s],
+                                         dr[(DIM+ZZ)*GMX_SIMD_REAL_WIDTH+s],
+                                         f);
+             s++;
+             iu += nfa1;
+         }
+         while (s < GMX_SIMD_REAL_WIDTH && iu < nbonds);
+     }
+ }
+ 
  #endif /* GMX_SIMD_HAVE_REAL */
  
  
@@@ -2175,7 -2310,7 +2321,7 @@@ real idihs(int nbonds
  
          dvdl_term += 0.5*(kB - kA)*dp2 - kk*dphi0*dp;
  
 -        do_dih_fup(ai, aj, ak, al, (real)(-ddphi), r_ij, r_kj, r_kl, m, n,
 +        do_dih_fup(ai, aj, ak, al, -ddphi, r_ij, r_kj, r_kl, m, n,
                     f, fshift, pbc, g, x, t1, t2, t3); /* 112		*/
          /* 218 TOTAL	*/
  #ifdef DEBUG
@@@ -2265,44 -2400,6 +2411,44 @@@ static void posres_dx(const rvec x, con
      }
  }
  
 +/*! \brief Computes forces and potential for flat-bottom cylindrical restraints.
 + *         Returns the flat-bottom potential. */
 +static real do_fbposres_cylinder(int fbdim, rvec fm, rvec dx, real rfb, real kk, gmx_bool bInvert)
 +{
 +    int     d;
 +    real    dr, dr2, invdr, v, rfb2;
 +
 +    dr2  = 0.0;
 +    rfb2 = sqr(rfb);
 +    v    = 0.0;
 +
 +    for (d = 0; d < DIM; d++)
 +    {
 +        if (d != fbdim)
 +        {
 +            dr2 += sqr(dx[d]);
 +        }
 +    }
 +
 +    if  (dr2 > 0.0 &&
 +         ( (dr2 > rfb2 && bInvert == FALSE ) || (dr2 < rfb2 && bInvert == TRUE ) )
 +         )
 +    {
 +        dr     = sqrt(dr2);
 +        invdr  = 1./dr;
 +        v      = 0.5*kk*sqr(dr - rfb);
 +        for (d = 0; d < DIM; d++)
 +        {
 +            if (d != fbdim)
 +            {
 +                fm[d] = -kk*(dr-rfb)*dx[d]*invdr; /* Force pointing to the center */
 +            }
 +        }
 +    }
 +
 +    return v;
 +}
 +
  /*! \brief Adds forces of flat-bottomed positions restraints to f[]
   *         and fixes vir_diag. Returns the flat-bottomed potential. */
  real fbposres(int nbonds,
@@@ -2315,8 -2412,8 +2461,8 @@@
      int              i, ai, m, d, type, npbcdim = 0, fbdim;
      const t_iparams *pr;
      real             vtot, kk, v;
 -    real             ref = 0, dr, dr2, rpot, rfb, rfb2, fact, invdr;
 -    rvec             com_sc, rdist, pos, dx, dpdl, fm;
 +    real             dr, dr2, rfb, rfb2, fact;
 +    rvec             com_sc, rdist, dx, dpdl, fm;
      gmx_bool         bInvert;
  
      npbcdim = ePBC2npbcdim(ePBC);
@@@ -2377,22 -2474,19 +2523,22 @@@
                      svmul(fact, dx, fm);
                  }
                  break;
 +            case efbposresCYLINDERX:
 +                /* cylindrical flat-bottom posres in y-z plane. fm[XX] = 0. */
 +                fbdim = XX;
 +                v     = do_fbposres_cylinder(fbdim, fm, dx, rfb, kk, bInvert);
 +                break;
 +            case efbposresCYLINDERY:
 +                /* cylindrical flat-bottom posres in x-z plane. fm[YY] = 0. */
 +                fbdim = YY;
 +                v     = do_fbposres_cylinder(fbdim, fm, dx, rfb, kk, bInvert);
 +                break;
              case efbposresCYLINDER:
 -                /* cylidrical flat-bottom posres in x-y plane. fm[ZZ] = 0. */
 -                dr2 = sqr(dx[XX])+sqr(dx[YY]);
 -                if  (dr2 > 0.0 &&
 -                     ( (dr2 > rfb2 && bInvert == FALSE ) || (dr2 < rfb2 && bInvert == TRUE ) )
 -                     )
 -                {
 -                    dr     = sqrt(dr2);
 -                    invdr  = 1./dr;
 -                    v      = 0.5*kk*sqr(dr - rfb);
 -                    fm[XX] = -kk*(dr-rfb)*dx[XX]*invdr; /* Force pointing to the center */
 -                    fm[YY] = -kk*(dr-rfb)*dx[YY]*invdr;
 -                }
 +            /* equivalent to efbposresCYLINDERZ for backwards compatibility */
 +            case efbposresCYLINDERZ:
 +                /* cylindrical flat-bottom posres in x-y plane. fm[ZZ] = 0. */
 +                fbdim = ZZ;
 +                v     = do_fbposres_cylinder(fbdim, fm, dx, rfb, kk, bInvert);
                  break;
              case efbposresX: /* fbdim=XX */
              case efbposresY: /* fbdim=YY */
@@@ -2434,11 -2528,12 +2580,11 @@@ real posres(int nbonds
              real lambda, real *dvdlambda,
              int refcoord_scaling, int ePBC, rvec comA, rvec comB)
  {
 -    int              i, ai, m, d, type, ki, npbcdim = 0;
 +    int              i, ai, m, d, type, npbcdim = 0;
      const t_iparams *pr;
      real             L1;
      real             vtot, kk, fm;
 -    real             posA, posB, ref = 0;
 -    rvec             comA_sc, comB_sc, rdist, dpdl, pos, dx;
 +    rvec             comA_sc, comB_sc, rdist, dpdl, dx;
      gmx_bool         bForceValid = TRUE;
  
      if ((f == NULL) || (vir_diag == NULL))    /* should both be null together! */
@@@ -2631,7 -2726,7 +2777,7 @@@ real dihres(int nbonds
      real vtot = 0;
      int  ai, aj, ak, al, i, k, type, t1, t2, t3;
      real phi0A, phi0B, dphiA, dphiB, kfacA, kfacB, phi0, dphi, kfac;
 -    real phi, ddphi, ddp, ddp2, dp, sign, d2r, fc, L1;
 +    real phi, ddphi, ddp, ddp2, dp, sign, d2r, L1;
      rvec r_ij, r_kj, r_kl, m, n;
  
      L1 = 1.0-lambda;
@@@ -2738,6 -2833,7 +2884,6 @@@ real restrangles(int nbonds
  {
      int  i, d, ai, aj, ak, type, m;
      int  t1, t2;
 -    rvec r_ij, r_kj;
      real v, vtot;
      ivec jt, dt_ij, dt_kj;
      rvec f_i, f_j, f_k;
@@@ -2865,7 -2961,7 +3011,7 @@@ real restrdihs(int nbonds
          t1 = pbc_rvec_sub(pbc, x[ai], x[aj], vec_temp);
          pbc_rvec_sub(pbc, x[aj], x[ai], delta_ante);
          t2 = pbc_rvec_sub(pbc, x[ak], x[aj], delta_crnt);
 -        t3 = pbc_rvec_sub(pbc, x[ak], x[al], vec_temp);
 +        pbc_rvec_sub(pbc, x[ak], x[al], vec_temp);
          pbc_rvec_sub(pbc, x[al], x[ak], delta_post);
  
          /* This function computes factors needed for restricted angle potential.
@@@ -2976,7 -3072,7 +3122,7 @@@ real cbtdihs(int nbonds
          pbc_rvec_sub(pbc, x[aj], x[ai], delta_ante);
          t2 = pbc_rvec_sub(pbc, x[ak], x[aj], vec_temp);
          pbc_rvec_sub(pbc, x[ak], x[aj], delta_crnt);
 -        t3 = pbc_rvec_sub(pbc, x[ak], x[al], vec_temp);
 +        pbc_rvec_sub(pbc, x[ak], x[al], vec_temp);
          pbc_rvec_sub(pbc, x[al], x[ak], delta_post);
  
          /* \brief Compute factors for CBT potential
@@@ -3151,11 -3247,7 +3297,11 @@@ real rbdihs(int nbonds
      return vtot;
  }
  
 -int cmap_setup_grid_index(int ip, int grid_spacing, int *ipm1, int *ipp1, int *ipp2)
 +//! \endcond
 +
 +/*! \brief Mysterious undocumented function */
 +static int
 +cmap_setup_grid_index(int ip, int grid_spacing, int *ipm1, int *ipp1, int *ipp2)
  {
      int im1, ip1, ip2;
  
@@@ -3194,16 -3286,14 +3340,16 @@@
  
  }
  
 -real cmap_dihs(int nbonds,
 -               const t_iatom forceatoms[], const t_iparams forceparams[],
 -               const gmx_cmap_t *cmap_grid,
 -               const rvec x[], rvec f[], rvec fshift[],
 -               const t_pbc *pbc, const t_graph *g,
 -               real gmx_unused lambda, real gmx_unused *dvdlambda,
 -               const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
 -               int  gmx_unused *global_atom_index)
 +/*! \brief Compute CMAP dihedral energies and forces */
 +static real
 +cmap_dihs(int nbonds,
 +          const t_iatom forceatoms[], const t_iparams forceparams[],
 +          const gmx_cmap_t *cmap_grid,
 +          const rvec x[], rvec f[], rvec fshift[],
 +          const t_pbc *pbc, const t_graph *g,
 +          real gmx_unused lambda, real gmx_unused *dvdlambda,
 +          const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
 +          int  gmx_unused *global_atom_index)
  {
      int         i, j, k, n, idx;
      int         ai, aj, ak, al, am;
@@@ -3212,13 -3302,13 +3358,13 @@@
      int         t11, t21, t31, t12, t22, t32;
      int         iphi1, ip1m1, ip1p1, ip1p2;
      int         iphi2, ip2m1, ip2p1, ip2p2;
 -    int         l1, l2, l3, l4;
 -    int         pos1, pos2, pos3, pos4, tmp;
 +    int         l1, l2, l3;
 +    int         pos1, pos2, pos3, pos4;
  
      real        ty[4], ty1[4], ty2[4], ty12[4], tc[16], tx[16];
 -    real        phi1, psi1, cos_phi1, sin_phi1, sign1, xphi1;
 -    real        phi2, psi2, cos_phi2, sin_phi2, sign2, xphi2;
 -    real        dx, xx, tt, tu, e, df1, df2, ddf1, ddf2, ddf12, vtot;
 +    real        phi1, cos_phi1, sin_phi1, sign1, xphi1;
 +    real        phi2, cos_phi2, sin_phi2, sign2, xphi2;
 +    real        dx, xx, tt, tu, e, df1, df2, vtot;
      real        ra21, rb21, rg21, rg1, rgr1, ra2r1, rb2r1, rabr1;
      real        ra22, rb22, rg22, rg2, rgr2, ra2r2, rb2r2, rabr2;
      real        fg1, hg1, fga1, hgb1, gaa1, gbb1;
@@@ -3280,7 -3370,7 +3426,7 @@@
          b1[1] = r1_kl[2]*r1_kj[0]-r1_kl[0]*r1_kj[2];
          b1[2] = r1_kl[0]*r1_kj[1]-r1_kl[1]*r1_kj[0]; /* 9 */
  
 -        tmp = pbc_rvec_sub(pbc, x[a1l], x[a1k], h1);
 +        pbc_rvec_sub(pbc, x[a1l], x[a1k], h1);
  
          ra21  = iprod(a1, a1);       /* 5 */
          rb21  = iprod(b1, b1);       /* 5 */
@@@ -3341,7 -3431,7 +3487,7 @@@
          b2[1] = r2_kl[2]*r2_kj[0]-r2_kl[0]*r2_kj[2];
          b2[2] = r2_kl[0]*r2_kj[1]-r2_kl[1]*r2_kj[0]; /* 9 */
  
 -        tmp = pbc_rvec_sub(pbc, x[a2l], x[a2k], h2);
 +        pbc_rvec_sub(pbc, x[a2l], x[a2k], h2);
  
          ra22  = iprod(a2, a2);         /* 5 */
          rb22  = iprod(b2, b2);         /* 5 */
@@@ -3406,8 -3496,8 +3552,8 @@@
          dx = 2*M_PI / cmap_grid->grid_spacing;
  
          /* Where on the grid are we */
 -        iphi1 = (int)(xphi1/dx);
 -        iphi2 = (int)(xphi2/dx);
 +        iphi1 = static_cast<int>(xphi1/dx);
 +        iphi2 = static_cast<int>(xphi2/dx);
  
          iphi1 = cmap_setup_grid_index(iphi1, cmap_grid->grid_spacing, &ip1m1, &ip1p1, &ip1p2);
          iphi2 = cmap_setup_grid_index(iphi2, cmap_grid->grid_spacing, &ip2m1, &ip2p1, &ip2p2);
@@@ -3472,6 -3562,9 +3618,6 @@@
          e     = 0;
          df1   = 0;
          df2   = 0;
 -        ddf1  = 0;
 -        ddf2  = 0;
 -        ddf12 = 0;
  
          for (i = 3; i >= 0; i--)
          {
@@@ -3482,11 -3575,19 +3628,11 @@@
              e     = tt * e    + ((tc[i*4+3]*tu+tc[i*4+2])*tu + tc[i*4+1])*tu+tc[i*4];
              df1   = tu * df1  + (3.0*tc[l1]*tt+2.0*tc[l2])*tt+tc[l3];
              df2   = tt * df2  + (3.0*tc[i*4+3]*tu+2.0*tc[i*4+2])*tu+tc[i*4+1];
 -            ddf1  = tu * ddf1 + 2.0*3.0*tc[l1]*tt+2.0*tc[l2];
 -            ddf2  = tt * ddf2 + 2.0*3.0*tc[4*i+3]*tu+2.0*tc[4*i+2];
          }
  
 -        ddf12 = tc[5] + 2.0*tc[9]*tt + 3.0*tc[13]*tt*tt + 2.0*tu*(tc[6]+2.0*tc[10]*tt+3.0*tc[14]*tt*tt) +
 -            3.0*tu*tu*(tc[7]+2.0*tc[11]*tt+3.0*tc[15]*tt*tt);
 -
          fac     = RAD2DEG/dx;
          df1     = df1   * fac;
          df2     = df2   * fac;
 -        ddf1    = ddf1  * fac * fac;
 -        ddf2    = ddf2  * fac * fac;
 -        ddf12   = ddf12 * fac * fac;
  
          /* CMAP energy */
          vtot += e;
@@@ -3593,7 -3694,7 +3739,7 @@@
  }
  
  
 -
 +//! \cond
  /***********************************************************
   *
   *   G R O M O S  9 6   F U N C T I O N S
@@@ -3852,7 -3953,7 +3998,7 @@@ real cross_bond_angle(int nbonds
      /* Potential from Lawrence and Skimmer, Chem. Phys. Lett. 372 (2003)
       * pp. 842-847
       */
 -    int  i, ai, aj, ak, type, m, t1, t2, t3;
 +    int  i, ai, aj, ak, type, m, t1, t2;
      rvec r_ij, r_kj, r_ik;
      real vtot, vrt, s1, s2, s3, r1, r2, r3, r1e, r2e, r3e, krt, k1, k2, k3;
      rvec f_i, f_j, f_k;
@@@ -3873,7 -3974,7 +4019,7 @@@
          /* Compute distance vectors ... */
          t1 = pbc_rvec_sub(pbc, x[ai], x[aj], r_ij);
          t2 = pbc_rvec_sub(pbc, x[ak], x[aj], r_kj);
 -        t3 = pbc_rvec_sub(pbc, x[ai], x[ak], r_ik);
 +        pbc_rvec_sub(pbc, x[ai], x[ak], r_ik);
  
          /* ... and their lengths */
          r1 = norm(r_ij);
@@@ -3931,7 -4032,7 +4077,7 @@@ static real bonded_tab(const char *type
  {
      real k, tabscale, *VFtab, rt, eps, eps2, Yt, Ft, Geps, Heps2, Fp, VV, FF;
      int  n0, nnn;
 -    real v, f, dvdlambda;
 +    real dvdlambda;
  
      k = (1.0 - lambda)*kA + lambda*kB;
  
@@@ -3939,7 -4040,7 +4085,7 @@@
      VFtab    = table->data;
  
      rt    = r*tabscale;
 -    n0    = rt;
 +    n0    = static_cast<int>(rt);
      if (n0 >= table->n)
      {
          gmx_fatal(FARGS, "A tabulated %s interaction table number %d is out of the table range: r %f, between table indices %d and %d, table length %d",
@@@ -4066,7 -4167,7 +4212,7 @@@ real tab_angles(int nbonds
          if (cos_theta2 < 1)
          {
              int  m;
 -            real snt, st, sth;
 +            real st, sth;
              real cik, cii, ckk;
              real nrkj2, nrij2;
              rvec f_i, f_j, f_k;
@@@ -4160,10 -4261,11 +4306,10 @@@ real tab_dihs(int nbonds
      return vtot;
  }
  
 -/* Return if this is a potential calculated in bondfree.c,
 - * i.e. an interaction that actually calculates a potential and
 - * works on multiple atoms (not e.g. a connection or a position restraint).
 - */
 -static gmx_inline gmx_bool ftype_is_bonded_potential(int ftype)
 +//! \endcond
 +
 +gmx_bool
 +ftype_is_bonded_potential(int ftype)
  {
      return
          (interaction_function[ftype].flags & IF_BOND) &&
@@@ -4171,7 -4273,173 +4317,7 @@@
          (ftype < F_GB12 || ftype > F_GB14);
  }
  
 -static void divide_bondeds_over_threads(t_idef *idef, int nthreads)
 -{
 -    int ftype;
 -    int nat1;
 -    int t;
 -    int il_nr_thread;
 -
 -    idef->nthreads = nthreads;
 -
 -    if (F_NRE*(nthreads+1) > idef->il_thread_division_nalloc)
 -    {
 -        idef->il_thread_division_nalloc = F_NRE*(nthreads+1);
 -        snew(idef->il_thread_division, idef->il_thread_division_nalloc);
 -    }
 -
 -    for (ftype = 0; ftype < F_NRE; ftype++)
 -    {
 -        if (ftype_is_bonded_potential(ftype))
 -        {
 -            nat1 = interaction_function[ftype].nratoms + 1;
 -
 -            for (t = 0; t <= nthreads; t++)
 -            {
 -                /* Divide the interactions equally over the threads.
 -                 * When the different types of bonded interactions
 -                 * are distributed roughly equally over the threads,
 -                 * this should lead to well localized output into
 -                 * the force buffer on each thread.
 -                 * If this is not the case, a more advanced scheme
 -                 * (not implemented yet) will do better.
 -                 */
 -                il_nr_thread = (((idef->il[ftype].nr/nat1)*t)/nthreads)*nat1;
 -
 -                /* Ensure that distance restraint pairs with the same label
 -                 * end up on the same thread.
 -                 * This is slighlty tricky code, since the next for iteration
 -                 * may have an initial il_nr_thread lower than the final value
 -                 * in the previous iteration, but this will anyhow be increased
 -                 * to the approriate value again by this while loop.
 -                 */
 -                while (ftype == F_DISRES &&
 -                       il_nr_thread > 0 &&
 -                       il_nr_thread < idef->il[ftype].nr &&
 -                       idef->iparams[idef->il[ftype].iatoms[il_nr_thread]].disres.label ==
 -                       idef->iparams[idef->il[ftype].iatoms[il_nr_thread-nat1]].disres.label)
 -                {
 -                    il_nr_thread += nat1;
 -                }
 -
 -                idef->il_thread_division[ftype*(nthreads+1)+t] = il_nr_thread;
 -            }
 -        }
 -    }
 -}
 -
 -static unsigned
 -calc_bonded_reduction_mask(const t_idef *idef,
 -                           int shift,
 -                           int t, int nt)
 -{
 -    unsigned mask;
 -    int      ftype, nb, nat1, nb0, nb1, i, a;
 -
 -    mask = 0;
 -
 -    for (ftype = 0; ftype < F_NRE; ftype++)
 -    {
 -        if (ftype_is_bonded_potential(ftype))
 -        {
 -            nb = idef->il[ftype].nr;
 -            if (nb > 0)
 -            {
 -                nat1 = interaction_function[ftype].nratoms + 1;
 -
 -                /* Divide this interaction equally over the threads.
 -                 * This is not stored: should match division in calc_bonds.
 -                 */
 -                nb0 = idef->il_thread_division[ftype*(nt+1)+t];
 -                nb1 = idef->il_thread_division[ftype*(nt+1)+t+1];
 -
 -                for (i = nb0; i < nb1; i += nat1)
 -                {
 -                    for (a = 1; a < nat1; a++)
 -                    {
 -                        mask |= (1U << (idef->il[ftype].iatoms[i+a]>>shift));
 -                    }
 -                }
 -            }
 -        }
 -    }
 -
 -    return mask;
 -}
 -
 -void setup_bonded_threading(t_forcerec   *fr, t_idef *idef)
 -{
 -#define MAX_BLOCK_BITS 32
 -    int t;
 -    int ctot, c, b;
 -
 -    assert(fr->nthreads >= 1);
 -
 -    /* Divide the bonded interaction over the threads */
 -    divide_bondeds_over_threads(idef, fr->nthreads);
 -
 -    if (fr->nthreads == 1)
 -    {
 -        fr->red_nblock = 0;
 -
 -        return;
 -    }
 -
 -    /* We divide the force array in a maximum of 32 blocks.
 -     * Minimum force block reduction size is 2^6=64.
 -     */
 -    fr->red_ashift = 6;
 -    while (fr->natoms_force > (int)(MAX_BLOCK_BITS*(1U<<fr->red_ashift)))
 -    {
 -        fr->red_ashift++;
 -    }
 -    if (debug)
 -    {
 -        fprintf(debug, "bonded force buffer block atom shift %d bits\n",
 -                fr->red_ashift);
 -    }
 -
 -    /* Determine to which blocks each thread's bonded force calculation
 -     * contributes. Store this is a mask for each thread.
 -     */
 -#pragma omp parallel for num_threads(fr->nthreads) schedule(static)
 -    for (t = 1; t < fr->nthreads; t++)
 -    {
 -        fr->f_t[t].red_mask =
 -            calc_bonded_reduction_mask(idef, fr->red_ashift, t, fr->nthreads);
 -    }
 -
 -    /* Determine the maximum number of blocks we need to reduce over */
 -    fr->red_nblock = 0;
 -    ctot           = 0;
 -    for (t = 0; t < fr->nthreads; t++)
 -    {
 -        c = 0;
 -        for (b = 0; b < MAX_BLOCK_BITS; b++)
 -        {
 -            if (fr->f_t[t].red_mask & (1U<<b))
 -            {
 -                fr->red_nblock = max(fr->red_nblock, b+1);
 -                c++;
 -            }
 -        }
 -        if (debug)
 -        {
 -            fprintf(debug, "thread %d flags %x count %d\n",
 -                    t, fr->f_t[t].red_mask, c);
 -        }
 -        ctot += c;
 -    }
 -    if (debug)
 -    {
 -        fprintf(debug, "Number of blocks to reduce: %d of size %d\n",
 -                fr->red_nblock, 1<<fr->red_ashift);
 -        fprintf(debug, "Reduction density %.2f density/#thread %.2f\n",
 -                ctot*(1<<fr->red_ashift)/(double)fr->natoms_force,
 -                ctot*(1<<fr->red_ashift)/(double)(fr->natoms_force*fr->nthreads));
 -    }
 -}
 -
 +/*! \brief Zero thread-local force-output buffers */
  static void zero_thread_forces(f_thread_t *f_t, int n,
                                 int nblock, int blocksize)
  {
@@@ -4190,7 -4458,7 +4336,7 @@@
              if (f_t->red_mask && (1U<<b))
              {
                  a0 = b*blocksize;
 -                a1 = min((b+1)*blocksize, n);
 +                a1 = std::min((b+1)*blocksize, n);
                  for (a = a0; a < a1; a++)
                  {
                      clear_rvec(f_t->f[a]);
@@@ -4219,16 -4487,15 +4365,16 @@@
      }
  }
  
 +/*! \brief The max thread number is arbitrary, we used a fixed number
 + * to avoid memory management.  Using more than 16 threads is probably
 + * never useful performance wise. */
 +#define MAX_BONDED_THREADS 256
 +
 +/*! \brief Reduce thread-local force buffers */
  static void reduce_thread_force_buffer(int n, rvec *f,
                                         int nthreads, f_thread_t *f_t,
                                         int nblock, int block_size)
  {
 -    /* The max thread number is arbitrary,
 -     * we used a fixed number to avoid memory management.
 -     * Using more than 16 threads is probably never useful performance wise.
 -     */
 -#define MAX_BONDED_THREADS 256
      int b;
  
      if (nthreads > MAX_BONDED_THREADS)
@@@ -4261,7 -4528,7 +4407,7 @@@
              /* Reduce force buffers for threads that contribute */
              a0 =  b   *block_size;
              a1 = (b+1)*block_size;
 -            a1 = min(a1, n);
 +            a1 = std::min(a1, n);
              for (a = a0; a < a1; a++)
              {
                  for (fb = 0; fb < nfb; fb++)
@@@ -4273,7 -4540,6 +4419,7 @@@
      }
  }
  
 +/*! \brief Reduce thread-local forces */
  static void reduce_thread_forces(int n, rvec *f, rvec *fshift,
                                   real *ener, gmx_grppairener_t *grpp, real *dvdl,
                                   int nthreads, f_thread_t *f_t,
@@@ -4331,11 -4597,9 +4477,11 @@@
      }
  }
  
 -static real calc_one_bond(FILE *fplog, int thread,
 +/*! \brief Calculate one element of the list of bonded interactions
 +    for this thread */
 +static real calc_one_bond(int thread,
                            int ftype, const t_idef *idef,
 -                          rvec x[], rvec f[], rvec fshift[],
 +                          const rvec x[], rvec f[], rvec fshift[],
                            t_forcerec *fr,
                            const t_pbc *pbc, const t_graph *g,
                            gmx_grppairener_t *grpp,
@@@ -4343,7 -4607,7 +4489,7 @@@
                            real *lambda, real *dvdl,
                            const t_mdatoms *md, t_fcdata *fcd,
                            gmx_bool bCalcEnerVir,
 -                          int *global_atom_index, gmx_bool bPrintSepPot)
 +                          int *global_atom_index)
  {
      int      nat1, nbonds, efptFTYPE;
      real     v = 0;
@@@ -4372,7 -4636,7 +4518,7 @@@
          {
              v = cmap_dihs(nbn, iatoms+nb0,
                            idef->iparams, &idef->cmap_grid,
 -                          (const rvec*)x, f, fshift,
 +                          x, f, fshift,
                            pbc, g, lambda[efptFTYPE], &(dvdl[efptFTYPE]),
                            md, fcd, global_atom_index);
          }
@@@ -4383,7 -4647,7 +4529,7 @@@
              /* No energies, shift forces, dvdl */
              angles_noener_simd(nbn, idef->il[ftype].iatoms+nb0,
                                 idef->iparams,
 -                               (const rvec*)x, f,
 +                               x, f,
                                 pbc, g, lambda[efptFTYPE], md, fcd,
                                 global_atom_index);
              v = 0;
@@@ -4400,24 -4664,53 +4546,37 @@@
  #endif
                  (nbn, idef->il[ftype].iatoms+nb0,
                  idef->iparams,
 -                (const rvec*)x, f,
 +                x, f,
                  pbc, g, lambda[efptFTYPE], md, fcd,
                  global_atom_index);
              v = 0;
          }
+ #ifdef GMX_SIMD_HAVE_REAL
+         else if (ftype == F_RBDIHS &&
+                  !bCalcEnerVir && fr->efep == efepNO)
+         {
+             /* No energies, shift forces, dvdl */
+             rbdihs_noener_simd(nbn, idef->il[ftype].iatoms+nb0,
+                                idef->iparams,
+                                (const rvec*)x, f,
+                                pbc, g, lambda[efptFTYPE], md, fcd,
+                                global_atom_index);
+             v = 0;
+         }
+ #endif
          else
          {
              v = interaction_function[ftype].ifunc(nbn, iatoms+nb0,
                                                    idef->iparams,
 -                                                  (const rvec*)x, f, fshift,
 +                                                  x, f, fshift,
                                                    pbc, g, lambda[efptFTYPE], &(dvdl[efptFTYPE]),
                                                    md, fcd, global_atom_index);
          }
 -        if (bPrintSepPot)
 -        {
 -            fprintf(fplog, "  %-23s #%4d  V %12.5e  dVdl %12.5e\n",
 -                    interaction_function[ftype].longname,
 -                    nbonds, v, lambda[efptFTYPE]);
 -        }
      }
      else
      {
 -        v = do_nonbonded_listed(ftype, nbn, iatoms+nb0, idef->iparams, (const rvec*)x, f, fshift,
 +        v = do_nonbonded_listed(ftype, nbn, iatoms+nb0, idef->iparams, x, f, fshift,
                                  pbc, g, lambda, dvdl, md, fr, grpp, global_atom_index);
 -
 -        if (bPrintSepPot)
 -        {
 -            fprintf(fplog, "  %-5s + %-15s #%4d                  dVdl %12.5e\n",
 -                    interaction_function[ftype].longname,
 -                    interaction_function[F_LJ14].longname, nbonds, dvdl[efptVDW]);
 -            fprintf(fplog, "  %-5s + %-15s #%4d                  dVdl %12.5e\n",
 -                    interaction_function[ftype].longname,
 -                    interaction_function[F_COUL14].longname, nbonds, dvdl[efptCOUL]);
 -        }
      }
  
      if (thread == 0)
@@@ -4428,22 -4721,25 +4587,22 @@@
      return v;
  }
  
 -void calc_bonds(FILE *fplog, const gmx_multisim_t *ms,
 +void calc_bonds(const gmx_multisim_t *ms,
                  const t_idef *idef,
 -                rvec x[], history_t *hist,
 +                const rvec x[], history_t *hist,
                  rvec f[], t_forcerec *fr,
 -                const t_pbc *pbc, const t_graph *g,
 +                const struct t_pbc *pbc, const struct t_graph *g,
                  gmx_enerdata_t *enerd, t_nrnb *nrnb,
                  real *lambda,
                  const t_mdatoms *md,
                  t_fcdata *fcd, int *global_atom_index,
 -                t_atomtypes gmx_unused *atype, gmx_genborn_t gmx_unused *born,
 -                int force_flags,
 -                gmx_bool bPrintSepPot, gmx_int64_t step)
 +                int force_flags)
  {
      gmx_bool      bCalcEnerVir;
      int           i;
 -    real          v, dvdl[efptNR], dvdl_dum[efptNR]; /* The dummy array is to have a place to store the dhdl at other values
 +    real          dvdl[efptNR]; /* The dummy array is to have a place to store the dhdl at other values
                                                          of lambda, which will be thrown away in the end*/
      const  t_pbc *pbc_null;
 -    char          buf[22];
      int           thread;
  
      assert(fr->nthreads == idef->nthreads);
@@@ -4462,6 -4758,11 +4621,6 @@@
      {
          pbc_null = NULL;
      }
 -    if (bPrintSepPot)
 -    {
 -        fprintf(fplog, "Step %s: bonded V and dVdl for this rank\n",
 -                gmx_step_str(step, buf));
 -    }
  
  #ifdef DEBUG
      if (g && debug)
@@@ -4476,14 -4777,14 +4635,14 @@@
          enerd->term[F_ORIRESDEV] =
              calc_orires_dev(ms, idef->il[F_ORIRES].nr,
                              idef->il[F_ORIRES].iatoms,
 -                            idef->iparams, md, (const rvec*)x,
 +                            idef->iparams, md, x,
                              pbc_null, fcd, hist);
      }
      if (idef->il[F_DISRES].nr)
      {
          calc_disres_R_6(idef->il[F_DISRES].nr,
                          idef->il[F_DISRES].iatoms,
 -                        idef->iparams, (const rvec*)x, pbc_null,
 +                        idef->iparams, x, pbc_null,
                          fcd, hist);
  #ifdef GMX_MPI
          if (fcd->disres.nsystems > 1)
@@@ -4527,11 -4828,11 +4686,11 @@@
          {
              if (idef->il[ftype].nr > 0 && ftype_is_bonded_potential(ftype))
              {
 -                v = calc_one_bond(fplog, thread, ftype, idef, x,
 +                v = calc_one_bond(thread, ftype, idef, x,
                                    ft, fshift, fr, pbc_null, g, grpp,
                                    nrnb, lambda, dvdlt,
                                    md, fcd, bCalcEnerVir,
 -                                  global_atom_index, bPrintSepPot);
 +                                  global_atom_index);
                  epot[ftype] += v;
              }
          }
@@@ -4561,17 -4862,18 +4720,17 @@@
      }
  }
  
 -void calc_bonds_lambda(FILE *fplog,
 -                       const t_idef *idef,
 -                       rvec x[],
 +void calc_bonds_lambda(const t_idef *idef,
 +                       const rvec x[],
                         t_forcerec *fr,
 -                       const t_pbc *pbc, const t_graph *g,
 +                       const struct t_pbc *pbc, const struct t_graph *g,
                         gmx_grppairener_t *grpp, real *epot, t_nrnb *nrnb,
                         real *lambda,
                         const t_mdatoms *md,
                         t_fcdata *fcd,
                         int *global_atom_index)
  {
 -    int           i, ftype, nr_nonperturbed, nr;
 +    int           ftype, nr_nonperturbed, nr;
      real          v;
      real          dvdl_dum[efptNR];
      rvec         *f, *fshift;
@@@ -4612,11 -4914,11 +4771,11 @@@
  
              if (nr - nr_nonperturbed > 0)
              {
 -                v = calc_one_bond(fplog, 0, ftype, &idef_fe,
 +                v = calc_one_bond(0, ftype, &idef_fe,
                                    x, f, fshift, fr, pbc_null, g,
                                    grpp, nrnb, lambda, dvdl_dum,
                                    md, fcd, TRUE,
 -                                  global_atom_index, FALSE);
 +                                  global_atom_index);
                  epot[ftype] += v;
              }
          }
diff --combined src/gromacs/mdlib/domdec.cpp
index 13934d5fa6,6cc6b73895..06f2602916
--- a/src/gromacs/mdlib/domdec.cpp
+++ b/src/gromacs/mdlib/domdec.cpp
@@@ -33,59 -33,53 +33,59 @@@
   * the research papers on the package. Check out http://www.gromacs.org.
   */
  
 -#ifdef HAVE_CONFIG_H
 -#include <config.h>
 -#endif
 +#include "gmxpre.h"
  
 -#include <stdio.h>
 -#include <time.h>
 +#include "gromacs/legacyheaders/domdec.h"
 +
 +#include "config.h"
 +
 +#include <assert.h>
  #include <math.h>
 -#include <string.h>
 +#include <stdio.h>
  #include <stdlib.h>
 -#include <assert.h>
 +#include <string.h>
 +#include <time.h>
 +
 +#include <algorithm>
  
 -#include "typedefs.h"
 -#include "gromacs/utility/smalloc.h"
 -#include "gmx_fatal.h"
 -#include "gmx_fatal_collective.h"
 -#include "vec.h"
 -#include "domdec.h"
 -#include "domdec_network.h"
 -#include "nrnb.h"
 -#include "pbc.h"
 -#include "chargegroup.h"
 -#include "constr.h"
 -#include "mdatoms.h"
 -#include "names.h"
 -#include "force.h"
 -#include "pme.h"
 -#include "mdrun.h"
 -#include "nsgrid.h"
 -#include "shellfc.h"
 -#include "mtop_util.h"
 -#include "gmx_ga2la.h"
 -#include "macros.h"
 -#include "nbnxn_search.h"
 -#include "bondf.h"
 -#include "gmx_omp_nthreads.h"
 -#include "gpu_utils.h"
 -
 -#include "gromacs/fileio/futil.h"
  #include "gromacs/fileio/gmxfio.h"
  #include "gromacs/fileio/pdbio.h"
 +#include "gromacs/imd/imd.h"
 +#include "gromacs/legacyheaders/bonded-threading.h"
 +#include "gromacs/legacyheaders/chargegroup.h"
 +#include "gromacs/legacyheaders/constr.h"
 +#include "gromacs/legacyheaders/domdec_network.h"
 +#include "gromacs/legacyheaders/force.h"
 +#include "gromacs/legacyheaders/gmx_ga2la.h"
 +#include "gromacs/legacyheaders/gmx_omp_nthreads.h"
 +#include "gromacs/legacyheaders/gpu_utils.h"
 +#include "gromacs/legacyheaders/macros.h"
 +#include "gromacs/legacyheaders/mdatoms.h"
 +#include "gromacs/legacyheaders/mdrun.h"
 +#include "gromacs/legacyheaders/names.h"
 +#include "gromacs/legacyheaders/network.h"
 +#include "gromacs/legacyheaders/nrnb.h"
 +#include "gromacs/legacyheaders/nsgrid.h"
 +#include "gromacs/legacyheaders/pme.h"
 +#include "gromacs/legacyheaders/shellfc.h"
 +#include "gromacs/legacyheaders/typedefs.h"
 +#include "gromacs/listed-forces/bonded.h"
 +#include "gromacs/math/vec.h"
 +#include "gromacs/mdlib/nb_verlet.h"
 +#include "gromacs/mdlib/nbnxn_search.h"
 +#include "gromacs/pbcutil/ishift.h"
 +#include "gromacs/pbcutil/pbc.h"
 +#include "gromacs/pulling/pull.h"
 +#include "gromacs/pulling/pull_rotation.h"
 +#include "gromacs/swap/swapcoords.h"
  #include "gromacs/timing/wallcycle.h"
 +#include "gromacs/topology/mtop_util.h"
 +#include "gromacs/utility/basenetwork.h"
 +#include "gromacs/utility/fatalerror.h"
 +#include "gromacs/utility/futil.h"
  #include "gromacs/utility/gmxmpi.h"
 -#include "gromacs/swap/swapcoords.h"
  #include "gromacs/utility/qsort_threadsafe.h"
 -#include "gromacs/pulling/pull.h"
 -#include "gromacs/pulling/pull_rotation.h"
 -#include "gromacs/imd/imd.h"
 +#include "gromacs/utility/smalloc.h"
  
  #define DDRANK(dd, rank)    (rank)
  #define DDMASTERRANK(dd)   (dd->masterrank)
@@@ -275,6 -269,8 +275,8 @@@ typedef struct gmx_domdec_com
  
      /* The DLB option */
      int      eDLB;
+     /* Is eDLB=edlbAUTO locked such that we currently can't turn it on? */
+     gmx_bool bDLB_locked;
      /* Are we actually using DLB? */
      gmx_bool bDynLoadBal;
  
@@@ -391,9 -387,9 +393,9 @@@
      int    eFlop;
      double flop;
      int    flop_n;
-     /* Have often have did we have load measurements */
+     /* How many times have did we have load measurements */
      int    n_load_have;
-     /* Have often have we collected the load measurements */
+     /* How many times have we collected the load measurements */
      int    n_load_collect;
  
      /* Statistics */
@@@ -772,7 -768,7 +774,7 @@@ void dd_move_f(gmx_domdec_t *dd, rvec f
      rvec                  *buf, *sbuf;
      ivec                   vis;
      int                    is;
 -    gmx_bool               bPBC, bScrew;
 +    gmx_bool               bShiftForcesNeedPbc, bScrew;
  
      comm = dd->comm;
  
@@@ -780,17 -776,16 +782,17 @@@
  
      buf = comm->vbuf.v;
  
 -    n       = 0;
      nzone   = comm->zones.n/2;
      nat_tot = dd->nat_tot;
      for (d = dd->ndim-1; d >= 0; d--)
      {
 -        bPBC   = (dd->ci[dd->dim[d]] == 0);
 -        bScrew = (bPBC && dd->bScrewPBC && dd->dim[d] == XX);
 +        /* Only forces in domains near the PBC boundaries need to
 +           consider PBC in the treatment of fshift */
 +        bShiftForcesNeedPbc   = (dd->ci[dd->dim[d]] == 0);
 +        bScrew                = (bShiftForcesNeedPbc && dd->bScrewPBC && dd->dim[d] == XX);
          if (fshift == NULL && !bScrew)
          {
 -            bPBC = FALSE;
 +            bShiftForcesNeedPbc = FALSE;
          }
          /* Determine which shift vector we need */
          clear_ivec(vis);
@@@ -826,7 -821,7 +828,7 @@@
              index = ind->index;
              /* Add the received forces */
              n = 0;
 -            if (!bPBC)
 +            if (!bShiftForcesNeedPbc)
              {
                  for (i = 0; i < ind->nsend[nzone]; i++)
                  {
@@@ -841,9 -836,6 +843,9 @@@
              }
              else if (!bScrew)
              {
 +                /* fshift should always be defined if this function is
 +                 * called when bShiftForcesNeedPbc is true */
 +                assert(NULL != fshift);
                  for (i = 0; i < ind->nsend[nzone]; i++)
                  {
                      at0 = cgindex[index[i]];
@@@ -964,6 -956,7 +966,6 @@@ void dd_atom_sum_real(gmx_domdec_t *dd
  
      buf = &comm->vbuf.v[0][0];
  
 -    n       = 0;
      nzone   = comm->zones.n/2;
      nat_tot = dd->nat_tot;
      for (d = dd->ndim-1; d >= 0; d--)
@@@ -1069,7 -1062,7 +1071,7 @@@ static void dd_sendrecv_ddzone(const gm
  static void dd_move_cellx(gmx_domdec_t *dd, gmx_ddbox_t *ddbox,
                            rvec cell_ns_x0, rvec cell_ns_x1)
  {
 -    int                d, d1, dim, dim1, pos, buf_size, i, j, k, p, npulse, npulse_min;
 +    int                d, d1, dim, pos, buf_size, i, j, p, npulse, npulse_min;
      gmx_ddzone_t      *zp;
      gmx_ddzone_t       buf_s[DDZONECOMM_MAXZONE];
      gmx_ddzone_t       buf_r[DDZONECOMM_MAXZONE];
@@@ -1141,7 -1134,7 +1143,7 @@@
          if (bPBC)
          {
              /* Take the minimum to avoid double communication */
 -            npulse_min = min(npulse, dd->nc[dim]-1-npulse);
 +            npulse_min = std::min(npulse, dd->nc[dim]-1-npulse);
          }
          else
          {
@@@ -1165,9 -1158,9 +1167,9 @@@
              {
                  for (d1 = d; d1 < dd->ndim-1; d1++)
                  {
 -                    extr_s[d1][0] = max(extr_s[d1][0], extr_r[d1][0]);
 -                    extr_s[d1][1] = min(extr_s[d1][1], extr_r[d1][1]);
 -                    extr_s[d1][2] = min(extr_s[d1][2], extr_r[d1][2]);
 +                    extr_s[d1][0] = std::max(extr_s[d1][0], extr_r[d1][0]);
 +                    extr_s[d1][1] = std::min(extr_s[d1][1], extr_r[d1][1]);
 +                    extr_s[d1][2] = std::min(extr_s[d1][2], extr_r[d1][2]);
                  }
              }
          }
@@@ -1228,9 -1221,9 +1230,9 @@@
                  {
                      if (bUse)
                      {
 -                        buf_e[i].min0 = min(buf_e[i].min0, buf_r[i].min0);
 -                        buf_e[i].max1 = max(buf_e[i].max1, buf_r[i].max1);
 -                        buf_e[i].min1 = min(buf_e[i].min1, buf_r[i].min1);
 +                        buf_e[i].min0 = std::min(buf_e[i].min0, buf_r[i].min0);
 +                        buf_e[i].max1 = std::max(buf_e[i].max1, buf_r[i].max1);
 +                        buf_e[i].min1 = std::min(buf_e[i].min1, buf_r[i].min1);
                      }
  
                      if (dd->ndim == 3 && d == 0 && i == buf_size - 1)
@@@ -1243,8 -1236,8 +1245,8 @@@
                      }
                      if (bUse && dh[d1] >= 0)
                      {
 -                        buf_e[i].mch0 = max(buf_e[i].mch0, buf_r[i].mch0-dh[d1]);
 -                        buf_e[i].mch1 = max(buf_e[i].mch1, buf_r[i].mch1-dh[d1]);
 +                        buf_e[i].mch0 = std::max(buf_e[i].mch0, buf_r[i].mch0-dh[d1]);
 +                        buf_e[i].mch1 = std::max(buf_e[i].mch1, buf_r[i].mch1-dh[d1]);
                      }
                  }
                  /* Copy the received buffer to the send buffer,
@@@ -1260,9 -1253,9 +1262,9 @@@
  
                  for (d1 = d; d1 < dd->ndim-1; d1++)
                  {
 -                    extr_s[d1][1] = min(extr_s[d1][1], buf_e[pos].min0);
 -                    extr_s[d1][0] = max(extr_s[d1][0], buf_e[pos].max1);
 -                    extr_s[d1][2] = min(extr_s[d1][2], buf_e[pos].min1);
 +                    extr_s[d1][1] = std::min(extr_s[d1][1], buf_e[pos].min0);
 +                    extr_s[d1][0] = std::max(extr_s[d1][0], buf_e[pos].max1);
 +                    extr_s[d1][2] = std::min(extr_s[d1][2], buf_e[pos].min1);
                      pos++;
                  }
  
@@@ -1292,8 -1285,8 +1294,8 @@@
              {
                  print_ddzone(debug, 1, i, 0, &comm->zone_d1[i]);
              }
 -            cell_ns_x0[dim] = min(cell_ns_x0[dim], comm->zone_d1[i].min0);
 -            cell_ns_x1[dim] = max(cell_ns_x1[dim], comm->zone_d1[i].max1);
 +            cell_ns_x0[dim] = std::min(cell_ns_x0[dim], comm->zone_d1[i].min0);
 +            cell_ns_x1[dim] = std::max(cell_ns_x1[dim], comm->zone_d1[i].max1);
          }
      }
      if (dd->ndim >= 3)
@@@ -1307,8 -1300,8 +1309,8 @@@
                  {
                      print_ddzone(debug, 2, i, j, &comm->zone_d2[i][j]);
                  }
 -                cell_ns_x0[dim] = min(cell_ns_x0[dim], comm->zone_d2[i][j].min0);
 -                cell_ns_x1[dim] = max(cell_ns_x1[dim], comm->zone_d2[i][j].max1);
 +                cell_ns_x0[dim] = std::min(cell_ns_x0[dim], comm->zone_d2[i][j].min0);
 +                cell_ns_x1[dim] = std::max(cell_ns_x1[dim], comm->zone_d2[i][j].max1);
              }
          }
      }
@@@ -1534,6 -1527,10 +1536,6 @@@ static void dd_collect_vec_gatherv(gmx_
  void dd_collect_vec(gmx_domdec_t *dd,
                      t_state *state_local, rvec *lv, rvec *v)
  {
 -    gmx_domdec_master_t *ma;
 -    int                  n, i, c, a, nalloc = 0;
 -    rvec                *buf = NULL;
 -
      dd_collect_cg(dd, state_local);
  
      if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
@@@ -1755,7 -1752,7 +1757,7 @@@ static void dd_distribute_vec_scatterv(
  {
      gmx_domdec_master_t *ma;
      int                 *scounts = NULL, *disps = NULL;
 -    int                  n, i, c, a, nalloc = 0;
 +    int                  n, i, c, a;
      rvec                *buf = NULL;
  
      if (DDMASTER(dd))
@@@ -1949,7 -1946,7 +1951,7 @@@ static void write_dd_grid_pdb(const cha
          snew(grid_r, 2*dd->nnodes);
      }
  
 -    dd_gather(dd, 2*sizeof(rvec), grid_s[0], DDMASTER(dd) ? grid_r[0] : NULL);
 +    dd_gather(dd, 2*sizeof(rvec), grid_s, DDMASTER(dd) ? grid_r : NULL);
  
      if (DDMASTER(dd))
      {
@@@ -2092,15 -2089,15 +2094,15 @@@ real dd_cutoff_mbody(gmx_domdec_t *dd
              r = comm->cellsize_min[dd->dim[0]];
              for (di = 1; di < dd->ndim; di++)
              {
 -                r = min(r, comm->cellsize_min[dd->dim[di]]);
 +                r = std::min(r, comm->cellsize_min[dd->dim[di]]);
              }
              if (comm->bBondComm)
              {
 -                r = max(r, comm->cutoff_mbody);
 +                r = std::max(r, comm->cutoff_mbody);
              }
              else
              {
 -                r = min(r, comm->cutoff);
 +                r = std::min(r, comm->cutoff);
              }
          }
      }
@@@ -2114,7 -2111,7 +2116,7 @@@ real dd_cutoff_twobody(gmx_domdec_t *dd
  
      r_mb = dd_cutoff_mbody(dd);
  
 -    return max(dd->comm->cutoff, r_mb);
 +    return std::max(dd->comm->cutoff, r_mb);
  }
  
  
@@@ -2176,7 -2173,7 +2178,7 @@@ static int *dd_pmenodes(t_commrec *cr
  static int gmx_ddcoord2pmeindex(t_commrec *cr, int x, int y, int z)
  {
      gmx_domdec_t *dd;
 -    ivec          coords, coords_pme, nc;
 +    ivec          coords;
      int           slab;
  
      dd = cr->dd;
@@@ -2245,6 -2242,7 +2247,6 @@@ static int dd_simnode2pmenode(t_commre
  {
      gmx_domdec_t      *dd;
      gmx_domdec_comm_t *comm;
 -    ivec               coord, coord_pme;
      int                i;
      int                pmenode = -1;
  
@@@ -2255,7 -2253,6 +2257,7 @@@
      if (comm->bCartesianPP_PME)
      {
  #ifdef GMX_MPI
 +        ivec coord, coord_pme;
          MPI_Cart_coords(cr->mpi_comm_mysim, sim_nodeid, DIM, coord);
          if (coord[comm->cartpmedim] < dd->nc[comm->cartpmedim])
          {
@@@ -2393,7 -2390,7 +2395,7 @@@ void get_pme_ddnodes(t_commrec *cr, in
  static gmx_bool receive_vir_ener(t_commrec *cr)
  {
      gmx_domdec_comm_t *comm;
 -    int                pmenode, coords[DIM], rank;
 +    int                pmenode;
      gmx_bool           bReceive;
  
      bReceive = TRUE;
@@@ -2404,12 -2401,10 +2406,12 @@@
          {
              pmenode = dd_simnode2pmenode(cr, cr->sim_nodeid);
  #ifdef GMX_MPI
 +            ivec coords;
              MPI_Cart_coords(cr->mpi_comm_mysim, cr->sim_nodeid, DIM, coords);
              coords[comm->cartpmedim]++;
              if (coords[comm->cartpmedim] < cr->dd->nc[comm->cartpmedim])
              {
 +                int rank;
                  MPI_Cart_rank(cr->mpi_comm_mysim, coords, &rank);
                  if (dd_simnode2pmenode(cr, rank) == pmenode)
                  {
@@@ -2517,8 -2512,12 +2519,8 @@@ static void make_dd_indices(gmx_domdec_
  {
      int          nzone, zone, zone1, cg0, cg1, cg1_p1, cg, cg_gl, a, a_gl;
      int         *zone2cg, *zone_ncg1, *index_gl, *gatindex;
 -    gmx_ga2la_t *ga2la;
 -    char        *bLocalCG;
      gmx_bool     bCGs;
  
 -    bLocalCG = dd->comm->bLocalCG;
 -
      if (dd->nat_tot > dd->gatindex_nalloc)
      {
          dd->gatindex_nalloc = over_alloc_dd(dd->nat_tot);
@@@ -2583,7 -2582,7 +2585,7 @@@
  static int check_bLocalCG(gmx_domdec_t *dd, int ncg_sys, const char *bLocalCG,
                            const char *where)
  {
 -    int ncg, i, ngl, nerr;
 +    int i, ngl, nerr;
  
      nerr = 0;
      if (bLocalCG == NULL)
@@@ -2743,12 -2742,12 +2745,12 @@@ static real cellsize_min_dlb(gmx_domdec
          /* The cut-off might have changed, e.g. by PME load balacning,
           * from the value used to set comm->cellsize_min, so check it.
           */
 -        cellsize_min = max(cellsize_min, comm->cutoff/comm->cd[dim_ind].np_dlb);
 +        cellsize_min = std::max(cellsize_min, comm->cutoff/comm->cd[dim_ind].np_dlb);
  
          if (comm->bPMELoadBalDLBLimits)
          {
              /* Check for the cut-off limit set by the PME load balancing */
 -            cellsize_min = max(cellsize_min, comm->PMELoadBal_max_cutoff/comm->cd[dim_ind].np_dlb);
 +            cellsize_min = std::max(cellsize_min, comm->PMELoadBal_max_cutoff/comm->cd[dim_ind].np_dlb);
          }
      }
  
@@@ -2771,10 -2770,10 +2773,10 @@@ static real grid_jump_limit(gmx_domdec_
      {
          if (comm->bPMELoadBalDLBLimits)
          {
 -            cutoff = max(cutoff, comm->PMELoadBal_max_cutoff);
 +            cutoff = std::max(cutoff, comm->PMELoadBal_max_cutoff);
          }
 -        grid_jump_limit = max(grid_jump_limit,
 -                              cutoff/comm->cd[dim_ind].np);
 +        grid_jump_limit = std::max(grid_jump_limit,
 +                                   cutoff/comm->cd[dim_ind].np);
      }
  
      return grid_jump_limit;
@@@ -2961,8 -2960,8 +2963,8 @@@ static void init_ddpme(gmx_domdec_t *dd
              {
                  slab = pmeindex % ddpme->nslab;
              }
 -            ddpme->pp_min[slab] = min(ddpme->pp_min[slab], xyz[dimind]);
 -            ddpme->pp_max[slab] = max(ddpme->pp_max[slab], xyz[dimind]);
 +            ddpme->pp_min[slab] = std::min(ddpme->pp_min[slab], xyz[dimind]);
 +            ddpme->pp_max[slab] = std::max(ddpme->pp_max[slab], xyz[dimind]);
          }
      }
  
@@@ -3163,7 -3162,7 +3165,7 @@@ static void set_dd_cell_sizes_slb(gmx_d
                  {
                      npulse[d]++;
                  }
 -                cellsize_min[d] = min(cellsize_min[d], cellsize);
 +                cellsize_min[d] = std::min(cellsize_min[d], cellsize);
              }
              if (setmode == setcellsizeslbLOCAL)
              {
@@@ -3433,7 -3432,7 +3435,7 @@@ static void set_dd_cell_sizes_dlb_root(
                                         gmx_bool bUniform, gmx_int64_t step)
  {
      gmx_domdec_comm_t *comm;
 -    int                ncd, d1, i, j, pos;
 +    int                ncd, d1, i, pos;
      real              *cell_size;
      real               load_aver, load_i, imbalance, change, change_max, sc;
      real               cellsize_limit_f, dist_min_f, dist_min_f_hard, space;
@@@ -3465,7 -3464,7 +3467,7 @@@
              cell_size[i] = 1.0/ncd;
          }
      }
-     else if (dd_load_count(comm))
+     else if (dd_load_count(comm) > 0)
      {
          load_aver  = comm->load[d].sum_m/ncd;
          change_max = 0;
@@@ -3476,7 -3475,7 +3478,7 @@@
              imbalance = (load_i - load_aver)/(load_aver > 0 ? load_aver : 1);
              /* Determine the change of the cell size using underrelaxation */
              change     = -relax*imbalance;
 -            change_max = max(change_max, max(change, -change));
 +            change_max = std::max(change_max, std::max(change, -change));
          }
          /* Limit the amount of scaling.
           * We need to use the same rescaling for all cells in one row,
@@@ -3619,7 -3618,7 +3621,7 @@@ static void distribute_dd_cell_sizes_dl
                                           gmx_ddbox_t *ddbox)
  {
      gmx_domdec_comm_t *comm;
 -    int                d1, dim1, pos;
 +    int                d1, pos;
  
      comm = dd->comm;
  
@@@ -3907,13 -3906,13 +3909,13 @@@ static void distribute_cg(FILE *fplog, 
  {
      gmx_domdec_master_t *ma;
      int                **tmp_ind = NULL, *tmp_nalloc = NULL;
 -    int                  i, icg, j, k, k0, k1, d, npbcdim;
 +    int                  i, icg, j, k, k0, k1, d;
      matrix               tcm;
 -    rvec                 box_size, cg_cm;
 +    rvec                 cg_cm;
      ivec                 ind;
      real                 nrcg, inv_ncg, pos_d;
      atom_id             *cgindex;
 -    gmx_bool             bUnbounded, bScrew;
 +    gmx_bool             bScrew;
  
      ma = dd->ma;
  
@@@ -4121,9 -4120,9 +4123,9 @@@ static void get_cg_distribution(FILE *f
      }
  
      dd_scatterv(dd,
 -                DDMASTER(dd) ? ma->ibuf : NULL,
 -                DDMASTER(dd) ? ma->ibuf+dd->nnodes : NULL,
 -                DDMASTER(dd) ? ma->cg : NULL,
 +                bMaster ? ma->ibuf : NULL,
 +                bMaster ? ma->ibuf+dd->nnodes : NULL,
 +                bMaster ? ma->cg : NULL,
                  dd->ncg_home*sizeof(int), dd->index_gl);
  
      /* Determine the home charge group sizes */
@@@ -4336,7 -4335,7 +4338,7 @@@ static void clear_and_mark_ind(int ncg
  static void print_cg_move(FILE *fplog,
                            gmx_domdec_t *dd,
                            gmx_int64_t step, int cg, int dim, int dir,
-                           gmx_bool bHaveLimitdAndCMOld, real limitd,
+                           gmx_bool bHaveCgcmOld, real limitd,
                            rvec cm_old, rvec cm_new, real pos_d)
  {
      gmx_domdec_comm_t *comm;
@@@ -4345,19 -4344,22 +4347,22 @@@
      comm = dd->comm;
  
      fprintf(fplog, "\nStep %s:\n", gmx_step_str(step, buf));
-     if (bHaveLimitdAndCMOld)
+     if (limitd > 0)
      {
-         fprintf(fplog, "The charge group starting at atom %d moved more than the distance allowed by the domain decomposition (%f) in direction %c\n",
+         fprintf(fplog, "%s %d moved more than the distance allowed by the domain decomposition (%f) in direction %c\n",
+                 dd->comm->bCGs ? "The charge group starting at atom" : "Atom",
                  ddglatnr(dd, dd->cgindex[cg]), limitd, dim2char(dim));
      }
      else
      {
-         fprintf(fplog, "The charge group starting at atom %d moved than the distance allowed by the domain decomposition in direction %c\n",
+         /* We don't have a limiting distance available: don't print it */
+         fprintf(fplog, "%s %d moved more than the distance allowed by the domain decomposition in direction %c\n",
+                 dd->comm->bCGs ? "The charge group starting at atom" : "Atom",
                  ddglatnr(dd, dd->cgindex[cg]), dim2char(dim));
      }
      fprintf(fplog, "distance out of cell %f\n",
              dir == 1 ? pos_d - comm->cell_x1[dim] : pos_d - comm->cell_x0[dim]);
-     if (bHaveLimitdAndCMOld)
+     if (bHaveCgcmOld)
      {
          fprintf(fplog, "Old coordinates: %8.3f %8.3f %8.3f\n",
                  cm_old[XX], cm_old[YY], cm_old[ZZ]);
@@@ -4375,19 -4377,20 +4380,20 @@@
  static void cg_move_error(FILE *fplog,
                            gmx_domdec_t *dd,
                            gmx_int64_t step, int cg, int dim, int dir,
-                           gmx_bool bHaveLimitdAndCMOld, real limitd,
+                           gmx_bool bHaveCgcmOld, real limitd,
                            rvec cm_old, rvec cm_new, real pos_d)
  {
      if (fplog)
      {
          print_cg_move(fplog, dd, step, cg, dim, dir,
-                       bHaveLimitdAndCMOld, limitd, cm_old, cm_new, pos_d);
+                       bHaveCgcmOld, limitd, cm_old, cm_new, pos_d);
      }
      print_cg_move(stderr, dd, step, cg, dim, dir,
-                   bHaveLimitdAndCMOld, limitd, cm_old, cm_new, pos_d);
+                   bHaveCgcmOld, limitd, cm_old, cm_new, pos_d);
      gmx_fatal(FARGS,
-               "A charge group moved too far between two domain decomposition steps\n"
-               "This usually means that your system is not well equilibrated");
+               "%s moved too far between two domain decomposition steps\n"
+               "This usually means that your system is not well equilibrated",
+               dd->comm->bCGs ? "A charge group" : "An atom");
  }
  
  static void rotate_state_atom(t_state *state, int a)
@@@ -4454,8 -4457,8 +4460,8 @@@ static void calc_cg_move(FILE *fplog, g
                           int *move)
  {
      int      npbcdim;
 -    int      c, i, cg, k, k0, k1, d, dim, dim2, dir, d2, d3, d4, cell_d;
 -    int      mc, cdd, nrcg, ncg_recv, nat_recv, nvs, nvr, nvec, vec;
 +    int      cg, k, k0, k1, d, dim, d2;
 +    int      mc, nrcg;
      int      flag;
      gmx_bool bScrew;
      ivec     dev;
@@@ -4509,7 -4512,8 +4515,8 @@@
                  {
                      if (pos_d >= limit1[d])
                      {
-                         cg_move_error(fplog, dd, step, cg, d, 1, TRUE, limitd[d],
+                         cg_move_error(fplog, dd, step, cg, d, 1,
+                                       cg_cm != state->x, limitd[d],
                                        cg_cm[cg], cm_new, pos_d);
                      }
                      dev[d] = 1;
@@@ -4535,7 -4539,8 +4542,8 @@@
                  {
                      if (pos_d < limit0[d])
                      {
-                         cg_move_error(fplog, dd, step, cg, d, -1, TRUE, limitd[d],
+                         cg_move_error(fplog, dd, step, cg, d, -1,
+                                       cg_cm != state->x, limitd[d],
                                        cg_cm[cg], cm_new, pos_d);
                      }
                      dev[d] = -1;
@@@ -4629,15 -4634,17 +4637,15 @@@ static void dd_redistribute_cg(FILE *fp
      int               *move;
      int                npbcdim;
      int                ncg[DIM*2], nat[DIM*2];
 -    int                c, i, cg, k, k0, k1, d, dim, dim2, dir, d2, d3, d4, cell_d;
 -    int                mc, cdd, nrcg, ncg_recv, nat_recv, nvs, nvr, nvec, vec;
 +    int                c, i, cg, k, d, dim, dim2, dir, d2, d3;
 +    int                mc, cdd, nrcg, ncg_recv, nvs, nvr, nvec, vec;
      int                sbuf[2], rbuf[2];
      int                home_pos_cg, home_pos_at, buf_pos;
      int                flag;
      gmx_bool           bV = FALSE, bSDX = FALSE, bCGP = FALSE;
 -    gmx_bool           bScrew;
 -    ivec               dev;
 -    real               inv_ncg, pos_d;
 +    real               pos_d;
      matrix             tcm;
 -    rvec              *cg_cm = NULL, cell_x0, cell_x1, limitd, limit0, limit1, cm_new;
 +    rvec              *cg_cm = NULL, cell_x0, cell_x1, limitd, limit0, limit1;
      atom_id           *cgindex;
      cginfo_mb_t       *cginfo_mb;
      gmx_domdec_comm_t *comm;
@@@ -4894,6 -4901,7 +4902,6 @@@
      {
          dim      = dd->dim[d];
          ncg_recv = 0;
 -        nat_recv = 0;
          nvr      = 0;
          for (dir = 0; dir < (dd->nc[dim] == 2 ? 1 : 2); dir++)
          {
@@@ -4928,6 -4936,7 +4936,6 @@@
                               comm->cgcm_state[cdd], nvs,
                               comm->vbuf.v+nvr, i);
              ncg_recv += rbuf[0];
 -            nat_recv += rbuf[1];
              nvr      += i;
          }
  
@@@ -4949,7 -4958,7 +4957,7 @@@
                  {
                      cg_move_error(fplog, dd, step, cg, dim,
                                    (flag & DD_FLAG_FW(d)) ? 1 : 0,
-                                   FALSE, 0,
+                                   fr->cutoff_scheme == ecutsGROUP, 0,
                                    comm->vbuf.v[buf_pos],
                                    comm->vbuf.v[buf_pos],
                                    comm->vbuf.v[buf_pos][dim]);
@@@ -5231,7 -5240,7 +5239,7 @@@ static void get_load_distribution(gmx_d
      gmx_domdec_comm_t *comm;
      gmx_domdec_load_t *load;
      gmx_domdec_root_t *root = NULL;
 -    int                d, dim, cid, i, pos;
 +    int                d, dim, i, pos;
      float              cell_frac = 0, sbuf[DD_NLOAD_MAX];
      gmx_bool           bSepPME;
  
@@@ -5327,7 -5336,7 +5335,7 @@@
                  for (i = 0; i < dd->nc[dim]; i++)
                  {
                      load->sum += load->load[pos++];
 -                    load->max  = max(load->max, load->load[pos]);
 +                    load->max  = std::max(load->max, load->load[pos]);
                      pos++;
                      if (dd->bGridJump)
                      {
@@@ -5336,14 -5345,14 +5344,14 @@@
                              /* This direction could not be load balanced properly,
                               * therefore we need to use the maximum iso the average load.
                               */
 -                            load->sum_m = max(load->sum_m, load->load[pos]);
 +                            load->sum_m = std::max(load->sum_m, load->load[pos]);
                          }
                          else
                          {
                              load->sum_m += load->load[pos];
                          }
                          pos++;
 -                        load->cvol_min = min(load->cvol_min, load->load[pos]);
 +                        load->cvol_min = std::min(load->cvol_min, load->load[pos]);
                          pos++;
                          if (d < dd->ndim-1)
                          {
@@@ -5357,9 -5366,9 +5365,9 @@@
                      }
                      if (bSepPME)
                      {
 -                        load->mdf = max(load->mdf, load->load[pos]);
 +                        load->mdf = std::max(load->mdf, load->load[pos]);
                          pos++;
 -                        load->pme = max(load->pme, load->load[pos]);
 +                        load->pme = std::max(load->pme, load->load[pos]);
                          pos++;
                      }
                  }
@@@ -5753,6 -5762,7 +5761,6 @@@ static void make_load_communicators(gmx
  
  void setup_dd_grid(FILE *fplog, gmx_domdec_t *dd)
  {
 -    gmx_bool                bZYX;
      int                     d, dim, i, j, m;
      ivec                    tmp, s;
      int                     nzone, nzonep;
@@@ -5911,16 -5921,17 +5919,16 @@@
  static void make_pp_communicator(FILE *fplog, t_commrec *cr, int gmx_unused reorder)
  {
      gmx_domdec_t      *dd;
 +    dd   = cr->dd;
 +
 +#ifdef GMX_MPI
      gmx_domdec_comm_t *comm;
 -    int                i, rank, *buf;
 +    int                rank, *buf;
      ivec               periods;
 -#ifdef GMX_MPI
      MPI_Comm           comm_cart;
 -#endif
  
 -    dd   = cr->dd;
      comm = dd->comm;
  
 -#ifdef GMX_MPI
      if (comm->bCartesianPP)
      {
          /* Set up cartesian communication for the particle-particle part */
@@@ -5930,7 -5941,7 +5938,7 @@@
                      dd->nc[XX], dd->nc[YY], dd->nc[ZZ]);
          }
  
 -        for (i = 0; i < DIM; i++)
 +        for (int i = 0; i < DIM; i++)
          {
              periods[i] = TRUE;
          }
@@@ -5994,7 -6005,7 +6002,7 @@@
          /* Determine the master coordinates and rank.
           * The DD master should be the same node as the master of this sim.
           */
 -        for (i = 0; i < dd->nnodes; i++)
 +        for (int i = 0; i < dd->nnodes; i++)
          {
              if (comm->ddindex2simnodeid[i] == 0)
              {
@@@ -6032,27 -6043,30 +6040,27 @@@
      }
  }
  
 -static void receive_ddindex2simnodeid(t_commrec *cr)
 +static void receive_ddindex2simnodeid(t_commrec gmx_unused *cr)
  {
 +#ifdef GMX_MPI
      gmx_domdec_t      *dd;
 -
      gmx_domdec_comm_t *comm;
 -    int               *buf;
  
      dd   = cr->dd;
      comm = dd->comm;
  
 -#ifdef GMX_MPI
      if (!comm->bCartesianPP_PME && comm->bCartesianPP)
      {
 +        int *buf;
          snew(comm->ddindex2simnodeid, dd->nnodes);
          snew(buf, dd->nnodes);
          if (cr->duty & DUTY_PP)
          {
              buf[dd_index(dd->nc, dd->ci)] = cr->sim_nodeid;
          }
 -#ifdef GMX_MPI
          /* Communicate the ddindex to simulation nodeid index */
          MPI_Allreduce(buf, comm->ddindex2simnodeid, dd->nnodes, MPI_INT, MPI_SUM,
                        cr->mpi_comm_mysim);
 -#endif
          sfree(buf);
      }
  #endif
@@@ -6094,8 -6108,9 +6102,8 @@@ static void split_communicator(FILE *fp
  {
      gmx_domdec_t      *dd;
      gmx_domdec_comm_t *comm;
 -    int                i, rank;
 +    int                i;
      gmx_bool           bDiv[DIM];
 -    ivec               periods;
  #ifdef GMX_MPI
      MPI_Comm           comm_cart;
  #endif
@@@ -6143,9 -6158,6 +6151,9 @@@
  #ifdef GMX_MPI
      if (comm->bCartesianPP_PME)
      {
 +        int  rank;
 +        ivec periods;
 +
          if (fplog)
          {
              fprintf(fplog, "Will use a Cartesian communicator for PP <-> PME: %d x %d x %d\n", comm->ntot[XX], comm->ntot[YY], comm->ntot[ZZ]);
@@@ -6157,6 -6169,7 +6165,6 @@@
          }
          MPI_Cart_create(cr->mpi_comm_mysim, DIM, comm->ntot, periods, reorder,
                          &comm_cart);
 -
          MPI_Comm_rank(comm_cart, &rank);
          if (MASTERNODE(cr) && rank != 0)
          {
@@@ -6340,7 -6353,7 +6348,7 @@@ static real *get_slb_frac(FILE *fplog, 
          for (i = 0; i < nc; i++)
          {
              dbl = 0;
 -            sscanf(size_string, "%lf%n", &dbl, &n);
 +            sscanf(size_string, "%20lf%n", &dbl, &n);
              if (dbl == 0)
              {
                  gmx_fatal(FARGS, "Incorrect or not enough DD cell size entries for direction %s: '%s'", dir, size_string);
@@@ -6403,7 -6416,7 +6411,7 @@@ static int dd_getenv(FILE *fplog, cons
      val = getenv(env_var);
      if (val)
      {
 -        if (sscanf(val, "%d", &nst) <= 0)
 +        if (sscanf(val, "%20d", &nst) <= 0)
          {
              nst = 1;
          }
@@@ -6464,7 -6477,7 +6472,7 @@@ static real average_cellsize_min(gmx_do
      {
          d = dd->dim[di];
          /* Check using the initial average cell size */
 -        r = min(r, ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
 +        r = std::min(r, ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
      }
  
      return r;
@@@ -6474,6 -6487,7 +6482,6 @@@ static int check_dlb_support(FILE *fplo
                               const char *dlb_opt, gmx_bool bRecordLoad,
                               unsigned long Flags, t_inputrec *ir)
  {
 -    gmx_domdec_t *dd;
      int           eDLB = -1;
      char          buf[STRLEN];
  
@@@ -6615,10 -6629,10 +6623,10 @@@ gmx_domdec_t *init_domain_decomposition
      gmx_domdec_t      *dd;
      gmx_domdec_comm_t *comm;
      int                recload;
 -    int                d, i, j;
      real               r_2b, r_mb, r_bonded = -1, r_bonded_limit = -1, limit, acs;
      gmx_bool           bC;
      char               buf[STRLEN];
 +    const real         tenPercentMargin = 1.1;
  
      if (fplog)
      {
@@@ -6673,7 -6687,8 +6681,8 @@@
      /* Initialize to GPU share count to 0, might change later */
      comm->nrank_gpu_shared = 0;
  
-     comm->eDLB = check_dlb_support(fplog, cr, dlb_opt, comm->bRecordLoad, Flags, ir);
+     comm->eDLB        = check_dlb_support(fplog, cr, dlb_opt, comm->bRecordLoad, Flags, ir);
+     comm->bDLB_locked = FALSE;
  
      comm->bDynLoadBal = (comm->eDLB == edlbYES);
      if (fplog)
@@@ -6739,6 -6754,13 +6748,13 @@@
      comm->cellsize_limit = 0;
      comm->bBondComm      = FALSE;
  
+     /* Atoms should be able to move by up to half the list buffer size (if > 0)
+      * within nstlist steps. Since boundaries are allowed to displace by half
+      * a cell size, DD cells should be at least the size of the list buffer.
+      */
 -    comm->cellsize_limit = max(comm->cellsize_limit,
 -                               ir->rlistlong - max(ir->rvdw, ir->rcoulomb));
++    comm->cellsize_limit = std::max(comm->cellsize_limit,
++                                    ir->rlistlong - std::max(ir->rvdw, ir->rcoulomb));
+ 
      if (comm->bInterCGBondeds)
      {
          if (comm_distance_min > 0)
@@@ -6750,7 -6772,7 +6766,7 @@@
              }
              else
              {
 -                comm->cutoff = max(comm->cutoff, comm->cutoff_mbody);
 +                comm->cutoff = std::max(comm->cutoff, comm->cutoff_mbody);
              }
              r_bonded_limit = comm->cutoff_mbody;
          }
@@@ -6776,16 -6798,16 +6792,16 @@@
               */
              if (Flags & MD_DDBONDCOMM)
              {
 -                if (max(r_2b, r_mb) > comm->cutoff)
 +                if (std::max(r_2b, r_mb) > comm->cutoff)
                  {
 -                    r_bonded        = max(r_2b, r_mb);
 -                    r_bonded_limit  = 1.1*r_bonded;
 +                    r_bonded        = std::max(r_2b, r_mb);
 +                    r_bonded_limit  = tenPercentMargin*r_bonded;
                      comm->bBondComm = TRUE;
                  }
                  else
                  {
                      r_bonded       = r_mb;
 -                    r_bonded_limit = min(1.1*r_bonded, comm->cutoff);
 +                    r_bonded_limit = std::min(tenPercentMargin*r_bonded, comm->cutoff);
                  }
                  /* We determine cutoff_mbody later */
              }
@@@ -6794,12 -6816,12 +6810,12 @@@
                  /* No special bonded communication,
                   * simply increase the DD cut-off.
                   */
 -                r_bonded_limit     = 1.1*max(r_2b, r_mb);
 +                r_bonded_limit     = tenPercentMargin*std::max(r_2b, r_mb);
                  comm->cutoff_mbody = r_bonded_limit;
 -                comm->cutoff       = max(comm->cutoff, comm->cutoff_mbody);
 +                comm->cutoff       = std::max(comm->cutoff, comm->cutoff_mbody);
              }
          }
 -        comm->cellsize_limit = max(comm->cellsize_limit, r_bonded_limit);
 +        comm->cellsize_limit = std::max(comm->cellsize_limit, r_bonded_limit);
          if (fplog)
          {
              fprintf(fplog,
@@@ -6833,7 -6855,7 +6849,7 @@@
                  "User supplied maximum distance required for P-LINCS: %.3f nm\n",
                  rconstr);
      }
 -    comm->cellsize_limit = max(comm->cellsize_limit, rconstr);
 +    comm->cellsize_limit = std::max(comm->cellsize_limit, rconstr);
  
      comm->cgs_gl = gmx_mtop_global_cgs(mtop);
  
@@@ -6989,12 -7011,12 +7005,12 @@@
              if (comm->eDLB != edlbNO)
              {
                  /* Check if this does not limit the scaling */
 -                comm->cutoff_mbody = min(comm->cutoff_mbody, dlb_scale*acs);
 +                comm->cutoff_mbody = std::min(comm->cutoff_mbody, dlb_scale*acs);
              }
              if (!comm->bBondComm)
              {
                  /* Without bBondComm do not go beyond the n.b. cut-off */
 -                comm->cutoff_mbody = min(comm->cutoff_mbody, comm->cutoff);
 +                comm->cutoff_mbody = std::min(comm->cutoff_mbody, comm->cutoff);
                  if (comm->cellsize_limit >= comm->cutoff)
                  {
                      /* We don't loose a lot of efficieny
@@@ -7006,7 -7028,7 +7022,7 @@@
                  }
              }
              /* Check if we did not end up below our original limit */
 -            comm->cutoff_mbody = max(comm->cutoff_mbody, r_bonded_limit);
 +            comm->cutoff_mbody = std::max(comm->cutoff_mbody, r_bonded_limit);
  
              if (comm->cutoff_mbody > comm->cellsize_limit)
              {
@@@ -7069,7 -7091,7 +7085,7 @@@ static void turn_on_dlb(FILE *fplog, t_
      cellsize_min = comm->cellsize_min[dd->dim[0]];
      for (d = 1; d < dd->ndim; d++)
      {
 -        cellsize_min = min(cellsize_min, comm->cellsize_min[dd->dim[d]]);
 +        cellsize_min = std::min(cellsize_min, comm->cellsize_min[dd->dim[d]]);
      }
  
      if (cellsize_min < comm->cellsize_limit*1.05)
@@@ -7134,6 -7156,8 +7150,6 @@@ void dd_init_bondeds(FILE *fplog
                       t_inputrec *ir, gmx_bool bBCheck, cginfo_mb_t *cginfo_mb)
  {
      gmx_domdec_comm_t *comm;
 -    gmx_bool           bBondComm;
 -    int                d;
  
      dd_make_reverse_top(fplog, dd, mtop, vsite, ir, bBCheck);
  
@@@ -7244,7 -7268,7 +7260,7 @@@ static void print_dd_settings(FILE *fpl
              limit = dd->comm->cellsize_min[XX];
              for (d = 1; d < DIM; d++)
              {
 -                limit = min(limit, dd->comm->cellsize_min[d]);
 +                limit = std::min(limit, dd->comm->cellsize_min[d]);
              }
          }
  
@@@ -7252,10 -7276,10 +7268,10 @@@
          {
              fprintf(fplog, "%40s  %-7s %6.3f nm\n",
                      "two-body bonded interactions", "(-rdd)",
 -                    max(comm->cutoff, comm->cutoff_mbody));
 +                    std::max(comm->cutoff, comm->cutoff_mbody));
              fprintf(fplog, "%40s  %-7s %6.3f nm\n",
                      "multi-body bonded interactions", "(-rdd)",
 -                    (comm->bBondComm || dd->bGridJump) ? comm->cutoff_mbody : min(comm->cutoff, limit));
 +                    (comm->bBondComm || dd->bGridJump) ? comm->cutoff_mbody : std::min(comm->cutoff, limit));
          }
          if (dd->vsite_comm)
          {
@@@ -7290,7 -7314,7 +7306,7 @@@ static void set_cell_limits_dlb(gmx_dom
  
      /* Determine the maximum number of comm. pulses in one dimension */
  
 -    comm->cellsize_limit = max(comm->cellsize_limit, comm->cutoff_mbody);
 +    comm->cellsize_limit = std::max(comm->cellsize_limit, comm->cutoff_mbody);
  
      /* Determine the maximum required number of grid pulses */
      if (comm->cellsize_limit >= comm->cutoff)
@@@ -7311,7 -7335,7 +7327,7 @@@
      else
      {
          /* There is no cell size limit */
 -        npulse = max(dd->nc[XX]-1, max(dd->nc[YY]-1, dd->nc[ZZ]-1));
 +        npulse = std::max(dd->nc[XX]-1, std::max(dd->nc[YY]-1, dd->nc[ZZ]-1));
      }
  
      if (!bNoCutOff && npulse > 1)
@@@ -7323,9 -7347,9 +7339,9 @@@
              dim      = dd->dim[d];
              npulse_d = (int)(1 + dd->nc[dim]*comm->cutoff
                               /(ddbox->box_size[dim]*ddbox->skew_fac[dim]*dlb_scale));
 -            npulse_d_max = max(npulse_d_max, npulse_d);
 +            npulse_d_max = std::max(npulse_d_max, npulse_d);
          }
 -        npulse = min(npulse, npulse_d_max);
 +        npulse = std::min(npulse, npulse_d_max);
      }
  
      /* This env var can override npulse */
@@@ -7339,10 -7363,10 +7355,10 @@@
      comm->bVacDLBNoLimit = (ir->ePBC == epbcNONE);
      for (d = 0; d < dd->ndim; d++)
      {
 -        comm->cd[d].np_dlb    = min(npulse, dd->nc[dd->dim[d]]-1);
 +        comm->cd[d].np_dlb    = std::min(npulse, dd->nc[dd->dim[d]]-1);
          comm->cd[d].np_nalloc = comm->cd[d].np_dlb;
          snew(comm->cd[d].ind, comm->cd[d].np_nalloc);
 -        comm->maxpulse = max(comm->maxpulse, comm->cd[d].np_dlb);
 +        comm->maxpulse = std::max(comm->maxpulse, comm->cd[d].np_dlb);
          if (comm->cd[d].np_dlb < dd->nc[dd->dim[d]]-1)
          {
              comm->bVacDLBNoLimit = FALSE;
@@@ -7352,10 -7376,10 +7368,10 @@@
      /* cellsize_limit is set for LINCS in init_domain_decomposition */
      if (!comm->bVacDLBNoLimit)
      {
 -        comm->cellsize_limit = max(comm->cellsize_limit,
 -                                   comm->cutoff/comm->maxpulse);
 +        comm->cellsize_limit = std::max(comm->cellsize_limit,
 +                                        comm->cutoff/comm->maxpulse);
      }
 -    comm->cellsize_limit = max(comm->cellsize_limit, comm->cutoff_mbody);
 +    comm->cellsize_limit = std::max(comm->cellsize_limit, comm->cutoff_mbody);
      /* Set the minimum cell size for each DD dimension */
      for (d = 0; d < dd->ndim; d++)
      {
@@@ -7372,7 -7396,7 +7388,7 @@@
      }
      if (comm->cutoff_mbody <= 0)
      {
 -        comm->cutoff_mbody = min(comm->cutoff, comm->cellsize_limit);
 +        comm->cutoff_mbody = std::min(comm->cutoff, comm->cellsize_limit);
      }
      if (comm->bDynLoadBal)
      {
@@@ -7463,7 -7487,7 +7479,7 @@@ void set_dd_parameters(FILE *fplog, gmx
      }
      natoms_tot = comm->cgs_gl.index[comm->cgs_gl.nr];
  
 -    dd->ga2la = ga2la_init(natoms_tot, vol_frac*natoms_tot);
 +    dd->ga2la = ga2la_init(natoms_tot, static_cast<int>(vol_frac*natoms_tot));
  }
  
  static gmx_bool test_dd_cutoff(t_commrec *cr,
@@@ -7564,6 -7588,20 +7580,20 @@@ void change_dd_dlb_cutoff_limit(t_commr
      comm->PMELoadBal_max_cutoff = comm->cutoff;
  }
  
+ gmx_bool dd_dlb_is_locked(const gmx_domdec_t *dd)
+ {
+     return dd->comm->bDLB_locked;
+ }
+ 
+ void dd_dlb_set_lock(gmx_domdec_t *dd, gmx_bool bValue)
+ {
+     /* We can only lock the DLB when it is set to auto, otherwise don't lock */
+     if (dd->comm->eDLB == edlbAUTO)
+     {
+         dd->comm->bDLB_locked = bValue;
+     }
+ }
+ 
  static void merge_cg_buffers(int ncell,
                               gmx_domdec_comm_dim_t *cd, int pulse,
                               int  *ncg_cell,
@@@ -7727,11 -7765,11 +7757,11 @@@ set_dd_corners(const gmx_domdec_t *dd
          c->c[1][1] = comm->cell_x0[dim1];
          if (dd->bGridJump)
          {
 -            c->c[1][1] = max(comm->cell_x0[dim1], comm->zone_d1[1].mch0);
 +            c->c[1][1] = std::max(comm->cell_x0[dim1], comm->zone_d1[1].mch0);
              if (bDistMB)
              {
                  /* For the multi-body distance we need the maximum */
 -                c->bc[1] = max(comm->cell_x0[dim1], comm->zone_d1[1].p1_0);
 +                c->bc[1] = std::max(comm->cell_x0[dim1], comm->zone_d1[1].p1_0);
              }
          }
          /* Set the upper-right corner for rounding */
@@@ -7754,8 -7792,8 +7784,8 @@@
                          if (j >= 4)
                          {
                              c->c[2][j-4] =
 -                                max(c->c[2][j-4],
 -                                    comm->zone_d2[zones->shift[i][dim0]][zones->shift[i][dim1]].mch0);
 +                                std::max(c->c[2][j-4],
 +                                         comm->zone_d2[zones->shift[i][dim0]][zones->shift[i][dim1]].mch0);
                          }
                      }
                  }
@@@ -7767,7 -7805,7 +7797,7 @@@
                      {
                          for (j = 0; j < 2; j++)
                          {
 -                            c->bc[2] = max(c->bc[2], comm->zone_d2[i][j].p1_0);
 +                            c->bc[2] = std::max(c->bc[2], comm->zone_d2[i][j].p1_0);
                          }
                      }
                  }
@@@ -7781,11 -7819,11 +7811,11 @@@
              c->cr1[3] = comm->cell_x1[dim1];
              if (dd->bGridJump)
              {
 -                c->cr1[0] = max(comm->cell_x1[dim1], comm->zone_d1[1].mch1);
 +                c->cr1[0] = std::max(comm->cell_x1[dim1], comm->zone_d1[1].mch1);
                  if (bDistMB)
                  {
                      /* For the multi-body distance we need the maximum */
 -                    c->bcr1 = max(comm->cell_x1[dim1], comm->zone_d1[1].p1_1);
 +                    c->bcr1 = std::max(comm->cell_x1[dim1], comm->zone_d1[1].p1_1);
                  }
              }
          }
@@@ -8065,7 -8103,7 +8095,7 @@@ static void setup_dd_communication(gmx_
  {
      int                    dim_ind, dim, dim0, dim1, dim2, dimd, p, nat_tot;
      int                    nzone, nzone_send, zone, zonei, cg0, cg1;
 -    int                    c, i, j, cg, cg_gl, nrcg;
 +    int                    c, i, cg, cg_gl, nrcg;
      int                   *zone_cg_range, pos_cg, *index_gl, *cgindex, *recv_i;
      gmx_domdec_comm_t     *comm;
      gmx_domdec_zones_t    *zones;
@@@ -8073,7 -8111,7 +8103,7 @@@
      gmx_domdec_ind_t      *ind;
      cginfo_mb_t           *cginfo_mb;
      gmx_bool               bBondComm, bDist2B, bDistMB, bDistBonded;
 -    real                   r_mb, r_comm2, r_scomm2, r_bcomm2, r_0, r_1, r2inc, inv_ncg;
 +    real                   r_comm2, r_bcomm2;
      dd_corners_t           corners;
      ivec                   tric_dist;
      rvec                  *cg_cm, *normal, *v_d, *v_0 = NULL, *v_1 = NULL, *recv_vr;
@@@ -8104,6 -8142,8 +8134,6 @@@
  
      for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
      {
 -        dim = dd->dim[dim_ind];
 -
          /* Check if we need to use triclinic distances */
          tric_dist[dim_ind] = 0;
          for (i = 0; i <= dim_ind; i++)
@@@ -8395,7 -8435,7 +8425,7 @@@
                      /* The rvec buffer is also required for atom buffers
                       * of size nrecv in dd_move_x and dd_move_f.
                       */
 -                    i = max(cd->ind[0].nrecv[nzone+1], ind->nrecv[nzone+1]);
 +                    i = std::max(cd->ind[0].nrecv[nzone+1], ind->nrecv[nzone+1]);
                      vec_rvec_check_alloc(&comm->vbuf2, i);
                  }
              }
@@@ -8540,9 -8580,10 +8570,9 @@@ static void set_zones_size(gmx_domdec_
      gmx_domdec_comm_t  *comm;
      gmx_domdec_zones_t *zones;
      gmx_bool            bDistMB;
 -    int                 z, zi, zj0, zj1, d, dim;
 +    int                 z, zi, d, dim;
      real                rcs, rcmbs;
      int                 i, j;
 -    real                size_j, add_tric;
      real                vol;
  
      comm = dd->comm;
@@@ -8640,8 -8681,8 +8670,8 @@@
                                   * With multiple pulses this will lead
                                   * to a larger zone then strictly necessary.
                                   */
 -                                zones->size[z].x1[dim] = max(zones->size[z].x1[dim],
 -                                                             zones->size[zi].x1[dim]+rcmbs);
 +                                zones->size[z].x1[dim] = std::max(zones->size[z].x1[dim],
 +                                                                  zones->size[zi].x1[dim]+rcmbs);
                              }
                          }
                      }
@@@ -8660,8 -8701,8 +8690,8 @@@
                  {
                      if (zones->shift[z][dim] > 0)
                      {
 -                        zones->size[z].x1[dim] = max(zones->size[z].x1[dim],
 -                                                     zones->size[zi].x1[dim]+rcs);
 +                        zones->size[z].x1[dim] = std::max(zones->size[z].x1[dim],
 +                                                          zones->size[zi].x1[dim]+rcs);
                      }
                  }
              }
@@@ -8725,8 -8766,8 +8755,8 @@@
              {
                  for (i = 0; i < DIM; i++)
                  {
 -                    corner_min[i] = min(corner_min[i], corner[i]);
 -                    corner_max[i] = max(corner_max[i], corner[i]);
 +                    corner_min[i] = std::min(corner_min[i], corner[i]);
 +                    corner_max[i] = std::max(corner_max[i], corner[i]);
                  }
              }
          }
@@@ -8896,7 -8937,8 +8926,7 @@@ static int dd_sort_order(gmx_domdec_t *
  {
      gmx_domdec_sort_t *sort;
      gmx_cgsort_t      *cgsort, *sort_i;
 -    int                ncg_new, nsort2, nsort_new, i, *a, moved, *ibuf;
 -    int                sort_last, sort_skip;
 +    int                ncg_new, nsort2, nsort_new, i, *a, moved;
  
      sort = dd->comm->sort;
  
@@@ -9007,7 -9049,7 +9037,7 @@@ static void dd_sort_state(gmx_domdec_t 
                            int ncg_home_old)
  {
      gmx_domdec_sort_t *sort;
 -    gmx_cgsort_t      *cgsort, *sort_i;
 +    gmx_cgsort_t      *cgsort;
      int               *cgindex;
      int                ncg_new, i, *ibuf, cgsize;
      rvec              *vbuf;
@@@ -9264,7 -9306,7 +9294,7 @@@ void dd_partition_system(FIL
      t_block           *cgs_gl;
      gmx_int64_t        step_pcoupl;
      rvec               cell_ns_x0, cell_ns_x1;
 -    int                i, j, n, ncgindex_set, ncg_home_old = -1, ncg_moved, nat_f_novirsum;
 +    int                i, n, ncgindex_set, ncg_home_old = -1, ncg_moved, nat_f_novirsum;
      gmx_bool           bBoxChanged, bNStGlobalComm, bDoDLB, bCheckDLB, bTurnOnDLB, bLogLoad;
      gmx_bool           bRedist, bSortCG, bResortAll;
      ivec               ncells_old = {0, 0, 0}, ncells_new = {0, 0, 0}, np;
@@@ -9324,17 -9366,18 +9354,17 @@@
      }
  
      /* Check if we have recorded loads on the nodes */
-     if (comm->bRecordLoad && dd_load_count(comm))
+     if (comm->bRecordLoad && dd_load_count(comm) > 0)
      {
-         if (comm->eDLB == edlbAUTO && !comm->bDynLoadBal)
+         if (comm->eDLB == edlbAUTO && !comm->bDynLoadBal && !dd_dlb_is_locked(dd))
          {
              /* Check if we should use DLB at the second partitioning
               * and every 100 partitionings,
               * so the extra communication cost is negligible.
               */
-             n         = std::max(100, nstglobalcomm);
+             const int nddp_chk_dlb = 100;
 -
              bCheckDLB = (comm->n_load_collect == 0 ||
-                          comm->n_load_have % n == n-1);
+                          comm->n_load_have % nddp_chk_dlb == nddp_chk_dlb - 1);
          }
          else
          {
@@@ -9372,8 -9415,26 +9402,26 @@@
                  /* Since the timings are node dependent, the master decides */
                  if (DDMASTER(dd))
                  {
-                     bTurnOnDLB =
-                         (dd_force_imb_perf_loss(dd) >= DD_PERF_LOSS_DLB_ON);
+                     /* Here we check if the max PME rank load is more than 0.98
+                      * the max PP force load. If so, PP DLB will not help,
+                      * since we are (almost) limited by PME. Furthermore,
+                      * DLB will cause a significant extra x/f redistribution
+                      * cost on the PME ranks, which will then surely result
+                      * in lower total performance.
+                      * This check might be fragile, since one measurement
+                      * below 0.98 (although only done once every 100 DD part.)
+                      * could turn on DLB for the rest of the run.
+                      */
+                     if (cr->npmenodes > 0 &&
+                         dd_pme_f_ratio(dd) > 1 - DD_PERF_LOSS_DLB_ON)
+                     {
+                         bTurnOnDLB = FALSE;
+                     }
+                     else
+                     {
+                         bTurnOnDLB =
+                             (dd_force_imb_perf_loss(dd) >= DD_PERF_LOSS_DLB_ON);
+                     }
                      if (debug)
                      {
                          fprintf(debug, "step %s, imb loss %f\n",
diff --combined src/gromacs/mdlib/force.c
index f98b073fee,d4e6445491..4313d3e75e
--- a/src/gromacs/mdlib/force.c
+++ b/src/gromacs/mdlib/force.c
@@@ -34,39 -34,38 +34,39 @@@
   * To help us fund GROMACS development, we humbly ask that you cite
   * the research papers on the package. Check out http://www.gromacs.org.
   */
 -#ifdef HAVE_CONFIG_H
 -#include <config.h>
 -#endif
 +#include "gmxpre.h"
 +
 +#include "gromacs/legacyheaders/force.h"
 +
 +#include "config.h"
  
 +#include <assert.h>
  #include <math.h>
  #include <string.h>
 -#include <assert.h>
 -#include "sysstuff.h"
 -#include "typedefs.h"
 -#include "macros.h"
 -#include "gromacs/utility/smalloc.h"
 -#include "macros.h"
 -#include "physics.h"
 -#include "force.h"
 -#include "nonbonded.h"
 -#include "names.h"
 -#include "network.h"
 -#include "pbc.h"
 -#include "ns.h"
 -#include "nrnb.h"
 -#include "bondf.h"
 -#include "mshift.h"
 -#include "txtdump.h"
 -#include "coulomb.h"
 -#include "pme.h"
 -#include "mdrun.h"
 -#include "domdec.h"
 -#include "qmmm.h"
 -#include "gmx_omp_nthreads.h"
  
 +#include "gromacs/legacyheaders/coulomb.h"
 +#include "gromacs/legacyheaders/domdec.h"
 +#include "gromacs/legacyheaders/gmx_omp_nthreads.h"
 +#include "gromacs/legacyheaders/macros.h"
 +#include "gromacs/legacyheaders/mdrun.h"
 +#include "gromacs/legacyheaders/names.h"
 +#include "gromacs/legacyheaders/network.h"
 +#include "gromacs/legacyheaders/nonbonded.h"
 +#include "gromacs/legacyheaders/nrnb.h"
 +#include "gromacs/legacyheaders/ns.h"
 +#include "gromacs/legacyheaders/pme.h"
 +#include "gromacs/legacyheaders/qmmm.h"
 +#include "gromacs/legacyheaders/txtdump.h"
 +#include "gromacs/legacyheaders/typedefs.h"
 +#include "gromacs/legacyheaders/types/commrec.h"
 +#include "gromacs/listed-forces/bonded.h"
 +#include "gromacs/math/vec.h"
 +#include "gromacs/pbcutil/ishift.h"
 +#include "gromacs/pbcutil/mshift.h"
 +#include "gromacs/pbcutil/pbc.h"
  #include "gromacs/timing/wallcycle.h"
 -#include "gmx_fatal.h"
 +#include "gromacs/utility/fatalerror.h"
 +#include "gromacs/utility/smalloc.h"
  
  void ns(FILE              *fp,
          t_forcerec        *fr,
@@@ -118,9 -117,11 +118,11 @@@ static void reduce_thread_forces(int n
                                   int nthreads, f_thread_t *f_t)
  {
      int t, i;
+     int nthreads_loop gmx_unused;
  
      /* This reduction can run over any number of threads */
- #pragma omp parallel for num_threads(gmx_omp_nthreads_get(emntBonded)) private(t) schedule(static)
+     nthreads_loop = gmx_omp_nthreads_get(emntBonded);
+ #pragma omp parallel for num_threads(nthreads_loop) private(t) schedule(static)
      for (i = 0; i < n; i++)
      {
          for (t = 1; t < nthreads; t++)
@@@ -139,7 -140,13 +141,7 @@@
      }
  }
  
 -void gmx_print_sepdvdl(FILE *fplog, const char *s, real v, real dvdlambda)
 -{
 -    fprintf(fplog, "  %-30s V %12.5e  dVdl %12.5e\n", s, v, dvdlambda);
 -}
 -
 -void do_force_lowlevel(FILE       *fplog,   gmx_int64_t step,
 -                       t_forcerec *fr,      t_inputrec *ir,
 +void do_force_lowlevel(t_forcerec *fr,      t_inputrec *ir,
                         t_idef     *idef,    t_commrec  *cr,
                         t_nrnb     *nrnb,    gmx_wallcycle_t wcycle,
                         t_mdatoms  *md,
@@@ -150,6 -157,7 +152,6 @@@
                         t_fcdata   *fcd,
                         gmx_localtop_t *top,
                         gmx_genborn_t *born,
 -                       t_atomtypes *atype,
                         gmx_bool       bBornRadii,
                         matrix     box,
                         t_lambda   *fepvals,
@@@ -162,7 -170,7 +164,7 @@@
  {
      int         i, j;
      int         donb_flags;
 -    gmx_bool    bDoEpot, bSepDVDL, bSB;
 +    gmx_bool    bDoEpot, bSB;
      int         pme_flags;
      matrix      boxs;
      rvec        box_size;
@@@ -176,6 -184,8 +178,6 @@@
      double  t0 = 0.0, t1, t2, t3; /* time measurement for coarse load balancing */
  #endif
  
 -#define PRINT_SEPDVDL(s, v, dvdlambda) if (bSepDVDL) { gmx_print_sepdvdl(fplog, s, v, dvdlambda); }
 -
      set_pbc(&pbc, fr->ePBC, box);
  
      /* reset free energy components */
@@@ -191,6 -201,7 +193,6 @@@
          box_size[i] = box[i][i];
      }
  
 -    bSepDVDL = (fr->bSepDVDL && do_per_step(step, ir->nstlog));
      debug_gmx();
  
      /* do QMMM first if requested */
@@@ -199,6 -210,12 +201,6 @@@
          enerd->term[F_EQM] = calculate_QMMM(cr, x, f, fr);
      }
  
 -    if (bSepDVDL)
 -    {
 -        fprintf(fplog, "Step %s: non-bonded V and dVdl for rank %d:\n",
 -                gmx_step_str(step, buf), cr->nodeid);
 -    }
 -
      /* Call the short range functions all in one go. */
  
  #ifdef GMX_MPI
@@@ -216,6 -233,7 +218,6 @@@
          /* foreign lambda component for walls */
          real dvdl_walls = do_walls(ir, fr, box, md, x, f, lambda[efptVDW],
                                     enerd->grpp.ener[egLJSR], nrnb);
 -        PRINT_SEPDVDL("Walls", 0.0, dvdl_walls);
          enerd->dvdl_lin[efptVDW] += dvdl_walls;
      }
  
@@@ -298,10 -316,10 +300,10 @@@
      /* MRS: Eventually, many need to include free energy contribution here! */
      if (ir->implicit_solvent)
      {
 -        wallcycle_sub_start(wcycle, ewcsBONDED);
 +        wallcycle_sub_start(wcycle, ewcsLISTED);
          calc_gb_forces(cr, md, born, top, x, f, fr, idef,
                         ir->gb_algorithm, ir->sa_algorithm, nrnb, &pbc, graph, enerd);
 -        wallcycle_sub_stop(wcycle, ewcsBONDED);
 +        wallcycle_sub_stop(wcycle, ewcsLISTED);
      }
  
  #ifdef GMX_MPI
@@@ -333,6 -351,24 +335,6 @@@
          enerd->dvdl_lin[efptCOUL] += dvdl_nb[efptCOUL];
      }
  
 -    if (bSepDVDL)
 -    {
 -        real V_short_range    = 0;
 -        real dvdl_short_range = 0;
 -
 -        for (i = 0; i < enerd->grpp.nener; i++)
 -        {
 -            V_short_range +=
 -                (fr->bBHAM ?
 -                 enerd->grpp.ener[egBHAMSR][i] :
 -                 enerd->grpp.ener[egLJSR][i])
 -                + enerd->grpp.ener[egCOULSR][i] + enerd->grpp.ener[egGB][i];
 -        }
 -        dvdl_short_range = dvdl_nb[efptVDW] + dvdl_nb[efptCOUL];
 -        PRINT_SEPDVDL("VdW and Coulomb SR particle-p.",
 -                      V_short_range,
 -                      dvdl_short_range);
 -    }
      debug_gmx();
  
  
@@@ -341,9 -377,9 +343,9 @@@
          pr_rvecs(debug, 0, "fshift after SR", fr->fshift, SHIFTS);
      }
  
 -    /* Shift the coordinates. Must be done before bonded forces and PPPM,
 +    /* Shift the coordinates. Must be done before listed forces and PPPM,
       * but is also necessary for SHAKE and update, therefore it can NOT
 -     * go when no bonded forces have to be evaluated.
 +     * go when no listed forces have to be evaluated.
       */
  
      /* Here sometimes we would not need to shift with NBFonly,
@@@ -361,9 -397,9 +363,9 @@@
              inc_nrnb(nrnb, eNR_SHIFTX, graph->nnodes);
          }
      }
 -    /* Check whether we need to do bondeds or correct for exclusions */
 +    /* Check whether we need to do listed interactions or correct for exclusions */
      if (fr->bMolPBC &&
 -        ((flags & GMX_FORCE_BONDED)
 +        ((flags & GMX_FORCE_LISTED)
           || EEL_RF(fr->eeltype) || EEL_FULL(fr->eeltype) || EVDW_PME(fr->vdwtype)))
      {
          /* Since all atoms are in the rectangular or triclinic unit-cell,
@@@ -373,13 -409,14 +375,13 @@@
      }
      debug_gmx();
  
 -    if (flags & GMX_FORCE_BONDED)
 +    if (flags & GMX_FORCE_LISTED)
      {
 -        wallcycle_sub_start(wcycle, ewcsBONDED);
 -        calc_bonds(fplog, cr->ms,
 -                   idef, x, hist, f, fr, &pbc, graph, enerd, nrnb, lambda, md, fcd,
 -                   DOMAINDECOMP(cr) ? cr->dd->gatindex : NULL, atype, born,
 -                   flags,
 -                   fr->bSepDVDL && do_per_step(step, ir->nstlog), step);
 +        wallcycle_sub_start(wcycle, ewcsLISTED);
 +        calc_bonds(cr->ms,
 +                   idef, (const rvec *) x, hist, f, fr, &pbc, graph, enerd, nrnb, lambda, md, fcd,
 +                   DOMAINDECOMP(cr) ? cr->dd->gatindex : NULL,
 +                   flags);
  
          /* Check if we have to determine energy differences
           * at foreign lambda's.
@@@ -398,7 -435,7 +400,7 @@@
                  {
                      lam_i[j] = (i == 0 ? lambda[j] : fepvals->all_lambda[j][i-1]);
                  }
 -                calc_bonds_lambda(fplog, idef, x, fr, &pbc, graph, &(enerd->foreign_grpp), enerd->foreign_term, nrnb, lam_i, md,
 +                calc_bonds_lambda(idef, (const rvec *) x, fr, &pbc, graph, &(enerd->foreign_grpp), enerd->foreign_term, nrnb, lam_i, md,
                                    fcd, DOMAINDECOMP(cr) ? cr->dd->gatindex : NULL);
                  sum_epot(&(enerd->foreign_grpp), enerd->foreign_term);
                  enerd->enerpart_lambda[i] += enerd->foreign_term[F_EPOT];
@@@ -406,25 -443,17 +408,25 @@@
          }
          debug_gmx();
  
 -        wallcycle_sub_stop(wcycle, ewcsBONDED);
 +        wallcycle_sub_stop(wcycle, ewcsLISTED);
      }
  
      where();
  
      *cycles_pme = 0;
 +    clear_mat(fr->vir_el_recip);
 +    clear_mat(fr->vir_lj_recip);
 +
 +    /* Do long-range electrostatics and/or LJ-PME, including related short-range
 +     * corrections.
 +     */
      if (EEL_FULL(fr->eeltype) || EVDW_PME(fr->vdwtype))
      {
 -        real Vlr             = 0, Vcorr = 0;
 -        real dvdl_long_range = 0;
 -        int  status          = 0;
 +        real Vlr               = 0, Vcorr = 0;
 +        real dvdl_long_range   = 0;
 +        int  status            = 0;
 +        real Vlr_q             = 0, Vlr_lj = 0, Vcorr_q = 0, Vcorr_lj = 0;
 +        real dvdl_long_range_q = 0, dvdl_long_range_lj = 0;
  
          bSB = (ir->nwall == 2);
          if (bSB)
@@@ -433,6 -462,20 +435,6 @@@
              svmul(ir->wall_ewald_zfac, boxs[ZZ], boxs[ZZ]);
              box_size[ZZ] *= ir->wall_ewald_zfac;
          }
 -    }
 -
 -    /* Do long-range electrostatics and/or LJ-PME, including related short-range
 -     * corrections.
 -     */
 -
 -    clear_mat(fr->vir_el_recip);
 -    clear_mat(fr->vir_lj_recip);
 -
 -    if (EEL_FULL(fr->eeltype) || EVDW_PME(fr->vdwtype))
 -    {
 -        real Vlr_q             = 0, Vlr_lj = 0, Vcorr_q = 0, Vcorr_lj = 0;
 -        real dvdl_long_range_q = 0, dvdl_long_range_lj = 0;
 -        int  status            = 0;
  
          if (EEL_PME_EWALD(fr->eeltype) || EVDW_PME(fr->vdwtype))
          {
@@@ -534,10 -577,15 +536,10 @@@
                                                     fr->vir_el_recip);
              }
  
 -            PRINT_SEPDVDL("Ewald excl./charge/dip. corr.", Vcorr_q, dvdl_long_range_correction_q);
 -            PRINT_SEPDVDL("Ewald excl. corr. LJ", Vcorr_lj, dvdl_long_range_correction_lj);
              enerd->dvdl_lin[efptCOUL] += dvdl_long_range_correction_q;
              enerd->dvdl_lin[efptVDW]  += dvdl_long_range_correction_lj;
 -        }
  
 -        if ((EEL_PME(fr->eeltype) || EVDW_PME(fr->vdwtype)))
 -        {
 -            if (cr->duty & DUTY_PME)
 +            if ((EEL_PME(fr->eeltype) || EVDW_PME(fr->vdwtype)) && (cr->duty & DUTY_PME))
              {
                  /* Do reciprocal PME for Coulomb and/or LJ. */
                  assert(fr->n_tpi >= 0);
@@@ -609,6 -657,7 +611,6 @@@
                                          md->chargeA + md->homenr - fr->n_tpi,
                                          &Vlr_q);
                  }
 -                PRINT_SEPDVDL("PME mesh", Vlr_q + Vlr_lj, dvdl_long_range_q+dvdl_long_range_lj);
              }
          }
  
@@@ -619,6 -668,7 +621,6 @@@
                               box_size, cr, md->homenr,
                               fr->vir_el_recip, fr->ewaldcoeff_q,
                               lambda[efptCOUL], &dvdl_long_range_q, fr->ewald_table);
 -            PRINT_SEPDVDL("Ewald long-range", Vlr_q, dvdl_long_range_q);
          }
  
          /* Note that with separate PME nodes we get the real energies later */
@@@ -653,6 -703,8 +655,6 @@@
                                         fr->fshift, &pbc, lambda[efptCOUL], &dvdl_rf_excl);
  
                  enerd->dvdl_lin[efptCOUL] += dvdl_rf_excl;
 -                PRINT_SEPDVDL("RF exclusion correction",
 -                              enerd->term[F_RF_EXCL], dvdl_rf_excl);
              }
          }
      }
diff --combined src/gromacs/mdlib/mdatom.c
index 01c5c55bd6,b8d51a76da..d1722dedf1
--- a/src/gromacs/mdlib/mdatom.c
+++ b/src/gromacs/mdlib/mdatom.c
@@@ -34,16 -34,19 +34,16 @@@
   * To help us fund GROMACS development, we humbly ask that you cite
   * the research papers on the package. Check out http://www.gromacs.org.
   */
 -#ifdef HAVE_CONFIG_H
 -#include <config.h>
 -#endif
 +#include "gmxpre.h"
  
  #include <math.h>
  
 -#include "typedefs.h"
 -#include "mdatoms.h"
 +#include "gromacs/legacyheaders/gmx_omp_nthreads.h"
 +#include "gromacs/legacyheaders/mdatoms.h"
 +#include "gromacs/legacyheaders/qmmm.h"
 +#include "gromacs/legacyheaders/typedefs.h"
 +#include "gromacs/topology/mtop_util.h"
  #include "gromacs/utility/smalloc.h"
 -#include "main.h"
 -#include "qmmm.h"
 -#include "mtop_util.h"
 -#include "gmx_omp_nthreads.h"
  
  #define ALMOST_ZERO 1e-30
  
@@@ -118,6 -121,7 +118,7 @@@ void atoms2md(gmx_mtop_t *mtop, t_input
      t_grpopts            *opts;
      gmx_groups_t         *groups;
      gmx_molblock_t       *molblock;
+     int                   nthreads gmx_unused;
  
      bLJPME = EVDW_PME(ir->vdwtype);
  
@@@ -230,7 -234,8 +231,8 @@@
  
      alook = gmx_mtop_atomlookup_init(mtop);
  
- #pragma omp parallel for num_threads(gmx_omp_nthreads_get(emntDefault)) schedule(static)
+     nthreads = gmx_omp_nthreads_get(emntDefault);
+ #pragma omp parallel for num_threads(nthreads) schedule(static)
      for (i = 0; i < md->nr; i++)
      {
          int      g, ag, molb;
diff --combined src/gromacs/mdlib/minimize.c
index 234bcbdd8f,69008f53fa..5255d544d1
--- a/src/gromacs/mdlib/minimize.c
+++ b/src/gromacs/mdlib/minimize.c
@@@ -34,51 -34,51 +34,51 @@@
   * To help us fund GROMACS development, we humbly ask that you cite
   * the research papers on the package. Check out http://www.gromacs.org.
   */
 -#ifdef HAVE_CONFIG_H
 -#include <config.h>
 -#endif
 +#include "gmxpre.h"
 +
 +#include "config.h"
  
 +#include <math.h>
  #include <string.h>
  #include <time.h>
 -#include <math.h>
 -#include "sysstuff.h"
 -#include "gromacs/utility/cstringutil.h"
 -#include "network.h"
 -#include "gromacs/utility/smalloc.h"
 -#include "nrnb.h"
 -#include "main.h"
 -#include "force.h"
 -#include "macros.h"
 -#include "names.h"
 -#include "gmx_fatal.h"
 -#include "txtdump.h"
 -#include "typedefs.h"
 -#include "update.h"
 -#include "constr.h"
 -#include "vec.h"
 -#include "tgroup.h"
 -#include "mdebin.h"
 -#include "vsite.h"
 -#include "force.h"
 -#include "mdrun.h"
 -#include "md_support.h"
 -#include "sim_util.h"
 -#include "domdec.h"
 -#include "mdatoms.h"
 -#include "ns.h"
 -#include "mtop_util.h"
 -#include "pme.h"
 -#include "bondf.h"
 -#include "gmx_omp_nthreads.h"
 -#include "md_logging.h"
  
  #include "gromacs/fileio/confio.h"
 +#include "gromacs/fileio/mtxio.h"
  #include "gromacs/fileio/trajectory_writing.h"
 -#include "gromacs/linearalgebra/mtxio.h"
 +#include "gromacs/imd/imd.h"
 +#include "gromacs/legacyheaders/bonded-threading.h"
 +#include "gromacs/legacyheaders/constr.h"
 +#include "gromacs/legacyheaders/domdec.h"
 +#include "gromacs/legacyheaders/force.h"
 +#include "gromacs/legacyheaders/gmx_omp_nthreads.h"
 +#include "gromacs/legacyheaders/macros.h"
 +#include "gromacs/legacyheaders/md_logging.h"
 +#include "gromacs/legacyheaders/md_support.h"
 +#include "gromacs/legacyheaders/mdatoms.h"
 +#include "gromacs/legacyheaders/mdebin.h"
 +#include "gromacs/legacyheaders/mdrun.h"
 +#include "gromacs/legacyheaders/names.h"
 +#include "gromacs/legacyheaders/network.h"
 +#include "gromacs/legacyheaders/nrnb.h"
 +#include "gromacs/legacyheaders/ns.h"
 +#include "gromacs/legacyheaders/pme.h"
 +#include "gromacs/legacyheaders/sim_util.h"
 +#include "gromacs/legacyheaders/tgroup.h"
 +#include "gromacs/legacyheaders/txtdump.h"
 +#include "gromacs/legacyheaders/typedefs.h"
 +#include "gromacs/legacyheaders/update.h"
 +#include "gromacs/legacyheaders/vsite.h"
 +#include "gromacs/legacyheaders/types/commrec.h"
  #include "gromacs/linearalgebra/sparsematrix.h"
 +#include "gromacs/math/vec.h"
 +#include "gromacs/pbcutil/mshift.h"
 +#include "gromacs/pbcutil/pbc.h"
  #include "gromacs/timing/wallcycle.h"
  #include "gromacs/timing/walltime_accounting.h"
 -#include "gromacs/imd/imd.h"
 +#include "gromacs/topology/mtop_util.h"
 +#include "gromacs/utility/cstringutil.h"
 +#include "gromacs/utility/fatalerror.h"
 +#include "gromacs/utility/smalloc.h"
  
  typedef struct {
      t_state  s;
@@@ -550,6 -550,7 +550,7 @@@ static void do_em_step(t_commrec *cr, t
      int      start, end;
      rvec    *x1, *x2;
      real     dvdl_constr;
+     int      nthreads gmx_unused;
  
      s1 = &ems1->s;
      s2 = &ems2->s;
@@@ -587,7 -588,8 +588,8 @@@
      x1 = s1->x;
      x2 = s2->x;
  
- #pragma omp parallel num_threads(gmx_omp_nthreads_get(emntUpdate))
+     nthreads = gmx_omp_nthreads_get(emntUpdate);
+ #pragma omp parallel num_threads(nthreads)
      {
          int gf, i, m;
  
@@@ -771,7 -773,7 +773,7 @@@ static void evaluate_energy(FILE *fplog
      }
  
      /* Calculate long range corrections to pressure and energy */
 -    calc_dispcorr(fplog, inputrec, fr, count, top_global->natoms, ems->s.box, ems->s.lambda[efptVDW],
 +    calc_dispcorr(inputrec, fr, top_global->natoms, ems->s.box, ems->s.lambda[efptVDW],
                    pres, force_vir, &prescorr, &enercorr, &dvdlcorr);
      enerd->term[F_DISPCORR] = enercorr;
      enerd->term[F_EPOT]    += enercorr;
@@@ -790,6 -792,10 +792,6 @@@
                    ems->s.x, ems->f, ems->f, fr->bMolPBC, ems->s.box,
                    ems->s.lambda[efptBONDED], &dvdl_constr,
                    NULL, &shake_vir, nrnb, econqForceDispl, FALSE, 0, 0);
 -        if (fr->bSepDVDL && fplog)
 -        {
 -            gmx_print_sepdvdl(fplog, "Constraints", t, dvdl_constr);
 -        }
          enerd->term[F_DVDL_CONSTR] += dvdl_constr;
          m_add(force_vir, shake_vir, vir);
          wallcycle_stop(wcycle, ewcCONSTR);
@@@ -1786,6 -1792,7 +1788,6 @@@ double do_lbfgs(FILE *fplog, t_commrec 
      }
  
      stepsize  = 1.0/fnorm;
 -    converged = FALSE;
  
      /* Start the loop over BFGS steps.
       * Each successful step is counted, and we continue until
diff --combined src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu
index 799425edab,d02e903410..c024f7b02c
--- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu
+++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu
@@@ -32,14 -32,12 +32,14 @@@
   * To help us fund GROMACS development, we humbly ask that you cite
   * the research papers on the package. Check out http://www.gromacs.org.
   */
 -#ifdef HAVE_CONFIG_H
 -#include <config.h>
 -#endif
 +#include "gmxpre.h"
 +
 +#include "nbnxn_cuda.h"
 +
 +#include "config.h"
  
 -#include <stdlib.h>
  #include <assert.h>
 +#include <stdlib.h>
  
  #if defined(_MSVC)
  #include <limits>
@@@ -47,21 -45,21 +47,21 @@@
  
  #include <cuda.h>
  
 -#include "types/simple.h"
 -#include "types/nbnxn_pairlist.h"
 -#include "types/nb_verlet.h"
 -#include "types/ishift.h"
 -#include "types/force_flags.h"
 -#include "../nbnxn_consts.h"
 -
  #ifdef TMPI_ATOMICS
  #include "thread_mpi/atomic.h"
  #endif
  
 +#include "gromacs/gmxlib/cuda_tools/cudautils.cuh"
 +#include "gromacs/legacyheaders/types/force_flags.h"
 +#include "gromacs/legacyheaders/types/simple.h"
 +#include "gromacs/mdlib/nb_verlet.h"
 +#include "gromacs/mdlib/nbnxn_consts.h"
 +#include "gromacs/mdlib/nbnxn_pairlist.h"
 +#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.h"
 +#include "gromacs/pbcutil/ishift.h"
 +#include "gromacs/utility/cstringutil.h"
 +
  #include "nbnxn_cuda_types.h"
 -#include "../../gmxlib/cuda_tools/cudautils.cuh"
 -#include "nbnxn_cuda.h"
 -#include "nbnxn_cuda_data_mgmt.h"
  
  #if defined TEXOBJ_SUPPORTED && __CUDA_ARCH__ >= 300
  #define USE_TEXOBJ
@@@ -81,7 -79,7 +81,7 @@@ texture<float, 1, cudaReadModeElementTy
  #define CL_SIZE                 (NBNXN_GPU_CLUSTER_SIZE)
  
  /***** The kernels come here *****/
 -#include "nbnxn_cuda_kernel_utils.cuh"
 +#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh"
  
  /* Top-level kernel generation: will generate through multiple inclusion the
   * following flavors for all kernels:
@@@ -91,19 -89,19 +91,19 @@@
   * - force and energy output with pair list pruning.
   */
  /** Force only **/
 -#include "nbnxn_cuda_kernels.cuh"
 +#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernels.cuh"
  /** Force & energy **/
  #define CALC_ENERGIES
 -#include "nbnxn_cuda_kernels.cuh"
 +#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernels.cuh"
  #undef CALC_ENERGIES
  
  /*** Pair-list pruning kernels ***/
  /** Force only **/
  #define PRUNE_NBL
 -#include "nbnxn_cuda_kernels.cuh"
 +#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernels.cuh"
  /** Force & energy **/
  #define CALC_ENERGIES
 -#include "nbnxn_cuda_kernels.cuh"
 +#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernels.cuh"
  #undef CALC_ENERGIES
  #undef PRUNE_NBL
  
@@@ -212,7 -210,7 +212,7 @@@ static inline nbnxn_cu_kfunc_ptr_t sele
      nbnxn_cu_kfunc_ptr_t res;
  
      assert(eeltype < eelCuNR);
-     assert(evdwtype < eelCuNR);
+     assert(evdwtype < evdwCuNR);
  
      if (bDoEne)
      {
@@@ -716,18 -714,18 +716,18 @@@ void nbnxn_cuda_set_cacheconfig(cuda_de
              if (devinfo->prop.major >= 3)
              {
                  /* Default kernel on sm 3.x 48/16 kB Shared/L1 */
 -                stat = cudaFuncSetCacheConfig(nb_kfunc_ener_prune_ptr[i][j], cudaFuncCachePreferShared);
 -                stat = cudaFuncSetCacheConfig(nb_kfunc_ener_noprune_ptr[i][j], cudaFuncCachePreferShared);
 -                stat = cudaFuncSetCacheConfig(nb_kfunc_noener_prune_ptr[i][j], cudaFuncCachePreferShared);
 +                cudaFuncSetCacheConfig(nb_kfunc_ener_prune_ptr[i][j], cudaFuncCachePreferShared);
 +                cudaFuncSetCacheConfig(nb_kfunc_ener_noprune_ptr[i][j], cudaFuncCachePreferShared);
 +                cudaFuncSetCacheConfig(nb_kfunc_noener_prune_ptr[i][j], cudaFuncCachePreferShared);
                  stat = cudaFuncSetCacheConfig(nb_kfunc_noener_noprune_ptr[i][j], cudaFuncCachePreferShared);
              }
              else
              {
                  /* On Fermi prefer L1 gives 2% higher performance */
                  /* Default kernel on sm_2.x 16/48 kB Shared/L1 */
 -                stat = cudaFuncSetCacheConfig(nb_kfunc_ener_prune_ptr[i][j], cudaFuncCachePreferL1);
 -                stat = cudaFuncSetCacheConfig(nb_kfunc_ener_noprune_ptr[i][j], cudaFuncCachePreferL1);
 -                stat = cudaFuncSetCacheConfig(nb_kfunc_noener_prune_ptr[i][j], cudaFuncCachePreferL1);
 +                cudaFuncSetCacheConfig(nb_kfunc_ener_prune_ptr[i][j], cudaFuncCachePreferL1);
 +                cudaFuncSetCacheConfig(nb_kfunc_ener_noprune_ptr[i][j], cudaFuncCachePreferL1);
 +                cudaFuncSetCacheConfig(nb_kfunc_noener_prune_ptr[i][j], cudaFuncCachePreferL1);
                  stat = cudaFuncSetCacheConfig(nb_kfunc_noener_noprune_ptr[i][j], cudaFuncCachePreferL1);
              }
              CU_RET_ERR(stat, "cudaFuncSetCacheConfig failed");
diff --combined src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernels.cuh
index c468306899,3bdb17f8fe..3b37869e6d
--- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernels.cuh
+++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernels.cuh
@@@ -52,30 -52,30 +52,30 @@@
  
  /* cut-off + V shift LJ */
  #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecCut_VdwLJ ## __VA_ARGS__
 -#include "nbnxn_cuda_kernel.cuh"
 +#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
  #undef NB_KERNEL_FUNC_NAME
  /* LJ-Ewald w geometric combination rules */
  #define LJ_EWALD_COMB_GEOM
  #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecCut_VdwLJEwCombGeom ## __VA_ARGS__
 -#include "nbnxn_cuda_kernel.cuh"
 +#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
  #undef LJ_EWALD_COMB_GEOM
  #undef NB_KERNEL_FUNC_NAME
  /* LJ-Ewald w LB combination rules */
  #define LJ_EWALD_COMB_LB
  #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecCut_VdwLJEwCombLB ## __VA_ARGS__
 -#include "nbnxn_cuda_kernel.cuh"
 +#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
  #undef LJ_EWALD_COMB_LB
  #undef NB_KERNEL_FUNC_NAME
  /* F switch LJ */
  #define LJ_FORCE_SWITCH
  #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecCut_VdwLJFsw ## __VA_ARGS__
 -#include "nbnxn_cuda_kernel.cuh"
 +#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
  #undef LJ_FORCE_SWITCH
  #undef NB_KERNEL_FUNC_NAME
  /* V switch LJ */
  #define LJ_POT_SWITCH
  #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecCut_VdwLJPsw ## __VA_ARGS__
 -#include "nbnxn_cuda_kernel.cuh"
 +#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
  #undef LJ_POT_SWITCH
  #undef NB_KERNEL_FUNC_NAME
  
@@@ -88,30 -88,30 +88,30 @@@
  
  /* cut-off + V shift LJ */
  #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecRF_VdwLJ ## __VA_ARGS__
 -#include "nbnxn_cuda_kernel.cuh"
 +#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
  #undef NB_KERNEL_FUNC_NAME
  /* LJ-Ewald w geometric combination rules */
  #define LJ_EWALD_COMB_GEOM
  #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecRF_VdwLJEwCombGeom ## __VA_ARGS__
 -#include "nbnxn_cuda_kernel.cuh"
 +#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
  #undef LJ_EWALD_COMB_GEOM
  #undef NB_KERNEL_FUNC_NAME
  /* LJ-Ewald w LB combination rules */
  #define LJ_EWALD_COMB_LB
  #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecRF_VdwLJEwCombLB ## __VA_ARGS__
 -#include "nbnxn_cuda_kernel.cuh"
 +#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
  #undef LJ_EWALD_COMB_LB
  #undef NB_KERNEL_FUNC_NAME
  /* F switch LJ */
  #define LJ_FORCE_SWITCH
  #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecRF_VdwLJFsw ## __VA_ARGS__
 -#include "nbnxn_cuda_kernel.cuh"
 +#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
  #undef LJ_FORCE_SWITCH
  #undef NB_KERNEL_FUNC_NAME
  /* V switch LJ */
  #define LJ_POT_SWITCH
  #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecRF_VdwLJPsw ## __VA_ARGS__
 -#include "nbnxn_cuda_kernel.cuh"
 +#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
  #undef LJ_POT_SWITCH
  #undef NB_KERNEL_FUNC_NAME
  
@@@ -124,30 -124,30 +124,30 @@@
  
  /* cut-off + V shift LJ */
  #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEw_VdwLJ ## __VA_ARGS__
 -#include "nbnxn_cuda_kernel.cuh"
 +#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
  #undef NB_KERNEL_FUNC_NAME
  /* LJ-Ewald w geometric combination rules */
  #define LJ_EWALD_COMB_GEOM
  #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEw_VdwLJEwCombGeom ## __VA_ARGS__
 -#include "nbnxn_cuda_kernel.cuh"
 +#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
  #undef LJ_EWALD_COMB_GEOM
  #undef NB_KERNEL_FUNC_NAME
  /* LJ-Ewald w LB combination rules */
  #define LJ_EWALD_COMB_LB
  #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEw_VdwLJEwCombLB ## __VA_ARGS__
 -#include "nbnxn_cuda_kernel.cuh"
 +#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
  #undef LJ_EWALD_COMB_LB
  #undef NB_KERNEL_FUNC_NAME
  /* F switch LJ */
  #define LJ_FORCE_SWITCH
  #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEw_VdwLJFsw ## __VA_ARGS__
 -#include "nbnxn_cuda_kernel.cuh"
 +#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
  #undef LJ_FORCE_SWITCH
  #undef NB_KERNEL_FUNC_NAME
  /* V switch LJ */
  #define LJ_POT_SWITCH
  #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEw_VdwLJPsw ## __VA_ARGS__
 -#include "nbnxn_cuda_kernel.cuh"
 +#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
  #undef LJ_POT_SWITCH
  #undef NB_KERNEL_FUNC_NAME
  
@@@ -157,39 -157,39 +157,39 @@@
  /* Analytical Ewald interaction kernels with twin-range cut-off
   */
  #define EL_EWALD_ANA
- #define LJ_CUTOFF_CHECK
+ #define VDW_CUTOFF_CHECK
  
  /* cut-off + V shift LJ */
  #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwTwinCut_VdwLJ ## __VA_ARGS__
 -#include "nbnxn_cuda_kernel.cuh"
 +#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
  #undef NB_KERNEL_FUNC_NAME
  /* LJ-Ewald w geometric combination rules */
  #define LJ_EWALD_COMB_GEOM
  #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwTwinCut_VdwLJEwCombGeom ## __VA_ARGS__
 -#include "nbnxn_cuda_kernel.cuh"
 +#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
  #undef LJ_EWALD_COMB_GEOM
  #undef NB_KERNEL_FUNC_NAME
  /* LJ-Ewald w LB combination rules */
  #define LJ_EWALD_COMB_LB
  #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwTwinCut_VdwLJEwCombLB ## __VA_ARGS__
 -#include "nbnxn_cuda_kernel.cuh"
 +#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
  #undef LJ_EWALD_COMB_LB
  #undef NB_KERNEL_FUNC_NAME
  /* F switch LJ */
  #define LJ_FORCE_SWITCH
  #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwTwinCut_VdwLJFsw ## __VA_ARGS__
 -#include "nbnxn_cuda_kernel.cuh"
 +#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
  #undef LJ_FORCE_SWITCH
  #undef NB_KERNEL_FUNC_NAME
  /* V switch LJ */
  #define LJ_POT_SWITCH
  #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwTwinCut_VdwLJPsw ## __VA_ARGS__
 -#include "nbnxn_cuda_kernel.cuh"
 +#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
  #undef LJ_POT_SWITCH
  #undef NB_KERNEL_FUNC_NAME
  
  #undef EL_EWALD_ANA
- #undef LJ_CUTOFF_CHECK
+ #undef VDW_CUTOFF_CHECK
  
  
  /* Tabulated Ewald interaction kernels */
@@@ -197,30 -197,30 +197,30 @@@
  
  /* cut-off + V shift LJ */
  #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwQSTab_VdwLJ ## __VA_ARGS__
 -#include "nbnxn_cuda_kernel.cuh"
 +#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
  #undef NB_KERNEL_FUNC_NAME
  /* LJ-Ewald w geometric combination rules */
  #define LJ_EWALD_COMB_GEOM
  #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwQSTab_VdwLJEwCombGeom ## __VA_ARGS__
 -#include "nbnxn_cuda_kernel.cuh"
 +#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
  #undef LJ_EWALD_COMB_GEOM
  #undef NB_KERNEL_FUNC_NAME
  /* LJ-Ewald w LB combination rules */
  #define LJ_EWALD_COMB_LB
  #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwQSTab_VdwLJEwCombLB ## __VA_ARGS__
 -#include "nbnxn_cuda_kernel.cuh"
 +#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
  #undef LJ_EWALD_COMB_LB
  #undef NB_KERNEL_FUNC_NAME
  /* F switch LJ */
  #define LJ_FORCE_SWITCH
  #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwQSTab_VdwLJFsw ## __VA_ARGS__
 -#include "nbnxn_cuda_kernel.cuh"
 +#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
  #undef LJ_FORCE_SWITCH
  #undef NB_KERNEL_FUNC_NAME
  /* V switch LJ */
  #define LJ_POT_SWITCH
  #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwQSTab_VdwLJPsw ## __VA_ARGS__
 -#include "nbnxn_cuda_kernel.cuh"
 +#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
  #undef LJ_POT_SWITCH
  #undef NB_KERNEL_FUNC_NAME
  
@@@ -229,36 -229,36 +229,36 @@@
  
  /* Tabulated Ewald interaction kernels with twin-range cut-off */
  #define EL_EWALD_TAB
- #define LJ_CUTOFF_CHECK
+ #define VDW_CUTOFF_CHECK
  
  /* cut-off + V shift LJ */
  #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwQSTabTwinCut_VdwLJ ## __VA_ARGS__
 -#include "nbnxn_cuda_kernel.cuh"
 +#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
  #undef NB_KERNEL_FUNC_NAME
  /* LJ-Ewald w geometric combination rules */
  #define LJ_EWALD_COMB_GEOM
  #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwQSTabTwinCut_VdwLJEwCombGeom ## __VA_ARGS__
 -#include "nbnxn_cuda_kernel.cuh"
 +#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
  #undef LJ_EWALD_COMB_GEOM
  #undef NB_KERNEL_FUNC_NAME
  /* LJ-Ewald w LB combination rules */
  #define LJ_EWALD_COMB_LB
  #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwQSTabTwinCut_VdwLJEwCombLB ## __VA_ARGS__
 -#include "nbnxn_cuda_kernel.cuh"
 +#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
  #undef LJ_EWALD_COMB_LB
  #undef NB_KERNEL_FUNC_NAME
  /* F switch LJ */
  #define LJ_FORCE_SWITCH
  #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwQSTabTwinCut_VdwLJFsw ## __VA_ARGS__
 -#include "nbnxn_cuda_kernel.cuh"
 +#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
  #undef LJ_FORCE_SWITCH
  #undef NB_KERNEL_FUNC_NAME
  /* V switch LJ */
  #define LJ_POT_SWITCH
  #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwQSTabTwinCut_VdwLJPsw ## __VA_ARGS__
 -#include "nbnxn_cuda_kernel.cuh"
 +#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
  #undef LJ_POT_SWITCH
  #undef NB_KERNEL_FUNC_NAME
  
  #undef EL_EWALD_TAB
- #undef LJ_CUTOFF_CHECK
+ #undef VDW_CUTOFF_CHECK
diff --combined src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_file_generator/nbnxn_kernel_simd_template.c.pre
index 9432b2f311,2aa4fa9fb5..5e410cdecc
--- a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_file_generator/nbnxn_kernel_simd_template.c.pre
+++ b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_file_generator/nbnxn_kernel_simd_template.c.pre
@@@ -32,12 -32,12 +32,12 @@@
   * To help us fund GROMACS development, we humbly ask that you cite
   * the research papers on the package. Check out http://www.gromacs.org.
   */
 -#ifdef HAVE_CONFIG_H
 -#include <config.h>
 -#endif
 +#include "gmxpre.h"
  
 -#include "typedefs.h"
 +#include "config.h"
  
 +#include "gromacs/legacyheaders/typedefs.h"
 +#include "gromacs/mdlib/nb_verlet.h"
  #include "gromacs/mdlib/nbnxn_simd.h"
  
  #ifdef {0}
@@@ -47,11 -47,10 +47,11 @@@
  {2}
  #define GMX_SIMD_J_UNROLL_SIZE {3}
  #include "{4}"
 -#include "../nbnxn_kernel_common.h"
 -#include "gmx_omp_nthreads.h"
 -#include "types/force_flags.h"
 -#include "gmx_fatal.h"
 +
 +#include "gromacs/legacyheaders/gmx_omp_nthreads.h"
 +#include "gromacs/legacyheaders/types/force_flags.h"
 +#include "gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_common.h"
 +#include "gromacs/utility/fatalerror.h"
  
  /*! \brief Kinds of electrostatic treatments in SIMD Verlet kernels
   */
@@@ -118,7 -117,7 +118,7 @@@ reduce_group_energies(int ng, int ng_2l
  
  #else /* {0} */
  
 -#include "gmx_fatal.h"
 +#include "gromacs/utility/fatalerror.h"
  
  #endif /* {0} */
  
@@@ -139,6 -138,7 +139,7 @@@ voi
      nbnxn_pairlist_t **nbl;
      int                coulkt, vdwkt = 0;
      int                nb;
+     int                nthreads gmx_unused;
  
      nnbl = nbl_list->nnbl;
      nbl  = nbl_list->nbl;
@@@ -210,7 -210,8 +211,8 @@@
          gmx_incons("Unsupported VdW interaction type");
      }}
  
- #pragma omp parallel for schedule(static) num_threads(gmx_omp_nthreads_get(emntNonbonded))
+     nthreads = gmx_omp_nthreads_get(emntNonbonded);
+ #pragma omp parallel for schedule(static) num_threads(nthreads)
      for (nb = 0; nb < nnbl; nb++)
      {{
          nbnxn_atomdata_output_t *out;
diff --combined src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_ref.c
index 3db03bbec5,a104652965..4cad2a2f56
--- a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_ref.c
+++ b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_ref.c
@@@ -32,24 -32,21 +32,24 @@@
   * To help us fund GROMACS development, we humbly ask that you cite
   * the research papers on the package. Check out http://www.gromacs.org.
   */
 -#ifdef HAVE_CONFIG_H
 -#include <config.h>
 -#endif
 +#include "gmxpre.h"
 +
 +#include "nbnxn_kernel_ref.h"
 +
 +#include "config.h"
  
 -#include <math.h>
  #include <assert.h>
 +#include <math.h>
  
 -#include "typedefs.h"
 -#include "vec.h"
 +#include "gromacs/legacyheaders/force.h"
 +#include "gromacs/legacyheaders/gmx_omp_nthreads.h"
 +#include "gromacs/legacyheaders/typedefs.h"
 +#include "gromacs/math/vec.h"
 +#include "gromacs/mdlib/nb_verlet.h"
 +#include "gromacs/mdlib/nbnxn_consts.h"
 +#include "gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_common.h"
 +#include "gromacs/pbcutil/ishift.h"
  #include "gromacs/utility/smalloc.h"
 -#include "force.h"
 -#include "gmx_omp_nthreads.h"
 -#include "nbnxn_kernel_ref.h"
 -#include "../nbnxn_consts.h"
 -#include "nbnxn_kernel_common.h"
  
  /*! \brief Typedefs for declaring lookup tables of kernel functions.
   */
@@@ -73,21 -70,21 +73,21 @@@ typedef void (*p_nbk_func_ener)(const n
  /* Analytical reaction-field kernels */
  #define CALC_COUL_RF
  #define LJ_CUT
 -#include "nbnxn_kernel_ref_includes.h"
 +#include "gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_ref_includes.h"
  #undef LJ_CUT
  #define LJ_FORCE_SWITCH
 -#include "nbnxn_kernel_ref_includes.h"
 +#include "gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_ref_includes.h"
  #undef LJ_FORCE_SWITCH
  #define LJ_POT_SWITCH
 -#include "nbnxn_kernel_ref_includes.h"
 +#include "gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_ref_includes.h"
  #undef LJ_POT_SWITCH
  #define LJ_EWALD
  #define LJ_CUT
  #define LJ_EWALD_COMB_GEOM
 -#include "nbnxn_kernel_ref_includes.h"
 +#include "gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_ref_includes.h"
  #undef LJ_EWALD_COMB_GEOM
  #define LJ_EWALD_COMB_LB
 -#include "nbnxn_kernel_ref_includes.h"
 +#include "gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_ref_includes.h"
  #undef LJ_EWALD_COMB_LB
  #undef LJ_CUT
  #undef LJ_EWALD
@@@ -97,42 -94,42 +97,42 @@@
  /* Tabulated exclusion interaction electrostatics kernels */
  #define CALC_COUL_TAB
  #define LJ_CUT
 -#include "nbnxn_kernel_ref_includes.h"
 +#include "gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_ref_includes.h"
  #undef LJ_CUT
  #define LJ_FORCE_SWITCH
 -#include "nbnxn_kernel_ref_includes.h"
 +#include "gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_ref_includes.h"
  #undef LJ_FORCE_SWITCH
  #define LJ_POT_SWITCH
 -#include "nbnxn_kernel_ref_includes.h"
 +#include "gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_ref_includes.h"
  #undef LJ_POT_SWITCH
  #define LJ_EWALD
  #define LJ_CUT
  #define LJ_EWALD_COMB_GEOM
 -#include "nbnxn_kernel_ref_includes.h"
 +#include "gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_ref_includes.h"
  #undef LJ_EWALD_COMB_GEOM
  #define LJ_EWALD_COMB_LB
 -#include "nbnxn_kernel_ref_includes.h"
 +#include "gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_ref_includes.h"
  #undef LJ_EWALD_COMB_LB
  #undef LJ_CUT
  #undef LJ_EWALD
  /* Twin-range cut-off kernels */
  #define VDW_CUTOFF_CHECK
  #define LJ_CUT
 -#include "nbnxn_kernel_ref_includes.h"
 +#include "gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_ref_includes.h"
  #undef LJ_CUT
  #define LJ_FORCE_SWITCH
 -#include "nbnxn_kernel_ref_includes.h"
 +#include "gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_ref_includes.h"
  #undef LJ_FORCE_SWITCH
  #define LJ_POT_SWITCH
 -#include "nbnxn_kernel_ref_includes.h"
 +#include "gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_ref_includes.h"
  #undef LJ_POT_SWITCH
  #define LJ_EWALD
  #define LJ_CUT
  #define LJ_EWALD_COMB_GEOM
 -#include "nbnxn_kernel_ref_includes.h"
 +#include "gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_ref_includes.h"
  #undef LJ_EWALD_COMB_GEOM
  #define LJ_EWALD_COMB_LB
 -#include "nbnxn_kernel_ref_includes.h"
 +#include "gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_ref_includes.h"
  #undef LJ_EWALD_COMB_LB
  #undef LJ_CUT
  #undef LJ_EWALD
@@@ -185,6 -182,7 +185,7 @@@ nbnxn_kernel_ref(const nbnxn_pairlist_s
      int                coult;
      int                vdwt;
      int                nb;
+     int                nthreads gmx_unused;
  
      nnbl = nbl_list->nnbl;
      nbl  = nbl_list->nbl;
@@@ -242,7 -240,8 +243,8 @@@
          gmx_incons("Unsupported vdwtype in nbnxn reference kernel");
      }
  
- #pragma omp parallel for schedule(static) num_threads(gmx_omp_nthreads_get(emntNonbonded))
+     nthreads = gmx_omp_nthreads_get(emntNonbonded);
+ #pragma omp parallel for schedule(static) num_threads(nthreads)
      for (nb = 0; nb < nnbl; nb++)
      {
          nbnxn_atomdata_output_t *out;
diff --combined src/gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn.c
index ed1456873b,35181d43eb..1d8567b206
--- a/src/gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn.c
+++ b/src/gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn.c
@@@ -37,12 -37,12 +37,12 @@@
   * kernel type 2xnn.
   */
  
 -#ifdef HAVE_CONFIG_H
 -#include <config.h>
 -#endif
 +#include "gmxpre.h"
  
 -#include "typedefs.h"
 +#include "config.h"
  
 +#include "gromacs/legacyheaders/typedefs.h"
 +#include "gromacs/mdlib/nb_verlet.h"
  #include "gromacs/mdlib/nbnxn_simd.h"
  
  #ifdef GMX_NBNXN_SIMD_2XNN
@@@ -56,11 -56,10 +56,11 @@@
  
  #define GMX_SIMD_J_UNROLL_SIZE 2
  #include "nbnxn_kernel_simd_2xnn.h"
 -#include "../nbnxn_kernel_common.h"
 -#include "gmx_omp_nthreads.h"
 -#include "types/force_flags.h"
 -#include "gmx_fatal.h"
 +
 +#include "gromacs/legacyheaders/gmx_omp_nthreads.h"
 +#include "gromacs/legacyheaders/types/force_flags.h"
 +#include "gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_common.h"
 +#include "gromacs/utility/fatalerror.h"
  
  /*! \brief Kinds of electrostatic treatments in SIMD Verlet kernels
   */
@@@ -253,7 -252,7 +253,7 @@@ reduce_group_energies(int ng, int ng_2l
  
  #else /* GMX_NBNXN_SIMD_2XNN */
  
 -#include "gmx_fatal.h"
 +#include "gromacs/utility/fatalerror.h"
  
  #endif /* GMX_NBNXN_SIMD_2XNN */
  
@@@ -274,6 -273,7 +274,7 @@@ nbnxn_kernel_simd_2xnn(nbnxn_pairlist_s
      nbnxn_pairlist_t **nbl;
      int                coulkt, vdwkt = 0;
      int                nb;
+     int                nthreads gmx_unused;
  
      nnbl = nbl_list->nnbl;
      nbl  = nbl_list->nbl;
@@@ -345,7 -345,8 +346,8 @@@
          gmx_incons("Unsupported VdW interaction type");
      }
  
- #pragma omp parallel for schedule(static) num_threads(gmx_omp_nthreads_get(emntNonbonded))
+     nthreads = gmx_omp_nthreads_get(emntNonbonded);
+ #pragma omp parallel for schedule(static) num_threads(nthreads)
      for (nb = 0; nb < nnbl; nb++)
      {
          nbnxn_atomdata_output_t *out;
diff --combined src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn.c
index d17ba1e7bd,5974873a07..3531299efd
--- a/src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn.c
+++ b/src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn.c
@@@ -37,12 -37,12 +37,12 @@@
   * kernel type 4xn.
   */
  
 -#ifdef HAVE_CONFIG_H
 -#include <config.h>
 -#endif
 +#include "gmxpre.h"
  
 -#include "typedefs.h"
 +#include "config.h"
  
 +#include "gromacs/legacyheaders/typedefs.h"
 +#include "gromacs/mdlib/nb_verlet.h"
  #include "gromacs/mdlib/nbnxn_simd.h"
  
  #ifdef GMX_NBNXN_SIMD_4XN
@@@ -55,11 -55,10 +55,11 @@@
  
  #define GMX_SIMD_J_UNROLL_SIZE 1
  #include "nbnxn_kernel_simd_4xn.h"
 -#include "../nbnxn_kernel_common.h"
 -#include "gmx_omp_nthreads.h"
 -#include "types/force_flags.h"
 -#include "gmx_fatal.h"
 +
 +#include "gromacs/legacyheaders/gmx_omp_nthreads.h"
 +#include "gromacs/legacyheaders/types/force_flags.h"
 +#include "gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_common.h"
 +#include "gromacs/utility/fatalerror.h"
  
  /*! \brief Kinds of electrostatic treatments in SIMD Verlet kernels
   */
@@@ -252,7 -251,7 +252,7 @@@ reduce_group_energies(int ng, int ng_2l
  
  #else /* GMX_NBNXN_SIMD_4XN */
  
 -#include "gmx_fatal.h"
 +#include "gromacs/utility/fatalerror.h"
  
  #endif /* GMX_NBNXN_SIMD_4XN */
  
@@@ -273,6 -272,7 +273,7 @@@ nbnxn_kernel_simd_4xn(nbnxn_pairlist_se
      nbnxn_pairlist_t **nbl;
      int                coulkt, vdwkt = 0;
      int                nb;
+     int                nthreads gmx_unused;
  
      nnbl = nbl_list->nnbl;
      nbl  = nbl_list->nbl;
@@@ -344,7 -344,8 +345,8 @@@
          gmx_incons("Unsupported VdW interaction type");
      }
  
- #pragma omp parallel for schedule(static) num_threads(gmx_omp_nthreads_get(emntNonbonded))
+     nthreads = gmx_omp_nthreads_get(emntNonbonded);
+ #pragma omp parallel for schedule(static) num_threads(nthreads)
      for (nb = 0; nb < nnbl; nb++)
      {
          nbnxn_atomdata_output_t *out;
diff --combined src/gromacs/mdlib/nbnxn_search.c
index f13de8c7db,3114bebc22..56e616967c
--- a/src/gromacs/mdlib/nbnxn_search.c
+++ b/src/gromacs/mdlib/nbnxn_search.c
@@@ -33,35 -33,34 +33,35 @@@
   * the research papers on the package. Check out http://www.gromacs.org.
   */
  
 -#ifdef HAVE_CONFIG_H
 -#include <config.h>
 -#endif
 +#include "gmxpre.h"
 +
 +#include "nbnxn_search.h"
 +
 +#include "config.h"
  
 +#include <assert.h>
  #include <math.h>
  #include <string.h>
 -#include <assert.h>
  
 -#include "sysstuff.h"
 -#include "gromacs/utility/smalloc.h"
 -#include "types/commrec.h"
 -#include "macros.h"
 +#include "gromacs/legacyheaders/gmx_omp_nthreads.h"
 +#include "gromacs/legacyheaders/macros.h"
 +#include "gromacs/legacyheaders/nrnb.h"
 +#include "gromacs/legacyheaders/ns.h"
 +#include "gromacs/legacyheaders/types/commrec.h"
  #include "gromacs/math/utilities.h"
 -#include "vec.h"
 -#include "pbc.h"
 -#include "nbnxn_consts.h"
 +#include "gromacs/math/vec.h"
 +#include "gromacs/mdlib/nb_verlet.h"
 +#include "gromacs/mdlib/nbnxn_atomdata.h"
 +#include "gromacs/mdlib/nbnxn_consts.h"
 +#include "gromacs/pbcutil/ishift.h"
 +#include "gromacs/pbcutil/pbc.h"
 +#include "gromacs/utility/smalloc.h"
 +
  /* nbnxn_internal.h included gromacs/simd/macros.h */
 -#include "nbnxn_internal.h"
 -#ifdef GMX_NBNXN_SIMD
 +#include "gromacs/mdlib/nbnxn_internal.h"
 +#ifdef GMX_SIMD
  #include "gromacs/simd/vector_operations.h"
  #endif
 -#include "nbnxn_atomdata.h"
 -#include "nbnxn_search.h"
 -#include "gmx_omp_nthreads.h"
 -#include "nrnb.h"
 -#include "ns.h"
 -
 -#include "gromacs/fileio/gmxfio.h"
  
  #ifdef NBNXN_SEARCH_BB_SIMD4
  /* Always use 4-wide SIMD for bounding box calculations */
@@@ -1383,13 -1382,14 +1383,13 @@@ static void sort_columns_supersub(cons
                                    int cxy_start, int cxy_end,
                                    int *sort_work)
  {
 -    int  cxy;
 -    int  cx, cy, cz = -1, c = -1, ncz;
 -    int  na, ash, na_c, ind, a;
 -    int  subdiv_z, sub_z, na_z, ash_z;
 -    int  subdiv_y, sub_y, na_y, ash_y;
 -    int  subdiv_x, sub_x, na_x, ash_x;
 +    int        cxy;
 +    int        cx, cy, cz = -1, c = -1, ncz;
 +    int        na, ash, na_c, ind, a;
 +    int        subdiv_z, sub_z, na_z, ash_z;
 +    int        subdiv_y, sub_y, na_y, ash_y;
 +    int        subdiv_x, sub_x, na_x, ash_x;
  
 -    /* cppcheck-suppress unassignedVariable */
      nbnxn_bb_t bb_work_array[2], *bb_work_aligned;
  
      bb_work_aligned = (nbnxn_bb_t *)(((size_t)(bb_work_array+1)) & (~((size_t)15)));
@@@ -1931,6 -1931,7 +1931,7 @@@ void nbnxn_grid_add_simple(nbnxn_search
      float        *bbcz;
      nbnxn_bb_t   *bb;
      int           ncd, sc;
+     int           nthreads gmx_unused;
  
      grid = &nbs->grid[0];
  
@@@ -1957,7 -1958,8 +1958,8 @@@
      bbcz = grid->bbcz_simple;
      bb   = grid->bb_simple;
  
- #pragma omp parallel for num_threads(gmx_omp_nthreads_get(emntPairsearch)) schedule(static)
+     nthreads = gmx_omp_nthreads_get(emntPairsearch);
+ #pragma omp parallel for num_threads(nthreads) schedule(static)
      for (sc = 0; sc < grid->nc; sc++)
      {
          int c, tx, na;
@@@ -2942,10 -2944,10 +2944,10 @@@ static void make_cluster_list_simple(co
  }
  
  #ifdef GMX_NBNXN_SIMD_4XN
 -#include "nbnxn_search_simd_4xn.h"
 +#include "gromacs/mdlib/nbnxn_search_simd_4xn.h"
  #endif
  #ifdef GMX_NBNXN_SIMD_2XNN
 -#include "nbnxn_search_simd_2xnn.h"
 +#include "gromacs/mdlib/nbnxn_search_simd_2xnn.h"
  #endif
  
  /* Plain C or SIMD4 code for making a pair list of super-cell sci vs scj.
@@@ -4473,6 -4475,7 +4475,7 @@@ static void combine_nblists(int nnbl, n
  {
      int nsci, ncj4, nexcl;
      int n, i;
+     int nthreads gmx_unused;
  
      if (nblc->bSimple)
      {
@@@ -4513,7 -4516,8 +4516,8 @@@
      /* Each thread should copy its own data to the combined arrays,
       * as otherwise data will go back and forth between different caches.
       */
- #pragma omp parallel for num_threads(gmx_omp_nthreads_get(emntPairsearch)) schedule(static)
+     nthreads = gmx_omp_nthreads_get(emntPairsearch);
+ #pragma omp parallel for num_threads(nthreads) schedule(static)
      for (n = 0; n < nnbl; n++)
      {
          int                     sci_offset;
diff --combined src/gromacs/mdlib/ns.c
index 9145929c47,430e26d304..3a1396cd40
--- a/src/gromacs/mdlib/ns.c
+++ b/src/gromacs/mdlib/ns.c
@@@ -34,33 -34,33 +34,33 @@@
   * To help us fund GROMACS development, we humbly ask that you cite
   * the research papers on the package. Check out http://www.gromacs.org.
   */
 -#ifdef HAVE_CONFIG_H
 -#include <config.h>
 -#endif
 +#include "gmxpre.h"
 +
 +#include "gromacs/legacyheaders/ns.h"
  
  #include <math.h>
 +#include <stdlib.h>
  #include <string.h>
 -#include "sysstuff.h"
 -#include "gromacs/utility/smalloc.h"
 -#include "macros.h"
 +
 +#include "gromacs/legacyheaders/domdec.h"
 +#include "gromacs/legacyheaders/force.h"
 +#include "gromacs/legacyheaders/macros.h"
 +#include "gromacs/legacyheaders/names.h"
 +#include "gromacs/legacyheaders/network.h"
 +#include "gromacs/legacyheaders/nonbonded.h"
 +#include "gromacs/legacyheaders/nrnb.h"
 +#include "gromacs/legacyheaders/nsgrid.h"
 +#include "gromacs/legacyheaders/txtdump.h"
 +#include "gromacs/legacyheaders/types/commrec.h"
  #include "gromacs/math/utilities.h"
 -#include "vec.h"
 -#include "types/commrec.h"
 -#include "network.h"
 -#include "nsgrid.h"
 -#include "force.h"
 -#include "nonbonded.h"
 -#include "ns.h"
 -#include "pbc.h"
 -#include "names.h"
 -#include "gmx_fatal.h"
 -#include "nrnb.h"
 -#include "txtdump.h"
 -#include "mtop_util.h"
 -
 -#include "domdec.h"
 -#include "adress.h"
 +#include "gromacs/math/vec.h"
 +#include "gromacs/pbcutil/ishift.h"
 +#include "gromacs/pbcutil/pbc.h"
 +#include "gromacs/topology/mtop_util.h"
 +#include "gromacs/utility/fatalerror.h"
 +#include "gromacs/utility/smalloc.h"
  
 +#include "adress.h"
  
  /*
   *    E X C L U S I O N   H A N D L I N G
@@@ -287,8 -287,11 +287,8 @@@ void init_neighbor_list(FILE *log, t_fo
          /* Did we get the solvent loops so we can use optimized water kernels? */
          if (nbl->nlist_sr[eNL_VDWQQ_WATER].kernelptr_vf == NULL
              || nbl->nlist_sr[eNL_QQ_WATER].kernelptr_vf == NULL
 -#ifndef DISABLE_WATERWATER_NLIST
              || nbl->nlist_sr[eNL_VDWQQ_WATERWATER].kernelptr_vf == NULL
 -            || nbl->nlist_sr[eNL_QQ_WATERWATER].kernelptr_vf == NULL
 -#endif
 -            )
 +            || nbl->nlist_sr[eNL_QQ_WATERWATER].kernelptr_vf == NULL)
          {
              fr->solvent_opt = esolNO;
              if (log != NULL)
@@@ -427,6 -430,14 +427,14 @@@ static gmx_inline void close_i_nblist(t
          nlist->jindex[nri+1] = nlist->nrj;
  
          len = nlist->nrj -  nlist->jindex[nri];
+         /* If there are no j-particles we have to reduce the
+          * number of i-particles again, to prevent errors in the
+          * kernel functions.
+          */
+         if ((len == 0) && (nlist->nri > 0))
+         {
+             nlist->nri--;
+         }
      }
  }
  
@@@ -661,11 -672,13 +669,11 @@@ put_in_list_at(gmx_bool              bH
  
      if (iwater != esolNO)
      {
 -        vdwc = &nlist[eNL_VDWQQ_WATER];
 -        vdw  = &nlist[eNL_VDW];
 -        coul = &nlist[eNL_QQ_WATER];
 -#ifndef DISABLE_WATERWATER_NLIST
 +        vdwc    = &nlist[eNL_VDWQQ_WATER];
 +        vdw     = &nlist[eNL_VDW];
 +        coul    = &nlist[eNL_QQ_WATER];
          vdwc_ww = &nlist[eNL_VDWQQ_WATERWATER];
          coul_ww = &nlist[eNL_QQ_WATERWATER];
 -#endif
      }
      else
      {
@@@ -685,7 -698,9 +693,7 @@@
              if (bDoCoul && bDoVdW)
              {
                  new_i_nblist(vdwc, i_atom, shift, gid);
 -#ifndef DISABLE_WATERWATER_NLIST
                  new_i_nblist(vdwc_ww, i_atom, shift, gid);
 -#endif
              }
              if (bDoVdW)
              {
@@@ -694,7 -709,9 +702,7 @@@
              if (bDoCoul)
              {
                  new_i_nblist(coul, i_atom, shift, gid);
 -#ifndef DISABLE_WATERWATER_NLIST
                  new_i_nblist(coul_ww, i_atom, shift, gid);
 -#endif
              }
              /* Loop over the j charge groups */
              for (j = 0; (j < nj); j++)
@@@ -719,6 -736,19 +727,6 @@@
                      }
                      else
                      {
 -#ifdef DISABLE_WATERWATER_NLIST
 -                        /* Add entries for the three atoms - only do VdW if we need to */
 -                        if (!bDoVdW)
 -                        {
 -                            add_j_to_nblist(coul, jj0, bLR);
 -                        }
 -                        else
 -                        {
 -                            add_j_to_nblist(vdwc, jj0, bLR);
 -                        }
 -                        add_j_to_nblist(coul, jj0+1, bLR);
 -                        add_j_to_nblist(coul, jj0+2, bLR);
 -#else
                          /* One entry for the entire water-water interaction */
                          if (!bDoVdW)
                          {
@@@ -728,6 -758,7 +736,6 @@@
                          {
                              add_j_to_nblist(vdwc_ww, jj0, bLR);
                          }
 -#endif
                      }
                  }
                  else if (iwater == esolTIP4P && jwater == esolTIP4P)
@@@ -740,6 -771,16 +748,6 @@@
                      }
                      else
                      {
 -#ifdef DISABLE_WATERWATER_NLIST
 -                        /* Add entries for the four atoms - only do VdW if we need to */
 -                        if (bDoVdW)
 -                        {
 -                            add_j_to_nblist(vdw, jj0, bLR);
 -                        }
 -                        add_j_to_nblist(coul, jj0+1, bLR);
 -                        add_j_to_nblist(coul, jj0+2, bLR);
 -                        add_j_to_nblist(coul, jj0+3, bLR);
 -#else
                          /* One entry for the entire water-water interaction */
                          if (!bDoVdW)
                          {
@@@ -749,6 -790,7 +757,6 @@@
                          {
                              add_j_to_nblist(vdwc_ww, jj0, bLR);
                          }
 -#endif
                      }
                  }
                  else
@@@ -809,8 -851,10 +817,8 @@@
              close_i_nblist(vdw);
              close_i_nblist(coul);
              close_i_nblist(vdwc);
 -#ifndef DISABLE_WATERWATER_NLIST
              close_i_nblist(coul_ww);
              close_i_nblist(vdwc_ww);
 -#endif
          }
          else
          {
@@@ -2087,7 -2131,7 +2095,7 @@@ static int nsgrid_core(t_commrec *cr, t
      gmx_ns_t     *ns;
      atom_id     **nl_lr_ljc, **nl_lr_one, **nl_sr;
      int          *nlr_ljc, *nlr_one, *nsr;
 -    gmx_domdec_t *dd     = NULL;
 +    gmx_domdec_t *dd;
      t_block      *cgs    = &(top->cgs);
      int          *cginfo = fr->cginfo;
      /* atom_id *i_atoms,*cgsindex=cgs->index; */
@@@ -2115,7 -2159,10 +2123,7 @@@
      ns = &fr->ns;
  
      bDomDec = DOMAINDECOMP(cr);
 -    if (bDomDec)
 -    {
 -        dd = cr->dd;
 -    }
 +    dd      = cr->dd;
  
      bTriclinicX = ((YY < grid->npbcdim &&
                      (!bDomDec || dd->nc[YY] == 1) && box[YY][XX] != 0) ||
diff --combined src/gromacs/mdlib/sim_util.c
index e6686abd02,a6b166ccfd..b35dd827ac
--- a/src/gromacs/mdlib/sim_util.c
+++ b/src/gromacs/mdlib/sim_util.c
@@@ -34,71 -34,70 +34,71 @@@
   * To help us fund GROMACS development, we humbly ask that you cite
   * the research papers on the package. Check out http://www.gromacs.org.
   */
 -#ifdef HAVE_CONFIG_H
 -#include <config.h>
 -#endif
 +#include "gmxpre.h"
 +
 +#include "gromacs/legacyheaders/sim_util.h"
 +
 +#include "config.h"
  
  #include <assert.h>
  #include <math.h>
  #include <stdio.h>
  #include <string.h>
 +
  #ifdef HAVE_SYS_TIME_H
  #include <sys/time.h>
  #endif
  
 -#include "typedefs.h"
 -#include "gromacs/utility/cstringutil.h"
 -#include "gromacs/utility/smalloc.h"
 -#include "names.h"
 -#include "txtdump.h"
 -#include "pbc.h"
 -#include "chargegroup.h"
 -#include "vec.h"
 -#include "nrnb.h"
 -#include "mshift.h"
 -#include "mdrun.h"
 -#include "sim_util.h"
 -#include "update.h"
 -#include "physics.h"
 -#include "main.h"
 -#include "mdatoms.h"
 -#include "force.h"
 -#include "bondf.h"
 -#include "pme.h"
 -#include "disre.h"
 -#include "orires.h"
 -#include "network.h"
 -#include "calcmu.h"
 -#include "constr.h"
 -#include "xvgr.h"
 -#include "copyrite.h"
 -#include "domdec.h"
 -#include "genborn.h"
 -#include "nbnxn_atomdata.h"
 -#include "nbnxn_search.h"
 -#include "nbnxn_kernels/nbnxn_kernel_ref.h"
 -#include "nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn.h"
 -#include "nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn.h"
 -#include "nbnxn_kernels/nbnxn_kernel_gpu_ref.h"
 -#include "nonbonded.h"
 -#include "../gmxlib/nonbonded/nb_kernel.h"
 -#include "../gmxlib/nonbonded/nb_free_energy.h"
 -
 -#include "gromacs/timing/wallcycle.h"
 -#include "gromacs/timing/walltime_accounting.h"
 -#include "gromacs/utility/gmxmpi.h"
  #include "gromacs/essentialdynamics/edsam.h"
 +#include "gromacs/gmxlib/nonbonded/nb_free_energy.h"
 +#include "gromacs/gmxlib/nonbonded/nb_kernel.h"
 +#include "gromacs/imd/imd.h"
 +#include "gromacs/legacyheaders/calcmu.h"
 +#include "gromacs/legacyheaders/chargegroup.h"
 +#include "gromacs/legacyheaders/constr.h"
 +#include "gromacs/legacyheaders/copyrite.h"
 +#include "gromacs/legacyheaders/disre.h"
 +#include "gromacs/legacyheaders/domdec.h"
 +#include "gromacs/legacyheaders/force.h"
 +#include "gromacs/legacyheaders/genborn.h"
 +#include "gromacs/legacyheaders/gmx_omp_nthreads.h"
 +#include "gromacs/legacyheaders/mdatoms.h"
 +#include "gromacs/legacyheaders/mdrun.h"
 +#include "gromacs/legacyheaders/names.h"
 +#include "gromacs/legacyheaders/network.h"
 +#include "gromacs/legacyheaders/nonbonded.h"
 +#include "gromacs/legacyheaders/nrnb.h"
 +#include "gromacs/legacyheaders/orires.h"
 +#include "gromacs/legacyheaders/pme.h"
 +#include "gromacs/legacyheaders/qmmm.h"
 +#include "gromacs/legacyheaders/txtdump.h"
 +#include "gromacs/legacyheaders/typedefs.h"
 +#include "gromacs/legacyheaders/update.h"
 +#include "gromacs/legacyheaders/types/commrec.h"
 +#include "gromacs/listed-forces/bonded.h"
 +#include "gromacs/math/units.h"
 +#include "gromacs/math/vec.h"
 +#include "gromacs/mdlib/nb_verlet.h"
 +#include "gromacs/mdlib/nbnxn_atomdata.h"
 +#include "gromacs/mdlib/nbnxn_search.h"
 +#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.h"
 +#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.h"
 +#include "gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_gpu_ref.h"
 +#include "gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_ref.h"
 +#include "gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn.h"
 +#include "gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn.h"
 +#include "gromacs/pbcutil/ishift.h"
 +#include "gromacs/pbcutil/mshift.h"
 +#include "gromacs/pbcutil/pbc.h"
  #include "gromacs/pulling/pull.h"
  #include "gromacs/pulling/pull_rotation.h"
 -#include "gromacs/imd/imd.h"
 -#include "adress.h"
 -#include "qmmm.h"
 -
 -#include "gmx_omp_nthreads.h"
 +#include "gromacs/timing/wallcycle.h"
 +#include "gromacs/timing/walltime_accounting.h"
 +#include "gromacs/utility/cstringutil.h"
 +#include "gromacs/utility/gmxmpi.h"
 +#include "gromacs/utility/smalloc.h"
  
 -#include "nbnxn_cuda_data_mgmt.h"
 -#include "nbnxn_cuda/nbnxn_cuda.h"
 +#include "adress.h"
  
  void print_time(FILE                     *out,
                  gmx_walltime_accounting_t walltime_accounting,
@@@ -314,7 -313,9 +314,7 @@@ static void calc_virial(int start, int 
      }
  }
  
 -static void posres_wrapper(FILE *fplog,
 -                           int flags,
 -                           gmx_bool bSepDVDL,
 +static void posres_wrapper(int flags,
                             t_inputrec *ir,
                             t_nrnb *nrnb,
                             gmx_localtop_t *top,
@@@ -336,6 -337,10 +336,6 @@@
                    ir->ePBC == epbcNONE ? NULL : &pbc,
                    lambda[efptRESTRAINT], &dvdl,
                    fr->rc_scaling, fr->ePBC, fr->posres_com, fr->posres_comB);
 -    if (bSepDVDL)
 -    {
 -        gmx_print_sepdvdl(fplog, interaction_function[F_POSRES].longname, v, dvdl);
 -    }
      enerd->term[F_POSRES] += v;
      /* If just the force constant changes, the FEP term is linear,
       * but if k changes, it is not.
@@@ -381,7 -386,9 +381,7 @@@ static void fbposres_wrapper(t_inputre
      inc_nrnb(nrnb, eNR_FBPOSRES, top->idef.il[F_FBPOSRES].nr/2);
  }
  
 -static void pull_potential_wrapper(FILE *fplog,
 -                                   gmx_bool bSepDVDL,
 -                                   t_commrec *cr,
 +static void pull_potential_wrapper(t_commrec *cr,
                                     t_inputrec *ir,
                                     matrix box, rvec x[],
                                     rvec f[],
@@@ -406,11 -413,17 +406,11 @@@
      enerd->term[F_COM_PULL] +=
          pull_potential(ir->ePull, ir->pull, mdatoms, &pbc,
                         cr, t, lambda[efptRESTRAINT], x, f, vir_force, &dvdl);
 -    if (bSepDVDL)
 -    {
 -        gmx_print_sepdvdl(fplog, "Com pull", enerd->term[F_COM_PULL], dvdl);
 -    }
      enerd->dvdl_lin[efptRESTRAINT] += dvdl;
      wallcycle_stop(wcycle, ewcPULLPOT);
  }
  
 -static void pme_receive_force_ener(FILE           *fplog,
 -                                   gmx_bool        bSepDVDL,
 -                                   t_commrec      *cr,
 +static void pme_receive_force_ener(t_commrec      *cr,
                                     gmx_wallcycle_t wcycle,
                                     gmx_enerdata_t *enerd,
                                     t_forcerec     *fr)
@@@ -430,6 -443,11 +430,6 @@@
      gmx_pme_receive_f(cr, fr->f_novirsum, fr->vir_el_recip, &e_q,
                        fr->vir_lj_recip, &e_lj, &dvdl_q, &dvdl_lj,
                        &cycles_seppme);
 -    if (bSepDVDL)
 -    {
 -        gmx_print_sepdvdl(fplog, "Electrostatic PME mesh", e_q, dvdl_q);
 -        gmx_print_sepdvdl(fplog, "Lennard-Jones PME mesh", e_lj, dvdl_lj);
 -    }
      enerd->term[F_COUL_RECIP] += e_q;
      enerd->term[F_LJ_RECIP]   += e_lj;
      enerd->dvdl_lin[efptCOUL] += dvdl_q;
@@@ -784,11 -802,6 +784,11 @@@ static void do_nb_verlet_fep(nbnxn_pair
      wallcycle_sub_stop(wcycle, ewcsNONBONDED);
  }
  
 +gmx_bool use_GPU(const nonbonded_verlet_t *nbv)
 +{
 +    return nbv != NULL && nbv->bUseGPU;
 +}
 +
  void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
                           t_inputrec *inputrec,
                           gmx_int64_t step, t_nrnb *nrnb, gmx_wallcycle_t wcycle,
@@@ -810,7 -823,7 +810,7 @@@
      int                 start, homenr;
      int                 nb_kernel_type;
      double              mu[2*DIM];
 -    gmx_bool            bSepDVDL, bStateChanged, bNS, bFillGrid, bCalcCGCM, bBS;
 +    gmx_bool            bStateChanged, bNS, bFillGrid, bCalcCGCM, bBS;
      gmx_bool            bDoLongRange, bDoForces, bSepLRF, bUseGPU, bUseOrEmulGPU;
      gmx_bool            bDiffKernels = FALSE;
      matrix              boxs;
@@@ -827,6 -840,8 +827,6 @@@
      start  = 0;
      homenr = mdatoms->homenr;
  
 -    bSepDVDL = (fr->bSepDVDL && do_per_step(step, inputrec->nstlog));
 -
      clear_mat(vir_force);
  
      cg0 = 0;
@@@ -1236,11 -1251,10 +1236,11 @@@
      }
  
      /* We calculate the non-bonded forces, when done on the CPU, here.
 -     * We do this before calling do_force_lowlevel, as in there bondeds
 -     * forces are calculated before PME, which does communication.
 -     * With this order, non-bonded and bonded force calculation imbalance
 -     * can be balanced out by the domain decomposition load balancing.
 +     * We do this before calling do_force_lowlevel, because in that
 +     * function, the listed forces are calculated before PME, which
 +     * does communication.  With this order, non-bonded and listed
 +     * force calculation imbalance can be balanced out by the domain
 +     * decomposition load balancing.
       */
  
      if (!bUseOrEmulGPU)
@@@ -1320,22 -1334,22 +1320,22 @@@
          update_QMMMrec(cr, fr, x, mdatoms, box, top);
      }
  
 -    if ((flags & GMX_FORCE_BONDED) && top->idef.il[F_POSRES].nr > 0)
 +    if ((flags & GMX_FORCE_LISTED) && top->idef.il[F_POSRES].nr > 0)
      {
 -        posres_wrapper(fplog, flags, bSepDVDL, inputrec, nrnb, top, box, x,
 +        posres_wrapper(flags, inputrec, nrnb, top, box, x,
                         enerd, lambda, fr);
      }
  
 -    if ((flags & GMX_FORCE_BONDED) && top->idef.il[F_FBPOSRES].nr > 0)
 +    if ((flags & GMX_FORCE_LISTED) && top->idef.il[F_FBPOSRES].nr > 0)
      {
          fbposres_wrapper(inputrec, nrnb, top, box, x, enerd, fr);
      }
  
      /* Compute the bonded and non-bonded energies and optionally forces */
 -    do_force_lowlevel(fplog, step, fr, inputrec, &(top->idef),
 +    do_force_lowlevel(fr, inputrec, &(top->idef),
                        cr, nrnb, wcycle, mdatoms,
                        x, hist, f, bSepLRF ? fr->f_twin : f, enerd, fcd, top, fr->born,
 -                      &(top->atomtypes), bBornRadii, box,
 +                      bBornRadii, box,
                        inputrec->fepvals, lambda, graph, &(top->excls), fr->mu_tot,
                        flags, &cycles_pme);
  
@@@ -1399,6 -1413,16 +1399,16 @@@
  
      if (bDoForces && DOMAINDECOMP(cr))
      {
+         if (bUseGPU)
+         {
+             /* We are done with the CPU compute, but the GPU local non-bonded
+              * kernel can still be running while we communicate the forces.
+              * We start a counter here, so we can, hopefully, time the rest
+              * of the GPU kernel execution and data transfer.
+              */
+             wallcycle_start(wcycle, ewcWAIT_GPU_NB_L_EST);
+         }
+ 
          /* Communicate the forces */
          wallcycle_start(wcycle, ewcMOVEF);
          dd_move_f(cr->dd, f, fr->fshift);
@@@ -1429,13 -1453,44 +1439,44 @@@
          /* wait for local forces (or calculate in emulation mode) */
          if (bUseGPU)
          {
+             float       cycles_tmp, cycles_wait_est;
+             const float cuda_api_overhead_margin = 50000.0f; /* cycles */
+ 
              wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
              nbnxn_cuda_wait_gpu(nbv->cu_nbv,
                                  nbv->grp[eintLocal].nbat,
                                  flags, eatLocal,
                                  enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
                                  fr->fshift);
-             cycles_wait_gpu += wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
+             cycles_tmp      = wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
+ 
+             if (bDoForces && DOMAINDECOMP(cr))
+             {
+                 cycles_wait_est = wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L_EST);
+ 
+                 if (cycles_tmp < cuda_api_overhead_margin)
+                 {
+                     /* We measured few cycles, it could be that the kernel
+                      * and transfer finished earlier and there was no actual
+                      * wait time, only API call overhead.
+                      * Then the actual time could be anywhere between 0 and
+                      * cycles_wait_est. As a compromise, we use half the time.
+                      */
+                     cycles_wait_est *= 0.5f;
+                 }
+             }
+             else
+             {
+                 /* No force communication so we actually timed the wait */
+                 cycles_wait_est = cycles_tmp;
+             }
+             /* Even though this is after dd_move_f, the actual task we are
+              * waiting for runs asynchronously with dd_move_f and we usually
+              * have nothing to balance it with, so we can and should add
+              * the time to the force time for load balancing.
+              */
+             cycles_force    += cycles_wait_est;
+             cycles_wait_gpu += cycles_wait_est;
  
              /* now clear the GPU outputs while we finish the step on the CPU */
  
@@@ -1519,7 -1574,7 +1560,7 @@@
          /* Since the COM pulling is always done mass-weighted, no forces are
           * applied to vsites and this call can be done after vsite spreading.
           */
 -        pull_potential_wrapper(fplog, bSepDVDL, cr, inputrec, box, x,
 +        pull_potential_wrapper(cr, inputrec, box, x,
                                 f, vir_force, mdatoms, enerd, lambda, t,
                                 wcycle);
      }
@@@ -1540,7 -1595,7 +1581,7 @@@
          /* In case of node-splitting, the PP nodes receive the long-range
           * forces, virial and energy from the PME nodes here.
           */
 -        pme_receive_force_ener(fplog, bSepDVDL, cr, wcycle, enerd, fr);
 +        pme_receive_force_ener(cr, wcycle, enerd, fr);
      }
  
      if (bDoForces)
@@@ -1573,7 -1628,7 +1614,7 @@@ void do_force_cutsGROUP(FILE *fplog, t_
      int        cg0, cg1, i, j;
      int        start, homenr;
      double     mu[2*DIM];
 -    gmx_bool   bSepDVDL, bStateChanged, bNS, bFillGrid, bCalcCGCM, bBS;
 +    gmx_bool   bStateChanged, bNS, bFillGrid, bCalcCGCM, bBS;
      gmx_bool   bDoLongRangeNS, bDoForces, bDoPotential, bSepLRF;
      gmx_bool   bDoAdressWF;
      matrix     boxs;
@@@ -1585,6 -1640,8 +1626,6 @@@
      start  = 0;
      homenr = mdatoms->homenr;
  
 -    bSepDVDL = (fr->bSepDVDL && do_per_step(step, inputrec->nstlog));
 -
      clear_mat(vir_force);
  
      cg0 = 0;
@@@ -1869,22 -1926,22 +1910,22 @@@
          update_QMMMrec(cr, fr, x, mdatoms, box, top);
      }
  
 -    if ((flags & GMX_FORCE_BONDED) && top->idef.il[F_POSRES].nr > 0)
 +    if ((flags & GMX_FORCE_LISTED) && top->idef.il[F_POSRES].nr > 0)
      {
 -        posres_wrapper(fplog, flags, bSepDVDL, inputrec, nrnb, top, box, x,
 +        posres_wrapper(flags, inputrec, nrnb, top, box, x,
                         enerd, lambda, fr);
      }
  
 -    if ((flags & GMX_FORCE_BONDED) && top->idef.il[F_FBPOSRES].nr > 0)
 +    if ((flags & GMX_FORCE_LISTED) && top->idef.il[F_FBPOSRES].nr > 0)
      {
          fbposres_wrapper(inputrec, nrnb, top, box, x, enerd, fr);
      }
  
      /* Compute the bonded and non-bonded energies and optionally forces */
 -    do_force_lowlevel(fplog, step, fr, inputrec, &(top->idef),
 +    do_force_lowlevel(fr, inputrec, &(top->idef),
                        cr, nrnb, wcycle, mdatoms,
                        x, hist, f, bSepLRF ? fr->f_twin : f, enerd, fcd, top, fr->born,
 -                      &(top->atomtypes), bBornRadii, box,
 +                      bBornRadii, box,
                        inputrec->fepvals, lambda,
                        graph, &(top->excls), fr->mu_tot,
                        flags,
@@@ -1992,7 -2049,7 +2033,7 @@@
  
      if (inputrec->ePull == epullUMBRELLA || inputrec->ePull == epullCONST_F)
      {
 -        pull_potential_wrapper(fplog, bSepDVDL, cr, inputrec, box, x,
 +        pull_potential_wrapper(cr, inputrec, box, x,
                                 f, vir_force, mdatoms, enerd, lambda, t,
                                 wcycle);
      }
@@@ -2013,7 -2070,7 +2054,7 @@@
          /* In case of node-splitting, the PP nodes receive the long-range
           * forces, virial and energy from the PME nodes here.
           */
 -        pme_receive_force_ener(fplog, bSepDVDL, cr, wcycle, enerd, fr);
 +        pme_receive_force_ener(cr, wcycle, enerd, fr);
      }
  
      if (bDoForces)
@@@ -2429,8 -2486,8 +2470,8 @@@ void calc_enervirdiff(FILE *fplog, int 
      }
  }
  
 -void calc_dispcorr(FILE *fplog, t_inputrec *ir, t_forcerec *fr,
 -                   gmx_int64_t step, int natoms,
 +void calc_dispcorr(t_inputrec *ir, t_forcerec *fr,
 +                   int natoms,
                     matrix box, real lambda, tensor pres, tensor virial,
                     real *prescorr, real *enercorr, real *dvdlcorr)
  {
@@@ -2531,6 -2588,10 +2572,6 @@@
              }
          }
  
 -        if (fr->bSepDVDL && do_per_step(step, ir->nstlog))
 -        {
 -            gmx_print_sepdvdl(fplog, "Dispersion correction", *enercorr, dvdlambda);
 -        }
          if (fr->efep != efepNO)
          {
              *dvdlcorr += dvdlambda;
@@@ -2635,7 -2696,7 +2676,7 @@@ void finish_run(FILE *fplog, t_commrec 
                  t_inputrec *inputrec,
                  t_nrnb nrnb[], gmx_wallcycle_t wcycle,
                  gmx_walltime_accounting_t walltime_accounting,
 -                wallclock_gpu_t *gputimes,
 +                nonbonded_verlet_t *nbv,
                  gmx_bool bWriteStat)
  {
      int     i, j;
@@@ -2699,8 -2760,6 +2740,8 @@@
  
      if (SIMMASTER(cr))
      {
 +        wallclock_gpu_t* gputimes = use_GPU(nbv) ?
 +            nbnxn_cuda_get_timings(nbv->cu_nbv) : NULL;
          wallcycle_print(fplog, cr->nnodes, cr->npmenodes,
                          elapsed_time_over_all_ranks,
                          wcycle, gputimes);
diff --combined src/gromacs/mdlib/update.c
index 3be84a8a93,93b7f59c1e..373212ca8a
--- a/src/gromacs/mdlib/update.c
+++ b/src/gromacs/mdlib/update.c
@@@ -34,37 -34,41 +34,37 @@@
   * To help us fund GROMACS development, we humbly ask that you cite
   * the research papers on the package. Check out http://www.gromacs.org.
   */
 -#ifdef HAVE_CONFIG_H
 -#include <config.h>
 -#endif
 +#include "gmxpre.h"
  
 +#include "gromacs/legacyheaders/update.h"
  
 -#include <stdio.h>
  #include <math.h>
 -
 -#include "types/commrec.h"
 -#include "sysstuff.h"
 -#include "gromacs/utility/smalloc.h"
 -#include "typedefs.h"
 -#include "nrnb.h"
 -#include "physics.h"
 -#include "macros.h"
 -#include "vec.h"
 -#include "main.h"
 -#include "update.h"
 -#include "gromacs/random/random.h"
 -#include "mshift.h"
 -#include "tgroup.h"
 -#include "force.h"
 -#include "names.h"
 -#include "txtdump.h"
 -#include "mdrun.h"
 -#include "constr.h"
 -#include "disre.h"
 -#include "orires.h"
 -#include "gmx_omp_nthreads.h"
 +#include <stdio.h>
  
  #include "gromacs/fileio/confio.h"
 -#include "gromacs/fileio/futil.h"
 +#include "gromacs/legacyheaders/constr.h"
 +#include "gromacs/legacyheaders/disre.h"
 +#include "gromacs/legacyheaders/force.h"
 +#include "gromacs/legacyheaders/gmx_omp_nthreads.h"
 +#include "gromacs/legacyheaders/macros.h"
 +#include "gromacs/legacyheaders/mdrun.h"
 +#include "gromacs/legacyheaders/names.h"
 +#include "gromacs/legacyheaders/nrnb.h"
 +#include "gromacs/legacyheaders/orires.h"
 +#include "gromacs/legacyheaders/tgroup.h"
 +#include "gromacs/legacyheaders/txtdump.h"
 +#include "gromacs/legacyheaders/typedefs.h"
 +#include "gromacs/legacyheaders/types/commrec.h"
 +#include "gromacs/math/units.h"
 +#include "gromacs/math/vec.h"
 +#include "gromacs/pbcutil/mshift.h"
 +#include "gromacs/pbcutil/pbc.h"
 +#include "gromacs/pulling/pull.h"
 +#include "gromacs/random/random.h"
  #include "gromacs/timing/wallcycle.h"
 +#include "gromacs/utility/futil.h"
  #include "gromacs/utility/gmxomp.h"
 -#include "gromacs/pulling/pull.h"
 +#include "gromacs/utility/smalloc.h"
  
  /*For debugging, start at v(-dt/2) for velolcity verlet -- uncomment next line */
  /*#define STARTFROMDT2*/
@@@ -1767,7 -1771,9 +1767,9 @@@ void update_constraints(FIL
          }
          else
          {
- #pragma omp parallel for num_threads(gmx_omp_nthreads_get(emntUpdate)) schedule(static)
+             nth = gmx_omp_nthreads_get(emntUpdate);
+ 
+ #pragma omp parallel for num_threads(nth) schedule(static)
              for (i = start; i < nrend; i++)
              {
                  copy_rvec(upd->xp[i], state->x[i]);
diff --combined src/gromacs/timing/wallcycle.c
index 27eb275cd0,ae26c2cec4..3b0d906636
--- a/src/gromacs/timing/wallcycle.c
+++ b/src/gromacs/timing/wallcycle.c
@@@ -34,22 -34,21 +34,22 @@@
   * To help us fund GROMACS development, we humbly ask that you cite
   * the research papers on the package. Check out http://www.gromacs.org.
   */
 -#include "gromacs/timing/wallcycle.h"
 +#include "gmxpre.h"
  
 -#ifdef HAVE_CONFIG_H
 -#include <config.h>
 -#endif
 +#include "wallcycle.h"
  
 -#include <string.h>
 +#include "config.h"
  
 -#include "gromacs/utility/smalloc.h"
 -#include "gmx_fatal.h"
 -#include "md_logging.h"
 -#include "gromacs/utility/cstringutil.h"
 +#include <stdlib.h>
 +#include <string.h>
  
 +#include "gromacs/legacyheaders/md_logging.h"
 +#include "gromacs/legacyheaders/types/commrec.h"
  #include "gromacs/timing/cyclecounter.h"
 +#include "gromacs/utility/cstringutil.h"
 +#include "gromacs/utility/fatalerror.h"
  #include "gromacs/utility/gmxmpi.h"
 +#include "gromacs/utility/smalloc.h"
  
  /* DEBUG_WCYCLE adds consistency checking for the counters.
   * It checks if you stop a counter different from the last
@@@ -99,7 -98,7 +99,7 @@@ static const char *wcn[ewcNR] 
      "DD comm. bounds", "Vsite constr.", "Send X to PME", "Neighbor search", "Launch GPU ops.",
      "Comm. coord.", "Born radii", "Force", "Wait + Comm. F", "PME mesh",
      "PME redist. X/F", "PME spread/gather", "PME 3D-FFT", "PME 3D-FFT Comm.", "PME solve LJ", "PME solve Elec",
-     "PME wait for PP", "Wait + Recv. PME F", "Wait GPU nonlocal", "Wait GPU local", "NB X/F buffer ops.",
+     "PME wait for PP", "Wait + Recv. PME F", "Wait GPU nonlocal", "Wait GPU local", "Wait GPU loc. est.", "NB X/F buffer ops.",
      "Vsite spread", "COM pull force",
      "Write traj.", "Update", "Constraints", "Comm. energies",
      "Enforced rotation", "Add rot. forces", "Coordinate swapping", "IMD", "Test"
@@@ -110,7 -109,7 +110,7 @@@ static const char *wcsn[ewcsNR] 
      "DD redist.", "DD NS grid + sort", "DD setup comm.",
      "DD make top.", "DD make constr.", "DD top. other",
      "NS grid local", "NS grid non-loc.", "NS search local", "NS search non-loc.",
 -    "Bonded F", "Nonbonded F", "Ewald F correction",
 +    "Listed F", "Nonbonded F", "Ewald F correction",
      "NB X buffer ops.", "NB F buffer ops."
  };
  
@@@ -388,6 -387,12 +388,12 @@@ void wallcycle_sum(t_commrec *cr, gmx_w
  
      wcc = wc->wcc;
  
+     /* The GPU wait estimate counter is used for load balancing only
+      * and will mess up the total due to double counting: clear it.
+      */
+     wcc[ewcWAIT_GPU_NB_L_EST].n = 0;
+     wcc[ewcWAIT_GPU_NB_L_EST].c = 0;
+ 
      for (i = 0; i < ewcNR; i++)
      {
          if (is_pme_counter(i) || (i == ewcRUN && cr->duty == DUTY_PME))
diff --combined src/gromacs/timing/wallcycle.h
index 104173598a,7b48849707..69c92cde72
--- a/src/gromacs/timing/wallcycle.h
+++ b/src/gromacs/timing/wallcycle.h
@@@ -38,23 -38,19 +38,23 @@@
  #define GMX_TIMING_WALLCYCLE_H
  
  #include <stdio.h>
 -#include "gromacs/legacyheaders/typedefs.h"
 -#include "gromacs/legacyheaders/types/commrec.h"
 +
 +#include "gromacs/legacyheaders/types/commrec_fwd.h"
 +#include "gromacs/legacyheaders/types/nbnxn_cuda_types_ext.h"
 +#include "gromacs/utility/basedefinitions.h"
  
  #ifdef __cplusplus
  extern "C" {
  #endif
  
 +typedef struct gmx_wallcycle *gmx_wallcycle_t;
 +
  enum {
      ewcRUN, ewcSTEP, ewcPPDURINGPME, ewcDOMDEC, ewcDDCOMMLOAD,
      ewcDDCOMMBOUND, ewcVSITECONSTR, ewcPP_PMESENDX, ewcNS, ewcLAUNCH_GPU_NB,
      ewcMOVEX, ewcGB, ewcFORCE, ewcMOVEF, ewcPMEMESH,
      ewcPME_REDISTXF, ewcPME_SPREADGATHER, ewcPME_FFT, ewcPME_FFTCOMM, ewcLJPME, ewcPME_SOLVE,
-     ewcPMEWAITCOMM, ewcPP_PMEWAITRECVF, ewcWAIT_GPU_NB_NL, ewcWAIT_GPU_NB_L, ewcNB_XF_BUF_OPS,
+     ewcPMEWAITCOMM, ewcPP_PMEWAITRECVF, ewcWAIT_GPU_NB_NL, ewcWAIT_GPU_NB_L, ewcWAIT_GPU_NB_L_EST, ewcNB_XF_BUF_OPS,
      ewcVSITESPREAD, ewcPULLPOT,
      ewcTRAJ, ewcUPDATE, ewcCONSTR, ewcMoveE, ewcROT, ewcROTadd, ewcSWAP, ewcIMD,
      ewcTEST, ewcNR
@@@ -66,7 -62,7 +66,7 @@@ enum 
      ewcsDD_MAKETOP, ewcsDD_MAKECONSTR, ewcsDD_TOPOTHER,
      ewcsNBS_GRID_LOCAL, ewcsNBS_GRID_NONLOCAL,
      ewcsNBS_SEARCH_LOCAL, ewcsNBS_SEARCH_NONLOCAL,
 -    ewcsBONDED, ewcsNONBONDED, ewcsEWALD_CORRECTION,
 +    ewcsLISTED, ewcsNONBONDED, ewcsEWALD_CORRECTION,
      ewcsNB_X_BUF_OPS, ewcsNB_F_BUF_OPS,
      ewcsNR
  };
diff --combined src/programs/mdrun/md.cpp
index 9864f8789f,3d98d597c7..0504150c00
--- a/src/programs/mdrun/md.cpp
+++ b/src/programs/mdrun/md.cpp
@@@ -34,67 -34,67 +34,67 @@@
   * To help us fund GROMACS development, we humbly ask that you cite
   * the research papers on the package. Check out http://www.gromacs.org.
   */
 -#ifdef HAVE_CONFIG_H
 -#include <config.h>
 -#endif
 +#include "gmxpre.h"
  
 -#include "typedefs.h"
 -#include "gromacs/utility/smalloc.h"
 -#include "sysstuff.h"
 -#include "vec.h"
 -#include "vcm.h"
 -#include "mdebin.h"
 -#include "nrnb.h"
 -#include "calcmu.h"
 -#include "index.h"
 -#include "vsite.h"
 -#include "update.h"
 -#include "ns.h"
 -#include "mdrun.h"
 -#include "md_support.h"
 -#include "md_logging.h"
 -#include "network.h"
 -#include "xvgr.h"
 -#include "physics.h"
 -#include "names.h"
 -#include "force.h"
 -#include "disre.h"
 -#include "orires.h"
 -#include "pme.h"
 -#include "mdatoms.h"
 -#include "repl_ex.h"
 -#include "deform.h"
 -#include "qmmm.h"
 -#include "domdec.h"
 -#include "domdec_network.h"
 -#include "gromacs/gmxlib/topsort.h"
 -#include "coulomb.h"
 -#include "constr.h"
 -#include "shellfc.h"
 -#include "gromacs/gmxpreprocess/compute_io.h"
 -#include "checkpoint.h"
 -#include "mtop_util.h"
 -#include "sighandler.h"
 -#include "txtdump.h"
 -#include "gromacs/utility/cstringutil.h"
 -#include "pme_loadbal.h"
 -#include "bondf.h"
 -#include "membed.h"
 -#include "types/nlistheuristics.h"
 -#include "types/iteratedconstraints.h"
 -#include "nbnxn_cuda_data_mgmt.h"
 +#include "config.h"
 +
 +#include <stdlib.h>
  
 -#include "gromacs/utility/gmxmpi.h"
  #include "gromacs/fileio/confio.h"
 +#include "gromacs/fileio/mdoutf.h"
  #include "gromacs/fileio/trajectory_writing.h"
  #include "gromacs/fileio/trnio.h"
  #include "gromacs/fileio/trxio.h"
  #include "gromacs/fileio/xtcio.h"
 -#include "gromacs/timing/wallcycle.h"
 -#include "gromacs/timing/walltime_accounting.h"
 +#include "gromacs/gmxpreprocess/compute_io.h"
 +#include "gromacs/imd/imd.h"
 +#include "gromacs/legacyheaders/bonded-threading.h"
 +#include "gromacs/legacyheaders/calcmu.h"
 +#include "gromacs/legacyheaders/checkpoint.h"
 +#include "gromacs/legacyheaders/constr.h"
 +#include "gromacs/legacyheaders/coulomb.h"
 +#include "gromacs/legacyheaders/disre.h"
 +#include "gromacs/legacyheaders/domdec.h"
 +#include "gromacs/legacyheaders/domdec_network.h"
 +#include "gromacs/legacyheaders/force.h"
 +#include "gromacs/legacyheaders/md_logging.h"
 +#include "gromacs/legacyheaders/md_support.h"
 +#include "gromacs/legacyheaders/mdatoms.h"
 +#include "gromacs/legacyheaders/mdebin.h"
 +#include "gromacs/legacyheaders/mdrun.h"
 +#include "gromacs/legacyheaders/names.h"
 +#include "gromacs/legacyheaders/network.h"
 +#include "gromacs/legacyheaders/nrnb.h"
 +#include "gromacs/legacyheaders/ns.h"
 +#include "gromacs/legacyheaders/orires.h"
 +#include "gromacs/legacyheaders/pme.h"
 +#include "gromacs/legacyheaders/qmmm.h"
 +#include "gromacs/legacyheaders/shellfc.h"
 +#include "gromacs/legacyheaders/sighandler.h"
 +#include "gromacs/legacyheaders/txtdump.h"
 +#include "gromacs/legacyheaders/typedefs.h"
 +#include "gromacs/legacyheaders/update.h"
 +#include "gromacs/legacyheaders/vcm.h"
 +#include "gromacs/legacyheaders/vsite.h"
 +#include "gromacs/legacyheaders/types/iteratedconstraints.h"
 +#include "gromacs/legacyheaders/types/nlistheuristics.h"
 +#include "gromacs/math/vec.h"
 +#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.h"
 +#include "gromacs/pbcutil/mshift.h"
 +#include "gromacs/pbcutil/pbc.h"
  #include "gromacs/pulling/pull.h"
  #include "gromacs/swap/swapcoords.h"
 -#include "gromacs/imd/imd.h"
 +#include "gromacs/timing/wallcycle.h"
 +#include "gromacs/timing/walltime_accounting.h"
 +#include "gromacs/topology/mtop_util.h"
 +#include "gromacs/utility/cstringutil.h"
 +#include "gromacs/utility/gmxmpi.h"
 +#include "gromacs/utility/smalloc.h"
 +
 +#include "deform.h"
 +#include "membed.h"
 +#include "pme_loadbal.h"
 +#include "repl_ex.h"
  
  #ifdef GMX_FAHCORE
  #include "corewrap.h"
@@@ -105,7 -105,7 +105,7 @@@ static void reset_all_counters(FILE *fp
                                 gmx_int64_t *step_rel, t_inputrec *ir,
                                 gmx_wallcycle_t wcycle, t_nrnb *nrnb,
                                 gmx_walltime_accounting_t walltime_accounting,
 -                               nbnxn_cuda_ptr_t cu_nbv)
 +                               struct nonbonded_verlet_t *nbv)
  {
      char sbuf[STEPSTRSIZE];
  
@@@ -113,7 -113,10 +113,7 @@@
      md_print_warn(cr, fplog, "step %s: resetting all time and cycle counters\n",
                    gmx_step_str(step, sbuf));
  
 -    if (cu_nbv)
 -    {
 -        nbnxn_cuda_reset_timings(cu_nbv);
 -    }
 +    nbnxn_cuda_reset_timings(nbv);
  
      wallcycle_stop(wcycle, ewcRUN);
      wallcycle_reset_all(wcycle);
@@@ -167,7 -170,7 +167,7 @@@ double do_md(FILE *fplog, t_commrec *cr
      rvec              mu_tot;
      t_vcm            *vcm;
      t_state          *bufstate = NULL;
 -    matrix           *scale_tot, pcoupl_mu, M, ebox;
 +    matrix            pcoupl_mu, M;
      gmx_nlheur_t      nlh;
      t_trxframe        rerun_fr;
      gmx_repl_ex_t     repl_ex = NULL;
@@@ -186,8 -189,10 +186,8 @@@
      gmx_ekindata_t   *ekind, *ekind_save;
      gmx_shellfc_t     shellfc;
      int               count, nconverged = 0;
 -    real              timestep   = 0;
 -    double            tcount     = 0;
 -    gmx_bool          bConverged = TRUE, bOK, bSumEkinhOld, bDoReplEx, bExchanged, bNeedRepartition;
 -    gmx_bool          bAppend;
 +    double            tcount                 = 0;
 +    gmx_bool          bConverged             = TRUE, bSumEkinhOld, bDoReplEx, bExchanged, bNeedRepartition;
      gmx_bool          bResetCountersHalfMaxH = FALSE;
      gmx_bool          bVV, bIterativeCase, bFirstIterate, bTemp, bPres, bTrotter;
      gmx_bool          bUpdateDoLR;
@@@ -202,6 -207,7 +202,6 @@@
      double            cycles;
      real              saved_conserved_quantity = 0;
      real              last_ekin                = 0;
 -    int               iter_i;
      t_extmass         MassQ;
      int             **trotter_seq;
      char              sbuf[STEPSTRSIZE], sbuf2[STEPSTRSIZE];
@@@ -225,6 -231,7 +225,6 @@@
  
      /* Check for special mdrun options */
      bRerunMD = (Flags & MD_RERUN);
 -    bAppend  = (Flags & MD_APPENDFILES);
      if (Flags & MD_RESETCOUNTERSHALFWAY)
      {
          if (ir->nsteps > 0)
@@@ -476,7 -483,7 +476,7 @@@
       */
      if ((Flags & MD_TUNEPME) &&
          EEL_PME(fr->eeltype) &&
 -        ( (fr->cutoff_scheme == ecutsVERLET && fr->nbv->bUseGPU) || !(cr->duty & DUTY_PME)) &&
 +        ( use_GPU(fr->nbv) || !(cr->duty & DUTY_PME)) &&
          !bRerunMD)
      {
          pme_loadbal_init(&pme_loadbal, ir, state->box, fr->ic, fr->pmedata);
@@@ -719,7 -726,9 +719,7 @@@
      bStateFromTPX    = !bStateFromCP;
      bInitStep        = bFirstStep && (bStateFromTPX || bVV);
      bStartingFromCpt = (Flags & MD_STARTFROMCPT) && bInitStep;
 -    bLastStep        = FALSE;
      bSumEkinhOld     = FALSE;
 -    bDoReplEx        = FALSE;
      bExchanged       = FALSE;
      bNeedRepartition = FALSE;
  
@@@ -728,7 -737,10 +728,7 @@@
      step     = ir->init_step;
      step_rel = 0;
  
 -    if (ir->nstlist == -1)
 -    {
 -        init_nlistheuristics(&nlh, bGStatEveryStep, step);
 -    }
 +    init_nlistheuristics(&nlh, bGStatEveryStep, step);
  
      if (MULTISIM(cr) && (repl_ex_nst <= 0 ))
      {
@@@ -1149,6 -1161,7 +1149,6 @@@
                      }
                  }
  
 -                bOK = TRUE;
                  if (!bRerunMD || rerun_fr.bV || bForceUpdate)     /* Why is rerun_fr.bV here?  Unclear. */
                  {
                      wallcycle_stop(wcycle, ewcUPDATE);
@@@ -1164,6 -1177,12 +1164,6 @@@
                          /* Correct the virial for multiple time stepping */
                          m_sub(shake_vir, fr->vir_twin_constr, shake_vir);
                      }
 -
 -                    if (!bOK)
 -                    {
 -                        gmx_fatal(FARGS, "Constraint error: Shake, Lincs or Settle could not solve the constrains");
 -                    }
 -
                  }
                  else if (graph)
                  {
@@@ -1464,6 -1483,7 +1464,6 @@@
               */
              copy_mat(state->box, lastbox);
  
 -            bOK         = TRUE;
              dvdl_constr = 0;
  
              if (!(bRerunMD && !rerun_fr.bV && !bForceUpdate))
@@@ -1573,7 -1593,15 +1573,7 @@@
                                         FALSE, bCalcVir,
                                         state->veta);
                  }
 -                if (!bOK)
 -                {
 -                    gmx_fatal(FARGS, "Constraint error: Shake, Lincs or Settle could not solve the constrains");
 -                }
  
 -                if (fr->bSepDVDL && fplog && do_log)
 -                {
 -                    gmx_print_sepdvdl(fplog, "Constraint dV/dl", 0.0, dvdl_constr);
 -                }
                  if (bVV)
                  {
                      /* this factor or 2 correction is necessary
@@@ -1881,6 -1909,21 +1881,21 @@@
                      }
                      dd_bcast(cr->dd, sizeof(gmx_bool), &bPMETuneRunning);
  
+                     if (bPMETuneRunning &&
 -                        fr->nbv->bUseGPU && DOMAINDECOMP(cr) &&
++                        use_GPU(fr->nbv) && DOMAINDECOMP(cr) &&
+                         !(cr->duty & DUTY_PME))
+                     {
+                         /* Lock DLB=auto to off (does nothing when DLB=yes/no).
+                          * With GPUs + separate PME ranks, we don't want DLB.
+                          * This could happen when we scan coarse grids and
+                          * it would then never be turned off again.
+                          * This would hurt performance at the final, optimal
+                          * grid spacing, where DLB almost never helps.
+                          * Also, DLB can limit the cut-off for PME tuning.
+                          */
+                         dd_dlb_set_lock(cr->dd, TRUE);
+                     }
+ 
                      if (bPMETuneRunning || step_rel > ir->nstlist*50)
                      {
                          bPMETuneTry     = FALSE;
@@@ -1911,6 -1954,16 +1926,16 @@@
                      {
                          calc_enervirdiff(NULL, ir->eDispCorr, fr);
                      }
+ 
+                     if (!bPMETuneRunning &&
+                         DOMAINDECOMP(cr) &&
+                         dd_dlb_is_locked(cr->dd))
+                     {
+                         /* Unlock the DLB=auto, DLB is allowed to activate
+                          * (but we don't expect it to activate in most cases).
+                          */
+                         dd_dlb_set_lock(cr->dd, FALSE);
+                     }
                  }
                  cycles_pmes = 0;
              }
@@@ -1921,7 -1974,7 +1946,7 @@@
          {
              /* Reset all the counters related to performance over the run */
              reset_all_counters(fplog, cr, step, &step_rel, ir, wcycle, nrnb, walltime_accounting,
 -                               fr->nbv != NULL && fr->nbv->bUseGPU ? fr->nbv->cu_nbv : NULL);
 +                               use_GPU(fr->nbv) ? fr->nbv : NULL);
              wcycle_set_reset_counters(wcycle, -1);
              if (!(cr->duty & DUTY_PME))
              {
@@@ -1980,7 -2033,7 +2005,7 @@@
      if (pme_loadbal != NULL)
      {
          pme_loadbal_done(pme_loadbal, cr, fplog,
 -                         fr->nbv != NULL && fr->nbv->bUseGPU);
 +                         use_GPU(fr->nbv));
      }
  
      if (shellfc && fplog)
diff --combined src/programs/mdrun/pme_loadbal.c
index 5ba76e63e0,d6fd68f372..dd2cd1a2e9
--- a/src/programs/mdrun/pme_loadbal.c
+++ b/src/programs/mdrun/pme_loadbal.c
@@@ -32,27 -32,23 +32,27 @@@
   * To help us fund GROMACS development, we humbly ask that you cite
   * the research papers on the package. Check out http://www.gromacs.org.
   */
 -#ifdef HAVE_CONFIG_H
 -#include <config.h>
 -#endif
 +#include "gmxpre.h"
  
 -#include "gromacs/utility/smalloc.h"
 -#include "types/commrec.h"
 -#include "network.h"
 -#include "calcgrid.h"
 -#include "pme.h"
 -#include "vec.h"
 -#include "domdec.h"
 -#include "nbnxn_cuda_data_mgmt.h"
 -#include "force.h"
 -#include "macros.h"
 -#include "md_logging.h"
  #include "pme_loadbal.h"
  
 +#include "config.h"
 +
 +#include "gromacs/legacyheaders/calcgrid.h"
 +#include "gromacs/legacyheaders/domdec.h"
 +#include "gromacs/legacyheaders/force.h"
 +#include "gromacs/legacyheaders/macros.h"
 +#include "gromacs/legacyheaders/md_logging.h"
 +#include "gromacs/legacyheaders/network.h"
 +#include "gromacs/legacyheaders/pme.h"
 +#include "gromacs/legacyheaders/sim_util.h"
 +#include "gromacs/legacyheaders/types/commrec.h"
 +#include "gromacs/math/vec.h"
 +#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.h"
 +#include "gromacs/pbcutil/pbc.h"
 +#include "gromacs/utility/cstringutil.h"
 +#include "gromacs/utility/smalloc.h"
 +
  /* Parameters and setting for one PP-PME setup */
  typedef struct {
      real      rcut_coulomb;    /* Coulomb cut-off                              */
@@@ -265,6 -261,15 +265,15 @@@ static gmx_bool pme_loadbal_increase_cu
      while (sp <= 1.001*pme_lb->setup[pme_lb->cur].spacing || !grid_ok);
  
      set->rcut_coulomb = pme_lb->cut_spacing*sp;
+     if (set->rcut_coulomb < pme_lb->rcut_coulomb_start)
+     {
+         /* This is unlikely, but can happen when e.g. continuing from
+          * a checkpoint after equilibration where the box shrank a lot.
+          * We want to avoid rcoulomb getting smaller than rvdw
+          * and there might be more issues with decreasing rcoulomb.
+          */
+         set->rcut_coulomb = pme_lb->rcut_coulomb_start;
+     }
  
      if (pme_lb->cutoff_scheme == ecutsVERLET)
      {
@@@ -431,17 -436,17 +440,17 @@@ static void switch_to_stage1(pme_load_b
      pme_lb->cur = pme_lb->start - 1;
  }
  
 -gmx_bool pme_load_balance(pme_load_balancing_t pme_lb,
 -                          t_commrec           *cr,
 -                          FILE                *fp_err,
 -                          FILE                *fp_log,
 -                          t_inputrec          *ir,
 -                          t_state             *state,
 -                          double               cycles,
 -                          interaction_const_t *ic,
 -                          nonbonded_verlet_t  *nbv,
 -                          gmx_pme_t           *pmedata,
 -                          gmx_int64_t          step)
 +gmx_bool pme_load_balance(pme_load_balancing_t        pme_lb,
 +                          t_commrec                  *cr,
 +                          FILE                       *fp_err,
 +                          FILE                       *fp_log,
 +                          t_inputrec                 *ir,
 +                          t_state                    *state,
 +                          double                      cycles,
 +                          interaction_const_t        *ic,
 +                          struct nonbonded_verlet_t  *nbv,
 +                          gmx_pme_t                  *pmedata,
 +                          gmx_int64_t                 step)
  {
      gmx_bool     OK;
      pme_setup_t *set;
@@@ -693,26 -698,30 +702,26 @@@
      }
  
      bUsesSimpleTables = uses_simple_tables(ir->cutoff_scheme, nbv, 0);
 -    if (pme_lb->cutoff_scheme == ecutsVERLET &&
 -        nbv->grp[0].kernel_type == nbnxnk8x8x8_CUDA)
 -    {
 -        nbnxn_cuda_pme_loadbal_update_param(nbv->cu_nbv, ic);
 -
 -        /* With tMPI + GPUs some ranks may be sharing GPU(s) and therefore
 -         * also sharing texture references. To keep the code simple, we don't
 -         * treat texture references as shared resources, but this means that
 -         * the coulomb_tab texture ref will get updated by multiple threads.
 -         * Hence, to ensure that the non-bonded kernels don't start before all
 -         * texture binding operations are finished, we need to wait for all ranks
 -         * to arrive here before continuing.
 -         *
 -         * Note that we could omit this barrier if GPUs are not shared (or
 -         * texture objects are used), but as this is initialization code, there
 -         * is not point in complicating things.
 -         */
 +    nbnxn_cuda_pme_loadbal_update_param(nbv, ic);
 +
 +    /* With tMPI + GPUs some ranks may be sharing GPU(s) and therefore
 +     * also sharing texture references. To keep the code simple, we don't
 +     * treat texture references as shared resources, but this means that
 +     * the coulomb_tab texture ref will get updated by multiple threads.
 +     * Hence, to ensure that the non-bonded kernels don't start before all
 +     * texture binding operations are finished, we need to wait for all ranks
 +     * to arrive here before continuing.
 +     *
 +     * Note that we could omit this barrier if GPUs are not shared (or
 +     * texture objects are used), but as this is initialization code, there
 +     * is not point in complicating things.
 +     */
  #ifdef GMX_THREAD_MPI
 -        if (PAR(cr))
 -        {
 -            gmx_barrier(cr);
 -        }
 -#endif  /* GMX_THREAD_MPI */
 +    if (PAR(cr) && use_GPU(nbv))
 +    {
 +        gmx_barrier(cr);
      }
 +#endif  /* GMX_THREAD_MPI */
  
      /* Usually we won't need the simple tables with GPUs.
       * But we do with hybrid acceleration and with free energy.