Merge release-5-0 into master

author Mark Abraham <mark.j.abraham@gmail.com>

Wed, 1 Oct 2014 20:02:02 +0000 (22:02 +0200)

committer Mark Abraham <mark.j.abraham@gmail.com>

Wed, 1 Oct 2014 21:18:54 +0000 (23:18 +0200)
author Mark Abraham <mark.j.abraham@gmail.com>
Wed, 1 Oct 2014 20:02:02 +0000 (22:02 +0200)
committer Mark Abraham <mark.j.abraham@gmail.com>
Wed, 1 Oct 2014 21:18:54 +0000 (23:18 +0200)
diff --combined CMakeLists.txt

index 70185949c4ac8cae13677e083501bd2f14c71124,53498532af1377e335ee0747ace076bf86f54d6a..7fbe947522f707b751ec25248446b94e39af19e4
--- 1/CMakeLists.txt
--- 2/CMakeLists.txt
+++ b/CMakeLists.txt
@@@ -48,28 -48,75 +48,28 @@@ set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CM
   set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
   set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
   
- -# PROJECT_VERSION should have the following structure:
- -# VERSION-dev[-SUFFIX] where the VERSION should have the for: vMajor.vMinor.vPatch
- -#
- -# The "-dev" suffix is important to keep because it makes possible to distinguish
- -# between a build from official release and a build from git release branch on a
- -# machine with no git.
- -#
- -# NOTE: when releasing the "-dev" suffix needs to be stripped off!
- -# REGRESSIONTEST_VERSION and REGRESSIONTEST_BRANCH should always be
- -# defined.
- -set(PROJECT_VERSION "5.0.3-dev")
- -# If this is a released tarball, "-dev" will not be present in
- -# PROJECT_VERSION, and REGRESSIONTEST_VERSION specifies the version
- -# number of the regressiontest tarball against which the code tarball
- -# can be tested. This will be the version of the last patch release.
- -set(REGRESSIONTEST_VERSION "5.0.3-dev")
- -# The MD5 checksum of the regressiontest tarball. Only used if "-dev"
- -# is not present in the PROJECT_VERSION
- -set(REGRESSIONTEST_MD5SUM "004f2377adf252e2caa9c241390d4b9c")
- -# If this is not a released tarball, "-dev" will be present in
- -# PROJECT_VERSION, and REGRESSIONTEST_BRANCH specifies the name of the
- -# gerrit.gromacs.org branch whose HEAD can test this code, *if* this
- -# code contains all recent fixes from the corresponding code branch.
- -set(REGRESSIONTEST_BRANCH "refs/heads/release-5-0")
- -
- -set(CUSTOM_VERSION_STRING ""
- -    CACHE STRING "Custom version string (if empty, use hard-coded default)")
- -mark_as_advanced(CUSTOM_VERSION_STRING)
- -if (CUSTOM_VERSION_STRING)
- -    set(PROJECT_VERSION ${CUSTOM_VERSION_STRING})
- -endif()
- -set(LIBRARY_SOVERSION 0)
- -set(LIBRARY_VERSION ${LIBRARY_SOVERSION}.0.0)
- -# It is a bit irritating, but this has to be set separately for now!
- -SET(CPACK_PACKAGE_VERSION_MAJOR "5")
- -SET(CPACK_PACKAGE_VERSION_MINOR "0")
- -#SET(CPACK_PACKAGE_VERSION_PATCH "0")
- -
- -# The numerical gromacs version. It is 40600 for 4.6.0.
- -# The #define GMX_VERSION in gromacs/version.h is set to this value.
- -math(EXPR NUM_VERSION
- -    "${CPACK_PACKAGE_VERSION_MAJOR}*10000 + ${CPACK_PACKAGE_VERSION_MINOR}*100")
- -if(CPACK_PACKAGE_VERSION_PATCH)
- -    math(EXPR NUM_VERSION
- -         "${NUM_VERSION} + ${CPACK_PACKAGE_VERSION_PATCH}")
- -endif()
- -
- -# The API version tracks the numerical Gromacs version (for now).
- -# It is potentially different from the Gromacs version in the future, if
- -# the programs/libraries diverge from the presumably more stable API.
- -# The #define GMX_API_VERSION in version.h is set to this value to
- -# provide backward compatibility of software written against the Gromacs API.
- -set(API_VERSION ${NUM_VERSION})
+ +# Set up common version variables, as well as general information about
+ +# the build tree (whether the build is from a source package or from a git
+ +# repository).  Also declares a few functions that will be used for generating
+ +# version info files later.
+ +include(gmxVersionInfo)
   
   if(CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT AND UNIX)
       set(CMAKE_INSTALL_PREFIX "/usr/local/gromacs" CACHE STRING "Installation prefix (installation will need write permissions here)" FORCE)
   endif()
   
   include(gmxBuildTypeReference)
+ +include(gmxBuildTypeProfile)
   include(gmxBuildTypeTSAN)
   include(gmxBuildTypeASAN)
   include(gmxBuildTypeReleaseWithAssert)
   
   if(NOT CMAKE_BUILD_TYPE)
- -    set(CMAKE_BUILD_TYPE "Release" CACHE STRING "Choose the type of build, options are: Debug Release RelWithDebInfo MinSizeRel Reference RelWithAssert." FORCE)
+ +    set(CMAKE_BUILD_TYPE "Release" CACHE STRING "Choose the type of build, options are: Debug Release RelWithDebInfo MinSizeRel Reference RelWithAssert Profile." FORCE)
       # There's no need to offer a user the choice of ThreadSanitizer
       # Set the possible values of build type for cmake-gui
       set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release"
- -        "MinSizeRel" "RelWithDebInfo" "Reference" "RelWithAssert")
+ +        "MinSizeRel" "RelWithDebInfo" "Reference" "RelWithAssert" "Profile")
   endif()
   if(CMAKE_CONFIGURATION_TYPES)
       # Add appropriate GROMACS-specific build types for the Visual
@@@ -81,10 -128,34 +81,10 @@@
           "List of configuration types"
           FORCE)
   endif()
- set(build_types_with_explicit_flags RELEASE DEBUG RELWITHDEBUGINFO RELWITHASSERT MINSIZEREL PROFILE)
- -set(build_types_with_explicit_flags RELEASE DEBUG RELWITHDEBINFO RELWITHASSERT MINSIZEREL)
++set(build_types_with_explicit_flags RELEASE DEBUG RELWITHDEBINFO RELWITHASSERT MINSIZEREL PROFILE)
   
- -enable_language(C)
- -enable_language(CXX)
   set_property(GLOBAL PROPERTY FIND_LIBRARY_USE_LIB64_PATHS ON)
   
- -set(CPACK_PACKAGE_NAME "gromacs")
- -set(CPACK_PACKAGE_VERSION ${PROJECT_VERSION})
- -set(CPACK_SOURCE_PACKAGE_FILE_NAME "${CPACK_PACKAGE_NAME}-${CPACK_PACKAGE_VERSION}")
- -set(CPACK_PACKAGE_VENDOR "gromacs.org")
- -set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "Gromacs - a toolkit for high-performance molecular simulation")
- -set(CPACK_RESOURCE_FILE_WELCOME "${CMAKE_SOURCE_DIR}/admin/InstallWelcome.txt")
- -# Its GPL/LGPL, so they do not have to agree to a license for mere usage, but some installers require this...
- -set(CPACK_RESOURCE_FILE_LICENSE "${CMAKE_SOURCE_DIR}/COPYING")
- -set(CPACK_RESOURCE_FILE_README "${CMAKE_SOURCE_DIR}/admin/InstallInfo.txt")
- -set(CPACK_SOURCE_IGNORE_FILES "\\\\.isreposource$;\\\\.git/;\\\\.gitignore$;\\\\.gitattributes;")
- -set(CPACK_PROJECT_CONFIG_FILE "${CMAKE_SOURCE_DIR}/CPackInit.cmake")
- -# CPack source archives include only the directories we list here.
- -# This variable is a list of pairs of names of source and destination
- -# directories. Most of these are used for content GROMACS generates as
- -# part of the configuration or build.
- -set(CPACK_SOURCE_INSTALLED_DIRECTORIES "${CMAKE_SOURCE_DIR};/;${CMAKE_BINARY_DIR}/src/programs/completion;src/programs/completion;${CMAKE_BINARY_DIR}/docs/man/man1;docs/man/man1;${CMAKE_BINARY_DIR}/docs/man/man7;docs/man/man7;${CMAKE_BINARY_DIR}/docs/old-html/final;docs/old-html/final;${CMAKE_BINARY_DIR}/docs/install-guide/final;/")
- -set(CPACK_PACKAGE_CONTACT "gmx-users@gromacs.org")
- -set(CPACK_GMX_BUILD_HELP "${GMX_BUILD_HELP}") #Works even though GMX_BUILD_HELP is defined later because it is off by default.
- -
- -#must come after all cpack settings!
- -include(CPack)
- -
   # Set a default valgrind suppression file.
   # This unfortunately needs to duplicate information from CTest to work as
   # expected...
@@@ -94,12 -165,14 +94,12 @@@ set(MEMORYCHECK_SUPPRESSIONS_FIL
       "File that contains suppressions for the memory checker")
   include(CTest)
   
- -set(SOURCE_IS_GIT_REPOSITORY OFF)
- -set(SOURCE_IS_SOURCE_DISTRIBUTION OFF)
- -if(EXISTS "${CMAKE_SOURCE_DIR}/.git")
- -    set(SOURCE_IS_GIT_REPOSITORY ON)
- -endif()
- -if(NOT EXISTS "${CMAKE_SOURCE_DIR}/admin/.isreposource")
- -    set(SOURCE_IS_SOURCE_DISTRIBUTION ON)
- -endif()
+ +include(gmxCPackUtilities)
+ +gmx_cpack_init()
+ +
+ +# Variables that accumulate stuff influencing the installed headers
+ +set(INSTALLED_HEADER_INCLUDE_DIRS "")
+ +set(INSTALLED_HEADER_DEFINITIONS "")
   
   ########################################################################
   # Check and warn if cache generated on a different host is being reused
@@@ -190,7 -263,7 +190,7 @@@ gmx_option_multichoice
       GMX_SIMD
       "SIMD instruction set for CPU kernels and compiler optimization"
       "${GMX_SUGGESTED_SIMD}"
- -    None SSE2 SSE4.1 AVX_128_FMA AVX_256 AVX2_256 IBM_QPX Sparc64_HPC_ACE Reference)
+ +    None SSE2 SSE4.1 AVX_128_FMA AVX_256 AVX2_256 ARM_NEON ARM_NEON_ASIMD IBM_QPX IBM_VMX IBM_VSX Sparc64_HPC_ACE Reference)
   
   gmx_option_multichoice(
       GMX_FFT_LIBRARY
@@@ -231,6 -304,9 +231,6 @@@ option(GMX_OPENMP "Enable OpenMP-based 
   
   option(GMX_USE_TNG "Use the TNG library for trajectory I/O" ON)
   
- -option(GMX_GIT_VERSION_INFO "Generate git version information" ${SOURCE_IS_GIT_REPOSITORY})
- -mark_as_advanced(GMX_GIT_VERSION_INFO)
- -
   if(UNIX)
       option(GMX_SYMLINK_OLD_BINARY_NAMES "Create symbolic links for pre-5.0 binary names" ON)
   endif()
@@@ -304,15 -380,17 +304,15 @@@ if(GMX_SIMD STREQUAL "AVX_256
   endif()
   
   
- -
- -set(PKG_CFLAGS "")
   if(GMX_DOUBLE)
       add_definitions(-DGMX_DOUBLE)
- -    set(PKG_CFLAGS "${PKG_CFLAGS} -DGMX_DOUBLE")
+ +    list(APPEND INSTALLED_HEADER_DEFINITIONS "-DGMX_DOUBLE")
       if(GMX_RELAXED_DOUBLE_PRECISION)
           add_definitions(-DGMX_RELAXED_DOUBLE_PRECISION)
       endif()
   endif()
   if(GMX_SOFTWARE_INVSQRT)
- -  set(PKG_CFLAGS "${PKG_CFLAGS} -DGMX_SOFTWARE_INVSQRT")
+ +    list(APPEND INSTALLED_HEADER_DEFINITIONS "-DGMX_SOFTWARE_INVSQRT")
   endif()
   
   if(WIN32 AND NOT CYGWIN)
@@@ -477,6 -555,25 +477,6 @@@ if(WIN32 AND NOT CYGWIN
       add_definitions(-DNOMINMAX)
   endif()
   
- -# only bother with finding git and using version.h if the source is a git repo
- -if(GMX_GIT_VERSION_INFO)
- -    if (NOT SOURCE_IS_GIT_REPOSITORY)
- -        message(FATAL_ERROR
- -            "Cannot generate git version information from source tree not under git. "
- -            "Set GMX_GIT_VERSION_INFO=OFF to proceed.")
- -    endif()
- -    # We need at least git v1.5.3 be able to parse git's date output. If not
- -    # found or the version is too small, we can't generate version information.
- -    find_package(Git)
- -
- -    if(NOT GIT_FOUND OR GIT_VERSION_STRING VERSION_LESS "1.5.3")
- -        message(FATAL_ERROR
- -            "No compatible git version found (>= 1.5.3 required). "
- -            "Won't be able to generate development version information. "
- -            "Set GMX_GIT_VERSION_INFO=OFF to proceed.")
- -    endif()
- -endif()
- -
   # Detect boost unless GMX_EXTERNAL_BOOST is explicitly OFF
   # Used for default if GMX_EXTERNAL_BOOST is not defined (first CMake pass)
   if(NOT DEFINED GMX_EXTERNAL_BOOST OR GMX_EXTERNAL_BOOST)
@@@ -519,6 -616,12 +519,6 @@@ if(GMX_USE_TNG AND NOT GMX_EXTERNAL_TNG
       gmx_test_zlib(HAVE_ZLIB)
   endif()
   
- -########################################################################
- -# Generate development version info for cache
- -########################################################################
- -# set(GEN_VERSION_INFO_INTERNAL "ON")
- -# include(gmxGenerateVersionString)
- -
   ########################################################################
   # Our own GROMACS tests
   ########################################################################
@@@ -528,6 -631,10 +528,6 @@@ include_directories(BEFORE ${CMAKE_SOUR
   include_directories(BEFORE ${CMAKE_SOURCE_DIR}/src/external/thread_mpi/include)
   # Required for config.h, maybe should only be set in src/CMakeLists.txt
   include_directories(BEFORE ${CMAKE_BINARY_DIR}/src)
- -# Required for gmx_header_config_gen.h to be found before installation
- -include_directories(BEFORE ${CMAKE_BINARY_DIR}/src/gromacs/utility)
- -# Required for now to make old code compile
- -include_directories(BEFORE ${CMAKE_SOURCE_DIR}/src/gromacs/legacyheaders)
   
   include(gmxTestInlineASM)
   gmx_test_inline_asm_gcc_x86(GMX_X86_GCC_INLINE_ASM)
@@@ -682,18 -789,14 +682,18 @@@ if(GMX_FAHCORE
     include_directories(${COREWRAP_INCLUDE_DIR})
   endif()
   
- -option(GMX_BUILD_HELP "Build man pages, HTML help, and completions automatically (requires that compiled binaries can be executed on the build host)" OFF)
+ +# Value of AUTO tries to generate things, but will only produce warnings if
+ +# that fails.
+ +set(build_help_default AUTO)
+ +if (SOURCE_IS_SOURCE_DISTRIBUTION)
+ +    set(build_help_default OFF)
+ +endif()
+ +gmx_option_trivalue(GMX_BUILD_HELP "Build man pages, HTML help, and completions automatically (requires that compiled binaries can be executed on the build host)" ${build_help_default})
   mark_as_advanced(GMX_BUILD_HELP)
- -if (GMX_BUILD_HELP AND SOURCE_IS_SOURCE_DISTRIBUTION AND
- -    "${CMAKE_CURRENT_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_BINARY_DIR}")
- -
+ +if (GMX_BUILD_HELP AND SOURCE_IS_SOURCE_DISTRIBUTION AND BUILD_IS_INSOURCE)
       message(FATAL_ERROR
- -        "Rebuilding HTML and man pages is not supported for in-source "
- -        "builds from a source distribution. "
+ +        "Rebuilding HTML and man pages or shell completions is not supported "
+ +        "for in-source builds from a source distribution. "
           "Set GMX_BUILD_HELP=OFF or do an out-of-source build to proceed.")
   endif()
   
@@@ -725,6 -828,7 +725,6 @@@ else(
       set(GMX_EXE_LINKER_FLAGS ${GMX_EXE_LINKER_FLAGS} ${OpenMP_LINKER_FLAGS})
       set(GMX_SHARED_LINKER_FLAGS ${GMX_SHARED_LINKER_FLAGS} ${OpenMP_SHARED_LINKER_FLAGS})
   endif()
- -set(PKG_CFLAGS "${PKG_CFLAGS} ${OpenMP_C_FLAGS}")
   
   ########################################################################
   # Specify install locations
@@@ -741,24 -845,17 +741,24 @@@ mark_as_advanced(GMX_LIB_INSTALL_DIR GM
   
   # These variables are used internally to provide a central location for
   # customizing the install locations.
- -set(LIB_INSTALL_DIR  ${GMX_LIB_INSTALL_DIR})
- -set(BIN_INSTALL_DIR  bin)
- -set(DATA_INSTALL_DIR share/${GMX_DATA_INSTALL_DIR})
- -set(MAN_INSTALL_DIR  share/man)
- -set(INCL_INSTALL_DIR include)
+ +set(LIB_INSTALL_DIR       ${GMX_LIB_INSTALL_DIR})
+ +set(BIN_INSTALL_DIR       bin)
+ +set(DATA_INSTALL_DIR      share/${GMX_DATA_INSTALL_DIR})
+ +set(MAN_INSTALL_DIR       share/man)
+ +# If the nesting level wrt. the installation root is changed,
+ +# gromacs-config.cmake.cmakein needs to be adapted.
+ +set(CMAKE_INSTALL_DIR     share/cmake)
+ +# TODO: Make GMXRC adapt if this is changed
+ +set(PKGCONFIG_INSTALL_DIR ${LIB_INSTALL_DIR}/pkgconfig)
+ +set(INCL_INSTALL_DIR      include)
   
   # These variables get written into config.h for use in finding the data
   # directories.
   set(GMXLIB_SEARCH_DIR share/${GMX_DATA_INSTALL_DIR}/top)
   set(GMXLIB_FALLBACK   ${CMAKE_INSTALL_PREFIX}/${DATA_INSTALL_DIR}/top)
   
+ +list(APPEND INSTALLED_HEADER_INCLUDE_DIRS ${INCL_INSTALL_DIR})
+ +
   # Binary and library suffix options
   include(gmxManageSuffixes)
   
@@@ -804,14 -901,15 +804,14 @@@ install(FILES COPYING DESTINATION ${DAT
   
   if(GMX_EXTERNAL_BOOST)
       include_directories(${Boost_INCLUDE_DIRS})
- -    set(PKG_CFLAGS "${PKG_CFLAGS} -I${Boost_INCLUDE_DIRS}")
+ +    list(APPEND INSTALLED_HEADER_INCLUDE_DIRS ${Boost_INCLUDE_DIRS})
   else()
       include_directories(BEFORE ${CMAKE_SOURCE_DIR}/src/external/boost)
+ +    list(APPEND INSTALLED_HEADER_INCLUDE_DIRS ${INCL_INSTALL_DIR}/gromacs/external/boost)
+ +    list(APPEND INSTALLED_HEADED_DEFINITIONS "-DBOOST_NO_TYPEID")
       # typeid not supported for minimal internal version
       # (would add significant amount of code)
       add_definitions(-DBOOST_NO_TYPEID)
- -    # TODO: Propagate the above settings to the installed CMakeFiles.txt template
- -    # (from share/template/)
- -    set(PKG_CFLAGS "${PKG_CFLAGS} -DBOOST_NO_TYPEID -I${CMAKE_INSTALL_PREFIX}/${INCL_INSTALL_DIR}/gromacs/external/boost")
       if (NOT GMX_BUILD_MDRUN_ONLY)
           install(DIRECTORY ${CMAKE_SOURCE_DIR}/src/external/boost/boost
                   DESTINATION ${INCL_INSTALL_DIR}/gromacs/external/boost
@@@ -854,8 -952,6 +854,8 @@@ if (BUILD_TESTING
       add_subdirectory(tests)
   endif()
   
+ +gmx_cpack_write_config()
+ +
   # Issue a warning if NVIDIA GPUs were detected, but CUDA was not found.
   # Don't bother the user after the first configure pass.
   if ((CUDA_NOTFOUND_AUTO AND GMX_DETECT_GPU_AVAILABLE) AND NOT GMX_GPU_DETECTION_DONE)
diff --combined cmake/gmxCFlags.cmake

index 011cada36e16dfdbe6eb9008041156c85e4c070d,124107b65c41c8105f528ebb91ae1c6e3a31cb09..3a0e3bbe15428b144c8fc1510112d46ee442b4a3
--- 1/cmake/gmxCFlags.cmake
--- 2/cmake/gmxCFlags.cmake
+++ b/cmake/gmxCFlags.cmake
@@@ -64,7 -64,7 +64,7 @@@ function(gmx_set_cmake_compiler_flags
           # be set up elsewhere and passed to this function, but it is
           # inconvenient in CMake to pass more than one list, and such a
           # list is only used here.
-         foreach(build_type RELWITHDEBUGINFO RELWITHASSERT MINSIZEREL PROFILE)
- -        foreach(build_type RELWITHDEBINFO RELWITHASSERT MINSIZEREL)
++        foreach(build_type RELWITHDEBINFO RELWITHASSERT MINSIZEREL PROFILE)
               set(GMXC_${language}FLAGS_${build_type} "${GMXC_${language}FLAGS_RELEASE}")
           endforeach()
           # Copy the flags that are only used by the real Release build
@@@ -85,9 -85,12 +85,12 @@@
               endif()
   
               # Append to the variables for the given build type for
-             # each language, in the parent scope.
+             # each language, in the parent scope. We add our new variables at the end, so
+             # compiler-specific choices are more likely to override default CMake choices.
+             # This is for instance useful for RelWithDebInfo builds, where we want to use the full
+             # set of our optimization flags detected in this file, rather than having -O2 override them.
               set(CMAKE_${language}_FLAGS${punctuation}${build_type}
-                 "${GMXC_${language}FLAGS${punctuation}${build_type}} ${CMAKE_${language}_FLAGS${punctuation}${build_type}}"
+                 "${CMAKE_${language}_FLAGS${punctuation}${build_type}} ${GMXC_${language}FLAGS${punctuation}${build_type}}"
                   PARENT_SCOPE)
           endforeach()
       endforeach()
@@@ -110,12 -113,14 +113,12 @@@ MACRO(gmx_c_flags
           # Since 4.8 on by default. For previous version disabling is a no-op. Only disabling for Release because with assert
           # the warnings are OK.
           GMX_TEST_CFLAG(CFLAGS_WARN_REL "-Wno-array-bounds" GMXC_CFLAGS_RELEASE_ONLY)
- -        # Since gcc 4.8 strict - false postives with old gmx_fatal. TODO: Remove in master
- -        GMX_TEST_CFLAG(CFLAGS_WARN_UNINIT "-Wno-maybe-uninitialized" GMXC_CFLAGS)
           if(CYGWIN)
               GMX_TEST_CFLAG(CFLAGS_WARN_SUBSCRIPT "-Wno-char-subscripts" GMXC_CFLAGS)
           endif()
           # new in gcc 4.5
           GMX_TEST_CFLAG(CFLAGS_EXCESS_PREC "-fexcess-precision=fast" GMXC_CFLAGS_RELEASE)
- -        GMX_TEST_CFLAG(CFLAGS_COPT "-fomit-frame-pointer -funroll-all-loops"
+ +        GMX_TEST_CFLAG(CFLAGS_COPT "-funroll-all-loops"
                          GMXC_CFLAGS_RELEASE)
           GMX_TEST_CFLAG(CFLAGS_NOINLINE "-fno-inline" GMXC_CFLAGS_DEBUG)
       endif()
@@@ -131,7 -136,7 +134,7 @@@
           GMX_TEST_CFLAG(CXXFLAGS_WARN_REL "-Wno-array-bounds" GMXC_CXXFLAGS_RELEASE_ONLY)
           # new in gcc 4.5
           GMX_TEST_CXXFLAG(CXXFLAGS_EXCESS_PREC "-fexcess-precision=fast" GMXC_CXXFLAGS_RELEASE)
- -        GMX_TEST_CXXFLAG(CXXFLAGS_COPT "-fomit-frame-pointer -funroll-all-loops"
+ +        GMX_TEST_CXXFLAG(CXXFLAGS_COPT "-funroll-all-loops"
                            GMXC_CXXFLAGS_RELEASE)
           GMX_TEST_CXXFLAG(CXXFLAGS_NOINLINE "-fno-inline" GMXC_CXXFLAGS_DEBUG)
       endif()
@@@ -148,7 -153,9 +151,7 @@@
                       GMX_TEST_CFLAG(CFLAGS_PRAGMA "-wd161" GMXC_CFLAGS)
                   endif()
               endif()
- -# 111: statement is unreachable
- -# 177: function ".." was declared but never referenced
- -# 181: argument of type ".." is incompatible with format "..", expecting argument of type ".."
+ +# 177: function/variable ".." was declared but never referenced
   # 193: zero used for undefined preprocessing identifier ".."
   # 271: trailing comma is nonstandard
   # 304: access control not specified ("public" by default)
@@@ -171,7 -178,7 +174,7 @@@
   #3346: dynamic exception specifications are deprecated
   #11074: Inlining inhibited by limit max-size(/max-total-size)
   #11076: To get full report use -opt-report=3 -opt-report-phase ipo (shown for previous remark)
- -            GMX_TEST_CFLAG(CFLAGS_WARN "-w3 -wd111 -wd177 -wd181 -wd193 -wd271 -wd304 -wd383 -wd424 -wd444 -wd522 -wd593 -wd869 -wd981 -wd1418 -wd1419 -wd1572 -wd1599 -wd2259 -wd2415 -wd2547 -wd2557 -wd3280 -wd3346 -wd11074 -wd11076" GMXC_CFLAGS)
+ +            GMX_TEST_CFLAG(CFLAGS_WARN "-w3 -wd177 -wd193 -wd271 -wd304 -wd383 -wd424 -wd444 -wd522 -wd593 -wd869 -wd981 -wd1418 -wd1419 -wd1572 -wd1599 -wd2259 -wd2415 -wd2547 -wd2557 -wd3280 -wd3346 -wd11074 -wd11076" GMXC_CFLAGS)
               GMX_TEST_CFLAG(CFLAGS_STDGNU "-std=gnu99" GMXC_CFLAGS)
               GMX_TEST_CFLAG(CFLAGS_OPT "-ip -funroll-all-loops -alias-const -ansi-alias" GMXC_CFLAGS_RELEASE)
           else()
@@@ -182,7 -189,7 +185,7 @@@
                       GMX_TEST_CFLAG(CFLAGS_PRAGMA "/wd161" GMXC_CFLAGS)
                   endif()
               endif()
- -            GMX_TEST_CFLAG(CFLAGS_WARN "/W3 /wd111 /wd177 /wd181 /wd193 /wd271 /wd304 /wd383 /wd424 /wd444 /wd522 /wd593 /wd869 /wd981 /wd1418 /wd1419 /wd1572 /wd1599 /wd2259 /wd2415 /wd2547 /wd2557 /wd3280 /wd3346" GMXC_CFLAGS)
+ +            GMX_TEST_CFLAG(CFLAGS_WARN "/W3 /wd177 /wd193 /wd271 /wd304 /wd383 /wd424 /wd444 /wd522 /wd593 /wd869 /wd981 /wd1418 /wd1419 /wd1572 /wd1599 /wd2259 /wd2415 /wd2547 /wd2557 /wd3280 /wd3346" GMXC_CFLAGS)
               GMX_TEST_CFLAG(CFLAGS_OPT "/Qip" GMXC_CFLAGS_RELEASE)
           endif()
       endif()
@@@ -199,7 -206,7 +202,7 @@@
   #All but the following warnings are identical for the C-compiler (see above)
   #1782: #pragma once is obsolete
   #2282: unrecognized GCC pragma
- -            GMX_TEST_CXXFLAG(CXXFLAGS_WARN "-w3 -wd111 -wd177 -wd181 -wd193 -wd271 -wd304 -wd383 -wd424 -wd444 -wd522 -wd593 -wd869 -wd981 -wd1418 -wd1419 -wd1572 -wd1599 -wd2259 -wd2415 -wd2547 -wd2557 -wd3280 -wd3346 -wd11074 -wd11076 -wd1782 -wd2282" GMXC_CXXFLAGS)
+ +            GMX_TEST_CXXFLAG(CXXFLAGS_WARN "-w3 -wd177 -wd193 -wd271 -wd304 -wd383 -wd424 -wd444 -wd522 -wd593 -wd869 -wd981 -wd1418 -wd1419 -wd1572 -wd1599 -wd2259 -wd2415 -wd2547 -wd2557 -wd3280 -wd3346 -wd11074 -wd11076 -wd1782 -wd2282" GMXC_CXXFLAGS)
               GMX_TEST_CXXFLAG(CXXFLAGS_OPT "-ip -funroll-all-loops -alias-const -ansi-alias" GMXC_CXXFLAGS_RELEASE)
           else()
               if(NOT GMX_OPENMP)
@@@ -209,7 -216,7 +212,7 @@@
                       GMX_TEST_CXXFLAG(CXXFLAGS_PRAGMA "/wd161" GMXC_CXXFLAGS)
                   endif()
               endif()
- -            GMX_TEST_CXXFLAG(CXXFLAGS_WARN "/W3 /wd111 /wd177 /wd181 /wd193 /wd271 /wd304 /wd383 /wd424 /wd444 /wd522 /wd593 /wd869 /wd981 /wd1418 /wd1419 /wd1572 /wd1599 /wd2259 /wd2415 /wd2547 /wd2557 /wd3280 /wd3346 /wd1782 /wd2282" GMXC_CXXFLAGS)
+ +            GMX_TEST_CXXFLAG(CXXFLAGS_WARN "/W3 /wd177 /wd193 /wd271 /wd304 /wd383 /wd424 /wd444 /wd522 /wd593 /wd869 /wd981 /wd1418 /wd1419 /wd1572 /wd1599 /wd2259 /wd2415 /wd2547 /wd2557 /wd3280 /wd3346 /wd1782 /wd2282" GMXC_CXXFLAGS)
               GMX_TEST_CXXFLAG(CXXFLAGS_OPT "/Qip" GMXC_CXXFLAGS_RELEASE)
           endif()
       endif()
diff --combined src/gromacs/gmxana/gmx_tune_pme.c

index b9c2af3e717863e654c243f8cd299c582023a742,a4378039327b2f23e842dc8ec5bb3c1ce79ae3cc..c4a7e6e021b4769d6c1e6063fc8d48f38eb8c13f
--- 1/src/gromacs/gmxana/gmx_tune_pme.c
--- 2/src/gromacs/gmxana/gmx_tune_pme.c
+++ b/src/gromacs/gmxana/gmx_tune_pme.c
@@@ -32,36 -32,35 +32,36 @@@
    * To help us fund GROMACS development, we humbly ask that you cite
    * the research papers on the package. Check out http://www.gromacs.org.
    */
- -#ifdef HAVE_CONFIG_H
- -#include <config.h>
- -#endif
+ +#include "gmxpre.h"
+ +
+ +#include "config.h"
   
+ +#include <stdlib.h>
   #include <time.h>
+ +
   #ifdef HAVE_SYS_TIME_H
   #include <sys/time.h>
   #endif
   
   #include "gromacs/commandline/pargs.h"
- -#include "typedefs.h"
- -#include "types/commrec.h"
- -#include "gromacs/utility/smalloc.h"
- -#include "vec.h"
- -#include "copyrite.h"
   #include "gromacs/fileio/tpxio.h"
- -#include "gromacs/utility/cstringutil.h"
- -#include "readinp.h"
- -#include "calcgrid.h"
- -#include "checkpoint.h"
- -#include "macros.h"
- -#include "gmx_ana.h"
- -#include "names.h"
- -#include "perf_est.h"
- -#include "inputrec.h"
- -#include "gromacs/timing/walltime_accounting.h"
+ +#include "gromacs/gmxana/gmx_ana.h"
+ +#include "gromacs/legacyheaders/calcgrid.h"
+ +#include "gromacs/legacyheaders/checkpoint.h"
+ +#include "gromacs/legacyheaders/inputrec.h"
+ +#include "gromacs/legacyheaders/macros.h"
+ +#include "gromacs/legacyheaders/names.h"
+ +#include "gromacs/legacyheaders/perf_est.h"
+ +#include "gromacs/legacyheaders/readinp.h"
+ +#include "gromacs/legacyheaders/typedefs.h"
+ +#include "gromacs/legacyheaders/types/commrec.h"
   #include "gromacs/math/utilities.h"
- -
- -#include "gmx_fatal.h"
+ +#include "gromacs/math/vec.h"
+ +#include "gromacs/timing/walltime_accounting.h"
+ +#include "gromacs/utility/baseversion.h"
+ +#include "gromacs/utility/cstringutil.h"
+ +#include "gromacs/utility/fatalerror.h"
+ +#include "gromacs/utility/smalloc.h"
   
   /* Enum for situations that can occur during log file parsing, the
    * corresponding string entries can be found in do_the_tests() in
@@@ -144,6 -143,29 +144,6 @@@ static void cleandata(t_perf *perfdata
   }
   
   
- -static gmx_bool is_equal(real a, real b)
- -{
- -    real diff, eps = 1.0e-7;
- -
- -
- -    diff = a - b;
- -
- -    if (diff < 0.0)
- -    {
- -        diff = -diff;
- -    }
- -
- -    if (diff < eps)
- -    {
- -        return TRUE;
- -    }
- -    else
- -    {
- -        return FALSE;
- -    }
- -}
- -
- -
   static void remove_if_exists(const char *fn)
   {
       if (gmx_fexist(fn))
@@@ -531,8 -553,8 +531,8 @@@ static gmx_bool analyze_data
       fprintf(fp, "\n");
   
       /* Only mention settings if they were modified: */
- -    bRefinedCoul = !is_equal(info->rcoulomb[k_win], info->rcoulomb[0]);
- -    bRefinedVdW  = !is_equal(info->rvdw[k_win], info->rvdw[0]    );
+ +    bRefinedCoul = !gmx_within_tol(info->rcoulomb[k_win], info->rcoulomb[0], GMX_REAL_EPS);
+ +    bRefinedVdW  = !gmx_within_tol(info->rvdw[k_win], info->rvdw[0], GMX_REAL_EPS);
       bRefinedGrid = !(info->nkx[k_win] == info->nkx[0] &&
                        info->nky[k_win] == info->nky[0] &&
                        info->nkz[k_win] == info->nkz[0]);
@@@ -594,25 -616,25 +594,25 @@@ static void get_program_paths(gmx_bool 
       {
           if ( (cp = getenv("MPIRUN")) != NULL)
           {
- -            *cmd_mpirun = strdup(cp);
+ +            *cmd_mpirun = gmx_strdup(cp);
           }
           else
           {
- -            *cmd_mpirun = strdup(def_mpirun);
+ +            *cmd_mpirun = gmx_strdup(def_mpirun);
           }
       }
       else
       {
- -        *cmd_mpirun = strdup(empty_mpirun);
+ +        *cmd_mpirun = gmx_strdup(empty_mpirun);
       }
   
       if ( (cp = getenv("MDRUN" )) != NULL)
       {
- -        *cmd_mdrun  = strdup(cp);
+ +        *cmd_mdrun  = gmx_strdup(cp);
       }
       else
       {
- -        *cmd_mdrun  = strdup(def_mdrun);
+ +        *cmd_mdrun  = gmx_strdup(def_mdrun);
       }
   }
   
@@@ -964,11 -986,11 +964,11 @@@ static void make_benchmark_tprs
           {
               /* Determine which Coulomb radii rc to use in the benchmarks */
               add = (rmax-rmin)/(*ntprs-1);
- -            if (is_equal(rmin, info->rcoulomb[0]))
+ +            if (gmx_within_tol(rmin, info->rcoulomb[0], GMX_REAL_EPS))
               {
                   ir->rcoulomb = rmin + j*add;
               }
- -            else if (is_equal(rmax, info->rcoulomb[0]))
+ +            else if (gmx_within_tol(rmax, info->rcoulomb[0], GMX_REAL_EPS))
               {
                   ir->rcoulomb = rmin + (j-1)*add;
               }
@@@ -1068,8 -1090,8 +1068,8 @@@
           fprintf(fp, "  %-14s\n", fn_bench_tprs[j]);
   
           /* Make it clear to the user that some additional settings were modified */
- -        if (!is_equal(ir->rvdw, info->rvdw[0])
- -            || !is_equal(ir->rlistlong, info->rlistlong[0]) )
+ +        if (!gmx_within_tol(ir->rvdw, info->rvdw[0], GMX_REAL_EPS)
+ +            || !gmx_within_tol(ir->rlistlong, info->rlistlong[0], GMX_REAL_EPS) )
           {
               bNote = TRUE;
           }
@@@ -1308,9 -1330,12 +1308,12 @@@ static void make_sure_it_runs(char *mdr
       {
           /* To prevent confusion, do not again issue a gmx_fatal here since we already
            * get the error message from mdrun itself */
-         sprintf(msg,    "Cannot run the benchmark simulations! Please check the error message of\n"
+         sprintf(msg,
+                 "Cannot run the first benchmark simulation! Please check the error message of\n"
                   "mdrun for the source of the problem. Did you provide a command line\n"
-                 "argument that neither g_tune_pme nor mdrun understands? Offending command:\n"
+                 "argument that neither gmx tune_pme nor mdrun understands? If you're\n"
+                 "sure your command line should work, you can bypass this check with \n"
+                 "gmx tune_pme -nocheck. The failing command was:\n"
                   "\n%s\n\n", command);
   
           fprintf(stderr, "%s", msg);
@@@ -1341,7 -1366,7 +1344,7 @@@ static void do_the_tests
           int             npme_fixed,     /* If >= -1, test fixed number of PME
                                            * nodes only                             */
           const char     *npmevalues_opt, /* Which -npme values should be tested    */
-         t_perf        **perfdata,       /* Here the performace data is stored     */
+         t_perf        **perfdata,       /* Here the performance data is stored    */
           int            *pmeentries,     /* Entries in the nPMEnodes list          */
           int             repeats,        /* Repeat each test this often            */
           int             nnodes,         /* Total number of nodes = nPP + nPME     */
@@@ -1354,7 -1379,8 +1357,8 @@@
           const t_filenm *fnm,            /* List of filenames from command line    */
           int             nfile,          /* Number of files specified on the cmdl. */
           int             presteps,       /* DLB equilibration steps, is checked    */
-         gmx_int64_t     cpt_steps)      /* Time step counter in the checkpoint    */
+         gmx_int64_t     cpt_steps,      /* Time step counter in the checkpoint    */
+         gmx_bool        bCheck)         /* Check whether benchmark mdrun works    */
   {
       int      i, nr, k, ret, count = 0, totaltests;
       int     *nPMEnodes = NULL;
@@@ -1454,8 -1480,11 +1458,11 @@@
                           cmd_stub, pd->nPMEnodes, tpr_names[k], cmd_args_bench);
   
                   /* To prevent that all benchmarks fail due to a show-stopper argument
-                  * on the mdrun command line, we make a quick check first */
-                 if (bFirst)
+                  * on the mdrun command line, we make a quick check first.
+                  * This check can be turned off in cases where the automatically chosen
+                  * number of PME-only ranks leads to a number of PP ranks for which no
+                  * decomposition can be found (e.g. for large prime numbers) */
+                 if (bFirst && bCheck)
                   {
                       make_sure_it_runs(pd->mdrun_cmd_line, cmdline_length, fp, fnm, nfile);
                   }
@@@ -1631,13 -1660,13 +1638,13 @@@ static void check_input
       /* Add test scenarios if rmin or rmax were set */
       if (*ntprs <= 2)
       {
- -        if (!is_equal(*rmin, rcoulomb) && (*ntprs == 1) )
+ +        if (!gmx_within_tol(*rmin, rcoulomb, GMX_REAL_EPS) && (*ntprs == 1) )
           {
               (*ntprs)++;
               fprintf(stderr, "NOTE: Setting -rmin to %g changed -ntpr to %d\n",
                       *rmin, *ntprs);
           }
- -        if (!is_equal(*rmax, rcoulomb) && (*ntprs == 1) )
+ +        if (!gmx_within_tol(*rmax, rcoulomb, GMX_REAL_EPS) && (*ntprs == 1) )
           {
               (*ntprs)++;
               fprintf(stderr, "NOTE: Setting -rmax to %g changed -ntpr to %d\n",
@@@ -1646,13 -1675,13 +1653,13 @@@
       }
       old = *ntprs;
       /* If one of rmin, rmax is set, we need 2 tpr files at minimum */
- -    if (!is_equal(*rmax, rcoulomb) || !is_equal(*rmin, rcoulomb) )
+ +    if (!gmx_within_tol(*rmax, rcoulomb, GMX_REAL_EPS) || !gmx_within_tol(*rmin, rcoulomb, GMX_REAL_EPS) )
       {
           *ntprs = max(*ntprs, 2);
       }
   
       /* If both rmin, rmax are set, we need 3 tpr files at minimum */
- -    if (!is_equal(*rmax, rcoulomb) && !is_equal(*rmin, rcoulomb) )
+ +    if (!gmx_within_tol(*rmax, rcoulomb, GMX_REAL_EPS) && !gmx_within_tol(*rmin, rcoulomb, GMX_REAL_EPS) )
       {
           *ntprs = max(*ntprs, 3);
       }
@@@ -1664,7 -1693,7 +1671,7 @@@
   
       if (*ntprs > 1)
       {
- -        if (is_equal(*rmin, rcoulomb) && is_equal(rcoulomb, *rmax)) /* We have just a single rc */
+ +        if (gmx_within_tol(*rmin, rcoulomb, GMX_REAL_EPS) && gmx_within_tol(rcoulomb, *rmax, GMX_REAL_EPS)) /* We have just a single rc */
           {
               fprintf(stderr, "WARNING: Resetting -ntpr to 1 since no Coulomb radius scaling is requested.\n"
                       "Please set rmin < rmax to test Coulomb radii in the [rmin, rmax] interval\n"
@@@ -2016,6 -2045,9 +2023,9 @@@ int gmx_tune_pme(int argc, char *argv[]
           "need to provide a machine- or hostfile. This can also be passed",
           "via the MPIRUN variable, e.g.[PAR]",
           "[TT]export MPIRUN=\"/usr/local/mpirun -machinefile hosts\"[tt][PAR]",
+         "Before doing the actual benchmark runs, [THISMODULE] will do a quick",
+         "check whether mdrun works as expected with the provided parallel settings",
+         "if the [TT]-check[tt] option is activated (the default).",
           "Please call [THISMODULE] with the normal options you would pass to",
           "[gmx-mdrun] and add [TT]-np[tt] for the number of ranks to perform the",
           "tests on, or [TT]-ntmpi[tt] for the number of threads. You can also add [TT]-r[tt]",
@@@ -2025,7 -2057,7 +2035,7 @@@
           "written with enlarged cutoffs and smaller Fourier grids respectively.",
           "Typically, the first test (number 0) will be with the settings from the input",
           "[TT].tpr[tt] file; the last test (number [TT]ntpr[tt]) will have the Coulomb cutoff",
-         "specified by [TT]-rmax[tt] with a somwhat smaller PME grid at the same time. ",
+         "specified by [TT]-rmax[tt] with a somewhat smaller PME grid at the same time. ",
           "In this last test, the Fourier spacing is multiplied with [TT]rmax[tt]/rcoulomb. ",
           "The remaining [TT].tpr[tt] files will have equally-spaced Coulomb radii (and Fourier "
           "spacings) between these extremes. [BB]Note[bb] that you can set [TT]-ntpr[tt] to 1",
@@@ -2035,7 -2067,7 +2045,7 @@@
           "MD systems. The dynamic load balancing needs about 100 time steps",
           "to adapt to local load imbalances, therefore the time step counters",
           "are by default reset after 100 steps. For large systems (>1M atoms), as well as ",
-         "for a higher accuarcy of the measurements, you should set [TT]-resetstep[tt] to a higher value.",
+         "for a higher accuracy of the measurements, you should set [TT]-resetstep[tt] to a higher value.",
           "From the 'DD' load imbalance entries in the md.log output file you",
           "can tell after how many steps the load is sufficiently balanced. Example call:[PAR]"
           "[TT]gmx tune_pme -np 64 -s protein.tpr -launch[tt][PAR]",
@@@ -2092,9 -2124,9 +2102,9 @@@
           /* g_tune_pme */
           { efOUT, "-p",      "perf",     ffWRITE },
           { efLOG, "-err",    "bencherr", ffWRITE },
- -        { efTPX, "-so",     "tuned",    ffWRITE },
+ +        { efTPR, "-so",     "tuned",    ffWRITE },
           /* mdrun: */
- -        { efTPX, NULL,      NULL,       ffREAD },
+ +        { efTPR, NULL,      NULL,       ffREAD },
           { efTRN, "-o",      NULL,       ffWRITE },
           { efCOMPRESSED, "-x", NULL,     ffOPTWR },
           { efCPT, "-cpi",    NULL,       ffOPTRD },
@@@ -2162,6 -2194,7 +2172,7 @@@
       gmx_bool     bKeepAndNumCPT        = FALSE;
       gmx_bool     bResetCountersHalfWay = FALSE;
       gmx_bool     bBenchmark            = TRUE;
+     gmx_bool     bCheck                = TRUE;
   
       output_env_t oenv = NULL;
   
@@@ -2205,6 -2238,8 +2216,8 @@@
             "Launch the real simulation after optimization" },
           { "-bench",    FALSE, etBOOL, {&bBenchmark},
             "Run the benchmarks or just create the input [TT].tpr[tt] files?" },
+         { "-check",    FALSE, etBOOL, {&bCheck},
+           "Before the benchmark runs, check whether mdrun works in parallel" },
           /******************/
           /* mdrun options: */
           /******************/
@@@ -2364,8 -2399,7 +2377,8 @@@
       sep_line(fp);
       fprintf(fp, "\n      P E R F O R M A N C E   R E S U L T S\n");
       sep_line(fp);
- -    fprintf(fp, "%s for Gromacs %s\n", ShortProgram(), GromacsVersion());
+ +    fprintf(fp, "%s for Gromacs %s\n", output_env_get_program_display_name(oenv),
+ +            gmx_version());
       if (!bThreads)
       {
           fprintf(fp, "Number of ranks         : %d\n", nnodes);
@@@ -2457,7 -2491,7 +2470,7 @@@
       {
           do_the_tests(fp, tpr_names, maxPMEnodes, minPMEnodes, npme_fixed, npmevalues_opt[0], perfdata, &pmeentries,
                        repeats, nnodes, ntprs, bThreads, cmd_mpirun, cmd_np, cmd_mdrun,
-                      cmd_args_bench, fnm, NFILE, presteps, cpt_steps);
+                      cmd_args_bench, fnm, NFILE, presteps, cpt_steps, bCheck);
   
           fprintf(fp, "\nTuning took%8.1f minutes.\n", (gmx_gettime()-seconds)/60.0);
   
diff --combined src/gromacs/gmxlib/calcgrid.c

index 1de1f57efed3998ac8ce36a36d0b67c833dee457,81856d489c877e5d0e97fad3423a90fecc815b77..7bbee76f1260ad0a81493ba9ee3ed1e0f5bf7eb9
--- 1/src/gromacs/gmxlib/calcgrid.c
--- 2/src/gromacs/gmxlib/calcgrid.c
+++ b/src/gromacs/gmxlib/calcgrid.c
@@@ -34,15 -34,16 +34,15 @@@
    * To help us fund GROMACS development, we humbly ask that you cite
    * the research papers on the package. Check out http://www.gromacs.org.
    */
- -#ifdef HAVE_CONFIG_H
- -#include <config.h>
- -#endif
+ +#include "gmxpre.h"
+ +
+ +#include "gromacs/legacyheaders/calcgrid.h"
   
   #include <math.h>
   
- -#include "typedefs.h"
+ +#include "gromacs/legacyheaders/typedefs.h"
+ +#include "gromacs/utility/fatalerror.h"
   #include "gromacs/utility/smalloc.h"
- -#include "gmx_fatal.h"
- -#include "calcgrid.h"
   
   /* The grid sizes below are based on timing of a 3D cubic grid in fftw
    * compiled with SSE using 4 threads in fft5d.c.
@@@ -136,7 -137,7 +136,7 @@@ real calc_grid(FILE *fp, matrix box, re
                   /* Determine how many pre-factors of 2 we need */
                   fac2 = 1;
                   i    = g_baseNR - 1;
-                 while (fac2*grid_base[i-1] < nmin)
+                 while (fac2*grid_base[i] < nmin)
                   {
                       fac2 *= 2;
                   }
diff --combined src/gromacs/gmxpreprocess/toppush.c

index 161940b72e0f2a73267baeab449016d12247b67d,fab21dda5189c7318f28a6361e689b98f3503fae..8272b7787ecf3f2dad111040e9d2ca76dca6b884
--- 1/src/gromacs/gmxpreprocess/toppush.c
--- 2/src/gromacs/gmxpreprocess/toppush.c
+++ b/src/gromacs/gmxpreprocess/toppush.c
@@@ -34,27 -34,28 +34,27 @@@
    * To help us fund GROMACS development, we humbly ask that you cite
    * the research papers on the package. Check out http://www.gromacs.org.
    */
- -#ifdef HAVE_CONFIG_H
- -#include <config.h>
- -#endif
+ +#include "gmxpre.h"
   
+ +#include "toppush.h"
+ +
+ +#include <assert.h>
   #include <ctype.h>
   #include <math.h>
- -#include <assert.h>
+ +#include <stdlib.h>
   
- -#include "sysstuff.h"
- -#include "gromacs/utility/smalloc.h"
- -#include "macros.h"
+ +#include "gromacs/gmxpreprocess/gpp_atomtype.h"
+ +#include "gromacs/gmxpreprocess/gpp_bond_atomtype.h"
+ +#include "gromacs/gmxpreprocess/readir.h"
+ +#include "gromacs/gmxpreprocess/topdirs.h"
+ +#include "gromacs/gmxpreprocess/toputil.h"
+ +#include "gromacs/legacyheaders/macros.h"
+ +#include "gromacs/legacyheaders/names.h"
+ +#include "gromacs/legacyheaders/warninp.h"
+ +#include "gromacs/topology/symtab.h"
   #include "gromacs/utility/cstringutil.h"
- -#include "names.h"
- -#include "toputil.h"
- -#include "toppush.h"
- -#include "topdirs.h"
- -#include "readir.h"
- -#include "symtab.h"
- -#include "gmx_fatal.h"
- -#include "warninp.h"
- -#include "gpp_atomtype.h"
- -#include "gpp_bond_atomtype.h"
+ +#include "gromacs/utility/fatalerror.h"
+ +#include "gromacs/utility/smalloc.h"
   
   void generate_nbparams(int comb, int ftype, t_params *plist, gpp_atomtype_t atype,
                          warninp_t wi)
@@@ -599,9 -600,11 +599,11 @@@ static void push_bondtype(t_params     
                       {
                           sprintf(errbuf, "Overriding %s parameters.%s",
                                   interaction_function[ftype].longname,
-                                 (ftype == F_PDIHS) ? "\nUse dihedraltype 4 to allow several multiplicity terms." : "");
+                                 (ftype == F_PDIHS) ?
+                                 "\nUse dihedraltype 9 to allow several multiplicity terms. Only consecutive lines are combined. Non-consective lines overwrite each other."
+                                 : "");
                           warning(wi, errbuf);
-                         fprintf(stderr, "  old:");
+                         fprintf(stderr, "  old:                                         ");
                           for (j = 0; (j < nrfp); j++)
                           {
                               fprintf(stderr, " %g", bt->param[i].c[j]);
diff --combined src/gromacs/legacyheaders/domdec.h

index 13b55711e8c385ee03d41139beb3563cc684c4cb,240a7901ccd0592922e98459d37437374285905a..c44c7bf8406784fdd4ee17326d357a3266c521d4
--- 1/src/gromacs/legacyheaders/domdec.h
--- 2/src/gromacs/legacyheaders/domdec.h
+++ b/src/gromacs/legacyheaders/domdec.h
@@@ -36,10 -36,9 +36,10 @@@
   #ifndef _domdec_h
   #define _domdec_h
   
- -#include "typedefs.h"
- -#include "vsite.h"
- -#include "genborn.h"
+ +#include "gromacs/legacyheaders/genborn.h"
+ +#include "gromacs/legacyheaders/typedefs.h"
+ +#include "gromacs/legacyheaders/vsite.h"
+ +#include "gromacs/timing/wallcycle.h"
   
   #ifdef __cplusplus
   extern "C" {
@@@ -139,6 -138,12 +139,12 @@@ void change_dd_dlb_cutoff_limit(t_commr
    * possible after subsequently setting a shorter cut-off with change_dd_cutoff.
    */
   
+ gmx_bool dd_dlb_is_locked(const gmx_domdec_t *dd);
+ /* Return if the DLB lock is set */
+ 
+ void dd_dlb_set_lock(gmx_domdec_t *dd, gmx_bool bValue);
+ /* Set a lock such that with DLB=auto DLB can (not) get turned on */
+ 
   void dd_setup_dlb_resource_sharing(t_commrec           *cr,
                                      const gmx_hw_info_t *hwinfo,
                                      const gmx_hw_opt_t  *hw_opt);
diff --combined src/gromacs/listed-forces/bonded.cpp

index b1ded380a424d9dcc554099fae8883af00314e12,0eb8d55791b983c0b5909786b7dffdb1c5ef462a..dfb846d9e700d1492bd6314d68537a9e21fb8860
--- 1/src/gromacs/listed-forces/bonded.cpp
--- 2/src/gromacs/gmxlib/bondfree.c
+++ b/src/gromacs/listed-forces/bonded.cpp
@@@ -34,50 -34,36 +34,50 @@@
    * To help us fund GROMACS development, we humbly ask that you cite
    * the research papers on the package. Check out http://www.gromacs.org.
    */
- -#ifdef HAVE_CONFIG_H
- -#include <config.h>
- -#endif
+ +/*! \internal \file
+ + *
+ + * \brief This file defines functions necessary for mdrun and tools to
+ + * compute energies and forces for bonded interactions.
+ + *
+ + * \author Mark Abraham <mark.j.abraham@gmail.com>
+ + *
+ + * \ingroup module_listed-forces
+ + */
+ +#include "gmxpre.h"
+ +
+ +#include "bonded.h"
+ +
+ +#include "config.h"
   
- -#include <math.h>
   #include <assert.h>
- -#include "physics.h"
- -#include "vec.h"
- -#include "gromacs/math/utilities.h"
- -#include "txtdump.h"
- -#include "bondf.h"
- -#include "gromacs/utility/smalloc.h"
- -#include "pbc.h"
- -#include "ns.h"
- -#include "macros.h"
- -#include "names.h"
- -#include "gmx_fatal.h"
- -#include "mshift.h"
- -#include "main.h"
- -#include "disre.h"
- -#include "orires.h"
- -#include "force.h"
- -#include "nonbonded.h"
- -#include "restcbt.h"
   
+ +#include <cmath>
+ +
+ +#include <algorithm>
+ +
+ +#include "gromacs/legacyheaders/disre.h"
+ +#include "gromacs/legacyheaders/force.h"
+ +#include "gromacs/legacyheaders/macros.h"
+ +#include "gromacs/legacyheaders/names.h"
+ +#include "gromacs/legacyheaders/nonbonded.h"
+ +#include "gromacs/legacyheaders/ns.h"
+ +#include "gromacs/legacyheaders/orires.h"
+ +#include "gromacs/legacyheaders/txtdump.h"
+ +#include "gromacs/math/units.h"
+ +#include "gromacs/math/utilities.h"
+ +#include "gromacs/math/vec.h"
+ +#include "gromacs/pbcutil/ishift.h"
+ +#include "gromacs/pbcutil/mshift.h"
+ +#include "gromacs/pbcutil/pbc.h"
   #include "gromacs/simd/simd.h"
   #include "gromacs/simd/simd_math.h"
   #include "gromacs/simd/vector_operations.h"
+ +#include "gromacs/utility/fatalerror.h"
+ +#include "gromacs/utility/smalloc.h"
   
- -/* Find a better place for this? */
+ +#include "restcbt.h"
+ +
+ +/*! \brief Mysterious CMAP coefficient matrix */
   const int cmap_coeff_matrix[] = {
       1, 0, -3,  2, 0, 0,  0,  0, -3,  0,  9, -6,  2,  0, -6,  4,
       0, 0,  0,  0, 0, 0,  0,  0,  3,  0, -9,  6, -2,  0,  6, -4,
@@@ -98,9 -84,7 +98,9 @@@
   };
   
   
- -
+ +/* TODO This function should go and live in nonbonded.c where it is
+ +   really needed. Here, it only supports giving a fatal error message
+ +   with FENE_bonds */
   int glatnr(int *global_atom_index, int i)
   {
       int atnr;
@@@ -117,9 -101,6 +117,9 @@@
       return atnr;
   }
   
+ +/*! \brief Compute dx = xi - xj, modulo PBC if non-NULL
+ + *
+ + * \todo This kind of code appears in many places. Consolidate it */
   static int pbc_rvec_sub(const t_pbc *pbc, const rvec xi, const rvec xj, rvec dx)
   {
       if (pbc)
@@@ -148,7 -129,7 +148,7 @@@ typedef struct 
       gmx_simd_real_t bxx;
   } pbc_simd_t;
   
- -/* Set the SIMD pbc data from a normal t_pbc struct */
+ +/*! \brief Set the SIMD pbc data from a normal t_pbc struct */
   static void set_pbc_simd(const t_pbc *pbc, pbc_simd_t *pbc_simd)
   {
       rvec inv_bdiag;
@@@ -188,7 -169,7 +188,7 @@@
       }
   }
   
- -/* Correct distance vector *dx,*dy,*dz for PBC using SIMD */
+ +/*! \brief Correct distance vector *dx,*dy,*dz for PBC using SIMD */
   static gmx_inline void
   pbc_dx_simd(gmx_simd_real_t *dx, gmx_simd_real_t *dy, gmx_simd_real_t *dz,
               const pbc_simd_t *pbc)
@@@ -210,9 -191,10 +210,9 @@@
   
   #endif /* GMX_SIMD_HAVE_REAL */
   
- -/*
- - * Morse potential bond by Frank Everdij
+ +/*! \brief Morse potential bond
    *
- - * Three parameters needed:
+ + * By Frank Everdij. Three parameters needed:
    *
    * b0 = equilibrium distance in nm
    * be = beta in nm^-1 (actually, it's nu_e*Sqrt(2*pi*pi*mu/D_e))
@@@ -221,6 -203,7 +221,6 @@@
    * Note: the potential is referenced to be +cb at infinite separation
    *       and zero at the equilibrium distance!
    */
- -
   real morse_bonds(int nbonds,
                    const t_iatom forceatoms[], const t_iparams forceparams[],
                    const rvec x[], rvec f[], rvec fshift[],
@@@ -295,7 -278,6 +295,7 @@@
       return vtot;
   }
   
+ +//! \cond
   real cubic_bonds(int nbonds,
                    const t_iatom forceatoms[], const t_iparams forceparams[],
                    const rvec x[], rvec f[], rvec fshift[],
@@@ -369,7 -351,7 +369,7 @@@ real FENE_bonds(int nbonds
       const real half = 0.5;
       const real one  = 1.0;
       real       bm, kb;
- -    real       dr, dr2, bm2, omdr2obm2, fbond, vbond, fij, vtot;
+ +    real       dr2, bm2, omdr2obm2, fbond, vbond, fij, vtot;
       rvec       dx;
       int        i, m, ki, type, ai, aj;
       ivec       dt;
@@@ -769,6 -751,7 +769,6 @@@ real water_pol(int nbonds
           kk[YY] = sqr(qS)*ONE_4PI_EPS0/forceparams[type0].wpol.al_y;
           kk[ZZ] = sqr(qS)*ONE_4PI_EPS0/forceparams[type0].wpol.al_z;
           r_HH   = 1.0/forceparams[type0].wpol.rHH;
- -        r_OD   = 1.0/forceparams[type0].wpol.rOD;
           if (debug)
           {
               fprintf(debug, "WPOL: qS  = %10.5f aS = %5d\n", qS, aS);
@@@ -892,7 -875,7 +892,7 @@@ static real do_1_thole(const rvec xi, c
                          rvec fshift[], real afac)
   {
       rvec r12;
- -    real r12sq, r12_1, r12n, r12bar, v0, v1, fscal, ebar, fff;
+ +    real r12sq, r12_1, r12bar, v0, v1, fscal, ebar, fff;
       int  m, t;
   
       t      = pbc_rvec_sub(pbc, xi, xj, r12);                      /*  3 */
@@@ -931,10 -914,9 +931,10 @@@ real thole_pol(int nbonds
                  int gmx_unused *global_atom_index)
   {
       /* Interaction between two pairs of particles with opposite charge */
- -    int  i, type, a1, da1, a2, da2;
- -    real q1, q2, qq, a, al1, al2, afac;
- -    real V = 0;
+ +    int        i, type, a1, da1, a2, da2;
+ +    real       q1, q2, qq, a, al1, al2, afac;
+ +    real       V             = 0;
+ +    const real minusOneOnSix = -1.0/6.0;
   
       for (i = 0; (i < nbonds); )
       {
@@@ -949,7 -931,7 +949,7 @@@
           al1   = forceparams[type].thole.alpha1;
           al2   = forceparams[type].thole.alpha2;
           qq    = q1*q2;
- -        afac  = a*pow(al1*al2, -1.0/6.0);
+ +        afac  = a*pow(al1*al2, minusOneOnSix);
           V    += do_1_thole(x[a1], x[a2], f[a1], f[a2], pbc, qq, fshift, afac);
           V    += do_1_thole(x[da1], x[a2], f[da1], f[a2], pbc, -qq, fshift, afac);
           V    += do_1_thole(x[a1], x[da2], f[a1], f[da2], pbc, -qq, fshift, afac);
@@@ -1734,9 -1716,10 +1734,9 @@@ do_dih_fup_noshiftf(int i, int j, int k
                       rvec m, rvec n, rvec f[])
   {
       rvec f_i, f_j, f_k, f_l;
- -    rvec uvec, vvec, svec, dx_jl;
+ +    rvec uvec, vvec, svec;
       real iprm, iprn, nrkj, nrkj2, nrkj_1, nrkj_2;
       real a, b, p, q, toler;
- -    ivec jt, dt_ij, dt_kj, dt_lj;
   
       iprm  = iprod(m, m);       /*  5    */
       iprn  = iprod(n, n);       /*  5  */
@@@ -2006,9 -1989,10 +2006,9 @@@ pdihs_noener_simd(int nbonds
       const int             nfa1 = 5;
       int                   i, iu, s;
       int                   type, ai[GMX_SIMD_REAL_WIDTH], aj[GMX_SIMD_REAL_WIDTH], ak[GMX_SIMD_REAL_WIDTH], al[GMX_SIMD_REAL_WIDTH];
- -    real                  ddphi;
       real                  dr_array[3*DIM*GMX_SIMD_REAL_WIDTH+GMX_SIMD_REAL_WIDTH], *dr;
       real                  buf_array[7*GMX_SIMD_REAL_WIDTH+GMX_SIMD_REAL_WIDTH], *buf;
- -    real                 *cp, *phi0, *mult, *phi, *p, *q, *sf_i, *msf_l;
+ +    real                 *cp, *phi0, *mult, *p, *q;
       gmx_simd_real_t       phi0_S, phi_S;
       gmx_simd_real_t       mx_S, my_S, mz_S;
       gmx_simd_real_t       nx_S, ny_S, nz_S;
@@@ -2029,6 -2013,8 +2029,6 @@@
       mult  = buf + 2*GMX_SIMD_REAL_WIDTH;
       p     = buf + 3*GMX_SIMD_REAL_WIDTH;
       q     = buf + 4*GMX_SIMD_REAL_WIDTH;
- -    sf_i  = buf + 5*GMX_SIMD_REAL_WIDTH;
- -    msf_l = buf + 6*GMX_SIMD_REAL_WIDTH;
   
       set_pbc_simd(pbc, &pbc_simd);
   
@@@ -2117,6 -2103,155 +2117,152 @@@
       }
   }
   
- -    real                  ddphi;
+ /* This is mostly a copy of pdihs_noener_simd above, but with using
+  * the RB potential instead of a harmonic potential.
+  * This function can replace rbdihs() when no energy and virial are needed.
+  */
+ static void
+ rbdihs_noener_simd(int nbonds,
+                    const t_iatom forceatoms[], const t_iparams forceparams[],
+                    const rvec x[], rvec f[],
+                    const t_pbc *pbc, const t_graph gmx_unused *g,
+                    real gmx_unused lambda,
+                    const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+                    int gmx_unused *global_atom_index)
+ {
+     const int             nfa1 = 5;
+     int                   i, iu, s, j;
+     int                   type, ai[GMX_SIMD_REAL_WIDTH], aj[GMX_SIMD_REAL_WIDTH], ak[GMX_SIMD_REAL_WIDTH], al[GMX_SIMD_REAL_WIDTH];
- -    real                 *parm, *phi, *p, *q, *sf_i, *msf_l;
+     real                  dr_array[3*DIM*GMX_SIMD_REAL_WIDTH+GMX_SIMD_REAL_WIDTH], *dr;
+     real                  buf_array[(NR_RBDIHS + 4)*GMX_SIMD_REAL_WIDTH+GMX_SIMD_REAL_WIDTH], *buf;
- -    sf_i  = buf + (NR_RBDIHS + 2)*GMX_SIMD_REAL_WIDTH;
- -    msf_l = buf + (NR_RBDIHS + 3)*GMX_SIMD_REAL_WIDTH;
++    real                 *parm, *p, *q;
+ 
+     gmx_simd_real_t       phi_S;
+     gmx_simd_real_t       ddphi_S, cosfac_S;
+     gmx_simd_real_t       mx_S, my_S, mz_S;
+     gmx_simd_real_t       nx_S, ny_S, nz_S;
+     gmx_simd_real_t       nrkj_m2_S, nrkj_n2_S;
+     gmx_simd_real_t       parm_S, c_S;
+     gmx_simd_real_t       sin_S, cos_S;
+     gmx_simd_real_t       sf_i_S, msf_l_S;
+     pbc_simd_t            pbc_simd;
+ 
+     gmx_simd_real_t       pi_S  = gmx_simd_set1_r(M_PI);
+     gmx_simd_real_t       one_S = gmx_simd_set1_r(1.0);
+ 
+     /* Ensure SIMD register alignment */
+     dr  = gmx_simd_align_r(dr_array);
+     buf = gmx_simd_align_r(buf_array);
+ 
+     /* Extract aligned pointer for parameters and variables */
+     parm  = buf;
+     p     = buf + (NR_RBDIHS + 0)*GMX_SIMD_REAL_WIDTH;
+     q     = buf + (NR_RBDIHS + 1)*GMX_SIMD_REAL_WIDTH;
+ 
+     set_pbc_simd(pbc, &pbc_simd);
+ 
+     /* nbonds is the number of dihedrals times nfa1, here we step GMX_SIMD_REAL_WIDTH dihs */
+     for (i = 0; (i < nbonds); i += GMX_SIMD_REAL_WIDTH*nfa1)
+     {
+         /* Collect atoms quadruplets for GMX_SIMD_REAL_WIDTH dihedrals.
+          * iu indexes into forceatoms, we should not let iu go beyond nbonds.
+          */
+         iu = i;
+         for (s = 0; s < GMX_SIMD_REAL_WIDTH; s++)
+         {
+             type  = forceatoms[iu];
+             ai[s] = forceatoms[iu+1];
+             aj[s] = forceatoms[iu+2];
+             ak[s] = forceatoms[iu+3];
+             al[s] = forceatoms[iu+4];
+ 
+             /* We don't need the first parameter, since that's a constant
+              * which only affects the energies, not the forces.
+              */
+             for (j = 1; j < NR_RBDIHS; j++)
+             {
+                 parm[j*GMX_SIMD_REAL_WIDTH + s] =
+                     forceparams[type].rbdihs.rbcA[j];
+             }
+ 
+             /* At the end fill the arrays with identical entries */
+             if (iu + nfa1 < nbonds)
+             {
+                 iu += nfa1;
+             }
+         }
+ 
+         /* Caclulate GMX_SIMD_REAL_WIDTH dihedral angles at once */
+         dih_angle_simd(x, ai, aj, ak, al, &pbc_simd,
+                        dr,
+                        &phi_S,
+                        &mx_S, &my_S, &mz_S,
+                        &nx_S, &ny_S, &nz_S,
+                        &nrkj_m2_S,
+                        &nrkj_n2_S,
+                        p, q);
+ 
+         /* Change to polymer convention */
+         phi_S = gmx_simd_sub_r(phi_S, pi_S);
+ 
+         gmx_simd_sincos_r(phi_S, &sin_S, &cos_S);
+ 
+         ddphi_S   = gmx_simd_setzero_r();
+         c_S       = one_S;
+         cosfac_S  = one_S;
+         for (j = 1; j < NR_RBDIHS; j++)
+         {
+             parm_S   = gmx_simd_load_r(parm + j*GMX_SIMD_REAL_WIDTH);
+             ddphi_S  = gmx_simd_fmadd_r(gmx_simd_mul_r(c_S, parm_S), cosfac_S, ddphi_S);
+             cosfac_S = gmx_simd_mul_r(cosfac_S, cos_S);
+             c_S      = gmx_simd_add_r(c_S, one_S);
+         }
+ 
+         /* Note that here we do not use the minus sign which is present
+          * in the normal RB code. This is corrected for through (m)sf below.
+          */
+         ddphi_S  = gmx_simd_mul_r(ddphi_S, sin_S);
+ 
+         sf_i_S   = gmx_simd_mul_r(ddphi_S, nrkj_m2_S);
+         msf_l_S  = gmx_simd_mul_r(ddphi_S, nrkj_n2_S);
+ 
+         /* After this m?_S will contain f[i] */
+         mx_S     = gmx_simd_mul_r(sf_i_S, mx_S);
+         my_S     = gmx_simd_mul_r(sf_i_S, my_S);
+         mz_S     = gmx_simd_mul_r(sf_i_S, mz_S);
+ 
+         /* After this m?_S will contain -f[l] */
+         nx_S     = gmx_simd_mul_r(msf_l_S, nx_S);
+         ny_S     = gmx_simd_mul_r(msf_l_S, ny_S);
+         nz_S     = gmx_simd_mul_r(msf_l_S, nz_S);
+ 
+         gmx_simd_store_r(dr + 0*GMX_SIMD_REAL_WIDTH, mx_S);
+         gmx_simd_store_r(dr + 1*GMX_SIMD_REAL_WIDTH, my_S);
+         gmx_simd_store_r(dr + 2*GMX_SIMD_REAL_WIDTH, mz_S);
+         gmx_simd_store_r(dr + 3*GMX_SIMD_REAL_WIDTH, nx_S);
+         gmx_simd_store_r(dr + 4*GMX_SIMD_REAL_WIDTH, ny_S);
+         gmx_simd_store_r(dr + 5*GMX_SIMD_REAL_WIDTH, nz_S);
+ 
+         iu = i;
+         s  = 0;
+         do
+         {
+             do_dih_fup_noshiftf_precalc(ai[s], aj[s], ak[s], al[s],
+                                         p[s], q[s],
+                                         dr[     XX *GMX_SIMD_REAL_WIDTH+s],
+                                         dr[     YY *GMX_SIMD_REAL_WIDTH+s],
+                                         dr[     ZZ *GMX_SIMD_REAL_WIDTH+s],
+                                         dr[(DIM+XX)*GMX_SIMD_REAL_WIDTH+s],
+                                         dr[(DIM+YY)*GMX_SIMD_REAL_WIDTH+s],
+                                         dr[(DIM+ZZ)*GMX_SIMD_REAL_WIDTH+s],
+                                         f);
+             s++;
+             iu += nfa1;
+         }
+         while (s < GMX_SIMD_REAL_WIDTH && iu < nbonds);
+     }
+ }
+ 
   #endif /* GMX_SIMD_HAVE_REAL */
   
   
@@@ -2175,7 -2310,7 +2321,7 @@@ real idihs(int nbonds
   
           dvdl_term += 0.5*(kB - kA)*dp2 - kk*dphi0*dp;
   
- -        do_dih_fup(ai, aj, ak, al, (real)(-ddphi), r_ij, r_kj, r_kl, m, n,
+ +        do_dih_fup(ai, aj, ak, al, -ddphi, r_ij, r_kj, r_kl, m, n,
                      f, fshift, pbc, g, x, t1, t2, t3); /* 112          */
           /* 218 TOTAL  */
   #ifdef DEBUG
@@@ -2265,44 -2400,6 +2411,44 @@@ static void posres_dx(const rvec x, con
       }
   }
   
+ +/*! \brief Computes forces and potential for flat-bottom cylindrical restraints.
+ + *         Returns the flat-bottom potential. */
+ +static real do_fbposres_cylinder(int fbdim, rvec fm, rvec dx, real rfb, real kk, gmx_bool bInvert)
+ +{
+ +    int     d;
+ +    real    dr, dr2, invdr, v, rfb2;
+ +
+ +    dr2  = 0.0;
+ +    rfb2 = sqr(rfb);
+ +    v    = 0.0;
+ +
+ +    for (d = 0; d < DIM; d++)
+ +    {
+ +        if (d != fbdim)
+ +        {
+ +            dr2 += sqr(dx[d]);
+ +        }
+ +    }
+ +
+ +    if  (dr2 > 0.0 &&
+ +         ( (dr2 > rfb2 && bInvert == FALSE ) || (dr2 < rfb2 && bInvert == TRUE ) )
+ +         )
+ +    {
+ +        dr     = sqrt(dr2);
+ +        invdr  = 1./dr;
+ +        v      = 0.5*kk*sqr(dr - rfb);
+ +        for (d = 0; d < DIM; d++)
+ +        {
+ +            if (d != fbdim)
+ +            {
+ +                fm[d] = -kk*(dr-rfb)*dx[d]*invdr; /* Force pointing to the center */
+ +            }
+ +        }
+ +    }
+ +
+ +    return v;
+ +}
+ +
   /*! \brief Adds forces of flat-bottomed positions restraints to f[]
    *         and fixes vir_diag. Returns the flat-bottomed potential. */
   real fbposres(int nbonds,
@@@ -2315,8 -2412,8 +2461,8 @@@
       int              i, ai, m, d, type, npbcdim = 0, fbdim;
       const t_iparams *pr;
       real             vtot, kk, v;
- -    real             ref = 0, dr, dr2, rpot, rfb, rfb2, fact, invdr;
- -    rvec             com_sc, rdist, pos, dx, dpdl, fm;
+ +    real             dr, dr2, rfb, rfb2, fact;
+ +    rvec             com_sc, rdist, dx, dpdl, fm;
       gmx_bool         bInvert;
   
       npbcdim = ePBC2npbcdim(ePBC);
@@@ -2377,22 -2474,19 +2523,22 @@@
                       svmul(fact, dx, fm);
                   }
                   break;
+ +            case efbposresCYLINDERX:
+ +                /* cylindrical flat-bottom posres in y-z plane. fm[XX] = 0. */
+ +                fbdim = XX;
+ +                v     = do_fbposres_cylinder(fbdim, fm, dx, rfb, kk, bInvert);
+ +                break;
+ +            case efbposresCYLINDERY:
+ +                /* cylindrical flat-bottom posres in x-z plane. fm[YY] = 0. */
+ +                fbdim = YY;
+ +                v     = do_fbposres_cylinder(fbdim, fm, dx, rfb, kk, bInvert);
+ +                break;
               case efbposresCYLINDER:
- -                /* cylidrical flat-bottom posres in x-y plane. fm[ZZ] = 0. */
- -                dr2 = sqr(dx[XX])+sqr(dx[YY]);
- -                if  (dr2 > 0.0 &&
- -                     ( (dr2 > rfb2 && bInvert == FALSE ) || (dr2 < rfb2 && bInvert == TRUE ) )
- -                     )
- -                {
- -                    dr     = sqrt(dr2);
- -                    invdr  = 1./dr;
- -                    v      = 0.5*kk*sqr(dr - rfb);
- -                    fm[XX] = -kk*(dr-rfb)*dx[XX]*invdr; /* Force pointing to the center */
- -                    fm[YY] = -kk*(dr-rfb)*dx[YY]*invdr;
- -                }
+ +            /* equivalent to efbposresCYLINDERZ for backwards compatibility */
+ +            case efbposresCYLINDERZ:
+ +                /* cylindrical flat-bottom posres in x-y plane. fm[ZZ] = 0. */
+ +                fbdim = ZZ;
+ +                v     = do_fbposres_cylinder(fbdim, fm, dx, rfb, kk, bInvert);
                   break;
               case efbposresX: /* fbdim=XX */
               case efbposresY: /* fbdim=YY */
@@@ -2434,11 -2528,12 +2580,11 @@@ real posres(int nbonds
               real lambda, real *dvdlambda,
               int refcoord_scaling, int ePBC, rvec comA, rvec comB)
   {
- -    int              i, ai, m, d, type, ki, npbcdim = 0;
+ +    int              i, ai, m, d, type, npbcdim = 0;
       const t_iparams *pr;
       real             L1;
       real             vtot, kk, fm;
- -    real             posA, posB, ref = 0;
- -    rvec             comA_sc, comB_sc, rdist, dpdl, pos, dx;
+ +    rvec             comA_sc, comB_sc, rdist, dpdl, dx;
       gmx_bool         bForceValid = TRUE;
   
       if ((f == NULL) || (vir_diag == NULL))    /* should both be null together! */
@@@ -2631,7 -2726,7 +2777,7 @@@ real dihres(int nbonds
       real vtot = 0;
       int  ai, aj, ak, al, i, k, type, t1, t2, t3;
       real phi0A, phi0B, dphiA, dphiB, kfacA, kfacB, phi0, dphi, kfac;
- -    real phi, ddphi, ddp, ddp2, dp, sign, d2r, fc, L1;
+ +    real phi, ddphi, ddp, ddp2, dp, sign, d2r, L1;
       rvec r_ij, r_kj, r_kl, m, n;
   
       L1 = 1.0-lambda;
@@@ -2738,6 -2833,7 +2884,6 @@@ real restrangles(int nbonds
   {
       int  i, d, ai, aj, ak, type, m;
       int  t1, t2;
- -    rvec r_ij, r_kj;
       real v, vtot;
       ivec jt, dt_ij, dt_kj;
       rvec f_i, f_j, f_k;
@@@ -2865,7 -2961,7 +3011,7 @@@ real restrdihs(int nbonds
           t1 = pbc_rvec_sub(pbc, x[ai], x[aj], vec_temp);
           pbc_rvec_sub(pbc, x[aj], x[ai], delta_ante);
           t2 = pbc_rvec_sub(pbc, x[ak], x[aj], delta_crnt);
- -        t3 = pbc_rvec_sub(pbc, x[ak], x[al], vec_temp);
+ +        pbc_rvec_sub(pbc, x[ak], x[al], vec_temp);
           pbc_rvec_sub(pbc, x[al], x[ak], delta_post);
   
           /* This function computes factors needed for restricted angle potential.
@@@ -2976,7 -3072,7 +3122,7 @@@ real cbtdihs(int nbonds
           pbc_rvec_sub(pbc, x[aj], x[ai], delta_ante);
           t2 = pbc_rvec_sub(pbc, x[ak], x[aj], vec_temp);
           pbc_rvec_sub(pbc, x[ak], x[aj], delta_crnt);
- -        t3 = pbc_rvec_sub(pbc, x[ak], x[al], vec_temp);
+ +        pbc_rvec_sub(pbc, x[ak], x[al], vec_temp);
           pbc_rvec_sub(pbc, x[al], x[ak], delta_post);
   
           /* \brief Compute factors for CBT potential
@@@ -3151,11 -3247,7 +3297,11 @@@ real rbdihs(int nbonds
       return vtot;
   }
   
- -int cmap_setup_grid_index(int ip, int grid_spacing, int *ipm1, int *ipp1, int *ipp2)
+ +//! \endcond
+ +
+ +/*! \brief Mysterious undocumented function */
+ +static int
+ +cmap_setup_grid_index(int ip, int grid_spacing, int *ipm1, int *ipp1, int *ipp2)
   {
       int im1, ip1, ip2;
   
@@@ -3194,16 -3286,14 +3340,16 @@@
   
   }
   
- -real cmap_dihs(int nbonds,
- -               const t_iatom forceatoms[], const t_iparams forceparams[],
- -               const gmx_cmap_t *cmap_grid,
- -               const rvec x[], rvec f[], rvec fshift[],
- -               const t_pbc *pbc, const t_graph *g,
- -               real gmx_unused lambda, real gmx_unused *dvdlambda,
- -               const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
- -               int  gmx_unused *global_atom_index)
+ +/*! \brief Compute CMAP dihedral energies and forces */
+ +static real
+ +cmap_dihs(int nbonds,
+ +          const t_iatom forceatoms[], const t_iparams forceparams[],
+ +          const gmx_cmap_t *cmap_grid,
+ +          const rvec x[], rvec f[], rvec fshift[],
+ +          const t_pbc *pbc, const t_graph *g,
+ +          real gmx_unused lambda, real gmx_unused *dvdlambda,
+ +          const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+ +          int  gmx_unused *global_atom_index)
   {
       int         i, j, k, n, idx;
       int         ai, aj, ak, al, am;
@@@ -3212,13 -3302,13 +3358,13 @@@
       int         t11, t21, t31, t12, t22, t32;
       int         iphi1, ip1m1, ip1p1, ip1p2;
       int         iphi2, ip2m1, ip2p1, ip2p2;
- -    int         l1, l2, l3, l4;
- -    int         pos1, pos2, pos3, pos4, tmp;
+ +    int         l1, l2, l3;
+ +    int         pos1, pos2, pos3, pos4;
   
       real        ty[4], ty1[4], ty2[4], ty12[4], tc[16], tx[16];
- -    real        phi1, psi1, cos_phi1, sin_phi1, sign1, xphi1;
- -    real        phi2, psi2, cos_phi2, sin_phi2, sign2, xphi2;
- -    real        dx, xx, tt, tu, e, df1, df2, ddf1, ddf2, ddf12, vtot;
+ +    real        phi1, cos_phi1, sin_phi1, sign1, xphi1;
+ +    real        phi2, cos_phi2, sin_phi2, sign2, xphi2;
+ +    real        dx, xx, tt, tu, e, df1, df2, vtot;
       real        ra21, rb21, rg21, rg1, rgr1, ra2r1, rb2r1, rabr1;
       real        ra22, rb22, rg22, rg2, rgr2, ra2r2, rb2r2, rabr2;
       real        fg1, hg1, fga1, hgb1, gaa1, gbb1;
@@@ -3280,7 -3370,7 +3426,7 @@@
           b1[1] = r1_kl[2]*r1_kj[0]-r1_kl[0]*r1_kj[2];
           b1[2] = r1_kl[0]*r1_kj[1]-r1_kl[1]*r1_kj[0]; /* 9 */
   
- -        tmp = pbc_rvec_sub(pbc, x[a1l], x[a1k], h1);
+ +        pbc_rvec_sub(pbc, x[a1l], x[a1k], h1);
   
           ra21  = iprod(a1, a1);       /* 5 */
           rb21  = iprod(b1, b1);       /* 5 */
@@@ -3341,7 -3431,7 +3487,7 @@@
           b2[1] = r2_kl[2]*r2_kj[0]-r2_kl[0]*r2_kj[2];
           b2[2] = r2_kl[0]*r2_kj[1]-r2_kl[1]*r2_kj[0]; /* 9 */
   
- -        tmp = pbc_rvec_sub(pbc, x[a2l], x[a2k], h2);
+ +        pbc_rvec_sub(pbc, x[a2l], x[a2k], h2);
   
           ra22  = iprod(a2, a2);         /* 5 */
           rb22  = iprod(b2, b2);         /* 5 */
@@@ -3406,8 -3496,8 +3552,8 @@@
           dx = 2*M_PI / cmap_grid->grid_spacing;
   
           /* Where on the grid are we */
- -        iphi1 = (int)(xphi1/dx);
- -        iphi2 = (int)(xphi2/dx);
+ +        iphi1 = static_cast<int>(xphi1/dx);
+ +        iphi2 = static_cast<int>(xphi2/dx);
   
           iphi1 = cmap_setup_grid_index(iphi1, cmap_grid->grid_spacing, &ip1m1, &ip1p1, &ip1p2);
           iphi2 = cmap_setup_grid_index(iphi2, cmap_grid->grid_spacing, &ip2m1, &ip2p1, &ip2p2);
@@@ -3472,6 -3562,9 +3618,6 @@@
           e     = 0;
           df1   = 0;
           df2   = 0;
- -        ddf1  = 0;
- -        ddf2  = 0;
- -        ddf12 = 0;
   
           for (i = 3; i >= 0; i--)
           {
@@@ -3482,11 -3575,19 +3628,11 @@@
               e     = tt * e    + ((tc[i*4+3]*tu+tc[i*4+2])*tu + tc[i*4+1])*tu+tc[i*4];
               df1   = tu * df1  + (3.0*tc[l1]*tt+2.0*tc[l2])*tt+tc[l3];
               df2   = tt * df2  + (3.0*tc[i*4+3]*tu+2.0*tc[i*4+2])*tu+tc[i*4+1];
- -            ddf1  = tu * ddf1 + 2.0*3.0*tc[l1]*tt+2.0*tc[l2];
- -            ddf2  = tt * ddf2 + 2.0*3.0*tc[4*i+3]*tu+2.0*tc[4*i+2];
           }
   
- -        ddf12 = tc[5] + 2.0*tc[9]*tt + 3.0*tc[13]*tt*tt + 2.0*tu*(tc[6]+2.0*tc[10]*tt+3.0*tc[14]*tt*tt) +
- -            3.0*tu*tu*(tc[7]+2.0*tc[11]*tt+3.0*tc[15]*tt*tt);
- -
           fac     = RAD2DEG/dx;
           df1     = df1   * fac;
           df2     = df2   * fac;
- -        ddf1    = ddf1  * fac * fac;
- -        ddf2    = ddf2  * fac * fac;
- -        ddf12   = ddf12 * fac * fac;
   
           /* CMAP energy */
           vtot += e;
@@@ -3593,7 -3694,7 +3739,7 @@@
   }
   
   
- -
+ +//! \cond
   /***********************************************************
    *
    *   G R O M O S  9 6   F U N C T I O N S
@@@ -3852,7 -3953,7 +3998,7 @@@ real cross_bond_angle(int nbonds
       /* Potential from Lawrence and Skimmer, Chem. Phys. Lett. 372 (2003)
        * pp. 842-847
        */
- -    int  i, ai, aj, ak, type, m, t1, t2, t3;
+ +    int  i, ai, aj, ak, type, m, t1, t2;
       rvec r_ij, r_kj, r_ik;
       real vtot, vrt, s1, s2, s3, r1, r2, r3, r1e, r2e, r3e, krt, k1, k2, k3;
       rvec f_i, f_j, f_k;
@@@ -3873,7 -3974,7 +4019,7 @@@
           /* Compute distance vectors ... */
           t1 = pbc_rvec_sub(pbc, x[ai], x[aj], r_ij);
           t2 = pbc_rvec_sub(pbc, x[ak], x[aj], r_kj);
- -        t3 = pbc_rvec_sub(pbc, x[ai], x[ak], r_ik);
+ +        pbc_rvec_sub(pbc, x[ai], x[ak], r_ik);
   
           /* ... and their lengths */
           r1 = norm(r_ij);
@@@ -3931,7 -4032,7 +4077,7 @@@ static real bonded_tab(const char *type
   {
       real k, tabscale, *VFtab, rt, eps, eps2, Yt, Ft, Geps, Heps2, Fp, VV, FF;
       int  n0, nnn;
- -    real v, f, dvdlambda;
+ +    real dvdlambda;
   
       k = (1.0 - lambda)*kA + lambda*kB;
   
@@@ -3939,7 -4040,7 +4085,7 @@@
       VFtab    = table->data;
   
       rt    = r*tabscale;
- -    n0    = rt;
+ +    n0    = static_cast<int>(rt);
       if (n0 >= table->n)
       {
           gmx_fatal(FARGS, "A tabulated %s interaction table number %d is out of the table range: r %f, between table indices %d and %d, table length %d",
@@@ -4066,7 -4167,7 +4212,7 @@@ real tab_angles(int nbonds
           if (cos_theta2 < 1)
           {
               int  m;
- -            real snt, st, sth;
+ +            real st, sth;
               real cik, cii, ckk;
               real nrkj2, nrij2;
               rvec f_i, f_j, f_k;
@@@ -4160,10 -4261,11 +4306,10 @@@ real tab_dihs(int nbonds
       return vtot;
   }
   
- -/* Return if this is a potential calculated in bondfree.c,
- - * i.e. an interaction that actually calculates a potential and
- - * works on multiple atoms (not e.g. a connection or a position restraint).
- - */
- -static gmx_inline gmx_bool ftype_is_bonded_potential(int ftype)
+ +//! \endcond
+ +
+ +gmx_bool
+ +ftype_is_bonded_potential(int ftype)
   {
       return
           (interaction_function[ftype].flags & IF_BOND) &&
@@@ -4171,7 -4273,173 +4317,7 @@@
           (ftype < F_GB12 || ftype > F_GB14);
   }
   
- -static void divide_bondeds_over_threads(t_idef *idef, int nthreads)
- -{
- -    int ftype;
- -    int nat1;
- -    int t;
- -    int il_nr_thread;
- -
- -    idef->nthreads = nthreads;
- -
- -    if (F_NRE*(nthreads+1) > idef->il_thread_division_nalloc)
- -    {
- -        idef->il_thread_division_nalloc = F_NRE*(nthreads+1);
- -        snew(idef->il_thread_division, idef->il_thread_division_nalloc);
- -    }
- -
- -    for (ftype = 0; ftype < F_NRE; ftype++)
- -    {
- -        if (ftype_is_bonded_potential(ftype))
- -        {
- -            nat1 = interaction_function[ftype].nratoms + 1;
- -
- -            for (t = 0; t <= nthreads; t++)
- -            {
- -                /* Divide the interactions equally over the threads.
- -                 * When the different types of bonded interactions
- -                 * are distributed roughly equally over the threads,
- -                 * this should lead to well localized output into
- -                 * the force buffer on each thread.
- -                 * If this is not the case, a more advanced scheme
- -                 * (not implemented yet) will do better.
- -                 */
- -                il_nr_thread = (((idef->il[ftype].nr/nat1)*t)/nthreads)*nat1;
- -
- -                /* Ensure that distance restraint pairs with the same label
- -                 * end up on the same thread.
- -                 * This is slighlty tricky code, since the next for iteration
- -                 * may have an initial il_nr_thread lower than the final value
- -                 * in the previous iteration, but this will anyhow be increased
- -                 * to the approriate value again by this while loop.
- -                 */
- -                while (ftype == F_DISRES &&
- -                       il_nr_thread > 0 &&
- -                       il_nr_thread < idef->il[ftype].nr &&
- -                       idef->iparams[idef->il[ftype].iatoms[il_nr_thread]].disres.label ==
- -                       idef->iparams[idef->il[ftype].iatoms[il_nr_thread-nat1]].disres.label)
- -                {
- -                    il_nr_thread += nat1;
- -                }
- -
- -                idef->il_thread_division[ftype*(nthreads+1)+t] = il_nr_thread;
- -            }
- -        }
- -    }
- -}
- -
- -static unsigned
- -calc_bonded_reduction_mask(const t_idef *idef,
- -                           int shift,
- -                           int t, int nt)
- -{
- -    unsigned mask;
- -    int      ftype, nb, nat1, nb0, nb1, i, a;
- -
- -    mask = 0;
- -
- -    for (ftype = 0; ftype < F_NRE; ftype++)
- -    {
- -        if (ftype_is_bonded_potential(ftype))
- -        {
- -            nb = idef->il[ftype].nr;
- -            if (nb > 0)
- -            {
- -                nat1 = interaction_function[ftype].nratoms + 1;
- -
- -                /* Divide this interaction equally over the threads.
- -                 * This is not stored: should match division in calc_bonds.
- -                 */
- -                nb0 = idef->il_thread_division[ftype*(nt+1)+t];
- -                nb1 = idef->il_thread_division[ftype*(nt+1)+t+1];
- -
- -                for (i = nb0; i < nb1; i += nat1)
- -                {
- -                    for (a = 1; a < nat1; a++)
- -                    {
- -                        mask |= (1U << (idef->il[ftype].iatoms[i+a]>>shift));
- -                    }
- -                }
- -            }
- -        }
- -    }
- -
- -    return mask;
- -}
- -
- -void setup_bonded_threading(t_forcerec   *fr, t_idef *idef)
- -{
- -#define MAX_BLOCK_BITS 32
- -    int t;
- -    int ctot, c, b;
- -
- -    assert(fr->nthreads >= 1);
- -
- -    /* Divide the bonded interaction over the threads */
- -    divide_bondeds_over_threads(idef, fr->nthreads);
- -
- -    if (fr->nthreads == 1)
- -    {
- -        fr->red_nblock = 0;
- -
- -        return;
- -    }
- -
- -    /* We divide the force array in a maximum of 32 blocks.
- -     * Minimum force block reduction size is 2^6=64.
- -     */
- -    fr->red_ashift = 6;
- -    while (fr->natoms_force > (int)(MAX_BLOCK_BITS*(1U<<fr->red_ashift)))
- -    {
- -        fr->red_ashift++;
- -    }
- -    if (debug)
- -    {
- -        fprintf(debug, "bonded force buffer block atom shift %d bits\n",
- -                fr->red_ashift);
- -    }
- -
- -    /* Determine to which blocks each thread's bonded force calculation
- -     * contributes. Store this is a mask for each thread.
- -     */
- -#pragma omp parallel for num_threads(fr->nthreads) schedule(static)
- -    for (t = 1; t < fr->nthreads; t++)
- -    {
- -        fr->f_t[t].red_mask =
- -            calc_bonded_reduction_mask(idef, fr->red_ashift, t, fr->nthreads);
- -    }
- -
- -    /* Determine the maximum number of blocks we need to reduce over */
- -    fr->red_nblock = 0;
- -    ctot           = 0;
- -    for (t = 0; t < fr->nthreads; t++)
- -    {
- -        c = 0;
- -        for (b = 0; b < MAX_BLOCK_BITS; b++)
- -        {
- -            if (fr->f_t[t].red_mask & (1U<<b))
- -            {
- -                fr->red_nblock = max(fr->red_nblock, b+1);
- -                c++;
- -            }
- -        }
- -        if (debug)
- -        {
- -            fprintf(debug, "thread %d flags %x count %d\n",
- -                    t, fr->f_t[t].red_mask, c);
- -        }
- -        ctot += c;
- -    }
- -    if (debug)
- -    {
- -        fprintf(debug, "Number of blocks to reduce: %d of size %d\n",
- -                fr->red_nblock, 1<<fr->red_ashift);
- -        fprintf(debug, "Reduction density %.2f density/#thread %.2f\n",
- -                ctot*(1<<fr->red_ashift)/(double)fr->natoms_force,
- -                ctot*(1<<fr->red_ashift)/(double)(fr->natoms_force*fr->nthreads));
- -    }
- -}
- -
+ +/*! \brief Zero thread-local force-output buffers */
   static void zero_thread_forces(f_thread_t *f_t, int n,
                                  int nblock, int blocksize)
   {
@@@ -4190,7 -4458,7 +4336,7 @@@
               if (f_t->red_mask && (1U<<b))
               {
                   a0 = b*blocksize;
- -                a1 = min((b+1)*blocksize, n);
+ +                a1 = std::min((b+1)*blocksize, n);
                   for (a = a0; a < a1; a++)
                   {
                       clear_rvec(f_t->f[a]);
@@@ -4219,16 -4487,15 +4365,16 @@@
       }
   }
   
+ +/*! \brief The max thread number is arbitrary, we used a fixed number
+ + * to avoid memory management.  Using more than 16 threads is probably
+ + * never useful performance wise. */
+ +#define MAX_BONDED_THREADS 256
+ +
+ +/*! \brief Reduce thread-local force buffers */
   static void reduce_thread_force_buffer(int n, rvec *f,
                                          int nthreads, f_thread_t *f_t,
                                          int nblock, int block_size)
   {
- -    /* The max thread number is arbitrary,
- -     * we used a fixed number to avoid memory management.
- -     * Using more than 16 threads is probably never useful performance wise.
- -     */
- -#define MAX_BONDED_THREADS 256
       int b;
   
       if (nthreads > MAX_BONDED_THREADS)
@@@ -4261,7 -4528,7 +4407,7 @@@
               /* Reduce force buffers for threads that contribute */
               a0 =  b   *block_size;
               a1 = (b+1)*block_size;
- -            a1 = min(a1, n);
+ +            a1 = std::min(a1, n);
               for (a = a0; a < a1; a++)
               {
                   for (fb = 0; fb < nfb; fb++)
@@@ -4273,7 -4540,6 +4419,7 @@@
       }
   }
   
+ +/*! \brief Reduce thread-local forces */
   static void reduce_thread_forces(int n, rvec *f, rvec *fshift,
                                    real *ener, gmx_grppairener_t *grpp, real *dvdl,
                                    int nthreads, f_thread_t *f_t,
@@@ -4331,11 -4597,9 +4477,11 @@@
       }
   }
   
- -static real calc_one_bond(FILE *fplog, int thread,
+ +/*! \brief Calculate one element of the list of bonded interactions
+ +    for this thread */
+ +static real calc_one_bond(int thread,
                             int ftype, const t_idef *idef,
- -                          rvec x[], rvec f[], rvec fshift[],
+ +                          const rvec x[], rvec f[], rvec fshift[],
                             t_forcerec *fr,
                             const t_pbc *pbc, const t_graph *g,
                             gmx_grppairener_t *grpp,
@@@ -4343,7 -4607,7 +4489,7 @@@
                             real *lambda, real *dvdl,
                             const t_mdatoms *md, t_fcdata *fcd,
                             gmx_bool bCalcEnerVir,
- -                          int *global_atom_index, gmx_bool bPrintSepPot)
+ +                          int *global_atom_index)
   {
       int      nat1, nbonds, efptFTYPE;
       real     v = 0;
@@@ -4372,7 -4636,7 +4518,7 @@@
           {
               v = cmap_dihs(nbn, iatoms+nb0,
                             idef->iparams, &idef->cmap_grid,
- -                          (const rvec*)x, f, fshift,
+ +                          x, f, fshift,
                             pbc, g, lambda[efptFTYPE], &(dvdl[efptFTYPE]),
                             md, fcd, global_atom_index);
           }
@@@ -4383,7 -4647,7 +4529,7 @@@
               /* No energies, shift forces, dvdl */
               angles_noener_simd(nbn, idef->il[ftype].iatoms+nb0,
                                  idef->iparams,
- -                               (const rvec*)x, f,
+ +                               x, f,
                                  pbc, g, lambda[efptFTYPE], md, fcd,
                                  global_atom_index);
               v = 0;
@@@ -4400,24 -4664,53 +4546,37 @@@
   #endif
                   (nbn, idef->il[ftype].iatoms+nb0,
                   idef->iparams,
- -                (const rvec*)x, f,
+ +                x, f,
                   pbc, g, lambda[efptFTYPE], md, fcd,
                   global_atom_index);
               v = 0;
           }
+ #ifdef GMX_SIMD_HAVE_REAL
+         else if (ftype == F_RBDIHS &&
+                  !bCalcEnerVir && fr->efep == efepNO)
+         {
+             /* No energies, shift forces, dvdl */
+             rbdihs_noener_simd(nbn, idef->il[ftype].iatoms+nb0,
+                                idef->iparams,
+                                (const rvec*)x, f,
+                                pbc, g, lambda[efptFTYPE], md, fcd,
+                                global_atom_index);
+             v = 0;
+         }
+ #endif
           else
           {
               v = interaction_function[ftype].ifunc(nbn, iatoms+nb0,
                                                     idef->iparams,
- -                                                  (const rvec*)x, f, fshift,
+ +                                                  x, f, fshift,
                                                     pbc, g, lambda[efptFTYPE], &(dvdl[efptFTYPE]),
                                                     md, fcd, global_atom_index);
           }
- -        if (bPrintSepPot)
- -        {
- -            fprintf(fplog, "  %-23s #%4d  V %12.5e  dVdl %12.5e\n",
- -                    interaction_function[ftype].longname,
- -                    nbonds, v, lambda[efptFTYPE]);
- -        }
       }
       else
       {
- -        v = do_nonbonded_listed(ftype, nbn, iatoms+nb0, idef->iparams, (const rvec*)x, f, fshift,
+ +        v = do_nonbonded_listed(ftype, nbn, iatoms+nb0, idef->iparams, x, f, fshift,
                                   pbc, g, lambda, dvdl, md, fr, grpp, global_atom_index);
- -
- -        if (bPrintSepPot)
- -        {
- -            fprintf(fplog, "  %-5s + %-15s #%4d                  dVdl %12.5e\n",
- -                    interaction_function[ftype].longname,
- -                    interaction_function[F_LJ14].longname, nbonds, dvdl[efptVDW]);
- -            fprintf(fplog, "  %-5s + %-15s #%4d                  dVdl %12.5e\n",
- -                    interaction_function[ftype].longname,
- -                    interaction_function[F_COUL14].longname, nbonds, dvdl[efptCOUL]);
- -        }
       }
   
       if (thread == 0)
@@@ -4428,22 -4721,25 +4587,22 @@@
       return v;
   }
   
- -void calc_bonds(FILE *fplog, const gmx_multisim_t *ms,
+ +void calc_bonds(const gmx_multisim_t *ms,
                   const t_idef *idef,
- -                rvec x[], history_t *hist,
+ +                const rvec x[], history_t *hist,
                   rvec f[], t_forcerec *fr,
- -                const t_pbc *pbc, const t_graph *g,
+ +                const struct t_pbc *pbc, const struct t_graph *g,
                   gmx_enerdata_t *enerd, t_nrnb *nrnb,
                   real *lambda,
                   const t_mdatoms *md,
                   t_fcdata *fcd, int *global_atom_index,
- -                t_atomtypes gmx_unused *atype, gmx_genborn_t gmx_unused *born,
- -                int force_flags,
- -                gmx_bool bPrintSepPot, gmx_int64_t step)
+ +                int force_flags)
   {
       gmx_bool      bCalcEnerVir;
       int           i;
- -    real          v, dvdl[efptNR], dvdl_dum[efptNR]; /* The dummy array is to have a place to store the dhdl at other values
+ +    real          dvdl[efptNR]; /* The dummy array is to have a place to store the dhdl at other values
                                                           of lambda, which will be thrown away in the end*/
       const  t_pbc *pbc_null;
- -    char          buf[22];
       int           thread;
   
       assert(fr->nthreads == idef->nthreads);
@@@ -4462,6 -4758,11 +4621,6 @@@
       {
           pbc_null = NULL;
       }
- -    if (bPrintSepPot)
- -    {
- -        fprintf(fplog, "Step %s: bonded V and dVdl for this rank\n",
- -                gmx_step_str(step, buf));
- -    }
   
   #ifdef DEBUG
       if (g && debug)
@@@ -4476,14 -4777,14 +4635,14 @@@
           enerd->term[F_ORIRESDEV] =
               calc_orires_dev(ms, idef->il[F_ORIRES].nr,
                               idef->il[F_ORIRES].iatoms,
- -                            idef->iparams, md, (const rvec*)x,
+ +                            idef->iparams, md, x,
                               pbc_null, fcd, hist);
       }
       if (idef->il[F_DISRES].nr)
       {
           calc_disres_R_6(idef->il[F_DISRES].nr,
                           idef->il[F_DISRES].iatoms,
- -                        idef->iparams, (const rvec*)x, pbc_null,
+ +                        idef->iparams, x, pbc_null,
                           fcd, hist);
   #ifdef GMX_MPI
           if (fcd->disres.nsystems > 1)
@@@ -4527,11 -4828,11 +4686,11 @@@
           {
               if (idef->il[ftype].nr > 0 && ftype_is_bonded_potential(ftype))
               {
- -                v = calc_one_bond(fplog, thread, ftype, idef, x,
+ +                v = calc_one_bond(thread, ftype, idef, x,
                                     ft, fshift, fr, pbc_null, g, grpp,
                                     nrnb, lambda, dvdlt,
                                     md, fcd, bCalcEnerVir,
- -                                  global_atom_index, bPrintSepPot);
+ +                                  global_atom_index);
                   epot[ftype] += v;
               }
           }
@@@ -4561,17 -4862,18 +4720,17 @@@
       }
   }
   
- -void calc_bonds_lambda(FILE *fplog,
- -                       const t_idef *idef,
- -                       rvec x[],
+ +void calc_bonds_lambda(const t_idef *idef,
+ +                       const rvec x[],
                          t_forcerec *fr,
- -                       const t_pbc *pbc, const t_graph *g,
+ +                       const struct t_pbc *pbc, const struct t_graph *g,
                          gmx_grppairener_t *grpp, real *epot, t_nrnb *nrnb,
                          real *lambda,
                          const t_mdatoms *md,
                          t_fcdata *fcd,
                          int *global_atom_index)
   {
- -    int           i, ftype, nr_nonperturbed, nr;
+ +    int           ftype, nr_nonperturbed, nr;
       real          v;
       real          dvdl_dum[efptNR];
       rvec         *f, *fshift;
@@@ -4612,11 -4914,11 +4771,11 @@@
   
               if (nr - nr_nonperturbed > 0)
               {
- -                v = calc_one_bond(fplog, 0, ftype, &idef_fe,
+ +                v = calc_one_bond(0, ftype, &idef_fe,
                                     x, f, fshift, fr, pbc_null, g,
                                     grpp, nrnb, lambda, dvdl_dum,
                                     md, fcd, TRUE,
- -                                  global_atom_index, FALSE);
+ +                                  global_atom_index);
                   epot[ftype] += v;
               }
           }
diff --combined src/gromacs/mdlib/domdec.cpp

index 13934d5fa68fd157027fac7fc3d4cb8b84662b6a,6cc6b73895d08a33eadcd83abcb492bd2dffcbef..06f2602916cdedd2b27728612b90bd533f09eadb
--- 1/src/gromacs/mdlib/domdec.cpp
--- 2/src/gromacs/mdlib/domdec.c
+++ b/src/gromacs/mdlib/domdec.cpp
@@@ -33,59 -33,53 +33,59 @@@
    * the research papers on the package. Check out http://www.gromacs.org.
    */
   
- -#ifdef HAVE_CONFIG_H
- -#include <config.h>
- -#endif
+ +#include "gmxpre.h"
   
- -#include <stdio.h>
- -#include <time.h>
+ +#include "gromacs/legacyheaders/domdec.h"
+ +
+ +#include "config.h"
+ +
+ +#include <assert.h>
   #include <math.h>
- -#include <string.h>
+ +#include <stdio.h>
   #include <stdlib.h>
- -#include <assert.h>
+ +#include <string.h>
+ +#include <time.h>
+ +
+ +#include <algorithm>
   
- -#include "typedefs.h"
- -#include "gromacs/utility/smalloc.h"
- -#include "gmx_fatal.h"
- -#include "gmx_fatal_collective.h"
- -#include "vec.h"
- -#include "domdec.h"
- -#include "domdec_network.h"
- -#include "nrnb.h"
- -#include "pbc.h"
- -#include "chargegroup.h"
- -#include "constr.h"
- -#include "mdatoms.h"
- -#include "names.h"
- -#include "force.h"
- -#include "pme.h"
- -#include "mdrun.h"
- -#include "nsgrid.h"
- -#include "shellfc.h"
- -#include "mtop_util.h"
- -#include "gmx_ga2la.h"
- -#include "macros.h"
- -#include "nbnxn_search.h"
- -#include "bondf.h"
- -#include "gmx_omp_nthreads.h"
- -#include "gpu_utils.h"
- -
- -#include "gromacs/fileio/futil.h"
   #include "gromacs/fileio/gmxfio.h"
   #include "gromacs/fileio/pdbio.h"
+ +#include "gromacs/imd/imd.h"
+ +#include "gromacs/legacyheaders/bonded-threading.h"
+ +#include "gromacs/legacyheaders/chargegroup.h"
+ +#include "gromacs/legacyheaders/constr.h"
+ +#include "gromacs/legacyheaders/domdec_network.h"
+ +#include "gromacs/legacyheaders/force.h"
+ +#include "gromacs/legacyheaders/gmx_ga2la.h"
+ +#include "gromacs/legacyheaders/gmx_omp_nthreads.h"
+ +#include "gromacs/legacyheaders/gpu_utils.h"
+ +#include "gromacs/legacyheaders/macros.h"
+ +#include "gromacs/legacyheaders/mdatoms.h"
+ +#include "gromacs/legacyheaders/mdrun.h"
+ +#include "gromacs/legacyheaders/names.h"
+ +#include "gromacs/legacyheaders/network.h"
+ +#include "gromacs/legacyheaders/nrnb.h"
+ +#include "gromacs/legacyheaders/nsgrid.h"
+ +#include "gromacs/legacyheaders/pme.h"
+ +#include "gromacs/legacyheaders/shellfc.h"
+ +#include "gromacs/legacyheaders/typedefs.h"
+ +#include "gromacs/listed-forces/bonded.h"
+ +#include "gromacs/math/vec.h"
+ +#include "gromacs/mdlib/nb_verlet.h"
+ +#include "gromacs/mdlib/nbnxn_search.h"
+ +#include "gromacs/pbcutil/ishift.h"
+ +#include "gromacs/pbcutil/pbc.h"
+ +#include "gromacs/pulling/pull.h"
+ +#include "gromacs/pulling/pull_rotation.h"
+ +#include "gromacs/swap/swapcoords.h"
   #include "gromacs/timing/wallcycle.h"
+ +#include "gromacs/topology/mtop_util.h"
+ +#include "gromacs/utility/basenetwork.h"
+ +#include "gromacs/utility/fatalerror.h"
+ +#include "gromacs/utility/futil.h"
   #include "gromacs/utility/gmxmpi.h"
- -#include "gromacs/swap/swapcoords.h"
   #include "gromacs/utility/qsort_threadsafe.h"
- -#include "gromacs/pulling/pull.h"
- -#include "gromacs/pulling/pull_rotation.h"
- -#include "gromacs/imd/imd.h"
+ +#include "gromacs/utility/smalloc.h"
   
   #define DDRANK(dd, rank)    (rank)
   #define DDMASTERRANK(dd)   (dd->masterrank)
@@@ -275,6 -269,8 +275,8 @@@ typedef struct gmx_domdec_com
   
       /* The DLB option */
       int      eDLB;
+     /* Is eDLB=edlbAUTO locked such that we currently can't turn it on? */
+     gmx_bool bDLB_locked;
       /* Are we actually using DLB? */
       gmx_bool bDynLoadBal;
   
@@@ -391,9 -387,9 +393,9 @@@
       int    eFlop;
       double flop;
       int    flop_n;
-     /* Have often have did we have load measurements */
+     /* How many times have did we have load measurements */
       int    n_load_have;
-     /* Have often have we collected the load measurements */
+     /* How many times have we collected the load measurements */
       int    n_load_collect;
   
       /* Statistics */
@@@ -772,7 -768,7 +774,7 @@@ void dd_move_f(gmx_domdec_t *dd, rvec f
       rvec                  *buf, *sbuf;
       ivec                   vis;
       int                    is;
- -    gmx_bool               bPBC, bScrew;
+ +    gmx_bool               bShiftForcesNeedPbc, bScrew;
   
       comm = dd->comm;
   
@@@ -780,17 -776,16 +782,17 @@@
   
       buf = comm->vbuf.v;
   
- -    n       = 0;
       nzone   = comm->zones.n/2;
       nat_tot = dd->nat_tot;
       for (d = dd->ndim-1; d >= 0; d--)
       {
- -        bPBC   = (dd->ci[dd->dim[d]] == 0);
- -        bScrew = (bPBC && dd->bScrewPBC && dd->dim[d] == XX);
+ +        /* Only forces in domains near the PBC boundaries need to
+ +           consider PBC in the treatment of fshift */
+ +        bShiftForcesNeedPbc   = (dd->ci[dd->dim[d]] == 0);
+ +        bScrew                = (bShiftForcesNeedPbc && dd->bScrewPBC && dd->dim[d] == XX);
           if (fshift == NULL && !bScrew)
           {
- -            bPBC = FALSE;
+ +            bShiftForcesNeedPbc = FALSE;
           }
           /* Determine which shift vector we need */
           clear_ivec(vis);
@@@ -826,7 -821,7 +828,7 @@@
               index = ind->index;
               /* Add the received forces */
               n = 0;
- -            if (!bPBC)
+ +            if (!bShiftForcesNeedPbc)
               {
                   for (i = 0; i < ind->nsend[nzone]; i++)
                   {
@@@ -841,9 -836,6 +843,9 @@@
               }
               else if (!bScrew)
               {
+ +                /* fshift should always be defined if this function is
+ +                 * called when bShiftForcesNeedPbc is true */
+ +                assert(NULL != fshift);
                   for (i = 0; i < ind->nsend[nzone]; i++)
                   {
                       at0 = cgindex[index[i]];
@@@ -964,6 -956,7 +966,6 @@@ void dd_atom_sum_real(gmx_domdec_t *dd
   
       buf = &comm->vbuf.v[0][0];
   
- -    n       = 0;
       nzone   = comm->zones.n/2;
       nat_tot = dd->nat_tot;
       for (d = dd->ndim-1; d >= 0; d--)
@@@ -1069,7 -1062,7 +1071,7 @@@ static void dd_sendrecv_ddzone(const gm
   static void dd_move_cellx(gmx_domdec_t *dd, gmx_ddbox_t *ddbox,
                             rvec cell_ns_x0, rvec cell_ns_x1)
   {
- -    int                d, d1, dim, dim1, pos, buf_size, i, j, k, p, npulse, npulse_min;
+ +    int                d, d1, dim, pos, buf_size, i, j, p, npulse, npulse_min;
       gmx_ddzone_t      *zp;
       gmx_ddzone_t       buf_s[DDZONECOMM_MAXZONE];
       gmx_ddzone_t       buf_r[DDZONECOMM_MAXZONE];
@@@ -1141,7 -1134,7 +1143,7 @@@
           if (bPBC)
           {
               /* Take the minimum to avoid double communication */
- -            npulse_min = min(npulse, dd->nc[dim]-1-npulse);
+ +            npulse_min = std::min(npulse, dd->nc[dim]-1-npulse);
           }
           else
           {
@@@ -1165,9 -1158,9 +1167,9 @@@
               {
                   for (d1 = d; d1 < dd->ndim-1; d1++)
                   {
- -                    extr_s[d1][0] = max(extr_s[d1][0], extr_r[d1][0]);
- -                    extr_s[d1][1] = min(extr_s[d1][1], extr_r[d1][1]);
- -                    extr_s[d1][2] = min(extr_s[d1][2], extr_r[d1][2]);
+ +                    extr_s[d1][0] = std::max(extr_s[d1][0], extr_r[d1][0]);
+ +                    extr_s[d1][1] = std::min(extr_s[d1][1], extr_r[d1][1]);
+ +                    extr_s[d1][2] = std::min(extr_s[d1][2], extr_r[d1][2]);
                   }
               }
           }
@@@ -1228,9 -1221,9 +1230,9 @@@
                   {
                       if (bUse)
                       {
- -                        buf_e[i].min0 = min(buf_e[i].min0, buf_r[i].min0);
- -                        buf_e[i].max1 = max(buf_e[i].max1, buf_r[i].max1);
- -                        buf_e[i].min1 = min(buf_e[i].min1, buf_r[i].min1);
+ +                        buf_e[i].min0 = std::min(buf_e[i].min0, buf_r[i].min0);
+ +                        buf_e[i].max1 = std::max(buf_e[i].max1, buf_r[i].max1);
+ +                        buf_e[i].min1 = std::min(buf_e[i].min1, buf_r[i].min1);
                       }
   
                       if (dd->ndim == 3 && d == 0 && i == buf_size - 1)
@@@ -1243,8 -1236,8 +1245,8 @@@
                       }
                       if (bUse && dh[d1] >= 0)
                       {
- -                        buf_e[i].mch0 = max(buf_e[i].mch0, buf_r[i].mch0-dh[d1]);
- -                        buf_e[i].mch1 = max(buf_e[i].mch1, buf_r[i].mch1-dh[d1]);
+ +                        buf_e[i].mch0 = std::max(buf_e[i].mch0, buf_r[i].mch0-dh[d1]);
+ +                        buf_e[i].mch1 = std::max(buf_e[i].mch1, buf_r[i].mch1-dh[d1]);
                       }
                   }
                   /* Copy the received buffer to the send buffer,
@@@ -1260,9 -1253,9 +1262,9 @@@
   
                   for (d1 = d; d1 < dd->ndim-1; d1++)
                   {
- -                    extr_s[d1][1] = min(extr_s[d1][1], buf_e[pos].min0);
- -                    extr_s[d1][0] = max(extr_s[d1][0], buf_e[pos].max1);
- -                    extr_s[d1][2] = min(extr_s[d1][2], buf_e[pos].min1);
+ +                    extr_s[d1][1] = std::min(extr_s[d1][1], buf_e[pos].min0);
+ +                    extr_s[d1][0] = std::max(extr_s[d1][0], buf_e[pos].max1);
+ +                    extr_s[d1][2] = std::min(extr_s[d1][2], buf_e[pos].min1);
                       pos++;
                   }
   
@@@ -1292,8 -1285,8 +1294,8 @@@
               {
                   print_ddzone(debug, 1, i, 0, &comm->zone_d1[i]);
               }
- -            cell_ns_x0[dim] = min(cell_ns_x0[dim], comm->zone_d1[i].min0);
- -            cell_ns_x1[dim] = max(cell_ns_x1[dim], comm->zone_d1[i].max1);
+ +            cell_ns_x0[dim] = std::min(cell_ns_x0[dim], comm->zone_d1[i].min0);
+ +            cell_ns_x1[dim] = std::max(cell_ns_x1[dim], comm->zone_d1[i].max1);
           }
       }
       if (dd->ndim >= 3)
@@@ -1307,8 -1300,8 +1309,8 @@@
                   {
                       print_ddzone(debug, 2, i, j, &comm->zone_d2[i][j]);
                   }
- -                cell_ns_x0[dim] = min(cell_ns_x0[dim], comm->zone_d2[i][j].min0);
- -                cell_ns_x1[dim] = max(cell_ns_x1[dim], comm->zone_d2[i][j].max1);
+ +                cell_ns_x0[dim] = std::min(cell_ns_x0[dim], comm->zone_d2[i][j].min0);
+ +                cell_ns_x1[dim] = std::max(cell_ns_x1[dim], comm->zone_d2[i][j].max1);
               }
           }
       }
@@@ -1534,6 -1527,10 +1536,6 @@@ static void dd_collect_vec_gatherv(gmx_
   void dd_collect_vec(gmx_domdec_t *dd,
                       t_state *state_local, rvec *lv, rvec *v)
   {
- -    gmx_domdec_master_t *ma;
- -    int                  n, i, c, a, nalloc = 0;
- -    rvec                *buf = NULL;
- -
       dd_collect_cg(dd, state_local);
   
       if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
@@@ -1755,7 -1752,7 +1757,7 @@@ static void dd_distribute_vec_scatterv(
   {
       gmx_domdec_master_t *ma;
       int                 *scounts = NULL, *disps = NULL;
- -    int                  n, i, c, a, nalloc = 0;
+ +    int                  n, i, c, a;
       rvec                *buf = NULL;
   
       if (DDMASTER(dd))
@@@ -1949,7 -1946,7 +1951,7 @@@ static void write_dd_grid_pdb(const cha
           snew(grid_r, 2*dd->nnodes);
       }
   
- -    dd_gather(dd, 2*sizeof(rvec), grid_s[0], DDMASTER(dd) ? grid_r[0] : NULL);
+ +    dd_gather(dd, 2*sizeof(rvec), grid_s, DDMASTER(dd) ? grid_r : NULL);
   
       if (DDMASTER(dd))
       {
@@@ -2092,15 -2089,15 +2094,15 @@@ real dd_cutoff_mbody(gmx_domdec_t *dd
               r = comm->cellsize_min[dd->dim[0]];
               for (di = 1; di < dd->ndim; di++)
               {
- -                r = min(r, comm->cellsize_min[dd->dim[di]]);
+ +                r = std::min(r, comm->cellsize_min[dd->dim[di]]);
               }
               if (comm->bBondComm)
               {
- -                r = max(r, comm->cutoff_mbody);
+ +                r = std::max(r, comm->cutoff_mbody);
               }
               else
               {
- -                r = min(r, comm->cutoff);
+ +                r = std::min(r, comm->cutoff);
               }
           }
       }
@@@ -2114,7 -2111,7 +2116,7 @@@ real dd_cutoff_twobody(gmx_domdec_t *dd
   
       r_mb = dd_cutoff_mbody(dd);
   
- -    return max(dd->comm->cutoff, r_mb);
+ +    return std::max(dd->comm->cutoff, r_mb);
   }
   
   
@@@ -2176,7 -2173,7 +2178,7 @@@ static int *dd_pmenodes(t_commrec *cr
   static int gmx_ddcoord2pmeindex(t_commrec *cr, int x, int y, int z)
   {
       gmx_domdec_t *dd;
- -    ivec          coords, coords_pme, nc;
+ +    ivec          coords;
       int           slab;
   
       dd = cr->dd;
@@@ -2245,6 -2242,7 +2247,6 @@@ static int dd_simnode2pmenode(t_commre
   {
       gmx_domdec_t      *dd;
       gmx_domdec_comm_t *comm;
- -    ivec               coord, coord_pme;
       int                i;
       int                pmenode = -1;
   
@@@ -2255,7 -2253,6 +2257,7 @@@
       if (comm->bCartesianPP_PME)
       {
   #ifdef GMX_MPI
+ +        ivec coord, coord_pme;
           MPI_Cart_coords(cr->mpi_comm_mysim, sim_nodeid, DIM, coord);
           if (coord[comm->cartpmedim] < dd->nc[comm->cartpmedim])
           {
@@@ -2393,7 -2390,7 +2395,7 @@@ void get_pme_ddnodes(t_commrec *cr, in
   static gmx_bool receive_vir_ener(t_commrec *cr)
   {
       gmx_domdec_comm_t *comm;
- -    int                pmenode, coords[DIM], rank;
+ +    int                pmenode;
       gmx_bool           bReceive;
   
       bReceive = TRUE;
@@@ -2404,12 -2401,10 +2406,12 @@@
           {
               pmenode = dd_simnode2pmenode(cr, cr->sim_nodeid);
   #ifdef GMX_MPI
+ +            ivec coords;
               MPI_Cart_coords(cr->mpi_comm_mysim, cr->sim_nodeid, DIM, coords);
               coords[comm->cartpmedim]++;
               if (coords[comm->cartpmedim] < cr->dd->nc[comm->cartpmedim])
               {
+ +                int rank;
                   MPI_Cart_rank(cr->mpi_comm_mysim, coords, &rank);
                   if (dd_simnode2pmenode(cr, rank) == pmenode)
                   {
@@@ -2517,8 -2512,12 +2519,8 @@@ static void make_dd_indices(gmx_domdec_
   {
       int          nzone, zone, zone1, cg0, cg1, cg1_p1, cg, cg_gl, a, a_gl;
       int         *zone2cg, *zone_ncg1, *index_gl, *gatindex;
- -    gmx_ga2la_t *ga2la;
- -    char        *bLocalCG;
       gmx_bool     bCGs;
   
- -    bLocalCG = dd->comm->bLocalCG;
- -
       if (dd->nat_tot > dd->gatindex_nalloc)
       {
           dd->gatindex_nalloc = over_alloc_dd(dd->nat_tot);
@@@ -2583,7 -2582,7 +2585,7 @@@
   static int check_bLocalCG(gmx_domdec_t *dd, int ncg_sys, const char *bLocalCG,
                             const char *where)
   {
- -    int ncg, i, ngl, nerr;
+ +    int i, ngl, nerr;
   
       nerr = 0;
       if (bLocalCG == NULL)
@@@ -2743,12 -2742,12 +2745,12 @@@ static real cellsize_min_dlb(gmx_domdec
           /* The cut-off might have changed, e.g. by PME load balacning,
            * from the value used to set comm->cellsize_min, so check it.
            */
- -        cellsize_min = max(cellsize_min, comm->cutoff/comm->cd[dim_ind].np_dlb);
+ +        cellsize_min = std::max(cellsize_min, comm->cutoff/comm->cd[dim_ind].np_dlb);
   
           if (comm->bPMELoadBalDLBLimits)
           {
               /* Check for the cut-off limit set by the PME load balancing */
- -            cellsize_min = max(cellsize_min, comm->PMELoadBal_max_cutoff/comm->cd[dim_ind].np_dlb);
+ +            cellsize_min = std::max(cellsize_min, comm->PMELoadBal_max_cutoff/comm->cd[dim_ind].np_dlb);
           }
       }
   
@@@ -2771,10 -2770,10 +2773,10 @@@ static real grid_jump_limit(gmx_domdec_
       {
           if (comm->bPMELoadBalDLBLimits)
           {
- -            cutoff = max(cutoff, comm->PMELoadBal_max_cutoff);
+ +            cutoff = std::max(cutoff, comm->PMELoadBal_max_cutoff);
           }
- -        grid_jump_limit = max(grid_jump_limit,
- -                              cutoff/comm->cd[dim_ind].np);
+ +        grid_jump_limit = std::max(grid_jump_limit,
+ +                                   cutoff/comm->cd[dim_ind].np);
       }
   
       return grid_jump_limit;
@@@ -2961,8 -2960,8 +2963,8 @@@ static void init_ddpme(gmx_domdec_t *dd
               {
                   slab = pmeindex % ddpme->nslab;
               }
- -            ddpme->pp_min[slab] = min(ddpme->pp_min[slab], xyz[dimind]);
- -            ddpme->pp_max[slab] = max(ddpme->pp_max[slab], xyz[dimind]);
+ +            ddpme->pp_min[slab] = std::min(ddpme->pp_min[slab], xyz[dimind]);
+ +            ddpme->pp_max[slab] = std::max(ddpme->pp_max[slab], xyz[dimind]);
           }
       }
   
@@@ -3163,7 -3162,7 +3165,7 @@@ static void set_dd_cell_sizes_slb(gmx_d
                   {
                       npulse[d]++;
                   }
- -                cellsize_min[d] = min(cellsize_min[d], cellsize);
+ +                cellsize_min[d] = std::min(cellsize_min[d], cellsize);
               }
               if (setmode == setcellsizeslbLOCAL)
               {
@@@ -3433,7 -3432,7 +3435,7 @@@ static void set_dd_cell_sizes_dlb_root(
                                          gmx_bool bUniform, gmx_int64_t step)
   {
       gmx_domdec_comm_t *comm;
- -    int                ncd, d1, i, j, pos;
+ +    int                ncd, d1, i, pos;
       real              *cell_size;
       real               load_aver, load_i, imbalance, change, change_max, sc;
       real               cellsize_limit_f, dist_min_f, dist_min_f_hard, space;
@@@ -3465,7 -3464,7 +3467,7 @@@
               cell_size[i] = 1.0/ncd;
           }
       }
-     else if (dd_load_count(comm))
+     else if (dd_load_count(comm) > 0)
       {
           load_aver  = comm->load[d].sum_m/ncd;
           change_max = 0;
@@@ -3476,7 -3475,7 +3478,7 @@@
               imbalance = (load_i - load_aver)/(load_aver > 0 ? load_aver : 1);
               /* Determine the change of the cell size using underrelaxation */
               change     = -relax*imbalance;
- -            change_max = max(change_max, max(change, -change));
+ +            change_max = std::max(change_max, std::max(change, -change));
           }
           /* Limit the amount of scaling.
            * We need to use the same rescaling for all cells in one row,
@@@ -3619,7 -3618,7 +3621,7 @@@ static void distribute_dd_cell_sizes_dl
                                            gmx_ddbox_t *ddbox)
   {
       gmx_domdec_comm_t *comm;
- -    int                d1, dim1, pos;
+ +    int                d1, pos;
   
       comm = dd->comm;
   
@@@ -3907,13 -3906,13 +3909,13 @@@ static void distribute_cg(FILE *fplog, 
   {
       gmx_domdec_master_t *ma;
       int                **tmp_ind = NULL, *tmp_nalloc = NULL;
- -    int                  i, icg, j, k, k0, k1, d, npbcdim;
+ +    int                  i, icg, j, k, k0, k1, d;
       matrix               tcm;
- -    rvec                 box_size, cg_cm;
+ +    rvec                 cg_cm;
       ivec                 ind;
       real                 nrcg, inv_ncg, pos_d;
       atom_id             *cgindex;
- -    gmx_bool             bUnbounded, bScrew;
+ +    gmx_bool             bScrew;
   
       ma = dd->ma;
   
@@@ -4121,9 -4120,9 +4123,9 @@@ static void get_cg_distribution(FILE *f
       }
   
       dd_scatterv(dd,
- -                DDMASTER(dd) ? ma->ibuf : NULL,
- -                DDMASTER(dd) ? ma->ibuf+dd->nnodes : NULL,
- -                DDMASTER(dd) ? ma->cg : NULL,
+ +                bMaster ? ma->ibuf : NULL,
+ +                bMaster ? ma->ibuf+dd->nnodes : NULL,
+ +                bMaster ? ma->cg : NULL,
                   dd->ncg_home*sizeof(int), dd->index_gl);
   
       /* Determine the home charge group sizes */
@@@ -4336,7 -4335,7 +4338,7 @@@ static void clear_and_mark_ind(int ncg
   static void print_cg_move(FILE *fplog,
                             gmx_domdec_t *dd,
                             gmx_int64_t step, int cg, int dim, int dir,
-                           gmx_bool bHaveLimitdAndCMOld, real limitd,
+                           gmx_bool bHaveCgcmOld, real limitd,
                             rvec cm_old, rvec cm_new, real pos_d)
   {
       gmx_domdec_comm_t *comm;
@@@ -4345,19 -4344,22 +4347,22 @@@
       comm = dd->comm;
   
       fprintf(fplog, "\nStep %s:\n", gmx_step_str(step, buf));
-     if (bHaveLimitdAndCMOld)
+     if (limitd > 0)
       {
-         fprintf(fplog, "The charge group starting at atom %d moved more than the distance allowed by the domain decomposition (%f) in direction %c\n",
+         fprintf(fplog, "%s %d moved more than the distance allowed by the domain decomposition (%f) in direction %c\n",
+                 dd->comm->bCGs ? "The charge group starting at atom" : "Atom",
                   ddglatnr(dd, dd->cgindex[cg]), limitd, dim2char(dim));
       }
       else
       {
-         fprintf(fplog, "The charge group starting at atom %d moved than the distance allowed by the domain decomposition in direction %c\n",
+         /* We don't have a limiting distance available: don't print it */
+         fprintf(fplog, "%s %d moved more than the distance allowed by the domain decomposition in direction %c\n",
+                 dd->comm->bCGs ? "The charge group starting at atom" : "Atom",
                   ddglatnr(dd, dd->cgindex[cg]), dim2char(dim));
       }
       fprintf(fplog, "distance out of cell %f\n",
               dir == 1 ? pos_d - comm->cell_x1[dim] : pos_d - comm->cell_x0[dim]);
-     if (bHaveLimitdAndCMOld)
+     if (bHaveCgcmOld)
       {
           fprintf(fplog, "Old coordinates: %8.3f %8.3f %8.3f\n",
                   cm_old[XX], cm_old[YY], cm_old[ZZ]);
@@@ -4375,19 -4377,20 +4380,20 @@@
   static void cg_move_error(FILE *fplog,
                             gmx_domdec_t *dd,
                             gmx_int64_t step, int cg, int dim, int dir,
-                           gmx_bool bHaveLimitdAndCMOld, real limitd,
+                           gmx_bool bHaveCgcmOld, real limitd,
                             rvec cm_old, rvec cm_new, real pos_d)
   {
       if (fplog)
       {
           print_cg_move(fplog, dd, step, cg, dim, dir,
-                       bHaveLimitdAndCMOld, limitd, cm_old, cm_new, pos_d);
+                       bHaveCgcmOld, limitd, cm_old, cm_new, pos_d);
       }
       print_cg_move(stderr, dd, step, cg, dim, dir,
-                   bHaveLimitdAndCMOld, limitd, cm_old, cm_new, pos_d);
+                   bHaveCgcmOld, limitd, cm_old, cm_new, pos_d);
       gmx_fatal(FARGS,
-               "A charge group moved too far between two domain decomposition steps\n"
-               "This usually means that your system is not well equilibrated");
+               "%s moved too far between two domain decomposition steps\n"
+               "This usually means that your system is not well equilibrated",
+               dd->comm->bCGs ? "A charge group" : "An atom");
   }
   
   static void rotate_state_atom(t_state *state, int a)
@@@ -4454,8 -4457,8 +4460,8 @@@ static void calc_cg_move(FILE *fplog, g
                            int *move)
   {
       int      npbcdim;
- -    int      c, i, cg, k, k0, k1, d, dim, dim2, dir, d2, d3, d4, cell_d;
- -    int      mc, cdd, nrcg, ncg_recv, nat_recv, nvs, nvr, nvec, vec;
+ +    int      cg, k, k0, k1, d, dim, d2;
+ +    int      mc, nrcg;
       int      flag;
       gmx_bool bScrew;
       ivec     dev;
@@@ -4509,7 -4512,8 +4515,8 @@@
                   {
                       if (pos_d >= limit1[d])
                       {
-                         cg_move_error(fplog, dd, step, cg, d, 1, TRUE, limitd[d],
+                         cg_move_error(fplog, dd, step, cg, d, 1,
+                                       cg_cm != state->x, limitd[d],
                                         cg_cm[cg], cm_new, pos_d);
                       }
                       dev[d] = 1;
@@@ -4535,7 -4539,8 +4542,8 @@@
                   {
                       if (pos_d < limit0[d])
                       {
-                         cg_move_error(fplog, dd, step, cg, d, -1, TRUE, limitd[d],
+                         cg_move_error(fplog, dd, step, cg, d, -1,
+                                       cg_cm != state->x, limitd[d],
                                         cg_cm[cg], cm_new, pos_d);
                       }
                       dev[d] = -1;
@@@ -4629,15 -4634,17 +4637,15 @@@ static void dd_redistribute_cg(FILE *fp
       int               *move;
       int                npbcdim;
       int                ncg[DIM*2], nat[DIM*2];
- -    int                c, i, cg, k, k0, k1, d, dim, dim2, dir, d2, d3, d4, cell_d;
- -    int                mc, cdd, nrcg, ncg_recv, nat_recv, nvs, nvr, nvec, vec;
+ +    int                c, i, cg, k, d, dim, dim2, dir, d2, d3;
+ +    int                mc, cdd, nrcg, ncg_recv, nvs, nvr, nvec, vec;
       int                sbuf[2], rbuf[2];
       int                home_pos_cg, home_pos_at, buf_pos;
       int                flag;
       gmx_bool           bV = FALSE, bSDX = FALSE, bCGP = FALSE;
- -    gmx_bool           bScrew;
- -    ivec               dev;
- -    real               inv_ncg, pos_d;
+ +    real               pos_d;
       matrix             tcm;
- -    rvec              *cg_cm = NULL, cell_x0, cell_x1, limitd, limit0, limit1, cm_new;
+ +    rvec              *cg_cm = NULL, cell_x0, cell_x1, limitd, limit0, limit1;
       atom_id           *cgindex;
       cginfo_mb_t       *cginfo_mb;
       gmx_domdec_comm_t *comm;
@@@ -4894,6 -4901,7 +4902,6 @@@
       {
           dim      = dd->dim[d];
           ncg_recv = 0;
- -        nat_recv = 0;
           nvr      = 0;
           for (dir = 0; dir < (dd->nc[dim] == 2 ? 1 : 2); dir++)
           {
@@@ -4928,6 -4936,7 +4936,6 @@@
                                comm->cgcm_state[cdd], nvs,
                                comm->vbuf.v+nvr, i);
               ncg_recv += rbuf[0];
- -            nat_recv += rbuf[1];
               nvr      += i;
           }
   
@@@ -4949,7 -4958,7 +4957,7 @@@
                   {
                       cg_move_error(fplog, dd, step, cg, dim,
                                     (flag & DD_FLAG_FW(d)) ? 1 : 0,
-                                   FALSE, 0,
+                                   fr->cutoff_scheme == ecutsGROUP, 0,
                                     comm->vbuf.v[buf_pos],
                                     comm->vbuf.v[buf_pos],
                                     comm->vbuf.v[buf_pos][dim]);
@@@ -5231,7 -5240,7 +5239,7 @@@ static void get_load_distribution(gmx_d
       gmx_domdec_comm_t *comm;
       gmx_domdec_load_t *load;
       gmx_domdec_root_t *root = NULL;
- -    int                d, dim, cid, i, pos;
+ +    int                d, dim, i, pos;
       float              cell_frac = 0, sbuf[DD_NLOAD_MAX];
       gmx_bool           bSepPME;
   
@@@ -5327,7 -5336,7 +5335,7 @@@
                   for (i = 0; i < dd->nc[dim]; i++)
                   {
                       load->sum += load->load[pos++];
- -                    load->max  = max(load->max, load->load[pos]);
+ +                    load->max  = std::max(load->max, load->load[pos]);
                       pos++;
                       if (dd->bGridJump)
                       {
@@@ -5336,14 -5345,14 +5344,14 @@@
                               /* This direction could not be load balanced properly,
                                * therefore we need to use the maximum iso the average load.
                                */
- -                            load->sum_m = max(load->sum_m, load->load[pos]);
+ +                            load->sum_m = std::max(load->sum_m, load->load[pos]);
                           }
                           else
                           {
                               load->sum_m += load->load[pos];
                           }
                           pos++;
- -                        load->cvol_min = min(load->cvol_min, load->load[pos]);
+ +                        load->cvol_min = std::min(load->cvol_min, load->load[pos]);
                           pos++;
                           if (d < dd->ndim-1)
                           {
@@@ -5357,9 -5366,9 +5365,9 @@@
                       }
                       if (bSepPME)
                       {
- -                        load->mdf = max(load->mdf, load->load[pos]);
+ +                        load->mdf = std::max(load->mdf, load->load[pos]);
                           pos++;
- -                        load->pme = max(load->pme, load->load[pos]);
+ +                        load->pme = std::max(load->pme, load->load[pos]);
                           pos++;
                       }
                   }
@@@ -5753,6 -5762,7 +5761,6 @@@ static void make_load_communicators(gmx
   
   void setup_dd_grid(FILE *fplog, gmx_domdec_t *dd)
   {
- -    gmx_bool                bZYX;
       int                     d, dim, i, j, m;
       ivec                    tmp, s;
       int                     nzone, nzonep;
@@@ -5911,16 -5921,17 +5919,16 @@@
   static void make_pp_communicator(FILE *fplog, t_commrec *cr, int gmx_unused reorder)
   {
       gmx_domdec_t      *dd;
+ +    dd   = cr->dd;
+ +
+ +#ifdef GMX_MPI
       gmx_domdec_comm_t *comm;
- -    int                i, rank, *buf;
+ +    int                rank, *buf;
       ivec               periods;
- -#ifdef GMX_MPI
       MPI_Comm           comm_cart;
- -#endif
   
- -    dd   = cr->dd;
       comm = dd->comm;
   
- -#ifdef GMX_MPI
       if (comm->bCartesianPP)
       {
           /* Set up cartesian communication for the particle-particle part */
@@@ -5930,7 -5941,7 +5938,7 @@@
                       dd->nc[XX], dd->nc[YY], dd->nc[ZZ]);
           }
   
- -        for (i = 0; i < DIM; i++)
+ +        for (int i = 0; i < DIM; i++)
           {
               periods[i] = TRUE;
           }
@@@ -5994,7 -6005,7 +6002,7 @@@
           /* Determine the master coordinates and rank.
            * The DD master should be the same node as the master of this sim.
            */
- -        for (i = 0; i < dd->nnodes; i++)
+ +        for (int i = 0; i < dd->nnodes; i++)
           {
               if (comm->ddindex2simnodeid[i] == 0)
               {
@@@ -6032,27 -6043,30 +6040,27 @@@
       }
   }
   
- -static void receive_ddindex2simnodeid(t_commrec *cr)
+ +static void receive_ddindex2simnodeid(t_commrec gmx_unused *cr)
   {
+ +#ifdef GMX_MPI
       gmx_domdec_t      *dd;
- -
       gmx_domdec_comm_t *comm;
- -    int               *buf;
   
       dd   = cr->dd;
       comm = dd->comm;
   
- -#ifdef GMX_MPI
       if (!comm->bCartesianPP_PME && comm->bCartesianPP)
       {
+ +        int *buf;
           snew(comm->ddindex2simnodeid, dd->nnodes);
           snew(buf, dd->nnodes);
           if (cr->duty & DUTY_PP)
           {
               buf[dd_index(dd->nc, dd->ci)] = cr->sim_nodeid;
           }
- -#ifdef GMX_MPI
           /* Communicate the ddindex to simulation nodeid index */
           MPI_Allreduce(buf, comm->ddindex2simnodeid, dd->nnodes, MPI_INT, MPI_SUM,
                         cr->mpi_comm_mysim);
- -#endif
           sfree(buf);
       }
   #endif
@@@ -6094,8 -6108,9 +6102,8 @@@ static void split_communicator(FILE *fp
   {
       gmx_domdec_t      *dd;
       gmx_domdec_comm_t *comm;
- -    int                i, rank;
+ +    int                i;
       gmx_bool           bDiv[DIM];
- -    ivec               periods;
   #ifdef GMX_MPI
       MPI_Comm           comm_cart;
   #endif
@@@ -6143,9 -6158,6 +6151,9 @@@
   #ifdef GMX_MPI
       if (comm->bCartesianPP_PME)
       {
+ +        int  rank;
+ +        ivec periods;
+ +
           if (fplog)
           {
               fprintf(fplog, "Will use a Cartesian communicator for PP <-> PME: %d x %d x %d\n", comm->ntot[XX], comm->ntot[YY], comm->ntot[ZZ]);
@@@ -6157,6 -6169,7 +6165,6 @@@
           }
           MPI_Cart_create(cr->mpi_comm_mysim, DIM, comm->ntot, periods, reorder,
                           &comm_cart);
- -
           MPI_Comm_rank(comm_cart, &rank);
           if (MASTERNODE(cr) && rank != 0)
           {
@@@ -6340,7 -6353,7 +6348,7 @@@ static real *get_slb_frac(FILE *fplog, 
           for (i = 0; i < nc; i++)
           {
               dbl = 0;
- -            sscanf(size_string, "%lf%n", &dbl, &n);
+ +            sscanf(size_string, "%20lf%n", &dbl, &n);
               if (dbl == 0)
               {
                   gmx_fatal(FARGS, "Incorrect or not enough DD cell size entries for direction %s: '%s'", dir, size_string);
@@@ -6403,7 -6416,7 +6411,7 @@@ static int dd_getenv(FILE *fplog, cons
       val = getenv(env_var);
       if (val)
       {
- -        if (sscanf(val, "%d", &nst) <= 0)
+ +        if (sscanf(val, "%20d", &nst) <= 0)
           {
               nst = 1;
           }
@@@ -6464,7 -6477,7 +6472,7 @@@ static real average_cellsize_min(gmx_do
       {
           d = dd->dim[di];
           /* Check using the initial average cell size */
- -        r = min(r, ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
+ +        r = std::min(r, ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
       }
   
       return r;
@@@ -6474,6 -6487,7 +6482,6 @@@ static int check_dlb_support(FILE *fplo
                                const char *dlb_opt, gmx_bool bRecordLoad,
                                unsigned long Flags, t_inputrec *ir)
   {
- -    gmx_domdec_t *dd;
       int           eDLB = -1;
       char          buf[STRLEN];
   
@@@ -6615,10 -6629,10 +6623,10 @@@ gmx_domdec_t *init_domain_decomposition
       gmx_domdec_t      *dd;
       gmx_domdec_comm_t *comm;
       int                recload;
- -    int                d, i, j;
       real               r_2b, r_mb, r_bonded = -1, r_bonded_limit = -1, limit, acs;
       gmx_bool           bC;
       char               buf[STRLEN];
+ +    const real         tenPercentMargin = 1.1;
   
       if (fplog)
       {
@@@ -6673,7 -6687,8 +6681,8 @@@
       /* Initialize to GPU share count to 0, might change later */
       comm->nrank_gpu_shared = 0;
   
-     comm->eDLB = check_dlb_support(fplog, cr, dlb_opt, comm->bRecordLoad, Flags, ir);
+     comm->eDLB        = check_dlb_support(fplog, cr, dlb_opt, comm->bRecordLoad, Flags, ir);
+     comm->bDLB_locked = FALSE;
   
       comm->bDynLoadBal = (comm->eDLB == edlbYES);
       if (fplog)
@@@ -6739,6 -6754,13 +6748,13 @@@
       comm->cellsize_limit = 0;
       comm->bBondComm      = FALSE;
   
- -    comm->cellsize_limit = max(comm->cellsize_limit,
- -                               ir->rlistlong - max(ir->rvdw, ir->rcoulomb));
+     /* Atoms should be able to move by up to half the list buffer size (if > 0)
+      * within nstlist steps. Since boundaries are allowed to displace by half
+      * a cell size, DD cells should be at least the size of the list buffer.
+      */
++    comm->cellsize_limit = std::max(comm->cellsize_limit,
++                                    ir->rlistlong - std::max(ir->rvdw, ir->rcoulomb));
+ 
       if (comm->bInterCGBondeds)
       {
           if (comm_distance_min > 0)
@@@ -6750,7 -6772,7 +6766,7 @@@
               }
               else
               {
- -                comm->cutoff = max(comm->cutoff, comm->cutoff_mbody);
+ +                comm->cutoff = std::max(comm->cutoff, comm->cutoff_mbody);
               }
               r_bonded_limit = comm->cutoff_mbody;
           }
@@@ -6776,16 -6798,16 +6792,16 @@@
                */
               if (Flags & MD_DDBONDCOMM)
               {
- -                if (max(r_2b, r_mb) > comm->cutoff)
+ +                if (std::max(r_2b, r_mb) > comm->cutoff)
                   {
- -                    r_bonded        = max(r_2b, r_mb);
- -                    r_bonded_limit  = 1.1*r_bonded;
+ +                    r_bonded        = std::max(r_2b, r_mb);
+ +                    r_bonded_limit  = tenPercentMargin*r_bonded;
                       comm->bBondComm = TRUE;
                   }
                   else
                   {
                       r_bonded       = r_mb;
- -                    r_bonded_limit = min(1.1*r_bonded, comm->cutoff);
+ +                    r_bonded_limit = std::min(tenPercentMargin*r_bonded, comm->cutoff);
                   }
                   /* We determine cutoff_mbody later */
               }
@@@ -6794,12 -6816,12 +6810,12 @@@
                   /* No special bonded communication,
                    * simply increase the DD cut-off.
                    */
- -                r_bonded_limit     = 1.1*max(r_2b, r_mb);
+ +                r_bonded_limit     = tenPercentMargin*std::max(r_2b, r_mb);
                   comm->cutoff_mbody = r_bonded_limit;
- -                comm->cutoff       = max(comm->cutoff, comm->cutoff_mbody);
+ +                comm->cutoff       = std::max(comm->cutoff, comm->cutoff_mbody);
               }
           }
- -        comm->cellsize_limit = max(comm->cellsize_limit, r_bonded_limit);
+ +        comm->cellsize_limit = std::max(comm->cellsize_limit, r_bonded_limit);
           if (fplog)
           {
               fprintf(fplog,
@@@ -6833,7 -6855,7 +6849,7 @@@
                   "User supplied maximum distance required for P-LINCS: %.3f nm\n",
                   rconstr);
       }
- -    comm->cellsize_limit = max(comm->cellsize_limit, rconstr);
+ +    comm->cellsize_limit = std::max(comm->cellsize_limit, rconstr);
   
       comm->cgs_gl = gmx_mtop_global_cgs(mtop);
   
@@@ -6989,12 -7011,12 +7005,12 @@@
               if (comm->eDLB != edlbNO)
               {
                   /* Check if this does not limit the scaling */
- -                comm->cutoff_mbody = min(comm->cutoff_mbody, dlb_scale*acs);
+ +                comm->cutoff_mbody = std::min(comm->cutoff_mbody, dlb_scale*acs);
               }
               if (!comm->bBondComm)
               {
                   /* Without bBondComm do not go beyond the n.b. cut-off */
- -                comm->cutoff_mbody = min(comm->cutoff_mbody, comm->cutoff);
+ +                comm->cutoff_mbody = std::min(comm->cutoff_mbody, comm->cutoff);
                   if (comm->cellsize_limit >= comm->cutoff)
                   {
                       /* We don't loose a lot of efficieny
@@@ -7006,7 -7028,7 +7022,7 @@@
                   }
               }
               /* Check if we did not end up below our original limit */
- -            comm->cutoff_mbody = max(comm->cutoff_mbody, r_bonded_limit);
+ +            comm->cutoff_mbody = std::max(comm->cutoff_mbody, r_bonded_limit);
   
               if (comm->cutoff_mbody > comm->cellsize_limit)
               {
@@@ -7069,7 -7091,7 +7085,7 @@@ static void turn_on_dlb(FILE *fplog, t_
       cellsize_min = comm->cellsize_min[dd->dim[0]];
       for (d = 1; d < dd->ndim; d++)
       {
- -        cellsize_min = min(cellsize_min, comm->cellsize_min[dd->dim[d]]);
+ +        cellsize_min = std::min(cellsize_min, comm->cellsize_min[dd->dim[d]]);
       }
   
       if (cellsize_min < comm->cellsize_limit*1.05)
@@@ -7134,6 -7156,8 +7150,6 @@@ void dd_init_bondeds(FILE *fplog
                        t_inputrec *ir, gmx_bool bBCheck, cginfo_mb_t *cginfo_mb)
   {
       gmx_domdec_comm_t *comm;
- -    gmx_bool           bBondComm;
- -    int                d;
   
       dd_make_reverse_top(fplog, dd, mtop, vsite, ir, bBCheck);
   
@@@ -7244,7 -7268,7 +7260,7 @@@ static void print_dd_settings(FILE *fpl
               limit = dd->comm->cellsize_min[XX];
               for (d = 1; d < DIM; d++)
               {
- -                limit = min(limit, dd->comm->cellsize_min[d]);
+ +                limit = std::min(limit, dd->comm->cellsize_min[d]);
               }
           }
   
@@@ -7252,10 -7276,10 +7268,10 @@@
           {
               fprintf(fplog, "%40s  %-7s %6.3f nm\n",
                       "two-body bonded interactions", "(-rdd)",
- -                    max(comm->cutoff, comm->cutoff_mbody));
+ +                    std::max(comm->cutoff, comm->cutoff_mbody));
               fprintf(fplog, "%40s  %-7s %6.3f nm\n",
                       "multi-body bonded interactions", "(-rdd)",
- -                    (comm->bBondComm || dd->bGridJump) ? comm->cutoff_mbody : min(comm->cutoff, limit));
+ +                    (comm->bBondComm || dd->bGridJump) ? comm->cutoff_mbody : std::min(comm->cutoff, limit));
           }
           if (dd->vsite_comm)
           {
@@@ -7290,7 -7314,7 +7306,7 @@@ static void set_cell_limits_dlb(gmx_dom
   
       /* Determine the maximum number of comm. pulses in one dimension */
   
- -    comm->cellsize_limit = max(comm->cellsize_limit, comm->cutoff_mbody);
+ +    comm->cellsize_limit = std::max(comm->cellsize_limit, comm->cutoff_mbody);
   
       /* Determine the maximum required number of grid pulses */
       if (comm->cellsize_limit >= comm->cutoff)
@@@ -7311,7 -7335,7 +7327,7 @@@
       else
       {
           /* There is no cell size limit */
- -        npulse = max(dd->nc[XX]-1, max(dd->nc[YY]-1, dd->nc[ZZ]-1));
+ +        npulse = std::max(dd->nc[XX]-1, std::max(dd->nc[YY]-1, dd->nc[ZZ]-1));
       }
   
       if (!bNoCutOff && npulse > 1)
@@@ -7323,9 -7347,9 +7339,9 @@@
               dim      = dd->dim[d];
               npulse_d = (int)(1 + dd->nc[dim]*comm->cutoff
                                /(ddbox->box_size[dim]*ddbox->skew_fac[dim]*dlb_scale));
- -            npulse_d_max = max(npulse_d_max, npulse_d);
+ +            npulse_d_max = std::max(npulse_d_max, npulse_d);
           }
- -        npulse = min(npulse, npulse_d_max);
+ +        npulse = std::min(npulse, npulse_d_max);
       }
   
       /* This env var can override npulse */
@@@ -7339,10 -7363,10 +7355,10 @@@
       comm->bVacDLBNoLimit = (ir->ePBC == epbcNONE);
       for (d = 0; d < dd->ndim; d++)
       {
- -        comm->cd[d].np_dlb    = min(npulse, dd->nc[dd->dim[d]]-1);
+ +        comm->cd[d].np_dlb    = std::min(npulse, dd->nc[dd->dim[d]]-1);
           comm->cd[d].np_nalloc = comm->cd[d].np_dlb;
           snew(comm->cd[d].ind, comm->cd[d].np_nalloc);
- -        comm->maxpulse = max(comm->maxpulse, comm->cd[d].np_dlb);
+ +        comm->maxpulse = std::max(comm->maxpulse, comm->cd[d].np_dlb);
           if (comm->cd[d].np_dlb < dd->nc[dd->dim[d]]-1)
           {
               comm->bVacDLBNoLimit = FALSE;
@@@ -7352,10 -7376,10 +7368,10 @@@
       /* cellsize_limit is set for LINCS in init_domain_decomposition */
       if (!comm->bVacDLBNoLimit)
       {
- -        comm->cellsize_limit = max(comm->cellsize_limit,
- -                                   comm->cutoff/comm->maxpulse);
+ +        comm->cellsize_limit = std::max(comm->cellsize_limit,
+ +                                        comm->cutoff/comm->maxpulse);
       }
- -    comm->cellsize_limit = max(comm->cellsize_limit, comm->cutoff_mbody);
+ +    comm->cellsize_limit = std::max(comm->cellsize_limit, comm->cutoff_mbody);
       /* Set the minimum cell size for each DD dimension */
       for (d = 0; d < dd->ndim; d++)
       {
@@@ -7372,7 -7396,7 +7388,7 @@@
       }
       if (comm->cutoff_mbody <= 0)
       {
- -        comm->cutoff_mbody = min(comm->cutoff, comm->cellsize_limit);
+ +        comm->cutoff_mbody = std::min(comm->cutoff, comm->cellsize_limit);
       }
       if (comm->bDynLoadBal)
       {
@@@ -7463,7 -7487,7 +7479,7 @@@ void set_dd_parameters(FILE *fplog, gmx
       }
       natoms_tot = comm->cgs_gl.index[comm->cgs_gl.nr];
   
- -    dd->ga2la = ga2la_init(natoms_tot, vol_frac*natoms_tot);
+ +    dd->ga2la = ga2la_init(natoms_tot, static_cast<int>(vol_frac*natoms_tot));
   }
   
   static gmx_bool test_dd_cutoff(t_commrec *cr,
@@@ -7564,6 -7588,20 +7580,20 @@@ void change_dd_dlb_cutoff_limit(t_commr
       comm->PMELoadBal_max_cutoff = comm->cutoff;
   }
   
+ gmx_bool dd_dlb_is_locked(const gmx_domdec_t *dd)
+ {
+     return dd->comm->bDLB_locked;
+ }
+ 
+ void dd_dlb_set_lock(gmx_domdec_t *dd, gmx_bool bValue)
+ {
+     /* We can only lock the DLB when it is set to auto, otherwise don't lock */
+     if (dd->comm->eDLB == edlbAUTO)
+     {
+         dd->comm->bDLB_locked = bValue;
+     }
+ }
+ 
   static void merge_cg_buffers(int ncell,
                                gmx_domdec_comm_dim_t *cd, int pulse,
                                int  *ncg_cell,
@@@ -7727,11 -7765,11 +7757,11 @@@ set_dd_corners(const gmx_domdec_t *dd
           c->c[1][1] = comm->cell_x0[dim1];
           if (dd->bGridJump)
           {
- -            c->c[1][1] = max(comm->cell_x0[dim1], comm->zone_d1[1].mch0);
+ +            c->c[1][1] = std::max(comm->cell_x0[dim1], comm->zone_d1[1].mch0);
               if (bDistMB)
               {
                   /* For the multi-body distance we need the maximum */
- -                c->bc[1] = max(comm->cell_x0[dim1], comm->zone_d1[1].p1_0);
+ +                c->bc[1] = std::max(comm->cell_x0[dim1], comm->zone_d1[1].p1_0);
               }
           }
           /* Set the upper-right corner for rounding */
@@@ -7754,8 -7792,8 +7784,8 @@@
                           if (j >= 4)
                           {
                               c->c[2][j-4] =
- -                                max(c->c[2][j-4],
- -                                    comm->zone_d2[zones->shift[i][dim0]][zones->shift[i][dim1]].mch0);
+ +                                std::max(c->c[2][j-4],
+ +                                         comm->zone_d2[zones->shift[i][dim0]][zones->shift[i][dim1]].mch0);
                           }
                       }
                   }
@@@ -7767,7 -7805,7 +7797,7 @@@
                       {
                           for (j = 0; j < 2; j++)
                           {
- -                            c->bc[2] = max(c->bc[2], comm->zone_d2[i][j].p1_0);
+ +                            c->bc[2] = std::max(c->bc[2], comm->zone_d2[i][j].p1_0);
                           }
                       }
                   }
@@@ -7781,11 -7819,11 +7811,11 @@@
               c->cr1[3] = comm->cell_x1[dim1];
               if (dd->bGridJump)
               {
- -                c->cr1[0] = max(comm->cell_x1[dim1], comm->zone_d1[1].mch1);
+ +                c->cr1[0] = std::max(comm->cell_x1[dim1], comm->zone_d1[1].mch1);
                   if (bDistMB)
                   {
                       /* For the multi-body distance we need the maximum */
- -                    c->bcr1 = max(comm->cell_x1[dim1], comm->zone_d1[1].p1_1);
+ +                    c->bcr1 = std::max(comm->cell_x1[dim1], comm->zone_d1[1].p1_1);
                   }
               }
           }
@@@ -8065,7 -8103,7 +8095,7 @@@ static void setup_dd_communication(gmx_
   {
       int                    dim_ind, dim, dim0, dim1, dim2, dimd, p, nat_tot;
       int                    nzone, nzone_send, zone, zonei, cg0, cg1;
- -    int                    c, i, j, cg, cg_gl, nrcg;
+ +    int                    c, i, cg, cg_gl, nrcg;
       int                   *zone_cg_range, pos_cg, *index_gl, *cgindex, *recv_i;
       gmx_domdec_comm_t     *comm;
       gmx_domdec_zones_t    *zones;
@@@ -8073,7 -8111,7 +8103,7 @@@
       gmx_domdec_ind_t      *ind;
       cginfo_mb_t           *cginfo_mb;
       gmx_bool               bBondComm, bDist2B, bDistMB, bDistBonded;
- -    real                   r_mb, r_comm2, r_scomm2, r_bcomm2, r_0, r_1, r2inc, inv_ncg;
+ +    real                   r_comm2, r_bcomm2;
       dd_corners_t           corners;
       ivec                   tric_dist;
       rvec                  *cg_cm, *normal, *v_d, *v_0 = NULL, *v_1 = NULL, *recv_vr;
@@@ -8104,6 -8142,8 +8134,6 @@@
   
       for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
       {
- -        dim = dd->dim[dim_ind];
- -
           /* Check if we need to use triclinic distances */
           tric_dist[dim_ind] = 0;
           for (i = 0; i <= dim_ind; i++)
@@@ -8395,7 -8435,7 +8425,7 @@@
                       /* The rvec buffer is also required for atom buffers
                        * of size nrecv in dd_move_x and dd_move_f.
                        */
- -                    i = max(cd->ind[0].nrecv[nzone+1], ind->nrecv[nzone+1]);
+ +                    i = std::max(cd->ind[0].nrecv[nzone+1], ind->nrecv[nzone+1]);
                       vec_rvec_check_alloc(&comm->vbuf2, i);
                   }
               }
@@@ -8540,9 -8580,10 +8570,9 @@@ static void set_zones_size(gmx_domdec_
       gmx_domdec_comm_t  *comm;
       gmx_domdec_zones_t *zones;
       gmx_bool            bDistMB;
- -    int                 z, zi, zj0, zj1, d, dim;
+ +    int                 z, zi, d, dim;
       real                rcs, rcmbs;
       int                 i, j;
- -    real                size_j, add_tric;
       real                vol;
   
       comm = dd->comm;
@@@ -8640,8 -8681,8 +8670,8 @@@
                                    * With multiple pulses this will lead
                                    * to a larger zone then strictly necessary.
                                    */
- -                                zones->size[z].x1[dim] = max(zones->size[z].x1[dim],
- -                                                             zones->size[zi].x1[dim]+rcmbs);
+ +                                zones->size[z].x1[dim] = std::max(zones->size[z].x1[dim],
+ +                                                                  zones->size[zi].x1[dim]+rcmbs);
                               }
                           }
                       }
@@@ -8660,8 -8701,8 +8690,8 @@@
                   {
                       if (zones->shift[z][dim] > 0)
                       {
- -                        zones->size[z].x1[dim] = max(zones->size[z].x1[dim],
- -                                                     zones->size[zi].x1[dim]+rcs);
+ +                        zones->size[z].x1[dim] = std::max(zones->size[z].x1[dim],
+ +                                                          zones->size[zi].x1[dim]+rcs);
                       }
                   }
               }
@@@ -8725,8 -8766,8 +8755,8 @@@
               {
                   for (i = 0; i < DIM; i++)
                   {
- -                    corner_min[i] = min(corner_min[i], corner[i]);
- -                    corner_max[i] = max(corner_max[i], corner[i]);
+ +                    corner_min[i] = std::min(corner_min[i], corner[i]);
+ +                    corner_max[i] = std::max(corner_max[i], corner[i]);
                   }
               }
           }
@@@ -8896,7 -8937,8 +8926,7 @@@ static int dd_sort_order(gmx_domdec_t *
   {
       gmx_domdec_sort_t *sort;
       gmx_cgsort_t      *cgsort, *sort_i;
- -    int                ncg_new, nsort2, nsort_new, i, *a, moved, *ibuf;
- -    int                sort_last, sort_skip;
+ +    int                ncg_new, nsort2, nsort_new, i, *a, moved;
   
       sort = dd->comm->sort;
   
@@@ -9007,7 -9049,7 +9037,7 @@@ static void dd_sort_state(gmx_domdec_t 
                             int ncg_home_old)
   {
       gmx_domdec_sort_t *sort;
- -    gmx_cgsort_t      *cgsort, *sort_i;
+ +    gmx_cgsort_t      *cgsort;
       int               *cgindex;
       int                ncg_new, i, *ibuf, cgsize;
       rvec              *vbuf;
@@@ -9264,7 -9306,7 +9294,7 @@@ void dd_partition_system(FIL
       t_block           *cgs_gl;
       gmx_int64_t        step_pcoupl;
       rvec               cell_ns_x0, cell_ns_x1;
- -    int                i, j, n, ncgindex_set, ncg_home_old = -1, ncg_moved, nat_f_novirsum;
+ +    int                i, n, ncgindex_set, ncg_home_old = -1, ncg_moved, nat_f_novirsum;
       gmx_bool           bBoxChanged, bNStGlobalComm, bDoDLB, bCheckDLB, bTurnOnDLB, bLogLoad;
       gmx_bool           bRedist, bSortCG, bResortAll;
       ivec               ncells_old = {0, 0, 0}, ncells_new = {0, 0, 0}, np;
@@@ -9324,17 -9366,18 +9354,17 @@@
       }
   
       /* Check if we have recorded loads on the nodes */
-     if (comm->bRecordLoad && dd_load_count(comm))
+     if (comm->bRecordLoad && dd_load_count(comm) > 0)
       {
-         if (comm->eDLB == edlbAUTO && !comm->bDynLoadBal)
+         if (comm->eDLB == edlbAUTO && !comm->bDynLoadBal && !dd_dlb_is_locked(dd))
           {
               /* Check if we should use DLB at the second partitioning
                * and every 100 partitionings,
                * so the extra communication cost is negligible.
                */
-             n         = std::max(100, nstglobalcomm);
+             const int nddp_chk_dlb = 100;
- -
               bCheckDLB = (comm->n_load_collect == 0 ||
-                          comm->n_load_have % n == n-1);
+                          comm->n_load_have % nddp_chk_dlb == nddp_chk_dlb - 1);
           }
           else
           {
@@@ -9372,8 -9415,26 +9402,26 @@@
                   /* Since the timings are node dependent, the master decides */
                   if (DDMASTER(dd))
                   {
-                     bTurnOnDLB =
-                         (dd_force_imb_perf_loss(dd) >= DD_PERF_LOSS_DLB_ON);
+                     /* Here we check if the max PME rank load is more than 0.98
+                      * the max PP force load. If so, PP DLB will not help,
+                      * since we are (almost) limited by PME. Furthermore,
+                      * DLB will cause a significant extra x/f redistribution
+                      * cost on the PME ranks, which will then surely result
+                      * in lower total performance.
+                      * This check might be fragile, since one measurement
+                      * below 0.98 (although only done once every 100 DD part.)
+                      * could turn on DLB for the rest of the run.
+                      */
+                     if (cr->npmenodes > 0 &&
+                         dd_pme_f_ratio(dd) > 1 - DD_PERF_LOSS_DLB_ON)
+                     {
+                         bTurnOnDLB = FALSE;
+                     }
+                     else
+                     {
+                         bTurnOnDLB =
+                             (dd_force_imb_perf_loss(dd) >= DD_PERF_LOSS_DLB_ON);
+                     }
                       if (debug)
                       {
                           fprintf(debug, "step %s, imb loss %f\n",
diff --combined src/gromacs/mdlib/force.c

index f98b073fee59645acfddc2992220bea2377dd2a3,d4e6445491e36fe2f10bff1cf360d7b6d8ae8050..4313d3e75ef729035af5e0600472183e92fadeb8
--- 1/src/gromacs/mdlib/force.c
--- 2/src/gromacs/mdlib/force.c
+++ b/src/gromacs/mdlib/force.c
@@@ -34,39 -34,38 +34,39 @@@
    * To help us fund GROMACS development, we humbly ask that you cite
    * the research papers on the package. Check out http://www.gromacs.org.
    */
- -#ifdef HAVE_CONFIG_H
- -#include <config.h>
- -#endif
+ +#include "gmxpre.h"
+ +
+ +#include "gromacs/legacyheaders/force.h"
+ +
+ +#include "config.h"
   
+ +#include <assert.h>
   #include <math.h>
   #include <string.h>
- -#include <assert.h>
- -#include "sysstuff.h"
- -#include "typedefs.h"
- -#include "macros.h"
- -#include "gromacs/utility/smalloc.h"
- -#include "macros.h"
- -#include "physics.h"
- -#include "force.h"
- -#include "nonbonded.h"
- -#include "names.h"
- -#include "network.h"
- -#include "pbc.h"
- -#include "ns.h"
- -#include "nrnb.h"
- -#include "bondf.h"
- -#include "mshift.h"
- -#include "txtdump.h"
- -#include "coulomb.h"
- -#include "pme.h"
- -#include "mdrun.h"
- -#include "domdec.h"
- -#include "qmmm.h"
- -#include "gmx_omp_nthreads.h"
   
+ +#include "gromacs/legacyheaders/coulomb.h"
+ +#include "gromacs/legacyheaders/domdec.h"
+ +#include "gromacs/legacyheaders/gmx_omp_nthreads.h"
+ +#include "gromacs/legacyheaders/macros.h"
+ +#include "gromacs/legacyheaders/mdrun.h"
+ +#include "gromacs/legacyheaders/names.h"
+ +#include "gromacs/legacyheaders/network.h"
+ +#include "gromacs/legacyheaders/nonbonded.h"
+ +#include "gromacs/legacyheaders/nrnb.h"
+ +#include "gromacs/legacyheaders/ns.h"
+ +#include "gromacs/legacyheaders/pme.h"
+ +#include "gromacs/legacyheaders/qmmm.h"
+ +#include "gromacs/legacyheaders/txtdump.h"
+ +#include "gromacs/legacyheaders/typedefs.h"
+ +#include "gromacs/legacyheaders/types/commrec.h"
+ +#include "gromacs/listed-forces/bonded.h"
+ +#include "gromacs/math/vec.h"
+ +#include "gromacs/pbcutil/ishift.h"
+ +#include "gromacs/pbcutil/mshift.h"
+ +#include "gromacs/pbcutil/pbc.h"
   #include "gromacs/timing/wallcycle.h"
- -#include "gmx_fatal.h"
+ +#include "gromacs/utility/fatalerror.h"
+ +#include "gromacs/utility/smalloc.h"
   
   void ns(FILE              *fp,
           t_forcerec        *fr,
@@@ -118,9 -117,11 +118,11 @@@ static void reduce_thread_forces(int n
                                    int nthreads, f_thread_t *f_t)
   {
       int t, i;
+     int nthreads_loop gmx_unused;
   
       /* This reduction can run over any number of threads */
- #pragma omp parallel for num_threads(gmx_omp_nthreads_get(emntBonded)) private(t) schedule(static)
+     nthreads_loop = gmx_omp_nthreads_get(emntBonded);
+ #pragma omp parallel for num_threads(nthreads_loop) private(t) schedule(static)
       for (i = 0; i < n; i++)
       {
           for (t = 1; t < nthreads; t++)
@@@ -139,7 -140,13 +141,7 @@@
       }
   }
   
- -void gmx_print_sepdvdl(FILE *fplog, const char *s, real v, real dvdlambda)
- -{
- -    fprintf(fplog, "  %-30s V %12.5e  dVdl %12.5e\n", s, v, dvdlambda);
- -}
- -
- -void do_force_lowlevel(FILE       *fplog,   gmx_int64_t step,
- -                       t_forcerec *fr,      t_inputrec *ir,
+ +void do_force_lowlevel(t_forcerec *fr,      t_inputrec *ir,
                          t_idef     *idef,    t_commrec  *cr,
                          t_nrnb     *nrnb,    gmx_wallcycle_t wcycle,
                          t_mdatoms  *md,
@@@ -150,6 -157,7 +152,6 @@@
                          t_fcdata   *fcd,
                          gmx_localtop_t *top,
                          gmx_genborn_t *born,
- -                       t_atomtypes *atype,
                          gmx_bool       bBornRadii,
                          matrix     box,
                          t_lambda   *fepvals,
@@@ -162,7 -170,7 +164,7 @@@
   {
       int         i, j;
       int         donb_flags;
- -    gmx_bool    bDoEpot, bSepDVDL, bSB;
+ +    gmx_bool    bDoEpot, bSB;
       int         pme_flags;
       matrix      boxs;
       rvec        box_size;
@@@ -176,6 -184,8 +178,6 @@@
       double  t0 = 0.0, t1, t2, t3; /* time measurement for coarse load balancing */
   #endif
   
- -#define PRINT_SEPDVDL(s, v, dvdlambda) if (bSepDVDL) { gmx_print_sepdvdl(fplog, s, v, dvdlambda); }
- -
       set_pbc(&pbc, fr->ePBC, box);
   
       /* reset free energy components */
@@@ -191,6 -201,7 +193,6 @@@
           box_size[i] = box[i][i];
       }
   
- -    bSepDVDL = (fr->bSepDVDL && do_per_step(step, ir->nstlog));
       debug_gmx();
   
       /* do QMMM first if requested */
@@@ -199,6 -210,12 +201,6 @@@
           enerd->term[F_EQM] = calculate_QMMM(cr, x, f, fr);
       }
   
- -    if (bSepDVDL)
- -    {
- -        fprintf(fplog, "Step %s: non-bonded V and dVdl for rank %d:\n",
- -                gmx_step_str(step, buf), cr->nodeid);
- -    }
- -
       /* Call the short range functions all in one go. */
   
   #ifdef GMX_MPI
@@@ -216,6 -233,7 +218,6 @@@
           /* foreign lambda component for walls */
           real dvdl_walls = do_walls(ir, fr, box, md, x, f, lambda[efptVDW],
                                      enerd->grpp.ener[egLJSR], nrnb);
- -        PRINT_SEPDVDL("Walls", 0.0, dvdl_walls);
           enerd->dvdl_lin[efptVDW] += dvdl_walls;
       }
   
@@@ -298,10 -316,10 +300,10 @@@
       /* MRS: Eventually, many need to include free energy contribution here! */
       if (ir->implicit_solvent)
       {
- -        wallcycle_sub_start(wcycle, ewcsBONDED);
+ +        wallcycle_sub_start(wcycle, ewcsLISTED);
           calc_gb_forces(cr, md, born, top, x, f, fr, idef,
                          ir->gb_algorithm, ir->sa_algorithm, nrnb, &pbc, graph, enerd);
- -        wallcycle_sub_stop(wcycle, ewcsBONDED);
+ +        wallcycle_sub_stop(wcycle, ewcsLISTED);
       }
   
   #ifdef GMX_MPI
@@@ -333,6 -351,24 +335,6 @@@
           enerd->dvdl_lin[efptCOUL] += dvdl_nb[efptCOUL];
       }
   
- -    if (bSepDVDL)
- -    {
- -        real V_short_range    = 0;
- -        real dvdl_short_range = 0;
- -
- -        for (i = 0; i < enerd->grpp.nener; i++)
- -        {
- -            V_short_range +=
- -                (fr->bBHAM ?
- -                 enerd->grpp.ener[egBHAMSR][i] :
- -                 enerd->grpp.ener[egLJSR][i])
- -                + enerd->grpp.ener[egCOULSR][i] + enerd->grpp.ener[egGB][i];
- -        }
- -        dvdl_short_range = dvdl_nb[efptVDW] + dvdl_nb[efptCOUL];
- -        PRINT_SEPDVDL("VdW and Coulomb SR particle-p.",
- -                      V_short_range,
- -                      dvdl_short_range);
- -    }
       debug_gmx();
   
   
@@@ -341,9 -377,9 +343,9 @@@
           pr_rvecs(debug, 0, "fshift after SR", fr->fshift, SHIFTS);
       }
   
- -    /* Shift the coordinates. Must be done before bonded forces and PPPM,
+ +    /* Shift the coordinates. Must be done before listed forces and PPPM,
        * but is also necessary for SHAKE and update, therefore it can NOT
- -     * go when no bonded forces have to be evaluated.
+ +     * go when no listed forces have to be evaluated.
        */
   
       /* Here sometimes we would not need to shift with NBFonly,
@@@ -361,9 -397,9 +363,9 @@@
               inc_nrnb(nrnb, eNR_SHIFTX, graph->nnodes);
           }
       }
- -    /* Check whether we need to do bondeds or correct for exclusions */
+ +    /* Check whether we need to do listed interactions or correct for exclusions */
       if (fr->bMolPBC &&
- -        ((flags & GMX_FORCE_BONDED)
+ +        ((flags & GMX_FORCE_LISTED)
            || EEL_RF(fr->eeltype) || EEL_FULL(fr->eeltype) || EVDW_PME(fr->vdwtype)))
       {
           /* Since all atoms are in the rectangular or triclinic unit-cell,
@@@ -373,13 -409,14 +375,13 @@@
       }
       debug_gmx();
   
- -    if (flags & GMX_FORCE_BONDED)
+ +    if (flags & GMX_FORCE_LISTED)
       {
- -        wallcycle_sub_start(wcycle, ewcsBONDED);
- -        calc_bonds(fplog, cr->ms,
- -                   idef, x, hist, f, fr, &pbc, graph, enerd, nrnb, lambda, md, fcd,
- -                   DOMAINDECOMP(cr) ? cr->dd->gatindex : NULL, atype, born,
- -                   flags,
- -                   fr->bSepDVDL && do_per_step(step, ir->nstlog), step);
+ +        wallcycle_sub_start(wcycle, ewcsLISTED);
+ +        calc_bonds(cr->ms,
+ +                   idef, (const rvec *) x, hist, f, fr, &pbc, graph, enerd, nrnb, lambda, md, fcd,
+ +                   DOMAINDECOMP(cr) ? cr->dd->gatindex : NULL,
+ +                   flags);
   
           /* Check if we have to determine energy differences
            * at foreign lambda's.
@@@ -398,7 -435,7 +400,7 @@@
                   {
                       lam_i[j] = (i == 0 ? lambda[j] : fepvals->all_lambda[j][i-1]);
                   }
- -                calc_bonds_lambda(fplog, idef, x, fr, &pbc, graph, &(enerd->foreign_grpp), enerd->foreign_term, nrnb, lam_i, md,
+ +                calc_bonds_lambda(idef, (const rvec *) x, fr, &pbc, graph, &(enerd->foreign_grpp), enerd->foreign_term, nrnb, lam_i, md,
                                     fcd, DOMAINDECOMP(cr) ? cr->dd->gatindex : NULL);
                   sum_epot(&(enerd->foreign_grpp), enerd->foreign_term);
                   enerd->enerpart_lambda[i] += enerd->foreign_term[F_EPOT];
@@@ -406,25 -443,17 +408,25 @@@
           }
           debug_gmx();
   
- -        wallcycle_sub_stop(wcycle, ewcsBONDED);
+ +        wallcycle_sub_stop(wcycle, ewcsLISTED);
       }
   
       where();
   
       *cycles_pme = 0;
+ +    clear_mat(fr->vir_el_recip);
+ +    clear_mat(fr->vir_lj_recip);
+ +
+ +    /* Do long-range electrostatics and/or LJ-PME, including related short-range
+ +     * corrections.
+ +     */
       if (EEL_FULL(fr->eeltype) || EVDW_PME(fr->vdwtype))
       {
- -        real Vlr             = 0, Vcorr = 0;
- -        real dvdl_long_range = 0;
- -        int  status          = 0;
+ +        real Vlr               = 0, Vcorr = 0;
+ +        real dvdl_long_range   = 0;
+ +        int  status            = 0;
+ +        real Vlr_q             = 0, Vlr_lj = 0, Vcorr_q = 0, Vcorr_lj = 0;
+ +        real dvdl_long_range_q = 0, dvdl_long_range_lj = 0;
   
           bSB = (ir->nwall == 2);
           if (bSB)
@@@ -433,6 -462,20 +435,6 @@@
               svmul(ir->wall_ewald_zfac, boxs[ZZ], boxs[ZZ]);
               box_size[ZZ] *= ir->wall_ewald_zfac;
           }
- -    }
- -
- -    /* Do long-range electrostatics and/or LJ-PME, including related short-range
- -     * corrections.
- -     */
- -
- -    clear_mat(fr->vir_el_recip);
- -    clear_mat(fr->vir_lj_recip);
- -
- -    if (EEL_FULL(fr->eeltype) || EVDW_PME(fr->vdwtype))
- -    {
- -        real Vlr_q             = 0, Vlr_lj = 0, Vcorr_q = 0, Vcorr_lj = 0;
- -        real dvdl_long_range_q = 0, dvdl_long_range_lj = 0;
- -        int  status            = 0;
   
           if (EEL_PME_EWALD(fr->eeltype) || EVDW_PME(fr->vdwtype))
           {
@@@ -534,10 -577,15 +536,10 @@@
                                                      fr->vir_el_recip);
               }
   
- -            PRINT_SEPDVDL("Ewald excl./charge/dip. corr.", Vcorr_q, dvdl_long_range_correction_q);
- -            PRINT_SEPDVDL("Ewald excl. corr. LJ", Vcorr_lj, dvdl_long_range_correction_lj);
               enerd->dvdl_lin[efptCOUL] += dvdl_long_range_correction_q;
               enerd->dvdl_lin[efptVDW]  += dvdl_long_range_correction_lj;
- -        }
   
- -        if ((EEL_PME(fr->eeltype) || EVDW_PME(fr->vdwtype)))
- -        {
- -            if (cr->duty & DUTY_PME)
+ +            if ((EEL_PME(fr->eeltype) || EVDW_PME(fr->vdwtype)) && (cr->duty & DUTY_PME))
               {
                   /* Do reciprocal PME for Coulomb and/or LJ. */
                   assert(fr->n_tpi >= 0);
@@@ -609,6 -657,7 +611,6 @@@
                                           md->chargeA + md->homenr - fr->n_tpi,
                                           &Vlr_q);
                   }
- -                PRINT_SEPDVDL("PME mesh", Vlr_q + Vlr_lj, dvdl_long_range_q+dvdl_long_range_lj);
               }
           }
   
@@@ -619,6 -668,7 +621,6 @@@
                                box_size, cr, md->homenr,
                                fr->vir_el_recip, fr->ewaldcoeff_q,
                                lambda[efptCOUL], &dvdl_long_range_q, fr->ewald_table);
- -            PRINT_SEPDVDL("Ewald long-range", Vlr_q, dvdl_long_range_q);
           }
   
           /* Note that with separate PME nodes we get the real energies later */
@@@ -653,6 -703,8 +655,6 @@@
                                          fr->fshift, &pbc, lambda[efptCOUL], &dvdl_rf_excl);
   
                   enerd->dvdl_lin[efptCOUL] += dvdl_rf_excl;
- -                PRINT_SEPDVDL("RF exclusion correction",
- -                              enerd->term[F_RF_EXCL], dvdl_rf_excl);
               }
           }
       }
diff --combined src/gromacs/mdlib/mdatom.c

index 01c5c55bd6140a5453f8ee24eac3a13f80e0e70a,b8d51a76da80566855c2088a159bc843d3a78e4e..d1722dedf172989c6616e38b7ff59f2d8746c115
--- 1/src/gromacs/mdlib/mdatom.c
--- 2/src/gromacs/mdlib/mdatom.c
+++ b/src/gromacs/mdlib/mdatom.c
@@@ -34,16 -34,19 +34,16 @@@
    * To help us fund GROMACS development, we humbly ask that you cite
    * the research papers on the package. Check out http://www.gromacs.org.
    */
- -#ifdef HAVE_CONFIG_H
- -#include <config.h>
- -#endif
+ +#include "gmxpre.h"
   
   #include <math.h>
   
- -#include "typedefs.h"
- -#include "mdatoms.h"
+ +#include "gromacs/legacyheaders/gmx_omp_nthreads.h"
+ +#include "gromacs/legacyheaders/mdatoms.h"
+ +#include "gromacs/legacyheaders/qmmm.h"
+ +#include "gromacs/legacyheaders/typedefs.h"
+ +#include "gromacs/topology/mtop_util.h"
   #include "gromacs/utility/smalloc.h"
- -#include "main.h"
- -#include "qmmm.h"
- -#include "mtop_util.h"
- -#include "gmx_omp_nthreads.h"
   
   #define ALMOST_ZERO 1e-30
   
@@@ -118,6 -121,7 +118,7 @@@ void atoms2md(gmx_mtop_t *mtop, t_input
       t_grpopts            *opts;
       gmx_groups_t         *groups;
       gmx_molblock_t       *molblock;
+     int                   nthreads gmx_unused;
   
       bLJPME = EVDW_PME(ir->vdwtype);
   
@@@ -230,7 -234,8 +231,8 @@@
   
       alook = gmx_mtop_atomlookup_init(mtop);
   
- #pragma omp parallel for num_threads(gmx_omp_nthreads_get(emntDefault)) schedule(static)
+     nthreads = gmx_omp_nthreads_get(emntDefault);
+ #pragma omp parallel for num_threads(nthreads) schedule(static)
       for (i = 0; i < md->nr; i++)
       {
           int      g, ag, molb;
diff --combined src/gromacs/mdlib/minimize.c

index 234bcbdd8fc68df3abae1445a5c266ba73d8a294,69008f53fabcb3388951c01d7554d479c67ccdca..5255d544d146f0fa3cea26efe922ef5f8665a302
--- 1/src/gromacs/mdlib/minimize.c
--- 2/src/gromacs/mdlib/minimize.c
+++ b/src/gromacs/mdlib/minimize.c
@@@ -34,51 -34,51 +34,51 @@@
    * To help us fund GROMACS development, we humbly ask that you cite
    * the research papers on the package. Check out http://www.gromacs.org.
    */
- -#ifdef HAVE_CONFIG_H
- -#include <config.h>
- -#endif
+ +#include "gmxpre.h"
+ +
+ +#include "config.h"
   
+ +#include <math.h>
   #include <string.h>
   #include <time.h>
- -#include <math.h>
- -#include "sysstuff.h"
- -#include "gromacs/utility/cstringutil.h"
- -#include "network.h"
- -#include "gromacs/utility/smalloc.h"
- -#include "nrnb.h"
- -#include "main.h"
- -#include "force.h"
- -#include "macros.h"
- -#include "names.h"
- -#include "gmx_fatal.h"
- -#include "txtdump.h"
- -#include "typedefs.h"
- -#include "update.h"
- -#include "constr.h"
- -#include "vec.h"
- -#include "tgroup.h"
- -#include "mdebin.h"
- -#include "vsite.h"
- -#include "force.h"
- -#include "mdrun.h"
- -#include "md_support.h"
- -#include "sim_util.h"
- -#include "domdec.h"
- -#include "mdatoms.h"
- -#include "ns.h"
- -#include "mtop_util.h"
- -#include "pme.h"
- -#include "bondf.h"
- -#include "gmx_omp_nthreads.h"
- -#include "md_logging.h"
   
   #include "gromacs/fileio/confio.h"
+ +#include "gromacs/fileio/mtxio.h"
   #include "gromacs/fileio/trajectory_writing.h"
- -#include "gromacs/linearalgebra/mtxio.h"
+ +#include "gromacs/imd/imd.h"
+ +#include "gromacs/legacyheaders/bonded-threading.h"
+ +#include "gromacs/legacyheaders/constr.h"
+ +#include "gromacs/legacyheaders/domdec.h"
+ +#include "gromacs/legacyheaders/force.h"
+ +#include "gromacs/legacyheaders/gmx_omp_nthreads.h"
+ +#include "gromacs/legacyheaders/macros.h"
+ +#include "gromacs/legacyheaders/md_logging.h"
+ +#include "gromacs/legacyheaders/md_support.h"
+ +#include "gromacs/legacyheaders/mdatoms.h"
+ +#include "gromacs/legacyheaders/mdebin.h"
+ +#include "gromacs/legacyheaders/mdrun.h"
+ +#include "gromacs/legacyheaders/names.h"
+ +#include "gromacs/legacyheaders/network.h"
+ +#include "gromacs/legacyheaders/nrnb.h"
+ +#include "gromacs/legacyheaders/ns.h"
+ +#include "gromacs/legacyheaders/pme.h"
+ +#include "gromacs/legacyheaders/sim_util.h"
+ +#include "gromacs/legacyheaders/tgroup.h"
+ +#include "gromacs/legacyheaders/txtdump.h"
+ +#include "gromacs/legacyheaders/typedefs.h"
+ +#include "gromacs/legacyheaders/update.h"
+ +#include "gromacs/legacyheaders/vsite.h"
+ +#include "gromacs/legacyheaders/types/commrec.h"
   #include "gromacs/linearalgebra/sparsematrix.h"
+ +#include "gromacs/math/vec.h"
+ +#include "gromacs/pbcutil/mshift.h"
+ +#include "gromacs/pbcutil/pbc.h"
   #include "gromacs/timing/wallcycle.h"
   #include "gromacs/timing/walltime_accounting.h"
- -#include "gromacs/imd/imd.h"
+ +#include "gromacs/topology/mtop_util.h"
+ +#include "gromacs/utility/cstringutil.h"
+ +#include "gromacs/utility/fatalerror.h"
+ +#include "gromacs/utility/smalloc.h"
   
   typedef struct {
       t_state  s;
@@@ -550,6 -550,7 +550,7 @@@ static void do_em_step(t_commrec *cr, t
       int      start, end;
       rvec    *x1, *x2;
       real     dvdl_constr;
+     int      nthreads gmx_unused;
   
       s1 = &ems1->s;
       s2 = &ems2->s;
@@@ -587,7 -588,8 +588,8 @@@
       x1 = s1->x;
       x2 = s2->x;
   
- #pragma omp parallel num_threads(gmx_omp_nthreads_get(emntUpdate))
+     nthreads = gmx_omp_nthreads_get(emntUpdate);
+ #pragma omp parallel num_threads(nthreads)
       {
           int gf, i, m;
   
@@@ -771,7 -773,7 +773,7 @@@ static void evaluate_energy(FILE *fplog
       }
   
       /* Calculate long range corrections to pressure and energy */
- -    calc_dispcorr(fplog, inputrec, fr, count, top_global->natoms, ems->s.box, ems->s.lambda[efptVDW],
+ +    calc_dispcorr(inputrec, fr, top_global->natoms, ems->s.box, ems->s.lambda[efptVDW],
                     pres, force_vir, &prescorr, &enercorr, &dvdlcorr);
       enerd->term[F_DISPCORR] = enercorr;
       enerd->term[F_EPOT]    += enercorr;
@@@ -790,6 -792,10 +792,6 @@@
                     ems->s.x, ems->f, ems->f, fr->bMolPBC, ems->s.box,
                     ems->s.lambda[efptBONDED], &dvdl_constr,
                     NULL, &shake_vir, nrnb, econqForceDispl, FALSE, 0, 0);
- -        if (fr->bSepDVDL && fplog)
- -        {
- -            gmx_print_sepdvdl(fplog, "Constraints", t, dvdl_constr);
- -        }
           enerd->term[F_DVDL_CONSTR] += dvdl_constr;
           m_add(force_vir, shake_vir, vir);
           wallcycle_stop(wcycle, ewcCONSTR);
@@@ -1786,6 -1792,7 +1788,6 @@@ double do_lbfgs(FILE *fplog, t_commrec 
       }
   
       stepsize  = 1.0/fnorm;
- -    converged = FALSE;
   
       /* Start the loop over BFGS steps.
        * Each successful step is counted, and we continue until
diff --combined src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu

index 799425edab3121ddcd2a131f0a92621ee6482bfc,d02e9034104480e733b0d4bdadd3827ba00eccb2..c024f7b02c628ea814e30ed38c23212fbde6aba4
--- 1/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu
--- 2/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu
+++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu
@@@ -32,14 -32,12 +32,14 @@@
    * To help us fund GROMACS development, we humbly ask that you cite
    * the research papers on the package. Check out http://www.gromacs.org.
    */
- -#ifdef HAVE_CONFIG_H
- -#include <config.h>
- -#endif
+ +#include "gmxpre.h"
+ +
+ +#include "nbnxn_cuda.h"
+ +
+ +#include "config.h"
   
- -#include <stdlib.h>
   #include <assert.h>
+ +#include <stdlib.h>
   
   #if defined(_MSVC)
   #include <limits>
@@@ -47,21 -45,21 +47,21 @@@
   
   #include <cuda.h>
   
- -#include "types/simple.h"
- -#include "types/nbnxn_pairlist.h"
- -#include "types/nb_verlet.h"
- -#include "types/ishift.h"
- -#include "types/force_flags.h"
- -#include "../nbnxn_consts.h"
- -
   #ifdef TMPI_ATOMICS
   #include "thread_mpi/atomic.h"
   #endif
   
+ +#include "gromacs/gmxlib/cuda_tools/cudautils.cuh"
+ +#include "gromacs/legacyheaders/types/force_flags.h"
+ +#include "gromacs/legacyheaders/types/simple.h"
+ +#include "gromacs/mdlib/nb_verlet.h"
+ +#include "gromacs/mdlib/nbnxn_consts.h"
+ +#include "gromacs/mdlib/nbnxn_pairlist.h"
+ +#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.h"
+ +#include "gromacs/pbcutil/ishift.h"
+ +#include "gromacs/utility/cstringutil.h"
+ +
   #include "nbnxn_cuda_types.h"
- -#include "../../gmxlib/cuda_tools/cudautils.cuh"
- -#include "nbnxn_cuda.h"
- -#include "nbnxn_cuda_data_mgmt.h"
   
   #if defined TEXOBJ_SUPPORTED && __CUDA_ARCH__ >= 300
   #define USE_TEXOBJ
@@@ -81,7 -79,7 +81,7 @@@ texture<float, 1, cudaReadModeElementTy
   #define CL_SIZE                 (NBNXN_GPU_CLUSTER_SIZE)
   
   /***** The kernels come here *****/
- -#include "nbnxn_cuda_kernel_utils.cuh"
+ +#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh"
   
   /* Top-level kernel generation: will generate through multiple inclusion the
    * following flavors for all kernels:
@@@ -91,19 -89,19 +91,19 @@@
    * - force and energy output with pair list pruning.
    */
   /** Force only **/
- -#include "nbnxn_cuda_kernels.cuh"
+ +#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernels.cuh"
   /** Force & energy **/
   #define CALC_ENERGIES
- -#include "nbnxn_cuda_kernels.cuh"
+ +#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernels.cuh"
   #undef CALC_ENERGIES
   
   /*** Pair-list pruning kernels ***/
   /** Force only **/
   #define PRUNE_NBL
- -#include "nbnxn_cuda_kernels.cuh"
+ +#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernels.cuh"
   /** Force & energy **/
   #define CALC_ENERGIES
- -#include "nbnxn_cuda_kernels.cuh"
+ +#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernels.cuh"
   #undef CALC_ENERGIES
   #undef PRUNE_NBL
   
@@@ -212,7 -210,7 +212,7 @@@ static inline nbnxn_cu_kfunc_ptr_t sele
       nbnxn_cu_kfunc_ptr_t res;
   
       assert(eeltype < eelCuNR);
-     assert(evdwtype < eelCuNR);
+     assert(evdwtype < evdwCuNR);
   
       if (bDoEne)
       {
@@@ -716,18 -714,18 +716,18 @@@ void nbnxn_cuda_set_cacheconfig(cuda_de
               if (devinfo->prop.major >= 3)
               {
                   /* Default kernel on sm 3.x 48/16 kB Shared/L1 */
- -                stat = cudaFuncSetCacheConfig(nb_kfunc_ener_prune_ptr[i][j], cudaFuncCachePreferShared);
- -                stat = cudaFuncSetCacheConfig(nb_kfunc_ener_noprune_ptr[i][j], cudaFuncCachePreferShared);
- -                stat = cudaFuncSetCacheConfig(nb_kfunc_noener_prune_ptr[i][j], cudaFuncCachePreferShared);
+ +                cudaFuncSetCacheConfig(nb_kfunc_ener_prune_ptr[i][j], cudaFuncCachePreferShared);
+ +                cudaFuncSetCacheConfig(nb_kfunc_ener_noprune_ptr[i][j], cudaFuncCachePreferShared);
+ +                cudaFuncSetCacheConfig(nb_kfunc_noener_prune_ptr[i][j], cudaFuncCachePreferShared);
                   stat = cudaFuncSetCacheConfig(nb_kfunc_noener_noprune_ptr[i][j], cudaFuncCachePreferShared);
               }
               else
               {
                   /* On Fermi prefer L1 gives 2% higher performance */
                   /* Default kernel on sm_2.x 16/48 kB Shared/L1 */
- -                stat = cudaFuncSetCacheConfig(nb_kfunc_ener_prune_ptr[i][j], cudaFuncCachePreferL1);
- -                stat = cudaFuncSetCacheConfig(nb_kfunc_ener_noprune_ptr[i][j], cudaFuncCachePreferL1);
- -                stat = cudaFuncSetCacheConfig(nb_kfunc_noener_prune_ptr[i][j], cudaFuncCachePreferL1);
+ +                cudaFuncSetCacheConfig(nb_kfunc_ener_prune_ptr[i][j], cudaFuncCachePreferL1);
+ +                cudaFuncSetCacheConfig(nb_kfunc_ener_noprune_ptr[i][j], cudaFuncCachePreferL1);
+ +                cudaFuncSetCacheConfig(nb_kfunc_noener_prune_ptr[i][j], cudaFuncCachePreferL1);
                   stat = cudaFuncSetCacheConfig(nb_kfunc_noener_noprune_ptr[i][j], cudaFuncCachePreferL1);
               }
               CU_RET_ERR(stat, "cudaFuncSetCacheConfig failed");
diff --combined src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernels.cuh

index c468306899094a050f87816c90cb0fe4087de5a0,3bdb17f8feaf51b33c790c09f252a5243a8d0578..3b37869e6dda00b17ae3f2fa4bced6347a271587
--- 1/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernels.cuh
--- 2/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernels.cuh
+++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernels.cuh
@@@ -52,30 -52,30 +52,30 @@@
   
   /* cut-off + V shift LJ */
   #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecCut_VdwLJ ## __VA_ARGS__
- -#include "nbnxn_cuda_kernel.cuh"
+ +#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
   #undef NB_KERNEL_FUNC_NAME
   /* LJ-Ewald w geometric combination rules */
   #define LJ_EWALD_COMB_GEOM
   #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecCut_VdwLJEwCombGeom ## __VA_ARGS__
- -#include "nbnxn_cuda_kernel.cuh"
+ +#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
   #undef LJ_EWALD_COMB_GEOM
   #undef NB_KERNEL_FUNC_NAME
   /* LJ-Ewald w LB combination rules */
   #define LJ_EWALD_COMB_LB
   #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecCut_VdwLJEwCombLB ## __VA_ARGS__
- -#include "nbnxn_cuda_kernel.cuh"
+ +#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
   #undef LJ_EWALD_COMB_LB
   #undef NB_KERNEL_FUNC_NAME
   /* F switch LJ */
   #define LJ_FORCE_SWITCH
   #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecCut_VdwLJFsw ## __VA_ARGS__
- -#include "nbnxn_cuda_kernel.cuh"
+ +#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
   #undef LJ_FORCE_SWITCH
   #undef NB_KERNEL_FUNC_NAME
   /* V switch LJ */
   #define LJ_POT_SWITCH
   #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecCut_VdwLJPsw ## __VA_ARGS__
- -#include "nbnxn_cuda_kernel.cuh"
+ +#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
   #undef LJ_POT_SWITCH
   #undef NB_KERNEL_FUNC_NAME
   
@@@ -88,30 -88,30 +88,30 @@@
   
   /* cut-off + V shift LJ */
   #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecRF_VdwLJ ## __VA_ARGS__
- -#include "nbnxn_cuda_kernel.cuh"
+ +#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
   #undef NB_KERNEL_FUNC_NAME
   /* LJ-Ewald w geometric combination rules */
   #define LJ_EWALD_COMB_GEOM
   #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecRF_VdwLJEwCombGeom ## __VA_ARGS__
- -#include "nbnxn_cuda_kernel.cuh"
+ +#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
   #undef LJ_EWALD_COMB_GEOM
   #undef NB_KERNEL_FUNC_NAME
   /* LJ-Ewald w LB combination rules */
   #define LJ_EWALD_COMB_LB
   #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecRF_VdwLJEwCombLB ## __VA_ARGS__
- -#include "nbnxn_cuda_kernel.cuh"
+ +#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
   #undef LJ_EWALD_COMB_LB
   #undef NB_KERNEL_FUNC_NAME
   /* F switch LJ */
   #define LJ_FORCE_SWITCH
   #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecRF_VdwLJFsw ## __VA_ARGS__
- -#include "nbnxn_cuda_kernel.cuh"
+ +#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
   #undef LJ_FORCE_SWITCH
   #undef NB_KERNEL_FUNC_NAME
   /* V switch LJ */
   #define LJ_POT_SWITCH
   #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecRF_VdwLJPsw ## __VA_ARGS__
- -#include "nbnxn_cuda_kernel.cuh"
+ +#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
   #undef LJ_POT_SWITCH
   #undef NB_KERNEL_FUNC_NAME
   
@@@ -124,30 -124,30 +124,30 @@@
   
   /* cut-off + V shift LJ */
   #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEw_VdwLJ ## __VA_ARGS__
- -#include "nbnxn_cuda_kernel.cuh"
+ +#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
   #undef NB_KERNEL_FUNC_NAME
   /* LJ-Ewald w geometric combination rules */
   #define LJ_EWALD_COMB_GEOM
   #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEw_VdwLJEwCombGeom ## __VA_ARGS__
- -#include "nbnxn_cuda_kernel.cuh"
+ +#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
   #undef LJ_EWALD_COMB_GEOM
   #undef NB_KERNEL_FUNC_NAME
   /* LJ-Ewald w LB combination rules */
   #define LJ_EWALD_COMB_LB
   #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEw_VdwLJEwCombLB ## __VA_ARGS__
- -#include "nbnxn_cuda_kernel.cuh"
+ +#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
   #undef LJ_EWALD_COMB_LB
   #undef NB_KERNEL_FUNC_NAME
   /* F switch LJ */
   #define LJ_FORCE_SWITCH
   #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEw_VdwLJFsw ## __VA_ARGS__
- -#include "nbnxn_cuda_kernel.cuh"
+ +#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
   #undef LJ_FORCE_SWITCH
   #undef NB_KERNEL_FUNC_NAME
   /* V switch LJ */
   #define LJ_POT_SWITCH
   #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEw_VdwLJPsw ## __VA_ARGS__
- -#include "nbnxn_cuda_kernel.cuh"
+ +#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
   #undef LJ_POT_SWITCH
   #undef NB_KERNEL_FUNC_NAME
   
@@@ -157,39 -157,39 +157,39 @@@
   /* Analytical Ewald interaction kernels with twin-range cut-off
    */
   #define EL_EWALD_ANA
- #define LJ_CUTOFF_CHECK
+ #define VDW_CUTOFF_CHECK
   
   /* cut-off + V shift LJ */
   #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwTwinCut_VdwLJ ## __VA_ARGS__
- -#include "nbnxn_cuda_kernel.cuh"
+ +#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
   #undef NB_KERNEL_FUNC_NAME
   /* LJ-Ewald w geometric combination rules */
   #define LJ_EWALD_COMB_GEOM
   #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwTwinCut_VdwLJEwCombGeom ## __VA_ARGS__
- -#include "nbnxn_cuda_kernel.cuh"
+ +#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
   #undef LJ_EWALD_COMB_GEOM
   #undef NB_KERNEL_FUNC_NAME
   /* LJ-Ewald w LB combination rules */
   #define LJ_EWALD_COMB_LB
   #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwTwinCut_VdwLJEwCombLB ## __VA_ARGS__
- -#include "nbnxn_cuda_kernel.cuh"
+ +#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
   #undef LJ_EWALD_COMB_LB
   #undef NB_KERNEL_FUNC_NAME
   /* F switch LJ */
   #define LJ_FORCE_SWITCH
   #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwTwinCut_VdwLJFsw ## __VA_ARGS__
- -#include "nbnxn_cuda_kernel.cuh"
+ +#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
   #undef LJ_FORCE_SWITCH
   #undef NB_KERNEL_FUNC_NAME
   /* V switch LJ */
   #define LJ_POT_SWITCH
   #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwTwinCut_VdwLJPsw ## __VA_ARGS__
- -#include "nbnxn_cuda_kernel.cuh"
+ +#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
   #undef LJ_POT_SWITCH
   #undef NB_KERNEL_FUNC_NAME
   
   #undef EL_EWALD_ANA
- #undef LJ_CUTOFF_CHECK
+ #undef VDW_CUTOFF_CHECK
   
   
   /* Tabulated Ewald interaction kernels */
@@@ -197,30 -197,30 +197,30 @@@
   
   /* cut-off + V shift LJ */
   #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwQSTab_VdwLJ ## __VA_ARGS__
- -#include "nbnxn_cuda_kernel.cuh"
+ +#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
   #undef NB_KERNEL_FUNC_NAME
   /* LJ-Ewald w geometric combination rules */
   #define LJ_EWALD_COMB_GEOM
   #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwQSTab_VdwLJEwCombGeom ## __VA_ARGS__
- -#include "nbnxn_cuda_kernel.cuh"
+ +#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
   #undef LJ_EWALD_COMB_GEOM
   #undef NB_KERNEL_FUNC_NAME
   /* LJ-Ewald w LB combination rules */
   #define LJ_EWALD_COMB_LB
   #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwQSTab_VdwLJEwCombLB ## __VA_ARGS__
- -#include "nbnxn_cuda_kernel.cuh"
+ +#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
   #undef LJ_EWALD_COMB_LB
   #undef NB_KERNEL_FUNC_NAME
   /* F switch LJ */
   #define LJ_FORCE_SWITCH
   #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwQSTab_VdwLJFsw ## __VA_ARGS__
- -#include "nbnxn_cuda_kernel.cuh"
+ +#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
   #undef LJ_FORCE_SWITCH
   #undef NB_KERNEL_FUNC_NAME
   /* V switch LJ */
   #define LJ_POT_SWITCH
   #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwQSTab_VdwLJPsw ## __VA_ARGS__
- -#include "nbnxn_cuda_kernel.cuh"
+ +#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
   #undef LJ_POT_SWITCH
   #undef NB_KERNEL_FUNC_NAME
   
@@@ -229,36 -229,36 +229,36 @@@
   
   /* Tabulated Ewald interaction kernels with twin-range cut-off */
   #define EL_EWALD_TAB
- #define LJ_CUTOFF_CHECK
+ #define VDW_CUTOFF_CHECK
   
   /* cut-off + V shift LJ */
   #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwQSTabTwinCut_VdwLJ ## __VA_ARGS__
- -#include "nbnxn_cuda_kernel.cuh"
+ +#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
   #undef NB_KERNEL_FUNC_NAME
   /* LJ-Ewald w geometric combination rules */
   #define LJ_EWALD_COMB_GEOM
   #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwQSTabTwinCut_VdwLJEwCombGeom ## __VA_ARGS__
- -#include "nbnxn_cuda_kernel.cuh"
+ +#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
   #undef LJ_EWALD_COMB_GEOM
   #undef NB_KERNEL_FUNC_NAME
   /* LJ-Ewald w LB combination rules */
   #define LJ_EWALD_COMB_LB
   #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwQSTabTwinCut_VdwLJEwCombLB ## __VA_ARGS__
- -#include "nbnxn_cuda_kernel.cuh"
+ +#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
   #undef LJ_EWALD_COMB_LB
   #undef NB_KERNEL_FUNC_NAME
   /* F switch LJ */
   #define LJ_FORCE_SWITCH
   #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwQSTabTwinCut_VdwLJFsw ## __VA_ARGS__
- -#include "nbnxn_cuda_kernel.cuh"
+ +#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
   #undef LJ_FORCE_SWITCH
   #undef NB_KERNEL_FUNC_NAME
   /* V switch LJ */
   #define LJ_POT_SWITCH
   #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwQSTabTwinCut_VdwLJPsw ## __VA_ARGS__
- -#include "nbnxn_cuda_kernel.cuh"
+ +#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
   #undef LJ_POT_SWITCH
   #undef NB_KERNEL_FUNC_NAME
   
   #undef EL_EWALD_TAB
- #undef LJ_CUTOFF_CHECK
+ #undef VDW_CUTOFF_CHECK
diff --combined src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_file_generator/nbnxn_kernel_simd_template.c.pre

index 9432b2f3119546932bf371d2805ec4e9343740e6,2aa4fa9fb59bf5db2dab19d027552d4707556867..5e410cdecc4525f96fa7ab74c4a50c848b203861
--- 1/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_file_generator/nbnxn_kernel_simd_template.c.pre
--- 2/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_file_generator/nbnxn_kernel_simd_template.c.pre
+++ b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_file_generator/nbnxn_kernel_simd_template.c.pre
@@@ -32,12 -32,12 +32,12 @@@
    * To help us fund GROMACS development, we humbly ask that you cite
    * the research papers on the package. Check out http://www.gromacs.org.
    */
- -#ifdef HAVE_CONFIG_H
- -#include <config.h>
- -#endif
+ +#include "gmxpre.h"
   
- -#include "typedefs.h"
+ +#include "config.h"
   
+ +#include "gromacs/legacyheaders/typedefs.h"
+ +#include "gromacs/mdlib/nb_verlet.h"
   #include "gromacs/mdlib/nbnxn_simd.h"
   
   #ifdef {0}
@@@ -47,11 -47,10 +47,11 @@@
   {2}
   #define GMX_SIMD_J_UNROLL_SIZE {3}
   #include "{4}"
- -#include "../nbnxn_kernel_common.h"
- -#include "gmx_omp_nthreads.h"
- -#include "types/force_flags.h"
- -#include "gmx_fatal.h"
+ +
+ +#include "gromacs/legacyheaders/gmx_omp_nthreads.h"
+ +#include "gromacs/legacyheaders/types/force_flags.h"
+ +#include "gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_common.h"
+ +#include "gromacs/utility/fatalerror.h"
   
   /*! \brief Kinds of electrostatic treatments in SIMD Verlet kernels
    */
@@@ -118,7 -117,7 +118,7 @@@ reduce_group_energies(int ng, int ng_2l
   
   #else /* {0} */
   
- -#include "gmx_fatal.h"
+ +#include "gromacs/utility/fatalerror.h"
   
   #endif /* {0} */
   
@@@ -139,6 -138,7 +139,7 @@@ voi
       nbnxn_pairlist_t **nbl;
       int                coulkt, vdwkt = 0;
       int                nb;
+     int                nthreads gmx_unused;
   
       nnbl = nbl_list->nnbl;
       nbl  = nbl_list->nbl;
@@@ -210,7 -210,8 +211,8 @@@
           gmx_incons("Unsupported VdW interaction type");
       }}
   
- #pragma omp parallel for schedule(static) num_threads(gmx_omp_nthreads_get(emntNonbonded))
+     nthreads = gmx_omp_nthreads_get(emntNonbonded);
+ #pragma omp parallel for schedule(static) num_threads(nthreads)
       for (nb = 0; nb < nnbl; nb++)
       {{
           nbnxn_atomdata_output_t *out;
diff --combined src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_ref.c

index 3db03bbec59c7f76f080ef9322b77f3cf2ac3cdc,a10465296569d932b4b90a2b34ca131d98c2fe58..4cad2a2f5637ba5b410af8db8ca1225cc441d1a6
--- 1/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_ref.c
--- 2/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_ref.c
+++ b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_ref.c
@@@ -32,24 -32,21 +32,24 @@@
    * To help us fund GROMACS development, we humbly ask that you cite
    * the research papers on the package. Check out http://www.gromacs.org.
    */
- -#ifdef HAVE_CONFIG_H
- -#include <config.h>
- -#endif
+ +#include "gmxpre.h"
+ +
+ +#include "nbnxn_kernel_ref.h"
+ +
+ +#include "config.h"
   
- -#include <math.h>
   #include <assert.h>
+ +#include <math.h>
   
- -#include "typedefs.h"
- -#include "vec.h"
+ +#include "gromacs/legacyheaders/force.h"
+ +#include "gromacs/legacyheaders/gmx_omp_nthreads.h"
+ +#include "gromacs/legacyheaders/typedefs.h"
+ +#include "gromacs/math/vec.h"
+ +#include "gromacs/mdlib/nb_verlet.h"
+ +#include "gromacs/mdlib/nbnxn_consts.h"
+ +#include "gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_common.h"
+ +#include "gromacs/pbcutil/ishift.h"
   #include "gromacs/utility/smalloc.h"
- -#include "force.h"
- -#include "gmx_omp_nthreads.h"
- -#include "nbnxn_kernel_ref.h"
- -#include "../nbnxn_consts.h"
- -#include "nbnxn_kernel_common.h"
   
   /*! \brief Typedefs for declaring lookup tables of kernel functions.
    */
@@@ -73,21 -70,21 +73,21 @@@ typedef void (*p_nbk_func_ener)(const n
   /* Analytical reaction-field kernels */
   #define CALC_COUL_RF
   #define LJ_CUT
- -#include "nbnxn_kernel_ref_includes.h"
+ +#include "gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_ref_includes.h"
   #undef LJ_CUT
   #define LJ_FORCE_SWITCH
- -#include "nbnxn_kernel_ref_includes.h"
+ +#include "gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_ref_includes.h"
   #undef LJ_FORCE_SWITCH
   #define LJ_POT_SWITCH
- -#include "nbnxn_kernel_ref_includes.h"
+ +#include "gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_ref_includes.h"
   #undef LJ_POT_SWITCH
   #define LJ_EWALD
   #define LJ_CUT
   #define LJ_EWALD_COMB_GEOM
- -#include "nbnxn_kernel_ref_includes.h"
+ +#include "gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_ref_includes.h"
   #undef LJ_EWALD_COMB_GEOM
   #define LJ_EWALD_COMB_LB
- -#include "nbnxn_kernel_ref_includes.h"
+ +#include "gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_ref_includes.h"
   #undef LJ_EWALD_COMB_LB
   #undef LJ_CUT
   #undef LJ_EWALD
@@@ -97,42 -94,42 +97,42 @@@
   /* Tabulated exclusion interaction electrostatics kernels */
   #define CALC_COUL_TAB
   #define LJ_CUT
- -#include "nbnxn_kernel_ref_includes.h"
+ +#include "gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_ref_includes.h"
   #undef LJ_CUT
   #define LJ_FORCE_SWITCH
- -#include "nbnxn_kernel_ref_includes.h"
+ +#include "gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_ref_includes.h"
   #undef LJ_FORCE_SWITCH
   #define LJ_POT_SWITCH
- -#include "nbnxn_kernel_ref_includes.h"
+ +#include "gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_ref_includes.h"
   #undef LJ_POT_SWITCH
   #define LJ_EWALD
   #define LJ_CUT
   #define LJ_EWALD_COMB_GEOM
- -#include "nbnxn_kernel_ref_includes.h"
+ +#include "gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_ref_includes.h"
   #undef LJ_EWALD_COMB_GEOM
   #define LJ_EWALD_COMB_LB
- -#include "nbnxn_kernel_ref_includes.h"
+ +#include "gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_ref_includes.h"
   #undef LJ_EWALD_COMB_LB
   #undef LJ_CUT
   #undef LJ_EWALD
   /* Twin-range cut-off kernels */
   #define VDW_CUTOFF_CHECK
   #define LJ_CUT
- -#include "nbnxn_kernel_ref_includes.h"
+ +#include "gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_ref_includes.h"
   #undef LJ_CUT
   #define LJ_FORCE_SWITCH
- -#include "nbnxn_kernel_ref_includes.h"
+ +#include "gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_ref_includes.h"
   #undef LJ_FORCE_SWITCH
   #define LJ_POT_SWITCH
- -#include "nbnxn_kernel_ref_includes.h"
+ +#include "gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_ref_includes.h"
   #undef LJ_POT_SWITCH
   #define LJ_EWALD
   #define LJ_CUT
   #define LJ_EWALD_COMB_GEOM
- -#include "nbnxn_kernel_ref_includes.h"
+ +#include "gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_ref_includes.h"
   #undef LJ_EWALD_COMB_GEOM
   #define LJ_EWALD_COMB_LB
- -#include "nbnxn_kernel_ref_includes.h"
+ +#include "gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_ref_includes.h"
   #undef LJ_EWALD_COMB_LB
   #undef LJ_CUT
   #undef LJ_EWALD
@@@ -185,6 -182,7 +185,7 @@@ nbnxn_kernel_ref(const nbnxn_pairlist_s
       int                coult;
       int                vdwt;
       int                nb;
+     int                nthreads gmx_unused;
   
       nnbl = nbl_list->nnbl;
       nbl  = nbl_list->nbl;
@@@ -242,7 -240,8 +243,8 @@@
           gmx_incons("Unsupported vdwtype in nbnxn reference kernel");
       }
   
- #pragma omp parallel for schedule(static) num_threads(gmx_omp_nthreads_get(emntNonbonded))
+     nthreads = gmx_omp_nthreads_get(emntNonbonded);
+ #pragma omp parallel for schedule(static) num_threads(nthreads)
       for (nb = 0; nb < nnbl; nb++)
       {
           nbnxn_atomdata_output_t *out;
diff --combined src/gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn.c

index ed1456873bcf373c6844295b7b286eb79e8a5577,35181d43eb17c584fa2af44b62f20aef15443875..1d8567b206d4619eab232683f732d56f20b71010
--- 1/src/gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn.c
--- 2/src/gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn.c
+++ b/src/gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn.c
@@@ -37,12 -37,12 +37,12 @@@
    * kernel type 2xnn.
    */
   
- -#ifdef HAVE_CONFIG_H
- -#include <config.h>
- -#endif
+ +#include "gmxpre.h"
   
- -#include "typedefs.h"
+ +#include "config.h"
   
+ +#include "gromacs/legacyheaders/typedefs.h"
+ +#include "gromacs/mdlib/nb_verlet.h"
   #include "gromacs/mdlib/nbnxn_simd.h"
   
   #ifdef GMX_NBNXN_SIMD_2XNN
@@@ -56,11 -56,10 +56,11 @@@
   
   #define GMX_SIMD_J_UNROLL_SIZE 2
   #include "nbnxn_kernel_simd_2xnn.h"
- -#include "../nbnxn_kernel_common.h"
- -#include "gmx_omp_nthreads.h"
- -#include "types/force_flags.h"
- -#include "gmx_fatal.h"
+ +
+ +#include "gromacs/legacyheaders/gmx_omp_nthreads.h"
+ +#include "gromacs/legacyheaders/types/force_flags.h"
+ +#include "gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_common.h"
+ +#include "gromacs/utility/fatalerror.h"
   
   /*! \brief Kinds of electrostatic treatments in SIMD Verlet kernels
    */
@@@ -253,7 -252,7 +253,7 @@@ reduce_group_energies(int ng, int ng_2l
   
   #else /* GMX_NBNXN_SIMD_2XNN */
   
- -#include "gmx_fatal.h"
+ +#include "gromacs/utility/fatalerror.h"
   
   #endif /* GMX_NBNXN_SIMD_2XNN */
   
@@@ -274,6 -273,7 +274,7 @@@ nbnxn_kernel_simd_2xnn(nbnxn_pairlist_s
       nbnxn_pairlist_t **nbl;
       int                coulkt, vdwkt = 0;
       int                nb;
+     int                nthreads gmx_unused;
   
       nnbl = nbl_list->nnbl;
       nbl  = nbl_list->nbl;
@@@ -345,7 -345,8 +346,8 @@@
           gmx_incons("Unsupported VdW interaction type");
       }
   
- #pragma omp parallel for schedule(static) num_threads(gmx_omp_nthreads_get(emntNonbonded))
+     nthreads = gmx_omp_nthreads_get(emntNonbonded);
+ #pragma omp parallel for schedule(static) num_threads(nthreads)
       for (nb = 0; nb < nnbl; nb++)
       {
           nbnxn_atomdata_output_t *out;
diff --combined src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn.c

index d17ba1e7bdbb1c328aa810bf3e268ed1c98d62fb,5974873a070147da2fd1723c8909d00449eb3dfe..3531299efd5feeda4bf72de911292bee47805745
--- 1/src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn.c
--- 2/src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn.c
+++ b/src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn.c
@@@ -37,12 -37,12 +37,12 @@@
    * kernel type 4xn.
    */
   
- -#ifdef HAVE_CONFIG_H
- -#include <config.h>
- -#endif
+ +#include "gmxpre.h"
   
- -#include "typedefs.h"
+ +#include "config.h"
   
+ +#include "gromacs/legacyheaders/typedefs.h"
+ +#include "gromacs/mdlib/nb_verlet.h"
   #include "gromacs/mdlib/nbnxn_simd.h"
   
   #ifdef GMX_NBNXN_SIMD_4XN
@@@ -55,11 -55,10 +55,11 @@@
   
   #define GMX_SIMD_J_UNROLL_SIZE 1
   #include "nbnxn_kernel_simd_4xn.h"
- -#include "../nbnxn_kernel_common.h"
- -#include "gmx_omp_nthreads.h"
- -#include "types/force_flags.h"
- -#include "gmx_fatal.h"
+ +
+ +#include "gromacs/legacyheaders/gmx_omp_nthreads.h"
+ +#include "gromacs/legacyheaders/types/force_flags.h"
+ +#include "gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_common.h"
+ +#include "gromacs/utility/fatalerror.h"
   
   /*! \brief Kinds of electrostatic treatments in SIMD Verlet kernels
    */
@@@ -252,7 -251,7 +252,7 @@@ reduce_group_energies(int ng, int ng_2l
   
   #else /* GMX_NBNXN_SIMD_4XN */
   
- -#include "gmx_fatal.h"
+ +#include "gromacs/utility/fatalerror.h"
   
   #endif /* GMX_NBNXN_SIMD_4XN */
   
@@@ -273,6 -272,7 +273,7 @@@ nbnxn_kernel_simd_4xn(nbnxn_pairlist_se
       nbnxn_pairlist_t **nbl;
       int                coulkt, vdwkt = 0;
       int                nb;
+     int                nthreads gmx_unused;
   
       nnbl = nbl_list->nnbl;
       nbl  = nbl_list->nbl;
@@@ -344,7 -344,8 +345,8 @@@
           gmx_incons("Unsupported VdW interaction type");
       }
   
- #pragma omp parallel for schedule(static) num_threads(gmx_omp_nthreads_get(emntNonbonded))
+     nthreads = gmx_omp_nthreads_get(emntNonbonded);
+ #pragma omp parallel for schedule(static) num_threads(nthreads)
       for (nb = 0; nb < nnbl; nb++)
       {
           nbnxn_atomdata_output_t *out;
diff --combined src/gromacs/mdlib/nbnxn_search.c

index f13de8c7db485633350dd28414c1d23bb53eb237,3114bebc22b048559369dede69c9774cf4d8f21a..56e616967c3f91d482408d2e4806c5f1f4fb4c37
--- 1/src/gromacs/mdlib/nbnxn_search.c
--- 2/src/gromacs/mdlib/nbnxn_search.c
+++ b/src/gromacs/mdlib/nbnxn_search.c
@@@ -33,35 -33,34 +33,35 @@@
    * the research papers on the package. Check out http://www.gromacs.org.
    */
   
- -#ifdef HAVE_CONFIG_H
- -#include <config.h>
- -#endif
+ +#include "gmxpre.h"
+ +
+ +#include "nbnxn_search.h"
+ +
+ +#include "config.h"
   
+ +#include <assert.h>
   #include <math.h>
   #include <string.h>
- -#include <assert.h>
   
- -#include "sysstuff.h"
- -#include "gromacs/utility/smalloc.h"
- -#include "types/commrec.h"
- -#include "macros.h"
+ +#include "gromacs/legacyheaders/gmx_omp_nthreads.h"
+ +#include "gromacs/legacyheaders/macros.h"
+ +#include "gromacs/legacyheaders/nrnb.h"
+ +#include "gromacs/legacyheaders/ns.h"
+ +#include "gromacs/legacyheaders/types/commrec.h"
   #include "gromacs/math/utilities.h"
- -#include "vec.h"
- -#include "pbc.h"
- -#include "nbnxn_consts.h"
+ +#include "gromacs/math/vec.h"
+ +#include "gromacs/mdlib/nb_verlet.h"
+ +#include "gromacs/mdlib/nbnxn_atomdata.h"
+ +#include "gromacs/mdlib/nbnxn_consts.h"
+ +#include "gromacs/pbcutil/ishift.h"
+ +#include "gromacs/pbcutil/pbc.h"
+ +#include "gromacs/utility/smalloc.h"
+ +
   /* nbnxn_internal.h included gromacs/simd/macros.h */
- -#include "nbnxn_internal.h"
- -#ifdef GMX_NBNXN_SIMD
+ +#include "gromacs/mdlib/nbnxn_internal.h"
+ +#ifdef GMX_SIMD
   #include "gromacs/simd/vector_operations.h"
   #endif
- -#include "nbnxn_atomdata.h"
- -#include "nbnxn_search.h"
- -#include "gmx_omp_nthreads.h"
- -#include "nrnb.h"
- -#include "ns.h"
- -
- -#include "gromacs/fileio/gmxfio.h"
   
   #ifdef NBNXN_SEARCH_BB_SIMD4
   /* Always use 4-wide SIMD for bounding box calculations */
@@@ -1383,13 -1382,14 +1383,13 @@@ static void sort_columns_supersub(cons
                                     int cxy_start, int cxy_end,
                                     int *sort_work)
   {
- -    int  cxy;
- -    int  cx, cy, cz = -1, c = -1, ncz;
- -    int  na, ash, na_c, ind, a;
- -    int  subdiv_z, sub_z, na_z, ash_z;
- -    int  subdiv_y, sub_y, na_y, ash_y;
- -    int  subdiv_x, sub_x, na_x, ash_x;
+ +    int        cxy;
+ +    int        cx, cy, cz = -1, c = -1, ncz;
+ +    int        na, ash, na_c, ind, a;
+ +    int        subdiv_z, sub_z, na_z, ash_z;
+ +    int        subdiv_y, sub_y, na_y, ash_y;
+ +    int        subdiv_x, sub_x, na_x, ash_x;
   
- -    /* cppcheck-suppress unassignedVariable */
       nbnxn_bb_t bb_work_array[2], *bb_work_aligned;
   
       bb_work_aligned = (nbnxn_bb_t *)(((size_t)(bb_work_array+1)) & (~((size_t)15)));
@@@ -1931,6 -1931,7 +1931,7 @@@ void nbnxn_grid_add_simple(nbnxn_search
       float        *bbcz;
       nbnxn_bb_t   *bb;
       int           ncd, sc;
+     int           nthreads gmx_unused;
   
       grid = &nbs->grid[0];
   
@@@ -1957,7 -1958,8 +1958,8 @@@
       bbcz = grid->bbcz_simple;
       bb   = grid->bb_simple;
   
- #pragma omp parallel for num_threads(gmx_omp_nthreads_get(emntPairsearch)) schedule(static)
+     nthreads = gmx_omp_nthreads_get(emntPairsearch);
+ #pragma omp parallel for num_threads(nthreads) schedule(static)
       for (sc = 0; sc < grid->nc; sc++)
       {
           int c, tx, na;
@@@ -2942,10 -2944,10 +2944,10 @@@ static void make_cluster_list_simple(co
   }
   
   #ifdef GMX_NBNXN_SIMD_4XN
- -#include "nbnxn_search_simd_4xn.h"
+ +#include "gromacs/mdlib/nbnxn_search_simd_4xn.h"
   #endif
   #ifdef GMX_NBNXN_SIMD_2XNN
- -#include "nbnxn_search_simd_2xnn.h"
+ +#include "gromacs/mdlib/nbnxn_search_simd_2xnn.h"
   #endif
   
   /* Plain C or SIMD4 code for making a pair list of super-cell sci vs scj.
@@@ -4473,6 -4475,7 +4475,7 @@@ static void combine_nblists(int nnbl, n
   {
       int nsci, ncj4, nexcl;
       int n, i;
+     int nthreads gmx_unused;
   
       if (nblc->bSimple)
       {
@@@ -4513,7 -4516,8 +4516,8 @@@
       /* Each thread should copy its own data to the combined arrays,
        * as otherwise data will go back and forth between different caches.
        */
- #pragma omp parallel for num_threads(gmx_omp_nthreads_get(emntPairsearch)) schedule(static)
+     nthreads = gmx_omp_nthreads_get(emntPairsearch);
+ #pragma omp parallel for num_threads(nthreads) schedule(static)
       for (n = 0; n < nnbl; n++)
       {
           int                     sci_offset;
diff --combined src/gromacs/mdlib/ns.c

index 9145929c475b11a0d5bda33347479803215ca818,430e26d304780b5b3f828efdad076163127cff1d..3a1396cd40ac57bec8a893dedd78d758d470e6dc
--- 1/src/gromacs/mdlib/ns.c
--- 2/src/gromacs/mdlib/ns.c
+++ b/src/gromacs/mdlib/ns.c
@@@ -34,33 -34,33 +34,33 @@@
    * To help us fund GROMACS development, we humbly ask that you cite
    * the research papers on the package. Check out http://www.gromacs.org.
    */
- -#ifdef HAVE_CONFIG_H
- -#include <config.h>
- -#endif
+ +#include "gmxpre.h"
+ +
+ +#include "gromacs/legacyheaders/ns.h"
   
   #include <math.h>
+ +#include <stdlib.h>
   #include <string.h>
- -#include "sysstuff.h"
- -#include "gromacs/utility/smalloc.h"
- -#include "macros.h"
+ +
+ +#include "gromacs/legacyheaders/domdec.h"
+ +#include "gromacs/legacyheaders/force.h"
+ +#include "gromacs/legacyheaders/macros.h"
+ +#include "gromacs/legacyheaders/names.h"
+ +#include "gromacs/legacyheaders/network.h"
+ +#include "gromacs/legacyheaders/nonbonded.h"
+ +#include "gromacs/legacyheaders/nrnb.h"
+ +#include "gromacs/legacyheaders/nsgrid.h"
+ +#include "gromacs/legacyheaders/txtdump.h"
+ +#include "gromacs/legacyheaders/types/commrec.h"
   #include "gromacs/math/utilities.h"
- -#include "vec.h"
- -#include "types/commrec.h"
- -#include "network.h"
- -#include "nsgrid.h"
- -#include "force.h"
- -#include "nonbonded.h"
- -#include "ns.h"
- -#include "pbc.h"
- -#include "names.h"
- -#include "gmx_fatal.h"
- -#include "nrnb.h"
- -#include "txtdump.h"
- -#include "mtop_util.h"
- -
- -#include "domdec.h"
- -#include "adress.h"
+ +#include "gromacs/math/vec.h"
+ +#include "gromacs/pbcutil/ishift.h"
+ +#include "gromacs/pbcutil/pbc.h"
+ +#include "gromacs/topology/mtop_util.h"
+ +#include "gromacs/utility/fatalerror.h"
+ +#include "gromacs/utility/smalloc.h"
   
+ +#include "adress.h"
   
   /*
    *    E X C L U S I O N   H A N D L I N G
@@@ -287,8 -287,11 +287,8 @@@ void init_neighbor_list(FILE *log, t_fo
           /* Did we get the solvent loops so we can use optimized water kernels? */
           if (nbl->nlist_sr[eNL_VDWQQ_WATER].kernelptr_vf == NULL
               || nbl->nlist_sr[eNL_QQ_WATER].kernelptr_vf == NULL
- -#ifndef DISABLE_WATERWATER_NLIST
               || nbl->nlist_sr[eNL_VDWQQ_WATERWATER].kernelptr_vf == NULL
- -            || nbl->nlist_sr[eNL_QQ_WATERWATER].kernelptr_vf == NULL
- -#endif
- -            )
+ +            || nbl->nlist_sr[eNL_QQ_WATERWATER].kernelptr_vf == NULL)
           {
               fr->solvent_opt = esolNO;
               if (log != NULL)
@@@ -427,6 -430,14 +427,14 @@@ static gmx_inline void close_i_nblist(t
           nlist->jindex[nri+1] = nlist->nrj;
   
           len = nlist->nrj -  nlist->jindex[nri];
+         /* If there are no j-particles we have to reduce the
+          * number of i-particles again, to prevent errors in the
+          * kernel functions.
+          */
+         if ((len == 0) && (nlist->nri > 0))
+         {
+             nlist->nri--;
+         }
       }
   }
   
@@@ -661,11 -672,13 +669,11 @@@ put_in_list_at(gmx_bool              bH
   
       if (iwater != esolNO)
       {
- -        vdwc = &nlist[eNL_VDWQQ_WATER];
- -        vdw  = &nlist[eNL_VDW];
- -        coul = &nlist[eNL_QQ_WATER];
- -#ifndef DISABLE_WATERWATER_NLIST
+ +        vdwc    = &nlist[eNL_VDWQQ_WATER];
+ +        vdw     = &nlist[eNL_VDW];
+ +        coul    = &nlist[eNL_QQ_WATER];
           vdwc_ww = &nlist[eNL_VDWQQ_WATERWATER];
           coul_ww = &nlist[eNL_QQ_WATERWATER];
- -#endif
       }
       else
       {
@@@ -685,7 -698,9 +693,7 @@@
               if (bDoCoul && bDoVdW)
               {
                   new_i_nblist(vdwc, i_atom, shift, gid);
- -#ifndef DISABLE_WATERWATER_NLIST
                   new_i_nblist(vdwc_ww, i_atom, shift, gid);
- -#endif
               }
               if (bDoVdW)
               {
@@@ -694,7 -709,9 +702,7 @@@
               if (bDoCoul)
               {
                   new_i_nblist(coul, i_atom, shift, gid);
- -#ifndef DISABLE_WATERWATER_NLIST
                   new_i_nblist(coul_ww, i_atom, shift, gid);
- -#endif
               }
               /* Loop over the j charge groups */
               for (j = 0; (j < nj); j++)
@@@ -719,6 -736,19 +727,6 @@@
                       }
                       else
                       {
- -#ifdef DISABLE_WATERWATER_NLIST
- -                        /* Add entries for the three atoms - only do VdW if we need to */
- -                        if (!bDoVdW)
- -                        {
- -                            add_j_to_nblist(coul, jj0, bLR);
- -                        }
- -                        else
- -                        {
- -                            add_j_to_nblist(vdwc, jj0, bLR);
- -                        }
- -                        add_j_to_nblist(coul, jj0+1, bLR);
- -                        add_j_to_nblist(coul, jj0+2, bLR);
- -#else
                           /* One entry for the entire water-water interaction */
                           if (!bDoVdW)
                           {
@@@ -728,6 -758,7 +736,6 @@@
                           {
                               add_j_to_nblist(vdwc_ww, jj0, bLR);
                           }
- -#endif
                       }
                   }
                   else if (iwater == esolTIP4P && jwater == esolTIP4P)
@@@ -740,6 -771,16 +748,6 @@@
                       }
                       else
                       {
- -#ifdef DISABLE_WATERWATER_NLIST
- -                        /* Add entries for the four atoms - only do VdW if we need to */
- -                        if (bDoVdW)
- -                        {
- -                            add_j_to_nblist(vdw, jj0, bLR);
- -                        }
- -                        add_j_to_nblist(coul, jj0+1, bLR);
- -                        add_j_to_nblist(coul, jj0+2, bLR);
- -                        add_j_to_nblist(coul, jj0+3, bLR);
- -#else
                           /* One entry for the entire water-water interaction */
                           if (!bDoVdW)
                           {
@@@ -749,6 -790,7 +757,6 @@@
                           {
                               add_j_to_nblist(vdwc_ww, jj0, bLR);
                           }
- -#endif
                       }
                   }
                   else
@@@ -809,8 -851,10 +817,8 @@@
               close_i_nblist(vdw);
               close_i_nblist(coul);
               close_i_nblist(vdwc);
- -#ifndef DISABLE_WATERWATER_NLIST
               close_i_nblist(coul_ww);
               close_i_nblist(vdwc_ww);
- -#endif
           }
           else
           {
@@@ -2087,7 -2131,7 +2095,7 @@@ static int nsgrid_core(t_commrec *cr, t
       gmx_ns_t     *ns;
       atom_id     **nl_lr_ljc, **nl_lr_one, **nl_sr;
       int          *nlr_ljc, *nlr_one, *nsr;
- -    gmx_domdec_t *dd     = NULL;
+ +    gmx_domdec_t *dd;
       t_block      *cgs    = &(top->cgs);
       int          *cginfo = fr->cginfo;
       /* atom_id *i_atoms,*cgsindex=cgs->index; */
@@@ -2115,7 -2159,10 +2123,7 @@@
       ns = &fr->ns;
   
       bDomDec = DOMAINDECOMP(cr);
- -    if (bDomDec)
- -    {
- -        dd = cr->dd;
- -    }
+ +    dd      = cr->dd;
   
       bTriclinicX = ((YY < grid->npbcdim &&
                       (!bDomDec || dd->nc[YY] == 1) && box[YY][XX] != 0) ||
diff --combined src/gromacs/mdlib/sim_util.c

index e6686abd02d1ca3ea154f0670dafc479b2724554,a6b166ccfd5acedb51742f9063d40207edda4e23..b35dd827ac0afe009787c7e562d0baebeb264322
--- 1/src/gromacs/mdlib/sim_util.c
--- 2/src/gromacs/mdlib/sim_util.c
+++ b/src/gromacs/mdlib/sim_util.c
@@@ -34,71 -34,70 +34,71 @@@
    * To help us fund GROMACS development, we humbly ask that you cite
    * the research papers on the package. Check out http://www.gromacs.org.
    */
- -#ifdef HAVE_CONFIG_H
- -#include <config.h>
- -#endif
+ +#include "gmxpre.h"
+ +
+ +#include "gromacs/legacyheaders/sim_util.h"
+ +
+ +#include "config.h"
   
   #include <assert.h>
   #include <math.h>
   #include <stdio.h>
   #include <string.h>
+ +
   #ifdef HAVE_SYS_TIME_H
   #include <sys/time.h>
   #endif
   
- -#include "typedefs.h"
- -#include "gromacs/utility/cstringutil.h"
- -#include "gromacs/utility/smalloc.h"
- -#include "names.h"
- -#include "txtdump.h"
- -#include "pbc.h"
- -#include "chargegroup.h"
- -#include "vec.h"
- -#include "nrnb.h"
- -#include "mshift.h"
- -#include "mdrun.h"
- -#include "sim_util.h"
- -#include "update.h"
- -#include "physics.h"
- -#include "main.h"
- -#include "mdatoms.h"
- -#include "force.h"
- -#include "bondf.h"
- -#include "pme.h"
- -#include "disre.h"
- -#include "orires.h"
- -#include "network.h"
- -#include "calcmu.h"
- -#include "constr.h"
- -#include "xvgr.h"
- -#include "copyrite.h"
- -#include "domdec.h"
- -#include "genborn.h"
- -#include "nbnxn_atomdata.h"
- -#include "nbnxn_search.h"
- -#include "nbnxn_kernels/nbnxn_kernel_ref.h"
- -#include "nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn.h"
- -#include "nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn.h"
- -#include "nbnxn_kernels/nbnxn_kernel_gpu_ref.h"
- -#include "nonbonded.h"
- -#include "../gmxlib/nonbonded/nb_kernel.h"
- -#include "../gmxlib/nonbonded/nb_free_energy.h"
- -
- -#include "gromacs/timing/wallcycle.h"
- -#include "gromacs/timing/walltime_accounting.h"
- -#include "gromacs/utility/gmxmpi.h"
   #include "gromacs/essentialdynamics/edsam.h"
+ +#include "gromacs/gmxlib/nonbonded/nb_free_energy.h"
+ +#include "gromacs/gmxlib/nonbonded/nb_kernel.h"
+ +#include "gromacs/imd/imd.h"
+ +#include "gromacs/legacyheaders/calcmu.h"
+ +#include "gromacs/legacyheaders/chargegroup.h"
+ +#include "gromacs/legacyheaders/constr.h"
+ +#include "gromacs/legacyheaders/copyrite.h"
+ +#include "gromacs/legacyheaders/disre.h"
+ +#include "gromacs/legacyheaders/domdec.h"
+ +#include "gromacs/legacyheaders/force.h"
+ +#include "gromacs/legacyheaders/genborn.h"
+ +#include "gromacs/legacyheaders/gmx_omp_nthreads.h"
+ +#include "gromacs/legacyheaders/mdatoms.h"
+ +#include "gromacs/legacyheaders/mdrun.h"
+ +#include "gromacs/legacyheaders/names.h"
+ +#include "gromacs/legacyheaders/network.h"
+ +#include "gromacs/legacyheaders/nonbonded.h"
+ +#include "gromacs/legacyheaders/nrnb.h"
+ +#include "gromacs/legacyheaders/orires.h"
+ +#include "gromacs/legacyheaders/pme.h"
+ +#include "gromacs/legacyheaders/qmmm.h"
+ +#include "gromacs/legacyheaders/txtdump.h"
+ +#include "gromacs/legacyheaders/typedefs.h"
+ +#include "gromacs/legacyheaders/update.h"
+ +#include "gromacs/legacyheaders/types/commrec.h"
+ +#include "gromacs/listed-forces/bonded.h"
+ +#include "gromacs/math/units.h"
+ +#include "gromacs/math/vec.h"
+ +#include "gromacs/mdlib/nb_verlet.h"
+ +#include "gromacs/mdlib/nbnxn_atomdata.h"
+ +#include "gromacs/mdlib/nbnxn_search.h"
+ +#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.h"
+ +#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.h"
+ +#include "gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_gpu_ref.h"
+ +#include "gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_ref.h"
+ +#include "gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn.h"
+ +#include "gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn.h"
+ +#include "gromacs/pbcutil/ishift.h"
+ +#include "gromacs/pbcutil/mshift.h"
+ +#include "gromacs/pbcutil/pbc.h"
   #include "gromacs/pulling/pull.h"
   #include "gromacs/pulling/pull_rotation.h"
- -#include "gromacs/imd/imd.h"
- -#include "adress.h"
- -#include "qmmm.h"
- -
- -#include "gmx_omp_nthreads.h"
+ +#include "gromacs/timing/wallcycle.h"
+ +#include "gromacs/timing/walltime_accounting.h"
+ +#include "gromacs/utility/cstringutil.h"
+ +#include "gromacs/utility/gmxmpi.h"
+ +#include "gromacs/utility/smalloc.h"
   
- -#include "nbnxn_cuda_data_mgmt.h"
- -#include "nbnxn_cuda/nbnxn_cuda.h"
+ +#include "adress.h"
   
   void print_time(FILE                     *out,
                   gmx_walltime_accounting_t walltime_accounting,
@@@ -314,7 -313,9 +314,7 @@@ static void calc_virial(int start, int 
       }
   }
   
- -static void posres_wrapper(FILE *fplog,
- -                           int flags,
- -                           gmx_bool bSepDVDL,
+ +static void posres_wrapper(int flags,
                              t_inputrec *ir,
                              t_nrnb *nrnb,
                              gmx_localtop_t *top,
@@@ -336,6 -337,10 +336,6 @@@
                     ir->ePBC == epbcNONE ? NULL : &pbc,
                     lambda[efptRESTRAINT], &dvdl,
                     fr->rc_scaling, fr->ePBC, fr->posres_com, fr->posres_comB);
- -    if (bSepDVDL)
- -    {
- -        gmx_print_sepdvdl(fplog, interaction_function[F_POSRES].longname, v, dvdl);
- -    }
       enerd->term[F_POSRES] += v;
       /* If just the force constant changes, the FEP term is linear,
        * but if k changes, it is not.
@@@ -381,7 -386,9 +381,7 @@@ static void fbposres_wrapper(t_inputre
       inc_nrnb(nrnb, eNR_FBPOSRES, top->idef.il[F_FBPOSRES].nr/2);
   }
   
- -static void pull_potential_wrapper(FILE *fplog,
- -                                   gmx_bool bSepDVDL,
- -                                   t_commrec *cr,
+ +static void pull_potential_wrapper(t_commrec *cr,
                                      t_inputrec *ir,
                                      matrix box, rvec x[],
                                      rvec f[],
@@@ -406,11 -413,17 +406,11 @@@
       enerd->term[F_COM_PULL] +=
           pull_potential(ir->ePull, ir->pull, mdatoms, &pbc,
                          cr, t, lambda[efptRESTRAINT], x, f, vir_force, &dvdl);
- -    if (bSepDVDL)
- -    {
- -        gmx_print_sepdvdl(fplog, "Com pull", enerd->term[F_COM_PULL], dvdl);
- -    }
       enerd->dvdl_lin[efptRESTRAINT] += dvdl;
       wallcycle_stop(wcycle, ewcPULLPOT);
   }
   
- -static void pme_receive_force_ener(FILE           *fplog,
- -                                   gmx_bool        bSepDVDL,
- -                                   t_commrec      *cr,
+ +static void pme_receive_force_ener(t_commrec      *cr,
                                      gmx_wallcycle_t wcycle,
                                      gmx_enerdata_t *enerd,
                                      t_forcerec     *fr)
@@@ -430,6 -443,11 +430,6 @@@
       gmx_pme_receive_f(cr, fr->f_novirsum, fr->vir_el_recip, &e_q,
                         fr->vir_lj_recip, &e_lj, &dvdl_q, &dvdl_lj,
                         &cycles_seppme);
- -    if (bSepDVDL)
- -    {
- -        gmx_print_sepdvdl(fplog, "Electrostatic PME mesh", e_q, dvdl_q);
- -        gmx_print_sepdvdl(fplog, "Lennard-Jones PME mesh", e_lj, dvdl_lj);
- -    }
       enerd->term[F_COUL_RECIP] += e_q;
       enerd->term[F_LJ_RECIP]   += e_lj;
       enerd->dvdl_lin[efptCOUL] += dvdl_q;
@@@ -784,11 -802,6 +784,11 @@@ static void do_nb_verlet_fep(nbnxn_pair
       wallcycle_sub_stop(wcycle, ewcsNONBONDED);
   }
   
+ +gmx_bool use_GPU(const nonbonded_verlet_t *nbv)
+ +{
+ +    return nbv != NULL && nbv->bUseGPU;
+ +}
+ +
   void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
                            t_inputrec *inputrec,
                            gmx_int64_t step, t_nrnb *nrnb, gmx_wallcycle_t wcycle,
@@@ -810,7 -823,7 +810,7 @@@
       int                 start, homenr;
       int                 nb_kernel_type;
       double              mu[2*DIM];
- -    gmx_bool            bSepDVDL, bStateChanged, bNS, bFillGrid, bCalcCGCM, bBS;
+ +    gmx_bool            bStateChanged, bNS, bFillGrid, bCalcCGCM, bBS;
       gmx_bool            bDoLongRange, bDoForces, bSepLRF, bUseGPU, bUseOrEmulGPU;
       gmx_bool            bDiffKernels = FALSE;
       matrix              boxs;
@@@ -827,6 -840,8 +827,6 @@@
       start  = 0;
       homenr = mdatoms->homenr;
   
- -    bSepDVDL = (fr->bSepDVDL && do_per_step(step, inputrec->nstlog));
- -
       clear_mat(vir_force);
   
       cg0 = 0;
@@@ -1236,11 -1251,10 +1236,11 @@@
       }
   
       /* We calculate the non-bonded forces, when done on the CPU, here.
- -     * We do this before calling do_force_lowlevel, as in there bondeds
- -     * forces are calculated before PME, which does communication.
- -     * With this order, non-bonded and bonded force calculation imbalance
- -     * can be balanced out by the domain decomposition load balancing.
+ +     * We do this before calling do_force_lowlevel, because in that
+ +     * function, the listed forces are calculated before PME, which
+ +     * does communication.  With this order, non-bonded and listed
+ +     * force calculation imbalance can be balanced out by the domain
+ +     * decomposition load balancing.
        */
   
       if (!bUseOrEmulGPU)
@@@ -1320,22 -1334,22 +1320,22 @@@
           update_QMMMrec(cr, fr, x, mdatoms, box, top);
       }
   
- -    if ((flags & GMX_FORCE_BONDED) && top->idef.il[F_POSRES].nr > 0)
+ +    if ((flags & GMX_FORCE_LISTED) && top->idef.il[F_POSRES].nr > 0)
       {
- -        posres_wrapper(fplog, flags, bSepDVDL, inputrec, nrnb, top, box, x,
+ +        posres_wrapper(flags, inputrec, nrnb, top, box, x,
                          enerd, lambda, fr);
       }
   
- -    if ((flags & GMX_FORCE_BONDED) && top->idef.il[F_FBPOSRES].nr > 0)
+ +    if ((flags & GMX_FORCE_LISTED) && top->idef.il[F_FBPOSRES].nr > 0)
       {
           fbposres_wrapper(inputrec, nrnb, top, box, x, enerd, fr);
       }
   
       /* Compute the bonded and non-bonded energies and optionally forces */
- -    do_force_lowlevel(fplog, step, fr, inputrec, &(top->idef),
+ +    do_force_lowlevel(fr, inputrec, &(top->idef),
                         cr, nrnb, wcycle, mdatoms,
                         x, hist, f, bSepLRF ? fr->f_twin : f, enerd, fcd, top, fr->born,
- -                      &(top->atomtypes), bBornRadii, box,
+ +                      bBornRadii, box,
                         inputrec->fepvals, lambda, graph, &(top->excls), fr->mu_tot,
                         flags, &cycles_pme);
   
@@@ -1399,6 -1413,16 +1399,16 @@@
   
       if (bDoForces && DOMAINDECOMP(cr))
       {
+         if (bUseGPU)
+         {
+             /* We are done with the CPU compute, but the GPU local non-bonded
+              * kernel can still be running while we communicate the forces.
+              * We start a counter here, so we can, hopefully, time the rest
+              * of the GPU kernel execution and data transfer.
+              */
+             wallcycle_start(wcycle, ewcWAIT_GPU_NB_L_EST);
+         }
+ 
           /* Communicate the forces */
           wallcycle_start(wcycle, ewcMOVEF);
           dd_move_f(cr->dd, f, fr->fshift);
@@@ -1429,13 -1453,44 +1439,44 @@@
           /* wait for local forces (or calculate in emulation mode) */
           if (bUseGPU)
           {
+             float       cycles_tmp, cycles_wait_est;
+             const float cuda_api_overhead_margin = 50000.0f; /* cycles */
+ 
               wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
               nbnxn_cuda_wait_gpu(nbv->cu_nbv,
                                   nbv->grp[eintLocal].nbat,
                                   flags, eatLocal,
                                   enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
                                   fr->fshift);
-             cycles_wait_gpu += wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
+             cycles_tmp      = wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
+ 
+             if (bDoForces && DOMAINDECOMP(cr))
+             {
+                 cycles_wait_est = wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L_EST);
+ 
+                 if (cycles_tmp < cuda_api_overhead_margin)
+                 {
+                     /* We measured few cycles, it could be that the kernel
+                      * and transfer finished earlier and there was no actual
+                      * wait time, only API call overhead.
+                      * Then the actual time could be anywhere between 0 and
+                      * cycles_wait_est. As a compromise, we use half the time.
+                      */
+                     cycles_wait_est *= 0.5f;
+                 }
+             }
+             else
+             {
+                 /* No force communication so we actually timed the wait */
+                 cycles_wait_est = cycles_tmp;
+             }
+             /* Even though this is after dd_move_f, the actual task we are
+              * waiting for runs asynchronously with dd_move_f and we usually
+              * have nothing to balance it with, so we can and should add
+              * the time to the force time for load balancing.
+              */
+             cycles_force    += cycles_wait_est;
+             cycles_wait_gpu += cycles_wait_est;
   
               /* now clear the GPU outputs while we finish the step on the CPU */
   
@@@ -1519,7 -1574,7 +1560,7 @@@
           /* Since the COM pulling is always done mass-weighted, no forces are
            * applied to vsites and this call can be done after vsite spreading.
            */
- -        pull_potential_wrapper(fplog, bSepDVDL, cr, inputrec, box, x,
+ +        pull_potential_wrapper(cr, inputrec, box, x,
                                  f, vir_force, mdatoms, enerd, lambda, t,
                                  wcycle);
       }
@@@ -1540,7 -1595,7 +1581,7 @@@
           /* In case of node-splitting, the PP nodes receive the long-range
            * forces, virial and energy from the PME nodes here.
            */
- -        pme_receive_force_ener(fplog, bSepDVDL, cr, wcycle, enerd, fr);
+ +        pme_receive_force_ener(cr, wcycle, enerd, fr);
       }
   
       if (bDoForces)
@@@ -1573,7 -1628,7 +1614,7 @@@ void do_force_cutsGROUP(FILE *fplog, t_
       int        cg0, cg1, i, j;
       int        start, homenr;
       double     mu[2*DIM];
- -    gmx_bool   bSepDVDL, bStateChanged, bNS, bFillGrid, bCalcCGCM, bBS;
+ +    gmx_bool   bStateChanged, bNS, bFillGrid, bCalcCGCM, bBS;
       gmx_bool   bDoLongRangeNS, bDoForces, bDoPotential, bSepLRF;
       gmx_bool   bDoAdressWF;
       matrix     boxs;
@@@ -1585,6 -1640,8 +1626,6 @@@
       start  = 0;
       homenr = mdatoms->homenr;
   
- -    bSepDVDL = (fr->bSepDVDL && do_per_step(step, inputrec->nstlog));
- -
       clear_mat(vir_force);
   
       cg0 = 0;
@@@ -1869,22 -1926,22 +1910,22 @@@
           update_QMMMrec(cr, fr, x, mdatoms, box, top);
       }
   
- -    if ((flags & GMX_FORCE_BONDED) && top->idef.il[F_POSRES].nr > 0)
+ +    if ((flags & GMX_FORCE_LISTED) && top->idef.il[F_POSRES].nr > 0)
       {
- -        posres_wrapper(fplog, flags, bSepDVDL, inputrec, nrnb, top, box, x,
+ +        posres_wrapper(flags, inputrec, nrnb, top, box, x,
                          enerd, lambda, fr);
       }
   
- -    if ((flags & GMX_FORCE_BONDED) && top->idef.il[F_FBPOSRES].nr > 0)
+ +    if ((flags & GMX_FORCE_LISTED) && top->idef.il[F_FBPOSRES].nr > 0)
       {
           fbposres_wrapper(inputrec, nrnb, top, box, x, enerd, fr);
       }
   
       /* Compute the bonded and non-bonded energies and optionally forces */
- -    do_force_lowlevel(fplog, step, fr, inputrec, &(top->idef),
+ +    do_force_lowlevel(fr, inputrec, &(top->idef),
                         cr, nrnb, wcycle, mdatoms,
                         x, hist, f, bSepLRF ? fr->f_twin : f, enerd, fcd, top, fr->born,
- -                      &(top->atomtypes), bBornRadii, box,
+ +                      bBornRadii, box,
                         inputrec->fepvals, lambda,
                         graph, &(top->excls), fr->mu_tot,
                         flags,
@@@ -1992,7 -2049,7 +2033,7 @@@
   
       if (inputrec->ePull == epullUMBRELLA || inputrec->ePull == epullCONST_F)
       {
- -        pull_potential_wrapper(fplog, bSepDVDL, cr, inputrec, box, x,
+ +        pull_potential_wrapper(cr, inputrec, box, x,
                                  f, vir_force, mdatoms, enerd, lambda, t,
                                  wcycle);
       }
@@@ -2013,7 -2070,7 +2054,7 @@@
           /* In case of node-splitting, the PP nodes receive the long-range
            * forces, virial and energy from the PME nodes here.
            */
- -        pme_receive_force_ener(fplog, bSepDVDL, cr, wcycle, enerd, fr);
+ +        pme_receive_force_ener(cr, wcycle, enerd, fr);
       }
   
       if (bDoForces)
@@@ -2429,8 -2486,8 +2470,8 @@@ void calc_enervirdiff(FILE *fplog, int 
       }
   }
   
- -void calc_dispcorr(FILE *fplog, t_inputrec *ir, t_forcerec *fr,
- -                   gmx_int64_t step, int natoms,
+ +void calc_dispcorr(t_inputrec *ir, t_forcerec *fr,
+ +                   int natoms,
                      matrix box, real lambda, tensor pres, tensor virial,
                      real *prescorr, real *enercorr, real *dvdlcorr)
   {
@@@ -2531,6 -2588,10 +2572,6 @@@
               }
           }
   
- -        if (fr->bSepDVDL && do_per_step(step, ir->nstlog))
- -        {
- -            gmx_print_sepdvdl(fplog, "Dispersion correction", *enercorr, dvdlambda);
- -        }
           if (fr->efep != efepNO)
           {
               *dvdlcorr += dvdlambda;
@@@ -2635,7 -2696,7 +2676,7 @@@ void finish_run(FILE *fplog, t_commrec 
                   t_inputrec *inputrec,
                   t_nrnb nrnb[], gmx_wallcycle_t wcycle,
                   gmx_walltime_accounting_t walltime_accounting,
- -                wallclock_gpu_t *gputimes,
+ +                nonbonded_verlet_t *nbv,
                   gmx_bool bWriteStat)
   {
       int     i, j;
@@@ -2699,8 -2760,6 +2740,8 @@@
   
       if (SIMMASTER(cr))
       {
+ +        wallclock_gpu_t* gputimes = use_GPU(nbv) ?
+ +            nbnxn_cuda_get_timings(nbv->cu_nbv) : NULL;
           wallcycle_print(fplog, cr->nnodes, cr->npmenodes,
                           elapsed_time_over_all_ranks,
                           wcycle, gputimes);
diff --combined src/gromacs/mdlib/update.c

index 3be84a8a93c697047beff45cb75b6ffa8d861312,93b7f59c1ec4dc7faffce100b5b1f1f51920c478..373212ca8a07a46f49551430fd3b6d5c1d365e6f
--- 1/src/gromacs/mdlib/update.c
--- 2/src/gromacs/mdlib/update.c
+++ b/src/gromacs/mdlib/update.c
@@@ -34,37 -34,41 +34,37 @@@
    * To help us fund GROMACS development, we humbly ask that you cite
    * the research papers on the package. Check out http://www.gromacs.org.
    */
- -#ifdef HAVE_CONFIG_H
- -#include <config.h>
- -#endif
+ +#include "gmxpre.h"
   
+ +#include "gromacs/legacyheaders/update.h"
   
- -#include <stdio.h>
   #include <math.h>
- -
- -#include "types/commrec.h"
- -#include "sysstuff.h"
- -#include "gromacs/utility/smalloc.h"
- -#include "typedefs.h"
- -#include "nrnb.h"
- -#include "physics.h"
- -#include "macros.h"
- -#include "vec.h"
- -#include "main.h"
- -#include "update.h"
- -#include "gromacs/random/random.h"
- -#include "mshift.h"
- -#include "tgroup.h"
- -#include "force.h"
- -#include "names.h"
- -#include "txtdump.h"
- -#include "mdrun.h"
- -#include "constr.h"
- -#include "disre.h"
- -#include "orires.h"
- -#include "gmx_omp_nthreads.h"
+ +#include <stdio.h>
   
   #include "gromacs/fileio/confio.h"
- -#include "gromacs/fileio/futil.h"
+ +#include "gromacs/legacyheaders/constr.h"
+ +#include "gromacs/legacyheaders/disre.h"
+ +#include "gromacs/legacyheaders/force.h"
+ +#include "gromacs/legacyheaders/gmx_omp_nthreads.h"
+ +#include "gromacs/legacyheaders/macros.h"
+ +#include "gromacs/legacyheaders/mdrun.h"
+ +#include "gromacs/legacyheaders/names.h"
+ +#include "gromacs/legacyheaders/nrnb.h"
+ +#include "gromacs/legacyheaders/orires.h"
+ +#include "gromacs/legacyheaders/tgroup.h"
+ +#include "gromacs/legacyheaders/txtdump.h"
+ +#include "gromacs/legacyheaders/typedefs.h"
+ +#include "gromacs/legacyheaders/types/commrec.h"
+ +#include "gromacs/math/units.h"
+ +#include "gromacs/math/vec.h"
+ +#include "gromacs/pbcutil/mshift.h"
+ +#include "gromacs/pbcutil/pbc.h"
+ +#include "gromacs/pulling/pull.h"
+ +#include "gromacs/random/random.h"
   #include "gromacs/timing/wallcycle.h"
+ +#include "gromacs/utility/futil.h"
   #include "gromacs/utility/gmxomp.h"
- -#include "gromacs/pulling/pull.h"
+ +#include "gromacs/utility/smalloc.h"
   
   /*For debugging, start at v(-dt/2) for velolcity verlet -- uncomment next line */
   /*#define STARTFROMDT2*/
@@@ -1767,7 -1771,9 +1767,9 @@@ void update_constraints(FIL
           }
           else
           {
- #pragma omp parallel for num_threads(gmx_omp_nthreads_get(emntUpdate)) schedule(static)
+             nth = gmx_omp_nthreads_get(emntUpdate);
+ 
+ #pragma omp parallel for num_threads(nth) schedule(static)
               for (i = start; i < nrend; i++)
               {
                   copy_rvec(upd->xp[i], state->x[i]);
diff --combined src/gromacs/timing/wallcycle.c

index 27eb275cd0e088e521f9d693ae731a6f188980c5,ae26c2cec4c0492ba6cdf92071587029d90d4709..3b0d90663631b29e7e15b6844e771ef653781a02
--- 1/src/gromacs/timing/wallcycle.c
--- 2/src/gromacs/timing/wallcycle.c
+++ b/src/gromacs/timing/wallcycle.c
@@@ -34,22 -34,21 +34,22 @@@
    * To help us fund GROMACS development, we humbly ask that you cite
    * the research papers on the package. Check out http://www.gromacs.org.
    */
- -#include "gromacs/timing/wallcycle.h"
+ +#include "gmxpre.h"
   
- -#ifdef HAVE_CONFIG_H
- -#include <config.h>
- -#endif
+ +#include "wallcycle.h"
   
- -#include <string.h>
+ +#include "config.h"
   
- -#include "gromacs/utility/smalloc.h"
- -#include "gmx_fatal.h"
- -#include "md_logging.h"
- -#include "gromacs/utility/cstringutil.h"
+ +#include <stdlib.h>
+ +#include <string.h>
   
+ +#include "gromacs/legacyheaders/md_logging.h"
+ +#include "gromacs/legacyheaders/types/commrec.h"
   #include "gromacs/timing/cyclecounter.h"
+ +#include "gromacs/utility/cstringutil.h"
+ +#include "gromacs/utility/fatalerror.h"
   #include "gromacs/utility/gmxmpi.h"
+ +#include "gromacs/utility/smalloc.h"
   
   /* DEBUG_WCYCLE adds consistency checking for the counters.
    * It checks if you stop a counter different from the last
@@@ -99,7 -98,7 +99,7 @@@ static const char *wcn[ewcNR] 
       "DD comm. bounds", "Vsite constr.", "Send X to PME", "Neighbor search", "Launch GPU ops.",
       "Comm. coord.", "Born radii", "Force", "Wait + Comm. F", "PME mesh",
       "PME redist. X/F", "PME spread/gather", "PME 3D-FFT", "PME 3D-FFT Comm.", "PME solve LJ", "PME solve Elec",
-     "PME wait for PP", "Wait + Recv. PME F", "Wait GPU nonlocal", "Wait GPU local", "NB X/F buffer ops.",
+     "PME wait for PP", "Wait + Recv. PME F", "Wait GPU nonlocal", "Wait GPU local", "Wait GPU loc. est.", "NB X/F buffer ops.",
       "Vsite spread", "COM pull force",
       "Write traj.", "Update", "Constraints", "Comm. energies",
       "Enforced rotation", "Add rot. forces", "Coordinate swapping", "IMD", "Test"
@@@ -110,7 -109,7 +110,7 @@@ static const char *wcsn[ewcsNR] 
       "DD redist.", "DD NS grid + sort", "DD setup comm.",
       "DD make top.", "DD make constr.", "DD top. other",
       "NS grid local", "NS grid non-loc.", "NS search local", "NS search non-loc.",
- -    "Bonded F", "Nonbonded F", "Ewald F correction",
+ +    "Listed F", "Nonbonded F", "Ewald F correction",
       "NB X buffer ops.", "NB F buffer ops."
   };
   
@@@ -388,6 -387,12 +388,12 @@@ void wallcycle_sum(t_commrec *cr, gmx_w
   
       wcc = wc->wcc;
   
+     /* The GPU wait estimate counter is used for load balancing only
+      * and will mess up the total due to double counting: clear it.
+      */
+     wcc[ewcWAIT_GPU_NB_L_EST].n = 0;
+     wcc[ewcWAIT_GPU_NB_L_EST].c = 0;
+ 
       for (i = 0; i < ewcNR; i++)
       {
           if (is_pme_counter(i) || (i == ewcRUN && cr->duty == DUTY_PME))
diff --combined src/gromacs/timing/wallcycle.h

index 104173598a16ab5270179208f1f58a793c8f9295,7b48849707ca4b3ab1d78e93f8b02c779decf5d7..69c92cde722ddb801b16c4bf8114515d8792bf46
--- 1/src/gromacs/timing/wallcycle.h
--- 2/src/gromacs/timing/wallcycle.h
+++ b/src/gromacs/timing/wallcycle.h
@@@ -38,23 -38,19 +38,23 @@@
   #define GMX_TIMING_WALLCYCLE_H
   
   #include <stdio.h>
- -#include "gromacs/legacyheaders/typedefs.h"
- -#include "gromacs/legacyheaders/types/commrec.h"
+ +
+ +#include "gromacs/legacyheaders/types/commrec_fwd.h"
+ +#include "gromacs/legacyheaders/types/nbnxn_cuda_types_ext.h"
+ +#include "gromacs/utility/basedefinitions.h"
   
   #ifdef __cplusplus
   extern "C" {
   #endif
   
+ +typedef struct gmx_wallcycle *gmx_wallcycle_t;
+ +
   enum {
       ewcRUN, ewcSTEP, ewcPPDURINGPME, ewcDOMDEC, ewcDDCOMMLOAD,
       ewcDDCOMMBOUND, ewcVSITECONSTR, ewcPP_PMESENDX, ewcNS, ewcLAUNCH_GPU_NB,
       ewcMOVEX, ewcGB, ewcFORCE, ewcMOVEF, ewcPMEMESH,
       ewcPME_REDISTXF, ewcPME_SPREADGATHER, ewcPME_FFT, ewcPME_FFTCOMM, ewcLJPME, ewcPME_SOLVE,
-     ewcPMEWAITCOMM, ewcPP_PMEWAITRECVF, ewcWAIT_GPU_NB_NL, ewcWAIT_GPU_NB_L, ewcNB_XF_BUF_OPS,
+     ewcPMEWAITCOMM, ewcPP_PMEWAITRECVF, ewcWAIT_GPU_NB_NL, ewcWAIT_GPU_NB_L, ewcWAIT_GPU_NB_L_EST, ewcNB_XF_BUF_OPS,
       ewcVSITESPREAD, ewcPULLPOT,
       ewcTRAJ, ewcUPDATE, ewcCONSTR, ewcMoveE, ewcROT, ewcROTadd, ewcSWAP, ewcIMD,
       ewcTEST, ewcNR
@@@ -66,7 -62,7 +66,7 @@@ enum 
       ewcsDD_MAKETOP, ewcsDD_MAKECONSTR, ewcsDD_TOPOTHER,
       ewcsNBS_GRID_LOCAL, ewcsNBS_GRID_NONLOCAL,
       ewcsNBS_SEARCH_LOCAL, ewcsNBS_SEARCH_NONLOCAL,
- -    ewcsBONDED, ewcsNONBONDED, ewcsEWALD_CORRECTION,
+ +    ewcsLISTED, ewcsNONBONDED, ewcsEWALD_CORRECTION,
       ewcsNB_X_BUF_OPS, ewcsNB_F_BUF_OPS,
       ewcsNR
   };
diff --combined src/programs/mdrun/md.cpp

index 9864f8789f9385d2e536ed33f5988bacff85186e,3d98d597c7d0e9c009e02d816e631b3f36846a25..0504150c008e23e09337d2774f24662f049a6474
--- 1/src/programs/mdrun/md.cpp
--- 2/src/programs/mdrun/md.c
+++ b/src/programs/mdrun/md.cpp
@@@ -34,67 -34,67 +34,67 @@@
    * To help us fund GROMACS development, we humbly ask that you cite
    * the research papers on the package. Check out http://www.gromacs.org.
    */
- -#ifdef HAVE_CONFIG_H
- -#include <config.h>
- -#endif
+ +#include "gmxpre.h"
   
- -#include "typedefs.h"
- -#include "gromacs/utility/smalloc.h"
- -#include "sysstuff.h"
- -#include "vec.h"
- -#include "vcm.h"
- -#include "mdebin.h"
- -#include "nrnb.h"
- -#include "calcmu.h"
- -#include "index.h"
- -#include "vsite.h"
- -#include "update.h"
- -#include "ns.h"
- -#include "mdrun.h"
- -#include "md_support.h"
- -#include "md_logging.h"
- -#include "network.h"
- -#include "xvgr.h"
- -#include "physics.h"
- -#include "names.h"
- -#include "force.h"
- -#include "disre.h"
- -#include "orires.h"
- -#include "pme.h"
- -#include "mdatoms.h"
- -#include "repl_ex.h"
- -#include "deform.h"
- -#include "qmmm.h"
- -#include "domdec.h"
- -#include "domdec_network.h"
- -#include "gromacs/gmxlib/topsort.h"
- -#include "coulomb.h"
- -#include "constr.h"
- -#include "shellfc.h"
- -#include "gromacs/gmxpreprocess/compute_io.h"
- -#include "checkpoint.h"
- -#include "mtop_util.h"
- -#include "sighandler.h"
- -#include "txtdump.h"
- -#include "gromacs/utility/cstringutil.h"
- -#include "pme_loadbal.h"
- -#include "bondf.h"
- -#include "membed.h"
- -#include "types/nlistheuristics.h"
- -#include "types/iteratedconstraints.h"
- -#include "nbnxn_cuda_data_mgmt.h"
+ +#include "config.h"
+ +
+ +#include <stdlib.h>
   
- -#include "gromacs/utility/gmxmpi.h"
   #include "gromacs/fileio/confio.h"
+ +#include "gromacs/fileio/mdoutf.h"
   #include "gromacs/fileio/trajectory_writing.h"
   #include "gromacs/fileio/trnio.h"
   #include "gromacs/fileio/trxio.h"
   #include "gromacs/fileio/xtcio.h"
- -#include "gromacs/timing/wallcycle.h"
- -#include "gromacs/timing/walltime_accounting.h"
+ +#include "gromacs/gmxpreprocess/compute_io.h"
+ +#include "gromacs/imd/imd.h"
+ +#include "gromacs/legacyheaders/bonded-threading.h"
+ +#include "gromacs/legacyheaders/calcmu.h"
+ +#include "gromacs/legacyheaders/checkpoint.h"
+ +#include "gromacs/legacyheaders/constr.h"
+ +#include "gromacs/legacyheaders/coulomb.h"
+ +#include "gromacs/legacyheaders/disre.h"
+ +#include "gromacs/legacyheaders/domdec.h"
+ +#include "gromacs/legacyheaders/domdec_network.h"
+ +#include "gromacs/legacyheaders/force.h"
+ +#include "gromacs/legacyheaders/md_logging.h"
+ +#include "gromacs/legacyheaders/md_support.h"
+ +#include "gromacs/legacyheaders/mdatoms.h"
+ +#include "gromacs/legacyheaders/mdebin.h"
+ +#include "gromacs/legacyheaders/mdrun.h"
+ +#include "gromacs/legacyheaders/names.h"
+ +#include "gromacs/legacyheaders/network.h"
+ +#include "gromacs/legacyheaders/nrnb.h"
+ +#include "gromacs/legacyheaders/ns.h"
+ +#include "gromacs/legacyheaders/orires.h"
+ +#include "gromacs/legacyheaders/pme.h"
+ +#include "gromacs/legacyheaders/qmmm.h"
+ +#include "gromacs/legacyheaders/shellfc.h"
+ +#include "gromacs/legacyheaders/sighandler.h"
+ +#include "gromacs/legacyheaders/txtdump.h"
+ +#include "gromacs/legacyheaders/typedefs.h"
+ +#include "gromacs/legacyheaders/update.h"
+ +#include "gromacs/legacyheaders/vcm.h"
+ +#include "gromacs/legacyheaders/vsite.h"
+ +#include "gromacs/legacyheaders/types/iteratedconstraints.h"
+ +#include "gromacs/legacyheaders/types/nlistheuristics.h"
+ +#include "gromacs/math/vec.h"
+ +#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.h"
+ +#include "gromacs/pbcutil/mshift.h"
+ +#include "gromacs/pbcutil/pbc.h"
   #include "gromacs/pulling/pull.h"
   #include "gromacs/swap/swapcoords.h"
- -#include "gromacs/imd/imd.h"
+ +#include "gromacs/timing/wallcycle.h"
+ +#include "gromacs/timing/walltime_accounting.h"
+ +#include "gromacs/topology/mtop_util.h"
+ +#include "gromacs/utility/cstringutil.h"
+ +#include "gromacs/utility/gmxmpi.h"
+ +#include "gromacs/utility/smalloc.h"
+ +
+ +#include "deform.h"
+ +#include "membed.h"
+ +#include "pme_loadbal.h"
+ +#include "repl_ex.h"
   
   #ifdef GMX_FAHCORE
   #include "corewrap.h"
@@@ -105,7 -105,7 +105,7 @@@ static void reset_all_counters(FILE *fp
                                  gmx_int64_t *step_rel, t_inputrec *ir,
                                  gmx_wallcycle_t wcycle, t_nrnb *nrnb,
                                  gmx_walltime_accounting_t walltime_accounting,
- -                               nbnxn_cuda_ptr_t cu_nbv)
+ +                               struct nonbonded_verlet_t *nbv)
   {
       char sbuf[STEPSTRSIZE];
   
@@@ -113,7 -113,10 +113,7 @@@
       md_print_warn(cr, fplog, "step %s: resetting all time and cycle counters\n",
                     gmx_step_str(step, sbuf));
   
- -    if (cu_nbv)
- -    {
- -        nbnxn_cuda_reset_timings(cu_nbv);
- -    }
+ +    nbnxn_cuda_reset_timings(nbv);
   
       wallcycle_stop(wcycle, ewcRUN);
       wallcycle_reset_all(wcycle);
@@@ -167,7 -170,7 +167,7 @@@ double do_md(FILE *fplog, t_commrec *cr
       rvec              mu_tot;
       t_vcm            *vcm;
       t_state          *bufstate = NULL;
- -    matrix           *scale_tot, pcoupl_mu, M, ebox;
+ +    matrix            pcoupl_mu, M;
       gmx_nlheur_t      nlh;
       t_trxframe        rerun_fr;
       gmx_repl_ex_t     repl_ex = NULL;
@@@ -186,8 -189,10 +186,8 @@@
       gmx_ekindata_t   *ekind, *ekind_save;
       gmx_shellfc_t     shellfc;
       int               count, nconverged = 0;
- -    real              timestep   = 0;
- -    double            tcount     = 0;
- -    gmx_bool          bConverged = TRUE, bOK, bSumEkinhOld, bDoReplEx, bExchanged, bNeedRepartition;
- -    gmx_bool          bAppend;
+ +    double            tcount                 = 0;
+ +    gmx_bool          bConverged             = TRUE, bSumEkinhOld, bDoReplEx, bExchanged, bNeedRepartition;
       gmx_bool          bResetCountersHalfMaxH = FALSE;
       gmx_bool          bVV, bIterativeCase, bFirstIterate, bTemp, bPres, bTrotter;
       gmx_bool          bUpdateDoLR;
@@@ -202,6 -207,7 +202,6 @@@
       double            cycles;
       real              saved_conserved_quantity = 0;
       real              last_ekin                = 0;
- -    int               iter_i;
       t_extmass         MassQ;
       int             **trotter_seq;
       char              sbuf[STEPSTRSIZE], sbuf2[STEPSTRSIZE];
@@@ -225,6 -231,7 +225,6 @@@
   
       /* Check for special mdrun options */
       bRerunMD = (Flags & MD_RERUN);
- -    bAppend  = (Flags & MD_APPENDFILES);
       if (Flags & MD_RESETCOUNTERSHALFWAY)
       {
           if (ir->nsteps > 0)
@@@ -476,7 -483,7 +476,7 @@@
        */
       if ((Flags & MD_TUNEPME) &&
           EEL_PME(fr->eeltype) &&
- -        ( (fr->cutoff_scheme == ecutsVERLET && fr->nbv->bUseGPU) || !(cr->duty & DUTY_PME)) &&
+ +        ( use_GPU(fr->nbv) || !(cr->duty & DUTY_PME)) &&
           !bRerunMD)
       {
           pme_loadbal_init(&pme_loadbal, ir, state->box, fr->ic, fr->pmedata);
@@@ -719,7 -726,9 +719,7 @@@
       bStateFromTPX    = !bStateFromCP;
       bInitStep        = bFirstStep && (bStateFromTPX || bVV);
       bStartingFromCpt = (Flags & MD_STARTFROMCPT) && bInitStep;
- -    bLastStep        = FALSE;
       bSumEkinhOld     = FALSE;
- -    bDoReplEx        = FALSE;
       bExchanged       = FALSE;
       bNeedRepartition = FALSE;
   
@@@ -728,7 -737,10 +728,7 @@@
       step     = ir->init_step;
       step_rel = 0;
   
- -    if (ir->nstlist == -1)
- -    {
- -        init_nlistheuristics(&nlh, bGStatEveryStep, step);
- -    }
+ +    init_nlistheuristics(&nlh, bGStatEveryStep, step);
   
       if (MULTISIM(cr) && (repl_ex_nst <= 0 ))
       {
@@@ -1149,6 -1161,7 +1149,6 @@@
                       }
                   }
   
- -                bOK = TRUE;
                   if (!bRerunMD || rerun_fr.bV || bForceUpdate)     /* Why is rerun_fr.bV here?  Unclear. */
                   {
                       wallcycle_stop(wcycle, ewcUPDATE);
@@@ -1164,6 -1177,12 +1164,6 @@@
                           /* Correct the virial for multiple time stepping */
                           m_sub(shake_vir, fr->vir_twin_constr, shake_vir);
                       }
- -
- -                    if (!bOK)
- -                    {
- -                        gmx_fatal(FARGS, "Constraint error: Shake, Lincs or Settle could not solve the constrains");
- -                    }
- -
                   }
                   else if (graph)
                   {
@@@ -1464,6 -1483,7 +1464,6 @@@
                */
               copy_mat(state->box, lastbox);
   
- -            bOK         = TRUE;
               dvdl_constr = 0;
   
               if (!(bRerunMD && !rerun_fr.bV && !bForceUpdate))
@@@ -1573,7 -1593,15 +1573,7 @@@
                                          FALSE, bCalcVir,
                                          state->veta);
                   }
- -                if (!bOK)
- -                {
- -                    gmx_fatal(FARGS, "Constraint error: Shake, Lincs or Settle could not solve the constrains");
- -                }
   
- -                if (fr->bSepDVDL && fplog && do_log)
- -                {
- -                    gmx_print_sepdvdl(fplog, "Constraint dV/dl", 0.0, dvdl_constr);
- -                }
                   if (bVV)
                   {
                       /* this factor or 2 correction is necessary
@@@ -1881,6 -1909,21 +1881,21 @@@
                       }
                       dd_bcast(cr->dd, sizeof(gmx_bool), &bPMETuneRunning);
   
- -                        fr->nbv->bUseGPU && DOMAINDECOMP(cr) &&
+                     if (bPMETuneRunning &&
++                        use_GPU(fr->nbv) && DOMAINDECOMP(cr) &&
+                         !(cr->duty & DUTY_PME))
+                     {
+                         /* Lock DLB=auto to off (does nothing when DLB=yes/no).
+                          * With GPUs + separate PME ranks, we don't want DLB.
+                          * This could happen when we scan coarse grids and
+                          * it would then never be turned off again.
+                          * This would hurt performance at the final, optimal
+                          * grid spacing, where DLB almost never helps.
+                          * Also, DLB can limit the cut-off for PME tuning.
+                          */
+                         dd_dlb_set_lock(cr->dd, TRUE);
+                     }
+ 
                       if (bPMETuneRunning || step_rel > ir->nstlist*50)
                       {
                           bPMETuneTry     = FALSE;
@@@ -1911,6 -1954,16 +1926,16 @@@
                       {
                           calc_enervirdiff(NULL, ir->eDispCorr, fr);
                       }
+ 
+                     if (!bPMETuneRunning &&
+                         DOMAINDECOMP(cr) &&
+                         dd_dlb_is_locked(cr->dd))
+                     {
+                         /* Unlock the DLB=auto, DLB is allowed to activate
+                          * (but we don't expect it to activate in most cases).
+                          */
+                         dd_dlb_set_lock(cr->dd, FALSE);
+                     }
                   }
                   cycles_pmes = 0;
               }
@@@ -1921,7 -1974,7 +1946,7 @@@
           {
               /* Reset all the counters related to performance over the run */
               reset_all_counters(fplog, cr, step, &step_rel, ir, wcycle, nrnb, walltime_accounting,
- -                               fr->nbv != NULL && fr->nbv->bUseGPU ? fr->nbv->cu_nbv : NULL);
+ +                               use_GPU(fr->nbv) ? fr->nbv : NULL);
               wcycle_set_reset_counters(wcycle, -1);
               if (!(cr->duty & DUTY_PME))
               {
@@@ -1980,7 -2033,7 +2005,7 @@@
       if (pme_loadbal != NULL)
       {
           pme_loadbal_done(pme_loadbal, cr, fplog,
- -                         fr->nbv != NULL && fr->nbv->bUseGPU);
+ +                         use_GPU(fr->nbv));
       }
   
       if (shellfc && fplog)
diff --combined src/programs/mdrun/pme_loadbal.c

index 5ba76e63e0edab5bba4a85a415fce497367b0386,d6fd68f37269d10692d625172706519c053c2cf3..dd2cd1a2e9f049b01ff6de90078d249f925c0cf7
--- 1/src/programs/mdrun/pme_loadbal.c
--- 2/src/programs/mdrun/pme_loadbal.c
+++ b/src/programs/mdrun/pme_loadbal.c
@@@ -32,27 -32,23 +32,27 @@@
    * To help us fund GROMACS development, we humbly ask that you cite
    * the research papers on the package. Check out http://www.gromacs.org.
    */
- -#ifdef HAVE_CONFIG_H
- -#include <config.h>
- -#endif
+ +#include "gmxpre.h"
   
- -#include "gromacs/utility/smalloc.h"
- -#include "types/commrec.h"
- -#include "network.h"
- -#include "calcgrid.h"
- -#include "pme.h"
- -#include "vec.h"
- -#include "domdec.h"
- -#include "nbnxn_cuda_data_mgmt.h"
- -#include "force.h"
- -#include "macros.h"
- -#include "md_logging.h"
   #include "pme_loadbal.h"
   
+ +#include "config.h"
+ +
+ +#include "gromacs/legacyheaders/calcgrid.h"
+ +#include "gromacs/legacyheaders/domdec.h"
+ +#include "gromacs/legacyheaders/force.h"
+ +#include "gromacs/legacyheaders/macros.h"
+ +#include "gromacs/legacyheaders/md_logging.h"
+ +#include "gromacs/legacyheaders/network.h"
+ +#include "gromacs/legacyheaders/pme.h"
+ +#include "gromacs/legacyheaders/sim_util.h"
+ +#include "gromacs/legacyheaders/types/commrec.h"
+ +#include "gromacs/math/vec.h"
+ +#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.h"
+ +#include "gromacs/pbcutil/pbc.h"
+ +#include "gromacs/utility/cstringutil.h"
+ +#include "gromacs/utility/smalloc.h"
+ +
   /* Parameters and setting for one PP-PME setup */
   typedef struct {
       real      rcut_coulomb;    /* Coulomb cut-off                              */
@@@ -265,6 -261,15 +265,15 @@@ static gmx_bool pme_loadbal_increase_cu
       while (sp <= 1.001*pme_lb->setup[pme_lb->cur].spacing || !grid_ok);
   
       set->rcut_coulomb = pme_lb->cut_spacing*sp;
+     if (set->rcut_coulomb < pme_lb->rcut_coulomb_start)
+     {
+         /* This is unlikely, but can happen when e.g. continuing from
+          * a checkpoint after equilibration where the box shrank a lot.
+          * We want to avoid rcoulomb getting smaller than rvdw
+          * and there might be more issues with decreasing rcoulomb.
+          */
+         set->rcut_coulomb = pme_lb->rcut_coulomb_start;
+     }
   
       if (pme_lb->cutoff_scheme == ecutsVERLET)
       {
@@@ -431,17 -436,17 +440,17 @@@ static void switch_to_stage1(pme_load_b
       pme_lb->cur = pme_lb->start - 1;
   }
   
- -gmx_bool pme_load_balance(pme_load_balancing_t pme_lb,
- -                          t_commrec           *cr,
- -                          FILE                *fp_err,
- -                          FILE                *fp_log,
- -                          t_inputrec          *ir,
- -                          t_state             *state,
- -                          double               cycles,
- -                          interaction_const_t *ic,
- -                          nonbonded_verlet_t  *nbv,
- -                          gmx_pme_t           *pmedata,
- -                          gmx_int64_t          step)
+ +gmx_bool pme_load_balance(pme_load_balancing_t        pme_lb,
+ +                          t_commrec                  *cr,
+ +                          FILE                       *fp_err,
+ +                          FILE                       *fp_log,
+ +                          t_inputrec                 *ir,
+ +                          t_state                    *state,
+ +                          double                      cycles,
+ +                          interaction_const_t        *ic,
+ +                          struct nonbonded_verlet_t  *nbv,
+ +                          gmx_pme_t                  *pmedata,
+ +                          gmx_int64_t                 step)
   {
       gmx_bool     OK;
       pme_setup_t *set;
@@@ -693,26 -698,30 +702,26 @@@
       }
   
       bUsesSimpleTables = uses_simple_tables(ir->cutoff_scheme, nbv, 0);
- -    if (pme_lb->cutoff_scheme == ecutsVERLET &&
- -        nbv->grp[0].kernel_type == nbnxnk8x8x8_CUDA)
- -    {
- -        nbnxn_cuda_pme_loadbal_update_param(nbv->cu_nbv, ic);
- -
- -        /* With tMPI + GPUs some ranks may be sharing GPU(s) and therefore
- -         * also sharing texture references. To keep the code simple, we don't
- -         * treat texture references as shared resources, but this means that
- -         * the coulomb_tab texture ref will get updated by multiple threads.
- -         * Hence, to ensure that the non-bonded kernels don't start before all
- -         * texture binding operations are finished, we need to wait for all ranks
- -         * to arrive here before continuing.
- -         *
- -         * Note that we could omit this barrier if GPUs are not shared (or
- -         * texture objects are used), but as this is initialization code, there
- -         * is not point in complicating things.
- -         */
+ +    nbnxn_cuda_pme_loadbal_update_param(nbv, ic);
+ +
+ +    /* With tMPI + GPUs some ranks may be sharing GPU(s) and therefore
+ +     * also sharing texture references. To keep the code simple, we don't
+ +     * treat texture references as shared resources, but this means that
+ +     * the coulomb_tab texture ref will get updated by multiple threads.
+ +     * Hence, to ensure that the non-bonded kernels don't start before all
+ +     * texture binding operations are finished, we need to wait for all ranks
+ +     * to arrive here before continuing.
+ +     *
+ +     * Note that we could omit this barrier if GPUs are not shared (or
+ +     * texture objects are used), but as this is initialization code, there
+ +     * is not point in complicating things.
+ +     */
   #ifdef GMX_THREAD_MPI
- -        if (PAR(cr))
- -        {
- -            gmx_barrier(cr);
- -        }
- -#endif  /* GMX_THREAD_MPI */
+ +    if (PAR(cr) && use_GPU(nbv))
+ +    {
+ +        gmx_barrier(cr);
       }
+ +#endif  /* GMX_THREAD_MPI */
   
       /* Usually we won't need the simple tables with GPUs.
        * But we do with hybrid acceleration and with free energy.
author	Mark Abraham <mark.j.abraham@gmail.com>
	Wed, 1 Oct 2014 20:02:02 +0000 (22:02 +0200)
committer	Mark Abraham <mark.j.abraham@gmail.com>
	Wed, 1 Oct 2014 21:18:54 +0000 (23:18 +0200)
		1	2
CMakeLists.txt	patch \|	diff1 \|	diff2 \|	blob \| history
cmake/gmxCFlags.cmake	patch \|	diff1 \|	diff2 \|	blob \| history
src/gromacs/gmxana/gmx_tune_pme.c	patch \|	diff1 \|	diff2 \|	blob \| history
src/gromacs/gmxlib/calcgrid.c	patch \|	diff1 \|	diff2 \|	blob \| history
src/gromacs/gmxpreprocess/toppush.c	patch \|	diff1 \|	diff2 \|	blob \| history
src/gromacs/legacyheaders/domdec.h	patch \|	diff1 \|	diff2 \|	blob \| history
src/gromacs/listed-forces/bonded.cpp	patch \|	diff1 \|	diff2 \|	blob \| history
src/gromacs/mdlib/domdec.cpp	patch \|	diff1 \|	diff2 \|	blob \| history
src/gromacs/mdlib/force.c	patch \|	diff1 \|	diff2 \|	blob \| history
src/gromacs/mdlib/mdatom.c	patch \|	diff1 \|	diff2 \|	blob \| history
src/gromacs/mdlib/minimize.c	patch \|	diff1 \|	diff2 \|	blob \| history
src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu	patch \|	diff1 \|	diff2 \|	blob \| history
src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernels.cuh	patch \|	diff1 \|	diff2 \|	blob \| history
src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_file_generator/nbnxn_kernel_simd_template.c.pre	patch \|	diff1 \|	diff2 \|	blob \| history
src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_ref.c	patch \|	diff1 \|	diff2 \|	blob \| history
src/gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn.c	patch \|	diff1 \|	diff2 \|	blob \| history
src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn.c	patch \|	diff1 \|	diff2 \|	blob \| history
src/gromacs/mdlib/nbnxn_search.c	patch \|	diff1 \|	diff2 \|	blob \| history
src/gromacs/mdlib/ns.c	patch \|	diff1 \|	diff2 \|	blob \| history
src/gromacs/mdlib/sim_util.c	patch \|	diff1 \|	diff2 \|	blob \| history
src/gromacs/mdlib/update.c	patch \|	diff1 \|	diff2 \|	blob \| history
src/gromacs/timing/wallcycle.c	patch \|	diff1 \|	diff2 \|	blob \| history
src/gromacs/timing/wallcycle.h	patch \|	diff1 \|	diff2 \|	blob \| history
src/programs/mdrun/md.cpp	patch \|	diff1 \|	diff2 \|	blob \| history
src/programs/mdrun/pme_loadbal.c	patch \|	diff1 \|	diff2 \|	blob \| history