Merge branch 'origin/release-2020' into master
authorPaul Bauer <paul.bauer.q@gmail.com>
Wed, 6 May 2020 12:31:38 +0000 (14:31 +0200)
committerPaul Bauer <paul.bauer.q@gmail.com>
Wed, 6 May 2020 12:31:38 +0000 (14:31 +0200)
Resolved Conflicts:
admin/clang-tidy.sh
admin/containers/buildall.sh
admin/containers/utility.py
admin/dockerfiles/buildall.sh
admin/dockerfiles/ci-clang/Dockerfile
admin/dockerfiles/ci-docs-clang/Dockerfile
admin/dockerfiles/ci-docs-gcc/Dockerfile
admin/dockerfiles/ci-gcc/Dockerfile
admin/gitlab-ci/archive.gitlab-ci.yml
admin/gitlab-ci/documentation.gitlab-ci.yml
admin/gitlab-ci/global.gitlab-ci.yml
admin/gitlab-ci/gromacs.gitlab-ci.yml
admin/gitlab-ci/lint.gitlab-ci.yml
cmake/gmxManageSimd.cmake
cmake/gmxVersionInfo.cmake
docs/dev-manual/containers.rst
src/CMakeLists.txt
src/api/cpp/tests/testingconfiguration.h
src/buildinfo.h.cmakein
src/gromacs/CMakeLists.txt
src/gromacs/ewald/pme_only.cpp
src/gromacs/fileio/matio.cpp
src/gromacs/fileio/oenv.cpp
src/gromacs/gmxana/gmx_do_dssp.cpp
src/gromacs/gmxana/nrama.cpp
src/gromacs/gmxlib/network.cpp
src/gromacs/gmxlib/nonbonded/nb_free_energy.cpp
src/gromacs/listed_forces/bonded.cpp
src/gromacs/mdlib/forcerec.cpp
src/gromacs/mdtypes/interaction_const.h
src/gromacs/nbnxm/pairlist.h
src/gromacs/selection/CMakeLists.txt
src/gromacs/simd/impl_x86_avx_256/impl_x86_avx_256_simd_double.h
src/gromacs/simd/impl_x86_avx_256/impl_x86_avx_256_simd_float.h
src/gromacs/simd/simd.h
src/gromacs/utility/basedefinitions.h
src/gromacs/utility/binaryinformation.cpp
src/testutils/CMakeLists.txt
src/testutils/TestMacros.cmake
src/testutils/testasserts.h

Change-Id: Ie157786299ca8f2d8c77a148b3bf1b1061f44b8a

36 files changed:
1  2 
admin/containers/buildall.sh
admin/containers/utility.py
cmake/gmxManageSimd.cmake
cmake/gmxVersionInfo.cmake
docs/CMakeLists.txt
docs/release-notes/2020/2020.2.rst
docs/release-notes/index.rst
src/CMakeLists.txt
src/api/cpp/CMakeLists.txt
src/api/cpp/tests/testingconfiguration.h
src/buildinfo.h.cmakein
src/gromacs/CMakeLists.txt
src/gromacs/domdec/domdec.cpp
src/gromacs/fileio/matio.cpp
src/gromacs/gmxana/gmx_do_dssp.cpp
src/gromacs/gmxana/nrama.cpp
src/gromacs/gmxlib/nonbonded/nb_free_energy.cpp
src/gromacs/listed_forces/bonded.cpp
src/gromacs/mdlib/forcerec.cpp
src/gromacs/mdlib/sim_util.cpp
src/gromacs/mdrun/isimulator.h
src/gromacs/mdrun/md.cpp
src/gromacs/mdtypes/interaction_const.h
src/gromacs/nbnxm/grid.h
src/gromacs/nbnxm/nbnxm_setup.cpp
src/gromacs/nbnxm/pairlist.h
src/gromacs/options/optionstoragetemplate.h
src/gromacs/selection/CMakeLists.txt
src/gromacs/simd/impl_x86_avx_256/impl_x86_avx_256_simd_double.h
src/gromacs/simd/impl_x86_avx_256/impl_x86_avx_256_simd_float.h
src/gromacs/simd/simd.h
src/gromacs/topology/block.h
src/gromacs/utility/basedefinitions.h
src/gromacs/utility/binaryinformation.cpp
src/testutils/CMakeLists.txt
src/testutils/TestMacros.cmake

index ee38d782a71950ca42985e2ad074cd0dd0d38209,1492f0f775e8df87509e397ad5d0722fc4f52c42..afc2b7166234401ea24badb5becb570f581124eb
@@@ -17,6 -17,10 +17,10 @@@ tag="gromacs/cmake-3.9.6-gcc-6-cuda-10.
  tags[${#tags[@]}]=$tag
  python3 $SCRIPT --cmake 3.9.6 --gcc 6 --cuda 10.1 --opencl --clfft --mpi openmpi | docker build -t $tag -
  
+ tag="gromacs/cmake-3.9.6-gcc-7-amdopencl-clfft-openmpi:2020"
+ tags[${#tags[@]}]=$tag
+ python3 $SCRIPT --cmake 3.9.6 --gcc 7 --opencl amd --clfft --mpi openmpi | docker build -t $tag -
  tag="gromacs/cmake-3.15.7-gcc-8-cuda-10.1-openmpi:2020"
  tags[${#tags[@]}]=$tag
  python3 $SCRIPT --cmake 3.15.7 --gcc 8 --cuda 10.1 --mpi openmpi | docker build -t $tag -
@@@ -41,52 -45,20 +45,53 @@@ tag="gromacs/cmake-3.15.7-llvm-8-intelo
  tags[${#tags[@]}]=$tag
  python3 $SCRIPT --cmake 3.15.7 --llvm 8 --opencl intel --mpi openmpi | docker build -t $tag -
  
 -tag="gromacs/cmake-3.9.6-llvm-3.6-amdopencl-openmpi:2020"
 +tag="gromacs/cmake-3.9.6-llvm-8-amdopencl-openmpi:2020"
  tags[${#tags[@]}]=$tag
 -python3 $SCRIPT --ubuntu 16.04 --cmake 3.9.6 --llvm 3.6 --opencl amd --mpi openmpi | docker build -t $tag -
 +python3 $SCRIPT --cmake 3.9.6 --llvm 8 --opencl amd --mpi openmpi | docker build -t $tag -
  
 -tag=gromacs/ci-docs-llvm:2020
 +tag="gromacs/cmake-3.15.7-gcc-8-cuda-10.1-nvidiaopencl-clfft-openmpi:master"
  tags[${#tags[@]}]=$tag
 -python3 $SCRIPT --llvm --doxygen | docker build -t $tag -
 +python3 $SCRIPT --cmake 3.15.7 --gcc 8 --cuda 10.1 --opencl --clfft --mpi openmpi \
 +| docker build -t $tag -
  
 -tag=gromacs/ci-docs-gcc:2020
 +tag="gromacs/cmake-3.13.0-gcc-7-amdopencl-clfft-openmpi:master"
 +tags[${#tags[@]}]=$tag
 +python3 $SCRIPT --cmake 3.13.0 --gcc 7 --opencl amd --clfft --mpi openmpi | docker build -t $tag -
 +
 +tag="gromacs/cmake-3.13.0-llvm-8-tsan:master"
 +tags[${#tags[@]}]=$tag
 +python3 $SCRIPT --cmake 3.13.0 --llvm 8 --tsan | docker build -t $tag -
 +
 +tag="gromacs/cmake-3.15.7-llvm-8-cuda-10.0-openmpi:master"
 +tags[${#tags[@]}]=$tag
 +python3 $SCRIPT --cmake 3.15.7 --llvm 8 --cuda 10.0 --mpi openmpi | docker build -t $tag -
 +
 +tag="gromacs/cmake-3.15.7-llvm-8-cuda-10.1-openmpi:master"
 +tags[${#tags[@]}]=$tag
 +python3 $SCRIPT --cmake 3.15.7 --llvm 8 --cuda 10.1 --mpi openmpi | docker build -t $tag -
 +
 +tag="gromacs/cmake-3.15.7-llvm-9-openmpi:master"
 +tags[${#tags[@]}]=$tag
 +python3 $SCRIPT --cmake 3.15.7 --llvm 9 --mpi openmpi | docker build -t $tag -
 +
 +tag="gromacs/cmake-3.13.0-llvm-9-intelopencl-openmpi:master"
 +tags[${#tags[@]}]=$tag
 +python3 $SCRIPT --cmake 3.13.0 --llvm 9 --opencl intel --mpi openmpi | docker build -t $tag -
 +
 +tag="gromacs/cmake-3.13.0-llvm-9-amdopencl-openmpi:master"
 +tags[${#tags[@]}]=$tag
 +python3 $SCRIPT --cmake 3.13.0 --llvm 9 --opencl amd --mpi openmpi | docker build -t $tag -
 +
 +tag=gromacs/ci-docs-llvm:master
 +tags[${#tags[@]}]=$tag
 +python3 $SCRIPT --cmake 3.17.2 --llvm --doxygen | docker build -t $tag -
 +
 +tag=gromacs/ci-docs-gcc:master
  tags[${#tags[@]}]=$tag
  python3 $SCRIPT --gcc --doxygen | docker build -t $tag -
  
  docker login
  for tag in "${tags[@]}"; do
-   docker push $tag
+   echo "Pushing $tag"
+   #docker push $tag
  done
index b556f85292c91d461502b808a24496954e9df45c,e413a19be0c7f1fb661ddf9cd54f33dfd8b62e7b..6652b8edfde5e0de142b255af6af00c9985cf926
@@@ -58,20 -58,20 +58,20 @@@ parsers for tools
  """
  
  # TODO: Try using distutils.version.StrictVersion.
 -parser.add_argument('--cmake', type=str, default='3.9.6',
 -                    choices=['3.9.6', '3.11.4', '3.15.7'],
 +parser.add_argument('--cmake', type=str, default='3.13.0',
 +                    choices=['3.9.6', '3.11.4', '3.13.0', '3.14.7', '3.15.7', '3.16.6', '3.17.2'],
                      help='Selection of CMake version to provide to base image')
  compiler_group = parser.add_mutually_exclusive_group()
  compiler_group.add_argument('--gcc', type=int, nargs='?', const=7, default=7,
                              choices=[5, 6, 7, 8, 9],
                              help='Select GNU compiler tool chain. (Default) '
                                   'Some checking is implemented to avoid incompatible combinations')
- compiler_group.add_argument('--llvm', type=int, nargs='?', const=7, default=None,
-                             choices=[3, 6, 7, 8, 9],
+ compiler_group.add_argument('--llvm', type=str, nargs='?', const='7', default=None,
+                             choices=['3.6', '6', '7', '8'],
                              help='Select LLVM compiler tool chain. '
                                   'Some checking is implemented to avoid incompatible combinations')
  compiler_group.add_argument('--icc', type=int, nargs='?', const=19, default=None,
 -                            choices=[19],
 +                            choices=[19, 20],
                              help='Select Intel compiler tool chain. '
                                   'Some checking is implemented to avoid incompatible combinations')
  
index 6f0c6343eb7e49549e3a69d6e3a3afac4e3d4c00,cd507c166be8f868a9c87a413c5330678b0a8421..55c49b921f19c1a21ea8b327d7cb55c9c9edf169
@@@ -1,8 -1,8 +1,8 @@@
  #
  # This file is part of the GROMACS molecular simulation package.
  #
 -# Copyright (c) 2012,2013,2014,2015, The GROMACS development team.
 -# Copyright (c) 2016,2017,2018,2019,2020, by the GROMACS development team, led by
 +# Copyright (c) 2012,2013,2014,2015,2016 by the GROMACS development team.
 +# Copyright (c) 2017,2018,2019,2020, by the GROMACS development team, led by
  # Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  # and including many others, as listed in the AUTHORS file in the
  # top-level source directory and at http://www.gromacs.org.
@@@ -264,6 -264,13 +264,13 @@@ elseif(GMX_SIMD_ACTIVE STREQUAL "IBM_VM
  
  elseif(GMX_SIMD_ACTIVE STREQUAL "IBM_VSX")
  
+     # IBM_VSX and gcc > 9 do not work together, so we need to prevent people from
+     # choosing a combination that might fail. Issue #3380.
+     if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU" AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER "9")
+         message(FATAL_ERROR "IBM_VSX does not work together with gcc > 9. Disable SIMD support (slower), or use an older version of the GNU compiler")
+     endif()
      gmx_find_simd_ibm_vsx_flags(SIMD_IBM_VSX_C_SUPPORTED SIMD_IBM_VSX_CXX_SUPPORTED
                                  SIMD_IBM_VSX_C_FLAGS SIMD_IBM_VSX_CXX_FLAGS)
  
      # is not required for SIMD support on this platform. cmake through
      # at least version 3.7 cannot pass this check with the C compiler
      # in the latest xlc 13.1.5, but the C++ compiler has different
 -    # behaviour and is OK. See Redmine #2102.
 +    # behaviour and is OK. See Issue #2102.
      if(NOT SIMD_IBM_VSX_CXX_SUPPORTED)
          gmx_give_fatal_error_when_simd_support_not_found("IBM VSX" "disable SIMD support (slower)" "${SUGGEST_BINUTILS_UPDATE}")
      endif()
index 136b8893c8fbfb82df19c3762ac59fdb9a62a5e5,7ca2bff88208c52b04209629c0cdeae2d50fc81a..58f3ff6efc8389ca82ee090cc010f03057a7e155
@@@ -1,8 -1,7 +1,8 @@@
  #
  # This file is part of the GROMACS molecular simulation package.
  #
 -# Copyright (c) 2014,2015,2016,2017,2018,2019,2020, by the GROMACS development team, led by
 +# Copyright (c) 2014,2015,2016,2017,2018 by the GROMACS development team.
 +# Copyright (c) 2019,2020, by the GROMACS development team, led by
  # Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  # and including many others, as listed in the AUTHORS file in the
  # top-level source directory and at http://www.gromacs.org.
@@@ -59,7 -58,6 +59,7 @@@
  #         GROMACS     2018   3
  #         GROMACS     2019   4
  #         GROMACS     2020   5
 +#         GROMACS     2021   6
  #   LIBRARY_SOVERSION_MINOR so minor version for the built libraries.
  #       Should be increased for each release that changes only the implementation.
  #       In GROMACS, the typical policy is to increase it for each patch version
  
  # The GROMACS convention is that these are the version number of the next
  # release that is going to be made from this branch.
 -set(GMX_VERSION_MAJOR 2020)
 -set(GMX_VERSION_PATCH 3)
 +set(GMX_VERSION_MAJOR 2021)
 +set(GMX_VERSION_PATCH 0)
  # The suffix, on the other hand, is used mainly for betas and release
  # candidates, where it signifies the most recent such release from
  # this branch; it will be empty before the first such release, as well
@@@ -214,7 -212,7 +214,7 @@@ set(GMX_VERSION_SUFFIX ""
  # here. The important thing is to minimize the chance of third-party
  # code being able to dynamically link with a version of libgromacs
  # that might not work.
 -set(LIBRARY_SOVERSION_MAJOR 5)
 +set(LIBRARY_SOVERSION_MAJOR 6)
  set(LIBRARY_SOVERSION_MINOR 0)
  set(LIBRARY_VERSION ${LIBRARY_SOVERSION_MAJOR}.${LIBRARY_SOVERSION_MINOR}.0)
  
@@@ -256,13 -254,13 +256,13 @@@ if (NOT SOURCE_IS_SOURCE_DISTRIBUTION A
  endif()
  
  set(REGRESSIONTEST_VERSION "${GMX_VERSION_STRING}")
 -set(REGRESSIONTEST_BRANCH "release-2020")
 +set(REGRESSIONTEST_BRANCH "master")
  # Run the regressiontests packaging job with the correct pakage
  # version string, and the release box checked, in order to have it
  # build the regressiontests tarball with all the right naming. The
  # naming affects the md5sum that has to go here, and if it isn't right
  # release workflow will report a failure.
- set(REGRESSIONTEST_MD5SUM "b032e4517195b1f8cd9db87cee1b30df" CACHE INTERNAL "MD5 sum of the regressiontests tarball for this GROMACS version")
+ set(REGRESSIONTEST_MD5SUM "b75c3b1bac0e4114e236f049fc7f1f1c" CACHE INTERNAL "MD5 sum of the regressiontests tarball for this GROMACS version")
  
  math(EXPR GMX_VERSION_NUMERIC
       "${GMX_VERSION_MAJOR}*10000 + ${GMX_VERSION_PATCH}")
@@@ -333,10 -331,13 +333,10 @@@ set(VERSION_INFO_DEPS         ${VERSION
  set(VERSION_INFO_CMAKEIN_FILE     ${CMAKE_CURRENT_LIST_DIR}/VersionInfo.cmake.cmakein)
  set(VERSION_INFO_CONFIGURE_SCRIPT ${CMAKE_CURRENT_LIST_DIR}/gmxConfigureVersionInfo.cmake)
  # A set of directories to scan for calculating the hash of source files.
- set(SET_OF_DIRECTORIES_TO_CHECKSUM  "${PROJECT_SOURCE_DIR}/src")
- list(APPEND SET_OF_DIRECTORIES_TO_CHECKSUM "${PROJECT_SOURCE_DIR}/python_packaging")
+ set(SET_OF_DIRECTORIES_TO_CHECKSUM  "src")
+ list(APPEND SET_OF_DIRECTORIES_TO_CHECKSUM "python_packaging")
  # Due to the limitations for passing a list as arguments, we make the directories a string here
  string(REPLACE ";" ":" DIRECTORIES_TO_CHECKSUM_STRING "${SET_OF_DIRECTORIES_TO_CHECKSUM}")
 -# Try to find python for the checksumming script
 -set(PythonInterp_FIND_QUIETLY ON)
 -find_package(PythonInterp 3.5)
  
  # Rules to create the VersionInfo.cmake file.
  # For git info, the sequence is:
@@@ -434,12 -435,17 +434,17 @@@ set(CHECKSUM_FILE "${PROJECT_SOURCE_DIR
  # not been tampered with.
  # Note: The RUN_ALWAYS here is to regenerate the hash file only, it does not
  # mean that the target is run in all builds
- if (PYTHON_EXECUTABLE)
+ if (PYTHONINTERP_FOUND)
+     # We need the full path to the directories after passing it through
+     set(FULL_PATH_DIRECTORIES "")
+     foreach(DIR ${SET_OF_DIRECTORIES_TO_CHECKSUM})
+         list(APPEND FULL_PATH_DIRECTORIES "${PROJECT_SOURCE_DIR}/${DIR}")
+     endforeach()
      gmx_add_custom_output_target(reference_checksum RUN_ALWAYS
          OUTPUT ${CHECKSUM_FILE}
          COMMAND ${PYTHON_EXECUTABLE}
              ${PROJECT_SOURCE_DIR}/admin/createFileHash.py
-             -s ${SET_OF_DIRECTORIES_TO_CHECKSUM}
+             -s ${FULL_PATH_DIRECTORIES}
              -o ${CHECKSUM_FILE}
          WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
          COMMENT "Generating reference checksum of source files")
diff --combined docs/CMakeLists.txt
index 3d8ce9b2e0dea083010a4501d2473abaced3c19d,ca252f6648f3e40d3c02833331b21b822501fa6a..f5503ad2b0f3e50fc5e427d19ebdce3a81e8c779
@@@ -1,8 -1,7 +1,8 @@@
  #
  # This file is part of the GROMACS molecular simulation package.
  #
 -# Copyright (c) 2014,2015,2016,2017,2018,2019,2020, by the GROMACS development team, led by
 +# Copyright (c) 2014,2015,2016,2017,2018 by the GROMACS development team.
 +# Copyright (c) 2019,2020, by the GROMACS development team, led by
  # Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  # and including many others, as listed in the AUTHORS file in the
  # top-level source directory and at http://www.gromacs.org.
@@@ -48,8 -47,9 +48,8 @@@
  # of configure time, because 1) some of the version variables are only
  # available during build time, and 2) we don't want to do all the Sphinx setup
  # during configuration to save some time when not building the content.
 -# All the generated values get put into conf-vars.py (generated from
 -# conf-vars.py.cmakein), which in turn is included by the Sphinx configuration
 -# file conf.py.
 +# All of the generated values get put into conf.py (generated from
 +# conf.cmakein.py).
  
  set(SOURCE_MD5SUM "unknown" CACHE STRING
      "MD5 sum of the source tarball, normally used only for the pre-release webpage build")
@@@ -327,21 -327,19 +327,21 @@@ if (SPHINX_FOUND
      set(SPHINX_SOURCE_FILES
          index.rst
          download.rst
 -        conf.py
          links.dat
          dev-manual/build-system.rst
          dev-manual/change-management.rst
          dev-manual/commitstyle.rst
 +        dev-manual/containers.rst
          dev-manual/documentation-generation.rst
          dev-manual/contribute.rst
          dev-manual/doxygen.rst
          dev-manual/error-handling.rst
          dev-manual/formatting.rst
 +        dev-manual/gitlab.rst
          dev-manual/gmxtree.rst
          dev-manual/includestyle.rst
          dev-manual/index.rst
 +        dev-manual/infrastructure.rst
          dev-manual/jenkins.rst
          dev-manual/known-issues.rst
          dev-manual/language-features.rst
          how-to/visualize.rst
          install-guide/index.rst
          release-notes/index.rst
 +        release-notes/2021/major/highlights.rst
 +        release-notes/2021/major/features.rst
 +        release-notes/2021/major/performance.rst
 +        release-notes/2021/major/tools.rst
 +        release-notes/2021/major/bugs-fixed.rst
 +        release-notes/2021/major/removed-functionality.rst
 +        release-notes/2021/major/deprecated-functionality.rst
 +        release-notes/2021/major/portability.rst
 +        release-notes/2021/major/miscellaneous.rst
          release-notes/2020/2020.1.rst
          release-notes/2020/2020.2.rst
+         release-notes/2020/2020.3.rst
          release-notes/2020/major/highlights.rst
          release-notes/2020/major/features.rst
          release-notes/2020/major/performance.rst
          set(IMAGE_CONVERT_STRING "impossible")
      endif()
  
 -    set(SPHINX_CONFIG_VARS_FILE ${SPHINX_INPUT_DIR}/conf-vars.py)
 +    set(SPHINX_CONFIG_FILE ${SPHINX_INPUT_DIR}/conf.py)
      if (GMX_PYTHON_PACKAGE)
          set(GMXAPI_PYTHON_STAGING_DIR ${CMAKE_BINARY_DIR}/python_packaging/src/gmxapi_staging)
          # TODO: Resolve circular reference. We would like to get the CMake build-time directory for
          # in this context?
      endif ()
  
 -    gmx_configure_version_file(conf-vars.py.cmakein ${SPHINX_CONFIG_VARS_FILE}
 +    gmx_configure_version_file(
 +        conf.cmakein.py ${SPHINX_CONFIG_FILE}
          EXTRA_VARS
 -            SPHINX_EXTENSION_PATH RELENG_PATH
 -            IMAGE_CONVERT_STRING
 +            CMAKE_MINIMUM_REQUIRED_VERSION
              EXPECTED_DOXYGEN_VERSION
              EXPECTED_SPHINX_VERSION
 -            CMAKE_MINIMUM_REQUIRED_VERSION REQUIRED_CUDA_VERSION
 -            REQUIRED_OPENCL_MIN_VERSION
 -            REQUIRED_CUDA_COMPUTE_CAPABILITY REGRESSIONTEST_VERSION
 -            SOURCE_MD5SUM REGRESSIONTEST_MD5SUM_STRING
 -            GMX_TNG_MINIMUM_REQUIRED_VERSION
 +            GMX_ADMIN_DIR
              GMX_LMFIT_REQUIRED_VERSION
              GMX_MANUAL_DOI_STRING
 +            GMX_TNG_MINIMUM_REQUIRED_VERSION
              GMX_SOURCE_DOI_STRING
              GMXAPI_PYTHON_STAGING_DIR
 +            IMAGE_CONVERT_STRING
 +            REGRESSIONTEST_VERSION
 +            REQUIRED_CUDA_COMPUTE_CAPABILITY
 +            REQUIRED_CUDA_VERSION
 +            REQUIRED_OPENCL_MIN_VERSION
 +            REGRESSIONTEST_MD5SUM_STRING
 +            RELENG_PATH
 +            SOURCE_MD5SUM
 +            SPHINX_EXTENSION_PATH
          COMMENT "Configuring Sphinx configuration file")
 -    gmx_add_sphinx_input_file(${SPHINX_CONFIG_VARS_FILE})
 +    gmx_add_sphinx_input_file(${SPHINX_CONFIG_FILE})
      gmx_add_sphinx_source_files(FILES ${SPHINX_SOURCE_FILES})
      if (EXISTS ${RELENG_PATH}/docs/FileList.cmake)
          include(${RELENG_PATH}/docs/FileList.cmake)
  
      # Sphinx cache with pickled ReST documents
      set(SPHINX_CACHE_DIR "${CMAKE_CURRENT_BINARY_DIR}/_doctrees")
 +    set(SPHINX_CONFIG_OVERRIDES "")
 +    if (GMX_DEVELOPER_BUILD)
 +        set(SPHINX_CONFIG_OVERRIDES "-Dtodo_include_todos=1")
 +    endif()
      add_custom_target(webpage-sphinx
          DEPENDS sphinx-programs
          DEPENDS sphinx-input
              -q -b html
              -w sphinx-html.log
              -d "${SPHINX_CACHE_DIR}"
 +            ${SPHINX_CONFIG_OVERRIDES}
              "${SPHINX_INPUT_DIR}"
              "${HTML_OUTPUT_DIR}"
          WORKING_DIRECTORY
@@@ -704,7 -683,7 +705,7 @@@ set(HTML_BUILD_NOT_POSSIBLE_REASON
  set(HTML_BUILD_WARNINGS)
  
  # Next, turn it off if any of the preconditions are unsatisified
 -if (NOT PythonInterp_FOUND)
 +if (NOT Python3_Interpreter_FOUND)
      set(HTML_BUILD_IS_POSSIBLE OFF)
      set(HTML_BUILD_NOT_POSSIBLE_REASON "Python is required")
  elseif (NOT SPHINX_FOUND)
index d10999d4e9838a856eec1ce0e53ccd3422bc882d,7120dbaa462804dfaa3c09ebd8f976cc04e0ee02..0ef68802565ab2d303fca4810ccad4a8fe553b1e
@@@ -1,7 -1,7 +1,7 @@@
  GROMACS 2020.2 release notes
  ----------------------------
  
- This version was released on TODO, 2020. These release notes
+ This version was released on April 30th, 2020. These release notes
  document the changes that have taken place in GROMACS since the
  previous 2020.1 version, to fix known issues. It also incorporates all
  fixes made in version 2019.6 and earlier, which you can find described
@@@ -10,18 -10,91 +10,91 @@@ in the :ref:`release-notes`
  .. Note to developers!
     Please use """"""" to underline the individual entries for fixed issues in the subfolders,
     otherwise the formatting on the webpage is messed up.
 -   Also, please use the syntax :issue:`number` to reference issues on redmine, without the
 +   Also, please use the syntax :issue:`number` to reference issues on GitLab, without the
     a space between the colon and number!
  
  Fixes where mdrun could behave incorrectly
  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  
+ Ewald dipole correction incorrect without domain decomposition
+ """"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
+ Ewald dipole correction (epsilon-surface != 0) is now disabled when not
+ using domain decomposition. With domain decomposition, it only works
+ when each molecule consists of a single update group (e.g. water).
+ This will be fixed in release-2021.
+ :issue:`3441`
+ Expanded ensemble simulations restarted from checkpoints
+ """"""""""""""""""""""""""""""""""""""""""""""""""""""""
+ When restarting expanded ensemble simulations from checkpoints, expanded
+ ensemble would silently refuse to run, and simulations would remain in
+ their original lambda state.
+ :issue:`3465`
+ Fixed free energy calculations with LJ PME
+ """"""""""""""""""""""""""""""""""""""""""
+ Fixed an issue that calculated wrong long-range corrections when using
+ free energy perturbation with ``vdwtype = pme``. This affected forces,
+ energies, lambda derivatives and foreign lambdas.
+ :issue:`3470`
+ The velocities of the center of mass are now removed correctly in case of -update gpu
+ """""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
+ When the center of mass motion is removed, the velocities are updated in the CPU memory.
+ In case of GPU update, they should be copied back to the GPU memory after they were updated
+ on the CPU. This affected most runs where the velocity of the center of mass has to be removed,
+ in particular these where this velocity is large in the beginning of the run.
+ :issue:`3508`
+ Fix checkpoint restart with non-zero initial step
+ """""""""""""""""""""""""""""""""""""""""""""""""
+ When restarting from the checkpoint, the init-step mdp parameter was ignored while
+ checking if the simulation is already finished. As a result, this check only worked
+ properly when init-step was 0 or was not specified.
+ :issue:`3489`
  Fixes for ``gmx`` tools
  ^^^^^^^^^^^^^^^^^^^^^^^
  
+ Time output unit fixes
+ ^^^^^^^^^^^^^^^^^^^^^^
+ When selecting a time unit of microseconds or larger,
+ ``gmx tool -tu`` now produces the correct string in .xvg and
+ particularly .xvgr plots
+ Fix do_dssp
+ ^^^^^^^^^^^
+ The tool would fail with a segmentation fault.
+ :issue:`3444`
  Fixes that affect portability
  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  
+ Give clearer message about not detecting IBM_VSX support in gcc > 9
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+ CMake would fail with a confusing error message.
+ :issue:`3380`
  Miscellaneous
  ^^^^^^^^^^^^^
  
+ Fixed initial DLB state reporting
+ """""""""""""""""""""""""""""""""
+ The initial DLB state was reported incorrectly in the log file when
+ the either "on" or "auto" value was the chosen at mdrun startup.
index e1796a0c9642fe50213b81add38d13c6a915747c,216de25a6abff9cc19ef880fb3538befbdcb4c84..1b30704e5787b298f21fdd9b8de48bdae708ee3d
@@@ -8,39 -8,17 +8,39 @@@ releases of |Gromacs|. Major releases c
  functionality supported, whereas patch releases contain only fixes for
  issues identified in the corresponding major releases.
  
 -Two versions of |Gromacs| are under active maintenance, the 2020
 -series and the 2019 series. In the latter, only highly conservative
 +Two versions of |Gromacs| are under active maintenance, the 2021
 +series and the 2020 series. In the latter, only highly conservative
  fixes will be made, and only to address issues that affect scientific
  correctness. Naturally, some of those releases will be made after the
 -year 2019 ends, but we keep 2019 in the name so users understand how
 +year 2020 ends, but we keep 2019 in the name so users understand how
  up to date their version is. Such fixes will also be incorporated into
 -the 2020 release series, as appropriate. Around the time the 2021
 -release is made, the 2019 series will no longer be maintained.
 +the 2021 release series, as appropriate. Around the time the 2022
 +release is made, the 2020 series will no longer be maintained.
  
  Where issue numbers are reported in these release notes, more details
 -can be found at https://redmine.gromacs.org at that issue number.
 +can be found at https://gitlab.com/gromacs/gromacs/-/issues at that issue number.
 +
 +|Gromacs| 2021 series
 +---------------------
 +
 +.. todolist::
 +
 +Major release
 +^^^^^^^^^^^^^
 +
 +.. toctree::
 +   :maxdepth: 1
 +
 +   2021/major/highlights
 +   2021/major/features
 +   2021/major/performance
 +   2021/major/tools
 +   2021/major/bugs-fixed
 +   2021/major/deprecated-functionality
 +   2021/major/removed-functionality
 +   2021/major/portability
 +   2021/major/miscellaneous
 +
  
  |Gromacs| 2020 series
  ---------------------
@@@ -53,6 -31,7 +53,7 @@@ Patch release
  
     2020/2020.1
     2020/2020.2
+    2020/2020.3
  
  Major release
  ^^^^^^^^^^^^^
diff --combined src/CMakeLists.txt
index 820e83c108255f1aa002ddb45bca9b0940c2d069,51c7c037667ee3c82681a5e988514bb2e908b3b1..4d75aabea445d73d8c4af593adbd60761d3ffe73
@@@ -1,9 -1,7 +1,9 @@@
  #
  # This file is part of the GROMACS molecular simulation package.
  #
 -# Copyright (c) 2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020, by the GROMACS development team, led by
 +# Copyright (c) 2009,2010,2011,2012,2013 by the GROMACS development team.
 +# Copyright (c) 2014,2015,2016,2017,2018 by the GROMACS development team.
 +# Copyright (c) 2019,2020, by the GROMACS development team, led by
  # Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  # and including many others, as listed in the AUTHORS file in the
  # top-level source directory and at http://www.gromacs.org.
@@@ -48,10 -46,6 +48,6 @@@ if(GMX_USE_CUDA
      get_cuda_compiler_info(CUDA_COMPILER_INFO CUDA_DEVICE_COMPILER_FLAGS CUDA_HOST_COMPILER_FLAGS)
  endif()
  
- string(TOUPPER "${CMAKE_BUILD_TYPE}" CMAKE_BUILD_TYPE_UPPER)
- configure_file(config.h.cmakein config.h)
- configure_file(gmxpre-config.h.cmakein gmxpre-config.h)
- configure_file(buildinfo.h.cmakein buildinfo.h ESCAPE_QUOTES)
  # Make a file with compiler flags used for libgromacs for each
  # langauge and build configuration.  The one that corresponds to
  # CMAKE_BUILD_TYPE is #included into buildinfo.h and populates the
@@@ -71,11 -65,18 +67,11 @@@ set(IGNORED_CLANG_ALL_WARNING
      "-Wno-covered-switch-default" #GCC gives maybe-uninitialized without default label and checks for illegal enum values.
      "-Wno-switch-enum" # default statement for enum is OK
  
 -    # TODO uncomment the next few ignore lines when we upgrade to test with clang 8 in Jenkins
 -
 -    # The barriers we use for tMPI and Nbnxm are sufficient, but it's
 -    # not known whether they're excessive. We assume they not
 -    # excessive.
 -    # "-Wno-atomic-implicit-seq-cst"
 -
      # We need to use macros like
      # GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR. Those will look strange
      # if they don't have a semicolon after them, and might confuse
      # tools like IDEs also.
 -    "-Wno-extra-semi-stmt"
 +    "-Wno-extra-semi-stmt"
  
      #Following ones are undecided/TODO
      "-Wno-disabled-macro-expansion"
      "-Wno-double-promotion")
  string(REPLACE " " ";" IGNORED_CLANG_ALL_WARNINGS "${IGNORED_CLANG_ALL_WARNINGS}")
  
 +option(GMX_CLANG_TIDY "Use clang-tidy" OFF)
  if (GMX_CLANG_TIDY)
 +   if("${CMAKE_BUILD_TYPE}" STREQUAL "Debug")
 +   elseif("${CMAKE_BUILD_TYPE}" STREQUAL "RelWithAssert")
 +   elseif("${CMAKE_BUILD_TYPE}" STREQUAL "RelWithDebInfo")
 +   elseif("${CMAKE_BUILD_TYPE}" STREQUAL "ASAN")
 +   else()
 +       message(FATAL_ERROR "Can only use clang-tidy with build type containing asserts: Debug, RelWithAssert, RelWithDebInfo, ASAN.")
 +   endif()
 +   set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 +   mark_as_advanced(CMAKE_EXPORT_COMPILE_COMMANDS)
     set(CLANG_TIDY "clang-tidy" CACHE STRING "Name of clang-tidy executable")
     find_program(CLANG_TIDY_EXE NAMES "${CLANG_TIDY}"
         DOC "Path to clang-tidy executable")
     if(NOT CLANG_TIDY_EXE)
         message(FATAL_ERROR "clang-tidy not found.")
     endif()
 +   mark_as_advanced(CLANG_TIDY)
 +   mark_as_advanced(CLANG_TIDY_EXE)
  endif()
 -#####
  
  add_subdirectory(external)
  
@@@ -155,3 -145,21 +151,21 @@@ if (GMXAPI
     endif()
     add_subdirectory(api)
  endif()
+ # Configure header files with configuration-specific values. This step
+ # should follow all introspection e.g. looking for headers and
+ # libraries. If not, cmake will need to change the contents of the
+ # file upon subsequent runs of cmake. This can mean that
+ #
+ #  cmake $src && make && make test
+ #
+ # requires building all the source files that depend on the changed
+ # header file in both of the make stages. That's slow, and is useless
+ # busy work for ccache, too.
+ string(TOUPPER "${CMAKE_BUILD_TYPE}" CMAKE_BUILD_TYPE_UPPER)
+ configure_file(config.h.cmakein config.h)
+ configure_file(gmxpre-config.h.cmakein gmxpre-config.h)
+ set(CMAKE_BUILD_CONFIGURATION_C_FLAGS   ${CMAKE_C_FLAGS_${CMAKE_BUILD_TYPE_UPPER}})
+ set(CMAKE_BUILD_CONFIGURATION_CXX_FLAGS ${CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE_UPPER}})
+ configure_file(buildinfo.h.cmakein buildinfo.h ESCAPE_QUOTES)
index 394e80b04507506dfbb22fe31cd41a28a5be4120,ffbf907db67aabab7b5d3b47e31136db0abfe642..748cdcce09d3b9bfefc142c251ed86f90931f2ba
@@@ -66,8 -66,6 +66,8 @@@ add_library(gmxapi SHARE
  gmx_target_compile_options(gmxapi)
  target_compile_definitions(gmxapi PRIVATE HAVE_CONFIG_H)
  target_include_directories(gmxapi SYSTEM BEFORE PRIVATE ${PROJECT_SOURCE_DIR}/src/external/thread_mpi/include)
 +# Should be possible to change this when resolving #3290
 +target_include_directories(gmxapi SYSTEM PRIVATE ${PROJECT_SOURCE_DIR}/src/external)
  
  # Define public interface. Make sure targets linking against `gmxapi` in the build
  # system don't accidentally have the implementation headers (this directory))
@@@ -106,6 -104,7 +106,7 @@@ if(CMAKE_SYSTEM_NAME STREQUAL "Darwin"
  endif()
  
  set_target_properties(gmxapi PROPERTIES
+                       OUTPUT_NAME "gmxapi${GMX_LIBS_SUFFIX}"
                        SOVERSION ${GMXAPI_MAJOR}
                        VERSION ${GMXAPI_RELEASE}
                        )
index 4e050d75b3e3e871ad30509a62d301d65fe66c09,2d4cf147f013d884bf1f21b690682fb55e955db4..3d160d7e1419e267e16f44da833fd62a7331546b
  #ifndef GROMACS_TESTINGCONFIGURATION_H
  #define GROMACS_TESTINGCONFIGURATION_H
  
 -#include <gtest/gtest.h>
 +#include "config.h"
  
  #include <string>
  #include <vector>
  
+ #include "config.h"
 +#include <gtest/gtest.h>
 +
  #include "gromacs/gmxpreprocess/grompp.h"
  #include "gromacs/math/vec.h"
  #include "gromacs/utility/stringutil.h"
  #include "gromacs/utility/textwriter.h"
  
 +#include "programs/mdrun/tests/moduletest.h"
  #include "testutils/cmdlinetest.h"
  #include "testutils/testfilemanager.h"
 -#include "programs/mdrun/tests/moduletest.h"
  
  namespace gmxapi
  {
diff --combined src/buildinfo.h.cmakein
index c2c48ece3aee1a4e6516f25362d57afd53767aee,8a7a641e9b6b24ff484dca8ca4c7864bc088c474..ba5f3658b67f8352f588e1a463d504499e4754a6
@@@ -1,8 -1,7 +1,8 @@@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
 - * Copyright (c) 2012,2013,2014,2015,2017,2018,2019,2020, by the GROMACS development team, led by
 + * Copyright (c) 2012,2013,2014,2015,2017 by the GROMACS development team.
 + * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
  /** C compiler used to build */
  #define BUILD_C_COMPILER        "@BUILD_C_COMPILER@"
  
+ /** C compiler flags for this build configuration */
+ #define CMAKE_BUILD_CONFIGURATION_C_FLAGS "@CMAKE_BUILD_CONFIGURATION_C_FLAGS@"
  #include "compilerflaginfo-@CMAKE_BUILD_TYPE@-C.h"
  
- /** C++ compiler flags used to build, or empty string if no C++ */
+ /** C++ compiler used to build */
  #define BUILD_CXX_COMPILER      "@BUILD_CXX_COMPILER@"
  
+ /** C++ compiler flags for this build configuration */
+ #define CMAKE_BUILD_CONFIGURATION_CXX_FLAGS "@CMAKE_BUILD_CONFIGURATION_CXX_FLAGS@"
  #include "compilerflaginfo-@CMAKE_BUILD_TYPE@-CXX.h"
  
  /** Installation prefix (default location of data files) */
index a549ddd9154598ed4e97b2ca421bfb4f82bcb37b,60e130c22af1f1edb189d4d5f361b07bf3c8dc38..0b369bde85feaaa8e3a180699bd07d8cedd18469
@@@ -1,8 -1,8 +1,8 @@@
  #
  # This file is part of the GROMACS molecular simulation package.
  #
 -# Copyright (c) 2010,2011,2012,2013,2014,2015, The GROMACS development team.
 -# Copyright (c) 2016,2017,2018,2019,2020, by the GROMACS development team, led by
 +# Copyright (c) 2010,2011,2012,2013,2014 by the GROMACS development team.
 +# Copyright (c) 2015,2016,2017,2018,2019,2020, by the GROMACS development team, led by
  # Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  # and including many others, as listed in the AUTHORS file in the
  # top-level source directory and at http://www.gromacs.org.
@@@ -61,19 -61,11 +61,19 @@@ function (gmx_add_libgromacs_sources
  endfunction ()
  
  # TODO Reconsider this, as the CUDA driver API is probably a simpler
 -# approach, at least for the build system. See Redmine #2530
 +# approach, at least for the build system. See Issue #2530
  function (gmx_compile_cpp_as_cuda)
      _gmx_add_files_to_property(GMX_LIBGROMACS_GPU_IMPL_SOURCES ${ARGN})
  endfunction ()
  
 +# Permit the configuration to disable compiling the many nbnxm kernels
 +# and others involved in force calculations. Currently only
 +# short-ranged and bonded kernels are disabled this way, but in future
 +# others may be appropriate. Thus the cmake option is not specific to
 +# nbnxm module.
 +option(GMX_USE_SIMD_KERNELS "Whether to compile NBNXM and other SIMD kernels" ON)
 +mark_as_advanced(GMX_USE_SIMD_KERNELS)
 +
  # Add these contents first because linking their tests can take a lot
  # of time, so we want lots of parallel work still available after
  # linking starts.
@@@ -274,11 -266,6 +274,6 @@@ endif(
  # TODO Perhaps generalize this for all headers from src/external
  target_include_directories(libgromacs SYSTEM PRIVATE ${PROJECT_SOURCE_DIR}/src/external)
  
- # Temporary fix to allow external access to restraintpotentail
- target_include_directories(libgromacs PUBLIC
-                            $<INSTALL_INTERFACE:include>
-                            )
  if(SIMD_AVX_512_CXX_SUPPORTED AND NOT ("${GMX_SIMD_ACTIVE}" STREQUAL "AVX_512_KNL"))
      # Since we might be overriding -march=core-avx2, add a flag so we don't warn for this specific file.
      # On KNL this can cause illegal instruction because the compiler might use non KNL AVX instructions
@@@ -312,8 -299,10 +307,8 @@@ set_target_properties(libgromacs PROPER
  gmx_manage_lmfit()
  target_link_libraries(libgromacs PRIVATE lmfit)
  
 -# Fix everything found by the latest version of clang that we use in
 -# Jenkins testing. This should be updated when we update the latest
 -# tested version of clang.
 -if (CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND CMAKE_CXX_COMPILER_VERSION MATCHES "^7\.0")
 +# Make sure we fix "everything" found by more recent versions of clang.
 +if (CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER "7")
     target_compile_options(libgromacs PRIVATE $<$<COMPILE_LANGUAGE:CXX>:-Weverything ${IGNORED_CLANG_ALL_WARNINGS}>)
  endif()
  if (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
@@@ -350,18 -339,23 +345,25 @@@ if (GMX_CLANG_TIDY
         "${CLANG_TIDY_EXE};-warnings-as-errors=*")
  endif()
  
+ # clang-3.6 warns about a number of issues that are not reported by more modern compilers
+ # and we know they are not real issues. So we only check that it can compile without error
+ # but ignore all warnings.
+ if (CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND CMAKE_CXX_COMPILER_VERSION MATCHES "^3\.6")
+     target_compile_options(libgromacs PRIVATE $<$<COMPILE_LANGUAGE:CXX>:-w>)
+ endif()
  # Only install the library in mdrun-only mode if it is actually necessary
  # for the binary
  if (NOT GMX_BUILD_MDRUN_ONLY OR BUILD_SHARED_LIBS)
      install(TARGETS libgromacs
 +            COMPONENT libraries
              EXPORT libgromacs
              LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
              RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
              ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
 -            INCLUDES DESTINATION include
 -            COMPONENT libraries)
 +            INCLUDES DESTINATION include)
 +    target_compile_definitions(libgromacs PUBLIC $<INSTALL_INTERFACE:GMX_DOUBLE=${GMX_DOUBLE_VALUE}>)
 +    add_library(Gromacs::libgromacs ALIAS libgromacs)
  endif()
  
  if (NOT GMX_BUILD_MDRUN_ONLY)
index bbd5d8a213a89898aff833d094edbb213ede4174,542275d91681b6b421ecdbce51c13e29d8dd55ab..3f355f71a835a42ada515934ccf1a0594c98d79e
@@@ -64,7 -64,6 +64,7 @@@
  #include "gromacs/domdec/partition.h"
  #include "gromacs/gmxlib/network.h"
  #include "gromacs/gmxlib/nrnb.h"
 +#include "gromacs/gpu_utils/device_stream_manager.h"
  #include "gromacs/gpu_utils/gpu_utils.h"
  #include "gromacs/hardware/hw_info.h"
  #include "gromacs/listed_forces/manage_threading.h"
@@@ -120,7 -119,7 +120,7 @@@ using gmx::DdRankOrder
  using gmx::DlbOption;
  using gmx::DomdecOptions;
  
- static const char* edlbs_names[int(DlbState::nr)] = { "off", "auto", "locked", "on", "on" };
+ static const char* edlbs_names[int(DlbState::nr)] = { "off", "off", "auto", "locked", "on", "on" };
  
  /* The size per atom group of the cggl_flag buffer in gmx_domdec_comm_t */
  #define DD_CGIBS 2
@@@ -146,6 -145,18 +146,6 @@@ static const int ddNonbondedZonePairRan
                                                                 { 2, 5, 6 },
                                                                 { 3, 5, 7 } };
  
 -
 -/*
 -   #define dd_index(n,i) ((((i)[ZZ]*(n)[YY] + (i)[YY])*(n)[XX]) + (i)[XX])
 -
 -   static void index2xyz(ivec nc,int ind,ivec xyz)
 -   {
 -   xyz[XX] = ind % nc[XX];
 -   xyz[YY] = (ind / nc[XX]) % nc[YY];
 -   xyz[ZZ] = ind / (nc[YY]*nc[XX]);
 -   }
 - */
 -
  static void ddindex2xyz(const ivec nc, int ind, ivec xyz)
  {
      xyz[XX] = ind / (nc[YY] * nc[ZZ]);
@@@ -158,7 -169,7 +158,7 @@@ static int ddcoord2ddnodeid(gmx_domdec_
      int ddnodeid = -1;
  
      const CartesianRankSetup& cartSetup = dd->comm->cartesianRankSetup;
 -    const int                 ddindex   = dd_index(dd->nc, c);
 +    const int                 ddindex   = dd_index(dd->numCells, c);
      if (cartSetup.bCartesianPP_PME)
      {
          ddnodeid = cartSetup.ddindex2ddnodeid[ddindex];
@@@ -692,11 -703,24 +692,11 @@@ static int gmx_ddcoord2pmeindex(const t
      ivec          coords;
      int           slab;
  
 -    dd = cr->dd;
 -    /*
 -       if (dd->comm->bCartesian) {
 -       gmx_ddindex2xyz(dd->nc,ddindex,coords);
 -       dd_coords2pmecoords(dd,coords,coords_pme);
 -       copy_ivec(dd->ntot,nc);
 -       nc[dd->cartpmedim]         -= dd->nc[dd->cartpmedim];
 -       coords_pme[dd->cartpmedim] -= dd->nc[dd->cartpmedim];
 -
 -       slab = (coords_pme[XX]*nc[YY] + coords_pme[YY])*nc[ZZ] + coords_pme[ZZ];
 -       } else {
 -       slab = (ddindex*cr->npmenodes + cr->npmenodes/2)/dd->nnodes;
 -       }
 -     */
 +    dd         = cr->dd;
      coords[XX] = x;
      coords[YY] = y;
      coords[ZZ] = z;
 -    slab       = ddindex2pmeindex(dd->comm->ddRankSetup, dd_index(dd->nc, coords));
 +    slab       = ddindex2pmeindex(dd->comm->ddRankSetup, dd_index(dd->numCells, coords));
  
      return slab;
  }
@@@ -715,7 -739,7 +715,7 @@@ static int ddcoord2simnodeid(const t_co
      }
      else
      {
 -        int ddindex = dd_index(cr->dd->nc, coords);
 +        int ddindex = dd_index(cr->dd->numCells, coords);
          if (cartSetup.bCartesianPP)
          {
              nodeid = cartSetup.ddindex2simnodeid[ddindex];
@@@ -862,7 -886,7 +862,7 @@@ static gmx_bool receive_vir_ener(const 
              ivec coords;
              MPI_Cart_coords(cr->mpi_comm_mysim, cr->sim_nodeid, DIM, coords);
              coords[cartSetup.cartpmedim]++;
 -            if (coords[cartSetup.cartpmedim] < dd->nc[cartSetup.cartpmedim])
 +            if (coords[cartSetup.cartpmedim] < dd->numCells[cartSetup.cartpmedim])
              {
                  int rank;
                  MPI_Cart_rank(cr->mpi_comm_mysim, coords, &rank);
@@@ -900,9 -924,9 +900,9 @@@ static void set_slb_pme_dim_f(gmx_domde
  
      comm = dd->comm;
  
 -    snew(*dim_f, dd->nc[dim] + 1);
 +    snew(*dim_f, dd->numCells[dim] + 1);
      (*dim_f)[0] = 0;
 -    for (i = 1; i < dd->nc[dim]; i++)
 +    for (i = 1; i < dd->numCells[dim]; i++)
      {
          if (comm->slb_frac[dim])
          {
          }
          else
          {
 -            (*dim_f)[i] = static_cast<real>(i) / static_cast<real>(dd->nc[dim]);
 +            (*dim_f)[i] = static_cast<real>(i) / static_cast<real>(dd->numCells[dim]);
          }
      }
 -    (*dim_f)[dd->nc[dim]] = 1;
 +    (*dim_f)[dd->numCells[dim]] = 1;
  }
  
  static void init_ddpme(gmx_domdec_t* dd, gmx_ddpme_t* ddpme, int dimind)
      snew(ddpme->pp_max, ddpme->nslab);
      for (int slab = 0; slab < ddpme->nslab; slab++)
      {
 -        ddpme->pp_min[slab] = dd->nc[dd->dim[dimind]] - 1;
 +        ddpme->pp_min[slab] = dd->numCells[dd->dim[dimind]] - 1;
          ddpme->pp_max[slab] = 0;
      }
      for (int i = 0; i < dd->nnodes; i++)
      {
          ivec xyz;
 -        ddindex2xyz(dd->nc, i, xyz);
 +        ddindex2xyz(dd->numCells, i, xyz);
          /* For y only use our y/z slab.
           * This assumes that the PME x grid size matches the DD grid size.
           */
@@@ -1045,10 -1069,10 +1045,10 @@@ static void make_load_communicator(gmx_
  
      dim = dd->dim[dim_ind];
      copy_ivec(loc, loc_c);
 -    for (i = 0; i < dd->nc[dim]; i++)
 +    for (i = 0; i < dd->numCells[dim]; i++)
      {
          loc_c[dim] = i;
 -        rank       = dd_index(dd->nc, loc_c);
 +        rank       = dd_index(dd->numCells, loc_c);
          if (rank == dd->rank)
          {
              /* This process is part of the group */
  
                  RowMaster& rowMaster = *cellsizes.rowMaster;
                  rowMaster.cellFrac.resize(ddCellFractionBufferSize(dd, dim_ind));
 -                rowMaster.oldCellFrac.resize(dd->nc[dim] + 1);
 -                rowMaster.isCellMin.resize(dd->nc[dim]);
 +                rowMaster.oldCellFrac.resize(dd->numCells[dim] + 1);
 +                rowMaster.isCellMin.resize(dd->numCells[dim]);
                  if (dim_ind > 0)
                  {
 -                    rowMaster.bounds.resize(dd->nc[dim]);
 +                    rowMaster.bounds.resize(dd->numCells[dim]);
                  }
 -                rowMaster.buf_ncd.resize(dd->nc[dim]);
 +                rowMaster.buf_ncd.resize(dd->numCells[dim]);
              }
              else
              {
          }
          if (dd->ci[dim] == dd->master_ci[dim])
          {
 -            snew(dd->comm->load[dim_ind].load, dd->nc[dim] * DD_NLOAD_MAX);
 +            snew(dd->comm->load[dim_ind].load, dd->numCells[dim] * DD_NLOAD_MAX);
          }
      }
  }
@@@ -1169,7 -1193,7 +1169,7 @@@ static void make_load_communicators(gmx
      if (dd->ndim > 1)
      {
          dim0 = dd->dim[0];
 -        for (i = 0; i < dd->nc[dim0]; i++)
 +        for (i = 0; i < dd->numCells[dim0]; i++)
          {
              loc[dim0] = i;
              make_load_communicator(dd, 1, loc);
      if (dd->ndim > 2)
      {
          dim0 = dd->dim[0];
 -        for (i = 0; i < dd->nc[dim0]; i++)
 +        for (i = 0; i < dd->numCells[dim0]; i++)
          {
              loc[dim0] = i;
              dim1      = dd->dim[1];
 -            for (j = 0; j < dd->nc[dim1]; j++)
 +            for (j = 0; j < dd->numCells[dim1]; j++)
              {
                  loc[dim1] = j;
                  make_load_communicator(dd, 2, loc);
@@@ -1209,10 -1233,10 +1209,10 @@@ static void setup_neighbor_relations(gm
      {
          dim = dd->dim[d];
          copy_ivec(dd->ci, tmp);
 -        tmp[dim]           = (tmp[dim] + 1) % dd->nc[dim];
 +        tmp[dim]           = (tmp[dim] + 1) % dd->numCells[dim];
          dd->neighbor[d][0] = ddcoord2ddnodeid(dd, tmp);
          copy_ivec(dd->ci, tmp);
 -        tmp[dim]           = (tmp[dim] - 1 + dd->nc[dim]) % dd->nc[dim];
 +        tmp[dim]           = (tmp[dim] - 1 + dd->numCells[dim]) % dd->numCells[dim];
          dd->neighbor[d][1] = ddcoord2ddnodeid(dd, tmp);
          if (debug)
          {
              s[d] = dd->ci[d] - zones->shift[i][d];
              if (s[d] < 0)
              {
 -                s[d] += dd->nc[d];
 +                s[d] += dd->numCells[d];
              }
 -            else if (s[d] >= dd->nc[d])
 +            else if (s[d] >= dd->numCells[d])
              {
 -                s[d] -= dd->nc[d];
 +                s[d] -= dd->numCells[d];
              }
          }
      }
                                             std::min(ddNonbondedZonePairRanges[iZoneIndex][2], nzone));
          for (dim = 0; dim < DIM; dim++)
          {
 -            if (dd->nc[dim] == 1)
 +            if (dd->numCells[dim] == 1)
              {
                  /* All shifts should be allowed */
                  iZone.shift0[dim] = -1;
@@@ -1321,8 -1345,8 +1321,8 @@@ static void make_pp_communicator(const 
      {
          /* Set up cartesian communication for the particle-particle part */
          GMX_LOG(mdlog.info)
 -                .appendTextFormatted("Will use a Cartesian communicator: %d x %d x %d", dd->nc[XX],
 -                                     dd->nc[YY], dd->nc[ZZ]);
 +                .appendTextFormatted("Will use a Cartesian communicator: %d x %d x %d",
 +                                     dd->numCells[XX], dd->numCells[YY], dd->numCells[ZZ]);
  
          ivec periods;
          for (int i = 0; i < DIM; i++)
              periods[i] = TRUE;
          }
          MPI_Comm comm_cart;
 -        MPI_Cart_create(cr->mpi_comm_mygroup, DIM, dd->nc, periods, static_cast<int>(reorder), &comm_cart);
 +        MPI_Cart_create(cr->mpi_comm_mygroup, DIM, dd->numCells, periods, static_cast<int>(reorder),
 +                        &comm_cart);
          /* We overwrite the old communicator with the new cartesian one */
          cr->mpi_comm_mygroup = comm_cart;
      }
           * and not the one after split, we need to make an index.
           */
          cartSetup.ddindex2ddnodeid.resize(dd->nnodes);
 -        cartSetup.ddindex2ddnodeid[dd_index(dd->nc, dd->ci)] = dd->rank;
 +        cartSetup.ddindex2ddnodeid[dd_index(dd->numCells, dd->ci)] = dd->rank;
          gmx_sumi(dd->nnodes, cartSetup.ddindex2ddnodeid.data(), cr);
          /* Get the rank of the DD master,
           * above we made sure that the master node is a PP node.
          std::vector<int> buf(dd->nnodes);
          if (thisRankHasDuty(cr, DUTY_PP))
          {
 -            buf[dd_index(dd->nc, dd->ci)] = cr->sim_nodeid;
 +            buf[dd_index(dd->numCells, dd->ci)] = cr->sim_nodeid;
          }
          /* Communicate the ddindex to simulation nodeid index */
          MPI_Allreduce(buf.data(), cartSetup.ddindex2simnodeid.data(), dd->nnodes, MPI_INT, MPI_SUM,
          {
              if (cartSetup.ddindex2simnodeid[i] == 0)
              {
 -                ddindex2xyz(dd->nc, i, dd->master_ci);
 +                ddindex2xyz(dd->numCells, i, dd->master_ci);
                  MPI_Cart_rank(dd->mpi_comm_all, dd->master_ci, &dd->masterrank);
              }
          }
      {
          /* No Cartesian communicators */
          /* We use the rank in dd->comm->all as DD index */
 -        ddindex2xyz(dd->nc, dd->rank, dd->ci);
 +        ddindex2xyz(dd->numCells, dd->rank, dd->ci);
          /* The simulation master nodeid is 0, so the DD master rank is also 0 */
          dd->masterrank = 0;
          clear_ivec(dd->master_ci);
@@@ -1435,7 -1458,7 +1435,7 @@@ static void receive_ddindex2simnodeid(g
          std::vector<int> buf(dd->nnodes);
          if (thisRankHasDuty(cr, DUTY_PP))
          {
 -            buf[dd_index(dd->nc, dd->ci)] = cr->sim_nodeid;
 +            buf[dd_index(dd->numCells, dd->ci)] = cr->sim_nodeid;
          }
          /* Communicate the ddindex to simulation nodeid index */
          MPI_Allreduce(buf.data(), cartSetup.ddindex2simnodeid.data(), dd->nnodes, MPI_INT, MPI_SUM,
@@@ -1683,7 -1706,7 +1683,7 @@@ static void setupGroupCommunication(con
      /* We can not use DDMASTER(dd), because dd->masterrank is set later */
      if (MASTER(cr))
      {
 -        dd->ma = std::make_unique<AtomDistribution>(dd->nc, numAtomsInSystem, numAtomsInSystem);
 +        dd->ma = std::make_unique<AtomDistribution>(dd->numCells, numAtomsInSystem, numAtomsInSystem);
      }
  }
  
@@@ -1766,11 -1789,10 +1766,11 @@@ static int dd_getenv(const gmx::MDLogge
  
  static void check_dd_restrictions(const gmx_domdec_t* dd, const t_inputrec* ir, const gmx::MDLogger& mdlog)
  {
 -    if (ir->ePBC == epbcSCREW && (dd->nc[XX] == 1 || dd->nc[YY] > 1 || dd->nc[ZZ] > 1))
 +    if (ir->pbcType == PbcType::Screw
 +        && (dd->numCells[XX] == 1 || dd->numCells[YY] > 1 || dd->numCells[ZZ] > 1))
      {
          gmx_fatal(FARGS, "With pbc=%s can only do domain decomposition in the x-direction",
 -                  epbc_names[ir->ePBC]);
 +                  c_pbcTypeNames[ir->pbcType].c_str());
      }
  
      if (ir->nstlist == 0)
          gmx_fatal(FARGS, "Domain decomposition does not work with nstlist=0");
      }
  
 -    if (ir->comm_mode == ecmANGULAR && ir->ePBC != epbcNONE)
 +    if (ir->comm_mode == ecmANGULAR && ir->pbcType != PbcType::No)
      {
          GMX_LOG(mdlog.warning)
                  .appendText(
@@@ -2005,10 -2027,10 +2005,10 @@@ static void setupUpdateGroups(const gmx
  }
  
  UnitCellInfo::UnitCellInfo(const t_inputrec& ir) :
 -    npbcdim(ePBC2npbcdim(ir.ePBC)),
 +    npbcdim(numPbcDimensions(ir.pbcType)),
      numBoundedDimensions(inputrec2nboundeddim(&ir)),
      ddBoxIsDynamic(numBoundedDimensions < DIM || inputrecDynamicBox(&ir)),
 -    haveScrewPBC(ir.ePBC == epbcSCREW)
 +    haveScrewPBC(ir.pbcType == PbcType::Screw)
  {
  }
  
@@@ -2060,7 -2082,7 +2060,7 @@@ static DDSystemInfo getSystemInfo(cons
      systemInfo.useUpdateGroups = false;
      if (ir.cutoff_scheme == ecutsVERLET)
      {
 -        real cutoffMargin = std::sqrt(max_cutoff2(ir.ePBC, box)) - ir.rlist;
 +        real cutoffMargin = std::sqrt(max_cutoff2(ir.pbcType, box)) - ir.rlist;
          setupUpdateGroups(mdlog, mtop, ir, cutoffMargin, &systemInfo);
      }
  
                  dd_bonded_cg_distance(mdlog, &mtop, &ir, as_rvec_array(xGlobal.data()), box,
                                        options.checkBondedInteractions, &r_2b, &r_mb);
              }
 -            gmx_bcast(sizeof(r_2b), &r_2b, cr);
 -            gmx_bcast(sizeof(r_mb), &r_mb, cr);
 +            gmx_bcast(sizeof(r_2b), &r_2b, cr->mpi_comm_mygroup);
 +            gmx_bcast(sizeof(r_mb), &r_mb, cr->mpi_comm_mygroup);
  
              /* We use an initial margin of 10% for the minimum cell size,
               * except when we are just below the non-bonded cut-off.
@@@ -2421,18 -2443,18 +2421,18 @@@ static void set_dd_limits(const gmx::MD
      }
  
      /* Set the DD setup given by ddGridSetup */
 -    copy_ivec(ddGridSetup.numDomains, dd->nc);
 +    copy_ivec(ddGridSetup.numDomains, dd->numCells);
      dd->ndim = ddGridSetup.numDDDimensions;
      copy_ivec(ddGridSetup.ddDimensions, dd->dim);
  
 -    dd->nnodes = dd->nc[XX] * dd->nc[YY] * dd->nc[ZZ];
 +    dd->nnodes = dd->numCells[XX] * dd->numCells[YY] * dd->numCells[ZZ];
  
      snew(comm->slb_frac, DIM);
      if (isDlbDisabled(comm))
      {
 -        comm->slb_frac[XX] = get_slb_frac(mdlog, "x", dd->nc[XX], options.cellSizeX);
 -        comm->slb_frac[YY] = get_slb_frac(mdlog, "y", dd->nc[YY], options.cellSizeY);
 -        comm->slb_frac[ZZ] = get_slb_frac(mdlog, "z", dd->nc[ZZ], options.cellSizeZ);
 +        comm->slb_frac[XX] = get_slb_frac(mdlog, "x", dd->numCells[XX], options.cellSizeX);
 +        comm->slb_frac[YY] = get_slb_frac(mdlog, "y", dd->numCells[YY], options.cellSizeY);
 +        comm->slb_frac[ZZ] = get_slb_frac(mdlog, "z", dd->numCells[ZZ], options.cellSizeZ);
      }
  
      /* Set the multi-body cut-off and cellsize limit for DLB */
               * the minimum and the maximum,
               * since the extra communication cost is nearly zero.
               */
 -            real acs           = average_cellsize_min(ddbox, dd->nc);
 +            real acs           = average_cellsize_min(ddbox, dd->numCells);
              comm->cutoff_mbody = 0.5 * (systemInfo.minCutoffForMultiBody + acs);
              if (!isDlbDisabled(comm))
              {
      }
  }
  
 -void dd_init_bondeds(FILE*              fplog,
 -                     gmx_domdec_t*      dd,
 -                     const gmx_mtop_t*  mtop,
 -                     const gmx_vsite_t* vsite,
 -                     const t_inputrec*  ir,
 -                     gmx_bool           bBCheck,
 -                     cginfo_mb_t*       cginfo_mb)
 +void dd_init_bondeds(FILE*                      fplog,
 +                     gmx_domdec_t*              dd,
 +                     const gmx_mtop_t&          mtop,
 +                     const gmx_vsite_t*         vsite,
 +                     const t_inputrec*          ir,
 +                     gmx_bool                   bBCheck,
 +                     gmx::ArrayRef<cginfo_mb_t> cginfo_mb)
  {
      gmx_domdec_comm_t* comm;
  
 -    dd_make_reverse_top(fplog, dd, mtop, vsite, ir, bBCheck);
 +    dd_make_reverse_top(fplog, dd, &mtop, vsite, ir, bBCheck);
  
      comm = dd->comm;
  
@@@ -2547,16 -2569,16 +2547,16 @@@ static void writeSettings(gmx::TextWrit
          log->writeString("The allowed shrink of domain decomposition cells is:");
          for (d = 0; d < DIM; d++)
          {
 -            if (dd->nc[d] > 1)
 +            if (dd->numCells[d] > 1)
              {
 -                if (d >= ddbox->npbcdim && dd->nc[d] == 2)
 +                if (d >= ddbox->npbcdim && dd->numCells[d] == 2)
                  {
                      shrink = 0;
                  }
                  else
                  {
                      shrink = comm->cellsize_min_dlb[d]
 -                             / (ddbox->box_size[d] * ddbox->skew_fac[d] / dd->nc[d]);
 +                             / (ddbox->box_size[d] * ddbox->skew_fac[d] / dd->numCells[d]);
                  }
                  log->writeStringFormatted(" %c %.2f", dim2char(d), shrink);
              }
          log->writeString("The initial domain decomposition cell size is:");
          for (d = 0; d < DIM; d++)
          {
 -            if (dd->nc[d] > 1)
 +            if (dd->numCells[d] > 1)
              {
                  log->writeStringFormatted(" %c %.2f nm", dim2char(d), dd->comm->cellsize_min[d]);
              }
@@@ -2707,7 -2729,7 +2707,7 @@@ static void set_cell_limits_dlb(const g
      else
      {
          /* There is no cell size limit */
 -        npulse = std::max(dd->nc[XX] - 1, std::max(dd->nc[YY] - 1, dd->nc[ZZ] - 1));
 +        npulse = std::max(dd->numCells[XX] - 1, std::max(dd->numCells[YY] - 1, dd->numCells[ZZ] - 1));
      }
  
      if (!bNoCutOff && npulse > 1)
              dim      = dd->dim[d];
              npulse_d = static_cast<int>(
                      1
 -                    + dd->nc[dim] * comm->systemInfo.cutoff
 +                    + dd->numCells[dim] * comm->systemInfo.cutoff
                                / (ddbox->box_size[dim] * ddbox->skew_fac[dim] * dlb_scale));
              npulse_d_max = std::max(npulse_d_max, npulse_d);
          }
      }
  
      comm->maxpulse       = 1;
 -    comm->bVacDLBNoLimit = (ir->ePBC == epbcNONE);
 +    comm->bVacDLBNoLimit = (ir->pbcType == PbcType::No);
      for (d = 0; d < dd->ndim; d++)
      {
 -        if (comm->ddSettings.request1DAnd1Pulse)
 -        {
 -            comm->cd[d].np_dlb = 1;
 -        }
 -        else
 -        {
 -            comm->cd[d].np_dlb = std::min(npulse, dd->nc[dd->dim[d]] - 1);
 -            comm->maxpulse     = std::max(comm->maxpulse, comm->cd[d].np_dlb);
 -        }
 -        if (comm->cd[d].np_dlb < dd->nc[dd->dim[d]] - 1)
 +        comm->cd[d].np_dlb = std::min(npulse, dd->numCells[dd->dim[d]] - 1);
 +        comm->maxpulse     = std::max(comm->maxpulse, comm->cd[d].np_dlb);
 +        if (comm->cd[d].np_dlb < dd->numCells[dd->dim[d]] - 1)
          {
              comm->bVacDLBNoLimit = FALSE;
          }
@@@ -2778,15 -2807,14 +2778,15 @@@ bool dd_moleculesAreAlwaysWhole(const g
      return dd.comm->systemInfo.moleculesAreAlwaysWhole;
  }
  
 -gmx_bool dd_bonded_molpbc(const gmx_domdec_t* dd, int ePBC)
 +gmx_bool dd_bonded_molpbc(const gmx_domdec_t* dd, PbcType pbcType)
  {
      /* If each molecule is a single charge group
       * or we use domain decomposition for each periodic dimension,
       * we do not need to take pbc into account for the bonded interactions.
       */
 -    return (ePBC != epbcNONE && dd->comm->systemInfo.haveInterDomainBondeds
 -            && !(dd->nc[XX] > 1 && dd->nc[YY] > 1 && (dd->nc[ZZ] > 1 || ePBC == epbcXY)));
 +    return (pbcType != PbcType::No && dd->comm->systemInfo.haveInterDomainBondeds
 +            && !(dd->numCells[XX] > 1 && dd->numCells[YY] > 1
 +                 && (dd->numCells[ZZ] > 1 || pbcType == PbcType::XY)));
  }
  
  /*! \brief Sets grid size limits and PP-PME setup, prints settings to log */
@@@ -2830,13 -2858,13 +2830,13 @@@ static void set_ddgrid_parameters(cons
      logSettings(mdlog, dd, mtop, ir, dlb_scale, ddbox);
  
      real vol_frac;
 -    if (ir->ePBC == epbcNONE)
 +    if (ir->pbcType == PbcType::No)
      {
          vol_frac = 1 - 1 / static_cast<double>(dd->nnodes);
      }
      else
      {
 -        vol_frac = (1 + comm_box_frac(dd->nc, comm->systemInfo.cutoff, *ddbox))
 +        vol_frac = (1 + comm_box_frac(dd->numCells, comm->systemInfo.cutoff, *ddbox))
                     / static_cast<double>(dd->nnodes);
      }
      if (debug)
@@@ -2858,7 -2886,7 +2858,7 @@@ static DDSettings getDDSettings(const g
  
      ddSettings.useSendRecv2        = (dd_getenv(mdlog, "GMX_DD_USE_SENDRECV2", 0) != 0);
      ddSettings.dlb_scale_lim       = dd_getenv(mdlog, "GMX_DLB_MAX_BOX_SCALING", 10);
 -    ddSettings.request1DAnd1Pulse  = bool(dd_getenv(mdlog, "GMX_DD_1D_1PULSE", 0));
 +    ddSettings.request1D           = bool(dd_getenv(mdlog, "GMX_DD_1D", 0));
      ddSettings.useDDOrderZYX       = bool(dd_getenv(mdlog, "GMX_DD_ORDER_ZYX", 0));
      ddSettings.useCartesianReorder = bool(dd_getenv(mdlog, "GMX_NO_CART_REORDER", 1));
      ddSettings.eFlop               = dd_getenv(mdlog, "GMX_DLB_BASED_ON_FLOPS", 0);
  
  gmx_domdec_t::gmx_domdec_t(const t_inputrec& ir) : unitCellInfo(ir) {}
  
 -/*! \brief Return whether the simulation described can run a 1D single-pulse DD.
 +/*! \brief Return whether the simulation described can run a 1D DD.
   *
 - * The GPU halo exchange code requires a 1D single-pulse DD. Such a DD
 + * The GPU halo exchange code requires 1D DD. Such a DD
   * generally requires a larger box than other possible decompositions
   * with the same rank count, so the calling code might need to decide
   * what is the most appropriate way to run the simulation based on
   * exchange code path. The number of PME ranks, if any, should be set
   * in \c options.numPmeRanks.
   */
 -static bool canMake1DAnd1PulseDomainDecomposition(const DDSettings&              ddSettingsOriginal,
 -                                                  const t_commrec*               cr,
 -                                                  const int                      numRanksRequested,
 -                                                  const DomdecOptions&           options,
 -                                                  const gmx_mtop_t&              mtop,
 -                                                  const t_inputrec&              ir,
 -                                                  const matrix                   box,
 -                                                  gmx::ArrayRef<const gmx::RVec> xGlobal)
 +static bool canMake1DDomainDecomposition(const DDSettings&              ddSettingsOriginal,
 +                                         const t_commrec*               cr,
 +                                         const int                      numRanksRequested,
 +                                         const DomdecOptions&           options,
 +                                         const gmx_mtop_t&              mtop,
 +                                         const t_inputrec&              ir,
 +                                         const matrix                   box,
 +                                         gmx::ArrayRef<const gmx::RVec> xGlobal)
  {
      // Ensure we don't write any output from this checking routine
      gmx::MDLogger dummyLogger;
  
      DDSystemInfo systemInfo = getSystemInfo(dummyLogger, cr, options, mtop, ir, box, xGlobal);
  
 -    DDSettings ddSettings             = ddSettingsOriginal;
 -    ddSettings.request1DAnd1Pulse     = true;
 -    const real gridSetupCellsizeLimit = getDDGridSetupCellSizeLimit(
 -            dummyLogger, ddSettings.request1DAnd1Pulse, !isDlbDisabled(ddSettings.initialDlbState),
 -            options.dlbScaling, ir, systemInfo.cellsizeLimit);
 +    DDSettings ddSettings = ddSettingsOriginal;
 +    ddSettings.request1D  = true;
 +    const real gridSetupCellsizeLimit =
 +            getDDGridSetupCellSizeLimit(dummyLogger, !isDlbDisabled(ddSettings.initialDlbState),
 +                                        options.dlbScaling, ir, systemInfo.cellsizeLimit);
      gmx_ddbox_t ddbox = { 0 };
      DDGridSetup ddGridSetup =
              getDDGridSetup(dummyLogger, cr, numRanksRequested, options, ddSettings, systemInfo,
                             gridSetupCellsizeLimit, mtop, ir, box, xGlobal, &ddbox);
  
 -    const bool canMakeDDWith1DAnd1Pulse = (ddGridSetup.numDomains[XX] != 0);
 +    const bool canMake1DDD = (ddGridSetup.numDomains[XX] != 0);
  
 -    return canMakeDDWith1DAnd1Pulse;
 +    return canMake1DDD;
  }
  
 -bool is1DAnd1PulseDD(const gmx_domdec_t& dd)
 +bool is1D(const gmx_domdec_t& dd)
  {
 -    const int  maxDimensionSize             = std::max(dd.nc[XX], std::max(dd.nc[YY], dd.nc[ZZ]));
 -    const int  productOfDimensionSizes      = dd.nc[XX] * dd.nc[YY] * dd.nc[ZZ];
 +    const int maxDimensionSize = std::max(dd.numCells[XX], std::max(dd.numCells[YY], dd.numCells[ZZ]));
 +    const int  productOfDimensionSizes      = dd.numCells[XX] * dd.numCells[YY] * dd.numCells[ZZ];
      const bool decompositionHasOneDimension = (maxDimensionSize == productOfDimensionSizes);
  
 -    const bool hasMax1Pulse =
 -            ((isDlbDisabled(dd.comm) && dd.comm->cellsize_limit >= dd.comm->systemInfo.cutoff)
 -             || (!isDlbDisabled(dd.comm) && dd.comm->maxpulse == 1));
 -
 -    return decompositionHasOneDimension && hasMax1Pulse;
 +    return decompositionHasOneDimension;
  }
  
  namespace gmx
@@@ -2966,7 -2998,7 +2966,7 @@@ public
           t_commrec*           cr,
           const DomdecOptions& options,
           const MdrunOptions&  mdrunOptions,
 -         bool                 prefer1DAnd1Pulse,
 +         bool                 prefer1D,
           const gmx_mtop_t&    mtop,
           const t_inputrec&    ir,
           const matrix         box,
@@@ -3014,7 -3046,7 +3014,7 @@@ DomainDecompositionBuilder::Impl::Impl(
                                         t_commrec*           cr,
                                         const DomdecOptions& options,
                                         const MdrunOptions&  mdrunOptions,
 -                                       const bool           prefer1DAnd1Pulse,
 +                                       const bool           prefer1D,
                                         const gmx_mtop_t&    mtop,
                                         const t_inputrec&    ir,
                                         const matrix         box,
  
      ddSettings_ = getDDSettings(mdlog_, options_, mdrunOptions, ir_);
  
 -    if (prefer1DAnd1Pulse
 -        && canMake1DAnd1PulseDomainDecomposition(ddSettings_, cr_, cr_->nnodes, options_, mtop_,
 -                                                 ir_, box, xGlobal))
 +    if (prefer1D
 +        && canMake1DDomainDecomposition(ddSettings_, cr_, cr_->nnodes, options_, mtop_, ir_, box, xGlobal))
      {
 -        ddSettings_.request1DAnd1Pulse = true;
 +        ddSettings_.request1D = true;
      }
  
      if (ddSettings_.eFlop > 1)
      // DD grid setup uses a more different cell size limit for
      // automated setup than the one in systemInfo_. The latter is used
      // in set_dd_limits() to configure DLB, for example.
 -    const real gridSetupCellsizeLimit = getDDGridSetupCellSizeLimit(
 -            mdlog_, ddSettings_.request1DAnd1Pulse, !isDlbDisabled(ddSettings_.initialDlbState),
 -            options_.dlbScaling, ir_, systemInfo_.cellsizeLimit);
 +    const real gridSetupCellsizeLimit =
 +            getDDGridSetupCellSizeLimit(mdlog_, !isDlbDisabled(ddSettings_.initialDlbState),
 +                                        options_.dlbScaling, ir_, systemInfo_.cellsizeLimit);
      ddGridSetup_ = getDDGridSetup(mdlog_, cr_, numRanksRequested, options_, ddSettings_, systemInfo_,
                                    gridSetupCellsizeLimit, mtop_, ir_, box, xGlobal, &ddbox_);
      checkDDGridSetup(ddGridSetup_, cr_, options_, ddSettings_, systemInfo_, gridSetupCellsizeLimit, ddbox_);
@@@ -3102,12 -3135,12 +3102,12 @@@ DomainDecompositionBuilder::DomainDecom
                                                         t_commrec*           cr,
                                                         const DomdecOptions& options,
                                                         const MdrunOptions&  mdrunOptions,
 -                                                       const bool           prefer1DAnd1Pulse,
 +                                                       const bool           prefer1D,
                                                         const gmx_mtop_t&    mtop,
                                                         const t_inputrec&    ir,
                                                         const matrix         box,
                                                         ArrayRef<const RVec> xGlobal) :
 -    impl_(new Impl(mdlog, cr, options, mdrunOptions, prefer1DAnd1Pulse, mtop, ir, box, xGlobal))
 +    impl_(new Impl(mdlog, cr, options, mdrunOptions, prefer1D, mtop, ir, box, xGlobal))
  {
  }
  
@@@ -3138,7 -3171,7 +3138,7 @@@ static gmx_bool test_dd_cutoff(t_commre
      {
          dim = dd->dim[d];
  
 -        inv_cell_size = DD_CELL_MARGIN * dd->nc[dim] / ddbox.box_size[dim];
 +        inv_cell_size = DD_CELL_MARGIN * dd->numCells[dim] / ddbox.box_size[dim];
          if (dd->unitCellInfo.ddBoxIsDynamic)
          {
              inv_cell_size *= DD_PRES_SCALE_MARGIN;
  
          np = 1 + static_cast<int>(cutoffRequested * inv_cell_size * ddbox.skew_fac[dim]);
  
 -        if (dd->comm->ddSettings.request1DAnd1Pulse && np > 1)
 -        {
 -            return FALSE;
 -        }
 -
          if (!isDlbDisabled(dd->comm) && (dim < ddbox.npbcdim) && (dd->comm->cd[d].np_dlb > 0))
          {
              if (np > dd->comm->cd[d].np_dlb)
@@@ -3200,69 -3238,3 +3200,69 @@@ gmx_bool change_dd_cutoff(t_commrec* cr
  
      return bCutoffAllowed;
  }
 +
 +void constructGpuHaloExchange(const gmx::MDLogger&            mdlog,
 +                              const t_commrec&                cr,
 +                              const gmx::DeviceStreamManager& deviceStreamManager)
 +{
 +    GMX_RELEASE_ASSERT(deviceStreamManager.streamIsValid(gmx::DeviceStreamType::NonBondedLocal),
 +                       "Local non-bonded stream should be valid when using"
 +                       "GPU halo exchange.");
 +    GMX_RELEASE_ASSERT(deviceStreamManager.streamIsValid(gmx::DeviceStreamType::NonBondedNonLocal),
 +                       "Non-local non-bonded stream should be valid when using "
 +                       "GPU halo exchange.");
 +    int gpuHaloExchangeSize = 0;
 +    int pulseStart          = 0;
 +    if (cr.dd->gpuHaloExchange.empty())
 +    {
 +        GMX_LOG(mdlog.warning)
 +                .asParagraph()
 +                .appendTextFormatted(
 +                        "NOTE: Activating the 'GPU halo exchange' feature, enabled "
 +                        "by the "
 +                        "GMX_GPU_DD_COMMS environment variable.");
 +    }
 +    else
 +    {
 +        gpuHaloExchangeSize = static_cast<int>(cr.dd->gpuHaloExchange.size());
 +        pulseStart          = gpuHaloExchangeSize - 1;
 +    }
 +    if (cr.dd->comm->cd[0].numPulses() > gpuHaloExchangeSize)
 +    {
 +        for (int pulse = pulseStart; pulse < cr.dd->comm->cd[0].numPulses(); pulse++)
 +        {
 +            cr.dd->gpuHaloExchange.push_back(std::make_unique<gmx::GpuHaloExchange>(
 +                    cr.dd, cr.mpi_comm_mysim, deviceStreamManager.context(),
 +                    deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedLocal),
 +                    deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedNonLocal), pulse));
 +        }
 +    }
 +}
 +
 +void reinitGpuHaloExchange(const t_commrec&              cr,
 +                           const DeviceBuffer<gmx::RVec> d_coordinatesBuffer,
 +                           const DeviceBuffer<gmx::RVec> d_forcesBuffer)
 +{
 +    for (int pulse = 0; pulse < cr.dd->comm->cd[0].numPulses(); pulse++)
 +    {
 +        cr.dd->gpuHaloExchange[pulse]->reinitHalo(d_coordinatesBuffer, d_forcesBuffer);
 +    }
 +}
 +
 +void communicateGpuHaloCoordinates(const t_commrec&      cr,
 +                                   const matrix          box,
 +                                   GpuEventSynchronizer* coordinatesReadyOnDeviceEvent)
 +{
 +    for (int pulse = 0; pulse < cr.dd->comm->cd[0].numPulses(); pulse++)
 +    {
 +        cr.dd->gpuHaloExchange[pulse]->communicateHaloCoordinates(box, coordinatesReadyOnDeviceEvent);
 +    }
 +}
 +
 +void communicateGpuHaloForces(const t_commrec& cr, bool accumulateForces)
 +{
 +    for (int pulse = cr.dd->comm->cd[0].numPulses() - 1; pulse >= 0; pulse--)
 +    {
 +        cr.dd->gpuHaloExchange[pulse]->communicateHaloForces(accumulateForces);
 +    }
 +}
index cf91196e0ef5440f8a8ab82a60ebbe3b3ff41ad5,cbb9c96b79614fa2c4363729e82bafdb5f51a758..8e1873867cb62076882fbcc7718957e8f7e4d93b
@@@ -3,8 -3,7 +3,8 @@@
   *
   * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   * Copyright (c) 2001-2004, The GROMACS development team.
 - * Copyright (c) 2013,2014,2015,2017,2018,2019,2020, by the GROMACS development team, led by
 + * Copyright (c) 2013,2014,2015,2017,2018 by the GROMACS development team.
 + * Copyright (c) 2019,2020, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@@ -293,7 -292,7 +293,7 @@@ static t_matrix read_xpm_entry(FILE* in
          findLabelInLine(lineString, "type"); // discard the returned string
      }
  
-     if (!line || strncmp(line, "static", 6) != 0)
+     if (!line_buf || strncmp(line_buf, "static", 6) != 0)
      {
          gmx_input("Invalid XPixMap");
      }
              line = line_buf;
          }
          bSetLine = TRUE;
+         GMX_RELEASE_ASSERT(line, "Need to have valid line to parse");
          if (strstr(line, "x-axis"))
          {
              line = std::strstr(line, "x-axis");
index 5dda614a9c2ec42ee0399d250ffae987df053ab1,d4e4edefdf5ad6d18c372e47c62c68f360b09c5b..a5a5f82267cf7f775ef620840718251a4a57b300
@@@ -3,8 -3,7 +3,8 @@@
   *
   * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   * Copyright (c) 2001-2004, The GROMACS development team.
 - * Copyright (c) 2012,2013,2014,2015,2017,2018,2019,2020, by the GROMACS development team, led by
 + * Copyright (c) 2012,2013,2014,2015,2017 by the GROMACS development team.
 + * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@@ -189,7 -188,7 +189,7 @@@ static int strip_dssp(FILE
          mat->axis_y.resize(nr);
          std::iota(mat->axis_y.begin(), mat->axis_y.end(), 1);
          mat->axis_x.resize(0);
-         mat->matrix.resize(0, 0);
+         mat->matrix.resize(1, 1);
          bFirst = false;
      }
      mat->axis_x.push_back(t);
@@@ -268,9 -267,15 +268,9 @@@ static void check_oo(t_atoms* atoms
  
      for (i = 0; (i < atoms->nr); i++)
      {
 -        if (std::strcmp(*(atoms->atomname[i]), "OXT") == 0)
 -        {
 -            *atoms->atomname[i] = OOO;
 -        }
 -        else if (std::strcmp(*(atoms->atomname[i]), "O1") == 0)
 -        {
 -            *atoms->atomname[i] = OOO;
 -        }
 -        else if (std::strcmp(*(atoms->atomname[i]), "OC1") == 0)
 +        if ((std::strcmp(*(atoms->atomname[i]), "OXT") == 0)
 +            || (std::strcmp(*(atoms->atomname[i]), "O1") == 0)
 +            || (std::strcmp(*(atoms->atomname[i]), "OC1") == 0))
          {
              *atoms->atomname[i] = OOO;
          }
@@@ -516,7 -521,7 +516,7 @@@ int gmx_do_dssp(int argc, char* argv[]
      const char *      fnSCount, *fnArea, *fnTArea, *fnAArea;
      const char*       leg[] = { "Phobic", "Phylic" };
      t_topology        top;
 -    int               ePBC;
 +    PbcType           pbcType;
      t_atoms*          atoms;
      t_matrix          mat;
      int               nres, nr0, naccr, nres_plus_separators;
      fnAArea    = opt2fn_null("-aa", NFILE, fnm);
      bDoAccSurf = ((fnArea != nullptr) || (fnTArea != nullptr) || (fnAArea != nullptr));
  
 -    read_tps_conf(ftp2fn(efTPS, NFILE, fnm), &top, &ePBC, &xp, nullptr, box, FALSE);
 +    read_tps_conf(ftp2fn(efTPS, NFILE, fnm), &top, &pbcType, &xp, nullptr, box, FALSE);
      atoms = &(top.atoms);
      check_oo(atoms);
      bPhbres = bPhobics(atoms);
      accr  = nullptr;
      naccr = 0;
  
 -    gpbc = gmx_rmpbc_init(&top.idef, ePBC, natoms);
 +    gpbc = gmx_rmpbc_init(&top.idef, pbcType, natoms);
      do
      {
          t = output_env_conv_time(oenv, t);
          }
          gmx_rmpbc(gpbc, natoms, box, x);
          tapein = gmx_ffopen(pdbfile, "w");
 -        write_pdbfile_indexed(tapein, nullptr, atoms, x, ePBC, box, ' ', -1, gnx, index, nullptr, FALSE);
 +        write_pdbfile_indexed(tapein, nullptr, atoms, x, pbcType, box, ' ', -1, gnx, index, nullptr, FALSE);
          gmx_ffclose(tapein);
          /* strip_dssp returns the number of lines found in the dssp file, i.e.
           * the number of residues plus the separator lines */
index 1c92c76e1fd2497bb8d8cad4390b135db99457a6,14b01053e1af92d517cf58cc5fbb93c0be04e4dc..f1bf5bb89e71b700572f09f3e23a1278da75e964
@@@ -3,8 -3,7 +3,8 @@@
   *
   * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   * Copyright (c) 2001-2004, The GROMACS development team.
 - * Copyright (c) 2013,2014,2015,2016,2017,2018,2019,2020, by the GROMACS development team, led by
 + * Copyright (c) 2013,2014,2015,2016,2017 by the GROMACS development team.
 + * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@@ -78,7 -77,7 +78,7 @@@ static void calc_dihs(t_xrama* xr
      t_dih*      dd;
      gmx_rmpbc_t gpbc = nullptr;
  
 -    gpbc = gmx_rmpbc_init(xr->idef, xr->ePBC, xr->natoms);
 +    gpbc = gmx_rmpbc_init(xr->idef, xr->pbcType, xr->natoms);
      gmx_rmpbc(gpbc, xr->natoms, xr->box, xr->x);
      gmx_rmpbc_done(gpbc);
  
@@@ -121,7 -120,7 +121,7 @@@ static void add_xr(t_xrama* xr, const i
      char buf[12];
      int  i;
  
-     srenew(xr->dih, xr->ndih + 2);
+     srenew(xr->dih, xr->ndih + 2LL);
      for (i = 0; (i < 4); i++)
      {
          xr->dih[xr->ndih].ai[i] = ff[i];
      }
      xr->ndih += 2;
  
-     srenew(xr->pp, xr->npp + 1);
+     srenew(xr->pp, xr->npp + 1LL);
      xr->pp[xr->npp].iphi  = xr->ndih - 2;
      xr->pp[xr->npp].ipsi  = xr->ndih - 1;
      xr->pp[xr->npp].bShow = FALSE;
@@@ -179,7 -178,9 +179,9 @@@ static void min_max(t_xrama* xr
      {
          for (j = 0; (j < 4); j++)
          {
+             MSVC_DIAGNOSTIC_IGNORE(28182) // false positive in 2019 (16.5.4)
              ai = xr->dih[i].ai[j];
+             MSVC_DIAGNOSTIC_RESET
              if (ai < xr->amin)
              {
                  xr->amin = ai;
@@@ -220,7 -221,7 +222,7 @@@ static void get_dih_props(t_xrama* xr, 
          }
  
          i += nra + 1;
-         ia += nra + 1;
+         ia += nra + 1LL;
      }
      /* Fill in defaults for values not in the topology */
      for (i = 0; (i < xr->ndih); i++)
@@@ -241,7 -242,7 +243,7 @@@ t_topology* init_rama(gmx_output_env_t
      t_topology* top;
      real        t;
  
 -    top = read_top(topfile, &xr->ePBC);
 +    top = read_top(topfile, &xr->pbcType);
  
      /*get_dih2(xr,top->idef.functype,&(top->idef.bondeds),&(top->atoms));*/
      get_dih(xr, &(top->atoms));
index 592c73ffa33f434847904de59051da6de5c14ca6,19e5c660dd26bf9e9eaa0a07b502f15d26b376d0..e23eef5bacac81b46820d97095190cd3f8cfbf02
@@@ -3,8 -3,7 +3,8 @@@
   *
   * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   * Copyright (c) 2001-2004, The GROMACS development team.
 - * Copyright (c) 2013,2014,2015,2016,2017,2018,2019,2020, by the GROMACS development team, led by
 + * Copyright (c) 2013,2014,2015,2016,2017 by the GROMACS development team.
 + * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@@ -39,8 -38,6 +39,8 @@@
  
  #include "nb_free_energy.h"
  
 +#include "config.h"
 +
  #include <cmath>
  
  #include <algorithm>
  #include "gromacs/math/vec.h"
  #include "gromacs/mdtypes/forceoutput.h"
  #include "gromacs/mdtypes/forcerec.h"
 +#include "gromacs/mdtypes/interaction_const.h"
  #include "gromacs/mdtypes/md_enums.h"
 +#include "gromacs/mdtypes/mdatom.h"
 +#include "gromacs/simd/simd.h"
  #include "gromacs/utility/fatalerror.h"
  
  
 -//! Enum for templating the soft-core treatment in the kernel
 -enum class SoftCoreTreatment
 -{
 -    None,    //!< No soft-core
 -    RPower6, //!< Soft-core with r-power = 6
 -    RPower48 //!< Soft-core with r-power = 48
 -};
 -
 -//! Most treatments are fine with float in mixed-precision mode.
 -template<SoftCoreTreatment softCoreTreatment>
 -struct SoftCoreReal
 +//! Scalar (non-SIMD) data types.
 +struct ScalarDataTypes
  {
 -    //! Real type for soft-core calculations
 -    using Real = real;
 +    using RealType                     = real; //!< The data type to use as real.
 +    using IntType                      = int;  //!< The data type to use as int.
 +    static constexpr int simdRealWidth = 1;    //!< The width of the RealType.
 +    static constexpr int simdIntWidth  = 1;    //!< The width of the IntType.
  };
  
 -//! This treatment requires double precision for some computations.
 -template<>
 -struct SoftCoreReal<SoftCoreTreatment::RPower48>
 +#if GMX_SIMD_HAVE_REAL && GMX_SIMD_HAVE_INT32_ARITHMETICS
 +//! SIMD data types.
 +struct SimdDataTypes
  {
 -    //! Real type for soft-core calculations
 -    using Real = double;
 +    using RealType                     = gmx::SimdReal;         //!< The data type to use as real.
 +    using IntType                      = gmx::SimdInt32;        //!< The data type to use as int.
 +    static constexpr int simdRealWidth = GMX_SIMD_REAL_WIDTH;   //!< The width of the RealType.
 +    static constexpr int simdIntWidth  = GMX_SIMD_FINT32_WIDTH; //!< The width of the IntType.
  };
 +#endif
  
  //! Computes r^(1/p) and 1/r^(1/p) for the standard p=6
 -template<SoftCoreTreatment softCoreTreatment>
 -static inline void pthRoot(const real r, real* pthRoot, real* invPthRoot)
 -{
 -    *invPthRoot = gmx::invsqrt(std::cbrt(r));
 -    *pthRoot    = 1 / (*invPthRoot);
 -}
 -
 -// We need a double version to make the specialization below work
 -#if !GMX_DOUBLE
 -//! Computes r^(1/p) and 1/r^(1/p) for the standard p=6
 -template<SoftCoreTreatment softCoreTreatment>
 -static inline void pthRoot(const double r, real* pthRoot, double* invPthRoot)
 +template<class RealType>
 +static inline void pthRoot(const RealType r, RealType* pthRoot, RealType* invPthRoot)
  {
      *invPthRoot = gmx::invsqrt(std::cbrt(r));
      *pthRoot    = 1 / (*invPthRoot);
  }
 -#endif
  
 -//! Computes r^(1/p) and 1/r^(1/p) for p=48
 -template<>
 -inline void pthRoot<SoftCoreTreatment::RPower48>(const double r, real* pthRoot, double* invPthRoot)
 +template<class RealType>
 +static inline RealType calculateRinv6(const RealType rinvV)
  {
 -    *pthRoot    = std::pow(r, 1.0 / 48.0);
 -    *invPthRoot = 1 / (*pthRoot);
 +    RealType rinv6 = rinvV * rinvV;
 +    return (rinv6 * rinv6 * rinv6);
  }
  
 -template<SoftCoreTreatment softCoreTreatment>
 -static inline real calculateSigmaPow(const real sigma6)
 -{
 -    if (softCoreTreatment == SoftCoreTreatment::RPower6)
 -    {
 -        return sigma6;
 -    }
 -    else
 -    {
 -        real sigmaPow = sigma6 * sigma6;     /* sigma^12 */
 -        sigmaPow      = sigmaPow * sigmaPow; /* sigma^24 */
 -        sigmaPow      = sigmaPow * sigmaPow; /* sigma^48 */
 -        return (sigmaPow);
 -    }
 -}
 -
 -template<SoftCoreTreatment softCoreTreatment, class SCReal>
 -static inline real calculateRinv6(const SCReal rinvV)
 -{
 -    if (softCoreTreatment == SoftCoreTreatment::RPower6)
 -    {
 -        return rinvV;
 -    }
 -    else
 -    {
 -        real rinv6 = rinvV * rinvV;
 -        return (rinv6 * rinv6 * rinv6);
 -    }
 -}
 -
 -static inline real calculateVdw6(const real c6, const real rinv6)
 +template<class RealType>
 +static inline RealType calculateVdw6(const RealType c6, const RealType rinv6)
  {
      return (c6 * rinv6);
  }
  
 -static inline real calculateVdw12(const real c12, const real rinv6)
 +template<class RealType>
 +static inline RealType calculateVdw12(const RealType c12, const RealType rinv6)
  {
      return (c12 * rinv6 * rinv6);
  }
  
  /* reaction-field electrostatics */
 -template<class SCReal>
 -static inline SCReal
 -reactionFieldScalarForce(const real qq, const real rinv, const SCReal r, const real krf, const real two)
 +template<class RealType>
 +static inline RealType reactionFieldScalarForce(const RealType qq,
 +                                                const RealType rinv,
 +                                                const RealType r,
 +                                                const real     krf,
 +                                                const real     two)
  {
      return (qq * (rinv - two * krf * r * r));
  }
 -template<class SCReal>
 -static inline real
 -reactionFieldPotential(const real qq, const real rinv, const SCReal r, const real krf, const real potentialShift)
 +template<class RealType>
 +static inline RealType reactionFieldPotential(const RealType qq,
 +                                              const RealType rinv,
 +                                              const RealType r,
 +                                              const real     krf,
 +                                              const real     potentialShift)
  {
      return (qq * (rinv + krf * r * r - potentialShift));
  }
  
  /* Ewald electrostatics */
 -static inline real ewaldScalarForce(const real coulomb, const real rinv)
 +template<class RealType>
 +static inline RealType ewaldScalarForce(const RealType coulomb, const RealType rinv)
  {
      return (coulomb * rinv);
  }
 -static inline real ewaldPotential(const real coulomb, const real rinv, const real potentialShift)
 +template<class RealType>
 +static inline RealType ewaldPotential(const RealType coulomb, const RealType rinv, const real potentialShift)
  {
      return (coulomb * (rinv - potentialShift));
  }
  
  /* cutoff LJ */
 -static inline real lennardJonesScalarForce(const real v6, const real v12)
 +template<class RealType>
 +static inline RealType lennardJonesScalarForce(const RealType v6, const RealType v12)
  {
      return (v12 - v6);
  }
 -static inline real lennardJonesPotential(const real v6,
 -                                         const real v12,
 -                                         const real c6,
 -                                         const real c12,
 -                                         const real repulsionShift,
 -                                         const real dispersionShift,
 -                                         const real onesixth,
 -                                         const real onetwelfth)
 +template<class RealType>
 +static inline RealType lennardJonesPotential(const RealType v6,
 +                                             const RealType v12,
 +                                             const RealType c6,
 +                                             const RealType c12,
 +                                             const real     repulsionShift,
 +                                             const real     dispersionShift,
 +                                             const real     onesixth,
 +                                             const real     onetwelfth)
  {
      return ((v12 + c12 * repulsionShift) * onetwelfth - (v6 + c6 * dispersionShift) * onesixth);
  }
@@@ -164,28 -192,25 +164,28 @@@ static inline real ewaldLennardJonesGri
  }
  
  /* LJ Potential switch */
 -template<class SCReal>
 -static inline SCReal potSwitchScalarForceMod(const SCReal fScalarInp,
 -                                             const real   potential,
 -                                             const real   sw,
 -                                             const SCReal r,
 -                                             const real   rVdw,
 -                                             const real   dsw,
 -                                             const real   zero)
 +template<class RealType>
 +static inline RealType potSwitchScalarForceMod(const RealType fScalarInp,
 +                                               const RealType potential,
 +                                               const RealType sw,
 +                                               const RealType r,
 +                                               const RealType rVdw,
 +                                               const RealType dsw,
 +                                               const real     zero)
  {
      if (r < rVdw)
      {
 -        SCReal fScalar = fScalarInp * sw - r * potential * dsw;
 +        real fScalar = fScalarInp * sw - r * potential * dsw;
          return (fScalar);
      }
      return (zero);
  }
 -template<class SCReal>
 -static inline real
 -potSwitchPotentialMod(const real potentialInp, const real sw, const SCReal r, const real rVdw, const real zero)
 +template<class RealType>
 +static inline RealType potSwitchPotentialMod(const RealType potentialInp,
 +                                             const RealType sw,
 +                                             const RealType r,
 +                                             const RealType rVdw,
 +                                             const real     zero)
  {
      if (r < rVdw)
      {
  
  
  //! Templated free-energy non-bonded kernel
 -template<SoftCoreTreatment softCoreTreatment, bool scLambdasOrAlphasDiffer, bool vdwInteractionTypeIsEwald, bool elecInteractionTypeIsEwald, bool vdwModifierIsPotSwitch>
 +template<typename DataTypes, bool useSoftCore, bool scLambdasOrAlphasDiffer, bool vdwInteractionTypeIsEwald, bool elecInteractionTypeIsEwald, bool vdwModifierIsPotSwitch>
  static void nb_free_energy_kernel(const t_nblist* gmx_restrict nlist,
                                    rvec* gmx_restrict         xx,
                                    gmx::ForceWithShiftForces* forceWithShiftForces,
                                    nb_kernel_data_t* gmx_restrict kernel_data,
                                    t_nrnb* gmx_restrict nrnb)
  {
 -    using SCReal = typename SoftCoreReal<softCoreTreatment>::Real;
 -
 -    constexpr bool useSoftCore = (softCoreTreatment != SoftCoreTreatment::None);
 -
  #define STATE_A 0
  #define STATE_B 1
  #define NSTATES 2
  
 +    using RealType = typename DataTypes::RealType;
 +    using IntType  = typename DataTypes::IntType;
 +
 +    /* FIXME: How should these be handled with SIMD? */
      constexpr real onetwelfth = 1.0 / 12.0;
      constexpr real onesixth   = 1.0 / 6.0;
      constexpr real zero       = 0.0;
      const int*  typeA         = mdatoms->typeA;
      const int*  typeB         = mdatoms->typeB;
      const int   ntype         = fr->ntype;
 -    const real* nbfp          = fr->nbfp;
 +    const real* nbfp          = fr->nbfp.data();
      const real* nbfp_grid     = fr->ljpme_c6grid;
      real*       Vv            = kernel_data->energygrp_vdw;
      const real  lambda_coul   = kernel_data->lambda[efptCOUL];
      real rcutoff_max2 = std::max(ic->rcoulomb, ic->rvdw);
      rcutoff_max2      = rcutoff_max2 * rcutoff_max2;
  
-     const real* tab_ewald_F_lj = nullptr;
-     const real* tab_ewald_V_lj = nullptr;
-     const real* ewtab          = nullptr;
-     real        ewtabscale     = 0;
-     real        ewtabhalfspace = 0;
-     real        sh_ewald       = 0;
+     const real* tab_ewald_F_lj           = nullptr;
+     const real* tab_ewald_V_lj           = nullptr;
+     const real* ewtab                    = nullptr;
+     real        coulombTableScale        = 0;
+     real        coulombTableScaleInvHalf = 0;
+     real        vdwTableScale            = 0;
+     real        vdwTableScaleInvHalf     = 0;
+     real        sh_ewald                 = 0;
      if (elecInteractionTypeIsEwald || vdwInteractionTypeIsEwald)
      {
-         const auto& tables = *ic->coulombEwaldTables;
-         sh_ewald           = ic->sh_ewald;
-         ewtab              = tables.tableFDV0.data();
-         ewtabscale         = tables.scale;
-         ewtabhalfspace     = half / ewtabscale;
-         tab_ewald_F_lj     = tables.tableF.data();
-         tab_ewald_V_lj     = tables.tableV.data();
+         sh_ewald = ic->sh_ewald;
+     }
+     if (elecInteractionTypeIsEwald)
+     {
+         const auto& coulombTables = *ic->coulombEwaldTables;
+         ewtab                     = coulombTables.tableFDV0.data();
+         coulombTableScale         = coulombTables.scale;
+         coulombTableScaleInvHalf  = half / coulombTableScale;
+     }
+     if (vdwInteractionTypeIsEwald)
+     {
+         const auto& vdwTables = *ic->vdwEwaldTables;
+         tab_ewald_F_lj        = vdwTables.tableF.data();
+         tab_ewald_V_lj        = vdwTables.tableV.data();
+         vdwTableScale         = vdwTables.scale;
+         vdwTableScaleInvHalf  = half / vdwTableScale;
      }
  
      /* For Ewald/PME interactions we cannot easily apply the soft-core component to
      GMX_RELEASE_ASSERT(!(vdwInteractionTypeIsEwald && vdwModifierIsPotSwitch),
                         "Can not apply soft-core to switched Ewald potentials");
  
 -    SCReal dvdl_coul = 0; /* Needs double for sc_power==48 */
 -    SCReal dvdl_vdw  = 0; /* Needs double for sc_power==48 */
 +    real dvdl_coul = 0;
 +    real dvdl_vdw  = 0;
  
      /* Lambda factor for state A, 1-lambda*/
      real LFC[NSTATES], LFV[NSTATES];
      DLF[STATE_B] = 1;
  
      real           lfac_coul[NSTATES], dlfac_coul[NSTATES], lfac_vdw[NSTATES], dlfac_vdw[NSTATES];
 -    constexpr real sc_r_power = (softCoreTreatment == SoftCoreTreatment::RPower48 ? 48.0_real : 6.0_real);
 +    constexpr real sc_r_power = 6.0_real;
      for (int i = 0; i < NSTATES; i++)
      {
          lfac_coul[i]  = (lam_power == 2 ? (1 - LFC[i]) * (1 - LFC[i]) : (1 - LFC[i]));
  
          for (int k = nj0; k < nj1; k++)
          {
 -            int        tj[NSTATES];
 -            const int  jnr = jjnr[k];
 -            const int  j3  = 3 * jnr;
 -            real       c6[NSTATES], c12[NSTATES], qq[NSTATES], Vcoul[NSTATES], Vvdw[NSTATES];
 -            real       r, rinv, rp, rpm2;
 -            real       alpha_vdw_eff, alpha_coul_eff, sigma_pow[NSTATES];
 -            const real dx  = ix - x[j3];
 -            const real dy  = iy - x[j3 + 1];
 -            const real dz  = iz - x[j3 + 2];
 -            const real rsq = dx * dx + dy * dy + dz * dz;
 -            SCReal     FscalC[NSTATES], FscalV[NSTATES]; /* Needs double for sc_power==48 */
 -
 -            if (rsq >= rcutoff_max2)
 +            int            tj[NSTATES];
 +            const int      jnr = jjnr[k];
 +            const int      j3  = 3 * jnr;
 +            RealType       c6[NSTATES], c12[NSTATES], qq[NSTATES], Vcoul[NSTATES], Vvdw[NSTATES];
 +            RealType       r, rinv, rp, rpm2;
 +            RealType       alpha_vdw_eff, alpha_coul_eff, sigma6[NSTATES];
 +            const RealType dx  = ix - x[j3];
 +            const RealType dy  = iy - x[j3 + 1];
 +            const RealType dz  = iz - x[j3 + 2];
 +            const RealType rsq = dx * dx + dy * dy + dz * dz;
 +            RealType       FscalC[NSTATES], FscalV[NSTATES];
 +            /* Check if this pair on the exlusions list.*/
 +            const bool bPairIncluded = nlist->excl_fep == nullptr || nlist->excl_fep[k];
 +
 +            if (rsq >= rcutoff_max2 && bPairIncluded)
              {
                  /* We save significant time by skipping all code below.
                   * Note that with soft-core interactions, the actual cut-off
                   * check might be different. But since the soft-core distance
                   * is always larger than r, checking on r here is safe.
 +                 * Exclusions outside the cutoff can not be skipped as
 +                 * when using Ewald: the reciprocal-space
 +                 * Ewald component still needs to be subtracted.
                   */
 +
                  continue;
              }
              npair_within_cutoff++;
                  r    = 0;
              }
  
 -            if (softCoreTreatment == SoftCoreTreatment::None)
 +            if (useSoftCore)
 +            {
 +                rpm2 = rsq * rsq;  /* r4 */
 +                rp   = rpm2 * rsq; /* r6 */
 +            }
 +            else
              {
                  /* The soft-core power p will not affect the results
                   * with not using soft-core, so we use power of 0 which gives
                  rpm2 = rinv * rinv;
                  rp   = 1;
              }
 -            if (softCoreTreatment == SoftCoreTreatment::RPower6)
 -            {
 -                rpm2 = rsq * rsq;  /* r4 */
 -                rp   = rpm2 * rsq; /* r6 */
 -            }
 -            if (softCoreTreatment == SoftCoreTreatment::RPower48)
 -            {
 -                rp   = rsq * rsq * rsq; /* r6 */
 -                rp   = rp * rp;         /* r12 */
 -                rp   = rp * rp;         /* r24 */
 -                rp   = rp * rp;         /* r48 */
 -                rpm2 = rp / rsq;        /* r46 */
 -            }
  
 -            real Fscal = 0;
 +            RealType Fscal = 0;
  
              qq[STATE_A] = iqA * chargeA[jnr];
              qq[STATE_B] = iqB * chargeB[jnr];
              tj[STATE_A] = ntiA + 2 * typeA[jnr];
              tj[STATE_B] = ntiB + 2 * typeB[jnr];
  
 -            if (nlist->excl_fep == nullptr || nlist->excl_fep[k])
 +            if (bPairIncluded)
              {
                  c6[STATE_A] = nbfp[tj[STATE_A]];
                  c6[STATE_B] = nbfp[tj[STATE_B]];
                      c12[i] = nbfp[tj[i] + 1];
                      if (useSoftCore)
                      {
 -                        real sigma6[NSTATES];
                          if ((c6[i] > 0) && (c12[i] > 0))
                          {
                              /* c12 is stored scaled with 12.0 and c6 is scaled with 6.0 - correct for this */
                          {
                              sigma6[i] = sigma6_def;
                          }
 -                        sigma_pow[i] = calculateSigmaPow<softCoreTreatment>(sigma6[i]);
                      }
                  }
  
                      Vcoul[i]  = 0;
                      Vvdw[i]   = 0;
  
 -                    real   rinvC, rinvV;
 -                    SCReal rC, rV, rpinvC, rpinvV; /* Needs double for sc_power==48 */
 +                    RealType rinvC, rinvV, rC, rV, rpinvC, rpinvV;
  
                      /* Only spend time on A or B state if it is non-zero */
                      if ((qq[i] != 0) || (c6[i] != 0) || (c12[i] != 0))
                      {
 -                        /* this section has to be inside the loop because of the dependence on sigma_pow */
 +                        /* this section has to be inside the loop because of the dependence on sigma6 */
                          if (useSoftCore)
                          {
 -                            rpinvC = one / (alpha_coul_eff * lfac_coul[i] * sigma_pow[i] + rp);
 -                            pthRoot<softCoreTreatment>(rpinvC, &rinvC, &rC);
 +                            rpinvC = one / (alpha_coul_eff * lfac_coul[i] * sigma6[i] + rp);
 +                            pthRoot(rpinvC, &rinvC, &rC);
                              if (scLambdasOrAlphasDiffer)
                              {
 -                                rpinvV = one / (alpha_vdw_eff * lfac_vdw[i] * sigma_pow[i] + rp);
 -                                pthRoot<softCoreTreatment>(rpinvV, &rinvV, &rV);
 +                                rpinvV = one / (alpha_vdw_eff * lfac_vdw[i] * sigma6[i] + rp);
 +                                pthRoot(rpinvV, &rinvV, &rV);
                              }
                              else
                              {
                                                       || (!vdwInteractionTypeIsEwald && rV < rvdw);
                          if ((c6[i] != 0 || c12[i] != 0) && computeVdwInteraction)
                          {
 -                            real rinv6;
 -                            if (softCoreTreatment == SoftCoreTreatment::RPower6)
 +                            RealType rinv6;
 +                            if (useSoftCore)
                              {
 -                                rinv6 = calculateRinv6<softCoreTreatment>(rpinvV);
 +                                rinv6 = rpinvV;
                              }
                              else
                              {
 -                                rinv6 = calculateRinv6<softCoreTreatment>(rinvV);
 +                                rinv6 = calculateRinv6(rinvV);
                              }
 -                            real Vvdw6  = calculateVdw6(c6[i], rinv6);
 -                            real Vvdw12 = calculateVdw12(c12[i], rinv6);
 +                            RealType Vvdw6  = calculateVdw6(c6[i], rinv6);
 +                            RealType Vvdw12 = calculateVdw12(c12[i], rinv6);
  
                              Vvdw[i] = lennardJonesPotential(Vvdw6, Vvdw12, c6[i], c12[i], repulsionShift,
                                                              dispersionShift, onesixth, onetwelfth);
  
                              if (vdwModifierIsPotSwitch)
                              {
 -                                real d        = rV - ic->rvdw_switch;
 -                                d             = (d > zero) ? d : zero;
 -                                const real d2 = d * d;
 -                                const real sw = one + d2 * d * (vdw_swV3 + d * (vdw_swV4 + d * vdw_swV5));
 -                                const real dsw = d2 * (vdw_swF2 + d * (vdw_swF3 + d * vdw_swF4));
 +                                RealType d        = rV - ic->rvdw_switch;
 +                                d                 = (d > zero) ? d : zero;
 +                                const RealType d2 = d * d;
 +                                const RealType sw =
 +                                        one + d2 * d * (vdw_swV3 + d * (vdw_swV4 + d * vdw_swV5));
 +                                const RealType dsw = d2 * (vdw_swF2 + d * (vdw_swF3 + d * vdw_swF4));
  
                                  FscalV[i] = potSwitchScalarForceMod(FscalV[i], Vvdw[i], sw, rV,
                                                                      rvdw, dsw, zero);
                          FscalC[i] *= rpinvC;
                          FscalV[i] *= rpinvV;
                      }
 -                }
 +                } // end for (int i = 0; i < NSTATES; i++)
  
                  /* Assemble A and B states */
                  for (int i = 0; i < NSTATES; i++)
  
                      if (useSoftCore)
                      {
 -                        dvdl_coul +=
 -                                Vcoul[i] * DLF[i]
 -                                + LFC[i] * alpha_coul_eff * dlfac_coul[i] * FscalC[i] * sigma_pow[i];
 +                        dvdl_coul += Vcoul[i] * DLF[i]
 +                                     + LFC[i] * alpha_coul_eff * dlfac_coul[i] * FscalC[i] * sigma6[i];
                          dvdl_vdw += Vvdw[i] * DLF[i]
 -                                    + LFV[i] * alpha_vdw_eff * dlfac_vdw[i] * FscalV[i] * sigma_pow[i];
 +                                    + LFV[i] * alpha_vdw_eff * dlfac_vdw[i] * FscalV[i] * sigma6[i];
                      }
                      else
                      {
                          dvdl_vdw += Vvdw[i] * DLF[i];
                      }
                  }
 -            }
 +            } // end if (bPairIncluded)
              else if (icoul == GMX_NBKERNEL_ELEC_REACTIONFIELD)
              {
                  /* For excluded pairs, which are only in this pair list when
                   * As there is no singularity, there is no need for soft-core.
                   */
                  const real FF = -two * krf;
 -                real       VV = krf * rsq - crf;
 +                RealType   VV = krf * rsq - crf;
  
                  if (ii == jnr)
                  {
                  }
              }
  
 -            if (elecInteractionTypeIsEwald && r < rcoulomb)
 +            if (elecInteractionTypeIsEwald && (r < rcoulomb || !bPairIncluded))
              {
                  /* See comment in the preamble. When using Ewald interactions
                   * (unless we use a switch modifier) we subtract the reciprocal-space
                   */
                  real v_lr, f_lr;
  
-                 const RealType ewrt   = r * ewtabscale;
 -                const real ewrt   = r * coulombTableScale;
 -                int        ewitab = static_cast<int>(ewrt);
 -                const real eweps  = ewrt - ewitab;
 -                ewitab            = 4 * ewitab;
 -                f_lr              = ewtab[ewitab] + eweps * ewtab[ewitab + 1];
++                const RealType ewrt   = r * coulombTableScale;
 +                IntType        ewitab = static_cast<IntType>(ewrt);
 +                const RealType eweps  = ewrt - ewitab;
 +                ewitab                = 4 * ewitab;
 +                f_lr                  = ewtab[ewitab] + eweps * ewtab[ewitab + 1];
-                 v_lr = (ewtab[ewitab + 2] - ewtabhalfspace * eweps * (ewtab[ewitab] + f_lr));
+                 v_lr = (ewtab[ewitab + 2] - coulombTableScaleInvHalf * eweps * (ewtab[ewitab] + f_lr));
                  f_lr *= rinv;
  
                  /* Note that any possible Ewald shift has already been applied in
                   * r close to 0 for non-interacting pairs.
                   */
  
-                 const RealType rs   = rsq * rinv * ewtabscale;
 -                const real rs   = rsq * rinv * vdwTableScale;
 -                const int  ri   = static_cast<int>(rs);
 -                const real frac = rs - ri;
 -                const real f_lr = (1 - frac) * tab_ewald_F_lj[ri] + frac * tab_ewald_F_lj[ri + 1];
++                const RealType rs   = rsq * rinv * vdwTableScale;
 +                const IntType  ri   = static_cast<IntType>(rs);
 +                const RealType frac = rs - ri;
 +                const RealType f_lr = (1 - frac) * tab_ewald_F_lj[ri] + frac * tab_ewald_F_lj[ri + 1];
                  /* TODO: Currently the Ewald LJ table does not contain
                   * the factor 1/6, we should add this.
                   */
 -                const real FF = f_lr * rinv / six;
 -                real VV = (tab_ewald_V_lj[ri] - vdwTableScaleInvHalf * frac * (tab_ewald_F_lj[ri] + f_lr))
 -                          / six;
 +                const RealType FF = f_lr * rinv / six;
 +                RealType       VV =
-                         (tab_ewald_V_lj[ri] - ewtabhalfspace * frac * (tab_ewald_F_lj[ri] + f_lr)) / six;
++                        (tab_ewald_V_lj[ri] - vdwTableScaleInvHalf * frac * (tab_ewald_F_lj[ri] + f_lr))
++                        / six;
  
                  if (ii == jnr)
                  {
  #pragma omp atomic
                  f[j3 + 2] -= tz;
              }
 -        }
 +        } // end for (int k = nj0; k < nj1; k++)
  
          /* The atomics below are expensive with many OpenMP threads.
           * Here unperturbed i-particles will usually only have a few
                  Vv[ggid] += vvtot;
              }
          }
 -    }
 +    } // end for (int n = 0; n < nri; n++)
  
  #pragma omp atomic
      dvdl[efptCOUL] += dvdl_coul;
@@@ -836,93 -877,69 +848,93 @@@ typedef void (*KernelFunction)(const t_
                                 nb_kernel_data_t* gmx_restrict kernel_data,
                                 t_nrnb* gmx_restrict nrnb);
  
 -template<SoftCoreTreatment softCoreTreatment, bool scLambdasOrAlphasDiffer, bool vdwInteractionTypeIsEwald, bool elecInteractionTypeIsEwald>
 -static KernelFunction dispatchKernelOnVdwModifier(const bool vdwModifierIsPotSwitch)
 +template<bool useSoftCore, bool scLambdasOrAlphasDiffer, bool vdwInteractionTypeIsEwald, bool elecInteractionTypeIsEwald, bool vdwModifierIsPotSwitch>
 +static KernelFunction dispatchKernelOnUseSimd(const bool useSimd)
 +{
 +    if (useSimd)
 +    {
 +#if GMX_SIMD_HAVE_REAL && GMX_SIMD_HAVE_INT32_ARITHMETICS && GMX_USE_SIMD_KERNELS
 +        /* FIXME: Here SimdDataTypes should be used to enable SIMD. So far, the code in nb_free_energy_kernel is not adapted to SIMD */
 +        return (nb_free_energy_kernel<ScalarDataTypes, useSoftCore, scLambdasOrAlphasDiffer, vdwInteractionTypeIsEwald,
 +                                      elecInteractionTypeIsEwald, vdwModifierIsPotSwitch>);
 +#else
 +        return (nb_free_energy_kernel<ScalarDataTypes, useSoftCore, scLambdasOrAlphasDiffer, vdwInteractionTypeIsEwald,
 +                                      elecInteractionTypeIsEwald, vdwModifierIsPotSwitch>);
 +#endif
 +    }
 +    else
 +    {
 +        return (nb_free_energy_kernel<ScalarDataTypes, useSoftCore, scLambdasOrAlphasDiffer, vdwInteractionTypeIsEwald,
 +                                      elecInteractionTypeIsEwald, vdwModifierIsPotSwitch>);
 +    }
 +}
 +
 +template<bool useSoftCore, bool scLambdasOrAlphasDiffer, bool vdwInteractionTypeIsEwald, bool elecInteractionTypeIsEwald>
 +static KernelFunction dispatchKernelOnVdwModifier(const bool vdwModifierIsPotSwitch, const bool useSimd)
  {
      if (vdwModifierIsPotSwitch)
      {
 -        return (nb_free_energy_kernel<softCoreTreatment, scLambdasOrAlphasDiffer,
 -                                      vdwInteractionTypeIsEwald, elecInteractionTypeIsEwald, true>);
 +        return (dispatchKernelOnUseSimd<useSoftCore, scLambdasOrAlphasDiffer, vdwInteractionTypeIsEwald,
 +                                        elecInteractionTypeIsEwald, true>(useSimd));
      }
      else
      {
 -        return (nb_free_energy_kernel<softCoreTreatment, scLambdasOrAlphasDiffer,
 -                                      vdwInteractionTypeIsEwald, elecInteractionTypeIsEwald, false>);
 +        return (dispatchKernelOnUseSimd<useSoftCore, scLambdasOrAlphasDiffer, vdwInteractionTypeIsEwald,
 +                                        elecInteractionTypeIsEwald, false>(useSimd));
      }
  }
  
 -template<SoftCoreTreatment softCoreTreatment, bool scLambdasOrAlphasDiffer, bool vdwInteractionTypeIsEwald>
 +template<bool useSoftCore, bool scLambdasOrAlphasDiffer, bool vdwInteractionTypeIsEwald>
  static KernelFunction dispatchKernelOnElecInteractionType(const bool elecInteractionTypeIsEwald,
 -                                                          const bool vdwModifierIsPotSwitch)
 +                                                          const bool vdwModifierIsPotSwitch,
 +                                                          const bool useSimd)
  {
      if (elecInteractionTypeIsEwald)
      {
 -        return (dispatchKernelOnVdwModifier<softCoreTreatment, scLambdasOrAlphasDiffer, vdwInteractionTypeIsEwald, true>(
 -                vdwModifierIsPotSwitch));
 +        return (dispatchKernelOnVdwModifier<useSoftCore, scLambdasOrAlphasDiffer, vdwInteractionTypeIsEwald, true>(
 +                vdwModifierIsPotSwitch, useSimd));
      }
      else
      {
 -        return (dispatchKernelOnVdwModifier<softCoreTreatment, scLambdasOrAlphasDiffer, vdwInteractionTypeIsEwald, false>(
 -                vdwModifierIsPotSwitch));
 +        return (dispatchKernelOnVdwModifier<useSoftCore, scLambdasOrAlphasDiffer, vdwInteractionTypeIsEwald, false>(
 +                vdwModifierIsPotSwitch, useSimd));
      }
  }
  
 -template<SoftCoreTreatment softCoreTreatment, bool scLambdasOrAlphasDiffer>
 +template<bool useSoftCore, bool scLambdasOrAlphasDiffer>
  static KernelFunction dispatchKernelOnVdwInteractionType(const bool vdwInteractionTypeIsEwald,
                                                           const bool elecInteractionTypeIsEwald,
 -                                                         const bool vdwModifierIsPotSwitch)
 +                                                         const bool vdwModifierIsPotSwitch,
 +                                                         const bool useSimd)
  {
      if (vdwInteractionTypeIsEwald)
      {
 -        return (dispatchKernelOnElecInteractionType<softCoreTreatment, scLambdasOrAlphasDiffer, true>(
 -                elecInteractionTypeIsEwald, vdwModifierIsPotSwitch));
 +        return (dispatchKernelOnElecInteractionType<useSoftCore, scLambdasOrAlphasDiffer, true>(
 +                elecInteractionTypeIsEwald, vdwModifierIsPotSwitch, useSimd));
      }
      else
      {
 -        return (dispatchKernelOnElecInteractionType<softCoreTreatment, scLambdasOrAlphasDiffer, false>(
 -                elecInteractionTypeIsEwald, vdwModifierIsPotSwitch));
 +        return (dispatchKernelOnElecInteractionType<useSoftCore, scLambdasOrAlphasDiffer, false>(
 +                elecInteractionTypeIsEwald, vdwModifierIsPotSwitch, useSimd));
      }
  }
  
 -template<SoftCoreTreatment softCoreTreatment>
 +template<bool useSoftCore>
  static KernelFunction dispatchKernelOnScLambdasOrAlphasDifference(const bool scLambdasOrAlphasDiffer,
                                                                    const bool vdwInteractionTypeIsEwald,
                                                                    const bool elecInteractionTypeIsEwald,
 -                                                                  const bool vdwModifierIsPotSwitch)
 +                                                                  const bool vdwModifierIsPotSwitch,
 +                                                                  const bool useSimd)
  {
      if (scLambdasOrAlphasDiffer)
      {
 -        return (dispatchKernelOnVdwInteractionType<softCoreTreatment, true>(
 -                vdwInteractionTypeIsEwald, elecInteractionTypeIsEwald, vdwModifierIsPotSwitch));
 +        return (dispatchKernelOnVdwInteractionType<useSoftCore, true>(
 +                vdwInteractionTypeIsEwald, elecInteractionTypeIsEwald, vdwModifierIsPotSwitch, useSimd));
      }
      else
      {
 -        return (dispatchKernelOnVdwInteractionType<softCoreTreatment, false>(
 -                vdwInteractionTypeIsEwald, elecInteractionTypeIsEwald, vdwModifierIsPotSwitch));
 +        return (dispatchKernelOnVdwInteractionType<useSoftCore, false>(
 +                vdwInteractionTypeIsEwald, elecInteractionTypeIsEwald, vdwModifierIsPotSwitch, useSimd));
      }
  }
  
@@@ -930,20 -947,25 +942,20 @@@ static KernelFunction dispatchKernel(co
                                       const bool        vdwInteractionTypeIsEwald,
                                       const bool        elecInteractionTypeIsEwald,
                                       const bool        vdwModifierIsPotSwitch,
 +                                     const bool        useSimd,
                                       const t_forcerec* fr)
  {
      if (fr->sc_alphacoul == 0 && fr->sc_alphavdw == 0)
      {
 -        return (dispatchKernelOnScLambdasOrAlphasDifference<SoftCoreTreatment::None>(
 +        return (dispatchKernelOnScLambdasOrAlphasDifference<false>(
                  scLambdasOrAlphasDiffer, vdwInteractionTypeIsEwald, elecInteractionTypeIsEwald,
 -                vdwModifierIsPotSwitch));
 -    }
 -    else if (fr->sc_r_power == 6.0_real)
 -    {
 -        return (dispatchKernelOnScLambdasOrAlphasDifference<SoftCoreTreatment::RPower6>(
 -                scLambdasOrAlphasDiffer, vdwInteractionTypeIsEwald, elecInteractionTypeIsEwald,
 -                vdwModifierIsPotSwitch));
 +                vdwModifierIsPotSwitch, useSimd));
      }
      else
      {
 -        return (dispatchKernelOnScLambdasOrAlphasDifference<SoftCoreTreatment::RPower48>(
 +        return (dispatchKernelOnScLambdasOrAlphasDifference<true>(
                  scLambdasOrAlphasDiffer, vdwInteractionTypeIsEwald, elecInteractionTypeIsEwald,
 -                vdwModifierIsPotSwitch));
 +                vdwModifierIsPotSwitch, useSimd));
      }
  }
  
@@@ -963,13 -985,12 +975,13 @@@ void gmx_nb_free_energy_kernel(const t_
      const bool elecInteractionTypeIsEwald = (EEL_PME_EWALD(fr->ic->eeltype));
      const bool vdwModifierIsPotSwitch     = (fr->ic->vdw_modifier == eintmodPOTSWITCH);
      bool       scLambdasOrAlphasDiffer    = true;
 +    const bool useSimd                    = fr->use_simd_kernels;
  
      if (fr->sc_alphacoul == 0 && fr->sc_alphavdw == 0)
      {
          scLambdasOrAlphasDiffer = false;
      }
 -    else if (fr->sc_r_power == 6.0_real || fr->sc_r_power == 48.0_real)
 +    else if (fr->sc_r_power == 6.0_real)
      {
          if (kernel_data->lambda[efptCOUL] == kernel_data->lambda[efptVDW] && fr->sc_alphacoul == fr->sc_alphavdw)
          {
      {
          GMX_RELEASE_ASSERT(false, "Unsupported soft-core r-power");
      }
 -    KernelFunction kernelFunc = dispatchKernel(scLambdasOrAlphasDiffer, vdwInteractionTypeIsEwald,
 -                                               elecInteractionTypeIsEwald, vdwModifierIsPotSwitch, fr);
 +
 +    KernelFunction kernelFunc;
 +    kernelFunc = dispatchKernel(scLambdasOrAlphasDiffer, vdwInteractionTypeIsEwald,
 +                                elecInteractionTypeIsEwald, vdwModifierIsPotSwitch, useSimd, fr);
      kernelFunc(nlist, xx, ff, fr, mdatoms, kernel_data, nrnb);
  }
index 443008c374e9d76f67a9fd75c14b42a96b453a12,129ab880d367ea5ba1eab5461c2525adbf19e3d8..75fd2c8e8fbc0f03dc9eb4e21d802e3d2d16a0d1
@@@ -3,8 -3,7 +3,8 @@@
   *
   * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   * Copyright (c) 2001-2004, The GROMACS development team.
 - * Copyright (c) 2013,2014,2015,2016,2017,2018,2019,2020, by the GROMACS development team, led by
 + * Copyright (c) 2013,2014,2015,2016,2017 by the GROMACS development team.
 + * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@@ -65,6 -64,7 +65,6 @@@
  #include "gromacs/mdtypes/fcdata.h"
  #include "gromacs/mdtypes/mdatom.h"
  #include "gromacs/pbcutil/ishift.h"
 -#include "gromacs/pbcutil/mshift.h"
  #include "gromacs/pbcutil/pbc.h"
  #include "gromacs/pbcutil/pbc_simd.h"
  #include "gromacs/simd/simd.h"
@@@ -92,6 -92,7 +92,6 @@@ using BondedFunction = real (*)(in
                                  rvec4            f[],
                                  rvec             fshift[],
                                  const t_pbc*     pbc,
 -                                const t_graph*   g,
                                  real             lambda,
                                  real*            dvdlambda,
                                  const t_mdatoms* md,
@@@ -203,17 -204,26 +203,17 @@@ namespac
  
  /*! \brief Spreads and accumulates the bonded forces to the two atoms and adds the virial contribution when needed
   *
 - * When \p g==nullptr, \p shiftIndex is used as the periodic shift.
 - * When \p g!=nullptr, the graph is used to compute the periodic shift.
 + * \p shiftIndex is used as the periodic shift.
   */
  template<BondedKernelFlavor flavor>
 -inline void spreadBondForces(const real     bondForce,
 -                             const rvec     dx,
 -                             const int      ai,
 -                             const int      aj,
 -                             rvec4*         f,
 -                             int            shiftIndex,
 -                             const t_graph* g,
 -                             rvec*          fshift)
 +inline void spreadBondForces(const real bondForce,
 +                             const rvec dx,
 +                             const int  ai,
 +                             const int  aj,
 +                             rvec4*     f,
 +                             int        shiftIndex,
 +                             rvec*      fshift)
  {
 -    if (computeVirial(flavor) && g)
 -    {
 -        ivec dt;
 -        ivec_sub(SHIFT_IVEC(g, ai), SHIFT_IVEC(g, aj), dt);
 -        shiftIndex = IVEC2IS(dt);
 -    }
 -
      for (int m = 0; m < DIM; m++) /*  15          */
      {
          const real fij = bondForce * dx[m];
@@@ -246,6 -256,7 +246,6 @@@ real morse_bonds(int             nbonds
                   rvec4           f[],
                   rvec            fshift[],
                   const t_pbc*    pbc,
 -                 const t_graph*  g,
                   real            lambda,
                   real*           dvdlambda,
                   const t_mdatoms gmx_unused* md,
          *dvdlambda += (cbB - cbA) * omtemp * omtemp
                        - (2 - 2 * omtemp) * omtemp * cb * ((b0B - b0A) * be - (beB - beA) * (dr - b0)); /* 15 */
  
 -        spreadBondForces<flavor>(fbond, dx, ai, aj, f, ki, g, fshift); /* 15 */
 -    }                                                                  /*  83 TOTAL    */
 +        spreadBondForces<flavor>(fbond, dx, ai, aj, f, ki, fshift); /* 15 */
 +    }                                                               /*  83 TOTAL    */
      return vtot;
  }
  
@@@ -314,6 -325,7 +314,6 @@@ real cubic_bonds(int             nbonds
                   rvec4           f[],
                   rvec            fshift[],
                   const t_pbc*    pbc,
 -                 const t_graph*  g,
                   real gmx_unused lambda,
                   real gmx_unused* dvdlambda,
                   const t_mdatoms gmx_unused* md,
  
          vtot += vbond; /* 21 */
  
 -        spreadBondForces<flavor>(fbond, dx, ai, aj, f, ki, g, fshift); /* 15 */
 -    }                                                                  /*  54 TOTAL    */
 +        spreadBondForces<flavor>(fbond, dx, ai, aj, f, ki, fshift); /* 15 */
 +    }                                                               /*  54 TOTAL    */
      return vtot;
  }
  
@@@ -369,6 -381,7 +369,6 @@@ real FENE_bonds(int             nbonds
                  rvec4           f[],
                  rvec            fshift[],
                  const t_pbc*    pbc,
 -                const t_graph*  g,
                  real gmx_unused lambda,
                  real gmx_unused* dvdlambda,
                  const t_mdatoms gmx_unused* md,
  
          vtot += vbond; /* 35 */
  
 -        spreadBondForces<flavor>(fbond, dx, ai, aj, f, ki, g, fshift); /* 15 */
 -    }                                                                  /*  58 TOTAL    */
 +        spreadBondForces<flavor>(fbond, dx, ai, aj, f, ki, fshift); /* 15 */
 +    }                                                               /*  58 TOTAL    */
      return vtot;
  }
  
@@@ -454,6 -467,7 +454,6 @@@ real bonds(int             nbonds
             rvec4           f[],
             rvec            fshift[],
             const t_pbc*    pbc,
 -           const t_graph*  g,
             real            lambda,
             real*           dvdlambda,
             const t_mdatoms gmx_unused* md,
          vtot += vbond;              /* 1*/
          fbond *= gmx::invsqrt(dr2); /*   6            */
  
 -        spreadBondForces<flavor>(fbond, dx, ai, aj, f, ki, g, fshift); /* 15 */
 -    }                                                                  /* 59 TOTAL    */
 +        spreadBondForces<flavor>(fbond, dx, ai, aj, f, ki, fshift); /* 15 */
 +    }                                                               /* 59 TOTAL       */
      return vtot;
  }
  
@@@ -501,6 -515,7 +501,6 @@@ real restraint_bonds(int             nb
                       rvec4           f[],
                       rvec            fshift[],
                       const t_pbc*    pbc,
 -                     const t_graph*  g,
                       real            lambda,
                       real*           dvdlambda,
                       const t_mdatoms gmx_unused* md,
          vtot += vbond;              /* 1*/
          fbond *= gmx::invsqrt(dr2); /*   6            */
  
 -        spreadBondForces<flavor>(fbond, dx, ai, aj, f, ki, g, fshift); /* 15 */
 -    }                                                                  /* 59 TOTAL    */
 +        spreadBondForces<flavor>(fbond, dx, ai, aj, f, ki, fshift); /* 15 */
 +    }                                                               /* 59 TOTAL       */
  
      return vtot;
  }
@@@ -589,6 -604,7 +589,6 @@@ real polarize(int              nbonds
                rvec4            f[],
                rvec             fshift[],
                const t_pbc*     pbc,
 -              const t_graph*   g,
                real             lambda,
                real*            dvdlambda,
                const t_mdatoms* md,
          vtot += vbond;              /* 1*/
          fbond *= gmx::invsqrt(dr2); /*   6            */
  
 -        spreadBondForces<flavor>(fbond, dx, ai, aj, f, ki, g, fshift); /* 15 */
 -    }                                                                  /* 59 TOTAL    */
 +        spreadBondForces<flavor>(fbond, dx, ai, aj, f, ki, fshift); /* 15 */
 +    }                                                               /* 59 TOTAL       */
      return vtot;
  }
  
@@@ -634,6 -650,7 +634,6 @@@ real anharm_polarize(int              n
                       rvec4            f[],
                       rvec             fshift[],
                       const t_pbc*     pbc,
 -                     const t_graph*   g,
                       real             lambda,
                       real*            dvdlambda,
                       const t_mdatoms* md,
          fbond *= gmx::invsqrt(dr2); /*   6            */
          vtot += vbond;              /* 1*/
  
 -        spreadBondForces<flavor>(fbond, dx, ai, aj, f, ki, g, fshift); /* 15 */
 -    }                                                                  /* 72 TOTAL    */
 +        spreadBondForces<flavor>(fbond, dx, ai, aj, f, ki, fshift); /* 15 */
 +    }                                                               /* 72 TOTAL       */
      return vtot;
  }
  
@@@ -688,6 -705,7 +688,6 @@@ real water_pol(int             nbonds
                 rvec4           f[],
                 rvec gmx_unused fshift[],
                 const t_pbc gmx_unused* pbc,
 -               const t_graph gmx_unused* g,
                 real gmx_unused lambda,
                 real gmx_unused* dvdlambda,
                 const t_mdatoms gmx_unused* md,
       * three spatial dimensions in the molecular frame.
       */
      int  i, m, aO, aH1, aH2, aD, aS, type, type0, ki;
 -    ivec dt;
      rvec dOH1, dOH2, dHH, dOD, dDS, nW, kk, dx, kdx, proj;
      real vtot, fij, r_HH, r_OD, r_nW, tx, ty, tz, qS;
  
              kdx[ZZ] = kk[ZZ] * dx[ZZ];
              vtot += iprod(dx, kdx);
  
 -            if (computeVirial(flavor) && g)
 -            {
 -                ivec_sub(SHIFT_IVEC(g, aS), SHIFT_IVEC(g, aD), dt);
 -                ki = IVEC2IS(dt);
 -            }
 -
              for (m = 0; (m < DIM); m++)
              {
                  /* This is a tensor operation but written out for speed */
@@@ -832,6 -857,7 +832,6 @@@ real thole_pol(int             nbonds
                 rvec4           f[],
                 rvec            fshift[],
                 const t_pbc*    pbc,
 -               const t_graph gmx_unused* g,
                 real gmx_unused lambda,
                 real gmx_unused* dvdlambda,
                 const t_mdatoms* md,
      return V;
  }
  
 -// Avoid gcc 386 -O3 code generation bug in this function (see Redmine
 +// Avoid gcc 386 -O3 code generation bug in this function (see Issue
  // #3205 for more information)
  #if defined(__GNUC__) && defined(__i386__) && defined(__OPTIMIZE__)
  #    pragma GCC push_options
@@@ -883,6 -909,7 +883,6 @@@ angles(int             nbonds
         rvec4           f[],
         rvec            fshift[],
         const t_pbc*    pbc,
 -       const t_graph*  g,
         real            lambda,
         real*           dvdlambda,
         const t_mdatoms gmx_unused* md,
      int  i, ai, aj, ak, t1, t2, type;
      rvec r_ij, r_kj;
      real cos_theta, cos_theta2, theta, dVdt, va, vtot;
 -    ivec jt, dt_ij, dt_kj;
  
      vtot = 0.0;
      for (i = 0; i < nbonds;)
              }
              if (computeVirial(flavor))
              {
 -                if (g != nullptr)
 -                {
 -                    copy_ivec(SHIFT_IVEC(g, aj), jt);
 -
 -                    ivec_sub(SHIFT_IVEC(g, ai), jt, dt_ij);
 -                    ivec_sub(SHIFT_IVEC(g, ak), jt, dt_kj);
 -                    t1 = IVEC2IS(dt_ij);
 -                    t2 = IVEC2IS(dt_kj);
 -                }
                  rvec_inc(fshift[t1], f_i);
                  rvec_inc(fshift[CENTRAL], f_j);
                  rvec_inc(fshift[t2], f_k);
@@@ -970,6 -1007,7 +970,6 @@@ angles(int             nbonds
         rvec4           f[],
         rvec gmx_unused fshift[],
         const t_pbc*    pbc,
 -       const t_graph gmx_unused* g,
         real gmx_unused lambda,
         real gmx_unused* dvdlambda,
         const t_mdatoms gmx_unused* md,
@@@ -1117,6 -1155,7 +1117,6 @@@ real linear_angles(int             nbon
                     rvec4           f[],
                     rvec            fshift[],
                     const t_pbc*    pbc,
 -                   const t_graph*  g,
                     real            lambda,
                     real*           dvdlambda,
                     const t_mdatoms gmx_unused* md,
      int  i, m, ai, aj, ak, t1, t2, type;
      rvec f_i, f_j, f_k;
      real L1, kA, kB, aA, aB, dr, dr2, va, vtot, a, b, klin;
 -    ivec jt, dt_ij, dt_kj;
      rvec r_ij, r_kj, r_ik, dx;
  
      L1   = 1 - lambda;
  
          if (computeVirial(flavor))
          {
 -            if (g)
 -            {
 -                copy_ivec(SHIFT_IVEC(g, aj), jt);
 -
 -                ivec_sub(SHIFT_IVEC(g, ai), jt, dt_ij);
 -                ivec_sub(SHIFT_IVEC(g, ak), jt, dt_kj);
 -                t1 = IVEC2IS(dt_ij);
 -                t2 = IVEC2IS(dt_kj);
 -            }
              rvec_inc(fshift[t1], f_i);
              rvec_inc(fshift[CENTRAL], f_j);
              rvec_inc(fshift[t2], f_k);
@@@ -1187,6 -1236,7 +1187,6 @@@ urey_bradley(int             nbonds
               rvec4           f[],
               rvec            fshift[],
               const t_pbc*    pbc,
 -             const t_graph*  g,
               real            lambda,
               real*           dvdlambda,
               const t_mdatoms gmx_unused* md,
      real cos_theta, cos_theta2, theta;
      real dVdt, va, vtot, dr, dr2, vbond, fbond, fik;
      real kthA, th0A, kUBA, r13A, kthB, th0B, kUBB, r13B;
 -    ivec jt, dt_ij, dt_kj, dt_ik;
  
      vtot = 0.0;
      for (i = 0; (i < nbonds);)
              }
              if (computeVirial(flavor))
              {
 -                if (g)
 -                {
 -                    copy_ivec(SHIFT_IVEC(g, aj), jt);
 -
 -                    ivec_sub(SHIFT_IVEC(g, ai), jt, dt_ij);
 -                    ivec_sub(SHIFT_IVEC(g, ak), jt, dt_kj);
 -                    t1 = IVEC2IS(dt_ij);
 -                    t2 = IVEC2IS(dt_kj);
 -                }
                  rvec_inc(fshift[t1], f_i);
                  rvec_inc(fshift[CENTRAL], f_j);
                  rvec_inc(fshift[t2], f_k);
          vtot += vbond;              /* 1*/
          fbond *= gmx::invsqrt(dr2); /*   6            */
  
 -        if (computeVirial(flavor) && g)
 -        {
 -            ivec_sub(SHIFT_IVEC(g, ai), SHIFT_IVEC(g, ak), dt_ik);
 -            ki = IVEC2IS(dt_ik);
 -        }
          for (m = 0; (m < DIM); m++) /*  15            */
          {
              fik = fbond * r_ik[m];
@@@ -1297,6 -1362,7 +1297,6 @@@ urey_bradley(int             nbonds
               rvec4           f[],
               rvec gmx_unused fshift[],
               const t_pbc*    pbc,
 -             const t_graph gmx_unused* g,
               real gmx_unused lambda,
               real gmx_unused* dvdlambda,
               const t_mdatoms gmx_unused* md,
@@@ -1439,6 -1505,7 +1439,6 @@@ real quartic_angles(int             nbo
                      rvec4           f[],
                      rvec            fshift[],
                      const t_pbc*    pbc,
 -                    const t_graph*  g,
                      real gmx_unused lambda,
                      real gmx_unused* dvdlambda,
                      const t_mdatoms gmx_unused* md,
      int  i, j, ai, aj, ak, t1, t2, type;
      rvec r_ij, r_kj;
      real cos_theta, cos_theta2, theta, dt, dVdt, va, dtp, c, vtot;
 -    ivec jt, dt_ij, dt_kj;
  
      vtot = 0.0;
      for (i = 0; (i < nbonds);)
  
              if (computeVirial(flavor))
              {
 -                if (g)
 -                {
 -                    copy_ivec(SHIFT_IVEC(g, aj), jt);
 -
 -                    ivec_sub(SHIFT_IVEC(g, ai), jt, dt_ij);
 -                    ivec_sub(SHIFT_IVEC(g, ak), jt, dt_kj);
 -                    t1 = IVEC2IS(dt_ij);
 -                    t2 = IVEC2IS(dt_kj);
 -                }
                  rvec_inc(fshift[t1], f_i);
                  rvec_inc(fshift[CENTRAL], f_j);
                  rvec_inc(fshift[t2], f_k);
@@@ -1636,29 -1713,31 +1636,29 @@@ inline void dih_angle_simd(const rvec* 
  } // namespace
  
  template<BondedKernelFlavor flavor>
 -void do_dih_fup(int            i,
 -                int            j,
 -                int            k,
 -                int            l,
 -                real           ddphi,
 -                rvec           r_ij,
 -                rvec           r_kj,
 -                rvec           r_kl,
 -                rvec           m,
 -                rvec           n,
 -                rvec4          f[],
 -                rvec           fshift[],
 -                const t_pbc*   pbc,
 -                const t_graph* g,
 -                const rvec     x[],
 -                int            t1,
 -                int            t2,
 -                int            t3)
 +void do_dih_fup(int          i,
 +                int          j,
 +                int          k,
 +                int          l,
 +                real         ddphi,
 +                rvec         r_ij,
 +                rvec         r_kj,
 +                rvec         r_kl,
 +                rvec         m,
 +                rvec         n,
 +                rvec4        f[],
 +                rvec         fshift[],
 +                const t_pbc* pbc,
 +                const rvec   x[],
 +                int          t1,
 +                int          t2,
 +                int          t3)
  {
      /* 143 FLOPS */
      rvec f_i, f_j, f_k, f_l;
      rvec uvec, vvec, svec, dx_jl;
      real iprm, iprn, nrkj, nrkj2, nrkj_1, nrkj_2;
      real a, b, p, q, toler;
 -    ivec jt, dt_ij, dt_kj, dt_lj;
  
      iprm  = iprod(m, m);       /*  5    */
      iprn  = iprod(n, n);       /*  5  */
  
          if (computeVirial(flavor))
          {
 -            if (g)
 -            {
 -                copy_ivec(SHIFT_IVEC(g, j), jt);
 -                ivec_sub(SHIFT_IVEC(g, i), jt, dt_ij);
 -                ivec_sub(SHIFT_IVEC(g, k), jt, dt_kj);
 -                ivec_sub(SHIFT_IVEC(g, l), jt, dt_lj);
 -                t1 = IVEC2IS(dt_ij);
 -                t2 = IVEC2IS(dt_kj);
 -                t3 = IVEC2IS(dt_lj);
 -            }
 -            else if (pbc)
 +            if (pbc)
              {
                  t3 = pbc_rvec_sub(pbc, x[l], x[j], dx_jl);
              }
@@@ -1804,6 -1893,7 +1804,6 @@@ pdihs(int             nbonds
        rvec4           f[],
        rvec            fshift[],
        const t_pbc*    pbc,
 -      const t_graph*  g,
        real            lambda,
        real*           dvdlambda,
        const t_mdatoms gmx_unused* md,
          } while (i < nbonds && forceatoms[i + 1] == ai && forceatoms[i + 2] == aj
                   && forceatoms[i + 3] == ak && forceatoms[i + 4] == al);
  
 -        do_dih_fup<flavor>(ai, aj, ak, al, ddphi_tot, r_ij, r_kj, r_kl, m, n, f, fshift, pbc, g, x,
 -                           t1, t2, t3); /* 112                */
 -    }                                   /* 223 TOTAL  */
 +        do_dih_fup<flavor>(ai, aj, ak, al, ddphi_tot, r_ij, r_kj, r_kl, m, n, f, fshift, pbc, x, t1,
 +                           t2, t3); /* 112            */
 +    }                               /* 223 TOTAL  */
  
      return vtot;
  }
@@@ -1859,6 -1949,7 +1859,6 @@@ pdihs(int             nbonds
        rvec4           f[],
        rvec gmx_unused fshift[],
        const t_pbc*    pbc,
 -      const t_graph gmx_unused* g,
        real gmx_unused lambda,
        real gmx_unused* dvdlambda,
        const t_mdatoms gmx_unused* md,
@@@ -1973,6 -2064,7 +1973,6 @@@ rbdihs(int             nbonds
         rvec4           f[],
         rvec gmx_unused fshift[],
         const t_pbc*    pbc,
 -       const t_graph gmx_unused* g,
         real gmx_unused lambda,
         real gmx_unused* dvdlambda,
         const t_mdatoms gmx_unused* md,
@@@ -2099,6 -2191,7 +2099,6 @@@ real idihs(int             nbonds
             rvec4           f[],
             rvec            fshift[],
             const t_pbc*    pbc,
 -           const t_graph*  g,
             real            lambda,
             real*           dvdlambda,
             const t_mdatoms gmx_unused* md,
  
          dvdl_term += 0.5 * (kB - kA) * dp2 - kk * dphi0 * dp;
  
 -        do_dih_fup<flavor>(ai, aj, ak, al, -ddphi, r_ij, r_kj, r_kl, m, n, f, fshift, pbc, g, x, t1,
 +        do_dih_fup<flavor>(ai, aj, ak, al, -ddphi, r_ij, r_kj, r_kl, m, n, f, fshift, pbc, x, t1,
                             t2, t3); /* 112            */
          /* 218 TOTAL  */
      }
@@@ -2169,6 -2262,7 +2169,6 @@@ real low_angres(int             nbonds
                  rvec4           f[],
                  rvec            fshift[],
                  const t_pbc*    pbc,
 -                const t_graph*  g,
                  real            lambda,
                  real*           dvdlambda,
                  gmx_bool        bZAxis)
      rvec r_ij, r_kl, f_i, f_k = { 0, 0, 0 };
      real st, sth, nrij2, nrkl2, c, cij, ckl;
  
 -    ivec dt;
      t2 = 0; /* avoid warning with gcc-3.3. It is never used uninitialized */
  
      vtot = 0.0;
  
              if (computeVirial(flavor))
              {
 -                if (g)
 -                {
 -                    ivec_sub(SHIFT_IVEC(g, ai), SHIFT_IVEC(g, aj), dt);
 -                    t1 = IVEC2IS(dt);
 -                }
                  rvec_inc(fshift[t1], f_i);
                  rvec_dec(fshift[CENTRAL], f_i);
                  if (!bZAxis)
                  {
 -                    if (g)
 -                    {
 -                        ivec_sub(SHIFT_IVEC(g, ak), SHIFT_IVEC(g, al), dt);
 -                        t2 = IVEC2IS(dt);
 -                    }
                      rvec_inc(fshift[t2], f_k);
                      rvec_dec(fshift[CENTRAL], f_k);
                  }
@@@ -2260,13 -2365,15 +2260,13 @@@ real angres(int             nbonds
              rvec4           f[],
              rvec            fshift[],
              const t_pbc*    pbc,
 -            const t_graph*  g,
              real            lambda,
              real*           dvdlambda,
              const t_mdatoms gmx_unused* md,
              t_fcdata gmx_unused* fcd,
              int gmx_unused* global_atom_index)
  {
 -    return low_angres<flavor>(nbonds, forceatoms, forceparams, x, f, fshift, pbc, g, lambda,
 -                              dvdlambda, FALSE);
 +    return low_angres<flavor>(nbonds, forceatoms, forceparams, x, f, fshift, pbc, lambda, dvdlambda, FALSE);
  }
  
  template<BondedKernelFlavor flavor>
@@@ -2277,13 -2384,15 +2277,13 @@@ real angresz(int             nbonds
               rvec4           f[],
               rvec            fshift[],
               const t_pbc*    pbc,
 -             const t_graph*  g,
               real            lambda,
               real*           dvdlambda,
               const t_mdatoms gmx_unused* md,
               t_fcdata gmx_unused* fcd,
               int gmx_unused* global_atom_index)
  {
 -    return low_angres<flavor>(nbonds, forceatoms, forceparams, x, f, fshift, pbc, g, lambda,
 -                              dvdlambda, TRUE);
 +    return low_angres<flavor>(nbonds, forceatoms, forceparams, x, f, fshift, pbc, lambda, dvdlambda, TRUE);
  }
  
  template<BondedKernelFlavor flavor>
@@@ -2294,6 -2403,7 +2294,6 @@@ real dihres(int             nbonds
              rvec4           f[],
              rvec            fshift[],
              const t_pbc*    pbc,
 -            const t_graph*  g,
              real            lambda,
              real*           dvdlambda,
              const t_mdatoms gmx_unused* md,
              {
                  *dvdlambda += kfac * ddp * ((dphiB - dphiA) - (phi0B - phi0A));
              }
 -            do_dih_fup<flavor>(ai, aj, ak, al, ddphi, r_ij, r_kj, r_kl, m, n, f, fshift, pbc, g, x,
 -                               t1, t2, t3); /* 112            */
 +            do_dih_fup<flavor>(ai, aj, ak, al, ddphi, r_ij, r_kj, r_kl, m, n, f, fshift, pbc, x, t1,
 +                               t2, t3); /* 112                */
          }
      }
      return vtot;
@@@ -2387,6 -2497,7 +2387,6 @@@ real unimplemented(int gmx_unused nbond
                     rvec4 gmx_unused f[],
                     rvec gmx_unused fshift[],
                     const t_pbc gmx_unused* pbc,
 -                   const t_graph gmx_unused* g,
                     real gmx_unused lambda,
                     real gmx_unused* dvdlambda,
                     const t_mdatoms gmx_unused* md,
@@@ -2404,6 -2515,7 +2404,6 @@@ real restrangles(int             nbonds
                   rvec4           f[],
                   rvec            fshift[],
                   const t_pbc*    pbc,
 -                 const t_graph*  g,
                   real gmx_unused lambda,
                   real gmx_unused* dvdlambda,
                   const t_mdatoms gmx_unused* md,
      int    i, d, ai, aj, ak, type, m;
      int    t1, t2;
      real   v, vtot;
 -    ivec   jt, dt_ij, dt_kj;
      rvec   f_i, f_j, f_k;
      double prefactor, ratio_ante, ratio_post;
      rvec   delta_ante, delta_post, vec_temp;
             real restrangles(int nbonds,
              const t_iatom forceatoms[],const t_iparams forceparams[],
              const rvec x[],rvec4 f[],rvec fshift[],
 -            const t_pbc *pbc,const t_graph *g,
 +            const t_pbc *pbc,
              real gmx_unused lambda,real gmx_unused *dvdlambda,
              const t_mdatoms gmx_unused *md,t_fcdata gmx_unused *fcd,
              int gmx_unused *global_atom_index)
             int t1, t2;
             rvec r_ij,r_kj;
             real v, vtot;
 -           ivec jt,dt_ij,dt_kj;
             rvec f_i, f_j, f_k;
             real prefactor, ratio_ante, ratio_post;
             rvec delta_ante, delta_post, vec_temp;
  
          if (computeVirial(flavor))
          {
 -            if (g)
 -            {
 -                copy_ivec(SHIFT_IVEC(g, aj), jt);
 -                ivec_sub(SHIFT_IVEC(g, ai), jt, dt_ij);
 -                ivec_sub(SHIFT_IVEC(g, ak), jt, dt_kj);
 -                t1 = IVEC2IS(dt_ij);
 -                t2 = IVEC2IS(dt_kj);
 -            }
 -
              rvec_inc(fshift[t1], f_i);
              rvec_inc(fshift[CENTRAL], f_j);
              rvec_inc(fshift[t2], f_k);
@@@ -2506,6 -2629,7 +2506,6 @@@ real restrdihs(int             nbonds
                 rvec4           f[],
                 rvec            fshift[],
                 const t_pbc*    pbc,
 -               const t_graph*  g,
                 real gmx_unused lambda,
                 real gmx_unused* dvlambda,
                 const t_mdatoms gmx_unused* md,
      int  i, d, type, ai, aj, ak, al;
      rvec f_i, f_j, f_k, f_l;
      rvec dx_jl;
 -    ivec jt, dt_ij, dt_kj, dt_lj;
      int  t1, t2, t3;
      real v, vtot;
      rvec delta_ante, delta_crnt, delta_post, vec_temp;
  
          if (computeVirial(flavor))
          {
 -            if (g)
 -            {
 -                copy_ivec(SHIFT_IVEC(g, aj), jt);
 -                ivec_sub(SHIFT_IVEC(g, ai), jt, dt_ij);
 -                ivec_sub(SHIFT_IVEC(g, ak), jt, dt_kj);
 -                ivec_sub(SHIFT_IVEC(g, al), jt, dt_lj);
 -                t1 = IVEC2IS(dt_ij);
 -                t2 = IVEC2IS(dt_kj);
 -                t3 = IVEC2IS(dt_lj);
 -            }
 -            else if (pbc)
 +            if (pbc)
              {
                  t3 = pbc_rvec_sub(pbc, x[al], x[aj], dx_jl);
              }
@@@ -2614,6 -2749,7 +2614,6 @@@ real cbtdihs(int             nbonds
               rvec4           f[],
               rvec            fshift[],
               const t_pbc*    pbc,
 -             const t_graph*  g,
               real gmx_unused lambda,
               real gmx_unused* dvdlambda,
               const t_mdatoms gmx_unused* md,
      real v, vtot;
      rvec vec_temp;
      rvec f_i, f_j, f_k, f_l;
 -    ivec jt, dt_ij, dt_kj, dt_lj;
      rvec dx_jl;
      rvec delta_ante, delta_crnt, delta_post;
      rvec f_phi_ai, f_phi_aj, f_phi_ak, f_phi_al;
  
          if (computeVirial(flavor))
          {
 -            if (g)
 -            {
 -                copy_ivec(SHIFT_IVEC(g, aj), jt);
 -                ivec_sub(SHIFT_IVEC(g, ai), jt, dt_ij);
 -                ivec_sub(SHIFT_IVEC(g, ak), jt, dt_kj);
 -                ivec_sub(SHIFT_IVEC(g, al), jt, dt_lj);
 -                t1 = IVEC2IS(dt_ij);
 -                t2 = IVEC2IS(dt_kj);
 -                t3 = IVEC2IS(dt_lj);
 -            }
 -            else if (pbc)
 +            if (pbc)
              {
                  t3 = pbc_rvec_sub(pbc, x[al], x[aj], dx_jl);
              }
@@@ -2716,6 -2863,7 +2716,6 @@@ rbdihs(int             nbonds
         rvec4           f[],
         rvec            fshift[],
         const t_pbc*    pbc,
 -       const t_graph*  g,
         real            lambda,
         real*           dvdlambda,
         const t_mdatoms gmx_unused* md,
  
          ddphi = -ddphi * sin_phi; /*  11              */
  
 -        do_dih_fup<flavor>(ai, aj, ak, al, ddphi, r_ij, r_kj, r_kl, m, n, f, fshift, pbc, g, x, t1,
 -                           t2, t3); /* 112            */
 +        do_dih_fup<flavor>(ai, aj, ak, al, ddphi, r_ij, r_kj, r_kl, m, n, f, fshift, pbc, x, t1, t2,
 +                           t3); /* 112                */
          vtot += v;
      }
      *dvdlambda += dvdl_term;
@@@ -2859,14 -3007,15 +2859,14 @@@ int cmap_setup_grid_index(int ip, int g
  
  } // namespace
  
 -real cmap_dihs(int                   nbonds,
 -               const t_iatom         forceatoms[],
 -               const t_iparams       forceparams[],
 -               const gmx_cmap_t*     cmap_grid,
 -               const rvec            x[],
 -               rvec4                 f[],
 -               rvec                  fshift[],
 -               const struct t_pbc*   pbc,
 -               const struct t_graph* g,
 +real cmap_dihs(int                 nbonds,
 +               const t_iatom       forceatoms[],
 +               const t_iparams     forceparams[],
 +               const gmx_cmap_t*   cmap_grid,
 +               const rvec          x[],
 +               rvec4               f[],
 +               rvec                fshift[],
 +               const struct t_pbc* pbc,
                 real gmx_unused lambda,
                 real gmx_unused* dvdlambda,
                 const t_mdatoms gmx_unused* md,
      rvec a1, b1, a2, b2;
      rvec f1, g1, h1, f2, g2, h2;
      rvec dtf1, dtg1, dth1, dtf2, dtg2, dth2;
 -    ivec jt1, dt1_ij, dt1_kj, dt1_lj;
 -    ivec jt2, dt2_ij, dt2_kj, dt2_lj;
  
      int loop_index[4][4] = { { 0, 4, 8, 12 }, { 1, 5, 9, 13 }, { 2, 6, 10, 14 }, { 3, 7, 11, 15 } };
  
          /* Shift forces */
          if (fshift != nullptr)
          {
 -            if (g)
 -            {
 -                copy_ivec(SHIFT_IVEC(g, a1j), jt1);
 -                ivec_sub(SHIFT_IVEC(g, a1i), jt1, dt1_ij);
 -                ivec_sub(SHIFT_IVEC(g, a1k), jt1, dt1_kj);
 -                ivec_sub(SHIFT_IVEC(g, a1l), jt1, dt1_lj);
 -                t11 = IVEC2IS(dt1_ij);
 -                t21 = IVEC2IS(dt1_kj);
 -                t31 = IVEC2IS(dt1_lj);
 -
 -                copy_ivec(SHIFT_IVEC(g, a2j), jt2);
 -                ivec_sub(SHIFT_IVEC(g, a2i), jt2, dt2_ij);
 -                ivec_sub(SHIFT_IVEC(g, a2k), jt2, dt2_kj);
 -                ivec_sub(SHIFT_IVEC(g, a2l), jt2, dt2_lj);
 -                t12 = IVEC2IS(dt2_ij);
 -                t22 = IVEC2IS(dt2_kj);
 -                t32 = IVEC2IS(dt2_lj);
 -            }
 -            else if (pbc)
 +            if (pbc)
              {
                  t31 = pbc_rvec_sub(pbc, x[a1l], x[a1j], h1);
                  t32 = pbc_rvec_sub(pbc, x[a2l], x[a2j], h2);
@@@ -3271,6 -3440,7 +3271,6 @@@ real g96bonds(int             nbonds
                rvec4           f[],
                rvec            fshift[],
                const t_pbc*    pbc,
 -              const t_graph*  g,
                real            lambda,
                real*           dvdlambda,
                const t_mdatoms gmx_unused* md,
  
          vtot += 0.5 * vbond; /* 1*/
  
 -        spreadBondForces<flavor>(fbond, dx, ai, aj, f, ki, g, fshift); /* 15 */
 -    }                                                                  /* 44 TOTAL    */
 +        spreadBondForces<flavor>(fbond, dx, ai, aj, f, ki, fshift); /* 15 */
 +    }                                                               /* 44 TOTAL       */
      return vtot;
  }
  
@@@ -3323,6 -3493,7 +3323,6 @@@ real g96angles(int             nbonds
                 rvec4           f[],
                 rvec            fshift[],
                 const t_pbc*    pbc,
 -               const t_graph*  g,
                 real            lambda,
                 real*           dvdlambda,
                 const t_mdatoms gmx_unused* md,
      real cos_theta, dVdt, va, vtot;
      real rij_1, rij_2, rkj_1, rkj_2, rijrkj_1;
      rvec f_i, f_j, f_k;
 -    ivec jt, dt_ij, dt_kj;
  
      vtot = 0.0;
      for (i = 0; (i < nbonds);)
  
          if (computeVirial(flavor))
          {
 -            if (g)
 -            {
 -                copy_ivec(SHIFT_IVEC(g, aj), jt);
 -
 -                ivec_sub(SHIFT_IVEC(g, ai), jt, dt_ij);
 -                ivec_sub(SHIFT_IVEC(g, ak), jt, dt_kj);
 -                t1 = IVEC2IS(dt_ij);
 -                t2 = IVEC2IS(dt_kj);
 -            }
              rvec_inc(fshift[t1], f_i);
              rvec_inc(fshift[CENTRAL], f_j);
              rvec_inc(fshift[t2], f_k); /* 9 */
@@@ -3385,6 -3566,7 +3385,6 @@@ real cross_bond_bond(int             nb
                       rvec4           f[],
                       rvec            fshift[],
                       const t_pbc*    pbc,
 -                     const t_graph*  g,
                       real gmx_unused lambda,
                       real gmx_unused* dvdlambda,
                       const t_mdatoms gmx_unused* md,
      rvec r_ij, r_kj;
      real vtot, vrr, s1, s2, r1, r2, r1e, r2e, krr;
      rvec f_i, f_j, f_k;
 -    ivec jt, dt_ij, dt_kj;
  
      vtot = 0.0;
      for (i = 0; (i < nbonds);)
  
          if (computeVirial(flavor))
          {
 -            if (g)
 -            {
 -                copy_ivec(SHIFT_IVEC(g, aj), jt);
 -
 -                ivec_sub(SHIFT_IVEC(g, ai), jt, dt_ij);
 -                ivec_sub(SHIFT_IVEC(g, ak), jt, dt_kj);
 -                t1 = IVEC2IS(dt_ij);
 -                t2 = IVEC2IS(dt_kj);
 -            }
              rvec_inc(fshift[t1], f_i);
              rvec_inc(fshift[CENTRAL], f_j);
              rvec_inc(fshift[t2], f_k); /* 9 */
@@@ -3457,6 -3649,7 +3457,6 @@@ real cross_bond_angle(int             n
                        rvec4           f[],
                        rvec            fshift[],
                        const t_pbc*    pbc,
 -                      const t_graph*  g,
                        real gmx_unused lambda,
                        real gmx_unused* dvdlambda,
                        const t_mdatoms gmx_unused* md,
      rvec r_ij, r_kj, r_ik;
      real vtot, vrt, s1, s2, s3, r1, r2, r3, r1e, r2e, r3e, krt, k1, k2, k3;
      rvec f_i, f_j, f_k;
 -    ivec jt, dt_ij, dt_kj;
  
      vtot = 0.0;
      for (i = 0; (i < nbonds);)
  
          if (computeVirial(flavor))
          {
 -            if (g)
 -            {
 -                copy_ivec(SHIFT_IVEC(g, aj), jt);
 -
 -                ivec_sub(SHIFT_IVEC(g, ai), jt, dt_ij);
 -                ivec_sub(SHIFT_IVEC(g, ak), jt, dt_kj);
 -                t1 = IVEC2IS(dt_ij);
 -                t2 = IVEC2IS(dt_kj);
 -            }
              rvec_inc(fshift[t1], f_i);
              rvec_inc(fshift[CENTRAL], f_j);
              rvec_inc(fshift[t2], f_k); /* 9 */
@@@ -3588,6 -3791,7 +3588,6 @@@ real tab_bonds(int             nbonds
                 rvec4           f[],
                 rvec            fshift[],
                 const t_pbc*    pbc,
 -               const t_graph*  g,
                 real            lambda,
                 real*           dvdlambda,
                 const t_mdatoms gmx_unused* md,
          vtot += vbond;              /* 1*/
          fbond *= gmx::invsqrt(dr2); /*   6            */
  
 -        spreadBondForces<flavor>(fbond, dx, ai, aj, f, ki, g, fshift); /* 15 */
 -    }                                                                  /* 62 TOTAL    */
 +        spreadBondForces<flavor>(fbond, dx, ai, aj, f, ki, fshift); /* 15 */
 +    }                                                               /* 62 TOTAL       */
      return vtot;
  }
  
@@@ -3636,6 -3840,7 +3636,6 @@@ real tab_angles(int             nbonds
                  rvec4           f[],
                  rvec            fshift[],
                  const t_pbc*    pbc,
 -                const t_graph*  g,
                  real            lambda,
                  real*           dvdlambda,
                  const t_mdatoms gmx_unused* md,
      int  i, ai, aj, ak, t1, t2, type, table;
      rvec r_ij, r_kj;
      real cos_theta, cos_theta2, theta, dVdt, va, vtot;
 -    ivec jt, dt_ij, dt_kj;
  
      vtot = 0.0;
      for (i = 0; (i < nbonds);)
  
              if (computeVirial(flavor))
              {
 -                if (g)
 -                {
 -                    copy_ivec(SHIFT_IVEC(g, aj), jt);
 -
 -                    ivec_sub(SHIFT_IVEC(g, ai), jt, dt_ij);
 -                    ivec_sub(SHIFT_IVEC(g, ak), jt, dt_kj);
 -                    t1 = IVEC2IS(dt_ij);
 -                    t2 = IVEC2IS(dt_kj);
 -                }
                  rvec_inc(fshift[t1], f_i);
                  rvec_inc(fshift[CENTRAL], f_j);
                  rvec_inc(fshift[t2], f_k);
@@@ -3709,6 -3924,7 +3709,6 @@@ real tab_dihs(int             nbonds
                rvec4           f[],
                rvec            fshift[],
                const t_pbc*    pbc,
 -              const t_graph*  g,
                real            lambda,
                real*           dvdlambda,
                const t_mdatoms gmx_unused* md,
                                   forceparams[type].tab.kB, phi + M_PI, lambda, &vpd, &ddphi);
  
          vtot += vpd;
 -        do_dih_fup<flavor>(ai, aj, ak, al, -ddphi, r_ij, r_kj, r_kl, m, n, f, fshift, pbc, g, x, t1,
 +        do_dih_fup<flavor>(ai, aj, ak, al, -ddphi, r_ij, r_kj, r_kl, m, n, f, fshift, pbc, x, t1,
                             t2, t3); /* 112    */
  
      } /* 227 TOTAL  */
@@@ -3752,11 -3968,18 +3752,18 @@@ struct BondedInteraction
      int            nrnbIndex;
  };
  
+ // Bug in old clang versions prevents constexpr. constexpr is needed for MSVC.
+ #if defined(__clang__) && __clang_major__ < 6
+ #    define CONSTEXPR_EXCL_OLD_CLANG const
+ #else
+ #    define CONSTEXPR_EXCL_OLD_CLANG constexpr
+ #endif
  /*! \brief Lookup table of bonded interaction functions
   *
   * This must have as many entries as interaction_function in ifunc.cpp */
  template<BondedKernelFlavor flavor>
const std::array<BondedInteractions, F_NRE> c_bondedInteractionFunctions = {
CONSTEXPR_EXCL_OLD_CLANG std::array<BondedInteractions, F_NRE> c_bondedInteractionFunctions = {
      BondedInteractions{ bonds<flavor>, eNR_BONDS },                       // F_BONDS
      BondedInteractions{ g96bonds<flavor>, eNR_BONDS },                    // F_G96BONDS
      BondedInteractions{ morse_bonds<flavor>, eNR_MORSE },                 // F_MORSE
  };
  
  /*! \brief List of instantiated BondedInteractions list */
- const gmx::EnumerationArray<BondedKernelFlavor, std::array<BondedInteractions, F_NRE>> c_bondedInteractionFunctionsPerFlavor = {
+ CONSTEXPR_EXCL_OLD_CLANG
+ gmx::EnumerationArray<BondedKernelFlavor, std::array<BondedInteractions, F_NRE>> c_bondedInteractionFunctionsPerFlavor = {
      c_bondedInteractionFunctions<BondedKernelFlavor::ForcesSimdWhenAvailable>,
      c_bondedInteractionFunctions<BondedKernelFlavor::ForcesNoSimd>,
      c_bondedInteractionFunctions<BondedKernelFlavor::ForcesAndVirialAndEnergy>,
  
  } // namespace
  
 -real calculateSimpleBond(const int            ftype,
 -                         const int            numForceatoms,
 -                         const t_iatom        forceatoms[],
 -                         const t_iparams      forceparams[],
 -                         const rvec           x[],
 -                         rvec4                f[],
 -                         rvec                 fshift[],
 -                         const struct t_pbc*  pbc,
 -                         const struct t_graph gmx_unused* g,
 -                         const real                       lambda,
 -                         real*                            dvdlambda,
 -                         const t_mdatoms*                 md,
 -                         t_fcdata*                        fcd,
 +real calculateSimpleBond(const int           ftype,
 +                         const int           numForceatoms,
 +                         const t_iatom       forceatoms[],
 +                         const t_iparams     forceparams[],
 +                         const rvec          x[],
 +                         rvec4               f[],
 +                         rvec                fshift[],
 +                         const struct t_pbc* pbc,
 +                         const real          lambda,
 +                         real*               dvdlambda,
 +                         const t_mdatoms*    md,
 +                         t_fcdata*           fcd,
                           int gmx_unused*          global_atom_index,
                           const BondedKernelFlavor bondedKernelFlavor)
  {
      const BondedInteractions& bonded = c_bondedInteractionFunctionsPerFlavor[bondedKernelFlavor][ftype];
  
 -    real v = bonded.function(numForceatoms, forceatoms, forceparams, x, f, fshift, pbc, g, lambda,
 +    real v = bonded.function(numForceatoms, forceatoms, forceparams, x, f, fshift, pbc, lambda,
                               dvdlambda, md, fcd, global_atom_index);
  
      return v;
index c8725816b3814318f76b905cd49b5e480971d2c4,c7aeca1a8b7b1379716bda636cc67bf04cfb697d..3cb7e40a93c6e173f20f7a9fd45c712063f55d92
@@@ -3,7 -3,7 +3,7 @@@
   *
   * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   * Copyright (c) 2001-2004, The GROMACS development team.
 - * Copyright (c) 2013-2020, by the GROMACS development team, led by
 + * Copyright (c) 2013-2019,2020, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
  #include "gromacs/mdlib/forcerec_threading.h"
  #include "gromacs/mdlib/gmx_omp_nthreads.h"
  #include "gromacs/mdlib/md_support.h"
 -#include "gromacs/mdlib/qmmm.h"
  #include "gromacs/mdlib/rf_util.h"
  #include "gromacs/mdlib/wall.h"
 +#include "gromacs/mdlib/wholemoleculetransform.h"
  #include "gromacs/mdtypes/commrec.h"
  #include "gromacs/mdtypes/fcdata.h"
 +#include "gromacs/mdtypes/forcerec.h"
  #include "gromacs/mdtypes/group.h"
  #include "gromacs/mdtypes/iforceprovider.h"
  #include "gromacs/mdtypes/inputrec.h"
 +#include "gromacs/mdtypes/interaction_const.h"
  #include "gromacs/mdtypes/md_enums.h"
  #include "gromacs/nbnxm/gpu_data_mgmt.h"
  #include "gromacs/nbnxm/nbnxm.h"
  #include "gromacs/utility/smalloc.h"
  #include "gromacs/utility/strconvert.h"
  
 -/*! \brief environment variable to enable GPU P2P communication */
 -static const bool c_enableGpuPmePpComms =
 -        (getenv("GMX_GPU_PME_PP_COMMS") != nullptr) && GMX_THREAD_MPI && (GMX_GPU == GMX_GPU_CUDA);
 -
 -static real* mk_nbfp(const gmx_ffparams_t* idef, gmx_bool bBHAM)
 +static std::vector<real> mk_nbfp(const gmx_ffparams_t* idef, gmx_bool bBHAM)
  {
 -    real* nbfp;
 -    int   i, j, k, atnr;
 +    std::vector<real> nbfp;
 +    int               atnr;
  
      atnr = idef->atnr;
      if (bBHAM)
      {
 -        snew(nbfp, 3 * atnr * atnr);
 -        for (i = k = 0; (i < atnr); i++)
 +        nbfp.resize(3 * atnr * atnr);
 +        int k = 0;
 +        for (int i = 0; (i < atnr); i++)
          {
 -            for (j = 0; (j < atnr); j++, k++)
 +            for (int j = 0; (j < atnr); j++, k++)
              {
                  BHAMA(nbfp, atnr, i, j) = idef->iparams[k].bham.a;
                  BHAMB(nbfp, atnr, i, j) = idef->iparams[k].bham.b;
      }
      else
      {
 -        snew(nbfp, 2 * atnr * atnr);
 -        for (i = k = 0; (i < atnr); i++)
 +        nbfp.resize(2 * atnr * atnr);
 +        int k = 0;
 +        for (int i = 0; (i < atnr); i++)
          {
 -            for (j = 0; (j < atnr); j++, k++)
 +            for (int j = 0; (j < atnr); j++, k++)
              {
                  /* nbfp now includes the 6.0/12.0 derivative prefactors */
                  C6(nbfp, atnr, i, j)  = idef->iparams[k].lj.c6 * 6.0;
@@@ -186,10 -186,14 +186,10 @@@ enu
      acSETTLE
  };
  
 -static cginfo_mb_t* init_cginfo_mb(const gmx_mtop_t* mtop, const t_forcerec* fr, gmx_bool* bFEP_NonBonded)
 +static std::vector<cginfo_mb_t> init_cginfo_mb(const gmx_mtop_t* mtop, const t_forcerec* fr)
  {
 -    cginfo_mb_t* cginfo_mb;
 -    gmx_bool*    type_VDW;
 -    int*         cginfo;
 -    int*         a_con;
 -
 -    snew(cginfo_mb, mtop->molblock.size());
 +    gmx_bool* type_VDW;
 +    int*      a_con;
  
      snew(type_VDW, fr->ntype);
      for (int ai = 0; ai < fr->ntype; ai++)
          }
      }
  
 -    *bFEP_NonBonded = FALSE;
 -
 -    int a_offset = 0;
 +    std::vector<cginfo_mb_t> cginfoPerMolblock;
 +    int                      a_offset = 0;
      for (size_t mb = 0; mb < mtop->molblock.size(); mb++)
      {
          const gmx_molblock_t& molb = mtop->molblock[mb];
          const gmx_moltype_t&  molt = mtop->moltype[molb.type];
 -        const t_blocka&       excl = molt.excls;
 +        const auto&           excl = molt.excls;
  
          /* Check if the cginfo is identical for all molecules in this block.
           * If so, we only need an array of the size of one molecule.
              }
          }
  
 -        cginfo_mb[mb].cg_start = a_offset;
 -        cginfo_mb[mb].cg_end   = a_offset + molb.nmol * molt.atoms.nr;
 -        cginfo_mb[mb].cg_mod   = (bId ? 1 : molb.nmol) * molt.atoms.nr;
 -        snew(cginfo_mb[mb].cginfo, cginfo_mb[mb].cg_mod);
 -        cginfo = cginfo_mb[mb].cginfo;
 +        cginfo_mb_t cginfo_mb;
 +        cginfo_mb.cg_start = a_offset;
 +        cginfo_mb.cg_end   = a_offset + molb.nmol * molt.atoms.nr;
 +        cginfo_mb.cg_mod   = (bId ? 1 : molb.nmol) * molt.atoms.nr;
 +        cginfo_mb.cginfo.resize(cginfo_mb.cg_mod);
 +        gmx::ArrayRef<int> cginfo = cginfo_mb.cginfo;
  
          /* Set constraints flags for constrained atoms */
          snew(a_con, molt.atoms.nr);
  
                  bool haveExclusions = false;
                  /* Loop over all the exclusions of atom ai */
 -                for (int j = excl.index[a]; j < excl.index[a + 1]; j++)
 +                for (const int j : excl[a])
                  {
 -                    if (excl.a[j] != a)
 +                    if (j != a)
                      {
                          haveExclusions = true;
                          break;
                  if (fr->efep != efepNO && PERTURBED(atom))
                  {
                      SET_CGINFO_FEP(atomInfo);
 -                    *bFEP_NonBonded = TRUE;
                  }
              }
          }
  
          sfree(a_con);
  
 +        cginfoPerMolblock.push_back(cginfo_mb);
 +
          a_offset += molb.nmol * molt.atoms.nr;
      }
      sfree(type_VDW);
  
 -    return cginfo_mb;
 +    return cginfoPerMolblock;
  }
  
 -static std::vector<int> cginfo_expand(const int nmb, const cginfo_mb_t* cgi_mb)
 +static std::vector<int> cginfo_expand(const int nmb, gmx::ArrayRef<const cginfo_mb_t> cgi_mb)
  {
      const int ncg = cgi_mb[nmb - 1].cg_end;
  
      return cginfo;
  }
  
 -static void done_cginfo_mb(cginfo_mb_t* cginfo_mb, int numMolBlocks)
 -{
 -    if (cginfo_mb == nullptr)
 -    {
 -        return;
 -    }
 -    for (int mb = 0; mb < numMolBlocks; ++mb)
 -    {
 -        sfree(cginfo_mb[mb].cginfo);
 -    }
 -    sfree(cginfo_mb);
 -}
 -
  /* Sets the sum of charges (squared) and C6 in the system in fr.
   * Returns whether the system has a net charge.
   */
@@@ -602,6 -618,11 +602,6 @@@ void forcerec_set_ranges(t_forcerec* fr
      fr->natoms_force        = natoms_force;
      fr->natoms_force_constr = natoms_force_constr;
  
 -    if (fr->natoms_force_constr > fr->nalloc_force)
 -    {
 -        fr->nalloc_force = over_alloc_dd(fr->natoms_force_constr);
 -    }
 -
      if (fr->haveDirectVirialContributions)
      {
          fr->forceBufferForDirectVirialContributions.resize(natoms_f_novirsum);
@@@ -712,7 -733,6 +712,7 @@@ static void initVdwEwaldParameters(FILE
   * both accuracy requirements, when relevant.
   */
  static void init_ewald_f_table(const interaction_const_t& ic,
 +                               const real                 tableExtensionLength,
                                 EwaldCorrectionTables*     coulombTables,
                                 EwaldCorrectionTables*     vdwTables)
  {
       */
      const real tableScale = ewald_spline3_table_scale(ic, useCoulombTable, useVdwTable);
  
 -    const int tableSize = static_cast<int>(ic.rcoulomb * tableScale) + 2;
 +    real tableLen = ic.rcoulomb;
 +    if (useCoulombTable && tableExtensionLength > 0.0)
 +    {
 +        /* TODO: Ideally this should also check if couple-intramol == no, but that isn't
 +         * stored in ir. Grompp puts that info into an opts structure that doesn't make it into the tpr.
 +         * The alternative is to look through all the exclusions and check if they come from
 +         * couple-intramol == no. Meanwhile, always having larger tables should only affect
 +         * memory consumption, not speed (barring cache issues).
 +         */
 +        tableLen = ic.rcoulomb + tableExtensionLength;
 +    }
 +    const int tableSize = static_cast<int>(tableLen * tableScale) + 2;
  
      if (useCoulombTable)
      {
      }
  }
  
 -void init_interaction_const_tables(FILE* fp, interaction_const_t* ic)
 +void init_interaction_const_tables(FILE* fp, interaction_const_t* ic, const real tableExtensionLength)
  {
-     if (EEL_PME_EWALD(ic->eeltype))
+     if (EEL_PME_EWALD(ic->eeltype) || EVDW_PME(ic->vdwtype))
      {
-         init_ewald_f_table(*ic, tableExtensionLength, ic->coulombEwaldTables.get(), nullptr);
 -        init_ewald_f_table(*ic, ic->coulombEwaldTables.get(), ic->vdwEwaldTables.get());
++        init_ewald_f_table(*ic, tableExtensionLength, ic->coulombEwaldTables.get(),
++                           ic->vdwEwaldTables.get());
          if (fp != nullptr)
          {
-             fprintf(fp, "Initialized non-bonded Coulomb Ewald tables, spacing: %.2e size: %zu\n\n",
+             fprintf(fp, "Initialized non-bonded Ewald tables, spacing: %.2e size: %zu\n\n",
                      1 / ic->coulombEwaldTables->scale, ic->coulombEwaldTables->tableF.size());
          }
      }
@@@ -817,6 -826,7 +818,7 @@@ static void init_interaction_const(FILE
      ic->cutoff_scheme = ir->cutoff_scheme;
  
      ic->coulombEwaldTables = std::make_unique<EwaldCorrectionTables>();
+     ic->vdwEwaldTables     = std::make_unique<EwaldCorrectionTables>();
  
      /* Lennard-Jones */
      ic->vdwtype         = ir->vdwtype;
      *interaction_const = ic;
  }
  
 -bool areMoleculesDistributedOverPbc(const t_inputrec& ir, const gmx_mtop_t& mtop, const gmx::MDLogger& mdlog)
 -{
 -    bool       areMoleculesDistributedOverPbc = false;
 -    const bool useEwaldSurfaceCorrection = (EEL_PME_EWALD(ir.coulombtype) && ir.epsilon_surface != 0);
 -
 -    const bool bSHAKE =
 -            (ir.eConstrAlg == econtSHAKE
 -             && (gmx_mtop_ftype_count(mtop, F_CONSTR) > 0 || gmx_mtop_ftype_count(mtop, F_CONSTRNC) > 0));
 -
 -    /* The group cut-off scheme and SHAKE assume charge groups
 -     * are whole, but not using molpbc is faster in most cases.
 -     * With intermolecular interactions we need PBC for calculating
 -     * distances between atoms in different molecules.
 -     */
 -    if (bSHAKE && !mtop.bIntermolecularInteractions)
 -    {
 -        areMoleculesDistributedOverPbc = ir.bPeriodicMols;
 -
 -        if (areMoleculesDistributedOverPbc)
 -        {
 -            gmx_fatal(FARGS, "SHAKE is not supported with periodic molecules");
 -        }
 -    }
 -    else
 -    {
 -        /* Not making molecules whole is faster in most cases,
 -         * but with orientation restraints or non-tinfoil boundary
 -         * conditions we need whole molecules.
 -         */
 -        areMoleculesDistributedOverPbc =
 -                (gmx_mtop_ftype_count(mtop, F_ORIRES) == 0 && !useEwaldSurfaceCorrection);
 -
 -        if (getenv("GMX_USE_GRAPH") != nullptr)
 -        {
 -            areMoleculesDistributedOverPbc = false;
 -
 -            GMX_LOG(mdlog.warning)
 -                    .asParagraph()
 -                    .appendText(
 -                            "GMX_USE_GRAPH is set, using the graph for bonded "
 -                            "interactions");
 -
 -            if (mtop.bIntermolecularInteractions)
 -            {
 -                GMX_LOG(mdlog.warning)
 -                        .asParagraph()
 -                        .appendText(
 -                                "WARNING: Molecules linked by intermolecular interactions "
 -                                "have to reside in the same periodic image, otherwise "
 -                                "artifacts will occur!");
 -            }
 -        }
 -
 -        GMX_RELEASE_ASSERT(areMoleculesDistributedOverPbc || !mtop.bIntermolecularInteractions,
 -                           "We need to use PBC within molecules with inter-molecular interactions");
 -
 -        if (bSHAKE && areMoleculesDistributedOverPbc)
 -        {
 -            gmx_fatal(FARGS,
 -                      "SHAKE is not properly supported with intermolecular interactions. "
 -                      "For short simulations where linked molecules remain in the same "
 -                      "periodic image, the environment variable GMX_USE_GRAPH can be used "
 -                      "to override this check.\n");
 -        }
 -    }
 -
 -    return areMoleculesDistributedOverPbc;
 -}
 -
  void init_forcerec(FILE*                            fp,
                     const gmx::MDLogger&             mdlog,
                     t_forcerec*                      fr,
                     const char*                      tabfn,
                     const char*                      tabpfn,
                     gmx::ArrayRef<const std::string> tabbfnm,
 -                   const gmx_hw_info_t&             hardwareInfo,
 -                   const gmx_device_info_t*         deviceInfo,
 -                   const bool                       useGpuForBonded,
 -                   const bool                       pmeOnlyRankUsesGpu,
 -                   real                             print_force,
 -                   gmx_wallcycle*                   wcycle)
 +                   real                             print_force)
  {
 -    real     rtab;
 -    char*    env;
 -    double   dbl;
 -    gmx_bool bFEP_NonBonded;
 +    /* The CMake default turns SIMD kernels on, but it might be turned off further down... */
 +    fr->use_simd_kernels = GMX_USE_SIMD_KERNELS;
  
 -    /* By default we turn SIMD kernels on, but it might be turned off further down... */
 -    fr->use_simd_kernels = TRUE;
 -
 -    if (check_box(ir->ePBC, box))
 +    if (check_box(ir->pbcType, box))
      {
 -        gmx_fatal(FARGS, "%s", check_box(ir->ePBC, box));
 +        gmx_fatal(FARGS, "%s", check_box(ir->pbcType, box));
      }
  
      /* Test particle insertion ? */
      fr->sc_r_power    = ir->fepvals->sc_r_power;
      fr->sc_sigma6_def = gmx::power6(ir->fepvals->sc_sigma);
  
 -    env = getenv("GMX_SCSIGMA_MIN");
 +    char* env = getenv("GMX_SCSIGMA_MIN");
      if (env != nullptr)
      {
 -        dbl = 0;
 +        double dbl = 0;
          sscanf(env, "%20lf", &dbl);
          fr->sc_sigma6_min = gmx::power6(dbl);
          if (fp)
  
      /* Neighbour searching stuff */
      fr->cutoff_scheme = ir->cutoff_scheme;
 -    fr->ePBC          = ir->ePBC;
 +    fr->pbcType       = ir->pbcType;
  
      /* Determine if we will do PBC for distances in bonded interactions */
 -    if (fr->ePBC == epbcNONE)
 +    if (fr->pbcType == PbcType::No)
      {
          fr->bMolPBC = FALSE;
      }
      {
          const bool useEwaldSurfaceCorrection =
                  (EEL_PME_EWALD(ir->coulombtype) && ir->epsilon_surface != 0);
 +        const bool haveOrientationRestraints = (gmx_mtop_ftype_count(mtop, F_ORIRES) > 0);
          if (!DOMAINDECOMP(cr))
          {
 -            fr->bMolPBC = areMoleculesDistributedOverPbc(*ir, *mtop, mdlog);
 -            // The assert below is equivalent to fcd->orires.nr != gmx_mtop_ftype_count(mtop, F_ORIRES)
 -            GMX_RELEASE_ASSERT(!fr->bMolPBC || fcd->orires.nr == 0,
 -                               "Molecules broken over PBC exist in a simulation including "
 -                               "orientation restraints. "
 -                               "This likely means that the global topology and the force constant "
 -                               "data have gotten out of sync.");
 -            if (useEwaldSurfaceCorrection)
 +            fr->bMolPBC = true;
 +
 +            if (useEwaldSurfaceCorrection || haveOrientationRestraints)
              {
 -                gmx_fatal(FARGS,
 -                          "In GROMACS 2020, Ewald dipole correction is disabled when not "
 -                          "using domain decomposition. With domain decomposition, it only works "
 -                          "when each molecule consists of a single update group (e.g. water). "
 -                          "This will be fixed in GROMACS 2021.");
 +                fr->wholeMoleculeTransform =
 +                        std::make_unique<gmx::WholeMoleculeTransform>(*mtop, ir->pbcType);
              }
          }
          else
          {
 -            fr->bMolPBC = dd_bonded_molpbc(cr->dd, fr->ePBC);
 +            fr->bMolPBC = dd_bonded_molpbc(cr->dd, fr->pbcType);
  
 -            if (useEwaldSurfaceCorrection && !dd_moleculesAreAlwaysWhole(*cr->dd))
 +            /* With Ewald surface correction it is useful to support e.g. running water
 +             * in parallel with update groups.
 +             * With orientation restraints there is no sensible use case supported with DD.
 +             */
 +            if ((useEwaldSurfaceCorrection && !dd_moleculesAreAlwaysWhole(*cr->dd)) || haveOrientationRestraints)
              {
                  gmx_fatal(FARGS,
 -                          "You requested dipole correction (epsilon_surface > 0), but molecules "
 -                          "are broken "
 +                          "You requested Ewald surface correction or orientation restraints, "
 +                          "but molecules are broken "
                            "over periodic boundary conditions by the domain decomposition. "
                            "Run without domain decomposition instead.");
              }
  
          if (useEwaldSurfaceCorrection)
          {
 -            GMX_RELEASE_ASSERT((!DOMAINDECOMP(cr) && !fr->bMolPBC)
 -                                       || (DOMAINDECOMP(cr) && dd_moleculesAreAlwaysWhole(*cr->dd)),
 +            GMX_RELEASE_ASSERT(!DOMAINDECOMP(cr) || dd_moleculesAreAlwaysWhole(*cr->dd),
                                 "Molecules can not be broken by PBC with epsilon_surface > 0");
          }
      }
  
      /* fr->ic is used both by verlet and group kernels (to some extent) now */
      init_interaction_const(fp, &fr->ic, ir, mtop, systemHasNetCharge);
 -    init_interaction_const_tables(fp, fr->ic);
 +    init_interaction_const_tables(fp, fr->ic, ir->tabext);
  
      const interaction_const_t* ic = fr->ic;
  
          case eelSWITCH:
          case eelSHIFT:
          case eelUSER:
 -        case eelENCADSHIFT:
          case eelPMESWITCH:
          case eelPMEUSER:
          case eelPMEUSERSWITCH:
  
          case evdwSWITCH:
          case evdwSHIFT:
 -        case evdwUSER:
 -        case evdwENCADSHIFT:
 -            fr->nbkernel_vdw_interaction = GMX_NBKERNEL_VDW_CUBICSPLINETABLE;
 -            break;
 +        case evdwUSER: fr->nbkernel_vdw_interaction = GMX_NBKERNEL_VDW_CUBICSPLINETABLE; break;
  
          default: gmx_fatal(FARGS, "Unsupported vdw interaction: %s", evdw_names[ic->vdwtype]);
      }
  
      fr->shiftForces.resize(SHIFTS);
  
 -    if (fr->nbfp == nullptr)
 +    if (fr->nbfp.empty())
      {
          fr->ntype = mtop->ffparams.atnr;
          fr->nbfp  = mk_nbfp(&mtop->ffparams, fr->bBHAM);
       * in that case grompp should already have checked that we do not need
       * normal tables and we only generate tables for 1-4 interactions.
       */
 -    rtab = ir->rlist + ir->tabext;
 +    real rtab = ir->rlist + ir->tabext;
  
      /* We want to use unmodified tables for 1-4 coulombic
       * interactions, so we must in general have an extra set of
      }
  
      // QM/MM initialization if requested
 -    fr->bQMMM = ir->bQMMM;
 -    if (fr->bQMMM)
 +    if (ir->bQMMM)
      {
 -        // Initialize QM/MM if supported
 -        if (GMX_QMMM)
 -        {
 -            GMX_LOG(mdlog.info)
 -                    .asParagraph()
 -                    .appendText(
 -                            "Large parts of the QM/MM support is deprecated, and may be removed in "
 -                            "a future "
 -                            "version. Please get in touch with the developers if you find the "
 -                            "support useful, "
 -                            "as help is needed if the functionality is to continue to be "
 -                            "available.");
 -            fr->qr = mk_QMMMrec();
 -            init_QMMMrec(cr, mtop, ir, fr);
 -        }
 -        else
 -        {
 -            gmx_incons(
 -                    "QM/MM was requested, but is only available when GROMACS "
 -                    "is configured with QM/MM support");
 -        }
 +        gmx_incons("QM/MM was requested, but is no longer available in GROMACS");
      }
  
      /* Set all the static charge group info */
 -    fr->cginfo_mb = init_cginfo_mb(mtop, fr, &bFEP_NonBonded);
 +    fr->cginfo_mb = init_cginfo_mb(mtop, fr);
      if (!DOMAINDECOMP(cr))
      {
          fr->cginfo = cginfo_expand(mtop->molblock.size(), fr->cginfo_mb);
      fr->nthread_ewc = gmx_omp_nthreads_get(emntBonded);
      snew(fr->ewc_t, fr->nthread_ewc);
  
 -    if (fr->cutoff_scheme == ecutsVERLET)
 -    {
 -        // We checked the cut-offs in grompp, but double-check here.
 -        // We have PME+LJcutoff kernels for rcoulomb>rvdw.
 -        if (EEL_PME_EWALD(ir->coulombtype) && ir->vdwtype == eelCUT)
 -        {
 -            GMX_RELEASE_ASSERT(ir->rcoulomb >= ir->rvdw,
 -                               "With Verlet lists and PME we should have rcoulomb>=rvdw");
 -        }
 -        else
 -        {
 -            GMX_RELEASE_ASSERT(
 -                    ir->rcoulomb == ir->rvdw,
 -                    "With Verlet lists and no PME rcoulomb and rvdw should be identical");
 -        }
 -
 -        fr->nbv = Nbnxm::init_nb_verlet(mdlog, bFEP_NonBonded, ir, fr, cr, hardwareInfo, deviceInfo,
 -                                        mtop, box, wcycle);
 -
 -        if (useGpuForBonded)
 -        {
 -            auto stream = havePPDomainDecomposition(cr)
 -                                  ? Nbnxm::gpu_get_command_stream(
 -                                            fr->nbv->gpu_nbv, gmx::InteractionLocality::NonLocal)
 -                                  : Nbnxm::gpu_get_command_stream(fr->nbv->gpu_nbv,
 -                                                                  gmx::InteractionLocality::Local);
 -            // TODO the heap allocation is only needed while
 -            // t_forcerec lacks a constructor.
 -            fr->gpuBonded = new gmx::GpuBonded(mtop->ffparams, stream, wcycle);
 -        }
 -    }
 -
      if (ir->eDispCorr != edispcNO)
      {
          fr->dispersionCorrection = std::make_unique<DispersionCorrection>(
 -                *mtop, *ir, fr->bBHAM, fr->ntype,
 -                gmx::arrayRefFromArray(fr->nbfp, fr->ntype * fr->ntype * 2), *fr->ic, tabfn);
 +                *mtop, *ir, fr->bBHAM, fr->ntype, fr->nbfp, *fr->ic, tabfn);
          fr->dispersionCorrection->print(mdlog);
      }
  
           */
          fprintf(fp, "\n");
      }
 -
 -    if (pmeOnlyRankUsesGpu && c_enableGpuPmePpComms)
 -    {
 -        fr->pmePpCommGpu = std::make_unique<gmx::PmePpCommGpu>(cr->mpi_comm_mysim, cr->dd->pme_nodeid);
 -    }
  }
  
  t_forcerec::t_forcerec() = default;
  
 -t_forcerec::~t_forcerec() = default;
 -
 -/* Frees GPU memory and sets a tMPI node barrier.
 - *
 - * Note that this function needs to be called even if GPUs are not used
 - * in this run because the PME ranks have no knowledge of whether GPUs
 - * are used or not, but all ranks need to enter the barrier below.
 - * \todo Remove physical node barrier from this function after making sure
 - * that it's not needed anymore (with a shared GPU run).
 - */
 -void free_gpu_resources(t_forcerec*                          fr,
 -                        const gmx::PhysicalNodeCommunicator& physicalNodeCommunicator,
 -                        const gmx_gpu_info_t&                gpu_info)
 -{
 -    bool isPPrankUsingGPU = (fr != nullptr) && (fr->nbv != nullptr) && fr->nbv->useGpu();
 -
 -    /* stop the GPU profiler (only CUDA) */
 -    if (gpu_info.n_dev > 0)
 -    {
 -        stopGpuProfiler();
 -    }
 -
 -    if (isPPrankUsingGPU)
 -    {
 -        /* Free data in GPU memory and pinned memory before destroying the GPU context */
 -        fr->nbv.reset();
 -
 -        delete fr->gpuBonded;
 -        fr->gpuBonded = nullptr;
 -    }
 -
 -    /* With tMPI we need to wait for all ranks to finish deallocation before
 -     * destroying the CUDA context in free_gpu() as some tMPI ranks may be sharing
 -     * GPU and context.
 -     *
 -     * This is not a concern in OpenCL where we use one context per rank which
 -     * is freed in nbnxn_gpu_free().
 -     *
 -     * Note: it is safe to not call the barrier on the ranks which do not use GPU,
 -     * but it is easier and more futureproof to call it on the whole node.
 -     */
 -    if (GMX_THREAD_MPI)
 -    {
 -        physicalNodeCommunicator.barrier();
 -    }
 -}
 -
 -void done_forcerec(t_forcerec* fr, int numMolBlocks)
 +t_forcerec::~t_forcerec()
  {
 -    if (fr == nullptr)
 -    {
 -        // PME-only ranks don't have a forcerec
 -        return;
 -    }
 -    done_cginfo_mb(fr->cginfo_mb, numMolBlocks);
 -    sfree(fr->nbfp);
 -    delete fr->ic;
 -    sfree(fr->shift_vec);
 -    sfree(fr->ewc_t);
 -    tear_down_bonded_threading(fr->bondedThreading);
 -    GMX_RELEASE_ASSERT(fr->gpuBonded == nullptr, "Should have been deleted earlier, when used");
 -    fr->bondedThreading = nullptr;
 -    delete fr;
 +    /* Note: This code will disappear when types are converted to C++ */
 +    sfree(shift_vec);
 +    sfree(ewc_t);
 +    tear_down_bonded_threading(bondedThreading);
  }
index 5ff92d9e19401a2ee8f26246cd52f18c96521275,f2528d78b43e643e34847d67e543ac819a8dcf02..179e3116e525860395178d1cd60ebde5df26bead
@@@ -3,7 -3,7 +3,7 @@@
   *
   * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   * Copyright (c) 2001-2004, The GROMACS development team.
 - * Copyright (c) 2013-2020, by the GROMACS development team, led by
 + * Copyright (c) 2013-2019,2020, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@@ -53,7 -53,6 +53,7 @@@
  #include "gromacs/domdec/partition.h"
  #include "gromacs/essentialdynamics/edsam.h"
  #include "gromacs/ewald/pme.h"
 +#include "gromacs/ewald/pme_pp.h"
  #include "gromacs/ewald/pme_pp_comm_gpu.h"
  #include "gromacs/gmxlib/network.h"
  #include "gromacs/gmxlib/nonbonded/nb_free_energy.h"
  #include "gromacs/mdlib/constr.h"
  #include "gromacs/mdlib/enerdata_utils.h"
  #include "gromacs/mdlib/force.h"
 +#include "gromacs/mdlib/force_flags.h"
  #include "gromacs/mdlib/forcerec.h"
  #include "gromacs/mdlib/gmx_omp_nthreads.h"
 -#include "gromacs/mdlib/qmmm.h"
  #include "gromacs/mdlib/update.h"
 +#include "gromacs/mdlib/vsite.h"
 +#include "gromacs/mdlib/wholemoleculetransform.h"
  #include "gromacs/mdtypes/commrec.h"
  #include "gromacs/mdtypes/enerdata.h"
  #include "gromacs/mdtypes/forceoutput.h"
 +#include "gromacs/mdtypes/forcerec.h"
  #include "gromacs/mdtypes/iforceprovider.h"
  #include "gromacs/mdtypes/inputrec.h"
  #include "gromacs/mdtypes/md_enums.h"
 +#include "gromacs/mdtypes/mdatom.h"
  #include "gromacs/mdtypes/simulation_workload.h"
  #include "gromacs/mdtypes/state.h"
  #include "gromacs/mdtypes/state_propagator_data_gpu.h"
  #include "gromacs/nbnxm/gpu_data_mgmt.h"
  #include "gromacs/nbnxm/nbnxm.h"
 +#include "gromacs/nbnxm/nbnxm_gpu.h"
  #include "gromacs/pbcutil/ishift.h"
 -#include "gromacs/pbcutil/mshift.h"
  #include "gromacs/pbcutil/pbc.h"
  #include "gromacs/pulling/pull.h"
  #include "gromacs/pulling/pull_rotation.h"
@@@ -149,21 -144,22 +149,21 @@@ static void calc_virial(in
                          const rvec                       x[],
                          const gmx::ForceWithShiftForces& forceWithShiftForces,
                          tensor                           vir_part,
 -                        const t_graph*                   graph,
                          const matrix                     box,
                          t_nrnb*                          nrnb,
                          const t_forcerec*                fr,
 -                        int                              ePBC)
 +                        PbcType                          pbcType)
  {
      /* The short-range virial from surrounding boxes */
      const rvec* fshift = as_rvec_array(forceWithShiftForces.shiftForces().data());
 -    calc_vir(SHIFTS, fr->shift_vec, fshift, vir_part, ePBC == epbcSCREW, box);
 +    calc_vir(SHIFTS, fr->shift_vec, fshift, vir_part, pbcType == PbcType::Screw, box);
      inc_nrnb(nrnb, eNR_VIRIAL, SHIFTS);
  
      /* Calculate partial virial, for local atoms only, based on short range.
       * Total virial is computed in global_stat, called from do_md
       */
      const rvec* f = as_rvec_array(forceWithShiftForces.force().data());
 -    f_calc_vir(start, start + homenr, x, f, vir_part, graph, box);
 +    f_calc_vir(start, start + homenr, x, f, vir_part, box);
      inc_nrnb(nrnb, eNR_VIRIAL, homenr);
  
      if (debug)
@@@ -191,15 -187,11 +191,15 @@@ static void pull_potential_wrapper(cons
       * which is why pull_potential is called close to other communication.
       */
      wallcycle_start(wcycle, ewcPULLPOT);
 -    set_pbc(&pbc, ir->ePBC, box);
 +    set_pbc(&pbc, ir->pbcType, box);
      dvdl = 0;
      enerd->term[F_COM_PULL] += pull_potential(pull_work, mdatoms, &pbc, cr, t, lambda[efptRESTRAINT],
                                                as_rvec_array(x.data()), force, &dvdl);
      enerd->dvdl_lin[efptRESTRAINT] += dvdl;
 +    for (auto& dhdl : enerd->dhdlLambda)
 +    {
 +        dhdl += dvdl;
 +    }
      wallcycle_stop(wcycle, ewcPULLPOT);
  }
  
@@@ -230,11 -222,6 +230,11 @@@ static void pme_receive_force_ener(t_fo
      enerd->dvdl_lin[efptCOUL] += dvdl_q;
      enerd->dvdl_lin[efptVDW] += dvdl_lj;
  
 +    for (auto& dhdl : enerd->dhdlLambda)
 +    {
 +        dhdl += dvdl_q + dvdl_lj;
 +    }
 +
      if (wcycle)
      {
          dd_cycles_add(cr->dd, cycles_seppme, ddCyclPME);
@@@ -286,6 -273,7 +286,6 @@@ static void post_process_forces(const t
                                  ForceOutputs*         forceOutputs,
                                  tensor                vir_force,
                                  const t_mdatoms*      mdatoms,
 -                                const t_graph*        graph,
                                  const t_forcerec*     fr,
                                  const gmx_vsite_t*    vsite,
                                  const StepWorkload&   stepWork)
               */
              matrix virial = { { 0 } };
              spread_vsite_f(vsite, x, fDirectVir, nullptr, stepWork.computeVirial, virial, nrnb,
 -                           &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr, wcycle);
 +                           top->idef, fr->pbcType, fr->bMolPBC, box, cr, wcycle);
              forceWithVirial.addVirialContribution(virial);
          }
  
@@@ -580,7 -568,7 +580,7 @@@ static void computeSpecialForces(FILE
          if (awh)
          {
              enerd->term[F_COM_PULL] += awh->applyBiasForcesAndUpdateBias(
 -                    inputrec->ePBC, *mdatoms, box, forceWithVirial, t, step, wcycle, fplog);
 +                    inputrec->pbcType, *mdatoms, box, forceWithVirial, t, step, wcycle, fplog);
          }
      }
  
      }
  }
  
 -/*! \brief Makes PME flags from StepWorkload data.
 - *
 - * \param[in]  stepWork     Step schedule flags
 - * \returns                 PME flags
 - */
 -static int makePmeFlags(const StepWorkload& stepWork)
 -{
 -    return GMX_PME_SPREAD | GMX_PME_SOLVE | (stepWork.computeVirial ? GMX_PME_CALC_ENER_VIR : 0)
 -           | (stepWork.computeEnergy ? GMX_PME_CALC_ENER_VIR : 0)
 -           | (stepWork.computeForces ? GMX_PME_CALC_F : 0);
 -}
 -
  /*! \brief Launch the prepare_step and spread stages of PME GPU.
   *
   * \param[in]  pmedata              The PME structure
   * \param[in]  box                  The box matrix
   * \param[in]  stepWork             Step schedule flags
 - * \param[in]  pmeFlags             PME flags
   * \param[in]  xReadyOnDevice       Event synchronizer indicating that the coordinates are ready in
   * the device memory. \param[in]  wcycle               The wallcycle structure
   */
  static inline void launchPmeGpuSpread(gmx_pme_t*            pmedata,
                                        const matrix          box,
                                        const StepWorkload&   stepWork,
 -                                      int                   pmeFlags,
                                        GpuEventSynchronizer* xReadyOnDevice,
                                        gmx_wallcycle_t       wcycle)
  {
 -    pme_gpu_prepare_computation(pmedata, stepWork.haveDynamicBox, box, wcycle, pmeFlags,
 -                                stepWork.useGpuPmeFReduction);
 +    pme_gpu_prepare_computation(pmedata, box, wcycle, stepWork);
      pme_gpu_launch_spread(pmedata, xReadyOnDevice, wcycle);
  }
  
   *
   * \param[in]  pmedata        The PME structure
   * \param[in]  wcycle         The wallcycle structure
 + * \param[in]  stepWork       Step schedule flags
   */
 -static void launchPmeGpuFftAndGather(gmx_pme_t* pmedata, gmx_wallcycle_t wcycle)
 +static void launchPmeGpuFftAndGather(gmx_pme_t* pmedata, gmx_wallcycle_t wcycle, const gmx::StepWorkload& stepWork)
  {
 -    pme_gpu_launch_complex_transforms(pmedata, wcycle);
 -    pme_gpu_launch_gather(pmedata, wcycle, PmeForceOutputHandling::Set);
 +    pme_gpu_launch_complex_transforms(pmedata, wcycle, stepWork);
 +    pme_gpu_launch_gather(pmedata, wcycle);
  }
  
  /*! \brief
   * \param[in,out] forceOutputs     Output buffer for the forces and virial
   * \param[in,out] enerd            Energy data structure results are reduced into
   * \param[in]     stepWork         Step schedule flags
 - * \param[in]     pmeFlags         PME flags
   * \param[in]     wcycle           The wallcycle structure
   */
  static void alternatePmeNbGpuWaitReduce(nonbonded_verlet_t* nbv,
                                          gmx::ForceOutputs*  forceOutputs,
                                          gmx_enerdata_t*     enerd,
                                          const StepWorkload& stepWork,
 -                                        int                 pmeFlags,
                                          gmx_wallcycle_t     wcycle)
  {
      bool isPmeGpuDone = false;
          {
              GpuTaskCompletion completionType =
                      (isNbGpuDone) ? GpuTaskCompletion::Wait : GpuTaskCompletion::Check;
 -            isPmeGpuDone = pme_gpu_try_finish_task(pmedata, pmeFlags, wcycle, &forceWithVirial,
 +            isPmeGpuDone = pme_gpu_try_finish_task(pmedata, stepWork, wcycle, &forceWithVirial,
                                                     enerd, completionType);
          }
  
@@@ -766,13 -770,13 +766,13 @@@ static ForceOutputs setupForceOutputs(t
  
  /*! \brief Set up flags that have the lifetime of the domain indicating what type of work is there to compute.
   */
 -static DomainLifetimeWorkload setupDomainLifetimeWorkload(const t_inputrec&         inputrec,
 -                                                          const t_forcerec&         fr,
 -                                                          const pull_t*             pull_work,
 -                                                          const gmx_edsam*          ed,
 -                                                          const t_idef&             idef,
 -                                                          const t_fcdata&           fcd,
 -                                                          const t_mdatoms&          mdatoms,
 +static DomainLifetimeWorkload setupDomainLifetimeWorkload(const t_inputrec&             inputrec,
 +                                                          const t_forcerec&             fr,
 +                                                          const pull_t*                 pull_work,
 +                                                          const gmx_edsam*              ed,
 +                                                          const InteractionDefinitions& idef,
 +                                                          const t_fcdata&               fcd,
 +                                                          const t_mdatoms&              mdatoms,
                                                            const SimulationWorkload& simulationWork,
                                                            const StepWorkload&       stepWork)
  {
@@@ -839,7 -843,7 +839,7 @@@ static StepWorkload setupStepWorkload(c
  
  /* \brief Launch end-of-step GPU tasks: buffer clearing and rolling pruning.
   *
 - * TODO: eliminate the \p useGpuNonbonded and \p useGpuNonbonded when these are
 + * TODO: eliminate \p useGpuPmeOnThisRank when this is
   * incorporated in DomainLifetimeWorkload.
   */
  static void launchGpuEndOfStepTasks(nonbonded_verlet_t*               nbv,
                                      gmx_pme_t*                        pmedata,
                                      gmx_enerdata_t*                   enerd,
                                      const gmx::MdrunScheduleWorkload& runScheduleWork,
 -                                    bool                              useGpuNonbonded,
 -                                    bool                              useGpuPme,
 +                                    bool                              useGpuPmeOnThisRank,
                                      int64_t                           step,
                                      gmx_wallcycle_t                   wcycle)
  {
 -    if (useGpuNonbonded)
 +    if (runScheduleWork.simulationWork.useGpuNonbonded)
      {
          /* Launch pruning before buffer clearing because the API overhead of the
           * clear kernel launches can leave the GPU idle while it could be running
          wallcycle_stop(wcycle, ewcLAUNCH_GPU);
      }
  
 -    if (useGpuPme)
 +    if (useGpuPmeOnThisRank)
      {
          pme_gpu_reinit_computation(pmedata, wcycle);
      }
      }
  }
  
 +//! \brief Data structure to hold dipole-related data and staging arrays
 +struct DipoleData
 +{
 +    //! Dipole staging for fast summing over MPI
 +    gmx::DVec muStaging[2] = { { 0.0, 0.0, 0.0 } };
 +    //! Dipole staging for states A and B (index 0 and 1 resp.)
 +    gmx::RVec muStateAB[2] = { { 0.0_real, 0.0_real, 0.0_real } };
 +};
 +
 +
 +static void reduceAndUpdateMuTot(DipoleData*                   dipoleData,
 +                                 const t_commrec*              cr,
 +                                 const bool                    haveFreeEnergy,
 +                                 gmx::ArrayRef<const real>     lambda,
 +                                 rvec                          muTotal,
 +                                 const DDBalanceRegionHandler& ddBalanceRegionHandler)
 +{
 +    if (PAR(cr))
 +    {
 +        gmx_sumd(2 * DIM, dipoleData->muStaging[0], cr);
 +        ddBalanceRegionHandler.reopenRegionCpu();
 +    }
 +    for (int i = 0; i < 2; i++)
 +    {
 +        for (int j = 0; j < DIM; j++)
 +        {
 +            dipoleData->muStateAB[i][j] = dipoleData->muStaging[i][j];
 +        }
 +    }
 +
 +    if (!haveFreeEnergy)
 +    {
 +        copy_rvec(dipoleData->muStateAB[0], muTotal);
 +    }
 +    else
 +    {
 +        for (int j = 0; j < DIM; j++)
 +        {
 +            muTotal[j] = (1.0 - lambda[efptCOUL]) * dipoleData->muStateAB[0][j]
 +                         + lambda[efptCOUL] * dipoleData->muStateAB[1][j];
 +        }
 +    }
 +}
  
  void do_force(FILE*                               fplog,
                const t_commrec*                    cr,
                gmx_enerdata_t*                     enerd,
                t_fcdata*                           fcd,
                gmx::ArrayRef<real>                 lambda,
 -              t_graph*                            graph,
                t_forcerec*                         fr,
                gmx::MdrunScheduleWorkload*         runScheduleWork,
                const gmx_vsite_t*                  vsite,
 -              rvec                                mu_tot,
 +              rvec                                muTotal,
                double                              t,
                gmx_edsam*                          ed,
                int                                 legacyFlags,
                const DDBalanceRegionHandler&       ddBalanceRegionHandler)
  {
 -    int                          i, j;
 -    double                       mu[2 * DIM];
 +    GMX_ASSERT(force.unpaddedArrayRef().ssize() >= fr->natoms_force_constr,
 +               "The size of the force buffer should be at least the number of atoms to compute "
 +               "forces for");
 +
      nonbonded_verlet_t*          nbv      = fr->nbv.get();
      interaction_const_t*         ic       = fr->ic;
      gmx::StatePropagatorDataGpu* stateGpu = fr->stateGpu;
  
 -    // TODO remove the code below when the legacy flags are not in use anymore
 -    /* modify force flag if not doing nonbonded */
 -    if (!fr->bNonbonded)
 -    {
 -        legacyFlags &= ~GMX_FORCE_NONBONDED;
 -    }
 -
      const SimulationWorkload& simulationWork = runScheduleWork->simulationWork;
  
  
  
  
      const bool useGpuPmeOnThisRank = simulationWork.useGpuPme && thisRankHasDuty(cr, DUTY_PME);
 -    const int  pmeFlags            = makePmeFlags(stepWork);
 -
 -    // Switches on whether to use GPU for position and force buffer operations
 -    // TODO consider all possible combinations of triggers, and how to combine optimally in each case.
 -    const BufferOpsUseGpu useGpuXBufOps =
 -            stepWork.useGpuXBufferOps ? BufferOpsUseGpu::True : BufferOpsUseGpu::False;
 -    // GPU Force buffer ops are disabled on virial steps, because the virial calc is not yet ported to GPU
 -    const BufferOpsUseGpu useGpuFBufOps =
 -            stepWork.useGpuFBufferOps ? BufferOpsUseGpu::True : BufferOpsUseGpu::False;
  
      /* At a search step we need to start the first balancing region
       * somewhere early inside the step after communication during domain
          ddBalanceRegionHandler.openBeforeForceComputationCpu(DdAllowBalanceRegionReopen::yes);
      }
  
 -    const int start  = 0;
 -    const int homenr = mdatoms->homenr;
 -
      clear_mat(vir_force);
  
 -    if (stepWork.stateChanged)
 -    {
 -        if (inputrecNeedMutot(inputrec))
 -        {
 -            /* Calculate total (local) dipole moment in a temporary common array.
 -             * This makes it possible to sum them over nodes faster.
 -             */
 -            calc_mu(start, homenr, x.unpaddedArrayRef(), mdatoms->chargeA, mdatoms->chargeB,
 -                    mdatoms->nChargePerturbed, mu, mu + DIM);
 -        }
 -    }
 -
 -    if (fr->ePBC != epbcNONE)
 +    if (fr->pbcType != PbcType::No)
      {
          /* Compute shift vectors every step,
           * because of pressure coupling or box deformation!
          const bool calcCGCM = (fillGrid && !DOMAINDECOMP(cr));
          if (calcCGCM)
          {
 -            put_atoms_in_box_omp(fr->ePBC, box, x.unpaddedArrayRef().subArray(0, homenr),
 +            put_atoms_in_box_omp(fr->pbcType, box, x.unpaddedArrayRef().subArray(0, mdatoms->homenr),
                                   gmx_omp_nthreads_get(emntDefault));
 -            inc_nrnb(nrnb, eNR_SHIFTX, homenr);
 -        }
 -        else if (EI_ENERGY_MINIMIZATION(inputrec->eI) && graph)
 -        {
 -            unshift_self(graph, box, as_rvec_array(x.unpaddedArrayRef().data()));
 +            inc_nrnb(nrnb, eNR_SHIFTX, mdatoms->homenr);
          }
      }
  
      nbnxn_atomdata_copy_shiftvec(stepWork.haveDynamicBox, fr->shift_vec, nbv->nbat.get());
  
 -#if GMX_MPI
      const bool pmeSendCoordinatesFromGpu =
 -            simulationWork.useGpuPmePpCommunication && !(stepWork.doNeighborSearch);
 +            GMX_MPI && simulationWork.useGpuPmePpCommunication && !(stepWork.doNeighborSearch);
      const bool reinitGpuPmePpComms =
 -            simulationWork.useGpuPmePpCommunication && (stepWork.doNeighborSearch);
 -#endif
 +            GMX_MPI && simulationWork.useGpuPmePpCommunication && (stepWork.doNeighborSearch);
  
 -    const auto localXReadyOnDevice = (stateGpu != nullptr)
 +    const auto localXReadyOnDevice = (useGpuPmeOnThisRank || simulationWork.useGpuBufferOps)
                                               ? stateGpu->getCoordinatesReadyOnDeviceEvent(
                                                         AtomLocality::Local, simulationWork, stepWork)
                                               : nullptr;
  
 -#if GMX_MPI
      // If coordinates are to be sent to PME task from CPU memory, perform that send here.
      // Otherwise the send will occur after H2D coordinate transfer.
 -    if (!thisRankHasDuty(cr, DUTY_PME) && !pmeSendCoordinatesFromGpu)
 +    if (GMX_MPI && !thisRankHasDuty(cr, DUTY_PME) && !pmeSendCoordinatesFromGpu)
      {
 -        /* Send particle coordinates to the pme nodes.
 -         * Since this is only implemented for domain decomposition
 -         * and domain decomposition does not use the graph,
 -         * we do not need to worry about shifting.
 -         */
 +        /* Send particle coordinates to the pme nodes */
          if (!stepWork.doNeighborSearch && simulationWork.useGpuUpdate)
          {
              GMX_RELEASE_ASSERT(false,
                                   step, simulationWork.useGpuPmePpCommunication, reinitGpuPmePpComms,
                                   pmeSendCoordinatesFromGpu, localXReadyOnDevice, wcycle);
      }
 -#endif /* GMX_MPI */
  
      // Coordinates on the device are needed if PME or BufferOps are offloaded.
      // The local coordinates can be copied right away.
      // NOTE: Consider moving this copy to right after they are updated and constrained,
      //       if the later is not offloaded.
 -    if (useGpuPmeOnThisRank || useGpuXBufOps == BufferOpsUseGpu::True)
 +    if (useGpuPmeOnThisRank || stepWork.useGpuXBufferOps)
      {
          if (stepWork.doNeighborSearch)
          {
 +            // TODO refactor this to do_md, after partitioning.
              stateGpu->reinit(mdatoms->homenr,
                               cr->dd != nullptr ? dd_numAtomsZones(*cr->dd) : mdatoms->homenr);
              if (useGpuPmeOnThisRank)
      // The conditions for gpuHaloExchange e.g. using GPU buffer
      // operations were checked before construction, so here we can
      // just use it and assert upon any conditions.
 -    gmx::GpuHaloExchange* gpuHaloExchange =
 -            (havePPDomainDecomposition(cr) ? cr->dd->gpuHaloExchange.get() : nullptr);
 -    const bool ddUsesGpuDirectCommunication = (gpuHaloExchange != nullptr);
 -    GMX_ASSERT(!ddUsesGpuDirectCommunication || (useGpuXBufOps == BufferOpsUseGpu::True),
 +    const bool ddUsesGpuDirectCommunication =
 +            ((cr->dd != nullptr) && (!cr->dd->gpuHaloExchange.empty()));
 +    GMX_ASSERT(!ddUsesGpuDirectCommunication || stepWork.useGpuXBufferOps,
                 "Must use coordinate buffer ops with GPU halo exchange");
 -    const bool useGpuForcesHaloExchange =
 -            ddUsesGpuDirectCommunication && (useGpuFBufOps == BufferOpsUseGpu::True);
 +    const bool useGpuForcesHaloExchange = ddUsesGpuDirectCommunication && stepWork.useGpuFBufferOps;
  
      // Copy coordinate from the GPU if update is on the GPU and there
      // are forces to be computed on the CPU, or for the computation of
          haveCopiedXFromGpu = true;
      }
  
 -#if GMX_MPI
      // If coordinates are to be sent to PME task from GPU memory, perform that send here.
      // Otherwise the send will occur before the H2D coordinate transfer.
-     if (pmeSendCoordinatesFromGpu)
+     if (!thisRankHasDuty(cr, DUTY_PME) && pmeSendCoordinatesFromGpu)
      {
 -        /* Send particle coordinates to the pme nodes.
 -         * Since this is only implemented for domain decomposition
 -         * and domain decomposition does not use the graph,
 -         * we do not need to worry about shifting.
 -         */
 +        /* Send particle coordinates to the pme nodes */
          gmx_pme_send_coordinates(fr, cr, box, as_rvec_array(x.unpaddedArrayRef().data()), lambda[efptCOUL],
                                   lambda[efptVDW], (stepWork.computeVirial || stepWork.computeEnergy),
                                   step, simulationWork.useGpuPmePpCommunication, reinitGpuPmePpComms,
                                   pmeSendCoordinatesFromGpu, localXReadyOnDevice, wcycle);
      }
 -#endif /* GMX_MPI */
  
      if (useGpuPmeOnThisRank)
      {
 -        launchPmeGpuSpread(fr->pmedata, box, stepWork, pmeFlags, localXReadyOnDevice, wcycle);
 +        launchPmeGpuSpread(fr->pmedata, box, stepWork, localXReadyOnDevice, wcycle);
      }
  
      /* do gridding for pair search */
      if (stepWork.doNeighborSearch)
      {
 -        if (graph && stepWork.stateChanged)
 +        if (fr->wholeMoleculeTransform && stepWork.stateChanged)
          {
 -            /* Calculate intramolecular shift vectors to make molecules whole */
 -            mk_mshift(fplog, graph, fr->ePBC, box, as_rvec_array(x.unpaddedArrayRef().data()));
 +            fr->wholeMoleculeTransform->updateForAtomPbcJumps(x.unpaddedArrayRef(), box);
          }
  
          // TODO
              }
              wallcycle_stop(wcycle, ewcLAUNCH_GPU);
          }
 -    }
  
 -    if (stepWork.doNeighborSearch)
 -    {
          // Need to run after the GPU-offload bonded interaction lists
          // are set up to be able to determine whether there is bonded work.
          runScheduleWork->domainWork = setupDomainLifetimeWorkload(
          wallcycle_start_nocount(wcycle, ewcNS);
          wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_LOCAL);
          /* Note that with a GPU the launch overhead of the list transfer is not timed separately */
 -        nbv->constructPairlist(InteractionLocality::Local, &top->excls, step, nrnb);
 +        nbv->constructPairlist(InteractionLocality::Local, top->excls, step, nrnb);
  
          nbv->setupGpuShortRangeWork(fr->gpuBonded, InteractionLocality::Local);
  
          wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_LOCAL);
          wallcycle_stop(wcycle, ewcNS);
  
 -        if (useGpuXBufOps == BufferOpsUseGpu::True)
 +        if (stepWork.useGpuXBufferOps)
          {
              nbv->atomdata_init_copy_x_to_nbat_x_gpu();
          }
          // For force buffer ops, we use the below conditon rather than
 -        // useGpuFBufOps to ensure that init is performed even if this
 +        // useGpuFBufferOps to ensure that init is performed even if this
          // NS step is also a virial step (on which f buf ops are deactivated).
          if (simulationWork.useGpuBufferOps && simulationWork.useGpuNonbonded && (GMX_GPU == GMX_GPU_CUDA))
          {
      }
      else if (!EI_TPI(inputrec->eI))
      {
 -        if (useGpuXBufOps == BufferOpsUseGpu::True)
 +        if (stepWork.useGpuXBufferOps)
          {
              GMX_ASSERT(stateGpu, "stateGpu should be valid when buffer ops are offloaded");
              nbv->convertCoordinatesGpu(AtomLocality::Local, false, stateGpu->getCoordinates(),
  
          wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
          Nbnxm::gpu_upload_shiftvec(nbv->gpu_nbv, nbv->nbat.get());
 -        if (stepWork.doNeighborSearch || (useGpuXBufOps == BufferOpsUseGpu::False))
 +        if (stepWork.doNeighborSearch || !stepWork.useGpuXBufferOps)
          {
              Nbnxm::gpu_copy_xq_to_gpu(nbv->gpu_nbv, nbv->nbat.get(), AtomLocality::Local);
          }
          if (domainWork.haveGpuBondedWork && !havePPDomainDecomposition(cr))
          {
              wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_BONDED);
 -            fr->gpuBonded->launchKernel(fr, stepWork, box);
 +            fr->gpuBonded->setPbcAndlaunchKernel(fr->pbcType, box, fr->bMolPBC, stepWork);
              wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_BONDED);
          }
  
          // X copy/transform to allow overlap as well as after the GPU NB
          // launch to avoid FFT launch overhead hijacking the CPU and delaying
          // the nonbonded kernel.
 -        launchPmeGpuFftAndGather(fr->pmedata, wcycle);
 +        launchPmeGpuFftAndGather(fr->pmedata, wcycle, stepWork);
      }
  
      /* Communicate coordinates and sum dipole if necessary +
              wallcycle_start_nocount(wcycle, ewcNS);
              wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_NONLOCAL);
              /* Note that with a GPU the launch overhead of the list transfer is not timed separately */
 -            nbv->constructPairlist(InteractionLocality::NonLocal, &top->excls, step, nrnb);
 +            nbv->constructPairlist(InteractionLocality::NonLocal, top->excls, step, nrnb);
  
              nbv->setupGpuShortRangeWork(fr->gpuBonded, InteractionLocality::NonLocal);
              wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_NONLOCAL);
              wallcycle_stop(wcycle, ewcNS);
 +            // TODO refactor this GPU halo exchange re-initialisation
 +            // to location in do_md where GPU halo exchange is
 +            // constructed at partitioning, after above stateGpu
 +            // re-initialization has similarly been refactored
              if (ddUsesGpuDirectCommunication)
              {
 -                gpuHaloExchange->reinitHalo(stateGpu->getCoordinates(), stateGpu->getForces());
 +                reinitGpuHaloExchange(*cr, stateGpu->getCoordinates(), stateGpu->getForces());
              }
          }
          else
              {
                  // The following must be called after local setCoordinates (which records an event
                  // when the coordinate data has been copied to the device).
 -                gpuHaloExchange->communicateHaloCoordinates(box, localXReadyOnDevice);
 +                communicateGpuHaloCoordinates(*cr, box, localXReadyOnDevice);
  
                  if (domainWork.haveCpuBondedWork || domainWork.haveFreeEnergyWork)
                  {
                  dd_move_x(cr->dd, box, x.unpaddedArrayRef(), wcycle);
              }
  
 -            if (useGpuXBufOps == BufferOpsUseGpu::True)
 +            if (stepWork.useGpuXBufferOps)
              {
 -                // The condition here was (pme != nullptr && pme_gpu_get_device_x(fr->pmedata) != nullptr)
                  if (!useGpuPmeOnThisRank && !ddUsesGpuDirectCommunication)
                  {
                      stateGpu->copyCoordinatesToGpu(x.unpaddedArrayRef(), AtomLocality::NonLocal);
          {
              wallcycle_start(wcycle, ewcLAUNCH_GPU);
  
 -            if (stepWork.doNeighborSearch || (useGpuXBufOps == BufferOpsUseGpu::False))
 +            if (stepWork.doNeighborSearch || !stepWork.useGpuXBufferOps)
              {
                  wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
                  Nbnxm::gpu_copy_xq_to_gpu(nbv->gpu_nbv, nbv->nbat.get(), AtomLocality::NonLocal);
              if (domainWork.haveGpuBondedWork)
              {
                  wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_BONDED);
 -                fr->gpuBonded->launchKernel(fr, stepWork, box);
 +                fr->gpuBonded->setPbcAndlaunchKernel(fr->pbcType, box, fr->bMolPBC, stepWork);
                  wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_BONDED);
              }
  
          wallcycle_stop(wcycle, ewcLAUNCH_GPU);
      }
  
 -    if (stepWork.stateChanged && inputrecNeedMutot(inputrec))
 +    gmx::ArrayRef<const gmx::RVec> xWholeMolecules;
 +    if (fr->wholeMoleculeTransform)
      {
 -        if (PAR(cr))
 -        {
 -            gmx_sumd(2 * DIM, mu, cr);
 +        xWholeMolecules = fr->wholeMoleculeTransform->wholeMoleculeCoordinates(x.unpaddedArrayRef(), box);
 +    }
  
 -            ddBalanceRegionHandler.reopenRegionCpu();
 -        }
 +    DipoleData dipoleData;
  
 -        for (i = 0; i < 2; i++)
 -        {
 -            for (j = 0; j < DIM; j++)
 -            {
 -                fr->mu_tot[i][j] = mu[i * DIM + j];
 -            }
 -        }
 -    }
 -    if (mdatoms->nChargePerturbed == 0)
 -    {
 -        copy_rvec(fr->mu_tot[0], mu_tot);
 -    }
 -    else
 +    if (simulationWork.computeMuTot)
      {
 -        for (j = 0; j < DIM; j++)
 -        {
 -            mu_tot[j] = (1.0 - lambda[efptCOUL]) * fr->mu_tot[0][j] + lambda[efptCOUL] * fr->mu_tot[1][j];
 -        }
 +        const int start = 0;
 +
 +        /* Calculate total (local) dipole moment in a temporary common array.
 +         * This makes it possible to sum them over nodes faster.
 +         */
 +        gmx::ArrayRef<const gmx::RVec> xRef =
 +                (xWholeMolecules.empty() ? x.unpaddedArrayRef() : xWholeMolecules);
 +        calc_mu(start, mdatoms->homenr, xRef, mdatoms->chargeA, mdatoms->chargeB,
 +                mdatoms->nChargePerturbed, dipoleData.muStaging[0], dipoleData.muStaging[1]);
 +
 +        reduceAndUpdateMuTot(&dipoleData, cr, (fr->efep != efepNO), lambda, muTotal, ddBalanceRegionHandler);
      }
  
      /* Reset energies */
          }
      }
  
 -    /* update QMMMrec, if necessary */
 -    if (fr->bQMMM)
 -    {
 -        update_QMMMrec(cr, fr, as_rvec_array(x.unpaddedArrayRef().data()), mdatoms, box);
 -    }
 -
      // TODO Force flags should include haveFreeEnergyWork for this domain
      if (ddUsesGpuDirectCommunication && (domainWork.haveCpuBondedWork || domainWork.haveFreeEnergyWork))
      {
          stateGpu->waitCoordinatesReadyOnHost(AtomLocality::NonLocal);
      }
      /* Compute the bonded and non-bonded energies and optionally forces */
 -    do_force_lowlevel(fr, inputrec, &(top->idef), cr, ms, nrnb, wcycle, mdatoms, x, hist, &forceOut, enerd,
 -                      fcd, box, lambda.data(), graph, fr->mu_tot, stepWork, ddBalanceRegionHandler);
 +    do_force_lowlevel(fr, inputrec, top->idef, cr, ms, nrnb, wcycle, mdatoms, x, xWholeMolecules,
 +                      hist, &forceOut, enerd, fcd, box, lambda.data(),
 +                      as_rvec_array(dipoleData.muStateAB), stepWork, ddBalanceRegionHandler);
  
      wallcycle_stop(wcycle, ewcFORCE);
  
                  wallcycle_stop(wcycle, ewcFORCE);
              }
  
 -            if (useGpuFBufOps == BufferOpsUseGpu::True)
 +            if (stepWork.useGpuFBufferOps)
              {
                  gmx::FixedCapacityVector<GpuEventSynchronizer*, 1> dependencyList;
  
                      stateGpu->copyForcesToGpu(forceOut.forceWithShiftForces().force(),
                                                AtomLocality::NonLocal);
                      dependencyList.push_back(stateGpu->getForcesReadyOnDeviceEvent(
 -                            AtomLocality::NonLocal, useGpuFBufOps == BufferOpsUseGpu::True));
 +                            AtomLocality::NonLocal, stepWork.useGpuFBufferOps));
                  }
  
                  nbv->atomdata_add_nbat_f_to_f_gpu(AtomLocality::NonLocal, stateGpu->getForces(),
                  {
                      stateGpu->copyForcesToGpu(forceOut.forceWithShiftForces().force(), AtomLocality::Local);
                  }
 -                gpuHaloExchange->communicateHaloForces(domainWork.haveCpuLocalForceWork);
 +                communicateGpuHaloForces(*cr, domainWork.haveCpuLocalForceWork);
              }
              else
              {
 -                if (useGpuFBufOps == BufferOpsUseGpu::True)
 +                if (stepWork.useGpuFBufferOps)
                  {
                      stateGpu->waitForcesReadyOnHost(AtomLocality::NonLocal);
                  }
      // With both nonbonded and PME offloaded a GPU on the same rank, we use
      // an alternating wait/reduction scheme.
      bool alternateGpuWait = (!c_disableAlternatingWait && useGpuPmeOnThisRank && simulationWork.useGpuNonbonded
 -                             && !DOMAINDECOMP(cr) && (useGpuFBufOps == BufferOpsUseGpu::False));
 +                             && !DOMAINDECOMP(cr) && !stepWork.useGpuFBufferOps);
      if (alternateGpuWait)
      {
 -        alternatePmeNbGpuWaitReduce(fr->nbv.get(), fr->pmedata, &forceOut, enerd, stepWork, pmeFlags, wcycle);
 +        alternatePmeNbGpuWaitReduce(fr->nbv.get(), fr->pmedata, &forceOut, enerd, stepWork, wcycle);
      }
  
      if (!alternateGpuWait && useGpuPmeOnThisRank)
      {
 -        pme_gpu_wait_and_reduce(fr->pmedata, pmeFlags, wcycle, &forceOut.forceWithVirial(), enerd);
 +        pme_gpu_wait_and_reduce(fr->pmedata, stepWork, wcycle, &forceOut.forceWithVirial(), enerd);
      }
  
      /* Wait for local GPU NB outputs on the non-alternating wait path */
  
          gmx::ArrayRef<gmx::RVec> forceWithShift = forceOut.forceWithShiftForces().force();
  
 -        if (useGpuFBufOps == BufferOpsUseGpu::True)
 +        if (stepWork.useGpuFBufferOps)
          {
              // Flag to specify whether the CPU force buffer has contributions to
              // local atoms. This depends on whether there are CPU-based force tasks
                  auto locality = havePPDomainDecomposition(cr) ? AtomLocality::Local : AtomLocality::All;
  
                  stateGpu->copyForcesToGpu(forceWithShift, locality);
 -                dependencyList.push_back(stateGpu->getForcesReadyOnDeviceEvent(
 -                        locality, useGpuFBufOps == BufferOpsUseGpu::True));
 +                dependencyList.push_back(
 +                        stateGpu->getForcesReadyOnDeviceEvent(locality, stepWork.useGpuFBufferOps));
              }
              if (useGpuForcesHaloExchange)
              {
 -                dependencyList.push_back(gpuHaloExchange->getForcesReadyOnDeviceEvent());
 +                dependencyList.push_back(cr->dd->gpuHaloExchange[0]->getForcesReadyOnDeviceEvent());
              }
              nbv->atomdata_add_nbat_f_to_f_gpu(AtomLocality::Local, stateGpu->getForces(), pmeForcePtr,
                                                dependencyList, stepWork.useGpuPmeFReduction,
      }
  
      launchGpuEndOfStepTasks(nbv, fr->gpuBonded, fr->pmedata, enerd, *runScheduleWork,
 -                            simulationWork.useGpuNonbonded, useGpuPmeOnThisRank, step, wcycle);
 +                            useGpuPmeOnThisRank, step, wcycle);
  
      if (DOMAINDECOMP(cr))
      {
          {
              rvec* fshift = as_rvec_array(forceOut.forceWithShiftForces().shiftForces().data());
              spread_vsite_f(vsite, as_rvec_array(x.unpaddedArrayRef().data()), f, fshift, FALSE,
 -                           nullptr, nrnb, &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr, wcycle);
 +                           nullptr, nrnb, top->idef, fr->pbcType, fr->bMolPBC, box, cr, wcycle);
          }
  
          if (stepWork.computeVirial)
          {
              /* Calculation of the virial must be done after vsites! */
              calc_virial(0, mdatoms->homenr, as_rvec_array(x.unpaddedArrayRef().data()),
 -                        forceOut.forceWithShiftForces(), vir_force, graph, box, nrnb, fr, inputrec->ePBC);
 +                        forceOut.forceWithShiftForces(), vir_force, box, nrnb, fr, inputrec->pbcType);
          }
      }
  
      if (stepWork.computeForces)
      {
          post_process_forces(cr, step, nrnb, wcycle, top, box, as_rvec_array(x.unpaddedArrayRef().data()),
 -                            &forceOut, vir_force, mdatoms, graph, fr, vsite, stepWork);
 +                            &forceOut, vir_force, mdatoms, fr, vsite, stepWork);
      }
  
      if (stepWork.computeEnergy)
index 946399c92f27d73ea106090c587e97194ead1b87,e1ee7ccbcb94dc27c640203d003ff2a70e30bdff..8f7ee114832d4791962941c2da7c3289fbb0a9b8
@@@ -190,7 -190,7 +190,7 @@@ protected
      //! Contains command-line options to mdrun.
      const MdrunOptions& mdrunOptions;
      //! Whether the simulation will start afresh, or restart with/without appending.
-     StartingBehavior startingBehavior;
+     const StartingBehavior startingBehavior;
      //! Handles virtual sites.
      gmx_vsite_t* vsite;
      //! Handles constraints.
      //! The coordinate-swapping session.
      t_swap* swap;
      //! Full system topology.
 -    gmx_mtop_t* top_global;
 +    const gmx_mtop_t* top_global;
      //! Helper struct for force calculations.
      t_fcdata* fcd;
      //! Full simulation state (only non-nullptr on master rank).
diff --combined src/gromacs/mdrun/md.cpp
index 3971786b001ca7c082da7781b3fd5dda046bba86,efd18b09e3afcb6ea724eaaa96cbef3c3479dbb5..10b76df3778955f5a83038e18ce2edd367911be7
  #include "gromacs/domdec/domdec.h"
  #include "gromacs/domdec/domdec_network.h"
  #include "gromacs/domdec/domdec_struct.h"
 +#include "gromacs/domdec/gpuhaloexchange.h"
  #include "gromacs/domdec/mdsetup.h"
  #include "gromacs/domdec/partition.h"
  #include "gromacs/essentialdynamics/edsam.h"
 -#include "gromacs/ewald/pme.h"
  #include "gromacs/ewald/pme_load_balancing.h"
 +#include "gromacs/ewald/pme_pp.h"
  #include "gromacs/fileio/trxio.h"
  #include "gromacs/gmxlib/network.h"
  #include "gromacs/gmxlib/nrnb.h"
 +#include "gromacs/gpu_utils/device_stream_manager.h"
  #include "gromacs/gpu_utils/gpu_utils.h"
  #include "gromacs/imd/imd.h"
  #include "gromacs/listed_forces/manage_threading.h"
@@@ -97,7 -95,7 +97,7 @@@
  #include "gromacs/mdlib/tgroup.h"
  #include "gromacs/mdlib/trajectory_writing.h"
  #include "gromacs/mdlib/update.h"
 -#include "gromacs/mdlib/update_constrain_cuda.h"
 +#include "gromacs/mdlib/update_constrain_gpu.h"
  #include "gromacs/mdlib/vcm.h"
  #include "gromacs/mdlib/vsite.h"
  #include "gromacs/mdrunutility/handlerestart.h"
  #include "gromacs/modularsimulator/energyelement.h"
  #include "gromacs/nbnxm/gpu_data_mgmt.h"
  #include "gromacs/nbnxm/nbnxm.h"
 -#include "gromacs/pbcutil/mshift.h"
  #include "gromacs/pbcutil/pbc.h"
  #include "gromacs/pulling/output.h"
  #include "gromacs/pulling/pull.h"
@@@ -175,8 -174,10 +175,8 @@@ void gmx::LegacySimulator::do_md(
      rvec                        mu_tot;
      matrix                      pressureCouplingMu, M;
      gmx_repl_ex_t               repl_ex = nullptr;
 -    gmx_localtop_t              top;
      PaddedHostVector<gmx::RVec> f{};
      gmx_global_stat_t           gstat;
 -    t_graph*                    graph = nullptr;
      gmx_shellfc_t*              shellfc;
      gmx_bool                    bSumEkinhOld, bDoReplEx, bExchanged, bNeedRepartition;
      gmx_bool                    bTemp, bPres, bTrotter;
      int nstglobalcomm = computeGlobalCommunicationPeriod(mdlog, ir, cr);
      bGStatEveryStep   = (nstglobalcomm == 1);
  
 -    SimulationGroups* groups = &top_global->groups;
 +    const SimulationGroups* groups = &top_global->groups;
  
      std::unique_ptr<EssentialDynamics> ed = nullptr;
      if (opt2bSet("-ei", nfile, fnm))
      t_state*                 state;
  
  
 +    gmx_localtop_t top(top_global->ffparams);
 +
      auto mdatoms = mdAtoms->mdatoms();
  
 -    std::unique_ptr<UpdateConstrainCuda> integrator;
 +    std::unique_ptr<UpdateConstrainGpu> integrator;
  
      if (DOMAINDECOMP(cr))
      {
 -        dd_init_local_top(*top_global, &top);
 -
          stateInstance = std::make_unique<t_state>();
          state         = stateInstance.get();
          dd_init_local_state(cr->dd, state_global, state);
      else
      {
          state_change_natoms(state_global, state_global->natoms);
 -        f.resizeWithPadding(state_global->natoms);
          /* Copy the pointer to the global state */
          state = state_global;
  
          /* Generate and initialize new topology */
 -        mdAlgorithmsSetupAtomData(cr, ir, *top_global, &top, fr, &graph, mdAtoms, constr, vsite, shellfc);
 +        mdAlgorithmsSetupAtomData(cr, ir, *top_global, &top, fr, &f, mdAtoms, constr, vsite, shellfc);
  
          upd.setNumAtoms(state->natoms);
      }
  
      StatePropagatorDataGpu* stateGpu = fr->stateGpu;
  
 +    // TODO: the assertions below should be handled by UpdateConstraintsBuilder.
      if (useGpuForUpdate)
      {
          GMX_RELEASE_ASSERT(!DOMAINDECOMP(cr) || ddUsesUpdateGroups(*cr->dd) || constr == nullptr
                             "Constraints pulling is not supported with the GPU update.\n");
          GMX_RELEASE_ASSERT(fcd->orires.nr == 0,
                             "Orientation restraints are not supported with the GPU update.\n");
 -        GMX_RELEASE_ASSERT(ir->efep == efepNO,
 -                           "Free energy perturbations are not supported with the GPU update.");
 -        GMX_RELEASE_ASSERT(graph == nullptr, "The graph is not supported with GPU update.");
 +        GMX_RELEASE_ASSERT(
 +                ir->efep == efepNO
 +                        || (!haveFreeEnergyType(*ir, efptBONDED) && !haveFreeEnergyType(*ir, efptMASS)),
 +                "Free energy perturbation of masses and constraints are not supported with the GPU "
 +                "update.");
  
          if (constr != nullptr && constr->numConstraintsTotal() > 0)
          {
          {
              GMX_LOG(mdlog.info).asParagraph().appendText("Updating coordinates on the GPU.");
          }
 -        integrator = std::make_unique<UpdateConstrainCuda>(
 -                *ir, *top_global, stateGpu->getUpdateStream(), stateGpu->xUpdatedOnDevice());
 -
 -        t_pbc pbc;
 -        set_pbc(&pbc, epbcXYZ, state->box);
 -        integrator->setPbc(&pbc);
 +        GMX_RELEASE_ASSERT(fr->deviceStreamManager != nullptr,
 +                           "Device stream manager should be initialized in order to use GPU "
 +                           "update-constraints.");
 +        GMX_RELEASE_ASSERT(
 +                fr->deviceStreamManager->streamIsValid(gmx::DeviceStreamType::UpdateAndConstraints),
 +                "Update stream should be initialized in order to use GPU "
 +                "update-constraints.");
 +        integrator = std::make_unique<UpdateConstrainGpu>(
 +                *ir, *top_global, fr->deviceStreamManager->context(),
 +                fr->deviceStreamManager->stream(gmx::DeviceStreamType::UpdateAndConstraints),
 +                stateGpu->xUpdatedOnDevice());
 +
 +        integrator->setPbc(PbcType::Xyz, state->box);
      }
  
      if (useGpuForPme || (useGpuForNonbonded && useGpuForBufferOps) || useGpuForUpdate)
          {
              /* Construct the virtual sites for the initial configuration */
              construct_vsites(vsite, state->x.rvec_array(), ir->delta_t, nullptr, top.idef.iparams,
 -                             top.idef.il, fr->ePBC, fr->bMolPBC, cr, state->box);
 +                             top.idef.il, fr->pbcType, fr->bMolPBC, cr, state->box);
          }
      }
  
      bool hasReadEkinState = MASTER(cr) ? state_global->ekinstate.hasReadEkinState : false;
      if (PAR(cr))
      {
 -        gmx_bcast(sizeof(hasReadEkinState), &hasReadEkinState, cr);
 +        gmx_bcast(sizeof(hasReadEkinState), &hasReadEkinState, cr->mpi_comm_mygroup);
      }
      if (hasReadEkinState)
      {
              cglo_flags_iteration |= CGLO_STOPCM;
              cglo_flags_iteration &= ~CGLO_TEMPERATURE;
          }
 -        compute_globals(gstat, cr, ir, fr, ekind, state->x.rvec_array(), state->v.rvec_array(),
 -                        state->box, state->lambda[efptVDW], mdatoms, nrnb, &vcm, nullptr, enerd,
 -                        force_vir, shake_vir, total_vir, pres, mu_tot, constr, &nullSignaller,
 -                        state->box, &totalNumberOfBondedInteractions, &bSumEkinhOld,
 +        compute_globals(gstat, cr, ir, fr, ekind, makeConstArrayRef(state->x),
 +                        makeConstArrayRef(state->v), state->box, state->lambda[efptVDW], mdatoms,
 +                        nrnb, &vcm, nullptr, enerd, force_vir, shake_vir, total_vir, pres, constr,
 +                        &nullSignaller, state->box, &totalNumberOfBondedInteractions, &bSumEkinhOld,
                          cglo_flags_iteration
                                  | (shouldCheckNumberOfBondedInteractions ? CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS
                                                                           : 0));
              /* At initialization, do not pass x with acceleration-correction mode
               * to avoid (incorrect) correction of the initial coordinates.
               */
 -            rvec* xPtr = nullptr;
 -            if (vcm.mode != ecmLINEAR_ACCELERATION_CORRECTION)
 -            {
 -                xPtr = state->x.rvec_array();
 -            }
 -            process_and_stopcm_grp(fplog, &vcm, *mdatoms, xPtr, state->v.rvec_array());
 +            auto x = (vcm.mode == ecmLINEAR_ACCELERATION_CORRECTION) ? ArrayRef<RVec>()
 +                                                                     : makeArrayRef(state->x);
 +            process_and_stopcm_grp(fplog, &vcm, *mdatoms, x, makeArrayRef(state->v));
              inc_nrnb(nrnb, eNR_STOPCM, mdatoms->homenr);
          }
      }
      checkNumberOfBondedInteractions(mdlog, cr, totalNumberOfBondedInteractions, top_global, &top,
 -                                    state->x.rvec_array(), state->box,
 +                                    makeConstArrayRef(state->x), state->box,
                                      &shouldCheckNumberOfBondedInteractions);
      if (ir->eI == eiVVAK)
      {
             kinetic energy calculation.  This minimized excess variables, but
             perhaps loses some logic?*/
  
 -        compute_globals(gstat, cr, ir, fr, ekind, state->x.rvec_array(), state->v.rvec_array(),
 -                        state->box, state->lambda[efptVDW], mdatoms, nrnb, &vcm, nullptr, enerd,
 -                        force_vir, shake_vir, total_vir, pres, mu_tot, constr, &nullSignaller,
 -                        state->box, nullptr, &bSumEkinhOld, cglo_flags & ~CGLO_PRESSURE);
 +        compute_globals(gstat, cr, ir, fr, ekind, makeConstArrayRef(state->x),
 +                        makeConstArrayRef(state->v), state->box, state->lambda[efptVDW], mdatoms,
 +                        nrnb, &vcm, nullptr, enerd, force_vir, shake_vir, total_vir, pres, constr,
 +                        &nullSignaller, state->box, nullptr, &bSumEkinhOld, cglo_flags & ~CGLO_PRESSURE);
      }
  
      /* Calculate the initial half step temperature, and save the ekinh_old */
                                "The initial step is not consistent across multi simulations which "
                                "share the state");
                  }
 -                gmx_barrier(cr);
 +                gmx_barrier(cr->mpi_comm_mygroup);
              }
              else
              {
              bDoDHDL     = do_per_step(step, ir->fepvals->nstdhdl);
              bDoFEP      = ((ir->efep != efepNO) && do_per_step(step, nstfep));
              bDoExpanded = (do_per_step(step, ir->expandedvals->nstexpanded) && (ir->bExpanded)
-                            && (step > 0) && (startingBehavior == StartingBehavior::NewSimulation));
+                            && (!bFirstStep));
          }
  
          bDoReplEx = (useReplicaExchange && (step > 0) && !bLastStep
              /* Correct the new box if it is too skewed */
              if (inputrecDynamicBox(ir))
              {
 -                if (correct_box(fplog, step, state->box, graph))
 +                if (correct_box(fplog, step, state->box))
                  {
                      bMasterState = TRUE;
                      // If update is offloaded, it should be informed about the box size change
                      if (useGpuForUpdate)
                      {
 -                        t_pbc pbc;
 -                        set_pbc(&pbc, epbcXYZ, state->box);
 -                        integrator->setPbc(&pbc);
 +                        integrator->setPbc(PbcType::Xyz, state->box);
                      }
                  }
              }
                                      fr, vsite, constr, nrnb, wcycle, do_verbose && !bPMETunePrinting);
                  shouldCheckNumberOfBondedInteractions = true;
                  upd.setNumAtoms(state->natoms);
 +
 +                // Allocate or re-size GPU halo exchange object, if necessary
 +                if (havePPDomainDecomposition(cr) && simulationWork.useGpuHaloExchange
 +                    && useGpuForNonbonded && is1D(*cr->dd))
 +                {
 +                    GMX_RELEASE_ASSERT(fr->deviceStreamManager != nullptr,
 +                                       "GPU device manager has to be initialized to use GPU "
 +                                       "version of halo exchange.");
 +                    // TODO remove need to pass local stream into GPU halo exchange - Issue #3093
 +                    constructGpuHaloExchange(mdlog, *cr, *fr->deviceStreamManager);
 +                }
              }
          }
  
          if (MASTER(cr) && do_log)
          {
 -            energyOutput.printHeader(fplog, step, t); /* can we improve the information printed here? */
 +            gmx::EnergyOutput::printHeader(fplog, step,
 +                                           t); /* can we improve the information printed here? */
          }
  
          if (ir->efep != efepNO)
              /* We need the kinetic energy at minus the half step for determining
               * the full step kinetic energy and possibly for T-coupling.*/
              /* This may not be quite working correctly yet . . . . */
 -            compute_globals(gstat, cr, ir, fr, ekind, state->x.rvec_array(), state->v.rvec_array(),
 -                            state->box, state->lambda[efptVDW], mdatoms, nrnb, &vcm, wcycle, enerd,
 -                            nullptr, nullptr, nullptr, nullptr, mu_tot, constr, &nullSignaller,
 -                            state->box, &totalNumberOfBondedInteractions, &bSumEkinhOld,
 +            compute_globals(gstat, cr, ir, fr, ekind, makeConstArrayRef(state->x),
 +                            makeConstArrayRef(state->v), state->box, state->lambda[efptVDW], mdatoms,
 +                            nrnb, &vcm, wcycle, enerd, nullptr, nullptr, nullptr, nullptr, constr,
 +                            &nullSignaller, state->box, &totalNumberOfBondedInteractions, &bSumEkinhOld,
                              CGLO_GSTAT | CGLO_TEMPERATURE | CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS);
              checkNumberOfBondedInteractions(mdlog, cr, totalNumberOfBondedInteractions, top_global,
 -                                            &top, state->x.rvec_array(), state->box,
 +                                            &top, makeConstArrayRef(state->x), state->box,
                                              &shouldCheckNumberOfBondedInteractions);
          }
          clear_mat(force_vir);
                                  imdSession, pull_work, bNS, force_flags, &top, constr, enerd, fcd,
                                  state->natoms, state->x.arrayRefWithPadding(),
                                  state->v.arrayRefWithPadding(), state->box, state->lambda, &state->hist,
 -                                f.arrayRefWithPadding(), force_vir, mdatoms, nrnb, wcycle, graph,
 -                                shellfc, fr, runScheduleWork, t, mu_tot, vsite, ddBalanceRegionHandler);
 +                                f.arrayRefWithPadding(), force_vir, mdatoms, nrnb, wcycle, shellfc,
 +                                fr, runScheduleWork, t, mu_tot, vsite, ddBalanceRegionHandler);
          }
          else
          {
               */
              do_force(fplog, cr, ms, ir, awh.get(), enforcedRotation, imdSession, pull_work, step,
                       nrnb, wcycle, &top, state->box, state->x.arrayRefWithPadding(), &state->hist,
 -                     f.arrayRefWithPadding(), force_vir, mdatoms, enerd, fcd, state->lambda, graph,
 -                     fr, runScheduleWork, vsite, mu_tot, t, ed ? ed->getLegacyED() : nullptr,
 +                     f.arrayRefWithPadding(), force_vir, mdatoms, enerd, fcd, state->lambda, fr,
 +                     runScheduleWork, vsite, mu_tot, t, ed ? ed->getLegacyED() : nullptr,
                       (bNS ? GMX_FORCE_NS : 0) | force_flags, ddBalanceRegionHandler);
          }
  
              if (bGStat || do_per_step(step - 1, nstglobalcomm))
              {
                  wallcycle_stop(wcycle, ewcUPDATE);
 -                compute_globals(gstat, cr, ir, fr, ekind, state->x.rvec_array(), state->v.rvec_array(),
 -                                state->box, state->lambda[efptVDW], mdatoms, nrnb, &vcm, wcycle, enerd,
 -                                force_vir, shake_vir, total_vir, pres, mu_tot, constr, &nullSignaller,
 -                                state->box, &totalNumberOfBondedInteractions, &bSumEkinhOld,
 -                                (bGStat ? CGLO_GSTAT : 0) | (bCalcEner ? CGLO_ENERGY : 0)
 -                                        | (bTemp ? CGLO_TEMPERATURE : 0) | (bPres ? CGLO_PRESSURE : 0)
 -                                        | (bPres ? CGLO_CONSTRAINT : 0) | (bStopCM ? CGLO_STOPCM : 0)
 -                                        | (shouldCheckNumberOfBondedInteractions ? CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS
 -                                                                                 : 0)
 -                                        | CGLO_SCALEEKIN);
 +                compute_globals(
 +                        gstat, cr, ir, fr, ekind, makeConstArrayRef(state->x),
 +                        makeConstArrayRef(state->v), state->box, state->lambda[efptVDW], mdatoms,
 +                        nrnb, &vcm, wcycle, enerd, force_vir, shake_vir, total_vir, pres, constr,
 +                        &nullSignaller, state->box, &totalNumberOfBondedInteractions, &bSumEkinhOld,
 +                        (bGStat ? CGLO_GSTAT : 0) | (bCalcEner ? CGLO_ENERGY : 0)
 +                                | (bTemp ? CGLO_TEMPERATURE : 0) | (bPres ? CGLO_PRESSURE : 0)
 +                                | (bPres ? CGLO_CONSTRAINT : 0) | (bStopCM ? CGLO_STOPCM : 0)
 +                                | (shouldCheckNumberOfBondedInteractions ? CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS
 +                                                                         : 0)
 +                                | CGLO_SCALEEKIN);
                  /* explanation of above:
                     a) We compute Ekin at the full time step
                     if 1) we are using the AveVel Ekin, and it's not the
                     b) If we are using EkinAveEkin for the kinetic energy for the temperature control, we still feed in
                     EkinAveVel because it's needed for the pressure */
                  checkNumberOfBondedInteractions(mdlog, cr, totalNumberOfBondedInteractions,
 -                                                top_global, &top, state->x.rvec_array(), state->box,
 -                                                &shouldCheckNumberOfBondedInteractions);
 +                                                top_global, &top, makeConstArrayRef(state->x),
 +                                                state->box, &shouldCheckNumberOfBondedInteractions);
                  if (bStopCM)
                  {
 -                    process_and_stopcm_grp(fplog, &vcm, *mdatoms, state->x.rvec_array(),
 -                                           state->v.rvec_array());
 +                    process_and_stopcm_grp(fplog, &vcm, *mdatoms, makeArrayRef(state->x),
 +                                           makeArrayRef(state->v));
                      inc_nrnb(nrnb, eNR_STOPCM, mdatoms->homenr);
                  }
                  wallcycle_start(wcycle, ewcUPDATE);
                      /* We need the kinetic energy at minus the half step for determining
                       * the full step kinetic energy and possibly for T-coupling.*/
                      /* This may not be quite working correctly yet . . . . */
 -                    compute_globals(gstat, cr, ir, fr, ekind, state->x.rvec_array(),
 -                                    state->v.rvec_array(), state->box, state->lambda[efptVDW],
 +                    compute_globals(gstat, cr, ir, fr, ekind, makeConstArrayRef(state->x),
 +                                    makeConstArrayRef(state->v), state->box, state->lambda[efptVDW],
                                      mdatoms, nrnb, &vcm, wcycle, enerd, nullptr, nullptr, nullptr,
 -                                    nullptr, mu_tot, constr, &nullSignaller, state->box, nullptr,
 +                                    nullptr, constr, &nullSignaller, state->box, nullptr,
                                      &bSumEkinhOld, CGLO_GSTAT | CGLO_TEMPERATURE);
                      wallcycle_start(wcycle, ewcUPDATE);
                  }
          bInteractiveMDstep = imdSession->run(step, bNS, state->box, state->x.rvec_array(), t);
  
          /* kludge -- virial is lost with restart for MTTK NPT control. Must reload (saved earlier). */
-         if (startingBehavior != StartingBehavior::NewSimulation
+         if (startingBehavior != StartingBehavior::NewSimulation && bFirstStep
              && (inputrecNptTrotter(ir) || inputrecNphTrotter(ir)))
          {
              copy_mat(state->svir_prev, shake_vir);
  
              update_sd_second_half(step, &dvdl_constr, ir, mdatoms, state, cr, nrnb, wcycle, &upd,
                                    constr, do_log, do_ene);
 -            finish_update(ir, mdatoms, state, graph, nrnb, wcycle, &upd, constr);
 +            finish_update(ir, mdatoms, state, wcycle, &upd, constr);
          }
  
          if (ir->bPull && ir->pull->bSetPbcRefToPrevStepCOM)
          {
              /* erase F_EKIN and F_TEMP here? */
              /* just compute the kinetic energy at the half step to perform a trotter step */
 -            compute_globals(gstat, cr, ir, fr, ekind, state->x.rvec_array(), state->v.rvec_array(),
 -                            state->box, state->lambda[efptVDW], mdatoms, nrnb, &vcm, wcycle, enerd,
 -                            force_vir, shake_vir, total_vir, pres, mu_tot, constr, &nullSignaller, lastbox,
 -                            nullptr, &bSumEkinhOld, (bGStat ? CGLO_GSTAT : 0) | CGLO_TEMPERATURE);
 +            compute_globals(gstat, cr, ir, fr, ekind, makeConstArrayRef(state->x),
 +                            makeConstArrayRef(state->v), state->box, state->lambda[efptVDW],
 +                            mdatoms, nrnb, &vcm, wcycle, enerd, force_vir, shake_vir, total_vir,
 +                            pres, constr, &nullSignaller, lastbox, nullptr, &bSumEkinhOld,
 +                            (bGStat ? CGLO_GSTAT : 0) | CGLO_TEMPERATURE);
              wallcycle_start(wcycle, ewcUPDATE);
              trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ4);
              /* now we know the scaling, we can compute the positions again */
               * to numerical errors, or are they important
               * physically? I'm thinking they are just errors, but not completely sure.
               * For now, will call without actually constraining, constr=NULL*/
 -            finish_update(ir, mdatoms, state, graph, nrnb, wcycle, &upd, nullptr);
 +            finish_update(ir, mdatoms, state, wcycle, &upd, nullptr);
          }
          if (EI_VV(ir->eI))
          {
              /* this factor or 2 correction is necessary
                 because half of the constraint force is removed
                 in the vv step, so we have to double it.  See
 -               the Redmine issue #1255.  It is not yet clear
 +               the Issue #1255.  It is not yet clear
                 if the factor of 2 is exact, or just a very
                 good approximation, and this will be
                 investigated.  The next step is to see if this
          if (vsite != nullptr)
          {
              wallcycle_start(wcycle, ewcVSITECONSTR);
 -            if (graph != nullptr)
 -            {
 -                shift_self(graph, state->box, state->x.rvec_array());
 -            }
              construct_vsites(vsite, state->x.rvec_array(), ir->delta_t, state->v.rvec_array(),
 -                             top.idef.iparams, top.idef.il, fr->ePBC, fr->bMolPBC, cr, state->box);
 -
 -            if (graph != nullptr)
 -            {
 -                unshift_self(graph, state->box, state->x.rvec_array());
 -            }
 +                             top.idef.iparams, top.idef.il, fr->pbcType, fr->bMolPBC, cr, state->box);
              wallcycle_stop(wcycle, ewcVSITECONSTR);
          }
  
                  SimulationSignaller signaller(&signals, cr, ms, doInterSimSignal, doIntraSimSignal);
  
                  compute_globals(
 -                        gstat, cr, ir, fr, ekind, state->x.rvec_array(), state->v.rvec_array(),
 -                        state->box, state->lambda[efptVDW], mdatoms, nrnb, &vcm, wcycle, enerd,
 -                        force_vir, shake_vir, total_vir, pres, mu_tot, constr, &signaller, lastbox,
 -                        &totalNumberOfBondedInteractions, &bSumEkinhOld,
 +                        gstat, cr, ir, fr, ekind, makeConstArrayRef(state->x),
 +                        makeConstArrayRef(state->v), state->box, state->lambda[efptVDW], mdatoms,
 +                        nrnb, &vcm, wcycle, enerd, force_vir, shake_vir, total_vir, pres, constr,
 +                        &signaller, lastbox, &totalNumberOfBondedInteractions, &bSumEkinhOld,
                          (bGStat ? CGLO_GSTAT : 0) | (!EI_VV(ir->eI) && bCalcEner ? CGLO_ENERGY : 0)
                                  | (!EI_VV(ir->eI) && bStopCM ? CGLO_STOPCM : 0)
                                  | (!EI_VV(ir->eI) ? CGLO_TEMPERATURE : 0)
                                  | (shouldCheckNumberOfBondedInteractions ? CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS
                                                                           : 0));
                  checkNumberOfBondedInteractions(mdlog, cr, totalNumberOfBondedInteractions,
 -                                                top_global, &top, state->x.rvec_array(), state->box,
 -                                                &shouldCheckNumberOfBondedInteractions);
 +                                                top_global, &top, makeConstArrayRef(state->x),
 +                                                state->box, &shouldCheckNumberOfBondedInteractions);
                  if (!EI_VV(ir->eI) && bStopCM)
                  {
 -                    process_and_stopcm_grp(fplog, &vcm, *mdatoms, state->x.rvec_array(),
 -                                           state->v.rvec_array());
 +                    process_and_stopcm_grp(fplog, &vcm, *mdatoms, makeArrayRef(state->x),
 +                                           makeArrayRef(state->v));
                      inc_nrnb(nrnb, eNR_STOPCM, mdatoms->homenr);
  
                      // TODO: The special case of removing CM motion should be dealt more gracefully
                          // force kernels that use the coordinates on the next steps is not implemented
                          // (not because of a race on state->x being modified on the CPU while H2D is in progress).
                          stateGpu->waitCoordinatesCopiedToDevice(AtomLocality::Local);
+                         // If the COM removal changed the velocities on the CPU, this has to be accounted for.
+                         if (vcm.mode != ecmNO)
+                         {
+                             stateGpu->copyVelocitiesToGpu(state->v, AtomLocality::Local);
+                         }
                      }
                  }
              }
          if (useGpuForUpdate && (doBerendsenPressureCoupling || doParrinelloRahman))
          {
              integrator->scaleCoordinates(pressureCouplingMu);
 -            t_pbc pbc;
 -            set_pbc(&pbc, epbcXYZ, state->box);
 -            integrator->setPbc(&pbc);
 +            integrator->setPbc(PbcType::Xyz, state->box);
          }
  
          /* ################# END UPDATE STEP 2 ################# */
  
              if (doSimulatedAnnealing)
              {
 -                energyOutput.printAnnealingTemperatures(do_log ? fplog : nullptr, groups, &(ir->opts));
 +                gmx::EnergyOutput::printAnnealingTemperatures(do_log ? fplog : nullptr, groups,
 +                                                              &(ir->opts));
              }
              if (do_log || do_ene || do_dr || do_or)
              {
      {
          if (ir->nstcalcenergy > 0)
          {
 -            energyOutput.printAnnealingTemperatures(fplog, groups, &(ir->opts));
 +            gmx::EnergyOutput::printAnnealingTemperatures(fplog, groups, &(ir->opts));
              energyOutput.printAverages(fplog, groups);
          }
      }
index bc09f47c92a6e129702e5af842f3197cdf4a3acf,a034e7dd781ecefb99f7b987720608e4a65b2d9e..e9378fe8336e9277d1a04352b33bd7d4a5bedb58
@@@ -1,8 -1,7 +1,8 @@@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
 - * Copyright (c) 2012,2013,2014,2015,2017,2018,2019,2020, by the GROMACS development team, led by
 + * Copyright (c) 2012,2013,2014,2015,2017 by the GROMACS development team.
 + * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@@ -144,9 -143,8 +144,8 @@@ struct interaction_const_
  
      // Coulomb Ewald correction table
      std::unique_ptr<EwaldCorrectionTables> coulombEwaldTables;
-     /* Note that a Van der Waals Ewald correction table
-      * of type EwaldCorrectionTables can be added here if wanted.
-      */
+     // Van der Waals Ewald correction table
+     std::unique_ptr<EwaldCorrectionTables> vdwEwaldTables;
  };
  
  #endif
diff --combined src/gromacs/nbnxm/grid.h
index b55f83d22256cc2546d016f00ba355bf15ad5f95,1e68c14dae2d16a9b0f4fb8a75a89773bc1af4be..806ee91f2bdb8826801f7d4816e6081197a7ee53
@@@ -125,20 -125,14 +125,20 @@@ struct BoundingBo
          //! Returns a pointer for SIMD storing of a Corner object
          float* ptr() { return &x; }
  
 -        float x;       //!< x coordinate
 -        float y;       //!< y coordinate
 -        float z;       //!< z coordinate
 -        float padding; //!< padding, unused, but should be set to avoid operations on unitialized data
 +        //! x coordinate
 +        float x;
 +        //! y coordinate
 +        float y;
 +        //! z coordinate
 +        float z;
 +        //! padding, unused, but should be set to avoid operations on unitialized data
 +        float padding;
      };
  
 -    Corner lower; //!< lower, along x and y and z, corner
 -    Corner upper; //!< upper, along x and y and z, corner
 +    //! lower, along x and y and z, corner
 +    Corner lower;
 +    //! upper, along x and y and z, corner
 +    Corner upper;
  };
  
  /*! \internal
   */
  struct BoundingBox1D
  {
 -    float lower; //!< lower bound
 -    float upper; //!< upper bound
 +    //! lower bound
 +    float lower;
 +    //! upper bound
 +    float upper;
  };
  
  } // namespace Nbnxm
@@@ -190,19 -182,14 +190,19 @@@ public
          //! Constructs the cluster/cell geometry given the type of pairlist
          Geometry(PairlistType pairlistType);
  
 -        bool isSimple;             //!< Is this grid simple (CPU) or hierarchical (GPU)
 -        int  numAtomsICluster;     //!< Number of atoms per cluster
 -        int  numAtomsJCluster;     //!< Number of atoms for list j-clusters
 -        int  numAtomsPerCell;      //!< Number of atoms per cell
 -        int  numAtomsICluster2Log; //!< 2log of na_c
 +        //! Is this grid simple (CPU) or hierarchical (GPU)
 +        bool isSimple;
 +        //! Number of atoms per cluster
 +        int numAtomsICluster;
 +        //! Number of atoms for list j-clusters
 +        int numAtomsJCluster;
 +        //! Number of atoms per cell
 +        int numAtomsPerCell;
 +        //! 2log of na_c
 +        int numAtomsICluster2Log;
      };
  
 -    // The physical dimensions of a grid
 +    //! The physical dimensions of a grid \internal
      struct Dimensions
      {
          //! The lower corner of the (local) grid
      //! Returns the number of cells in the column
      int numCellsInColumn(int columnIndex) const
      {
-         return cxy_ind_[columnIndex + 1] - cxy_ind_[columnIndex];
+         return cxy_ind_[columnIndex + 1LL] - cxy_ind_[columnIndex];
      }
  
      //! Returns the index of the first atom in the column
index ea90c12b0c7e713c16777d492c0a0409a554a0e3,26a9bba0da94ffef9ec382aec0e1dd9f8b3581b3..e7994d84bac46560035ffc9b5e3d746a0dbd0cdc
  #include "gromacs/mdtypes/commrec.h"
  #include "gromacs/mdtypes/forcerec.h"
  #include "gromacs/mdtypes/inputrec.h"
 +#include "gromacs/mdtypes/interaction_const.h"
 +#include "gromacs/nbnxm/atomdata.h"
  #include "gromacs/nbnxm/gpu_data_mgmt.h"
  #include "gromacs/nbnxm/nbnxm.h"
  #include "gromacs/nbnxm/pairlist_tuning.h"
  #include "gromacs/simd/simd.h"
 +#include "gromacs/topology/mtop_util.h"
  #include "gromacs/utility/fatalerror.h"
  #include "gromacs/utility/logger.h"
  
 -#include "atomdata.h"
 -#include "gpu_types.h"
  #include "grid.h"
  #include "nbnxm_geometry.h"
  #include "nbnxm_simd.h"
@@@ -186,6 -185,7 +186,7 @@@ static KernelSetup pick_nbnxn_kernel_cp
           * On AMD Zen, tabulated Ewald kernels are faster on all 4 combinations
           * of single or double precision and 128 or 256-bit AVX2.
           */
+         MSVC_DIAGNOSTIC_IGNORE(6285) // Always zero because compile time constant
          if (
  #if GMX_SIMD
                  (GMX_SIMD_REAL_WIDTH >= 8 || (GMX_SIMD_REAL_WIDTH >= 4 && GMX_SIMD_HAVE_FMA && !GMX_DOUBLE)) &&
          {
              kernelSetup.ewaldExclusionType = EwaldExclusionType::Analytical;
          }
-         else
-         {
-             kernelSetup.ewaldExclusionType = EwaldExclusionType::Table;
-         }
+         MSVC_DIAGNOSTIC_RESET
+         else { kernelSetup.ewaldExclusionType = EwaldExclusionType::Table; }
          if (getenv("GMX_NBNXN_EWALD_TABLE") != nullptr)
          {
              kernelSetup.ewaldExclusionType = EwaldExclusionType::Table;
@@@ -322,7 -320,7 +321,7 @@@ namespace Nbnx
  {
  
  /*! \brief Gets and returns the minimum i-list count for balacing based on the GPU used or env.var. when set */
 -static int getMinimumIlistCountForGpuBalancing(gmx_nbnxn_gpu_t* nbnxmGpu)
 +static int getMinimumIlistCountForGpuBalancing(NbnxmGpu* nbnxmGpu)
  {
      int minimumIlistCount;
  
      return minimumIlistCount;
  }
  
 -std::unique_ptr<nonbonded_verlet_t> init_nb_verlet(const gmx::MDLogger&     mdlog,
 -                                                   gmx_bool                 bFEP_NonBonded,
 -                                                   const t_inputrec*        ir,
 -                                                   const t_forcerec*        fr,
 -                                                   const t_commrec*         cr,
 -                                                   const gmx_hw_info_t&     hardwareInfo,
 -                                                   const gmx_device_info_t* deviceInfo,
 -                                                   const gmx_mtop_t*        mtop,
 -                                                   matrix                   box,
 -                                                   gmx_wallcycle*           wcycle)
 +std::unique_ptr<nonbonded_verlet_t> init_nb_verlet(const gmx::MDLogger& mdlog,
 +                                                   const t_inputrec*    ir,
 +                                                   const t_forcerec*    fr,
 +                                                   const t_commrec*     cr,
 +                                                   const gmx_hw_info_t& hardwareInfo,
 +                                                   const bool           useGpuForNonbonded,
 +                                                   const gmx::DeviceStreamManager* deviceStreamManager,
 +                                                   const gmx_mtop_t*               mtop,
 +                                                   matrix                          box,
 +                                                   gmx_wallcycle*                  wcycle)
  {
      const bool emulateGpu = (getenv("GMX_EMULATE_GPU") != nullptr);
 -    const bool useGpu     = deviceInfo != nullptr;
  
 -    GMX_RELEASE_ASSERT(!(emulateGpu && useGpu),
 +    GMX_RELEASE_ASSERT(!(emulateGpu && useGpuForNonbonded),
                         "When GPU emulation is active, there cannot be a GPU assignment");
  
      NonbondedResource nonbondedResource;
 -    if (useGpu)
 +    if (useGpuForNonbonded)
      {
          nonbondedResource = NonbondedResource::Gpu;
      }
      Nbnxm::KernelSetup kernelSetup = pick_nbnxn_kernel(mdlog, fr->use_simd_kernels, hardwareInfo,
                                                         nonbondedResource, ir, fr->bNonbonded);
  
 -    const bool haveMultipleDomains = (DOMAINDECOMP(cr) && cr->dd->nnodes > 1);
 +    const bool haveMultipleDomains = havePPDomainDecomposition(cr);
  
 -    PairlistParams pairlistParams(kernelSetup.kernelType, bFEP_NonBonded, ir->rlist,
 -                                  havePPDomainDecomposition(cr));
 +    bool           bFEP_NonBonded = (fr->efep != efepNO) && haveFepPerturbedNBInteractions(mtop);
 +    PairlistParams pairlistParams(kernelSetup.kernelType, bFEP_NonBonded, ir->rlist, haveMultipleDomains);
  
      setupDynamicPairlistPruning(mdlog, ir, mtop, box, fr->ic, &pairlistParams);
  
          enbnxninitcombrule = enbnxninitcombruleNONE;
      }
  
 -    auto pinPolicy = (useGpu ? gmx::PinningPolicy::PinnedIfSupported : gmx::PinningPolicy::CannotBePinned);
 +    auto pinPolicy = (useGpuForNonbonded ? gmx::PinningPolicy::PinnedIfSupported
 +                                         : gmx::PinningPolicy::CannotBePinned);
  
      auto nbat = std::make_unique<nbnxn_atomdata_t>(pinPolicy);
  
      }
      nbnxn_atomdata_init(mdlog, nbat.get(), kernelSetup.kernelType, enbnxninitcombrule, fr->ntype,
                          fr->nbfp, mimimumNumEnergyGroupNonbonded,
 -                        (useGpu || emulateGpu) ? 1 : gmx_omp_nthreads_get(emntNonbonded));
 +                        (useGpuForNonbonded || emulateGpu) ? 1 : gmx_omp_nthreads_get(emntNonbonded));
  
 -    gmx_nbnxn_gpu_t* gpu_nbv                          = nullptr;
 -    int              minimumIlistCountForGpuBalancing = 0;
 -    if (useGpu)
 +    NbnxmGpu* gpu_nbv                          = nullptr;
 +    int       minimumIlistCountForGpuBalancing = 0;
 +    if (useGpuForNonbonded)
      {
          /* init the NxN GPU data; the last argument tells whether we'll have
           * both local and non-local NB calculation on GPU */
 -        gpu_nbv = gpu_init(deviceInfo, fr->ic, pairlistParams, nbat.get(), cr->nodeid, haveMultipleDomains);
 +        GMX_RELEASE_ASSERT(
 +                (deviceStreamManager != nullptr),
 +                "Device stream manager should be initialized in order to use GPU for non-bonded.");
 +        gpu_nbv = gpu_init(*deviceStreamManager, fr->ic, pairlistParams, nbat.get(), haveMultipleDomains);
  
          minimumIlistCountForGpuBalancing = getMinimumIlistCountForGpuBalancing(gpu_nbv);
      }
                                                         minimumIlistCountForGpuBalancing);
  
      auto pairSearch = std::make_unique<PairSearch>(
 -            ir->ePBC, EI_TPI(ir->eI), DOMAINDECOMP(cr) ? &cr->dd->nc : nullptr,
 +            ir->pbcType, EI_TPI(ir->eI), DOMAINDECOMP(cr) ? &cr->dd->numCells : nullptr,
              DOMAINDECOMP(cr) ? domdec_zones(cr->dd) : nullptr, pairlistParams.pairlistType,
              bFEP_NonBonded, gmx_omp_nthreads_get(emntPairsearch), pinPolicy);
  
@@@ -474,7 -469,7 +473,7 @@@ nonbonded_verlet_t::nonbonded_verlet_t(
                                         std::unique_ptr<PairSearch>       pairSearch,
                                         std::unique_ptr<nbnxn_atomdata_t> nbat_in,
                                         const Nbnxm::KernelSetup&         kernelSetup,
 -                                       gmx_nbnxn_gpu_t*                  gpu_nbv_ptr,
 +                                       NbnxmGpu*                         gpu_nbv_ptr,
                                         gmx_wallcycle*                    wcycle) :
      pairlistSets_(std::move(pairlistSets)),
      pairSearch_(std::move(pairSearch)),
index 97805f7f1b5a615ab051922d41a9851858b35d16,162954c89d0fc87871208fb7e09c548d14ac5ab1..04f33159905bdc48ef3400306e01ba907f826352
@@@ -1,8 -1,8 +1,8 @@@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
 - * Copyright (c) 2012,2013,2014,2015, The GROMACS development team.
 - * Copyright (c) 2016,2017,2018,2019,2020, by the GROMACS development team, led by
 + * Copyright (c) 2012,2013,2014,2015,2016 by the GROMACS development team.
 + * Copyright (c) 2017,2018,2019,2020, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
  #include "gromacs/utility/enumerationhelpers.h"
  #include "gromacs/utility/real.h"
  
 -// This file with constants is separate from this file to be able
 -// to include it during OpenCL jitting without including config.h
 -#include "constants.h"
  #include "pairlistparams.h"
  
  struct NbnxnPairlistCpuWork;
  struct NbnxnPairlistGpuWork;
  
  
 -/* Convenience type for vector with aligned memory */
 +//! Convenience type for vector with aligned memory
  template<typename T>
  using AlignedVector = std::vector<T, gmx::AlignedAllocator<T>>;
  
 -/* Convenience type for vector that avoids initialization at resize() */
 +//! Convenience type for vector that avoids initialization at resize()
  template<typename T>
  using FastVector = std::vector<T, gmx::DefaultInitializationAllocator<T>>;
  
 -/* A buffer data structure of 64 bytes
 +/*! \brief Cache-line protection buffer
 + *
 + * A buffer data structure of 64 bytes
   * to be placed at the beginning and end of structs
   * to avoid cache invalidation of the real contents
   * of the struct by writes to neighboring memory.
   */
  typedef struct
  {
 +    //! Unused field used to create space to protect cache lines that are in use
      int dummy[16];
  } gmx_cache_protect_t;
  
 -/* This is the actual cluster-pair list j-entry.
 +/*! \brief This is the actual cluster-pair list j-entry.
 + *
   * cj is the j-cluster.
   * The interaction bits in excl are indexed i-major, j-minor.
   * The cj entries are sorted such that ones with exclusions come first.
   */
  struct nbnxn_cj_t
  {
 -    int          cj;   /* The j-cluster                    */
 -    unsigned int excl; /* The exclusion (interaction) bits */
 +    //! The j-cluster
 +    int cj;
 +    //! The exclusion (interaction) bits
 +    unsigned int excl;
  };
  
 -/* In nbnxn_ci_t the integer shift contains the shift in the lower 7 bits.
 +/*! \brief Constants for interpreting interaction flags
 + *
 + * In nbnxn_ci_t the integer shift contains the shift in the lower 7 bits.
   * The upper bits contain information for non-bonded kernel optimization.
   * Simply calculating LJ and Coulomb for all pairs in a cluster pair is fine.
   * But three flags can be used to skip interactions, currently only for subc=0
   * shift & NBNXN_CI_HALF_LJ(subc)    => we can skip LJ for the second half of i
   * !(shift & NBNXN_CI_DO_COUL(subc)) => we can skip Coulomb for all pairs
   */
 +//! \{
  #define NBNXN_CI_SHIFT 127
  #define NBNXN_CI_DO_LJ(subc) (1 << (7 + 3 * (subc)))
  #define NBNXN_CI_HALF_LJ(subc) (1 << (8 + 3 * (subc)))
  #define NBNXN_CI_DO_COUL(subc) (1 << (9 + 3 * (subc)))
 +//! \}
  
 -/* Cluster-pair Interaction masks
 +/*! \brief Cluster-pair Interaction masks
 + *
   * Bit i*j-cluster-size + j tells if atom i and j interact.
   */
 +//! \{
  // TODO: Rename according to convention when moving into Nbnxn namespace
 -/* All interaction mask is the same for all kernels */
 +//! All interaction mask is the same for all kernels
  constexpr unsigned int NBNXN_INTERACTION_MASK_ALL = 0xffffffffU;
 -/* 4x4 kernel diagonal mask */
 +//! 4x4 kernel diagonal mask
  constexpr unsigned int NBNXN_INTERACTION_MASK_DIAG = 0x08ceU;
 -/* 4x2 kernel diagonal masks */
 +//! 4x2 kernel diagonal masks
 +//! \{
  constexpr unsigned int NBNXN_INTERACTION_MASK_DIAG_J2_0 = 0x0002U;
  constexpr unsigned int NBNXN_INTERACTION_MASK_DIAG_J2_1 = 0x002fU;
 -/* 4x8 kernel diagonal masks */
 +//! \}
 +//! 4x8 kernel diagonal masks
 +//! \{
  constexpr unsigned int NBNXN_INTERACTION_MASK_DIAG_J8_0 = 0xf0f8fcfeU;
  constexpr unsigned int NBNXN_INTERACTION_MASK_DIAG_J8_1 = 0x0080c0e0U;
 +//! \}
 +//! \}
 +
 +/*! \brief Lower limit for square interaction distances in nonbonded kernels.
 + *
 + * For smaller values we will overflow when calculating r^-1 or r^-12, but
 + * to keep it simple we always apply the limit from the tougher r^-12 condition.
 + */
 +#if GMX_DOUBLE
 +// Some double precision SIMD architectures use single precision in the first
 +// step, so although the double precision criterion would allow smaller rsq,
 +// we need to stay in single precision with some margin for the N-R iterations.
 +constexpr double c_nbnxnMinDistanceSquared = 1.0e-36;
 +#else
 +// The worst intermediate value we might evaluate is r^-12, which
 +// means we should ensure r^2 stays above pow(GMX_FLOAT_MAX,-1.0/6.0)*1.01 (some margin)
 +constexpr float c_nbnxnMinDistanceSquared = 3.82e-07F; // r > 6.2e-4
 +#endif
  
 -/* Simple pair-list i-unit */
 +
 +//! The number of clusters in a super-cluster, used for GPU
 +constexpr int c_nbnxnGpuNumClusterPerSupercluster = 8;
 +
 +/*! \brief With GPU kernels we group cluster pairs in 4 to optimize memory usage
 + * of integers containing 32 bits.
 + */
 +constexpr int c_nbnxnGpuJgroupSize = (32 / c_nbnxnGpuNumClusterPerSupercluster);
 +
 +/*! \internal
 + * \brief Simple pair-list i-unit
 + */
  struct nbnxn_ci_t
  {
 -    int ci;           /* i-cluster             */
 -    int shift;        /* Shift vector index plus possible flags, see above */
 -    int cj_ind_start; /* Start index into cj   */
 -    int cj_ind_end;   /* End index into cj     */
 +    //! i-cluster
 +    int ci;
 +    //! Shift vector index plus possible flags, see above
 +    int shift;
 +    //! Start index into cj
 +    int cj_ind_start;
 +    //! End index into cj
 +    int cj_ind_end;
  };
  
 -/* Grouped pair-list i-unit */
 +//! Grouped pair-list i-unit
  typedef struct
  {
 -    /* Returns the number of j-cluster groups in this entry */
 +    //! Returns the number of j-cluster groups in this entry
      int numJClusterGroups() const { return cj4_ind_end - cj4_ind_start; }
  
 -    int sci;           /* i-super-cluster       */
 -    int shift;         /* Shift vector index plus possible flags */
 -    int cj4_ind_start; /* Start index into cj4  */
 -    int cj4_ind_end;   /* End index into cj4    */
 +    //! i-super-cluster
 +    int sci;
 +    //! Shift vector index plus possible flags
 +    int shift;
 +    //! Start index into cj4
 +    int cj4_ind_start;
 +    //! End index into cj4
 +    int cj4_ind_end;
  } nbnxn_sci_t;
  
 -/* Interaction data for a j-group for one warp */
 +//! Interaction data for a j-group for one warp
  struct nbnxn_im_ei_t
  {
 -    // The i-cluster interactions mask for 1 warp
 +    //! The i-cluster interactions mask for 1 warp
      unsigned int imask = 0U;
 -    // Index into the exclusion array for 1 warp, default index 0 which means no exclusions
 +    //! Index into the exclusion array for 1 warp, default index 0 which means no exclusions
      int excl_ind = 0;
  };
  
 +//! Four-way j-cluster lists
  typedef struct
  {
 -    int           cj[c_nbnxnGpuJgroupSize];         /* The 4 j-clusters */
 -    nbnxn_im_ei_t imei[c_nbnxnGpuClusterpairSplit]; /* The i-cluster mask data       for 2 warps */
 +    //! The 4 j-clusters
 +    int cj[c_nbnxnGpuJgroupSize];
 +    //! The i-cluster mask data for 2 warps
 +    nbnxn_im_ei_t imei[c_nbnxnGpuClusterpairSplit];
  } nbnxn_cj4_t;
  
 -/* Struct for storing the atom-pair interaction bits for a cluster pair in a GPU pairlist */
 +//! Struct for storing the atom-pair interaction bits for a cluster pair in a GPU pairlist
  struct nbnxn_excl_t
  {
 -    /* Constructor, sets no exclusions, so all atom pairs interacting */
 +    //! Constructor, sets no exclusions, so all atom pairs interacting
+     MSVC_DIAGNOSTIC_IGNORE(26495) // pair is not being initialized!
      nbnxn_excl_t()
      {
          for (unsigned int& pairEntry : pair)
              pairEntry = NBNXN_INTERACTION_MASK_ALL;
          }
      }
+     MSVC_DIAGNOSTIC_RESET
  
 -    /* Topology exclusion interaction bits per warp */
 +    //! Topology exclusion interaction bits per warp
      unsigned int pair[c_nbnxnGpuExclSize];
  };
  
 -/* Cluster pairlist type for use on CPUs */
 +//! Cluster pairlist type for use on CPUs
  struct NbnxnPairlistCpu
  {
      NbnxnPairlistCpu();
  
 +    //! Cache protection
      gmx_cache_protect_t cp0;
  
 -    int                    na_ci;   /* The number of atoms per i-cluster        */
 -    int                    na_cj;   /* The number of atoms per j-cluster        */
 -    real                   rlist;   /* The radius for constructing the list     */
 -    FastVector<nbnxn_ci_t> ci;      /* The i-cluster list                       */
 -    FastVector<nbnxn_ci_t> ciOuter; /* The outer, unpruned i-cluster list       */
 -
 -    FastVector<nbnxn_cj_t> cj;      /* The j-cluster list, size ncj             */
 -    FastVector<nbnxn_cj_t> cjOuter; /* The outer, unpruned j-cluster list       */
 -    int                    ncjInUse; /* The number of j-clusters that are used by ci entries in this list, will be <= cj.size() */
 -
 -    int nci_tot; /* The total number of i clusters           */
 +    //! The number of atoms per i-cluster
 +    int na_ci;
 +    //! The number of atoms per j-cluster
 +    int na_cj;
 +    //! The radius for constructing the list
 +    real rlist;
 +    //! The i-cluster list
 +    FastVector<nbnxn_ci_t> ci;
 +    //! The outer, unpruned i-cluster list
 +    FastVector<nbnxn_ci_t> ciOuter;
 +
 +    //! The j-cluster list, size ncj
 +    FastVector<nbnxn_cj_t> cj;
 +    //! The outer, unpruned j-cluster list
 +    FastVector<nbnxn_cj_t> cjOuter;
 +    //! The number of j-clusters that are used by ci entries in this list, will be <= cj.size()
 +    int ncjInUse;
 +
 +    //! The total number of i clusters
 +    int nci_tot;
  
 -    /* Working data storage for list construction */
 +    //! Working data storage for list construction
      std::unique_ptr<NbnxnPairlistCpuWork> work;
  
 +    //! Cache protection
      gmx_cache_protect_t cp1;
  };
  
   */
  struct NbnxnPairlistGpu
  {
 -    /* Constructor
 +    /*! \brief Constructor
       *
       * \param[in] pinningPolicy  Sets the pinning policy for all buffers used on the GPU
       */
      NbnxnPairlistGpu(gmx::PinningPolicy pinningPolicy);
  
 +    //! Cache protection
      gmx_cache_protect_t cp0;
  
 -    int  na_ci; /* The number of atoms per i-cluster        */
 -    int  na_cj; /* The number of atoms per j-cluster        */
 -    int  na_sc; /* The number of atoms per super cluster    */
 -    real rlist; /* The radius for constructing the list     */
 +    //! The number of atoms per i-cluster
 +    int na_ci;
 +    //! The number of atoms per j-cluster
 +    int na_cj;
 +    //! The number of atoms per super cluster
 +    int na_sc;
 +    //! The radius for constructing the list
 +    real rlist;
      // The i-super-cluster list, indexes into cj4;
      gmx::HostVector<nbnxn_sci_t> sci;
      // The list of 4*j-cluster groups
      // The total number of i-clusters
      int nci_tot;
  
 -    /* Working data storage for list construction */
 +    //! Working data storage for list construction
      std::unique_ptr<NbnxnPairlistGpuWork> work;
  
 +    //! Cache protection
      gmx_cache_protect_t cp1;
  };
  
index 482ba62f59832bdf05de75d363c8756668c30c72,6c5f24ad876909e908fbfcbabab1bc89e00b50d4..0ec97a877cd36ced66847a4d3f98229c446c88bd
@@@ -554,13 -554,13 +554,13 @@@ void OptionStorageTemplate<T>::commitVa
      store_->reserve(setValues_.size());
      // For bool the loop variable isn't a reference (it's its special reference type)
      // clang-format off
 -    CLANG_DIAGNOSTIC_IGNORE(-Wrange-loop-analysis)
 +    CLANG_DIAGNOSTIC_IGNORE(-Wrange-loop-analysis);
      // clang-format on
      for (const auto& value : setValues_)
      {
          store_->append(value);
      }
-     DIAGNOSTIC_RESET;
+     CLANG_DIAGNOSTIC_RESET;
      clearSet();
  }
  
index 78b34060afbcdd20b27c665d58b4ae826f8f0ca3,8adb34434aa141dfc603f0622f32ccedf15b23bb..349092cc80bd74581d20293ae4747a12f21ffc07
@@@ -1,7 -1,7 +1,7 @@@
  #
  # This file is part of the GROMACS molecular simulation package.
  #
 -# Copyright (c) 2010,2012,2013,2014,2015, The GROMACS development team.
 +# Copyright (c) 2010,2012,2013,2014,2015 by the GROMACS development team.
  # Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
  # Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  # and including many others, as listed in the AUTHORS file in the
@@@ -54,6 -54,7 +54,7 @@@ else(
      gmx_target_warning_suppression(scanner -Wno-unused HAS_NO_UNUSED)
      gmx_target_warning_suppression(scanner -Wno-unused-parameter HAS_NO_UNUSED_PARAMETER)
      gmx_target_warning_suppression(scanner -Wno-missing-declarations HAS_NO_MISSING_DECLARATIONS)
+     gmx_target_warning_suppression(scanner -Wno-null-conversion HAS_NO_NULL_CONVERSIONS)
      gmx_target_warning_suppression(scanner -wd1419 HAS_DECL_IN_SOURCE)
  endif()
  list(APPEND libgromacs_object_library_dependencies scanner)
index d6feeba60bb20e32d3233a34ea025b272ee37e11,1ca12e817114c248be9ca6923ceae1b7fed63bb8..e0984d0d05d4fc2ee0d2506ee2db05fa6cceb1a1
@@@ -1,8 -1,7 +1,8 @@@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
 - * Copyright (c) 2014,2015,2016,2017,2018,2019,2020, by the GROMACS development team, led by
 + * Copyright (c) 2014,2015,2016,2017,2018 by the GROMACS development team.
 + * Copyright (c) 2019,2020, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@@ -56,8 -55,9 +56,9 @@@ namespace gm
  class SimdDouble
  {
  public:
+     MSVC_DIAGNOSTIC_IGNORE(26495) // simdInternal_ is not being initialized!
      SimdDouble() {}
+     MSVC_DIAGNOSTIC_RESET
      SimdDouble(double d) : simdInternal_(_mm256_set1_pd(d)) {}
  
      // Internal utility constructor to simplify return statements
@@@ -69,8 -69,9 +70,9 @@@
  class SimdDInt32
  {
  public:
+     MSVC_DIAGNOSTIC_IGNORE(26495) // simdInternal_ is not being initialized!
      SimdDInt32() {}
+     MSVC_DIAGNOSTIC_RESET
      SimdDInt32(std::int32_t i) : simdInternal_(_mm_set1_epi32(i)) {}
  
      // Internal utility constructor to simplify return statements
@@@ -82,8 -83,9 +84,9 @@@
  class SimdDBool
  {
  public:
+     MSVC_DIAGNOSTIC_IGNORE(26495) // simdInternal_ is not being initialized!
      SimdDBool() {}
+     MSVC_DIAGNOSTIC_RESET
      SimdDBool(bool b) : simdInternal_(_mm256_castsi256_pd(_mm256_set1_epi32(b ? 0xFFFFFFFF : 0))) {}
  
      // Internal utility constructor to simplify return statements
@@@ -95,8 -97,9 +98,9 @@@
  class SimdDIBool
  {
  public:
+     MSVC_DIAGNOSTIC_IGNORE(26495) // simdInternal_ is not being initialized!
      SimdDIBool() {}
+     MSVC_DIAGNOSTIC_RESET
      SimdDIBool(bool b) : simdInternal_(_mm_set1_epi32(b ? 0xFFFFFFFF : 0)) {}
  
      // Internal utility constructor to simplify return statements
index 76c42d2331c0e3df7f3b638efc85dcc7c73dcae1,36496ca3ec5294395942fb58749fe7086ca3a697..7f219d9937fc967e07ab5e4f45ed4fda6f6cad92
@@@ -1,8 -1,7 +1,8 @@@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
 - * Copyright (c) 2014,2015,2016,2017,2018,2019,2020, by the GROMACS development team, led by
 + * Copyright (c) 2014,2015,2016,2017,2018 by the GROMACS development team.
 + * Copyright (c) 2019,2020, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@@ -53,8 -52,9 +53,9 @@@ namespace gm
  class SimdFloat
  {
  public:
+     MSVC_DIAGNOSTIC_IGNORE(26495) // simdInternal_ is not being initialized!
      SimdFloat() {}
+     MSVC_DIAGNOSTIC_RESET
      SimdFloat(float f) : simdInternal_(_mm256_set1_ps(f)) {}
  
      // Internal utility constructor to simplify return statements
@@@ -66,8 -66,9 +67,9 @@@
  class SimdFInt32
  {
  public:
+     MSVC_DIAGNOSTIC_IGNORE(26495) // simdInternal_ is not being initialized!
      SimdFInt32() {}
+     MSVC_DIAGNOSTIC_RESET
      SimdFInt32(std::int32_t i) : simdInternal_(_mm256_set1_epi32(i)) {}
  
      // Internal utility constructor to simplify return statements
@@@ -79,8 -80,9 +81,9 @@@
  class SimdFBool
  {
  public:
+     MSVC_DIAGNOSTIC_IGNORE(26495) // simdInternal_ is not being initialized!
      SimdFBool() {}
+     MSVC_DIAGNOSTIC_RESET
      SimdFBool(bool b) : simdInternal_(_mm256_castsi256_ps(_mm256_set1_epi32(b ? 0xFFFFFFFF : 0))) {}
  
      // Internal utility constructor to simplify return statements
diff --combined src/gromacs/simd/simd.h
index ffc5a6cce53defd72424d3cdba1c6bd20aa8f741,bd9b678e1f8cbd60ed503bdd5ffaa0741232b592..537046645a9b2080ae4024bfc6dcadb9e7e794c4
@@@ -1,8 -1,7 +1,8 @@@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
 - * Copyright (c) 2013,2014,2015,2016,2017,2018,2019,2020, by the GROMACS development team, led by
 + * Copyright (c) 2013,2014,2015,2016,2017 by the GROMACS development team.
 + * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@@ -79,6 -78,7 +79,7 @@@
  #include <array>
  #include <type_traits>
  
+ #include "gromacs/utility/basedefinitions.h"
  #include "gromacs/utility/classhelpers.h"
  #include "gromacs/utility/real.h"
  
index fe0a30a56a15661f993bb13d56e711c90b85925a,db9065a2b2ad221bc8cd5ba04f6bf36a2e1265d1..10fee47a24b40f31275487be8f752c85de506af9
@@@ -3,7 -3,7 +3,7 @@@
   *
   * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   * Copyright (c) 2001-2004, The GROMACS development team.
-  * Copyright (c) 2010,2014,2015,2018,2019, by the GROMACS development team, led by
+  * Copyright (c) 2010,2014,2015,2018,2019,2020, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@@ -48,9 -48,6 +48,9 @@@
  namespace gmx
  {
  
 +template<typename>
 +class ListOfLists;
 +
  /*! \brief Division of a range of indices into consecutive blocks
   *
   * A range of consecutive indices 0 to full.range.end() is divided
@@@ -67,7 -64,10 +67,10 @@@ public
      int numBlocks() const { return static_cast<int>(index_.size()) - 1; }
  
      /*! \brief Returns the size of the block with index \p blockIndex */
-     Block block(int blockIndex) const { return Block(index_[blockIndex], index_[blockIndex + 1]); }
+     Block block(int blockIndex) const
+     {
+         return Block(index_[blockIndex], index_[blockIndex + 1LL]);
+     }
  
      /*! \brief Returns the full range */
      Block fullRange() const { return Block(index_.front(), index_.back()); }
      void reduceNumBlocks(int newNumBlocks)
      {
          GMX_ASSERT(newNumBlocks <= numBlocks(), "Can only shrink to fewer blocks");
-         index_.resize(newNumBlocks + 1);
+         index_.resize(newNumBlocks + 1LL);
      }
  
      /*! \brief Sets the partitioning to \p numBlocks blocks each of size 1 */
@@@ -220,6 -220,5 +223,6 @@@ void stupid_fill_blocka(t_blocka* grp, 
  
  void pr_block(FILE* fp, int indent, const char* title, const t_block* block, gmx_bool bShowNumbers);
  void pr_blocka(FILE* fp, int indent, const char* title, const t_blocka* block, gmx_bool bShowNumbers);
 +void pr_listoflists(FILE* fp, int indent, const char* title, const gmx::ListOfLists<int>* block, gmx_bool bShowNumbers);
  
  #endif
index aa7b5f5dbc8dd2e912f85031ce7f302dfff0e652,d1ae0c6ce71f26f60b9398f866daf8008be71ff3..a6a83112cac309de11cc7b6f78658b3e18180248
@@@ -3,8 -3,7 +3,8 @@@
   *
   * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   * Copyright (c) 2001-2004, The GROMACS development team.
 - * Copyright (c) 2013,2014,2015,2016,2017,2018,2019,2020, by the GROMACS development team, led by
 + * Copyright (c) 2013,2014,2015,2016,2017 by the GROMACS development team.
 + * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@@ -152,12 -151,22 +152,22 @@@ index ssize(const T& t
  #    define DO_PRAGMA(x) _Pragma(#    x)
  #    define CLANG_DIAGNOSTIC_IGNORE(warning) \
          _Pragma("clang diagnostic push") DO_PRAGMA(clang diagnostic ignored #warning)
- #    define DIAGNOSTIC_RESET _Pragma("clang diagnostic pop")
+ #    define CLANG_DIAGNOSTIC_RESET _Pragma("clang diagnostic pop")
  #else
- //! Ignore specified clang warning until DIAGNOSTIC_RESET
+ //! Ignore specified clang warning until CLANG_DIAGNOSTIC_RESET
  #    define CLANG_DIAGNOSTIC_IGNORE(warning)
  //! Reset all diagnostics to default
- #    define DIAGNOSTIC_RESET
+ #    define CLANG_DIAGNOSTIC_RESET
+ #endif
+ #ifdef _MSC_VER
+ #    define MSVC_DIAGNOSTIC_IGNORE(id) __pragma(warning(push)) __pragma(warning(disable : id))
+ #    define MSVC_DIAGNOSTIC_RESET __pragma(warning(pop))
+ #else
+ //! Ignore specified MSVC warning until MSVC_DIAGNOSTIC_RESET
+ #    define MSVC_DIAGNOSTIC_IGNORE(warning)
+ //! Reset all diagnostics to default
+ #    define MSVC_DIAGNOSTIC_RESET
  #endif
  
  namespace gmx
index e081ed4b5e2d76baa2ff3777d767eb4ee8e399a7,1a96d71ce18ece1f7b3691c9f4eb039580978a3f..a81e582b02fedaa80c1ca53ba36cc013feab6065
@@@ -3,8 -3,7 +3,8 @@@
   *
   * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   * Copyright (c) 2001-2004, The GROMACS development team.
 - * Copyright (c) 2013,2014,2015,2016,2017,2018,2019,2020, by the GROMACS development team, led by
 + * Copyright (c) 2013,2014,2015,2016,2017 by the GROMACS development team.
 + * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@@ -296,9 -295,11 +296,11 @@@ void gmx_print_version_info(gmx::TextWr
       * them. Can wait for later, as the master branch has ready code to do all
       * that. */
      writer->writeLine(formatString("C compiler:         %s", BUILD_C_COMPILER));
-     writer->writeLine(formatString("C compiler flags:   %s", BUILD_CFLAGS));
+     writer->writeLine(formatString("C compiler flags:   %s %s", BUILD_CFLAGS,
+                                    CMAKE_BUILD_CONFIGURATION_C_FLAGS));
      writer->writeLine(formatString("C++ compiler:       %s", BUILD_CXX_COMPILER));
-     writer->writeLine(formatString("C++ compiler flags: %s", BUILD_CXXFLAGS));
+     writer->writeLine(formatString("C++ compiler flags: %s %s", BUILD_CXXFLAGS,
+                                    CMAKE_BUILD_CONFIGURATION_CXX_FLAGS));
  #ifdef HAVE_LIBMKL
      /* MKL might be used for LAPACK/BLAS even if FFTs use FFTW, so keep it separate */
      writer->writeLine(formatString("Linked with Intel MKL version %d.%d.%d.", __INTEL_MKL__,
  #endif
  #if GMX_GPU == GMX_GPU_CUDA
      writer->writeLine(formatString("CUDA compiler:      %s", CUDA_COMPILER_INFO));
-     writer->writeLine(formatString("CUDA compiler flags:%s", CUDA_COMPILER_FLAGS));
+     writer->writeLine(formatString("CUDA compiler flags:%s %s", CUDA_COMPILER_FLAGS,
+                                    CMAKE_BUILD_CONFIGURATION_CXX_FLAGS));
      writer->writeLine("CUDA driver:        " + gmx::getCudaDriverVersionString());
      writer->writeLine("CUDA runtime:       " + gmx::getCudaRuntimeVersionString());
  #endif
index d0f77c29137d6a05336fc04c9a529614778b100f,bb8e8b777482a822563c4c991c0a2e7a87f72107..6ac9aacbe4b3fd66d4194c9e64a9cdf82e4aa526
@@@ -1,7 -1,7 +1,7 @@@
  #
  # This file is part of the GROMACS molecular simulation package.
  #
 -# Copyright (c) 2011,2012,2013,2014,2015, The GROMACS development team.
 +# Copyright (c) 2011,2012,2013,2014,2015 by the GROMACS development team.
  # Copyright (c) 2016,2017,2018,2019,2020, by the GROMACS development team, led by
  # Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  # and including many others, as listed in the AUTHORS file in the
@@@ -74,7 -74,7 +74,7 @@@ target_link_libraries(testutils PRIVAT
  
  if(HAVE_TINYXML2)
      include_directories(SYSTEM ${TinyXML2_INCLUDE_DIR})
-     target_link_libraries(testutils ${TinyXML2_LIBRARIES})
+     target_link_libraries(testutils PRIVATE ${TinyXML2_LIBRARIES})
  else()
      include_directories(BEFORE SYSTEM "../external/tinyxml2")
  endif()
index c17ce34641794eb71f4aa37eb87bc2122674eecf,1c4e0d71af97a5f7708fe24e4bf0ed80df9a5021..d668cb5d9c92e091fea6daa099d8cf130aee9de1
@@@ -1,8 -1,7 +1,8 @@@
  #
  # This file is part of the GROMACS molecular simulation package.
  #
 -# Copyright (c) 2011,2012,2013,2014,2015,2016,2017,2018,2019,2020, by the GROMACS development team, led by
 +# Copyright (c) 2011,2012,2013,2014,2015 by the GROMACS development team.
 +# Copyright (c) 2016,2017,2018,2019,2020, by the GROMACS development team, led by
  # Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  # and including many others, as listed in the AUTHORS file in the
  # top-level source directory and at http://www.gromacs.org.
@@@ -34,7 -33,6 +34,7 @@@
  # the research papers on the package. Check out http://www.gromacs.org.
  
  include(CMakeParseArguments)
 +include(gmxClangCudaUtils)
  
  function (gmx_add_unit_test_library NAME)
      if (GMX_BUILD_UNITTESTS AND BUILD_TESTING)
          target_compile_definitions(${NAME} PRIVATE HAVE_CONFIG_H)
          target_include_directories(${NAME} SYSTEM BEFORE PRIVATE ${PROJECT_SOURCE_DIR}/src/external/thread_mpi/include)
          target_link_libraries(${NAME} PRIVATE testutils gmock)
 -        # clang-3.6 warns about a number of issues that are not reported by more modern compilers
 -        # and we know they are not real issues. So we only check that it can compile without error
 -        # but ignore all warnings.
 -        if (CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND CMAKE_CXX_COMPILER_VERSION MATCHES "^3\.6")
 -            target_compile_options(${NAME} PRIVATE $<$<COMPILE_LANGUAGE:CXX>:-w>)
 +        if(GMX_CLANG_TIDY)
 +            set_target_properties(${NAME} PROPERTIES CXX_CLANG_TIDY
 +                "${CLANG_TIDY_EXE};-warnings-as-errors=*;-header-filter=.*")
 +        endif()
 +        if (CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER "7")
 +            target_compile_options(${NAME} PRIVATE $<$<COMPILE_LANGUAGE:CXX>:-Weverything ${IGNORED_CLANG_ALL_WARNINGS} -Wno-gnu-zero-variadic-macro-arguments -Wno-zero-as-null-pointer-constant -Wno-missing-variable-declarations>)
          endif()
  
      endif()
  endfunction ()
  
 +# This function creates a GoogleTest test executable for a module.  It
 +# hides all the complexity of how to treat different source files
 +# under different configuration conditions. It should be extended
 +# if we ever support another GPU compilation approach.
 +#
 +# It can be called with extra options and arguments:
 +#   MPI
 +#     To trigger the ctest runner to run this test with multiple ranks
 +#   HARDWARE_DETECTION
 +#     To trigger the test executable setup code to run hardware detection
 +#   CPP_SOURCE_FILES          file1.cpp file2.cpp ...
 +#     All the normal C++ .cpp source files
 +#   GPU_CPP_SOURCE_FILES  file1.cpp file2.cpp ...
 +#     All the C++ .cpp source files that are always needed, but must be
 +#     compiled in the way that suits GMX_GPU.
 +#   CUDA_CU_SOURCE_FILES      file1.cu  file2.cu  ...
 +#     All the normal CUDA .cu source files
 +#   CUDA_CPP_SOURCE_FILES     file1.cpp file2.cpp ...
 +#     All the other .cpp source files to be compiled as CUDA
 +#   OPENCL_CPP_SOURCE_FILES   file1.cpp file2.cpp ...
 +#     All the other C++ .cpp source files needed only with OpenCL
 +#   NON_GPU_CPP_SOURCE_FILES  file1.cpp file2.cpp ...
 +#     All the other C++ .cpp source files needed only with neither OpenCL nor CUDA
  function (gmx_add_gtest_executable EXENAME)
      if (GMX_BUILD_UNITTESTS AND BUILD_TESTING)
          set(_options MPI HARDWARE_DETECTION)
 -        cmake_parse_arguments(ARG "${_options}" "" "" ${ARGN})
 -        set(_source_files ${ARG_UNPARSED_ARGUMENTS})
 +        set(_multi_value_keywords
 +            CPP_SOURCE_FILES
 +            CUDA_CU_SOURCE_FILES
 +            GPU_CPP_SOURCE_FILES
 +            CUDA_CPP_SOURCE_FILES
 +            OPENCL_CPP_SOURCE_FILES
 +            NON_GPU_CPP_SOURCE_FILES
 +            )
 +        cmake_parse_arguments(ARG "${_options}" "" "${_multi_value_keywords}" ${ARGN})
  
          file(RELATIVE_PATH _input_files_path ${CMAKE_SOURCE_DIR} ${CMAKE_CURRENT_SOURCE_DIR})
          set(_temporary_files_path "${CMAKE_CURRENT_BINARY_DIR}/Testing/Temporary")
                   TEST_USES_HARDWARE_DETECTION=true)
          endif()
  
 -        add_executable(${EXENAME} ${UNITTEST_TARGET_OPTIONS}
 -            ${_source_files} ${TESTUTILS_DIR}/unittest_main.cpp)
 +        if (GMX_USE_CUDA AND NOT GMX_CLANG_CUDA)
 +            # Work around FindCUDA that prevents using target_link_libraries()
 +            # with keywords otherwise...
 +            set(CUDA_LIBRARIES PRIVATE ${CUDA_LIBRARIES})
 +            cuda_add_executable(${EXENAME} ${UNITTEST_TARGET_OPTIONS}
 +                ${ARG_CPP_SOURCE_FILES}
 +                ${ARG_CUDA_CU_SOURCE_FILES}
 +                ${ARG_CUDA_CPP_SOURCE_FILES}
 +                ${ARG_GPU_CPP_SOURCE_FILES}
 +                ${TESTUTILS_DIR}/unittest_main.cpp)
 +        else()
 +            add_executable(${EXENAME} ${UNITTEST_TARGET_OPTIONS}
 +                ${ARG_CPP_SOURCE_FILES}
 +                ${TESTUTILS_DIR}/unittest_main.cpp)
 +        endif()
 +
 +        if (GMX_USE_CUDA)
 +            if (GMX_CLANG_CUDA)
 +                target_sources(${EXENAME} PRIVATE
 +                    ${ARG_CUDA_CU_SOURCE_FILES}
 +                    ${ARG_CUDA_CPP_SOURCE_FILES}
 +                    ${ARG_GPU_CPP_SOURCE_FILES})
 +                set_source_files_properties(${ARG_CUDA_CPP_SOURCE_FILES} ${ARG_GPU_CPP_SOURCE_FILES} PROPERTIES CUDA_SOURCE_PROPERTY_FORMAT OBJ)
 +                gmx_compile_cuda_file_with_clang(${ARG_CUDA_CU_SOURCE_FILES})
 +                if(ARG_CUDA_CPP_SOURCE_FILES OR ARG_CUDA_CU_SOURCE_FILES OR ARG_GPU_CPP_SOURCE_FILES)
 +                    target_link_libraries(${EXENAME} PRIVATE ${GMX_EXTRA_LIBRARIES})
 +                endif()
 +            endif()
 +        elseif (GMX_USE_OPENCL)
 +            target_sources(${EXENAME} PRIVATE ${ARG_OPENCL_CPP_SOURCE_FILES} ${ARG_GPU_CPP_SOURCE_FILES})
 +            if(ARG_OPENCL_CPP_SOURCE_FILES OR ARG_GPU_CPP_SOURCE_FILES)
 +                target_link_libraries(${EXENAME} PRIVATE ${OpenCL_LIBRARIES})
 +            endif()
 +        else()
 +            target_sources(${EXENAME} PRIVATE ${ARG_NON_GPU_CPP_SOURCE_FILES} ${ARG_GPU_CPP_SOURCE_FILES})
 +        endif()
 +
          gmx_target_compile_options(${EXENAME})
          target_compile_definitions(${EXENAME} PRIVATE HAVE_CONFIG_H ${EXTRA_COMPILE_DEFINITIONS})
          target_include_directories(${EXENAME} SYSTEM BEFORE PRIVATE ${PROJECT_SOURCE_DIR}/src/external/thread_mpi/include)
              set_target_properties(${EXENAME} PROPERTIES CXX_CLANG_TIDY
                  "${CLANG_TIDY_EXE};-warnings-as-errors=*;-header-filter=.*")
          endif()
 -        if (CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND CMAKE_CXX_COMPILER_VERSION MATCHES "^6\.0")
 +        if (CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER "7")
              target_compile_options(${EXENAME} PRIVATE $<$<COMPILE_LANGUAGE:CXX>:-Weverything ${IGNORED_CLANG_ALL_WARNINGS} -Wno-gnu-zero-variadic-macro-arguments -Wno-zero-as-null-pointer-constant -Wno-missing-variable-declarations>)
          endif()
+         # clang-3.6 warns about a number of issues that are not reported by more modern compilers
+         # and we know they are not real issues. So we only check that it can compile without error
+         # but ignore all warnings.
+         if (CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND CMAKE_CXX_COMPILER_VERSION MATCHES "^3\.6")
+             target_compile_options(${EXENAME} PRIVATE $<$<COMPILE_LANGUAGE:CXX>:-w>)
+         endif()
      endif()
  endfunction()
  
  #   INTEGRATION_TEST      requires the use of the IntegrationTest label in CTest
  #   SLOW_TEST             requires the use of the SlowTest label in CTest, and
  #                         increase the length of the ctest timeout.
 +#   IGNORE_LEAKS          Skip some memory safety checks.
  #
  # TODO When a test case needs it, generalize the MPI_RANKS mechanism so
  # that ctest can run the test binary over a range of numbers of MPI
  # ranks.
  function (gmx_register_gtest_test NAME EXENAME)
      if (GMX_BUILD_UNITTESTS AND BUILD_TESTING)
 -        set(_options INTEGRATION_TEST SLOW_TEST)
 +        set(_options INTEGRATION_TEST SLOW_TEST IGNORE_LEAKS)
          set(_one_value_args MPI_RANKS OPENMP_THREADS)
          cmake_parse_arguments(ARG "${_options}" "${_one_value_args}" "" ${ARGN})
          set(_xml_path ${CMAKE_BINARY_DIR}/Testing/Temporary/${NAME}.xml)
              else()
                  set(_timeout 120)
              endif()
 -            gmx_get_test_prefix_cmd(_prefix_cmd IGNORE_LEAKS)
          elseif (ARG_SLOW_TEST)
              list(APPEND _labels SlowTest)
              set(_timeout 480)
 -            gmx_get_test_prefix_cmd(_prefix_cmd IGNORE_LEAKS)
          else()
              list(APPEND _labels UnitTest)
              gmx_get_test_prefix_cmd(_prefix_cmd)
          endif()
 +        if (ARG_IGNORE_LEAKS)
 +            gmx_get_test_prefix_cmd(_prefix_cmd IGNORE_LEAKS)
 +        endif ()
          set(_cmd ${_prefix_cmd} $<TARGET_FILE:${EXENAME}>)
          if (ARG_OPENMP_THREADS)
              if (GMX_OPENMP)