Merge branch 'release-2019' into master
authorPaul Bauer <paul.bauer.q@gmail.com>
Wed, 20 Feb 2019 12:50:49 +0000 (13:50 +0100)
committerPaul Bauer <paul.bauer.q@gmail.com>
Wed, 20 Feb 2019 12:50:49 +0000 (13:50 +0100)
Kept the way topologyinformation was removed from solvate and
insert_molecules in 2019.

Resolved Conflicts:
cmake/gmxVersionInfo.cmake
docs/CMakeLists.txt
docs/user-guide/mdp-options.rst
src/gromacs/gmxpreprocess/insert-molecules.cpp
src/gromacs/gmxpreprocess/solvate.cpp
src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh
src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_utils.clh
src/gromacs/mdlib/sim_util.cpp
src/gromacs/trajectoryanalysis/topologyinformation.cpp

Change-Id: I2576133c4bc7a3c55cedb29e9832bb5afa8fe76f

26 files changed:
1  2 
cmake/gmxManageNvccConfig.cmake
cmake/gmxSimdFlags.cmake
cmake/gmxVersionInfo.cmake
docs/CMakeLists.txt
docs/install-guide/index.rst
docs/release-notes/index.rst
docs/user-guide/mdp-options.rst
src/gromacs/CMakeLists.txt
src/gromacs/domdec/partition.cpp
src/gromacs/ewald/pme.cpp
src/gromacs/ewald/tests/pmetestcommon.cpp
src/gromacs/ewald/tests/testhardwarecontexts.cpp
src/gromacs/gmxpreprocess/insert_molecules.cpp
src/gromacs/gmxpreprocess/readir.cpp
src/gromacs/gmxpreprocess/solvate.cpp
src/gromacs/mdlib/constr.cpp
src/gromacs/mdlib/constr.h
src/gromacs/mdlib/force.cpp
src/gromacs/mdlib/sim_util.cpp
src/gromacs/mdrun/rerun.cpp
src/gromacs/mdrun/runner.cpp
src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel_utils.cuh
src/gromacs/nbnxm/opencl/nbnxm_ocl_kernel_utils.clh
src/gromacs/taskassignment/decidegpuusage.cpp
src/gromacs/taskassignment/resourcedivision.cpp
src/gromacs/trajectoryanalysis/topologyinformation.cpp

index c4a5abaaf4697ebe00d073c8f5b334c909b0d651,75ed125cbb475cff0f987f99596aa547ffc9f33f..f88297e960f2c1a52cecaf9f11a96cefe63bf250
@@@ -67,10 -67,8 +67,10 @@@ if(CUDA_HOST_COMPILER_CHANGED
      set(CUDA_HOST_COMPILER_OPTIONS "")
  
      if(APPLE AND CMAKE_C_COMPILER_ID MATCHES "GNU")
 -        # Some versions of gcc-4.8 and gcc-4.9 produce errors (in particular on OS X)
 -        # if we do not use -D__STRICT_ANSI__. It is harmless, so we might as well add it for all versions.
 +        # Some versions of gcc-4.8 and gcc-4.9 have produced errors
 +        # (in particular on OS X) if we do not use
 +        # -D__STRICT_ANSI__. It is harmless, so we might as well add
 +        # it for all versions.
          list(APPEND CUDA_HOST_COMPILER_OPTIONS "-D__STRICT_ANSI__")
      endif()
  
@@@ -99,30 -97,54 +99,30 @@@ if (GMX_CUDA_TARGET_SM OR GMX_CUDA_TARG
      endforeach()
  else()
      # Set the CUDA GPU architectures to compile for:
 -    # - with CUDA >=5.0 <6.5:   CC <=3.5 is supported
 -    #     => compile sm_30, sm_35 SASS, and compute_35 PTX
 -    # - with CUDA ==6.5:        CC <=3.7 and 5.0 are supported
 -    #     => compile sm_30, sm_35, sm_37 sm_50, SASS, and compute_50 PTX
 -    # - with CUDA >=7.0         CC 5.2 is supported (5.3, Tegra X1 we don't generate code for)
 -    #     => compile sm_30, sm_35, sm_37, sm_50, & sm_52 SASS, and compute_52 PTX
 -    # - with CUDA >=8.0         CC 6.0-6.2 is supported (but we know nothing about CC 6.2, so we won't generate code or it)
 -    #     => compile sm_30, sm_35, sm_37, sm_50, sm_52, sm_60, sm_61 SASS, and compute_60 and compute_61 PTX
      # - with CUDA >=9.0         CC 7.0 is supported and CC 2.0 is no longer supported
      #     => compile sm_30, sm_35, sm_37, sm_50, sm_52, sm_60, sm_61, sm_70 SASS, and compute_70 PTX
 -    #
 -    #   Note that CUDA 6.5.19 second patch release supports cc 5.2 too, but
 -    #   CUDA_VERSION does not contain patch version and having PTX 5.0 JIT-ed is
 -    #   equally fast as compiling with sm_5.2 anyway.
 +    # - with CUDA >=10.0        CC 7.5 is supported
 +    #     => compile sm_30, sm_35, sm_37, sm_50, sm_52, sm_60, sm_61, sm_70, sm_75 SASS, and compute_75 PTX
  
      # First add flags that trigger SASS (binary) code generation for physical arch
      list (APPEND GMX_CUDA_NVCC_GENCODE_FLAGS "-gencode;arch=compute_30,code=sm_30")
      list (APPEND GMX_CUDA_NVCC_GENCODE_FLAGS "-gencode;arch=compute_35,code=sm_35")
 -
 -    if(NOT CUDA_VERSION VERSION_LESS "6.5") # >= 6.5
 -        list (APPEND GMX_CUDA_NVCC_GENCODE_FLAGS "-gencode;arch=compute_37,code=sm_37")
 -        list (APPEND GMX_CUDA_NVCC_GENCODE_FLAGS "-gencode;arch=compute_50,code=sm_50")
 -    endif()
 -    if(NOT CUDA_VERSION VERSION_LESS "7.0") # >= 7.0
 -        list (APPEND GMX_CUDA_NVCC_GENCODE_FLAGS "-gencode;arch=compute_52,code=sm_52")
 -    endif()
 -    if(NOT CUDA_VERSION VERSION_LESS "8.0") # >= 8.0
 -        list (APPEND GMX_CUDA_NVCC_GENCODE_FLAGS "-gencode;arch=compute_60,code=sm_60")
 -        list (APPEND GMX_CUDA_NVCC_GENCODE_FLAGS "-gencode;arch=compute_61,code=sm_61")
 -    endif()
 -    if(NOT CUDA_VERSION VERSION_LESS "9.0") # >= 9.0
 -        list (APPEND GMX_CUDA_NVCC_GENCODE_FLAGS "-gencode;arch=compute_70,code=sm_70")
 -    endif()
 +    list (APPEND GMX_CUDA_NVCC_GENCODE_FLAGS "-gencode;arch=compute_37,code=sm_37")
 +    list (APPEND GMX_CUDA_NVCC_GENCODE_FLAGS "-gencode;arch=compute_50,code=sm_50")
 +    list (APPEND GMX_CUDA_NVCC_GENCODE_FLAGS "-gencode;arch=compute_52,code=sm_52")
 +    list (APPEND GMX_CUDA_NVCC_GENCODE_FLAGS "-gencode;arch=compute_60,code=sm_60")
 +    list (APPEND GMX_CUDA_NVCC_GENCODE_FLAGS "-gencode;arch=compute_61,code=sm_61")
 +    list (APPEND GMX_CUDA_NVCC_GENCODE_FLAGS "-gencode;arch=compute_70,code=sm_70")
  
      # Next add flags that trigger PTX code generation for the newest supported virtual arch
      # that's useful to JIT to future architectures
 -    if(CUDA_VERSION VERSION_LESS "6.5")
 -        list (APPEND GMX_CUDA_NVCC_GENCODE_FLAGS "-gencode;arch=compute_35,code=compute_35")
 -    elseif(CUDA_VERSION VERSION_LESS "7.0")
 -        list (APPEND GMX_CUDA_NVCC_GENCODE_FLAGS "-gencode;arch=compute_50,code=compute_50")
 -    elseif(CUDA_VERSION VERSION_LESS "8.0")
 -        list (APPEND GMX_CUDA_NVCC_GENCODE_FLAGS "-gencode;arch=compute_52,code=compute_52")
 -    elseif(CUDA_VERSION VERSION_LESS "9.0")
 -        list (APPEND GMX_CUDA_NVCC_GENCODE_FLAGS "-gencode;arch=compute_60,code=compute_60")
 -        list (APPEND GMX_CUDA_NVCC_GENCODE_FLAGS "-gencode;arch=compute_61,code=compute_61")
 -    elseif(CUDA_VERSION VERSION_LESS "10.0")
 -        list (APPEND GMX_CUDA_NVCC_GENCODE_FLAGS "-gencode;arch=compute_70,code=compute_70")
 -    else() # version >= 10.0
 +    list (APPEND GMX_CUDA_NVCC_GENCODE_FLAGS "-gencode;arch=compute_35,code=compute_35")
 +    list (APPEND GMX_CUDA_NVCC_GENCODE_FLAGS "-gencode;arch=compute_50,code=compute_50")
 +    list (APPEND GMX_CUDA_NVCC_GENCODE_FLAGS "-gencode;arch=compute_52,code=compute_52")
 +    list (APPEND GMX_CUDA_NVCC_GENCODE_FLAGS "-gencode;arch=compute_60,code=compute_60")
 +    list (APPEND GMX_CUDA_NVCC_GENCODE_FLAGS "-gencode;arch=compute_61,code=compute_61")
 +    list (APPEND GMX_CUDA_NVCC_GENCODE_FLAGS "-gencode;arch=compute_70,code=compute_70")
 +    if(NOT CUDA_VERSION VERSION_LESS "10.0")
          list (APPEND GMX_CUDA_NVCC_GENCODE_FLAGS "-gencode;arch=compute_75,code=compute_75")
      endif()
  endif()
@@@ -136,23 -158,6 +136,23 @@@ if (GMX_CUDA_TARGET_COMPUTE
      set_property(CACHE GMX_CUDA_TARGET_COMPUTE PROPERTY TYPE STRING)
  endif()
  
 +# FindCUDA.cmake is unaware of the mechanism used by cmake to embed
 +# the compiler flag for the required C++ standard in the generated
 +# build files, so we have to pass it ourselves
 +if (MSVC)
 +    # We use C++14 on MSVC, but cmake does not understand the
 +    # necessary compilation option for that until version 3.10, so we
 +    # can remove this after we require that version.
 +    if (NOT CMAKE_CXX14_STANDARD_COMPILE_OPTION)
 +        set(GMX_CXX_STANDARD_COMPILE_OPTION "-std:c++14")
 +    else()
 +        set(GMX_CXX_STANDARD_COMPILE_OPTION "${CMAKE_CXX14_STANDARD_COMPILE_OPTION}")
 +    endif()
 +else()
 +    set(GMX_CXX_STANDARD_COMPILE_OPTION "${CMAKE_CXX14_STANDARD_COMPILE_OPTION}")
 +endif()
 +list(APPEND GMX_CUDA_NVCC_FLAGS "${GMX_CXX_STANDARD_COMPILE_OPTION}")
 +
  # assemble the CUDA flags
  list(APPEND GMX_CUDA_NVCC_FLAGS "${GMX_CUDA_NVCC_GENCODE_FLAGS}")
  list(APPEND GMX_CUDA_NVCC_FLAGS "-use_fast_math")
@@@ -163,7 -168,15 +163,15 @@@ list(APPEND GMX_CUDA_NVCC_FLAGS "${CUDA
  string(TOUPPER "${CMAKE_BUILD_TYPE}" _build_type)
  gmx_check_if_changed(_cuda_nvcc_executable_or_flags_changed CUDA_NVCC_EXECUTABLE CUDA_NVCC_FLAGS CUDA_NVCC_FLAGS_${_build_type})
  
- if(_cuda_nvcc_executable_or_flags_changed OR CUDA_HOST_COMPILER_CHANGED OR NOT GMX_NVCC_WORKS)
+ # We would like to be helpful and reject the host compiler with a
+ # clear error message at configure time, rather than let nvcc
+ # later reject the host compiler as not supported when the first
+ # CUDA source file is built. We've implemented that for current
+ # nvcc running on Unix-like systems, but e.g. changes to nvcc
+ # will further affect the limited portability of this checking
+ # code. Set the CMake variable GMX_NVCC_WORKS on if you want to
+ # bypass this check.
+ if((_cuda_nvcc_executable_or_flags_changed OR CUDA_HOST_COMPILER_CHANGED OR NOT GMX_NVCC_WORKS) AND NOT WIN32)
      message(STATUS "Check for working NVCC/C compiler combination")
      execute_process(COMMAND ${CUDA_NVCC_EXECUTABLE} -ccbin ${CUDA_HOST_COMPILER} -c ${CUDA_NVCC_FLAGS} ${CUDA_NVCC_FLAGS_${_build_type}} ${CMAKE_SOURCE_DIR}/cmake/TestCUDA.cu
          RESULT_VARIABLE _cuda_test_res
diff --combined cmake/gmxSimdFlags.cmake
index bf17e6af819fd29be44935b6b1cbeb60b88e95e4,83b3e97ffba9cb181ab5605ed596f9f40a2a3a5b..3dce16fc942189fc4343d8d4ac891e7f34042c38
@@@ -58,14 -58,19 +58,19 @@@ macro(find_power_vsx_toolchain_flags TO
          # VSX uses the same function API as Altivec/VMX, so make sure we tune for the current CPU and not VMX.
          # By putting these flags here rather than in the general compiler flags file we can safely assume
          # that we are at least on Power7 since that is when VSX appeared.
+         # NOTE:  Enabling instruction fusion on Power8/9 using -mpower8-fusion/-mpower9-fusion
+         #        seems to produce buggy code (see #2747, #2746, #2734).
+         #        Note that instruction fusion does have considerable performance benefits
+         #        (up to 8% measured with gcc 8) if the issue is resolved the flag can be re-enabled.
          gmx_run_cpu_detection(brand)
          if(CPU_DETECTION_BRAND MATCHES "POWER7")
              gmx_test_cflag(GNU_C_VSX_POWER7   "-mcpu=power7 -mtune=power7" ${TOOLCHAIN_C_FLAGS_VARIABLE})
              gmx_test_cflag(GNU_CXX_VSX_POWER7 "-mcpu=power7 -mtune=power7" ${TOOLCHAIN_CXX_FLAGS_VARIABLE})
          elseif(CPU_DETECTION_BRAND MATCHES "POWER8")
              # Enable power8 vector extensions on such platforms.
-             gmx_test_cflag(GNU_C_VSX_POWER8   "-mcpu=power8 -mpower8-vector -mpower8-fusion" ${TOOLCHAIN_C_FLAGS_VARIABLE})
-             gmx_test_cflag(GNU_CXX_VSX_POWER8 "-mcpu=power8 -mpower8-vector -mpower8-fusion" ${TOOLCHAIN_CXX_FLAGS_VARIABLE})
+             gmx_test_cflag(GNU_C_VSX_POWER8   "-mcpu=power8 -mpower8-vector" ${TOOLCHAIN_C_FLAGS_VARIABLE})
+             gmx_test_cflag(GNU_CXX_VSX_POWER8 "-mcpu=power8 -mpower8-vector" ${TOOLCHAIN_CXX_FLAGS_VARIABLE})
          elseif(CPU_DETECTION_BRAND MATCHES "POWER9")
              # Enable power9 vector extensions on such platforms.
              # TODO consider whether adding " -mpower9-vector -mpower9-fusion"
          else()
              # Don't add arch-specific flags for unknown architectures.
          endif()
 -        # Altivec was originally single-only, and it took a while for compilers
 -        # to support the double-precision features in VSX.
 -        if(GMX_DOUBLE AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS "4.9")
 -            message(FATAL_ERROR "Using VSX SIMD in double precision with GCC requires GCC-4.9 or later.")
 -        endif()
      endif()
      if(${CMAKE_CXX_COMPILER_ID} MATCHES "XL" OR ${CMAKE_C_COMPILER_ID} MATCHES "XL")
          if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS "13.1.5" OR CMAKE_C_COMPILER_VERSION VERSION_LESS "13.1.5")
@@@ -128,6 -138,30 +133,6 @@@ endfunction(
  function(gmx_find_simd_avx_128_fma_flags C_FLAGS_RESULT CXX_FLAGS_RESULT C_FLAGS_VARIABLE CXX_FLAGS_VARIABLE)
      find_x86_toolchain_flags(TOOLCHAIN_C_FLAGS TOOLCHAIN_CXX_FLAGS)
  
 -    # We don't have the full compiler version string yet (BUILD_C_COMPILER),
 -    # so we can't distinguish vanilla from Apple clang versions, but catering for a few rare AMD
 -    # hackintoshes is not worth the effort.
 -    if (APPLE AND (CMAKE_C_COMPILER_ID STREQUAL "Clang" OR
 -                CMAKE_CXX_COMPILER_ID STREQUAL "Clang"))
 -        message(WARNING "Due to a known compiler bug, Clang up to version 3.2 (and Apple Clang up to version 4.1) produces incorrect code with AVX_128_FMA SIMD.")
 -    endif()
 -
 -    # clang <=3.2 contains a bug that causes incorrect code to be generated for the
 -    # vfmaddps instruction and therefore the bug is triggered with AVX_128_FMA.
 -    # (see: http://llvm.org/bugs/show_bug.cgi?id=15040).
 -    # We can work around this by not using the integrated assembler (except on OS X
 -    # which has an outdated assembler that does not support AVX instructions).
 -    if (CMAKE_C_COMPILER_ID MATCHES "Clang" AND CMAKE_C_COMPILER_VERSION VERSION_LESS "3.3")
 -        # we assume that we have an external assembler that supports AVX
 -        message(STATUS "Clang ${CMAKE_C_COMPILER_VERSION} detected, enabling FMA bug workaround")
 -        set(TOOLCHAIN_C_FLAGS "${TOOLCHAIN_C_FLAGS} -no-integrated-as")
 -    endif()
 -    if (CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS "3.3")
 -        # we assume that we have an external assembler that supports AVX
 -        message(STATUS "Clang ${CMAKE_CXX_COMPILER_VERSION} detected, enabling FMA bug workaround")
 -        set(TOOLCHAIN_CXX_FLAGS "${TOOLCHAIN_CXX_FLAGS} -no-integrated-as")
 -    endif()
 -
      # AVX128/FMA on AMD is a bit complicated. We need to do detection in three stages:
      # 1) Find the flags required for generic AVX support
      # 2) Find the flags necessary to enable fused-multiply add support
index a804d132e93ba529643f81edfbd2d54df1548ba9,2d86decf59e4df012a3aa4d60eb9a6990bc5b3e7..5dcffe6707ad6cc4538176a3f3440b213c8da930
@@@ -57,7 -57,6 +57,7 @@@
  #         GROMACS     2016   2
  #         GROMACS     2018   3
  #         GROMACS     2019   4
 +#         GROMACS     2020   5
  #   LIBRARY_SOVERSION_MINOR so minor version for the built libraries.
  #       Should be increased for each release that changes only the implementation.
  #       In GROMACS, the typical policy is to increase it for each patch version
@@@ -83,7 -82,6 +83,6 @@@
  #       This ID is used for the source code tarball.
  #   GMX_MANUAL_DOI_ID
  #       Same as above, but for the reference manual.
- # Setting and retrieving of those variables is handled in gmxCheckReleaseDOI.cmake
  # They are collected into a single section below.
  # The following variables are set based on these:
  #   GMX_VERSION            String composed from GMX_VERSION_* numeric variables
  
  # The GROMACS convention is that these are the version number of the next
  # release that is going to be made from this branch.
 -set(GMX_VERSION_MAJOR 2019)
 -set(GMX_VERSION_PATCH 2)
 +set(GMX_VERSION_MAJOR 2020)
 +set(GMX_VERSION_PATCH 0)
  # The suffix, on the other hand, is used mainly for betas and release
  # candidates, where it signifies the most recent such release from
  # this branch; it will be empty before the first such release, as well
@@@ -213,7 -211,7 +212,7 @@@ set(GMX_VERSION_SUFFIX ""
  # here. The important thing is to minimize the chance of third-party
  # code being able to dynamically link with a version of libgromacs
  # that might not work.
 -set(LIBRARY_SOVERSION_MAJOR 4)
 +set(LIBRARY_SOVERSION_MAJOR 5)
  set(LIBRARY_SOVERSION_MINOR 0)
  set(LIBRARY_VERSION ${LIBRARY_SOVERSION_MAJOR}.${LIBRARY_SOVERSION_MINOR}.0)
  
@@@ -236,13 -234,13 +235,13 @@@ if (NOT SOURCE_IS_SOURCE_DISTRIBUTION A
  endif()
  
  set(REGRESSIONTEST_VERSION "${GMX_VERSION_STRING}")
 -set(REGRESSIONTEST_BRANCH "refs/heads/release-2019")
 +set(REGRESSIONTEST_BRANCH "refs/heads/master")
  # Run the regressiontests packaging job with the correct pakage
  # version string, and the release box checked, in order to have it
  # build the regressiontests tarball with all the right naming. The
  # naming affects the md5sum that has to go here, and if it isn't right
  # release workflow will report a failure.
 -set(REGRESSIONTEST_MD5SUM "36103a0078bf9a802e16ec982b4e5c4c" CACHE INTERNAL "MD5 sum of the regressiontests tarball for this GROMACS version")
 +set(REGRESSIONTEST_MD5SUM "3d06d41e07f523d70ae575b9ad75c670" CACHE INTERNAL "MD5 sum of the regressiontests tarball for this GROMACS version")
  
  math(EXPR GMX_VERSION_NUMERIC
       "${GMX_VERSION_MAJOR}*10000 + ${GMX_VERSION_PATCH}")
diff --combined docs/CMakeLists.txt
index 8ad2e04f5a0fa9e773899a049354432bb002a513,65c8c5756f5b14ece9c420aa7be1ef112ef0bec5..9f6c80457a8492f81970588fc1ee8a9412cb69bf
@@@ -245,7 -245,6 +245,7 @@@ if (SPHINX_FOUND
          reference-manual/analysis/rmsd.rst
          reference-manual/analysis/covariance-analysis.rst
          reference-manual/analysis/dihedral-pca.rst
 +        reference-manual/analysis/hydrogen-bonds.rst
          reference-manual/analysis/protein-related.rst
          reference-manual/analysis/interface-related.rst)
      # The image files have also been ordered by the respective
          how-to/visualize.rst
          install-guide/index.rst
          release-notes/index.rst
 +        release-notes/highlights.rst
 +        release-notes/features.rst
 +        release-notes/performance.rst
 +        release-notes/tools.rst
 +        release-notes/bugs-fixed.rst
 +        release-notes/removed-functionality.rst
 +        release-notes/deprecated-functionality.rst
 +        release-notes/portability.rst
 +        release-notes/miscellaneous.rst
+         release-notes/2019/2019.2.rst
          release-notes/2019/2019.1.rst
          release-notes/2019/major/highlights.rst
          release-notes/2019/major/features.rst
      add_custom_target(install-guide
          COMMAND
              ${SPHINX_EXECUTABLE}
 -            -q -E -b text
 +            -q -b text
              -w sphinx-install.log
              -d ${CMAKE_CURRENT_BINARY_DIR}/install-guide/_doctrees
              -c ${SPHINX_INPUT_DIR}
      add_custom_target(webpage-sphinx
          DEPENDS sphinx-programs
          DEPENDS sphinx-input
 -        DEPENDS sphinx-image-conversion 
 +        DEPENDS sphinx-image-conversion
 +        DEPENDS manual 
          COMMAND
              ${CMAKE_COMMAND} -E make_directory ${SPHINX_INPUT_DIR}/_static
          COMMAND
              ${SPHINX_EXECUTABLE}
 -            -q -E -b html
 +            -q -b html
              -w sphinx-html.log
              -d "${SPHINX_CACHE_DIR}"
              "${SPHINX_INPUT_DIR}"
      add_custom_target(man
          COMMAND
              ${SPHINX_EXECUTABLE}
 -            -q -E -b man
 +            -q -b man
              -w sphinx-man.log
              -d ${SPHINX_CACHE_DIR}
              -t do_man
index 20d3b60d44c9174a0997d2aea3a34f5bd0af3485,4fd59acaa4b0a74d9eeeb344ed2d20137305f50a..76cf1f8ad367c42b47d30b8808544deba7762e25
@@@ -55,7 -55,7 +55,7 @@@ Quick and dirty cluster installatio
  
  On a cluster where users are expected to be running across multiple
  nodes using MPI, make one installation similar to the above, and
 -another using an MPI wrapper compiler and which is `building only
 +another using ``-DGMX_MPI=on`` and which is `building only
  mdrun`_, because that is the only component of |Gromacs| that uses
  MPI. The latter will install a single simulation engine binary,
  i.e. ``mdrun_mpi`` when the default suffix is used. Hence it is safe
@@@ -104,20 -104,20 +104,20 @@@ PowerPC including POWER8, ARM v7, ARM v
  Compiler
  ^^^^^^^^
  
 -|Gromacs| can be compiled on any platform with ANSI C99 and C++11
 +|Gromacs| can be compiled on any platform with ANSI C99 and C++14
  compilers, and their respective standard C/C++ libraries. Good
  performance on an OS and architecture requires choosing a good
  compiler. We recommend gcc, because it is free, widely available and
  frequently provides the best performance.
  
  You should strive to use the most recent version of your
 -compiler. Since we require full C++11 support the minimum supported
 +compiler. Since we require full C++14 support the minimum supported
  compiler versions are
  
 -* GNU (gcc) 4.8.1
 +* GNU (gcc) 5.1
  * Intel (icc) 17.0.1
 -* LLVM (clang) 3.3
 -* Microsoft (MSVC) 2017 (C++14 is used)
 +* LLVM (clang) 3.6
 +* Microsoft (MSVC) 2017
  
  Other compilers may work (Cray, Pathscale, older clang) but do
  not offer competitive performance. We recommend against PGI because
@@@ -131,24 -131,32 +131,24 @@@ You may also need the most recent versi
  components beside the compiler itself (e.g. assembler or linker);
  these are often shipped by your OS distribution's binutils package.
  
 -C++11 support requires adequate support in both the compiler and the
 +C++14 support requires adequate support in both the compiler and the
  C++ library. The gcc and MSVC compilers include their own standard
 -libraries and require no further configuration. For configuration of
 -other compilers, read on.
 +libraries and require no further configuration. If your vendor's
 +compiler also manages the standard library library via compiler flags,
 +these will be honored. For configuration of other compilers, read on.
  
  On Linux, both the Intel and clang compiler use the libstdc++ which
  comes with gcc as the default C++ library. For |Gromacs|, we require
 -the compiler to support libstc++ version 4.8.1 or higher. To select a
 -particular libstdc++ library, use:
 -
 -* For Intel: ``-DGMX_STDLIB_CXX_FLAGS=-gcc-name=/path/to/gcc/binary``
 -  or make sure that the correct gcc version is first in path (e.g. by
 -  loading the gcc module). It can also be useful to add
 -  ``-DCMAKE_CXX_LINK_FLAGS="-Wl,-rpath,/path/to/gcc/lib64
 -  -L/path/to/gcc/lib64"`` to ensure linking works correctly.
 -* For clang:
 -  ``-DCMAKE_CXX_FLAGS=--gcc-toolchain=/path/to/gcc/folder``. This
 -  folder should contain ``include/c++``.
 +the compiler to support libstc++ version 5.1 or higher. To select a
 +particular libstdc++ library, provide the path to g++ with
 +``-DGMX_GPLUSPLUS_PATH=/path/to/g++``.
  
  On Windows with the Intel compiler, the MSVC standard library is used,
  and at least MSVC 2017 is required. Load the enviroment variables with
  vcvarsall.bat.
  
 -To build with any compiler and clang's libcxx standard library, use
 -``-DGMX_STDLIB_CXX_FLAGS=-stdlib=libc++
 --DGMX_STDLIB_LIBRARIES='-lc++abi -lc++'``.
 +To build with clang and llvm's libcxx standard library, use
 +``-DCMAKE_CXX_FLAGS=-stdlib=libc++``.
  
  If you are running on Mac OS X, the best option is the Intel
  compiler. Both clang and gcc will work, but they produce lower
@@@ -179,7 -187,8 +179,7 @@@ GPU suppor
  
  |Gromacs| has excellent support for NVIDIA GPUs supported via CUDA.
  On Linux, NVIDIA CUDA_ toolkit with minimum version |REQUIRED_CUDA_VERSION|
 -is required, and the latest version is strongly encouraged. Using
 -Microsoft MSVC compiler requires version 9.0. NVIDIA GPUs with at
 +is required, and the latest version is strongly encouraged. NVIDIA GPUs with at
  least NVIDIA compute capability |REQUIRED_CUDA_COMPUTE_CAPABILITY| are
  required. You are strongly recommended to
  get the latest CUDA version and driver that supports your hardware, but
@@@ -194,7 -203,8 +194,8 @@@ version for |Gromacs| code as used as t
  
  To make it possible to use other accelerators, |Gromacs| also includes
  OpenCL_ support. The minimum OpenCL version required is
- |REQUIRED_OPENCL_MIN_VERSION|. The current OpenCL implementation is recommended for
+ |REQUIRED_OPENCL_MIN_VERSION| and only 64-bit implementations are supported.
+ The current OpenCL implementation is recommended for
  use with GCN-based AMD GPUs, and on Linux we recommend the ROCm runtime.
  Intel integrated GPUs are supported with the Neo drivers.
  OpenCL is also supported with NVIDIA GPUs, but using
@@@ -203,7 -213,8 +204,8 @@@ recommended. Also note that there are p
  to the NVIDIA OpenCL runtime).
  It is not possible to configure both CUDA and OpenCL
  support in the same build of |Gromacs|, nor to support both
- Intel and other vendors' GPUs with OpenCL.
+ Intel and other vendors' GPUs with OpenCL. A 64-bit implementation
+ of OpenCL is required and therefore OpenCL is only supported on 64-bit platforms.
  
  .. _mpi-support:
  
@@@ -221,11 -232,6 +223,11 @@@ you will need to hav
    standard, and
  * wrapper compilers that will compile code using that library.
  
 +To compile with MPI set your compiler to the normal (non-MPI) compiler
 +and add ``-DGMX_MPI=on`` to the cmake options. It is possible to set
 +the compiler to the MPI compiler wrapper but it is neither necessary
 +nor recommended.
 +
  The |Gromacs| team recommends OpenMPI_ version
  1.6 (or higher), MPICH_ version 1.4.1 (or
  higher), or your hardware vendor's MPI installation. The most recent
@@@ -663,8 -669,8 +665,8 @@@ best-tested and supported of these. Lin
  CPUs also works well.
  
  Experimental support is available for compiling CUDA code, both for host and
 -device, using clang (version 3.9 or later).
 -A CUDA toolkit (>= v7.0) is still required but it is used only for GPU device code
 +device, using clang (version 6.0 or later).
 +A CUDA toolkit is still required but it is used only for GPU device code
  generation and to link against the CUDA runtime library.
  The clang CUDA support simplifies compilation and provides benefits for development
  (e.g. allows the use code sanitizers in CUDA host-code).
@@@ -677,7 -683,7 +679,7 @@@ virtual architecture code is always emb
  Note that this is mainly a developer-oriented feature and it is not recommended
  for production use as the performance can be significantly lower than that
  of code compiled with nvcc (and it has also received less testing).
 -However, note that with clang 5.0 the performance gap is significantly narrowed
 +However, note that since clang 5.0 the performance gap is only moderate
  (at the time of writing, about 20% slower GPU kernels), so this version
  could be considered in non performance-critical use-cases.
  
@@@ -1255,12 -1261,11 +1257,12 @@@ much everywhere, it is important that w
  it works because we have tested it. We do test on Linux, Windows, and
  Mac with a range of compilers and libraries for a range of our
  configuration options. Every commit in our git source code repository
 -is currently tested on x86 with a number of gcc versions ranging from 4.8.1
 -through 7, versions 16 and 18 of the Intel compiler, and Clang
 -versions 3.4 through 5. For this, we use a variety of GNU/Linux
 +is currently tested on x86 with a number of gcc versions ranging from 5.1
 +through 8.1, version 19 of the Intel compiler, and Clang
 +versions 3.6 through 7. For this, we use a variety of GNU/Linux
  flavors and versions as well as recent versions of Windows. Under
  Windows, we test both MSVC 2017 and version 16 of the Intel compiler.
 +Other compiler, library, and OS versions are tested less frequently.
  For details, you can
  have a look at the `continuous integration server used by GROMACS`_,
  which runs Jenkins_.
index ac2a9646f992ee4abd381635cf2e1e43762d50ce,93bdb09b56d0f82c15a2a2ad7b36c531bfa835d8..b0d564b938b259ac453d586122c83db2daba1522
@@@ -8,35 -8,18 +8,35 @@@ releases of |Gromacs|. Major releases c
  functionality supported, whereas patch releases contain only fixes for
  issues identified in the corresponding major releases.
  
 -Two versions of |Gromacs| are under active maintenance, the 2019
 -series and the 2018 series. In the latter, only highly conservative
 +Two versions of |Gromacs| are under active maintenance, the NEXT
 +series and the 2019 series. In the latter, only highly conservative
  fixes will be made, and only to address issues that affect scientific
  correctness. Naturally, some of those releases will be made after the
 -year 2018 ends, but we keep 2018 in the name so users understand how
 +year 2019 ends, but we keep 2018 in the name so users understand how
  up to date their version is. Such fixes will also be incorporated into
 -the 2019 release series, as appropriate. Around the time the 2020
 -release is made, the 2018 series will no longer be maintained.
 +the NEXT release series, as appropriate. Around the time the NEXT+1
 +release is made, the 2019 series will no longer be maintained.
  
  Where issue numbers are reported in these release notes, more details
  can be found at https://redmine.gromacs.org at that issue number.
  
 +|Gromacs| NEXT series
 +---------------------
 +
 +.. toctree::
 +   :maxdepth: 1
 +
 +   highlights
 +   features
 +   performance
 +   tools
 +   bugs-fixed
 +   deprecated-functionality
 +   removed-functionality
 +   portability
 +   miscellaneous
 +
 +
  |Gromacs| 2019 series
  ---------------------
  
@@@ -46,6 -29,7 +46,7 @@@ Patch release
  .. toctree::
     :maxdepth: 1
  
+    2019/2019.2
     2019/2019.1
  
  Major release
index ead69bfce4fe30c8c451563bd72ade01d1c9d66a,1e41d0cd1ccc1e16a15e88fb3671c8d09e30457d..3801cb8d331224c259404895f7b08a0bc319e320
@@@ -1075,7 -1075,7 +1075,7 @@@ Temperature couplin
     integrators, the leap-frog :mdp-value:`integrator=md` integrator
     only supports 1. Data for the NH chain variables is not printed
     to the :ref:`edr` file by default, but can be turned on with the
 -   :mdp:`print-nose-hoover-chains` option.
 +   :mdp:`print-nose-hoover-chain-variables` option.
  
  .. mdp:: print-nose-hoover-chain-variables
  
@@@ -3041,28 -3041,29 +3041,29 @@@ Non-equilibrium M
  Electric fields
  ^^^^^^^^^^^^^^^
  
- .. mdp:: electric-field-x ; electric-field-y ; electric-field-z
+ .. mdp:: electric-field-x
+ .. mdp:: electric-field-y
+ .. mdp:: electric-field-z
  
     Here you can specify an electric field that optionally can be
     alternating and pulsed. The general expression for the field
     has the form of a gaussian laser pulse:
  
-    E(t) = E0 exp ( -(t-t0)\ :sup:`2`/(2 sigma\ :sup:`2`) ) cos(omega (t-t0))
+    .. math:: E(t) = E_0 \exp\left[-\frac{(t-t_0)^2}{2\sigma^2}\right]\cos\left[\omega (t-t_0)\right]
  
     For example, the four parameters for direction x are set in the
-    three fields of ``electric-field-x`` (and similar for y and z)
-    like
+    fields of :mdp:`electric-field-x` (and similar for ``electric-field-y``
+    and ``electric-field-z``) like
  
-    electric-field-x  = E0 omega t0 sigma
+    ``electric-field-x  = E0 omega t0 sigma``
  
-    In the special case that sigma = 0, the exponential term is omitted
-    and only the cosine term is used. If also omega = 0 a static
-    electric field is applied.
+    with units (respectively) V nm\ :sup:`-1`, ps\ :sup:`-1`, ps, ps.
  
-    More details in Carl Caleman and David van der Spoel: Picosecond
-    Melting of Ice by an Infrared Laser Pulse - A Simulation Study.
-    Angew. Chem. Intl. Ed. 47 pp. 14 17-1420 (2008)
+    In the special case that ``sigma = 0``, the exponential term is omitted
+    and only the cosine term is used. If also ``omega = 0`` a static
+    electric field is applied.
  
+    Read more at :ref:`electric fields` and in ref. \ :ref:`146 <refCaleman2008a>`.
  
  
  Mixed quantum/classical molecular dynamics
index 71d72783dac0c09e7614d92b97b56422e45368d9,f94af553a91047274f879856836782856d62abcb..82de302e9d2683d2634b361b532c91037e3eea23
@@@ -95,9 -95,8 +95,9 @@@ endfunction(
  
  add_subdirectory(gmxlib)
  add_subdirectory(mdlib)
 -add_subdirectory(applied-forces)
 -add_subdirectory(listed-forces)
 +add_subdirectory(applied_forces)
 +add_subdirectory(listed_forces)
 +add_subdirectory(nbnxm)
  add_subdirectory(commandline)
  add_subdirectory(domdec)
  add_subdirectory(ewald)
@@@ -108,7 -107,6 +108,7 @@@ add_subdirectory(linearalgebra
  add_subdirectory(math)
  add_subdirectory(mdrun)
  add_subdirectory(mdrunutility)
 +add_subdirectory(mdspan)
  add_subdirectory(mdtypes)
  add_subdirectory(onlinehelp)
  add_subdirectory(options)
@@@ -282,7 -280,7 +282,7 @@@ endif(
  # source files, and issue warnings about that. Remove the use of
  # -Wno-missing-prototypes here and above when the group scheme is
  # removed.
- if (HAS_NO_MISSING_PROTO AND "${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
+ if (HAS_NO_MISSING_PROTO AND "${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
      target_compile_options(libgromacs_generated PRIVATE "-Wno-missing-prototypes")
  endif()
  if (HAS_NO_MSVC_UNUSED)
@@@ -309,17 -307,15 +309,17 @@@ target_link_libraries(libgromac
                        ${FFT_LIBRARIES} ${LINEAR_ALGEBRA_LIBRARIES}
                        ${THREAD_LIB} ${GMX_SHARED_LINKER_FLAGS}
                        ${OpenCL_LIBRARIES}
 -                      ${GMX_STDLIB_LIBRARIES}
                        PUBLIC
                        ${GMX_PUBLIC_LIBRARIES}
                        )
 +if (GMX_OPENMP)
 +    target_link_libraries(libgromacs PUBLIC OpenMP::OpenMP_CXX)
 +endif()
  set_target_properties(libgromacs PROPERTIES
                        OUTPUT_NAME "gromacs${GMX_LIBS_SUFFIX}"
                        SOVERSION ${LIBRARY_SOVERSION_MAJOR}
                        VERSION ${LIBRARY_VERSION}
 -                      COMPILE_FLAGS "${OpenMP_C_FLAGS}")
 +                      )
  
  gmx_manage_lmfit()
  target_link_libraries(libgromacs PRIVATE lmfit)
@@@ -413,35 -409,35 +413,35 @@@ if(GMX_USE_OPENCL
          DESTINATION ${GMX_INSTALL_OCLDIR}/gromacs/pbcutil
          COMPONENT libraries)
  
 -    # Install the NB source and headers
 +    # Install the NBNXM source and headers
      file(GLOB OPENCL_INSTALLED_FILES
 -        mdlib/nbnxn_consts.h
 +        nbnxm/constants.h
          )
      install(FILES ${OPENCL_INSTALLED_FILES}
 -        DESTINATION ${GMX_INSTALL_OCLDIR}/gromacs/mdlib
 +        DESTINATION ${GMX_INSTALL_OCLDIR}/gromacs/nbnxm
          COMPONENT libraries)
      file(GLOB OPENCL_INSTALLED_FILES
 -        mdlib/nbnxn_ocl/nbnxn_ocl_kernels.cl
 -        mdlib/nbnxn_ocl/nbnxn_ocl_kernel.clh
 -        mdlib/nbnxn_ocl/nbnxn_ocl_kernel_pruneonly.clh
 -        mdlib/nbnxn_ocl/nbnxn_ocl_kernels.clh
 -        mdlib/nbnxn_ocl/nbnxn_ocl_kernels_fastgen.clh
 -        mdlib/nbnxn_ocl/nbnxn_ocl_kernels_fastgen_add_twincut.clh
 -        mdlib/nbnxn_ocl/nbnxn_ocl_kernel_utils.clh
 -        mdlib/nbnxn_ocl/nbnxn_ocl_consts.h
 +        nbnxm/opencl/nbnxm_ocl_kernels.cl
 +        nbnxm/opencl/nbnxm_ocl_kernel.clh
 +        nbnxm/opencl/nbnxm_ocl_kernel_pruneonly.clh
 +        nbnxm/opencl/nbnxm_ocl_kernels.clh
 +        nbnxm/opencl/nbnxm_ocl_kernels_fastgen.clh
 +        nbnxm/opencl/nbnxm_ocl_kernels_fastgen_add_twincut.clh
 +        nbnxm/opencl/nbnxm_ocl_kernel_utils.clh
 +        nbnxm/opencl/nbnxm_ocl_consts.h
          )
      install(FILES ${OPENCL_INSTALLED_FILES}
 -        DESTINATION ${GMX_INSTALL_OCLDIR}/gromacs/mdlib/nbnxn_ocl
 +        DESTINATION ${GMX_INSTALL_OCLDIR}/gromacs/nbnxm/opencl
          COMPONENT libraries)
  
      # Install the PME source and headers
      file(GLOB OPENCL_INSTALLED_FILES
 -        ewald/pme-spread.clh
 -        ewald/pme-solve.clh
 -        ewald/pme-gather.clh
 -        ewald/pme-gpu-utils.clh
 -        ewald/pme-program.cl
 -        ewald/pme-gpu-types.h
 +        ewald/pme_spread.clh
 +        ewald/pme_solve.clh
 +        ewald/pme_gather.clh
 +        ewald/pme_gpu_utils.clh
 +        ewald/pme_program.cl
 +        ewald/pme_gpu_types.h
          )
      install(FILES ${OPENCL_INSTALLED_FILES}
          DESTINATION ${GMX_INSTALL_OCLDIR}/gromacs/ewald
index 83ffe74855a82633455142c29b360b8769ce0134,9c963f502118c91ade558a9a2f501cbd08eff90f..d2764efd62f3497826aa2de90bd72f79279f44c7
@@@ -70,6 -70,8 +70,6 @@@
  #include "gromacs/mdlib/gmx_omp_nthreads.h"
  #include "gromacs/mdlib/mdatoms.h"
  #include "gromacs/mdlib/mdsetup.h"
 -#include "gromacs/mdlib/nb_verlet.h"
 -#include "gromacs/mdlib/nbnxn_grid.h"
  #include "gromacs/mdlib/nsgrid.h"
  #include "gromacs/mdlib/vsite.h"
  #include "gromacs/mdtypes/commrec.h"
@@@ -78,7 -80,6 +78,7 @@@
  #include "gromacs/mdtypes/md_enums.h"
  #include "gromacs/mdtypes/nblist.h"
  #include "gromacs/mdtypes/state.h"
 +#include "gromacs/nbnxm/nbnxm.h"
  #include "gromacs/pulling/pull.h"
  #include "gromacs/timing/wallcycle.h"
  #include "gromacs/topology/mtop_util.h"
@@@ -455,7 -456,7 +455,7 @@@ static void restoreAtomGroups(gmx_domde
      /* Copy back the global charge group indices from state
       * and rebuild the local charge group to atom index.
       */
 -    for (gmx::index i = 0; i < atomGroupsState.size(); i++)
 +    for (gmx::index i = 0; i < atomGroupsState.ssize(); i++)
      {
          const int atomGroupGlobal  = atomGroupsState[i];
          const int groupSize        = gcgs_index[atomGroupGlobal + 1] - gcgs_index[atomGroupGlobal];
@@@ -2111,7 -2112,7 +2111,7 @@@ static void setup_dd_communication(gmx_
                      cg0 = cg1 - cd->ind[p-1].nrecv[zone];
                  }
  
 -                const int numThreads = static_cast<int>(comm->dth.size());
 +                const int numThreads = gmx::ssize(comm->dth);
  #pragma omp parallel for num_threads(numThreads) schedule(static)
                  for (int th = 0; th < numThreads; th++)
                  {
@@@ -2631,7 -2632,7 +2631,7 @@@ orderVector(gmx::ArrayRef<const gmx_cgs
              gmx::ArrayRef<T>                   vectorToSort,
              std::vector<T>                    *workVector)
  {
 -    if (gmx::index(workVector->size()) < sort.size())
 +    if (gmx::index(workVector->size()) < sort.ssize())
      {
          workVector->resize(sort.size());
      }
@@@ -2709,8 -2710,11 +2709,11 @@@ static void dd_sort_order(const gmx_dom
  
      const int  movedValue = NSGRID_SIGNAL_MOVED_FAC*fr->ns->grid->ncells;
  
-     if (ncg_home_old >= 0)
+     if (ncg_home_old >= 0 && !sort->sorted.empty())
      {
+         GMX_RELEASE_ASSERT(sort->sorted.size() == static_cast<size_t>(ncg_home_old),
+                            "The sorting buffer should contain the old home charge group indices");
          std::vector<gmx_cgsort_t> &stationary = sort->stationary;
          std::vector<gmx_cgsort_t> &moved      = sort->moved;
  
@@@ -2843,7 -2847,7 +2846,7 @@@ static void dd_sort_state(gmx_domdec_t 
  
      /* Reorder the state */
      gmx::ArrayRef<const gmx_cgsort_t> cgsort = sort->sorted;
 -    GMX_RELEASE_ASSERT(cgsort.size() == dd->ncg_home, "We should sort all the home atom groups");
 +    GMX_RELEASE_ASSERT(cgsort.ssize() == dd->ncg_home, "We should sort all the home atom groups");
  
      if (state->flags & (1 << estX))
      {
      else
      {
          /* Copy the sorted ns cell indices back to the ns grid struct */
 -        for (gmx::index i = 0; i < cgsort.size(); i++)
 +        for (gmx::index i = 0; i < cgsort.ssize(); i++)
          {
              fr->ns->grid->cell_index[i] = cgsort[i].nsc;
          }
@@@ -3001,7 -3005,7 +3004,7 @@@ void dd_partition_system(FIL
                           gmx_bool                 bMasterState,
                           int                      nstglobalcomm,
                           t_state                 *state_global,
 -                         const gmx_mtop_t        *top_global,
 +                         const gmx_mtop_t        &top_global,
                           const t_inputrec        *ir,
                           t_state                 *state_local,
                           PaddedVector<gmx::RVec> *f,
                    true, xGlobal,
                    &ddbox);
  
 -        distributeState(mdlog, dd, *top_global, state_global, ddbox, state_local, f);
 +        distributeState(mdlog, dd, top_global, state_global, ddbox, state_local, f);
  
          dd_make_local_cgs(dd, &top_local->cgs);
  
                                    fr->cginfo,
                                    state_local->x,
                                    ncg_moved, bRedist ? comm->movedBuffer.data() : nullptr,
 -                                  fr->nbv->grp[eintLocal].kernel_type,
 +                                  fr->nbv->grp[Nbnxm::InteractionLocality::Local].kernel_type,
                                    fr->nbv->nbat);
  
                  nbnxn_get_ncells(fr->nbv->nbs.get(), &ncells_new[XX], &ncells_new[YY]);
  
          wallcycle_sub_stop(wcycle, ewcsDD_GRID);
      }
+     else
+     {
+         /* With the group scheme the sorting array is part of the DD state,
+          * but it just got out of sync, so mark as invalid by emptying it.
+          */
+         if (ir->cutoff_scheme == ecutsGROUP)
+         {
+             comm->sort->sorted.clear();
+         }
+     }
  
      if (comm->useUpdateGroups)
      {
                  if (dd->splitConstraints || dd->splitSettles)
                  {
                      /* Only for inter-cg constraints we need special code */
 -                    n = dd_make_local_constraints(dd, n, top_global, fr->cginfo,
 +                    n = dd_make_local_constraints(dd, n, &top_global, fr->cginfo,
                                                    constr, ir->nProjOrder,
                                                    top_local->idef.il);
                  }
      if (comm->nstDDDump > 0 && step % comm->nstDDDump == 0)
      {
          dd_move_x(dd, state_local->box, state_local->x, nullWallcycle);
 -        write_dd_pdb("dd_dump", step, "dump", top_global, cr,
 +        write_dd_pdb("dd_dump", step, "dump", &top_global, cr,
                       -1, state_local->x.rvec_array(), state_local->box);
      }
  
      if (comm->DD_debug > 0)
      {
          /* Set the env var GMX_DD_DEBUG if you suspect corrupted indices */
 -        check_index_consistency(dd, top_global->natoms, ncg_mtop(top_global),
 +        check_index_consistency(dd, top_global.natoms, ncg_mtop(&top_global),
                                  "after partitioning");
      }
  
index f906a04e45a9d120cd6d8cf9bea7ba9474d644ad,d687e855cbb229a5a51d1164b1f731ddbe835190..7221c3b9a9f7e1b9d5af975faa19c15ee063ad6f
@@@ -83,7 -83,7 +83,7 @@@
  #include <list>
  
  #include "gromacs/domdec/domdec.h"
 -#include "gromacs/ewald/ewald-utils.h"
 +#include "gromacs/ewald/ewald_utils.h"
  #include "gromacs/fft/parallel_3dfft.h"
  #include "gromacs/fileio/pdbio.h"
  #include "gromacs/gmxlib/network.h"
  #include "gromacs/utility/stringutil.h"
  #include "gromacs/utility/unique_cptr.h"
  
 -#include "calculate-spline-moduli.h"
 -#include "pme-gather.h"
 -#include "pme-gpu-internal.h"
 -#include "pme-grid.h"
 -#include "pme-internal.h"
 -#include "pme-redistribute.h"
 -#include "pme-solve.h"
 -#include "pme-spline-work.h"
 -#include "pme-spread.h"
 +#include "calculate_spline_moduli.h"
 +#include "pme_gather.h"
 +#include "pme_gpu_internal.h"
 +#include "pme_grid.h"
 +#include "pme_internal.h"
 +#include "pme_redistribute.h"
 +#include "pme_solve.h"
 +#include "pme_spline_work.h"
 +#include "pme_spread.h"
  
  /*! \brief Help build a descriptive message in \c error if there are
   * \c errorReasons why PME on GPU is not supported.
@@@ -133,33 -133,48 +133,48 @@@ static boo
  addMessageIfNotSupported(const std::list<std::string> &errorReasons,
                           std::string                  *error)
  {
-     bool foundErrorReasons = errorReasons.empty();
-     if (!foundErrorReasons && error)
+     bool isSupported = errorReasons.empty();
+     if (!isSupported && error)
      {
          std::string regressionTestMarker = "PME GPU does not support";
          // this prefix is tested for in the regression tests script gmxtest.pl
-         *error = regressionTestMarker + ": " + gmx::joinStrings(errorReasons, "; ") + ".";
+         *error = regressionTestMarker;
+         if (errorReasons.size() == 1)
+         {
+             *error += " " + errorReasons.back();
+         }
+         else
+         {
+             *error += ": " + gmx::joinStrings(errorReasons, "; ");
+         }
+         *error += ".";
      }
-     return foundErrorReasons;
+     return isSupported;
  }
  
- bool pme_gpu_supports_build(const gmx_hw_info_t &hwinfo,
-                             std::string         *error)
+ bool pme_gpu_supports_build(std::string *error)
  {
      std::list<std::string> errorReasons;
      if (GMX_DOUBLE)
      {
-         errorReasons.emplace_back("double precision");
+         errorReasons.emplace_back("a double-precision build");
      }
      if (GMX_GPU == GMX_GPU_NONE)
      {
-         errorReasons.emplace_back("non-GPU build of GROMACS");
+         errorReasons.emplace_back("a non-GPU build");
      }
+     return addMessageIfNotSupported(errorReasons, error);
+ }
+ bool pme_gpu_supports_hardware(const gmx_hw_info_t &hwinfo,
+                                std::string         *error)
+ {
+     std::list<std::string> errorReasons;
      if (GMX_GPU == GMX_GPU_OPENCL)
      {
          if (!areAllGpuDevicesFromAmd(hwinfo.gpu_info))
          {
-             errorReasons.emplace_back("only AMD devices are supported");
+             errorReasons.emplace_back("non-AMD devices");
          }
      }
      return addMessageIfNotSupported(errorReasons, error);
index c3fe624c5ca9904fcca3964f0f9bc4ef8d4a7dd8,7475de602e6d2ea4d4373274321b00b353bb35ad..ad8905d0fdd27587f95514e1f21e07489f1e5adf
  #include <algorithm>
  
  #include "gromacs/domdec/domdec.h"
 -#include "gromacs/ewald/pme-gather.h"
 -#include "gromacs/ewald/pme-gpu-internal.h"
 -#include "gromacs/ewald/pme-grid.h"
 -#include "gromacs/ewald/pme-internal.h"
 -#include "gromacs/ewald/pme-solve.h"
 -#include "gromacs/ewald/pme-spread.h"
 +#include "gromacs/ewald/pme_gather.h"
 +#include "gromacs/ewald/pme_gpu_internal.h"
 +#include "gromacs/ewald/pme_grid.h"
 +#include "gromacs/ewald/pme_internal.h"
 +#include "gromacs/ewald/pme_solve.h"
 +#include "gromacs/ewald/pme_spread.h"
  #include "gromacs/fft/parallel_3dfft.h"
  #include "gromacs/gpu_utils/gpu_utils.h"
  #include "gromacs/math/invertmatrix.h"
@@@ -85,7 -85,8 +85,8 @@@ bool pmeSupportsInputForMode(const gmx_
              break;
  
          case CodePath::GPU:
-             implemented = (pme_gpu_supports_build(hwinfo, nullptr) &&
+             implemented = (pme_gpu_supports_build(nullptr) &&
+                            pme_gpu_supports_hardware(hwinfo, nullptr) &&
                             pme_gpu_supports_input(*inputRec, mtop, nullptr));
              break;
  
@@@ -178,7 -179,7 +179,7 @@@ PmeSafePointer pmeInitAtoms(const t_inp
                              )
  {
      const index     atomCount = coordinates.size();
 -    GMX_RELEASE_ASSERT(atomCount == charges.size(), "Mismatch in atom data");
 +    GMX_RELEASE_ASSERT(atomCount == charges.ssize(), "Mismatch in atom data");
      PmeSafePointer  pmeSafe = pmeInitInternal(inputRec, mode, gpuInfo, pmeGpuProgram, atomCount, box);
      pme_atomcomm_t *atc     = nullptr;
  
@@@ -381,7 -382,7 +382,7 @@@ void pmePerformGather(gmx_pme_t *pme, C
  {
      pme_atomcomm_t *atc                     = &(pme->atc[0]);
      const index     atomCount               = atc->n;
 -    GMX_RELEASE_ASSERT(forces.size() == atomCount, "Invalid force buffer size");
 +    GMX_RELEASE_ASSERT(forces.ssize() == atomCount, "Invalid force buffer size");
      const bool      forceReductionWithInput = (inputTreatment == PmeForceOutputHandling::ReduceWithInput);
      const real      scale                   = 1.0;
      const size_t    threadIndex             = 0;
@@@ -447,7 -448,7 +448,7 @@@ void pmeSetSplineData(const gmx_pme_t *
      const index           atomCount   = atc->n;
      const index           pmeOrder    = pme->pme_order;
      const index           dimSize     = pmeOrder * atomCount;
 -    GMX_RELEASE_ASSERT(dimSize == splineValues.size(), "Mismatch in spline data");
 +    GMX_RELEASE_ASSERT(dimSize == splineValues.ssize(), "Mismatch in spline data");
      real                 *splineBuffer = pmeGetSplineDataInternal(pme, type, dimIndex);
  
      switch (mode)
@@@ -472,7 -473,7 +473,7 @@@ void pmeSetGridLineIndices(const gmx_pm
  {
      const pme_atomcomm_t       *atc         = &(pme->atc[0]);
      const index                 atomCount   = atc->n;
 -    GMX_RELEASE_ASSERT(atomCount == gridLineIndices.size(), "Mismatch in gridline indices size");
 +    GMX_RELEASE_ASSERT(atomCount == gridLineIndices.ssize(), "Mismatch in gridline indices size");
  
      IVec paddedGridSizeUnused, gridSize(0, 0, 0);
      pmeGetRealGridSizesInternal(pme, mode, gridSize, paddedGridSizeUnused);
index 6157cd3dd1bc251fe7a692e58c5800ba70865aab,5cbbcc311951ab6a4620288092dbdabd36c00943..e15c37c3a1f1c2eae738e2b2f7fd84c77d6e1682
@@@ -44,8 -44,7 +44,8 @@@
  
  #include "testhardwarecontexts.h"
  
 -#include "gromacs/compat/make_unique.h"
 +#include <memory>
 +
  #include "gromacs/ewald/pme.h"
  #include "gromacs/gpu_utils/gpu_utils.h"
  #include "gromacs/hardware/detecthardware.h"
@@@ -110,10 -109,11 +110,11 @@@ static gmx_hw_info_t *hardwareInit(
  
  void PmeTestEnvironment::SetUp()
  {
 -    hardwareContexts_.emplace_back(compat::make_unique<TestHardwareContext>(CodePath::CPU, "", nullptr));
 +    hardwareContexts_.emplace_back(std::make_unique<TestHardwareContext>(CodePath::CPU, "", nullptr));
  
      hardwareInfo_ = hardwareInit();
-     if (!pme_gpu_supports_build(*hardwareInfo_, nullptr))
+     if (!pme_gpu_supports_build(nullptr) ||
+         !pme_gpu_supports_hardware(*hardwareInfo_, nullptr))
      {
          // PME can only run on the CPU, so don't make any more test contexts.
          return;
          char        stmp[200] = {};
          get_gpu_device_info_string(stmp, hardwareInfo_->gpu_info, gpuIndex);
          std::string description = "(GPU " + std::string(stmp) + ") ";
 -        hardwareContexts_.emplace_back(compat::make_unique<TestHardwareContext>
 +        hardwareContexts_.emplace_back(std::make_unique<TestHardwareContext>
                                             (CodePath::GPU, description.c_str(),
                                             deviceInfo));
      }
index 572c385516faf0094710afd2bfaeb227ca9dfcde,0000000000000000000000000000000000000000..8f7df5bbbce7ffc533f485cd36be65b0f8e33dde
mode 100644,000000..100644
--- /dev/null
@@@ -1,616 -1,0 +1,621 @@@
- #include "gromacs/trajectoryanalysis/topologyinformation.h"
 +/*
 + * This file is part of the GROMACS molecular simulation package.
 + *
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team.
 + * Copyright (c) 2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
 + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
 + * and including many others, as listed in the AUTHORS file in the
 + * top-level source directory and at http://www.gromacs.org.
 + *
 + * GROMACS is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU Lesser General Public License
 + * as published by the Free Software Foundation; either version 2.1
 + * of the License, or (at your option) any later version.
 + *
 + * GROMACS is distributed in the hope that it will be useful,
 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 + * Lesser General Public License for more details.
 + *
 + * You should have received a copy of the GNU Lesser General Public
 + * License along with GROMACS; if not, see
 + * http://www.gnu.org/licenses, or write to the Free Software Foundation,
 + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
 + *
 + * If you want to redistribute modifications to GROMACS, please
 + * consider that scientific software is very special. Version
 + * control is crucial - bugs must be traceable. We will be happy to
 + * consider code for inclusion in the official distribution, but
 + * derived work must not be called official GROMACS. Details are found
 + * in the README & COPYING files - if they are missing, get the
 + * official version at http://www.gromacs.org.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the research papers on the package. Check out http://www.gromacs.org.
 + */
 +#include "gmxpre.h"
 +
 +#include "insert_molecules.h"
 +
 +#include <algorithm>
 +#include <memory>
 +#include <set>
 +#include <string>
 +#include <vector>
 +
 +#include "gromacs/commandline/cmdlineoptionsmodule.h"
 +#include "gromacs/fileio/confio.h"
 +#include "gromacs/fileio/filetypes.h"
 +#include "gromacs/fileio/xvgr.h"
 +#include "gromacs/gmxlib/conformation_utilities.h"
 +#include "gromacs/gmxpreprocess/makeexclusiondistances.h"
 +#include "gromacs/math/functions.h"
 +#include "gromacs/math/utilities.h"
 +#include "gromacs/math/vec.h"
 +#include "gromacs/options/basicoptions.h"
 +#include "gromacs/options/filenameoption.h"
 +#include "gromacs/options/ioptionscontainer.h"
 +#include "gromacs/pbcutil/pbc.h"
 +#include "gromacs/random/threefry.h"
 +#include "gromacs/random/uniformrealdistribution.h"
 +#include "gromacs/selection/nbsearch.h"
 +#include "gromacs/selection/selection.h"
 +#include "gromacs/selection/selectioncollection.h"
 +#include "gromacs/selection/selectionoption.h"
 +#include "gromacs/selection/selectionoptionbehavior.h"
 +#include "gromacs/topology/atomprop.h"
 +#include "gromacs/topology/atoms.h"
 +#include "gromacs/topology/atomsbuilder.h"
 +#include "gromacs/topology/mtop_util.h"
 +#include "gromacs/topology/symtab.h"
 +#include "gromacs/topology/topology.h"
 +#include "gromacs/trajectory/trajectoryframe.h"
- #include "gromacs/utility/unique_cptr.h"
 +#include "gromacs/utility/cstringutil.h"
 +#include "gromacs/utility/exceptions.h"
 +#include "gromacs/utility/fatalerror.h"
 +#include "gromacs/utility/smalloc.h"
-         gmx_mtop_t *getTopology(bool /*required*/) override { return topInfo_.mtop(); }
 +
 +using gmx::RVec;
 +
 +/* enum for random rotations of inserted solutes */
 +enum RotationType {
 +    en_rotXYZ, en_rotZ, en_rotNone
 +};
 +const char *const cRotationEnum[] = {"xyz", "z", "none"};
 +
 +static void center_molecule(gmx::ArrayRef<RVec> x)
 +{
 +    rvec center = {0};
 +    for (auto &xi : x)
 +    {
 +        rvec_inc(center, xi);
 +    }
 +    svmul(1.0/x.size(), center, center);
 +    for (auto &xi : x)
 +    {
 +        rvec_dec(xi, center);
 +    }
 +}
 +
 +static void generate_trial_conf(gmx::ArrayRef<RVec> xin,
 +                                const rvec offset, RotationType enum_rot,
 +                                gmx::DefaultRandomEngine * rng,
 +                                std::vector<RVec> *xout)
 +{
 +    gmx::UniformRealDistribution<real> dist(0, 2.0*M_PI);
 +    xout->assign(xin.begin(), xin.end());
 +
 +    real alfa = 0.0, beta = 0.0, gamma = 0.0;
 +    switch (enum_rot)
 +    {
 +        case en_rotXYZ:
 +            alfa  = dist(*rng);
 +            beta  = dist(*rng);
 +            gamma = dist(*rng);
 +            break;
 +        case en_rotZ:
 +            alfa  = beta = 0.;
 +            gamma = dist(*rng);
 +            break;
 +        case en_rotNone:
 +            alfa = beta = gamma = 0.;
 +            break;
 +    }
 +    if (enum_rot == en_rotXYZ || enum_rot == en_rotZ)
 +    {
 +        rotate_conf(xout->size(), as_rvec_array(xout->data()), nullptr, alfa, beta, gamma);
 +    }
 +    for (size_t i = 0; i < xout->size(); ++i)
 +    {
 +        rvec_inc((*xout)[i], offset);
 +    }
 +}
 +
 +static bool isInsertionAllowed(gmx::AnalysisNeighborhoodSearch *search,
 +                               const std::vector<real>         &exclusionDistances,
 +                               const std::vector<RVec>         &x,
 +                               const std::vector<real>         &exclusionDistances_insrt,
 +                               const t_atoms                   &atoms,
 +                               const std::set<int>             &removableAtoms,
 +                               gmx::AtomsRemover               *remover)
 +{
 +    gmx::AnalysisNeighborhoodPositions  pos(x);
 +    gmx::AnalysisNeighborhoodPairSearch pairSearch = search->startPairSearch(pos);
 +    gmx::AnalysisNeighborhoodPair       pair;
 +    while (pairSearch.findNextPair(&pair))
 +    {
 +        const real r1 = exclusionDistances[pair.refIndex()];
 +        const real r2 = exclusionDistances_insrt[pair.testIndex()];
 +        if (pair.distance2() < gmx::square(r1 + r2))
 +        {
 +            if (removableAtoms.count(pair.refIndex()) == 0)
 +            {
 +                return false;
 +            }
 +            // TODO: If molecule information is available, this should ideally
 +            // use it to remove whole molecules.
 +            remover->markResidue(atoms, pair.refIndex(), true);
 +        }
 +    }
 +    return true;
 +}
 +
 +static void insert_mols(int nmol_insrt, int ntry, int seed,
 +                        real defaultDistance, real scaleFactor,
 +                        t_atoms *atoms, t_symtab *symtab, std::vector<RVec> *x,
 +                        const std::set<int> &removableAtoms,
 +                        const t_atoms &atoms_insrt, gmx::ArrayRef<RVec> x_insrt,
 +                        int ePBC, matrix box,
 +                        const std::string &posfn, const rvec deltaR,
 +                        RotationType enum_rot)
 +{
 +    fprintf(stderr, "Initialising inter-atomic distances...\n");
 +    AtomProperties          aps;
 +    std::vector<real>       exclusionDistances(
 +            makeExclusionDistances(atoms, &aps, defaultDistance, scaleFactor));
 +    const std::vector<real> exclusionDistances_insrt(
 +            makeExclusionDistances(&atoms_insrt, &aps, defaultDistance, scaleFactor));
 +
 +    const real       maxInsertRadius
 +        = *std::max_element(exclusionDistances_insrt.begin(),
 +                            exclusionDistances_insrt.end());
 +    real             maxRadius = maxInsertRadius;
 +    if (!exclusionDistances.empty())
 +    {
 +        const real maxExistingRadius
 +            = *std::max_element(exclusionDistances.begin(),
 +                                exclusionDistances.end());
 +        maxRadius = std::max(maxInsertRadius, maxExistingRadius);
 +    }
 +
 +    // TODO: Make all of this exception-safe.
 +    gmx::AnalysisNeighborhood nb;
 +    nb.setCutoff(maxInsertRadius + maxRadius);
 +
 +
 +    if (seed == 0)
 +    {
 +        seed = static_cast<int>(gmx::makeRandomSeed());
 +    }
 +    fprintf(stderr, "Using random seed %d\n", seed);
 +
 +    gmx::DefaultRandomEngine rng(seed);
 +
 +    t_pbc                    pbc;
 +    set_pbc(&pbc, ePBC, box);
 +
 +    /* With -ip, take nmol_insrt from file posfn */
 +    double     **rpos              = nullptr;
 +    const bool   insertAtPositions = !posfn.empty();
 +    if (insertAtPositions)
 +    {
 +        int ncol;
 +        nmol_insrt = read_xvg(posfn.c_str(), &rpos, &ncol);
 +        if (ncol != 3)
 +        {
 +            gmx_fatal(FARGS, "Expected 3 columns (x/y/z coordinates) in file %s\n",
 +                      posfn.c_str());
 +        }
 +        fprintf(stderr, "Read %d positions from file %s\n\n",
 +                nmol_insrt, posfn.c_str());
 +    }
 +
 +    gmx::AtomsBuilder builder(atoms, symtab);
 +    gmx::AtomsRemover remover(*atoms);
 +    {
 +        const int finalAtomCount    = atoms->nr + nmol_insrt * atoms_insrt.nr;
 +        const int finalResidueCount = atoms->nres + nmol_insrt * atoms_insrt.nres;
 +        builder.reserve(finalAtomCount, finalResidueCount);
 +        x->reserve(finalAtomCount);
 +        exclusionDistances.reserve(finalAtomCount);
 +    }
 +
 +    std::vector<RVec>                    x_n(x_insrt.size());
 +
 +    int                                  mol        = 0;
 +    int                                  trial      = 0;
 +    int                                  firstTrial = 0;
 +    int                                  failed     = 0;
 +    gmx::UniformRealDistribution<real>   dist;
 +
 +    while (mol < nmol_insrt && trial < ntry*nmol_insrt)
 +    {
 +        rvec offset_x;
 +        if (!insertAtPositions)
 +        {
 +            // Insert at random positions.
 +            offset_x[XX] = box[XX][XX] * dist(rng);
 +            offset_x[YY] = box[YY][YY] * dist(rng);
 +            offset_x[ZZ] = box[ZZ][ZZ] * dist(rng);
 +        }
 +        else
 +        {
 +            // Skip a position if ntry trials were not successful.
 +            if (trial >= firstTrial + ntry)
 +            {
 +                fprintf(stderr, " skipped position (%.3f, %.3f, %.3f)\n",
 +                        rpos[XX][mol], rpos[YY][mol], rpos[ZZ][mol]);
 +                ++mol;
 +                ++failed;
 +                firstTrial = trial;
 +                continue;
 +            }
 +            // Insert at positions taken from option -ip file.
 +            offset_x[XX] = rpos[XX][mol] + deltaR[XX]*(2 * dist(rng)-1);
 +            offset_x[YY] = rpos[YY][mol] + deltaR[YY]*(2 * dist(rng)-1);
 +            offset_x[ZZ] = rpos[ZZ][mol] + deltaR[ZZ]*(2 * dist(rng)-1);
 +        }
 +        fprintf(stderr, "\rTry %d", ++trial);
 +        fflush(stderr);
 +
 +        generate_trial_conf(x_insrt, offset_x, enum_rot, &rng, &x_n);
 +        gmx::AnalysisNeighborhoodPositions pos(*x);
 +        gmx::AnalysisNeighborhoodSearch    search = nb.initSearch(&pbc, pos);
 +        if (isInsertionAllowed(&search, exclusionDistances, x_n, exclusionDistances_insrt,
 +                               *atoms, removableAtoms, &remover))
 +        {
 +            x->insert(x->end(), x_n.begin(), x_n.end());
 +            exclusionDistances.insert(exclusionDistances.end(),
 +                                      exclusionDistances_insrt.begin(),
 +                                      exclusionDistances_insrt.end());
 +            builder.mergeAtoms(atoms_insrt);
 +            ++mol;
 +            firstTrial = trial;
 +            fprintf(stderr, " success (now %d atoms)!\n", builder.currentAtomCount());
 +        }
 +    }
 +
 +    fprintf(stderr, "\n");
 +    /* print number of molecules added */
 +    fprintf(stderr, "Added %d molecules (out of %d requested)\n",
 +            mol - failed, nmol_insrt);
 +
 +    const int originalAtomCount    = atoms->nr;
 +    const int originalResidueCount = atoms->nres;
 +    remover.refreshAtomCount(*atoms);
 +    remover.removeMarkedElements(x);
 +    remover.removeMarkedAtoms(atoms);
 +    if (atoms->nr < originalAtomCount)
 +    {
 +        fprintf(stderr, "Replaced %d residues (%d atoms)\n",
 +                originalResidueCount - atoms->nres,
 +                originalAtomCount - atoms->nr);
 +    }
 +
 +    if (rpos != nullptr)
 +    {
 +        for (int i = 0; i < DIM; ++i)
 +        {
 +            sfree(rpos[i]);
 +        }
 +        sfree(rpos);
 +    }
 +}
 +
 +namespace gmx
 +{
 +
 +namespace
 +{
 +
 +class InsertMolecules : public ICommandLineOptionsModule, public ITopologyProvider
 +{
 +    public:
 +        InsertMolecules()
 +            : bBox_(false), nmolIns_(0), nmolTry_(10), seed_(0),
 +              defaultDistance_(0.105), scaleFactor_(0.57), enumRot_(en_rotXYZ)
 +        {
 +            clear_rvec(newBox_);
 +            clear_rvec(deltaR_);
++            clear_mat(box_);
 +        }
 +
 +        // From ITopologyProvider
-         TopologyInformation topInfo_;
++        gmx_mtop_t *getTopology(bool /*required*/) override { return &top_; }
 +        int getAtomCount() override { return 0; }
 +
 +        // From ICommandLineOptionsModule
 +        void init(CommandLineModuleSettings * /*settings*/) override
 +        {
 +        }
 +        void initOptions(IOptionsContainer                 *options,
 +                         ICommandLineOptionsModuleSettings *settings) override;
 +        void optionsFinished() override;
 +        int run() override;
 +
 +    private:
 +        SelectionCollection selections_;
 +
 +        std::string         inputConfFile_;
 +        std::string         insertConfFile_;
 +        std::string         positionFile_;
 +        std::string         outputConfFile_;
 +        rvec                newBox_;
 +        bool                bBox_;
 +        int                 nmolIns_;
 +        int                 nmolTry_;
 +        int                 seed_;
 +        real                defaultDistance_;
 +        real                scaleFactor_;
 +        rvec                deltaR_;
 +        RotationType        enumRot_;
 +        Selection           replaceSel_;
 +
-         topInfo_.fillFromInputFile(inputConfFile_);
-         if (topInfo_.mtop()->natoms == 0)
++        gmx_mtop_t          top_;
++        std::vector<RVec>   x_;
++        matrix              box_;
++        int                 ePBC_;
 +};
 +
 +void InsertMolecules::initOptions(IOptionsContainer                 *options,
 +                                  ICommandLineOptionsModuleSettings *settings)
 +{
 +    const char *const desc[] = {
 +        "[THISMODULE] inserts [TT]-nmol[tt] copies of the system specified in",
 +        "the [TT]-ci[tt] input file. The insertions take place either into",
 +        "vacant space in the solute conformation given with [TT]-f[tt], or",
 +        "into an empty box given by [TT]-box[tt]. Specifying both [TT]-f[tt]",
 +        "and [TT]-box[tt] behaves like [TT]-f[tt], but places a new box",
 +        "around the solute before insertions. Any velocities present are",
 +        "discarded.",
 +        "",
 +        "It is possible to also insert into a solvated configuration and",
 +        "replace solvent atoms with the inserted atoms. To do this, use",
 +        "[TT]-replace[tt] to specify a selection that identifies the atoms",
 +        "that can be replaced. The tool assumes that all molecules in this",
 +        "selection consist of single residues: each residue from this",
 +        "selection that overlaps with the inserted molecules will be removed",
 +        "instead of preventing insertion.",
 +        "",
 +        "By default, the insertion positions are random (with initial seed",
 +        "specified by [TT]-seed[tt]). The program iterates until [TT]-nmol[tt]",
 +        "molecules have been inserted in the box. Molecules are not inserted",
 +        "where the distance between any existing atom and any atom of the",
 +        "inserted molecule is less than the sum based on the van der Waals",
 +        "radii of both atoms. A database ([TT]vdwradii.dat[tt]) of van der",
 +        "Waals radii is read by the program, and the resulting radii scaled",
 +        "by [TT]-scale[tt]. If radii are not found in the database, those",
 +        "atoms are assigned the (pre-scaled) distance [TT]-radius[tt].",
 +        "Note that the usefulness of those radii depends on the atom names,",
 +        "and thus varies widely with force field.",
 +        "",
 +        "A total of [TT]-nmol[tt] * [TT]-try[tt] insertion attempts are made",
 +        "before giving up. Increase [TT]-try[tt] if you have several small",
 +        "holes to fill. Option [TT]-rot[tt] specifies whether the insertion",
 +        "molecules are randomly oriented before insertion attempts.",
 +        "",
 +        "Alternatively, the molecules can be inserted only at positions defined in",
 +        "positions.dat ([TT]-ip[tt]). That file should have 3 columns (x,y,z),",
 +        "that give the displacements compared to the input molecule position",
 +        "([TT]-ci[tt]). Hence, if that file should contain the absolute",
 +        "positions, the molecule must be centered on (0,0,0) before using",
 +        "[THISMODULE] (e.g. from [gmx-editconf] [TT]-center[tt]).",
 +        "Comments in that file starting with # are ignored. Option [TT]-dr[tt]",
 +        "defines the maximally allowed displacements during insertial trials.",
 +        "[TT]-try[tt] and [TT]-rot[tt] work as in the default mode (see above)."
 +    };
 +
 +    settings->setHelpText(desc);
 +
 +    std::shared_ptr<SelectionOptionBehavior> selectionOptionBehavior(
 +            new SelectionOptionBehavior(&selections_, this));
 +    settings->addOptionsBehavior(selectionOptionBehavior);
 +
 +    // TODO: Replace use of legacyType.
 +    options->addOption(FileNameOption("f")
 +                           .legacyType(efSTX).inputFile()
 +                           .store(&inputConfFile_)
 +                           .defaultBasename("protein")
 +                           .description("Existing configuration to insert into"));
 +    options->addOption(FileNameOption("ci")
 +                           .legacyType(efSTX).inputFile().required()
 +                           .store(&insertConfFile_)
 +                           .defaultBasename("insert")
 +                           .description("Configuration to insert"));
 +    options->addOption(FileNameOption("ip")
 +                           .filetype(eftGenericData).inputFile()
 +                           .store(&positionFile_)
 +                           .defaultBasename("positions")
 +                           .description("Predefined insertion trial positions"));
 +    options->addOption(FileNameOption("o")
 +                           .legacyType(efSTO).outputFile().required()
 +                           .store(&outputConfFile_)
 +                           .defaultBasename("out")
 +                           .description("Output configuration after insertion"));
 +
 +    options->addOption(SelectionOption("replace").onlyAtoms()
 +                           .store(&replaceSel_)
 +                           .description("Atoms that can be removed if overlapping"));
 +    selectionOptionBehavior->initOptions(options);
 +
 +    options->addOption(RealOption("box").vector()
 +                           .store(newBox_).storeIsSet(&bBox_)
 +                           .description("Box size (in nm)"));
 +    options->addOption(IntegerOption("nmol")
 +                           .store(&nmolIns_)
 +                           .description("Number of extra molecules to insert"));
 +    options->addOption(IntegerOption("try")
 +                           .store(&nmolTry_)
 +                           .description("Try inserting [TT]-nmol[tt] times [TT]-try[tt] times"));
 +    options->addOption(IntegerOption("seed")
 +                           .store(&seed_)
 +                           .description("Random generator seed (0 means generate)"));
 +    options->addOption(RealOption("radius")
 +                           .store(&defaultDistance_)
 +                           .description("Default van der Waals distance"));
 +    options->addOption(RealOption("scale")
 +                           .store(&scaleFactor_)
 +                           .description("Scale factor to multiply Van der Waals radii from the database in share/gromacs/top/vdwradii.dat. The default value of 0.57 yields density close to 1000 g/l for proteins in water."));
 +    options->addOption(RealOption("dr").vector()
 +                           .store(deltaR_)
 +                           .description("Allowed displacement in x/y/z from positions in [TT]-ip[tt] file"));
 +    options->addOption(EnumOption<RotationType>("rot").enumValue(cRotationEnum)
 +                           .store(&enumRot_)
 +                           .description("Rotate inserted molecules randomly"));
 +}
 +
 +void InsertMolecules::optionsFinished()
 +{
 +    if (nmolIns_ <= 0 && positionFile_.empty())
 +    {
 +        GMX_THROW(InconsistentInputError("Either -nmol must be larger than 0, "
 +                                         "or positions must be given with -ip."));
 +    }
 +    if (inputConfFile_.empty() && !bBox_)
 +    {
 +        GMX_THROW(InconsistentInputError("When no solute (-f) is specified, "
 +                                         "a box size (-box) must be specified."));
 +    }
 +    if (replaceSel_.isValid() && inputConfFile_.empty())
 +    {
 +        GMX_THROW(InconsistentInputError("Replacement (-replace) only makes sense "
 +                                         "together with an existing configuration (-f)."));
 +    }
 +
 +    if (!inputConfFile_.empty())
 +    {
++        bool  bTprFileWasRead;
++        rvec *temporaryX = nullptr;
 +        fprintf(stderr, "Reading solute configuration\n");
-     const char       *outputTitle = topInfo_.name();
-     std::vector<RVec> xOutput;
-     matrix            box = {{ 0 }};
-     if (topInfo_.hasTopology())
-     {
-         xOutput = copyOf(topInfo_.x());
-         topInfo_.getBox(box);
-     }
-     else
-     {
-         xOutput.resize(topInfo_.mtop()->natoms);
-     }
-     auto          atomsSolute = topInfo_.copyAtoms();
++        readConfAndTopology(inputConfFile_.c_str(), &bTprFileWasRead, &top_,
++                            &ePBC_, &temporaryX, nullptr, box_);
++        x_.assign(temporaryX, temporaryX + top_.natoms);
++        sfree(temporaryX);
++        if (top_.natoms == 0)
 +        {
 +            fprintf(stderr, "Note: no atoms in %s\n", inputConfFile_.c_str());
 +        }
 +    }
 +}
 +
 +int InsertMolecules::run()
 +{
-         set_pbc(&pbc, topInfo_.ePBC(), box);
 +    std::set<int> removableAtoms;
 +    if (replaceSel_.isValid())
 +    {
 +        t_pbc       pbc;
-         fr->natoms = topInfo_.mtop()->natoms;
++        set_pbc(&pbc, ePBC_, box_);
 +        t_trxframe *fr;
 +        snew(fr, 1);
-         fr->x      = as_rvec_array(xOutput.data());
++        fr->natoms = x_.size();
 +        fr->bX     = TRUE;
-     int ePBCForOutput = topInfo_.ePBC();
++        fr->x      = as_rvec_array(x_.data());
 +        selections_.evaluate(fr, &pbc);
 +        sfree(fr);
 +        removableAtoms.insert(replaceSel_.atomIndices().begin(),
 +                              replaceSel_.atomIndices().end());
 +        // TODO: It could be nice to check that removableAtoms contains full
 +        // residues, since we anyways remove whole residues instead of
 +        // individual atoms.
 +    }
 +
-         clear_mat(box);
-         box[XX][XX] = newBox_[XX];
-         box[YY][YY] = newBox_[YY];
-         box[ZZ][ZZ] = newBox_[ZZ];
++    int ePBCForOutput = ePBC_;
 +    if (bBox_)
 +    {
 +        ePBCForOutput = epbcXYZ;
-     if (det(box) == 0)
++        clear_mat(box_);
++        box_[XX][XX] = newBox_[XX];
++        box_[YY][YY] = newBox_[YY];
++        box_[ZZ][ZZ] = newBox_[ZZ];
 +    }
-     fprintf(stderr, "Reading molecule configuration\n");
-     TopologyInformation      topInfoForInsertedMolecule;
-     topInfoForInsertedMolecule.fillFromInputFile(insertConfFile_);
-     auto                     atomsInserted = topInfoForInsertedMolecule.atoms();
-     std::vector<RVec>        xInserted     = copyOf(topInfoForInsertedMolecule.x());
-     if (topInfoForInsertedMolecule.mtop()->natoms == 0)
-     {
-         gmx_fatal(FARGS, "No molecule in %s, please check your input",
-                   insertConfFile_.c_str());
-     }
-     if (outputTitle == nullptr)
-     {
-         outputTitle = topInfoForInsertedMolecule.name();
-     }
-     if (positionFile_.empty())
++    if (det(box_) == 0)
 +    {
 +        gmx_fatal(FARGS, "Undefined solute box.\nCreate one with gmx editconf "
 +                  "or give explicit -box command line option");
 +    }
 +
-         center_molecule(xInserted);
++    gmx_mtop_t         topInserted;
++    t_atoms            atomsInserted;
++    std::vector<RVec>  xInserted;
 +    {
-     auto              symtabInserted = duplicateSymtab(&topInfo_.mtop()->symtab);
-     const sfree_guard symtabInsertedGuard(symtabInserted);
++        bool        bTprFileWasRead;
++        int         ePBC_dummy;
++        matrix      box_dummy;
++        rvec       *temporaryX;
++        readConfAndTopology(insertConfFile_.c_str(), &bTprFileWasRead, &topInserted,
++                            &ePBC_dummy, &temporaryX, nullptr, box_dummy);
++        xInserted.assign(temporaryX, temporaryX + topInserted.natoms);
++        sfree(temporaryX);
++        atomsInserted = gmx_mtop_global_atoms(&topInserted);
++        if (atomsInserted.nr == 0)
++        {
++            gmx_fatal(FARGS, "No molecule in %s, please check your input",
++                      insertConfFile_.c_str());
++        }
++        if (top_.name == nullptr)
++        {
++            top_.name = topInserted.name;
++        }
++        if (positionFile_.empty())
++        {
++            center_molecule(xInserted);
++        }
 +    }
 +
-                 atomsSolute.get(), symtabInserted, &xOutput, removableAtoms, *atomsInserted, xInserted,
-                 ePBCForOutput, box, positionFile_, deltaR_, enumRot_);
++    t_atoms atoms = gmx_mtop_global_atoms(&top_);
++
 +    /* add nmol_ins molecules of atoms_ins
 +       in random orientation at random place */
 +    insert_mols(nmolIns_, nmolTry_, seed_, defaultDistance_, scaleFactor_,
-     write_sto_conf(outputConfFile_.c_str(), outputTitle, atomsSolute.get(),
-                    as_rvec_array(xOutput.data()), nullptr, ePBCForOutput, box);
++                &atoms, &top_.symtab, &x_, removableAtoms, atomsInserted, xInserted,
++                ePBCForOutput, box_, positionFile_, deltaR_, enumRot_);
 +
 +    /* write new configuration to file confout */
 +    fprintf(stderr, "Writing generated configuration to %s\n",
 +            outputConfFile_.c_str());
-             atomsSolute->nr, atomsSolute->nres);
-     done_symtab(symtabInserted);
++    write_sto_conf(outputConfFile_.c_str(), *top_.name, &atoms,
++                   as_rvec_array(x_.data()), nullptr, ePBCForOutput, box_);
 +
 +    /* print size of generated configuration */
 +    fprintf(stderr, "\nOutput configuration contains %d atoms in %d residues\n",
++            atoms.nr, atoms.nres);
++
++    done_atom(&atoms);
++    done_atom(&atomsInserted);
++
 +    return 0;
 +}
 +
 +}   // namespace
 +
 +
 +const char* InsertMoleculesInfo::name()
 +{
 +    static const char* name = "insert-molecules";
 +    return name;
 +}
 +
 +const char* InsertMoleculesInfo::shortDescription()
 +{
 +    static const char* shortDescription =
 +        "Insert molecules into existing vacancies";
 +    return shortDescription;
 +}
 +
 +ICommandLineOptionsModulePointer InsertMoleculesInfo::create()
 +{
 +    return ICommandLineOptionsModulePointer(new InsertMolecules());
 +}
 +
 +} // namespace gmx
index 2804ab3a1ab74f4762e276a3ddbb0afd2a93daa4,21226f53504247a993991487e3f21bc988401277..f9b1f27e2c8c8cbfab2dc1cb9aa259f2f5002039
@@@ -46,7 -46,7 +46,7 @@@
  #include <algorithm>
  #include <string>
  
 -#include "gromacs/awh/read-params.h"
 +#include "gromacs/awh/read_params.h"
  #include "gromacs/fileio/readinp.h"
  #include "gromacs/fileio/warninp.h"
  #include "gromacs/gmxlib/chargegroup.h"
@@@ -60,7 -60,7 +60,7 @@@
  #include "gromacs/mdrunutility/mdmodules.h"
  #include "gromacs/mdtypes/inputrec.h"
  #include "gromacs/mdtypes/md_enums.h"
 -#include "gromacs/mdtypes/pull-params.h"
 +#include "gromacs/mdtypes/pull_params.h"
  #include "gromacs/options/options.h"
  #include "gromacs/options/treesupport.h"
  #include "gromacs/pbcutil/pbc.h"
@@@ -1531,12 -1531,6 +1531,6 @@@ static void do_fep_params(t_inputrec *i
          expand->nstexpanded = fep->nstdhdl;
          /* if you don't specify nstexpanded when doing expanded ensemble free energy calcs, it is set to nstdhdl */
      }
-     if ((expand->nstexpanded < 0) && ir->bSimTemp)
-     {
-         expand->nstexpanded = 2*static_cast<int>(ir->opts.tau_t[0]/ir->delta_t);
-         /* if you don't specify nstexpanded when doing expanded ensemble simulated tempering, it is set to
-            2*tau_t just to be careful so it's not to frequent  */
-     }
  }
  
  
@@@ -2646,7 -2640,7 +2640,7 @@@ static bool do_numbering(int natoms, gm
      }
  
      snew(grps->nm_ind, groupsFromMdpFile.size()+1); /* +1 for possible rest group */
 -    for (int i = 0; i != groupsFromMdpFile.size(); ++i)
 +    for (int i = 0; i != groupsFromMdpFile.ssize(); ++i)
      {
          /* Lookup the group name in the block structure */
          gid = search_string(groupsFromMdpFile[i].c_str(), block->nr, gnames);
  static void calc_nrdf(const gmx_mtop_t *mtop, t_inputrec *ir, char **gnames)
  {
      t_grpopts              *opts;
 -    const gmx_groups_t     *groups;
      pull_params_t          *pull;
      int                     natoms, ai, aj, i, j, d, g, imin, jmin;
      int                    *nrdf2, *na_vcm, na_tot;
      double                 *nrdf_tc, *nrdf_vcm, nrdf_uc, *nrdf_vcm_sub;
      ivec                   *dof_vcm;
 -    gmx_mtop_atomloop_all_t aloop;
      int                     mol, ftype, as;
  
      /* Calculate nrdf.
  
      opts = &ir->opts;
  
 -    groups = &mtop->groups;
 +    const gmx_groups_t &groups = mtop->groups;
      natoms = mtop->natoms;
  
      /* Allocate one more for a possible rest group */
      /* We need to sum degrees of freedom into doubles,
       * since floats give too low nrdf's above 3 million atoms.
       */
 -    snew(nrdf_tc, groups->grps[egcTC].nr+1);
 -    snew(nrdf_vcm, groups->grps[egcVCM].nr+1);
 -    snew(dof_vcm, groups->grps[egcVCM].nr+1);
 -    snew(na_vcm, groups->grps[egcVCM].nr+1);
 -    snew(nrdf_vcm_sub, groups->grps[egcVCM].nr+1);
 +    snew(nrdf_tc, groups.grps[egcTC].nr+1);
 +    snew(nrdf_vcm, groups.grps[egcVCM].nr+1);
 +    snew(dof_vcm, groups.grps[egcVCM].nr+1);
 +    snew(na_vcm, groups.grps[egcVCM].nr+1);
 +    snew(nrdf_vcm_sub, groups.grps[egcVCM].nr+1);
  
 -    for (i = 0; i < groups->grps[egcTC].nr; i++)
 +    for (i = 0; i < groups.grps[egcTC].nr; i++)
      {
          nrdf_tc[i] = 0;
      }
 -    for (i = 0; i < groups->grps[egcVCM].nr+1; i++)
 +    for (i = 0; i < groups.grps[egcVCM].nr+1; i++)
      {
          nrdf_vcm[i]     = 0;
          clear_ivec(dof_vcm[i]);
      }
  
      snew(nrdf2, natoms);
 -    aloop = gmx_mtop_atomloop_all_init(mtop);
 -    const t_atom *atom;
 -    while (gmx_mtop_atomloop_all_next(aloop, &i, &atom))
 +    for (const AtomProxy atomP : AtomRange(*mtop))
      {
 +        const t_atom &local = atomP.atom();
 +        int           i     = atomP.globalAtomNumber();
          nrdf2[i] = 0;
 -        if (atom->ptype == eptAtom || atom->ptype == eptNucleus)
 +        if (local.ptype == eptAtom || local.ptype == eptNucleus)
          {
              g = getGroupType(groups, egcFREEZE, i);
 -            for (d = 0; d < DIM; d++)
 +            for (int d = 0; d < DIM; d++)
              {
                  if (opts->nFreeze[g][d] == 0)
                  {
      for (const gmx_molblock_t &molb : mtop->molblock)
      {
          const gmx_moltype_t &molt = mtop->moltype[molb.type];
 -        atom = molt.atoms.atom;
 +        const t_atom        *atom = molt.atoms.atom;
          for (mol = 0; mol < molb.nmol; mol++)
          {
              for (ftype = F_CONSTR; ftype <= F_CONSTRNC; ftype++)
                      nrdf_vcm[getGroupType(groups, egcVCM, ai)] -= 0.5*imin;
                      if (nrdf_tc[getGroupType(groups, egcTC, ai)] < 0)
                      {
 -                        gmx_fatal(FARGS, "Center of mass pulling constraints caused the number of degrees of freedom for temperature coupling group %s to be negative", gnames[groups->grps[egcTC].nm_ind[getGroupType(groups, egcTC, ai)]]);
 +                        gmx_fatal(FARGS, "Center of mass pulling constraints caused the number of degrees of freedom for temperature coupling group %s to be negative", gnames[groups.grps[egcTC].nm_ind[getGroupType(groups, egcTC, ai)]]);
                      }
                  }
                  else
           * the number of degrees of freedom in each vcm group when COM
           * translation is removed and 6 when rotation is removed as well.
           */
 -        for (j = 0; j < groups->grps[egcVCM].nr+1; j++)
 +        for (j = 0; j < groups.grps[egcVCM].nr+1; j++)
          {
              switch (ir->comm_mode)
              {
              }
          }
  
 -        for (i = 0; i < groups->grps[egcTC].nr; i++)
 +        for (i = 0; i < groups.grps[egcTC].nr; i++)
          {
              /* Count the number of atoms of TC group i for every VCM group */
 -            for (j = 0; j < groups->grps[egcVCM].nr+1; j++)
 +            for (j = 0; j < groups.grps[egcVCM].nr+1; j++)
              {
                  na_vcm[j] = 0;
              }
               */
              nrdf_uc    = nrdf_tc[i];
              nrdf_tc[i] = 0;
 -            for (j = 0; j < groups->grps[egcVCM].nr+1; j++)
 +            for (j = 0; j < groups.grps[egcVCM].nr+1; j++)
              {
                  if (nrdf_vcm[j] > nrdf_vcm_sub[j])
                  {
              }
          }
      }
 -    for (i = 0; (i < groups->grps[egcTC].nr); i++)
 +    for (i = 0; (i < groups.grps[egcTC].nr); i++)
      {
          opts->nrdf[i] = nrdf_tc[i];
          if (opts->nrdf[i] < 0)
          }
          fprintf(stderr,
                  "Number of degrees of freedom in T-Coupling group %s is %.2f\n",
 -                gnames[groups->grps[egcTC].nm_ind[i]], opts->nrdf[i]);
 +                gnames[groups.grps[egcTC].nm_ind[i]], opts->nrdf[i]);
      }
  
      sfree(nrdf2);
@@@ -3683,6 -3679,17 +3677,17 @@@ void do_index(const char* mdparin, cons
          gmx_fatal(FARGS, "Can only have energy group pair tables in combination with user tables for VdW and/or Coulomb");
      }
  
+     /* final check before going out of scope if simulated tempering variables
+      * need to be set to default values.
+      */
+     if ((ir->expandedvals->nstexpanded < 0) && ir->bSimTemp)
+     {
+         ir->expandedvals->nstexpanded = 2*static_cast<int>(ir->opts.tau_t[0]/ir->delta_t);
+         warning(wi, gmx::formatString("the value for nstexpanded was not specified for "
+                                       " expanded ensemble simulated tempering. It is set to 2*tau_t (%d) "
+                                       "by default, but it is recommended to set it to an explicit value!",
+                                       ir->expandedvals->nstexpanded));
+     }
      for (i = 0; (i < grps->nr); i++)
      {
          sfree(gnames[i]);
@@@ -3952,6 -3959,7 +3957,6 @@@ void triple_check(const char *mdparin, 
      real                     *mgrp, mt;
      rvec                      acc;
      gmx_mtop_atomloop_block_t aloopb;
 -    gmx_mtop_atomloop_all_t   aloop;
      ivec                      AbsRef;
      char                      warn_buf[STRLEN];
  
      {
          clear_rvec(acc);
          snew(mgrp, sys->groups.grps[egcACC].nr);
 -        aloop = gmx_mtop_atomloop_all_init(sys);
 -        const t_atom *atom;
 -        while (gmx_mtop_atomloop_all_next(aloop, &i, &atom))
 +        for (const AtomProxy atomP : AtomRange(*sys))
          {
 -            mgrp[getGroupType(&sys->groups, egcACC, i)] += atom->m;
 +            const t_atom &local = atomP.atom();
 +            int           i     = atomP.globalAtomNumber();
 +            mgrp[getGroupType(sys->groups, egcACC, i)] += local.m;
          }
          mt = 0.0;
          for (i = 0; (i < sys->groups.grps[egcACC].nr); i++)
index 60fa35e97cd941905fa394c0208f3cd9db5ec6b3,c6e6eba38e77f5d2db51b47add28790f379c4b3e..bf44bd395e803a013bc8a71a4191f255e7942e83
@@@ -46,7 -46,7 +46,7 @@@
  #include "gromacs/commandline/pargs.h"
  #include "gromacs/fileio/confio.h"
  #include "gromacs/fileio/pdbio.h"
 -#include "gromacs/gmxlib/conformation-utilities.h"
 +#include "gromacs/gmxlib/conformation_utilities.h"
  #include "gromacs/gmxpreprocess/makeexclusiondistances.h"
  #include "gromacs/math/functions.h"
  #include "gromacs/math/units.h"
  #include "gromacs/topology/atomprop.h"
  #include "gromacs/topology/atoms.h"
  #include "gromacs/topology/atomsbuilder.h"
+ #include "gromacs/topology/mtop_util.h"
  #include "gromacs/topology/topology.h"
- #include "gromacs/trajectoryanalysis/topologyinformation.h"
  #include "gromacs/utility/arraysize.h"
  #include "gromacs/utility/cstringutil.h"
  #include "gromacs/utility/fatalerror.h"
  #include "gromacs/utility/futil.h"
  #include "gromacs/utility/gmxassert.h"
  #include "gromacs/utility/smalloc.h"
- #include "gromacs/utility/unique_cptr.h"
  
  using gmx::RVec;
  
@@@ -638,30 -637,30 +637,30 @@@ static void removeExtraSolventMolecules
      remover.removeMarkedAtoms(atoms);
  }
  
- static void add_solv(const char *filename,
-                      const gmx::TopologyInformation &topInfo,
-                      t_atoms *atoms,
+ static void add_solv(const char *filename, t_atoms *atoms,
+                      t_symtab *symtab,
                       std::vector<RVec> *x, std::vector<RVec> *v,
-                      matrix box, AtomProperties *aps,
 -                     int ePBC, matrix box, gmx_atomprop_t aps,
++                     int ePBC, matrix box, AtomProperties *aps,
                       real defaultDistance, real scaleFactor,
                       real rshell, int max_sol)
  {
-     gmx::TopologyInformation topInfoSolvent;
-     topInfoSolvent.fillFromInputFile(gmx::findLibraryFile(filename));
-     auto                     atomsSolvent = topInfoSolvent.copyAtoms();
-     std::vector<RVec>        xSolvent, vSolvent;
-     matrix                   boxSolvent = {{ 0 }};
-     if (topInfoSolvent.hasTopology())
-     {
-         xSolvent = copyOf(topInfoSolvent.x());
-         vSolvent = copyOf(topInfoSolvent.v());
-         topInfoSolvent.getBox(boxSolvent);
-     }
-     else
-     {
-         xSolvent.resize(atomsSolvent->nr);
-         vSolvent.resize(atomsSolvent->nr);
-     }
+     gmx_mtop_t        topSolvent;
+     std::vector<RVec> xSolvent, vSolvent;
+     matrix            boxSolvent = {{ 0 }};
+     int               ePBCSolvent;
+     fprintf(stderr, "Reading solvent configuration\n");
+     bool              bTprFileWasRead;
+     rvec             *temporaryX = nullptr, *temporaryV = nullptr;
+     readConfAndTopology(gmx::findLibraryFile(filename).c_str(), &bTprFileWasRead, &topSolvent,
+                         &ePBCSolvent, &temporaryX, &temporaryV, boxSolvent);
+     t_atoms *atomsSolvent;
+     snew(atomsSolvent, 1);
+     *atomsSolvent = gmx_mtop_global_atoms(&topSolvent);
+     xSolvent.assign(temporaryX, temporaryX + topSolvent.natoms);
+     sfree(temporaryX);
+     vSolvent.assign(temporaryV, temporaryV + topSolvent.natoms);
+     sfree(temporaryV);
      if (gmx::boxIsZero(boxSolvent))
      {
          gmx_fatal(FARGS, "No box information for solvent in %s, please use a properly formatted file\n",
      const std::vector<real> exclusionDistances(
              makeExclusionDistances(atoms, aps, defaultDistance, scaleFactor));
      std::vector<real>       exclusionDistances_solvt(
-             makeExclusionDistances(atomsSolvent.get(), aps, defaultDistance, scaleFactor));
+             makeExclusionDistances(atomsSolvent, aps, defaultDistance, scaleFactor));
  
      /* generate a new solvent configuration */
      fprintf(stderr, "Generating solvent configuration\n");
      t_pbc pbc;
-     set_pbc(&pbc, topInfo.ePBC(), box);
+     set_pbc(&pbc, ePBC, box);
      if (!gmx::boxesAreEqual(boxSolvent, box))
      {
          if (TRICLINIC(boxSolvent))
                        "You can try to pass the same box for -cp and -cs.");
          }
          /* apply pbc for solvent configuration for whole molecules */
-         rm_res_pbc(atomsSolvent.get(), &xSolvent, boxSolvent);
-         replicateSolventBox(atomsSolvent.get(), &xSolvent, &vSolvent, &exclusionDistances_solvt,
+         rm_res_pbc(atomsSolvent, &xSolvent, boxSolvent);
+         replicateSolventBox(atomsSolvent, &xSolvent, &vSolvent, &exclusionDistances_solvt,
                              boxSolvent, box);
-         if (topInfo.ePBC() != epbcNONE)
+         if (ePBC != epbcNONE)
          {
-             removeSolventBoxOverlap(atomsSolvent.get(), &xSolvent, &vSolvent, &exclusionDistances_solvt, pbc);
+             removeSolventBoxOverlap(atomsSolvent, &xSolvent, &vSolvent, &exclusionDistances_solvt, pbc);
          }
      }
      if (atoms->nr > 0)
      {
          if (rshell > 0.0)
          {
-             removeSolventOutsideShell(atomsSolvent.get(), &xSolvent,  &vSolvent,
+             removeSolventOutsideShell(atomsSolvent, &xSolvent,  &vSolvent,
                                        &exclusionDistances_solvt, pbc, *x, rshell);
          }
-         removeSolventOverlappingWithSolute(atomsSolvent.get(), &xSolvent, &vSolvent,
+         removeSolventOverlappingWithSolute(atomsSolvent, &xSolvent, &vSolvent,
                                             &exclusionDistances_solvt, pbc, *x,
                                             exclusionDistances);
      }
      if (max_sol > 0 && atomsSolvent->nres > max_sol)
      {
          const int numberToRemove = atomsSolvent->nres - max_sol;
-         removeExtraSolventMolecules(atomsSolvent.get(), &xSolvent, &vSolvent, numberToRemove);
+         removeExtraSolventMolecules(atomsSolvent, &xSolvent, &vSolvent, numberToRemove);
      }
  
      /* Sort the solvent mixture, not the protein... */
-     t_atoms *newatoms           = nullptr;
-     t_atoms *sortedAtomsSolvent = atomsSolvent.get();
+     t_atoms *newatoms = nullptr;
+     // The sort_molecule function does something creative with the
+     // t_atoms pointers. We need to make sure we neither leak, nor
+     // double-free, so make a shallow pointer that is fine for it to
+     // change.
+     t_atoms *sortedAtomsSolvent = atomsSolvent;
      sort_molecule(&sortedAtomsSolvent, &newatoms, &xSolvent, &vSolvent);
  
      // Merge the two configurations.
          v->insert(v->end(), vSolvent.begin(), vSolvent.end());
      }
      {
-         gmx::AtomsBuilder builder(atoms, &topInfo.mtop()->symtab);
+         gmx::AtomsBuilder builder(atoms, symtab);
          builder.mergeAtoms(*sortedAtomsSolvent);
      }
      fprintf(stderr, "Generated solvent containing %d atoms in %d residues\n",
-             sortedAtomsSolvent->nr, sortedAtomsSolvent->nres);
+             atomsSolvent->nr, atomsSolvent->nres);
  
      if (newatoms)
      {
          done_atom(newatoms);
          sfree(newatoms);
      }
+     if (atomsSolvent)
+     {
+         done_atom(atomsSolvent);
+         sfree(atomsSolvent);
+     }
  }
  
 -static void update_top(t_atoms *atoms, int firstSolventResidueIndex, matrix box, int NFILE, t_filenm fnm[],
 -                       gmx_atomprop_t aps)
 +static void update_top(t_atoms        *atoms,
 +                       int             firstSolventResidueIndex,
 +                       matrix          box,
 +                       int             NFILE,
 +                       t_filenm        fnm[],
 +                       AtomProperties *aps)
  {
      FILE        *fpin, *fpout;
      char         buf[STRLEN*2], buf2[STRLEN], *temp;
      mtot = 0;
      for (i = 0; (i < atoms->nr); i++)
      {
 -        gmx_atomprop_query(aps, epropMass,
 -                           *atoms->resinfo[atoms->atom[i].resind].name,
 -                           *atoms->atomname[i], &mm);
 +        aps->setAtomProperty(epropMass,
 +                             std::string(*atoms->resinfo[atoms->atom[i].resind].name),
 +                             std::string(*atoms->atomname[i]), &mm);
          mtot += mm;
      }
  
@@@ -924,6 -928,7 +932,6 @@@ int gmx_solvate(int argc, char *argv[]
      /* parameter data */
      gmx_bool       bProt, bBox;
      const char    *conf_prot, *confout;
 -    gmx_atomprop_t aps;
  
      t_filenm       fnm[] = {
          { efSTX, "-cp", "protein", ffOPTRD },
                    "a box size (-box) must be specified");
      }
  
 -    aps = gmx_atomprop_init();
 +    AtomProperties           aps;
  
-     gmx::TopologyInformation topInfo;
-     std::vector<RVec>        x, v;
-     matrix                   box = {{ 0 }};
+     /* solute configuration data */
+     gmx_mtop_t        top;
+     std::vector<RVec> x, v;
+     matrix            box  = {{ 0 }};
+     int               ePBC = -1;
+     t_atoms          *atoms;
+     snew(atoms, 1);
      if (bProt)
      {
          /* Generate a solute configuration */
          conf_prot = opt2fn("-cp", NFILE, fnm);
-         topInfo.fillFromInputFile(conf_prot);
-         x = copyOf(topInfo.x());
-         if (bReadV)
-         {
-             v = copyOf(topInfo.v());
-         }
-         topInfo.getBox(box);
          fprintf(stderr, "Reading solute configuration%s\n",
                  bReadV ? " and velocities" : "");
-         if (bReadV && v.empty())
+         bool  bTprFileWasRead;
+         rvec *temporaryX = nullptr, *temporaryV = nullptr;
+         readConfAndTopology(conf_prot, &bTprFileWasRead, &top,
+                             &ePBC, &temporaryX, bReadV ? &temporaryV : nullptr, box);
+         *atoms = gmx_mtop_global_atoms(&top);
+         x.assign(temporaryX, temporaryX + top.natoms);
+         sfree(temporaryX);
+         if (temporaryV)
+         {
+             v.assign(temporaryV, temporaryV + top.natoms);
+             sfree(temporaryV);
+         }
+         else if (bReadV)
          {
              fprintf(stderr, "Note: no velocities found\n");
          }
-         if (topInfo.mtop()->natoms == 0)
+         if (atoms->nr == 0)
          {
              fprintf(stderr, "Note: no atoms in %s\n", conf_prot);
              bProt = FALSE;
          }
          else
          {
-             firstSolventResidueIndex = topInfo.atoms()->nres;
+             firstSolventResidueIndex = atoms->nres;
          }
      }
-     auto atoms         = topInfo.copyAtoms();
-     int  ePBCForOutput = topInfo.ePBC();
+     int ePBCForOutput = ePBC;
      if (bBox)
      {
          ePBCForOutput = epbcXYZ;
                    "or give explicit -box command line option");
      }
  
-     add_solv(solventFileName, topInfo, atoms.get(), &x, &v, box,
+     add_solv(solventFileName, atoms, &top.symtab, &x, &v, ePBCForOutput, box,
 -             aps, defaultDistance, scaleFactor, r_shell, max_sol);
 +             &aps, defaultDistance, scaleFactor, r_shell, max_sol);
  
      /* write new configuration 1 to file confout */
      confout = ftp2fn(efSTO, NFILE, fnm);
      fprintf(stderr, "Writing generated configuration to %s\n", confout);
-     const char *outputTitle = (bProt ? *topInfo.mtop()->name : "Generated by gmx solvate");
-     write_sto_conf(confout, outputTitle, atoms.get(), as_rvec_array(x.data()),
+     const char *outputTitle = (bProt ? *top.name : "Generated by gmx solvate");
+     write_sto_conf(confout, outputTitle, atoms, as_rvec_array(x.data()),
                     !v.empty() ? as_rvec_array(v.data()) : nullptr, ePBCForOutput, box);
  
      /* print size of generated configuration */
      fprintf(stderr, "\nOutput configuration contains %d atoms in %d residues\n",
              atoms->nr, atoms->nres);
-     update_top(atoms.get(), firstSolventResidueIndex, box, NFILE, fnm, &aps);
 -    update_top(atoms, firstSolventResidueIndex, box, NFILE, fnm, aps);
++    update_top(atoms, firstSolventResidueIndex, box, NFILE, fnm, &aps);
+     done_atom(atoms);
+     sfree(atoms);
 -    gmx_atomprop_destroy(aps);
      output_env_done(oenv);
  
      return 0;
index cf66b046a7a6d6955a043bb0c4fd74329c9e4618,48a26c647a60e4990678ab9dd84d2e34b0c7dcf1..e75f429e782970b325c02bd160075f5c89dcc2a7
@@@ -189,6 -189,23 +189,23 @@@ int Constraints::numFlexibleConstraints
      return impl_->nflexcon;
  }
  
+ bool Constraints::havePerturbedConstraints() const
+ {
+     const gmx_ffparams_t &ffparams = impl_->mtop.ffparams;
+     for (size_t i = 0; i < ffparams.functype.size(); i++)
+     {
+         if ((ffparams.functype[i] == F_CONSTR ||
+              ffparams.functype[i] == F_CONSTRNC) &&
+             ffparams.iparams[i].constr.dA != ffparams.iparams[i].constr.dB)
+         {
+             return true;
+         }
+     }
+     return false;
+ }
  //! Clears constraint quantities for atoms in nonlocal region.
  static void clear_constraint_quantity_nonlocal(gmx_domdec_t *dd, rvec *q)
  {
@@@ -711,7 -728,7 +728,7 @@@ ArrayRef<real> Constraints::rmsdData() 
      }
      else
      {
 -        return EmptyArrayRef();
 +        return {};
      }
  }
  
@@@ -1122,22 -1139,6 +1139,22 @@@ Constraints::Impl::Impl(const gmx_mtop_
  
  Constraints::Impl::~Impl()
  {
 +    for (auto blocka : at2con_mt)
 +    {
 +        done_blocka(&blocka);
 +    }
 +    if (bSettleErrorHasOccurred != nullptr)
 +    {
 +        sfree(bSettleErrorHasOccurred);
 +    }
 +    if (vir_r_m_dr_th != nullptr)
 +    {
 +        sfree(vir_r_m_dr_th);
 +    }
 +    if (settled != nullptr)
 +    {
 +        settle_free(settled);
 +    }
      done_lincs(lincsd);
  }
  
index a07560bde73c022d597c8c6d66914d9c72642eda,bac22fd07a103f3a6e5a5b73694e1fef7c22b0cc..147ad445615f67e1a461b5adf9e3837c147b818b
@@@ -120,6 -120,9 +120,9 @@@ class Constraint
          /*! \brief Returns the total number of flexible constraints in the system. */
          int numFlexibleConstraints() const;
  
+         /*! \brief Returns whether the system contains perturbed constraints */
+         bool havePerturbedConstraints() const;
          /*! \brief Set up all the local constraints for the domain.
           *
           * \todo Make this a callback that is called automatically
@@@ -271,7 -274,7 +274,7 @@@ constr_iatomptr(gmx::ArrayRef<const int
                  gmx::ArrayRef<const int> iatom_constrnc,
                  int                      con)
  {
 -    if (con*3 < iatom_constr.size())
 +    if (con*3 < iatom_constr.ssize())
      {
          return iatom_constr.data() + con*3;
      }
index 0ed28de87e05c0c75cd7a2f78ca5fd8c301df6a2,8a7615365140ca43108e1a445b360f36696a93b3..1a7d648e801d74102a86dae217152757b2aa0335
  #include "gromacs/domdec/domdec.h"
  #include "gromacs/domdec/domdec_struct.h"
  #include "gromacs/ewald/ewald.h"
 -#include "gromacs/ewald/long-range-correction.h"
 +#include "gromacs/ewald/long_range_correction.h"
  #include "gromacs/ewald/pme.h"
  #include "gromacs/gmxlib/network.h"
  #include "gromacs/gmxlib/nrnb.h"
  #include "gromacs/gmxlib/nonbonded/nonbonded.h"
 -#include "gromacs/listed-forces/listed-forces.h"
 +#include "gromacs/listed_forces/listed_forces.h"
  #include "gromacs/math/vec.h"
  #include "gromacs/math/vecdump.h"
  #include "gromacs/mdlib/force_flags.h"
 -#include "gromacs/mdlib/forcerec-threading.h"
 +#include "gromacs/mdlib/forcerec_threading.h"
  #include "gromacs/mdlib/mdrun.h"
  #include "gromacs/mdlib/ns.h"
  #include "gromacs/mdlib/qmmm.h"
@@@ -767,7 -767,7 +767,7 @@@ void sum_dhdl(gmx_enerdata_t *enerd, gm
  
          double &enerpart_lambda = enerd->enerpart_lambda[i + 1];
  
 -        for (gmx::index j = 0; j < lambda.size(); j++)
 +        for (gmx::index j = 0; j < lambda.ssize(); j++)
          {
              /* Note that this loop is over all dhdl components, not just the separated ones */
              const double dlam  = fepvals->all_lambda[j][i] - lambda[j];
                  enerpart_lambda += dlam*enerd->term[F_DVDL_CONSTR];
              }
  
-             if (j == efptMASS)
+             if (j == efptMASS && !fepvals->separate_dvdl[j])
              {
                  enerpart_lambda += dlam*enerd->term[F_DKDL];
              }
index 6dedbb795b4a78c9bf5664e3dc175846d0ef0b43,4b0992916ab6f12b19a59b5ead139840cf39b156..3d5578a664b07f0849877da60a1a25f0b68985c1
@@@ -49,7 -49,6 +49,7 @@@
  
  #include "gromacs/awh/awh.h"
  #include "gromacs/domdec/dlbtiming.h"
 +#include "gromacs/domdec/domdec.h"
  #include "gromacs/domdec/domdec_struct.h"
  #include "gromacs/domdec/partition.h"
  #include "gromacs/essentialdynamics/edsam.h"
  #include "gromacs/gmxlib/nonbonded/nonbonded.h"
  #include "gromacs/gpu_utils/gpu_utils.h"
  #include "gromacs/imd/imd.h"
 -#include "gromacs/listed-forces/bonded.h"
 -#include "gromacs/listed-forces/disre.h"
 -#include "gromacs/listed-forces/gpubonded.h"
 -#include "gromacs/listed-forces/manage-threading.h"
 -#include "gromacs/listed-forces/orires.h"
 +#include "gromacs/listed_forces/bonded.h"
 +#include "gromacs/listed_forces/disre.h"
 +#include "gromacs/listed_forces/gpubonded.h"
 +#include "gromacs/listed_forces/manage_threading.h"
 +#include "gromacs/listed_forces/orires.h"
  #include "gromacs/math/arrayrefwithpadding.h"
  #include "gromacs/math/functions.h"
  #include "gromacs/math/units.h"
  #include "gromacs/mdlib/forcerec.h"
  #include "gromacs/mdlib/gmx_omp_nthreads.h"
  #include "gromacs/mdlib/mdrun.h"
 -#include "gromacs/mdlib/nb_verlet.h"
 -#include "gromacs/mdlib/nbnxn_atomdata.h"
 -#include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h"
 -#include "gromacs/mdlib/nbnxn_grid.h"
 -#include "gromacs/mdlib/nbnxn_search.h"
  #include "gromacs/mdlib/ppforceworkload.h"
  #include "gromacs/mdlib/qmmm.h"
  #include "gromacs/mdlib/update.h"
 -#include "gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_gpu_ref.h"
  #include "gromacs/mdtypes/commrec.h"
 +#include "gromacs/mdtypes/enerdata.h"
  #include "gromacs/mdtypes/forceoutput.h"
  #include "gromacs/mdtypes/iforceprovider.h"
  #include "gromacs/mdtypes/inputrec.h"
  #include "gromacs/mdtypes/md_enums.h"
  #include "gromacs/mdtypes/state.h"
 +#include "gromacs/nbnxm/atomdata.h"
 +#include "gromacs/nbnxm/gpu_data_mgmt.h"
 +#include "gromacs/nbnxm/nbnxm.h"
  #include "gromacs/pbcutil/ishift.h"
  #include "gromacs/pbcutil/mshift.h"
  #include "gromacs/pbcutil/pbc.h"
  #include "gromacs/utility/gmxassert.h"
  #include "gromacs/utility/gmxmpi.h"
  #include "gromacs/utility/logger.h"
 -#include "gromacs/utility/pleasecite.h"
  #include "gromacs/utility/smalloc.h"
  #include "gromacs/utility/strconvert.h"
  #include "gromacs/utility/sysinfo.h"
  
 -#include "nbnxn_gpu.h"
 -#include "nbnxn_kernels/nbnxn_kernel_cpu.h"
 -#include "nbnxn_kernels/nbnxn_kernel_prune.h"
 -
  // TODO: this environment variable allows us to verify before release
  // that on less common architectures the total cost of polling is not larger than
  // a blocking wait (so polling does not introduce overhead when the static
@@@ -387,15 -393,14 +387,15 @@@ static void post_process_forces(const t
      }
  }
  
 -static void do_nb_verlet(const t_forcerec *fr,
 -                         const interaction_const_t *ic,
 -                         gmx_enerdata_t *enerd,
 -                         int flags, int ilocality,
 -                         int clearF,
 -                         int64_t step,
 -                         t_nrnb *nrnb,
 -                         gmx_wallcycle_t wcycle)
 +static void do_nb_verlet(t_forcerec                       *fr,
 +                         const interaction_const_t        *ic,
 +                         gmx_enerdata_t                   *enerd,
 +                         const int                         flags,
 +                         const Nbnxm::InteractionLocality  ilocality,
 +                         const int                         clearF,
 +                         const int64_t                     step,
 +                         t_nrnb                           *nrnb,
 +                         gmx_wallcycle_t                   wcycle)
  {
      if (!(flags & GMX_FORCE_NONBONDED))
      {
               * the current coordinates of the atoms.
               */
              wallcycle_sub_start(wcycle, ewcsNONBONDED_PRUNING);
 -            nbnxn_kernel_cpu_prune(nbvg, nbv->nbat, fr->shift_vec, nbv->listParams->rlistInner);
 +            NbnxnDispatchPruneKernel(nbv, ilocality, fr->shift_vec);
              wallcycle_sub_stop(wcycle, ewcsNONBONDED_PRUNING);
          }
  
          wallcycle_sub_start(wcycle, ewcsNONBONDED);
      }
  
 -    switch (nbvg->kernel_type)
 -    {
 -        case nbnxnk4x4_PlainC:
 -        case nbnxnk4xN_SIMD_4xN:
 -        case nbnxnk4xN_SIMD_2xNN:
 -            nbnxn_kernel_cpu(nbvg,
 -                             nbv->nbat,
 -                             ic,
 -                             fr->shift_vec,
 -                             flags,
 -                             clearF,
 -                             fr->fshift[0],
 -                             enerd->grpp.ener[egCOULSR],
 -                             fr->bBHAM ?
 -                             enerd->grpp.ener[egBHAMSR] :
 -                             enerd->grpp.ener[egLJSR]);
 -            break;
 -
 -        case nbnxnk8x8x8_GPU:
 -            nbnxn_gpu_launch_kernel(nbv->gpu_nbv, flags, ilocality);
 -            break;
 +    NbnxnDispatchKernel(nbv, ilocality, *ic, flags, clearF, fr, enerd, nrnb);
  
 -        case nbnxnk8x8x8_PlainC:
 -            nbnxn_kernel_gpu_ref(nbvg->nbl_lists.nbl[0],
 -                                 nbv->nbat, ic,
 -                                 fr->shift_vec,
 -                                 flags,
 -                                 clearF,
 -                                 nbv->nbat->out[0].f,
 -                                 fr->fshift[0],
 -                                 enerd->grpp.ener[egCOULSR],
 -                                 fr->bBHAM ?
 -                                 enerd->grpp.ener[egBHAMSR] :
 -                                 enerd->grpp.ener[egLJSR]);
 -            break;
 -
 -        default:
 -            GMX_RELEASE_ASSERT(false, "Invalid nonbonded kernel type passed!");
 -
 -    }
      if (!bUsingGpuKernels)
      {
          wallcycle_sub_stop(wcycle, ewcsNONBONDED);
      }
 -
 -    int enr_nbnxn_kernel_ljc, enr_nbnxn_kernel_lj;
 -    if (EEL_RF(ic->eeltype) || ic->eeltype == eelCUT)
 -    {
 -        enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_RF;
 -    }
 -    else if ((!bUsingGpuKernels && nbvg->ewald_excl == ewaldexclAnalytical) ||
 -             (bUsingGpuKernels && nbnxn_gpu_is_kernel_ewald_analytical(nbv->gpu_nbv)))
 -    {
 -        enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_EWALD;
 -    }
 -    else
 -    {
 -        enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_TAB;
 -    }
 -    enr_nbnxn_kernel_lj = eNR_NBNXN_LJ;
 -    if (flags & GMX_FORCE_ENERGY)
 -    {
 -        /* In eNR_??? the nbnxn F+E kernels are always the F kernel + 1 */
 -        enr_nbnxn_kernel_ljc += 1;
 -        enr_nbnxn_kernel_lj  += 1;
 -    }
 -
 -    inc_nrnb(nrnb, enr_nbnxn_kernel_ljc,
 -             nbvg->nbl_lists.natpair_ljq);
 -    inc_nrnb(nrnb, enr_nbnxn_kernel_lj,
 -             nbvg->nbl_lists.natpair_lj);
 -    /* The Coulomb-only kernels are offset -eNR_NBNXN_LJ_RF+eNR_NBNXN_RF */
 -    inc_nrnb(nrnb, enr_nbnxn_kernel_ljc-eNR_NBNXN_LJ_RF+eNR_NBNXN_RF,
 -             nbvg->nbl_lists.natpair_q);
 -
 -    if (ic->vdw_modifier == eintmodFORCESWITCH)
 -    {
 -        /* We add up the switch cost separately */
 -        inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_FSW+((flags & GMX_FORCE_ENERGY) ? 1 : 0),
 -                 nbvg->nbl_lists.natpair_ljq + nbvg->nbl_lists.natpair_lj);
 -    }
 -    if (ic->vdw_modifier == eintmodPOTSWITCH)
 -    {
 -        /* We add up the switch cost separately */
 -        inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_PSW+((flags & GMX_FORCE_ENERGY) ? 1 : 0),
 -                 nbvg->nbl_lists.natpair_ljq + nbvg->nbl_lists.natpair_lj);
 -    }
 -    if (ic->vdwtype == evdwPME)
 -    {
 -        /* We add up the LJ Ewald cost separately */
 -        inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_EWALD+((flags & GMX_FORCE_ENERGY) ? 1 : 0),
 -                 nbvg->nbl_lists.natpair_ljq + nbvg->nbl_lists.natpair_lj);
 -    }
  }
  
  static void do_nb_verlet_fep(nbnxn_pairlist_set_t *nbl_lists,
@@@ -860,12 -952,11 +860,12 @@@ static void alternatePmeNbGpuWaitReduce
          {
              GpuTaskCompletion completionType = (isPmeGpuDone) ? GpuTaskCompletion::Wait : GpuTaskCompletion::Check;
              wallcycle_start_nocount(wcycle, ewcWAIT_GPU_NB_L);
 -            isNbGpuDone = nbnxn_gpu_try_finish_task(nbv->gpu_nbv,
 -                                                    flags, eatLocal,
 -                                                    haveOtherWork,
 -                                                    enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
 -                                                    fshift, completionType);
 +            isNbGpuDone = Nbnxm::gpu_try_finish_task(nbv->gpu_nbv,
 +                                                     flags,
 +                                                     Nbnxm::AtomLocality::Local,
 +                                                     haveOtherWork,
 +                                                     enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
 +                                                     fshift, completionType);
              wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
              // To get the call count right, when the task finished we
              // issue a start/stop.
                  wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
                  wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
  
 -                nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), eatLocal,
 +                nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), Nbnxm::AtomLocality::Local,
                                                 nbv->nbat, as_rvec_array(force->unpaddedArrayRef().data()), wcycle);
              }
          }
@@@ -907,15 -998,15 +907,15 @@@ static inline void launchGpuRollingPrun
       */
      int  numRollingParts     = nbv->listParams->numRollingParts;
      GMX_ASSERT(numRollingParts == nbv->listParams->nstlistPrune/2, "Since we alternate local/non-local at even/odd steps, we need numRollingParts<=nstlistPrune/2 for correctness and == for efficiency");
 -    int  stepWithCurrentList = step - nbv->grp[eintLocal].nbl_lists.outerListCreationStep;
 +    int  stepWithCurrentList = step - nbv->grp[Nbnxm::InteractionLocality::Local].nbl_lists.outerListCreationStep;
      bool stepIsEven          = ((stepWithCurrentList & 1) == 0);
      if (stepWithCurrentList > 0 &&
          stepWithCurrentList < inputrec->nstlist - 1 &&
          (stepIsEven || DOMAINDECOMP(cr)))
      {
 -        nbnxn_gpu_launch_kernel_pruneonly(nbv->gpu_nbv,
 -                                          stepIsEven ? eintLocal : eintNonlocal,
 -                                          numRollingParts);
 +        Nbnxm::gpu_launch_kernel_pruneonly(nbv->gpu_nbv,
 +                                           stepIsEven ? Nbnxm::InteractionLocality::Local : Nbnxm::InteractionLocality::NonLocal,
 +                                           numRollingParts);
      }
  }
  
@@@ -958,7 -1049,7 +958,7 @@@ static void do_force_cutsVERLET(FILE *f
      nonbonded_verlet_t *nbv = fr->nbv;
  
      bStateChanged = ((flags & GMX_FORCE_STATECHANGED) != 0);
 -    bNS           = ((flags & GMX_FORCE_NS) != 0) && (!fr->bAllvsAll);
 +    bNS           = ((flags & GMX_FORCE_NS) != 0);
      bFillGrid     = (bNS && bStateChanged);
      bCalcCGCM     = (bFillGrid && !DOMAINDECOMP(cr));
      bDoForces     = ((flags & GMX_FORCE_FORCES) != 0);
                                nullptr, 0, mdatoms->homenr, -1,
                                fr->cginfo, x.unpaddedArrayRef(),
                                0, nullptr,
 -                              nbv->grp[eintLocal].kernel_type,
 +                              nbv->grp[Nbnxm::InteractionLocality::Local].kernel_type,
                                nbv->nbat);
              wallcycle_sub_stop(wcycle, ewcsNBS_GRID_LOCAL);
          }
              wallcycle_sub_start(wcycle, ewcsNBS_GRID_NONLOCAL);
              nbnxn_put_on_grid_nonlocal(nbv->nbs.get(), domdec_zones(cr->dd),
                                         fr->cginfo, x.unpaddedArrayRef(),
 -                                       nbv->grp[eintNonlocal].kernel_type,
 +                                       nbv->grp[Nbnxm::InteractionLocality::NonLocal].kernel_type,
                                         nbv->nbat);
              wallcycle_sub_stop(wcycle, ewcsNBS_GRID_NONLOCAL);
          }
  
          if (bNS)
          {
 -            nbnxn_gpu_init_atomdata(nbv->gpu_nbv, nbv->nbat);
 +            Nbnxm::gpu_init_atomdata(nbv->gpu_nbv, nbv->nbat);
          }
  
 -        nbnxn_gpu_upload_shiftvec(nbv->gpu_nbv, nbv->nbat);
 +        Nbnxm::gpu_upload_shiftvec(nbv->gpu_nbv, nbv->nbat);
  
          wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
  
              // higher-level object than the nb module.
              fr->gpuBonded->updateInteractionListsAndDeviceBuffers(nbnxn_get_gridindices(fr->nbv->nbs.get()),
                                                                    top->idef,
 -                                                                  nbnxn_gpu_get_xq(nbv->gpu_nbv),
 -                                                                  nbnxn_gpu_get_f(nbv->gpu_nbv),
 -                                                                  nbnxn_gpu_get_fshift(nbv->gpu_nbv));
 +                                                                  Nbnxm::gpu_get_xq(nbv->gpu_nbv),
 +                                                                  Nbnxm::gpu_get_f(nbv->gpu_nbv),
 +                                                                  Nbnxm::gpu_get_fshift(nbv->gpu_nbv));
              ppForceWorkload->haveGpuBondedWork = fr->gpuBonded->haveInteractions();
          }
  
      /* do local pair search */
      if (bNS)
      {
 +        nbnxn_pairlist_set_t &pairlistSet = nbv->grp[Nbnxm::InteractionLocality::Local].nbl_lists;
 +
          wallcycle_start_nocount(wcycle, ewcNS);
          wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_LOCAL);
          nbnxn_make_pairlist(nbv->nbs.get(), nbv->nbat,
                              &top->excls,
                              nbv->listParams->rlistOuter,
                              nbv->min_ci_balanced,
 -                            &nbv->grp[eintLocal].nbl_lists,
 -                            eintLocal,
 -                            nbv->grp[eintLocal].kernel_type,
 +                            &pairlistSet,
 +                            Nbnxm::InteractionLocality::Local,
 +                            nbv->grp[Nbnxm::InteractionLocality::Local].kernel_type,
                              nrnb);
 -        nbv->grp[eintLocal].nbl_lists.outerListCreationStep = step;
 +        pairlistSet.outerListCreationStep = step;
          if (nbv->listParams->useDynamicPruning && !bUseGPU)
          {
 -            nbnxnPrepareListForDynamicPruning(&nbv->grp[eintLocal].nbl_lists);
 +            nbnxnPrepareListForDynamicPruning(&pairlistSet);
          }
          wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_LOCAL);
  
          if (bUseGPU)
          {
              /* initialize local pair-list on the GPU */
 -            nbnxn_gpu_init_pairlist(nbv->gpu_nbv,
 -                                    nbv->grp[eintLocal].nbl_lists.nbl[0],
 -                                    eintLocal);
 +            Nbnxm::gpu_init_pairlist(nbv->gpu_nbv,
 +                                     pairlistSet.nblGpu[0],
 +                                     Nbnxm::InteractionLocality::Local);
          }
          wallcycle_stop(wcycle, ewcNS);
      }
      else
      {
 -        nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs.get(), eatLocal, FALSE, as_rvec_array(x.unpaddedArrayRef().data()),
 +        nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs.get(), Nbnxm::AtomLocality::Local,
 +                                        FALSE, as_rvec_array(x.unpaddedArrayRef().data()),
                                          nbv->nbat, wcycle);
      }
  
          wallcycle_start(wcycle, ewcLAUNCH_GPU);
  
          wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
 -        nbnxn_gpu_copy_xq_to_gpu(nbv->gpu_nbv, nbv->nbat, eatLocal, ppForceWorkload->haveGpuBondedWork);
 +        Nbnxm::gpu_copy_xq_to_gpu(nbv->gpu_nbv, nbv->nbat, Nbnxm::AtomLocality::Local, ppForceWorkload->haveGpuBondedWork);
          wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
  
          // bonded work not split into separate local and non-local, so with DD
          }
  
          /* launch local nonbonded work on GPU */
-         wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
+         wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
 -        do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFNo,
 +        do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::Local, enbvClearFNo,
                       step, nrnb, wcycle);
          wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
          wallcycle_stop(wcycle, ewcLAUNCH_GPU);
         do non-local pair search */
      if (DOMAINDECOMP(cr))
      {
 +        nbnxn_pairlist_set_t &pairlistSet = nbv->grp[Nbnxm::InteractionLocality::NonLocal].nbl_lists;
 +
          if (bNS)
          {
              wallcycle_start_nocount(wcycle, ewcNS);
                                  &top->excls,
                                  nbv->listParams->rlistOuter,
                                  nbv->min_ci_balanced,
 -                                &nbv->grp[eintNonlocal].nbl_lists,
 -                                eintNonlocal,
 -                                nbv->grp[eintNonlocal].kernel_type,
 +                                &pairlistSet,
 +                                Nbnxm::InteractionLocality::NonLocal,
 +                                nbv->grp[Nbnxm::InteractionLocality::NonLocal].kernel_type,
                                  nrnb);
 -            nbv->grp[eintNonlocal].nbl_lists.outerListCreationStep = step;
 +            pairlistSet.outerListCreationStep = step;
              if (nbv->listParams->useDynamicPruning && !bUseGPU)
              {
 -                nbnxnPrepareListForDynamicPruning(&nbv->grp[eintNonlocal].nbl_lists);
 +                nbnxnPrepareListForDynamicPruning(&pairlistSet);
              }
              wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_NONLOCAL);
  
 -            if (nbv->grp[eintNonlocal].kernel_type == nbnxnk8x8x8_GPU)
 +            if (nbv->grp[Nbnxm::InteractionLocality::NonLocal].kernel_type == nbnxnk8x8x8_GPU)
              {
                  /* initialize non-local pair-list on the GPU */
 -                nbnxn_gpu_init_pairlist(nbv->gpu_nbv,
 -                                        nbv->grp[eintNonlocal].nbl_lists.nbl[0],
 -                                        eintNonlocal);
 +                Nbnxm::gpu_init_pairlist(nbv->gpu_nbv,
 +                                         pairlistSet.nblGpu[0],
 +                                         Nbnxm::InteractionLocality::NonLocal);
              }
              wallcycle_stop(wcycle, ewcNS);
          }
          {
              dd_move_x(cr->dd, box, x.unpaddedArrayRef(), wcycle);
  
 -            nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs.get(), eatNonlocal, FALSE, as_rvec_array(x.unpaddedArrayRef().data()),
 +            nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs.get(), Nbnxm::AtomLocality::NonLocal,
 +                                            FALSE, as_rvec_array(x.unpaddedArrayRef().data()),
                                              nbv->nbat, wcycle);
          }
  
              wallcycle_start(wcycle, ewcLAUNCH_GPU);
  
              /* launch non-local nonbonded tasks on GPU */
-             wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
+             wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
 -            nbnxn_gpu_copy_xq_to_gpu(nbv->gpu_nbv, nbv->nbat, eatNonlocal, ppForceWorkload->haveGpuBondedWork);
 +            Nbnxm::gpu_copy_xq_to_gpu(nbv->gpu_nbv, nbv->nbat, Nbnxm::AtomLocality::NonLocal, ppForceWorkload->haveGpuBondedWork);
              wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
  
              if (ppForceWorkload->haveGpuBondedWork)
              }
  
              wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
 -            do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFNo,
 +            do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::NonLocal, enbvClearFNo,
                           step, nrnb, wcycle);
              wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
  
          wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
          if (DOMAINDECOMP(cr))
          {
 -            nbnxn_gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat,
 -                                     flags, eatNonlocal, ppForceWorkload->haveGpuBondedWork);
 +            Nbnxm::gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat,
 +                                      flags, Nbnxm::AtomLocality::NonLocal, ppForceWorkload->haveGpuBondedWork);
          }
 -        nbnxn_gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat,
 -                                 flags, eatLocal, ppForceWorkload->haveGpuBondedWork);
 +        Nbnxm::gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat,
 +                                  flags, Nbnxm::AtomLocality::Local, ppForceWorkload->haveGpuBondedWork);
          wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
  
          if (ppForceWorkload->haveGpuBondedWork && (flags & GMX_FORCE_ENERGY))
  
      if (!bUseOrEmulGPU)
      {
 -        do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFYes,
 +        do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::Local, enbvClearFYes,
                       step, nrnb, wcycle);
      }
  
          /* Calculate the local and non-local free energy interactions here.
           * Happens here on the CPU both with and without GPU.
           */
 -        if (fr->nbv->grp[eintLocal].nbl_lists.nbl_fep[0]->nrj > 0)
 +        if (fr->nbv->grp[Nbnxm::InteractionLocality::Local].nbl_lists.nbl_fep[0]->nrj > 0)
          {
 -            do_nb_verlet_fep(&fr->nbv->grp[eintLocal].nbl_lists,
 +            do_nb_verlet_fep(&fr->nbv->grp[Nbnxm::InteractionLocality::Local].nbl_lists,
                               fr, as_rvec_array(x.unpaddedArrayRef().data()), f, mdatoms,
                               inputrec->fepvals, lambda,
                               enerd, flags, nrnb, wcycle);
          }
  
          if (DOMAINDECOMP(cr) &&
 -            fr->nbv->grp[eintNonlocal].nbl_lists.nbl_fep[0]->nrj > 0)
 +            fr->nbv->grp[Nbnxm::InteractionLocality::NonLocal].nbl_lists.nbl_fep[0]->nrj > 0)
          {
 -            do_nb_verlet_fep(&fr->nbv->grp[eintNonlocal].nbl_lists,
 +            do_nb_verlet_fep(&fr->nbv->grp[Nbnxm::InteractionLocality::NonLocal].nbl_lists,
                               fr, as_rvec_array(x.unpaddedArrayRef().data()), f, mdatoms,
                               inputrec->fepvals, lambda,
                               enerd, flags, nrnb, wcycle);
  
      if (!bUseOrEmulGPU)
      {
 -        int aloc;
 -
          if (DOMAINDECOMP(cr))
          {
 -            do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFNo,
 +            do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::NonLocal, enbvClearFNo,
                           step, nrnb, wcycle);
          }
  
 -        if (!bUseOrEmulGPU)
 -        {
 -            aloc = eintLocal;
 -        }
 -        else
 -        {
 -            aloc = eintNonlocal;
 -        }
 +        const Nbnxm::InteractionLocality iloc =
 +            (!bUseOrEmulGPU ? Nbnxm::InteractionLocality::Local : Nbnxm::InteractionLocality::NonLocal);
  
          /* Add all the non-bonded force to the normal force array.
           * This can be split into a local and a non-local part when overlapping
           */
          wallcycle_stop(wcycle, ewcFORCE);
  
 -        nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), eatAll, nbv->nbat, f, wcycle);
 +        nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), Nbnxm::AtomLocality::All, nbv->nbat, f, wcycle);
  
          wallcycle_start_nocount(wcycle, ewcFORCE);
  
          /* if there are multiple fshift output buffers reduce them */
          if ((flags & GMX_FORCE_VIRIAL) &&
 -            nbv->grp[aloc].nbl_lists.nnbl > 1)
 +            nbv->grp[iloc].nbl_lists.nnbl > 1)
          {
              /* This is not in a subcounter because it takes a
                 negligible and constant-sized amount of time */
              if (bUseGPU)
              {
                  wallcycle_start(wcycle, ewcWAIT_GPU_NB_NL);
 -                nbnxn_gpu_wait_finish_task(nbv->gpu_nbv,
 -                                           flags, eatNonlocal,
 -                                           ppForceWorkload->haveGpuBondedWork,
 -                                           enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
 -                                           fr->fshift);
 +                Nbnxm::gpu_wait_finish_task(nbv->gpu_nbv,
 +                                            flags, Nbnxm::AtomLocality::NonLocal,
 +                                            ppForceWorkload->haveGpuBondedWork,
 +                                            enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
 +                                            fr->fshift);
                  cycles_wait_gpu += wallcycle_stop(wcycle, ewcWAIT_GPU_NB_NL);
              }
              else
              {
                  wallcycle_start_nocount(wcycle, ewcFORCE);
 -                do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFYes,
 +                do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::NonLocal, enbvClearFYes,
                               step, nrnb, wcycle);
                  wallcycle_stop(wcycle, ewcFORCE);
              }
  
              /* skip the reduction if there was no non-local work to do */
 -            if (nbv->grp[eintNonlocal].nbl_lists.nbl[0]->nsci > 0)
 +            if (!nbv->grp[Nbnxm::InteractionLocality::NonLocal].nbl_lists.nblGpu[0]->sci.empty())
              {
 -                nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), eatNonlocal,
 +                nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), Nbnxm::AtomLocality::NonLocal,
                                                 nbv->nbat, f, wcycle);
              }
          }
          const float gpuWaitApiOverheadMargin = 2e6f; /* cycles */
  
          wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
 -        nbnxn_gpu_wait_finish_task(nbv->gpu_nbv,
 -                                   flags, eatLocal, ppForceWorkload->haveGpuBondedWork,
 -                                   enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
 -                                   fr->fshift);
 +        Nbnxm::gpu_wait_finish_task(nbv->gpu_nbv,
 +                                    flags, Nbnxm::AtomLocality::Local, ppForceWorkload->haveGpuBondedWork,
 +                                    enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
 +                                    fr->fshift);
          float cycles_tmp = wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
  
          if (ddCloseBalanceRegion == DdCloseBalanceRegionAfterForceComputation::yes)
          // NOTE: emulation kernel is not included in the balancing region,
          // but emulation mode does not target performance anyway
          wallcycle_start_nocount(wcycle, ewcFORCE);
 -        do_nb_verlet(fr, ic, enerd, flags, eintLocal,
 +        do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::Local,
                       DOMAINDECOMP(cr) ? enbvClearFNo : enbvClearFYes,
                       step, nrnb, wcycle);
          wallcycle_stop(wcycle, ewcFORCE);
          /* now clear the GPU outputs while we finish the step on the CPU */
          wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
          wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
 -        nbnxn_gpu_clear_outputs(nbv->gpu_nbv, flags);
 +        Nbnxm::gpu_clear_outputs(nbv->gpu_nbv, flags);
  
          /* Is dynamic pair-list pruning activated? */
          if (nbv->listParams->useDynamicPruning)
       * on the non-alternating path. */
      if (bUseOrEmulGPU && !alternateGpuWait)
      {
 -        nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), eatLocal,
 +        nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), Nbnxm::AtomLocality::Local,
                                         nbv->nbat, f, wcycle);
      }
      if (DOMAINDECOMP(cr))
@@@ -1743,7 -1836,7 +1743,7 @@@ static void do_force_cutsGROUP(FILE *fp
      }
  
      bStateChanged  = ((flags & GMX_FORCE_STATECHANGED) != 0);
 -    bNS            = ((flags & GMX_FORCE_NS) != 0) && (!fr->bAllvsAll);
 +    bNS            = ((flags & GMX_FORCE_NS) != 0);
      /* Should we perform the long-range nonbonded evaluation inside the neighborsearching? */
      bFillGrid      = (bNS && bStateChanged);
      bCalcCGCM      = (bFillGrid && !DOMAINDECOMP(cr));
@@@ -2744,7 -2837,7 +2744,7 @@@ void finish_run(FILE *fplog, const gmx:
  
      if (printReport)
      {
 -        auto                    nbnxn_gpu_timings = use_GPU(nbv) ? nbnxn_gpu_get_timings(nbv->gpu_nbv) : nullptr;
 +        auto                    nbnxn_gpu_timings = use_GPU(nbv) ? Nbnxm::gpu_get_timings(nbv->gpu_nbv) : nullptr;
          gmx_wallclock_gpu_pme_t pme_gpu_timings   = {};
          if (pme_gpu_task_enabled(pme))
          {
      }
  }
  
 -extern void initialize_lambdas(FILE *fplog, t_inputrec *ir, int *fep_state, gmx::ArrayRef<real> lambda, double *lam0)
 +void initialize_lambdas(FILE               *fplog,
 +                        const t_inputrec   &ir,
 +                        bool                isMaster,
 +                        int                *fep_state,
 +                        gmx::ArrayRef<real> lambda,
 +                        double             *lam0)
  {
 -    /* this function works, but could probably use a logic rewrite to keep all the different
 -       types of efep straight. */
 +    /* TODO: Clean up initialization of fep_state and lambda in
 +       t_state.  This function works, but could probably use a logic
 +       rewrite to keep all the different types of efep straight. */
  
 -    if ((ir->efep == efepNO) && (!ir->bSimTemp))
 +    if ((ir.efep == efepNO) && (!ir.bSimTemp))
      {
          return;
      }
  
 -    t_lambda *fep = ir->fepvals;
 -    *fep_state    = fep->init_fep_state; /* this might overwrite the checkpoint
 -                                            if checkpoint is set -- a kludge is in for now
 -                                            to prevent this.*/
 +    const t_lambda *fep = ir.fepvals;
 +    if (isMaster)
 +    {
 +        *fep_state = fep->init_fep_state; /* this might overwrite the checkpoint
 +                                             if checkpoint is set -- a kludge is in for now
 +                                             to prevent this.*/
 +    }
  
      for (int i = 0; i < efptNR; i++)
      {
 +        double thisLambda;
          /* overwrite lambda state with init_lambda for now for backwards compatibility */
 -        if (fep->init_lambda >= 0) /* if it's -1, it was never initializd */
 +        if (fep->init_lambda >= 0) /* if it's -1, it was never initialized */
          {
 -            lambda[i] = fep->init_lambda;
 -            if (lam0)
 -            {
 -                lam0[i] = lambda[i];
 -            }
 +            thisLambda = fep->init_lambda;
          }
          else
          {
 -            lambda[i] = fep->all_lambda[i][*fep_state];
 -            if (lam0)
 -            {
 -                lam0[i] = lambda[i];
 -            }
 +            thisLambda = fep->all_lambda[i][fep->init_fep_state];
 +        }
 +        if (isMaster)
 +        {
 +            lambda[i] = thisLambda;
 +        }
 +        if (lam0 != nullptr)
 +        {
 +            lam0[i] = thisLambda;
          }
      }
 -    if (ir->bSimTemp)
 +    if (ir.bSimTemp)
      {
          /* need to rescale control temperatures to match current state */
 -        for (int i = 0; i < ir->opts.ngtc; i++)
 +        for (int i = 0; i < ir.opts.ngtc; i++)
          {
 -            if (ir->opts.ref_t[i] > 0)
 +            if (ir.opts.ref_t[i] > 0)
              {
 -                ir->opts.ref_t[i] = ir->simtempvals->temperatures[*fep_state];
 +                ir.opts.ref_t[i] = ir.simtempvals->temperatures[fep->init_fep_state];
              }
          }
      }
          fprintf(fplog, "]\n");
      }
  }
 -
 -
 -void init_md(FILE *fplog,
 -             const t_commrec *cr, gmx::IMDOutputProvider *outputProvider,
 -             t_inputrec *ir, const gmx_output_env_t *oenv,
 -             const MdrunOptions &mdrunOptions,
 -             double *t, double *t0,
 -             t_state *globalState, double *lam0,
 -             t_nrnb *nrnb, gmx_mtop_t *mtop,
 -             gmx_update_t **upd,
 -             gmx::BoxDeformation *deform,
 -             int nfile, const t_filenm fnm[],
 -             gmx_mdoutf_t *outf, t_mdebin **mdebin,
 -             tensor force_vir, tensor shake_vir,
 -             tensor total_vir, tensor pres, rvec mu_tot,
 -             gmx_bool *bSimAnn, t_vcm **vcm,
 -             gmx_wallcycle_t wcycle)
 -{
 -    int  i;
 -
 -    /* Initial values */
 -    *t = *t0       = ir->init_t;
 -
 -    *bSimAnn = FALSE;
 -    for (i = 0; i < ir->opts.ngtc; i++)
 -    {
 -        /* set bSimAnn if any group is being annealed */
 -        if (ir->opts.annealing[i] != eannNO)
 -        {
 -            *bSimAnn = TRUE;
 -        }
 -    }
 -
 -    /* Initialize lambda variables */
 -    /* TODO: Clean up initialization of fep_state and lambda in t_state.
 -     * We currently need to call initialize_lambdas on non-master ranks
 -     * to initialize lam0.
 -     */
 -    if (MASTER(cr))
 -    {
 -        initialize_lambdas(fplog, ir, &globalState->fep_state, globalState->lambda, lam0);
 -    }
 -    else
 -    {
 -        int                      tmpFepState;
 -        std::array<real, efptNR> tmpLambda;
 -        initialize_lambdas(fplog, ir, &tmpFepState, tmpLambda, lam0);
 -    }
 -
 -    // TODO upd is never NULL in practice, but the analysers don't know that
 -    if (upd)
 -    {
 -        *upd = init_update(ir, deform);
 -    }
 -    if (*bSimAnn)
 -    {
 -        update_annealing_target_temp(ir, ir->init_t, upd ? *upd : nullptr);
 -    }
 -
 -    if (vcm != nullptr)
 -    {
 -        *vcm = init_vcm(fplog, &mtop->groups, ir);
 -    }
 -
 -    if (EI_DYNAMICS(ir->eI) && !mdrunOptions.continuationOptions.appendFiles)
 -    {
 -        if (ir->etc == etcBERENDSEN)
 -        {
 -            please_cite(fplog, "Berendsen84a");
 -        }
 -        if (ir->etc == etcVRESCALE)
 -        {
 -            please_cite(fplog, "Bussi2007a");
 -        }
 -        if (ir->eI == eiSD1)
 -        {
 -            please_cite(fplog, "Goga2012");
 -        }
 -    }
 -    init_nrnb(nrnb);
 -
 -    if (nfile != -1)
 -    {
 -        *outf = init_mdoutf(fplog, nfile, fnm, mdrunOptions, cr, outputProvider, ir, mtop, oenv, wcycle);
 -
 -        *mdebin = init_mdebin(mdrunOptions.continuationOptions.appendFiles ? nullptr : mdoutf_get_fp_ene(*outf),
 -                              mtop, ir, mdoutf_get_fp_dhdl(*outf));
 -    }
 -
 -    /* Initiate variables */
 -    clear_mat(force_vir);
 -    clear_mat(shake_vir);
 -    clear_rvec(mu_tot);
 -    clear_mat(total_vir);
 -    clear_mat(pres);
 -}
 -
 -void init_rerun(FILE *fplog,
 -                const t_commrec *cr, gmx::IMDOutputProvider *outputProvider,
 -                t_inputrec *ir, const gmx_output_env_t *oenv,
 -                const MdrunOptions &mdrunOptions,
 -                t_state *globalState, double *lam0,
 -                t_nrnb *nrnb, gmx_mtop_t *mtop,
 -                int nfile, const t_filenm fnm[],
 -                gmx_mdoutf_t *outf, t_mdebin **mdebin,
 -                gmx_wallcycle_t wcycle)
 -{
 -    /* Initialize lambda variables */
 -    /* TODO: Clean up initialization of fep_state and lambda in t_state.
 -     * We currently need to call initialize_lambdas on non-master ranks
 -     * to initialize lam0.
 -     */
 -    if (MASTER(cr))
 -    {
 -        initialize_lambdas(fplog, ir, &globalState->fep_state, globalState->lambda, lam0);
 -    }
 -    else
 -    {
 -        int                      tmpFepState;
 -        std::array<real, efptNR> tmpLambda;
 -        initialize_lambdas(fplog, ir, &tmpFepState, tmpLambda, lam0);
 -    }
 -
 -    init_nrnb(nrnb);
 -
 -    if (nfile != -1)
 -    {
 -        *outf   = init_mdoutf(fplog, nfile, fnm, mdrunOptions, cr, outputProvider, ir, mtop, oenv, wcycle);
 -        *mdebin = init_mdebin(mdrunOptions.continuationOptions.appendFiles ? nullptr : mdoutf_get_fp_ene(*outf),
 -                              mtop, ir, mdoutf_get_fp_dhdl(*outf), true);
 -    }
 -}
index a6f30f0e748b6701af35879af75577e6e0c7905f,d3be42c82a89bfe5684bbf914079f8060b534c81..fc6d53e60c2759d20ad3bc842a33aa4521d80e4d
@@@ -53,6 -53,7 +53,6 @@@
  
  #include "gromacs/awh/awh.h"
  #include "gromacs/commandline/filenm.h"
 -#include "gromacs/compat/make_unique.h"
  #include "gromacs/domdec/collect.h"
  #include "gromacs/domdec/domdec.h"
  #include "gromacs/domdec/domdec_network.h"
  #include "gromacs/domdec/partition.h"
  #include "gromacs/essentialdynamics/edsam.h"
  #include "gromacs/ewald/pme.h"
 -#include "gromacs/ewald/pme-load-balancing.h"
 +#include "gromacs/ewald/pme_load_balancing.h"
  #include "gromacs/fileio/trxio.h"
  #include "gromacs/gmxlib/network.h"
  #include "gromacs/gmxlib/nrnb.h"
  #include "gromacs/gpu_utils/gpu_utils.h"
  #include "gromacs/imd/imd.h"
 -#include "gromacs/listed-forces/manage-threading.h"
 +#include "gromacs/listed_forces/manage_threading.h"
  #include "gromacs/math/functions.h"
  #include "gromacs/math/utilities.h"
  #include "gromacs/math/vec.h"
  #include "gromacs/mdlib/compute_io.h"
  #include "gromacs/mdlib/constr.h"
  #include "gromacs/mdlib/ebin.h"
 +#include "gromacs/mdlib/energyoutput.h"
  #include "gromacs/mdlib/expanded.h"
  #include "gromacs/mdlib/force.h"
  #include "gromacs/mdlib/force_flags.h"
  #include "gromacs/mdlib/forcerec.h"
  #include "gromacs/mdlib/md_support.h"
  #include "gromacs/mdlib/mdatoms.h"
 -#include "gromacs/mdlib/mdebin.h"
  #include "gromacs/mdlib/mdoutf.h"
  #include "gromacs/mdlib/mdrun.h"
  #include "gromacs/mdlib/mdsetup.h"
  #include "gromacs/mdlib/membed.h"
 -#include "gromacs/mdlib/nb_verlet.h"
 -#include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h"
  #include "gromacs/mdlib/ns.h"
  #include "gromacs/mdlib/resethandler.h"
  #include "gromacs/mdlib/shellfc.h"
  #include "gromacs/mdlib/update.h"
  #include "gromacs/mdlib/vcm.h"
  #include "gromacs/mdlib/vsite.h"
 -#include "gromacs/mdtypes/awh-history.h"
 -#include "gromacs/mdtypes/awh-params.h"
 +#include "gromacs/mdtypes/awh_history.h"
 +#include "gromacs/mdtypes/awh_params.h"
  #include "gromacs/mdtypes/commrec.h"
  #include "gromacs/mdtypes/df_history.h"
  #include "gromacs/mdtypes/energyhistory.h"
@@@ -192,6 -195,7 +192,6 @@@ void gmx::Integrator::do_rerun(
      // t_inputrec is being replaced by IMdpOptionsProvider, so this
      // will go away eventually.
      t_inputrec             *ir   = inputrec;
 -    gmx_mdoutf             *outf = nullptr;
      int64_t                 step, step_rel;
      double                  t, lam0[efptNR];
      bool                    isLastStep               = false;
      t_trxstatus            *status;
      rvec                    mu_tot;
      t_trxframe              rerun_fr;
 -    gmx_localtop_t         *top;
 -    t_mdebin               *mdebin   = nullptr;
 +    gmx_localtop_t          top;
      gmx_enerdata_t         *enerd;
      PaddedVector<gmx::RVec> f {};
      gmx_global_stat_t       gstat;
      t_graph                *graph = nullptr;
      gmx_groups_t           *groups;
 -    gmx_ekindata_t         *ekind;
      gmx_shellfc_t          *shellfc;
  
      double                  cycles;
                     "be available in a different form in a future version of GROMACS, "
                     "e.g. gmx rerun -f.");
  
+     if (ir->efep != efepNO && (mdAtoms->mdatoms()->nMassPerturbed > 0 ||
+                                (constr && constr->havePerturbedConstraints())))
+     {
+         gmx_fatal(FARGS, "Perturbed masses or constraints are not supported by rerun. "
+                   "Either make a .tpr without mass and constraint perturbation, "
+                   "or use GROMACS 2018.4, 2018.5 or later 2018 version.");
+     }
      if (ir->bExpanded)
      {
          gmx_fatal(FARGS, "Expanded ensemble not supported by rerun.");
          top_global->intermolecularExclusionGroup = genQmmmIndices(*top_global);
      }
  
 -    /* Initial values */
 -    init_rerun(fplog, cr, outputProvider, ir, oenv, mdrunOptions,
 -               state_global, lam0, nrnb, top_global,
 -               nfile, fnm, &outf, &mdebin, wcycle);
 +    initialize_lambdas(fplog, *ir, MASTER(cr), &state_global->fep_state, state_global->lambda, lam0);
 +    init_nrnb(nrnb);
 +    gmx_mdoutf       *outf = init_mdoutf(fplog, nfile, fnm, mdrunOptions, cr, outputProvider, ir, top_global, oenv, wcycle);
 +    gmx::EnergyOutput energyOutput;
 +    energyOutput.prepare(mdoutf_get_fp_ene(outf), top_global, ir, mdoutf_get_fp_dhdl(outf), true);
  
      /* Energy terms and groups */
      snew(enerd, 1);
                    enerd);
  
      /* Kinetic energy data */
 -    snew(ekind, 1);
 +    std::unique_ptr<gmx_ekindata_t> eKinData = std::make_unique<gmx_ekindata_t>();
 +    gmx_ekindata_t                 *ekind    = eKinData.get();
      init_ekindata(fplog, top_global, &(ir->opts), ekind);
      /* Copy the cos acceleration to the groups struct */
      ekind->cosacc.cos_accel = ir->cos_accel;
                                   ir->nstcalcenergy, DOMAINDECOMP(cr));
  
      {
 -        double io = compute_io(ir, top_global->natoms, groups, mdebin->ebin->nener, 1);
 +        double io = compute_io(ir, top_global->natoms, groups, energyOutput.numEnergyTerms(), 1);
          if ((io > 2000) && MASTER(cr))
          {
              fprintf(stderr,
  
      if (DOMAINDECOMP(cr))
      {
 -        top = dd_init_local_top(top_global);
 +        dd_init_local_top(*top_global, &top);
  
 -        stateInstance = compat::make_unique<t_state>();
 +        stateInstance = std::make_unique<t_state>();
          state         = stateInstance.get();
          dd_init_local_state(cr->dd, state_global, state);
  
          /* Distribute the charge groups over the nodes from the master node */
          dd_partition_system(fplog, mdlog, ir->init_step, cr, TRUE, 1,
 -                            state_global, top_global, ir,
 -                            state, &f, mdAtoms, top, fr,
 +                            state_global, *top_global, ir,
 +                            state, &f, mdAtoms, &top, fr,
                              vsite, constr,
                              nrnb, nullptr, FALSE);
          shouldCheckNumberOfBondedInteractions = true;
          /* Copy the pointer to the global state */
          state = state_global;
  
 -        snew(top, 1);
 -        mdAlgorithmsSetupAtomData(cr, ir, top_global, top, fr,
 +        mdAlgorithmsSetupAtomData(cr, ir, *top_global, &top, fr,
                                    &graph, mdAtoms, constr, vsite, shellfc);
      }
  
                          &totalNumberOfBondedInteractions, &bSumEkinhOld, cglo_flags);
      }
      checkNumberOfBondedInteractions(mdlog, cr, totalNumberOfBondedInteractions,
 -                                    top_global, top, state,
 +                                    top_global, &top, state,
                                      &shouldCheckNumberOfBondedInteractions);
  
      if (MASTER(cr))
                  gmx_fatal(FARGS, "Vsite recalculation with -rerun is not implemented with domain decomposition, "
                            "use a single rank");
              }
 -            prepareRerunState(rerun_fr, state_global, constructVsites, vsite, top->idef, ir->delta_t, *fr, graph);
 +            prepareRerunState(rerun_fr, state_global, constructVsites, vsite, top.idef, ir->delta_t, *fr, graph);
          }
  
          isLastStep = isLastStep || stopHandler->stoppingAfterCurrentStep(bNS);
              const bool bMasterState = true;
              dd_partition_system(fplog, mdlog, step, cr,
                                  bMasterState, nstglobalcomm,
 -                                state_global, top_global, ir,
 -                                state, &f, mdAtoms, top, fr,
 +                                state_global, *top_global, ir,
 +                                state, &f, mdAtoms, &top, fr,
                                  vsite, constr,
                                  nrnb, wcycle,
                                  mdrunOptions.verbose);
              /* Now is the time to relax the shells */
              relax_shell_flexcon(fplog, cr, ms, mdrunOptions.verbose,
                                  enforcedRotation, step,
 -                                ir, bNS, force_flags, top,
 +                                ir, bNS, force_flags, &top,
                                  constr, enerd, fcd,
                                  state, f.arrayRefWithPadding(), force_vir, mdatoms,
                                  nrnb, wcycle, graph, groups,
              Awh       *awh = nullptr;
              gmx_edsam *ed  = nullptr;
              do_force(fplog, cr, ms, ir, awh, enforcedRotation,
 -                     step, nrnb, wcycle, top, groups,
 +                     step, nrnb, wcycle, &top, groups,
                       state->box, state->x.arrayRefWithPadding(), &state->hist,
                       f.arrayRefWithPadding(), force_vir, mdatoms, enerd, fcd,
                       state->lambda, graph,
              do_md_trajectory_writing(fplog, cr, nfile, fnm, step, step_rel, t,
                                       ir, state, state_global, observablesHistory,
                                       top_global, fr,
 -                                     outf, mdebin, ekind, f,
 +                                     outf, energyOutput, ekind, f,
                                       isCheckpointingStep, doRerun, isLastStep,
                                       mdrunOptions.writeConfout,
                                       bSumEkinhOld);
                  shift_self(graph, state->box, as_rvec_array(state->x.data()));
              }
              construct_vsites(vsite, as_rvec_array(state->x.data()), ir->delta_t, as_rvec_array(state->v.data()),
 -                             top->idef.iparams, top->idef.il,
 +                             top.idef.iparams, top.idef.il,
                               fr->ePBC, fr->bMolPBC, cr, state->box);
  
              if (graph != nullptr)
                              | (shouldCheckNumberOfBondedInteractions ? CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS : 0)
                              );
              checkNumberOfBondedInteractions(mdlog, cr, totalNumberOfBondedInteractions,
 -                                            top_global, top, state,
 +                                            top_global, &top, state,
                                              &shouldCheckNumberOfBondedInteractions);
          }
  
          if (MASTER(cr))
          {
              const bool bCalcEnerStep = true;
 -            upd_mdebin(mdebin, doFreeEnergyPerturbation, bCalcEnerStep,
 -                       t, mdatoms->tmass, enerd, state,
 -                       ir->fepvals, ir->expandedvals, rerun_fr.box,
 -                       shake_vir, force_vir, total_vir, pres,
 -                       ekind, mu_tot, constr);
 +            energyOutput.addDataAtEnergyStep(doFreeEnergyPerturbation, bCalcEnerStep,
 +                                             t, mdatoms->tmass, enerd, state,
 +                                             ir->fepvals, ir->expandedvals, state->box,
 +                                             shake_vir, force_vir, total_vir, pres,
 +                                             ekind, mu_tot, constr);
  
              const bool do_ene = true;
              const bool do_log = true;
              const bool do_dr  = ir->nstdisreout != 0;
              const bool do_or  = ir->nstorireout != 0;
  
 -            print_ebin(mdoutf_get_fp_ene(outf), do_ene, do_dr, do_or, do_log ? fplog : nullptr,
 -                       step, t,
 -                       eprNORMAL, mdebin, fcd, groups, &(ir->opts), awh);
 +            energyOutput.printStepToEnergyFile(mdoutf_get_fp_ene(outf), do_ene, do_dr, do_or,
 +                                               do_log ? fplog : nullptr,
 +                                               step, t,
 +                                               eprNORMAL, fcd, groups, &(ir->opts), awh);
  
              if (do_per_step(step, ir->nstlog))
              {
          gmx_pme_send_finish(cr);
      }
  
 -    done_mdebin(mdebin);
      done_mdoutf(outf);
  
      done_shellfc(fplog, shellfc, step_rel);
  
      destroy_enerdata(enerd);
      sfree(enerd);
 -    sfree(top);
  }
index 3ad53c99eb748531538f3fdc9a4a683e8d99136e,ae158a4984deefd1d61ca9e2a912143db1c75e05..1ae70af026460cad75cb4bb39136d2f23013456b
  #include <cstring>
  
  #include <algorithm>
 +#include <memory>
  
  #include "gromacs/commandline/filenm.h"
 -#include "gromacs/compat/make_unique.h"
  #include "gromacs/domdec/domdec.h"
  #include "gromacs/domdec/domdec_struct.h"
  #include "gromacs/domdec/localatomsetmanager.h"
 -#include "gromacs/ewald/ewald-utils.h"
 +#include "gromacs/ewald/ewald_utils.h"
  #include "gromacs/ewald/pme.h"
 -#include "gromacs/ewald/pme-gpu-program.h"
 +#include "gromacs/ewald/pme_gpu_program.h"
  #include "gromacs/fileio/checkpoint.h"
  #include "gromacs/fileio/gmxfio.h"
  #include "gromacs/fileio/oenv.h"
@@@ -74,9 -74,9 +74,9 @@@
  #include "gromacs/hardware/cpuinfo.h"
  #include "gromacs/hardware/detecthardware.h"
  #include "gromacs/hardware/printhardware.h"
 -#include "gromacs/listed-forces/disre.h"
 -#include "gromacs/listed-forces/gpubonded.h"
 -#include "gromacs/listed-forces/orires.h"
 +#include "gromacs/listed_forces/disre.h"
 +#include "gromacs/listed_forces/gpubonded.h"
 +#include "gromacs/listed_forces/orires.h"
  #include "gromacs/math/functions.h"
  #include "gromacs/math/utilities.h"
  #include "gromacs/math/vec.h"
  #include "gromacs/mdlib/mdatoms.h"
  #include "gromacs/mdlib/mdrun.h"
  #include "gromacs/mdlib/membed.h"
 -#include "gromacs/mdlib/nb_verlet.h"
 -#include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h"
 -#include "gromacs/mdlib/nbnxn_search.h"
 -#include "gromacs/mdlib/nbnxn_tuning.h"
  #include "gromacs/mdlib/ppforceworkload.h"
  #include "gromacs/mdlib/qmmm.h"
  #include "gromacs/mdlib/sighandler.h"
  #include "gromacs/mdtypes/md_enums.h"
  #include "gromacs/mdtypes/observableshistory.h"
  #include "gromacs/mdtypes/state.h"
 +#include "gromacs/nbnxm/nbnxm.h"
 +#include "gromacs/nbnxm/pairlist_tuning.h"
  #include "gromacs/pbcutil/pbc.h"
  #include "gromacs/pulling/output.h"
  #include "gromacs/pulling/pull.h"
@@@ -167,7 -169,7 +167,7 @@@ Mdrunner Mdrunner::cloneOnSpawnedThread
      // part of the interface to the client code, which is associated only with the
      // original thread. Handles to the same resources can be obtained by copy.
      {
 -        newRunner.restraintManager_ = compat::make_unique<RestraintManager>(*restraintManager_);
 +        newRunner.restraintManager_ = std::make_unique<RestraintManager>(*restraintManager_);
      }
  
      // Copy original cr pointer before master thread can pass the thread barrier
      newRunner.replExParams        = replExParams;
      newRunner.pforce              = pforce;
      newRunner.ms                  = ms;
 -    newRunner.stopHandlerBuilder_ = compat::make_unique<StopHandlerBuilder>(*stopHandlerBuilder_);
 +    newRunner.stopHandlerBuilder_ = std::make_unique<StopHandlerBuilder>(*stopHandlerBuilder_);
  
      threadMpiMdrunnerAccessBarrier();
  
@@@ -426,8 -428,11 +426,8 @@@ int Mdrunner::mdrunner(
      int                       nChargePerturbed = -1, nTypePerturbed = 0;
      gmx_wallcycle_t           wcycle;
      gmx_walltime_accounting_t walltime_accounting = nullptr;
 -    int                       rc;
 -    int64_t                   reset_counters;
 -    int                       nthreads_pme = 1;
 -    gmx_membed_t *            membed       = nullptr;
 -    gmx_hw_info_t            *hwinfo       = nullptr;
 +    gmx_membed_t *            membed              = nullptr;
 +    gmx_hw_info_t            *hwinfo              = nullptr;
  
      /* CAUTION: threads may be started later on in this function, so
         cr doesn't reflect the final parallel state right now */
      std::vector<int>    gpuIdsAvailable;
      try
      {
 -        gpuIdsAvailable = parseUserGpuIds(hw_opt.gpuIdsAvailable);
 -        // TODO We could put the GPU IDs into a std::map to find
 -        // duplicates, but for the small numbers of IDs involved, this
 -        // code is simple and fast.
 -        for (size_t i = 0; i != gpuIdsAvailable.size(); ++i)
 -        {
 -            for (size_t j = i+1; j != gpuIdsAvailable.size(); ++j)
 -            {
 -                if (gpuIdsAvailable[i] == gpuIdsAvailable[j])
 -                {
 -                    GMX_THROW(InvalidInputError(formatString("The string of available GPU device IDs '%s' may not contain duplicate device IDs", hw_opt.gpuIdsAvailable.c_str())));
 -                }
 -            }
 -        }
 +        gpuIdsAvailable = parseUserGpuIdString(hw_opt.gpuIdsAvailable);
      }
      GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
  
      std::vector<int> userGpuTaskAssignment;
      try
      {
 -        userGpuTaskAssignment = parseUserGpuIds(hw_opt.userGpuTaskAssignment);
 +        userGpuTaskAssignment = parseUserTaskAssignmentString(hw_opt.userGpuTaskAssignment);
      }
      GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
      auto       nonbondedTarget = findTaskTarget(nbpu_opt);
      if (SIMMASTER(cr))
      {
          /* Only the master rank has the global state */
 -        globalState = compat::make_unique<t_state>();
 +        globalState = std::make_unique<t_state>();
  
          /* Read (nearly) all data required for the simulation */
          read_tpx_state(ftp2fn(efTPR, filenames.size(), filenames.data()), inputrec, globalState.get(), &mtop);
                      inputrec->cutoff_scheme == ecutsVERLET,
                      gpuAccelerationOfNonbondedIsUseful(mdlog, inputrec, GMX_THREAD_MPI),
                      hw_opt.nthreads_tmpi);
-             auto canUseGpuForPme   = pme_gpu_supports_build(*hwinfo, nullptr) && pme_gpu_supports_input(*inputrec, mtop, nullptr);
              useGpuForPme = decideWhetherToUseGpusForPmeWithThreadMpi
                      (useGpuForNonbonded, pmeTarget, gpuIdsToUse, userGpuTaskAssignment,
-                     canUseGpuForPme, hw_opt.nthreads_tmpi, domdecOptions.numPmeRanks);
+                     *hwinfo, *inputrec, mtop, hw_opt.nthreads_tmpi, domdecOptions.numPmeRanks);
  
          }
          GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
                                                                  usingVerletScheme,
                                                                  gpuAccelerationOfNonbondedIsUseful(mdlog, inputrec, !GMX_THREAD_MPI),
                                                                  gpusWereDetected);
-         auto canUseGpuForPme   = pme_gpu_supports_build(*hwinfo, nullptr) && pme_gpu_supports_input(*inputrec, mtop, nullptr);
          useGpuForPme = decideWhetherToUseGpusForPme(useGpuForNonbonded, pmeTarget, userGpuTaskAssignment,
-                                                     canUseGpuForPme, cr->nnodes, domdecOptions.numPmeRanks,
+                                                     *hwinfo, *inputrec, mtop,
+                                                     cr->nnodes, domdecOptions.numPmeRanks,
                                                      gpusWereDetected);
          auto canUseGpuForBonded = buildSupportsGpuBondeds(nullptr) && inputSupportsGpuBondeds(*inputrec, mtop, nullptr);
          useGpuForBonded =
      {
          if (!MASTER(cr))
          {
 -            globalState = compat::make_unique<t_state>();
 +            globalState = std::make_unique<t_state>();
          }
          broadcastStateWithoutDynamics(cr, globalState.get());
      }
          gmx_fatal(FARGS, "The .mdp file specified an energy mininization or normal mode algorithm, and these are not compatible with mdrun -rerun");
      }
  
 -    if (can_use_allvsall(inputrec, TRUE, cr, fplog) && DOMAINDECOMP(cr))
 -    {
 -        gmx_fatal(FARGS, "All-vs-all loops do not work with domain decomposition, use a single MPI rank");
 -    }
 -
      if (!(EEL_PME(inputrec->coulombtype) || EVDW_PME(inputrec->vdwtype)))
      {
          if (domdecOptions.numPmeRanks > 0)
          }
      }
  
 -    /* getting number of PP/PME threads
 +    /* getting number of PP/PME threads on this MPI / tMPI rank.
         PME: env variable should be read only on one node to make sure it is
         identical everywhere;
       */
 -    nthreads_pme = gmx_omp_nthreads_get(emntPME);
 -
 -    int numThreadsOnThisRank;
 -    /* threads on this MPI process or TMPI thread */
 -    if (thisRankHasDuty(cr, DUTY_PP))
 -    {
 -        numThreadsOnThisRank = gmx_omp_nthreads_get(emntNonbonded);
 -    }
 -    else
 -    {
 -        numThreadsOnThisRank = nthreads_pme;
 -    }
 -
 +    const int numThreadsOnThisRank =
 +        thisRankHasDuty(cr, DUTY_PP) ? gmx_omp_nthreads_get(emntNonbonded) : gmx_omp_nthreads_get(emntPME);
      checkHardwareOversubscription(numThreadsOnThisRank, cr->nodeid,
                                    *hwinfo->hardwareTopology,
                                    physicalNodeComm, mdlog);
      {
          /* Master synchronizes its value of reset_counters with all nodes
           * including PME only nodes */
 -        reset_counters = wcycle_get_reset_counters(wcycle);
 +        int64_t reset_counters = wcycle_get_reset_counters(wcycle);
          gmx_bcast_sim(sizeof(reset_counters), &reset_counters, cr);
          wcycle_set_reset_counters(wcycle, reset_counters);
      }
                                         mtop.natoms, nChargePerturbed != 0, nTypePerturbed != 0,
                                         mdrunOptions.reproducible,
                                         ewaldcoeff_q, ewaldcoeff_lj,
 -                                       nthreads_pme,
 +                                       gmx_omp_nthreads_get(emntPME),
                                         pmeRunMode, nullptr,
                                         pmeDeviceInfo, pmeGpuProgram.get(), mdlog);
              }
                 pmedata,
                 EI_DYNAMICS(inputrec->eI) && !isMultiSim(ms));
  
 +    // clean up cycle counter
 +    wallcycle_destroy(wcycle);
 +
      // Free PME data
      if (pmedata)
      {
          gmx_fedisableexcept();
      }
  
 -    rc = static_cast<int>(gmx_get_stop_condition());
 +    auto rc = static_cast<int>(gmx_get_stop_condition());
  
  #if GMX_THREAD_MPI
      /* we need to join all threads. The sub-threads join when they
         wait for that. */
      if (PAR(cr) && MASTER(cr))
      {
 -        done_commrec(cr);
          tMPI_Finalize();
      }
 +    //TODO free commrec in MPI simulations
 +    done_commrec(cr);
  #endif
 -
      return rc;
  }
  
@@@ -1660,7 -1690,7 +1659,7 @@@ void Mdrunner::BuilderImplementation::a
  
  void Mdrunner::BuilderImplementation::addMultiSim(gmx_multisim_t* multisim)
  {
 -    multisim_ = compat::make_unique<gmx_multisim_t*>(multisim);
 +    multisim_ = std::make_unique<gmx_multisim_t*>(multisim);
  }
  
  Mdrunner Mdrunner::BuilderImplementation::build()
          GMX_THROW(gmx::APIError("MdrunnerBuilder::addBondedTaskAssignment() is required before build()"));
      }
  
 -    newRunner.restraintManager_ = compat::make_unique<gmx::RestraintManager>();
 +    newRunner.restraintManager_ = std::make_unique<gmx::RestraintManager>();
  
      if (stopHandlerBuilder_)
      {
      }
      else
      {
 -        newRunner.stopHandlerBuilder_ = compat::make_unique<StopHandlerBuilder>();
 +        newRunner.stopHandlerBuilder_ = std::make_unique<StopHandlerBuilder>();
      }
  
      return newRunner;
@@@ -1794,7 -1824,7 +1793,7 @@@ void Mdrunner::BuilderImplementation::a
  }
  
  MdrunnerBuilder::MdrunnerBuilder(compat::not_null<SimulationContext*> context) :
 -    impl_ {gmx::compat::make_unique<Mdrunner::BuilderImplementation>(context)}
 +    impl_ {std::make_unique<Mdrunner::BuilderImplementation>(context)}
  {
  }
  
index 34b932394b97f81c62fa1710c6529d8729c7902a,0000000000000000000000000000000000000000..758e5ec55d16c6ea3799dda3c8844016c03e6e5f
mode 100644,000000..100644
--- /dev/null
@@@ -1,737 -1,0 +1,737 @@@
-         c12*(-repu_shift_V2 + repu_shift_V3*r_switch)*r_switch*r_switch*inv_r;
 +/*
 + * This file is part of the GROMACS molecular simulation package.
 + *
 + * Copyright (c) 2012,2013,2014,2016,2017,2018,2019, by the GROMACS development team, led by
 + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
 + * and including many others, as listed in the AUTHORS file in the
 + * top-level source directory and at http://www.gromacs.org.
 + *
 + * GROMACS is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU Lesser General Public License
 + * as published by the Free Software Foundation; either version 2.1
 + * of the License, or (at your option) any later version.
 + *
 + * GROMACS is distributed in the hope that it will be useful,
 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 + * Lesser General Public License for more details.
 + *
 + * You should have received a copy of the GNU Lesser General Public
 + * License along with GROMACS; if not, see
 + * http://www.gnu.org/licenses, or write to the Free Software Foundation,
 + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
 + *
 + * If you want to redistribute modifications to GROMACS, please
 + * consider that scientific software is very special. Version
 + * control is crucial - bugs must be traceable. We will be happy to
 + * consider code for inclusion in the official distribution, but
 + * derived work must not be called official GROMACS. Details are found
 + * in the README & COPYING files - if they are missing, get the
 + * official version at http://www.gromacs.org.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the research papers on the package. Check out http://www.gromacs.org.
 + */
 +
 +/*! \internal \file
 + *  \brief
 + *  Utility constant and function declaration for the CUDA non-bonded kernels.
 + *  This header should be included once at the top level, just before the
 + *  kernels are included (has to be preceded by nbnxn_cuda_types.h).
 + *
 + *  \author Szilárd Páll <pall.szilard@gmail.com>
 + *  \ingroup module_nbnxm
 + */
 +#include <assert.h>
 +
 +/* Note that floating-point constants in CUDA code should be suffixed
 + * with f (e.g. 0.5f), to stop the compiler producing intermediate
 + * code that is in double precision.
 + */
 +
 +#include "gromacs/gpu_utils/cuda_arch_utils.cuh"
 +#include "gromacs/gpu_utils/cuda_kernel_utils.cuh"
 +#include "gromacs/gpu_utils/vectype_ops.cuh"
 +
 +#include "nbnxm_cuda_types.h"
 +
 +#ifndef NBNXM_CUDA_KERNEL_UTILS_CUH
 +#define NBNXM_CUDA_KERNEL_UTILS_CUH
 +
 +/*! \brief Log of the i and j cluster size.
 + *  change this together with c_clSize !*/
 +static const int __device__          c_clSizeLog2  = 3;
 +/*! \brief Square of cluster size. */
 +static const int __device__          c_clSizeSq    = c_clSize*c_clSize;
 +/*! \brief j-cluster size after split (4 in the current implementation). */
 +static const int __device__          c_splitClSize = c_clSize/c_nbnxnGpuClusterpairSplit;
 +/*! \brief Stride in the force accumualation buffer */
 +static const int __device__          c_fbufStride  = c_clSizeSq;
 +/*! \brief i-cluster interaction mask for a super-cluster with all c_numClPerSupercl=8 bits set */
 +static const unsigned __device__     superClInteractionMask = ((1U << c_numClPerSupercl) - 1U);
 +
 +static const float __device__        c_oneSixth    = 0.16666667f;
 +static const float __device__        c_oneTwelveth = 0.08333333f;
 +
 +
 +/*! Convert LJ sigma,epsilon parameters to C6,C12. */
 +static __forceinline__ __device__
 +void convert_sigma_epsilon_to_c6_c12(const float  sigma,
 +                                     const float  epsilon,
 +                                     float       *c6,
 +                                     float       *c12)
 +{
 +    float sigma2, sigma6;
 +
 +    sigma2 = sigma * sigma;
 +    sigma6 = sigma2 *sigma2 * sigma2;
 +    *c6    = epsilon * sigma6;
 +    *c12   = *c6 * sigma6;
 +}
 +
 +/*! Apply force switch,  force + energy version. */
 +static __forceinline__ __device__
 +void calculate_force_switch_F(const  cu_nbparam_t nbparam,
 +                              float               c6,
 +                              float               c12,
 +                              float               inv_r,
 +                              float               r2,
 +                              float              *F_invr)
 +{
 +    float r, r_switch;
 +
 +    /* force switch constants */
 +    float disp_shift_V2 = nbparam.dispersion_shift.c2;
 +    float disp_shift_V3 = nbparam.dispersion_shift.c3;
 +    float repu_shift_V2 = nbparam.repulsion_shift.c2;
 +    float repu_shift_V3 = nbparam.repulsion_shift.c3;
 +
 +    r         = r2 * inv_r;
 +    r_switch  = r - nbparam.rvdw_switch;
 +    r_switch  = r_switch >= 0.0f ? r_switch : 0.0f;
 +
 +    *F_invr  +=
 +        -c6*(disp_shift_V2 + disp_shift_V3*r_switch)*r_switch*r_switch*inv_r +
-         c12*(-repu_shift_V2 + repu_shift_V3*r_switch)*r_switch*r_switch*inv_r;
++        c12*(repu_shift_V2 + repu_shift_V3*r_switch)*r_switch*r_switch*inv_r;
 +}
 +
 +/*! Apply force switch, force-only version. */
 +static __forceinline__ __device__
 +void calculate_force_switch_F_E(const  cu_nbparam_t nbparam,
 +                                float               c6,
 +                                float               c12,
 +                                float               inv_r,
 +                                float               r2,
 +                                float              *F_invr,
 +                                float              *E_lj)
 +{
 +    float r, r_switch;
 +
 +    /* force switch constants */
 +    float disp_shift_V2 = nbparam.dispersion_shift.c2;
 +    float disp_shift_V3 = nbparam.dispersion_shift.c3;
 +    float repu_shift_V2 = nbparam.repulsion_shift.c2;
 +    float repu_shift_V3 = nbparam.repulsion_shift.c3;
 +
 +    float disp_shift_F2 = nbparam.dispersion_shift.c2/3;
 +    float disp_shift_F3 = nbparam.dispersion_shift.c3/4;
 +    float repu_shift_F2 = nbparam.repulsion_shift.c2/3;
 +    float repu_shift_F3 = nbparam.repulsion_shift.c3/4;
 +
 +    r         = r2 * inv_r;
 +    r_switch  = r - nbparam.rvdw_switch;
 +    r_switch  = r_switch >= 0.0f ? r_switch : 0.0f;
 +
 +    *F_invr  +=
 +        -c6*(disp_shift_V2 + disp_shift_V3*r_switch)*r_switch*r_switch*inv_r +
++        c12*(repu_shift_V2 + repu_shift_V3*r_switch)*r_switch*r_switch*inv_r;
 +    *E_lj    +=
 +        c6*(disp_shift_F2 + disp_shift_F3*r_switch)*r_switch*r_switch*r_switch -
 +        c12*(repu_shift_F2 + repu_shift_F3*r_switch)*r_switch*r_switch*r_switch;
 +}
 +
 +/*! Apply potential switch, force-only version. */
 +static __forceinline__ __device__
 +void calculate_potential_switch_F(const  cu_nbparam_t nbparam,
 +                                  float               inv_r,
 +                                  float               r2,
 +                                  float              *F_invr,
 +                                  float              *E_lj)
 +{
 +    float r, r_switch;
 +    float sw, dsw;
 +
 +    /* potential switch constants */
 +    float switch_V3 = nbparam.vdw_switch.c3;
 +    float switch_V4 = nbparam.vdw_switch.c4;
 +    float switch_V5 = nbparam.vdw_switch.c5;
 +    float switch_F2 = 3*nbparam.vdw_switch.c3;
 +    float switch_F3 = 4*nbparam.vdw_switch.c4;
 +    float switch_F4 = 5*nbparam.vdw_switch.c5;
 +
 +    r        = r2 * inv_r;
 +    r_switch = r - nbparam.rvdw_switch;
 +
 +    /* Unlike in the F+E kernel, conditional is faster here */
 +    if (r_switch > 0.0f)
 +    {
 +        sw      = 1.0f + (switch_V3 + (switch_V4 + switch_V5*r_switch)*r_switch)*r_switch*r_switch*r_switch;
 +        dsw     = (switch_F2 + (switch_F3 + switch_F4*r_switch)*r_switch)*r_switch*r_switch;
 +
 +        *F_invr = (*F_invr)*sw - inv_r*(*E_lj)*dsw;
 +    }
 +}
 +
 +/*! Apply potential switch, force + energy version. */
 +static __forceinline__ __device__
 +void calculate_potential_switch_F_E(const  cu_nbparam_t nbparam,
 +                                    float               inv_r,
 +                                    float               r2,
 +                                    float              *F_invr,
 +                                    float              *E_lj)
 +{
 +    float r, r_switch;
 +    float sw, dsw;
 +
 +    /* potential switch constants */
 +    float switch_V3 = nbparam.vdw_switch.c3;
 +    float switch_V4 = nbparam.vdw_switch.c4;
 +    float switch_V5 = nbparam.vdw_switch.c5;
 +    float switch_F2 = 3*nbparam.vdw_switch.c3;
 +    float switch_F3 = 4*nbparam.vdw_switch.c4;
 +    float switch_F4 = 5*nbparam.vdw_switch.c5;
 +
 +    r        = r2 * inv_r;
 +    r_switch = r - nbparam.rvdw_switch;
 +    r_switch = r_switch >= 0.0f ? r_switch : 0.0f;
 +
 +    /* Unlike in the F-only kernel, masking is faster here */
 +    sw       = 1.0f + (switch_V3 + (switch_V4 + switch_V5*r_switch)*r_switch)*r_switch*r_switch*r_switch;
 +    dsw      = (switch_F2 + (switch_F3 + switch_F4*r_switch)*r_switch)*r_switch*r_switch;
 +
 +    *F_invr  = (*F_invr)*sw - inv_r*(*E_lj)*dsw;
 +    *E_lj   *= sw;
 +}
 +
 +
 +/*! \brief Fetch C6 grid contribution coefficients and return the product of these.
 + *
 + *  Depending on what is supported, it fetches parameters either
 + *  using direct load, texture objects, or texrefs.
 + */
 +static __forceinline__ __device__
 +float calculate_lj_ewald_c6grid(const cu_nbparam_t nbparam,
 +                                int                typei,
 +                                int                typej)
 +{
 +#if DISABLE_CUDA_TEXTURES
 +    return LDG(&nbparam.nbfp_comb[2*typei]) * LDG(&nbparam.nbfp_comb[2*typej]);
 +#else
 +    return tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2*typei) * tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2*typej);
 +#endif /* DISABLE_CUDA_TEXTURES */
 +}
 +
 +
 +/*! Calculate LJ-PME grid force contribution with
 + *  geometric combination rule.
 + */
 +static __forceinline__ __device__
 +void calculate_lj_ewald_comb_geom_F(const cu_nbparam_t nbparam,
 +                                    int                typei,
 +                                    int                typej,
 +                                    float              r2,
 +                                    float              inv_r2,
 +                                    float              lje_coeff2,
 +                                    float              lje_coeff6_6,
 +                                    float             *F_invr)
 +{
 +    float c6grid, inv_r6_nm, cr2, expmcr2, poly;
 +
 +    c6grid = calculate_lj_ewald_c6grid(nbparam, typei, typej);
 +
 +    /* Recalculate inv_r6 without exclusion mask */
 +    inv_r6_nm = inv_r2*inv_r2*inv_r2;
 +    cr2       = lje_coeff2*r2;
 +    expmcr2   = expf(-cr2);
 +    poly      = 1.0f + cr2 + 0.5f*cr2*cr2;
 +
 +    /* Subtract the grid force from the total LJ force */
 +    *F_invr  += c6grid*(inv_r6_nm - expmcr2*(inv_r6_nm*poly + lje_coeff6_6))*inv_r2;
 +}
 +
 +
 +/*! Calculate LJ-PME grid force + energy contribution with
 + *  geometric combination rule.
 + */
 +static __forceinline__ __device__
 +void calculate_lj_ewald_comb_geom_F_E(const cu_nbparam_t nbparam,
 +                                      int                typei,
 +                                      int                typej,
 +                                      float              r2,
 +                                      float              inv_r2,
 +                                      float              lje_coeff2,
 +                                      float              lje_coeff6_6,
 +                                      float              int_bit,
 +                                      float             *F_invr,
 +                                      float             *E_lj)
 +{
 +    float c6grid, inv_r6_nm, cr2, expmcr2, poly, sh_mask;
 +
 +    c6grid = calculate_lj_ewald_c6grid(nbparam, typei, typej);
 +
 +    /* Recalculate inv_r6 without exclusion mask */
 +    inv_r6_nm = inv_r2*inv_r2*inv_r2;
 +    cr2       = lje_coeff2*r2;
 +    expmcr2   = expf(-cr2);
 +    poly      = 1.0f + cr2 + 0.5f*cr2*cr2;
 +
 +    /* Subtract the grid force from the total LJ force */
 +    *F_invr  += c6grid*(inv_r6_nm - expmcr2*(inv_r6_nm*poly + lje_coeff6_6))*inv_r2;
 +
 +    /* Shift should be applied only to real LJ pairs */
 +    sh_mask   = nbparam.sh_lj_ewald*int_bit;
 +    *E_lj    += c_oneSixth*c6grid*(inv_r6_nm*(1.0f - expmcr2*poly) + sh_mask);
 +}
 +
 +/*! Fetch per-type LJ parameters.
 + *
 + *  Depending on what is supported, it fetches parameters either
 + *  using direct load, texture objects, or texrefs.
 + */
 +static __forceinline__ __device__
 +float2 fetch_nbfp_comb_c6_c12(const cu_nbparam_t nbparam,
 +                              int                type)
 +{
 +    float2 c6c12;
 +#if DISABLE_CUDA_TEXTURES
 +    /* Force an 8-byte fetch to save a memory instruction. */
 +    float2 *nbfp_comb = (float2 *)nbparam.nbfp_comb;
 +    c6c12 = LDG(&nbfp_comb[type]);
 +#else
 +    /* NOTE: as we always do 8-byte aligned loads, we could
 +       fetch float2 here too just as above. */
 +    c6c12.x = tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2*type);
 +    c6c12.y = tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2*type + 1);
 +#endif /* DISABLE_CUDA_TEXTURES */
 +
 +    return c6c12;
 +}
 +
 +
 +/*! Calculate LJ-PME grid force + energy contribution (if E_lj != nullptr) with
 + *  Lorentz-Berthelot combination rule.
 + *  We use a single F+E kernel with conditional because the performance impact
 + *  of this is pretty small and LB on the CPU is anyway very slow.
 + */
 +static __forceinline__ __device__
 +void calculate_lj_ewald_comb_LB_F_E(const cu_nbparam_t nbparam,
 +                                    int                typei,
 +                                    int                typej,
 +                                    float              r2,
 +                                    float              inv_r2,
 +                                    float              lje_coeff2,
 +                                    float              lje_coeff6_6,
 +                                    float              int_bit,
 +                                    float             *F_invr,
 +                                    float             *E_lj)
 +{
 +    float c6grid, inv_r6_nm, cr2, expmcr2, poly;
 +    float sigma, sigma2, epsilon;
 +
 +    /* sigma and epsilon are scaled to give 6*C6 */
 +    float2 c6c12_i = fetch_nbfp_comb_c6_c12(nbparam, typei);
 +    float2 c6c12_j = fetch_nbfp_comb_c6_c12(nbparam, typej);
 +
 +    sigma   = c6c12_i.x + c6c12_j.x;
 +    epsilon = c6c12_i.y * c6c12_j.y;
 +
 +    sigma2  = sigma*sigma;
 +    c6grid  = epsilon*sigma2*sigma2*sigma2;
 +
 +    /* Recalculate inv_r6 without exclusion mask */
 +    inv_r6_nm = inv_r2*inv_r2*inv_r2;
 +    cr2       = lje_coeff2*r2;
 +    expmcr2   = expf(-cr2);
 +    poly      = 1.0f + cr2 + 0.5f*cr2*cr2;
 +
 +    /* Subtract the grid force from the total LJ force */
 +    *F_invr  += c6grid*(inv_r6_nm - expmcr2*(inv_r6_nm*poly + lje_coeff6_6))*inv_r2;
 +
 +    if (E_lj != nullptr)
 +    {
 +        float sh_mask;
 +
 +        /* Shift should be applied only to real LJ pairs */
 +        sh_mask   = nbparam.sh_lj_ewald*int_bit;
 +        *E_lj    += c_oneSixth*c6grid*(inv_r6_nm*(1.0f - expmcr2*poly) + sh_mask);
 +    }
 +}
 +
 +
 +/*! Fetch two consecutive values from the Ewald correction F*r table.
 + *
 + *  Depending on what is supported, it fetches parameters either
 + *  using direct load, texture objects, or texrefs.
 + */
 +static __forceinline__ __device__
 +float2 fetch_coulomb_force_r(const cu_nbparam_t nbparam,
 +                             int                index)
 +{
 +    float2 d;
 +
 +#if DISABLE_CUDA_TEXTURES
 +    /* Can't do 8-byte fetch because some of the addresses will be misaligned. */
 +    d.x = LDG(&nbparam.coulomb_tab[index]);
 +    d.y = LDG(&nbparam.coulomb_tab[index + 1]);
 +#else
 +    d.x = tex1Dfetch<float>(nbparam.coulomb_tab_texobj, index);
 +    d.y = tex1Dfetch<float>(nbparam.coulomb_tab_texobj, index + 1);
 +#endif // DISABLE_CUDA_TEXTURES
 +
 +    return d;
 +}
 +
 +/*! Linear interpolation using exactly two FMA operations.
 + *
 + *  Implements numeric equivalent of: (1-t)*d0 + t*d1
 + *  Note that CUDA does not have fnms, otherwise we'd use
 + *  fma(t, d1, fnms(t, d0, d0)
 + *  but input modifiers are designed for this and are fast.
 + */
 +template <typename T>
 +__forceinline__ __host__ __device__
 +T lerp(T d0, T d1, T t)
 +{
 +    return fma(t, d1, fma(-t, d0, d0));
 +}
 +
 +/*! Interpolate Ewald coulomb force correction using the F*r table.
 + */
 +static __forceinline__ __device__
 +float interpolate_coulomb_force_r(const cu_nbparam_t nbparam,
 +                                  float              r)
 +{
 +    float  normalized = nbparam.coulomb_tab_scale * r;
 +    int    index      = (int) normalized;
 +    float  fraction   = normalized - index;
 +
 +    float2 d01 = fetch_coulomb_force_r(nbparam, index);
 +
 +    return lerp(d01.x, d01.y, fraction);
 +}
 +
 +/*! Fetch C6 and C12 from the parameter table.
 + *
 + *  Depending on what is supported, it fetches parameters either
 + *  using direct load, texture objects, or texrefs.
 + */
 +static __forceinline__ __device__
 +void fetch_nbfp_c6_c12(float               &c6,
 +                       float               &c12,
 +                       const cu_nbparam_t   nbparam,
 +                       int                  baseIndex)
 +{
 +#if DISABLE_CUDA_TEXTURES
 +    /* Force an 8-byte fetch to save a memory instruction. */
 +    float2 *nbfp = (float2 *)nbparam.nbfp;
 +    float2  c6c12;
 +    c6c12 = LDG(&nbfp[baseIndex]);
 +    c6    = c6c12.x;
 +    c12   = c6c12.y;
 +#else
 +    /* NOTE: as we always do 8-byte aligned loads, we could
 +       fetch float2 here too just as above. */
 +    c6  = tex1Dfetch<float>(nbparam.nbfp_texobj, 2*baseIndex);
 +    c12 = tex1Dfetch<float>(nbparam.nbfp_texobj, 2*baseIndex + 1);
 +#endif // DISABLE_CUDA_TEXTURES
 +}
 +
 +
 +/*! Calculate analytical Ewald correction term. */
 +static __forceinline__ __device__
 +float pmecorrF(float z2)
 +{
 +    const float FN6 = -1.7357322914161492954e-8f;
 +    const float FN5 = 1.4703624142580877519e-6f;
 +    const float FN4 = -0.000053401640219807709149f;
 +    const float FN3 = 0.0010054721316683106153f;
 +    const float FN2 = -0.019278317264888380590f;
 +    const float FN1 = 0.069670166153766424023f;
 +    const float FN0 = -0.75225204789749321333f;
 +
 +    const float FD4 = 0.0011193462567257629232f;
 +    const float FD3 = 0.014866955030185295499f;
 +    const float FD2 = 0.11583842382862377919f;
 +    const float FD1 = 0.50736591960530292870f;
 +    const float FD0 = 1.0f;
 +
 +    float       z4;
 +    float       polyFN0, polyFN1, polyFD0, polyFD1;
 +
 +    z4          = z2*z2;
 +
 +    polyFD0     = FD4*z4 + FD2;
 +    polyFD1     = FD3*z4 + FD1;
 +    polyFD0     = polyFD0*z4 + FD0;
 +    polyFD0     = polyFD1*z2 + polyFD0;
 +
 +    polyFD0     = 1.0f/polyFD0;
 +
 +    polyFN0     = FN6*z4 + FN4;
 +    polyFN1     = FN5*z4 + FN3;
 +    polyFN0     = polyFN0*z4 + FN2;
 +    polyFN1     = polyFN1*z4 + FN1;
 +    polyFN0     = polyFN0*z4 + FN0;
 +    polyFN0     = polyFN1*z2 + polyFN0;
 +
 +    return polyFN0*polyFD0;
 +}
 +
 +/*! Final j-force reduction; this generic implementation works with
 + *  arbitrary array sizes.
 + */
 +static __forceinline__ __device__
 +void reduce_force_j_generic(float *f_buf, float3 *fout,
 +                            int tidxi, int tidxj, int aidx)
 +{
 +    if (tidxi < 3)
 +    {
 +        float f = 0.0f;
 +        for (int j = tidxj * c_clSize; j < (tidxj + 1) * c_clSize; j++)
 +        {
 +            f += f_buf[c_fbufStride * tidxi + j];
 +        }
 +
 +        atomicAdd((&fout[aidx].x)+tidxi, f);
 +    }
 +}
 +
 +/*! Final j-force reduction; this implementation only with power of two
 + *  array sizes.
 + */
 +static __forceinline__ __device__
 +void reduce_force_j_warp_shfl(float3 f, float3 *fout,
 +                              int tidxi, int aidx,
 +                              const unsigned int activemask)
 +{
 +    f.x += gmx_shfl_down_sync(activemask, f.x, 1);
 +    f.y += gmx_shfl_up_sync  (activemask, f.y, 1);
 +    f.z += gmx_shfl_down_sync(activemask, f.z, 1);
 +
 +    if (tidxi & 1)
 +    {
 +        f.x = f.y;
 +    }
 +
 +    f.x += gmx_shfl_down_sync(activemask, f.x, 2);
 +    f.z += gmx_shfl_up_sync  (activemask, f.z, 2);
 +
 +    if (tidxi & 2)
 +    {
 +        f.x = f.z;
 +    }
 +
 +    f.x += gmx_shfl_down_sync(activemask, f.x, 4);
 +
 +    if (tidxi < 3)
 +    {
 +        atomicAdd((&fout[aidx].x) + tidxi, f.x);
 +    }
 +}
 +
 +/*! Final i-force reduction; this generic implementation works with
 + *  arbitrary array sizes.
 + * TODO: add the tidxi < 3 trick
 + */
 +static __forceinline__ __device__
 +void reduce_force_i_generic(float *f_buf, float3 *fout,
 +                            float *fshift_buf, bool bCalcFshift,
 +                            int tidxi, int tidxj, int aidx)
 +{
 +    if (tidxj < 3)
 +    {
 +        float f = 0.0f;
 +        for (int j = tidxi; j < c_clSizeSq; j += c_clSize)
 +        {
 +            f += f_buf[tidxj * c_fbufStride + j];
 +        }
 +
 +        atomicAdd(&fout[aidx].x + tidxj, f);
 +
 +        if (bCalcFshift)
 +        {
 +            *fshift_buf += f;
 +        }
 +    }
 +}
 +
 +/*! Final i-force reduction; this implementation works only with power of two
 + *  array sizes.
 + */
 +static __forceinline__ __device__
 +void reduce_force_i_pow2(volatile float *f_buf, float3 *fout,
 +                         float *fshift_buf, bool bCalcFshift,
 +                         int tidxi, int tidxj, int aidx)
 +{
 +    int     i, j;
 +    float   f;
 +
 +    assert(c_clSize == 1 << c_clSizeLog2);
 +
 +    /* Reduce the initial c_clSize values for each i atom to half
 +     * every step by using c_clSize * i threads.
 +     * Can't just use i as loop variable because than nvcc refuses to unroll.
 +     */
 +    i = c_clSize/2;
 +#pragma unroll 5
 +    for (j = c_clSizeLog2 - 1; j > 0; j--)
 +    {
 +        if (tidxj < i)
 +        {
 +
 +            f_buf[                   tidxj * c_clSize + tidxi] += f_buf[                   (tidxj + i) * c_clSize + tidxi];
 +            f_buf[    c_fbufStride + tidxj * c_clSize + tidxi] += f_buf[    c_fbufStride + (tidxj + i) * c_clSize + tidxi];
 +            f_buf[2 * c_fbufStride + tidxj * c_clSize + tidxi] += f_buf[2 * c_fbufStride + (tidxj + i) * c_clSize + tidxi];
 +        }
 +        i >>= 1;
 +    }
 +
 +    /* i == 1, last reduction step, writing to global mem */
 +    if (tidxj < 3)
 +    {
 +        /* tidxj*c_fbufStride selects x, y or z */
 +        f = f_buf[tidxj * c_fbufStride               + tidxi] +
 +            f_buf[tidxj * c_fbufStride + i * c_clSize + tidxi];
 +
 +        atomicAdd(&(fout[aidx].x) + tidxj, f);
 +
 +        if (bCalcFshift)
 +        {
 +            *fshift_buf += f;
 +        }
 +    }
 +
 +}
 +
 +/*! Final i-force reduction wrapper; calls the generic or pow2 reduction depending
 + *  on whether the size of the array to be reduced is power of two or not.
 + */
 +static __forceinline__ __device__
 +void reduce_force_i(float *f_buf, float3 *f,
 +                    float *fshift_buf, bool bCalcFshift,
 +                    int tidxi, int tidxj, int ai)
 +{
 +    if ((c_clSize & (c_clSize - 1)))
 +    {
 +        reduce_force_i_generic(f_buf, f, fshift_buf, bCalcFshift, tidxi, tidxj, ai);
 +    }
 +    else
 +    {
 +        reduce_force_i_pow2(f_buf, f, fshift_buf, bCalcFshift, tidxi, tidxj, ai);
 +    }
 +}
 +
 +/*! Final i-force reduction; this implementation works only with power of two
 + *  array sizes.
 + */
 +static __forceinline__ __device__
 +void reduce_force_i_warp_shfl(float3 fin, float3 *fout,
 +                              float *fshift_buf, bool bCalcFshift,
 +                              int tidxj, int aidx,
 +                              const unsigned int activemask)
 +{
 +    fin.x += gmx_shfl_down_sync(activemask, fin.x, c_clSize);
 +    fin.y += gmx_shfl_up_sync  (activemask, fin.y, c_clSize);
 +    fin.z += gmx_shfl_down_sync(activemask, fin.z, c_clSize);
 +
 +    if (tidxj & 1)
 +    {
 +        fin.x = fin.y;
 +    }
 +
 +    fin.x += gmx_shfl_down_sync(activemask, fin.x, 2*c_clSize);
 +    fin.z += gmx_shfl_up_sync  (activemask, fin.z, 2*c_clSize);
 +
 +    if (tidxj & 2)
 +    {
 +        fin.x = fin.z;
 +    }
 +
 +    /* Threads 0,1,2 and 4,5,6 increment x,y,z for their warp */
 +    if ((tidxj & 3) < 3)
 +    {
 +        atomicAdd(&fout[aidx].x + (tidxj & 3), fin.x);
 +
 +        if (bCalcFshift)
 +        {
 +            *fshift_buf += fin.x;
 +        }
 +    }
 +}
 +
 +/*! Energy reduction; this implementation works only with power of two
 + *  array sizes.
 + */
 +static __forceinline__ __device__
 +void reduce_energy_pow2(volatile float *buf,
 +                        float *e_lj, float *e_el,
 +                        unsigned int tidx)
 +{
 +    float        e1, e2;
 +
 +    unsigned int i = warp_size/2;
 +
 +    /* Can't just use i as loop variable because than nvcc refuses to unroll. */
 +#pragma unroll 10
 +    for (int j = warp_size_log2 - 1; j > 0; j--)
 +    {
 +        if (tidx < i)
 +        {
 +            buf[               tidx] += buf[               tidx + i];
 +            buf[c_fbufStride + tidx] += buf[c_fbufStride + tidx + i];
 +        }
 +        i >>= 1;
 +    }
 +
 +    // TODO do this on two threads - requires e_lj and e_el to be stored on adjascent
 +    // memory locations to make sense
 +    /* last reduction step, writing to global mem */
 +    if (tidx == 0)
 +    {
 +        e1 = buf[               tidx] + buf[               tidx + i];
 +        e2 = buf[c_fbufStride + tidx] + buf[c_fbufStride + tidx + i];
 +
 +        atomicAdd(e_lj, e1);
 +        atomicAdd(e_el, e2);
 +    }
 +}
 +
 +/*! Energy reduction; this implementation works only with power of two
 + *  array sizes.
 + */
 +static __forceinline__ __device__
 +void reduce_energy_warp_shfl(float E_lj, float E_el,
 +                             float *e_lj, float *e_el,
 +                             int tidx,
 +                             const unsigned int activemask)
 +{
 +    int i, sh;
 +
 +    sh = 1;
 +#pragma unroll 5
 +    for (i = 0; i < 5; i++)
 +    {
 +        E_lj += gmx_shfl_down_sync(activemask, E_lj, sh);
 +        E_el += gmx_shfl_down_sync(activemask, E_el, sh);
 +        sh   += sh;
 +    }
 +
 +    /* The first thread in the warp writes the reduced energies */
 +    if (tidx == 0 || tidx == warp_size)
 +    {
 +        atomicAdd(e_lj, E_lj);
 +        atomicAdd(e_el, E_el);
 +    }
 +}
 +
 +#endif /* NBNXN_CUDA_KERNEL_UTILS_CUH */
index 74fcc398ac8080973c369dd0727da5c987ff5fbb,0000000000000000000000000000000000000000..c63c10d9571919fe14c3f15d12c9990d4eb6f1a3
mode 100644,000000..100644
--- /dev/null
@@@ -1,913 -1,0 +1,913 @@@
-         c12*(-repu_shift_V2 + repu_shift_V3*r_switch)*r_switch*r_switch*inv_r;
 +/*
 + * This file is part of the GROMACS molecular simulation package.
 + *
 + * Copyright (c) 2012,2013,2014,2016,2017,2018,2019, by the GROMACS development team, led by
 + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
 + * and including many others, as listed in the AUTHORS file in the
 + * top-level source directory and at http://www.gromacs.org.
 + *
 + * GROMACS is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU Lesser General Public License
 + * as published by the Free Software Foundation; either version 2.1
 + * of the License, or (at your option) any later version.
 + *
 + * GROMACS is distributed in the hope that it will be useful,
 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 + * Lesser General Public License for more details.
 + *
 + * You should have received a copy of the GNU Lesser General Public
 + * License along with GROMACS; if not, see
 + * http://www.gnu.org/licenses, or write to the Free Software Foundation,
 + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
 + *
 + * If you want to redistribute modifications to GROMACS, please
 + * consider that scientific software is very special. Version
 + * control is crucial - bugs must be traceable. We will be happy to
 + * consider code for inclusion in the official distribution, but
 + * derived work must not be called official GROMACS. Details are found
 + * in the README & COPYING files - if they are missing, get the
 + * official version at http://www.gromacs.org.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the research papers on the package. Check out http://www.gromacs.org.
 + */
 +
 +#define GMX_DOUBLE 0
 +
 +#include "gromacs/gpu_utils/device_utils.clh"
 +#include "gromacs/gpu_utils/vectype_ops.clh"
 +#include "gromacs/nbnxm/constants.h"
 +#include "gromacs/pbcutil/ishift.h"
 +
 +#include "nbnxm_ocl_consts.h"
 +
 +#define CL_SIZE                 (NBNXN_GPU_CLUSTER_SIZE)
 +#define NCL_PER_SUPERCL         c_nbnxnGpuNumClusterPerSupercluster
 +
 +#define WARP_SIZE  (CL_SIZE*CL_SIZE/2) //Currently only c_nbnxnGpuClusterpairSplit=2 supported
 +
 +#if defined _NVIDIA_SOURCE_ || defined _AMD_SOURCE_
 +/* Currently we enable CJ prefetch for AMD/NVIDIA and disable it for other vendors
 + * Note that this should precede the kernel_utils include.
 + */
 +#define USE_CJ_PREFETCH 1
 +#else
 +#define USE_CJ_PREFETCH 0
 +#endif
 +
 +#if defined cl_intel_subgroups || defined cl_khr_subgroups || (defined __OPENCL_VERSION__ && __OPENCL_VERSION__ >= 210)
 +#define HAVE_SUBGROUP 1
 +#else
 +#define HAVE_SUBGROUP 0
 +#endif
 +
 +#ifdef cl_intel_subgroups
 +#define HAVE_INTEL_SUBGROUP 1
 +#else
 +#define HAVE_INTEL_SUBGROUP 0
 +#endif
 +
 +#if defined _INTEL_SOURCE_
 +#define SUBGROUP_SIZE 8
 +#elif defined _AMD_SOURCE_
 +#define SUBGROUP_SIZE 64
 +#else
 +#define SUBGROUP_SIZE 32
 +#endif
 +
 +#define REDUCE_SHUFFLE (HAVE_INTEL_SUBGROUP && CL_SIZE == 4 && SUBGROUP_SIZE == WARP_SIZE)
 +#define USE_SUBGROUP_ANY (HAVE_SUBGROUP && SUBGROUP_SIZE == WARP_SIZE)
 +#define USE_SUBGROUP_PRELOAD HAVE_INTEL_SUBGROUP
 +
 +/* 1.0 / sqrt(M_PI) */
 +#define M_FLOAT_1_SQRTPI 0.564189583547756f
 +
 +//-------------------
 +
 +#ifndef NBNXN_OPENCL_KERNEL_UTILS_CLH
 +#define NBNXN_OPENCL_KERNEL_UTILS_CLH
 +
 +#if CL_SIZE == 8
 +#define WARP_SIZE_LOG2  (5)
 +#define CL_SIZE_LOG2    (3)
 +#elif CL_SIZE == 4
 +#define WARP_SIZE_LOG2  (3)
 +#define CL_SIZE_LOG2    (2)
 +#else
 +#error unsupported CL_SIZE
 +#endif
 +
 +#define CL_SIZE_SQ      (CL_SIZE * CL_SIZE)
 +#define FBUF_STRIDE     (CL_SIZE_SQ)
 +
 +#define ONE_SIXTH_F     0.16666667f
 +#define ONE_TWELVETH_F  0.08333333f
 +
 +
 +#ifdef __GNUC__
 +/* GCC, clang, and some ICC pretending to be GCC */
 +#  define gmx_unused __attribute__ ((unused))
 +#else
 +#  define gmx_unused
 +#endif
 +
 +// Data structures shared between OpenCL device code and OpenCL host code
 +// TODO: review, improve
 +// Replaced real by float for now, to avoid including any other header
 +typedef struct {
 +    float c2;
 +    float c3;
 +    float cpot;
 +} shift_consts_t;
 +
 +/* Used with potential switching:
 + * rsw        = max(r - r_switch, 0)
 + * sw         = 1 + c3*rsw^3 + c4*rsw^4 + c5*rsw^5
 + * dsw        = 3*c3*rsw^2 + 4*c4*rsw^3 + 5*c5*rsw^4
 + * force      = force*dsw - potential*sw
 + * potential *= sw
 + */
 +typedef struct {
 +    float c3;
 +    float c4;
 +    float c5;
 +} switch_consts_t;
 +
 +// Data structure shared between the OpenCL device code and OpenCL host code
 +// Must not contain OpenCL objects (buffers)
 +typedef struct cl_nbparam_params
 +{
 +
 +    int             eeltype;          /**< type of electrostatics, takes values from #eelCu */
 +    int             vdwtype;          /**< type of VdW impl., takes values from #evdwCu     */
 +
 +    float           epsfac;           /**< charge multiplication factor                      */
 +    float           c_rf;             /**< Reaction-field/plain cutoff electrostatics const. */
 +    float           two_k_rf;         /**< Reaction-field electrostatics constant            */
 +    float           ewald_beta;       /**< Ewald/PME parameter                               */
 +    float           sh_ewald;         /**< Ewald/PME correction term substracted from the direct-space potential */
 +    float           sh_lj_ewald;      /**< LJ-Ewald/PME correction term added to the correction potential        */
 +    float           ewaldcoeff_lj;    /**< LJ-Ewald/PME coefficient                          */
 +
 +    float           rcoulomb_sq;      /**< Coulomb cut-off squared                           */
 +
 +    float           rvdw_sq;          /**< VdW cut-off squared                               */
 +    float           rvdw_switch;      /**< VdW switched cut-off                              */
 +    float           rlistOuter_sq;    /**< Full, outer pair-list cut-off squared             */
 +    float           rlistInner_sq;    /**< Inner, dynamic pruned pair-list cut-off squared  XXX: this is only needed in the pruning kernels, but for now we also pass it to the nonbondeds */
 +
 +    shift_consts_t  dispersion_shift; /**< VdW shift dispersion constants           */
 +    shift_consts_t  repulsion_shift;  /**< VdW shift repulsion constants            */
 +    switch_consts_t vdw_switch;       /**< VdW switch constants                     */
 +
 +    /* Ewald Coulomb force table data - accessed through texture memory */
 +    float                  coulomb_tab_scale;  /**< table scale/spacing                        */
 +}cl_nbparam_params_t;
 +
 +typedef struct {
 +    int sci;            /* i-super-cluster       */
 +    int shift;          /* Shift vector index plus possible flags */
 +    int cj4_ind_start;  /* Start index into cj4  */
 +    int cj4_ind_end;    /* End index into cj4    */
 +} nbnxn_sci_t;
 +
 +typedef struct {
 +    unsigned int imask;    /* The i-cluster interactions mask for 1 warp  */
 +    int          excl_ind; /* Index into the exclusion array for 1 warp   */
 +} nbnxn_im_ei_t;
 +
 +typedef struct {
 +    int           cj[4];   /* The 4 j-clusters                            */
 +    nbnxn_im_ei_t imei[2]; /* The i-cluster mask data       for 2 warps   */
 +} nbnxn_cj4_t;
 +
 +
 +typedef struct {
 +    unsigned int pair[CL_SIZE*CL_SIZE/2]; /* Topology exclusion interaction bits for one warp,
 +                                           * each unsigned has bitS for 4*8 i clusters
 +                                           */
 +} nbnxn_excl_t;
 +
 +/*! i-cluster interaction mask for a super-cluster with all NCL_PER_SUPERCL bits set */
 +__constant unsigned supercl_interaction_mask = ((1U << NCL_PER_SUPERCL) - 1U);
 +
 +gmx_opencl_inline
 +void preloadCj4Generic(__local int        *sm_cjPreload,
 +                       const __global int *gm_cj,
 +                       int                 tidxi,
 +                       int                 tidxj,
 +                       bool gmx_unused     iMaskCond)
 +{
 +    /* Pre-load cj into shared memory */
 +#if defined _AMD_SOURCE_ //TODO: fix by setting c_nbnxnGpuClusterpairSplit properly
 +    if (tidxj == 0 & tidxi < c_nbnxnGpuJgroupSize)
 +    {
 +        sm_cjPreload[tidxi] = gm_cj[tidxi];
 +    }
 +#else
 +    const int c_clSize                   = CL_SIZE;
 +    const int c_nbnxnGpuClusterpairSplit = 2;
 +    const int c_splitClSize              = c_clSize/c_nbnxnGpuClusterpairSplit;
 +    if ((tidxj == 0 | tidxj == c_splitClSize) & (tidxi < c_nbnxnGpuJgroupSize))
 +    {
 +        sm_cjPreload[tidxi + tidxj * c_nbnxnGpuJgroupSize/c_splitClSize] = gm_cj[tidxi];
 +    }
 +#endif
 +}
 +
 +
 +#if USE_SUBGROUP_PRELOAD
 +gmx_opencl_inline
 +int  preloadCj4Subgroup(const __global int *gm_cj)
 +{
 +    //loads subgroup-size # of elements (8) instead of the 4 required
 +    //equivalent to *cjs = *gm_cj
 +    return intel_sub_group_block_read((const __global uint *)gm_cj);
 +}
 +#endif //USE_SUBGROUP_PRELOAD
 +
 +#if USE_SUBGROUP_PRELOAD
 +typedef size_t CjType;
 +#else
 +typedef __local int* CjType;
 +#endif
 +
 +/*! \brief Preload cj4
 + *
 + * - For AMD we load once for a wavefront of 64 threads (on 4 threads * NTHREAD_Z)
 + * - For NVIDIA once per warp (on 2x4 threads * NTHREAD_Z)
 + * - For Intel(/USE_SUBGROUP_PRELOAD) loads into private memory(/register) instead of local memory
 + *
 + * It is the caller's responsibility to make sure that data is consumed only when
 + * it's ready. This function does not call a barrier.
 + */
 +gmx_opencl_inline
 +void preloadCj4(CjType gmx_unused             *cjs,
 +                const __global int gmx_unused *gm_cj,
 +                int gmx_unused                 tidxi,
 +                int gmx_unused                 tidxj,
 +                bool gmx_unused                iMaskCond)
 +{
 +#if USE_SUBGROUP_PRELOAD
 +    *cjs = preloadCj4Subgroup(gm_cj);
 +#elif USE_CJ_PREFETCH
 +    preloadCj4Generic(*cjs, gm_cj, tidxi, tidxj, iMaskCond);
 +#else
 +    //nothing to do
 +#endif
 +}
 +
 +gmx_opencl_inline
 +int loadCjPreload(__local int           *        sm_cjPreload,
 +                  int                            jm,
 +                  int gmx_unused                 tidxi,
 +                  int gmx_unused                 tidxj)
 +{
 +#if defined _AMD_SOURCE_
 +    int       warpLoadOffset = 0; //TODO: fix by setting c_nbnxnGpuClusterpairSplit properly
 +#else
 +    const int c_clSize                   = CL_SIZE;
 +    const int c_nbnxnGpuClusterpairSplit = 2;
 +    const int c_splitClSize              = c_clSize/c_nbnxnGpuClusterpairSplit;
 +    int       warpLoadOffset             = (tidxj & c_splitClSize) * c_nbnxnGpuJgroupSize/c_splitClSize;
 +#endif
 +    return sm_cjPreload[jm + warpLoadOffset];
 +}
 +
 +/* \brief Load a cj given a jm index.
 + *
 + * If cj4 preloading is enabled, it loads from the local memory, otherwise from global.
 + */
 +gmx_opencl_inline
 +int loadCj(CjType cjs, const __global int gmx_unused* gm_cj,
 +           int jm, int gmx_unused tidxi, int gmx_unused tidxj)
 +{
 +#if USE_SUBGROUP_PRELOAD
 +    return sub_group_broadcast(cjs, jm);
 +#elif USE_CJ_PREFETCH
 +    return loadCjPreload(cjs, jm, tidxi, tidxj);
 +#else
 +    return gm_cj[jm];
 +#endif
 +}
 +
 +/*! Convert LJ sigma,epsilon parameters to C6,C12. */
 +gmx_opencl_inline
 +void convert_sigma_epsilon_to_c6_c12(const float  sigma,
 +                                     const float  epsilon,
 +                                     float       *c6,
 +                                     float       *c12)
 +{
 +    float sigma2, sigma6;
 +
 +    sigma2 = sigma * sigma;
 +    sigma6 = sigma2 *sigma2 * sigma2;
 +    *c6    = epsilon * sigma6;
 +    *c12   = *c6 * sigma6;
 +}
 +
 +
 +/*! Apply force switch,  force + energy version. */
 +gmx_opencl_inline
 +void calculate_force_switch_F(const cl_nbparam_params_t *nbparam,
 +                              float                      c6,
 +                              float                      c12,
 +                              float                      inv_r,
 +                              float                      r2,
 +                              float                     *F_invr)
 +{
 +    float r, r_switch;
 +
 +    /* force switch constants */
 +    float disp_shift_V2 = nbparam->dispersion_shift.c2;
 +    float disp_shift_V3 = nbparam->dispersion_shift.c3;
 +    float repu_shift_V2 = nbparam->repulsion_shift.c2;
 +    float repu_shift_V3 = nbparam->repulsion_shift.c3;
 +
 +    r         = r2 * inv_r;
 +    r_switch  = r - nbparam->rvdw_switch;
 +    r_switch  = r_switch >= 0.0f ? r_switch : 0.0f;
 +
 +    *F_invr  +=
 +        -c6*(disp_shift_V2 + disp_shift_V3*r_switch)*r_switch*r_switch*inv_r +
-         c12*(-repu_shift_V2 + repu_shift_V3*r_switch)*r_switch*r_switch*inv_r;
++        c12*(repu_shift_V2 + repu_shift_V3*r_switch)*r_switch*r_switch*inv_r;
 +}
 +
 +/*! Apply force switch, force-only version. */
 +gmx_opencl_inline
 +void calculate_force_switch_F_E(const cl_nbparam_params_t *nbparam,
 +                                float                      c6,
 +                                float                      c12,
 +                                float                      inv_r,
 +                                float                      r2,
 +                                float                     *F_invr,
 +                                float                     *E_lj)
 +{
 +    float r, r_switch;
 +
 +    /* force switch constants */
 +    float disp_shift_V2 = nbparam->dispersion_shift.c2;
 +    float disp_shift_V3 = nbparam->dispersion_shift.c3;
 +    float repu_shift_V2 = nbparam->repulsion_shift.c2;
 +    float repu_shift_V3 = nbparam->repulsion_shift.c3;
 +
 +    float disp_shift_F2 = nbparam->dispersion_shift.c2/3;
 +    float disp_shift_F3 = nbparam->dispersion_shift.c3/4;
 +    float repu_shift_F2 = nbparam->repulsion_shift.c2/3;
 +    float repu_shift_F3 = nbparam->repulsion_shift.c3/4;
 +
 +    r         = r2 * inv_r;
 +    r_switch  = r - nbparam->rvdw_switch;
 +    r_switch  = r_switch >= 0.0f ? r_switch : 0.0f;
 +
 +    *F_invr  +=
 +        -c6*(disp_shift_V2 + disp_shift_V3*r_switch)*r_switch*r_switch*inv_r +
++        c12*(repu_shift_V2 + repu_shift_V3*r_switch)*r_switch*r_switch*inv_r;
 +    *E_lj    +=
 +        c6*(disp_shift_F2 + disp_shift_F3*r_switch)*r_switch*r_switch*r_switch -
 +        c12*(repu_shift_F2 + repu_shift_F3*r_switch)*r_switch*r_switch*r_switch;
 +}
 +
 +/*! Apply potential switch, force-only version. */
 +gmx_opencl_inline
 +void calculate_potential_switch_F(const cl_nbparam_params_t *nbparam,
 +                                  float                      inv_r,
 +                                  float                      r2,
 +                                  float                     *F_invr,
 +                                  const float               *E_lj)
 +{
 +    float r, r_switch;
 +    float sw, dsw;
 +
 +    /* potential switch constants */
 +    float switch_V3 = nbparam->vdw_switch.c3;
 +    float switch_V4 = nbparam->vdw_switch.c4;
 +    float switch_V5 = nbparam->vdw_switch.c5;
 +    float switch_F2 = nbparam->vdw_switch.c3;
 +    float switch_F3 = nbparam->vdw_switch.c4;
 +    float switch_F4 = nbparam->vdw_switch.c5;
 +
 +    r        = r2 * inv_r;
 +    r_switch = r - nbparam->rvdw_switch;
 +
 +    /* Unlike in the F+E kernel, conditional is faster here */
 +    if (r_switch > 0.0f)
 +    {
 +        sw      = 1.0f + (switch_V3 + (switch_V4 + switch_V5*r_switch)*r_switch)*r_switch*r_switch*r_switch;
 +        dsw     = (switch_F2 + (switch_F3 + switch_F4*r_switch)*r_switch)*r_switch*r_switch;
 +
 +        *F_invr = (*F_invr)*sw - inv_r*(*E_lj)*dsw;
 +    }
 +}
 +
 +/*! Apply potential switch, force + energy version. */
 +gmx_opencl_inline
 +void calculate_potential_switch_F_E(const cl_nbparam_params_t *nbparam,
 +                                    float                      inv_r,
 +                                    float                      r2,
 +                                    float                     *F_invr,
 +                                    float                     *E_lj)
 +{
 +    float r, r_switch;
 +    float sw, dsw;
 +
 +    /* potential switch constants */
 +    float switch_V3 = nbparam->vdw_switch.c3;
 +    float switch_V4 = nbparam->vdw_switch.c4;
 +    float switch_V5 = nbparam->vdw_switch.c5;
 +    float switch_F2 = nbparam->vdw_switch.c3;
 +    float switch_F3 = nbparam->vdw_switch.c4;
 +    float switch_F4 = nbparam->vdw_switch.c5;
 +
 +    r        = r2 * inv_r;
 +    r_switch = r - nbparam->rvdw_switch;
 +    r_switch = r_switch >= 0.0f ? r_switch : 0.0f;
 +
 +    /* Unlike in the F-only kernel, masking is faster here */
 +    sw       = 1.0f + (switch_V3 + (switch_V4 + switch_V5*r_switch)*r_switch)*r_switch*r_switch*r_switch;
 +    dsw      = (switch_F2 + (switch_F3 + switch_F4*r_switch)*r_switch)*r_switch*r_switch;
 +
 +    *F_invr  = (*F_invr)*sw - inv_r*(*E_lj)*dsw;
 +    *E_lj   *= sw;
 +}
 +
 +/*! Calculate LJ-PME grid force contribution with
 + *  geometric combination rule.
 + */
 +gmx_opencl_inline
 +void calculate_lj_ewald_comb_geom_F(__constant const float *nbfp_comb_climg2d,
 +                                    int                     typei,
 +                                    int                     typej,
 +                                    float                   r2,
 +                                    float                   inv_r2,
 +                                    float                   lje_coeff2,
 +                                    float                   lje_coeff6_6,
 +                                    float                  *F_invr)
 +{
 +    float c6grid, inv_r6_nm, cr2, expmcr2, poly;
 +
 +    c6grid    = nbfp_comb_climg2d[2*typei]*nbfp_comb_climg2d[2*typej];
 +
 +    /* Recalculate inv_r6 without exclusion mask */
 +    inv_r6_nm = inv_r2*inv_r2*inv_r2;
 +    cr2       = lje_coeff2*r2;
 +    expmcr2   = exp(-cr2);
 +    poly      = 1.0f + cr2 + 0.5f*cr2*cr2;
 +
 +    /* Subtract the grid force from the total LJ force */
 +    *F_invr  += c6grid*(inv_r6_nm - expmcr2*(inv_r6_nm*poly + lje_coeff6_6))*inv_r2;
 +}
 +
 +/*! Calculate LJ-PME grid force + energy contribution with
 + *  geometric combination rule.
 + */
 +gmx_opencl_inline
 +void calculate_lj_ewald_comb_geom_F_E(__constant const float    *nbfp_comb_climg2d,
 +                                      const cl_nbparam_params_t *nbparam,
 +                                      int                        typei,
 +                                      int                        typej,
 +                                      float                      r2,
 +                                      float                      inv_r2,
 +                                      float                      lje_coeff2,
 +                                      float                      lje_coeff6_6,
 +                                      float                      int_bit,
 +                                      float                     *F_invr,
 +                                      float                     *E_lj)
 +{
 +    float c6grid, inv_r6_nm, cr2, expmcr2, poly, sh_mask;
 +
 +    c6grid    = nbfp_comb_climg2d[2*typei]*nbfp_comb_climg2d[2*typej];
 +
 +    /* Recalculate inv_r6 without exclusion mask */
 +    inv_r6_nm = inv_r2*inv_r2*inv_r2;
 +    cr2       = lje_coeff2*r2;
 +    expmcr2   = exp(-cr2);
 +    poly      = 1.0f + cr2 + 0.5f*cr2*cr2;
 +
 +    /* Subtract the grid force from the total LJ force */
 +    *F_invr  += c6grid*(inv_r6_nm - expmcr2*(inv_r6_nm*poly + lje_coeff6_6))*inv_r2;
 +
 +    /* Shift should be applied only to real LJ pairs */
 +    sh_mask   = nbparam->sh_lj_ewald*int_bit;
 +    *E_lj    += ONE_SIXTH_F*c6grid*(inv_r6_nm*(1.0f - expmcr2*poly) + sh_mask);
 +}
 +
 +/*! Calculate LJ-PME grid force + energy contribution (if E_lj != NULL) with
 + *  Lorentz-Berthelot combination rule.
 + *  We use a single F+E kernel with conditional because the performance impact
 + *  of this is pretty small and LB on the CPU is anyway very slow.
 + */
 +gmx_opencl_inline
 +void calculate_lj_ewald_comb_LB_F_E(__constant const float    *nbfp_comb_climg2d,
 +                                    const cl_nbparam_params_t *nbparam,
 +                                    int                        typei,
 +                                    int                        typej,
 +                                    float                      r2,
 +                                    float                      inv_r2,
 +                                    float                      lje_coeff2,
 +                                    float                      lje_coeff6_6,
 +                                    float                      int_bit,
 +                                    bool                       with_E_lj,
 +                                    float                     *F_invr,
 +                                    float                     *E_lj)
 +{
 +    float c6grid, inv_r6_nm, cr2, expmcr2, poly;
 +    float sigma, sigma2, epsilon;
 +
 +    /* sigma and epsilon are scaled to give 6*C6 */
 +    sigma      = nbfp_comb_climg2d[2*typei] + nbfp_comb_climg2d[2*typej];
 +
 +    epsilon    = nbfp_comb_climg2d[2*typei+1]*nbfp_comb_climg2d[2*typej+1];
 +
 +    sigma2  = sigma*sigma;
 +    c6grid  = epsilon*sigma2*sigma2*sigma2;
 +
 +    /* Recalculate inv_r6 without exclusion mask */
 +    inv_r6_nm = inv_r2*inv_r2*inv_r2;
 +    cr2       = lje_coeff2*r2;
 +    expmcr2   = exp(-cr2);
 +    poly      = 1.0f + cr2 + 0.5f*cr2*cr2;
 +
 +    /* Subtract the grid force from the total LJ force */
 +    *F_invr  += c6grid*(inv_r6_nm - expmcr2*(inv_r6_nm*poly + lje_coeff6_6))*inv_r2;
 +
 +    if (with_E_lj)
 +    {
 +        float sh_mask;
 +
 +        /* Shift should be applied only to real LJ pairs */
 +        sh_mask   = nbparam->sh_lj_ewald*int_bit;
 +        *E_lj    += ONE_SIXTH_F*c6grid*(inv_r6_nm*(1.0f - expmcr2*poly) + sh_mask);
 +    }
 +}
 +
 +/*! Interpolate Ewald coulomb force using the table through the tex_nbfp texture.
 + *  Original idea: from the OpenMM project
 + */
 +gmx_opencl_inline float
 +interpolate_coulomb_force_r(__constant const float *coulomb_tab_climg2d,
 +                            float                   r,
 +                            float                   scale)
 +{
 +    float   normalized = scale * r;
 +    int     index      = (int) normalized;
 +    float   fract2     = normalized - index;
 +    float   fract1     = 1.0f - fract2;
 +
 +    return fract1*coulomb_tab_climg2d[index] +
 +           fract2*coulomb_tab_climg2d[index + 1];
 +}
 +
 +/*! Calculate analytical Ewald correction term. */
 +gmx_opencl_inline
 +float pmecorrF(float z2)
 +{
 +    const float FN6 = -1.7357322914161492954e-8f;
 +    const float FN5 = 1.4703624142580877519e-6f;
 +    const float FN4 = -0.000053401640219807709149f;
 +    const float FN3 = 0.0010054721316683106153f;
 +    const float FN2 = -0.019278317264888380590f;
 +    const float FN1 = 0.069670166153766424023f;
 +    const float FN0 = -0.75225204789749321333f;
 +
 +    const float FD4 = 0.0011193462567257629232f;
 +    const float FD3 = 0.014866955030185295499f;
 +    const float FD2 = 0.11583842382862377919f;
 +    const float FD1 = 0.50736591960530292870f;
 +    const float FD0 = 1.0f;
 +
 +    float       z4;
 +    float       polyFN0, polyFN1, polyFD0, polyFD1;
 +
 +    z4          = z2*z2;
 +
 +    polyFD0     = FD4*z4 + FD2;
 +    polyFD1     = FD3*z4 + FD1;
 +    polyFD0     = polyFD0*z4 + FD0;
 +    polyFD0     = polyFD1*z2 + polyFD0;
 +
 +    polyFD0     = 1.0f/polyFD0;
 +
 +    polyFN0     = FN6*z4 + FN4;
 +    polyFN1     = FN5*z4 + FN3;
 +    polyFN0     = polyFN0*z4 + FN2;
 +    polyFN1     = polyFN1*z4 + FN1;
 +    polyFN0     = polyFN0*z4 + FN0;
 +    polyFN0     = polyFN1*z2 + polyFN0;
 +
 +    return polyFN0*polyFD0;
 +}
 +
 +#if REDUCE_SHUFFLE
 +gmx_opencl_inline
 +void reduce_force_j_shfl(float3 fin, __global float *fout,
 +                         int gmx_unused tidxi, int gmx_unused tidxj, int aidx)
 +{
 +    /* Only does reduction over 4 elements in cluster. Needs to be changed
 +     * for CL_SIZE>4. See CUDA code for required code */
 +    fin.x += intel_sub_group_shuffle_down(fin.x, fin.x, 1);
 +    fin.y += intel_sub_group_shuffle_up  (fin.y, fin.y, 1);
 +    fin.z += intel_sub_group_shuffle_down(fin.z, fin.z, 1);
 +    if ((tidxi & 1) == 1)
 +    {
 +        fin.x = fin.y;
 +    }
 +    fin.x += intel_sub_group_shuffle_down(fin.x, fin.x, 2);
 +    fin.z += intel_sub_group_shuffle_up  (fin.z, fin.z, 2);
 +    if (tidxi == 2)
 +    {
 +        fin.x = fin.z;
 +    }
 +    if (tidxi < 3)
 +    {
 +        atomicAdd_g_f(&fout[3 * aidx + tidxi], fin.x);
 +    }
 +}
 +#endif
 +
 +gmx_opencl_inline
 +void reduce_force_j_generic(__local float *f_buf, float3 fcj_buf, __global float *fout,
 +                            int tidxi, int tidxj, int aidx)
 +{
 +    int tidx = tidxi + tidxj*CL_SIZE;
 +    f_buf[                  tidx] = fcj_buf.x;
 +    f_buf[    FBUF_STRIDE + tidx] = fcj_buf.y;
 +    f_buf[2 * FBUF_STRIDE + tidx] = fcj_buf.z;
 +
 +    /* Split the reduction between the first 3 column threads
 +       Threads with column id 0 will do the reduction for (float3).x components
 +       Threads with column id 1 will do the reduction for (float3).y components
 +       Threads with column id 2 will do the reduction for (float3).z components.
 +       The reduction is performed for each line tidxj of f_buf. */
 +    if (tidxi < 3)
 +    {
 +        float f = 0.0f;
 +        for (int j = tidxj * CL_SIZE; j < (tidxj + 1) * CL_SIZE; j++)
 +        {
 +            f += f_buf[FBUF_STRIDE * tidxi + j];
 +        }
 +
 +        atomicAdd_g_f(&fout[3 * aidx + tidxi], f);
 +    }
 +}
 +
 +/*! Final j-force reduction
 + */
 +gmx_opencl_inline
 +void reduce_force_j(__local float gmx_unused *f_buf, float3 fcj_buf, __global float *fout,
 +                    int tidxi, int tidxj, int aidx)
 +{
 +#if REDUCE_SHUFFLE
 +    reduce_force_j_shfl(fcj_buf, fout, tidxi, tidxj, aidx);
 +#else
 +    reduce_force_j_generic(f_buf, fcj_buf, fout, tidxi, tidxj, aidx);
 +#endif
 +}
 +
 +#if REDUCE_SHUFFLE
 +gmx_opencl_inline
 +void reduce_force_i_and_shift_shfl(float3* fci_buf, __global float *fout,
 +                                   bool bCalcFshift, int tidxi, int tidxj,
 +                                   int sci, int shift, __global float *fshift)
 +{
 +    /* Only does reduction over 4 elements in cluster (2 per warp). Needs to be changed
 +     * for CL_SIZE>4.*/
 +    float2 fshift_buf = 0;
 +    for (int ci_offset = 0; ci_offset < NCL_PER_SUPERCL; ci_offset++)
 +    {
 +        int    aidx = (sci * NCL_PER_SUPERCL + ci_offset) * CL_SIZE + tidxi;
 +        float3 fin  = fci_buf[ci_offset];
 +        fin.x += intel_sub_group_shuffle_down(fin.x, fin.x, CL_SIZE);
 +        fin.y += intel_sub_group_shuffle_up  (fin.y, fin.y, CL_SIZE);
 +        fin.z += intel_sub_group_shuffle_down(fin.z, fin.z, CL_SIZE);
 +
 +        if (tidxj & 1)
 +        {
 +            fin.x = fin.y;
 +        }
 +        /* Threads 0,1 and 2,3 increment x,y for their warp */
 +        atomicAdd_g_f(&fout[3*aidx + (tidxj & 1)], fin.x);
 +        if (bCalcFshift)
 +        {
 +            fshift_buf[0] += fin.x;
 +        }
 +        /* Threads 0 and 2 increment z for their warp */
 +        if ((tidxj & 1) == 0)
 +        {
 +            atomicAdd_g_f(&fout[3*aidx+2], fin.z);
 +            if (bCalcFshift)
 +            {
 +                fshift_buf[1] += fin.z;
 +            }
 +        }
 +    }
 +    /* add up local shift forces into global mem */
 +    if (bCalcFshift)
 +    {
 +        //Threads 0,1 and 2,3 update x,y
 +        atomicAdd_g_f(&(fshift[3 * shift + (tidxj&1)]), fshift_buf[0]);
 +        //Threads 0 and 2 update z
 +        if ((tidxj & 1) == 0)
 +        {
 +            atomicAdd_g_f(&(fshift[3 * shift + 2]), fshift_buf[1]);
 +        }
 +    }
 +}
 +#endif
 +
 +/*! Final i-force reduction; this implementation works only with power of two
 + *  array sizes.
 + */
 +gmx_opencl_inline
 +void reduce_force_i_and_shift_pow2(volatile __local float *f_buf, float3* fci_buf,
 +                                   __global float *fout,
 +                                   bool bCalcFshift,
 +                                   int tidxi, int tidxj,
 +                                   int sci, int shift, __global float *fshift)
 +{
 +    float fshift_buf = 0;
 +    for (int ci_offset = 0; ci_offset < NCL_PER_SUPERCL; ci_offset++)
 +    {
 +        int aidx = (sci * NCL_PER_SUPERCL + ci_offset) * CL_SIZE + tidxi;
 +        int tidx = tidxi + tidxj*CL_SIZE;
 +        /* store i forces in shmem */
 +        f_buf[                  tidx] = fci_buf[ci_offset].x;
 +        f_buf[    FBUF_STRIDE + tidx] = fci_buf[ci_offset].y;
 +        f_buf[2 * FBUF_STRIDE + tidx] = fci_buf[ci_offset].z;
 +        barrier(CLK_LOCAL_MEM_FENCE);
 +
 +        int     i, j;
 +        /* Reduce the initial CL_SIZE values for each i atom to half
 +         * every step by using CL_SIZE * i threads.
 +         * Can't just use i as loop variable because than nvcc refuses to unroll.
 +         */
 +        i = CL_SIZE/2;
 +        for (j = CL_SIZE_LOG2 - 1; j > 0; j--)
 +        {
 +            if (tidxj < i)
 +            {
 +
 +                f_buf[                  tidxj * CL_SIZE + tidxi] += f_buf[                  (tidxj + i) * CL_SIZE + tidxi];
 +                f_buf[    FBUF_STRIDE + tidxj * CL_SIZE + tidxi] += f_buf[    FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi];
 +                f_buf[2 * FBUF_STRIDE + tidxj * CL_SIZE + tidxi] += f_buf[2 * FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi];
 +            }
 +            i >>= 1;
 +        }
 +        /* needed because
 +         * a) for CL_SIZE<8: id 2 (doing z in next block) is in 2nd warp
 +         * b) for all CL_SIZE a barrier is needed before f_buf is reused by next reduce_force_i call
 +         * TODO: Test on Nvidia for performance difference between having the barrier here or after the atomicAdd
 +         */
 +        barrier(CLK_LOCAL_MEM_FENCE);
 +
 +        /* i == 1, last reduction step, writing to global mem */
 +        /* Split the reduction between the first 3 line threads
 +           Threads with line id 0 will do the reduction for (float3).x components
 +           Threads with line id 1 will do the reduction for (float3).y components
 +           Threads with line id 2 will do the reduction for (float3).z components. */
 +        if (tidxj < 3)
 +        {
 +            float f = f_buf[tidxj * FBUF_STRIDE + tidxi] + f_buf[tidxj * FBUF_STRIDE + i * CL_SIZE + tidxi];
 +
 +            atomicAdd_g_f(&fout[3 * aidx + tidxj], f);
 +
 +            if (bCalcFshift)
 +            {
 +                fshift_buf += f;
 +            }
 +        }
 +    }
 +    /* add up local shift forces into global mem */
 +    if (bCalcFshift)
 +    {
 +        /* Only threads with tidxj < 3 will update fshift.
 +           The threads performing the update, must be the same as the threads
 +           storing the reduction result above.
 +         */
 +        if (tidxj < 3)
 +        {
 +            atomicAdd_g_f(&(fshift[3 * shift + tidxj]), fshift_buf);
 +        }
 +    }
 +}
 +
 +/*! Final i-force reduction
 + */
 +gmx_opencl_inline
 +void reduce_force_i_and_shift(__local float gmx_unused *f_buf, float3* fci_buf, __global float *f,
 +                              bool bCalcFshift, int tidxi, int tidxj, int sci,
 +                              int shift, __global float *fshift)
 +{
 +#if REDUCE_SHUFFLE
 +    reduce_force_i_and_shift_shfl(fci_buf, f, bCalcFshift, tidxi, tidxj,
 +                                  sci, shift, fshift);
 +#else
 +    reduce_force_i_and_shift_pow2(f_buf, fci_buf, f, bCalcFshift, tidxi, tidxj,
 +                                  sci, shift, fshift);
 +#endif
 +}
 +
 +
 +
 +#if REDUCE_SHUFFLE
 +gmx_opencl_inline
 +void reduce_energy_shfl(float E_lj, float E_el,
 +                        volatile __global float *e_lj,
 +                        volatile __global float *e_el,
 +                        unsigned int    tidx)
 +{
 +    E_lj = sub_group_reduce_add(E_lj);
 +    E_el = sub_group_reduce_add(E_el);
 +    /* Should be get_sub_group_local_id()==0. Doesn't work with Intel Classic driver.
 +     * To make it possible to use REDUCE_SHUFFLE with single subgroup per i-j pair
 +     * (e.g. subgroup size 16 with CL_SIZE 4), either this "if" needs to be changed or
 +     * the definition of WARP_SIZE (currently CL_SIZE*CL_SIZE/2) needs to be changed
 +     * (by supporting c_nbnxnGpuClusterpairSplit=1). */
 +    if (tidx == 0 || tidx == WARP_SIZE)
 +    {
 +        atomicAdd_g_f(e_lj, E_lj);
 +        atomicAdd_g_f(e_el, E_el);
 +    }
 +}
 +#endif
 +
 +/*! Energy reduction; this implementation works only with power of two
 + *  array sizes.
 + */
 +gmx_opencl_inline
 +void reduce_energy_pow2(volatile __local float  *buf,
 +                        volatile __global float *e_lj,
 +                        volatile __global float *e_el,
 +                        unsigned int             tidx)
 +{
 +    int          j;
 +
 +    unsigned int i = WARP_SIZE/2;
 +
 +    /* Can't just use i as loop variable because than nvcc refuses to unroll. */
 +    for (j = WARP_SIZE_LOG2 - 1; j > 0; j--)
 +    {
 +        if (tidx < i)
 +        {
 +            buf[              tidx] += buf[              tidx + i];
 +            buf[FBUF_STRIDE + tidx] += buf[FBUF_STRIDE + tidx + i];
 +        }
 +        i >>= 1;
 +    }
 +
 +    /* last reduction step, writing to global mem */
 +    if (tidx == 0)
 +    {
 +        float e1 = buf[              tidx] + buf[              tidx + i];
 +        float e2 = buf[FBUF_STRIDE + tidx] + buf[FBUF_STRIDE + tidx + i];
 +
 +        atomicAdd_g_f(e_lj, e1);
 +        atomicAdd_g_f(e_el, e2);
 +    }
 +}
 +
 +gmx_opencl_inline
 +void reduce_energy(volatile __local float  gmx_unused *buf,
 +                   float E_lj, float E_el,
 +                   volatile __global float *e_lj,
 +                   volatile __global float *e_el,
 +                   unsigned int             tidx)
 +{
 +#if REDUCE_SHUFFLE
 +    reduce_energy_shfl(E_lj, E_el, e_lj, e_el, tidx);
 +#else
 +    /* flush the energies to shmem and reduce them */
 +    buf[              tidx] = E_lj;
 +    buf[FBUF_STRIDE + tidx] = E_el;
 +    reduce_energy_pow2(buf + (tidx & WARP_SIZE), e_lj, e_el, tidx & ~WARP_SIZE);
 +#endif
 +}
 +
 +gmx_opencl_inline
 +bool gmx_sub_group_any_localmem(volatile __local uint *warp_any, int widx, bool pred)
 +{
 +    if (pred)
 +    {
 +        warp_any[widx] = 1;
 +    }
 +
 +    bool ret = warp_any[widx];
 +
 +    warp_any[widx] = 0;
 +
 +    return ret;
 +}
 +
 +//! Returns a true if predicate is true for any work item in warp
 +gmx_opencl_inline
 +bool gmx_sub_group_any(volatile __local uint gmx_unused *warp_any, int gmx_unused widx, bool pred)
 +{
 +#if USE_SUBGROUP_ANY
 +    return sub_group_any(pred);
 +#else
 +    return gmx_sub_group_any_localmem(warp_any, widx, pred);
 +#endif
 +}
 +
 +#endif /* NBNXN_OPENCL_KERNEL_UTILS_CLH */
index 5e18aa79483706da3f697d17ad281ca5972a0b3e,b1f2a547df87634928d19257ffd7656bd39cc5a3..09f4ebc27cf8e7f01587394f964da4be470f13c1
  #include <algorithm>
  #include <string>
  
+ #include "gromacs/ewald/pme.h"
  #include "gromacs/hardware/cpuinfo.h"
  #include "gromacs/hardware/detecthardware.h"
  #include "gromacs/hardware/hardwaretopology.h"
  #include "gromacs/hardware/hw_info.h"
  #include "gromacs/mdlib/gmx_omp_nthreads.h"
 -#include "gromacs/mdlib/nb_verlet.h"
  #include "gromacs/mdtypes/commrec.h"
  #include "gromacs/mdtypes/inputrec.h"
  #include "gromacs/mdtypes/md_enums.h"
 +#include "gromacs/nbnxm/nbnxm.h"
  #include "gromacs/taskassignment/taskassignment.h"
  #include "gromacs/topology/topology.h"
  #include "gromacs/utility/baseversion.h"
@@@ -153,14 -154,18 +154,18 @@@ decideWhetherToUseGpusForPmeWithThreadM
                                            const TaskTarget        pmeTarget,
                                            const std::vector<int> &gpuIdsToUse,
                                            const std::vector<int> &userGpuTaskAssignment,
-                                           const bool              canUseGpuForPme,
+                                           const gmx_hw_info_t    &hardwareInfo,
+                                           const t_inputrec       &inputrec,
+                                           const gmx_mtop_t       &mtop,
                                            const int               numRanksPerSimulation,
                                            const int               numPmeRanksPerSimulation)
  {
      // First, exclude all cases where we can't run PME on GPUs.
      if ((pmeTarget == TaskTarget::Cpu) ||
          !useGpuForNonbonded ||
-         !canUseGpuForPme)
+         !pme_gpu_supports_build(nullptr) ||
+         !pme_gpu_supports_hardware(hardwareInfo, nullptr) ||
+         !pme_gpu_supports_input(inputrec, mtop, nullptr))
      {
          // PME can't run on a GPU. If the user required that, we issue
          // an error later.
@@@ -335,7 -340,9 +340,9 @@@ bool decideWhetherToUseGpusForNonbonded
  bool decideWhetherToUseGpusForPme(const bool              useGpuForNonbonded,
                                    const TaskTarget        pmeTarget,
                                    const std::vector<int> &userGpuTaskAssignment,
-                                   const bool              canUseGpuForPme,
+                                   const gmx_hw_info_t    &hardwareInfo,
+                                   const t_inputrec       &inputrec,
+                                   const gmx_mtop_t       &mtop,
                                    const int               numRanksPerSimulation,
                                    const int               numPmeRanksPerSimulation,
                                    const bool              gpusWereDetected)
          if (pmeTarget == TaskTarget::Gpu)
          {
              GMX_THROW(NotImplementedError
-                           ("The PME on the GPU is only supported when nonbonded interactions run on GPUs also."));
+                           ("PME on GPUs is only supported when nonbonded interactions run on GPUs also."));
          }
          return false;
      }
  
-     if (!canUseGpuForPme)
+     std::string message;
+     if (!pme_gpu_supports_build(&message))
      {
          if (pmeTarget == TaskTarget::Gpu)
          {
-             // TODO Pass in the inputrec so we can give more help here?
              GMX_THROW(NotImplementedError
-                           ("The input simulation did not use PME in a way that is supported on the GPU."));
+                           ("Cannot compute PME interactions on a GPU, because " + message));
+         }
+         return false;
+     }
+     if (!pme_gpu_supports_hardware(hardwareInfo, &message))
+     {
+         if (pmeTarget == TaskTarget::Gpu)
+         {
+             GMX_THROW(NotImplementedError
+                           ("Cannot compute PME interactions on a GPU, because " + message));
+         }
+         return false;
+     }
+     if (!pme_gpu_supports_input(inputrec, mtop, &message))
+     {
+         if (pmeTarget == TaskTarget::Gpu)
+         {
+             GMX_THROW(NotImplementedError
+                           ("Cannot compute PME interactions on a GPU, because " + message));
          }
          return false;
      }
          // Specifying -gputasks requires specifying everything.
          if (pmeTarget == TaskTarget::Auto)
          {
-             GMX_THROW(InconsistentInputError(formatString(g_specifyEverythingFormatString, "all of -nb, -pme, and -ntmpi")));
+             GMX_THROW(InconsistentInputError(formatString(g_specifyEverythingFormatString, "all of -nb, -pme, and -ntmpi"))); // TODO ntmpi?
          }
  
          return true;
index da85789f5f0ff21fb1dceb6299907481c77a525e,09d02464a6f4f45abf7ee9fe5ad1671e4b80fe0e..05e3912797ddb62d9bee0abfc3f9aa66e919faae
@@@ -356,7 -356,9 +356,9 @@@ int get_nthreads_mpi(const gmx_hw_info_
      if (pmeOnGpu)
      {
          GMX_RELEASE_ASSERT((EEL_PME(inputrec->coulombtype) || EVDW_PME(inputrec->vdwtype)) &&
-                            pme_gpu_supports_build(*hwinfo, nullptr) && pme_gpu_supports_input(*inputrec, *mtop, nullptr),
+                            pme_gpu_supports_build(nullptr) &&
+                            pme_gpu_supports_hardware(*hwinfo, nullptr) &&
+                            pme_gpu_supports_input(*inputrec, *mtop, nullptr),
                             "PME can't be on GPUs unless we are using PME");
  
          // PME on GPUs supports a single PME rank with PP running on the same or few other ranks.
  
      /* nonbondedOnGpu might be false e.g. because this simulation uses
       * the group scheme, or is a rerun with energy groups. */
 -    ngpu = (nonbondedOnGpu ? static_cast<int>(gpuIdsToUse.size()) : 0);
 +    ngpu = (nonbondedOnGpu ? gmx::ssize(gpuIdsToUse) : 0);
  
      if (inputrec->cutoff_scheme == ecutsGROUP)
      {
index c2e8eb2dc39b239267ea78c79ab7dbfb749d2b5a,a4db1c3a55d580b44973f3d833c573c70c3d7497..15f100aa9f11f87b357d07fd232c119b433b9d2c
@@@ -43,8 -43,7 +43,8 @@@
  
  #include "topologyinformation.h"
  
 -#include "gromacs/compat/make_unique.h"
 +#include <memory>
 +
  #include "gromacs/fileio/confio.h"
  #include "gromacs/math/vec.h"
  #include "gromacs/pbcutil/rmpbc.h"
@@@ -60,8 -59,7 +60,7 @@@ namespace gm
  {
  
  TopologyInformation::TopologyInformation()
-     : mtop_(std::make_unique<gmx_mtop_t>()),
-       hasLoadedMtop_(false),
+     : hasLoadedMtop_(false),
        expandedTopology_(nullptr),
        atoms_ (nullptr),
        bTop_(false), ePBC_(-1)
@@@ -75,7 -73,7 +74,7 @@@ TopologyInformation::~TopologyInformati
  
  void TopologyInformation::fillFromInputFile(const std::string &filename)
  {
 -    mtop_ = gmx::compat::make_unique<gmx_mtop_t>();
 +    mtop_ = std::make_unique<gmx_mtop_t>();
      // TODO When filename is not a .tpr, then using readConfAndAtoms
      // would be efficient for not doing multiple conversions for
      // makeAtomsData. However we'd also need to be able to copy the
@@@ -108,8 -106,7 +107,8 @@@ const gmx_localtop_t *TopologyInformati
      // Do lazy initialization
      if (expandedTopology_ == nullptr && hasTopology())
      {
 -        expandedTopology_.reset(gmx_mtop_generate_local_top(mtop_.get(), false));
 +        expandedTopology_ = std::make_unique<gmx_localtop_t>();
 +        gmx_mtop_generate_local_top(*mtop_, expandedTopology_.get(), false);
      }
  
      return expandedTopology_.get();