Merge branch 'release-2016'

author Szilárd Páll <pall.szilard@gmail.com>

Mon, 11 Sep 2017 17:33:09 +0000 (19:33 +0200)

committer Berk Hess <hess@kth.se>

Tue, 12 Sep 2017 06:47:39 +0000 (08:47 +0200)
author Szilárd Páll <pall.szilard@gmail.com>
Mon, 11 Sep 2017 17:33:09 +0000 (19:33 +0200)
committer Berk Hess <hess@kth.se>
Tue, 12 Sep 2017 06:47:39 +0000 (08:47 +0200)
diff --combined CMakeLists.txt

index 4c264ff2a0dc74b33f5478c5985e203a594f5a0d,52634bf9b89732faafceaaf2399679e3e035bea7..3aa958f4b1eb8bf2fb8918f8d58b9c05e7d059d9
--- 1/CMakeLists.txt
--- 2/CMakeLists.txt
+++ b/CMakeLists.txt
@@@ -32,7 -32,11 +32,7 @@@
   # To help us fund GROMACS development, we humbly ask that you cite
   # the research papers on the package. Check out http://www.gromacs.org.
   
- -cmake_minimum_required(VERSION 2.8.8)
- -# When we require cmake >= 2.8.12, it will provide
- -# CMAKE_MINIMUM_REQUIRED_VERSION automatically, but in the meantime we
- -# need to set a variable, and it must have a different name.
- -set(GMX_CMAKE_MINIMUM_REQUIRED_VERSION "2.8.8")
+ +cmake_minimum_required(VERSION 3.4.3)
   
   # CMake modules/macros are in a subdirectory to keep this file cleaner
   # This needs to be set before project() in order to pick up toolchain files
@@@ -66,10 -70,11 +66,10 @@@ include(gmxBuildTypeMSAN
   include(gmxBuildTypeReleaseWithAssert)
   
   if(NOT CMAKE_BUILD_TYPE)
- -    set(CMAKE_BUILD_TYPE "Release" CACHE STRING "Choose the type of build, options are: Debug Release RelWithDebInfo MinSizeRel Reference RelWithAssert Profile." FORCE)
- -    # There's no need to offer a user the choice of ThreadSanitizer
+ +    set(CMAKE_BUILD_TYPE "Release" CACHE STRING "Choose the type of build, options are: Debug Release RelWithDebInfo MinSizeRel Reference RelWithAssert Profile TSAN ASAN MSAN." FORCE)
       # Set the possible values of build type for cmake-gui
       set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release"
- -        "MinSizeRel" "RelWithDebInfo" "Reference" "RelWithAssert" "Profile")
+ +        "MinSizeRel" "RelWithDebInfo" "Reference" "RelWithAssert" "Profile" "TSAN" "ASAN" "MSAN")
   endif()
   if(CMAKE_CONFIGURATION_TYPES)
       # Add appropriate GROMACS-specific build types for the Visual
@@@ -85,8 -90,14 +85,8 @@@ set(build_types_with_explicit_flags REL
   
   set_property(GLOBAL PROPERTY FIND_LIBRARY_USE_LIB64_PATHS ON)
   
- -# Set a default valgrind suppression file.
- -# This unfortunately needs to duplicate information from CTest to work as
- -# expected...
- -set(MEMORYCHECK_SUPPRESSIONS_FILE
- -    "${CMAKE_SOURCE_DIR}/cmake/legacy_and_external.supp"
- -    CACHE FILEPATH
- -    "File that contains suppressions for the memory checker")
- -include(CTest)
+ +include(gmxCTestUtilities)
+ +gmx_ctest_init()
   
   include(gmxCPackUtilities)
   gmx_cpack_init()
@@@ -95,21 -106,6 +95,21 @@@
   set(INSTALLED_HEADER_INCLUDE_DIRS "")
   set(INSTALLED_HEADER_DEFINITIONS "")
   
+ +########################################################################
+ +# Global non-cache variables for implementing the build system
+ +########################################################################
+ +
+ +# These variables collect libraries that GROMACS requires for
+ +# linking. They should be appended to with list(APPEND ${name}
+ +# new-library) calls. They are:
+ +#  - Libraries that are required for libgromacs (only)
+ +set(GMX_EXTRA_LIBRARIES "")
+ +#  - Libraries that are required for all code in the repository
+ +set(GMX_COMMON_LIBRARIES "")
+ +#  - Libraries that all code linked against libgromacs needs
+ +#    (i.e., something that is exposed in installed headers).
+ +set(GMX_PUBLIC_LIBRARIES "")
+ +
   ########################################################################
   # Check and warn if cache generated on a different host is being reused
   ########################################################################
@@@ -201,16 -197,12 +201,16 @@@ gmx_add_cache_dependency(GMX_COOL_QUOTE
   
   option(GMX_USE_OPENCL "Enable OpenCL acceleration" OFF)
   
- -# Decide on GPU settings based on user-settings and GPU/CUDA detection.
- -# GCC 4.6 requires CUDA 5.0 and VS2015 requires CUDA 8.0
+ +# Decide on GPU settings based on user-settings and GPU/CUDA
+ +# detection.  GCC 4.8 requires CUDA 6.0 (but we choose 6.5 for the
+ +# preliminary C++11 support), icc 15 requires CUDA 7.0, and VS2015
+ +# requires CUDA 8.0
   if(MSVC)
       set(REQUIRED_CUDA_VERSION 8.0)
+ +elseif(CMAKE_CXX_COMPILER_ID MATCHES "Intel")
+ +    set(REQUIRED_CUDA_VERSION 7.0)
   else()
- -    set(REQUIRED_CUDA_VERSION 5.0)
+ +    set(REQUIRED_CUDA_VERSION 6.5)
   endif()
   set(REQUIRED_CUDA_COMPUTE_CAPABILITY 2.0)
   
@@@ -236,11 -228,14 +236,11 @@@ else(
       endif()
   endif()
   
- -include(gmxDetectSimd)
- -gmx_detect_simd(GMX_SUGGESTED_SIMD)
- -
   gmx_option_multichoice(
       GMX_SIMD
       "SIMD instruction set for CPU kernels and compiler optimization"
- -    "${GMX_SUGGESTED_SIMD}"
- -    None SSE2 SSE4.1 AVX_128_FMA AVX_256 AVX2_256 AVX_512 AVX_512_KNL MIC ARM_NEON ARM_NEON_ASIMD IBM_QPX IBM_VMX IBM_VSX Sparc64_HPC_ACE Reference)
+ +    "AUTO"
+ +    AUTO None SSE2 SSE4.1 AVX_128_FMA AVX_256 AVX2_256 AVX2_128 AVX_512 AVX_512_KNL MIC ARM_NEON ARM_NEON_ASIMD IBM_QPX IBM_VMX IBM_VSX Sparc64_HPC_ACE Reference)
   
   if(GMX_TARGET_MIC)
       set(GMX_FFT_LIBRARY_DEFAULT "mkl")
@@@ -333,12 -328,28 +333,17 @@@ gmx_c_flags(
   
   # This variable should be used for additional compiler flags which are not
   # generated in gmxCFlags nor are SIMD or MPI related.
+ #
+ # TODO These variables should be consolidated into
+ # EXTRA_COMPILER_FLAGS so that we we don't perpetrate bugs where
+ # things that work in C compilation (e.g. merging from old branches)
+ # also work for C++ compilation.
   set(EXTRA_C_FLAGS "")
   set(EXTRA_CXX_FLAGS "")
   
   # Run through a number of tests for buggy compilers and other issues
   include(gmxTestCompilerProblems)
   gmx_test_compiler_problems()
- -# GMX_SIMD will not be set automatically until the second
- -# pass (which is not strictly guaranteed to occur), so putting this
- -# check here among logically-related tests is inefficient, but the
- -# potential loss is likely zero.
- -if(GMX_SIMD STREQUAL "AVX_256"
- -        AND CMAKE_COMPILER_IS_GNUCC
- -        AND (C_COMPILER_VERSION VERSION_EQUAL "4.6.1"
- -            OR CXX_COMPILER_VERSION VERSION_EQUAL "4.6.1"))
- -    message(FATAL_ERROR "gcc 4.6.1 has buggy support for AVX, and GROMACS mdrun will not work. If you want simulation performance, use a more recent compiler. Otherwise, use GMX_SIMD=SSE4.1")
- -    # See http://gcc.gnu.org/bugzilla/show_bug.cgi?id=49002
- -endif()
   
   # Implement double-precision option. This is complicated because we
   # need installed headers to use the precision mode of the build that
@@@ -458,6 -469,11 +463,6 @@@ include(gmxManageSharedLibraries
   # Find external packages                                               #
   ########################################################################
   
- -# TNG wants zlib if it is available. And static libxml2 might have a dependency
- -find_package(ZLIB QUIET)
- -include(gmxTestZLib)
- -gmx_test_zlib(HAVE_ZLIB)
- -
   # Unconditionally find the package, as it is also required for unit
   # tests. This exports LIBXML2_FOUND, which we should not use because
   # it does not tell us that linking will succeed. Instead, we test that
@@@ -503,7 -519,7 +508,7 @@@ endif(
   option(GMX_HWLOC "Add support for hwloc Portable Hardware locality library" ${GMX_HWLOC_DEFAULT})
   if(GMX_HWLOC)
       if(HWLOC_FOUND)
- -        include_directories(${HWLOC_INCLUDE_DIRS})
+ +        include_directories(SYSTEM ${HWLOC_INCLUDE_DIRS})
           list(APPEND GMX_EXTRA_LIBRARIES ${HWLOC_LIBRARIES})
       else()
           message(FATAL_ERROR "Hwloc package support requested, but not found.")
@@@ -562,6 -578,7 +567,6 @@@ if (TMPI_ATOMICS_DISABLED
      add_definitions(-DTMPI_ATOMICS_DISABLED)
   endif()
   
- -# Note this relies on zlib detection having already run
   include(gmxManageTNG)
   
   include(gmxManageLmfit)
@@@ -569,6 -586,8 +574,6 @@@
   if(GMX_GPU)
       # now that we have detected the dependencies, do the second configure pass
       gmx_gpu_setup()
- -else()
- -    mark_as_advanced(CUDA_HOST_COMPILER)
   endif()
   
   if(CYGWIN)
@@@ -601,9 -620,7 +606,9 @@@ gmx_add_cache_dependency(GMX_BUILD_UNIT
   
   add_definitions( -DHAVE_CONFIG_H )
   include_directories(BEFORE ${CMAKE_SOURCE_DIR}/src)
- -include_directories(BEFORE ${CMAKE_SOURCE_DIR}/src/external/thread_mpi/include)
+ +# TODO required at high level because both libgromacs and progs/mdrun
+ +# require it, both for thread-MPI and its atomics and mutexes.
+ +include_directories(BEFORE SYSTEM ${CMAKE_SOURCE_DIR}/src/external/thread_mpi/include)
   # Required for config.h, maybe should only be set in src/CMakeLists.txt
   include_directories(BEFORE ${CMAKE_BINARY_DIR}/src)
   
@@@ -612,50 -629,24 +617,50 @@@ gmx_test_inline_asm_gcc_x86(GMX_X86_GCC
   
   include(gmxSetBuildInformation)
   gmx_set_build_information()
- -# Turn on RDTSCP if:
- -# - the build system's CPU supports it
- -# - the acceleration is set to AVX as all AVX-capable CPUs support AVX (which
- -#   at this point means that the user set it).
- -# Note: it's better to not use the later set value of GMX_SIMD because
- -# it reflects the system's capability of both compiling and running AVX code.
- -# TODO: After merge with 5.0 one could implement a cache variable dependency
- -# such that GMX_USE_RDTSCP can change if GMX_SIMD is changed to AVX
- -# after the first cmake pass.
- -if (BUILD_CPU_FEATURES MATCHES "rdtscp" OR GMX_SIMD MATCHES "AVX")
- -    set(GMX_USE_RDTSCP_DEFAULT_VALUE ON)
- -else()
- -    set(GMX_USE_RDTSCP_DEFAULT_VALUE OFF)
- -endif()
- -option(GMX_USE_RDTSCP "Use RDTSCP for better CPU-based timers (available on recent x86 CPUs; might need to be off when compiling for heterogeneous environments)" ${GMX_USE_RDTSCP_DEFAULT_VALUE})
+ +
+ +gmx_option_multichoice(
+ +    GMX_USE_RDTSCP
+ +    "Use low-latency RDTSCP instruction for CPU-based timers for mdrun execution; might need to be off when compiling for heterogeneous environments)"
+ +    "AUTO"
+ +    OFF ON AUTO DETECT)
   mark_as_advanced(GMX_USE_RDTSCP)
- -if(GMX_USE_RDTSCP)
+ +
+ +macro(gmx_check_rdtscp)
+ +    if (CPU_DETECTION_FEATURES MATCHES "rdtscp")
+ +        set(HAVE_RDTSCP 1)
+ +        set(RDTSCP_DETECTION_MESSAGE " - detected on the build host")
+ +    else()
+ +        set(RDTSCP_DETECTION_MESSAGE " - not detected on the build host")
+ +    endif()
+ +endmacro()
+ +
+ +set(HAVE_RDTSCP 0)
+ +if (GMX_USE_RDTSCP STREQUAL "ON")
       set(HAVE_RDTSCP 1)
+ +elseif(GMX_USE_RDTSCP STREQUAL "DETECT")
+ +    gmx_check_rdtscp()
+ +elseif(GMX_USE_RDTSCP STREQUAL "AUTO")
+ +    # If the user specified automated SIMD selection, that the choice
+ +    # is made based on detection on the build host. If so, then RDTSCP
+ +    # should be chosen the same way.
+ +    #
+ +    # If the user specified an AVX SIMD level (e.g. when
+ +    # cross-compiling GROMACS) then they will get our best guess, ie
+ +    # that in practice AVX mostly correlates with rdtscp (and anyway
+ +    # is only relevant in rather old x86 hardware).
+ +    if (GMX_SIMD STREQUAL "AUTO")
+ +        gmx_check_rdtscp()
+ +    elseif (GMX_SIMD MATCHES "AVX")
+ +        set(HAVE_RDTSCP 1)
+ +    endif()
+ +endif()
+ +gmx_check_if_changed(HAVE_RDTSCP_CHANGED HAVE_RDTSCP)
+ +if (HAVE_RDTSCP_CHANGED)
+ +    if (HAVE_RDTSCP)
+ +        message(STATUS "Enabling RDTSCP support${RDTSCP_DETECTION_MESSAGE}")
+ +    else()
+ +        message(STATUS "Disabling RDTSCP support${RDTSCP_DETECTION_MESSAGE}")
+ +    endif()
   endif()
   
   include(gmxTestLargeFiles)
@@@ -729,8 -720,8 +734,8 @@@ if(HAVE_TIME_H AND HAVE_UNISTD_H AND HA
   endif()
   
   # Math and thread libraries must often come after all others when linking...
- -if(HAVE_LIBM)
- -    list(APPEND GMX_EXTRA_LIBRARIES m)
+ +if (HAVE_LIBM)
+ +    list(APPEND GMX_PUBLIC_LIBRARIES m)
   endif()
   
   option(GMX_NACL "Configure for Native Client builds" OFF)
@@@ -780,7 -771,7 +785,7 @@@ else(
       message("CMAKE_CXX_FLAGS_RELEASE: ${GMXC_CXXFLAGS_RELEASE}")
       message("CMAKE_CXX_FLAGS_DEBUG: ${GMXC_CXXFLAGS_DEBUG}")
       message("CMAKE_EXE_LINKER_FLAGS: ${FFT_LINKER_FLAGS} ${MPI_LINKER_FLAGS}")
- -    message("CMAKE_SHARED_LINKER_FLAGS: ${MPI_LINKER_FLAGS}")
+ +    message("CMAKE_SHARED_LINKER_FLAGS: ${FFT_LINKER_FLAGS} ${MPI_LINKER_FLAGS}")
   endif()
   
   if(NOT GMX_OPENMP)
@@@ -830,12 -821,17 +835,12 @@@ include(gmxManageSuffixes
   ################################################################
   # Shared library load path settings
   ################################################################
- -# CMake supports RPATH on OS X only from 2.8.12 upwards.
- -# CMAKE_SYSTEM_VERSION > 8.0 matches OS X 10.5 and above, where RPATH support
- -# was added.
- -
   if(NOT GMX_BUILD_SHARED_EXE)
       # No rpath
       set(CMAKE_SKIP_RPATH TRUE)
       set(CMAKE_EXE_LINK_DYNAMIC_C_FLAGS) # remove -Wl,-Bdynamic
       set(CMAKE_EXE_LINK_DYNAMIC_CXX_FLAGS)
- -elseif((NOT CMAKE_SYSTEM_NAME STREQUAL "Darwin") OR
- -   ((CMAKE_SYSTEM_VERSION VERSION_GREATER 8.0) AND (NOT CMAKE_VERSION VERSION_LESS 2.8.12)))
+ +else()
       # The build folder always has bin/ and lib/; if we are also going to
       # install to lib/, then the installation RPATH works also in the build
       # tree.  This makes installation slightly faster (no need to rewrite the
@@@ -845,13 -841,23 +850,13 @@@
       endif()
       # Set the RPATH as relative to the executable location to make the
       # binaries relocatable.
- -    if(NOT CMAKE_SYSTEM_NAME STREQUAL "Darwin")
+ +    if(NOT CMAKE_SYSTEM_NAME STREQUAL "Darwin") #Assume OS X >=10.5
           set(CMAKE_INSTALL_RPATH "\$ORIGIN/../${GMX_LIB_INSTALL_DIR}")
       else()
           set(CMAKE_INSTALL_RPATH "@executable_path/../${GMX_LIB_INSTALL_DIR}")
       endif()
       set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
       set(CMAKE_MACOSX_RPATH 1)
- -else()
- -    # We are on Darwin/OSX, and CMake cannot handle RPATHs automatically.
- -    if(CMAKE_SYSTEM_VERSION VERSION_GREATER 8.0)
- -        # Set the RPATH options manually.
- -        set(CMAKE_INSTALL_NAME_DIR "@rpath")
- -        set(GMX_EXE_LINKER_FLAGS ${GMX_EXE_LINKER_FLAGS} "-Wl,-rpath,@executable_path/../${GMX_LIB_INSTALL_DIR}")
- -    else()
- -        # Use the old INSTALL_NAME_DIR mechanism if RPATH is not supported.
- -        set(CMAKE_INSTALL_NAME_DIR "${CMAKE_INSTALL_PREFIX}/${LIB_INSTALL_DIR}")
- -    endif()
   endif()
   
   #COPYING file: Only necessary for binary distributions.
@@@ -868,7 -874,18 +873,7 @@@ if (GMX_BUILD_FOR_COVERAGE
   endif()
   
   if (BUILD_TESTING)
- -    # "tests" target builds all the separate test binaries.
- -    add_custom_target(tests)
- -    # "run-ctest" is an internal target that actually runs the tests.
- -    # This is necessary to be able to add separate targets that execute as part
- -    # of 'make check', but are ensured to be executed after the actual tests.
- -    add_custom_target(run-ctest
- -                      COMMAND ${CMAKE_CTEST_COMMAND} --output-on-failure
- -                      COMMENT "Running all tests"
- -                      VERBATIM)
- -    add_dependencies(run-ctest tests)
- -    # "check" target builds and runs all tests.
- -    add_custom_target(check DEPENDS run-ctest)
+ +    include(tests/CheckTarget.cmake)
   endif()
   
   if (NOT GMX_BUILD_MDRUN_ONLY)
diff --combined cmake/gmxManageFFTLibraries.cmake

index f0ef27f6d6422b8d271ba5cdb02ba6babdb8325b,6577ef201c858b46b79493bc41ecfa9478f64080..8a184948cde1b20acf68a6f58cb251b690b2fe6f
--- 1/cmake/gmxManageFFTLibraries.cmake
--- 2/cmake/gmxManageFFTLibraries.cmake
+++ b/cmake/gmxManageFFTLibraries.cmake
@@@ -89,10 -89,10 +89,10 @@@ if(${GMX_FFT_LIBRARY} STREQUAL "FFTW3"
           set(PKG_FFT "${${FFTW}_PKG}")
           include_directories(SYSTEM ${${FFTW}_INCLUDE_DIRS})
   
- -        if ((${GMX_SIMD} MATCHES "SSE" OR ${GMX_SIMD} MATCHES "AVX") AND NOT ${FFTW}_HAVE_SIMD)
+ +        if ((${GMX_SIMD_ACTIVE} MATCHES "SSE" OR ${GMX_SIMD_ACTIVE} MATCHES "AVX") AND NOT ${FFTW}_HAVE_SIMD)
               message(WARNING "The fftw library found is compiled without SIMD support, which makes it slow. Consider recompiling it or contact your admin")
           else()
- -            if(${GMX_SIMD} MATCHES "AVX" AND NOT (${FFTW}_HAVE_SSE OR ${FFTW}_HAVE_SSE2))
+ +            if(${GMX_SIMD_ACTIVE} MATCHES "AVX" AND NOT (${FFTW}_HAVE_SSE OR ${FFTW}_HAVE_SSE2))
                   # If we end up here we have an AVX Gromacs build, and
                   # FFTW with SIMD.
                   message(WARNING "The FFTW library was compiled with neither --enable-sse nor --enable-sse2; those would have enabled SSE(2) SIMD instructions. This will give suboptimal performance. You should (re)compile the FFTW library with --enable-sse2 and --enable-avx (and --enable-avx2 or --enable-avx512 if supported).")
@@@ -120,6 -120,7 +120,7 @@@ elseif(${GMX_FFT_LIBRARY} STREQUAL "MKL
           # Some versions of icc require this in order that mkl.h can be
           # found at compile time.
           set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} ${FFT_LINKER_FLAGS}")
+         set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} ${FFT_LINKER_FLAGS}")
   
           set(MKL_ERROR_MESSAGE "Make sure you have configured your compiler so that ${FFT_LINKER_FLAGS} will work.")
       else()
diff --combined docs/install-guide/index.rst

index 429d9a7b028edb09bba0ee201068db0bc9f3b1cf,2f8afec88bb6a9c86dacf7d035f7cda04a9c3a47..98b6fa8b074a9eb5a1ac6b55b1b2d82e217bdd77
--- 1/docs/install-guide/index.rst
--- 2/docs/install-guide/index.rst
+++ b/docs/install-guide/index.rst
@@@ -15,7 -15,7 +15,7 @@@ These instructions pertain to building 
   Quick and dirty installation
   ----------------------------
   1. Get the latest version of your C and C++ compilers.
- -2. Check that you have CMake version |GMX_CMAKE_MINIMUM_REQUIRED_VERSION| or later.
+ +2. Check that you have CMake version |CMAKE_MINIMUM_REQUIRED_VERSION| or later.
   3. Get and unpack the latest version of the |Gromacs| tarball.
   4. Make a separate build directory and change to it. 
   5. Run ``cmake`` with the path to the source as an argument
@@@ -100,12 -100,11 +100,12 @@@ compiler. We recommend gcc, because it 
   frequently provides the best performance.
   
   You should strive to use the most recent version of your
- -compiler. Minimum supported compiler versions are
+ +compiler. Since we require full C++11 support the minimum supported
+ +compiler versions are
   
- -* GNU (gcc) 4.6
- -* Intel (icc) 14
- -* LLVM (clang) 3.4
+ +* GNU (gcc) 4.8.1
+ +* Intel (icc) 15.0
+ +* LLVM (clang) 3.3
   * Microsoft (MSVC) 2015
   
   Other compilers may work (Cray, Pathscale, older clang) but do
@@@ -123,7 -122,7 +123,7 @@@ other compilers, read on
   
   On Linux, both the Intel and clang compiler use the libstdc++ which
   comes with gcc as the default C++ library. For |Gromacs|, we require
- -the compiler to support libstc++ version 4.6.1 or higher. To select a
+ +the compiler to support libstc++ version 4.8.1 or higher. To select a
   particular libstdc++ library, use:
   
   * For Intel: ``-DGMX_STDLIB_CXX_FLAGS=-gcc-name=/path/to/gcc/binary``
@@@ -152,11 -151,6 +152,11 @@@ For all non-x86 platforms, your best op
   the vendor's default or recommended compiler, and check for
   specialized information below.
   
+ +For updated versions of gcc to add to your Linux OS, see
+ +
+ +* Ubuntu: `Ubuntu toolchain ppa page`_
+ +* RHEL/CentOS: `EPEL page`_ or the RedHat Developer Toolset
+ +
   Compiling with parallelization options
   --------------------------------------
   
@@@ -168,10 -162,8 +168,10 @@@ generally built into your compiler and 
   GPU support
   ^^^^^^^^^^^
   |Gromacs| has excellent support for NVIDIA GPUs supported via CUDA.
- -NVIDIA's CUDA_ version |REQUIRED_CUDA_VERSION| software development kit is required,
- -and the latest version is strongly encouraged. NVIDIA GPUs with at
+ +On Linux with gcc, NVIDIA's CUDA_ version |REQUIRED_CUDA_VERSION|
+ +software development kit is required, and the latest
+ +version is strongly encouraged. Using Intel or Microsoft compilers
+ +requires version 7.0 and 8.0, respectively. NVIDIA GPUs with at
   least NVIDIA compute capability |REQUIRED_CUDA_COMPUTE_CAPABILITY| are
   required, e.g. Fermi, Kepler, Maxwell or Pascal cards. You are strongly recommended to
   get the latest CUDA version and driver supported by your hardware, but
@@@ -228,7 -220,7 +228,7 @@@ CMak
   -----
   
   |Gromacs| builds with the CMake build system, requiring at least
- -version |GMX_CMAKE_MINIMUM_REQUIRED_VERSION|. You can check whether
+ +version |CMAKE_MINIMUM_REQUIRED_VERSION|. You can check whether
   CMake is installed, and what version it is, with ``cmake
   --version``. If you need to install CMake, then first check whether
   your platform's package management system provides a suitable version,
@@@ -1035,6 -1027,19 +1035,19 @@@ you may need to specify the use of stat
   ``-DGMX_BUILD_SHARED_EXE=off``, and you may need to set the F77
   environmental variable to ``ftn`` when compiling FFTW.
   
+ Building on Solaris
+ -------------------
+ 
+ The built-in |Gromacs| processor detection does not work on Solaris,
+ so it is strongly recommended that you build |Gromacs| with
+ ``-DGMX_HWLOC=on`` and ensure that the ``CMAKE_PREFIX_PATH`` includes
+ the path where the hwloc headers and libraries can be found. At least
+ version 1.11.8 of hwloc is recommended.
+ 
+ Oracle Developer Studio is not a currently supported compiler (and
+ does not currently compile |Gromacs| correctly, perhaps because the
+ thread-MPI atomics are incorrectly implemented in |Gromacs|).
+ 
   Building on BlueGene
   --------------------
   
@@@ -1150,9 -1155,9 +1163,9 @@@ much everywhere, it is important that w
   it works because we have tested it. We do test on Linux, Windows, and
   Mac with a range of compilers and libraries for a range of our
   configuration options. Every commit in our git source code repository
- -is currently tested on x86 with gcc versions ranging from 4.6 through
- -5.2, and versions 16 of the Intel compiler as well as Clang
- -version 3.4 through 3.8. For this, we use a variety of GNU/Linux
+ +is currently tested on x86 with a number of gcc versions ranging from 4.8.1
+ +through 6.1, versions 16 of the Intel compiler, and Clang
+ +versions 3.4 through 3.8. For this, we use a variety of GNU/Linux
   flavors and versions as well as recent versions of Windows. Under
   Windows, we test both MSVC 2015 and version 16 of the Intel compiler.
   For details, you can
diff --combined docs/user-guide/environment-variables.rst

index f95ea156634c7dfec0c2fe46482f4f6657e5d30a,597ed8f900e9f36f5ebd5cf9bf01568ee4c42213..76fd58770621590c849a86cb259a576776ba9933
--- 1/docs/user-guide/environment-variables.rst
--- 2/docs/user-guide/environment-variables.rst
+++ b/docs/user-guide/environment-variables.rst
@@@ -74,6 -74,10 +74,10 @@@ Output Contro
           :ref:`pdb` file, which allows to check whether multimeric channels have
           the correct PBC representation.
   
+ ``GMX_TRAJECTORY_IO_VERBOSITY``
+         Defaults to 1, which prints frame count e.g. when reading trajectory
+         files. Set to 0 for quiet operation.
+ 
   Debugging
   ---------
   ``GMX_PRINT_DEBUG_LINES``
@@@ -195,6 -199,8 +199,6 @@@ Performance and Run Contro
   ``GMX_EMULATE_GPU``
           emulate GPU runs by using algorithmically equivalent CPU reference code instead of
           GPU-accelerated functions. As the CPU code is slow, it is intended to be used only for debugging purposes.
- -        The behavior is automatically triggered if non-bonded calculations are turned off using ``GMX_NO_NONBONDED``
- -        case in which the non-bonded calculations will not be called, but the CPU-GPU transfer will also be skipped.
   
   ``GMX_ENX_NO_FATAL``
           disable exiting upon encountering a corrupted frame in an :ref:`edr`
@@@ -282,7 -288,6 +286,7 @@@
           skip non-bonded calculations; can be used to estimate the possible
           performance gain from adding a GPU accelerator to the current hardware setup -- assuming that this is
           fast enough to complete the non-bonded calculations while the CPU does bonded force and PME computation.
+ +        Freezing the particles will be required to stop the system blowing up.
   
   ``GMX_NO_PULLVIR``
           when set, do not add virial contribution to COM pull forces.
@@@ -351,15 -356,9 +355,15 @@@
   ``MDRUN``
           the :ref:`gmx mdrun` command used by :ref:`gmx tune_pme`.
   
- -``GMX_NSTLIST``
- -        sets the default value for :mdp:`nstlist`, preventing it from being tuned during
- -        :ref:`gmx mdrun` startup when using the Verlet cutoff scheme.
+ +``GMX_DISABLE_DYNAMICPRUNING``
+ +        disables dynamic pair-list pruning. Note that :ref:`gmx mdrun` will
+ +        still tune nstlist to the optimal value picked assuming dynamic pruning. Thus
+ +        for good performance the -nstlist option should be used.
+ +
+ +``GMX_NSTLIST_DYNAMICPRUNING``
+ +        overrides the dynamic pair-list pruning interval chosen heuristically
+ +        by mdrun. Values should be between the pruning frequency value
+ +        (1 for CPU and 2 for GPU) and :mdp:`nstlist - 1`.
   
   ``GMX_USE_TREEREDUCE``
           use tree reduction for nbnxn force reduction. Potentially faster for large number of
@@@ -459,11 -458,6 +463,11 @@@ compilation of OpenCL kernels, but the
           override |Gromacs| default behavior, or if you want to test
           your own kernels.
   
+ +``GMX_OCL_DISABLE_COMPATIBILITY_CHECK``
+ +        Disables the hardware compatibility check. Useful for developers
+ +        and allows testing the OpenCL kernels on non-supported platforms
+ +        (like Intel iGPUs) without source code modification.
+ +
   Analysis and Core Functions
   ---------------------------
   ``GMX_QM_ACCURACY``
diff --combined src/gromacs/fileio/oenv.cpp

index c1d19ce08ea181832f6c42b527de1b2fb4235b91,9a9c6c2d44cd15674f806c8800e5960ae17aa522..745ea6c9fd27d213287bbad7ca1eb7566d7ddf05
--- 1/src/gromacs/fileio/oenv.cpp
--- 2/src/gromacs/fileio/oenv.cpp
+++ b/src/gromacs/fileio/oenv.cpp
@@@ -41,16 -41,18 +41,18 @@@
   #include "gromacs/utility/exceptions.h"
   #include "gromacs/utility/programcontext.h"
   #include "gromacs/utility/smalloc.h"
+ +#include "gromacs/utility/stringutil.h"
   
   struct gmx_output_env_t
   {
       explicit gmx_output_env_t(const gmx::IProgramContext &context)
- -        : programContext(context)
- -    {
- -        time_unit               = time_ps;
- -        view                    = FALSE;
- -        xvg_format              = exvgNONE;
- -        verbosity               = 0;
- -        trajectory_io_verbosity = 1;
- -    }
+ +        : programContext(context),
+ +          time_unit(time_ps),
+ +          view(FALSE),
+ +          xvg_format(exvgNONE),
-           verbosity(0) {}
++          verbosity(0),
++          trajectory_io_verbosity(0) {}
++
   
       const gmx::IProgramContext  &programContext;
   
@@@ -62,6 -64,8 +64,8 @@@
       xvg_format_t                         xvg_format;
       /* The level of verbosity for this program */
       int                                  verbosity;
+     /* The level of verbosity during trajectory I/O. Default=1, quiet=0. */
+     int                                  trajectory_io_verbosity;
   };
   
   /* The source code in this file should be thread-safe.
@@@ -78,12 -82,12 +82,12 @@@
   static const real  timefactors[] =   { real(0),  real(1e3),  real(1), real(1e-3), real(1e-6), real(1e-9), real(1e-12), real(0) };
   static const real  timeinvfactors[] = { real(0), real(1e-3),  real(1),  real(1e3),  real(1e6),  real(1e9),  real(1e12), real(0) };
   static const char *time_units_str[] = {
- -    NULL, "fs", "ps", "ns", "us",
+ +    nullptr, "fs", "ps", "ns", "us",
       "\\mus", "ms", "s"
   };
   static const char *time_units_xvgr[] = {
- -    NULL, "fs", "ps", "ns",
- -    "ms", "s", NULL
+ +    nullptr, "fs", "ps", "ns",
+ +    "ms", "s", nullptr
   };
   
   
@@@ -102,6 -106,9 +106,9 @@@ void output_env_init(gmx_output_env_t *
           oenv->view        = view;
           oenv->xvg_format  = xvg_format;
           oenv->verbosity   = verbosity;
+         const char *env = getenv("GMX_TRAJECTORY_IO_VERBOSITY");
+         oenv->trajectory_io_verbosity = (env != nullptr ? strtol(env, NULL, 10) : 1);
+ 
       }
       GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
   }
@@@ -127,21 -134,36 +134,26 @@@ int output_env_get_verbosity(const gmx_
       return oenv->verbosity;
   }
   
- -const char *output_env_get_time_unit(const gmx_output_env_t *oenv)
+ int output_env_get_trajectory_io_verbosity(const gmx_output_env_t *oenv)
+ {
+     return oenv->trajectory_io_verbosity;
+ }
+ 
+ +std::string output_env_get_time_unit(const gmx_output_env_t *oenv)
   {
       return time_units_str[oenv->time_unit];
   }
   
- -const char *output_env_get_time_label(const gmx_output_env_t *oenv)
+ +std::string output_env_get_time_label(const gmx_output_env_t *oenv)
   {
- -    char *label;
- -    snew(label, 20);
- -
- -    sprintf(label, "Time (%s)", time_units_str[oenv->time_unit] ?
- -            time_units_str[oenv->time_unit] : "ps");
- -
- -    return label;
+ +    return gmx::formatString("Time (%s)", time_units_str[oenv->time_unit] ?
+ +                             time_units_str[oenv->time_unit] : "ps");
   }
   
- -const char *output_env_get_xvgr_tlabel(const gmx_output_env_t *oenv)
+ +std::string output_env_get_xvgr_tlabel(const gmx_output_env_t *oenv)
   {
- -    char *label;
- -    snew(label, 20);
- -
- -    sprintf(label, "Time (%s)", time_units_xvgr[oenv->time_unit] ?
- -            time_units_xvgr[oenv->time_unit] : "ps");
- -
- -    return label;
+ +    return gmx::formatString("Time (%s)", time_units_xvgr[oenv->time_unit] ?
+ +                             time_units_xvgr[oenv->time_unit] : "ps");
   }
   
   real output_env_get_time_factor(const gmx_output_env_t *oenv)
@@@ -185,7 -207,7 +197,7 @@@ xvg_format_t output_env_get_xvg_format(
   
   const char *output_env_get_program_display_name(const gmx_output_env_t *oenv)
   {
- -    const char *displayName = NULL;
+ +    const char *displayName = nullptr;
   
       try
       {
diff --combined src/gromacs/fileio/oenv.h

index 51c7b23b3a0d33509516d180e429108521089c87,58ec84ec972181688640e6d84f6443ba90995658..57188976064d3594526d5f3d21d97385918459d4
--- 1/src/gromacs/fileio/oenv.h
--- 2/src/gromacs/fileio/oenv.h
+++ b/src/gromacs/fileio/oenv.h
@@@ -37,17 -37,20 +37,18 @@@
   #ifndef GMX_FILEIO_OENV_H
   #define GMX_FILEIO_OENV_H
   
+ +#include <string>
+ +
   #include "gromacs/utility/basedefinitions.h"
   #include "gromacs/utility/real.h"
   
- -#ifdef __cplusplus
- -extern "C" {
- -#endif
- -
   struct gmx_output_env_t;
   
   /* output_env member functions */
   
- /* The output_env structure holds information about program name, cmd line,
-    default times, etc.
+ /* The output_env structure holds information about program name, cmd
+    line, default times, etc. along with verbosity levels for code
+    components that use this structure for regulating output.
   
      There are still legacy functions for the program name, and the command
      line, but the output_env versions are now preferred.*/
@@@ -75,14 -78,17 +76,17 @@@ void output_env_done(gmx_output_env_t *
   int output_env_get_verbosity(const gmx_output_env_t *oenv);
   /* return the verbosity */
   
- -const char *output_env_get_time_unit(const gmx_output_env_t *oenv);
+ int output_env_get_trajectory_io_verbosity(const gmx_output_env_t *oenv);
+ /* return the verbosity for trajectory IO handling */
+ 
+ +std::string output_env_get_time_unit(const gmx_output_env_t *oenv);
   /* return time unit (e.g. ps or ns) */
   
- -const char *output_env_get_time_label(const gmx_output_env_t *oenv);
+ +std::string output_env_get_time_label(const gmx_output_env_t *oenv);
   /* return time unit label (e.g. "Time (ps)") */
   
- -const char *output_env_get_xvgr_tlabel(const gmx_output_env_t *oenv);
- -/* retrun x-axis time label for xmgr */
+ +std::string output_env_get_xvgr_tlabel(const gmx_output_env_t *oenv);
+ +/* return x-axis time label for xmgr */
   
   real output_env_get_time_factor(const gmx_output_env_t *oenv);
   /* return time conversion factor from ps (i.e. 1e-3 for ps->ns) */
@@@ -107,6 -113,9 +111,6 @@@ xvg_format_t output_env_get_xvg_format(
    */
   const char *output_env_get_program_display_name(const gmx_output_env_t *oenv);
   
- -#ifdef __cplusplus
- -}
- -
   namespace gmx
   {
   class IProgramContext;
@@@ -128,3 -137,5 +132,3 @@@ const gmx::IProgramContext 
   output_env_get_program_context(const gmx_output_env_t *oenv);
   
   #endif
- -
- -#endif
diff --combined src/gromacs/fileio/trxio.cpp

index 8c03614008473739088513240be0b26bdfdc782a,f25eaa1e01aa3833ef80a661983446d755682915..1786c17ead287f3a217e43757de086487a5b6483
--- 1/src/gromacs/fileio/trxio.cpp
--- 2/src/gromacs/fileio/trxio.cpp
+++ b/src/gromacs/fileio/trxio.cpp
@@@ -55,6 -55,7 +55,6 @@@
   #include "gromacs/fileio/pdbio.h"
   #include "gromacs/fileio/timecontrol.h"
   #include "gromacs/fileio/tngio.h"
- -#include "gromacs/fileio/tngio_for_tools.h"
   #include "gromacs/fileio/tpxio.h"
   #include "gromacs/fileio/trrio.h"
   #include "gromacs/fileio/xdrf.h"
@@@ -87,6 -88,7 +87,6 @@@ struct t_trxstatu
                                                  * for skipping frames with -dt     */
       real                    tf;               /* internal frame time              */
       t_trxframe             *xframe;
- -    int                     nxframe;
       t_fileio               *fio;
       tng_trajectory_t        tng;
       int                     natoms;
@@@ -168,13 -170,14 +168,13 @@@ static void initcount(t_trxstatus *stat
   static void status_init(t_trxstatus *status)
   {
       status->flags           = 0;
- -    status->nxframe         = 0;
- -    status->xframe          = NULL;
- -    status->fio             = NULL;
+ +    status->xframe          = nullptr;
+ +    status->fio             = nullptr;
       status->__frame         = -1;
       status->t0              = 0;
       status->tf              = 0;
- -    status->persistent_line = NULL;
- -    status->tng             = NULL;
+ +    status->persistent_line = nullptr;
+ +    status->tng             = nullptr;
   }
   
   
@@@ -188,7 -191,8 +188,8 @@@ static void printcount_(t_trxstatus *st
   {
       if ((status->__frame < 2*SKIP1 || status->__frame % SKIP1 == 0) &&
           (status->__frame < 2*SKIP2 || status->__frame % SKIP2 == 0) &&
-         (status->__frame < 2*SKIP3 || status->__frame % SKIP3 == 0))
+         (status->__frame < 2*SKIP3 || status->__frame % SKIP3 == 0) &&
+         output_env_get_trajectory_io_verbosity(oenv) != 0)
       {
           fprintf(stderr, "\r%-14s %6d time %8.3f   ", l, status->__frame,
                   output_env_conv_time(oenv, t));
@@@ -282,6 -286,7 +283,6 @@@ float trx_get_time_of_final_frame(t_trx
   void clear_trxframe(t_trxframe *fr, gmx_bool bFirst)
   {
       fr->not_ok    = 0;
- -    fr->bTitle    = FALSE;
       fr->bStep     = FALSE;
       fr->bTime     = FALSE;
       fr->bLambda   = FALSE;
@@@ -296,15 -301,16 +297,15 @@@
       {
           fr->bDouble   = FALSE;
           fr->natoms    = -1;
- -        fr->title     = NULL;
           fr->step      = 0;
           fr->time      = 0;
           fr->lambda    = 0;
           fr->fep_state = 0;
- -        fr->atoms     = NULL;
+ +        fr->atoms     = nullptr;
           fr->prec      = 0;
- -        fr->x         = NULL;
- -        fr->v         = NULL;
- -        fr->f         = NULL;
+ +        fr->x         = nullptr;
+ +        fr->v         = nullptr;
+ +        fr->f         = nullptr;
           clear_mat(fr->box);
           fr->bPBC   = FALSE;
           fr->ePBC   = -1;
@@@ -321,7 -327,7 +322,7 @@@ int write_trxframe_indexed(t_trxstatus 
                              const int *ind, gmx_conect gc)
   {
       char  title[STRLEN];
- -    rvec *xout = NULL, *vout = NULL, *fout = NULL;
+ +    rvec *xout = nullptr, *vout = nullptr, *fout = nullptr;
       int   i, ftp = -1;
       real  prec;
   
@@@ -381,7 -387,7 +382,7 @@@
                       copy_rvec(fr->f[ind[i]], fout[i]);
                   }
               }
- -        /* no break */
+ +        // fallthrough
           case efXTC:
               if (fr->bX)
               {
@@@ -421,7 -427,7 +422,7 @@@
               if (ftp == efGRO)
               {
                   write_hconf_indexed_p(gmx_fio_getfp(status->fio), title, fr->atoms, nind, ind,
- -                                      fr->x, fr->bV ? fr->v : NULL, fr->box);
+ +                                      fr->x, fr->bV ? fr->v : nullptr, fr->box);
               }
               else
               {
@@@ -430,8 -436,7 +431,8 @@@
               }
               break;
           case efG96:
- -            write_g96_conf(gmx_fio_getfp(status->fio), fr, nind, ind);
+ +            sprintf(title, "frame t= %.3f", fr->time);
+ +            write_g96_conf(gmx_fio_getfp(status->fio), title, fr, nind, ind);
               break;
           default:
               gmx_fatal(FARGS, "Sorry, write_trxframe_indexed can not write %s",
@@@ -451,7 -456,7 +452,7 @@@
               {
                   sfree(fout);
               }
- -        /* no break */
+ +        // fallthrough
           case efXTC:
               sfree(xout);
               break;
@@@ -477,13 -482,13 +478,13 @@@ void trjtools_gmx_prepare_tng_writing(c
           gmx_incons("Sorry, can only prepare for TNG output.");
       }
   
- -    if (*out == NULL)
+ +    if (*out == nullptr)
       {
           snew((*out), 1);
       }
       status_init(*out);
   
- -    if (in != NULL)
+ +    if (in != nullptr)
       {
           gmx_prepare_tng_writing(filename,
                                   filemode,
@@@ -558,7 -563,7 +559,7 @@@ int write_trxframe(t_trxstatus *status
               break;
           case efTRR:
               gmx_trr_write_frame(status->fio, fr->step, fr->time, fr->lambda, fr->box, fr->natoms,
- -                                fr->bX ? fr->x : NULL, fr->bV ? fr->v : NULL, fr->bF ? fr->f : NULL);
+ +                                fr->bX ? fr->x : nullptr, fr->bV ? fr->v : nullptr, fr->bF ? fr->f : nullptr);
               break;
           case efGRO:
           case efPDB:
@@@ -573,7 -578,7 +574,7 @@@
               if (gmx_fio_getftp(status->fio) == efGRO)
               {
                   write_hconf_p(gmx_fio_getfp(status->fio), title, fr->atoms,
- -                              fr->x, fr->bV ? fr->v : NULL, fr->box);
+ +                              fr->x, fr->bV ? fr->v : nullptr, fr->box);
               }
               else
               {
@@@ -583,7 -588,7 +584,7 @@@
               }
               break;
           case efG96:
- -            write_g96_conf(gmx_fio_getfp(status->fio), fr, -1, NULL);
+ +            write_g96_conf(gmx_fio_getfp(status->fio), title, fr, -1, nullptr);
               break;
           default:
               gmx_fatal(FARGS, "Sorry, write_trxframe can not write %s",
@@@ -605,11 -610,11 +606,11 @@@ int write_trx(t_trxstatus *status, int 
       fr.step   = step;
       fr.bTime  = TRUE;
       fr.time   = time;
- -    fr.bAtoms = atoms != NULL;
+ +    fr.bAtoms = atoms != nullptr;
       fr.atoms  = const_cast<t_atoms *>(atoms);
       fr.bX     = TRUE;
       fr.x      = x;
- -    fr.bV     = v != NULL;
+ +    fr.bV     = v != nullptr;
       fr.v      = v;
       fr.bBox   = TRUE;
       copy_mat(box, fr.box);
@@@ -628,14 -633,6 +629,14 @@@ void close_trx(t_trxstatus *status
       {
           gmx_fio_close(status->fio);
       }
+ +    sfree(status->persistent_line);
+ +#if GMX_USE_PLUGINS
+ +    sfree(status->vmdplugin);
+ +#endif
+ +    /* The memory in status->xframe is lost here,
+ +     * but the read_first_x/read_next_x functions are deprecated anyhow.
+ +     * read_first_frame/read_next_frame and close_trx should be used.
+ +     */
       sfree(status);
   }
   
@@@ -675,7 -672,7 +676,7 @@@ static gmx_bool gmx_next_frame(t_trxsta
           fr->bBox      = sh.box_size > 0;
           if (status->flags & (TRX_READ_X | TRX_NEED_X))
           {
- -            if (fr->x == NULL)
+ +            if (fr->x == nullptr)
               {
                   snew(fr->x, sh.natoms);
               }
@@@ -683,7 -680,7 +684,7 @@@
           }
           if (status->flags & (TRX_READ_V | TRX_NEED_V))
           {
- -            if (fr->v == NULL)
+ +            if (fr->v == nullptr)
               {
                   snew(fr->v, sh.natoms);
               }
@@@ -691,7 -688,7 +692,7 @@@
           }
           if (status->flags & (TRX_READ_F | TRX_NEED_F))
           {
- -            if (fr->f == NULL)
+ +            if (fr->f == nullptr)
               {
                   snew(fr->f, sh.natoms);
               }
@@@ -728,12 -725,12 +729,12 @@@ static gmx_bool pdb_next_x(t_trxstatus 
       double    dbl;
   
       atoms.nr      = fr->natoms;
- -    atoms.atom    = NULL;
- -    atoms.pdbinfo = NULL;
+ +    atoms.atom    = nullptr;
+ +    atoms.pdbinfo = nullptr;
       /* the other pointers in atoms should not be accessed if these are NULL */
       snew(symtab, 1);
       open_symtab(symtab);
- -    na       = read_pdbfile(fp, title, &model_nr, &atoms, symtab, fr->x, &ePBC, boxpdb, TRUE, NULL);
+ +    na       = read_pdbfile(fp, title, &model_nr, &atoms, symtab, fr->x, &ePBC, boxpdb, TRUE, nullptr);
       free_symtab(symtab);
       sfree(symtab);
       set_trxframe_ePBC(fr, ePBC);
@@@ -842,9 -839,12 +843,9 @@@ gmx_bool read_next_frame(const gmx_outp
                   break;
               case efG96:
               {
- -                t_symtab *symtab;
- -                snew(symtab, 1);
- -                open_symtab(symtab);
- -                read_g96_conf(gmx_fio_getfp(status->fio), NULL, fr,
+ +                t_symtab *symtab = nullptr;
+ +                read_g96_conf(gmx_fio_getfp(status->fio), nullptr, nullptr, fr,
                                 symtab, status->persistent_line);
- -                free_symtab(symtab);
                   bRet = (fr->natoms > 0);
                   break;
               }
@@@ -873,7 -873,7 +874,7 @@@
                   }
                   break;
               case efTNG:
- -                bRet = gmx_read_next_tng_frame(status->tng, fr, NULL, 0);
+ +                bRet = gmx_read_next_tng_frame(status->tng, fr, nullptr, 0);
                   break;
               case efPDB:
                   bRet = pdb_next_x(status, gmx_fio_getfp(status->fio), fr);
@@@ -935,7 -935,7 +936,7 @@@
   int read_first_frame(const gmx_output_env_t *oenv, t_trxstatus **status,
                        const char *fn, t_trxframe *fr, int flags)
   {
- -    t_fileio      *fio;
+ +    t_fileio      *fio = nullptr;
       gmx_bool       bFirst, bOK;
       int            ftp   = fn2ftp(fn);
   
@@@ -946,6 -946,7 +947,6 @@@
       snew((*status), 1);
   
       status_init( *status );
- -    (*status)->nxframe = 1;
       initcount(*status);
       (*status)->flags = flags;
   
@@@ -974,8 -975,11 +975,8 @@@
                   /* allocate the persistent line */
                   snew((*status)->persistent_line, STRLEN+1);
               }
- -            t_symtab *symtab;
- -            snew(symtab, 1);
- -            open_symtab(symtab);
- -            read_g96_conf(gmx_fio_getfp(fio), fn, fr, symtab, (*status)->persistent_line);
- -            free_symtab(symtab);
+ +            t_symtab *symtab = nullptr;
+ +            read_g96_conf(gmx_fio_getfp(fio), fn, nullptr, fr, symtab, (*status)->persistent_line);
               gmx_fio_close(fio);
               clear_trxframe(fr, FALSE);
               if (flags & (TRX_READ_X | TRX_NEED_X))
@@@ -1014,7 -1018,7 +1015,7 @@@
               break;
           case efTNG:
               fr->step = -1;
- -            if (!gmx_read_next_tng_frame((*status)->tng, fr, NULL, 0))
+ +            if (!gmx_read_next_tng_frame((*status)->tng, fr, nullptr, 0))
               {
                   fr->not_ok = DATA_NOT_OK;
                   fr->natoms = 0;
@@@ -1097,6 -1101,7 +1098,6 @@@ int read_first_x(const gmx_output_env_
       read_first_frame(oenv, status, fn, &fr, TRX_NEED_X);
   
       snew((*status)->xframe, 1);
- -    (*status)->nxframe   = 1;
       (*(*status)->xframe) = fr;
       *t                   = (*status)->xframe->time;
       *x                   = (*status)->xframe->x;
@@@ -1119,6 -1124,25 +1120,6 @@@ gmx_bool read_next_x(const gmx_output_e
       return bRet;
   }
   
- -void close_trj(t_trxstatus *status)
- -{
- -    if (status == nullptr)
- -    {
- -        return;
- -    }
- -    gmx_tng_close(&status->tng);
- -    if (status->fio)
- -    {
- -        gmx_fio_close(status->fio);
- -    }
- -
- -    /* The memory in status->xframe is lost here,
- -     * but the read_first_x/read_next_x functions are deprecated anyhow.
- -     * read_first_frame/read_next_frame and close_trx should be used.
- -     */
- -    sfree(status);
- -}
- -
   void rewind_trj(t_trxstatus *status)
   {
       initcount(status);
@@@ -1134,7 -1158,7 +1135,7 @@@ t_topology *read_top(const char *fn, in
       t_topology *top;
   
       snew(top, 1);
- -    epbc = read_tpx_top(fn, NULL, NULL, &natoms, NULL, NULL, top);
+ +    epbc = read_tpx_top(fn, nullptr, nullptr, &natoms, nullptr, nullptr, top);
       if (ePBC)
       {
           *ePBC = epbc;
diff --combined src/gromacs/gmxlib/nonbonded/CMakeLists.txt

index 69b077b6ab4e7fc1290126f5a99577bde78321ae,841e2f0112b465ceec3c98d3c6078137f8a4f618..f588792ad56e43c7550ebb64e27518d1ec8c17ba
--- 1/src/gromacs/gmxlib/nonbonded/CMakeLists.txt
--- 2/src/gromacs/gmxlib/nonbonded/CMakeLists.txt
+++ b/src/gromacs/gmxlib/nonbonded/CMakeLists.txt
@@@ -35,39 -35,47 +35,47 @@@
   # Sources that should always be built
   file(GLOB NONBONDED_SOURCES *.cpp nb_kernel_c/*.c)
   
- -if("${GMX_SIMD}" STREQUAL "SSE2" AND NOT GMX_DOUBLE)
+ +if("${GMX_SIMD_ACTIVE}" STREQUAL "SSE2" AND NOT GMX_DOUBLE)
       file(GLOB NONBONDED_SSE2_SINGLE_SOURCES nb_kernel_sse2_single/*.c)
   endif()
   
- -if("${GMX_SIMD}" STREQUAL "SSE4.1" AND NOT GMX_DOUBLE)
+ +if("${GMX_SIMD_ACTIVE}" STREQUAL "SSE4.1" AND NOT GMX_DOUBLE)
       file(GLOB NONBONDED_SSE4_1_SINGLE_SOURCES nb_kernel_sse4_1_single/*.c)
   endif()
   
- -if("${GMX_SIMD}" STREQUAL "AVX_128_FMA" AND NOT GMX_DOUBLE)
+ +if("${GMX_SIMD_ACTIVE}" STREQUAL "AVX_128_FMA" AND NOT GMX_DOUBLE)
       file(GLOB NONBONDED_AVX_128_FMA_SINGLE_SOURCES nb_kernel_avx_128_fma_single/*.c)
   endif()
   
- if((("${GMX_SIMD_ACTIVE}" STREQUAL "AVX_256") OR ("${GMX_SIMD_ACTIVE}" STREQUAL "AVX2_256")) AND NOT GMX_DOUBLE)
- -if((("${GMX_SIMD}" STREQUAL "AVX_256")
- -            OR ("${GMX_SIMD}" STREQUAL "AVX2_256")
- -            OR ("${GMX_SIMD}" STREQUAL "AVX_512")
- -            OR ("${GMX_SIMD}" STREQUAL "AVX_512_KNL"))
++if((("${GMX_SIMD_ACTIVE}" STREQUAL "AVX_256")
++    OR ("${GMX_SIMD_ACTIVE}" STREQUAL "AVX2_256")
++    OR ("${GMX_SIMD_ACTIVE}" STREQUAL "AVX_512")
++    OR ("${GMX_SIMD_ACTIVE}" STREQUAL "AVX_512_KNL"))
+         AND NOT GMX_DOUBLE)
       file(GLOB NONBONDED_AVX_256_SINGLE_SOURCES nb_kernel_avx_256_single/*.c)
   endif()
   
- -if("${GMX_SIMD}" STREQUAL "SSE2" AND GMX_DOUBLE)
+ +if("${GMX_SIMD_ACTIVE}" STREQUAL "SSE2" AND GMX_DOUBLE)
       file(GLOB NONBONDED_SSE2_DOUBLE_SOURCES nb_kernel_sse2_double/*.c)
   endif()
   
- -if("${GMX_SIMD}" STREQUAL "SSE4.1" AND GMX_DOUBLE)
+ +if("${GMX_SIMD_ACTIVE}" STREQUAL "SSE4.1" AND GMX_DOUBLE)
       file(GLOB NONBONDED_SSE4_1_DOUBLE_SOURCES nb_kernel_sse4_1_double/*.c)
   endif()
   
- -if("${GMX_SIMD}" STREQUAL "AVX_128_FMA" AND GMX_DOUBLE)
+ +if("${GMX_SIMD_ACTIVE}" STREQUAL "AVX_128_FMA" AND GMX_DOUBLE)
       file(GLOB NONBONDED_AVX_128_FMA_DOUBLE_SOURCES nb_kernel_avx_128_fma_double/*.c)
   endif()
   
- if((("${GMX_SIMD_ACTIVE}" STREQUAL "AVX_256") OR ("${GMX_SIMD_ACTIVE}" STREQUAL "AVX2_256")) AND GMX_DOUBLE)
- -if((("${GMX_SIMD}" STREQUAL "AVX_256")
- -            OR ("${GMX_SIMD}" STREQUAL "AVX2_256")
- -            OR ("${GMX_SIMD}" STREQUAL "AVX_512")
- -            OR ("${GMX_SIMD}" STREQUAL "AVX_512_KNL"))
++if((("${GMX_SIMD_ACTIVE}" STREQUAL "AVX_256")
++            OR ("${GMX_SIMD_ACTIVE}" STREQUAL "AVX2_256")
++            OR ("${GMX_SIMD_ACTIVE}" STREQUAL "AVX_512")
++            OR ("${GMX_SIMD_ACTIVE}" STREQUAL "AVX_512_KNL"))
+         AND GMX_DOUBLE)
       file(GLOB NONBONDED_AVX_256_DOUBLE_SOURCES nb_kernel_avx_256_double/*.c)
   endif()
   
- -if("${GMX_SIMD}" STREQUAL "SPARC64_HPC_ACE" AND GMX_DOUBLE)
+ +if("${GMX_SIMD_ACTIVE}" STREQUAL "SPARC64_HPC_ACE" AND GMX_DOUBLE)
       file(GLOB NONBONDED_SPARC64_HPC_ACE_DOUBLE_SOURCES nb_kernel_sparc64_hpc_ace_double/*.c)
   endif()
   
diff --combined src/gromacs/hardware/cpuinfo.cpp

index 1d2cf357c9a8173b86924c5bfc055a99123bb9e2,df8d70086f945ca8b10f640af6d69e6f1358c9f1..3ff092d2846e4aabead3563da5bd2e1d6b3143f7
--- 1/src/gromacs/hardware/cpuinfo.cpp
--- 2/src/gromacs/hardware/cpuinfo.cpp
+++ b/src/gromacs/hardware/cpuinfo.cpp
@@@ -99,8 -99,6 +99,8 @@@
   #    include "gromacs/utility/basedefinitions.h"
   #endif
   
+ +#include "architecture.h"
+ +
   namespace gmx
   {
   
@@@ -154,60 -152,62 +154,60 @@@ executeX86CpuID(unsigned int     gmx_un
                   unsigned int *              ecx,
                   unsigned int *              edx)
   {
- -#if defined __i386__ || defined __i386 || defined _X86_ || defined _M_IX86 || \
- -    defined __x86_64__ || defined __amd64__ || defined _M_X64 || defined _M_AMD64
- -
- -#    if defined __GNUC__ || GMX_X86_GCC_INLINE_ASM
- -
- -    // any compiler that understands gcc inline assembly
- -    *eax = level;
- -    *ecx = ecxval;
- -    *ebx = 0;
- -    *edx = 0;
- -
- -#        if (defined __i386__ || defined __i386 || defined _X86_ || defined _M_IX86) && defined(__PIC__)
- -    // Avoid clobbering the global offset table in 32-bit pic code (ebx register)
- -    __asm__ __volatile__ ("xchgl %%ebx, %1  \n\t"
- -                          "cpuid            \n\t"
- -                          "xchgl %%ebx, %1  \n\t"
- -                          : "+a" (*eax), "+r" (*ebx), "+c" (*ecx), "+d" (*edx));
- -#        else
- -    // i386 without PIC, or x86-64. Things are easy and we can clobber any reg we want
- -    __asm__ __volatile__ ("cpuid            \n\t"
- -                          : "+a" (*eax), "+b" (*ebx), "+c" (*ecx), "+d" (*edx));
- -#        endif
- -    return 0;
- -
- -#    elif defined _MSC_VER
- -
- -    // MSVC (and icc on windows) on ia32 or x86-64
- -    int cpuInfo[4];
- -    __cpuidex(cpuInfo, level, ecxval);
- -    *eax = static_cast<unsigned int>(cpuInfo[0]);
- -    *ebx = static_cast<unsigned int>(cpuInfo[1]);
- -    *ecx = static_cast<unsigned int>(cpuInfo[2]);
- -    *edx = static_cast<unsigned int>(cpuInfo[3]);
- -    return 0;
- -
+ +    if (c_architecture == Architecture::X86)
+ +    {
+ +#if defined __GNUC__ || GMX_X86_GCC_INLINE_ASM
+ +
+ +        // any compiler that understands gcc inline assembly
+ +        *eax = level;
+ +        *ecx = ecxval;
+ +        *ebx = 0;
+ +        *edx = 0;
+ +
+ +#    if (defined __i386__ || defined __i386 || defined _X86_ || defined _M_IX86) && defined(__PIC__)
+ +        // Avoid clobbering the global offset table in 32-bit pic code (ebx register)
+ +        __asm__ __volatile__ ("xchgl %%ebx, %1  \n\t"
+ +                              "cpuid            \n\t"
+ +                              "xchgl %%ebx, %1  \n\t"
+ +                              : "+a" (*eax), "+r" (*ebx), "+c" (*ecx), "+d" (*edx));
   #    else
+ +        // i386 without PIC, or x86-64. Things are easy and we can clobber any reg we want
+ +        __asm__ __volatile__ ("cpuid            \n\t"
+ +                              : "+a" (*eax), "+b" (*ebx), "+c" (*ecx), "+d" (*edx));
+ +#    endif
+ +        return 0;
   
- -    // We are on x86, but without compiler support for cpuid if we get here
- -    *eax = 0;
- -    *ebx = 0;
- -    *ecx = 0;
- -    *edx = 0;
- -    return 1;
+ +#elif defined _MSC_VER
   
- -#    endif  // check for inline asm on x86
+ +        // MSVC (and icc on windows) on ia32 or x86-64
+ +        int cpuInfo[4];
+ +        __cpuidex(cpuInfo, level, ecxval);
+ +        *eax = static_cast<unsigned int>(cpuInfo[0]);
+ +        *ebx = static_cast<unsigned int>(cpuInfo[1]);
+ +        *ecx = static_cast<unsigned int>(cpuInfo[2]);
+ +        *edx = static_cast<unsigned int>(cpuInfo[3]);
+ +        return 0;
   
   #else
   
- -    // We are not on x86
- -    *eax = 0;
- -    *ebx = 0;
- -    *ecx = 0;
- -    *edx = 0;
- -    return 1;
+ +        // We are on x86, but without compiler support for cpuid if we get here
+ +        *eax = 0;
+ +        *ebx = 0;
+ +        *ecx = 0;
+ +        *edx = 0;
+ +        return 1;
   
- -#endif      // x86
+ +#endif          // check for inline asm on x86
+ +    }
+ +    else
+ +    {
+ +        // We are not on x86
+ +        *eax = 0;
+ +        *ebx = 0;
+ +        *ecx = 0;
+ +        *edx = 0;
+ +        return 1;
+ +    }
   }
   
   
@@@ -669,7 -669,8 +669,8 @@@ detectProcCpuInfoVendor(const std::map<
           { "AArch64",      CpuInfo::Vendor::Arm     },
           { "Fujitsu",      CpuInfo::Vendor::Fujitsu },
           { "IBM",          CpuInfo::Vendor::Ibm     },
-         { "POWER",        CpuInfo::Vendor::Ibm     }
+         { "POWER",        CpuInfo::Vendor::Ibm     },
+         { "Oracle",       CpuInfo::Vendor::Oracle  },
       };
   
       // For each label in /proc/cpuinfo, compare the value to the name in the
@@@ -776,7 -777,7 +777,7 @@@ detectProcCpuInfoArm(const std::map<std
       }
       if (cpuInfo.count("CPU architecture"))
       {
- -        *family = std::strtol(cpuInfo.at("CPU architecture").c_str(), NULL, 10);
+ +        *family = std::strtol(cpuInfo.at("CPU architecture").c_str(), nullptr, 10);
           // For some 64-bit CPUs it appears to say 'AArch64' instead
           if (*family == 0 && cpuInfo.at("CPU architecture").find("AArch64") != std::string::npos)
           {
@@@ -785,11 -786,11 +786,11 @@@
       }
       if (cpuInfo.count("CPU variant"))
       {
- -        *model    = std::strtol(cpuInfo.at("CPU variant").c_str(), NULL, 16);
+ +        *model    = std::strtol(cpuInfo.at("CPU variant").c_str(), nullptr, 16);
       }
       if (cpuInfo.count("CPU revision"))
       {
- -        *stepping = std::strtol(cpuInfo.at("CPU revision").c_str(), NULL, 10);
+ +        *stepping = std::strtol(cpuInfo.at("CPU revision").c_str(), nullptr, 10);
       }
   
       if (cpuInfo.count("Features"))
@@@ -872,44 -873,36 +873,48 @@@ CpuInfo CpuInfo::detect(
   {
       CpuInfo result;
   
- -#if defined __i386__ || defined __i386 || defined _X86_ || defined _M_IX86 || \
- -    defined __x86_64__ || defined __amd64__ || defined _M_X64 || defined _M_AMD64
- -
- -    result.vendor_            = detectX86Vendor();
- -    detectX86Features(&result.brandString_, &result.family_, &result.model_,
- -                      &result.stepping_, &result.features_);
- -    result.logicalProcessors_ = detectX86LogicalProcessors();
- -#else   // not x86
- -
- -#    if defined __arm__ || defined __arm || defined _M_ARM || defined __aarch64__
- -    result.vendor_  = CpuInfo::Vendor::Arm;
- -#    elif defined __powerpc__ || defined __ppc__ || defined __PPC__
- -    result.vendor_  = CpuInfo::Vendor::Ibm;
- -#    endif
+ +    if (c_architecture == Architecture::X86)
+ +    {
+ +        result.vendor_ = detectX86Vendor();
   
- -#    if defined __aarch64__ || ( defined _M_ARM && _M_ARM >= 8 )
- -    result.features_.insert(Feature::Arm_Neon);      // ARMv8 always has Neon
- -    result.features_.insert(Feature::Arm_NeonAsimd); // ARMv8 always has Neon-asimd
- -#    endif
+ +        if (result.vendor_ == CpuInfo::Vendor::Intel)
+ +        {
+ +            result.features_.insert(CpuInfo::Feature::X86_Intel);
+ +        }
+ +        else if (result.vendor_ == CpuInfo::Vendor::Amd)
+ +        {
+ +            result.features_.insert(CpuInfo::Feature::X86_Amd);
+ +        }
+ +        detectX86Features(&result.brandString_, &result.family_, &result.model_,
+ +                          &result.stepping_, &result.features_);
+ +        result.logicalProcessors_ = detectX86LogicalProcessors();
+ +    }
+ +    else
+ +    {
+ +        // Not x86
+ +        if (c_architecture == Architecture::Arm)
+ +        {
+ +            result.vendor_  = CpuInfo::Vendor::Arm;
+ +        }
+ +        else if (c_architecture == Architecture::PowerPC)
+ +        {
+ +            result.vendor_  = CpuInfo::Vendor::Ibm;
+ +        }
   
- -#    if defined sun
- -    result.vendor_ = CpuInfo::Vendor::Oracle;
- -#    endif
+ +#if defined __aarch64__ || ( defined _M_ARM && _M_ARM >= 8 )
+ +        result.features_.insert(Feature::Arm_Neon);      // ARMv8 always has Neon
+ +        result.features_.insert(Feature::Arm_NeonAsimd); // ARMv8 always has Neon-asimd
+ +#endif
   
- -    // On Linux we might be able to find information in /proc/cpuinfo. If vendor or brand
- -    // is set to a known value this routine will not overwrite it.
- -    detectProcCpuInfo(&result.vendor_, &result.brandString_, &result.family_,
- -                      &result.model_, &result.stepping_, &result.features_);
++#if defined sun
++        result.vendor_ = CpuInfo::Vendor::Oracle;
++#endif
+ 
- -#endif  // x86 or not
+ +        // On Linux we might be able to find information in /proc/cpuinfo. If vendor or brand
+ +        // is set to a known value this routine will not overwrite it.
+ +        detectProcCpuInfo(&result.vendor_, &result.brandString_, &result.family_,
+ +                          &result.model_, &result.stepping_, &result.features_);
+ +    }
   
       if (!result.logicalProcessors_.empty())
       {
@@@ -948,7 -941,8 +953,8 @@@ CpuInfo::s_vendorStrings_ 
       { CpuInfo::Vendor::Amd, "AMD"                                 },
       { CpuInfo::Vendor::Fujitsu, "Fujitsu"                         },
       { CpuInfo::Vendor::Ibm, "IBM"                                 },
-     { CpuInfo::Vendor::Arm, "ARM"                                 }
+     { CpuInfo::Vendor::Arm, "ARM"                                 },
+     { CpuInfo::Vendor::Oracle, "Oracle"                           },
   };
   
   
@@@ -956,7 -950,6 +962,7 @@@ const std::map<CpuInfo::Feature, std::s
   CpuInfo::s_featureStrings_ =
   {
       { CpuInfo::Feature::X86_Aes, "aes"                            },
+ +    { CpuInfo::Feature::X86_Amd, "amd"                            },
       { CpuInfo::Feature::X86_Apic, "apic"                          },
       { CpuInfo::Feature::X86_Avx, "avx"                            },
       { CpuInfo::Feature::X86_Avx2, "avx2"                          },
@@@ -975,7 -968,6 +981,7 @@@
       { CpuInfo::Feature::X86_Fma4, "fma4"                          },
       { CpuInfo::Feature::X86_Hle, "hle"                            },
       { CpuInfo::Feature::X86_Htt, "htt"                            },
+ +    { CpuInfo::Feature::X86_Intel, "intel"                        },
       { CpuInfo::Feature::X86_Lahf, "lahf"                          },
       { CpuInfo::Feature::X86_MisalignSse, "misalignsse"            },
       { CpuInfo::Feature::X86_Mmx, "mmx"                            },
@@@ -1065,15 -1057,11 +1071,15 @@@ main(int argc, char **argv
       }
       else if (arg == "-features")
       {
+ +        // Separate the feature strings with spaces. Note that in the
+ +        // GROMACS cmake code, surrounding whitespace is first
+ +        // stripped by the CPU detection routine, and then added back
+ +        // in the code for making the SIMD suggestion.
           for (auto &f : cpuInfo.featureSet() )
           {
- -            printf(" %s", cpuInfo.featureString(f).c_str());
+ +            printf("%s ", cpuInfo.featureString(f).c_str());
           }
- -        printf(" \n"); // extra space so we can grep output for " <feature> " in CMake
+ +        printf("\n");
       }
       else if (arg == "-topology")
       {
diff --combined src/gromacs/hardware/cpuinfo.h

index 676321a4a7dd71e012ed5e29cede0683991de6b1,bd2be6bc24d786b2de89cb1687cbc575cf9ce65b..33c7875850875e14a20930f9fa1b9660c79d80b6
--- 1/src/gromacs/hardware/cpuinfo.h
--- 2/src/gromacs/hardware/cpuinfo.h
+++ b/src/gromacs/hardware/cpuinfo.h
@@@ -86,6 -86,7 +86,7 @@@ class CpuInf
               Fujitsu,      //!< Only works on Linux (parsed from /proc/cpuinfo)
               Ibm,          //!< Only works on Linux (parsed from /proc/cpuinfo)
               Arm,          //!< Only works on Linux (parsed from /proc/cpuinfo)
+             Oracle,       //!< Cannot detect anything else yet (no /proc/cpuinfo available)
           };
   
           /*! \brief List of CPU features
@@@ -97,7 -98,6 +98,7 @@@
           enum class Feature
           {
               X86_Aes,         //!< x86 advanced encryption standard accel.
+ +            X86_Amd,         //!< This is an AMD x86 processor
               X86_Apic,        //!< APIC support
               X86_Avx,         //!< Advanced vector extensions
               X86_Avx2,        //!< AVX2 including gather support (not used yet)
@@@ -116,7 -116,6 +117,7 @@@
               X86_Fma4,        //!< 4-operand FMA, only on AMD for now
               X86_Hle,         //!< Hardware lock elision
               X86_Htt,         //!< Hyper-Threading supported (but maybe not enabled)
+ +            X86_Intel,       //!< This is an Intel x86 processor
               X86_Lahf,        //!< LAHF/SAHF support in 64 bits
               X86_MisalignSse, //!< Support for misaligned SSE data instructions
               X86_Mmx,         //!< MMX registers and instructions
diff --combined src/gromacs/mdlib/md_support.cpp

index 431b40bf489d58f967b0bba293f6b40865f58c73,1a21fd2e600fb5050f8e635290aa255f49c5067f..31e9955475076af26b220387920fc18d474c24dd
--- 1/src/gromacs/mdlib/md_support.cpp
--- 2/src/gromacs/mdlib/md_support.cpp
+++ b/src/gromacs/mdlib/md_support.cpp
@@@ -44,6 -44,7 +44,6 @@@
   #include <algorithm>
   
   #include "gromacs/domdec/domdec.h"
- -#include "gromacs/gmxlib/md_logging.h"
   #include "gromacs/gmxlib/network.h"
   #include "gromacs/gmxlib/nrnb.h"
   #include "gromacs/math/vec.h"
@@@ -68,7 -69,6 +68,7 @@@
   #include "gromacs/utility/cstringutil.h"
   #include "gromacs/utility/fatalerror.h"
   #include "gromacs/utility/gmxassert.h"
+ +#include "gromacs/utility/logger.h"
   #include "gromacs/utility/smalloc.h"
   #include "gromacs/utility/snprintf.h"
   
@@@ -146,6 -146,93 +146,6 @@@ int multisim_min(const gmx_multisim_t *
       return nmin;
   }
   
- -void copy_coupling_state(t_state *statea, t_state *stateb,
- -                         gmx_ekindata_t *ekinda, gmx_ekindata_t *ekindb, t_grpopts* opts)
- -{
- -
- -    /* MRS note -- might be able to get rid of some of the arguments.  Look over it when it's all debugged */
- -
- -    int i, j, nc;
- -
- -    /* Make sure we have enough space for x and v */
- -    if (statea->nalloc > stateb->nalloc)
- -    {
- -        stateb->nalloc = statea->nalloc;
- -        /* We need to allocate one element extra, since we might use
- -         * (unaligned) 4-wide SIMD loads to access rvec entries.
- -         */
- -        srenew(stateb->x, stateb->nalloc + 1);
- -        srenew(stateb->v, stateb->nalloc + 1);
- -    }
- -
- -    stateb->natoms     = statea->natoms;
- -    stateb->ngtc       = statea->ngtc;
- -    stateb->nnhpres    = statea->nnhpres;
- -    stateb->veta       = statea->veta;
- -    if (ekinda)
- -    {
- -        copy_mat(ekinda->ekin, ekindb->ekin);
- -        for (i = 0; i < stateb->ngtc; i++)
- -        {
- -            ekindb->tcstat[i].T  = ekinda->tcstat[i].T;
- -            ekindb->tcstat[i].Th = ekinda->tcstat[i].Th;
- -            copy_mat(ekinda->tcstat[i].ekinh, ekindb->tcstat[i].ekinh);
- -            copy_mat(ekinda->tcstat[i].ekinf, ekindb->tcstat[i].ekinf);
- -            ekindb->tcstat[i].ekinscalef_nhc =  ekinda->tcstat[i].ekinscalef_nhc;
- -            ekindb->tcstat[i].ekinscaleh_nhc =  ekinda->tcstat[i].ekinscaleh_nhc;
- -            ekindb->tcstat[i].vscale_nhc     =  ekinda->tcstat[i].vscale_nhc;
- -        }
- -    }
- -    copy_rvecn(statea->x, stateb->x, 0, stateb->natoms);
- -    copy_rvecn(statea->v, stateb->v, 0, stateb->natoms);
- -    copy_mat(statea->box, stateb->box);
- -    copy_mat(statea->box_rel, stateb->box_rel);
- -    copy_mat(statea->boxv, stateb->boxv);
- -
- -    for (i = 0; i < stateb->ngtc; i++)
- -    {
- -        nc = i*opts->nhchainlength;
- -        for (j = 0; j < opts->nhchainlength; j++)
- -        {
- -            stateb->nosehoover_xi[nc+j]  = statea->nosehoover_xi[nc+j];
- -            stateb->nosehoover_vxi[nc+j] = statea->nosehoover_vxi[nc+j];
- -        }
- -    }
- -    if (stateb->nhpres_xi != NULL)
- -    {
- -        for (i = 0; i < stateb->nnhpres; i++)
- -        {
- -            nc = i*opts->nhchainlength;
- -            for (j = 0; j < opts->nhchainlength; j++)
- -            {
- -                stateb->nhpres_xi[nc+j]  = statea->nhpres_xi[nc+j];
- -                stateb->nhpres_vxi[nc+j] = statea->nhpres_vxi[nc+j];
- -            }
- -        }
- -    }
- -}
- -
- -real compute_conserved_from_auxiliary(t_inputrec *ir, t_state *state, t_extmass *MassQ)
- -{
- -    real quantity = 0;
- -    switch (ir->etc)
- -    {
- -        case etcNO:
- -            break;
- -        case etcBERENDSEN:
- -            break;
- -        case etcNOSEHOOVER:
- -            quantity = NPT_energy(ir, state, MassQ);
- -            break;
- -        case etcVRESCALE:
- -            quantity = vrescale_energy(&(ir->opts), state->therm_integral);
- -            break;
- -        default:
- -            break;
- -    }
- -    return quantity;
- -}
- -
   /* TODO Specialize this routine into init-time and loop-time versions?
      e.g. bReadEkin is only true when restoring from checkpoint */
   void compute_globals(FILE *fplog, gmx_global_stat *gstat, t_commrec *cr, t_inputrec *ir,
@@@ -153,7 -240,7 +153,7 @@@
                        t_state *state, t_mdatoms *mdatoms,
                        t_nrnb *nrnb, t_vcm *vcm, gmx_wallcycle_t wcycle,
                        gmx_enerdata_t *enerd, tensor force_vir, tensor shake_vir, tensor total_vir,
- -                     tensor pres, rvec mu_tot, gmx_constr_t constr,
+ +                     tensor pres, rvec mu_tot, struct gmx_constr *constr,
                        gmx::SimulationSignaller *signalCoordinator,
                        matrix box, int *totalNumberOfBondedInteractions,
                        gmx_bool *bSumEkinhOld, int flags)
@@@ -204,7 -291,7 +204,7 @@@
       if (bStopCM)
       {
           calc_vcm_grp(0, mdatoms->homenr, mdatoms,
- -                     state->x, state->v, vcm);
+ +                     as_rvec_array(state->x.data()), as_rvec_array(state->v.data()), vcm);
       }
   
       if (bTemp || bStopCM || bPres || bEner || bConstrain)
@@@ -224,7 -311,7 +224,7 @@@
               {
                   wallcycle_start(wcycle, ewcMoveE);
                   global_stat(gstat, cr, enerd, force_vir, shake_vir, mu_tot,
- -                            ir, ekind, constr, bStopCM ? vcm : NULL,
+ +                            ir, ekind, constr, bStopCM ? vcm : nullptr,
                               signalBuffer.size(), signalBuffer.data(),
                               totalNumberOfBondedInteractions,
                               *bSumEkinhOld, flags);
@@@ -239,7 -326,7 +239,7 @@@
       {
           correct_ekin(debug,
                        0, mdatoms->homenr,
- -                     state->v, vcm->group_p[0],
+ +                     as_rvec_array(state->v.data()), vcm->group_p[0],
                        mdatoms->massT, mdatoms->tmass, ekind->ekin);
       }
   
@@@ -248,7 -335,7 +248,7 @@@
       {
           check_cm_grp(fplog, vcm, ir, 1);
           do_stopcm_grp(0, mdatoms->homenr, mdatoms->cVCM,
- -                      state->x, state->v, vcm);
+ +                      as_rvec_array(state->x.data()), as_rvec_array(state->v.data()), vcm);
           inc_nrnb(nrnb, eNR_STOPCM, mdatoms->homenr);
       }
   
@@@ -313,18 -400,16 +313,18 @@@
       }
   }
   
- -void check_nst_param(FILE *fplog, t_commrec *cr,
- -                     const char *desc_nst, int nst,
- -                     const char *desc_p, int *p)
+ +/* check whether an 'nst'-style parameter p is a multiple of nst, and
+ +   set it to be one if not, with a warning. */
+ +static void check_nst_param(const gmx::MDLogger &mdlog,
+ +                            const char *desc_nst, int nst,
+ +                            const char *desc_p, int *p)
   {
       if (*p > 0 && *p % nst != 0)
       {
           /* Round up to the next multiple of nst */
           *p = ((*p)/nst + 1)*nst;
- -        md_print_warn(cr, fplog,
- -                      "NOTE: %s changes %s to %d\n", desc_nst, desc_p, *p);
+ +        GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted(
+ +                "NOTE: %s changes %s to %d", desc_nst, desc_p, *p);
       }
   }
   
@@@ -333,8 -418,7 +333,7 @@@ void set_current_lambdas(gmx_int64_t st
   /* find the current lambdas.  If rerunning, we either read in a state, or a lambda value,
      requiring different logic. */
   {
-     real frac;
-     int  i, fep_state = 0;
+     int  i;
       if (bRerunMD)
       {
           if (rerun_fr->bLambda)
@@@ -353,8 -437,8 +352,8 @@@
               else
               {
                   /* find out between which two value of lambda we should be */
-                 frac      = (step*fepvals->delta_lambda);
-                 fep_state = static_cast<int>(floor(frac*fepvals->n_lambda));
+                 real frac      = (step*fepvals->delta_lambda);
+                 int  fep_state = static_cast<int>(floor(frac*fepvals->n_lambda));
                   /* interpolate between this state and the next */
                   /* this assumes that the initial lambda corresponds to lambda==0, which is verified in grompp */
                   frac = (frac*fepvals->n_lambda)-fep_state;
@@@ -370,7 -454,7 +369,7 @@@
               state_global->fep_state = rerun_fr->fep_state;
               for (i = 0; i < efptNR; i++)
               {
-                 state_global->lambda[i] = fepvals->all_lambda[i][fep_state];
+                 state_global->lambda[i] = fepvals->all_lambda[i][state_global->fep_state];
               }
           }
       }
@@@ -379,10 -463,10 +378,10 @@@
           if (fepvals->delta_lambda != 0)
           {
               /* find out between which two value of lambda we should be */
-             frac = (step*fepvals->delta_lambda);
+             real frac = (step*fepvals->delta_lambda);
               if (fepvals->n_lambda > 0)
               {
-                 fep_state = static_cast<int>(floor(frac*fepvals->n_lambda));
+                 int fep_state = static_cast<int>(floor(frac*fepvals->n_lambda));
                   /* interpolate between this state and the next */
                   /* this assumes that the initial lambda corresponds to lambda==0, which is verified in grompp */
                   frac = (frac*fepvals->n_lambda)-fep_state;
@@@ -452,7 -536,8 +451,7 @@@ static int lcd4(int i1, int i2, int i3
       return nst;
   }
   
- -int check_nstglobalcomm(FILE *fplog, t_commrec *cr,
- -                        int nstglobalcomm, t_inputrec *ir)
+ +int check_nstglobalcomm(const gmx::MDLogger &mdlog, int nstglobalcomm, t_inputrec *ir)
   {
       if (!EI_DYNAMICS(ir->eI))
       {
@@@ -500,43 -585,42 +499,43 @@@
               nstglobalcomm > ir->nstlist && nstglobalcomm % ir->nstlist != 0)
           {
               nstglobalcomm = (nstglobalcomm / ir->nstlist)*ir->nstlist;
- -            md_print_warn(cr, fplog, "WARNING: nstglobalcomm is larger than nstlist, but not a multiple, setting it to %d\n", nstglobalcomm);
+ +            GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted(
+ +                    "WARNING: nstglobalcomm is larger than nstlist, but not a multiple, setting it to %d",
+ +                    nstglobalcomm);
           }
           if (ir->nstcalcenergy > 0)
           {
- -            check_nst_param(fplog, cr, "-gcom", nstglobalcomm,
+ +            check_nst_param(mdlog, "-gcom", nstglobalcomm,
                               "nstcalcenergy", &ir->nstcalcenergy);
           }
           if (ir->etc != etcNO && ir->nsttcouple > 0)
           {
- -            check_nst_param(fplog, cr, "-gcom", nstglobalcomm,
+ +            check_nst_param(mdlog, "-gcom", nstglobalcomm,
                               "nsttcouple", &ir->nsttcouple);
           }
           if (ir->epc != epcNO && ir->nstpcouple > 0)
           {
- -            check_nst_param(fplog, cr, "-gcom", nstglobalcomm,
+ +            check_nst_param(mdlog, "-gcom", nstglobalcomm,
                               "nstpcouple", &ir->nstpcouple);
           }
   
- -        check_nst_param(fplog, cr, "-gcom", nstglobalcomm,
+ +        check_nst_param(mdlog, "-gcom", nstglobalcomm,
                           "nstenergy", &ir->nstenergy);
   
- -        check_nst_param(fplog, cr, "-gcom", nstglobalcomm,
+ +        check_nst_param(mdlog, "-gcom", nstglobalcomm,
                           "nstlog", &ir->nstlog);
       }
   
       if (ir->comm_mode != ecmNO && ir->nstcomm < nstglobalcomm)
       {
- -        md_print_warn(cr, fplog, "WARNING: Changing nstcomm from %d to %d\n",
- -                      ir->nstcomm, nstglobalcomm);
+ +        GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted(
+ +                "WARNING: Changing nstcomm from %d to %d",
+ +                ir->nstcomm, nstglobalcomm);
           ir->nstcomm = nstglobalcomm;
       }
   
- -    if (fplog)
- -    {
- -        fprintf(fplog, "Intra-simulation communication will occur every %d steps.\n", nstglobalcomm);
- -    }
+ +    GMX_LOG(mdlog.info).appendTextFormatted(
+ +            "Intra-simulation communication will occur every %d steps.\n", nstglobalcomm);
       return nstglobalcomm;
   }
   
@@@ -559,7 -643,6 +558,7 @@@ void rerun_parallel_comm(t_commrec *cr
   
   }
   
+ +// TODO Most of this logic seems to belong in the respective modules
   void set_state_entries(t_state *state, const t_inputrec *ir)
   {
       /* The entries in the state in the tpx file might not correspond
@@@ -572,10 -655,33 +571,10 @@@
           state->flags |= (1<<estFEPSTATE);
       }
       state->flags |= (1<<estX);
- -    if (state->lambda == NULL)
- -    {
- -        snew(state->lambda, efptNR);
- -    }
- -    if (state->x == NULL)
- -    {
- -        /* We need to allocate one element extra, since we might use
- -         * (unaligned) 4-wide SIMD loads to access rvec entries.
- -         */
- -        snew(state->x, state->nalloc + 1);
- -    }
+ +    GMX_RELEASE_ASSERT(state->x.size() >= static_cast<unsigned int>(state->natoms), "We should start a run with an initialized state->x");
       if (EI_DYNAMICS(ir->eI))
       {
           state->flags |= (1<<estV);
- -        if (state->v == NULL)
- -        {
- -            snew(state->v, state->nalloc + 1);
- -        }
- -    }
- -    if (ir->eI == eiCG)
- -    {
- -        state->flags |= (1<<estCGP);
- -        if (state->cg_p == NULL)
- -        {
- -            /* cg_p is not stored in the tpx file, so we need to allocate it */
- -            snew(state->cg_p, state->nalloc + 1);
- -        }
       }
   
       state->nnhpres = 0;
@@@ -589,21 -695,23 +588,21 @@@
           if ((ir->epc == epcPARRINELLORAHMAN) || (ir->epc == epcMTTK))
           {
               state->flags |= (1<<estBOXV);
+ +            state->flags |= (1<<estPRES_PREV);
           }
- -        if (ir->epc != epcNO)
+ +        if (inputrecNptTrotter(ir) || (inputrecNphTrotter(ir)))
           {
- -            if (inputrecNptTrotter(ir) || (inputrecNphTrotter(ir)))
- -            {
- -                state->nnhpres = 1;
- -                state->flags  |= (1<<estNHPRES_XI);
- -                state->flags  |= (1<<estNHPRES_VXI);
- -                state->flags  |= (1<<estSVIR_PREV);
- -                state->flags  |= (1<<estFVIR_PREV);
- -                state->flags  |= (1<<estVETA);
- -                state->flags  |= (1<<estVOL0);
- -            }
- -            else
- -            {
- -                state->flags |= (1<<estPRES_PREV);
- -            }
+ +            state->nnhpres = 1;
+ +            state->flags  |= (1<<estNHPRES_XI);
+ +            state->flags  |= (1<<estNHPRES_VXI);
+ +            state->flags  |= (1<<estSVIR_PREV);
+ +            state->flags  |= (1<<estFVIR_PREV);
+ +            state->flags  |= (1<<estVETA);
+ +            state->flags  |= (1<<estVOL0);
+ +        }
+ +        if (ir->epc == epcBERENDSEN)
+ +        {
+ +            state->flags  |= (1<<estBAROS_INT);
           }
       }
   
@@@ -613,17 -721,15 +612,17 @@@
           state->flags |= (1<<estNH_VXI);
       }
   
- -    if (ir->etc == etcVRESCALE)
+ +    if (ir->etc == etcVRESCALE || ir->etc == etcBERENDSEN)
       {
- -        state->flags |= (1<<estTC_INT);
+ +        state->flags |= (1<<estTHERM_INT);
       }
   
       init_gtc_state(state, state->ngtc, state->nnhpres, ir->opts.nhchainlength); /* allocate the space for nose-hoover chains */
       init_ekinstate(&state->ekinstate, ir);
- -    snew(state->enerhist, 1);
- -    init_energyhistory(state->enerhist);
- -    init_df_history(&state->dfhist, ir->fepvals->n_lambda);
- -    state->swapstate.eSwapCoords = ir->eSwapCoords;
+ +
+ +    if (ir->bExpanded)
+ +    {
+ +        snew(state->dfhist, 1);
+ +        init_df_history(state->dfhist, ir->fepvals->n_lambda);
+ +    }
   }
diff --combined src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh

index c4ec038d2c4504a46c70d8d66ce3c1b475bf1377,00bc3b2b926f16ada3d90bd4a3906186036179f3..c0ef88571064e342206c18a5594ece1d153b73fa
--- 1/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh
--- 2/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh
+++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh
@@@ -46,7 -46,6 +46,7 @@@
    */
   
   #include "gromacs/gpu_utils/cuda_arch_utils.cuh"
+ +#include "gromacs/gpu_utils/cuda_kernel_utils.cuh"
   #include "gromacs/math/utilities.h"
   #include "gromacs/pbcutil/ishift.h"
   /* Note that floating-point constants in CUDA code should be suffixed
@@@ -54,8 -53,8 +54,8 @@@
    * code that is in double precision.
    */
   
- -#if GMX_PTX_ARCH < 300
- -#error "nbnxn_cuda_kernel.cuh included with GMX_PTX_ARCH < 300"
+ +#if GMX_PTX_ARCH < 300 && GMX_PTX_ARCH != 0
+ +#error "nbnxn_cuda_kernel.cuh included with GMX_PTX_ARCH < 300 or host pass"
   #endif
   
   #if defined EL_EWALD_ANA || defined EL_EWALD_TAB
@@@ -86,7 -85,7 +86,7 @@@
      Kernel launch parameters:
       - #blocks   = #pair lists, blockId = pair list Id
       - #threads  = NTHREAD_Z * c_clSize^2
- -    - shmem     = see nbnxn_cuda.cu:calc_shmem_required()
+ +    - shmem     = see nbnxn_cuda.cu:calc_shmem_required_nonbonded()
   
       Each thread calculates an i force-component taking one pair of i-j atoms.
    */
@@@ -111,11 -110,15 +111,15 @@@
    * shuffle-based reduction, hence CC >= 3.0.
    *
    *
-  * NOTEs / TODO on Volta / CUDA 9 support extensions:
-  * - the current way of computing active mask using ballot_sync() should be
-  *   reconsidered: we can compute all masks with bitwise ops iso ballot and
-  *   secondly, all conditionals are warp-uniform, so the sync is not needed;
-  * - reconsider the use of __syncwarp(): its only role is currently to prevent
+  * NOTEs on Volta / CUDA 9 extensions:
+  *
+  * - While active thread masks are required for the warp collectives
+  *   (we use any and shfl), the kernel is designed such that all conditions
+  *   (other than the inner-most distance check) including loop trip counts
+  *   are warp-synchronous. Therefore, we don't need ballot to compute the
+  *   active masks as these are all full-warp masks.
+  *
+  * - TODO: reconsider the use of __syncwarp(): its only role is currently to prevent
    *   WAR hazard due to the cj preload; we should try to replace it with direct
    *   loads (which may be faster given the improved L1 on Volta).
    */
@@@ -195,12 -198,15 +199,12 @@@ __global__ void NB_KERNEL_FUNC_NAME(nbn
   #ifdef EL_RF
       float two_k_rf              = nbparam.two_k_rf;
   #endif
- -#ifdef EL_EWALD_TAB
- -    float coulomb_tab_scale     = nbparam.coulomb_tab_scale;
- -#endif
   #ifdef EL_EWALD_ANA
       float beta2                 = nbparam.ewald_beta*nbparam.ewald_beta;
       float beta3                 = nbparam.ewald_beta*nbparam.ewald_beta*nbparam.ewald_beta;
   #endif
   #ifdef PRUNE_NBL
- -    float rlist_sq              = nbparam.rlist_sq;
+ +    float rlist_sq              = nbparam.rlistOuter_sq;
   #endif
   
   #ifdef CALC_ENERGIES
@@@ -283,8 -289,7 +287,8 @@@
           ci = sci * c_numClPerSupercl + tidxj;
           ai = ci * c_clSize + tidxi;
   
- -        xqbuf    = xq[ai] + shift_vec[nb_sci.shift];
+ +        float  *shiftptr = (float *)&shift_vec[nb_sci.shift];
+ +        xqbuf    = xq[ai] + make_float4(LDG(shiftptr), LDG(shiftptr + 1), LDG(shiftptr + 2), 0.0f);
           xqbuf.w *= nbparam.epsfac;
           xqib[tidxj * c_clSize + tidxi] = xqbuf;
   
@@@ -326,11 -331,7 +330,11 @@@
   #endif
   
   #ifdef LJ_EWALD
+ +    #if DISABLE_CUDA_TEXTURES
+ +            E_lj += LDG(&nbparam.nbfp[atom_types[(sci*c_numClPerSupercl + i)*c_clSize + tidxi]*(ntypes + 1)*2]);
+ +    #else
               E_lj += tex1Dfetch<float>(nbparam.nbfp_texobj, atom_types[(sci*c_numClPerSupercl + i)*c_clSize + tidxi]*(ntypes + 1)*2);
+ +    #endif
   #endif
           }
   
@@@ -354,12 -355,7 +358,11 @@@
   
   #endif                                  /* CALC_ENERGIES */
   
-     int          j4LoopStart      = cij4_start + tidxz;
-     unsigned int j4LoopThreadMask = gmx_ballot_sync(c_fullWarpMask, j4LoopStart < cij4_end);
+ +#ifdef EXCLUSION_FORCES
+ +    const int nonSelfInteraction  = !(nb_sci.shift == CENTRAL & tidxj <= tidxi);
+ +#endif
+ +
+     int          j4LoopStart = cij4_start + tidxz;
       /* loop over the j clusters = seen by any of the atoms in the current super-cluster */
       for (j4 = j4LoopStart; j4 < cij4_end; j4 += NTHREAD_Z)
       {
@@@ -367,18 -363,16 +370,16 @@@
           imask       = pl_cj4[j4].imei[widx].imask;
           wexcl       = excl[wexcl_idx].pair[(tidx) & (warp_size - 1)];
   
-         unsigned int imaskSkipConditionThreadMask = j4LoopThreadMask;
   #ifndef PRUNE_NBL
-         imaskSkipConditionThreadMask = gmx_ballot_sync(j4LoopThreadMask, imask);
           if (imask)
   #endif
           {
               /* Pre-load cj into shared memory on both warps separately */
- -            if ((tidxj == 0 || tidxj == 4) && tidxi < c_nbnxnGpuJgroupSize)
+ +            if ((tidxj == 0 | tidxj == 4) & (tidxi < c_nbnxnGpuJgroupSize))
               {
                   cjs[tidxi + tidxj * c_nbnxnGpuJgroupSize/c_splitClSize] = pl_cj4[j4].cj[tidxi];
               }
-             gmx_syncwarp(imaskSkipConditionThreadMask);
+             gmx_syncwarp(c_fullWarpMask);
   
               /* Unrolling this loop
                  - with pruning leads to register spilling;
@@@ -386,8 -380,7 +387,7 @@@
                  Tested with up to nvcc 7.5 */
               for (jm = 0; jm < c_nbnxnGpuJgroupSize; jm++)
               {
-                 const unsigned int jmSkipCondition           = imask & (superClInteractionMask << (jm * c_numClPerSupercl));
-                 const unsigned int jmSkipConditionThreadMask = gmx_ballot_sync(imaskSkipConditionThreadMask, jmSkipCondition);
+                 const unsigned int jmSkipCondition = imask & (superClInteractionMask << (jm * c_numClPerSupercl));
                   if (jmSkipCondition)
                   {
                       mask_ji = (1U << (jm * c_numClPerSupercl));
@@@ -412,8 -405,7 +412,7 @@@
   #endif
                       for (i = 0; i < c_numClPerSupercl; i++)
                       {
-                         const unsigned int iInnerSkipCondition           = imask & mask_ji;
-                         const unsigned int iInnerSkipConditionThreadMask = gmx_ballot_sync(jmSkipConditionThreadMask, iInnerSkipCondition);
+                         const unsigned int iInnerSkipCondition = imask & mask_ji;
                           if (iInnerSkipCondition)
                           {
                               ci      = sci * c_numClPerSupercl + i; /* i cluster index */
@@@ -430,7 -422,7 +429,7 @@@
                               /* If _none_ of the atoms pairs are in cutoff range,
                                  the bit corresponding to the current
                                  cluster-pair in imask gets set to 0. */
-                             if (!gmx_any_sync(iInnerSkipConditionThreadMask, r2 < rlist_sq))
+                             if (!gmx_any_sync(c_fullWarpMask, r2 < rlist_sq))
                               {
                                   imask &= ~mask_ji;
                               }
@@@ -440,9 -432,10 +439,9 @@@
   
                               /* cutoff & exclusion check */
   #ifdef EXCLUSION_FORCES
- -                            if (r2 < rcoulomb_sq *
- -                                (nb_sci.shift != CENTRAL || ci != cj || tidxj > tidxi))
+ +                            if ((r2 < rcoulomb_sq) * (nonSelfInteraction | (ci != cj)))
   #else
- -                            if (r2 < rcoulomb_sq * int_bit)
+ +                            if ((r2 < rcoulomb_sq) * int_bit)
   #endif
                               {
                                   /* load the rest of the i-atom parameters */
@@@ -451,7 -444,8 +450,7 @@@
   #ifndef LJ_COMB
                                   /* LJ 6*C6 and 12*C12 */
                                   typei   = atib[i * c_clSize + tidxi];
- -                                c6      = tex1Dfetch<float>(nbparam.nbfp_texobj, 2 * (ntypes * typei + typej));
- -                                c12     = tex1Dfetch<float>(nbparam.nbfp_texobj, 2 * (ntypes * typei + typej) + 1);
+ +                                fetch_nbfp_c6_c12(c6, c12, nbparam, ntypes * typei + typej);
   #else
                                   ljcp_i  = ljcpib[i * c_clSize + tidxi];
   #ifdef LJ_COMB_GEOM
@@@ -525,9 -519,9 +524,9 @@@
   
   #ifdef LJ_POT_SWITCH
   #ifdef CALC_ENERGIES
- -                                calculate_potential_switch_F_E(nbparam, c6, c12, inv_r, r2, &F_invr, &E_lj_p);
+ +                                calculate_potential_switch_F_E(nbparam, inv_r, r2, &F_invr, &E_lj_p);
   #else
- -                                calculate_potential_switch_F(nbparam, c6, c12, inv_r, r2, &F_invr, &E_lj_p);
+ +                                calculate_potential_switch_F(nbparam, inv_r, r2, &F_invr, &E_lj_p);
   #endif /* CALC_ENERGIES */
   #endif /* LJ_POT_SWITCH */
   
@@@ -561,7 -555,7 +560,7 @@@
                                   F_invr  += qi * qj_f * (int_bit*inv_r2*inv_r + pmecorrF(beta2*r2)*beta3);
   #elif defined EL_EWALD_TAB
                                   F_invr  += qi * qj_f * (int_bit*inv_r2 -
- -                                                        interpolate_coulomb_force_r(nbparam.coulomb_tab_texobj, r2 * inv_r, coulomb_tab_scale)) * inv_r;
+ +                                                        interpolate_coulomb_force_r(nbparam, r2 * inv_r)) * inv_r;
   #endif                          /* EL_EWALD_ANA/TAB */
   
   #ifdef CALC_ENERGIES
@@@ -591,7 -585,7 +590,7 @@@
                       }
   
                       /* reduce j forces */
-                     reduce_force_j_warp_shfl(fcj_buf, f, tidxi, aj, jmSkipConditionThreadMask);
+                     reduce_force_j_warp_shfl(fcj_buf, f, tidxi, aj, c_fullWarpMask);
                   }
               }
   #ifdef PRUNE_NBL
@@@ -601,9 -595,7 +600,7 @@@
   #endif
           }
           // avoid shared memory WAR hazards between loop iterations
-         gmx_syncwarp(j4LoopThreadMask);
-         // update thread mask for next loop iteration
-         j4LoopThreadMask = gmx_ballot_sync(j4LoopThreadMask, (j4 + NTHREAD_Z) < cij4_end);
+         gmx_syncwarp(c_fullWarpMask);
       }
   
       /* skip central shifts when summing shift forces */
diff --combined src/gromacs/simd/simd_math.h

index d44560611cd6c236291204c84b485fa8c47dbbf3,b11e0edc9bfa1604061db11e254afecd207ee1e9..e3f1db68258b361d1753d6ceedca8686b32a53b7
--- 1/src/gromacs/simd/simd_math.h
--- 2/src/gromacs/simd/simd_math.h
+++ b/src/gromacs/simd/simd_math.h
@@@ -131,15 -131,7 +131,15 @@@ rsqrtIter(SimdFloat lu, SimdFloat x
   
   /*! \brief Calculate 1/sqrt(x) for SIMD float.
    *
- - *  \param x Argument that must be >0. This routine does not check arguments.
+ + *  \param x Argument that must be larger than GMX_FLOAT_MIN and smaller than
+ + *           GMX_FLOAT_MAX, i.e. within the range of single precision.
+ + *           For the single precision implementation this is obviously always
+ + *           true for positive values, but for double precision it adds an
+ + *           extra restriction since the first lookup step might have to be
+ + *           performed in single precision on some architectures. Note that the
+ + *           responsibility for checking falls on you - this routine does not
+ + *           check arguments.
+ + *
    *  \return 1/sqrt(x). Result is undefined if your argument was invalid.
    */
   static inline SimdFloat gmx_simdcall
@@@ -160,22 -152,13 +160,22 @@@ invsqrt(SimdFloat x
   
   /*! \brief Calculate 1/sqrt(x) for two SIMD floats.
    *
- - * \param x0  First set of arguments, x0 must be positive - no argument checking.
- - * \param x1  Second set of arguments, x1 must be positive - no argument checking.
+ + * \param x0  First set of arguments, x0 must be in single range (see below).
+ + * \param x1  Second set of arguments, x1 must be in single range (see below).
    * \param[out] out0  Result 1/sqrt(x0)
    * \param[out] out1  Result 1/sqrt(x1)
    *
    *  In particular for double precision we can sometimes calculate square root
    *  pairs slightly faster by using single precision until the very last step.
+ + *
+ + * \note Both arguments must be larger than GMX_FLOAT_MIN and smaller than
+ + *       GMX_FLOAT_MAX, i.e. within the range of single precision.
+ + *       For the single precision implementation this is obviously always
+ + *       true for positive values, but for double precision it adds an
+ + *       extra restriction since the first lookup step might have to be
+ + *       performed in single precision on some architectures. Note that the
+ + *       responsibility for checking falls on you - this routine does not
+ + *       check arguments.
    */
   static inline void gmx_simdcall
   invsqrtPair(SimdFloat x0,    SimdFloat x1,
@@@ -204,15 -187,7 +204,15 @@@ rcpIter(SimdFloat lu, SimdFloat x
   
   /*! \brief Calculate 1/x for SIMD float.
    *
- - *  \param x Argument that must be nonzero. This routine does not check arguments.
+ + *  \param x Argument with magnitude larger than GMX_FLOAT_MIN and smaller than
+ + *           GMX_FLOAT_MAX, i.e. within the range of single precision.
+ + *           For the single precision implementation this is obviously always
+ + *           true for positive values, but for double precision it adds an
+ + *           extra restriction since the first lookup step might have to be
+ + *           performed in single precision on some architectures. Note that the
+ + *           responsibility for checking falls on you - this routine does not
+ + *           check arguments.
+ + *
    *  \return 1/x. Result is undefined if your argument was invalid.
    */
   static inline SimdFloat gmx_simdcall
@@@ -234,12 -209,7 +234,12 @@@ inv(SimdFloat x
   /*! \brief Division for SIMD floats
    *
    * \param nom    Nominator
- - * \param denom  Denominator
+ + * \param denom  Denominator, with magnitude in range (GMX_FLOAT_MIN,GMX_FLOAT_MAX).
+ + *               For single precision this is equivalent to a nonzero argument,
+ + *               but in double precision it adds an extra restriction since
+ + *               the first lookup step might have to be performed in single
+ + *               precision on some architectures. Note that the responsibility
+ + *               for checking falls on you - this routine does not check arguments.
    *
    * \return nom/denom
    *
@@@ -258,9 -228,7 +258,9 @@@ operator/(SimdFloat nom, SimdFloat deno
    *  Illegal values in the masked-out elements will not lead to
    *  floating-point exceptions.
    *
- - *  \param x Argument that must be >0 for masked-in entries
+ + *  \param x Argument that must be larger than GMX_FLOAT_MIN and smaller than
+ + *           GMX_FLOAT_MAX for masked-in entries.
+ + *           See \ref invsqrt for the discussion about argument restrictions.
    *  \param m Mask
    *  \return 1/sqrt(x). Result is undefined if your argument was invalid or
    *          entry was not masked, and 0.0 for masked-out entries.
@@@ -283,9 -251,7 +283,9 @@@ maskzInvsqrt(SimdFloat x, SimdFBool m
   
   /*! \brief Calculate 1/x for SIMD float, masked version.
    *
- - *  \param x Argument that must be nonzero for non-masked entries.
+ + *  \param x Argument with magnitude larger than GMX_FLOAT_MIN and smaller than
+ + *           GMX_FLOAT_MAX for masked-in entries.
+ + *           See \ref invsqrt for the discussion about argument restrictions.
    *  \param m Mask
    *  \return 1/x for elements where m is true, or 0.0 for masked-out entries.
    */
@@@ -305,37 -271,17 +305,37 @@@ maskzInv(SimdFloat x, SimdFBool m
       return lu;
   }
   
- -/*! \brief Calculate sqrt(x) correctly for SIMD floats, including argument 0.0.
+ +/*! \brief Calculate sqrt(x) for SIMD floats
+ + *
+ + *  \tparam opt By default, this function checks if the input value is 0.0
+ + *              and masks this to return the correct result. If you are certain
+ + *              your argument will never be zero, and you know you need to save
+ + *              every single cycle you can, you can alternatively call the
+ + *              function as sqrt<MathOptimization::Unsafe>(x).
+ + *
+ + *  \param  x   Argument that must be in range 0 <=x <= GMX_FLOAT_MAX, since the
+ + *              lookup step often has to be implemented in single precision.
+ + *              Arguments smaller than GMX_FLOAT_MIN will always lead to a zero
+ + *              result, even in double precision. If you are using the unsafe
+ + *              math optimization parameter, the argument must be in the range
+ + *              GMX_FLOAT_MIN <= x <= GMX_FLOAT_MAX.
    *
- - *  \param x Argument that must be >=0.
- - *  \return sqrt(x). If x=0, the result will correctly be set to 0.
- - *          The result is undefined if the input value is negative.
+ + *  \return sqrt(x). The result is undefined if the input value does not fall
+ + *          in the allowed range specified for the argument.
    */
+ +template <MathOptimization opt = MathOptimization::Safe>
   static inline SimdFloat gmx_simdcall
   sqrt(SimdFloat x)
   {
- -    SimdFloat  res = maskzInvsqrt(x, setZero() < x);
- -    return res*x;
+ +    if (opt == MathOptimization::Safe)
+ +    {
+ +        SimdFloat  res = maskzInvsqrt(x, setZero() < x);
+ +        return res*x;
+ +    }
+ +    else
+ +    {
+ +        return x * invsqrt(x);
+ +    }
   }
   
   #if !GMX_SIMD_HAVE_NATIVE_LOG_FLOAT
@@@ -382,44 -328,16 +382,44 @@@ log(SimdFloat x
   #endif
   
   #if !GMX_SIMD_HAVE_NATIVE_EXP2_FLOAT
- -/*! \brief SIMD float 2^x.
- - *
- - * \param x Argument.
- - * \result 2^x. Undefined if input argument caused overflow.
+ +/*! \brief SIMD float 2^x
+ + *
+ + * \tparam opt If this is changed from the default (safe) into the unsafe
+ + *             option, input values that would otherwise lead to zero-clamped
+ + *             results are not allowed and will lead to undefined results.
+ + *
+ + * \param x Argument. For the default (safe) function version this can be
+ + *          arbitrarily small value, but the routine might clamp the result to
+ + *          zero for arguments that would produce subnormal IEEE754-2008 results.
+ + *          This corresponds to inputs below -126 in single or -1022 in double,
+ + *          and it might overflow for arguments reaching 127 (single) or
+ + *          1023 (double). If you enable the unsafe math optimization,
+ + *          very small arguments will not necessarily be zero-clamped, but
+ + *          can produce undefined results.
+ + *
+ + * \result 2^x. The result is undefined for very large arguments that cause
+ + *          internal floating-point overflow. If unsafe optimizations are enabled,
+ + *          this is also true for very small values.
+ + *
+ + * \note    The definition range of this function is just-so-slightly smaller
+ + *          than the allowed IEEE exponents for many architectures. This is due
+ + *          to the implementation, which will hopefully improve in the future.
+ + *
+ + * \warning You cannot rely on this implementation returning inf for arguments
+ + *          that cause overflow. If you have some very large
+ + *          values and need to rely on getting a valid numerical output,
+ + *          take the minimum of your variable and the largest valid argument
+ + *          before calling this routine.
    */
+ +template <MathOptimization opt = MathOptimization::Safe>
   static inline SimdFloat gmx_simdcall
   exp2(SimdFloat x)
   {
- -    // Lower bound: Disallow numbers that would lead to an IEEE fp exponent reaching +-127.
- -    const SimdFloat  arglimit(126.0f);
+ +    // Lower bound: Clamp args that would lead to an IEEE fp exponent below -127.
+ +    // Note that we use a slightly wider range inside this *implementation* compared
+ +    // to that guaranteed by the API documentation in the comment above (which has
+ +    // to cover all architectures).
+ +    const SimdFloat  smallArgLimit(-127.0f);
       const SimdFloat  CC6(0.0001534581200287996416911311f);
       const SimdFloat  CC5(0.001339993121934088894618990f);
       const SimdFloat  CC4(0.009618488957115180159497841f);
@@@ -431,30 -349,12 +431,30 @@@
       SimdFloat        intpart;
       SimdFloat        fexppart;
       SimdFloat        p;
- -    SimdFBool        m;
+ +
+ +    // Large negative values are valid arguments to exp2(), so there are two
+ +    // things we need to account for:
+ +    // 1. When the exponents reaches -127, the (biased) exponent field will be
+ +    //    zero and we can no longer multiply with it. There are special IEEE
+ +    //    formats to handle this range, but for now we have to accept that
+ +    //    we cannot handle those arguments. If input value becomes even more
+ +    //    negative, it will start to loop and we would end up with invalid
+ +    //    exponents. Thus, we need to limit or mask this.
+ +    // 2. For VERY large negative values, we will have problems that the
+ +    //    subtraction to get the fractional part loses accuracy, and then we
+ +    //    can end up with overflows in the polynomial.
+ +    //
+ +    // For now, we handle this by clamping smaller arguments to -127. At this
+ +    // point we will already return zero, so we don't need to do anything
+ +    // extra for the exponent.
+ +
+ +    if (opt == MathOptimization::Safe)
+ +    {
+ +        x         = max(x, smallArgLimit);
+ +    }
   
       fexppart  = ldexp(one, cvtR2I(x));
       intpart   = round(x);
- -    m         = abs(x) <= arglimit;
- -    fexppart  = selectByMask(fexppart, m);
       x         = x - intpart;
   
       p         = fma(CC6, x, CC5);
@@@ -474,44 -374,16 +474,44 @@@
    * In addition to scaling the argument for 2^x this routine correctly does
    * extended precision arithmetics to improve accuracy.
    *
- - * \param x Argument.
- - * \result exp(x). Undefined if input argument caused overflow,
- - * which can happen if abs(x) \> 7e13.
+ + * \tparam opt If this is changed from the default (safe) into the unsafe
+ + *             option, input values that would otherwise lead to zero-clamped
+ + *             results are not allowed and will lead to undefined results.
+ + *
+ + * \param x Argument. For the default (safe) function version this can be
+ + *          arbitrarily small value, but the routine might clamp the result to
+ + *          zero for arguments that would produce subnormal IEEE754-2008 results.
+ + *          This corresponds to input arguments reaching
+ + *          -126*ln(2)=-87.3 in single, or -1022*ln(2)=-708.4 (double).
+ + *          Similarly, it might overflow for arguments reaching
+ + *          127*ln(2)=88.0 (single) or 1023*ln(2)=709.1 (double). If the
+ + *          unsafe math optimizations are enabled, small input values that would
+ + *          result in zero-clamped output are not allowed.
+ + *
+ + * \result exp(x). Overflowing arguments are likely to either return 0 or inf,
+ + *         depending on the underlying implementation. If unsafe optimizations
+ + *         are enabled, this is also true for very small values.
+ + *
+ + * \note    The definition range of this function is just-so-slightly smaller
+ + *          than the allowed IEEE exponents for many architectures. This is due
+ + *          to the implementation, which will hopefully improve in the future.
+ + *
+ + * \warning You cannot rely on this implementation returning inf for arguments
+ + *          that cause overflow. If you have some very large
+ + *          values and need to rely on getting a valid numerical output,
+ + *          take the minimum of your variable and the largest valid argument
+ + *          before calling this routine.
    */
+ +template <MathOptimization opt = MathOptimization::Safe>
   static inline SimdFloat gmx_simdcall
   exp(SimdFloat x)
   {
       const SimdFloat  argscale(1.44269504088896341f);
- -    // Lower bound: Disallow numbers that would lead to an IEEE fp exponent reaching +-127.
- -    const SimdFloat  arglimit(126.0f);
+ +    // Lower bound: Clamp args that would lead to an IEEE fp exponent below -127.
+ +    // Note that we use a slightly wider range inside this *implementation* compared
+ +    // to that guaranteed by the API documentation in the comment above (which has
+ +    // to cover all architectures).
+ +    const SimdFloat  smallArgLimit(-88.0296919311f);
       const SimdFloat  invargscale0(-0.693145751953125f);
       const SimdFloat  invargscale1(-1.428606765330187045e-06f);
       const SimdFloat  CC4(0.00136324646882712841033936f);
@@@ -523,31 -395,13 +523,31 @@@
       SimdFloat        fexppart;
       SimdFloat        intpart;
       SimdFloat        y, p;
- -    SimdFBool        m;
+ +
+ +    // Large negative values are valid arguments to exp2(), so there are two
+ +    // things we need to account for:
+ +    // 1. When the IEEE exponents reaches -127, the (biased) exponent field will be
+ +    //    zero and we can no longer multiply with it. There are special IEEE
+ +    //    formats to handle this range, but for now we have to accept that
+ +    //    we cannot handle those arguments. If input value becomes even more
+ +    //    negative, it will start to loop and we would end up with invalid
+ +    //    exponents. Thus, we need to limit or mask this.
+ +    // 2. For VERY large negative values, we will have problems that the
+ +    //    subtraction to get the fractional part loses accuracy, and then we
+ +    //    can end up with overflows in the polynomial.
+ +    //
+ +    // For now, we handle this by clamping smaller arguments to -127*ln(2). At this
+ +    // point we will already return zero, so we don't need to do anything
+ +    // extra for the exponent.
+ +
+ +    if (opt == MathOptimization::Safe)
+ +    {
+ +        x         = max(x, smallArgLimit);
+ +    }
   
       y         = x * argscale;
       fexppart  = ldexp(one, cvtR2I(y));
       intpart   = round(y);
- -    m         = (abs(y) <= arglimit);
- -    fexppart  = selectByMask(fexppart, m);
   
       // Extended precision arithmetics
       x         = fma(invargscale0, intpart, x);
@@@ -1503,15 -1357,7 +1503,15 @@@ rsqrtIter(SimdDouble lu, SimdDouble x
   
   /*! \brief Calculate 1/sqrt(x) for SIMD double.
    *
- - *  \param x Argument that must be >0. This routine does not check arguments.
+ + *  \param x Argument that must be larger than GMX_FLOAT_MIN and smaller than
+ + *           GMX_FLOAT_MAX, i.e. within the range of single precision.
+ + *           For the single precision implementation this is obviously always
+ + *           true for positive values, but for double precision it adds an
+ + *           extra restriction since the first lookup step might have to be
+ + *           performed in single precision on some architectures. Note that the
+ + *           responsibility for checking falls on you - this routine does not
+ + *           check arguments.
+ + *
    *  \return 1/sqrt(x). Result is undefined if your argument was invalid.
    */
   static inline SimdDouble gmx_simdcall
@@@ -1535,22 -1381,13 +1535,22 @@@ invsqrt(SimdDouble x
   
   /*! \brief Calculate 1/sqrt(x) for two SIMD doubles.
    *
- - * \param x0  First set of arguments, x0 must be positive - no argument checking.
- - * \param x1  Second set of arguments, x1 must be positive - no argument checking.
+ + * \param x0  First set of arguments, x0 must be in single range (see below).
+ + * \param x1  Second set of arguments, x1 must be in single range (see below).
    * \param[out] out0  Result 1/sqrt(x0)
    * \param[out] out1  Result 1/sqrt(x1)
    *
    *  In particular for double precision we can sometimes calculate square root
    *  pairs slightly faster by using single precision until the very last step.
+ + *
+ + * \note Both arguments must be larger than GMX_FLOAT_MIN and smaller than
+ + *       GMX_FLOAT_MAX, i.e. within the range of single precision.
+ + *       For the single precision implementation this is obviously always
+ + *       true for positive values, but for double precision it adds an
+ + *       extra restriction since the first lookup step might have to be
+ + *       performed in single precision on some architectures. Note that the
+ + *       responsibility for checking falls on you - this routine does not
+ + *       check arguments.
    */
   static inline void gmx_simdcall
   invsqrtPair(SimdDouble x0,    SimdDouble x1,
@@@ -1607,15 -1444,7 +1607,15 @@@ rcpIter(SimdDouble lu, SimdDouble x
   
   /*! \brief Calculate 1/x for SIMD double.
    *
- - *  \param x Argument that must be nonzero. This routine does not check arguments.
+ + *  \param x Argument with magnitude larger than GMX_FLOAT_MIN and smaller than
+ + *           GMX_FLOAT_MAX, i.e. within the range of single precision.
+ + *           For the single precision implementation this is obviously always
+ + *           true for positive values, but for double precision it adds an
+ + *           extra restriction since the first lookup step might have to be
+ + *           performed in single precision on some architectures. Note that the
+ + *           responsibility for checking falls on you - this routine does not
+ + *           check arguments.
+ + *
    *  \return 1/x. Result is undefined if your argument was invalid.
    */
   static inline SimdDouble gmx_simdcall
@@@ -1640,12 -1469,7 +1640,12 @@@ inv(SimdDouble x
   /*! \brief Division for SIMD doubles
    *
    * \param nom    Nominator
- - * \param denom  Denominator
+ + * \param denom  Denominator, with magnitude in range (GMX_FLOAT_MIN,GMX_FLOAT_MAX).
+ + *               For single precision this is equivalent to a nonzero argument,
+ + *               but in double precision it adds an extra restriction since
+ + *               the first lookup step might have to be performed in single
+ + *               precision on some architectures. Note that the responsibility
+ + *               for checking falls on you - this routine does not check arguments.
    *
    * \return nom/denom
    *
@@@ -1665,9 -1489,7 +1665,9 @@@ operator/(SimdDouble nom, SimdDouble de
    *  Illegal values in the masked-out elements will not lead to
    *  floating-point exceptions.
    *
- - *  \param x Argument that must be >0 for masked-in entries
+ + *  \param x Argument that must be larger than GMX_FLOAT_MIN and smaller than
+ + *           GMX_FLOAT_MAX for masked-in entries.
+ + *           See \ref invsqrt for the discussion about argument restrictions.
    *  \param m Mask
    *  \return 1/sqrt(x). Result is undefined if your argument was invalid or
    *          entry was not masked, and 0.0 for masked-out entries.
@@@ -1693,9 -1515,7 +1693,9 @@@ maskzInvsqrt(SimdDouble x, SimdDBool m
   
   /*! \brief Calculate 1/x for SIMD double, masked version.
    *
- - *  \param x Argument that must be nonzero for non-masked entries.
+ + *  \param x Argument with magnitude larger than GMX_FLOAT_MIN and smaller than
+ + *           GMX_FLOAT_MAX for masked-in entries.
+ + *           See \ref invsqrt for the discussion about argument restrictions.
    *  \param m Mask
    *  \return 1/x for elements where m is true, or 0.0 for masked-out entries.
    */
@@@ -1719,23 -1539,18 +1719,24 @@@ maskzInv(SimdDouble x, SimdDBool m
   }
   
   
- -/*! \brief Calculate sqrt(x) correctly for SIMD doubles, including argument 0.0.
+ +/*! \brief Calculate sqrt(x) for SIMD doubles.
    *
- - *  \param x Argument that must be >=0.
- - *  \return sqrt(x). If x=0, the result will correctly be set to 0.
- - *          if x>0 && x<float_min, the result will incorrectly be set to 0.
- - *          The result is undefined if the input value is negative.
+ + *  \copydetails sqrt(SimdFloat)
    */
+ +template <MathOptimization opt = MathOptimization::Safe>
   static inline SimdDouble gmx_simdcall
   sqrt(SimdDouble x)
   {
- -    // As we might use a float version of rsqrt, we mask out small values
- -    return x * maskzInvsqrt(x, SimdDouble(GMX_FLOAT_MIN) <= x);
+ +    if (opt == MathOptimization::Safe)
+ +    {
++        // As we might use a float version of rsqrt, we mask out small values
+ +        SimdDouble res = maskzInvsqrt(x, SimdDouble(GMX_FLOAT_MIN) < x);
+ +        return res*x;
+ +    }
+ +    else
+ +    {
+ +        return x * invsqrt(x);
+ +    }
   }
   
   #if !GMX_SIMD_HAVE_NATIVE_LOG_DOUBLE
@@@ -1790,14 -1605,13 +1791,14 @@@ log(SimdDouble x
   #if !GMX_SIMD_HAVE_NATIVE_EXP2_DOUBLE
   /*! \brief SIMD double 2^x.
    *
- - * \param x Argument.
- - * \result 2^x. Undefined if input argument caused overflow.
+ + * \copydetails exp2(SimdFloat)
    */
+ +template <MathOptimization opt = MathOptimization::Safe>
   static inline SimdDouble gmx_simdcall
   exp2(SimdDouble x)
   {
- -    const SimdDouble  arglimit(1022.0);
+ +    // Lower bound: Clamp args that would lead to an IEEE fp exponent below -1023.
+ +    const SimdDouble  smallArgLimit(-1023.0);
       const SimdDouble  CE11(4.435280790452730022081181e-10);
       const SimdDouble  CE10(7.074105630863314448024247e-09);
       const SimdDouble  CE9(1.017819803432096698472621e-07);
@@@ -1814,30 -1628,12 +1815,30 @@@
       SimdDouble        intpart;
       SimdDouble        fexppart;
       SimdDouble        p;
- -    SimdDBool         m;
+ +
+ +    // Large negative values are valid arguments to exp2(), so there are two
+ +    // things we need to account for:
+ +    // 1. When the exponents reaches -1023, the (biased) exponent field will be
+ +    //    zero and we can no longer multiply with it. There are special IEEE
+ +    //    formats to handle this range, but for now we have to accept that
+ +    //    we cannot handle those arguments. If input value becomes even more
+ +    //    negative, it will start to loop and we would end up with invalid
+ +    //    exponents. Thus, we need to limit or mask this.
+ +    // 2. For VERY large negative values, we will have problems that the
+ +    //    subtraction to get the fractional part loses accuracy, and then we
+ +    //    can end up with overflows in the polynomial.
+ +    //
+ +    // For now, we handle this by clamping smaller arguments to -1023. At this
+ +    // point we will already return zero, so we don't need to do anything
+ +    // extra for the exponent.
+ +
+ +    if (opt == MathOptimization::Safe)
+ +    {
+ +        x         = max(x, smallArgLimit);
+ +    }
   
       fexppart  = ldexp(one, cvtR2I(x));
       intpart   = round(x);
- -    m         = abs(x) <= arglimit;
- -    fexppart  = selectByMask(fexppart, m);
       x         = x - intpart;
   
       p         = fma(CE11, x, CE10);
@@@ -1859,15 -1655,18 +1860,15 @@@
   #if !GMX_SIMD_HAVE_NATIVE_EXP_DOUBLE
   /*! \brief SIMD double exp(x).
    *
- - * In addition to scaling the argument for 2^x this routine correctly does
- - * extended precision arithmetics to improve accuracy.
- - *
- - * \param x Argument.
- - * \result exp(x). Undefined if input argument caused overflow,
- - * which can happen if abs(x) \> 7e13.
+ + * \copydetails exp(SimdFloat)
    */
+ +template <MathOptimization opt = MathOptimization::Safe>
   static inline SimdDouble gmx_simdcall
   exp(SimdDouble x)
   {
       const SimdDouble  argscale(1.44269504088896340735992468100);
- -    const SimdDouble  arglimit(1022.0);
+ +    // Lower bound: Clamp args that would lead to an IEEE fp exponent below -1023.
+ +    const SimdDouble  smallArgLimit(-709.0895657128);
       const SimdDouble  invargscale0(-0.69314718055966295651160180568695068359375);
       const SimdDouble  invargscale1(-2.8235290563031577122588448175013436025525412068e-13);
       const SimdDouble  CE12(2.078375306791423699350304e-09);
@@@ -1885,31 -1684,13 +1886,31 @@@
       SimdDouble        fexppart;
       SimdDouble        intpart;
       SimdDouble        y, p;
- -    SimdDBool         m;
+ +
+ +    // Large negative values are valid arguments to exp2(), so there are two
+ +    // things we need to account for:
+ +    // 1. When the exponents reaches -1023, the (biased) exponent field will be
+ +    //    zero and we can no longer multiply with it. There are special IEEE
+ +    //    formats to handle this range, but for now we have to accept that
+ +    //    we cannot handle those arguments. If input value becomes even more
+ +    //    negative, it will start to loop and we would end up with invalid
+ +    //    exponents. Thus, we need to limit or mask this.
+ +    // 2. For VERY large negative values, we will have problems that the
+ +    //    subtraction to get the fractional part loses accuracy, and then we
+ +    //    can end up with overflows in the polynomial.
+ +    //
+ +    // For now, we handle this by clamping smaller arguments to -1023*ln(2). At this
+ +    // point we will already return zero, so we don't need to do anything
+ +    // extra for the exponent.
+ +
+ +    if (opt == MathOptimization::Safe)
+ +    {
+ +        x         = max(x, smallArgLimit);
+ +    }
   
       y         = x * argscale;
       fexppart  = ldexp(one, cvtR2I(y));
       intpart   = round(y);
- -    m         = (abs(y) <= arglimit);
- -    fexppart  = selectByMask(fexppart, m);
   
       // Extended precision arithmetics
       x         = fma(invargscale0, intpart, x);
@@@ -2916,15 -2697,7 +2917,15 @@@ pmePotentialCorrection(SimdDouble z2
   
   /*! \brief Calculate 1/sqrt(x) for SIMD double, but in single accuracy.
    *
- - *  \param x Argument that must be >0. This routine does not check arguments.
+ + *  \param x Argument that must be larger than GMX_FLOAT_MIN and smaller than
+ + *           GMX_FLOAT_MAX, i.e. within the range of single precision.
+ + *           For the single precision implementation this is obviously always
+ + *           true for positive values, but for double precision it adds an
+ + *           extra restriction since the first lookup step might have to be
+ + *           performed in single precision on some architectures. Note that the
+ + *           responsibility for checking falls on you - this routine does not
+ + *           check arguments.
+ + *
    *  \return 1/sqrt(x). Result is undefined if your argument was invalid.
    */
   static inline SimdDouble gmx_simdcall
@@@ -2949,15 -2722,7 +2950,15 @@@ invsqrtSingleAccuracy(SimdDouble x
    *  Illegal values in the masked-out elements will not lead to
    *  floating-point exceptions.
    *
- - *  \param x Argument that must be >0 for masked-in entries
+ + *  \param x Argument that must be larger than GMX_FLOAT_MIN and smaller than
+ + *           GMX_FLOAT_MAX, i.e. within the range of single precision.
+ + *           For the single precision implementation this is obviously always
+ + *           true for positive values, but for double precision it adds an
+ + *           extra restriction since the first lookup step might have to be
+ + *           performed in single precision on some architectures. Note that the
+ + *           responsibility for checking falls on you - this routine does not
+ + *           check arguments.
+ + *
    *  \param m Mask
    *  \return 1/sqrt(x). Result is undefined if your argument was invalid or
    *          entry was not masked, and 0.0 for masked-out entries.
@@@ -2980,22 -2745,13 +2981,22 @@@ maskzInvsqrtSingleAccuracy(SimdDouble x
   
   /*! \brief Calculate 1/sqrt(x) for two SIMD doubles, but single accuracy.
    *
- - * \param x0  First set of arguments, x0 must be positive - no argument checking.
- - * \param x1  Second set of arguments, x1 must be positive - no argument checking.
+ + * \param x0  First set of arguments, x0 must be in single range (see below).
+ + * \param x1  Second set of arguments, x1 must be in single range (see below).
    * \param[out] out0  Result 1/sqrt(x0)
    * \param[out] out1  Result 1/sqrt(x1)
    *
    *  In particular for double precision we can sometimes calculate square root
    *  pairs slightly faster by using single precision until the very last step.
+ + *
+ + * \note Both arguments must be larger than GMX_FLOAT_MIN and smaller than
+ + *       GMX_FLOAT_MAX, i.e. within the range of single precision.
+ + *       For the single precision implementation this is obviously always
+ + *       true for positive values, but for double precision it adds an
+ + *       extra restriction since the first lookup step might have to be
+ + *       performed in single precision on some architectures. Note that the
+ + *       responsibility for checking falls on you - this routine does not
+ + *       check arguments.
    */
   static inline void gmx_simdcall
   invsqrtPairSingleAccuracy(SimdDouble x0,    SimdDouble x1,
@@@ -3027,15 -2783,7 +3028,15 @@@
   
   /*! \brief Calculate 1/x for SIMD double, but in single accuracy.
    *
- - *  \param x Argument that must be nonzero. This routine does not check arguments.
+ + *  \param x Argument with magnitude larger than GMX_FLOAT_MIN and smaller than
+ + *           GMX_FLOAT_MAX, i.e. within the range of single precision.
+ + *           For the single precision implementation this is obviously always
+ + *           true for positive values, but for double precision it adds an
+ + *           extra restriction since the first lookup step might have to be
+ + *           performed in single precision on some architectures. Note that the
+ + *           responsibility for checking falls on you - this routine does not
+ + *           check arguments.
+ + *
    *  \return 1/x. Result is undefined if your argument was invalid.
    */
   static inline SimdDouble gmx_simdcall
@@@ -3056,15 -2804,7 +3057,15 @@@ invSingleAccuracy(SimdDouble x
   
   /*! \brief 1/x for masked entries of SIMD double, single accuracy.
    *
- - *  \param x Argument that must be nonzero for non-masked entries.
+ + *  \param x Argument with magnitude larger than GMX_FLOAT_MIN and smaller than
+ + *           GMX_FLOAT_MAX, i.e. within the range of single precision.
+ + *           For the single precision implementation this is obviously always
+ + *           true for positive values, but for double precision it adds an
+ + *           extra restriction since the first lookup step might have to be
+ + *           performed in single precision on some architectures. Note that the
+ + *           responsibility for checking falls on you - this routine does not
+ + *           check arguments.
+ + *
    *  \param m Mask
    *  \return 1/x for elements where m is true, or 0.0 for masked-out entries.
    */
@@@ -3085,26 -2825,18 +3086,26 @@@ maskzInvSingleAccuracy(SimdDouble x, Si
   }
   
   
- -/*! \brief Calculate sqrt(x) (correct for 0.0) for SIMD double, single accuracy.
+ +/*! \brief Calculate sqrt(x) (correct for 0.0) for SIMD double, with single accuracy.
    *
- - *  \param x Argument that must be >=0.
- - *  \return sqrt(x). If x<float_min, the result will correctly be set to 0.
- - *          The result is undefined if the input value is negative.
+ + *  \copydetails sqrt(SimdFloat)
    */
+ +template <MathOptimization opt = MathOptimization::Safe>
   static inline SimdDouble gmx_simdcall
   sqrtSingleAccuracy(SimdDouble x)
   {
- -    return x * maskzInvsqrtSingleAccuracy(x, SimdDouble(GMX_FLOAT_MIN) <= x);
+ +    if (opt == MathOptimization::Safe)
+ +    {
+ +        SimdDouble res = maskzInvsqrt(x, SimdDouble(GMX_FLOAT_MIN) < x);
+ +        return res*x;
+ +    }
+ +    else
+ +    {
+ +        return x * invsqrtSingleAccuracy(x);
+ +    }
   }
   
+ +
   /*! \brief SIMD log(x). Double precision SIMD data, single accuracy.
    *
    * \param x Argument, should be >0.
@@@ -3146,16 -2878,16 +3147,16 @@@ logSingleAccuracy(SimdDouble x
       return p;
   }
   
- -/*! \brief SIMD 2^x. Double precision SIMD data, single accuracy.
+ +/*! \brief SIMD 2^x. Double precision SIMD, single accuracy.
    *
- - * \param x Argument.
- - * \result 2^x. Undefined if input argument caused overflow.
+ + * \copydetails exp2(SimdFloat)
    */
+ +template <MathOptimization opt = MathOptimization::Safe>
   static inline SimdDouble gmx_simdcall
   exp2SingleAccuracy(SimdDouble x)
   {
- -    // Lower bound: Disallow numbers that would lead to an IEEE fp exponent reaching +-127.
- -    const SimdDouble  arglimit = SimdDouble(126.0);
+ +    // Lower bound: Clamp args that would lead to an IEEE fp exponent below -1023.
+ +    const SimdDouble  smallArgLimit(-1023.0);
       const SimdDouble  CC6(0.0001534581200287996416911311);
       const SimdDouble  CC5(0.001339993121934088894618990);
       const SimdDouble  CC4(0.009618488957115180159497841);
@@@ -3166,31 -2898,12 +3167,31 @@@
   
       SimdDouble        intpart;
       SimdDouble        p;
- -    SimdDBool         valuemask;
       SimdDInt32        ix;
   
+ +    // Large negative values are valid arguments to exp2(), so there are two
+ +    // things we need to account for:
+ +    // 1. When the exponents reaches -1023, the (biased) exponent field will be
+ +    //    zero and we can no longer multiply with it. There are special IEEE
+ +    //    formats to handle this range, but for now we have to accept that
+ +    //    we cannot handle those arguments. If input value becomes even more
+ +    //    negative, it will start to loop and we would end up with invalid
+ +    //    exponents. Thus, we need to limit or mask this.
+ +    // 2. For VERY large negative values, we will have problems that the
+ +    //    subtraction to get the fractional part loses accuracy, and then we
+ +    //    can end up with overflows in the polynomial.
+ +    //
+ +    // For now, we handle this by clamping smaller arguments to -1023. At this
+ +    // point we will already return zero, so we don't need to do anything
+ +    // extra for the exponent.
+ +
+ +    if (opt == MathOptimization::Safe)
+ +    {
+ +        x         = max(x, smallArgLimit);
+ +    }
+ +
       ix        = cvtR2I(x);
       intpart   = round(x);
- -    valuemask = (abs(x) <= arglimit);
       x         = x - intpart;
   
       p         = fma(CC6, x, CC5);
@@@ -3200,23 -2913,22 +3201,23 @@@
       p         = fma(p, x, CC1);
       p         = fma(p, x, one);
       x         = ldexp(p, ix);
- -    x         = selectByMask(x, valuemask);
   
       return x;
   }
   
- -/*! \brief SIMD exp(x). Double precision SIMD data, single accuracy.
+ +
+ +
+ +/*! \brief SIMD exp(x). Double precision SIMD, single accuracy.
    *
- - * \param x Argument.
- - * \result exp(x). Undefined if input argument caused overflow.
+ + * \copydetails exp(SimdFloat)
    */
+ +template <MathOptimization opt = MathOptimization::Safe>
   static inline SimdDouble gmx_simdcall
   expSingleAccuracy(SimdDouble x)
   {
       const SimdDouble  argscale(1.44269504088896341);
- -    // Lower bound: Disallow numbers that would lead to an IEEE fp exponent reaching +-127
- -    const SimdDouble  arglimit(126.0);
+ +    // Lower bound: Clamp args that would lead to an IEEE fp exponent below -1023.
+ +    const SimdDouble  smallArgLimit(-709.0895657128);
       const SimdDouble  invargscale(-0.69314718055994528623);
       const SimdDouble  CC4(0.00136324646882712841033936);
       const SimdDouble  CC3(0.00836596917361021041870117);
@@@ -3226,32 -2938,13 +3227,32 @@@
       const SimdDouble  one(1.0);
       SimdDouble        intpart;
       SimdDouble        y, p;
- -    SimdDBool         valuemask;
       SimdDInt32        iy;
   
+ +    // Large negative values are valid arguments to exp2(), so there are two
+ +    // things we need to account for:
+ +    // 1. When the exponents reaches -127, the (biased) exponent field will be
+ +    //    zero and we can no longer multiply with it. There are special IEEE
+ +    //    formats to handle this range, but for now we have to accept that
+ +    //    we cannot handle those arguments. If input value becomes even more
+ +    //    negative, it will start to loop and we would end up with invalid
+ +    //    exponents. Thus, we need to limit or mask this.
+ +    // 2. For VERY large negative values, we will have problems that the
+ +    //    subtraction to get the fractional part loses accuracy, and then we
+ +    //    can end up with overflows in the polynomial.
+ +    //
+ +    // For now, we handle this by clamping smaller arguments to -127. At this
+ +    // point we will already return zero, so we don't need to do anything
+ +    // extra for the exponent.
+ +
+ +    if (opt == MathOptimization::Safe)
+ +    {
+ +        x         = max(x, smallArgLimit);
+ +    }
+ +
       y         = x * argscale;
       iy        = cvtR2I(y);
       intpart   = round(y);        // use same rounding algorithm here
- -    valuemask = (abs(y) <= arglimit);
   
       // Extended precision arithmetics not needed since
       // we have double precision and only need single accuracy.
@@@ -3264,10 -2957,10 +3265,10 @@@
       p         = fma(x*x, p, x);
       p         = p + one;
       x         = ldexp(p, iy);
- -    x         = selectByMask(x, valuemask);
       return x;
   }
   
+ +
   /*! \brief SIMD erf(x). Double precision SIMD data, single accuracy.
    *
    * \param x The value to calculate erf(x) for.
@@@ -3967,7 -3660,7 +3968,7 @@@ atan2SingleAccuracy(SimdDouble y, SimdD
    * added to \f$1/r\f$ the error will be insignificant.
    *
    */
- -static SimdDouble gmx_simdcall
+ +static inline SimdDouble gmx_simdcall
   pmeForceCorrectionSingleAccuracy(SimdDouble z2)
   {
       const SimdDouble  FN6(-1.7357322914161492954e-8);
@@@ -4039,7 -3732,7 +4040,7 @@@
    * This approximation achieves an accuracy slightly lower than 1e-6; when
    * added to \f$1/r\f$ the error will be insignificant.
    */
- -static SimdDouble gmx_simdcall
+ +static inline SimdDouble gmx_simdcall
   pmePotentialCorrectionSingleAccuracy(SimdDouble z2)
   {
       const SimdDouble  VN6(1.9296833005951166339e-8);
@@@ -4112,15 -3805,8 +4113,15 @@@ rsqrtIter(Simd4Float lu, Simd4Float x
   
   /*! \brief Calculate 1/sqrt(x) for SIMD4 float.
    *
- - *  \param x Argument that must be >0. This routine does not check arguments.
- - *  \return 1/sqrt(x). Result is undefined if your argument was invalid.
+ + *  \param x Argument that must be larger than GMX_FLOAT_MIN and smaller than
+ + *           GMX_FLOAT_MAX, i.e. within the range of single precision.
+ + *           For the single precision implementation this is obviously always
+ + *           true for positive values, but for double precision it adds an
+ + *           extra restriction since the first lookup step might have to be
+ + *           performed in single precision on some architectures. Note that the
+ + *           responsibility for checking falls on you - this routine does not
+ + *           check arguments.
+ + *  \return  1/sqrt(x). Result is undefined if your argument was invalid.
    */
   static inline Simd4Float gmx_simdcall
   invsqrt(Simd4Float x)
@@@ -4168,15 -3854,8 +4169,15 @@@ rsqrtIter(Simd4Double lu, Simd4Double x
   
   /*! \brief Calculate 1/sqrt(x) for SIMD4 double.
    *
- - *  \param x Argument that must be >0. This routine does not check arguments.
- - *  \return 1/sqrt(x). Result is undefined if your argument was invalid.
+ + *  \param x Argument that must be larger than GMX_FLOAT_MIN and smaller than
+ + *           GMX_FLOAT_MAX, i.e. within the range of single precision.
+ + *           For the single precision implementation this is obviously always
+ + *           true for positive values, but for double precision it adds an
+ + *           extra restriction since the first lookup step might have to be
+ + *           performed in single precision on some architectures. Note that the
+ + *           responsibility for checking falls on you - this routine does not
+ + *           check arguments.
+ + *  \return  1/sqrt(x). Result is undefined if your argument was invalid.
    */
   static inline Simd4Double gmx_simdcall
   invsqrt(Simd4Double x)
@@@ -4204,15 -3883,8 +4205,15 @@@
   
   /*! \brief Calculate 1/sqrt(x) for SIMD4 double, but in single accuracy.
    *
- - *  \param x Argument that must be >0. This routine does not check arguments.
- - *  \return 1/sqrt(x). Result is undefined if your argument was invalid.
+ + *  \param x Argument that must be larger than GMX_FLOAT_MIN and smaller than
+ + *           GMX_FLOAT_MAX, i.e. within the range of single precision.
+ + *           For the single precision implementation this is obviously always
+ + *           true for positive values, but for double precision it adds an
+ + *           extra restriction since the first lookup step might have to be
+ + *           performed in single precision on some architectures. Note that the
+ + *           responsibility for checking falls on you - this routine does not
+ + *           check arguments.
+ + *  \return  1/sqrt(x). Result is undefined if your argument was invalid.
    */
   static inline Simd4Double gmx_simdcall
   invsqrtSingleAccuracy(Simd4Double x)
@@@ -4239,15 -3911,8 +4240,15 @@@
   #if GMX_SIMD_HAVE_FLOAT
   /*! \brief Calculate 1/sqrt(x) for SIMD float, only targeting single accuracy.
    *
- - *  \param x Argument that must be >0. This routine does not check arguments.
- - *  \return 1/sqrt(x). Result is undefined if your argument was invalid.
+ + *  \param x Argument that must be larger than GMX_FLOAT_MIN and smaller than
+ + *           GMX_FLOAT_MAX, i.e. within the range of single precision.
+ + *           For the single precision implementation this is obviously always
+ + *           true for positive values, but for double precision it adds an
+ + *           extra restriction since the first lookup step might have to be
+ + *           performed in single precision on some architectures. Note that the
+ + *           responsibility for checking falls on you - this routine does not
+ + *           check arguments.
+ + *  \return  1/sqrt(x). Result is undefined if your argument was invalid.
    */
   static inline SimdFloat gmx_simdcall
   invsqrtSingleAccuracy(SimdFloat x)
@@@ -4261,17 -3926,10 +4262,17 @@@
    *  Illegal values in the masked-out elements will not lead to
    *  floating-point exceptions.
    *
- - *  \param x Argument that must be >0 for masked-in entries
+ + *  \param x Argument that must be larger than GMX_FLOAT_MIN and smaller than
+ + *           GMX_FLOAT_MAX, i.e. within the range of single precision.
+ + *           For the single precision implementation this is obviously always
+ + *           true for positive values, but for double precision it adds an
+ + *           extra restriction since the first lookup step might have to be
+ + *           performed in single precision on some architectures. Note that the
+ + *           responsibility for checking falls on you - this routine does not
+ + *           check arguments.
    *  \param m Mask
- - *  \return 1/sqrt(x). Result is undefined if your argument was invalid or
- - *          entry was not masked, and 0.0 for masked-out entries.
+ + *  \return  1/sqrt(x). Result is undefined if your argument was invalid or
+ + *           entry was not masked, and 0.0 for masked-out entries.
    */
   static inline SimdFloat
   maskzInvsqrtSingleAccuracy(SimdFloat x, SimdFBool m)
@@@ -4281,22 -3939,13 +4282,22 @@@
   
   /*! \brief Calculate 1/sqrt(x) for two SIMD floats, only targeting single accuracy.
    *
- - * \param x0  First set of arguments, x0 must be positive - no argument checking.
- - * \param x1  Second set of arguments, x1 must be positive - no argument checking.
+ + * \param x0  First set of arguments, x0 must be in single range (see below).
+ + * \param x1  Second set of arguments, x1 must be in single range (see below).
    * \param[out] out0  Result 1/sqrt(x0)
    * \param[out] out1  Result 1/sqrt(x1)
    *
    *  In particular for double precision we can sometimes calculate square root
    *  pairs slightly faster by using single precision until the very last step.
+ + *
+ + * \note Both arguments must be larger than GMX_FLOAT_MIN and smaller than
+ + *       GMX_FLOAT_MAX, i.e. within the range of single precision.
+ + *       For the single precision implementation this is obviously always
+ + *       true for positive values, but for double precision it adds an
+ + *       extra restriction since the first lookup step might have to be
+ + *       performed in single precision on some architectures. Note that the
+ + *       responsibility for checking falls on you - this routine does not
+ + *       check arguments.
    */
   static inline void gmx_simdcall
   invsqrtPairSingleAccuracy(SimdFloat x0,    SimdFloat x1,
@@@ -4307,15 -3956,8 +4308,15 @@@
   
   /*! \brief Calculate 1/x for SIMD float, only targeting single accuracy.
    *
- - *  \param x Argument that must be nonzero. This routine does not check arguments.
- - *  \return 1/x. Result is undefined if your argument was invalid.
+ + *  \param x Argument with magnitude larger than GMX_FLOAT_MIN and smaller than
+ + *           GMX_FLOAT_MAX, i.e. within the range of single precision.
+ + *           For the single precision implementation this is obviously always
+ + *           true for positive values, but for double precision it adds an
+ + *           extra restriction since the first lookup step might have to be
+ + *           performed in single precision on some architectures. Note that the
+ + *           responsibility for checking falls on you - this routine does not
+ + *           check arguments.
+ + *  \return  1/x. Result is undefined if your argument was invalid.
    */
   static inline SimdFloat gmx_simdcall
   invSingleAccuracy(SimdFloat x)
@@@ -4326,16 -3968,9 +4327,16 @@@
   
   /*! \brief Calculate 1/x for masked SIMD floats, only targeting single accuracy.
    *
- - *  \param x Argument that must be nonzero for non-masked entries.
+ + *  \param x Argument with magnitude larger than GMX_FLOAT_MIN and smaller than
+ + *           GMX_FLOAT_MAX, i.e. within the range of single precision.
+ + *           For the single precision implementation this is obviously always
+ + *           true for positive values, but for double precision it adds an
+ + *           extra restriction since the first lookup step might have to be
+ + *           performed in single precision on some architectures. Note that the
+ + *           responsibility for checking falls on you - this routine does not
+ + *           check arguments.
    *  \param m Mask
- - *  \return 1/x for elements where m is true, or 0.0 for masked-out entries.
+ + *  \return  1/x for elements where m is true, or 0.0 for masked-out entries.
    */
   static inline SimdFloat
   maskzInvSingleAccuracy(SimdFloat x, SimdFBool m)
@@@ -4343,15 -3978,16 +4344,15 @@@
       return maskzInv(x, m);
   }
   
- -/*! \brief Calculate sqrt(x) for SIMD float, only targeting single accuracy.
+ +/*! \brief Calculate sqrt(x) for SIMD float, always targeting single accuracy.
    *
- - *  \param x Argument that must be >=0.
- - *  \return sqrt(x). If x=0, the result will correctly be set to 0.
- - *          The result is undefined if the input value is negative.
+ + * \copydetails sqrt(SimdFloat)
    */
+ +template <MathOptimization opt = MathOptimization::Safe>
   static inline SimdFloat gmx_simdcall
   sqrtSingleAccuracy(SimdFloat x)
   {
- -    return sqrt(x);
+ +    return sqrt<opt>(x);
   }
   
   /*! \brief SIMD float log(x), only targeting single accuracy. This is the natural logarithm.
@@@ -4367,27 -4003,26 +4368,27 @@@ logSingleAccuracy(SimdFloat x
   
   /*! \brief SIMD float 2^x, only targeting single accuracy.
    *
- - * \param x Argument.
- - * \result 2^x. Undefined if input argument caused overflow.
+ + * \copydetails exp2(SimdFloat)
    */
+ +template <MathOptimization opt = MathOptimization::Safe>
   static inline SimdFloat gmx_simdcall
   exp2SingleAccuracy(SimdFloat x)
   {
- -    return exp2(x);
+ +    return exp2<opt>(x);
   }
   
   /*! \brief SIMD float e^x, only targeting single accuracy.
    *
- - * \param x Argument.
- - * \result exp(x). Undefined if input argument caused overflow.
+ + * \copydetails exp(SimdFloat)
    */
+ +template <MathOptimization opt = MathOptimization::Safe>
   static inline SimdFloat gmx_simdcall
   expSingleAccuracy(SimdFloat x)
   {
- -    return exp(x);
+ +    return exp<opt>(x);
   }
   
+ +
   /*! \brief SIMD float erf(x), only targeting single accuracy.
    *
    * \param x The value to calculate erf(x) for.
@@@ -4548,14 -4183,7 +4549,14 @@@ pmePotentialCorrectionSingleAccuracy(Si
   #if GMX_SIMD4_HAVE_FLOAT
   /*! \brief Calculate 1/sqrt(x) for SIMD4 float, only targeting single accuracy.
    *
- - *  \param x Argument that must be >0. This routine does not check arguments.
+ + *  \param x Argument that must be larger than GMX_FLOAT_MIN and smaller than
+ + *           GMX_FLOAT_MAX, i.e. within the range of single precision.
+ + *           For the single precision implementation this is obviously always
+ + *           true for positive values, but for double precision it adds an
+ + *           extra restriction since the first lookup step might have to be
+ + *           performed in single precision on some architectures. Note that the
+ + *           responsibility for checking falls on you - this routine does not
+ + *           check arguments.
    *  \return 1/sqrt(x). Result is undefined if your argument was invalid.
    */
   static inline Simd4Float gmx_simdcall
diff --combined src/gromacs/swap/swapcoords.cpp

index 1d230a6c5f6758026f6d9dd5ea6a4573fff7240f,92cbe4ca5c83b978321e3b03ccc6eed6cdb39373..4333572aad3c94dabe77405c84fb08c7ca4921bc
--- 1/src/gromacs/swap/swapcoords.cpp
--- 2/src/gromacs/swap/swapcoords.cpp
+++ b/src/gromacs/swap/swapcoords.cpp
@@@ -58,14 -58,11 +58,14 @@@
   #include "gromacs/mdlib/groupcoord.h"
   #include "gromacs/mdlib/mdrun.h"
   #include "gromacs/mdlib/sim_util.h"
+ +#include "gromacs/mdtypes/commrec.h"
   #include "gromacs/mdtypes/inputrec.h"
   #include "gromacs/mdtypes/md_enums.h"
+ +#include "gromacs/mdtypes/observableshistory.h"
+ +#include "gromacs/mdtypes/swaphistory.h"
   #include "gromacs/pbcutil/pbc.h"
   #include "gromacs/timing/wallcycle.h"
- -#include "gromacs/topology/mtop_util.h"
+ +#include "gromacs/topology/mtop_lookup.h"
   #include "gromacs/topology/topology.h"
   #include "gromacs/utility/cstringutil.h"
   #include "gromacs/utility/fatalerror.h"
@@@ -76,8 -73,8 +76,8 @@@
   static const char *SwS      = {"SWAP:"};                                           /**< For output that comes from the swap module */
   static const char *SwSEmpty = {"     "};                                           /**< Placeholder for multi-line output */
   static const char* CompStr[eCompNR] = {"A", "B" };                                 /**< Compartment name */
- -static const char *SwapStr[eSwapTypesNR+1] = { "", "X-", "Y-", "Z-", NULL};        /**< Name for the swap types. */
- -static const char *DimStr[DIM+1] = { "X", "Y", "Z", NULL};                         /**< Name for the swap dimension. */
+ +static const char *SwapStr[eSwapTypesNR+1] = { "", "X-", "Y-", "Z-", nullptr};     /**< Name for the swap types. */
+ +static const char *DimStr[DIM+1] = { "X", "Y", "Z", nullptr};                      /**< Name for the swap dimension. */
   
   /** Keep track of through which channel the ions have passed */
   enum eChannelHistory {
@@@ -315,7 -312,7 +315,7 @@@ static void get_molecule_center
           rvec_add(reference, dx, correctPBCimage);
   
           /* Take weight into account */
- -        if (NULL == weights)
+ +        if (nullptr == weights)
           {
               wi = 1.0;
           }
@@@ -704,7 -701,7 +704,7 @@@ static void sortMoleculesIntoCompartmen
                   add_to_list(iAtom, &g->comp[comp], dist);
   
                   /* Master also checks for ion groups through which channel each ion has passed */
- -                if (MASTER(cr) && (g->comp_now != NULL) && !bIsSolvent)
+ +                if (MASTER(cr) && (g->comp_now != nullptr) && !bIsSolvent)
                   {
                       int globalAtomNr = g->ind[iAtom] + 1; /* PDB index starts at 1 ... */
                       detect_flux_per_channel(g, globalAtomNr, comp, g->xc[iAtom],
@@@ -740,7 -737,7 +740,7 @@@
           }
       }
   
- -    if (bIsSolvent && NULL != fpout)
+ +    if (bIsSolvent && nullptr != fpout)
       {
           fprintf(fpout, "# Solv. molecules in comp.%s: %d   comp.%s: %d\n",
                   CompStr[eCompA], g->comp[eCompA].nMol,
@@@ -845,7 -842,7 +845,7 @@@ static void get_initial_ioncounts
    * over the values from .cpt file to the swap data structure.
    */
   static void get_initial_ioncounts_from_cpt(
- -        t_inputrec *ir, swapstate_t *swapstate,
+ +        t_inputrec *ir, swaphistory_t *swapstate,
           t_commrec *cr, gmx_bool bVerbose)
   {
       t_swapcoords    *sc;
@@@ -927,10 -924,10 +927,10 @@@ static void bc_initial_concentrations
   /*! \brief Ensure that each atom belongs to at most one of the swap groups. */
   static void check_swap_groups(t_swap *s, int nat, gmx_bool bVerbose)
   {
- -    int  *nGroup    = NULL; /* This array counts for each atom in the MD system to
- -                               how many swap groups it belongs (should be 0 or 1!) */
+ +    int  *nGroup    = nullptr; /* This array counts for each atom in the MD system to
+ +                                  how many swap groups it belongs (should be 0 or 1!) */
       int   ind       = -1;
- -    int   nMultiple = 0;    /* Number of atoms belonging to multiple groups */
+ +    int   nMultiple = 0;       /* Number of atoms belonging to multiple groups */
   
   
       if (bVerbose)
@@@ -978,16 -975,17 +978,16 @@@ static int get_group_apm_check
           int                         igroup,
           t_swap                     *s,
           gmx_bool                    bVerbose,
- -        const gmx_mtop_atomlookup_t alook,
           gmx_mtop_t                 *mtop)
   {
- -    int        molb, molnr, atnr_mol;
       t_swapgrp *g   = &s->group[igroup];
       int       *ind = s->group[igroup].ind;
       int        nat = s->group[igroup].nat;
   
       /* Determine the number of solvent atoms per solvent molecule from the
        * first solvent atom: */
- -    gmx_mtop_atomnr_to_molblock_ind(alook, ind[0], &molb, &molnr, &atnr_mol);
+ +    int molb = 0;
+ +    mtopGetMolblockIndex(mtop, ind[0], &molb, nullptr, nullptr);
       int apm = mtop->molblock[molb].natoms_mol;
   
       if (bVerbose)
@@@ -999,7 -997,7 +999,7 @@@
       /* Check whether this is also true for all other solvent atoms */
       for (int i = 1; i < nat; i++)
       {
- -        gmx_mtop_atomnr_to_molblock_ind(alook, ind[i], &molb, &molnr, &atnr_mol);
+ +        mtopGetMolblockIndex(mtop, ind[i], &molb, nullptr, nullptr);
           if (apm != mtop->molblock[molb].natoms_mol)
           {
               gmx_fatal(FARGS, "Not all molecules of swap group %d consist of %d atoms.",
@@@ -1049,9 -1047,9 +1049,9 @@@ static void print_ionlist_legend(t_inpu
       }
   
       // Center of split groups
- -    snprintf(buf, STRLEN, "%scenter of %s of split group 0", SwapStr[ir->eSwapCoords], (NULL != s->group[eGrpSplit0].m) ? "mass" : "geometry");
+ +    snprintf(buf, STRLEN, "%scenter of %s of split group 0", SwapStr[ir->eSwapCoords], (nullptr != s->group[eGrpSplit0].m) ? "mass" : "geometry");
       legend[count++] = gmx_strdup(buf);
- -    snprintf(buf, STRLEN, "%scenter of %s of split group 1", SwapStr[ir->eSwapCoords], (NULL != s->group[eGrpSplit1].m) ? "mass" : "geometry");
+ +    snprintf(buf, STRLEN, "%scenter of %s of split group 1", SwapStr[ir->eSwapCoords], (nullptr != s->group[eGrpSplit1].m) ? "mass" : "geometry");
       legend[count++] = gmx_strdup(buf);
   
       // Ion flux for each channel and ion type
@@@ -1091,24 -1089,30 +1091,24 @@@
    * they go.
    */
   static void detect_flux_per_channel_init(
- -        t_commrec   *cr,
- -        t_swap      *s,
- -        swapstate_t *swapstate,
- -        gmx_bool     bStartFromCpt)
+ +        t_swap        *s,
+ +        swaphistory_t *swapstate,
+ +        gmx_bool       bStartFromCpt)
   {
       t_swapgrp       *g;
       swapstateIons_t *gs;
   
+ +    /* All these flux detection routines run on the master only */
+ +    if (swapstate == nullptr)
+ +    {
+ +        return;
+ +    }
   
       for (int ig = eSwapFixedGrpNR; ig < s->ngrp; ig++)
       {
           g  = &s->group[ig];
           gs = &swapstate->ionType[ig - eSwapFixedGrpNR];
   
- -        /* All these flux detection routines run on the master only */
- -        if (!MASTER(cr))
- -        {
- -            g->comp_now      = NULL;
- -            g->comp_from     = NULL;
- -            g->channel_label = NULL;
- -
- -            return;
- -        }
- -
           /******************************************************/
           /* Channel and domain history for the individual ions */
           /******************************************************/
@@@ -1201,13 -1205,13 +1201,13 @@@ static void outputStartStructureIfWante
   {
       char *env = getenv("GMX_COMPELDUMP");
   
- -    if (env != NULL)
+ +    if (env != nullptr)
       {
           fprintf(stderr, "\n%s Found env.var. GMX_COMPELDUMP, will output CompEL starting structure made whole.\n"
                   "%s In case of multimeric channels, please check whether they have the correct PBC representation.\n",
                   SwS, SwSEmpty);
   
- -        write_sto_conf_mtop("CompELAssumedWholeConfiguration.pdb", *mtop->name, mtop, x, NULL, ePBC, box);
+ +        write_sto_conf_mtop("CompELAssumedWholeConfiguration.pdb", *mtop->name, mtop, x, nullptr, ePBC, box);
       }
   }
   
@@@ -1223,14 -1227,14 +1223,14 @@@
    * multimeric channels at the last time step.
    */
   static void init_swapstate(
- -        swapstate_t      *swapstate,
+ +        swaphistory_t    *swapstate,
           t_swapcoords     *sc,
           gmx_mtop_t       *mtop,
           rvec              x[],      /* the initial positions */
           matrix            box,
- -        int               ePBC)
+ +        t_inputrec       *ir)
   {
- -    rvec      *x_pbc  = NULL; /* positions of the whole MD system with molecules made whole */
+ +    rvec      *x_pbc  = nullptr; /* positions of the whole MD system with molecules made whole */
       t_swapgrp *g;
       t_swap    *s;
   
@@@ -1255,8 -1259,6 +1255,8 @@@
       }
       else
       {
+ +        swapstate->eSwapCoords = ir->eSwapCoords;
+ +
           /* Set the number of ion types and allocate memory for checkpointing */
           swapstate->nIonTypes = s->ngrp - eSwapFixedGrpNR;
           snew(swapstate->ionType, swapstate->nIonTypes);
@@@ -1276,10 -1278,10 +1276,10 @@@
           copy_rvecn(x, x_pbc, 0, mtop->natoms);
   
           /* This can only make individual molecules whole, not multimers */
- -        do_pbc_mtop(NULL, ePBC, box, mtop, x_pbc);
+ +        do_pbc_mtop(nullptr, ir->ePBC, box, mtop, x_pbc);
   
           /* Output the starting structure? */
- -        outputStartStructureIfWanted(mtop, x_pbc, ePBC, box);
+ +        outputStartStructureIfWanted(mtop, x_pbc, ir->ePBC, box);
   
           /* If this is the first run (i.e. no checkpoint present) we assume
            * that the starting positions give us the correct PBC representation */
@@@ -1336,7 -1338,7 +1336,7 @@@ static real getRequestedChargeImbalance
    * This routine should be called for the 'anions' and 'cations' group,
    * of which the indices were lumped together in the older version of the code.
    */
- -void copyIndicesToGroup(
+ +static void copyIndicesToGroup(
           int         *indIons,
           int          nIons,
           t_swapGroup *g,
@@@ -1380,12 -1382,14 +1380,12 @@@
    * #4 cations        - empty before conversion
    *
    */
- -void convertOldToNewGroupFormat(
+ +static void convertOldToNewGroupFormat(
           t_swapcoords *sc,
           gmx_mtop_t   *mtop,
           gmx_bool      bVerbose,
           t_commrec    *cr)
   {
- -    t_atom                *atom;
- -    gmx_mtop_atomlookup_t  alook = gmx_mtop_atomlookup_init(mtop);
       t_swapGroup           *g     = &sc->grp[3];
   
       /* Loop through the atom indices of group #3 (anions) and put all indices
@@@ -1393,16 -1397,15 +1393,16 @@@
        */
       int  nAnions    = 0;
       int  nCations   = 0;
- -    int *indAnions  = NULL;
- -    int *indCations = NULL;
+ +    int *indAnions  = nullptr;
+ +    int *indCations = nullptr;
       snew(indAnions, g->nat);
       snew(indCations, g->nat);
   
+ +    int molb = 0;
       for (int i = 0; i < g->nat; i++)
       {
- -        gmx_mtop_atomnr_to_atom(alook, g->ind[i], &atom);
- -        if (atom->q < 0)
+ +        const t_atom &atom = mtopGetAtomParameters(mtop, g->ind[i], &molb);
+ +        if (atom.q < 0)
           {
               // This is an anion, add it to the list of anions
               indAnions[nAnions++] = g->ind[i];
@@@ -1437,7 -1440,7 +1437,7 @@@
   /*! \brief Returns TRUE if we started from an old .tpr
    *
    * Then we need to re-sort anions and cations into separate groups */
- -gmx_bool bConvertFromOldTpr(t_swapcoords *sc)
+ +static gmx_bool bConvertFromOldTpr(t_swapcoords *sc)
   {
       // If the last group has no atoms it means we need to convert!
       if ( (sc->ngrp >= 5) && (0 == sc->grp[4].nat) )
@@@ -1450,38 -1453,41 +1450,36 @@@
   
   void init_swapcoords(
           FILE                   *fplog,
- -        gmx_bool                bVerbose,
           t_inputrec             *ir,
           const char             *fn,
           gmx_mtop_t             *mtop,
           rvec                    x[],
           matrix                  box,
- -        swapstate_t            *swapstate,
+ +        ObservablesHistory     *oh,
           t_commrec              *cr,
           const gmx_output_env_t *oenv,
- -        unsigned long           Flags)
+ +        const MdrunOptions     &mdrunOptions)
   {
       t_swapcoords          *sc;
       t_swap                *s;
- -    t_atom                *atom;
       t_swapgrp             *g;
       swapstateIons_t       *gs;
- -    gmx_bool               bAppend, bStartFromCpt, bRerun;
- -    gmx_mtop_atomlookup_t  alook = NULL;
- -
- -    alook = gmx_mtop_atomlookup_init(mtop);
+ +    gmx_bool               bAppend, bStartFromCpt;
-     matrix                 boxCopy;
+ +    swaphistory_t         *swapstate = nullptr;
   
- 
       if ( (PAR(cr)) && !DOMAINDECOMP(cr) )
       {
           gmx_fatal(FARGS, "Position swapping is only implemented for domain decomposition!");
       }
   
- -    bAppend       = Flags & MD_APPENDFILES;
- -    bStartFromCpt = Flags & MD_STARTFROMCPT;
- -    bRerun        = Flags & MD_RERUN;
+ +    bAppend       = mdrunOptions.continuationOptions.appendFiles;
+ +    bStartFromCpt = mdrunOptions.continuationOptions.startedFromCheckpoint;
   
       sc = ir->swap;
       snew(sc->si_priv, 1);
       s = sc->si_priv;
   
- -    if (bRerun)
+ +    if (mdrunOptions.rerun)
       {
           if (PAR(cr))
           {
@@@ -1515,8 -1521,6 +1513,8 @@@
               break;
       }
   
+ +    const gmx_bool bVerbose = mdrunOptions.verbose;
+ +
       // For compatibility with old .tpr files
       if (bConvertFromOldTpr(sc) )
       {
@@@ -1558,13 -1562,7 +1556,13 @@@
   
       if (MASTER(cr))
       {
- -        init_swapstate(swapstate, sc, mtop, x, box, ir->ePBC);
+ +        if (oh->swapHistory == nullptr)
+ +        {
+ +            oh->swapHistory = std::unique_ptr<swaphistory_t>(new swaphistory_t {});
+ +        }
+ +        swapstate = oh->swapHistory.get();
+ +
+ +        init_swapstate(swapstate, sc, mtop, x, box, ir);
       }
   
       /* After init_swapstate we have a set of (old) whole positions for our
@@@ -1585,18 -1583,17 +1583,18 @@@
           real charge;
   
           g      = &(s->group[ig]);
- -        g->apm = get_group_apm_check(ig, s, MASTER(cr) && bVerbose, alook, mtop);
+ +        g->apm = get_group_apm_check(ig, s, MASTER(cr) && bVerbose, mtop);
   
           /* Since all molecules of a group are equal, we only need enough space
            * to determine properties of a single molecule at at time */
           snew(g->m, g->apm);  /* For the center of mass */
           charge = 0;          /* To determine the charge imbalance */
+ +        int molb = 0;
           for (int j = 0; j < g->apm; j++)
           {
- -            gmx_mtop_atomnr_to_atom(alook, g->ind[j], &atom);
- -            g->m[j] = atom->m;
- -            charge += atom->q;
+ +            const t_atom &atom = mtopGetAtomParameters(mtop, g->ind[j], &molb);
+ +            g->m[j] = atom.m;
+ +            charge += atom.q;
           }
           /* Total charge of one molecule of this group: */
           g->q = charge;
@@@ -1611,10 -1608,10 +1609,10 @@@
           {
               /* Save the split group masses if mass-weighting is requested */
               snew(g->m, g->nat);
+ +            int molb = 0;
               for (int i = 0; i < g->nat; i++)
               {
- -                gmx_mtop_atomnr_to_atom(alook, g->ind[i], &atom);
- -                g->m[i] = atom->m;
+ +                g->m[i] = mtopGetAtomMass(mtop, g->ind[i], &molb);
               }
           }
       }
@@@ -1622,14 -1619,6 +1620,6 @@@
       /* Make a t_pbc struct on all nodes so that the molecules
        * chosen for an exchange can be made whole. */
       snew(s->pbc, 1);
-     /* Every node needs to call set_pbc() and therefore every node needs
-      * to know the box dimensions */
-     copy_mat(box, boxCopy);
-     if (PAR(cr))
-     {
-         gmx_bcast(sizeof(boxCopy), boxCopy, cr);
-     }
-     set_pbc(s->pbc, -1, boxCopy);
   
       if (MASTER(cr))
       {
@@@ -1697,7 -1686,7 +1687,7 @@@
                       sc->cyl1r, sc->cyl1u, sc->cyl1l);
   
               fprintf(s->fpout, "#\n");
- -            if (!bRerun)
+ +            if (!mdrunOptions.rerun)
               {
                   fprintf(s->fpout, "# Coupling constant (number of swap attempt steps to average over): %d  (translates to %f ps).\n",
                           sc->nAverage, sc->nAverage*sc->nstswap*ir->delta_t);
@@@ -1709,7 -1698,7 +1699,7 @@@
       }
       else
       {
- -        s->fpout = NULL;
+ +        s->fpout = nullptr;
       }
   
       /* Prepare for parallel or serial run */
@@@ -1720,7 -1709,7 +1710,7 @@@
               g             = &(s->group[ig]);
               g->nat_loc    = 0;
               g->nalloc_loc = 0;
- -            g->ind_loc    = NULL;
+ +            g->ind_loc    = nullptr;
           }
       }
       else
@@@ -1758,7 -1747,7 +1748,7 @@@
           else
           {
               fprintf(stderr, "%s Determining initial numbers of ions per compartment.\n", SwS);
- -            get_initial_ioncounts(ir, x, box, cr, bRerun);
+ +            get_initial_ioncounts(ir, x, box, cr, mdrunOptions.rerun);
           }
   
           /* Prepare (further) checkpoint writes ... */
@@@ -1818,7 -1807,7 +1808,7 @@@
       }
   
       /* Initialize arrays that keep track of through which channel the ions go */
- -    detect_flux_per_channel_init(cr, s, swapstate, bStartFromCpt);
+ +    detect_flux_per_channel_init(s, swapstate, bStartFromCpt);
   
       /* We need to print the legend if we open this file for the first time. */
       if (MASTER(cr) && !bAppend)
@@@ -1989,9 -1978,10 +1979,9 @@@ gmx_bool do_swapcoords
           gmx_int64_t       step,
           double            t,
           t_inputrec       *ir,
- -        gmx_wallcycle_t   wcycle,
+ +        gmx_wallcycle    *wcycle,
           rvec              x[],
           matrix            box,
- -        gmx_mtop_t       *mtop,
           gmx_bool          bVerbose,
           gmx_bool          bRerun)
   {
@@@ -2003,6 -1993,7 +1993,6 @@@
       t_swapgrp            *g, *gsol;
       int                   isol, iion;
       rvec                  com_solvent, com_particle; /* solvent and swap molecule's center of mass */
- -    gmx_mtop_atomlookup_t alook = NULL;
   
   
       wallcycle_start(wcycle, ewcSWAP);
@@@ -2010,6 -2001,7 +2000,7 @@@
       sc  = ir->swap;
       s   = sc->si_priv;
   
+     set_pbc(s->pbc, ir->ePBC, box);
   
       /* Assemble the positions of the split groups, i.e. the channels.
        * Here we also pass a shifts array to communicate_group_positions(), so that it can make
@@@ -2030,8 -2022,8 +2021,8 @@@
       for (ig = eSwapFixedGrpNR; ig < s->ngrp; ig++)
       {
           g = &(s->group[ig]);
- -        communicate_group_positions(cr, g->xc, NULL, NULL, FALSE,
- -                                    x, g->nat, g->nat_loc, g->ind_loc, g->c_ind_loc, NULL, NULL);
+ +        communicate_group_positions(cr, g->xc, nullptr, nullptr, FALSE,
+ +                                    x, g->nat, g->nat_loc, g->ind_loc, g->c_ind_loc, nullptr, nullptr);
   
           /* Determine how many ions of this type each compartment contains */
           sortMoleculesIntoCompartments(g, cr, sc, box, step, s->fpout, bRerun, FALSE);
@@@ -2057,8 -2049,8 +2048,8 @@@
           /* Since we here know that we have to perform ion/water position exchanges,
            * we now assemble the solvent positions */
           g = &(s->group[eGrpSolvent]);
- -        communicate_group_positions(cr, g->xc, NULL, NULL, FALSE,
- -                                    x, g->nat, g->nat_loc, g->ind_loc, g->c_ind_loc, NULL, NULL);
+ +        communicate_group_positions(cr, g->xc, nullptr, nullptr, FALSE,
+ +                                    x, g->nat, g->nat_loc, g->ind_loc, g->c_ind_loc, nullptr, nullptr);
   
           /* Determine how many molecules of solvent each compartment contains */
           sortMoleculesIntoCompartments(g, cr, sc, box, step, s->fpout, bRerun, TRUE);
@@@ -2082,6 -2074,7 +2073,6 @@@
           }
   
           /* Now actually perform the particle exchanges, one swap group after another */
- -        alook  = gmx_mtop_atomlookup_init(mtop);
           gsol   = &s->group[eGrpSolvent];
           for (ig = eSwapFixedGrpNR; ig < s->ngrp; ig++)
           {
@@@ -2143,8 -2136,9 +2134,8 @@@
                           SwS, nswaps, nswaps > 1 ? "s" : "", step, g->molname);
               }
           }
- -        gmx_mtop_atomlookup_destroy(alook);
   
- -        if (s->fpout != NULL)
+ +        if (s->fpout != nullptr)
           {
               print_ionlist(s, t, "  # after swap");
           }
diff --combined src/gromacs/tables/forcetable.cpp

index cf49c48f2f2092bcde31dea283f29b05cc5201c6,04eae447f76929965d45e3896b08f082487577f1..6ef0b7d1f72315c2c70ca6ba7451045add273b87
--- 1/src/gromacs/tables/forcetable.cpp
--- 2/src/gromacs/tables/forcetable.cpp
+++ b/src/gromacs/tables/forcetable.cpp
@@@ -211,7 -211,7 +211,7 @@@ void table_spline3_fill_ewald_lr(rea
               vi = v_inrange - dc*(i - i_inrange)*dx;
           }
   
- -        if (table_v != NULL)
+ +        if (table_v != nullptr)
           {
               table_v[i] = vi;
           }
@@@ -281,7 -281,7 +281,7 @@@
       /* Currently the last value only contains half the force: double it */
       table_f[0] *= 2;
   
- -    if (table_v != NULL && table_fdv0 != NULL)
+ +    if (table_v != nullptr && table_fdv0 != nullptr)
       {
           /* Copy to FDV0 table too. Allocation occurs in forcerec.c,
            * init_ewald_f_table().
@@@ -317,7 -317,7 +317,7 @@@ static double spline3_table_scale(doubl
   
       /* Don't try to be more accurate on energy than the precision */
       func_tol  = std::max(func_tol, static_cast<double>(GMX_REAL_EPS));
-     sc_func   = std::cbrt(third_deriv_max/(6*12*sqrt(3)*func_tol))*x_scale;
+     sc_func   = std::cbrt(third_deriv_max/(6*12*std::sqrt(3.0)*func_tol))*x_scale;
   
       return std::max(sc_deriv, sc_func);
   }
@@@ -382,6 -382,37 +382,6 @@@ real ewald_spline3_table_scale(const in
       return sc;
   }
   
- -/* Calculate the potential and force for an r value
- - * in exactly the same way it is done in the inner loop.
- - * VFtab is a pointer to the table data, offset is
- - * the point where we should begin and stride is
- - * 4 if we have a buckingham table, 3 otherwise.
- - * If you want to evaluate table no N, set offset to 4*N.
- - *
- - * We use normal precision here, since that is what we
- - * will use in the inner loops.
- - */
- -static void evaluate_table(real VFtab[], int offset, int stride,
- -                           real tabscale, real r, real *y, real *yp)
- -{
- -    int  n;
- -    real rt, eps, eps2;
- -    real Y, F, Geps, Heps2, Fp;
- -
- -    rt       =  r*tabscale;
- -    n        =  (int)rt;
- -    eps      =  rt - n;
- -    eps2     =  eps*eps;
- -    n        =  offset+stride*n;
- -    Y        =  VFtab[n];
- -    F        =  VFtab[n+1];
- -    Geps     =  eps*VFtab[n+2];
- -    Heps2    =  eps2*VFtab[n+3];
- -    Fp       =  F+Geps+Heps2;
- -    *y       =  Y+eps*Fp;
- -    *yp      =  (Fp+Geps+2.0*Heps2)*tabscale;
- -}
- -
   static void copy2table(int n, int offset, int stride,
                          double x[], double Vtab[], double Ftab[], real scalefactor,
                          real dest[])
@@@ -578,7 -609,7 +578,7 @@@ static void read_tables(FILE *fp, cons
   {
       char    *libfn;
       char     buf[STRLEN];
- -    double **yy = NULL, start, end, dx0, dx1, ssd, vm, vp, f, numf;
+ +    double **yy = nullptr, start, end, dx0, dx1, ssd, vm, vp, f, numf;
       int      k, i, nx, nx0 = 0, ny, nny, ns;
       gmx_bool bAllZero, bZeroV, bZeroF;
       double   tabscale;
@@@ -1542,9 -1573,9 +1542,9 @@@ t_forcetable *makeDispersionCorrectionT
                                               t_forcerec *fr, real rtab,
                                               const char *tabfn)
   {
- -    t_forcetable *dispersionCorrectionTable = NULL;
+ +    t_forcetable *dispersionCorrectionTable = nullptr;
   
- -    if (tabfn == NULL)
+ +    if (tabfn == nullptr)
       {
           if (debug)
           {
diff --combined src/gromacs/utility/sysinfo.cpp

index 88a26d8696dc0c3a5ffc5b83486ccfd4178071cf,8bcf8b5de0e215b083a7ef6a239770963d5583eb..91fc69cc7764fd55dedb68852090193eb185ef62
--- 1/src/gromacs/utility/sysinfo.cpp
--- 2/src/gromacs/utility/sysinfo.cpp
+++ b/src/gromacs/utility/sysinfo.cpp
@@@ -144,7 -144,7 +144,7 @@@ gmx_ctime_r(const time_t *clock, char *
       buf[len-1] = '\0';
   #elif (defined(__sun))
       /*Solaris*/
-     ctime_r(clock, buf, len);
+     ctime_r(clock, buf);
   #else
       char tmpbuf[30];
       ctime_r(clock, tmpbuf);
@@@ -156,7 -156,7 +156,7 @@@
   
   void gmx_format_current_time(char *buf, size_t len)
   {
- -    time_t clock = time(NULL);
+ +    time_t clock = time(nullptr);
       gmx_ctime_r(&clock, buf, len);
   }
author	Szilárd Páll <pall.szilard@gmail.com>
	Mon, 11 Sep 2017 17:33:09 +0000 (19:33 +0200)
committer	Berk Hess <hess@kth.se>
	Tue, 12 Sep 2017 06:47:39 +0000 (08:47 +0200)
		1	2
CMakeLists.txt	patch \|	diff1 \|	diff2 \|	blob \| history
cmake/gmxManageFFTLibraries.cmake	patch \|	diff1 \|	diff2 \|	blob \| history
docs/install-guide/index.rst	patch \|	diff1 \|	diff2 \|	blob \| history
docs/user-guide/environment-variables.rst	patch \|	diff1 \|	diff2 \|	blob \| history
src/gromacs/fileio/oenv.cpp	patch \|	diff1 \|	diff2 \|	blob \| history
src/gromacs/fileio/oenv.h	patch \|	diff1 \|	diff2 \|	blob \| history
src/gromacs/fileio/trxio.cpp	patch \|	diff1 \|	diff2 \|	blob \| history
src/gromacs/gmxlib/nonbonded/CMakeLists.txt	patch \|	diff1 \|	diff2 \|	blob \| history
src/gromacs/hardware/cpuinfo.cpp	patch \|	diff1 \|	diff2 \|	blob \| history
src/gromacs/hardware/cpuinfo.h	patch \|	diff1 \|	diff2 \|	blob \| history
src/gromacs/mdlib/md_support.cpp	patch \|	diff1 \|	diff2 \|	blob \| history
src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh	patch \|	diff1 \|	diff2 \|	blob \| history
src/gromacs/simd/simd_math.h	patch \|	diff1 \|	diff2 \|	blob \| history
src/gromacs/swap/swapcoords.cpp	patch \|	diff1 \|	diff2 \|	blob \| history
src/gromacs/tables/forcetable.cpp	patch \|	diff1 \|	diff2 \|	blob \| history
src/gromacs/utility/sysinfo.cpp	patch \|	diff1 \|	diff2 \|	blob \| history