set(CUDA_HOST_COMPILER_OPTIONS "")
if(APPLE AND CMAKE_C_COMPILER_ID MATCHES "GNU")
- # Some versions of gcc-4.8 and gcc-4.9 produce errors (in particular on OS X)
- # if we do not use -D__STRICT_ANSI__. It is harmless, so we might as well add it for all versions.
+ # Some versions of gcc-4.8 and gcc-4.9 have produced errors
+ # (in particular on OS X) if we do not use
+ # -D__STRICT_ANSI__. It is harmless, so we might as well add
+ # it for all versions.
list(APPEND CUDA_HOST_COMPILER_OPTIONS "-D__STRICT_ANSI__")
endif()
endforeach()
else()
# Set the CUDA GPU architectures to compile for:
- # - with CUDA >=5.0 <6.5: CC <=3.5 is supported
- # => compile sm_30, sm_35 SASS, and compute_35 PTX
- # - with CUDA ==6.5: CC <=3.7 and 5.0 are supported
- # => compile sm_30, sm_35, sm_37 sm_50, SASS, and compute_50 PTX
- # - with CUDA >=7.0 CC 5.2 is supported (5.3, Tegra X1 we don't generate code for)
- # => compile sm_30, sm_35, sm_37, sm_50, & sm_52 SASS, and compute_52 PTX
- # - with CUDA >=8.0 CC 6.0-6.2 is supported (but we know nothing about CC 6.2, so we won't generate code or it)
- # => compile sm_30, sm_35, sm_37, sm_50, sm_52, sm_60, sm_61 SASS, and compute_60 and compute_61 PTX
# - with CUDA >=9.0 CC 7.0 is supported and CC 2.0 is no longer supported
# => compile sm_30, sm_35, sm_37, sm_50, sm_52, sm_60, sm_61, sm_70 SASS, and compute_70 PTX
- #
- # Note that CUDA 6.5.19 second patch release supports cc 5.2 too, but
- # CUDA_VERSION does not contain patch version and having PTX 5.0 JIT-ed is
- # equally fast as compiling with sm_5.2 anyway.
+ # - with CUDA >=10.0 CC 7.5 is supported
+ # => compile sm_30, sm_35, sm_37, sm_50, sm_52, sm_60, sm_61, sm_70, sm_75 SASS, and compute_75 PTX
# First add flags that trigger SASS (binary) code generation for physical arch
list (APPEND GMX_CUDA_NVCC_GENCODE_FLAGS "-gencode;arch=compute_30,code=sm_30")
list (APPEND GMX_CUDA_NVCC_GENCODE_FLAGS "-gencode;arch=compute_35,code=sm_35")
-
- if(NOT CUDA_VERSION VERSION_LESS "6.5") # >= 6.5
- list (APPEND GMX_CUDA_NVCC_GENCODE_FLAGS "-gencode;arch=compute_37,code=sm_37")
- list (APPEND GMX_CUDA_NVCC_GENCODE_FLAGS "-gencode;arch=compute_50,code=sm_50")
- endif()
- if(NOT CUDA_VERSION VERSION_LESS "7.0") # >= 7.0
- list (APPEND GMX_CUDA_NVCC_GENCODE_FLAGS "-gencode;arch=compute_52,code=sm_52")
- endif()
- if(NOT CUDA_VERSION VERSION_LESS "8.0") # >= 8.0
- list (APPEND GMX_CUDA_NVCC_GENCODE_FLAGS "-gencode;arch=compute_60,code=sm_60")
- list (APPEND GMX_CUDA_NVCC_GENCODE_FLAGS "-gencode;arch=compute_61,code=sm_61")
- endif()
- if(NOT CUDA_VERSION VERSION_LESS "9.0") # >= 9.0
- list (APPEND GMX_CUDA_NVCC_GENCODE_FLAGS "-gencode;arch=compute_70,code=sm_70")
- endif()
+ list (APPEND GMX_CUDA_NVCC_GENCODE_FLAGS "-gencode;arch=compute_37,code=sm_37")
+ list (APPEND GMX_CUDA_NVCC_GENCODE_FLAGS "-gencode;arch=compute_50,code=sm_50")
+ list (APPEND GMX_CUDA_NVCC_GENCODE_FLAGS "-gencode;arch=compute_52,code=sm_52")
+ list (APPEND GMX_CUDA_NVCC_GENCODE_FLAGS "-gencode;arch=compute_60,code=sm_60")
+ list (APPEND GMX_CUDA_NVCC_GENCODE_FLAGS "-gencode;arch=compute_61,code=sm_61")
+ list (APPEND GMX_CUDA_NVCC_GENCODE_FLAGS "-gencode;arch=compute_70,code=sm_70")
# Next add flags that trigger PTX code generation for the newest supported virtual arch
# that's useful to JIT to future architectures
- if(CUDA_VERSION VERSION_LESS "6.5")
- list (APPEND GMX_CUDA_NVCC_GENCODE_FLAGS "-gencode;arch=compute_35,code=compute_35")
- elseif(CUDA_VERSION VERSION_LESS "7.0")
- list (APPEND GMX_CUDA_NVCC_GENCODE_FLAGS "-gencode;arch=compute_50,code=compute_50")
- elseif(CUDA_VERSION VERSION_LESS "8.0")
- list (APPEND GMX_CUDA_NVCC_GENCODE_FLAGS "-gencode;arch=compute_52,code=compute_52")
- elseif(CUDA_VERSION VERSION_LESS "9.0")
- list (APPEND GMX_CUDA_NVCC_GENCODE_FLAGS "-gencode;arch=compute_60,code=compute_60")
- list (APPEND GMX_CUDA_NVCC_GENCODE_FLAGS "-gencode;arch=compute_61,code=compute_61")
- elseif(CUDA_VERSION VERSION_LESS "10.0")
- list (APPEND GMX_CUDA_NVCC_GENCODE_FLAGS "-gencode;arch=compute_70,code=compute_70")
- else() # version >= 10.0
+ list (APPEND GMX_CUDA_NVCC_GENCODE_FLAGS "-gencode;arch=compute_35,code=compute_35")
+ list (APPEND GMX_CUDA_NVCC_GENCODE_FLAGS "-gencode;arch=compute_50,code=compute_50")
+ list (APPEND GMX_CUDA_NVCC_GENCODE_FLAGS "-gencode;arch=compute_52,code=compute_52")
+ list (APPEND GMX_CUDA_NVCC_GENCODE_FLAGS "-gencode;arch=compute_60,code=compute_60")
+ list (APPEND GMX_CUDA_NVCC_GENCODE_FLAGS "-gencode;arch=compute_61,code=compute_61")
+ list (APPEND GMX_CUDA_NVCC_GENCODE_FLAGS "-gencode;arch=compute_70,code=compute_70")
+ if(NOT CUDA_VERSION VERSION_LESS "10.0")
list (APPEND GMX_CUDA_NVCC_GENCODE_FLAGS "-gencode;arch=compute_75,code=compute_75")
endif()
endif()
set_property(CACHE GMX_CUDA_TARGET_COMPUTE PROPERTY TYPE STRING)
endif()
+# FindCUDA.cmake is unaware of the mechanism used by cmake to embed
+# the compiler flag for the required C++ standard in the generated
+# build files, so we have to pass it ourselves
+if (MSVC)
+ # We use C++14 on MSVC, but cmake does not understand the
+ # necessary compilation option for that until version 3.10, so we
+ # can remove this after we require that version.
+ if (NOT CMAKE_CXX14_STANDARD_COMPILE_OPTION)
+ set(GMX_CXX_STANDARD_COMPILE_OPTION "-std:c++14")
+ else()
+ set(GMX_CXX_STANDARD_COMPILE_OPTION "${CMAKE_CXX14_STANDARD_COMPILE_OPTION}")
+ endif()
+else()
+ set(GMX_CXX_STANDARD_COMPILE_OPTION "${CMAKE_CXX14_STANDARD_COMPILE_OPTION}")
+endif()
+list(APPEND GMX_CUDA_NVCC_FLAGS "${GMX_CXX_STANDARD_COMPILE_OPTION}")
+
# assemble the CUDA flags
list(APPEND GMX_CUDA_NVCC_FLAGS "${GMX_CUDA_NVCC_GENCODE_FLAGS}")
list(APPEND GMX_CUDA_NVCC_FLAGS "-use_fast_math")
string(TOUPPER "${CMAKE_BUILD_TYPE}" _build_type)
gmx_check_if_changed(_cuda_nvcc_executable_or_flags_changed CUDA_NVCC_EXECUTABLE CUDA_NVCC_FLAGS CUDA_NVCC_FLAGS_${_build_type})
- if(_cuda_nvcc_executable_or_flags_changed OR CUDA_HOST_COMPILER_CHANGED OR NOT GMX_NVCC_WORKS)
+ # We would like to be helpful and reject the host compiler with a
+ # clear error message at configure time, rather than let nvcc
+ # later reject the host compiler as not supported when the first
+ # CUDA source file is built. We've implemented that for current
+ # nvcc running on Unix-like systems, but e.g. changes to nvcc
+ # will further affect the limited portability of this checking
+ # code. Set the CMake variable GMX_NVCC_WORKS on if you want to
+ # bypass this check.
+ if((_cuda_nvcc_executable_or_flags_changed OR CUDA_HOST_COMPILER_CHANGED OR NOT GMX_NVCC_WORKS) AND NOT WIN32)
message(STATUS "Check for working NVCC/C compiler combination")
execute_process(COMMAND ${CUDA_NVCC_EXECUTABLE} -ccbin ${CUDA_HOST_COMPILER} -c ${CUDA_NVCC_FLAGS} ${CUDA_NVCC_FLAGS_${_build_type}} ${CMAKE_SOURCE_DIR}/cmake/TestCUDA.cu
RESULT_VARIABLE _cuda_test_res
# VSX uses the same function API as Altivec/VMX, so make sure we tune for the current CPU and not VMX.
# By putting these flags here rather than in the general compiler flags file we can safely assume
# that we are at least on Power7 since that is when VSX appeared.
+
+ # NOTE: Enabling instruction fusion on Power8/9 using -mpower8-fusion/-mpower9-fusion
+ # seems to produce buggy code (see #2747, #2746, #2734).
+ # Note that instruction fusion does have considerable performance benefits
+ # (up to 8% measured with gcc 8) if the issue is resolved the flag can be re-enabled.
gmx_run_cpu_detection(brand)
if(CPU_DETECTION_BRAND MATCHES "POWER7")
gmx_test_cflag(GNU_C_VSX_POWER7 "-mcpu=power7 -mtune=power7" ${TOOLCHAIN_C_FLAGS_VARIABLE})
gmx_test_cflag(GNU_CXX_VSX_POWER7 "-mcpu=power7 -mtune=power7" ${TOOLCHAIN_CXX_FLAGS_VARIABLE})
elseif(CPU_DETECTION_BRAND MATCHES "POWER8")
# Enable power8 vector extensions on such platforms.
- gmx_test_cflag(GNU_C_VSX_POWER8 "-mcpu=power8 -mpower8-vector -mpower8-fusion" ${TOOLCHAIN_C_FLAGS_VARIABLE})
- gmx_test_cflag(GNU_CXX_VSX_POWER8 "-mcpu=power8 -mpower8-vector -mpower8-fusion" ${TOOLCHAIN_CXX_FLAGS_VARIABLE})
+ gmx_test_cflag(GNU_C_VSX_POWER8 "-mcpu=power8 -mpower8-vector" ${TOOLCHAIN_C_FLAGS_VARIABLE})
+ gmx_test_cflag(GNU_CXX_VSX_POWER8 "-mcpu=power8 -mpower8-vector" ${TOOLCHAIN_CXX_FLAGS_VARIABLE})
elseif(CPU_DETECTION_BRAND MATCHES "POWER9")
# Enable power9 vector extensions on such platforms.
# TODO consider whether adding " -mpower9-vector -mpower9-fusion"
else()
# Don't add arch-specific flags for unknown architectures.
endif()
- # Altivec was originally single-only, and it took a while for compilers
- # to support the double-precision features in VSX.
- if(GMX_DOUBLE AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS "4.9")
- message(FATAL_ERROR "Using VSX SIMD in double precision with GCC requires GCC-4.9 or later.")
- endif()
endif()
if(${CMAKE_CXX_COMPILER_ID} MATCHES "XL" OR ${CMAKE_C_COMPILER_ID} MATCHES "XL")
if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS "13.1.5" OR CMAKE_C_COMPILER_VERSION VERSION_LESS "13.1.5")
function(gmx_find_simd_avx_128_fma_flags C_FLAGS_RESULT CXX_FLAGS_RESULT C_FLAGS_VARIABLE CXX_FLAGS_VARIABLE)
find_x86_toolchain_flags(TOOLCHAIN_C_FLAGS TOOLCHAIN_CXX_FLAGS)
- # We don't have the full compiler version string yet (BUILD_C_COMPILER),
- # so we can't distinguish vanilla from Apple clang versions, but catering for a few rare AMD
- # hackintoshes is not worth the effort.
- if (APPLE AND (CMAKE_C_COMPILER_ID STREQUAL "Clang" OR
- CMAKE_CXX_COMPILER_ID STREQUAL "Clang"))
- message(WARNING "Due to a known compiler bug, Clang up to version 3.2 (and Apple Clang up to version 4.1) produces incorrect code with AVX_128_FMA SIMD.")
- endif()
-
- # clang <=3.2 contains a bug that causes incorrect code to be generated for the
- # vfmaddps instruction and therefore the bug is triggered with AVX_128_FMA.
- # (see: http://llvm.org/bugs/show_bug.cgi?id=15040).
- # We can work around this by not using the integrated assembler (except on OS X
- # which has an outdated assembler that does not support AVX instructions).
- if (CMAKE_C_COMPILER_ID MATCHES "Clang" AND CMAKE_C_COMPILER_VERSION VERSION_LESS "3.3")
- # we assume that we have an external assembler that supports AVX
- message(STATUS "Clang ${CMAKE_C_COMPILER_VERSION} detected, enabling FMA bug workaround")
- set(TOOLCHAIN_C_FLAGS "${TOOLCHAIN_C_FLAGS} -no-integrated-as")
- endif()
- if (CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS "3.3")
- # we assume that we have an external assembler that supports AVX
- message(STATUS "Clang ${CMAKE_CXX_COMPILER_VERSION} detected, enabling FMA bug workaround")
- set(TOOLCHAIN_CXX_FLAGS "${TOOLCHAIN_CXX_FLAGS} -no-integrated-as")
- endif()
-
# AVX128/FMA on AMD is a bit complicated. We need to do detection in three stages:
# 1) Find the flags required for generic AVX support
# 2) Find the flags necessary to enable fused-multiply add support
# GROMACS 2016 2
# GROMACS 2018 3
# GROMACS 2019 4
+# GROMACS 2020 5
# LIBRARY_SOVERSION_MINOR so minor version for the built libraries.
# Should be increased for each release that changes only the implementation.
# In GROMACS, the typical policy is to increase it for each patch version
# This ID is used for the source code tarball.
# GMX_MANUAL_DOI_ID
# Same as above, but for the reference manual.
- # Setting and retrieving of those variables is handled in gmxCheckReleaseDOI.cmake
# They are collected into a single section below.
# The following variables are set based on these:
# GMX_VERSION String composed from GMX_VERSION_* numeric variables
# The GROMACS convention is that these are the version number of the next
# release that is going to be made from this branch.
-set(GMX_VERSION_MAJOR 2019)
-set(GMX_VERSION_PATCH 2)
+set(GMX_VERSION_MAJOR 2020)
+set(GMX_VERSION_PATCH 0)
# The suffix, on the other hand, is used mainly for betas and release
# candidates, where it signifies the most recent such release from
# this branch; it will be empty before the first such release, as well
# here. The important thing is to minimize the chance of third-party
# code being able to dynamically link with a version of libgromacs
# that might not work.
-set(LIBRARY_SOVERSION_MAJOR 4)
+set(LIBRARY_SOVERSION_MAJOR 5)
set(LIBRARY_SOVERSION_MINOR 0)
set(LIBRARY_VERSION ${LIBRARY_SOVERSION_MAJOR}.${LIBRARY_SOVERSION_MINOR}.0)
endif()
set(REGRESSIONTEST_VERSION "${GMX_VERSION_STRING}")
-set(REGRESSIONTEST_BRANCH "refs/heads/release-2019")
+set(REGRESSIONTEST_BRANCH "refs/heads/master")
# Run the regressiontests packaging job with the correct pakage
# version string, and the release box checked, in order to have it
# build the regressiontests tarball with all the right naming. The
# naming affects the md5sum that has to go here, and if it isn't right
# release workflow will report a failure.
-set(REGRESSIONTEST_MD5SUM "36103a0078bf9a802e16ec982b4e5c4c" CACHE INTERNAL "MD5 sum of the regressiontests tarball for this GROMACS version")
+set(REGRESSIONTEST_MD5SUM "3d06d41e07f523d70ae575b9ad75c670" CACHE INTERNAL "MD5 sum of the regressiontests tarball for this GROMACS version")
math(EXPR GMX_VERSION_NUMERIC
"${GMX_VERSION_MAJOR}*10000 + ${GMX_VERSION_PATCH}")
reference-manual/analysis/rmsd.rst
reference-manual/analysis/covariance-analysis.rst
reference-manual/analysis/dihedral-pca.rst
+ reference-manual/analysis/hydrogen-bonds.rst
reference-manual/analysis/protein-related.rst
reference-manual/analysis/interface-related.rst)
# The image files have also been ordered by the respective
how-to/visualize.rst
install-guide/index.rst
release-notes/index.rst
+ release-notes/highlights.rst
+ release-notes/features.rst
+ release-notes/performance.rst
+ release-notes/tools.rst
+ release-notes/bugs-fixed.rst
+ release-notes/removed-functionality.rst
+ release-notes/deprecated-functionality.rst
+ release-notes/portability.rst
+ release-notes/miscellaneous.rst
+ release-notes/2019/2019.2.rst
release-notes/2019/2019.1.rst
release-notes/2019/major/highlights.rst
release-notes/2019/major/features.rst
add_custom_target(install-guide
COMMAND
${SPHINX_EXECUTABLE}
- -q -E -b text
+ -q -b text
-w sphinx-install.log
-d ${CMAKE_CURRENT_BINARY_DIR}/install-guide/_doctrees
-c ${SPHINX_INPUT_DIR}
add_custom_target(webpage-sphinx
DEPENDS sphinx-programs
DEPENDS sphinx-input
- DEPENDS sphinx-image-conversion
+ DEPENDS sphinx-image-conversion
+ DEPENDS manual
COMMAND
${CMAKE_COMMAND} -E make_directory ${SPHINX_INPUT_DIR}/_static
COMMAND
${SPHINX_EXECUTABLE}
- -q -E -b html
+ -q -b html
-w sphinx-html.log
-d "${SPHINX_CACHE_DIR}"
"${SPHINX_INPUT_DIR}"
add_custom_target(man
COMMAND
${SPHINX_EXECUTABLE}
- -q -E -b man
+ -q -b man
-w sphinx-man.log
-d ${SPHINX_CACHE_DIR}
-t do_man
On a cluster where users are expected to be running across multiple
nodes using MPI, make one installation similar to the above, and
-another using an MPI wrapper compiler and which is `building only
+another using ``-DGMX_MPI=on`` and which is `building only
mdrun`_, because that is the only component of |Gromacs| that uses
MPI. The latter will install a single simulation engine binary,
i.e. ``mdrun_mpi`` when the default suffix is used. Hence it is safe
Compiler
^^^^^^^^
-|Gromacs| can be compiled on any platform with ANSI C99 and C++11
+|Gromacs| can be compiled on any platform with ANSI C99 and C++14
compilers, and their respective standard C/C++ libraries. Good
performance on an OS and architecture requires choosing a good
compiler. We recommend gcc, because it is free, widely available and
frequently provides the best performance.
You should strive to use the most recent version of your
-compiler. Since we require full C++11 support the minimum supported
+compiler. Since we require full C++14 support the minimum supported
compiler versions are
-* GNU (gcc) 4.8.1
+* GNU (gcc) 5.1
* Intel (icc) 17.0.1
-* LLVM (clang) 3.3
-* Microsoft (MSVC) 2017 (C++14 is used)
+* LLVM (clang) 3.6
+* Microsoft (MSVC) 2017
Other compilers may work (Cray, Pathscale, older clang) but do
not offer competitive performance. We recommend against PGI because
components beside the compiler itself (e.g. assembler or linker);
these are often shipped by your OS distribution's binutils package.
-C++11 support requires adequate support in both the compiler and the
+C++14 support requires adequate support in both the compiler and the
C++ library. The gcc and MSVC compilers include their own standard
-libraries and require no further configuration. For configuration of
-other compilers, read on.
+libraries and require no further configuration. If your vendor's
+compiler also manages the standard library library via compiler flags,
+these will be honored. For configuration of other compilers, read on.
On Linux, both the Intel and clang compiler use the libstdc++ which
comes with gcc as the default C++ library. For |Gromacs|, we require
-the compiler to support libstc++ version 4.8.1 or higher. To select a
-particular libstdc++ library, use:
-
-* For Intel: ``-DGMX_STDLIB_CXX_FLAGS=-gcc-name=/path/to/gcc/binary``
- or make sure that the correct gcc version is first in path (e.g. by
- loading the gcc module). It can also be useful to add
- ``-DCMAKE_CXX_LINK_FLAGS="-Wl,-rpath,/path/to/gcc/lib64
- -L/path/to/gcc/lib64"`` to ensure linking works correctly.
-* For clang:
- ``-DCMAKE_CXX_FLAGS=--gcc-toolchain=/path/to/gcc/folder``. This
- folder should contain ``include/c++``.
+the compiler to support libstc++ version 5.1 or higher. To select a
+particular libstdc++ library, provide the path to g++ with
+``-DGMX_GPLUSPLUS_PATH=/path/to/g++``.
On Windows with the Intel compiler, the MSVC standard library is used,
and at least MSVC 2017 is required. Load the enviroment variables with
vcvarsall.bat.
-To build with any compiler and clang's libcxx standard library, use
-``-DGMX_STDLIB_CXX_FLAGS=-stdlib=libc++
--DGMX_STDLIB_LIBRARIES='-lc++abi -lc++'``.
+To build with clang and llvm's libcxx standard library, use
+``-DCMAKE_CXX_FLAGS=-stdlib=libc++``.
If you are running on Mac OS X, the best option is the Intel
compiler. Both clang and gcc will work, but they produce lower
|Gromacs| has excellent support for NVIDIA GPUs supported via CUDA.
On Linux, NVIDIA CUDA_ toolkit with minimum version |REQUIRED_CUDA_VERSION|
-is required, and the latest version is strongly encouraged. Using
-Microsoft MSVC compiler requires version 9.0. NVIDIA GPUs with at
+is required, and the latest version is strongly encouraged. NVIDIA GPUs with at
least NVIDIA compute capability |REQUIRED_CUDA_COMPUTE_CAPABILITY| are
required. You are strongly recommended to
get the latest CUDA version and driver that supports your hardware, but
To make it possible to use other accelerators, |Gromacs| also includes
OpenCL_ support. The minimum OpenCL version required is
- |REQUIRED_OPENCL_MIN_VERSION|. The current OpenCL implementation is recommended for
+ |REQUIRED_OPENCL_MIN_VERSION| and only 64-bit implementations are supported.
+ The current OpenCL implementation is recommended for
use with GCN-based AMD GPUs, and on Linux we recommend the ROCm runtime.
Intel integrated GPUs are supported with the Neo drivers.
OpenCL is also supported with NVIDIA GPUs, but using
to the NVIDIA OpenCL runtime).
It is not possible to configure both CUDA and OpenCL
support in the same build of |Gromacs|, nor to support both
- Intel and other vendors' GPUs with OpenCL.
+ Intel and other vendors' GPUs with OpenCL. A 64-bit implementation
+ of OpenCL is required and therefore OpenCL is only supported on 64-bit platforms.
.. _mpi-support:
standard, and
* wrapper compilers that will compile code using that library.
+To compile with MPI set your compiler to the normal (non-MPI) compiler
+and add ``-DGMX_MPI=on`` to the cmake options. It is possible to set
+the compiler to the MPI compiler wrapper but it is neither necessary
+nor recommended.
+
The |Gromacs| team recommends OpenMPI_ version
1.6 (or higher), MPICH_ version 1.4.1 (or
higher), or your hardware vendor's MPI installation. The most recent
CPUs also works well.
Experimental support is available for compiling CUDA code, both for host and
-device, using clang (version 3.9 or later).
-A CUDA toolkit (>= v7.0) is still required but it is used only for GPU device code
+device, using clang (version 6.0 or later).
+A CUDA toolkit is still required but it is used only for GPU device code
generation and to link against the CUDA runtime library.
The clang CUDA support simplifies compilation and provides benefits for development
(e.g. allows the use code sanitizers in CUDA host-code).
Note that this is mainly a developer-oriented feature and it is not recommended
for production use as the performance can be significantly lower than that
of code compiled with nvcc (and it has also received less testing).
-However, note that with clang 5.0 the performance gap is significantly narrowed
+However, note that since clang 5.0 the performance gap is only moderate
(at the time of writing, about 20% slower GPU kernels), so this version
could be considered in non performance-critical use-cases.
it works because we have tested it. We do test on Linux, Windows, and
Mac with a range of compilers and libraries for a range of our
configuration options. Every commit in our git source code repository
-is currently tested on x86 with a number of gcc versions ranging from 4.8.1
-through 7, versions 16 and 18 of the Intel compiler, and Clang
-versions 3.4 through 5. For this, we use a variety of GNU/Linux
+is currently tested on x86 with a number of gcc versions ranging from 5.1
+through 8.1, version 19 of the Intel compiler, and Clang
+versions 3.6 through 7. For this, we use a variety of GNU/Linux
flavors and versions as well as recent versions of Windows. Under
Windows, we test both MSVC 2017 and version 16 of the Intel compiler.
+Other compiler, library, and OS versions are tested less frequently.
For details, you can
have a look at the `continuous integration server used by GROMACS`_,
which runs Jenkins_.
functionality supported, whereas patch releases contain only fixes for
issues identified in the corresponding major releases.
-Two versions of |Gromacs| are under active maintenance, the 2019
-series and the 2018 series. In the latter, only highly conservative
+Two versions of |Gromacs| are under active maintenance, the NEXT
+series and the 2019 series. In the latter, only highly conservative
fixes will be made, and only to address issues that affect scientific
correctness. Naturally, some of those releases will be made after the
-year 2018 ends, but we keep 2018 in the name so users understand how
+year 2019 ends, but we keep 2018 in the name so users understand how
up to date their version is. Such fixes will also be incorporated into
-the 2019 release series, as appropriate. Around the time the 2020
-release is made, the 2018 series will no longer be maintained.
+the NEXT release series, as appropriate. Around the time the NEXT+1
+release is made, the 2019 series will no longer be maintained.
Where issue numbers are reported in these release notes, more details
can be found at https://redmine.gromacs.org at that issue number.
+|Gromacs| NEXT series
+---------------------
+
+.. toctree::
+ :maxdepth: 1
+
+ highlights
+ features
+ performance
+ tools
+ bugs-fixed
+ deprecated-functionality
+ removed-functionality
+ portability
+ miscellaneous
+
+
|Gromacs| 2019 series
---------------------
.. toctree::
:maxdepth: 1
+ 2019/2019.2
2019/2019.1
Major release
integrators, the leap-frog :mdp-value:`integrator=md` integrator
only supports 1. Data for the NH chain variables is not printed
to the :ref:`edr` file by default, but can be turned on with the
- :mdp:`print-nose-hoover-chains` option.
+ :mdp:`print-nose-hoover-chain-variables` option.
.. mdp:: print-nose-hoover-chain-variables
Electric fields
^^^^^^^^^^^^^^^
- .. mdp:: electric-field-x ; electric-field-y ; electric-field-z
+ .. mdp:: electric-field-x
+ .. mdp:: electric-field-y
+ .. mdp:: electric-field-z
Here you can specify an electric field that optionally can be
alternating and pulsed. The general expression for the field
has the form of a gaussian laser pulse:
- E(t) = E0 exp ( -(t-t0)\ :sup:`2`/(2 sigma\ :sup:`2`) ) cos(omega (t-t0))
+ .. math:: E(t) = E_0 \exp\left[-\frac{(t-t_0)^2}{2\sigma^2}\right]\cos\left[\omega (t-t_0)\right]
For example, the four parameters for direction x are set in the
- three fields of ``electric-field-x`` (and similar for y and z)
- like
+ fields of :mdp:`electric-field-x` (and similar for ``electric-field-y``
+ and ``electric-field-z``) like
- electric-field-x = E0 omega t0 sigma
+ ``electric-field-x = E0 omega t0 sigma``
- In the special case that sigma = 0, the exponential term is omitted
- and only the cosine term is used. If also omega = 0 a static
- electric field is applied.
+ with units (respectively) V nm\ :sup:`-1`, ps\ :sup:`-1`, ps, ps.
- More details in Carl Caleman and David van der Spoel: Picosecond
- Melting of Ice by an Infrared Laser Pulse - A Simulation Study.
- Angew. Chem. Intl. Ed. 47 pp. 14 17-1420 (2008)
+ In the special case that ``sigma = 0``, the exponential term is omitted
+ and only the cosine term is used. If also ``omega = 0`` a static
+ electric field is applied.
+ Read more at :ref:`electric fields` and in ref. \ :ref:`146 <refCaleman2008a>`.
Mixed quantum/classical molecular dynamics
add_subdirectory(gmxlib)
add_subdirectory(mdlib)
-add_subdirectory(applied-forces)
-add_subdirectory(listed-forces)
+add_subdirectory(applied_forces)
+add_subdirectory(listed_forces)
+add_subdirectory(nbnxm)
add_subdirectory(commandline)
add_subdirectory(domdec)
add_subdirectory(ewald)
add_subdirectory(math)
add_subdirectory(mdrun)
add_subdirectory(mdrunutility)
+add_subdirectory(mdspan)
add_subdirectory(mdtypes)
add_subdirectory(onlinehelp)
add_subdirectory(options)
# source files, and issue warnings about that. Remove the use of
# -Wno-missing-prototypes here and above when the group scheme is
# removed.
- if (HAS_NO_MISSING_PROTO AND "${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
+ if (HAS_NO_MISSING_PROTO AND "${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
target_compile_options(libgromacs_generated PRIVATE "-Wno-missing-prototypes")
endif()
if (HAS_NO_MSVC_UNUSED)
${FFT_LIBRARIES} ${LINEAR_ALGEBRA_LIBRARIES}
${THREAD_LIB} ${GMX_SHARED_LINKER_FLAGS}
${OpenCL_LIBRARIES}
- ${GMX_STDLIB_LIBRARIES}
PUBLIC
${GMX_PUBLIC_LIBRARIES}
)
+if (GMX_OPENMP)
+ target_link_libraries(libgromacs PUBLIC OpenMP::OpenMP_CXX)
+endif()
set_target_properties(libgromacs PROPERTIES
OUTPUT_NAME "gromacs${GMX_LIBS_SUFFIX}"
SOVERSION ${LIBRARY_SOVERSION_MAJOR}
VERSION ${LIBRARY_VERSION}
- COMPILE_FLAGS "${OpenMP_C_FLAGS}")
+ )
gmx_manage_lmfit()
target_link_libraries(libgromacs PRIVATE lmfit)
DESTINATION ${GMX_INSTALL_OCLDIR}/gromacs/pbcutil
COMPONENT libraries)
- # Install the NB source and headers
+ # Install the NBNXM source and headers
file(GLOB OPENCL_INSTALLED_FILES
- mdlib/nbnxn_consts.h
+ nbnxm/constants.h
)
install(FILES ${OPENCL_INSTALLED_FILES}
- DESTINATION ${GMX_INSTALL_OCLDIR}/gromacs/mdlib
+ DESTINATION ${GMX_INSTALL_OCLDIR}/gromacs/nbnxm
COMPONENT libraries)
file(GLOB OPENCL_INSTALLED_FILES
- mdlib/nbnxn_ocl/nbnxn_ocl_kernels.cl
- mdlib/nbnxn_ocl/nbnxn_ocl_kernel.clh
- mdlib/nbnxn_ocl/nbnxn_ocl_kernel_pruneonly.clh
- mdlib/nbnxn_ocl/nbnxn_ocl_kernels.clh
- mdlib/nbnxn_ocl/nbnxn_ocl_kernels_fastgen.clh
- mdlib/nbnxn_ocl/nbnxn_ocl_kernels_fastgen_add_twincut.clh
- mdlib/nbnxn_ocl/nbnxn_ocl_kernel_utils.clh
- mdlib/nbnxn_ocl/nbnxn_ocl_consts.h
+ nbnxm/opencl/nbnxm_ocl_kernels.cl
+ nbnxm/opencl/nbnxm_ocl_kernel.clh
+ nbnxm/opencl/nbnxm_ocl_kernel_pruneonly.clh
+ nbnxm/opencl/nbnxm_ocl_kernels.clh
+ nbnxm/opencl/nbnxm_ocl_kernels_fastgen.clh
+ nbnxm/opencl/nbnxm_ocl_kernels_fastgen_add_twincut.clh
+ nbnxm/opencl/nbnxm_ocl_kernel_utils.clh
+ nbnxm/opencl/nbnxm_ocl_consts.h
)
install(FILES ${OPENCL_INSTALLED_FILES}
- DESTINATION ${GMX_INSTALL_OCLDIR}/gromacs/mdlib/nbnxn_ocl
+ DESTINATION ${GMX_INSTALL_OCLDIR}/gromacs/nbnxm/opencl
COMPONENT libraries)
# Install the PME source and headers
file(GLOB OPENCL_INSTALLED_FILES
- ewald/pme-spread.clh
- ewald/pme-solve.clh
- ewald/pme-gather.clh
- ewald/pme-gpu-utils.clh
- ewald/pme-program.cl
- ewald/pme-gpu-types.h
+ ewald/pme_spread.clh
+ ewald/pme_solve.clh
+ ewald/pme_gather.clh
+ ewald/pme_gpu_utils.clh
+ ewald/pme_program.cl
+ ewald/pme_gpu_types.h
)
install(FILES ${OPENCL_INSTALLED_FILES}
DESTINATION ${GMX_INSTALL_OCLDIR}/gromacs/ewald
#include "gromacs/mdlib/gmx_omp_nthreads.h"
#include "gromacs/mdlib/mdatoms.h"
#include "gromacs/mdlib/mdsetup.h"
-#include "gromacs/mdlib/nb_verlet.h"
-#include "gromacs/mdlib/nbnxn_grid.h"
#include "gromacs/mdlib/nsgrid.h"
#include "gromacs/mdlib/vsite.h"
#include "gromacs/mdtypes/commrec.h"
#include "gromacs/mdtypes/md_enums.h"
#include "gromacs/mdtypes/nblist.h"
#include "gromacs/mdtypes/state.h"
+#include "gromacs/nbnxm/nbnxm.h"
#include "gromacs/pulling/pull.h"
#include "gromacs/timing/wallcycle.h"
#include "gromacs/topology/mtop_util.h"
/* Copy back the global charge group indices from state
* and rebuild the local charge group to atom index.
*/
- for (gmx::index i = 0; i < atomGroupsState.size(); i++)
+ for (gmx::index i = 0; i < atomGroupsState.ssize(); i++)
{
const int atomGroupGlobal = atomGroupsState[i];
const int groupSize = gcgs_index[atomGroupGlobal + 1] - gcgs_index[atomGroupGlobal];
cg0 = cg1 - cd->ind[p-1].nrecv[zone];
}
- const int numThreads = static_cast<int>(comm->dth.size());
+ const int numThreads = gmx::ssize(comm->dth);
#pragma omp parallel for num_threads(numThreads) schedule(static)
for (int th = 0; th < numThreads; th++)
{
gmx::ArrayRef<T> vectorToSort,
std::vector<T> *workVector)
{
- if (gmx::index(workVector->size()) < sort.size())
+ if (gmx::index(workVector->size()) < sort.ssize())
{
workVector->resize(sort.size());
}
const int movedValue = NSGRID_SIGNAL_MOVED_FAC*fr->ns->grid->ncells;
- if (ncg_home_old >= 0)
+ if (ncg_home_old >= 0 && !sort->sorted.empty())
{
+ GMX_RELEASE_ASSERT(sort->sorted.size() == static_cast<size_t>(ncg_home_old),
+ "The sorting buffer should contain the old home charge group indices");
+
std::vector<gmx_cgsort_t> &stationary = sort->stationary;
std::vector<gmx_cgsort_t> &moved = sort->moved;
/* Reorder the state */
gmx::ArrayRef<const gmx_cgsort_t> cgsort = sort->sorted;
- GMX_RELEASE_ASSERT(cgsort.size() == dd->ncg_home, "We should sort all the home atom groups");
+ GMX_RELEASE_ASSERT(cgsort.ssize() == dd->ncg_home, "We should sort all the home atom groups");
if (state->flags & (1 << estX))
{
else
{
/* Copy the sorted ns cell indices back to the ns grid struct */
- for (gmx::index i = 0; i < cgsort.size(); i++)
+ for (gmx::index i = 0; i < cgsort.ssize(); i++)
{
fr->ns->grid->cell_index[i] = cgsort[i].nsc;
}
gmx_bool bMasterState,
int nstglobalcomm,
t_state *state_global,
- const gmx_mtop_t *top_global,
+ const gmx_mtop_t &top_global,
const t_inputrec *ir,
t_state *state_local,
PaddedVector<gmx::RVec> *f,
true, xGlobal,
&ddbox);
- distributeState(mdlog, dd, *top_global, state_global, ddbox, state_local, f);
+ distributeState(mdlog, dd, top_global, state_global, ddbox, state_local, f);
dd_make_local_cgs(dd, &top_local->cgs);
fr->cginfo,
state_local->x,
ncg_moved, bRedist ? comm->movedBuffer.data() : nullptr,
- fr->nbv->grp[eintLocal].kernel_type,
+ fr->nbv->grp[Nbnxm::InteractionLocality::Local].kernel_type,
fr->nbv->nbat);
nbnxn_get_ncells(fr->nbv->nbs.get(), &ncells_new[XX], &ncells_new[YY]);
wallcycle_sub_stop(wcycle, ewcsDD_GRID);
}
+ else
+ {
+ /* With the group scheme the sorting array is part of the DD state,
+ * but it just got out of sync, so mark as invalid by emptying it.
+ */
+ if (ir->cutoff_scheme == ecutsGROUP)
+ {
+ comm->sort->sorted.clear();
+ }
+ }
if (comm->useUpdateGroups)
{
if (dd->splitConstraints || dd->splitSettles)
{
/* Only for inter-cg constraints we need special code */
- n = dd_make_local_constraints(dd, n, top_global, fr->cginfo,
+ n = dd_make_local_constraints(dd, n, &top_global, fr->cginfo,
constr, ir->nProjOrder,
top_local->idef.il);
}
if (comm->nstDDDump > 0 && step % comm->nstDDDump == 0)
{
dd_move_x(dd, state_local->box, state_local->x, nullWallcycle);
- write_dd_pdb("dd_dump", step, "dump", top_global, cr,
+ write_dd_pdb("dd_dump", step, "dump", &top_global, cr,
-1, state_local->x.rvec_array(), state_local->box);
}
if (comm->DD_debug > 0)
{
/* Set the env var GMX_DD_DEBUG if you suspect corrupted indices */
- check_index_consistency(dd, top_global->natoms, ncg_mtop(top_global),
+ check_index_consistency(dd, top_global.natoms, ncg_mtop(&top_global),
"after partitioning");
}
#include <list>
#include "gromacs/domdec/domdec.h"
-#include "gromacs/ewald/ewald-utils.h"
+#include "gromacs/ewald/ewald_utils.h"
#include "gromacs/fft/parallel_3dfft.h"
#include "gromacs/fileio/pdbio.h"
#include "gromacs/gmxlib/network.h"
#include "gromacs/utility/stringutil.h"
#include "gromacs/utility/unique_cptr.h"
-#include "calculate-spline-moduli.h"
-#include "pme-gather.h"
-#include "pme-gpu-internal.h"
-#include "pme-grid.h"
-#include "pme-internal.h"
-#include "pme-redistribute.h"
-#include "pme-solve.h"
-#include "pme-spline-work.h"
-#include "pme-spread.h"
+#include "calculate_spline_moduli.h"
+#include "pme_gather.h"
+#include "pme_gpu_internal.h"
+#include "pme_grid.h"
+#include "pme_internal.h"
+#include "pme_redistribute.h"
+#include "pme_solve.h"
+#include "pme_spline_work.h"
+#include "pme_spread.h"
/*! \brief Help build a descriptive message in \c error if there are
* \c errorReasons why PME on GPU is not supported.
addMessageIfNotSupported(const std::list<std::string> &errorReasons,
std::string *error)
{
- bool foundErrorReasons = errorReasons.empty();
- if (!foundErrorReasons && error)
+ bool isSupported = errorReasons.empty();
+ if (!isSupported && error)
{
std::string regressionTestMarker = "PME GPU does not support";
// this prefix is tested for in the regression tests script gmxtest.pl
- *error = regressionTestMarker + ": " + gmx::joinStrings(errorReasons, "; ") + ".";
+ *error = regressionTestMarker;
+ if (errorReasons.size() == 1)
+ {
+ *error += " " + errorReasons.back();
+ }
+ else
+ {
+ *error += ": " + gmx::joinStrings(errorReasons, "; ");
+ }
+ *error += ".";
}
- return foundErrorReasons;
+ return isSupported;
}
- bool pme_gpu_supports_build(const gmx_hw_info_t &hwinfo,
- std::string *error)
+ bool pme_gpu_supports_build(std::string *error)
{
std::list<std::string> errorReasons;
if (GMX_DOUBLE)
{
- errorReasons.emplace_back("double precision");
+ errorReasons.emplace_back("a double-precision build");
}
if (GMX_GPU == GMX_GPU_NONE)
{
- errorReasons.emplace_back("non-GPU build of GROMACS");
+ errorReasons.emplace_back("a non-GPU build");
}
+ return addMessageIfNotSupported(errorReasons, error);
+ }
+
+ bool pme_gpu_supports_hardware(const gmx_hw_info_t &hwinfo,
+ std::string *error)
+ {
+ std::list<std::string> errorReasons;
if (GMX_GPU == GMX_GPU_OPENCL)
{
if (!areAllGpuDevicesFromAmd(hwinfo.gpu_info))
{
- errorReasons.emplace_back("only AMD devices are supported");
+ errorReasons.emplace_back("non-AMD devices");
}
}
return addMessageIfNotSupported(errorReasons, error);
#include <algorithm>
#include "gromacs/domdec/domdec.h"
-#include "gromacs/ewald/pme-gather.h"
-#include "gromacs/ewald/pme-gpu-internal.h"
-#include "gromacs/ewald/pme-grid.h"
-#include "gromacs/ewald/pme-internal.h"
-#include "gromacs/ewald/pme-solve.h"
-#include "gromacs/ewald/pme-spread.h"
+#include "gromacs/ewald/pme_gather.h"
+#include "gromacs/ewald/pme_gpu_internal.h"
+#include "gromacs/ewald/pme_grid.h"
+#include "gromacs/ewald/pme_internal.h"
+#include "gromacs/ewald/pme_solve.h"
+#include "gromacs/ewald/pme_spread.h"
#include "gromacs/fft/parallel_3dfft.h"
#include "gromacs/gpu_utils/gpu_utils.h"
#include "gromacs/math/invertmatrix.h"
break;
case CodePath::GPU:
- implemented = (pme_gpu_supports_build(hwinfo, nullptr) &&
+ implemented = (pme_gpu_supports_build(nullptr) &&
+ pme_gpu_supports_hardware(hwinfo, nullptr) &&
pme_gpu_supports_input(*inputRec, mtop, nullptr));
break;
)
{
const index atomCount = coordinates.size();
- GMX_RELEASE_ASSERT(atomCount == charges.size(), "Mismatch in atom data");
+ GMX_RELEASE_ASSERT(atomCount == charges.ssize(), "Mismatch in atom data");
PmeSafePointer pmeSafe = pmeInitInternal(inputRec, mode, gpuInfo, pmeGpuProgram, atomCount, box);
pme_atomcomm_t *atc = nullptr;
{
pme_atomcomm_t *atc = &(pme->atc[0]);
const index atomCount = atc->n;
- GMX_RELEASE_ASSERT(forces.size() == atomCount, "Invalid force buffer size");
+ GMX_RELEASE_ASSERT(forces.ssize() == atomCount, "Invalid force buffer size");
const bool forceReductionWithInput = (inputTreatment == PmeForceOutputHandling::ReduceWithInput);
const real scale = 1.0;
const size_t threadIndex = 0;
const index atomCount = atc->n;
const index pmeOrder = pme->pme_order;
const index dimSize = pmeOrder * atomCount;
- GMX_RELEASE_ASSERT(dimSize == splineValues.size(), "Mismatch in spline data");
+ GMX_RELEASE_ASSERT(dimSize == splineValues.ssize(), "Mismatch in spline data");
real *splineBuffer = pmeGetSplineDataInternal(pme, type, dimIndex);
switch (mode)
{
const pme_atomcomm_t *atc = &(pme->atc[0]);
const index atomCount = atc->n;
- GMX_RELEASE_ASSERT(atomCount == gridLineIndices.size(), "Mismatch in gridline indices size");
+ GMX_RELEASE_ASSERT(atomCount == gridLineIndices.ssize(), "Mismatch in gridline indices size");
IVec paddedGridSizeUnused, gridSize(0, 0, 0);
pmeGetRealGridSizesInternal(pme, mode, gridSize, paddedGridSizeUnused);
#include "testhardwarecontexts.h"
-#include "gromacs/compat/make_unique.h"
+#include <memory>
+
#include "gromacs/ewald/pme.h"
#include "gromacs/gpu_utils/gpu_utils.h"
#include "gromacs/hardware/detecthardware.h"
void PmeTestEnvironment::SetUp()
{
- hardwareContexts_.emplace_back(compat::make_unique<TestHardwareContext>(CodePath::CPU, "", nullptr));
+ hardwareContexts_.emplace_back(std::make_unique<TestHardwareContext>(CodePath::CPU, "", nullptr));
hardwareInfo_ = hardwareInit();
- if (!pme_gpu_supports_build(*hardwareInfo_, nullptr))
+ if (!pme_gpu_supports_build(nullptr) ||
+ !pme_gpu_supports_hardware(*hardwareInfo_, nullptr))
{
// PME can only run on the CPU, so don't make any more test contexts.
return;
char stmp[200] = {};
get_gpu_device_info_string(stmp, hardwareInfo_->gpu_info, gpuIndex);
std::string description = "(GPU " + std::string(stmp) + ") ";
- hardwareContexts_.emplace_back(compat::make_unique<TestHardwareContext>
+ hardwareContexts_.emplace_back(std::make_unique<TestHardwareContext>
(CodePath::GPU, description.c_str(),
deviceInfo));
}
--- /dev/null
- #include "gromacs/trajectoryanalysis/topologyinformation.h"
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2004, The GROMACS development team.
+ * Copyright (c) 2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+#include "gmxpre.h"
+
+#include "insert_molecules.h"
+
+#include <algorithm>
+#include <memory>
+#include <set>
+#include <string>
+#include <vector>
+
+#include "gromacs/commandline/cmdlineoptionsmodule.h"
+#include "gromacs/fileio/confio.h"
+#include "gromacs/fileio/filetypes.h"
+#include "gromacs/fileio/xvgr.h"
+#include "gromacs/gmxlib/conformation_utilities.h"
+#include "gromacs/gmxpreprocess/makeexclusiondistances.h"
+#include "gromacs/math/functions.h"
+#include "gromacs/math/utilities.h"
+#include "gromacs/math/vec.h"
+#include "gromacs/options/basicoptions.h"
+#include "gromacs/options/filenameoption.h"
+#include "gromacs/options/ioptionscontainer.h"
+#include "gromacs/pbcutil/pbc.h"
+#include "gromacs/random/threefry.h"
+#include "gromacs/random/uniformrealdistribution.h"
+#include "gromacs/selection/nbsearch.h"
+#include "gromacs/selection/selection.h"
+#include "gromacs/selection/selectioncollection.h"
+#include "gromacs/selection/selectionoption.h"
+#include "gromacs/selection/selectionoptionbehavior.h"
+#include "gromacs/topology/atomprop.h"
+#include "gromacs/topology/atoms.h"
+#include "gromacs/topology/atomsbuilder.h"
+#include "gromacs/topology/mtop_util.h"
+#include "gromacs/topology/symtab.h"
+#include "gromacs/topology/topology.h"
+#include "gromacs/trajectory/trajectoryframe.h"
- #include "gromacs/utility/unique_cptr.h"
+#include "gromacs/utility/cstringutil.h"
+#include "gromacs/utility/exceptions.h"
+#include "gromacs/utility/fatalerror.h"
+#include "gromacs/utility/smalloc.h"
- gmx_mtop_t *getTopology(bool /*required*/) override { return topInfo_.mtop(); }
+
+using gmx::RVec;
+
+/* enum for random rotations of inserted solutes */
+enum RotationType {
+ en_rotXYZ, en_rotZ, en_rotNone
+};
+const char *const cRotationEnum[] = {"xyz", "z", "none"};
+
+static void center_molecule(gmx::ArrayRef<RVec> x)
+{
+ rvec center = {0};
+ for (auto &xi : x)
+ {
+ rvec_inc(center, xi);
+ }
+ svmul(1.0/x.size(), center, center);
+ for (auto &xi : x)
+ {
+ rvec_dec(xi, center);
+ }
+}
+
+static void generate_trial_conf(gmx::ArrayRef<RVec> xin,
+ const rvec offset, RotationType enum_rot,
+ gmx::DefaultRandomEngine * rng,
+ std::vector<RVec> *xout)
+{
+ gmx::UniformRealDistribution<real> dist(0, 2.0*M_PI);
+ xout->assign(xin.begin(), xin.end());
+
+ real alfa = 0.0, beta = 0.0, gamma = 0.0;
+ switch (enum_rot)
+ {
+ case en_rotXYZ:
+ alfa = dist(*rng);
+ beta = dist(*rng);
+ gamma = dist(*rng);
+ break;
+ case en_rotZ:
+ alfa = beta = 0.;
+ gamma = dist(*rng);
+ break;
+ case en_rotNone:
+ alfa = beta = gamma = 0.;
+ break;
+ }
+ if (enum_rot == en_rotXYZ || enum_rot == en_rotZ)
+ {
+ rotate_conf(xout->size(), as_rvec_array(xout->data()), nullptr, alfa, beta, gamma);
+ }
+ for (size_t i = 0; i < xout->size(); ++i)
+ {
+ rvec_inc((*xout)[i], offset);
+ }
+}
+
+static bool isInsertionAllowed(gmx::AnalysisNeighborhoodSearch *search,
+ const std::vector<real> &exclusionDistances,
+ const std::vector<RVec> &x,
+ const std::vector<real> &exclusionDistances_insrt,
+ const t_atoms &atoms,
+ const std::set<int> &removableAtoms,
+ gmx::AtomsRemover *remover)
+{
+ gmx::AnalysisNeighborhoodPositions pos(x);
+ gmx::AnalysisNeighborhoodPairSearch pairSearch = search->startPairSearch(pos);
+ gmx::AnalysisNeighborhoodPair pair;
+ while (pairSearch.findNextPair(&pair))
+ {
+ const real r1 = exclusionDistances[pair.refIndex()];
+ const real r2 = exclusionDistances_insrt[pair.testIndex()];
+ if (pair.distance2() < gmx::square(r1 + r2))
+ {
+ if (removableAtoms.count(pair.refIndex()) == 0)
+ {
+ return false;
+ }
+ // TODO: If molecule information is available, this should ideally
+ // use it to remove whole molecules.
+ remover->markResidue(atoms, pair.refIndex(), true);
+ }
+ }
+ return true;
+}
+
+static void insert_mols(int nmol_insrt, int ntry, int seed,
+ real defaultDistance, real scaleFactor,
+ t_atoms *atoms, t_symtab *symtab, std::vector<RVec> *x,
+ const std::set<int> &removableAtoms,
+ const t_atoms &atoms_insrt, gmx::ArrayRef<RVec> x_insrt,
+ int ePBC, matrix box,
+ const std::string &posfn, const rvec deltaR,
+ RotationType enum_rot)
+{
+ fprintf(stderr, "Initialising inter-atomic distances...\n");
+ AtomProperties aps;
+ std::vector<real> exclusionDistances(
+ makeExclusionDistances(atoms, &aps, defaultDistance, scaleFactor));
+ const std::vector<real> exclusionDistances_insrt(
+ makeExclusionDistances(&atoms_insrt, &aps, defaultDistance, scaleFactor));
+
+ const real maxInsertRadius
+ = *std::max_element(exclusionDistances_insrt.begin(),
+ exclusionDistances_insrt.end());
+ real maxRadius = maxInsertRadius;
+ if (!exclusionDistances.empty())
+ {
+ const real maxExistingRadius
+ = *std::max_element(exclusionDistances.begin(),
+ exclusionDistances.end());
+ maxRadius = std::max(maxInsertRadius, maxExistingRadius);
+ }
+
+ // TODO: Make all of this exception-safe.
+ gmx::AnalysisNeighborhood nb;
+ nb.setCutoff(maxInsertRadius + maxRadius);
+
+
+ if (seed == 0)
+ {
+ seed = static_cast<int>(gmx::makeRandomSeed());
+ }
+ fprintf(stderr, "Using random seed %d\n", seed);
+
+ gmx::DefaultRandomEngine rng(seed);
+
+ t_pbc pbc;
+ set_pbc(&pbc, ePBC, box);
+
+ /* With -ip, take nmol_insrt from file posfn */
+ double **rpos = nullptr;
+ const bool insertAtPositions = !posfn.empty();
+ if (insertAtPositions)
+ {
+ int ncol;
+ nmol_insrt = read_xvg(posfn.c_str(), &rpos, &ncol);
+ if (ncol != 3)
+ {
+ gmx_fatal(FARGS, "Expected 3 columns (x/y/z coordinates) in file %s\n",
+ posfn.c_str());
+ }
+ fprintf(stderr, "Read %d positions from file %s\n\n",
+ nmol_insrt, posfn.c_str());
+ }
+
+ gmx::AtomsBuilder builder(atoms, symtab);
+ gmx::AtomsRemover remover(*atoms);
+ {
+ const int finalAtomCount = atoms->nr + nmol_insrt * atoms_insrt.nr;
+ const int finalResidueCount = atoms->nres + nmol_insrt * atoms_insrt.nres;
+ builder.reserve(finalAtomCount, finalResidueCount);
+ x->reserve(finalAtomCount);
+ exclusionDistances.reserve(finalAtomCount);
+ }
+
+ std::vector<RVec> x_n(x_insrt.size());
+
+ int mol = 0;
+ int trial = 0;
+ int firstTrial = 0;
+ int failed = 0;
+ gmx::UniformRealDistribution<real> dist;
+
+ while (mol < nmol_insrt && trial < ntry*nmol_insrt)
+ {
+ rvec offset_x;
+ if (!insertAtPositions)
+ {
+ // Insert at random positions.
+ offset_x[XX] = box[XX][XX] * dist(rng);
+ offset_x[YY] = box[YY][YY] * dist(rng);
+ offset_x[ZZ] = box[ZZ][ZZ] * dist(rng);
+ }
+ else
+ {
+ // Skip a position if ntry trials were not successful.
+ if (trial >= firstTrial + ntry)
+ {
+ fprintf(stderr, " skipped position (%.3f, %.3f, %.3f)\n",
+ rpos[XX][mol], rpos[YY][mol], rpos[ZZ][mol]);
+ ++mol;
+ ++failed;
+ firstTrial = trial;
+ continue;
+ }
+ // Insert at positions taken from option -ip file.
+ offset_x[XX] = rpos[XX][mol] + deltaR[XX]*(2 * dist(rng)-1);
+ offset_x[YY] = rpos[YY][mol] + deltaR[YY]*(2 * dist(rng)-1);
+ offset_x[ZZ] = rpos[ZZ][mol] + deltaR[ZZ]*(2 * dist(rng)-1);
+ }
+ fprintf(stderr, "\rTry %d", ++trial);
+ fflush(stderr);
+
+ generate_trial_conf(x_insrt, offset_x, enum_rot, &rng, &x_n);
+ gmx::AnalysisNeighborhoodPositions pos(*x);
+ gmx::AnalysisNeighborhoodSearch search = nb.initSearch(&pbc, pos);
+ if (isInsertionAllowed(&search, exclusionDistances, x_n, exclusionDistances_insrt,
+ *atoms, removableAtoms, &remover))
+ {
+ x->insert(x->end(), x_n.begin(), x_n.end());
+ exclusionDistances.insert(exclusionDistances.end(),
+ exclusionDistances_insrt.begin(),
+ exclusionDistances_insrt.end());
+ builder.mergeAtoms(atoms_insrt);
+ ++mol;
+ firstTrial = trial;
+ fprintf(stderr, " success (now %d atoms)!\n", builder.currentAtomCount());
+ }
+ }
+
+ fprintf(stderr, "\n");
+ /* print number of molecules added */
+ fprintf(stderr, "Added %d molecules (out of %d requested)\n",
+ mol - failed, nmol_insrt);
+
+ const int originalAtomCount = atoms->nr;
+ const int originalResidueCount = atoms->nres;
+ remover.refreshAtomCount(*atoms);
+ remover.removeMarkedElements(x);
+ remover.removeMarkedAtoms(atoms);
+ if (atoms->nr < originalAtomCount)
+ {
+ fprintf(stderr, "Replaced %d residues (%d atoms)\n",
+ originalResidueCount - atoms->nres,
+ originalAtomCount - atoms->nr);
+ }
+
+ if (rpos != nullptr)
+ {
+ for (int i = 0; i < DIM; ++i)
+ {
+ sfree(rpos[i]);
+ }
+ sfree(rpos);
+ }
+}
+
+namespace gmx
+{
+
+namespace
+{
+
+class InsertMolecules : public ICommandLineOptionsModule, public ITopologyProvider
+{
+ public:
+ InsertMolecules()
+ : bBox_(false), nmolIns_(0), nmolTry_(10), seed_(0),
+ defaultDistance_(0.105), scaleFactor_(0.57), enumRot_(en_rotXYZ)
+ {
+ clear_rvec(newBox_);
+ clear_rvec(deltaR_);
++ clear_mat(box_);
+ }
+
+ // From ITopologyProvider
- TopologyInformation topInfo_;
++ gmx_mtop_t *getTopology(bool /*required*/) override { return &top_; }
+ int getAtomCount() override { return 0; }
+
+ // From ICommandLineOptionsModule
+ void init(CommandLineModuleSettings * /*settings*/) override
+ {
+ }
+ void initOptions(IOptionsContainer *options,
+ ICommandLineOptionsModuleSettings *settings) override;
+ void optionsFinished() override;
+ int run() override;
+
+ private:
+ SelectionCollection selections_;
+
+ std::string inputConfFile_;
+ std::string insertConfFile_;
+ std::string positionFile_;
+ std::string outputConfFile_;
+ rvec newBox_;
+ bool bBox_;
+ int nmolIns_;
+ int nmolTry_;
+ int seed_;
+ real defaultDistance_;
+ real scaleFactor_;
+ rvec deltaR_;
+ RotationType enumRot_;
+ Selection replaceSel_;
+
- topInfo_.fillFromInputFile(inputConfFile_);
- if (topInfo_.mtop()->natoms == 0)
++ gmx_mtop_t top_;
++ std::vector<RVec> x_;
++ matrix box_;
++ int ePBC_;
+};
+
+void InsertMolecules::initOptions(IOptionsContainer *options,
+ ICommandLineOptionsModuleSettings *settings)
+{
+ const char *const desc[] = {
+ "[THISMODULE] inserts [TT]-nmol[tt] copies of the system specified in",
+ "the [TT]-ci[tt] input file. The insertions take place either into",
+ "vacant space in the solute conformation given with [TT]-f[tt], or",
+ "into an empty box given by [TT]-box[tt]. Specifying both [TT]-f[tt]",
+ "and [TT]-box[tt] behaves like [TT]-f[tt], but places a new box",
+ "around the solute before insertions. Any velocities present are",
+ "discarded.",
+ "",
+ "It is possible to also insert into a solvated configuration and",
+ "replace solvent atoms with the inserted atoms. To do this, use",
+ "[TT]-replace[tt] to specify a selection that identifies the atoms",
+ "that can be replaced. The tool assumes that all molecules in this",
+ "selection consist of single residues: each residue from this",
+ "selection that overlaps with the inserted molecules will be removed",
+ "instead of preventing insertion.",
+ "",
+ "By default, the insertion positions are random (with initial seed",
+ "specified by [TT]-seed[tt]). The program iterates until [TT]-nmol[tt]",
+ "molecules have been inserted in the box. Molecules are not inserted",
+ "where the distance between any existing atom and any atom of the",
+ "inserted molecule is less than the sum based on the van der Waals",
+ "radii of both atoms. A database ([TT]vdwradii.dat[tt]) of van der",
+ "Waals radii is read by the program, and the resulting radii scaled",
+ "by [TT]-scale[tt]. If radii are not found in the database, those",
+ "atoms are assigned the (pre-scaled) distance [TT]-radius[tt].",
+ "Note that the usefulness of those radii depends on the atom names,",
+ "and thus varies widely with force field.",
+ "",
+ "A total of [TT]-nmol[tt] * [TT]-try[tt] insertion attempts are made",
+ "before giving up. Increase [TT]-try[tt] if you have several small",
+ "holes to fill. Option [TT]-rot[tt] specifies whether the insertion",
+ "molecules are randomly oriented before insertion attempts.",
+ "",
+ "Alternatively, the molecules can be inserted only at positions defined in",
+ "positions.dat ([TT]-ip[tt]). That file should have 3 columns (x,y,z),",
+ "that give the displacements compared to the input molecule position",
+ "([TT]-ci[tt]). Hence, if that file should contain the absolute",
+ "positions, the molecule must be centered on (0,0,0) before using",
+ "[THISMODULE] (e.g. from [gmx-editconf] [TT]-center[tt]).",
+ "Comments in that file starting with # are ignored. Option [TT]-dr[tt]",
+ "defines the maximally allowed displacements during insertial trials.",
+ "[TT]-try[tt] and [TT]-rot[tt] work as in the default mode (see above)."
+ };
+
+ settings->setHelpText(desc);
+
+ std::shared_ptr<SelectionOptionBehavior> selectionOptionBehavior(
+ new SelectionOptionBehavior(&selections_, this));
+ settings->addOptionsBehavior(selectionOptionBehavior);
+
+ // TODO: Replace use of legacyType.
+ options->addOption(FileNameOption("f")
+ .legacyType(efSTX).inputFile()
+ .store(&inputConfFile_)
+ .defaultBasename("protein")
+ .description("Existing configuration to insert into"));
+ options->addOption(FileNameOption("ci")
+ .legacyType(efSTX).inputFile().required()
+ .store(&insertConfFile_)
+ .defaultBasename("insert")
+ .description("Configuration to insert"));
+ options->addOption(FileNameOption("ip")
+ .filetype(eftGenericData).inputFile()
+ .store(&positionFile_)
+ .defaultBasename("positions")
+ .description("Predefined insertion trial positions"));
+ options->addOption(FileNameOption("o")
+ .legacyType(efSTO).outputFile().required()
+ .store(&outputConfFile_)
+ .defaultBasename("out")
+ .description("Output configuration after insertion"));
+
+ options->addOption(SelectionOption("replace").onlyAtoms()
+ .store(&replaceSel_)
+ .description("Atoms that can be removed if overlapping"));
+ selectionOptionBehavior->initOptions(options);
+
+ options->addOption(RealOption("box").vector()
+ .store(newBox_).storeIsSet(&bBox_)
+ .description("Box size (in nm)"));
+ options->addOption(IntegerOption("nmol")
+ .store(&nmolIns_)
+ .description("Number of extra molecules to insert"));
+ options->addOption(IntegerOption("try")
+ .store(&nmolTry_)
+ .description("Try inserting [TT]-nmol[tt] times [TT]-try[tt] times"));
+ options->addOption(IntegerOption("seed")
+ .store(&seed_)
+ .description("Random generator seed (0 means generate)"));
+ options->addOption(RealOption("radius")
+ .store(&defaultDistance_)
+ .description("Default van der Waals distance"));
+ options->addOption(RealOption("scale")
+ .store(&scaleFactor_)
+ .description("Scale factor to multiply Van der Waals radii from the database in share/gromacs/top/vdwradii.dat. The default value of 0.57 yields density close to 1000 g/l for proteins in water."));
+ options->addOption(RealOption("dr").vector()
+ .store(deltaR_)
+ .description("Allowed displacement in x/y/z from positions in [TT]-ip[tt] file"));
+ options->addOption(EnumOption<RotationType>("rot").enumValue(cRotationEnum)
+ .store(&enumRot_)
+ .description("Rotate inserted molecules randomly"));
+}
+
+void InsertMolecules::optionsFinished()
+{
+ if (nmolIns_ <= 0 && positionFile_.empty())
+ {
+ GMX_THROW(InconsistentInputError("Either -nmol must be larger than 0, "
+ "or positions must be given with -ip."));
+ }
+ if (inputConfFile_.empty() && !bBox_)
+ {
+ GMX_THROW(InconsistentInputError("When no solute (-f) is specified, "
+ "a box size (-box) must be specified."));
+ }
+ if (replaceSel_.isValid() && inputConfFile_.empty())
+ {
+ GMX_THROW(InconsistentInputError("Replacement (-replace) only makes sense "
+ "together with an existing configuration (-f)."));
+ }
+
+ if (!inputConfFile_.empty())
+ {
++ bool bTprFileWasRead;
++ rvec *temporaryX = nullptr;
+ fprintf(stderr, "Reading solute configuration\n");
- const char *outputTitle = topInfo_.name();
- std::vector<RVec> xOutput;
- matrix box = {{ 0 }};
- if (topInfo_.hasTopology())
- {
- xOutput = copyOf(topInfo_.x());
- topInfo_.getBox(box);
- }
- else
- {
- xOutput.resize(topInfo_.mtop()->natoms);
- }
- auto atomsSolute = topInfo_.copyAtoms();
++ readConfAndTopology(inputConfFile_.c_str(), &bTprFileWasRead, &top_,
++ &ePBC_, &temporaryX, nullptr, box_);
++ x_.assign(temporaryX, temporaryX + top_.natoms);
++ sfree(temporaryX);
++ if (top_.natoms == 0)
+ {
+ fprintf(stderr, "Note: no atoms in %s\n", inputConfFile_.c_str());
+ }
+ }
+}
+
+int InsertMolecules::run()
+{
- set_pbc(&pbc, topInfo_.ePBC(), box);
+ std::set<int> removableAtoms;
+ if (replaceSel_.isValid())
+ {
+ t_pbc pbc;
- fr->natoms = topInfo_.mtop()->natoms;
++ set_pbc(&pbc, ePBC_, box_);
+ t_trxframe *fr;
+ snew(fr, 1);
- fr->x = as_rvec_array(xOutput.data());
++ fr->natoms = x_.size();
+ fr->bX = TRUE;
- int ePBCForOutput = topInfo_.ePBC();
++ fr->x = as_rvec_array(x_.data());
+ selections_.evaluate(fr, &pbc);
+ sfree(fr);
+ removableAtoms.insert(replaceSel_.atomIndices().begin(),
+ replaceSel_.atomIndices().end());
+ // TODO: It could be nice to check that removableAtoms contains full
+ // residues, since we anyways remove whole residues instead of
+ // individual atoms.
+ }
+
- clear_mat(box);
- box[XX][XX] = newBox_[XX];
- box[YY][YY] = newBox_[YY];
- box[ZZ][ZZ] = newBox_[ZZ];
++ int ePBCForOutput = ePBC_;
+ if (bBox_)
+ {
+ ePBCForOutput = epbcXYZ;
- if (det(box) == 0)
++ clear_mat(box_);
++ box_[XX][XX] = newBox_[XX];
++ box_[YY][YY] = newBox_[YY];
++ box_[ZZ][ZZ] = newBox_[ZZ];
+ }
- fprintf(stderr, "Reading molecule configuration\n");
- TopologyInformation topInfoForInsertedMolecule;
- topInfoForInsertedMolecule.fillFromInputFile(insertConfFile_);
- auto atomsInserted = topInfoForInsertedMolecule.atoms();
- std::vector<RVec> xInserted = copyOf(topInfoForInsertedMolecule.x());
-
- if (topInfoForInsertedMolecule.mtop()->natoms == 0)
- {
- gmx_fatal(FARGS, "No molecule in %s, please check your input",
- insertConfFile_.c_str());
- }
- if (outputTitle == nullptr)
- {
- outputTitle = topInfoForInsertedMolecule.name();
- }
- if (positionFile_.empty())
++ if (det(box_) == 0)
+ {
+ gmx_fatal(FARGS, "Undefined solute box.\nCreate one with gmx editconf "
+ "or give explicit -box command line option");
+ }
+
- center_molecule(xInserted);
++ gmx_mtop_t topInserted;
++ t_atoms atomsInserted;
++ std::vector<RVec> xInserted;
+ {
- auto symtabInserted = duplicateSymtab(&topInfo_.mtop()->symtab);
- const sfree_guard symtabInsertedGuard(symtabInserted);
++ bool bTprFileWasRead;
++ int ePBC_dummy;
++ matrix box_dummy;
++ rvec *temporaryX;
++ readConfAndTopology(insertConfFile_.c_str(), &bTprFileWasRead, &topInserted,
++ &ePBC_dummy, &temporaryX, nullptr, box_dummy);
++ xInserted.assign(temporaryX, temporaryX + topInserted.natoms);
++ sfree(temporaryX);
++ atomsInserted = gmx_mtop_global_atoms(&topInserted);
++ if (atomsInserted.nr == 0)
++ {
++ gmx_fatal(FARGS, "No molecule in %s, please check your input",
++ insertConfFile_.c_str());
++ }
++ if (top_.name == nullptr)
++ {
++ top_.name = topInserted.name;
++ }
++ if (positionFile_.empty())
++ {
++ center_molecule(xInserted);
++ }
+ }
+
- atomsSolute.get(), symtabInserted, &xOutput, removableAtoms, *atomsInserted, xInserted,
- ePBCForOutput, box, positionFile_, deltaR_, enumRot_);
++ t_atoms atoms = gmx_mtop_global_atoms(&top_);
++
+ /* add nmol_ins molecules of atoms_ins
+ in random orientation at random place */
+ insert_mols(nmolIns_, nmolTry_, seed_, defaultDistance_, scaleFactor_,
- write_sto_conf(outputConfFile_.c_str(), outputTitle, atomsSolute.get(),
- as_rvec_array(xOutput.data()), nullptr, ePBCForOutput, box);
++ &atoms, &top_.symtab, &x_, removableAtoms, atomsInserted, xInserted,
++ ePBCForOutput, box_, positionFile_, deltaR_, enumRot_);
+
+ /* write new configuration to file confout */
+ fprintf(stderr, "Writing generated configuration to %s\n",
+ outputConfFile_.c_str());
- atomsSolute->nr, atomsSolute->nres);
- done_symtab(symtabInserted);
++ write_sto_conf(outputConfFile_.c_str(), *top_.name, &atoms,
++ as_rvec_array(x_.data()), nullptr, ePBCForOutput, box_);
+
+ /* print size of generated configuration */
+ fprintf(stderr, "\nOutput configuration contains %d atoms in %d residues\n",
++ atoms.nr, atoms.nres);
++
++ done_atom(&atoms);
++ done_atom(&atomsInserted);
++
+ return 0;
+}
+
+} // namespace
+
+
+const char* InsertMoleculesInfo::name()
+{
+ static const char* name = "insert-molecules";
+ return name;
+}
+
+const char* InsertMoleculesInfo::shortDescription()
+{
+ static const char* shortDescription =
+ "Insert molecules into existing vacancies";
+ return shortDescription;
+}
+
+ICommandLineOptionsModulePointer InsertMoleculesInfo::create()
+{
+ return ICommandLineOptionsModulePointer(new InsertMolecules());
+}
+
+} // namespace gmx
#include <algorithm>
#include <string>
-#include "gromacs/awh/read-params.h"
+#include "gromacs/awh/read_params.h"
#include "gromacs/fileio/readinp.h"
#include "gromacs/fileio/warninp.h"
#include "gromacs/gmxlib/chargegroup.h"
#include "gromacs/mdrunutility/mdmodules.h"
#include "gromacs/mdtypes/inputrec.h"
#include "gromacs/mdtypes/md_enums.h"
-#include "gromacs/mdtypes/pull-params.h"
+#include "gromacs/mdtypes/pull_params.h"
#include "gromacs/options/options.h"
#include "gromacs/options/treesupport.h"
#include "gromacs/pbcutil/pbc.h"
expand->nstexpanded = fep->nstdhdl;
/* if you don't specify nstexpanded when doing expanded ensemble free energy calcs, it is set to nstdhdl */
}
- if ((expand->nstexpanded < 0) && ir->bSimTemp)
- {
- expand->nstexpanded = 2*static_cast<int>(ir->opts.tau_t[0]/ir->delta_t);
- /* if you don't specify nstexpanded when doing expanded ensemble simulated tempering, it is set to
- 2*tau_t just to be careful so it's not to frequent */
- }
}
}
snew(grps->nm_ind, groupsFromMdpFile.size()+1); /* +1 for possible rest group */
- for (int i = 0; i != groupsFromMdpFile.size(); ++i)
+ for (int i = 0; i != groupsFromMdpFile.ssize(); ++i)
{
/* Lookup the group name in the block structure */
gid = search_string(groupsFromMdpFile[i].c_str(), block->nr, gnames);
static void calc_nrdf(const gmx_mtop_t *mtop, t_inputrec *ir, char **gnames)
{
t_grpopts *opts;
- const gmx_groups_t *groups;
pull_params_t *pull;
int natoms, ai, aj, i, j, d, g, imin, jmin;
int *nrdf2, *na_vcm, na_tot;
double *nrdf_tc, *nrdf_vcm, nrdf_uc, *nrdf_vcm_sub;
ivec *dof_vcm;
- gmx_mtop_atomloop_all_t aloop;
int mol, ftype, as;
/* Calculate nrdf.
opts = &ir->opts;
- groups = &mtop->groups;
+ const gmx_groups_t &groups = mtop->groups;
natoms = mtop->natoms;
/* Allocate one more for a possible rest group */
/* We need to sum degrees of freedom into doubles,
* since floats give too low nrdf's above 3 million atoms.
*/
- snew(nrdf_tc, groups->grps[egcTC].nr+1);
- snew(nrdf_vcm, groups->grps[egcVCM].nr+1);
- snew(dof_vcm, groups->grps[egcVCM].nr+1);
- snew(na_vcm, groups->grps[egcVCM].nr+1);
- snew(nrdf_vcm_sub, groups->grps[egcVCM].nr+1);
+ snew(nrdf_tc, groups.grps[egcTC].nr+1);
+ snew(nrdf_vcm, groups.grps[egcVCM].nr+1);
+ snew(dof_vcm, groups.grps[egcVCM].nr+1);
+ snew(na_vcm, groups.grps[egcVCM].nr+1);
+ snew(nrdf_vcm_sub, groups.grps[egcVCM].nr+1);
- for (i = 0; i < groups->grps[egcTC].nr; i++)
+ for (i = 0; i < groups.grps[egcTC].nr; i++)
{
nrdf_tc[i] = 0;
}
- for (i = 0; i < groups->grps[egcVCM].nr+1; i++)
+ for (i = 0; i < groups.grps[egcVCM].nr+1; i++)
{
nrdf_vcm[i] = 0;
clear_ivec(dof_vcm[i]);
}
snew(nrdf2, natoms);
- aloop = gmx_mtop_atomloop_all_init(mtop);
- const t_atom *atom;
- while (gmx_mtop_atomloop_all_next(aloop, &i, &atom))
+ for (const AtomProxy atomP : AtomRange(*mtop))
{
+ const t_atom &local = atomP.atom();
+ int i = atomP.globalAtomNumber();
nrdf2[i] = 0;
- if (atom->ptype == eptAtom || atom->ptype == eptNucleus)
+ if (local.ptype == eptAtom || local.ptype == eptNucleus)
{
g = getGroupType(groups, egcFREEZE, i);
- for (d = 0; d < DIM; d++)
+ for (int d = 0; d < DIM; d++)
{
if (opts->nFreeze[g][d] == 0)
{
for (const gmx_molblock_t &molb : mtop->molblock)
{
const gmx_moltype_t &molt = mtop->moltype[molb.type];
- atom = molt.atoms.atom;
+ const t_atom *atom = molt.atoms.atom;
for (mol = 0; mol < molb.nmol; mol++)
{
for (ftype = F_CONSTR; ftype <= F_CONSTRNC; ftype++)
nrdf_vcm[getGroupType(groups, egcVCM, ai)] -= 0.5*imin;
if (nrdf_tc[getGroupType(groups, egcTC, ai)] < 0)
{
- gmx_fatal(FARGS, "Center of mass pulling constraints caused the number of degrees of freedom for temperature coupling group %s to be negative", gnames[groups->grps[egcTC].nm_ind[getGroupType(groups, egcTC, ai)]]);
+ gmx_fatal(FARGS, "Center of mass pulling constraints caused the number of degrees of freedom for temperature coupling group %s to be negative", gnames[groups.grps[egcTC].nm_ind[getGroupType(groups, egcTC, ai)]]);
}
}
else
* the number of degrees of freedom in each vcm group when COM
* translation is removed and 6 when rotation is removed as well.
*/
- for (j = 0; j < groups->grps[egcVCM].nr+1; j++)
+ for (j = 0; j < groups.grps[egcVCM].nr+1; j++)
{
switch (ir->comm_mode)
{
}
}
- for (i = 0; i < groups->grps[egcTC].nr; i++)
+ for (i = 0; i < groups.grps[egcTC].nr; i++)
{
/* Count the number of atoms of TC group i for every VCM group */
- for (j = 0; j < groups->grps[egcVCM].nr+1; j++)
+ for (j = 0; j < groups.grps[egcVCM].nr+1; j++)
{
na_vcm[j] = 0;
}
*/
nrdf_uc = nrdf_tc[i];
nrdf_tc[i] = 0;
- for (j = 0; j < groups->grps[egcVCM].nr+1; j++)
+ for (j = 0; j < groups.grps[egcVCM].nr+1; j++)
{
if (nrdf_vcm[j] > nrdf_vcm_sub[j])
{
}
}
}
- for (i = 0; (i < groups->grps[egcTC].nr); i++)
+ for (i = 0; (i < groups.grps[egcTC].nr); i++)
{
opts->nrdf[i] = nrdf_tc[i];
if (opts->nrdf[i] < 0)
}
fprintf(stderr,
"Number of degrees of freedom in T-Coupling group %s is %.2f\n",
- gnames[groups->grps[egcTC].nm_ind[i]], opts->nrdf[i]);
+ gnames[groups.grps[egcTC].nm_ind[i]], opts->nrdf[i]);
}
sfree(nrdf2);
gmx_fatal(FARGS, "Can only have energy group pair tables in combination with user tables for VdW and/or Coulomb");
}
+ /* final check before going out of scope if simulated tempering variables
+ * need to be set to default values.
+ */
+ if ((ir->expandedvals->nstexpanded < 0) && ir->bSimTemp)
+ {
+ ir->expandedvals->nstexpanded = 2*static_cast<int>(ir->opts.tau_t[0]/ir->delta_t);
+ warning(wi, gmx::formatString("the value for nstexpanded was not specified for "
+ " expanded ensemble simulated tempering. It is set to 2*tau_t (%d) "
+ "by default, but it is recommended to set it to an explicit value!",
+ ir->expandedvals->nstexpanded));
+ }
for (i = 0; (i < grps->nr); i++)
{
sfree(gnames[i]);
real *mgrp, mt;
rvec acc;
gmx_mtop_atomloop_block_t aloopb;
- gmx_mtop_atomloop_all_t aloop;
ivec AbsRef;
char warn_buf[STRLEN];
{
clear_rvec(acc);
snew(mgrp, sys->groups.grps[egcACC].nr);
- aloop = gmx_mtop_atomloop_all_init(sys);
- const t_atom *atom;
- while (gmx_mtop_atomloop_all_next(aloop, &i, &atom))
+ for (const AtomProxy atomP : AtomRange(*sys))
{
- mgrp[getGroupType(&sys->groups, egcACC, i)] += atom->m;
+ const t_atom &local = atomP.atom();
+ int i = atomP.globalAtomNumber();
+ mgrp[getGroupType(sys->groups, egcACC, i)] += local.m;
}
mt = 0.0;
for (i = 0; (i < sys->groups.grps[egcACC].nr); i++)
#include "gromacs/commandline/pargs.h"
#include "gromacs/fileio/confio.h"
#include "gromacs/fileio/pdbio.h"
-#include "gromacs/gmxlib/conformation-utilities.h"
+#include "gromacs/gmxlib/conformation_utilities.h"
#include "gromacs/gmxpreprocess/makeexclusiondistances.h"
#include "gromacs/math/functions.h"
#include "gromacs/math/units.h"
#include "gromacs/topology/atomprop.h"
#include "gromacs/topology/atoms.h"
#include "gromacs/topology/atomsbuilder.h"
+ #include "gromacs/topology/mtop_util.h"
#include "gromacs/topology/topology.h"
- #include "gromacs/trajectoryanalysis/topologyinformation.h"
#include "gromacs/utility/arraysize.h"
#include "gromacs/utility/cstringutil.h"
#include "gromacs/utility/fatalerror.h"
#include "gromacs/utility/futil.h"
#include "gromacs/utility/gmxassert.h"
#include "gromacs/utility/smalloc.h"
- #include "gromacs/utility/unique_cptr.h"
using gmx::RVec;
remover.removeMarkedAtoms(atoms);
}
- static void add_solv(const char *filename,
- const gmx::TopologyInformation &topInfo,
- t_atoms *atoms,
+ static void add_solv(const char *filename, t_atoms *atoms,
+ t_symtab *symtab,
std::vector<RVec> *x, std::vector<RVec> *v,
- matrix box, AtomProperties *aps,
- int ePBC, matrix box, gmx_atomprop_t aps,
++ int ePBC, matrix box, AtomProperties *aps,
real defaultDistance, real scaleFactor,
real rshell, int max_sol)
{
- gmx::TopologyInformation topInfoSolvent;
- topInfoSolvent.fillFromInputFile(gmx::findLibraryFile(filename));
- auto atomsSolvent = topInfoSolvent.copyAtoms();
- std::vector<RVec> xSolvent, vSolvent;
- matrix boxSolvent = {{ 0 }};
- if (topInfoSolvent.hasTopology())
- {
- xSolvent = copyOf(topInfoSolvent.x());
- vSolvent = copyOf(topInfoSolvent.v());
- topInfoSolvent.getBox(boxSolvent);
- }
- else
- {
- xSolvent.resize(atomsSolvent->nr);
- vSolvent.resize(atomsSolvent->nr);
- }
+ gmx_mtop_t topSolvent;
+ std::vector<RVec> xSolvent, vSolvent;
+ matrix boxSolvent = {{ 0 }};
+ int ePBCSolvent;
+
+ fprintf(stderr, "Reading solvent configuration\n");
+ bool bTprFileWasRead;
+ rvec *temporaryX = nullptr, *temporaryV = nullptr;
+ readConfAndTopology(gmx::findLibraryFile(filename).c_str(), &bTprFileWasRead, &topSolvent,
+ &ePBCSolvent, &temporaryX, &temporaryV, boxSolvent);
+ t_atoms *atomsSolvent;
+ snew(atomsSolvent, 1);
+ *atomsSolvent = gmx_mtop_global_atoms(&topSolvent);
+ xSolvent.assign(temporaryX, temporaryX + topSolvent.natoms);
+ sfree(temporaryX);
+ vSolvent.assign(temporaryV, temporaryV + topSolvent.natoms);
+ sfree(temporaryV);
if (gmx::boxIsZero(boxSolvent))
{
gmx_fatal(FARGS, "No box information for solvent in %s, please use a properly formatted file\n",
const std::vector<real> exclusionDistances(
makeExclusionDistances(atoms, aps, defaultDistance, scaleFactor));
std::vector<real> exclusionDistances_solvt(
- makeExclusionDistances(atomsSolvent.get(), aps, defaultDistance, scaleFactor));
+ makeExclusionDistances(atomsSolvent, aps, defaultDistance, scaleFactor));
/* generate a new solvent configuration */
fprintf(stderr, "Generating solvent configuration\n");
t_pbc pbc;
- set_pbc(&pbc, topInfo.ePBC(), box);
+ set_pbc(&pbc, ePBC, box);
if (!gmx::boxesAreEqual(boxSolvent, box))
{
if (TRICLINIC(boxSolvent))
"You can try to pass the same box for -cp and -cs.");
}
/* apply pbc for solvent configuration for whole molecules */
- rm_res_pbc(atomsSolvent.get(), &xSolvent, boxSolvent);
- replicateSolventBox(atomsSolvent.get(), &xSolvent, &vSolvent, &exclusionDistances_solvt,
+ rm_res_pbc(atomsSolvent, &xSolvent, boxSolvent);
+ replicateSolventBox(atomsSolvent, &xSolvent, &vSolvent, &exclusionDistances_solvt,
boxSolvent, box);
- if (topInfo.ePBC() != epbcNONE)
+ if (ePBC != epbcNONE)
{
- removeSolventBoxOverlap(atomsSolvent.get(), &xSolvent, &vSolvent, &exclusionDistances_solvt, pbc);
+ removeSolventBoxOverlap(atomsSolvent, &xSolvent, &vSolvent, &exclusionDistances_solvt, pbc);
}
}
if (atoms->nr > 0)
{
if (rshell > 0.0)
{
- removeSolventOutsideShell(atomsSolvent.get(), &xSolvent, &vSolvent,
+ removeSolventOutsideShell(atomsSolvent, &xSolvent, &vSolvent,
&exclusionDistances_solvt, pbc, *x, rshell);
}
- removeSolventOverlappingWithSolute(atomsSolvent.get(), &xSolvent, &vSolvent,
+ removeSolventOverlappingWithSolute(atomsSolvent, &xSolvent, &vSolvent,
&exclusionDistances_solvt, pbc, *x,
exclusionDistances);
}
if (max_sol > 0 && atomsSolvent->nres > max_sol)
{
const int numberToRemove = atomsSolvent->nres - max_sol;
- removeExtraSolventMolecules(atomsSolvent.get(), &xSolvent, &vSolvent, numberToRemove);
+ removeExtraSolventMolecules(atomsSolvent, &xSolvent, &vSolvent, numberToRemove);
}
/* Sort the solvent mixture, not the protein... */
- t_atoms *newatoms = nullptr;
- t_atoms *sortedAtomsSolvent = atomsSolvent.get();
+ t_atoms *newatoms = nullptr;
+ // The sort_molecule function does something creative with the
+ // t_atoms pointers. We need to make sure we neither leak, nor
+ // double-free, so make a shallow pointer that is fine for it to
+ // change.
+ t_atoms *sortedAtomsSolvent = atomsSolvent;
sort_molecule(&sortedAtomsSolvent, &newatoms, &xSolvent, &vSolvent);
// Merge the two configurations.
v->insert(v->end(), vSolvent.begin(), vSolvent.end());
}
{
- gmx::AtomsBuilder builder(atoms, &topInfo.mtop()->symtab);
+ gmx::AtomsBuilder builder(atoms, symtab);
builder.mergeAtoms(*sortedAtomsSolvent);
}
fprintf(stderr, "Generated solvent containing %d atoms in %d residues\n",
- sortedAtomsSolvent->nr, sortedAtomsSolvent->nres);
+ atomsSolvent->nr, atomsSolvent->nres);
if (newatoms)
{
done_atom(newatoms);
sfree(newatoms);
}
+ if (atomsSolvent)
+ {
+ done_atom(atomsSolvent);
+ sfree(atomsSolvent);
+ }
}
-static void update_top(t_atoms *atoms, int firstSolventResidueIndex, matrix box, int NFILE, t_filenm fnm[],
- gmx_atomprop_t aps)
+static void update_top(t_atoms *atoms,
+ int firstSolventResidueIndex,
+ matrix box,
+ int NFILE,
+ t_filenm fnm[],
+ AtomProperties *aps)
{
FILE *fpin, *fpout;
char buf[STRLEN*2], buf2[STRLEN], *temp;
mtot = 0;
for (i = 0; (i < atoms->nr); i++)
{
- gmx_atomprop_query(aps, epropMass,
- *atoms->resinfo[atoms->atom[i].resind].name,
- *atoms->atomname[i], &mm);
+ aps->setAtomProperty(epropMass,
+ std::string(*atoms->resinfo[atoms->atom[i].resind].name),
+ std::string(*atoms->atomname[i]), &mm);
mtot += mm;
}
/* parameter data */
gmx_bool bProt, bBox;
const char *conf_prot, *confout;
- gmx_atomprop_t aps;
t_filenm fnm[] = {
{ efSTX, "-cp", "protein", ffOPTRD },
"a box size (-box) must be specified");
}
- aps = gmx_atomprop_init();
+ AtomProperties aps;
- gmx::TopologyInformation topInfo;
- std::vector<RVec> x, v;
- matrix box = {{ 0 }};
+ /* solute configuration data */
+ gmx_mtop_t top;
+ std::vector<RVec> x, v;
+ matrix box = {{ 0 }};
+ int ePBC = -1;
+ t_atoms *atoms;
+ snew(atoms, 1);
if (bProt)
{
/* Generate a solute configuration */
conf_prot = opt2fn("-cp", NFILE, fnm);
- topInfo.fillFromInputFile(conf_prot);
- x = copyOf(topInfo.x());
- if (bReadV)
- {
- v = copyOf(topInfo.v());
- }
- topInfo.getBox(box);
fprintf(stderr, "Reading solute configuration%s\n",
bReadV ? " and velocities" : "");
- if (bReadV && v.empty())
+ bool bTprFileWasRead;
+ rvec *temporaryX = nullptr, *temporaryV = nullptr;
+ readConfAndTopology(conf_prot, &bTprFileWasRead, &top,
+ &ePBC, &temporaryX, bReadV ? &temporaryV : nullptr, box);
+ *atoms = gmx_mtop_global_atoms(&top);
+ x.assign(temporaryX, temporaryX + top.natoms);
+ sfree(temporaryX);
+ if (temporaryV)
+ {
+ v.assign(temporaryV, temporaryV + top.natoms);
+ sfree(temporaryV);
+ }
+ else if (bReadV)
{
fprintf(stderr, "Note: no velocities found\n");
}
- if (topInfo.mtop()->natoms == 0)
+ if (atoms->nr == 0)
{
fprintf(stderr, "Note: no atoms in %s\n", conf_prot);
bProt = FALSE;
}
else
{
- firstSolventResidueIndex = topInfo.atoms()->nres;
+ firstSolventResidueIndex = atoms->nres;
}
}
- auto atoms = topInfo.copyAtoms();
- int ePBCForOutput = topInfo.ePBC();
+ int ePBCForOutput = ePBC;
if (bBox)
{
ePBCForOutput = epbcXYZ;
"or give explicit -box command line option");
}
- add_solv(solventFileName, topInfo, atoms.get(), &x, &v, box,
+ add_solv(solventFileName, atoms, &top.symtab, &x, &v, ePBCForOutput, box,
- aps, defaultDistance, scaleFactor, r_shell, max_sol);
+ &aps, defaultDistance, scaleFactor, r_shell, max_sol);
/* write new configuration 1 to file confout */
confout = ftp2fn(efSTO, NFILE, fnm);
fprintf(stderr, "Writing generated configuration to %s\n", confout);
- const char *outputTitle = (bProt ? *topInfo.mtop()->name : "Generated by gmx solvate");
- write_sto_conf(confout, outputTitle, atoms.get(), as_rvec_array(x.data()),
+ const char *outputTitle = (bProt ? *top.name : "Generated by gmx solvate");
+ write_sto_conf(confout, outputTitle, atoms, as_rvec_array(x.data()),
!v.empty() ? as_rvec_array(v.data()) : nullptr, ePBCForOutput, box);
/* print size of generated configuration */
fprintf(stderr, "\nOutput configuration contains %d atoms in %d residues\n",
atoms->nr, atoms->nres);
- update_top(atoms.get(), firstSolventResidueIndex, box, NFILE, fnm, &aps);
- update_top(atoms, firstSolventResidueIndex, box, NFILE, fnm, aps);
++ update_top(atoms, firstSolventResidueIndex, box, NFILE, fnm, &aps);
+
+ done_atom(atoms);
+ sfree(atoms);
- gmx_atomprop_destroy(aps);
output_env_done(oenv);
return 0;
return impl_->nflexcon;
}
+ bool Constraints::havePerturbedConstraints() const
+ {
+ const gmx_ffparams_t &ffparams = impl_->mtop.ffparams;
+
+ for (size_t i = 0; i < ffparams.functype.size(); i++)
+ {
+ if ((ffparams.functype[i] == F_CONSTR ||
+ ffparams.functype[i] == F_CONSTRNC) &&
+ ffparams.iparams[i].constr.dA != ffparams.iparams[i].constr.dB)
+ {
+ return true;
+ }
+ }
+
+ return false;
+ }
+
//! Clears constraint quantities for atoms in nonlocal region.
static void clear_constraint_quantity_nonlocal(gmx_domdec_t *dd, rvec *q)
{
}
else
{
- return EmptyArrayRef();
+ return {};
}
}
Constraints::Impl::~Impl()
{
+ for (auto blocka : at2con_mt)
+ {
+ done_blocka(&blocka);
+ }
+ if (bSettleErrorHasOccurred != nullptr)
+ {
+ sfree(bSettleErrorHasOccurred);
+ }
+ if (vir_r_m_dr_th != nullptr)
+ {
+ sfree(vir_r_m_dr_th);
+ }
+ if (settled != nullptr)
+ {
+ settle_free(settled);
+ }
done_lincs(lincsd);
}
/*! \brief Returns the total number of flexible constraints in the system. */
int numFlexibleConstraints() const;
+ /*! \brief Returns whether the system contains perturbed constraints */
+ bool havePerturbedConstraints() const;
+
/*! \brief Set up all the local constraints for the domain.
*
* \todo Make this a callback that is called automatically
gmx::ArrayRef<const int> iatom_constrnc,
int con)
{
- if (con*3 < iatom_constr.size())
+ if (con*3 < iatom_constr.ssize())
{
return iatom_constr.data() + con*3;
}
#include "gromacs/domdec/domdec.h"
#include "gromacs/domdec/domdec_struct.h"
#include "gromacs/ewald/ewald.h"
-#include "gromacs/ewald/long-range-correction.h"
+#include "gromacs/ewald/long_range_correction.h"
#include "gromacs/ewald/pme.h"
#include "gromacs/gmxlib/network.h"
#include "gromacs/gmxlib/nrnb.h"
#include "gromacs/gmxlib/nonbonded/nonbonded.h"
-#include "gromacs/listed-forces/listed-forces.h"
+#include "gromacs/listed_forces/listed_forces.h"
#include "gromacs/math/vec.h"
#include "gromacs/math/vecdump.h"
#include "gromacs/mdlib/force_flags.h"
-#include "gromacs/mdlib/forcerec-threading.h"
+#include "gromacs/mdlib/forcerec_threading.h"
#include "gromacs/mdlib/mdrun.h"
#include "gromacs/mdlib/ns.h"
#include "gromacs/mdlib/qmmm.h"
double &enerpart_lambda = enerd->enerpart_lambda[i + 1];
- for (gmx::index j = 0; j < lambda.size(); j++)
+ for (gmx::index j = 0; j < lambda.ssize(); j++)
{
/* Note that this loop is over all dhdl components, not just the separated ones */
const double dlam = fepvals->all_lambda[j][i] - lambda[j];
enerpart_lambda += dlam*enerd->term[F_DVDL_CONSTR];
}
- if (j == efptMASS)
+ if (j == efptMASS && !fepvals->separate_dvdl[j])
{
enerpart_lambda += dlam*enerd->term[F_DKDL];
}
#include "gromacs/awh/awh.h"
#include "gromacs/domdec/dlbtiming.h"
+#include "gromacs/domdec/domdec.h"
#include "gromacs/domdec/domdec_struct.h"
#include "gromacs/domdec/partition.h"
#include "gromacs/essentialdynamics/edsam.h"
#include "gromacs/gmxlib/nonbonded/nonbonded.h"
#include "gromacs/gpu_utils/gpu_utils.h"
#include "gromacs/imd/imd.h"
-#include "gromacs/listed-forces/bonded.h"
-#include "gromacs/listed-forces/disre.h"
-#include "gromacs/listed-forces/gpubonded.h"
-#include "gromacs/listed-forces/manage-threading.h"
-#include "gromacs/listed-forces/orires.h"
+#include "gromacs/listed_forces/bonded.h"
+#include "gromacs/listed_forces/disre.h"
+#include "gromacs/listed_forces/gpubonded.h"
+#include "gromacs/listed_forces/manage_threading.h"
+#include "gromacs/listed_forces/orires.h"
#include "gromacs/math/arrayrefwithpadding.h"
#include "gromacs/math/functions.h"
#include "gromacs/math/units.h"
#include "gromacs/mdlib/forcerec.h"
#include "gromacs/mdlib/gmx_omp_nthreads.h"
#include "gromacs/mdlib/mdrun.h"
-#include "gromacs/mdlib/nb_verlet.h"
-#include "gromacs/mdlib/nbnxn_atomdata.h"
-#include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h"
-#include "gromacs/mdlib/nbnxn_grid.h"
-#include "gromacs/mdlib/nbnxn_search.h"
#include "gromacs/mdlib/ppforceworkload.h"
#include "gromacs/mdlib/qmmm.h"
#include "gromacs/mdlib/update.h"
-#include "gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_gpu_ref.h"
#include "gromacs/mdtypes/commrec.h"
+#include "gromacs/mdtypes/enerdata.h"
#include "gromacs/mdtypes/forceoutput.h"
#include "gromacs/mdtypes/iforceprovider.h"
#include "gromacs/mdtypes/inputrec.h"
#include "gromacs/mdtypes/md_enums.h"
#include "gromacs/mdtypes/state.h"
+#include "gromacs/nbnxm/atomdata.h"
+#include "gromacs/nbnxm/gpu_data_mgmt.h"
+#include "gromacs/nbnxm/nbnxm.h"
#include "gromacs/pbcutil/ishift.h"
#include "gromacs/pbcutil/mshift.h"
#include "gromacs/pbcutil/pbc.h"
#include "gromacs/utility/gmxassert.h"
#include "gromacs/utility/gmxmpi.h"
#include "gromacs/utility/logger.h"
-#include "gromacs/utility/pleasecite.h"
#include "gromacs/utility/smalloc.h"
#include "gromacs/utility/strconvert.h"
#include "gromacs/utility/sysinfo.h"
-#include "nbnxn_gpu.h"
-#include "nbnxn_kernels/nbnxn_kernel_cpu.h"
-#include "nbnxn_kernels/nbnxn_kernel_prune.h"
-
// TODO: this environment variable allows us to verify before release
// that on less common architectures the total cost of polling is not larger than
// a blocking wait (so polling does not introduce overhead when the static
}
}
-static void do_nb_verlet(const t_forcerec *fr,
- const interaction_const_t *ic,
- gmx_enerdata_t *enerd,
- int flags, int ilocality,
- int clearF,
- int64_t step,
- t_nrnb *nrnb,
- gmx_wallcycle_t wcycle)
+static void do_nb_verlet(t_forcerec *fr,
+ const interaction_const_t *ic,
+ gmx_enerdata_t *enerd,
+ const int flags,
+ const Nbnxm::InteractionLocality ilocality,
+ const int clearF,
+ const int64_t step,
+ t_nrnb *nrnb,
+ gmx_wallcycle_t wcycle)
{
if (!(flags & GMX_FORCE_NONBONDED))
{
* the current coordinates of the atoms.
*/
wallcycle_sub_start(wcycle, ewcsNONBONDED_PRUNING);
- nbnxn_kernel_cpu_prune(nbvg, nbv->nbat, fr->shift_vec, nbv->listParams->rlistInner);
+ NbnxnDispatchPruneKernel(nbv, ilocality, fr->shift_vec);
wallcycle_sub_stop(wcycle, ewcsNONBONDED_PRUNING);
}
wallcycle_sub_start(wcycle, ewcsNONBONDED);
}
- switch (nbvg->kernel_type)
- {
- case nbnxnk4x4_PlainC:
- case nbnxnk4xN_SIMD_4xN:
- case nbnxnk4xN_SIMD_2xNN:
- nbnxn_kernel_cpu(nbvg,
- nbv->nbat,
- ic,
- fr->shift_vec,
- flags,
- clearF,
- fr->fshift[0],
- enerd->grpp.ener[egCOULSR],
- fr->bBHAM ?
- enerd->grpp.ener[egBHAMSR] :
- enerd->grpp.ener[egLJSR]);
- break;
-
- case nbnxnk8x8x8_GPU:
- nbnxn_gpu_launch_kernel(nbv->gpu_nbv, flags, ilocality);
- break;
+ NbnxnDispatchKernel(nbv, ilocality, *ic, flags, clearF, fr, enerd, nrnb);
- case nbnxnk8x8x8_PlainC:
- nbnxn_kernel_gpu_ref(nbvg->nbl_lists.nbl[0],
- nbv->nbat, ic,
- fr->shift_vec,
- flags,
- clearF,
- nbv->nbat->out[0].f,
- fr->fshift[0],
- enerd->grpp.ener[egCOULSR],
- fr->bBHAM ?
- enerd->grpp.ener[egBHAMSR] :
- enerd->grpp.ener[egLJSR]);
- break;
-
- default:
- GMX_RELEASE_ASSERT(false, "Invalid nonbonded kernel type passed!");
-
- }
if (!bUsingGpuKernels)
{
wallcycle_sub_stop(wcycle, ewcsNONBONDED);
}
-
- int enr_nbnxn_kernel_ljc, enr_nbnxn_kernel_lj;
- if (EEL_RF(ic->eeltype) || ic->eeltype == eelCUT)
- {
- enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_RF;
- }
- else if ((!bUsingGpuKernels && nbvg->ewald_excl == ewaldexclAnalytical) ||
- (bUsingGpuKernels && nbnxn_gpu_is_kernel_ewald_analytical(nbv->gpu_nbv)))
- {
- enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_EWALD;
- }
- else
- {
- enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_TAB;
- }
- enr_nbnxn_kernel_lj = eNR_NBNXN_LJ;
- if (flags & GMX_FORCE_ENERGY)
- {
- /* In eNR_??? the nbnxn F+E kernels are always the F kernel + 1 */
- enr_nbnxn_kernel_ljc += 1;
- enr_nbnxn_kernel_lj += 1;
- }
-
- inc_nrnb(nrnb, enr_nbnxn_kernel_ljc,
- nbvg->nbl_lists.natpair_ljq);
- inc_nrnb(nrnb, enr_nbnxn_kernel_lj,
- nbvg->nbl_lists.natpair_lj);
- /* The Coulomb-only kernels are offset -eNR_NBNXN_LJ_RF+eNR_NBNXN_RF */
- inc_nrnb(nrnb, enr_nbnxn_kernel_ljc-eNR_NBNXN_LJ_RF+eNR_NBNXN_RF,
- nbvg->nbl_lists.natpair_q);
-
- if (ic->vdw_modifier == eintmodFORCESWITCH)
- {
- /* We add up the switch cost separately */
- inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_FSW+((flags & GMX_FORCE_ENERGY) ? 1 : 0),
- nbvg->nbl_lists.natpair_ljq + nbvg->nbl_lists.natpair_lj);
- }
- if (ic->vdw_modifier == eintmodPOTSWITCH)
- {
- /* We add up the switch cost separately */
- inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_PSW+((flags & GMX_FORCE_ENERGY) ? 1 : 0),
- nbvg->nbl_lists.natpair_ljq + nbvg->nbl_lists.natpair_lj);
- }
- if (ic->vdwtype == evdwPME)
- {
- /* We add up the LJ Ewald cost separately */
- inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_EWALD+((flags & GMX_FORCE_ENERGY) ? 1 : 0),
- nbvg->nbl_lists.natpair_ljq + nbvg->nbl_lists.natpair_lj);
- }
}
static void do_nb_verlet_fep(nbnxn_pairlist_set_t *nbl_lists,
{
GpuTaskCompletion completionType = (isPmeGpuDone) ? GpuTaskCompletion::Wait : GpuTaskCompletion::Check;
wallcycle_start_nocount(wcycle, ewcWAIT_GPU_NB_L);
- isNbGpuDone = nbnxn_gpu_try_finish_task(nbv->gpu_nbv,
- flags, eatLocal,
- haveOtherWork,
- enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
- fshift, completionType);
+ isNbGpuDone = Nbnxm::gpu_try_finish_task(nbv->gpu_nbv,
+ flags,
+ Nbnxm::AtomLocality::Local,
+ haveOtherWork,
+ enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
+ fshift, completionType);
wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
// To get the call count right, when the task finished we
// issue a start/stop.
wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
- nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), eatLocal,
+ nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), Nbnxm::AtomLocality::Local,
nbv->nbat, as_rvec_array(force->unpaddedArrayRef().data()), wcycle);
}
}
*/
int numRollingParts = nbv->listParams->numRollingParts;
GMX_ASSERT(numRollingParts == nbv->listParams->nstlistPrune/2, "Since we alternate local/non-local at even/odd steps, we need numRollingParts<=nstlistPrune/2 for correctness and == for efficiency");
- int stepWithCurrentList = step - nbv->grp[eintLocal].nbl_lists.outerListCreationStep;
+ int stepWithCurrentList = step - nbv->grp[Nbnxm::InteractionLocality::Local].nbl_lists.outerListCreationStep;
bool stepIsEven = ((stepWithCurrentList & 1) == 0);
if (stepWithCurrentList > 0 &&
stepWithCurrentList < inputrec->nstlist - 1 &&
(stepIsEven || DOMAINDECOMP(cr)))
{
- nbnxn_gpu_launch_kernel_pruneonly(nbv->gpu_nbv,
- stepIsEven ? eintLocal : eintNonlocal,
- numRollingParts);
+ Nbnxm::gpu_launch_kernel_pruneonly(nbv->gpu_nbv,
+ stepIsEven ? Nbnxm::InteractionLocality::Local : Nbnxm::InteractionLocality::NonLocal,
+ numRollingParts);
}
}
nonbonded_verlet_t *nbv = fr->nbv;
bStateChanged = ((flags & GMX_FORCE_STATECHANGED) != 0);
- bNS = ((flags & GMX_FORCE_NS) != 0) && (!fr->bAllvsAll);
+ bNS = ((flags & GMX_FORCE_NS) != 0);
bFillGrid = (bNS && bStateChanged);
bCalcCGCM = (bFillGrid && !DOMAINDECOMP(cr));
bDoForces = ((flags & GMX_FORCE_FORCES) != 0);
nullptr, 0, mdatoms->homenr, -1,
fr->cginfo, x.unpaddedArrayRef(),
0, nullptr,
- nbv->grp[eintLocal].kernel_type,
+ nbv->grp[Nbnxm::InteractionLocality::Local].kernel_type,
nbv->nbat);
wallcycle_sub_stop(wcycle, ewcsNBS_GRID_LOCAL);
}
wallcycle_sub_start(wcycle, ewcsNBS_GRID_NONLOCAL);
nbnxn_put_on_grid_nonlocal(nbv->nbs.get(), domdec_zones(cr->dd),
fr->cginfo, x.unpaddedArrayRef(),
- nbv->grp[eintNonlocal].kernel_type,
+ nbv->grp[Nbnxm::InteractionLocality::NonLocal].kernel_type,
nbv->nbat);
wallcycle_sub_stop(wcycle, ewcsNBS_GRID_NONLOCAL);
}
if (bNS)
{
- nbnxn_gpu_init_atomdata(nbv->gpu_nbv, nbv->nbat);
+ Nbnxm::gpu_init_atomdata(nbv->gpu_nbv, nbv->nbat);
}
- nbnxn_gpu_upload_shiftvec(nbv->gpu_nbv, nbv->nbat);
+ Nbnxm::gpu_upload_shiftvec(nbv->gpu_nbv, nbv->nbat);
wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
// higher-level object than the nb module.
fr->gpuBonded->updateInteractionListsAndDeviceBuffers(nbnxn_get_gridindices(fr->nbv->nbs.get()),
top->idef,
- nbnxn_gpu_get_xq(nbv->gpu_nbv),
- nbnxn_gpu_get_f(nbv->gpu_nbv),
- nbnxn_gpu_get_fshift(nbv->gpu_nbv));
+ Nbnxm::gpu_get_xq(nbv->gpu_nbv),
+ Nbnxm::gpu_get_f(nbv->gpu_nbv),
+ Nbnxm::gpu_get_fshift(nbv->gpu_nbv));
ppForceWorkload->haveGpuBondedWork = fr->gpuBonded->haveInteractions();
}
/* do local pair search */
if (bNS)
{
+ nbnxn_pairlist_set_t &pairlistSet = nbv->grp[Nbnxm::InteractionLocality::Local].nbl_lists;
+
wallcycle_start_nocount(wcycle, ewcNS);
wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_LOCAL);
nbnxn_make_pairlist(nbv->nbs.get(), nbv->nbat,
&top->excls,
nbv->listParams->rlistOuter,
nbv->min_ci_balanced,
- &nbv->grp[eintLocal].nbl_lists,
- eintLocal,
- nbv->grp[eintLocal].kernel_type,
+ &pairlistSet,
+ Nbnxm::InteractionLocality::Local,
+ nbv->grp[Nbnxm::InteractionLocality::Local].kernel_type,
nrnb);
- nbv->grp[eintLocal].nbl_lists.outerListCreationStep = step;
+ pairlistSet.outerListCreationStep = step;
if (nbv->listParams->useDynamicPruning && !bUseGPU)
{
- nbnxnPrepareListForDynamicPruning(&nbv->grp[eintLocal].nbl_lists);
+ nbnxnPrepareListForDynamicPruning(&pairlistSet);
}
wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_LOCAL);
if (bUseGPU)
{
/* initialize local pair-list on the GPU */
- nbnxn_gpu_init_pairlist(nbv->gpu_nbv,
- nbv->grp[eintLocal].nbl_lists.nbl[0],
- eintLocal);
+ Nbnxm::gpu_init_pairlist(nbv->gpu_nbv,
+ pairlistSet.nblGpu[0],
+ Nbnxm::InteractionLocality::Local);
}
wallcycle_stop(wcycle, ewcNS);
}
else
{
- nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs.get(), eatLocal, FALSE, as_rvec_array(x.unpaddedArrayRef().data()),
+ nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs.get(), Nbnxm::AtomLocality::Local,
+ FALSE, as_rvec_array(x.unpaddedArrayRef().data()),
nbv->nbat, wcycle);
}
wallcycle_start(wcycle, ewcLAUNCH_GPU);
wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
- nbnxn_gpu_copy_xq_to_gpu(nbv->gpu_nbv, nbv->nbat, eatLocal, ppForceWorkload->haveGpuBondedWork);
+ Nbnxm::gpu_copy_xq_to_gpu(nbv->gpu_nbv, nbv->nbat, Nbnxm::AtomLocality::Local, ppForceWorkload->haveGpuBondedWork);
wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
// bonded work not split into separate local and non-local, so with DD
}
/* launch local nonbonded work on GPU */
- wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
+ wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
- do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFNo,
+ do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::Local, enbvClearFNo,
step, nrnb, wcycle);
wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
wallcycle_stop(wcycle, ewcLAUNCH_GPU);
do non-local pair search */
if (DOMAINDECOMP(cr))
{
+ nbnxn_pairlist_set_t &pairlistSet = nbv->grp[Nbnxm::InteractionLocality::NonLocal].nbl_lists;
+
if (bNS)
{
wallcycle_start_nocount(wcycle, ewcNS);
&top->excls,
nbv->listParams->rlistOuter,
nbv->min_ci_balanced,
- &nbv->grp[eintNonlocal].nbl_lists,
- eintNonlocal,
- nbv->grp[eintNonlocal].kernel_type,
+ &pairlistSet,
+ Nbnxm::InteractionLocality::NonLocal,
+ nbv->grp[Nbnxm::InteractionLocality::NonLocal].kernel_type,
nrnb);
- nbv->grp[eintNonlocal].nbl_lists.outerListCreationStep = step;
+ pairlistSet.outerListCreationStep = step;
if (nbv->listParams->useDynamicPruning && !bUseGPU)
{
- nbnxnPrepareListForDynamicPruning(&nbv->grp[eintNonlocal].nbl_lists);
+ nbnxnPrepareListForDynamicPruning(&pairlistSet);
}
wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_NONLOCAL);
- if (nbv->grp[eintNonlocal].kernel_type == nbnxnk8x8x8_GPU)
+ if (nbv->grp[Nbnxm::InteractionLocality::NonLocal].kernel_type == nbnxnk8x8x8_GPU)
{
/* initialize non-local pair-list on the GPU */
- nbnxn_gpu_init_pairlist(nbv->gpu_nbv,
- nbv->grp[eintNonlocal].nbl_lists.nbl[0],
- eintNonlocal);
+ Nbnxm::gpu_init_pairlist(nbv->gpu_nbv,
+ pairlistSet.nblGpu[0],
+ Nbnxm::InteractionLocality::NonLocal);
}
wallcycle_stop(wcycle, ewcNS);
}
{
dd_move_x(cr->dd, box, x.unpaddedArrayRef(), wcycle);
- nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs.get(), eatNonlocal, FALSE, as_rvec_array(x.unpaddedArrayRef().data()),
+ nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs.get(), Nbnxm::AtomLocality::NonLocal,
+ FALSE, as_rvec_array(x.unpaddedArrayRef().data()),
nbv->nbat, wcycle);
}
wallcycle_start(wcycle, ewcLAUNCH_GPU);
/* launch non-local nonbonded tasks on GPU */
- wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
+ wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
- nbnxn_gpu_copy_xq_to_gpu(nbv->gpu_nbv, nbv->nbat, eatNonlocal, ppForceWorkload->haveGpuBondedWork);
+ Nbnxm::gpu_copy_xq_to_gpu(nbv->gpu_nbv, nbv->nbat, Nbnxm::AtomLocality::NonLocal, ppForceWorkload->haveGpuBondedWork);
wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
if (ppForceWorkload->haveGpuBondedWork)
}
wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
- do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFNo,
+ do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::NonLocal, enbvClearFNo,
step, nrnb, wcycle);
wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
if (DOMAINDECOMP(cr))
{
- nbnxn_gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat,
- flags, eatNonlocal, ppForceWorkload->haveGpuBondedWork);
+ Nbnxm::gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat,
+ flags, Nbnxm::AtomLocality::NonLocal, ppForceWorkload->haveGpuBondedWork);
}
- nbnxn_gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat,
- flags, eatLocal, ppForceWorkload->haveGpuBondedWork);
+ Nbnxm::gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat,
+ flags, Nbnxm::AtomLocality::Local, ppForceWorkload->haveGpuBondedWork);
wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
if (ppForceWorkload->haveGpuBondedWork && (flags & GMX_FORCE_ENERGY))
if (!bUseOrEmulGPU)
{
- do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFYes,
+ do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::Local, enbvClearFYes,
step, nrnb, wcycle);
}
/* Calculate the local and non-local free energy interactions here.
* Happens here on the CPU both with and without GPU.
*/
- if (fr->nbv->grp[eintLocal].nbl_lists.nbl_fep[0]->nrj > 0)
+ if (fr->nbv->grp[Nbnxm::InteractionLocality::Local].nbl_lists.nbl_fep[0]->nrj > 0)
{
- do_nb_verlet_fep(&fr->nbv->grp[eintLocal].nbl_lists,
+ do_nb_verlet_fep(&fr->nbv->grp[Nbnxm::InteractionLocality::Local].nbl_lists,
fr, as_rvec_array(x.unpaddedArrayRef().data()), f, mdatoms,
inputrec->fepvals, lambda,
enerd, flags, nrnb, wcycle);
}
if (DOMAINDECOMP(cr) &&
- fr->nbv->grp[eintNonlocal].nbl_lists.nbl_fep[0]->nrj > 0)
+ fr->nbv->grp[Nbnxm::InteractionLocality::NonLocal].nbl_lists.nbl_fep[0]->nrj > 0)
{
- do_nb_verlet_fep(&fr->nbv->grp[eintNonlocal].nbl_lists,
+ do_nb_verlet_fep(&fr->nbv->grp[Nbnxm::InteractionLocality::NonLocal].nbl_lists,
fr, as_rvec_array(x.unpaddedArrayRef().data()), f, mdatoms,
inputrec->fepvals, lambda,
enerd, flags, nrnb, wcycle);
if (!bUseOrEmulGPU)
{
- int aloc;
-
if (DOMAINDECOMP(cr))
{
- do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFNo,
+ do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::NonLocal, enbvClearFNo,
step, nrnb, wcycle);
}
- if (!bUseOrEmulGPU)
- {
- aloc = eintLocal;
- }
- else
- {
- aloc = eintNonlocal;
- }
+ const Nbnxm::InteractionLocality iloc =
+ (!bUseOrEmulGPU ? Nbnxm::InteractionLocality::Local : Nbnxm::InteractionLocality::NonLocal);
/* Add all the non-bonded force to the normal force array.
* This can be split into a local and a non-local part when overlapping
*/
wallcycle_stop(wcycle, ewcFORCE);
- nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), eatAll, nbv->nbat, f, wcycle);
+ nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), Nbnxm::AtomLocality::All, nbv->nbat, f, wcycle);
wallcycle_start_nocount(wcycle, ewcFORCE);
/* if there are multiple fshift output buffers reduce them */
if ((flags & GMX_FORCE_VIRIAL) &&
- nbv->grp[aloc].nbl_lists.nnbl > 1)
+ nbv->grp[iloc].nbl_lists.nnbl > 1)
{
/* This is not in a subcounter because it takes a
negligible and constant-sized amount of time */
if (bUseGPU)
{
wallcycle_start(wcycle, ewcWAIT_GPU_NB_NL);
- nbnxn_gpu_wait_finish_task(nbv->gpu_nbv,
- flags, eatNonlocal,
- ppForceWorkload->haveGpuBondedWork,
- enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
- fr->fshift);
+ Nbnxm::gpu_wait_finish_task(nbv->gpu_nbv,
+ flags, Nbnxm::AtomLocality::NonLocal,
+ ppForceWorkload->haveGpuBondedWork,
+ enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
+ fr->fshift);
cycles_wait_gpu += wallcycle_stop(wcycle, ewcWAIT_GPU_NB_NL);
}
else
{
wallcycle_start_nocount(wcycle, ewcFORCE);
- do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFYes,
+ do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::NonLocal, enbvClearFYes,
step, nrnb, wcycle);
wallcycle_stop(wcycle, ewcFORCE);
}
/* skip the reduction if there was no non-local work to do */
- if (nbv->grp[eintNonlocal].nbl_lists.nbl[0]->nsci > 0)
+ if (!nbv->grp[Nbnxm::InteractionLocality::NonLocal].nbl_lists.nblGpu[0]->sci.empty())
{
- nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), eatNonlocal,
+ nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), Nbnxm::AtomLocality::NonLocal,
nbv->nbat, f, wcycle);
}
}
const float gpuWaitApiOverheadMargin = 2e6f; /* cycles */
wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
- nbnxn_gpu_wait_finish_task(nbv->gpu_nbv,
- flags, eatLocal, ppForceWorkload->haveGpuBondedWork,
- enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
- fr->fshift);
+ Nbnxm::gpu_wait_finish_task(nbv->gpu_nbv,
+ flags, Nbnxm::AtomLocality::Local, ppForceWorkload->haveGpuBondedWork,
+ enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
+ fr->fshift);
float cycles_tmp = wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
if (ddCloseBalanceRegion == DdCloseBalanceRegionAfterForceComputation::yes)
// NOTE: emulation kernel is not included in the balancing region,
// but emulation mode does not target performance anyway
wallcycle_start_nocount(wcycle, ewcFORCE);
- do_nb_verlet(fr, ic, enerd, flags, eintLocal,
+ do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::Local,
DOMAINDECOMP(cr) ? enbvClearFNo : enbvClearFYes,
step, nrnb, wcycle);
wallcycle_stop(wcycle, ewcFORCE);
/* now clear the GPU outputs while we finish the step on the CPU */
wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
- nbnxn_gpu_clear_outputs(nbv->gpu_nbv, flags);
+ Nbnxm::gpu_clear_outputs(nbv->gpu_nbv, flags);
/* Is dynamic pair-list pruning activated? */
if (nbv->listParams->useDynamicPruning)
* on the non-alternating path. */
if (bUseOrEmulGPU && !alternateGpuWait)
{
- nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), eatLocal,
+ nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), Nbnxm::AtomLocality::Local,
nbv->nbat, f, wcycle);
}
if (DOMAINDECOMP(cr))
}
bStateChanged = ((flags & GMX_FORCE_STATECHANGED) != 0);
- bNS = ((flags & GMX_FORCE_NS) != 0) && (!fr->bAllvsAll);
+ bNS = ((flags & GMX_FORCE_NS) != 0);
/* Should we perform the long-range nonbonded evaluation inside the neighborsearching? */
bFillGrid = (bNS && bStateChanged);
bCalcCGCM = (bFillGrid && !DOMAINDECOMP(cr));
if (printReport)
{
- auto nbnxn_gpu_timings = use_GPU(nbv) ? nbnxn_gpu_get_timings(nbv->gpu_nbv) : nullptr;
+ auto nbnxn_gpu_timings = use_GPU(nbv) ? Nbnxm::gpu_get_timings(nbv->gpu_nbv) : nullptr;
gmx_wallclock_gpu_pme_t pme_gpu_timings = {};
if (pme_gpu_task_enabled(pme))
{
}
}
-extern void initialize_lambdas(FILE *fplog, t_inputrec *ir, int *fep_state, gmx::ArrayRef<real> lambda, double *lam0)
+void initialize_lambdas(FILE *fplog,
+ const t_inputrec &ir,
+ bool isMaster,
+ int *fep_state,
+ gmx::ArrayRef<real> lambda,
+ double *lam0)
{
- /* this function works, but could probably use a logic rewrite to keep all the different
- types of efep straight. */
+ /* TODO: Clean up initialization of fep_state and lambda in
+ t_state. This function works, but could probably use a logic
+ rewrite to keep all the different types of efep straight. */
- if ((ir->efep == efepNO) && (!ir->bSimTemp))
+ if ((ir.efep == efepNO) && (!ir.bSimTemp))
{
return;
}
- t_lambda *fep = ir->fepvals;
- *fep_state = fep->init_fep_state; /* this might overwrite the checkpoint
- if checkpoint is set -- a kludge is in for now
- to prevent this.*/
+ const t_lambda *fep = ir.fepvals;
+ if (isMaster)
+ {
+ *fep_state = fep->init_fep_state; /* this might overwrite the checkpoint
+ if checkpoint is set -- a kludge is in for now
+ to prevent this.*/
+ }
for (int i = 0; i < efptNR; i++)
{
+ double thisLambda;
/* overwrite lambda state with init_lambda for now for backwards compatibility */
- if (fep->init_lambda >= 0) /* if it's -1, it was never initializd */
+ if (fep->init_lambda >= 0) /* if it's -1, it was never initialized */
{
- lambda[i] = fep->init_lambda;
- if (lam0)
- {
- lam0[i] = lambda[i];
- }
+ thisLambda = fep->init_lambda;
}
else
{
- lambda[i] = fep->all_lambda[i][*fep_state];
- if (lam0)
- {
- lam0[i] = lambda[i];
- }
+ thisLambda = fep->all_lambda[i][fep->init_fep_state];
+ }
+ if (isMaster)
+ {
+ lambda[i] = thisLambda;
+ }
+ if (lam0 != nullptr)
+ {
+ lam0[i] = thisLambda;
}
}
- if (ir->bSimTemp)
+ if (ir.bSimTemp)
{
/* need to rescale control temperatures to match current state */
- for (int i = 0; i < ir->opts.ngtc; i++)
+ for (int i = 0; i < ir.opts.ngtc; i++)
{
- if (ir->opts.ref_t[i] > 0)
+ if (ir.opts.ref_t[i] > 0)
{
- ir->opts.ref_t[i] = ir->simtempvals->temperatures[*fep_state];
+ ir.opts.ref_t[i] = ir.simtempvals->temperatures[fep->init_fep_state];
}
}
}
fprintf(fplog, "]\n");
}
}
-
-
-void init_md(FILE *fplog,
- const t_commrec *cr, gmx::IMDOutputProvider *outputProvider,
- t_inputrec *ir, const gmx_output_env_t *oenv,
- const MdrunOptions &mdrunOptions,
- double *t, double *t0,
- t_state *globalState, double *lam0,
- t_nrnb *nrnb, gmx_mtop_t *mtop,
- gmx_update_t **upd,
- gmx::BoxDeformation *deform,
- int nfile, const t_filenm fnm[],
- gmx_mdoutf_t *outf, t_mdebin **mdebin,
- tensor force_vir, tensor shake_vir,
- tensor total_vir, tensor pres, rvec mu_tot,
- gmx_bool *bSimAnn, t_vcm **vcm,
- gmx_wallcycle_t wcycle)
-{
- int i;
-
- /* Initial values */
- *t = *t0 = ir->init_t;
-
- *bSimAnn = FALSE;
- for (i = 0; i < ir->opts.ngtc; i++)
- {
- /* set bSimAnn if any group is being annealed */
- if (ir->opts.annealing[i] != eannNO)
- {
- *bSimAnn = TRUE;
- }
- }
-
- /* Initialize lambda variables */
- /* TODO: Clean up initialization of fep_state and lambda in t_state.
- * We currently need to call initialize_lambdas on non-master ranks
- * to initialize lam0.
- */
- if (MASTER(cr))
- {
- initialize_lambdas(fplog, ir, &globalState->fep_state, globalState->lambda, lam0);
- }
- else
- {
- int tmpFepState;
- std::array<real, efptNR> tmpLambda;
- initialize_lambdas(fplog, ir, &tmpFepState, tmpLambda, lam0);
- }
-
- // TODO upd is never NULL in practice, but the analysers don't know that
- if (upd)
- {
- *upd = init_update(ir, deform);
- }
- if (*bSimAnn)
- {
- update_annealing_target_temp(ir, ir->init_t, upd ? *upd : nullptr);
- }
-
- if (vcm != nullptr)
- {
- *vcm = init_vcm(fplog, &mtop->groups, ir);
- }
-
- if (EI_DYNAMICS(ir->eI) && !mdrunOptions.continuationOptions.appendFiles)
- {
- if (ir->etc == etcBERENDSEN)
- {
- please_cite(fplog, "Berendsen84a");
- }
- if (ir->etc == etcVRESCALE)
- {
- please_cite(fplog, "Bussi2007a");
- }
- if (ir->eI == eiSD1)
- {
- please_cite(fplog, "Goga2012");
- }
- }
- init_nrnb(nrnb);
-
- if (nfile != -1)
- {
- *outf = init_mdoutf(fplog, nfile, fnm, mdrunOptions, cr, outputProvider, ir, mtop, oenv, wcycle);
-
- *mdebin = init_mdebin(mdrunOptions.continuationOptions.appendFiles ? nullptr : mdoutf_get_fp_ene(*outf),
- mtop, ir, mdoutf_get_fp_dhdl(*outf));
- }
-
- /* Initiate variables */
- clear_mat(force_vir);
- clear_mat(shake_vir);
- clear_rvec(mu_tot);
- clear_mat(total_vir);
- clear_mat(pres);
-}
-
-void init_rerun(FILE *fplog,
- const t_commrec *cr, gmx::IMDOutputProvider *outputProvider,
- t_inputrec *ir, const gmx_output_env_t *oenv,
- const MdrunOptions &mdrunOptions,
- t_state *globalState, double *lam0,
- t_nrnb *nrnb, gmx_mtop_t *mtop,
- int nfile, const t_filenm fnm[],
- gmx_mdoutf_t *outf, t_mdebin **mdebin,
- gmx_wallcycle_t wcycle)
-{
- /* Initialize lambda variables */
- /* TODO: Clean up initialization of fep_state and lambda in t_state.
- * We currently need to call initialize_lambdas on non-master ranks
- * to initialize lam0.
- */
- if (MASTER(cr))
- {
- initialize_lambdas(fplog, ir, &globalState->fep_state, globalState->lambda, lam0);
- }
- else
- {
- int tmpFepState;
- std::array<real, efptNR> tmpLambda;
- initialize_lambdas(fplog, ir, &tmpFepState, tmpLambda, lam0);
- }
-
- init_nrnb(nrnb);
-
- if (nfile != -1)
- {
- *outf = init_mdoutf(fplog, nfile, fnm, mdrunOptions, cr, outputProvider, ir, mtop, oenv, wcycle);
- *mdebin = init_mdebin(mdrunOptions.continuationOptions.appendFiles ? nullptr : mdoutf_get_fp_ene(*outf),
- mtop, ir, mdoutf_get_fp_dhdl(*outf), true);
- }
-}
#include "gromacs/awh/awh.h"
#include "gromacs/commandline/filenm.h"
-#include "gromacs/compat/make_unique.h"
#include "gromacs/domdec/collect.h"
#include "gromacs/domdec/domdec.h"
#include "gromacs/domdec/domdec_network.h"
#include "gromacs/domdec/partition.h"
#include "gromacs/essentialdynamics/edsam.h"
#include "gromacs/ewald/pme.h"
-#include "gromacs/ewald/pme-load-balancing.h"
+#include "gromacs/ewald/pme_load_balancing.h"
#include "gromacs/fileio/trxio.h"
#include "gromacs/gmxlib/network.h"
#include "gromacs/gmxlib/nrnb.h"
#include "gromacs/gpu_utils/gpu_utils.h"
#include "gromacs/imd/imd.h"
-#include "gromacs/listed-forces/manage-threading.h"
+#include "gromacs/listed_forces/manage_threading.h"
#include "gromacs/math/functions.h"
#include "gromacs/math/utilities.h"
#include "gromacs/math/vec.h"
#include "gromacs/mdlib/compute_io.h"
#include "gromacs/mdlib/constr.h"
#include "gromacs/mdlib/ebin.h"
+#include "gromacs/mdlib/energyoutput.h"
#include "gromacs/mdlib/expanded.h"
#include "gromacs/mdlib/force.h"
#include "gromacs/mdlib/force_flags.h"
#include "gromacs/mdlib/forcerec.h"
#include "gromacs/mdlib/md_support.h"
#include "gromacs/mdlib/mdatoms.h"
-#include "gromacs/mdlib/mdebin.h"
#include "gromacs/mdlib/mdoutf.h"
#include "gromacs/mdlib/mdrun.h"
#include "gromacs/mdlib/mdsetup.h"
#include "gromacs/mdlib/membed.h"
-#include "gromacs/mdlib/nb_verlet.h"
-#include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h"
#include "gromacs/mdlib/ns.h"
#include "gromacs/mdlib/resethandler.h"
#include "gromacs/mdlib/shellfc.h"
#include "gromacs/mdlib/update.h"
#include "gromacs/mdlib/vcm.h"
#include "gromacs/mdlib/vsite.h"
-#include "gromacs/mdtypes/awh-history.h"
-#include "gromacs/mdtypes/awh-params.h"
+#include "gromacs/mdtypes/awh_history.h"
+#include "gromacs/mdtypes/awh_params.h"
#include "gromacs/mdtypes/commrec.h"
#include "gromacs/mdtypes/df_history.h"
#include "gromacs/mdtypes/energyhistory.h"
// t_inputrec is being replaced by IMdpOptionsProvider, so this
// will go away eventually.
t_inputrec *ir = inputrec;
- gmx_mdoutf *outf = nullptr;
int64_t step, step_rel;
double t, lam0[efptNR];
bool isLastStep = false;
t_trxstatus *status;
rvec mu_tot;
t_trxframe rerun_fr;
- gmx_localtop_t *top;
- t_mdebin *mdebin = nullptr;
+ gmx_localtop_t top;
gmx_enerdata_t *enerd;
PaddedVector<gmx::RVec> f {};
gmx_global_stat_t gstat;
t_graph *graph = nullptr;
gmx_groups_t *groups;
- gmx_ekindata_t *ekind;
gmx_shellfc_t *shellfc;
double cycles;
"be available in a different form in a future version of GROMACS, "
"e.g. gmx rerun -f.");
+ if (ir->efep != efepNO && (mdAtoms->mdatoms()->nMassPerturbed > 0 ||
+ (constr && constr->havePerturbedConstraints())))
+ {
+ gmx_fatal(FARGS, "Perturbed masses or constraints are not supported by rerun. "
+ "Either make a .tpr without mass and constraint perturbation, "
+ "or use GROMACS 2018.4, 2018.5 or later 2018 version.");
+ }
if (ir->bExpanded)
{
gmx_fatal(FARGS, "Expanded ensemble not supported by rerun.");
top_global->intermolecularExclusionGroup = genQmmmIndices(*top_global);
}
- /* Initial values */
- init_rerun(fplog, cr, outputProvider, ir, oenv, mdrunOptions,
- state_global, lam0, nrnb, top_global,
- nfile, fnm, &outf, &mdebin, wcycle);
+ initialize_lambdas(fplog, *ir, MASTER(cr), &state_global->fep_state, state_global->lambda, lam0);
+ init_nrnb(nrnb);
+ gmx_mdoutf *outf = init_mdoutf(fplog, nfile, fnm, mdrunOptions, cr, outputProvider, ir, top_global, oenv, wcycle);
+ gmx::EnergyOutput energyOutput;
+ energyOutput.prepare(mdoutf_get_fp_ene(outf), top_global, ir, mdoutf_get_fp_dhdl(outf), true);
/* Energy terms and groups */
snew(enerd, 1);
enerd);
/* Kinetic energy data */
- snew(ekind, 1);
+ std::unique_ptr<gmx_ekindata_t> eKinData = std::make_unique<gmx_ekindata_t>();
+ gmx_ekindata_t *ekind = eKinData.get();
init_ekindata(fplog, top_global, &(ir->opts), ekind);
/* Copy the cos acceleration to the groups struct */
ekind->cosacc.cos_accel = ir->cos_accel;
ir->nstcalcenergy, DOMAINDECOMP(cr));
{
- double io = compute_io(ir, top_global->natoms, groups, mdebin->ebin->nener, 1);
+ double io = compute_io(ir, top_global->natoms, groups, energyOutput.numEnergyTerms(), 1);
if ((io > 2000) && MASTER(cr))
{
fprintf(stderr,
if (DOMAINDECOMP(cr))
{
- top = dd_init_local_top(top_global);
+ dd_init_local_top(*top_global, &top);
- stateInstance = compat::make_unique<t_state>();
+ stateInstance = std::make_unique<t_state>();
state = stateInstance.get();
dd_init_local_state(cr->dd, state_global, state);
/* Distribute the charge groups over the nodes from the master node */
dd_partition_system(fplog, mdlog, ir->init_step, cr, TRUE, 1,
- state_global, top_global, ir,
- state, &f, mdAtoms, top, fr,
+ state_global, *top_global, ir,
+ state, &f, mdAtoms, &top, fr,
vsite, constr,
nrnb, nullptr, FALSE);
shouldCheckNumberOfBondedInteractions = true;
/* Copy the pointer to the global state */
state = state_global;
- snew(top, 1);
- mdAlgorithmsSetupAtomData(cr, ir, top_global, top, fr,
+ mdAlgorithmsSetupAtomData(cr, ir, *top_global, &top, fr,
&graph, mdAtoms, constr, vsite, shellfc);
}
&totalNumberOfBondedInteractions, &bSumEkinhOld, cglo_flags);
}
checkNumberOfBondedInteractions(mdlog, cr, totalNumberOfBondedInteractions,
- top_global, top, state,
+ top_global, &top, state,
&shouldCheckNumberOfBondedInteractions);
if (MASTER(cr))
gmx_fatal(FARGS, "Vsite recalculation with -rerun is not implemented with domain decomposition, "
"use a single rank");
}
- prepareRerunState(rerun_fr, state_global, constructVsites, vsite, top->idef, ir->delta_t, *fr, graph);
+ prepareRerunState(rerun_fr, state_global, constructVsites, vsite, top.idef, ir->delta_t, *fr, graph);
}
isLastStep = isLastStep || stopHandler->stoppingAfterCurrentStep(bNS);
const bool bMasterState = true;
dd_partition_system(fplog, mdlog, step, cr,
bMasterState, nstglobalcomm,
- state_global, top_global, ir,
- state, &f, mdAtoms, top, fr,
+ state_global, *top_global, ir,
+ state, &f, mdAtoms, &top, fr,
vsite, constr,
nrnb, wcycle,
mdrunOptions.verbose);
/* Now is the time to relax the shells */
relax_shell_flexcon(fplog, cr, ms, mdrunOptions.verbose,
enforcedRotation, step,
- ir, bNS, force_flags, top,
+ ir, bNS, force_flags, &top,
constr, enerd, fcd,
state, f.arrayRefWithPadding(), force_vir, mdatoms,
nrnb, wcycle, graph, groups,
Awh *awh = nullptr;
gmx_edsam *ed = nullptr;
do_force(fplog, cr, ms, ir, awh, enforcedRotation,
- step, nrnb, wcycle, top, groups,
+ step, nrnb, wcycle, &top, groups,
state->box, state->x.arrayRefWithPadding(), &state->hist,
f.arrayRefWithPadding(), force_vir, mdatoms, enerd, fcd,
state->lambda, graph,
do_md_trajectory_writing(fplog, cr, nfile, fnm, step, step_rel, t,
ir, state, state_global, observablesHistory,
top_global, fr,
- outf, mdebin, ekind, f,
+ outf, energyOutput, ekind, f,
isCheckpointingStep, doRerun, isLastStep,
mdrunOptions.writeConfout,
bSumEkinhOld);
shift_self(graph, state->box, as_rvec_array(state->x.data()));
}
construct_vsites(vsite, as_rvec_array(state->x.data()), ir->delta_t, as_rvec_array(state->v.data()),
- top->idef.iparams, top->idef.il,
+ top.idef.iparams, top.idef.il,
fr->ePBC, fr->bMolPBC, cr, state->box);
if (graph != nullptr)
| (shouldCheckNumberOfBondedInteractions ? CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS : 0)
);
checkNumberOfBondedInteractions(mdlog, cr, totalNumberOfBondedInteractions,
- top_global, top, state,
+ top_global, &top, state,
&shouldCheckNumberOfBondedInteractions);
}
if (MASTER(cr))
{
const bool bCalcEnerStep = true;
- upd_mdebin(mdebin, doFreeEnergyPerturbation, bCalcEnerStep,
- t, mdatoms->tmass, enerd, state,
- ir->fepvals, ir->expandedvals, rerun_fr.box,
- shake_vir, force_vir, total_vir, pres,
- ekind, mu_tot, constr);
+ energyOutput.addDataAtEnergyStep(doFreeEnergyPerturbation, bCalcEnerStep,
+ t, mdatoms->tmass, enerd, state,
+ ir->fepvals, ir->expandedvals, state->box,
+ shake_vir, force_vir, total_vir, pres,
+ ekind, mu_tot, constr);
const bool do_ene = true;
const bool do_log = true;
const bool do_dr = ir->nstdisreout != 0;
const bool do_or = ir->nstorireout != 0;
- print_ebin(mdoutf_get_fp_ene(outf), do_ene, do_dr, do_or, do_log ? fplog : nullptr,
- step, t,
- eprNORMAL, mdebin, fcd, groups, &(ir->opts), awh);
+ energyOutput.printStepToEnergyFile(mdoutf_get_fp_ene(outf), do_ene, do_dr, do_or,
+ do_log ? fplog : nullptr,
+ step, t,
+ eprNORMAL, fcd, groups, &(ir->opts), awh);
if (do_per_step(step, ir->nstlog))
{
gmx_pme_send_finish(cr);
}
- done_mdebin(mdebin);
done_mdoutf(outf);
done_shellfc(fplog, shellfc, step_rel);
destroy_enerdata(enerd);
sfree(enerd);
- sfree(top);
}
#include <cstring>
#include <algorithm>
+#include <memory>
#include "gromacs/commandline/filenm.h"
-#include "gromacs/compat/make_unique.h"
#include "gromacs/domdec/domdec.h"
#include "gromacs/domdec/domdec_struct.h"
#include "gromacs/domdec/localatomsetmanager.h"
-#include "gromacs/ewald/ewald-utils.h"
+#include "gromacs/ewald/ewald_utils.h"
#include "gromacs/ewald/pme.h"
-#include "gromacs/ewald/pme-gpu-program.h"
+#include "gromacs/ewald/pme_gpu_program.h"
#include "gromacs/fileio/checkpoint.h"
#include "gromacs/fileio/gmxfio.h"
#include "gromacs/fileio/oenv.h"
#include "gromacs/hardware/cpuinfo.h"
#include "gromacs/hardware/detecthardware.h"
#include "gromacs/hardware/printhardware.h"
-#include "gromacs/listed-forces/disre.h"
-#include "gromacs/listed-forces/gpubonded.h"
-#include "gromacs/listed-forces/orires.h"
+#include "gromacs/listed_forces/disre.h"
+#include "gromacs/listed_forces/gpubonded.h"
+#include "gromacs/listed_forces/orires.h"
#include "gromacs/math/functions.h"
#include "gromacs/math/utilities.h"
#include "gromacs/math/vec.h"
#include "gromacs/mdlib/mdatoms.h"
#include "gromacs/mdlib/mdrun.h"
#include "gromacs/mdlib/membed.h"
-#include "gromacs/mdlib/nb_verlet.h"
-#include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h"
-#include "gromacs/mdlib/nbnxn_search.h"
-#include "gromacs/mdlib/nbnxn_tuning.h"
#include "gromacs/mdlib/ppforceworkload.h"
#include "gromacs/mdlib/qmmm.h"
#include "gromacs/mdlib/sighandler.h"
#include "gromacs/mdtypes/md_enums.h"
#include "gromacs/mdtypes/observableshistory.h"
#include "gromacs/mdtypes/state.h"
+#include "gromacs/nbnxm/nbnxm.h"
+#include "gromacs/nbnxm/pairlist_tuning.h"
#include "gromacs/pbcutil/pbc.h"
#include "gromacs/pulling/output.h"
#include "gromacs/pulling/pull.h"
// part of the interface to the client code, which is associated only with the
// original thread. Handles to the same resources can be obtained by copy.
{
- newRunner.restraintManager_ = compat::make_unique<RestraintManager>(*restraintManager_);
+ newRunner.restraintManager_ = std::make_unique<RestraintManager>(*restraintManager_);
}
// Copy original cr pointer before master thread can pass the thread barrier
newRunner.replExParams = replExParams;
newRunner.pforce = pforce;
newRunner.ms = ms;
- newRunner.stopHandlerBuilder_ = compat::make_unique<StopHandlerBuilder>(*stopHandlerBuilder_);
+ newRunner.stopHandlerBuilder_ = std::make_unique<StopHandlerBuilder>(*stopHandlerBuilder_);
threadMpiMdrunnerAccessBarrier();
int nChargePerturbed = -1, nTypePerturbed = 0;
gmx_wallcycle_t wcycle;
gmx_walltime_accounting_t walltime_accounting = nullptr;
- int rc;
- int64_t reset_counters;
- int nthreads_pme = 1;
- gmx_membed_t * membed = nullptr;
- gmx_hw_info_t *hwinfo = nullptr;
+ gmx_membed_t * membed = nullptr;
+ gmx_hw_info_t *hwinfo = nullptr;
/* CAUTION: threads may be started later on in this function, so
cr doesn't reflect the final parallel state right now */
std::vector<int> gpuIdsAvailable;
try
{
- gpuIdsAvailable = parseUserGpuIds(hw_opt.gpuIdsAvailable);
- // TODO We could put the GPU IDs into a std::map to find
- // duplicates, but for the small numbers of IDs involved, this
- // code is simple and fast.
- for (size_t i = 0; i != gpuIdsAvailable.size(); ++i)
- {
- for (size_t j = i+1; j != gpuIdsAvailable.size(); ++j)
- {
- if (gpuIdsAvailable[i] == gpuIdsAvailable[j])
- {
- GMX_THROW(InvalidInputError(formatString("The string of available GPU device IDs '%s' may not contain duplicate device IDs", hw_opt.gpuIdsAvailable.c_str())));
- }
- }
- }
+ gpuIdsAvailable = parseUserGpuIdString(hw_opt.gpuIdsAvailable);
}
GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
std::vector<int> userGpuTaskAssignment;
try
{
- userGpuTaskAssignment = parseUserGpuIds(hw_opt.userGpuTaskAssignment);
+ userGpuTaskAssignment = parseUserTaskAssignmentString(hw_opt.userGpuTaskAssignment);
}
GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
auto nonbondedTarget = findTaskTarget(nbpu_opt);
if (SIMMASTER(cr))
{
/* Only the master rank has the global state */
- globalState = compat::make_unique<t_state>();
+ globalState = std::make_unique<t_state>();
/* Read (nearly) all data required for the simulation */
read_tpx_state(ftp2fn(efTPR, filenames.size(), filenames.data()), inputrec, globalState.get(), &mtop);
inputrec->cutoff_scheme == ecutsVERLET,
gpuAccelerationOfNonbondedIsUseful(mdlog, inputrec, GMX_THREAD_MPI),
hw_opt.nthreads_tmpi);
- auto canUseGpuForPme = pme_gpu_supports_build(*hwinfo, nullptr) && pme_gpu_supports_input(*inputrec, mtop, nullptr);
useGpuForPme = decideWhetherToUseGpusForPmeWithThreadMpi
(useGpuForNonbonded, pmeTarget, gpuIdsToUse, userGpuTaskAssignment,
- canUseGpuForPme, hw_opt.nthreads_tmpi, domdecOptions.numPmeRanks);
+ *hwinfo, *inputrec, mtop, hw_opt.nthreads_tmpi, domdecOptions.numPmeRanks);
}
GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
usingVerletScheme,
gpuAccelerationOfNonbondedIsUseful(mdlog, inputrec, !GMX_THREAD_MPI),
gpusWereDetected);
- auto canUseGpuForPme = pme_gpu_supports_build(*hwinfo, nullptr) && pme_gpu_supports_input(*inputrec, mtop, nullptr);
useGpuForPme = decideWhetherToUseGpusForPme(useGpuForNonbonded, pmeTarget, userGpuTaskAssignment,
- canUseGpuForPme, cr->nnodes, domdecOptions.numPmeRanks,
+ *hwinfo, *inputrec, mtop,
+ cr->nnodes, domdecOptions.numPmeRanks,
gpusWereDetected);
auto canUseGpuForBonded = buildSupportsGpuBondeds(nullptr) && inputSupportsGpuBondeds(*inputrec, mtop, nullptr);
useGpuForBonded =
{
if (!MASTER(cr))
{
- globalState = compat::make_unique<t_state>();
+ globalState = std::make_unique<t_state>();
}
broadcastStateWithoutDynamics(cr, globalState.get());
}
gmx_fatal(FARGS, "The .mdp file specified an energy mininization or normal mode algorithm, and these are not compatible with mdrun -rerun");
}
- if (can_use_allvsall(inputrec, TRUE, cr, fplog) && DOMAINDECOMP(cr))
- {
- gmx_fatal(FARGS, "All-vs-all loops do not work with domain decomposition, use a single MPI rank");
- }
-
if (!(EEL_PME(inputrec->coulombtype) || EVDW_PME(inputrec->vdwtype)))
{
if (domdecOptions.numPmeRanks > 0)
}
}
- /* getting number of PP/PME threads
+ /* getting number of PP/PME threads on this MPI / tMPI rank.
PME: env variable should be read only on one node to make sure it is
identical everywhere;
*/
- nthreads_pme = gmx_omp_nthreads_get(emntPME);
-
- int numThreadsOnThisRank;
- /* threads on this MPI process or TMPI thread */
- if (thisRankHasDuty(cr, DUTY_PP))
- {
- numThreadsOnThisRank = gmx_omp_nthreads_get(emntNonbonded);
- }
- else
- {
- numThreadsOnThisRank = nthreads_pme;
- }
-
+ const int numThreadsOnThisRank =
+ thisRankHasDuty(cr, DUTY_PP) ? gmx_omp_nthreads_get(emntNonbonded) : gmx_omp_nthreads_get(emntPME);
checkHardwareOversubscription(numThreadsOnThisRank, cr->nodeid,
*hwinfo->hardwareTopology,
physicalNodeComm, mdlog);
{
/* Master synchronizes its value of reset_counters with all nodes
* including PME only nodes */
- reset_counters = wcycle_get_reset_counters(wcycle);
+ int64_t reset_counters = wcycle_get_reset_counters(wcycle);
gmx_bcast_sim(sizeof(reset_counters), &reset_counters, cr);
wcycle_set_reset_counters(wcycle, reset_counters);
}
mtop.natoms, nChargePerturbed != 0, nTypePerturbed != 0,
mdrunOptions.reproducible,
ewaldcoeff_q, ewaldcoeff_lj,
- nthreads_pme,
+ gmx_omp_nthreads_get(emntPME),
pmeRunMode, nullptr,
pmeDeviceInfo, pmeGpuProgram.get(), mdlog);
}
pmedata,
EI_DYNAMICS(inputrec->eI) && !isMultiSim(ms));
+ // clean up cycle counter
+ wallcycle_destroy(wcycle);
+
// Free PME data
if (pmedata)
{
gmx_fedisableexcept();
}
- rc = static_cast<int>(gmx_get_stop_condition());
+ auto rc = static_cast<int>(gmx_get_stop_condition());
#if GMX_THREAD_MPI
/* we need to join all threads. The sub-threads join when they
wait for that. */
if (PAR(cr) && MASTER(cr))
{
- done_commrec(cr);
tMPI_Finalize();
}
+ //TODO free commrec in MPI simulations
+ done_commrec(cr);
#endif
-
return rc;
}
void Mdrunner::BuilderImplementation::addMultiSim(gmx_multisim_t* multisim)
{
- multisim_ = compat::make_unique<gmx_multisim_t*>(multisim);
+ multisim_ = std::make_unique<gmx_multisim_t*>(multisim);
}
Mdrunner Mdrunner::BuilderImplementation::build()
GMX_THROW(gmx::APIError("MdrunnerBuilder::addBondedTaskAssignment() is required before build()"));
}
- newRunner.restraintManager_ = compat::make_unique<gmx::RestraintManager>();
+ newRunner.restraintManager_ = std::make_unique<gmx::RestraintManager>();
if (stopHandlerBuilder_)
{
}
else
{
- newRunner.stopHandlerBuilder_ = compat::make_unique<StopHandlerBuilder>();
+ newRunner.stopHandlerBuilder_ = std::make_unique<StopHandlerBuilder>();
}
return newRunner;
}
MdrunnerBuilder::MdrunnerBuilder(compat::not_null<SimulationContext*> context) :
- impl_ {gmx::compat::make_unique<Mdrunner::BuilderImplementation>(context)}
+ impl_ {std::make_unique<Mdrunner::BuilderImplementation>(context)}
{
}
--- /dev/null
- c12*(-repu_shift_V2 + repu_shift_V3*r_switch)*r_switch*r_switch*inv_r;
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014,2016,2017,2018,2019, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/*! \internal \file
+ * \brief
+ * Utility constant and function declaration for the CUDA non-bonded kernels.
+ * This header should be included once at the top level, just before the
+ * kernels are included (has to be preceded by nbnxn_cuda_types.h).
+ *
+ * \author Szilárd Páll <pall.szilard@gmail.com>
+ * \ingroup module_nbnxm
+ */
+#include <assert.h>
+
+/* Note that floating-point constants in CUDA code should be suffixed
+ * with f (e.g. 0.5f), to stop the compiler producing intermediate
+ * code that is in double precision.
+ */
+
+#include "gromacs/gpu_utils/cuda_arch_utils.cuh"
+#include "gromacs/gpu_utils/cuda_kernel_utils.cuh"
+#include "gromacs/gpu_utils/vectype_ops.cuh"
+
+#include "nbnxm_cuda_types.h"
+
+#ifndef NBNXM_CUDA_KERNEL_UTILS_CUH
+#define NBNXM_CUDA_KERNEL_UTILS_CUH
+
+/*! \brief Log of the i and j cluster size.
+ * change this together with c_clSize !*/
+static const int __device__ c_clSizeLog2 = 3;
+/*! \brief Square of cluster size. */
+static const int __device__ c_clSizeSq = c_clSize*c_clSize;
+/*! \brief j-cluster size after split (4 in the current implementation). */
+static const int __device__ c_splitClSize = c_clSize/c_nbnxnGpuClusterpairSplit;
+/*! \brief Stride in the force accumualation buffer */
+static const int __device__ c_fbufStride = c_clSizeSq;
+/*! \brief i-cluster interaction mask for a super-cluster with all c_numClPerSupercl=8 bits set */
+static const unsigned __device__ superClInteractionMask = ((1U << c_numClPerSupercl) - 1U);
+
+static const float __device__ c_oneSixth = 0.16666667f;
+static const float __device__ c_oneTwelveth = 0.08333333f;
+
+
+/*! Convert LJ sigma,epsilon parameters to C6,C12. */
+static __forceinline__ __device__
+void convert_sigma_epsilon_to_c6_c12(const float sigma,
+ const float epsilon,
+ float *c6,
+ float *c12)
+{
+ float sigma2, sigma6;
+
+ sigma2 = sigma * sigma;
+ sigma6 = sigma2 *sigma2 * sigma2;
+ *c6 = epsilon * sigma6;
+ *c12 = *c6 * sigma6;
+}
+
+/*! Apply force switch, force + energy version. */
+static __forceinline__ __device__
+void calculate_force_switch_F(const cu_nbparam_t nbparam,
+ float c6,
+ float c12,
+ float inv_r,
+ float r2,
+ float *F_invr)
+{
+ float r, r_switch;
+
+ /* force switch constants */
+ float disp_shift_V2 = nbparam.dispersion_shift.c2;
+ float disp_shift_V3 = nbparam.dispersion_shift.c3;
+ float repu_shift_V2 = nbparam.repulsion_shift.c2;
+ float repu_shift_V3 = nbparam.repulsion_shift.c3;
+
+ r = r2 * inv_r;
+ r_switch = r - nbparam.rvdw_switch;
+ r_switch = r_switch >= 0.0f ? r_switch : 0.0f;
+
+ *F_invr +=
+ -c6*(disp_shift_V2 + disp_shift_V3*r_switch)*r_switch*r_switch*inv_r +
- c12*(-repu_shift_V2 + repu_shift_V3*r_switch)*r_switch*r_switch*inv_r;
++ c12*(repu_shift_V2 + repu_shift_V3*r_switch)*r_switch*r_switch*inv_r;
+}
+
+/*! Apply force switch, force-only version. */
+static __forceinline__ __device__
+void calculate_force_switch_F_E(const cu_nbparam_t nbparam,
+ float c6,
+ float c12,
+ float inv_r,
+ float r2,
+ float *F_invr,
+ float *E_lj)
+{
+ float r, r_switch;
+
+ /* force switch constants */
+ float disp_shift_V2 = nbparam.dispersion_shift.c2;
+ float disp_shift_V3 = nbparam.dispersion_shift.c3;
+ float repu_shift_V2 = nbparam.repulsion_shift.c2;
+ float repu_shift_V3 = nbparam.repulsion_shift.c3;
+
+ float disp_shift_F2 = nbparam.dispersion_shift.c2/3;
+ float disp_shift_F3 = nbparam.dispersion_shift.c3/4;
+ float repu_shift_F2 = nbparam.repulsion_shift.c2/3;
+ float repu_shift_F3 = nbparam.repulsion_shift.c3/4;
+
+ r = r2 * inv_r;
+ r_switch = r - nbparam.rvdw_switch;
+ r_switch = r_switch >= 0.0f ? r_switch : 0.0f;
+
+ *F_invr +=
+ -c6*(disp_shift_V2 + disp_shift_V3*r_switch)*r_switch*r_switch*inv_r +
++ c12*(repu_shift_V2 + repu_shift_V3*r_switch)*r_switch*r_switch*inv_r;
+ *E_lj +=
+ c6*(disp_shift_F2 + disp_shift_F3*r_switch)*r_switch*r_switch*r_switch -
+ c12*(repu_shift_F2 + repu_shift_F3*r_switch)*r_switch*r_switch*r_switch;
+}
+
+/*! Apply potential switch, force-only version. */
+static __forceinline__ __device__
+void calculate_potential_switch_F(const cu_nbparam_t nbparam,
+ float inv_r,
+ float r2,
+ float *F_invr,
+ float *E_lj)
+{
+ float r, r_switch;
+ float sw, dsw;
+
+ /* potential switch constants */
+ float switch_V3 = nbparam.vdw_switch.c3;
+ float switch_V4 = nbparam.vdw_switch.c4;
+ float switch_V5 = nbparam.vdw_switch.c5;
+ float switch_F2 = 3*nbparam.vdw_switch.c3;
+ float switch_F3 = 4*nbparam.vdw_switch.c4;
+ float switch_F4 = 5*nbparam.vdw_switch.c5;
+
+ r = r2 * inv_r;
+ r_switch = r - nbparam.rvdw_switch;
+
+ /* Unlike in the F+E kernel, conditional is faster here */
+ if (r_switch > 0.0f)
+ {
+ sw = 1.0f + (switch_V3 + (switch_V4 + switch_V5*r_switch)*r_switch)*r_switch*r_switch*r_switch;
+ dsw = (switch_F2 + (switch_F3 + switch_F4*r_switch)*r_switch)*r_switch*r_switch;
+
+ *F_invr = (*F_invr)*sw - inv_r*(*E_lj)*dsw;
+ }
+}
+
+/*! Apply potential switch, force + energy version. */
+static __forceinline__ __device__
+void calculate_potential_switch_F_E(const cu_nbparam_t nbparam,
+ float inv_r,
+ float r2,
+ float *F_invr,
+ float *E_lj)
+{
+ float r, r_switch;
+ float sw, dsw;
+
+ /* potential switch constants */
+ float switch_V3 = nbparam.vdw_switch.c3;
+ float switch_V4 = nbparam.vdw_switch.c4;
+ float switch_V5 = nbparam.vdw_switch.c5;
+ float switch_F2 = 3*nbparam.vdw_switch.c3;
+ float switch_F3 = 4*nbparam.vdw_switch.c4;
+ float switch_F4 = 5*nbparam.vdw_switch.c5;
+
+ r = r2 * inv_r;
+ r_switch = r - nbparam.rvdw_switch;
+ r_switch = r_switch >= 0.0f ? r_switch : 0.0f;
+
+ /* Unlike in the F-only kernel, masking is faster here */
+ sw = 1.0f + (switch_V3 + (switch_V4 + switch_V5*r_switch)*r_switch)*r_switch*r_switch*r_switch;
+ dsw = (switch_F2 + (switch_F3 + switch_F4*r_switch)*r_switch)*r_switch*r_switch;
+
+ *F_invr = (*F_invr)*sw - inv_r*(*E_lj)*dsw;
+ *E_lj *= sw;
+}
+
+
+/*! \brief Fetch C6 grid contribution coefficients and return the product of these.
+ *
+ * Depending on what is supported, it fetches parameters either
+ * using direct load, texture objects, or texrefs.
+ */
+static __forceinline__ __device__
+float calculate_lj_ewald_c6grid(const cu_nbparam_t nbparam,
+ int typei,
+ int typej)
+{
+#if DISABLE_CUDA_TEXTURES
+ return LDG(&nbparam.nbfp_comb[2*typei]) * LDG(&nbparam.nbfp_comb[2*typej]);
+#else
+ return tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2*typei) * tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2*typej);
+#endif /* DISABLE_CUDA_TEXTURES */
+}
+
+
+/*! Calculate LJ-PME grid force contribution with
+ * geometric combination rule.
+ */
+static __forceinline__ __device__
+void calculate_lj_ewald_comb_geom_F(const cu_nbparam_t nbparam,
+ int typei,
+ int typej,
+ float r2,
+ float inv_r2,
+ float lje_coeff2,
+ float lje_coeff6_6,
+ float *F_invr)
+{
+ float c6grid, inv_r6_nm, cr2, expmcr2, poly;
+
+ c6grid = calculate_lj_ewald_c6grid(nbparam, typei, typej);
+
+ /* Recalculate inv_r6 without exclusion mask */
+ inv_r6_nm = inv_r2*inv_r2*inv_r2;
+ cr2 = lje_coeff2*r2;
+ expmcr2 = expf(-cr2);
+ poly = 1.0f + cr2 + 0.5f*cr2*cr2;
+
+ /* Subtract the grid force from the total LJ force */
+ *F_invr += c6grid*(inv_r6_nm - expmcr2*(inv_r6_nm*poly + lje_coeff6_6))*inv_r2;
+}
+
+
+/*! Calculate LJ-PME grid force + energy contribution with
+ * geometric combination rule.
+ */
+static __forceinline__ __device__
+void calculate_lj_ewald_comb_geom_F_E(const cu_nbparam_t nbparam,
+ int typei,
+ int typej,
+ float r2,
+ float inv_r2,
+ float lje_coeff2,
+ float lje_coeff6_6,
+ float int_bit,
+ float *F_invr,
+ float *E_lj)
+{
+ float c6grid, inv_r6_nm, cr2, expmcr2, poly, sh_mask;
+
+ c6grid = calculate_lj_ewald_c6grid(nbparam, typei, typej);
+
+ /* Recalculate inv_r6 without exclusion mask */
+ inv_r6_nm = inv_r2*inv_r2*inv_r2;
+ cr2 = lje_coeff2*r2;
+ expmcr2 = expf(-cr2);
+ poly = 1.0f + cr2 + 0.5f*cr2*cr2;
+
+ /* Subtract the grid force from the total LJ force */
+ *F_invr += c6grid*(inv_r6_nm - expmcr2*(inv_r6_nm*poly + lje_coeff6_6))*inv_r2;
+
+ /* Shift should be applied only to real LJ pairs */
+ sh_mask = nbparam.sh_lj_ewald*int_bit;
+ *E_lj += c_oneSixth*c6grid*(inv_r6_nm*(1.0f - expmcr2*poly) + sh_mask);
+}
+
+/*! Fetch per-type LJ parameters.
+ *
+ * Depending on what is supported, it fetches parameters either
+ * using direct load, texture objects, or texrefs.
+ */
+static __forceinline__ __device__
+float2 fetch_nbfp_comb_c6_c12(const cu_nbparam_t nbparam,
+ int type)
+{
+ float2 c6c12;
+#if DISABLE_CUDA_TEXTURES
+ /* Force an 8-byte fetch to save a memory instruction. */
+ float2 *nbfp_comb = (float2 *)nbparam.nbfp_comb;
+ c6c12 = LDG(&nbfp_comb[type]);
+#else
+ /* NOTE: as we always do 8-byte aligned loads, we could
+ fetch float2 here too just as above. */
+ c6c12.x = tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2*type);
+ c6c12.y = tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2*type + 1);
+#endif /* DISABLE_CUDA_TEXTURES */
+
+ return c6c12;
+}
+
+
+/*! Calculate LJ-PME grid force + energy contribution (if E_lj != nullptr) with
+ * Lorentz-Berthelot combination rule.
+ * We use a single F+E kernel with conditional because the performance impact
+ * of this is pretty small and LB on the CPU is anyway very slow.
+ */
+static __forceinline__ __device__
+void calculate_lj_ewald_comb_LB_F_E(const cu_nbparam_t nbparam,
+ int typei,
+ int typej,
+ float r2,
+ float inv_r2,
+ float lje_coeff2,
+ float lje_coeff6_6,
+ float int_bit,
+ float *F_invr,
+ float *E_lj)
+{
+ float c6grid, inv_r6_nm, cr2, expmcr2, poly;
+ float sigma, sigma2, epsilon;
+
+ /* sigma and epsilon are scaled to give 6*C6 */
+ float2 c6c12_i = fetch_nbfp_comb_c6_c12(nbparam, typei);
+ float2 c6c12_j = fetch_nbfp_comb_c6_c12(nbparam, typej);
+
+ sigma = c6c12_i.x + c6c12_j.x;
+ epsilon = c6c12_i.y * c6c12_j.y;
+
+ sigma2 = sigma*sigma;
+ c6grid = epsilon*sigma2*sigma2*sigma2;
+
+ /* Recalculate inv_r6 without exclusion mask */
+ inv_r6_nm = inv_r2*inv_r2*inv_r2;
+ cr2 = lje_coeff2*r2;
+ expmcr2 = expf(-cr2);
+ poly = 1.0f + cr2 + 0.5f*cr2*cr2;
+
+ /* Subtract the grid force from the total LJ force */
+ *F_invr += c6grid*(inv_r6_nm - expmcr2*(inv_r6_nm*poly + lje_coeff6_6))*inv_r2;
+
+ if (E_lj != nullptr)
+ {
+ float sh_mask;
+
+ /* Shift should be applied only to real LJ pairs */
+ sh_mask = nbparam.sh_lj_ewald*int_bit;
+ *E_lj += c_oneSixth*c6grid*(inv_r6_nm*(1.0f - expmcr2*poly) + sh_mask);
+ }
+}
+
+
+/*! Fetch two consecutive values from the Ewald correction F*r table.
+ *
+ * Depending on what is supported, it fetches parameters either
+ * using direct load, texture objects, or texrefs.
+ */
+static __forceinline__ __device__
+float2 fetch_coulomb_force_r(const cu_nbparam_t nbparam,
+ int index)
+{
+ float2 d;
+
+#if DISABLE_CUDA_TEXTURES
+ /* Can't do 8-byte fetch because some of the addresses will be misaligned. */
+ d.x = LDG(&nbparam.coulomb_tab[index]);
+ d.y = LDG(&nbparam.coulomb_tab[index + 1]);
+#else
+ d.x = tex1Dfetch<float>(nbparam.coulomb_tab_texobj, index);
+ d.y = tex1Dfetch<float>(nbparam.coulomb_tab_texobj, index + 1);
+#endif // DISABLE_CUDA_TEXTURES
+
+ return d;
+}
+
+/*! Linear interpolation using exactly two FMA operations.
+ *
+ * Implements numeric equivalent of: (1-t)*d0 + t*d1
+ * Note that CUDA does not have fnms, otherwise we'd use
+ * fma(t, d1, fnms(t, d0, d0)
+ * but input modifiers are designed for this and are fast.
+ */
+template <typename T>
+__forceinline__ __host__ __device__
+T lerp(T d0, T d1, T t)
+{
+ return fma(t, d1, fma(-t, d0, d0));
+}
+
+/*! Interpolate Ewald coulomb force correction using the F*r table.
+ */
+static __forceinline__ __device__
+float interpolate_coulomb_force_r(const cu_nbparam_t nbparam,
+ float r)
+{
+ float normalized = nbparam.coulomb_tab_scale * r;
+ int index = (int) normalized;
+ float fraction = normalized - index;
+
+ float2 d01 = fetch_coulomb_force_r(nbparam, index);
+
+ return lerp(d01.x, d01.y, fraction);
+}
+
+/*! Fetch C6 and C12 from the parameter table.
+ *
+ * Depending on what is supported, it fetches parameters either
+ * using direct load, texture objects, or texrefs.
+ */
+static __forceinline__ __device__
+void fetch_nbfp_c6_c12(float &c6,
+ float &c12,
+ const cu_nbparam_t nbparam,
+ int baseIndex)
+{
+#if DISABLE_CUDA_TEXTURES
+ /* Force an 8-byte fetch to save a memory instruction. */
+ float2 *nbfp = (float2 *)nbparam.nbfp;
+ float2 c6c12;
+ c6c12 = LDG(&nbfp[baseIndex]);
+ c6 = c6c12.x;
+ c12 = c6c12.y;
+#else
+ /* NOTE: as we always do 8-byte aligned loads, we could
+ fetch float2 here too just as above. */
+ c6 = tex1Dfetch<float>(nbparam.nbfp_texobj, 2*baseIndex);
+ c12 = tex1Dfetch<float>(nbparam.nbfp_texobj, 2*baseIndex + 1);
+#endif // DISABLE_CUDA_TEXTURES
+}
+
+
+/*! Calculate analytical Ewald correction term. */
+static __forceinline__ __device__
+float pmecorrF(float z2)
+{
+ const float FN6 = -1.7357322914161492954e-8f;
+ const float FN5 = 1.4703624142580877519e-6f;
+ const float FN4 = -0.000053401640219807709149f;
+ const float FN3 = 0.0010054721316683106153f;
+ const float FN2 = -0.019278317264888380590f;
+ const float FN1 = 0.069670166153766424023f;
+ const float FN0 = -0.75225204789749321333f;
+
+ const float FD4 = 0.0011193462567257629232f;
+ const float FD3 = 0.014866955030185295499f;
+ const float FD2 = 0.11583842382862377919f;
+ const float FD1 = 0.50736591960530292870f;
+ const float FD0 = 1.0f;
+
+ float z4;
+ float polyFN0, polyFN1, polyFD0, polyFD1;
+
+ z4 = z2*z2;
+
+ polyFD0 = FD4*z4 + FD2;
+ polyFD1 = FD3*z4 + FD1;
+ polyFD0 = polyFD0*z4 + FD0;
+ polyFD0 = polyFD1*z2 + polyFD0;
+
+ polyFD0 = 1.0f/polyFD0;
+
+ polyFN0 = FN6*z4 + FN4;
+ polyFN1 = FN5*z4 + FN3;
+ polyFN0 = polyFN0*z4 + FN2;
+ polyFN1 = polyFN1*z4 + FN1;
+ polyFN0 = polyFN0*z4 + FN0;
+ polyFN0 = polyFN1*z2 + polyFN0;
+
+ return polyFN0*polyFD0;
+}
+
+/*! Final j-force reduction; this generic implementation works with
+ * arbitrary array sizes.
+ */
+static __forceinline__ __device__
+void reduce_force_j_generic(float *f_buf, float3 *fout,
+ int tidxi, int tidxj, int aidx)
+{
+ if (tidxi < 3)
+ {
+ float f = 0.0f;
+ for (int j = tidxj * c_clSize; j < (tidxj + 1) * c_clSize; j++)
+ {
+ f += f_buf[c_fbufStride * tidxi + j];
+ }
+
+ atomicAdd((&fout[aidx].x)+tidxi, f);
+ }
+}
+
+/*! Final j-force reduction; this implementation only with power of two
+ * array sizes.
+ */
+static __forceinline__ __device__
+void reduce_force_j_warp_shfl(float3 f, float3 *fout,
+ int tidxi, int aidx,
+ const unsigned int activemask)
+{
+ f.x += gmx_shfl_down_sync(activemask, f.x, 1);
+ f.y += gmx_shfl_up_sync (activemask, f.y, 1);
+ f.z += gmx_shfl_down_sync(activemask, f.z, 1);
+
+ if (tidxi & 1)
+ {
+ f.x = f.y;
+ }
+
+ f.x += gmx_shfl_down_sync(activemask, f.x, 2);
+ f.z += gmx_shfl_up_sync (activemask, f.z, 2);
+
+ if (tidxi & 2)
+ {
+ f.x = f.z;
+ }
+
+ f.x += gmx_shfl_down_sync(activemask, f.x, 4);
+
+ if (tidxi < 3)
+ {
+ atomicAdd((&fout[aidx].x) + tidxi, f.x);
+ }
+}
+
+/*! Final i-force reduction; this generic implementation works with
+ * arbitrary array sizes.
+ * TODO: add the tidxi < 3 trick
+ */
+static __forceinline__ __device__
+void reduce_force_i_generic(float *f_buf, float3 *fout,
+ float *fshift_buf, bool bCalcFshift,
+ int tidxi, int tidxj, int aidx)
+{
+ if (tidxj < 3)
+ {
+ float f = 0.0f;
+ for (int j = tidxi; j < c_clSizeSq; j += c_clSize)
+ {
+ f += f_buf[tidxj * c_fbufStride + j];
+ }
+
+ atomicAdd(&fout[aidx].x + tidxj, f);
+
+ if (bCalcFshift)
+ {
+ *fshift_buf += f;
+ }
+ }
+}
+
+/*! Final i-force reduction; this implementation works only with power of two
+ * array sizes.
+ */
+static __forceinline__ __device__
+void reduce_force_i_pow2(volatile float *f_buf, float3 *fout,
+ float *fshift_buf, bool bCalcFshift,
+ int tidxi, int tidxj, int aidx)
+{
+ int i, j;
+ float f;
+
+ assert(c_clSize == 1 << c_clSizeLog2);
+
+ /* Reduce the initial c_clSize values for each i atom to half
+ * every step by using c_clSize * i threads.
+ * Can't just use i as loop variable because than nvcc refuses to unroll.
+ */
+ i = c_clSize/2;
+#pragma unroll 5
+ for (j = c_clSizeLog2 - 1; j > 0; j--)
+ {
+ if (tidxj < i)
+ {
+
+ f_buf[ tidxj * c_clSize + tidxi] += f_buf[ (tidxj + i) * c_clSize + tidxi];
+ f_buf[ c_fbufStride + tidxj * c_clSize + tidxi] += f_buf[ c_fbufStride + (tidxj + i) * c_clSize + tidxi];
+ f_buf[2 * c_fbufStride + tidxj * c_clSize + tidxi] += f_buf[2 * c_fbufStride + (tidxj + i) * c_clSize + tidxi];
+ }
+ i >>= 1;
+ }
+
+ /* i == 1, last reduction step, writing to global mem */
+ if (tidxj < 3)
+ {
+ /* tidxj*c_fbufStride selects x, y or z */
+ f = f_buf[tidxj * c_fbufStride + tidxi] +
+ f_buf[tidxj * c_fbufStride + i * c_clSize + tidxi];
+
+ atomicAdd(&(fout[aidx].x) + tidxj, f);
+
+ if (bCalcFshift)
+ {
+ *fshift_buf += f;
+ }
+ }
+
+}
+
+/*! Final i-force reduction wrapper; calls the generic or pow2 reduction depending
+ * on whether the size of the array to be reduced is power of two or not.
+ */
+static __forceinline__ __device__
+void reduce_force_i(float *f_buf, float3 *f,
+ float *fshift_buf, bool bCalcFshift,
+ int tidxi, int tidxj, int ai)
+{
+ if ((c_clSize & (c_clSize - 1)))
+ {
+ reduce_force_i_generic(f_buf, f, fshift_buf, bCalcFshift, tidxi, tidxj, ai);
+ }
+ else
+ {
+ reduce_force_i_pow2(f_buf, f, fshift_buf, bCalcFshift, tidxi, tidxj, ai);
+ }
+}
+
+/*! Final i-force reduction; this implementation works only with power of two
+ * array sizes.
+ */
+static __forceinline__ __device__
+void reduce_force_i_warp_shfl(float3 fin, float3 *fout,
+ float *fshift_buf, bool bCalcFshift,
+ int tidxj, int aidx,
+ const unsigned int activemask)
+{
+ fin.x += gmx_shfl_down_sync(activemask, fin.x, c_clSize);
+ fin.y += gmx_shfl_up_sync (activemask, fin.y, c_clSize);
+ fin.z += gmx_shfl_down_sync(activemask, fin.z, c_clSize);
+
+ if (tidxj & 1)
+ {
+ fin.x = fin.y;
+ }
+
+ fin.x += gmx_shfl_down_sync(activemask, fin.x, 2*c_clSize);
+ fin.z += gmx_shfl_up_sync (activemask, fin.z, 2*c_clSize);
+
+ if (tidxj & 2)
+ {
+ fin.x = fin.z;
+ }
+
+ /* Threads 0,1,2 and 4,5,6 increment x,y,z for their warp */
+ if ((tidxj & 3) < 3)
+ {
+ atomicAdd(&fout[aidx].x + (tidxj & 3), fin.x);
+
+ if (bCalcFshift)
+ {
+ *fshift_buf += fin.x;
+ }
+ }
+}
+
+/*! Energy reduction; this implementation works only with power of two
+ * array sizes.
+ */
+static __forceinline__ __device__
+void reduce_energy_pow2(volatile float *buf,
+ float *e_lj, float *e_el,
+ unsigned int tidx)
+{
+ float e1, e2;
+
+ unsigned int i = warp_size/2;
+
+ /* Can't just use i as loop variable because than nvcc refuses to unroll. */
+#pragma unroll 10
+ for (int j = warp_size_log2 - 1; j > 0; j--)
+ {
+ if (tidx < i)
+ {
+ buf[ tidx] += buf[ tidx + i];
+ buf[c_fbufStride + tidx] += buf[c_fbufStride + tidx + i];
+ }
+ i >>= 1;
+ }
+
+ // TODO do this on two threads - requires e_lj and e_el to be stored on adjascent
+ // memory locations to make sense
+ /* last reduction step, writing to global mem */
+ if (tidx == 0)
+ {
+ e1 = buf[ tidx] + buf[ tidx + i];
+ e2 = buf[c_fbufStride + tidx] + buf[c_fbufStride + tidx + i];
+
+ atomicAdd(e_lj, e1);
+ atomicAdd(e_el, e2);
+ }
+}
+
+/*! Energy reduction; this implementation works only with power of two
+ * array sizes.
+ */
+static __forceinline__ __device__
+void reduce_energy_warp_shfl(float E_lj, float E_el,
+ float *e_lj, float *e_el,
+ int tidx,
+ const unsigned int activemask)
+{
+ int i, sh;
+
+ sh = 1;
+#pragma unroll 5
+ for (i = 0; i < 5; i++)
+ {
+ E_lj += gmx_shfl_down_sync(activemask, E_lj, sh);
+ E_el += gmx_shfl_down_sync(activemask, E_el, sh);
+ sh += sh;
+ }
+
+ /* The first thread in the warp writes the reduced energies */
+ if (tidx == 0 || tidx == warp_size)
+ {
+ atomicAdd(e_lj, E_lj);
+ atomicAdd(e_el, E_el);
+ }
+}
+
+#endif /* NBNXN_CUDA_KERNEL_UTILS_CUH */
--- /dev/null
- c12*(-repu_shift_V2 + repu_shift_V3*r_switch)*r_switch*r_switch*inv_r;
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014,2016,2017,2018,2019, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+#define GMX_DOUBLE 0
+
+#include "gromacs/gpu_utils/device_utils.clh"
+#include "gromacs/gpu_utils/vectype_ops.clh"
+#include "gromacs/nbnxm/constants.h"
+#include "gromacs/pbcutil/ishift.h"
+
+#include "nbnxm_ocl_consts.h"
+
+#define CL_SIZE (NBNXN_GPU_CLUSTER_SIZE)
+#define NCL_PER_SUPERCL c_nbnxnGpuNumClusterPerSupercluster
+
+#define WARP_SIZE (CL_SIZE*CL_SIZE/2) //Currently only c_nbnxnGpuClusterpairSplit=2 supported
+
+#if defined _NVIDIA_SOURCE_ || defined _AMD_SOURCE_
+/* Currently we enable CJ prefetch for AMD/NVIDIA and disable it for other vendors
+ * Note that this should precede the kernel_utils include.
+ */
+#define USE_CJ_PREFETCH 1
+#else
+#define USE_CJ_PREFETCH 0
+#endif
+
+#if defined cl_intel_subgroups || defined cl_khr_subgroups || (defined __OPENCL_VERSION__ && __OPENCL_VERSION__ >= 210)
+#define HAVE_SUBGROUP 1
+#else
+#define HAVE_SUBGROUP 0
+#endif
+
+#ifdef cl_intel_subgroups
+#define HAVE_INTEL_SUBGROUP 1
+#else
+#define HAVE_INTEL_SUBGROUP 0
+#endif
+
+#if defined _INTEL_SOURCE_
+#define SUBGROUP_SIZE 8
+#elif defined _AMD_SOURCE_
+#define SUBGROUP_SIZE 64
+#else
+#define SUBGROUP_SIZE 32
+#endif
+
+#define REDUCE_SHUFFLE (HAVE_INTEL_SUBGROUP && CL_SIZE == 4 && SUBGROUP_SIZE == WARP_SIZE)
+#define USE_SUBGROUP_ANY (HAVE_SUBGROUP && SUBGROUP_SIZE == WARP_SIZE)
+#define USE_SUBGROUP_PRELOAD HAVE_INTEL_SUBGROUP
+
+/* 1.0 / sqrt(M_PI) */
+#define M_FLOAT_1_SQRTPI 0.564189583547756f
+
+//-------------------
+
+#ifndef NBNXN_OPENCL_KERNEL_UTILS_CLH
+#define NBNXN_OPENCL_KERNEL_UTILS_CLH
+
+#if CL_SIZE == 8
+#define WARP_SIZE_LOG2 (5)
+#define CL_SIZE_LOG2 (3)
+#elif CL_SIZE == 4
+#define WARP_SIZE_LOG2 (3)
+#define CL_SIZE_LOG2 (2)
+#else
+#error unsupported CL_SIZE
+#endif
+
+#define CL_SIZE_SQ (CL_SIZE * CL_SIZE)
+#define FBUF_STRIDE (CL_SIZE_SQ)
+
+#define ONE_SIXTH_F 0.16666667f
+#define ONE_TWELVETH_F 0.08333333f
+
+
+#ifdef __GNUC__
+/* GCC, clang, and some ICC pretending to be GCC */
+# define gmx_unused __attribute__ ((unused))
+#else
+# define gmx_unused
+#endif
+
+// Data structures shared between OpenCL device code and OpenCL host code
+// TODO: review, improve
+// Replaced real by float for now, to avoid including any other header
+typedef struct {
+ float c2;
+ float c3;
+ float cpot;
+} shift_consts_t;
+
+/* Used with potential switching:
+ * rsw = max(r - r_switch, 0)
+ * sw = 1 + c3*rsw^3 + c4*rsw^4 + c5*rsw^5
+ * dsw = 3*c3*rsw^2 + 4*c4*rsw^3 + 5*c5*rsw^4
+ * force = force*dsw - potential*sw
+ * potential *= sw
+ */
+typedef struct {
+ float c3;
+ float c4;
+ float c5;
+} switch_consts_t;
+
+// Data structure shared between the OpenCL device code and OpenCL host code
+// Must not contain OpenCL objects (buffers)
+typedef struct cl_nbparam_params
+{
+
+ int eeltype; /**< type of electrostatics, takes values from #eelCu */
+ int vdwtype; /**< type of VdW impl., takes values from #evdwCu */
+
+ float epsfac; /**< charge multiplication factor */
+ float c_rf; /**< Reaction-field/plain cutoff electrostatics const. */
+ float two_k_rf; /**< Reaction-field electrostatics constant */
+ float ewald_beta; /**< Ewald/PME parameter */
+ float sh_ewald; /**< Ewald/PME correction term substracted from the direct-space potential */
+ float sh_lj_ewald; /**< LJ-Ewald/PME correction term added to the correction potential */
+ float ewaldcoeff_lj; /**< LJ-Ewald/PME coefficient */
+
+ float rcoulomb_sq; /**< Coulomb cut-off squared */
+
+ float rvdw_sq; /**< VdW cut-off squared */
+ float rvdw_switch; /**< VdW switched cut-off */
+ float rlistOuter_sq; /**< Full, outer pair-list cut-off squared */
+ float rlistInner_sq; /**< Inner, dynamic pruned pair-list cut-off squared XXX: this is only needed in the pruning kernels, but for now we also pass it to the nonbondeds */
+
+ shift_consts_t dispersion_shift; /**< VdW shift dispersion constants */
+ shift_consts_t repulsion_shift; /**< VdW shift repulsion constants */
+ switch_consts_t vdw_switch; /**< VdW switch constants */
+
+ /* Ewald Coulomb force table data - accessed through texture memory */
+ float coulomb_tab_scale; /**< table scale/spacing */
+}cl_nbparam_params_t;
+
+typedef struct {
+ int sci; /* i-super-cluster */
+ int shift; /* Shift vector index plus possible flags */
+ int cj4_ind_start; /* Start index into cj4 */
+ int cj4_ind_end; /* End index into cj4 */
+} nbnxn_sci_t;
+
+typedef struct {
+ unsigned int imask; /* The i-cluster interactions mask for 1 warp */
+ int excl_ind; /* Index into the exclusion array for 1 warp */
+} nbnxn_im_ei_t;
+
+typedef struct {
+ int cj[4]; /* The 4 j-clusters */
+ nbnxn_im_ei_t imei[2]; /* The i-cluster mask data for 2 warps */
+} nbnxn_cj4_t;
+
+
+typedef struct {
+ unsigned int pair[CL_SIZE*CL_SIZE/2]; /* Topology exclusion interaction bits for one warp,
+ * each unsigned has bitS for 4*8 i clusters
+ */
+} nbnxn_excl_t;
+
+/*! i-cluster interaction mask for a super-cluster with all NCL_PER_SUPERCL bits set */
+__constant unsigned supercl_interaction_mask = ((1U << NCL_PER_SUPERCL) - 1U);
+
+gmx_opencl_inline
+void preloadCj4Generic(__local int *sm_cjPreload,
+ const __global int *gm_cj,
+ int tidxi,
+ int tidxj,
+ bool gmx_unused iMaskCond)
+{
+ /* Pre-load cj into shared memory */
+#if defined _AMD_SOURCE_ //TODO: fix by setting c_nbnxnGpuClusterpairSplit properly
+ if (tidxj == 0 & tidxi < c_nbnxnGpuJgroupSize)
+ {
+ sm_cjPreload[tidxi] = gm_cj[tidxi];
+ }
+#else
+ const int c_clSize = CL_SIZE;
+ const int c_nbnxnGpuClusterpairSplit = 2;
+ const int c_splitClSize = c_clSize/c_nbnxnGpuClusterpairSplit;
+ if ((tidxj == 0 | tidxj == c_splitClSize) & (tidxi < c_nbnxnGpuJgroupSize))
+ {
+ sm_cjPreload[tidxi + tidxj * c_nbnxnGpuJgroupSize/c_splitClSize] = gm_cj[tidxi];
+ }
+#endif
+}
+
+
+#if USE_SUBGROUP_PRELOAD
+gmx_opencl_inline
+int preloadCj4Subgroup(const __global int *gm_cj)
+{
+ //loads subgroup-size # of elements (8) instead of the 4 required
+ //equivalent to *cjs = *gm_cj
+ return intel_sub_group_block_read((const __global uint *)gm_cj);
+}
+#endif //USE_SUBGROUP_PRELOAD
+
+#if USE_SUBGROUP_PRELOAD
+typedef size_t CjType;
+#else
+typedef __local int* CjType;
+#endif
+
+/*! \brief Preload cj4
+ *
+ * - For AMD we load once for a wavefront of 64 threads (on 4 threads * NTHREAD_Z)
+ * - For NVIDIA once per warp (on 2x4 threads * NTHREAD_Z)
+ * - For Intel(/USE_SUBGROUP_PRELOAD) loads into private memory(/register) instead of local memory
+ *
+ * It is the caller's responsibility to make sure that data is consumed only when
+ * it's ready. This function does not call a barrier.
+ */
+gmx_opencl_inline
+void preloadCj4(CjType gmx_unused *cjs,
+ const __global int gmx_unused *gm_cj,
+ int gmx_unused tidxi,
+ int gmx_unused tidxj,
+ bool gmx_unused iMaskCond)
+{
+#if USE_SUBGROUP_PRELOAD
+ *cjs = preloadCj4Subgroup(gm_cj);
+#elif USE_CJ_PREFETCH
+ preloadCj4Generic(*cjs, gm_cj, tidxi, tidxj, iMaskCond);
+#else
+ //nothing to do
+#endif
+}
+
+gmx_opencl_inline
+int loadCjPreload(__local int * sm_cjPreload,
+ int jm,
+ int gmx_unused tidxi,
+ int gmx_unused tidxj)
+{
+#if defined _AMD_SOURCE_
+ int warpLoadOffset = 0; //TODO: fix by setting c_nbnxnGpuClusterpairSplit properly
+#else
+ const int c_clSize = CL_SIZE;
+ const int c_nbnxnGpuClusterpairSplit = 2;
+ const int c_splitClSize = c_clSize/c_nbnxnGpuClusterpairSplit;
+ int warpLoadOffset = (tidxj & c_splitClSize) * c_nbnxnGpuJgroupSize/c_splitClSize;
+#endif
+ return sm_cjPreload[jm + warpLoadOffset];
+}
+
+/* \brief Load a cj given a jm index.
+ *
+ * If cj4 preloading is enabled, it loads from the local memory, otherwise from global.
+ */
+gmx_opencl_inline
+int loadCj(CjType cjs, const __global int gmx_unused* gm_cj,
+ int jm, int gmx_unused tidxi, int gmx_unused tidxj)
+{
+#if USE_SUBGROUP_PRELOAD
+ return sub_group_broadcast(cjs, jm);
+#elif USE_CJ_PREFETCH
+ return loadCjPreload(cjs, jm, tidxi, tidxj);
+#else
+ return gm_cj[jm];
+#endif
+}
+
+/*! Convert LJ sigma,epsilon parameters to C6,C12. */
+gmx_opencl_inline
+void convert_sigma_epsilon_to_c6_c12(const float sigma,
+ const float epsilon,
+ float *c6,
+ float *c12)
+{
+ float sigma2, sigma6;
+
+ sigma2 = sigma * sigma;
+ sigma6 = sigma2 *sigma2 * sigma2;
+ *c6 = epsilon * sigma6;
+ *c12 = *c6 * sigma6;
+}
+
+
+/*! Apply force switch, force + energy version. */
+gmx_opencl_inline
+void calculate_force_switch_F(const cl_nbparam_params_t *nbparam,
+ float c6,
+ float c12,
+ float inv_r,
+ float r2,
+ float *F_invr)
+{
+ float r, r_switch;
+
+ /* force switch constants */
+ float disp_shift_V2 = nbparam->dispersion_shift.c2;
+ float disp_shift_V3 = nbparam->dispersion_shift.c3;
+ float repu_shift_V2 = nbparam->repulsion_shift.c2;
+ float repu_shift_V3 = nbparam->repulsion_shift.c3;
+
+ r = r2 * inv_r;
+ r_switch = r - nbparam->rvdw_switch;
+ r_switch = r_switch >= 0.0f ? r_switch : 0.0f;
+
+ *F_invr +=
+ -c6*(disp_shift_V2 + disp_shift_V3*r_switch)*r_switch*r_switch*inv_r +
- c12*(-repu_shift_V2 + repu_shift_V3*r_switch)*r_switch*r_switch*inv_r;
++ c12*(repu_shift_V2 + repu_shift_V3*r_switch)*r_switch*r_switch*inv_r;
+}
+
+/*! Apply force switch, force-only version. */
+gmx_opencl_inline
+void calculate_force_switch_F_E(const cl_nbparam_params_t *nbparam,
+ float c6,
+ float c12,
+ float inv_r,
+ float r2,
+ float *F_invr,
+ float *E_lj)
+{
+ float r, r_switch;
+
+ /* force switch constants */
+ float disp_shift_V2 = nbparam->dispersion_shift.c2;
+ float disp_shift_V3 = nbparam->dispersion_shift.c3;
+ float repu_shift_V2 = nbparam->repulsion_shift.c2;
+ float repu_shift_V3 = nbparam->repulsion_shift.c3;
+
+ float disp_shift_F2 = nbparam->dispersion_shift.c2/3;
+ float disp_shift_F3 = nbparam->dispersion_shift.c3/4;
+ float repu_shift_F2 = nbparam->repulsion_shift.c2/3;
+ float repu_shift_F3 = nbparam->repulsion_shift.c3/4;
+
+ r = r2 * inv_r;
+ r_switch = r - nbparam->rvdw_switch;
+ r_switch = r_switch >= 0.0f ? r_switch : 0.0f;
+
+ *F_invr +=
+ -c6*(disp_shift_V2 + disp_shift_V3*r_switch)*r_switch*r_switch*inv_r +
++ c12*(repu_shift_V2 + repu_shift_V3*r_switch)*r_switch*r_switch*inv_r;
+ *E_lj +=
+ c6*(disp_shift_F2 + disp_shift_F3*r_switch)*r_switch*r_switch*r_switch -
+ c12*(repu_shift_F2 + repu_shift_F3*r_switch)*r_switch*r_switch*r_switch;
+}
+
+/*! Apply potential switch, force-only version. */
+gmx_opencl_inline
+void calculate_potential_switch_F(const cl_nbparam_params_t *nbparam,
+ float inv_r,
+ float r2,
+ float *F_invr,
+ const float *E_lj)
+{
+ float r, r_switch;
+ float sw, dsw;
+
+ /* potential switch constants */
+ float switch_V3 = nbparam->vdw_switch.c3;
+ float switch_V4 = nbparam->vdw_switch.c4;
+ float switch_V5 = nbparam->vdw_switch.c5;
+ float switch_F2 = nbparam->vdw_switch.c3;
+ float switch_F3 = nbparam->vdw_switch.c4;
+ float switch_F4 = nbparam->vdw_switch.c5;
+
+ r = r2 * inv_r;
+ r_switch = r - nbparam->rvdw_switch;
+
+ /* Unlike in the F+E kernel, conditional is faster here */
+ if (r_switch > 0.0f)
+ {
+ sw = 1.0f + (switch_V3 + (switch_V4 + switch_V5*r_switch)*r_switch)*r_switch*r_switch*r_switch;
+ dsw = (switch_F2 + (switch_F3 + switch_F4*r_switch)*r_switch)*r_switch*r_switch;
+
+ *F_invr = (*F_invr)*sw - inv_r*(*E_lj)*dsw;
+ }
+}
+
+/*! Apply potential switch, force + energy version. */
+gmx_opencl_inline
+void calculate_potential_switch_F_E(const cl_nbparam_params_t *nbparam,
+ float inv_r,
+ float r2,
+ float *F_invr,
+ float *E_lj)
+{
+ float r, r_switch;
+ float sw, dsw;
+
+ /* potential switch constants */
+ float switch_V3 = nbparam->vdw_switch.c3;
+ float switch_V4 = nbparam->vdw_switch.c4;
+ float switch_V5 = nbparam->vdw_switch.c5;
+ float switch_F2 = nbparam->vdw_switch.c3;
+ float switch_F3 = nbparam->vdw_switch.c4;
+ float switch_F4 = nbparam->vdw_switch.c5;
+
+ r = r2 * inv_r;
+ r_switch = r - nbparam->rvdw_switch;
+ r_switch = r_switch >= 0.0f ? r_switch : 0.0f;
+
+ /* Unlike in the F-only kernel, masking is faster here */
+ sw = 1.0f + (switch_V3 + (switch_V4 + switch_V5*r_switch)*r_switch)*r_switch*r_switch*r_switch;
+ dsw = (switch_F2 + (switch_F3 + switch_F4*r_switch)*r_switch)*r_switch*r_switch;
+
+ *F_invr = (*F_invr)*sw - inv_r*(*E_lj)*dsw;
+ *E_lj *= sw;
+}
+
+/*! Calculate LJ-PME grid force contribution with
+ * geometric combination rule.
+ */
+gmx_opencl_inline
+void calculate_lj_ewald_comb_geom_F(__constant const float *nbfp_comb_climg2d,
+ int typei,
+ int typej,
+ float r2,
+ float inv_r2,
+ float lje_coeff2,
+ float lje_coeff6_6,
+ float *F_invr)
+{
+ float c6grid, inv_r6_nm, cr2, expmcr2, poly;
+
+ c6grid = nbfp_comb_climg2d[2*typei]*nbfp_comb_climg2d[2*typej];
+
+ /* Recalculate inv_r6 without exclusion mask */
+ inv_r6_nm = inv_r2*inv_r2*inv_r2;
+ cr2 = lje_coeff2*r2;
+ expmcr2 = exp(-cr2);
+ poly = 1.0f + cr2 + 0.5f*cr2*cr2;
+
+ /* Subtract the grid force from the total LJ force */
+ *F_invr += c6grid*(inv_r6_nm - expmcr2*(inv_r6_nm*poly + lje_coeff6_6))*inv_r2;
+}
+
+/*! Calculate LJ-PME grid force + energy contribution with
+ * geometric combination rule.
+ */
+gmx_opencl_inline
+void calculate_lj_ewald_comb_geom_F_E(__constant const float *nbfp_comb_climg2d,
+ const cl_nbparam_params_t *nbparam,
+ int typei,
+ int typej,
+ float r2,
+ float inv_r2,
+ float lje_coeff2,
+ float lje_coeff6_6,
+ float int_bit,
+ float *F_invr,
+ float *E_lj)
+{
+ float c6grid, inv_r6_nm, cr2, expmcr2, poly, sh_mask;
+
+ c6grid = nbfp_comb_climg2d[2*typei]*nbfp_comb_climg2d[2*typej];
+
+ /* Recalculate inv_r6 without exclusion mask */
+ inv_r6_nm = inv_r2*inv_r2*inv_r2;
+ cr2 = lje_coeff2*r2;
+ expmcr2 = exp(-cr2);
+ poly = 1.0f + cr2 + 0.5f*cr2*cr2;
+
+ /* Subtract the grid force from the total LJ force */
+ *F_invr += c6grid*(inv_r6_nm - expmcr2*(inv_r6_nm*poly + lje_coeff6_6))*inv_r2;
+
+ /* Shift should be applied only to real LJ pairs */
+ sh_mask = nbparam->sh_lj_ewald*int_bit;
+ *E_lj += ONE_SIXTH_F*c6grid*(inv_r6_nm*(1.0f - expmcr2*poly) + sh_mask);
+}
+
+/*! Calculate LJ-PME grid force + energy contribution (if E_lj != NULL) with
+ * Lorentz-Berthelot combination rule.
+ * We use a single F+E kernel with conditional because the performance impact
+ * of this is pretty small and LB on the CPU is anyway very slow.
+ */
+gmx_opencl_inline
+void calculate_lj_ewald_comb_LB_F_E(__constant const float *nbfp_comb_climg2d,
+ const cl_nbparam_params_t *nbparam,
+ int typei,
+ int typej,
+ float r2,
+ float inv_r2,
+ float lje_coeff2,
+ float lje_coeff6_6,
+ float int_bit,
+ bool with_E_lj,
+ float *F_invr,
+ float *E_lj)
+{
+ float c6grid, inv_r6_nm, cr2, expmcr2, poly;
+ float sigma, sigma2, epsilon;
+
+ /* sigma and epsilon are scaled to give 6*C6 */
+ sigma = nbfp_comb_climg2d[2*typei] + nbfp_comb_climg2d[2*typej];
+
+ epsilon = nbfp_comb_climg2d[2*typei+1]*nbfp_comb_climg2d[2*typej+1];
+
+ sigma2 = sigma*sigma;
+ c6grid = epsilon*sigma2*sigma2*sigma2;
+
+ /* Recalculate inv_r6 without exclusion mask */
+ inv_r6_nm = inv_r2*inv_r2*inv_r2;
+ cr2 = lje_coeff2*r2;
+ expmcr2 = exp(-cr2);
+ poly = 1.0f + cr2 + 0.5f*cr2*cr2;
+
+ /* Subtract the grid force from the total LJ force */
+ *F_invr += c6grid*(inv_r6_nm - expmcr2*(inv_r6_nm*poly + lje_coeff6_6))*inv_r2;
+
+ if (with_E_lj)
+ {
+ float sh_mask;
+
+ /* Shift should be applied only to real LJ pairs */
+ sh_mask = nbparam->sh_lj_ewald*int_bit;
+ *E_lj += ONE_SIXTH_F*c6grid*(inv_r6_nm*(1.0f - expmcr2*poly) + sh_mask);
+ }
+}
+
+/*! Interpolate Ewald coulomb force using the table through the tex_nbfp texture.
+ * Original idea: from the OpenMM project
+ */
+gmx_opencl_inline float
+interpolate_coulomb_force_r(__constant const float *coulomb_tab_climg2d,
+ float r,
+ float scale)
+{
+ float normalized = scale * r;
+ int index = (int) normalized;
+ float fract2 = normalized - index;
+ float fract1 = 1.0f - fract2;
+
+ return fract1*coulomb_tab_climg2d[index] +
+ fract2*coulomb_tab_climg2d[index + 1];
+}
+
+/*! Calculate analytical Ewald correction term. */
+gmx_opencl_inline
+float pmecorrF(float z2)
+{
+ const float FN6 = -1.7357322914161492954e-8f;
+ const float FN5 = 1.4703624142580877519e-6f;
+ const float FN4 = -0.000053401640219807709149f;
+ const float FN3 = 0.0010054721316683106153f;
+ const float FN2 = -0.019278317264888380590f;
+ const float FN1 = 0.069670166153766424023f;
+ const float FN0 = -0.75225204789749321333f;
+
+ const float FD4 = 0.0011193462567257629232f;
+ const float FD3 = 0.014866955030185295499f;
+ const float FD2 = 0.11583842382862377919f;
+ const float FD1 = 0.50736591960530292870f;
+ const float FD0 = 1.0f;
+
+ float z4;
+ float polyFN0, polyFN1, polyFD0, polyFD1;
+
+ z4 = z2*z2;
+
+ polyFD0 = FD4*z4 + FD2;
+ polyFD1 = FD3*z4 + FD1;
+ polyFD0 = polyFD0*z4 + FD0;
+ polyFD0 = polyFD1*z2 + polyFD0;
+
+ polyFD0 = 1.0f/polyFD0;
+
+ polyFN0 = FN6*z4 + FN4;
+ polyFN1 = FN5*z4 + FN3;
+ polyFN0 = polyFN0*z4 + FN2;
+ polyFN1 = polyFN1*z4 + FN1;
+ polyFN0 = polyFN0*z4 + FN0;
+ polyFN0 = polyFN1*z2 + polyFN0;
+
+ return polyFN0*polyFD0;
+}
+
+#if REDUCE_SHUFFLE
+gmx_opencl_inline
+void reduce_force_j_shfl(float3 fin, __global float *fout,
+ int gmx_unused tidxi, int gmx_unused tidxj, int aidx)
+{
+ /* Only does reduction over 4 elements in cluster. Needs to be changed
+ * for CL_SIZE>4. See CUDA code for required code */
+ fin.x += intel_sub_group_shuffle_down(fin.x, fin.x, 1);
+ fin.y += intel_sub_group_shuffle_up (fin.y, fin.y, 1);
+ fin.z += intel_sub_group_shuffle_down(fin.z, fin.z, 1);
+ if ((tidxi & 1) == 1)
+ {
+ fin.x = fin.y;
+ }
+ fin.x += intel_sub_group_shuffle_down(fin.x, fin.x, 2);
+ fin.z += intel_sub_group_shuffle_up (fin.z, fin.z, 2);
+ if (tidxi == 2)
+ {
+ fin.x = fin.z;
+ }
+ if (tidxi < 3)
+ {
+ atomicAdd_g_f(&fout[3 * aidx + tidxi], fin.x);
+ }
+}
+#endif
+
+gmx_opencl_inline
+void reduce_force_j_generic(__local float *f_buf, float3 fcj_buf, __global float *fout,
+ int tidxi, int tidxj, int aidx)
+{
+ int tidx = tidxi + tidxj*CL_SIZE;
+ f_buf[ tidx] = fcj_buf.x;
+ f_buf[ FBUF_STRIDE + tidx] = fcj_buf.y;
+ f_buf[2 * FBUF_STRIDE + tidx] = fcj_buf.z;
+
+ /* Split the reduction between the first 3 column threads
+ Threads with column id 0 will do the reduction for (float3).x components
+ Threads with column id 1 will do the reduction for (float3).y components
+ Threads with column id 2 will do the reduction for (float3).z components.
+ The reduction is performed for each line tidxj of f_buf. */
+ if (tidxi < 3)
+ {
+ float f = 0.0f;
+ for (int j = tidxj * CL_SIZE; j < (tidxj + 1) * CL_SIZE; j++)
+ {
+ f += f_buf[FBUF_STRIDE * tidxi + j];
+ }
+
+ atomicAdd_g_f(&fout[3 * aidx + tidxi], f);
+ }
+}
+
+/*! Final j-force reduction
+ */
+gmx_opencl_inline
+void reduce_force_j(__local float gmx_unused *f_buf, float3 fcj_buf, __global float *fout,
+ int tidxi, int tidxj, int aidx)
+{
+#if REDUCE_SHUFFLE
+ reduce_force_j_shfl(fcj_buf, fout, tidxi, tidxj, aidx);
+#else
+ reduce_force_j_generic(f_buf, fcj_buf, fout, tidxi, tidxj, aidx);
+#endif
+}
+
+#if REDUCE_SHUFFLE
+gmx_opencl_inline
+void reduce_force_i_and_shift_shfl(float3* fci_buf, __global float *fout,
+ bool bCalcFshift, int tidxi, int tidxj,
+ int sci, int shift, __global float *fshift)
+{
+ /* Only does reduction over 4 elements in cluster (2 per warp). Needs to be changed
+ * for CL_SIZE>4.*/
+ float2 fshift_buf = 0;
+ for (int ci_offset = 0; ci_offset < NCL_PER_SUPERCL; ci_offset++)
+ {
+ int aidx = (sci * NCL_PER_SUPERCL + ci_offset) * CL_SIZE + tidxi;
+ float3 fin = fci_buf[ci_offset];
+ fin.x += intel_sub_group_shuffle_down(fin.x, fin.x, CL_SIZE);
+ fin.y += intel_sub_group_shuffle_up (fin.y, fin.y, CL_SIZE);
+ fin.z += intel_sub_group_shuffle_down(fin.z, fin.z, CL_SIZE);
+
+ if (tidxj & 1)
+ {
+ fin.x = fin.y;
+ }
+ /* Threads 0,1 and 2,3 increment x,y for their warp */
+ atomicAdd_g_f(&fout[3*aidx + (tidxj & 1)], fin.x);
+ if (bCalcFshift)
+ {
+ fshift_buf[0] += fin.x;
+ }
+ /* Threads 0 and 2 increment z for their warp */
+ if ((tidxj & 1) == 0)
+ {
+ atomicAdd_g_f(&fout[3*aidx+2], fin.z);
+ if (bCalcFshift)
+ {
+ fshift_buf[1] += fin.z;
+ }
+ }
+ }
+ /* add up local shift forces into global mem */
+ if (bCalcFshift)
+ {
+ //Threads 0,1 and 2,3 update x,y
+ atomicAdd_g_f(&(fshift[3 * shift + (tidxj&1)]), fshift_buf[0]);
+ //Threads 0 and 2 update z
+ if ((tidxj & 1) == 0)
+ {
+ atomicAdd_g_f(&(fshift[3 * shift + 2]), fshift_buf[1]);
+ }
+ }
+}
+#endif
+
+/*! Final i-force reduction; this implementation works only with power of two
+ * array sizes.
+ */
+gmx_opencl_inline
+void reduce_force_i_and_shift_pow2(volatile __local float *f_buf, float3* fci_buf,
+ __global float *fout,
+ bool bCalcFshift,
+ int tidxi, int tidxj,
+ int sci, int shift, __global float *fshift)
+{
+ float fshift_buf = 0;
+ for (int ci_offset = 0; ci_offset < NCL_PER_SUPERCL; ci_offset++)
+ {
+ int aidx = (sci * NCL_PER_SUPERCL + ci_offset) * CL_SIZE + tidxi;
+ int tidx = tidxi + tidxj*CL_SIZE;
+ /* store i forces in shmem */
+ f_buf[ tidx] = fci_buf[ci_offset].x;
+ f_buf[ FBUF_STRIDE + tidx] = fci_buf[ci_offset].y;
+ f_buf[2 * FBUF_STRIDE + tidx] = fci_buf[ci_offset].z;
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ int i, j;
+ /* Reduce the initial CL_SIZE values for each i atom to half
+ * every step by using CL_SIZE * i threads.
+ * Can't just use i as loop variable because than nvcc refuses to unroll.
+ */
+ i = CL_SIZE/2;
+ for (j = CL_SIZE_LOG2 - 1; j > 0; j--)
+ {
+ if (tidxj < i)
+ {
+
+ f_buf[ tidxj * CL_SIZE + tidxi] += f_buf[ (tidxj + i) * CL_SIZE + tidxi];
+ f_buf[ FBUF_STRIDE + tidxj * CL_SIZE + tidxi] += f_buf[ FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi];
+ f_buf[2 * FBUF_STRIDE + tidxj * CL_SIZE + tidxi] += f_buf[2 * FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi];
+ }
+ i >>= 1;
+ }
+ /* needed because
+ * a) for CL_SIZE<8: id 2 (doing z in next block) is in 2nd warp
+ * b) for all CL_SIZE a barrier is needed before f_buf is reused by next reduce_force_i call
+ * TODO: Test on Nvidia for performance difference between having the barrier here or after the atomicAdd
+ */
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ /* i == 1, last reduction step, writing to global mem */
+ /* Split the reduction between the first 3 line threads
+ Threads with line id 0 will do the reduction for (float3).x components
+ Threads with line id 1 will do the reduction for (float3).y components
+ Threads with line id 2 will do the reduction for (float3).z components. */
+ if (tidxj < 3)
+ {
+ float f = f_buf[tidxj * FBUF_STRIDE + tidxi] + f_buf[tidxj * FBUF_STRIDE + i * CL_SIZE + tidxi];
+
+ atomicAdd_g_f(&fout[3 * aidx + tidxj], f);
+
+ if (bCalcFshift)
+ {
+ fshift_buf += f;
+ }
+ }
+ }
+ /* add up local shift forces into global mem */
+ if (bCalcFshift)
+ {
+ /* Only threads with tidxj < 3 will update fshift.
+ The threads performing the update, must be the same as the threads
+ storing the reduction result above.
+ */
+ if (tidxj < 3)
+ {
+ atomicAdd_g_f(&(fshift[3 * shift + tidxj]), fshift_buf);
+ }
+ }
+}
+
+/*! Final i-force reduction
+ */
+gmx_opencl_inline
+void reduce_force_i_and_shift(__local float gmx_unused *f_buf, float3* fci_buf, __global float *f,
+ bool bCalcFshift, int tidxi, int tidxj, int sci,
+ int shift, __global float *fshift)
+{
+#if REDUCE_SHUFFLE
+ reduce_force_i_and_shift_shfl(fci_buf, f, bCalcFshift, tidxi, tidxj,
+ sci, shift, fshift);
+#else
+ reduce_force_i_and_shift_pow2(f_buf, fci_buf, f, bCalcFshift, tidxi, tidxj,
+ sci, shift, fshift);
+#endif
+}
+
+
+
+#if REDUCE_SHUFFLE
+gmx_opencl_inline
+void reduce_energy_shfl(float E_lj, float E_el,
+ volatile __global float *e_lj,
+ volatile __global float *e_el,
+ unsigned int tidx)
+{
+ E_lj = sub_group_reduce_add(E_lj);
+ E_el = sub_group_reduce_add(E_el);
+ /* Should be get_sub_group_local_id()==0. Doesn't work with Intel Classic driver.
+ * To make it possible to use REDUCE_SHUFFLE with single subgroup per i-j pair
+ * (e.g. subgroup size 16 with CL_SIZE 4), either this "if" needs to be changed or
+ * the definition of WARP_SIZE (currently CL_SIZE*CL_SIZE/2) needs to be changed
+ * (by supporting c_nbnxnGpuClusterpairSplit=1). */
+ if (tidx == 0 || tidx == WARP_SIZE)
+ {
+ atomicAdd_g_f(e_lj, E_lj);
+ atomicAdd_g_f(e_el, E_el);
+ }
+}
+#endif
+
+/*! Energy reduction; this implementation works only with power of two
+ * array sizes.
+ */
+gmx_opencl_inline
+void reduce_energy_pow2(volatile __local float *buf,
+ volatile __global float *e_lj,
+ volatile __global float *e_el,
+ unsigned int tidx)
+{
+ int j;
+
+ unsigned int i = WARP_SIZE/2;
+
+ /* Can't just use i as loop variable because than nvcc refuses to unroll. */
+ for (j = WARP_SIZE_LOG2 - 1; j > 0; j--)
+ {
+ if (tidx < i)
+ {
+ buf[ tidx] += buf[ tidx + i];
+ buf[FBUF_STRIDE + tidx] += buf[FBUF_STRIDE + tidx + i];
+ }
+ i >>= 1;
+ }
+
+ /* last reduction step, writing to global mem */
+ if (tidx == 0)
+ {
+ float e1 = buf[ tidx] + buf[ tidx + i];
+ float e2 = buf[FBUF_STRIDE + tidx] + buf[FBUF_STRIDE + tidx + i];
+
+ atomicAdd_g_f(e_lj, e1);
+ atomicAdd_g_f(e_el, e2);
+ }
+}
+
+gmx_opencl_inline
+void reduce_energy(volatile __local float gmx_unused *buf,
+ float E_lj, float E_el,
+ volatile __global float *e_lj,
+ volatile __global float *e_el,
+ unsigned int tidx)
+{
+#if REDUCE_SHUFFLE
+ reduce_energy_shfl(E_lj, E_el, e_lj, e_el, tidx);
+#else
+ /* flush the energies to shmem and reduce them */
+ buf[ tidx] = E_lj;
+ buf[FBUF_STRIDE + tidx] = E_el;
+ reduce_energy_pow2(buf + (tidx & WARP_SIZE), e_lj, e_el, tidx & ~WARP_SIZE);
+#endif
+}
+
+gmx_opencl_inline
+bool gmx_sub_group_any_localmem(volatile __local uint *warp_any, int widx, bool pred)
+{
+ if (pred)
+ {
+ warp_any[widx] = 1;
+ }
+
+ bool ret = warp_any[widx];
+
+ warp_any[widx] = 0;
+
+ return ret;
+}
+
+//! Returns a true if predicate is true for any work item in warp
+gmx_opencl_inline
+bool gmx_sub_group_any(volatile __local uint gmx_unused *warp_any, int gmx_unused widx, bool pred)
+{
+#if USE_SUBGROUP_ANY
+ return sub_group_any(pred);
+#else
+ return gmx_sub_group_any_localmem(warp_any, widx, pred);
+#endif
+}
+
+#endif /* NBNXN_OPENCL_KERNEL_UTILS_CLH */
#include <algorithm>
#include <string>
+ #include "gromacs/ewald/pme.h"
#include "gromacs/hardware/cpuinfo.h"
#include "gromacs/hardware/detecthardware.h"
#include "gromacs/hardware/hardwaretopology.h"
#include "gromacs/hardware/hw_info.h"
#include "gromacs/mdlib/gmx_omp_nthreads.h"
-#include "gromacs/mdlib/nb_verlet.h"
#include "gromacs/mdtypes/commrec.h"
#include "gromacs/mdtypes/inputrec.h"
#include "gromacs/mdtypes/md_enums.h"
+#include "gromacs/nbnxm/nbnxm.h"
#include "gromacs/taskassignment/taskassignment.h"
#include "gromacs/topology/topology.h"
#include "gromacs/utility/baseversion.h"
const TaskTarget pmeTarget,
const std::vector<int> &gpuIdsToUse,
const std::vector<int> &userGpuTaskAssignment,
- const bool canUseGpuForPme,
+ const gmx_hw_info_t &hardwareInfo,
+ const t_inputrec &inputrec,
+ const gmx_mtop_t &mtop,
const int numRanksPerSimulation,
const int numPmeRanksPerSimulation)
{
// First, exclude all cases where we can't run PME on GPUs.
if ((pmeTarget == TaskTarget::Cpu) ||
!useGpuForNonbonded ||
- !canUseGpuForPme)
+ !pme_gpu_supports_build(nullptr) ||
+ !pme_gpu_supports_hardware(hardwareInfo, nullptr) ||
+ !pme_gpu_supports_input(inputrec, mtop, nullptr))
{
// PME can't run on a GPU. If the user required that, we issue
// an error later.
bool decideWhetherToUseGpusForPme(const bool useGpuForNonbonded,
const TaskTarget pmeTarget,
const std::vector<int> &userGpuTaskAssignment,
- const bool canUseGpuForPme,
+ const gmx_hw_info_t &hardwareInfo,
+ const t_inputrec &inputrec,
+ const gmx_mtop_t &mtop,
const int numRanksPerSimulation,
const int numPmeRanksPerSimulation,
const bool gpusWereDetected)
if (pmeTarget == TaskTarget::Gpu)
{
GMX_THROW(NotImplementedError
- ("The PME on the GPU is only supported when nonbonded interactions run on GPUs also."));
+ ("PME on GPUs is only supported when nonbonded interactions run on GPUs also."));
}
return false;
}
- if (!canUseGpuForPme)
+ std::string message;
+ if (!pme_gpu_supports_build(&message))
{
if (pmeTarget == TaskTarget::Gpu)
{
- // TODO Pass in the inputrec so we can give more help here?
GMX_THROW(NotImplementedError
- ("The input simulation did not use PME in a way that is supported on the GPU."));
+ ("Cannot compute PME interactions on a GPU, because " + message));
+ }
+ return false;
+ }
+ if (!pme_gpu_supports_hardware(hardwareInfo, &message))
+ {
+ if (pmeTarget == TaskTarget::Gpu)
+ {
+ GMX_THROW(NotImplementedError
+ ("Cannot compute PME interactions on a GPU, because " + message));
+ }
+ return false;
+ }
+ if (!pme_gpu_supports_input(inputrec, mtop, &message))
+ {
+ if (pmeTarget == TaskTarget::Gpu)
+ {
+ GMX_THROW(NotImplementedError
+ ("Cannot compute PME interactions on a GPU, because " + message));
}
return false;
}
// Specifying -gputasks requires specifying everything.
if (pmeTarget == TaskTarget::Auto)
{
- GMX_THROW(InconsistentInputError(formatString(g_specifyEverythingFormatString, "all of -nb, -pme, and -ntmpi")));
+ GMX_THROW(InconsistentInputError(formatString(g_specifyEverythingFormatString, "all of -nb, -pme, and -ntmpi"))); // TODO ntmpi?
}
return true;
if (pmeOnGpu)
{
GMX_RELEASE_ASSERT((EEL_PME(inputrec->coulombtype) || EVDW_PME(inputrec->vdwtype)) &&
- pme_gpu_supports_build(*hwinfo, nullptr) && pme_gpu_supports_input(*inputrec, *mtop, nullptr),
+ pme_gpu_supports_build(nullptr) &&
+ pme_gpu_supports_hardware(*hwinfo, nullptr) &&
+ pme_gpu_supports_input(*inputrec, *mtop, nullptr),
"PME can't be on GPUs unless we are using PME");
// PME on GPUs supports a single PME rank with PP running on the same or few other ranks.
/* nonbondedOnGpu might be false e.g. because this simulation uses
* the group scheme, or is a rerun with energy groups. */
- ngpu = (nonbondedOnGpu ? static_cast<int>(gpuIdsToUse.size()) : 0);
+ ngpu = (nonbondedOnGpu ? gmx::ssize(gpuIdsToUse) : 0);
if (inputrec->cutoff_scheme == ecutsGROUP)
{
#include "topologyinformation.h"
-#include "gromacs/compat/make_unique.h"
+#include <memory>
+
#include "gromacs/fileio/confio.h"
#include "gromacs/math/vec.h"
#include "gromacs/pbcutil/rmpbc.h"
{
TopologyInformation::TopologyInformation()
- : mtop_(std::make_unique<gmx_mtop_t>()),
- hasLoadedMtop_(false),
+ : hasLoadedMtop_(false),
expandedTopology_(nullptr),
atoms_ (nullptr),
bTop_(false), ePBC_(-1)
void TopologyInformation::fillFromInputFile(const std::string &filename)
{
- mtop_ = gmx::compat::make_unique<gmx_mtop_t>();
+ mtop_ = std::make_unique<gmx_mtop_t>();
// TODO When filename is not a .tpr, then using readConfAndAtoms
// would be efficient for not doing multiple conversions for
// makeAtomsData. However we'd also need to be able to copy the
// Do lazy initialization
if (expandedTopology_ == nullptr && hasTopology())
{
- expandedTopology_.reset(gmx_mtop_generate_local_top(mtop_.get(), false));
+ expandedTopology_ = std::make_unique<gmx_localtop_t>();
+ gmx_mtop_generate_local_top(*mtop_, expandedTopology_.get(), false);
}
return expandedTopology_.get();