if (CMAKE_CXX_COMPILER_LOADED)
get_compiler_info(CXX BUILD_CXX_COMPILER BUILD_CXXFLAGS)
endif ()
+if(GMX_GPU)
+ get_cuda_compiler_info(CUDA_NVCC_COMPILER_INFO CUDA_NVCC_COMPILER_FLAGS)
+endif(GMX_GPU)
+
########################################################################
# Specify install locations
% latexmlpost --destination installguide.xhtml --format=xhtml installguide.xml
%
% Crude hack to remove ugly symbols:
-% sed -e 's/§//g' -i installguide.xhtml
+% sed -e 's/[§]//g' -i installguide.xhtml
%
% Strip off header for pasting into the website at
% http://www.gromacs.org/Documentation/Installation_Instructions:
\end{enumerate}
Or, as a sequence of commands to execute:
\begin{verbatim}
-tar xfz gromacs-4.6.1.tar.gz
-cd gromacs-4.6.1
+tar xfz gromacs-4.6.3.tar.gz
+cd gromacs-4.6.3
mkdir build
cd build
cmake .. -DGMX_BUILD_OWN_FFTW=ON
\subsubsection{\mkl{}}
-Using \mkl{} with \icc{} 11 or higher is very simple. Set up your
+Using \mkl{} with icc 11 or higher is very simple. Set up your
compiler environment correctly, perhaps with a command like
\verb+source /path/to/compilervars.sh intel64+ (or consult your local
documentation). Then set \verb+-DGMX_FFT_LIBRARY=mkl+ when you run
example, download the source tarball and use
% TODO: keep up to date with new releases!
\begin{verbatim}
-$ tar xfz gromacs-4.6.1.tgz
-$ cd gromacs-4.6.1
+$ tar xfz gromacs-4.6.3.tgz
+$ cd gromacs-4.6.3
$ mkdir build-cmake
$ cd build-cmake
$ cmake ..
\end{verbatim}
Thus the names of all binaries and libraries will be appended with
-"_mod."
+"\_mod."
\subsection{Building \gromacs{}}
+++ /dev/null
-<?xml version="1.0" encoding="utf-8"?>
-<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1 plus MathML 2.0//EN" "http://www.w3c.org/TR/MathML2/dtd/xhtml-math11-f.dtd">
-<html xmlns="http://www.w3.org/1999/xhtml" xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:svg="http://www.w3.org/2000/svg">
- <head>
- <title>GROMACS installation guide</title>
- <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
- <link rel="stylesheet" type="text/css" href="core.css"/>
- </head>
- <body>
- <div class="navbar">
- <ul class="toc">
- <li class="tocentry"><span class="ref toc here">GROMACS installation guide</span>
- <ul class="toc">
- <li class="tocentry"><a href="#S1" title="1. Building GROMACS in GROMACS installation guide" class="ref toc">1 Building GROMACS</a></li>
- <li class="tocentry"><a href="#S2" title="2. Prerequisites in GROMACS installation guide" class="ref toc">2 Prerequisites</a>
- <ul class="toc">
- <li class="tocentry"><a href="#S2.SS1" title="2.1. Platform in 2. Prerequisites in GROMACS installation guide" class="ref toc">2.1 Platform</a></li>
- <li class="tocentry"><a href="#S2.SS2" title="2.2. Compiler in 2. Prerequisites in GROMACS installation guide" class="ref toc">2.2 Compiler</a>
- <ul class="toc">
- <li class="tocentry"><a href="#S2.SS2.SSS1" title="2.2.1. Running in parallel in 2.2. Compiler in 2. Prerequisites in GROMACS installation guide" class="ref toc">2.2.1 Running in parallel</a></li>
- </ul></li>
- <li class="tocentry"><a href="#S2.SS3" title="2.3. CMake in 2. Prerequisites in GROMACS installation guide" class="ref toc">2.3 CMake</a></li>
- <li class="tocentry"><a href="#S2.SS4" title="2.4. Fast Fourier Transform library in 2. Prerequisites in GROMACS installation guide" class="ref toc">2.4 Fast Fourier Transform library</a>
- <ul class="toc">
- <li class="tocentry"><a href="#S2.SS4.SSS1" title="2.4.1. FFTW in 2.4. Fast Fourier Transform library in 2. Prerequisites in GROMACS installation guide" class="ref toc">2.4.1 FFTW</a></li>
- <li class="tocentry"><a href="#S2.SS4.SSS2" title="2.4.2. MKL in 2.4. Fast Fourier Transform library in 2. Prerequisites in GROMACS installation guide" class="ref toc">2.4.2 MKL</a></li>
- </ul></li>
- <li class="tocentry"><a href="#S2.SS5" title="2.5. Optional build components in 2. Prerequisites in GROMACS installation guide" class="ref toc">2.5 Optional build components</a></li>
- </ul></li>
- <li class="tocentry"><a href="#S3" title="3. Doing a build of GROMACS in GROMACS installation guide" class="ref toc">3 Doing a build of GROMACS</a>
- <ul class="toc">
- <li class="tocentry"><a href="#S3.SS1" title="3.1. Configuring with CMake in 3. Doing a build of GROMACS in GROMACS installation guide" class="ref toc">3.1 Configuring with CMake</a></li>
- <li class="tocentry"><a href="#S3.SS2" title="3.2. Using CMake command-line options in 3. Doing a build of GROMACS in GROMACS installation guide" class="ref toc">3.2 Using CMake command-line options</a></li>
- <li class="tocentry"><a href="#S3.SS3" title="3.3. CMake advanced options in 3. Doing a build of GROMACS in GROMACS installation guide" class="ref toc">3.3 CMake advanced options</a></li>
- <li class="tocentry"><a href="#S3.SS4" title="3.4. Helping CMake find the right libraries/headers/programs in 3. Doing a build of GROMACS in GROMACS installation guide" class="ref toc">3.4 Helping CMake find the right libraries/headers/programs</a></li>
- <li class="tocentry"><a href="#S3.SS5" title="3.5. CMake advice during the GROMACS 4.6 beta phase in 3. Doing a build of GROMACS in GROMACS installation guide" class="ref toc">3.5 CMake advice during the GROMACS 4.6 beta phase</a></li>
- <li class="tocentry"><a href="#S3.SS6" title="3.6. Native GPU acceleration in 3. Doing a build of GROMACS in GROMACS installation guide" class="ref toc">3.6 Native GPU acceleration</a></li>
- <li class="tocentry"><a href="#S3.SS7" title="3.7. Static linking in 3. Doing a build of GROMACS in GROMACS installation guide" class="ref toc">3.7 Static linking</a></li>
- <li class="tocentry"><a href="#S3.SS8" title="3.8. Suffixes for binaries and libraries in 3. Doing a build of GROMACS in GROMACS installation guide" class="ref toc">3.8 Suffixes for binaries and libraries</a></li>
- <li class="tocentry"><a href="#S3.SS9" title="3.9. Building GROMACS in 3. Doing a build of GROMACS in GROMACS installation guide" class="ref toc">3.9 Building GROMACS</a></li>
- <li class="tocentry"><a href="#S3.SS10" title="3.10. Installing GROMACS in 3. Doing a build of GROMACS in GROMACS installation guide" class="ref toc">3.10 Installing GROMACS</a></li>
- <li class="tocentry"><a href="#S3.SS11" title="3.11. Getting access to GROMACS after installation in 3. Doing a build of GROMACS in GROMACS installation guide" class="ref toc">3.11 Getting access to GROMACS after installation</a></li>
- <li class="tocentry"><a href="#S3.SS12" title="3.12. Testing GROMACS for correctness in 3. Doing a build of GROMACS in GROMACS installation guide" class="ref toc">3.12 Testing GROMACS for correctness</a></li>
- <li class="tocentry"><a href="#S3.SS13" title="3.13. Testing GROMACS for performance in 3. Doing a build of GROMACS in GROMACS installation guide" class="ref toc">3.13 Testing GROMACS for performance</a></li>
- <li class="tocentry"><a href="#S3.SS14" title="3.14. Having difficulty? in 3. Doing a build of GROMACS in GROMACS installation guide" class="ref toc">3.14 Having difficulty?</a></li>
- </ul></li>
- <li class="tocentry"><a href="#S4" title="4. Special instructions for some platforms in GROMACS installation guide" class="ref toc">4 Special instructions for some platforms</a>
- <ul class="toc">
- <li class="tocentry"><a href="#S4.SS1" title="4.1. Building on Windows in 4. Special instructions for some platforms in GROMACS installation guide" class="ref toc">4.1 Building on Windows</a></li>
- <li class="tocentry"><a href="#S4.SS2" title="4.2. Building on Cray in 4. Special instructions for some platforms in GROMACS installation guide" class="ref toc">4.2 Building on Cray</a></li>
- <li class="tocentry"><a href="#S4.SS3" title="4.3. Building on BlueGene in 4. Special instructions for some platforms in GROMACS installation guide" class="ref toc">4.3 Building on BlueGene</a>
- <ul class="toc">
- <li class="tocentry"><a href="#S4.SS3.SSS1" title="4.3.1. BlueGene/P in 4.3. Building on BlueGene in 4. Special instructions for some platforms in GROMACS installation guide" class="ref toc">4.3.1 BlueGene/P</a></li>
- <li class="tocentry"><a href="#S4.SS3.SSS2" title="4.3.2. BlueGene/Q in 4.3. Building on BlueGene in 4. Special instructions for some platforms in GROMACS installation guide" class="ref toc">4.3.2 BlueGene/Q</a></li>
- </ul></li>
- </ul></li>
- <li class="tocentry"><a href="#S5" title="5. Tested platforms in GROMACS installation guide" class="ref toc">5 Tested platforms</a></li>
- <li class="tocentry"><a href="#S6" title="6. Other issues in GROMACS installation guide" class="ref toc">6 Other issues</a></li>
- </ul></li>
- </ul>
- </div>
- <div class="main">
- <div class="content">
- <div class="document">
- <h1 class="title document-title">GROMACS installation guide</h1>
- <div class="section" id="S1">
- <h2 class="title section-title"> 1. Building GROMACS</h2>
- <div class="para" id="S1.p1">
- <p class="p">These instructions pertain to building GROMACS 4.6 beta releases
-and newer. For installations instructions for old GROMACS versions,
-see here
-<a href="http://www.gromacs.org/Documentation/Installation_Instructions_4.5" title="" class="ref url"><span style="" class="text typewriter">http://www.gromacs.org/Documentation/Installation_Instructions_4.5</span></a>.</p>
- </div>
-
- </div>
-
- <div class="section" id="S2">
- <h2 class="title section-title"> 2. Prerequisites</h2>
- <div class="subsection" id="S2.SS1">
- <h3 class="title subsection-title"> 2.1. Platform</h3>
- <div class="para" id="S2.SS1.p1">
- <p class="p">GROMACS can be compiled for any distribution of Linux, Mac OS X,
-Windows (native, Cygwin or MinGW), BlueGene, Cray and probably others.
-Technically, it can be compiled on any platform with an ANSI C
-compiler and supporting libraries, such as the GNU C library. It can
-even compile on an iPhone! Later, there will be a detailed list of
-hardware, platform and compilers upon which we do build and regression
-testing.</p>
- </div>
-
- </div>
-
- <div class="subsection" id="S2.SS2">
- <h3 class="title subsection-title"> 2.2. Compiler</h3>
- <div class="para" id="S2.SS2.p1">
- <p class="p">GROMACS requires an ANSI C compiler that complies with the C89
-standard. For best performance, the GROMACS team strongly
-recommends you get the most recent version of your preferred compiler
-for your platform (e.g. GCC 4.7 or Intel 12.0 or newer on x86
-hardware). There is a large amount of GROMACS code introduced in
-version 4.6 that depends on effective compiler optimization to get
-high performance - the old assembly-language routines have all
-gone. For other platforms, use the vendor's compiler, and check for
-specialized information below.</p>
- </div>
-
- <div class="subsubsection" id="S2.SS2.SSS1">
- <h4 class="title subsubsection-title"> 2.2.1. Running in parallel</h4>
- <div class="para" id="S2.SS2.SSS1.p1">
- <p class="p">GROMACS can run in parallel on multiple cores of a single
-workstation using its built-in ThreadMPI. No user action is required
-in order to enable this.</p>
- </div>
-
- <div class="para" id="S2.SS2.SSS1.p2">
- <p class="p">If you wish to use the excellent new native GPU support in GROMACS,
-NVIDIA's CUDA
-<a href="http://www.nvidia.com/object/cuda_home_new.html" title="" class="ref url"><span style="" class="text typewriter">http://www.nvidia.com/object/cuda_home_new.html</span></a> version
-3.2 software development kit is required, and the latest
-version is encouraged. NVIDIA GPUs with at least NVIDIA compute
-capability 2.0 are required, e.g. Fermi or Kepler cards.</p>
- </div>
-
- <div class="para" id="S2.SS2.SSS1.p3">
- <p class="p">The GPU support from GROMACS version 4.5 using OpenMM
-<a href="https://simtk.org/home/openmm" title="" class="ref url"><span style="" class="text typewriter">https://simtk.org/home/openmm</span></a> is still available, also requires
-CUDA, and remains the only hardware-based acceleration available
-for implicit solvent simulations in GROMACS. This parallelization
-path may not be maintained in the future.</p>
- </div>
-
- <div class="para" id="S2.SS2.SSS1.p4">
- <p class="p">If you wish to run in parallel on multiple machines across a network,
-you will need to have</p>
-
- <ul class="itemize" id="I1">
-
- <li class="item" id="I1.i1">
-
- <div class="para" id="I1.i1.p1">
- <p class="p">an MPI library installed that supports the MPI 1.3
-standard, and</p>
- </div>
-
- </li>
-
- <li class="item" id="I1.i2">
-
- <div class="para" id="I1.i2.p1">
- <p class="p">wrapper compilers that will compile code using that library.</p>
- </div>
-
- </li>
-
- </ul>
-
- <p class="p">The GROMACS team recommends OpenMPI
-<a href="http://www.open-mpi.org/" title="" class="ref url"><span style="" class="text typewriter">http://www.open-mpi.org/</span></a> version 1.4.1 (or higher), MPICH
-<a href="http://www.mpich.org/" title="" class="ref url"><span style="" class="text typewriter">http://www.mpich.org/</span></a> version 1.4.1 (or higher), or your
-hardware vendor's MPI installation. The most recent version of
-either of this is likely to be the best. LAM/MPI
-<a href="http://www.lam-mpi.org/" title="" class="ref url"><span style="" class="text typewriter">http://www.lam-mpi.org/</span></a> may work, but since it has been
-deprecated for years, it is not supported.</p>
- </div>
-
- <div class="para" id="S2.SS2.SSS1.p5">
- <p class="p">In some cases, OpenMP parallelism is an advantage for GROMACS,
-but support for this is generally built into your compiler and you
-need to make no advance preparation for this. The performance gain you
-might achieve can vary with the compiler.</p>
- </div>
-
- <div class="para" id="S2.SS2.SSS1.p6">
- <p class="p">It is important to examine how you will use GROMACS and upon what
-hardware and with what compilers in deciding which parallelization
-paths to make available. Testing the performance of different options
-is unfortunately mandatory. The days of being able to just build and
-run '<code class="verbatim">mdrun</code>' and get decent performance by default on your
-hardware are long gone. GROMACS will always run correctly, and does
-a decent job of trying to maximize your performance, but if you want
-to approach close to the optimum, you will need to do some work for
-it!</p>
- </div>
-
- </div>
-
- </div>
-
- <div class="subsection" id="S2.SS3">
- <h3 class="title subsection-title"> 2.3. CMake</h3>
- <div class="para" id="S2.SS3.p1">
- <p class="p">From version 4.6, GROMACS has moved to use the build system
-CMake. The previous build system that used <span style="" class="text typewriter">configure</span> from
-the GNU autotools package has been removed permanently. CMake
-permits the GROMACS team to support a very wide range of hardware,
-compilers and build configurations while continuing to provide the
-portability, robustness and performance for which GROMACS is known.</p>
- </div>
-
- <div class="para" id="S2.SS3.p2">
- <p class="p">GROMACS requires CMake version 2.8.0 or higher. Lower
-versions will not work. You can check whether CMake is installed,
-and what version it is, with <span style="" class="text typewriter">cmake --version</span>. If you need to
-install CMake, then first check whether your platform's package
-management system provides a suitable version, or visit
-<a href="http://www.cmake.org/cmake/help/install.html" title="" class="ref url"><span style="" class="text typewriter">http://www.cmake.org/cmake/help/install.html</span></a> for pre-compiled
-binaries, source code and installation instructions. The GROMACS
-team recommends you install the most recent version of CMake you
-can.
-</p>
- </div>
-
- </div>
-
- <div class="subsection" id="S2.SS4">
- <h3 class="title subsection-title"> 2.4. Fast Fourier Transform library</h3>
- <div class="para" id="S2.SS4.p1">
- <p class="p">Many simulations in GROMACS make extensive use of Fourier transforms,
-and a software library to perform these is always required. We
-recommend FFTW <a href="http://www.fftw.org/" title="" class="ref url"><span style="" class="text typewriter">http://www.fftw.org/</span></a> (version 3 or higher
-only) or Intel's MKL
-<a href="http://software.intel.com/en-us/intel-mkl" title="" class="ref url"><span style="" class="text typewriter">http://software.intel.com/en-us/intel-mkl</span></a>.</p>
- </div>
-
- <div class="subsubsection" id="S2.SS4.SSS1">
- <h4 class="title subsubsection-title"> 2.4.1. FFTW</h4>
- <div class="para" id="S2.SS4.SSS1.p1">
- <p class="p">FFTW is likely to be available for your platform via its package
-management system, but there can be compatibility and significant
-performance issues associated with these packages. In particular,
-GROMACS simulations are normally run in single floating-point
-precision whereas the default FFTW package is normally in double
-precision, and good compiler options to use for FFTW when linked to
-GROMACS may not have been used. Accordingly, the GROMACS team
-recommends either</p>
-
- <ul class="itemize" id="I2">
-
- <li class="item" id="I2.i1">
-
- <div class="para" id="I2.i1.p1">
- <p class="p">that you permit the GROMACS installation to download and
-build FFTW 3.3.2 from source automatically
-for you, or</p>
- </div>
-
- </li>
-
- <li class="item" id="I2.i2">
-
- <div class="para" id="I2.i2.p1">
- <p class="p">that you build FFTW from the source code.</p>
- </div>
-
- </li>
-
- </ul>
- </div>
-
- <div class="para" id="S2.SS4.SSS1.p2">
- <p class="p">If you build FFTW from source yourself, get the most recent version
-and follow its installation guide
-(e.g. <a href="http://www.fftw.org/fftw3_doc/Installation-and-Customization.html" title="" class="ref url"><span style="" class="text typewriter">http://www.fftw.org/fftw3_doc/Installation-and-Customization.html</span></a>). Choose
-the precision (i.e. single or float vs double) to match what you will
-later require for GROMACS. There is no need to compile with
-threading or MPI support, but it does no harm. On x86 hardware,
-compile <em class="emph">only</em> with <span style="" class="text typewriter">--enable-sse2</span> (regardless of
-precision) even if your processors can take advantage of AVX
-extensions to SSE. The way GROMACS uses Fourier transforms
-cannot take advantage of this feature in FFTW because of memory
-system performance limitations, it can degrade performance by around
-20%, and there is no way for GROMACS to require the use of the
-SSE2 at run time if AVX support has been compiled into FFTW.</p>
- </div>
-
- </div>
-
- <div class="subsubsection" id="S2.SS4.SSS2">
- <h4 class="title subsubsection-title"> 2.4.2. MKL</h4>
- <div class="para" id="S2.SS4.SSS2.p1">
- <p class="p">Using MKL requires a set of linker flags that GROMACS is not
-able to detect for you, so setting up optimal linking is tricky at the
-moment. Need better documentation later.</p>
- </div>
-
- </div>
-
- </div>
-
- <div class="subsection" id="S2.SS5">
- <h3 class="title subsection-title"> 2.5. Optional build components</h3>
- <div class="para" id="S2.SS5.p1">
- <ul class="itemize" id="I3">
-
- <li class="item" id="I3.i1">
-
- <div class="para" id="I3.i1.p1">
- <p class="p">A hardware-optimized BLAS or LAPACK library is useful for
-some of the GROMACS utilities, but is not needed for running
-simulations.</p>
- </div>
-
- </li>
-
- <li class="item" id="I3.i2">
-
- <div class="para" id="I3.i2.p1">
- <p class="p">The built-in GROMACS trajectory viewer <span style="" class="text typewriter">ngmx</span> requires
-X11 and Motif/Lesstif libraries and header files. Generally, the
-GROMACS team recommends you use third-party software for
-visualization, such as VMD
-<a href="http://www.ks.uiuc.edu/Research/vmd/" title="" class="ref url"><span style="" class="text typewriter">http://www.ks.uiuc.edu/Research/vmd/</span></a> or PyMOL
-<a href="http://www.pymol.org/" title="" class="ref url"><span style="" class="text typewriter">http://www.pymol.org/</span></a>.</p>
- </div>
-
- </li>
-
- </ul>
- </div>
-
- </div>
-
- </div>
-
- <div class="section" id="S3">
- <h2 class="title section-title"> 3. Doing a build of GROMACS</h2>
- <div class="para" id="S3.p1">
- <p class="p">This section will cover a general build of GROMACS with CMake,
-but it is not an exhaustive discussion of how to use CMake. There
-are many resources available on the web, which we suggest you search
-for when you encounter problems not covered here. The material below
-applies specifically to builds on Unix-like systems, including Linux,
-Mac OS X, MinGW and Cygwin. For other platforms, see the specialist
-instructions below.</p>
- </div>
-
- <div class="subsection" id="S3.SS1">
- <h3 class="title subsection-title"> 3.1. Configuring with CMake</h3>
- <div class="para" id="S3.SS1.p1">
- <p class="p">CMake will run many tests on your system and do its best to work
-out how to build GROMACS for you. If you are building GROMACS on
-hardware that is identical to that where you will run <span style="" class="text typewriter">mdrun</span>,
-then you can be sure that the defaults will be pretty good. Howver, if
-you want to control aspects of the build, there's plenty of things you
-can set, too.</p>
- </div>
-
- <div class="para" id="S3.SS1.p2">
- <p class="p">The best way to use CMake to configure GROMACS is to do an
-“out-of-source” build, by making another directory from which you
-will run CMake. This can be a subdirectory or not, it doesn't
-matter. It also means you can never corrupt your source code by trying
-to build it! So, the only required argument on the CMake command
-line is the name of the directory containing the
-<span style="" class="text typewriter">CMakeLists.txt</span> file of the code you want to build. For
-example, download the source tarball and use
-</p>
- <pre class="verbatim">
-$ tar xfz gromacs-4.6-beta1-src.tgz
-$ cd gromacs-4.6-beta1
-$ mkdir build-cmake
-$ cd build-cmake
-$ cmake ..
-</pre></div>
-
- <div class="para" id="S3.SS1.p3">
- <p class="p">You will see <span style="" class="text typewriter">cmake</span> report the results of a large number of
-tests on your system made by CMake and by GROMACS. These are
-written to the CMake cache, kept in <span style="" class="text typewriter">CMakeCache.txt</span>. You
-can edit this file by hand, but this is not recommended because it is
-easy to reach an inconsistent state. You should not attempt to move or
-copy this file to do another build, because the paths are hard-coded
-within it. If you mess things up, just delete this file and start
-again with '<code class="verbatim">cmake</code>'.</p>
- </div>
-
- <div class="para" id="S3.SS1.p4">
- <p class="p">If there's a serious problem detected at this stage, then you will see
-a fatal error and some suggestions for how to overcome it. If you're
-not sure how to deal with that, please start by searching on the web
-(most computer problems already have known solutions!) and then
-consult the <span style="" class="text typewriter">gmx-users</span> mailing list. There are also
-informational warnings that you might like to take on board or
-not. Piping the output of <span style="" class="text typewriter">cmake</span> through <span style="" class="text typewriter">less</span> or
-<span style="" class="text typewriter">tee</span> can be useful, too.</p>
- </div>
-
- <div class="para" id="S3.SS1.p5">
- <p class="p">CMake works in an iterative fashion, re-running each time a setting
-is changed to try to make sure other things are consistent. Once
-things seem consistent, the iterations stop. Once <span style="" class="text typewriter">cmake</span>
-returns, you can see all the settings that were chosen and information
-about them by using</p>
- <pre class="verbatim">
-$ ccmake ..
-</pre>
- <p class="p">Check out <a href="http://www.cmake.org/cmake/help/runningcmake.html" title="" class="ref url"><span style="" class="text typewriter">http://www.cmake.org/cmake/help/runningcmake.html</span></a> for
-general advice on what you are seeing and how to navigate and change
-things. The settings you might normally want to change are already
-presented. If you make any changes, then <span style="" class="text typewriter">ccmake</span> will notice
-that and require that you re-configure (using '<code class="verbatim">c</code>'), so that it
-gets a chance to make changes that depend on yours and perform more
-checking. This might require several configuration stages when you are
-using <span style="" class="text typewriter">ccmake</span> - when you are using <span style="" class="text typewriter">cmake</span> the
-iteration is done behind the scenes.</p>
- </div>
-
- <div class="para" id="S3.SS1.p6">
- <p class="p">A key thing to consider here is the setting of
-<span style="" class="text typewriter">GMX_INSTALL_PREFIX</span>. You will need to be able to write to this
-directory in order to install GROMACS later, and if you change your
-mind later, changing it in the cache triggers a full re-build,
-unfortunately. So if you do not have super-user privileges on your
-machine, then you will need to choose a sensible location within your
-home directory for your GROMACS installation.</p>
- </div>
-
- <div class="para" id="S3.SS1.p7">
- <p class="p">When <span style="" class="text typewriter">cmake</span> or <span style="" class="text typewriter">ccmake</span> have completed iterating, the
-cache is stable and a build tree can be generated, with '<code class="verbatim">g</code>' in
-<span style="" class="text typewriter">ccmake</span> or automatically with <span style="" class="text typewriter">cmake</span>.</p>
- </div>
-
- </div>
-
- <div class="subsection" id="S3.SS2">
- <h3 class="title subsection-title"> 3.2. Using CMake command-line options</h3>
- <div class="para" id="S3.SS2.p1">
- <p class="p">Once you become comfortable with setting and changing options, you
-may know in advance how you will configure GROMACS. If so, you can
-speed things up by invoking <span style="" class="text typewriter">cmake</span> with a command like:</p>
- <pre class="verbatim">
-$ cmake .. -DGMX_GPU=ON -DGMX_MPI=ON -DGMX_INSTALL_PREFIX=/home/marydoe/programs
-</pre>
- <p class="p">to build with GPUs, MPI and install in a custom location. You can even
-save that in a shell script to make it even easier next time. You can
-also do this kind of thing with <span style="" class="text typewriter">ccmake</span>, but you should avoid
-this, because the options set with '<code class="verbatim">-D</code>' will not be able to be
-changed interactively in that run of <span style="" class="text typewriter">ccmake</span>.</p>
- </div>
-
- </div>
-
- <div class="subsection" id="S3.SS3">
- <h3 class="title subsection-title"> 3.3. CMake advanced options</h3>
- <div class="para" id="S3.SS3.p1">
- <p class="p">The options that can be seen with <span style="" class="text typewriter">ccmake</span> are ones that we
-think a reasonable number of users might want to consider
-changing. There are a lot more options available, which you can see by
-toggling the advanced mode in <span style="" class="text typewriter">ccmake</span> on and off with
-'<code class="verbatim">t</code>'. Even there, most of the variables that you might want to
-change have a '<code class="verbatim">CMAKE_</code>' or '<code class="verbatim">GMX_</code>' prefix.</p>
- </div>
-
- </div>
-
- <div class="subsection" id="S3.SS4">
- <h3 class="title subsection-title"> 3.4. Helping CMake find the right libraries/headers/programs</h3>
- <div class="para" id="S3.SS4.p1">
- <p class="p">If libraries are installed in non-default locations their location can
-be specified using the following environment variables:</p>
-
- <ul class="itemize" id="I4">
-
- <li class="item" id="I4.i1">
-
- <div class="para" id="I4.i1.p1">
- <p class="p"><span style="" class="text typewriter">CMAKE_INCLUDE_PATH</span> for header files</p>
- </div>
-
- </li>
-
- <li class="item" id="I4.i2">
-
- <div class="para" id="I4.i2.p1">
- <p class="p"><span style="" class="text typewriter">CMAKE_LIBRARY_PATH</span> for libraries</p>
- </div>
-
- </li>
-
- <li class="item" id="I4.i3">
-
- <div class="para" id="I4.i3.p1">
- <p class="p"><span style="" class="text typewriter">CMAKE_PREFIX_PATH</span> for header, libraries and binaries
-(e.g. '<code class="verbatim">/usr/local</code>').</p>
- </div>
-
- </li>
-
- </ul>
-
- <p class="p">The respective '<code class="verbatim">include</code>', '<code class="verbatim">lib</code>', or '<code class="verbatim">bin</code>' is
-appended to the path. For each of these variables, a list of paths can
-be specified (on Unix seperated with ”:”). Note that these are
-enviroment variables (and not CMake command-line arguments) and in
-a '<code class="verbatim">bash</code>' shell are used like:</p>
- <pre class="verbatim">
-$ CMAKE_PREFIX_PATH=/opt/fftw:/opt/cuda cmake ..
-</pre></div>
-
- <div class="para" id="S3.SS4.p2">
- <p class="p">The <span style="" class="text typewriter">CC</span> and <span style="" class="text typewriter">CXX</span> environment variables are also useful
-for indicating to CMake which compilers to use, which can be very
-important for maximising GROMACS performance. Similarly,
-<span style="" class="text typewriter">CFLAGS</span>/<span style="" class="text typewriter">CXXFLAGS</span> can be used to pass compiler
-options, but note that these will be appended to those set by
-GROMACS for your build platform and build type. You can customize
-some of this with advanced options such as <span style="" class="text typewriter">CMAKE_C_FLAGS</span>
-and its relatives.</p>
- </div>
-
- <div class="para" id="S3.SS4.p3">
- <p class="p">See also: <a href="http://cmake.org/Wiki/CMake_Useful_Variables#Environment_Variables" title="" class="ref url"><span style="" class="text typewriter">http://cmake.org/Wiki/CMake_Useful_Variables#Environment_Variables</span></a></p>
- </div>
-
- </div>
-
- <div class="subsection" id="S3.SS5">
- <h3 class="title subsection-title"> 3.5. CMake advice during the GROMACS 4.6 beta phase</h3>
- <div class="para" id="S3.SS5.p1">
- <p class="p">We'd like users to have the ability to change any setting and still
-have the CMake cache stable; ie. not have things you set
-mysteriously change, or (worse) the whole thing breaks. We're not
-there yet. If you know in advance you will want to use a particular
-setting, set that on the initial <span style="" class="text typewriter">cmake</span> command line. If you
-have to change compilers, do that there, or immediately afterwards in
-<span style="" class="text typewriter">ccmake</span>. Gross changes like GPU or shared libraries on/off are
-more likely to work if you do them on the initial command line,
-because that's how we've been doing it while developing and
-testing. If you do make a mess of things, there's a great thing about
-an out-of-source build - you can just do '<code class="verbatim">rm -rf *</code>' and start
-again. Easy!</p>
- </div>
-
- <div class="para" id="S3.SS5.p2">
- <p class="p">We are interested in learning how you managed to break things. If you
-can reproducibly reach a state where CMake can't proceed, or
-subsequent compilation/linking/running fails, then we need to know so
-we can fix it!</p>
- </div>
-
- </div>
-
- <div class="subsection" id="S3.SS6">
- <h3 class="title subsection-title"> 3.6. Native GPU acceleration</h3>
- <div class="para" id="S3.SS6.p1">
- <p class="p">If you have the CUDA SDK installed, you can use CMake
-with:</p>
- <pre class="verbatim">
-cmake .. -DGMX_GPU=ON -DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda
-</pre>
- <p class="p">(or whichever path has your installation). Note that this will require
-a working C++ compiler, and in some cases you might need to handle
-this manually, e.g. with the advanced option
-<span style="" class="text typewriter">CUDA_HOST_COMPILER</span>.</p>
- </div>
-
- <div class="para" id="S3.SS6.p2">
- <p class="p">More documentation needed here - particular discussion of fiddly
-details on Windows, Linux and Mac required. Not all compilers on all
-systems can be made to work.</p>
- </div>
-
- </div>
-
- <div class="subsection" id="S3.SS7">
- <h3 class="title subsection-title"> 3.7. Static linking</h3>
- <div class="para" id="S3.SS7.p1">
- <p class="p">Dynamic linking of the GROMACS executables will lead to a smaller
-disk footprint when installed, and so is the default. However, on some
-hardware or under some circumstances you might need to do static
-linking. To link GROMACS binaries statically against the internal
-GROMACS libraries, set <span style="" class="text typewriter">BUILD_SHARED_LIBS=OFF</span>. To link
-statically against external libraries as well, the
-<span style="" class="text typewriter">GMX_PREFER_STATIC_LIBS=ON</span> option can be used. Note, that
-in general CMake picks up whatever is available, so this option
-only instructs CMake to prefer static libraries when both static
-and shared are available. If no static version of an external library
-is available, even when the aforementioned option is ON, the shared
-library will be used. Also note, that the resulting binaries will
-still be dynamically linked against system libraries if that is all
-that is available.</p>
- </div>
-
- </div>
-
- <div class="subsection" id="S3.SS8">
- <h3 class="title subsection-title"> 3.8. Suffixes for binaries and libraries</h3>
- <div class="para" id="S3.SS8.p1">
- <p class="p">It is sometimes convenient to have different versions of the same
-GROMACS libraries installed. The most common use cases have been
-single and double precision, and with and without MPI. By default,
-GROMACS will suffix binaries and libraries for such builds with
-'<code class="verbatim">_d</code>' for double precision and/or '<code class="verbatim">_mpi</code>' for MPI (and
-nothing otherwise). This can be controlled manually with
-<span style="" class="text typewriter">GMX_DEFAULT_SUFFIX</span>, <span style="" class="text typewriter">GMX_BINARY_SUFFIX</span> and
-<span style="" class="text typewriter">GMX_LIBRARY_SUFFIX</span>.</p>
- </div>
-
- </div>
-
- <div class="subsection" id="S3.SS9">
- <h3 class="title subsection-title"> 3.9. Building GROMACS</h3>
- <div class="para" id="S3.SS9.p1">
- <p class="p">Once you have a stable cache, you can build GROMACS. If you're not
-sure the cache is stable, you can re-run <code class="verbatim">cmake ..</code> or
-<code class="verbatim">ccmake ..</code>' to see. Then you can run <span style="" class="text typewriter">make</span> to start the
-compilation. Before actual compilation starts, <span style="" class="text typewriter">make</span> checks
-that the cache is stable, so if it isn't you will see CMake run
-again.</p>
- </div>
-
- <div class="para" id="S3.SS9.p2">
- <p class="p">So long as any changes you've made to the configuration are sensible,
-it is expected that the <span style="" class="text typewriter">make</span> procedure will always complete
-successfully. The tests GROMACS makes on the settings you choose
-are pretty extensive, but there are probably a few cases we haven't
-thought of yet. Search the web first for solutions to problems,
-but if you need help, ask on <span style="" class="text typewriter">gmx-users</span>, being sure to provide
-as much information as possible about what you did, the system you are
-building on, and what went wrong.</p>
- </div>
-
- <div class="para" id="S3.SS9.p3">
- <p class="p">If you have a multi-core or multi-CPU machine with <span style="" class="text typewriter">N</span>
-processors, then using
-</p>
- <pre class="verbatim">
-$ make -j N
-</pre>
- <p class="p">will generally speed things up by quite a bit.</p>
- </div>
-
- </div>
-
- <div class="subsection" id="S3.SS10">
- <h3 class="title subsection-title"> 3.10. Installing GROMACS</h3>
- <div class="para" id="S3.SS10.p1">
- <p class="p">Finally, <span style="" class="text typewriter">make install</span> will install GROMACS in the
-directory given in <span style="" class="text typewriter">GMX_INSTALL_PREFIX</span>. If this is an system
-directory, then you will need permission to write there, and you
-should use super-user privileges only for <span style="" class="text typewriter">make install</span> and
-not the whole procedure.</p>
- </div>
-
- </div>
-
- <div class="subsection" id="S3.SS11">
- <h3 class="title subsection-title"> 3.11. Getting access to GROMACS after installation</h3>
- <div class="para" id="S3.SS11.p1">
- <p class="p">GROMACS installs the script <span style="" class="text typewriter">GMXRC</span> in the <span style="" class="text typewriter">bin</span>
-subdirectory of the installation directory
-(e.g. <span style="" class="text typewriter">/usr/local/gromacs/bin/GMXRC</span>), which you should source
-from your shell:</p>
- <pre class="verbatim">
-$ source your-installation-prefix-here/bin/GMXRC
-</pre></div>
-
- <div class="para" id="S3.SS11.p2">
- <p class="p">It will detect what kind of shell you are running and
-set up your environment for using GROMACS. You may wish to arrange
-for your login scripts to do this automatically; please search the web
-for instructions on how to do this for your shell.</p>
- </div>
-
- </div>
-
- <div class="subsection" id="S3.SS12">
- <h3 class="title subsection-title"> 3.12. Testing GROMACS for correctness</h3>
- <div class="para" id="S3.SS12.p1">
- <p class="p">TODO install and use regression set</p>
- </div>
-
- </div>
-
- <div class="subsection" id="S3.SS13">
- <h3 class="title subsection-title"> 3.13. Testing GROMACS for performance</h3>
- <div class="para" id="S3.SS13.p1">
- <p class="p">TODO benchmarks</p>
- </div>
-
- </div>
-
- <div class="subsection" id="S3.SS14">
- <h3 class="title subsection-title"> 3.14. Having difficulty?</h3>
- <div class="para" id="S3.SS14.p1">
- <p class="p">You're not alone, this can be a complex task. If you encounter a
-problem with installing GROMACS, then there are a number of
-locations where you can find assistance. It is recommended that you
-follow these steps to find the solution:</p>
- </div>
-
- <div class="para" id="S3.SS14.p2">
- <ol class="enumerate" id="I5">
-
- <li class="item" id="I5.i1">
-
- <div class="para" id="I5.i1.p1">
- <p class="p">Read the installation instructions again, taking note that you
-have followed each and every step correctly.</p>
- </div>
-
- </li>
-
- <li class="item" id="I5.i2">
-
- <div class="para" id="I5.i2.p1">
- <p class="p">Search the GROMACS website and users emailing list for
-information on the error.</p>
- </div>
-
- </li>
-
- <li class="item" id="I5.i3">
-
- <div class="para" id="I5.i3.p1">
- <p class="p">Search the internet using a search engine such as Google.</p>
- </div>
-
- </li>
-
- <li class="item" id="I5.i4">
-
- <div class="para" id="I5.i4.p1">
- <p class="p">Post to the GROMACS users emailing list <span style="" class="text typewriter">gmx-users</span>
-for assistance. Be sure to give a full description of what you have
-done and why you think it didn't work. Give details about the system
-on which you are installing. Copy and paste your command line and as
-much of the output as you think might be relevant - certainly from
-the first indication of a problem. Describe the machine and
-operating system you are running on. People who might volunteer to
-help you do not have time to ask you interactive detailed follow-up
-questions, so you will get an answer faster if you provide as much
-information as you think could possibly help.</p>
- </div>
-
- </li>
-
- </ol>
- </div>
-
- </div>
-
- </div>
-
- <div class="section" id="S4">
- <h2 class="title section-title"> 4. Special instructions for some platforms</h2>
- <div class="subsection" id="S4.SS1">
- <h3 class="title subsection-title"> 4.1. Building on Windows</h3>
- <div class="para" id="S4.SS1.p1">
- <p class="p">Building on Cygwin/MinGW/etc. works just like Unix. Please see the
-instructions above.</p>
- </div>
-
- <div class="para" id="S4.SS1.p2">
- <p class="p">Building on Windows using native compilers is rather similar to
-building on Unix, so please start by reading the above. Then, download
-and unpack the GROMACS source archive. The UNIX-standard
-<span style="" class="text typewriter">.tar.gz</span> format can be managed on Windows, but you may prefer
-to browse <a href="ftp://ftp.gromacs.org/pub/gromacs" title="" class="ref url"><span style="" class="text typewriter">ftp://ftp.gromacs.org/pub/gromacs</span></a> to obtain a
-<span style="" class="text typewriter">.zip</span> format file, which doesn't need any external tools to
-unzip on recent Windows systems. Make a folder in which to do the
-out-of-source build of GROMACS. For example, make it within the
-folder unpacked from the source archive, and call it “build-cmake”.
-</p>
- </div>
-
- <div class="para" id="S4.SS1.p3">
- <p class="p">Next, you need to open a command shell. If you do this from within
-your IDE (e.g. Microsoft Visual Studio), it will configure the
-environment for you. If you use a normal Windows command shell, then
-you will need to either set up the environment to find your compilers
-and libraries yourself, or run the <span style="" class="text typewriter">vcvarsall.bat</span> batch script
-provided by MSVC (just like sourcing a bash script under
-Unix). Presumably Intel's IDE has a similar functionality.</p>
- </div>
-
- <div class="para" id="S4.SS1.p4">
- <p class="p">Within that command shell, change to the folder you created above. Run
-<code class="verbatim">cmake ..</code>, where the folder you point CMake towards is the
-folder created by the GROMACS installer. Resolve issues as
-above. You will probably make your life easier and faster by using the
-new facility to download and install FFTW automatically. After the
-initial run of <code class="verbatim">cmake</code>, you may wish to use <code class="verbatim">cmake</code>,
-<code class="verbatim">ccmake</code> or the GUI version of CMake until your configuration
-is complete.</p>
- </div>
-
- <div class="para" id="S4.SS1.p5">
- <p class="p">To compile GROMACS, you then use <code class="verbatim">cmake --build .</code> so the
-right tools get used.</p>
- </div>
-
- </div>
-
- <div class="subsection" id="S4.SS2">
- <h3 class="title subsection-title"> 4.2. Building on Cray</h3>
- <div class="para" id="S4.SS2.p1">
- <p class="p">Probably you need to build static libraries only? Volunteer needed.</p>
- </div>
-
- </div>
-
- <div class="subsection" id="S4.SS3">
- <h3 class="title subsection-title"> 4.3. Building on BlueGene</h3>
- <div class="subsubsection" id="S4.SS3.SSS1">
- <h4 class="title subsubsection-title"> 4.3.1. BlueGene/P</h4>
- <div class="para" id="S4.SS3.SSS1.p1">
- <p class="p">Mark to write later. There is currently no native acceleration on this
-platform, but the default plain C kernels will work.</p>
- </div>
-
- </div>
-
- <div class="subsubsection" id="S4.SS3.SSS2">
- <h4 class="title subsubsection-title"> 4.3.2. BlueGene/Q</h4>
- <div class="para" id="S4.SS3.SSS2.p1">
- <p class="p">Mark to write later. There is currently no native acceleration on this
-platform, but the default plain C kernels will work.</p>
- </div>
-
- </div>
-
- </div>
-
- </div>
-
- <div class="section" id="S5">
- <h2 class="title section-title"> 5. Tested platforms</h2>
- <div class="para" id="S5.p1">
- <p class="p">While it is our best belief that GROMACS will build and run pretty
-much everywhere, it's important that we tell you where we really know
-it works because we've tested it. We do test on Linux, Windows, and
-Mac with a range of compilers and libraries for a range of our
-configuration options. Every commit in our <span style="" class="text typewriter">git</span> source code
-repository is tested on … We test irregularly on…</p>
- </div>
-
- <div class="para" id="S5.p2">
- <p class="p">Contributions to this section are welcome.</p>
- </div>
-
- <div class="para" id="S5.p3">
- <p class="p">Later we might set up the ability for users to contribute test results
-to Jenkins.
-</p>
- </div>
-
- </div>
-
- <div class="section" id="S6">
- <h2 class="title section-title"> 6. Other issues</h2>
- <div class="para" id="S6.p1">
- <p class="p">The GROMACS utility programs often write data files in formats
-suitable for the Grace plotting tool, but it is straightforward to
-use these files in other plotting programs, too.</p>
- </div>
-
- </div>
-
- </div>
-
- </div>
- </div>
- </body>
- </html>
set dir = $cwd
-set VER = 4.6.2
-set DATE = `date "+%B %d, %Y`
+set VER = 4.6.3
set MANDIR = online
set HTML = $cwd/html
set HTMLOL = $HTML/$MANDIR
</td>
</TABLE></TD>
<td ALIGN=RIGHT VALIGN=BOTTOM WIDTH="*" NOSAVE>
-<B>VERSION $VER<br>
-$DATE</B></td>
+<B>VERSION $VER</B></td>
</tr>
</table>
cd $dir
+setenv GMX_MAXBACKUP -1
foreach program ( $PROGRAMS )
- if ( ( -x $GMXBINDIR/$program ) && ( $program != "my_dssp" ) && ( $program != "GMXRC" ) && ( $program != "completion.csh" ) && ( $program != "completion.zsh" ) && ( $program != "average" ) && ( $program != "completion.bash" ) ) then
+ if ( ( -x $GMXBINDIR/$program ) && ( $program != "my_dssp" ) && ( $program != "GMXRC" ) && ( $program != "completion.csh" ) && ( $program != "completion.zsh" ) && ( $program != "average" ) && ( $program != "completion.bash" ) && ( $program != "demux.pl" ) ) then
echo -n "$program "
cd $HTMLOL
$GMXBINDIR/$program -quiet -man html >& /dev/null
endif
endif
end
+echo
#last line
set(CMAKE_SYSTEM_NAME BlueGeneQ-static)
# xl.ndebug is appropriate for production calculations. For debugging,
# use xl to add back error checks and assertions
-set(CMAKE_C_COMPILER /bgsys/drivers/ppcfloor/comm/xl.ndebug/bin/mpicc)
-set(CMAKE_C_FLAGS_RELEASE "-O4 -DNDEBUG" CACHE STRING "Compiler optimization flags")
+set(CMAKE_C_COMPILER /bgsys/drivers/ppcfloor/comm/xl.ndebug/bin/mpixlc_r)
+set(CMAKE_C_FLAGS_RELEASE "-O3 -DNDEBUG" CACHE STRING "Compiler optimization flags")
+set(CMAKE_CXX_COMPILER /bgsys/drivers/ppcfloor/comm/xl.ndebug/bin/mpixlcxx_r)
+set(CMAKE_CXX_FLAGS_RELEASE "-O3 -DNDEBUG" CACHE STRING "Compiler optimization flags")
mark_as_advanced(CMAKE_XL_CreateExportList) # No idea what spams this
set(CMAKE_SYSTEM_NAME BlueGeneQ-static CACHE STRING "Cross-compiling for BlueGene/Q" FORCE)
# xl.ndebug is appropriate for production calculations. For debugging,
# use xl to add back error checks and assertions
-set(CMAKE_CXX_COMPILER /bgsys/drivers/ppcfloor/comm/xl.ndebug/bin/mpicxx)
-set(CMAKE_CXX_FLAGS_RELEASE "-O4 -DNDEBUG" CACHE STRING "Compiler optimization flags")
+set(CMAKE_C_COMPILER /bgsys/drivers/ppcfloor/comm/xl.ndebug/bin/mpixlc_r)
+set(CMAKE_C_FLAGS_RELEASE "-O3 -DNDEBUG" CACHE STRING "Compiler optimization flags")
+set(CMAKE_CXX_COMPILER /bgsys/drivers/ppcfloor/comm/xl.ndebug/bin/mpixlcxx_r)
+set(CMAKE_CXX_FLAGS_RELEASE "-O3 -DNDEBUG" CACHE STRING "Compiler optimization flags")
mark_as_advanced(CMAKE_XL_CreateExportList) # No idea what spams this
# Turns on thread_mpi core threading functions.
MACRO(TMPI_ENABLE_CORE INCDIR)
TMPI_TEST_ATOMICS(${INCDIR})
+
+# affinity checks
+ include(CheckFunctionExists)
+ if (THREAD_PTHREADS)
+ set(CMAKE_REQUIRED_LIBRARIES ${CMAKE_THREAD_LIBS_INIT})
+ # check for sched_setaffinity
+ check_c_source_compiles(
+ "#define _GNU_SOURCE
+#include <pthread.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <errno.h>
+ int main(void) { cpu_set_t set;
+ CPU_ZERO(&set);
+ CPU_SET(0, &set);
+ pthread_setaffinity_np(pthread_self(), sizeof(set), &set);
+ return 0;
+ }"
+ PTHREAD_SETAFFINITY
+ )
+ if (PTHREAD_SETAFFINITY)
+ set(HAVE_PTHREAD_SETAFFINITY 1)
+ endif (PTHREAD_SETAFFINITY)
+ set(CMAKE_REQUIRED_LIBRARIES)
+ endif (THREAD_PTHREADS)
+
+
+# this runs on POSIX systems
+ check_include_files(unistd.h HAVE_UNISTD_H)
+ check_include_files(sched.h HAVE_SCHED_H)
+ check_include_files(sys/time.h HAVE_SYS_TIME_H)
+ check_function_exists(sysconf HAVE_SYSCONF)
+# this runs on windows
+#check_include_files(windows.h HAVE_WINDOWS_H)
ENDMACRO(TMPI_ENABLE_CORE)
# enable C++ library build.
endif (THREAD_MPI_WARNINGS)
include(CheckCSourceCompiles)
-
-# affinity checks
- include(CheckFunctionExists)
- if (THREAD_PTHREADS)
- set(CMAKE_REQUIRED_LIBRARIES ${CMAKE_THREAD_LIBS_INIT})
- # check for sched_setaffinity
- check_c_source_compiles(
- "#define _GNU_SOURCE
-#include <pthread.h>
-#include <stdlib.h>
-#include <stdio.h>
-#include <errno.h>
- int main(void) { cpu_set_t set;
- CPU_ZERO(&set);
- CPU_SET(0, &set);
- pthread_setaffinity_np(pthread_self(), sizeof(set), &set);
- return 0;
- }"
- PTHREAD_SETAFFINITY
- )
- if (PTHREAD_SETAFFINITY)
- set(HAVE_PTHREAD_SETAFFINITY 1)
- endif (PTHREAD_SETAFFINITY)
- set(CMAKE_REQUIRED_LIBRARIES)
- endif (THREAD_PTHREADS)
-
-
-# this runs on POSIX systems
- check_include_files(unistd.h HAVE_UNISTD_H)
- check_include_files(sched.h HAVE_SCHED_H)
- check_include_files(sys/time.h HAVE_SYS_TIME_H)
- check_function_exists(sysconf HAVE_SYSCONF)
-# this runs on windows
-#check_include_files(windows.h HAVE_WINDOWS_H)
ENDMACRO(TMPI_ENABLE)
mark_as_advanced(CUDA_TOOLKIT_ROOT_DIR)
endif()
+# Try to execute ${CUDA_NVCC_EXECUTABLE} --version and set the output
+# (or an error string) in the argument variable.
+# Note that semicolon is used as separator for nvcc.
+#
+# Parameters:
+# COMPILER_INFO - [output variable] string with compiler path, ID and
+# some compiler-provided information
+# COMPILER_FLAGS - [output variable] flags for the compiler
+#
+macro(get_cuda_compiler_info COMPILER_INFO COMPILER_FLAGS)
+ if(CUDA_NVCC_EXECUTABLE)
+
+ # Get the nvcc version string. This is multi-line, but since it is only 4 lines
+ # and might change in the future it is better to store than trying to parse out
+ # the version from the current format.
+ execute_process(COMMAND ${CUDA_NVCC_EXECUTABLE} --version
+ RESULT_VARIABLE _nvcc_version_res
+ OUTPUT_VARIABLE _nvcc_version_out
+ ERROR_VARIABLE _nvcc_version_err
+ OUTPUT_STRIP_TRAILING_WHITESPACE)
+ if (${_nvcc_version_res} EQUAL 0)
+ # Fix multi-line mess: Replace newline with ";" so we can use it in a define
+ string(REPLACE "\n" ";" _nvcc_info_singleline ${_nvcc_version_out})
+ SET(${COMPILER_INFO} "${CUDA_NVCC_EXECUTABLE} ${_nvcc_info_singleline}")
+ string(TOUPPER ${CMAKE_BUILD_TYPE} _build_type)
+ SET(_compiler_flags "${CUDA_NVCC_FLAGS_${_build_type}}")
+ if(CUDA_PROPAGATE_HOST_FLAGS)
+ string(REGEX REPLACE "[ ]+" ";" _cxx_flags_nospace "${BUILD_CXXFLAGS}")
+ endif()
+ SET(${COMPILER_FLAGS} "${CUDA_NVCC_FLAGS}${CUDA_NVCC_FLAGS_${_build_type}}; ${_cxx_flags_nospace}")
+ else ()
+ SET(${COMPILER_INFO} "N/A")
+ SET(${COMPILER_FLAGS} "N/A")
+ endif ()
+ endif ()
+endmacro ()
+
macro(gmx_gpu_setup)
# set up nvcc options
include(gmxManageNvccConfig)
- # Version info (semicolon used as line separator) for nvcc.
- get_nvcc_version_info()
-
# Atomic operations used for polling wait for GPU
# (to avoid the cudaStreamSynchronize + ECC bug).
# ThreadMPI is now always included. Thus, we don't check for Atomics anymore here.
"${_CUDA_ARCH_STR};-use_fast_math;${_HOST_COMPILER_OPTION_STRING}${CUDA_HOST_COMPILER_OPTIONS}"
CACHE STRING "Compiler flags for nvcc." FORCE)
endif()
-
-
-# Try to execute ${CUDA_NVCC_EXECUTABLE} --version and set the output
-# (or an error string) in the argument variable.
-#
-# returned in argument: CUDA nvcc compiler version string
-#
-macro(get_nvcc_version_info)
- if(CUDA_NVCC_EXECUTABLE AND NOT CUDA_NVCC_COMPILER_INFO)
-
- # Get the nvcc version string. This is multi-line, but since it is only 4 lines
- # and might change in the future it is better to store than trying to parse out
- # the version from the current format.
- execute_process(COMMAND ${CUDA_NVCC_EXECUTABLE} --version
- RESULT_VARIABLE _nvcc_version_res
- OUTPUT_VARIABLE _nvcc_version_out
- ERROR_VARIABLE _nvcc_version_err
- OUTPUT_STRIP_TRAILING_WHITESPACE)
- if (${_nvcc_version_res} EQUAL 0)
- # Fix multi-line mess: Replace newline with ";" so we can use it in a define
- string(REPLACE "\n" ";" _nvcc_info_singleline ${_nvcc_version_out})
- SET(CUDA_NVCC_COMPILER_INFO ${_nvcc_info_singleline}
- CACHE STRING "CUDA nvcc compiler version string" FORCE)
- else ()
- SET(CUDA_NVCC_COMPILER_INFO ""
- CACHE STRING "CUDA nvcc compiler version string not available" FORCE)
- endif ()
- endif ()
- mark_as_advanced(CUDA_NVCC_COMPILER_INFO)
-endmacro ()
<TD WIDTH=116>
<a href="http://www.gromacs.org/"><img SRC="../images/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td>
<td ALIGN=LEFT VALIGN=TOP WIDTH=280><br><h2>mdp options</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
-</TABLE></TD><TD WIDTH="*" ALIGN=RIGHT VALIGN=BOTTOM><p> </p><B>VERSION 4.6<br>
-Sat 19 Jan 2013</B></td></tr></TABLE>
+</TABLE></TD><TD WIDTH="*" ALIGN=RIGHT VALIGN=BOTTOM><p> </p><B>VERSION 4.6.3</B></td></tr></TABLE>
<HR>
<!--
/** CUDA nvcc compiler version information */
#define CUDA_NVCC_COMPILER_INFO "@CUDA_NVCC_COMPILER_INFO@"
+/** CUDA nvcc compiler flags */
+#define CUDA_NVCC_COMPILER_FLAGS "@CUDA_NVCC_COMPILER_FLAGS@"
/* Use (modified) Mopac 7 for QM-MM calculations */
#cmakedefine GMX_QMMM_MOPAC
+/* Use ORCA for QM-MM calculations */
+#cmakedefine GMX_QMMM_ORCA
+
/* Use the GROMACS software 1/sqrt(x) */
#cmakedefine GMX_SOFTWARE_INVSQRT
#include "mtop_util.h"
#include "gmx_ana.h"
-static int greatest_common_divisor(int p, int q)
-{
- int tmp;
- while (q != 0)
- {
- tmp = q;
- q = p % q;
- p = tmp;
- }
- return p;
-}
-
static void insert_ion(int nsa, int *nwater,
gmx_bool bSet[], int repl[], atom_id index[],
rvec x[], t_pbc *pbc,
/* Check if the system is neutralizable
* is (qdelta == p_q*p_num + n_q*n_num) solvable for p_num and n_num? */
- int gcd = greatest_common_divisor(n_q, p_q);
+ int gcd = gmx_greatest_common_divisor(n_q, p_q);
if ((qdelta % gcd) != 0)
{
gmx_fatal(FARGS, "Can't neutralize this system using -nq %d and"
}
else
{
- sprintf(bbuf, " -np %d ", nnodes);
+ /* This string will be used for MPI runs and will appear after the
+ * mpirun command. */
+ if (strcmp(procstring[0], "none") != 0)
+ {
+ sprintf(bbuf, " %s %d ", procstring[0], nnodes);
+ }
+ else
+ {
+ sprintf(bbuf, " ");
+ }
}
cmd_np = bbuf;
{
int cuda_driver,cuda_runtime;
fprintf(fp, "CUDA compiler: %s\n",CUDA_NVCC_COMPILER_INFO);
+ fprintf(fp, "CUDA compiler flags:%s\n",CUDA_NVCC_COMPILER_FLAGS);
cuda_driver = 0;
cudaDriverGetVersion(&cuda_driver);
cuda_runtime = 0;
#include "main.h"
#include "md_logging.h"
+#include "thread_mpi/threads.h"
+
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif
-
#if ((defined(WIN32) || defined( _WIN32 ) || defined(WIN64) || defined( _WIN64 )) && !(defined (__CYGWIN__) || defined (__CYGWIN32__)))
#include "windows.h"
#endif
static const char * invalid_gpuid_hint =
"A delimiter-free sequence of valid numeric IDs of available GPUs is expected.";
+/* The globally shared hwinfo structure. */
+static gmx_hw_info_t *hwinfo_g;
+/* A reference counter for the hwinfo structure */
+static int n_hwinfo = 0;
+/* A lock to protect the hwinfo structure */
+static tMPI_Thread_mutex_t hw_info_lock = TMPI_THREAD_MUTEX_INITIALIZER;
+
+
/* FW decl. */
-void limit_num_gpus_used(gmx_hw_info_t *hwinfo, int count);
+static void limit_num_gpus_used(gmx_hw_info_t *hwinfo, int count);
static void sprint_gpus(char *sbuf, const gmx_gpu_info_t *gpu_info)
{
const t_commrec *cr, int ntmpi_requested,
gmx_bool bUseGPU)
{
- int npppn, ntmpi_pp, ngpu;
- char sbuf[STRLEN], th_or_proc[STRLEN], th_or_proc_plural[STRLEN], pernode[STRLEN];
- char gpu_plural[2];
- gmx_bool bGPUBin, btMPI, bMPI, bMaxMpiThreadsSet, bNthreadsAuto, bEmulateGPU;
+ int npppn, ntmpi_pp, ngpu;
+ char sbuf[STRLEN], th_or_proc[STRLEN], th_or_proc_plural[STRLEN], pernode[STRLEN];
+ char gpu_plural[2];
+ gmx_bool bGPUBin, btMPI, bMPI, bMaxMpiThreadsSet, bNthreadsAuto, bEmulateGPU;
+ int ret;
+ static tMPI_Thread_mutex_t cons_lock = TMPI_THREAD_MUTEX_INITIALIZER;
+
assert(hwinfo);
assert(cr);
- btMPI = bMPI = FALSE;
- bNthreadsAuto = FALSE;
+ /* Below we only do consistency checks for PP and GPUs,
+ * this is irrelevant for PME only nodes, so in that case we return
+ * here.
+ */
+ if (!(cr->duty & DUTY_PP))
+ {
+ return;
+ }
+
+ /* We run this function only once, but must make sure that all threads
+ that are alive run this function, so they get consistent data. We
+ achieve this by mutual exclusion and returning if the structure is
+ already properly checked & set */
+ ret = tMPI_Thread_mutex_lock(&cons_lock);
+ if (ret != 0)
+ {
+ gmx_fatal(FARGS, "Error locking cons mutex: %s", strerror(errno));
+ }
+
+ if (!hwinfo->bConsistencyChecked)
+ {
+ btMPI = bMPI = FALSE;
+ bNthreadsAuto = FALSE;
#if defined(GMX_THREAD_MPI)
- btMPI = TRUE;
- bNthreadsAuto = (ntmpi_requested < 1);
+ btMPI = TRUE;
+ bNthreadsAuto = (ntmpi_requested < 1);
#elif defined(GMX_LIB_MPI)
- bMPI = TRUE;
+ bMPI = TRUE;
#endif
#ifdef GMX_GPU
- bGPUBin = TRUE;
+ bGPUBin = TRUE;
#else
- bGPUBin = FALSE;
+ bGPUBin = FALSE;
#endif
- /* GPU emulation detection is done later, but we need here as well
- * -- uncool, but there's no elegant workaround */
- bEmulateGPU = (getenv("GMX_EMULATE_GPU") != NULL);
- bMaxMpiThreadsSet = (getenv("GMX_MAX_MPI_THREADS") != NULL);
+ /* GPU emulation detection is done later, but we need here as well
+ * -- uncool, but there's no elegant workaround */
+ bEmulateGPU = (getenv("GMX_EMULATE_GPU") != NULL);
+ bMaxMpiThreadsSet = (getenv("GMX_MAX_MPI_THREADS") != NULL);
- if (SIMMASTER(cr))
- {
- /* check the acceleration mdrun is compiled with against hardware capabilities */
- /* TODO: Here we assume homogeneous hardware which is not necessarily the case!
- * Might not hurt to add an extra check over MPI. */
+ /* check the acceleration mdrun is compiled with against hardware
+ capabilities */
+ /* TODO: Here we assume homogeneous hardware which is not necessarily
+ the case! Might not hurt to add an extra check over MPI. */
gmx_cpuid_acceleration_check(hwinfo->cpuid_info, fplog);
- }
-
- /* Below we only do consistency checks for PP and GPUs,
- * this is irrelevant for PME only nodes, so in that case we return here.
- */
- if (!(cr->duty & DUTY_PP))
- {
- return;
- }
- /* Need to ensure that we have enough GPUs:
- * - need one GPU per PP node
- * - no GPU oversubscription with tMPI
- * => keep on the GPU support, otherwise turn off (or bail if forced)
- * */
- /* number of PP processes per node */
- npppn = cr->nrank_pp_intranode;
-
- pernode[0] = '\0';
- th_or_proc_plural[0] = '\0';
- if (btMPI)
- {
- sprintf(th_or_proc, "thread-MPI thread");
- if (npppn > 1)
+ /* Need to ensure that we have enough GPUs:
+ * - need one GPU per PP node
+ * - no GPU oversubscription with tMPI
+ * => keep on the GPU support, otherwise turn off (or bail if forced)
+ * */
+ /* number of PP processes per node */
+ npppn = cr->nrank_pp_intranode;
+
+ pernode[0] = '\0';
+ th_or_proc_plural[0] = '\0';
+ if (btMPI)
{
- sprintf(th_or_proc_plural, "s");
+ sprintf(th_or_proc, "thread-MPI thread");
+ if (npppn > 1)
+ {
+ sprintf(th_or_proc_plural, "s");
+ }
}
- }
- else if (bMPI)
- {
- sprintf(th_or_proc, "MPI process");
- if (npppn > 1)
+ else if (bMPI)
{
- sprintf(th_or_proc_plural, "es");
+ sprintf(th_or_proc, "MPI process");
+ if (npppn > 1)
+ {
+ sprintf(th_or_proc_plural, "es");
+ }
+ sprintf(pernode, " per node");
+ }
+ else
+ {
+ /* neither MPI nor tMPI */
+ sprintf(th_or_proc, "process");
}
- sprintf(pernode, " per node");
- }
- else
- {
- /* neither MPI nor tMPI */
- sprintf(th_or_proc, "process");
- }
-
- if (bGPUBin)
- {
- print_gpu_detection_stats(fplog, &hwinfo->gpu_info, cr);
- }
- if (bUseGPU && hwinfo->bCanUseGPU && !bEmulateGPU)
- {
- ngpu = hwinfo->gpu_info.ncuda_dev_use;
- sprintf(gpu_plural, "%s", (ngpu > 1) ? "s" : "");
+ if (bGPUBin)
+ {
+ print_gpu_detection_stats(fplog, &hwinfo->gpu_info, cr);
+ }
- /* number of tMPI threads atuo-adjusted */
- if (btMPI && bNthreadsAuto && SIMMASTER(cr))
+ if (bUseGPU && hwinfo->bCanUseGPU && !bEmulateGPU)
{
- if (npppn < ngpu)
+ ngpu = hwinfo->gpu_info.ncuda_dev_use;
+ sprintf(gpu_plural, "%s", (ngpu > 1) ? "s" : "");
+
+ /* number of tMPI threads atuo-adjusted */
+ if (btMPI && bNthreadsAuto)
{
- if (hwinfo->gpu_info.bUserSet)
+ if (npppn < ngpu)
{
- /* The user manually provided more GPUs than threads we could
- * automatically start. */
- gmx_fatal(FARGS,
- "%d GPU%s provided, but only %d PP thread-MPI thread%s coud be started.\n"
- "%s requires one PP tread-MPI thread per GPU; use fewer GPUs%s.",
- ngpu, gpu_plural, npppn, th_or_proc_plural,
- ShortProgram(), bMaxMpiThreadsSet ? "\nor allow more threads to be used" : "");
- }
- else
- {
- /* There are more GPUs than tMPI threads; we have to limit the number GPUs used. */
- md_print_warn(cr, fplog,
- "NOTE: %d GPU%s were detected, but only %d PP thread-MPI thread%s can be started.\n"
- " %s can use one GPU per PP tread-MPI thread, so only %d GPU%s will be used.%s\n",
+ if (hwinfo->gpu_info.bUserSet)
+ {
+ /* The user manually provided more GPUs than threads we
+ could automatically start. */
+ gmx_fatal(FARGS,
+ "%d GPU%s provided, but only %d PP thread-MPI thread%s coud be started.\n"
+ "%s requires one PP tread-MPI thread per GPU; use fewer GPUs%s.",
ngpu, gpu_plural, npppn, th_or_proc_plural,
- ShortProgram(), npppn, npppn > 1 ? "s" : "",
- bMaxMpiThreadsSet ? "\n Also, you can allow more threads to be used by increasing GMX_MAX_MPI_THREADS" : "");
-
- if (cr->rank_pp_intranode == 0)
+ ShortProgram(), bMaxMpiThreadsSet ? "\nor allow more threads to be used" : "");
+ }
+ else
{
- limit_num_gpus_used(hwinfo, npppn);
- ngpu = hwinfo->gpu_info.ncuda_dev_use;
- sprintf(gpu_plural, "%s", (ngpu > 1) ? "s" : "");
+ /* There are more GPUs than tMPI threads; we have to
+ limit the number GPUs used. */
+ md_print_warn(cr, fplog,
+ "NOTE: %d GPU%s were detected, but only %d PP thread-MPI thread%s can be started.\n"
+ " %s can use one GPU per PP tread-MPI thread, so only %d GPU%s will be used.%s\n",
+ ngpu, gpu_plural, npppn,
+ th_or_proc_plural,
+ ShortProgram(), npppn,
+ npppn > 1 ? "s" : "",
+ bMaxMpiThreadsSet ? "\n Also, you can allow more threads to be used by increasing GMX_MAX_MPI_THREADS" : "");
+
+ if (cr->rank_pp_intranode == 0)
+ {
+ limit_num_gpus_used(hwinfo, npppn);
+ ngpu = hwinfo->gpu_info.ncuda_dev_use;
+ sprintf(gpu_plural, "%s", (ngpu > 1) ? "s" : "");
+ }
}
}
}
- }
- if (ngpu != npppn)
- {
- if (hwinfo->gpu_info.bUserSet)
+ if (ngpu != npppn)
{
- gmx_fatal(FARGS,
- "Incorrect launch configuration: mismatching number of PP %s%s and GPUs%s.\n"
- "%s was started with %d PP %s%s%s, but you provided %d GPU%s.",
- th_or_proc, btMPI ? "s" : "es", pernode,
- ShortProgram(), npppn, th_or_proc, th_or_proc_plural, pernode, ngpu, gpu_plural);
- }
- else
- {
- if (ngpu > npppn)
+ if (hwinfo->gpu_info.bUserSet)
{
- md_print_warn(cr, fplog,
- "NOTE: potentially sub-optimal launch configuration, %s started with less\n"
- " PP %s%s%s than GPU%s available.\n"
- " Each PP %s can use only one GPU, %d GPU%s%s will be used.\n",
- ShortProgram(),
- th_or_proc, th_or_proc_plural, pernode, gpu_plural,
- th_or_proc, npppn, gpu_plural, pernode);
-
- if (bMPI || (btMPI && cr->rank_pp_intranode == 0))
- {
- limit_num_gpus_used(hwinfo, npppn);
- ngpu = hwinfo->gpu_info.ncuda_dev_use;
- sprintf(gpu_plural, "%s", (ngpu > 1) ? "s" : "");
- }
+ gmx_fatal(FARGS,
+ "Incorrect launch configuration: mismatching number of PP %s%s and GPUs%s.\n"
+ "%s was started with %d PP %s%s%s, but you provided %d GPU%s.",
+ th_or_proc, btMPI ? "s" : "es", pernode,
+ ShortProgram(), npppn, th_or_proc,
+ th_or_proc_plural, pernode, ngpu, gpu_plural);
}
else
{
- /* Avoid duplicate error messages.
- * Unfortunately we can only do this at the physical node
- * level, since the hardware setup and MPI process count
- * might be differ over physical nodes.
- */
- if (cr->rank_pp_intranode == 0)
+ if (ngpu > npppn)
{
- gmx_fatal(FARGS,
- "Incorrect launch configuration: mismatching number of PP %s%s and GPUs%s.\n"
- "%s was started with %d PP %s%s%s, but only %d GPU%s were detected.",
- th_or_proc, btMPI ? "s" : "es", pernode,
- ShortProgram(), npppn, th_or_proc, th_or_proc_plural, pernode, ngpu, gpu_plural);
+ md_print_warn(cr, fplog,
+ "NOTE: potentially sub-optimal launch configuration, %s started with less\n"
+ " PP %s%s%s than GPU%s available.\n"
+ " Each PP %s can use only one GPU, %d GPU%s%s will be used.\n",
+ ShortProgram(), th_or_proc,
+ th_or_proc_plural, pernode, gpu_plural,
+ th_or_proc, npppn, gpu_plural, pernode);
+
+ if (bMPI || (btMPI && cr->rank_pp_intranode == 0))
+ {
+ limit_num_gpus_used(hwinfo, npppn);
+ ngpu = hwinfo->gpu_info.ncuda_dev_use;
+ sprintf(gpu_plural, "%s", (ngpu > 1) ? "s" : "");
+ }
}
-#ifdef GMX_MPI
else
{
- /* Avoid other ranks to continue after inconsistency */
- MPI_Barrier(cr->mpi_comm_mygroup);
+ /* Avoid duplicate error messages.
+ * Unfortunately we can only do this at the physical node
+ * level, since the hardware setup and MPI process count
+ * might be differ over physical nodes.
+ */
+ if (cr->rank_pp_intranode == 0)
+ {
+ gmx_fatal(FARGS,
+ "Incorrect launch configuration: mismatching number of PP %s%s and GPUs%s.\n"
+ "%s was started with %d PP %s%s%s, but only %d GPU%s were detected.",
+ th_or_proc, btMPI ? "s" : "es", pernode,
+ ShortProgram(), npppn, th_or_proc,
+ th_or_proc_plural, pernode, ngpu,
+ gpu_plural);
+ }
}
-#endif
}
}
- }
- hwinfo->gpu_info.bDevShare = FALSE;
- if (hwinfo->gpu_info.bUserSet && (cr->rank_pp_intranode == 0))
- {
- int i, j, same_count;
- gmx_bool bSomeSame, bAllDifferent;
+ {
+ int same_count;
- same_count = 0; /* number of GPUs shared among ranks */
- bSomeSame = FALSE;
- bAllDifferent = TRUE;
+ same_count = gmx_count_gpu_dev_shared(&(hwinfo->gpu_info));
- for (i = 0; i < ngpu - 1; i++)
- {
- for (j = i + 1; j < ngpu; j++)
+ if (btMPI && same_count > 0)
{
- bSomeSame |= hwinfo->gpu_info.cuda_dev_use[i] == hwinfo->gpu_info.cuda_dev_use[j];
- bAllDifferent &= hwinfo->gpu_info.cuda_dev_use[i] != hwinfo->gpu_info.cuda_dev_use[j];
- same_count += hwinfo->gpu_info.cuda_dev_use[i] == hwinfo->gpu_info.cuda_dev_use[j];
+ gmx_fatal(FARGS,
+ "Invalid GPU assignment: can't share a GPU among multiple thread-MPI threads.\n"
+ "Use MPI if you are sure that you want to assign GPU to multiple threads.");
+ }
+
+ if (same_count > 0)
+ {
+ md_print_warn(cr, fplog,
+ "NOTE: Potentially sub-optimal launch configuration: you assigned %s to\n"
+ " multiple %s%s; this should be avoided as it can cause\n"
+ " performance loss.\n",
+ same_count > 1 ? "GPUs" : "a GPU", th_or_proc, btMPI ? "s" : "es");
}
}
+ print_gpu_use_stats(fplog, &hwinfo->gpu_info, cr);
+ }
+ hwinfo->bConsistencyChecked = TRUE;
+ }
- /* store the number of shared/oversubscribed GPUs */
- hwinfo->gpu_info.bDevShare = bSomeSame;
+ ret = tMPI_Thread_mutex_unlock(&cons_lock);
+ if (ret != 0)
+ {
+ gmx_fatal(FARGS, "Error unlocking cons mutex: %s", strerror(errno));
+ }
- if (btMPI && !bAllDifferent)
- {
- gmx_fatal(FARGS,
- "Invalid GPU assignment: can't share a GPU among multiple thread-MPI threads.\n"
- "Use MPI if you are sure that you want to assign GPU to multiple threads.");
- }
+#ifdef GMX_MPI
+ if (PAR(cr))
+ {
+ /* Avoid other ranks to continue after
+ inconsistency */
+ MPI_Barrier(cr->mpi_comm_mygroup);
+ }
+#endif
+
+}
+
+int gmx_count_gpu_dev_shared(const gmx_gpu_info_t *gpu_info)
+{
+ int same_count = 0;
+ int ngpu = gpu_info->ncuda_dev_use;
- if (bSomeSame)
+ if (gpu_info->bUserSet)
+ {
+ int i, j;
+
+ for (i = 0; i < ngpu - 1; i++)
+ {
+ for (j = i + 1; j < ngpu; j++)
{
- md_print_warn(cr, fplog,
- "NOTE: Potentially sub-optimal launch configuration: you assigned %s to\n"
- " multiple %s%s; this should be avoided as it can cause\n"
- " performance loss.\n",
- same_count > 1 ? "GPUs" : "a GPU", th_or_proc, btMPI ? "s" : "es");
+ same_count += (gpu_info->cuda_dev_use[i] ==
+ gpu_info->cuda_dev_use[j]);
}
}
- print_gpu_use_stats(fplog, &hwinfo->gpu_info, cr);
}
+
+ return same_count;
}
+
/* Return the number of hardware threads supported by the current CPU.
* We assume that this is equal with the number of CPUs reported to be
* online by the OS at the time of the call.
return ret;
}
-void gmx_detect_hardware(FILE *fplog, gmx_hw_info_t *hwinfo,
- const t_commrec *cr,
- gmx_bool bForceUseGPU, gmx_bool bTryUseGPU,
- const char *gpu_id)
+gmx_hw_info_t *gmx_detect_hardware(FILE *fplog, const t_commrec *cr,
+ gmx_bool bForceUseGPU, gmx_bool bTryUseGPU,
+ const char *gpu_id)
{
int i;
const char *env;
gmx_hw_info_t *hw;
gmx_gpu_info_t gpuinfo_auto, gpuinfo_user;
gmx_bool bGPUBin;
+ int ret;
- assert(hwinfo);
-
- /* detect CPUID info; no fuss, we don't detect system-wide
- * -- sloppy, but that's it for now */
- if (gmx_cpuid_init(&hwinfo->cpuid_info) != 0)
+ /* make sure no one else is doing the same thing */
+ ret = tMPI_Thread_mutex_lock(&hw_info_lock);
+ if (ret != 0)
{
- gmx_fatal_collective(FARGS, cr, NULL, "CPUID detection failed!");
+ gmx_fatal(FARGS, "Error locking hwinfo mutex: %s", strerror(errno));
}
- /* detect number of hardware threads */
- hwinfo->nthreads_hw_avail = get_nthreads_hw_avail(fplog, cr);
+ /* only initialize the hwinfo structure if it is not already initalized */
+ if (n_hwinfo == 0)
+ {
+ snew(hwinfo_g, 1);
+ hwinfo_g->bConsistencyChecked = FALSE;
- /* detect GPUs */
- hwinfo->gpu_info.ncuda_dev_use = 0;
- hwinfo->gpu_info.cuda_dev_use = NULL;
- hwinfo->gpu_info.ncuda_dev = 0;
- hwinfo->gpu_info.cuda_dev = NULL;
+ /* detect CPUID info; no fuss, we don't detect system-wide
+ * -- sloppy, but that's it for now */
+ if (gmx_cpuid_init(&hwinfo_g->cpuid_info) != 0)
+ {
+ gmx_fatal_collective(FARGS, cr, NULL, "CPUID detection failed!");
+ }
+
+ /* detect number of hardware threads */
+ hwinfo_g->nthreads_hw_avail = get_nthreads_hw_avail(fplog, cr);
+
+ /* detect GPUs */
+ hwinfo_g->gpu_info.ncuda_dev_use = 0;
+ hwinfo_g->gpu_info.cuda_dev_use = NULL;
+ hwinfo_g->gpu_info.ncuda_dev = 0;
+ hwinfo_g->gpu_info.cuda_dev = NULL;
#ifdef GMX_GPU
- bGPUBin = TRUE;
+ bGPUBin = TRUE;
#else
- bGPUBin = FALSE;
+ bGPUBin = FALSE;
#endif
- /* Bail if binary is not compiled with GPU acceleration, but this is either
- * explicitly (-nb gpu) or implicitly (gpu ID passed) requested. */
- if (bForceUseGPU && !bGPUBin)
- {
- gmx_fatal(FARGS, "GPU acceleration requested, but %s was compiled without GPU support!", ShortProgram());
- }
- if (gpu_id != NULL && !bGPUBin)
- {
- gmx_fatal(FARGS, "GPU ID string set, but %s was compiled without GPU support!", ShortProgram());
- }
-
- /* run the detection if the binary was compiled with GPU support */
- if (bGPUBin && getenv("GMX_DISABLE_GPU_DETECTION") == NULL)
- {
- char detection_error[STRLEN];
-
- if (detect_cuda_gpus(&hwinfo->gpu_info, detection_error) != 0)
+ /* Bail if binary is not compiled with GPU acceleration, but this is either
+ * explicitly (-nb gpu) or implicitly (gpu ID passed) requested. */
+ if (bForceUseGPU && !bGPUBin)
{
- if (detection_error != NULL && detection_error[0] != '\0')
- {
- sprintf(sbuf, ":\n %s\n", detection_error);
- }
- else
- {
- sprintf(sbuf, ".");
- }
- md_print_warn(cr, fplog,
- "NOTE: Error occurred during GPU detection%s"
- " Can not use GPU acceleration, will fall back to CPU kernels.\n",
- sbuf);
+ gmx_fatal(FARGS, "GPU acceleration requested, but %s was compiled without GPU support!", ShortProgram());
}
- }
-
- if (bForceUseGPU || bTryUseGPU)
- {
- env = getenv("GMX_GPU_ID");
- if (env != NULL && gpu_id != NULL)
+ if (gpu_id != NULL && !bGPUBin)
{
- gmx_fatal(FARGS, "GMX_GPU_ID and -gpu_id can not be used at the same time");
+ gmx_fatal(FARGS, "GPU ID string set, but %s was compiled without GPU support!", ShortProgram());
}
- if (env == NULL)
+
+ /* run the detection if the binary was compiled with GPU support */
+ if (bGPUBin && getenv("GMX_DISABLE_GPU_DETECTION") == NULL)
{
- env = gpu_id;
+ char detection_error[STRLEN];
+
+ if (detect_cuda_gpus(&hwinfo_g->gpu_info, detection_error) != 0)
+ {
+ if (detection_error != NULL && detection_error[0] != '\0')
+ {
+ sprintf(sbuf, ":\n %s\n", detection_error);
+ }
+ else
+ {
+ sprintf(sbuf, ".");
+ }
+ md_print_warn(cr, fplog,
+ "NOTE: Error occurred during GPU detection%s"
+ " Can not use GPU acceleration, will fall back to CPU kernels.\n",
+ sbuf);
+ }
}
- /* parse GPU IDs if the user passed any */
- if (env != NULL)
+ if (bForceUseGPU || bTryUseGPU)
{
- int *gpuid, *checkres;
- int nid, res;
+ env = getenv("GMX_GPU_ID");
+ if (env != NULL && gpu_id != NULL)
+ {
+ gmx_fatal(FARGS, "GMX_GPU_ID and -gpu_id can not be used at the same time");
+ }
+ if (env == NULL)
+ {
+ env = gpu_id;
+ }
- snew(gpuid, max_gpu_ids_user);
- snew(checkres, max_gpu_ids_user);
+ /* parse GPU IDs if the user passed any */
+ if (env != NULL)
+ {
+ int *gpuid, *checkres;
+ int nid, res;
- parse_gpu_id_plain_string(env, &nid, gpuid);
+ snew(gpuid, max_gpu_ids_user);
+ snew(checkres, max_gpu_ids_user);
- if (nid == 0)
- {
- gmx_fatal(FARGS, "Empty GPU ID string encountered.\n%s\n", invalid_gpuid_hint);
- }
+ parse_gpu_id_plain_string(env, &nid, gpuid);
- res = check_select_cuda_gpus(checkres, &hwinfo->gpu_info, gpuid, nid);
+ if (nid == 0)
+ {
+ gmx_fatal(FARGS, "Empty GPU ID string encountered.\n%s\n",
+ invalid_gpuid_hint);
+ }
- if (!res)
- {
- print_gpu_detection_stats(fplog, &hwinfo->gpu_info, cr);
+ res = check_select_cuda_gpus(checkres, &hwinfo_g->gpu_info,
+ gpuid, nid);
- sprintf(sbuf, "Some of the requested GPUs do not exist, behave strangely, or are not compatible:\n");
- for (i = 0; i < nid; i++)
+ if (!res)
{
- if (checkres[i] != egpuCompatible)
+ print_gpu_detection_stats(fplog, &hwinfo_g->gpu_info, cr);
+
+ sprintf(sbuf, "Some of the requested GPUs do not exist, behave strangely, or are not compatible:\n");
+ for (i = 0; i < nid; i++)
{
- sprintf(stmp, " GPU #%d: %s\n",
- gpuid[i], gpu_detect_res_str[checkres[i]]);
- strcat(sbuf, stmp);
+ if (checkres[i] != egpuCompatible)
+ {
+ sprintf(stmp, " GPU #%d: %s\n",
+ gpuid[i], gpu_detect_res_str[checkres[i]]);
+ strcat(sbuf, stmp);
+ }
}
+ gmx_fatal(FARGS, "%s", sbuf);
}
- gmx_fatal(FARGS, "%s", sbuf);
- }
- hwinfo->gpu_info.bUserSet = TRUE;
+ hwinfo_g->gpu_info.bUserSet = TRUE;
- sfree(gpuid);
- sfree(checkres);
- }
- else
- {
- pick_compatible_gpus(&hwinfo->gpu_info);
- hwinfo->gpu_info.bUserSet = FALSE;
- }
+ sfree(gpuid);
+ sfree(checkres);
+ }
+ else
+ {
+ pick_compatible_gpus(&hwinfo_g->gpu_info);
+ hwinfo_g->gpu_info.bUserSet = FALSE;
+ }
- /* decide whether we can use GPU */
- hwinfo->bCanUseGPU = (hwinfo->gpu_info.ncuda_dev_use > 0);
- if (!hwinfo->bCanUseGPU && bForceUseGPU)
- {
- gmx_fatal(FARGS, "GPU acceleration requested, but no compatible GPUs were detected.");
+ /* decide whether we can use GPU */
+ hwinfo_g->bCanUseGPU = (hwinfo_g->gpu_info.ncuda_dev_use > 0);
+ if (!hwinfo_g->bCanUseGPU && bForceUseGPU)
+ {
+ gmx_fatal(FARGS, "GPU acceleration requested, but no compatible GPUs were detected.");
+ }
}
}
+ /* increase the reference counter */
+ n_hwinfo++;
+
+ ret = tMPI_Thread_mutex_unlock(&hw_info_lock);
+ if (ret != 0)
+ {
+ gmx_fatal(FARGS, "Error unlocking hwinfo mutex: %s", strerror(errno));
+ }
+
+ return hwinfo_g;
}
-void limit_num_gpus_used(gmx_hw_info_t *hwinfo, int count)
+static void limit_num_gpus_used(gmx_hw_info_t *hwinfo, int count)
{
int ndev_use;
void gmx_hardware_info_free(gmx_hw_info_t *hwinfo)
{
- if (hwinfo)
+ int ret;
+
+ ret = tMPI_Thread_mutex_lock(&hw_info_lock);
+ if (ret != 0)
+ {
+ gmx_fatal(FARGS, "Error locking hwinfo mutex: %s", strerror(errno));
+ }
+
+ /* decrease the reference counter */
+ n_hwinfo--;
+
+
+ if (hwinfo != hwinfo_g)
+ {
+ gmx_incons("hwinfo < hwinfo_g");
+ }
+
+ if (n_hwinfo < 0)
+ {
+ gmx_incons("n_hwinfo < 0");
+ }
+
+ if (n_hwinfo == 0)
+ {
+ gmx_cpuid_done(hwinfo_g->cpuid_info);
+ free_gpu_info(&hwinfo_g->gpu_info);
+ sfree(hwinfo_g);
+ }
+
+ ret = tMPI_Thread_mutex_unlock(&hw_info_lock);
+ if (ret != 0)
{
- gmx_cpuid_done(hwinfo->cpuid_info);
- free_gpu_info(&hwinfo->gpu_info);
- sfree(hwinfo);
+ gmx_fatal(FARGS, "Error unlocking hwinfo mutex: %s", strerror(errno));
}
}
int nth_affinity_set, thread_id_node, thread_id,
nthread_local, nthread_node, nthread_hw_max, nphyscore;
int offset;
- /* these are inherently global properties that are shared among all threads
- */
- static const int *locality_order;
- static int rc;
- static gmx_bool have_locality_order = FALSE;
- static tMPI_Thread_mutex_t locality_order_mtx =
- TMPI_THREAD_MUTEX_INITIALIZER;
- static tMPI_Thread_cond_t locality_order_cond =
- TMPI_THREAD_COND_INITIALIZER;
+ const int *locality_order;
+ int rc;
if (hw_opt->thread_affinity == threadaffOFF)
{
"Can not set thread affinities on the current platform. On NUMA systems this\n"
"can cause performance degradation. If you think your platform should support\n"
"setting affinities, contact the GROMACS developers.");
-#endif /* __APPLE__ */
+#endif /* __APPLE__ */
return;
}
md_print_info(cr, fplog, "Applying core pinning offset %d\n", offset);
}
- /* hw_opt is shared among tMPI threads, so for thread safety we need to do
- * the layout detection only on master as core_pinning_stride is an in-out
- * parameter and gets auto-set depending on its initial value.
- * This
- * This is not thread-safe with multi-simulations, but that's anyway not
- * supported by tMPI. */
- if (SIMMASTER(cr))
- {
- int ret;
- int i;
-
- ret = tMPI_Thread_mutex_lock(&locality_order_mtx);
- if (ret != 0)
- {
- goto locality_order_err;
- }
- rc = get_thread_affinity_layout(fplog, cr, hwinfo,
- nthread_node,
- offset, &hw_opt->core_pinning_stride,
- &locality_order);
- have_locality_order = TRUE;
- ret = tMPI_Thread_cond_broadcast(&locality_order_cond);
- if (ret != 0)
- {
- tMPI_Thread_mutex_unlock(&locality_order_mtx);
- goto locality_order_err;
- }
- ret = tMPI_Thread_mutex_unlock(&locality_order_mtx);
- if (ret != 0)
- {
- goto locality_order_err;
- }
- }
- else
- {
- int ret;
- /* all other threads wait for the locality order data. */
- ret = tMPI_Thread_mutex_lock(&locality_order_mtx);
- if (ret != 0)
- {
- goto locality_order_err;
- }
-
- while (!have_locality_order)
- {
- ret = tMPI_Thread_cond_wait(&locality_order_cond,
- &locality_order_mtx);
- if (ret != 0)
- {
- tMPI_Thread_mutex_unlock(&locality_order_mtx);
- goto locality_order_err;
- }
- }
- ret = tMPI_Thread_mutex_unlock(&locality_order_mtx);
- if (ret != 0)
- {
- goto locality_order_err;
- }
- }
+ rc = get_thread_affinity_layout(fplog, cr, hwinfo,
+ nthread_node,
+ offset, &hw_opt->core_pinning_stride,
+ &locality_order);
if (rc != 0)
{
}
}
return;
-
-locality_order_err:
- /* any error in affinity setting shouldn't be fatal, but should generate
- a warning */
- md_print_warn(NULL, fplog,
- "WARNING: Obtaining affinity information failed due to a basic system error: %s.\n"
- " This can cause performance degradation! ",
- strerror(errno));
- return;
}
/* Check the process affinity mask and if it is found to be non-zero,
uid = getuid();
pw = getpwuid(uid);
gh = gethostname(buf, 255);
- user = pw->pw_name;
+ /* pw returns null on error (e.g. compute nodes lack /etc/passwd) */
+ user = pw ? pw->pw_name : unk;
#else
uid = 0;
gh = -1;
} /* fixes auto-indentation problems */
#endif
-void gmx_detect_hardware(FILE *fplog, gmx_hw_info_t *hwinfo,
- const t_commrec *cr,
- gmx_bool bForceUseGPU, gmx_bool bTryUseGPU,
- const char *gpu_id);
+/* the init and consistency functions depend on commrec that may not be
+ consistent in cuda because MPI types don't exist there. */
+#ifndef __CUDACC__
+#include "types/commrec.h"
+/* return a pointer to a global hwinfo structure. */
+gmx_hw_info_t *gmx_detect_hardware(FILE *fplog, const t_commrec *cr,
+ gmx_bool bForceUseGPU, gmx_bool bTryUseGPU,
+ const char *gpu_id);
void gmx_hardware_info_free(gmx_hw_info_t *hwinfo);
+/* Check the thread count + GPU assignment. This function must
+ either be run by all threads that persist (i.e. all tmpi threads),
+ or be run before they are created. */
void gmx_check_hw_runconf_consistency(FILE *fplog, gmx_hw_info_t *hwinfo,
const t_commrec *cr, int ntmpi_requsted,
gmx_bool bUseGPU);
+#endif
+
+
+/* Check whether a GPU is shared among ranks, and return the number of shared
+ gpus
+
+ hwinfo = the hwinfo struct
+
+ returns: The number of GPUs shared among ranks, or 0 */
+int gmx_count_gpu_dev_shared(const gmx_gpu_info_t *gpu_info);
+
#ifdef __cplusplus
}
gmx_large_int_t b,
gmx_large_int_t *result);
+static int gmx_greatest_common_divisor(int p, int q)
+{
+ int tmp;
+ while (q != 0)
+ {
+ tmp = q;
+ q = p % q;
+ p = tmp;
+ }
+ return p;
+}
+
#ifdef __cplusplus
}
#endif
FUNC_QUALIFIER
void nbnxn_cuda_init(FILE *fplog,
nbnxn_cuda_ptr_t *p_cu_nb,
- gmx_gpu_info_t *gpu_info, int my_gpu_index,
+ const gmx_gpu_info_t *gpu_info, int my_gpu_index,
/* true of both local and non-local are don on GPU */
gmx_bool bLocalAndNonlocal) FUNC_TERM
}
#endif
+/*! Returns if analytical Ewald CUDA kernels are used. */
+FUNC_QUALIFIER
+gmx_bool nbnxn_cuda_is_kernel_ewald_analytical(const nbnxn_cuda_ptr_t cu_nb)
+#ifdef GMX_GPU
+;
+#else
+{
+ return FALSE;
+}
+#endif
+
#ifdef __cplusplus
}
#endif
#elif defined(__FUJITSU) && ( defined(__sparc_v9__) || defined (__sparcv9) )
/* Fujitsu FX10 SPARC compiler */
-#include "atomic/fujitsu_sparc.h"
+#include "atomic/fujitsu_sparc64.h"
#else
/* otherwise, there's a generic gcc intrinsics version: */
typedef struct tMPI_Atomic_ptr
{
- volatile char* volatile* value __attribute__ ((aligned(64))); /*!< Volatile, to avoid compiler aliasing */
+ /* volatile char* volatile is not a bug, but means a volatile pointer
+ to a volatile value. This is needed for older versions of
+ xlc. */
+ volatile char* volatile value __attribute__ ((aligned(64))); /*!< Volatile, to avoid compiler aliasing */
}
tMPI_Atomic_ptr_t;
void* newval)
{
int ret;
- volatile char* volatile* oldv = oldval;
- volatile char* volatile* newv = newval;
+ volatile char* volatile oldv = (char*)oldval;
+ volatile char* volatile newv = (char*)newval;
__fence(); /* this one needs to be here to avoid ptr. aliasing issues */
__eieio();
#define EEL_SWITCHED(e) ((e) == eelSWITCH || (e) == eelSHIFT || (e) == eelENCADSHIFT || (e) == eelPMESWITCH || (e) == eelPMEUSERSWITCH)
-#define EEL_USER(e) ((e) == eelUSER || (e) == eelPMEUSER || (e) == (eelPMESWITCH))
+#define EEL_USER(e) ((e) == eelUSER || (e) == eelPMEUSER || (e) == (eelPMEUSERSWITCH))
#define EEL_IS_ZERO_AT_CUTOFF(e) (EEL_SWITCHED(e) || (e) == eelRF_ZERO)
gmx_bool bDomDec;
/* PBC stuff */
- int ePBC;
- gmx_bool bMolPBC;
- int rc_scaling;
- rvec posres_com;
- rvec posres_comB;
-
- gmx_hw_info_t *hwinfo;
- gmx_bool use_cpu_acceleration;
+ int ePBC;
+ gmx_bool bMolPBC;
+ int rc_scaling;
+ rvec posres_com;
+ rvec posres_comB;
+
+ const gmx_hw_info_t *hwinfo;
+ gmx_bool use_cpu_acceleration;
/* Interaction for calculated in kernels. In many cases this is similar to
* the electrostatics settings in the inputrecord, but the difference is that
* The gmx_hardware_detect module initializes it. */
typedef struct
{
- gmx_bool bUserSet; /* true if the GPUs in cuda_dev_use are manually provided by the user */
- gmx_bool bDevShare; /* true if any of the devices is shared by
- (t)MPI ranks, with auto-detection always FALSE */
+ gmx_bool bUserSet; /* true if the GPUs in cuda_dev_use are manually provided by the user */
int ncuda_dev_use; /* number of devices selected to be used */
int *cuda_dev_use; /* index of the devices selected to be used */
} gmx_gpu_info_t;
/* Hardware information structure with CPU and GPU information.
- * It is initialized by gmx_detect_hardware(). */
+ * It is initialized by gmx_detect_hardware().
+ * NOTE: this structure may only contain structures that are globally valid
+ * (i.e. must be able to be shared among all threads) */
typedef struct
{
- gmx_bool bCanUseGPU; /* True if compatible GPUs are detected during hardware detection */
- gmx_gpu_info_t gpu_info; /* Information about GPUs detected in the system */
+ gmx_bool bCanUseGPU; /* True if compatible GPUs are detected during hardware detection */
+ gmx_gpu_info_t gpu_info; /* Information about GPUs detected in the system */
- gmx_cpuid_t cpuid_info; /* CPUID information about CPU detected;
- NOTE: this will only detect the CPU thread 0 of the
- current process runs on. */
- int nthreads_hw_avail; /* Number of hardware threads available; this number
- is based on the number of CPUs reported as available
- by the OS at the time of detection. */
+ gmx_cpuid_t cpuid_info; /* CPUID information about CPU detected;
+ NOTE: this will only detect the CPU thread 0 of the
+ current process runs on. */
+ int nthreads_hw_avail; /* Number of hardware threads available; this number
+ is based on the number of CPUs reported as available
+ by the OS at the time of detection. */
+ gmx_bool bConsistencyChecked; /* whether
+ gmx_check_hw_runconf_consistency()
+ has been run with this hw_info */
} gmx_hw_info_t;
#ifdef __cplusplus
libdir=@LIB_INSTALL_DIR@
includedir=@INCL_INSTALL_DIR@
-Name: libgromacs
+Name: libgromacs@GMX_LIBS_SUFFIX@
Description: Gromacs library
URL: http://www.gromacs.org
Version: @PROJECT_VERSION@
* In mdrun, hwinfo has already been set before calling init_forcerec.
* Here we ignore GPUs, as tools will not use them anyhow.
*/
- snew(fr->hwinfo, 1);
- gmx_detect_hardware(fp, fr->hwinfo, cr,
- FALSE, FALSE, NULL);
+ fr->hwinfo = gmx_detect_hardware(fp, cr, FALSE, FALSE, NULL);
}
/* By default we turn acceleration on, but it might be turned off further down... */
* group kernels are OK. See Redmine #1249. */
if (fr->bAllvsAll)
{
- fr->bAllvsAll = FALSE;
+ fr->bAllvsAll = FALSE;
fr->use_cpu_acceleration = FALSE;
if (fp != NULL)
{
*/
#ifndef _nbnxn_atomdata_h
-#define _nsnxn_atomdata_h
+#define _nbnxn_atomdata_h
#include "typedefs.h"
#include "types/interaction_const.h"
#include "types/force_flags.h"
#include "../nbnxn_consts.h"
+#include "gmx_detect_hardware.h"
#include "nbnxn_cuda_types.h"
#include "../../gmxlib/cuda_tools/cudautils.cuh"
void nbnxn_cuda_init(FILE *fplog,
nbnxn_cuda_ptr_t *p_cu_nb,
- gmx_gpu_info_t *gpu_info, int my_gpu_index,
+ const gmx_gpu_info_t *gpu_info, int my_gpu_index,
gmx_bool bLocalAndNonlocal)
{
cudaError_t stat;
* - atomics are available, and
* - GPUs are not being shared.
*/
- bool bShouldUsePollSync = (bX86 && bTMPIAtomics && !gpu_info->bDevShare);
+ bool bShouldUsePollSync = (bX86 && bTMPIAtomics &&
+ (gmx_count_gpu_dev_shared(gpu_info) < 1));
if (bStreamSync)
{
gpu_min_ci_balanced_factor*cu_nb->dev_info->prop.multiProcessorCount : 0;
}
+
+gmx_bool nbnxn_cuda_is_kernel_ewald_analytical(const nbnxn_cuda_ptr_t cu_nb)
+{
+ return ((cu_nb->nbparam->eeltype == eelCuEWALD_ANA) ||
+ (cu_nb->nbparam->eeltype == eelCuEWALD_ANA_TWIN));
+}
*/
#ifndef _nbnxn_search_h
-#define _nsnxn_search_h
+#define _nbnxn_search_h
#include "typedefs.h"
void init_orca(t_commrec *cr, t_QMrec *qm, t_MMrec *mm)
{
- char
- *buf;
+ char *buf;
snew(buf, 200);
+
/* ORCA settings on the system */
buf = getenv("BASENAME");
if (buf)
}
else
{
- gmx_fatal(FARGS, "no $BASENAME\n");
+ gmx_fatal(FARGS, "$BASENAME not set\n");
}
/* ORCA directory on the system */
snew(buf, 200);
buf = getenv("ORCA_PATH");
- fprintf(stderr, "%s", buf);
if (buf)
{
}
else
{
- gmx_fatal(FARGS, "no $ORCA_PATH, check manual\n");
+ gmx_fatal(FARGS, "$ORCA_PATH not set, check manual\n");
}
- fprintf(stderr, "%s...\n", qm->orca_dir);
- fprintf(stderr, "orca initialised...\n");
+ fprintf(stderr, "Setting ORCA path to: %s...\n", qm->orca_dir);
+ fprintf(stderr, "ORCA initialised...\n\n");
/* since we append the output to the BASENAME.out file,
we should delete an existent old out-file here. */
sprintf(buf, "%s.out", qm->orca_basename);
void write_orca_input(int step, t_forcerec *fr, t_QMrec *qm, t_MMrec *mm)
{
- int
- i;
- t_QMMMrec
- *QMMMrec;
- FILE
- *out, *pcFile, *addInputFile, *LJCoeff;
- char
- *buf, *orcaInput, *addInputFilename, *LJCoeffFilename,
- *pcFilename, *exclInName, *exclOutName;
+ int i;
+ t_QMMMrec *QMMMrec;
+ FILE *out, *pcFile, *addInputFile, *LJCoeff;
+ char *buf, *orcaInput, *addInputFilename, *LJCoeffFilename, *pcFilename, *exclInName, *exclOutName;
+
QMMMrec = fr->qr;
+
/* write the first part of the input-file */
snew(orcaInput, 200);
sprintf(orcaInput, "%s.inp", qm->orca_basename);
out = fopen(orcaInput, "w");
+
snew(addInputFilename, 200);
sprintf(addInputFilename, "%s.ORCAINFO", qm->orca_basename);
addInputFile = fopen(addInputFilename, "r");
- fprintf(out, "#input-file generated by gromacs\n");
+
+ fprintf(out, "#input-file generated by GROMACS\n");
+
if (qm->bTS)
{
fprintf(out, "!QMMMOpt TightSCF\n");
{
fprintf(out, "!EnGrad TightSCF\n");
}
+
/* here we include the insertion of the additional orca-input */
snew(buf, 200);
if (addInputFile != NULL)
}
else
{
- fprintf(stderr, "No information on the calculation given in <%s>\n", addInputFilename);
- gmx_call("qm_orca.c");
+ gmx_fatal(FARGS, "No information on the calculation given in %s\n", addInputFilename);
}
+
fclose(addInputFile);
+
if (qm->bTS || qm->bOPT)
{
/* freeze the frontier QM atoms and Link atoms. This is
fclose(LJCoeff);
}
}
- /* write charge and multiplicity
- */
+
+ /* write charge and multiplicity */
fprintf(out, "*xyz %2d%2d\n", qm->QMcharge, qm->multiplicity);
- /* write the QM coordinates
- */
+
+ /* write the QM coordinates */
for (i = 0; i < qm->nrQMatoms; i++)
{
int atomNr;
#endif
}
fprintf(out, "*\n");
- /* write the MM point charge data
- */
+
+ /* write the MM point charge data */
if (QMMMrec->QMMMscheme != eQMMMschemeoniom && mm->nrMMatoms)
{
/* name of the point charge file */
int nnbl, kernel_type, enr_nbnxn_kernel_ljc, enr_nbnxn_kernel_lj;
char *env;
nonbonded_verlet_group_t *nbvg;
+ gmx_bool bCUDA;
if (!(flags & GMX_FORCE_NONBONDED))
{
gmx_incons("Invalid cut-off scheme passed!");
}
- if (nbvg->kernel_type != nbnxnk8x8x8_CUDA)
+ bCUDA = (nbvg->kernel_type == nbnxnk8x8x8_CUDA);
+
+ if (!bCUDA)
{
wallcycle_sub_start(wcycle, ewcsNONBONDED);
}
gmx_incons("Invalid nonbonded kernel type passed!");
}
- if (nbvg->kernel_type != nbnxnk8x8x8_CUDA)
+ if (!bCUDA)
{
wallcycle_sub_stop(wcycle, ewcsNONBONDED);
}
{
enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_RF;
}
- else if (nbvg->ewald_excl == ewaldexclTable)
+ else if ((!bCUDA && nbvg->ewald_excl == ewaldexclAnalytical) ||
+ (bCUDA && nbnxn_cuda_is_kernel_ewald_analytical(fr->nbv->cu_nbv)))
{
- enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_TAB;
+ enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_EWALD;
}
else
{
- enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_EWALD;
+ enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_TAB;
}
enr_nbnxn_kernel_lj = eNR_NBNXN_LJ;
if (flags & GMX_FORCE_ENERGY)
seed[i] = gmx_rng_uniform_uint32(sd->gaussrand[0]);
}
-#pragma omp parallel num_threads(ngr)
+ if (ngr != gmx_omp_nthreads_get(emntUpdate))
+ {
+ gmx_incons("The number of Gaussian number generators should be equal to gmx_omp_nthreads_get(emntUpdate)");
+ }
+
+#pragma omp parallel num_threads(gmx_omp_nthreads_get(emntUpdate))
{
int th;
th = gmx_omp_get_thread_num();
if (th > 0)
{
- /* Initialize on each thread to have thread-local memory alloced */
+ /* Initialize on each thread to get memory allocated thread-local */
sd->gaussrand[th] = gmx_rng_init(seed[th]);
}
}
}
}
+static void do_update_sd2_Tconsts(gmx_stochd_t *sd,
+ int ngtc,
+ const real tau_t[],
+ const real ref_t[])
+{
+ /* This is separated from the update below, because it is single threaded */
+ gmx_sd_const_t *sdc;
+ gmx_sd_sigma_t *sig;
+ int gt;
+ real kT;
+
+ sdc = sd->sdc;
+ sig = sd->sdsig;
+
+ for (gt = 0; gt < ngtc; gt++)
+ {
+ kT = BOLTZ*ref_t[gt];
+ /* The mass is encounted for later, since this differs per atom */
+ sig[gt].V = sqrt(kT*(1-sdc[gt].em));
+ sig[gt].X = sqrt(kT*sqr(tau_t[gt])*sdc[gt].c);
+ sig[gt].Yv = sqrt(kT*sdc[gt].b/sdc[gt].c);
+ sig[gt].Yx = sqrt(kT*sqr(tau_t[gt])*sdc[gt].b/(1-sdc[gt].em));
+ }
+}
+
static void do_update_sd2(gmx_stochd_t *sd,
gmx_rng_t gaussrand,
gmx_bool bInitStep,
unsigned short cTC[],
rvec x[], rvec xprime[], rvec v[], rvec f[],
rvec sd_X[],
- int ngtc, real tau_t[], real ref_t[],
+ const real tau_t[],
gmx_bool bFirstHalf)
{
gmx_sd_const_t *sdc;
sig = sd->sdsig;
sd_V = sd->sd_V;
- if (bFirstHalf)
- {
- for (n = 0; n < ngtc; n++)
- {
- kT = BOLTZ*ref_t[n];
- /* The mass is encounted for later, since this differs per atom */
- sig[n].V = sqrt(kT*(1-sdc[n].em));
- sig[n].X = sqrt(kT*sqr(tau_t[n])*sdc[n].c);
- sig[n].Yv = sqrt(kT*sdc[n].b/sdc[n].c);
- sig[n].Yx = sqrt(kT*sqr(tau_t[n])*sdc[n].b/(1-sdc[n].em));
- }
- }
-
for (n = start; n < nrend; n++)
{
ism = sqrt(invmass[n]);
}
}
+static void do_update_bd_Tconsts(double dt, real friction_coefficient,
+ int ngtc, const real ref_t[],
+ real *rf)
+{
+ /* This is separated from the update below, because it is single threaded */
+ int gt;
+
+ if (friction_coefficient != 0)
+ {
+ for (gt = 0; gt < ngtc; gt++)
+ {
+ rf[gt] = sqrt(2.0*BOLTZ*ref_t[gt]/(friction_coefficient*dt));
+ }
+ }
+ else
+ {
+ for (gt = 0; gt < ngtc; gt++)
+ {
+ rf[gt] = sqrt(2.0*BOLTZ*ref_t[gt]);
+ }
+ }
+}
+
static void do_update_bd(int start, int nrend, double dt,
ivec nFreeze[],
real invmass[], unsigned short ptype[],
unsigned short cFREEZE[], unsigned short cTC[],
rvec x[], rvec xprime[], rvec v[],
rvec f[], real friction_coefficient,
- int ngtc, real tau_t[], real ref_t[],
real *rf, gmx_rng_t gaussrand)
{
/* note -- these appear to be full step velocities . . . */
if (friction_coefficient != 0)
{
invfr = 1.0/friction_coefficient;
- for (n = 0; n < ngtc; n++)
- {
- rf[n] = sqrt(2.0*BOLTZ*ref_t[n]/(friction_coefficient*dt));
- }
- }
- else
- {
- for (n = 0; n < ngtc; n++)
- {
- rf[n] = sqrt(2.0*BOLTZ*ref_t[n]);
- }
}
+
for (n = start; (n < nrend); n++)
{
if (cFREEZE)
md->invmass, md->ptype,
md->cFREEZE, md->cACC, md->cTC,
state->x, xprime, state->v, force, state->sd_X,
- inputrec->opts.ngtc, inputrec->opts.tau_t,
- inputrec->opts.ref_t, FALSE);
+ inputrec->opts.tau_t,
+ FALSE);
}
inc_nrnb(nrnb, eNR_UPDATE, homenr);
dump_it_all(fplog, "Before update",
state->natoms, state->x, xprime, state->v, force);
- if (EI_RANDOM(inputrec->eI))
+ if (inputrec->eI == eiSD2)
{
- /* We still need to take care of generating random seeds properly
- * when multi-threading.
- */
- nth = 1;
+ check_sd2_work_data_allocation(upd->sd, nrend);
+
+ do_update_sd2_Tconsts(upd->sd,
+ inputrec->opts.ngtc,
+ inputrec->opts.tau_t,
+ inputrec->opts.ref_t);
}
- else
+ if (inputrec->eI == eiBD)
{
- nth = gmx_omp_nthreads_get(emntUpdate);
+ do_update_bd_Tconsts(dt, inputrec->bd_fric,
+ inputrec->opts.ngtc, inputrec->opts.ref_t,
+ upd->sd->bd_rf);
}
- if (inputrec->eI == eiSD2)
- {
- check_sd2_work_data_allocation(upd->sd, nrend);
- }
+ nth = gmx_omp_nthreads_get(emntUpdate);
#pragma omp parallel for num_threads(nth) schedule(static) private(alpha)
for (th = 0; th < nth; th++)
md->invmass, md->ptype,
md->cFREEZE, md->cACC, md->cTC,
state->x, xprime, state->v, force, state->sd_X,
- inputrec->opts.ngtc, inputrec->opts.tau_t, inputrec->opts.ref_t,
+ inputrec->opts.tau_t,
TRUE);
break;
case (eiBD):
md->cFREEZE, md->cTC,
state->x, xprime, state->v, force,
inputrec->bd_fric,
- inputrec->opts.ngtc, inputrec->opts.tau_t, inputrec->opts.ref_t,
upd->sd->bd_rf, upd->sd->gaussrand[th]);
break;
case (eiVV):
debug_gmx();
- /* set free energy calculation frequency as the minimum of nstdhdl, nstexpanded, and nstrepl_ex_nst*/
+ /* set free energy calculation frequency as the minimum
+ greatest common denominator of nstdhdl, nstexpanded, and repl_ex_nst*/
nstfep = ir->fepvals->nstdhdl;
- if (ir->bExpanded && (nstfep > ir->expandedvals->nstexpanded))
+ if (ir->bExpanded)
{
- nstfep = ir->expandedvals->nstexpanded;
+ nstfep = gmx_greatest_common_divisor(ir->fepvals->nstdhdl,nstfep);
}
- if (repl_ex_nst > 0 && nstfep > repl_ex_nst)
+ if (repl_ex_nst > 0)
{
- nstfep = repl_ex_nst;
+ nstfep = gmx_greatest_common_divisor(repl_ex_nst,nstfep);
}
/* I'm assuming we need global communication the first time! MRS */
* Thus all options should be internally consistent and consistent
* with the hardware, except that ntmpi could be larger than #GPU.
*/
-static int get_nthreads_mpi(gmx_hw_info_t *hwinfo,
+static int get_nthreads_mpi(const gmx_hw_info_t *hwinfo,
gmx_hw_opt_t *hw_opt,
t_inputrec *inputrec, gmx_mtop_t *mtop,
const t_commrec *cr,
}
}
-static void prepare_verlet_scheme(FILE *fplog,
- gmx_hw_info_t *hwinfo,
- t_commrec *cr,
- const char *nbpu_opt,
- t_inputrec *ir,
- const gmx_mtop_t *mtop,
- matrix box,
- gmx_bool *bUseGPU)
+static void prepare_verlet_scheme(FILE *fplog,
+ const gmx_hw_info_t *hwinfo,
+ t_commrec *cr,
+ const char *nbpu_opt,
+ t_inputrec *ir,
+ const gmx_mtop_t *mtop,
+ matrix box,
+ gmx_bool *bUseGPU)
{
/* Here we only check for GPU usage on the MPI master process,
* as here we don't know how many GPUs we will use yet.
bForceUseGPU = (strncmp(nbpu_opt, "gpu", 3) == 0);
bTryUseGPU = (strncmp(nbpu_opt, "auto", 4) == 0) || bForceUseGPU;
+ /* Detect hardware, gather information. This is an operation that is
+ * global for this process (MPI rank). */
+ hwinfo = gmx_detect_hardware(fplog, cr,
+ bForceUseGPU, bTryUseGPU, hw_opt->gpu_id);
+
+
snew(state, 1);
if (SIMMASTER(cr))
{
convert_to_verlet_scheme(fplog, inputrec, mtop, det(state->box));
}
- /* Detect hardware, gather information. With tMPI only thread 0 does it
- * and after threads are started broadcasts hwinfo around. */
- snew(hwinfo, 1);
- gmx_detect_hardware(fplog, hwinfo, cr,
- bForceUseGPU, bTryUseGPU, hw_opt->gpu_id);
minf.cutoff_scheme = inputrec->cutoff_scheme;
minf.bUseGPU = FALSE;
pr_inputrec(fplog, 0, "Input Parameters", inputrec, FALSE);
}
-#if defined GMX_THREAD_MPI
- /* With tMPI we detected on thread 0 and we'll just pass the hwinfo pointer
- * to the other threads -- slightly uncool, but works fine, just need to
- * make sure that the data doesn't get freed twice. */
- if (cr->nnodes > 1)
- {
- if (!SIMMASTER(cr))
- {
- snew(hwinfo, 1);
- }
- gmx_bcast(sizeof(&hwinfo), &hwinfo, cr);
- }
-#else
- if (PAR(cr) && !SIMMASTER(cr))
- {
- /* now we have inputrec on all nodes, can run the detection */
- /* TODO: perhaps it's better to propagate within a node instead? */
- snew(hwinfo, 1);
- gmx_detect_hardware(fplog, hwinfo, cr,
- bForceUseGPU, bTryUseGPU, hw_opt->gpu_id);
- }
-
- /* Now do the affinity check with MPI/no-MPI (done earlier with thread-MPI). */
- gmx_check_thread_affinity_set(fplog, cr,
- hw_opt, hwinfo->nthreads_hw_avail, FALSE);
-#endif
-
/* now make sure the state is initialized and propagated */
set_state_entries(state, inputrec, cr->nnodes);
(cr->duty & DUTY_PP) == 0,
inputrec->cutoff_scheme == ecutsVERLET);
- gmx_check_hw_runconf_consistency(fplog, hwinfo, cr, hw_opt->nthreads_tmpi, minf.bUseGPU);
+ /* check consistency and decide on the number of gpus to use. */
+ gmx_check_hw_runconf_consistency(fplog, hwinfo, cr, hw_opt->nthreads_tmpi,
+ minf.bUseGPU);
/* getting number of PP/PME threads
PME: env variable should be read only on one node to make sure it is
sfree(membed);
}
-#ifdef GMX_THREAD_MPI
- if (PAR(cr) && SIMMASTER(cr))
-#endif
- {
- gmx_hardware_info_free(hwinfo);
- }
+ gmx_hardware_info_free(hwinfo);
/* Does what it says */
print_date_and_time(fplog, cr->nodeid, "Finished mdrun", &runtime);