# git branch can be tested. Normally, this will be the version of the
# last patch release. Comment the next line out for branches leading
# to a major/minor release.
-set(REGRESSIONTEST_VERSION "4.6.2")
+set(REGRESSIONTEST_VERSION "4.6.4")
set(CUSTOM_VERSION_STRING ""
CACHE STRING "Custom version string (if empty, use hard-coded default)")
mark_as_advanced(CUSTOM_VERSION_STRING)
\end{enumerate}
Or, as a sequence of commands to execute:
\begin{verbatim}
-tar xfz gromacs-4.6.3.tar.gz
-cd gromacs-4.6.3
+tar xfz gromacs-4.6.4.tar.gz
+cd gromacs-4.6.4
mkdir build
cd build
cmake .. -DGMX_BUILD_OWN_FFTW=ON
example, download the source tarball and use
% TODO: keep up to date with new releases!
\begin{verbatim}
-$ tar xfz gromacs-4.6.3.tgz
-$ cd gromacs-4.6.3
+$ tar xfz gromacs-4.6.4.tgz
+$ cd gromacs-4.6.4
$ mkdir build-cmake
$ cd build-cmake
$ cmake ..
\verb+-DREGRESSIONTEST_DOWNLOAD+, and run \verb+make check+.
\gromacs{} will automatically download and run the tests for you.
Alternatively, you can download and unpack the tarball yourself from
-\url{http://gerrit.gromacs.org/download/regressiontests-4.6.1.tar.gz},
+\url{http://gerrit.gromacs.org/download/regressiontests-4.6.4.tar.gz},
and use the advanced \cmake{} option \verb+REGRESSIONTEST_PATH+ to
specify the path to the unpacked tarball, which will then be used for
testing. If this doesn't work, then please read on.
This is the architecture of the K computer, which uses Fujitsu Sparc64viiifx
chips. Gromacs-4.6 will build with default C kernels on this architecture,
-and Gromacs-4.6.2 will add accelerated kernels and a custom toolchain.
+and Gromacs-4.6.2 added accelerated group kernels and a custom toolchain.
\section{Tested platforms}
set dir = $cwd
-set VER = 4.6.3
+set VER = 4.6.4
set MANDIR = online
set HTML = $cwd/html
set HTMLOL = $HTML/$MANDIR
echo "<br><br>" >> $HTMLIDX
end
cat >> $HTMLIDX <<EOD
-<A HREF="gmxfaq.html">FAQ</a>
+<A HREF="http://www.gromacs.org/Documentation/FAQs">FAQ</a>
<br>
</TD>
<TD VALIGN=top WIDTH=75%>
# Try to detect CPU information and suggest an acceleration option
# (such as SSE/AVX) that fits the current CPU. These functions assume
# that gmx_detect_target_architecture() has already been run, so that
-# things like GMX_IS_X86 are already available.
+# things like GMX_TARGET_X86 are already available.
#
# Sets ${GMX_SUGGESTED_CPU_ACCELERATION} in the parent scope if
# GMX_CPU_ACCELERATION is not set (e.g. by the user, or a previous run
message(STATUS "Detecting best acceleration for this CPU")
# Get CPU acceleration information
+ set(_compile_definitions "@GCC_INLINE_ASM_DEFINE@ -I${CMAKE_SOURCE_DIR}/src/gromacs/legacyheaders -DGMX_CPUID_STANDALONE")
+ if(GMX_TARGET_X86)
+ set(_compile_definitions "${_compile_definitions} -DGMX_TARGET_X86")
+ endif()
try_run(GMX_CPUID_RUN_ACC GMX_CPUID_COMPILED
${CMAKE_BINARY_DIR}
${CMAKE_SOURCE_DIR}/src/gromacs/gmxlib/gmx_cpuid.c
- COMPILE_DEFINITIONS "@GCC_INLINE_ASM_DEFINE@ -I${CMAKE_SOURCE_DIR}/src/gromacs/legacyheaders -DGMX_CPUID_STANDALONE -DGMX_IS_X86"
+ COMPILE_DEFINITIONS ${_compile_definitions}
RUN_OUTPUT_VARIABLE OUTPUT_TMP
COMPILE_OUTPUT_VARIABLE GMX_CPUID_COMPILE_OUTPUT
ARGS "-acceleration")
function(gmx_detect_acceleration _suggested_acceleration)
if(NOT DEFINED GMX_CPU_ACCELERATION)
- if(GMX_IS_BGQ)
+ if(GMX_TARGET_BGQ)
set(${_suggested_acceleration} "IBM_QPX")
- elseif(GMX_IS_X86)
+ elseif(GMX_TARGET_X86)
gmx_suggest_x86_acceleration(${_suggested_acceleration})
else()
set(${_suggested_acceleration} "None")
# - architecture is one for which GROMACS has special treatment
# - (e.g. kernel acceleration)
#
-# Sets GMX_IS_X86 or GMX_IS_BGQ if targetting that architecture
+# Sets GMX_TARGET_X86 or GMX_TARGET_BGQ if targetting that
+# architecture. May set other such variables if/when there is future
+# need.
function(gmx_detect_target_architecture)
- try_compile(GMX_IS_X86 ${CMAKE_BINARY_DIR}
- "${CMAKE_SOURCE_DIR}/cmake/TestX86.c")
- try_compile(GMX_IS_BGQ ${CMAKE_BINARY_DIR}
- "${CMAKE_SOURCE_DIR}/cmake/TestBlueGeneQ.c")
+ if (NOT DEFINED GMX_TARGET_X86)
+ try_compile(GMX_TARGET_X86 ${CMAKE_BINARY_DIR}
+ "${CMAKE_SOURCE_DIR}/cmake/TestX86.c")
+ endif()
+ if (NOT DEFINED GMX_TARGET_BGQ)
+ try_compile(GMX_TARGET_BGQ ${CMAKE_BINARY_DIR}
+ "${CMAKE_SOURCE_DIR}/cmake/TestBlueGeneQ.c")
+ endif()
endfunction()
if(NOT CMAKE_CROSSCOMPILING)
# Get CPU acceleration information
- set(_compile_definitions "@GCC_INLINE_ASM_DEFINE@ -I${CMAKE_SOURCE_DIR}/src/gromacs/legacyheaders -DGMX_CPUID_STANDALONE -DGMX_IS_X86")
+ set(_compile_definitions "@GCC_INLINE_ASM_DEFINE@ -I${CMAKE_SOURCE_DIR}/src/gromacs/legacyheaders -DGMX_CPUID_STANDALONE")
+ if(GMX_TARGET_X86)
+ set(_compile_definitions "${_compile_definitions} -DGMX_TARGET_X86")
+ endif()
try_run(GMX_CPUID_RUN_VENDOR GMX_CPUID_COMPILED
${CMAKE_BINARY_DIR}
${CMAKE_SOURCE_DIR}/src/gromacs/gmxlib/gmx_cpuid.c
+++ /dev/null
-<HTML>
-<HEAD>
-<TITLE>GROMACS FAQ</TITLE>
-<LINK rel=stylesheet href="online/style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
-<TABLE WIDTH="98%" NOBORDER >
-<TR><TD WIDTH=400>
-<TABLE WIDTH=400 NOBORDER>
-<TD WIDTH=116>
-<a href="http://www.gromacs.org/"><img SRC="images/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td>
-<td ALIGN=LEFT VALIGN=TOP WIDTH=280><br><h2>GROMACS<br>FAQ</h2><font size=-1><A HREF="online.html">Main Table of Contents</A></font><br><br></td>
-</TABLE></TD><TD WIDTH="*" ALIGN=RIGHT VALIGN=BOTTOM><p><B>VERSION 4.5<br>
-Thu 26 Aug 2010</B></td></tr></TABLE>
-<HR>
-
-<p>If you don't find the solution to your problem here, you could have a look in
-the online archives of our <a href="http://www.gromacs.org/mailing_lists/index.php">
-mailing lists</a>, or subscribe yourself!
-
-<p>There is also a special <a href="http://www.gromacs.org/developer/developer_faq.php">Developer FAQ</a>
-at www.gromacs.org with more advanced and/or technical topics (e.g. automake/autoconf) available under
-Developer info that you could use, and when all else fails it's
-time to post your question to the mailing lists!</p>
-
-
-
-<h3>Download & Installation</h3>
-
-<ul>
-<li><A HREF="#getgromacs">How can I get GROMACS and how much does it cost?</A>
-<li><A HREF="#systemsupported">Is my system supported?</A>
-<li><A HREF="#binaries">Can't you provide binaries for my system?</A>
-<li><A HREF="#install">How do I compile and install the GROMACS package?</A>
-<li><A HREF="#compiler">How do I select the compiler and/or flags to use?</A>
-<li><A HREF="#fftw">The configuration script complains about FFTW - how
-do I install it?</A>
-<li><A HREF="#fftwlocation">I HAVE installed FFTW, but the configuration script
-still says it can't find it!</A>
-<li><A HREF="#MPI">How do I compile GROMACS for parallel runs?</A>
-<li><A HREF="#noMPIwrapper">When I enable MPI support for parallel runs, GROMACS
-looks for a special MPI wrapper script like 'mpicc', but we don't use that; is it possible to
- add the MPI library manually with -lmpi ?</A>
-<li><A HREF="#nomotif">How do I turn off Motif?</A>
-<li><A HREF="#ldpath">Everything compiles fine, but when I try to run a program
-it complains about not finding libXXXX.so.</A>
-<li><A HREF="#osx_zsh">I get an error from the configure script on Mac OS X!</A>
-<li><A HREF="#noclue">It still won't compile and I haven't got a clue what the problem might be...</A>
-<li><A HREF="#relativespeed">How fast is GROMACS compared to other programs?</A>
-<li><A HREF="#speed">Is there any way I can make GROMACS run faster?</A>
-<li><A HREF="#besthardware">What hardware do you recommend?</A>
-<li><A HREF="#systemsize">How large systems can I simulate with GROMACS?</A>
-<li><A HREF="#pdfgraphics">Why is the front page graphics in the PDF manual strange?</A>
-</ul><br>
-
-<h3>System preparation</h3>
-
-<ul>
-<li><A HREF="#PDB">OK, I've downloaded a PDB file with a structure I'd like
-to simulate. What should I do?</A>
-<li><A HREF="#multi">My protein has multiple subunits. Is that a problem?</A>
-<li><A HREF="#convert">How do I convert my structure from a .gro, .tpr, or trajectory
-file to a .pdb file?</A>
-<li><A HREF="#scmis">The <TT>pdb2gmx</TT> program is complaining about long bonds and/or
-missing atoms. What should I do?</A>
-<li><A HREF="#osxcpp">grompp doesn't find the C preprocessor /lib/cpp on OS X!</A>
-</ul><br>
-
-<h3>Simulation</h3>
-
-<ul>
-<li><A HREF="#1-4cut">What does "1-4 (#,#) interaction not within cut-off" mean?</A>
-<li><A HREF="#libnet">What does "Fatal error: Routine gmx_tx called in libnet.c" mean?</A>
-<li><A HREF="#output">My simulation seems to be running, but shouldn't there be any output?</A>
-<li><A HREF="#temp">Why do I get very strange temperatures in my simulation?</A>
-<li><A HREF="#recover">Is there any smart way to continue a run that crashed?</A>
-<li><A HREF="#largefiles">When my trajectory files reach 2GB I get strange error messages,
-or they just disappear. Why?</A>
-</ul><br>
-
-<h3>Analysis</h3>
-
-<ul>
-<li><A HREF="#multPDB">How do I analyze a PDB file with multiple entries?</A>
-<li><A HREF="#twostruc">Can I fit two structures which do not have the
-same number/sequence of atoms?</A>
-<li><A HREF="#group">I get tired of having to select the same index group
-over and over again. Is there a better way to do it?</A>
-<li><A HREF="#diys">How do I perform an analysis that GROMACS doesn't have a program
-for?</A>
-</ul><br>
-
-<h3>Other problems</h3>
-
-<ul>
-<li><A HREF="#none">My problem isn't mentioned above, and/or none of the solutions seem to work?</A>
-</ul>
-<br><br><br><br>
-
-<hr>
-
-<ul>
-<li><A NAME="getgromacs">
-<B>How can I get GROMACS and how much does it cost?</B><br><br>
-You can download it immediately from this website, and it won't cost
-you a penny! GROMACS is free software, licensed under the GNU General
-Public License. The details are available in the
-<A HREF="http://www.gnu.org/copyleft/gpl.html">license text</a>, but
-in short you can modify and redistribute the code as long as your
-version is licensed under the GPL too. <br><br>
-
-<li><A NAME="systemsupported">
-<B>Is my system supported?</B><br><br>
-GROMACS is a recursive acronym for "GROMACS Runs On Most Of All Computer Systems" :-)<br>
-Since we use GNU automatic configuration scripts you should in principle
-be able to compile GROMACS on any UNIX dialect, probably including Mac OS X.
-Contact us if you have any problems. At least Solaris, IRIX, Linux (both x86 and alpha),
-Tru64/Digital Unix, and AIX should be virtually problem-free. An ANSI C compiler
-is sufficient to compile GROMACS, but we definitely recommend a good Fortran 77
-compiler too (performance-critical routines are available in fortran versions). You
-won't need Fortran on Linux/x86 where we provide even faster assembly loops!<br><br>
-
-<li><A NAME="binaries">
-<B>Can't you provide binaries for my system?</B><br><br>
-The problem is that we want the highest possible performance, and to achieve this
-it is necessary to adapt compiler flags to your processor type. We also use special
-mathematic libraries from several hardware manufacturers, and different versions
-of MPI for parallel runs. This means we would have to provide about 10 different
-sets of binaries for each processor on each operation system, and keep them updated
-for each new release. Sorry, but that's simply not possible.<br>
-However, for Linux running on x86 computers it doesn't matter which compiler
-flags we use since it doesn't affect the assembly loops, and we can thus distribute
-RPM packages of GROMACS. If you want to run in parallel you probably have to install our
-versions of the LAM MPI packages to get the correct version, or compile GROMACS yourself.
-<br><br>
-
-<li><A NAME="install">
-<B>How do I compile and install the GROMACS package?</B><br><br>
-That's easy - we provide step-by-step instructions for you <A HREF="/installation/">here</A>.<br>
-If you're impatient you could just unpack the distribution and try<br><br>
-<table BORDER=0 CELLSPACING=0 CELLPADDING=8 COLS=3 WIDTH="100%" NOSAVE >
-<tr NOSAVE>
-<td WIDTH="2%" NOSAVE><font color="#000000"></font></td>
-<td WIDTH="98%" BGCOLOR="#000066" NOSAVE><font color="#FFFFFF">
-<tt>
- ./configure<br>
- make<br>
- make install</tt>
-<td></td>
-</tr>
-</table>
-<br>
-
-The configure script will complain if it doesn't find FFTW, but you will be told what to do.<br><br>
-This setup is new from version 3.0, so there might be some bugs we've missed, though. Don't
-hesitate to post questions to the <a href="http://www.gromacs.org/mailing_lists/users.php>mailing lists</a>
-if you have problems.<br><br>
-
-<li><A NAME="compiler">
-<B>How do I select the compiler and/or flags to use?</B><br><br>
-Select the compiler by setting the CC and/or F77 environment variables before running the
-GROMACS configure script (MPICC for the MPI C compiler). You can also set the corresponding compiler flags with CFLAGS and
-FFLAGS, and the linker flags with LDFLAGS. If you want to add a library at the link stage
-you can add the -llib flags to the LIBS variable, and include directories can be
-added in the CPPFLAGS variable.<br><br>
-
-<li><A NAME="fftw">
-<B>The configuration script complains about FFTW - how
-do I install it?</B><br><br>
-FFTW uses the same type of automatic configuration scripts as
-GROMACS, so it's easy to configure and compile. The default setup
-places libraries under /usr/local, but you can change it with
---prefix. One important difference is that GROMACS normally
-is compiled in single precision, while FFTW defaults to double precision.
-Configure and install FFTW with the command:<br><br>
-<table BORDER=0 CELLSPACING=0 CELLPADDING=8 COLS=3 WIDTH="100%" NOSAVE >
-<tr NOSAVE>
-<td WIDTH="2%" NOSAVE><font color="#000000"></font></td>
-
-<td WIDTH="98%" BGCOLOR="#000066" NOSAVE><font color="#FFFFFF">
-<tt> ./configure --enable-float<br>
- make<br>
- make install</TT>
-<td></td>
-</tr>
-</table>
-<br>
-If you want to compile GROMACS with parallel MPI support you should
-also add --enable-mpi to the FFTW configuration. Once the installation
-is ready we recommend that you also install a double-precision version
-of FFTW (nice to have) with:<br><br>
-<table BORDER=0 CELLSPACING=0 CELLPADDING=8 COLS=3 WIDTH="100%" NOSAVE >
-<tr NOSAVE>
-<td WIDTH="2%" NOSAVE><font color="#000000"></font></td>
-
-<td WIDTH="98%" BGCOLOR="#000066" NOSAVE><font color="#FFFFFF">
-<tt> make distclean<br>
- ./configure<br>
- make<br>
-make install</tt>
-<td></td>
-</tr>
-</table>
-<br>
-
-That's it. Consult <A HREF="http://www.fftw.org">www.fftw.org</A> if you have any problems.<br><br>
-
-<li><A NAME="fftwlocation">
-<B>I HAVE installed FFTW, but the configuration script
-still says it can't find it!</B><br><br>
-
-OK. The problem is most probably that your compiler doesn't look for the
-header files and/or libraries in the place where you installed them. It's easy to fix; you
-can just tell the GROMACS configure script to add those directories to the search paths.
-Specify the header file directory (e.g. /home/erik/fftw/include) as<br><br>
-<table BORDER=0 CELLSPACING=0 CELLPADDING=8 COLS=3 WIDTH="100%" NOSAVE >
-<tr NOSAVE>
-<td WIDTH="2%" NOSAVE><font color="#000000"></font></td>
-
-<td WIDTH="98%" BGCOLOR="#000066" NOSAVE><font color="#FFFFFF">
-<tt> CPPFLAGS="-I/home/erik/fftw/include"</tt>
-<td></td>
-</tr>
-</table>
-<br>
-and the location of the libraries (e.g. /home/erik/fftw/lib) in<br><br>
-<table BORDER=0 CELLSPACING=0 CELLPADDING=8 COLS=3 WIDTH="100%" NOSAVE >
-<tr NOSAVE>
-<td WIDTH="2%" NOSAVE><font color="#000000"></font></td>
-
-<td WIDTH="98%" BGCOLOR="#000066" NOSAVE><font color="#FFFFFF">
-<tt> LDFLAGS="-L/home/erik/fftw/lib"</tt>
-<td></td>
-</tr>
-</table>
-<br>
-and then run the GROMACS configuration script. Note that some compilers don't search
-/usr/local by default, so you might have to specify these paths even if you installed
-FFTW in the default place.<br><br>
-
-<li><A NAME="MPI">
-<B>How do I compile GROMACS for parallel runs?</B><br><br>
-On most systems you only have to add the option --enable-mpi to the configure
-script, and then compile GROMACS the normal way. For this to work you need to
-have MPI communication libraries install, and some kind of wrapper script like
-mpicc or mpcc to use when compiling MPI programs. MPI should always be present
-on supercomputers, and if you are running on workstations we recommend LAM MPI,
-<a href="http://www.lam-mpi.org">www.lam-mpi.org</a>. On Linux it's even simpler -
-just install the RPM packages we provide!<br><br>
-
-On most supercomputers you can only run MPI programs on dedicated nodes, so
-in this case you probably want all the analysis programs compiled without MPI first.
-Since you normally only need the mdrun program with MPI support you can type
-"make mdrun; make install-mdrun" instead, but remember to type "make distclean"
-if you have previously compiled GROMACS without MPI. It is also possible to put a suffix
-on the MPI-version programs, or just start MPI runs when an environment variable
-is set. Check the options to the configure script with "./configure --help".
-<br><br>
-
-<li><A NAME="noMPIwrapper">
-<B>When I enable MPI support for parallel runs, GROMACS
-looks for a special MPI wrapper script like 'mpicc', but we don't use that; is it possible to
- add the MPI library manually with -lmpi ?</B><br><br>
-Sure - no problem, but it might not be entirely obvious if you are new to autoconf scripts.
-Here's how to do it:<br><br>
-To use the MPI library we need the header files (mpi.h) with definitions, and the MPI libraries
-with the actual code (e.g. libmpi.a). If your system uses some special hardware it might
-also be necessary to link with more libraries - ask your system administrator if you have
-any problems. Start by location these headers and libraries on your system, and then add
-them to your environment variables before running the configure script:
-<br><br>
-<table BORDER=0 CELLSPACING=0 CELLPADDING=8 COLS=3 WIDTH="100%" NOSAVE >
-<tr NOSAVE>
-<td WIDTH="2%" NOSAVE><font color="#000000"></font></td>
-<td WIDTH="90%" BGCOLOR="#000066" NOSAVE><font color="#FFFFFF">
-<tt>
-setenv CPPFLAGS "-I/path/to/your/mpi/include"<br>
-setenv LDFLAGS "-L/path/to/your/mpi/lib"<br>
-setenv LIBS "-lmpi"<br>
-setenv MPICC "cc"</tt>
-<td></td>
-</tr>
-</table>
-<br>
-(This is valid for tcsh, for bash you should use export instead.) Note that these commands overwrite
-any previous assignments, so you must add
-all parts you want (you can use $VARIABLE to add the previous value of an environment variable).
-<br>
-Now you should be able to run ./configure --enable-mpi !
-<br><br>
-
-<li><A NAME="nomotif">
-<B>How do I turn off Motif?</b><br><br>
-Just use the flag --without-motif-libraries (or headers).
-If the configure script doesn't find both libraries and headers it will disable motif. This is useful when you have motif on the machine where you compile, but not on all machines you run on.<br><br>
-
-<li><A NAME="ldpath">
-<b>Everything compiles fine, but when I try to run a program
-it complains about not finding libXXXX.so.</b><br><br>
-GROMACS and/or the FFTW package can be compiled with shared libraries. In fact,
-it's the default setup in the Linux RPMs. This means we save space by not linking all the
-routines into each binary, but call the shared library at runtime. Of course, this requires that you can find the library at runtime. For the GROMACS distribution programs we hardcode the location of the GROMACS and FFTW libraries, but if you compile your own programs or move your libraries you must tell the system where to find them!
-Fortunately, this is quite easy to do. On Linux you can do it permanently for
-all users if you are root, by adding the search path to the file /etc/ld.so.conf. Alternatively, you can add it to the LD_LIBRARY_PATH environment variable:
-<br><br>
-<table BORDER=0 CELLSPACING=0 CELLPADDING=8 COLS=3 WIDTH="100%" NOSAVE >
-<tr NOSAVE>
-<td WIDTH="2%" NOSAVE><font color="#000000"></font></td>
-
-<td WIDTH="98%" BGCOLOR="#000066" NOSAVE><font color="#FFFFFF">
-<TT>setenv LD_LIBRARY_PATH "$LD_LIBRARY_PATH:/opt/lib"</TT>
-<td></td>
-</tr>
-</table>
-<br>
-(This is valid for tcsh, for bash you should use export instead.) Ask your
-local sysadm how to add it to your login file so it's done automatically each
-time you log on.
-<br><br>
-
-
-<li><A NAME="osx_zsh">
-<b>I get an error from the configure script on Mac OS X!</B><br><br>
-This is because OS X uses Z shell for /bin/sh. This will hopefully be
-fixed in a future release of automake (it is not caused by Gromacs), but in the
-meantime you can install bash (if you don't already have it) and
-use the command '/bin/bash ./configure' (Your bash location might be different from /bin/bash).
-<br><br>
-
-
-
-<li><A NAME="noclue">
-<B>It still won't compile and I haven't got a clue what the problem might be...</B><br><br>
-Oops. Sorry, but those things happen :-) It's usually quite easy to fix. One possible
-source of errors is the shared libraries we use to save space. You can try to disable
-them with the --disable-shared flag to the configure script. You can also ask questions on the mailing
-lists or contact us. BUT: Unless you attach copies of your configuration/make output and/or
-log files we can only guess what your problem might be!<br><br>
-
-
-<li><A NAME="relativespeed">
-<B>How fast is GROMACS compared to other programs?</B><br><br>
-GROMACS is fast, VERY fast. In fact, on all benchmarks we've tested it's
-3-10 times faster than any other program we've tried, many of which are
-commercial. On x86 hardware GROMACS really excels due to the assembly loops.
-Of course, speed isn't everything. There are cases where
-we don't support a certain algorithm that program X supports, and
-vice versa. For instance, our assembly loops are only available in single precision.
-In any case - show us a benchmark were some other program
-is faster and we'll be happy to implement that algorithm. <br><br>
-
-
-<li><A NAME="speed">
-<B>Is there any way I can make GROMACS run faster?</B><br><br>
-That depends on your setup. If you are using x86 processors you should definitely
-make sure that you compile GROMACS with assemblt loops, and that your OS
-supports SSE instructions if you are using Pentium III/IV processors. If you
-compiled GROMACS with assembly loops there will be a line in the logfile telling
-you which loops we are using.<br>
-On alpha hardware you might want to play around with enabling/disabling the
-software invsqrt, and the inner loop vectorization. Modern alpha chips have a
-fairly fast hardware sqrt, but they also seem to benefit even more from vectorizing the
-innerloops and using the vectorized invsqrt provided in GROMACS.<br>
-If you are using IBM hardware you should locate or download the MASS libraries
-(mathematical accelerated subsystem). If you provide the location of this library
-in the LDFLAGS environment variable GROMACS will automatically use fast vectorized
-inner loops on IBM.<br>
-On any system apart from Linux/x86 (where we use assembly innerloops) you should also
-try to use a fortran compiler for better performance, and if you run Linux/alpha
-you should use the Compaq compilers instead of gcc.<br><br>
-You should always use single precision; there are very few cases where you
-actually need double precision, and it's slower.<br><br>
-Investigate the options <TT>-dummy</TT> and <TT>-heavyh</TT> to
-<A HREF="/documentation/reference_3.1/online/pdb2gmx.html"><TT>pdb2gmx</TT></A>
-which control the constraining of hydrogen atoms and the mass of
-unconstrained hydrogen atoms. This eliminates the highest freqency
-motions in your system, enabling you to increase the timestep without
-loss of accuracy to about 4 fs, or even up to 7 fs with negligible
-loss of accuracy! (<I>J. Comput. Chem.</I> <B>20</B>,786).<br><br>
-If your system has relatively slow disk-IO, and/or you write
-frames and energies out very often, and/or you have a very large
-system the performance might be limited by disk access. In that case,
-you might consider writing fewer frames to your trajectories
-(<TT>.xtc</TT> and especially <TT>.trr</TT> or <TT>.trj</TT>) and
-energy file (<TT>.ene</TT> or <TT>.edr</TT>).<br><br>
-
-
-<li><A NAME="besthardware">
-<B>What hardware do you recommend?</B><br><br>
-If cost is an issue, you can't beat dual Pentium boxes due to the assembly loops!
-Dual AMD machines should also be a nice option soon. Note that the Pentium 4 processors
-achieve a high clock frequency by using a longer pipeline, so a Pentium 4 at 1.7 GHz is
-about the same speed as a Pentium III at 1.2 GHz. Don't be fooled by the high clock!<br><br>
-
-
-<li><A NAME="systemsize">
-<B>How large systems can I simulate with GROMACS?</B><br><br>
-It's only limited by your memory, and GROMACS is quite modest in its
-memory requirements. As an indictaion: a system of 12000 atoms takes
-about 10Mb of memory, and 6000 atoms about 5.5Mb (on a SGI O2),
-which comes down to just over 900 bytes memory use per atom in
-your system (your mileage will vary). Due to the fact that we initially
-developed GROMACS to run on our home-built parallel machine, with
-only 8Mb of memory per processor, the code is quite well optimized for
-memory use. To get an indication of scaling of GROMACS performance as
-a function of system size, have a look at the
-<A HREF="/benchmarks/scaling.php">scaling benchmark page</A>.<br><br>
-Note that the .gro format only support atoms numbers up to 99999, so
-it will loop once it reaches 100000 atoms. This is no problem in GROMACS
-since we don't use the atom number from the .gro file.<br><br>
-
-
-<li><A NAME="pdfgraphics">
-<B>Why is the front page graphics in the PDF manual strange?</b><br><br>
-This is a known problem with some versions of Acrobat reader on Linux. There
-is nothing we can do about it, but in the cases we've tested the graphics
-still prints fine on paper.<br><br>
-
-
-<li><A NAME="PDB">
-<B>OK, I've downloaded a PDB file with a structure I'd like
-to simulate. What should I do?</B><br><br>
-Look at the <A HREF="/documentation/reference_3.1/online/flow.html">flowchart</A> for a quick overview.
-Start where it says "eiwit.pdb" (this is somewhere at the top).
-More detailed info can be found in the
-<A HREF="/documentation/reference_3.1/online/getting_started.html">Getting Started</A>
-section, you can probably start where it says "Ribonuclease S-Peptide".
-<br><br>
-
-<li><A NAME="multi">
-<B>My protein has multiple subunits. Is that a problem?</B><br><br>
-<TT>pdb2gmx</TT> can automatically process multimeric proteins,
-but won't be able to make inter-subunit cystine bridges. A word of warning, though:
-the units will only be recognized as different chains if they
-have different chain identifiers!<br><br>
-
-<li><A NAME="convert">
-<B>How do I convert my structure from a .gro, .tpr, or trajectory
-file to a .pdb file?</B><br><br>
-Any <a href="/documentation/reference_3.1/online/files.html">generic structure</a> file,
-for instance <TT>.gro</TT>, <TT>.pdb</TT> or <TT>.tpr</TT>, can be
-converted to <TT>.pdb</TT> with
-<a href="/documentation/reference_3.1/online/editconf.html">editconf</a></TT>. You can view a
-<TT>.pdb</TT> file with several programs, for instance <TT>rasmol</TT>. Two generic
-structure files can be fitted with
-<a href="/documentation/reference_3.1/online/g_confrms.html">g_confrms</a></TT>, the two
-superimposed structures can be written to a <TT>.pdb</TT> file. Any
-<a href="/documentation/reference_3.1/online/files.html">generic trajectory</a> format can be
-converted with <a href="/documentation/reference_3.1/online/trjconv.html"><TT>trjconv</TT></a>.
-You can dump one frame with <TT>trjconv -dump</TT>, or write a
-<TT>.pdb</TT> with multiple frames using <TT>trjconv -op -app</TT>.
-If multiple structures in a <TT>.pdb</TT> are separated by
-<TT>ENDMDL</TT> keywords, you should use <TT>rasmol -nmrpdb</TT> to
-view them.<br><br>
-
-
-<li><A NAME="scmis">
-<B>The <TT>pdb2gmx</TT> program is complaining about long bonds and/or
-missing atoms. What should I do?</B><br><br>
-There are probably atoms missing earlier in the
-<A HREF="/documentation/reference_3.1/online/pdb.html"><TT>.pdb</TT></A> file
-which makes <A HREF="/documentation/reference_3.1/online/pdb2gmx.html"><TT>pdb2gmx</TT></A>
-go crazy. Check the screen output of <TT>pdb2gmx</TT>, as it
-will tell you which one is missing. Then add the atoms in your pdb file,
-energy minimization will put them in the right place,
-or fix the side chain with e.g. the
-<A HREF="http://swift.embl-heidelberg.de/whatif/">WhatIF program</A>.<br><br>
-
-<li><A NAME="osxcpp">
-<B>grompp doesn't find the C preprocessor /lib/cpp on OS X!</B><br><br>
-Yep, that's right. OS X is a real Unix system, but Apple have been moving
-some stuff around. Look for it in /usr/bin or possible a place like
-/usr/libexec/gcc/darwin/ppc/2.95.2/cpp. Since a lot of programs assume
-cpp to be present in /lib it is probably smart to make a link, but you
-can also specify the location with the cpp keyword in your mdp files.
-<br><br>
-
-<li><A NAME="1-4cut">
-<B>What does "1-4 (#,#) interaction not within cut-off" mean?</B><br><br>
-Some of your atoms have moved so two atoms separated by three bonds are
-separated by more than the cutoff distance. This is BAD.
-Most important: <b>do not increase your cut-off!</b> This error
-actually indicates that the atoms have very large velocities,
-which usually means that (part of) your molecule(s) is (are)
-exploding. If you are using LINCS for constraints, you probably also
-already got a number of LINCS warnings. When using SHAKE this will
-give rise to a SHAKE error, which halts your simulation before the
-"1-4 not within cutoff" error can appear. <br><br>
-There can be a number of reasons for the large velocities in
-your system. If it happens at the
-beginning of the simulation, your system might be not equilibrated
-well enough (e.g. it contains some bad contacts). Try a(nother) round
-of energy minimization to fix this. Otherwise you might have a very
-high temperature, and/or a too large
-timestep. Experiment with these parameters till the error stops
-occurring. If this doesn't help, check your topology!<br><br>
-
-<li><A NAME="libnet">
-<B>What does "Fatal error: Routine gmx_tx called in libnet.c" mean?</B><br><br>
-You probably made a parallel mdrun without typing make distclean. The error
-messages tells you that the GROMACS library you are using doesn't support parallel runs.<br><br>
-
-<li><A NAME="output">
-<B>My simulation seems to be running, but shouldn't there be any output?</B><br><br>
-
-
-<UL>
-<LI>Your simulation might simply be (very) slow, and since output is
-buffered, it can take quite some time for output to appear in the
-respective files. If you are trying to fix some problems and you want
-to get output as fast as possible, you can set the environment
-variable <TT>LOG_BUFS</TT> to 0 by using <TT>setenv LOG_BUFS 0</TT>,
-this disables output buffering. Use <TT>unsetenv LOG_BUFS</TT> to turn
-buffering on again.
-
-<LI>Something might be going wrong in your simulation, causing
-e.g. <tt>not-a-number</tt>s (<TT>NAN</TT>) to be generated (these are
-the result of e.g. division by zero). Subsequent calculations with
-<TT>NAN</TT>'s will generate floating point exceptions which slow
-everything down by orders of magnitude. On a SGI system this will
-usually result in a large percentage of CPU time being devoted to
-'system' (check it with <TT>osview</TT>, or for a multi-processor
-machine with <TT>top</TT> and <TT>osview</TT>).
-
-<LI>You might have all
-<A HREF="/documentation/reference_3.1/online/mdp_opt.html#out"><TT>nst*</TT></A> parameters (see
-your <A HREF="/documentation/reference_3.1/online/mdp_opt.html"><TT>.mdp</TT></A> file) set to 0,
-this will suppress most output.
-
-<LI>Your disk might be full. Eventually this will lead to
-<TT>mdrun</TT> crashing, but since output is buffered, it might take a
-while for <TT>mdrun</TT> to realize it can't write.
-<li>You are runnning an executable compiled with MPI support (e.g.
-<a href="http://www.lam-mpi.org">LAM</a>) and did not start the LAM daemon
-(lamboot). See LAM documentation.
-</UL>
-<br><br>
-
-<li><A NAME="temp">
-<b>Why do I get very strange temperatures in my simulation?</b><br><br>
-You probably have very close contacts or a too large time
-step. This causes inaccurate integration which will usually result in
-a large positive temperature drift. Try some more energy minimization
-to get rid of the close contacts, or if that still doesn't help, try a
-short equilibration run with a small(er) time step. <br><br>
-
-<li><A NAME="recover">
-<b>Is there any smart way to continue a run that crashed?</b><br><br>
-Yes, if the reason for the crash didn't have anything to doe with
-the algorithms, i.e. it was due to a system crash, a full disk, or
- a kill by the queuing system. Otherwise you'll have to use
-<TT><a href="/documentation/reference_3.1/online/grompp.html">grompp</a></TT>
-and change the options.
-
-To really continue a simulation as if nothing had happened, you will
-need coordinates and velocities in full precision (i.e.
-<TT><a href="/documentation/reference_3.1/online/trr.html">.trr</a></TT> format).
-<TT><a href="/documentation/reference_3.1/online/xtc.html">.xtc</a></TT> trajectories are in
-reduced precision (only 3 decimal places after the decimal point) and
-do not contain velocity information at all. Feed this trajectory and
-your origional <TT><a href="/documentation/reference_3.1/online/tpr.html">.tpr</a></TT> file to
-<TT><a href="/documentation/reference_3.1/online/tpbconv.html">tpbconv</a></TT> to obtain a new
-<TT><a href="/documentation/reference_3.1/online/tpr.html">.tpr</a></TT> file, <EM>be sure</EM>
-to specify the one-but-last frame from your
-<TT><a href="/documentation/reference_3.1/online/trr.html">.trr</a></TT> file, since the very
-last frame is likely to be corrupted due to the crash. With the
-<TT><a href="/documentation/reference_3.1/online/tpr.html">.tpr</a></TT> file
-<TT><a href="/documentation/reference_3.1/online/tpbconv.html">tpbconv</a></TT> produces you can
-restart your simulation.<br><br>
-
-After the continuation run is finished, you will have your simulation
-split up in separate files, which you will probably want to combine.
-This can be done as follows (the same command works for xtc-files):<br><br>
-<table BORDER=0 CELLSPACING=0 CELLPADDING=8 COLS=3 WIDTH="100%" NOSAVE >
-<tr NOSAVE>
-<td WIDTH="2%" NOSAVE><font color="#000000"></font></td>
-
-<td WIDTH="98%" BGCOLOR="#000066" NOSAVE><font color="#FFFFFF">
-<TT>trjcat -o whole.trr part1.trr part2.trr part3.trr</TT>
-<td></td>
-</tr>
-</table>
-<br>
-The energy files can be concatenated in a similar manner:<br><br>
-<table BORDER=0 CELLSPACING=0 CELLPADDING=8 COLS=3 WIDTH="100%" NOSAVE >
-<tr NOSAVE>
-<td WIDTH="2%" NOSAVE><font color="#000000"></font></td>
-<td WIDTH="98%" BGCOLOR="#000066" NOSAVE><font color="#FFFFFF">
-<TT>eneconv -o whole.edr part1.edr part2.edr part3.edr</TT>
-<td></td>
-</tr>
-</table>
-<br>
-Since tpbconv sets the time in the continuation runs the files are
-automatically sorted and overlapping frames removed. If you have a mix of
-runs continued with tpbconv and grompp you might have to set the times yourself
-(see the manual pages for details).
-<br><br>
-
-It is of course possible to start a simulation from the coordinates in
-your <a href="/documentation/reference_3.1/online/xtc.html">xtc</a> file, but in that case new
-velocities will have to be generated resulting in a 'kink' in the
-simulation. To prevent this you should write coordinates and
-velocities to a <TT><a href="/documentation/reference_3.1/online/trr.html">.trr</a></TT> file
-during your simulations. Do this by setting
-<A HREF=/documentation/reference_3.1/online/mdp_opt.html#out><TT>nstxout</TT></a> and
-<A HREF=/documentation/reference_3.1/online/mdp_opt.html#out><TT>nstvout</TT></a> in your
-<TT><a href="/documentation/reference_3.1/online/mdp.html">.mdp</a></TT> file. You don't need
-these frames very often (every 10 ps or so), but remember that when
-<TT><a href="/documentation/reference_3.1/online/mdrun.html">mdrun</a></TT> crashes, everything
-calculated after the last frame in the <TT><a
-href="/documentation/reference_3.1/online/trr.html">.trr</a></TT> file, will have to be
-recalculated for a proper continuation.<br><br>
-
-<li><A NAME="largefiles">
-<b>When my trajectory files reach 2GB I get strange error messages,
-or they just disappear. Why?</b><br><br>
-This is a problem with the file system; when the system, or the C library,
-or the compiler, or the NFS implementation (version 2) only uses 32 bits
-for the file pointer you cannot use files larger than 2GB. On most modern
-systems there are special compiler flags you can set to enable 64-bit file
-pointers, but since the autoconf test for this doesn't work we have chosen
-not to include any flags by default, since it can break other things. But
-you can of course try anything if you add your own flags :-)
-<br>
-In any case, it is probably a good idea to try to keep your files
-smaller than 2GB. You never know if you later might need to use it over
-NFS version 2 or on some supercomputer system that doesn't support large
-files yet.<br><br>
-
-
-<li><A NAME="multPDB">
-<b>How do I analyze a PDB file with multiple entries?</b><br><br>
-Assuming your
-<A HREF="/documentation/reference_3.1/online/pdb.html"><TT>.pdb</TT></A> file is
-called "<TT>eiwit.pdb</TT>", this is what you would do:<br><br>
-<table BORDER=0 CELLSPACING=0 CELLPADDING=8 COLS=3 WIDTH="100%" NOSAVE >
-<tr NOSAVE>
-<td WIDTH="2%" NOSAVE><font color="#000000"></font></td>
-<td WIDTH="98%" BGCOLOR="#000066" NOSAVE><font color="#FFFFFF">
-<TT>pdb2gmx -f eiwit.pdb -reth -ter -n</TT>
-<td></td>
-</tr>
-</table>
-<br>
-
-<TT>-reth</TT> lets
-<A HREF="/documentation/reference_3.1/online/pdb2gmx.html"><TT>pdb2gmx</TT></A> keep all
-hydrogens which are present in your input file. It will also <B>not
-add</B> any missing hydrogens, so your molecules should be
-complete. <TT>-ter</TT> will cause <TT>pdb2gmx</TT> to ask for termini
-types for which you must select 'none' for both C- and N-terminus.
-<TT>-n</TT> tells <TT>pdb2gmx</TT> to generate a
-<A HREF="/documentation/reference_3.1/online/ndx.html"><TT>.ndx</TT></A> file with the atoms
-reordered to the GROMACS standard. <TT>pdb2gmx</TT> now generates a
-topology file (<A HREF="/documentation/reference_3.1/online/top.html"><TT>topol.top</TT></A>)
-which exactly corresponds with the molecule(s) in your input file.
-It also writes a coordinate file
-(<A HREF="/documentation/reference_3.1/online/gro.html"><TT>conf.gro</TT></A>).<br><br>
-
-<P>
-The next step is:
-<br><br>
-<table BORDER=0 CELLSPACING=0 CELLPADDING=8 COLS=3 WIDTH="100%" NOSAVE >
-<tr NOSAVE>
-<td WIDTH="2%" NOSAVE><font color="#000000"></font></td>
-<td WIDTH="98%" BGCOLOR="#000066" NOSAVE><font color="#FFFFFF">
-<tt>trjconv -f eiwit.pdb -o eiwit.xtc -n clean -timestep 1 -box 10 -center</tt>
-<td></td>
-</tr>
-</table>
-<br>
-
-Yes, <TT>-f eiwit.pdb</TT> works because a <TT>.pdb</TT> is also a
-<A HREF="/documentation/reference_3.1/online/files.html">trajectory format</A>
-in GROMACS. <TT>-ox</TT> sets output to
-<A HREF="/documentation/reference_3.1/online/xtc.html"><TT>.xtc</TT></A>. <TT>-n clean</TT>
-tells <A HREF="/documentation/reference_3.1/online/trjconv.html"><TT>trjconv</TT></A> to
-use the <TT>clean.ndx</TT> generated by
-<A HREF="/documentation/reference_3.1/online/pdb2gmx.html"><TT>pdb2gmx</TT></A>, so the atom
-ordering in the output (<TT>.xtc</TT>) file will be according to GROMACS
-standards. <TT>-timestep 1</TT> sets the timestep between output frames
-to one, so the structures from the <TT>.pdb</TT> file get numbered
-sequentially.
-<TT>-ter</TT> causes <TT>TER</TT> markers in the <TT>.pdb</TT>
-file to be seen as end-of-frame, default <TT>ENDMDL</TT> is used. If you
-are not sure what is in your <TT>eiwit.pdb</TT>, <TT>TER</TT> is a good
-guess, but you should check. If you have <TT>ENDMDL</TT> in stead of
-<TT>TER</TT>, omit the <TT>-ter</TT>. <TT>-box 10</TT> sets a default
-box-size in the output <TT>.xtc</TT> trajectory (since no box is stored
-in a <TT>.pdb</TT> file). The size is in nm and should be larger than
-your molecule size. <TT>-center</TT> resets the geometrical center of
-each of your structures to the center of the box (the one you specify
-with <TT>-box</TT>). <TT>trjconv</TT> will generate a <TT>.xtc</TT>
-trajectory file with all the coordinates from your <TT>eiwit.pdb</TT>.
-
-<P>
-A not very exiting but mandatory step is:<br><br>
-<table BORDER=0 CELLSPACING=0 CELLPADDING=8 COLS=3 WIDTH="100%" NOSAVE >
-<tr NOSAVE>
-<td WIDTH="2%" NOSAVE><font color="#000000"></font></td>
-<td WIDTH="98%" BGCOLOR="#000066" NOSAVE><font color="#FFFFFF">
-<tt>
-grompp -f grompp.mdp -c conf.gro -p topol.top</tt>
-<td></td>
-</tr>
-</table>
-<br>
-
-This will generate a run input file
-(<A HREF="/documentation/reference_3.1/online/tpr.html"><TT>topol.tpr</TT></A>) from the
-<TT>topol.top</TT> and <TT>conf.gro</TT> you generated with
-<TT>pdb2gmx</TT>.
-A <A HREF="/documentation/reference_3.1/online/mdp.html">default <TT>grompp.mdp</TT></A> is
-available. You can probably use it 'as is', but you might want or need
-to modify some thing. In any case you are encouraged to
-<A HREF="/documentation/reference_3.1/online/mdp_opt.html">review the description</A> of the
-numerous options in the <TT>.mdp</TT> file.
-
-<P>
-Now, suppose you want to calculate all cross-rmsd values for all
-structures. Enter:<br><br>
-<table BORDER=0 CELLSPACING=0 CELLPADDING=8 COLS=3 WIDTH="100%" NOSAVE >
-<tr NOSAVE>
-<td WIDTH="2%" NOSAVE><font color="#000000"></font></td>
-<td WIDTH="98%" BGCOLOR="#000066" NOSAVE><font color="#FFFFFF">
-<tt>
-g_rms -f eiwit.xtc -s topol.tpr -m</tt>
-<td></td>
-</tr>
-</table>
-<br>
-
-<TT>-f eiwit.xtc</TT> and <TT>-s topol.tpr</TT> are self-explanatory.
-<TT>-m</TT> tells <A HREF="/documentation/reference_3.1/online/g_rms.html"><TT>g_rms</TT></A>
-to output an RMSD matrix in
-<A HREF="/documentation/reference_3.1/online/xpm.html"><TT>.xpm</TT></A> format, which can be
-directly viewed with for example <TT>xv</TT>.
-
-<P>
-Of course there are many more analysis tools available. For example
-<A HREF="/documentation/reference_3.1/online/ngmx.html"><TT>ngmx</TT></A> a trajectory viewer.
-A list of all tools is available in the <A HREF="/documentation/reference_3.1/online.html">online
-manual</A>.
-
-<P>
-
-<li><A NAME="twostruc">
-<b>Can I fit two structures which do not have the
-same number/sequence of atoms?</b><br><br>
-Yes, just type:<br><br>
-<table BORDER=0 CELLSPACING=0 CELLPADDING=8 COLS=3 WIDTH="100%" NOSAVE >
-<tr NOSAVE>
-<td WIDTH="2%" NOSAVE><font color="#000000"></font></td>
-<td WIDTH="98%" BGCOLOR="#000066" NOSAVE><font color="#FFFFFF">
-<tt>g_confrms -f1 file1.xxx -f2 file2.xxx</tt>
-<td></td>
-</tr>
-</table>
-<br>
-g_confrms accepts
-any <A HREF="/documentation/reference_3.1/online/files.html">generic structure format</A> which
-for instance can be <TT>.pdb</TT>, <TT>.gro</TT> or <TT>.tpr</TT>.
-The program will ask you to select subgroups of both structures for the
-(non mass weighted) LSQ fit. These subgroups must have the same number of atoms, however the two
-structures do <B>not</B> need to have the same number of atoms, i.e. two proteins
-with the same number of residues but not the same type of residues can be
-fitted on c-alpha's. You will be warned when the atomnames in the fit groups
-do not match, but the program will go on.
-Option <TT>-o</TT> gives a <TT>.gro</TT> file of the second structure fitted
-one the first.
-Option <TT>-op</TT> gives a <TT>.gro</TT> file of the two structures fitted
-on top of each other.
-</P>
-
-<li><A NAME="group">
-<b>I get tired of having to select the same index group
-over and over again. Is there a better way to do it?</b><br><br>
-Use <A HREF="/documentation/reference_3.1/online/make_ndx.html"><TT>make_ndx</TT></A> to create
-an <A HREF="/documentation/reference_3.1/online/ndx.html"><TT>.ndx</TT></A> file with only one
-group in it, this is done by typing '<TT>keep #</TT>' in <TT>make_ndx</TT>,
-where '<TT>#</TT>' stands for the one group you want to have.
-Name the file <TT>index.ndx</TT> (which is the default
-filename for index files) and specify the option <TT>-n</TT> with your
-favorite GROMACS analysis tool. Now this single group will get selected
-automatically every time an index group is needed.
-<P>
-
-<li><A NAME="diys">
-<b>How do I perform an analysis that GROMACS doesn't have a program
-for?</b><br><br>
-We've created a small well-commented template analysis program for you; look in
-/usr/local/gromacs/share/template (or wherever you have gromacs installed.)
-This program reads a topology and trajectory and shows you how to access
-both coordinates and topology information like atom masses and charges.
-There is also a proper GROMACS Makefile in this directory, so if you
-copy the entire contents of the directory you should be able to type
-"make template" to build the program. It's easy to add more programs or
-change the name by editing the Makefile.<br><br>
-
-Now, if you wrote an analysis tool which, in your opinion, adds
-something that is really missing in GROMACS, please post it on the
-<a href="http://www.gromacs.org/mailing_lists/developers.php">developers
-mailing list</a> so that all other GROMACS users can also benefit from it!
-<br><br>
-
-<li><A NAME="none"> <b>My problem isn't mentioned above, and/or none
-of the solutions seem to work?</b> <br><br> Check the installation
-instructions carefully if your problem is related to the
-configuration, building and installation of GROMACS. Also try <A
-HREF="/documentation/reference_3.1/online/getting_started.html">"Getting
-Started"</A> where a guided tour of GROMACS is provided. A quick
-glance at the <A
-HREF="/documentation/reference_3.1/online/flow.html">flowchart</A> will
-tell you if you missed any essential steps in setting up a run.
-Checking your <A
-HREF="/documentation/reference_3.1/online/mdp.html"><TT>.mdp</TT></A>
-file against our <A
-HREF="/documentation/reference_3.1/online/mdp.html">sample
-<TT>.mdp</TT> file</A> and the <A
-HREF="/documentation/reference_3.1/online/mdp_opt.html">mdp options
-list</A> might solve a number of potential problems. In general it
-never hurts to read the <A
-HREF="/documentation/reference_3.1/online.html">manual pages</A> of all
-the GROMACS programs you (tried to) use. If all this still leaves you
-with any unanswered questions, please post your question to the <a
-href="/mailing_lists/users.php">gmx-users mailing list</a>! <br><br>
-</ul>
-
-
-<hr>
-<div ALIGN=RIGHT>
-<font size="-1"><a href="http://www.gromacs.org">http://www.gromacs.org</a></font><br>
-</div>
-
-</BODY>
More info can be found in the
<A HREF="flow.html">flowchart</A>
(for a quick overview) and the
-<A HREF="../gmxfaq.html">GMX FAQ</A>.
+<A HREF="http://www.gromacs.org/Documentation/FAQs">GROMACS FAQs</A>.
</P>
<br><hr><br>
More info can be found in the
<A HREF="flow.html">flowchart</A>
(for a quick overview) and the
-<A HREF="../gmxfaq.html">GROMACS FAQ (Frequently asked questions)</A>.
+<A HREF="http://www.gromacs.org/Documentation/FAQs">GROMACS FAQs (Frequently asked questions)</A>.
</P>
<br><hr><br>
<TD WIDTH=116>
<a href="http://www.gromacs.org/"><img SRC="../images/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td>
<td ALIGN=LEFT VALIGN=TOP WIDTH=280><br><h2>mdp options</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
-</TABLE></TD><TD WIDTH="*" ALIGN=RIGHT VALIGN=BOTTOM><p> </p><B>VERSION 4.6.3</B></td></tr></TABLE>
+</TABLE></TD><TD WIDTH="*" ALIGN=RIGHT VALIGN=BOTTOM><p> </p><B>VERSION 4.6.4</B></td></tr></TABLE>
<HR>
<!--
#cmakedefine GMX_FFT_ACML
/* Target platform is x86 or x86_64 */
-#cmakedefine GMX_IS_X86
+#cmakedefine GMX_TARGET_X86
/* Target platform is BlueGene/Q */
-#cmakedefine GMX_IS_BGQ
+#cmakedefine GMX_TARGET_BGQ
/* SSE2 instructions available */
#cmakedefine GMX_X86_SSE2
/* For convenience, and to enable configure-time invocation, we keep all architectures
* in a single file, but to avoid repeated ifdefs we set the overall architecture here.
*/
-#ifdef GMX_IS_X86
+#ifdef GMX_TARGET_X86
/* OK, it is x86, but can we execute cpuid? */
#if defined(GMX_X86_GCC_INLINE_ASM) || ( defined(_MSC_VER) && ( (_MSC_VER > 1500) || (_MSC_VER==1500 & _MSC_FULL_VER >= 150030729)))
# define GMX_CPUID_X86
int
gmx_cpuid_acceleration_check(gmx_cpuid_t cpuid,
- FILE * log)
+ FILE * log,
+ int print_to_stderr)
{
int rc;
char str[1024];
gmx_cpuid_acceleration_string[acc],
gmx_cpuid_acceleration_string[compiled_acc]);
}
- printf("Compiled acceleration: %s (Gromacs could use %s on this machine, which is better)\n",
- gmx_cpuid_acceleration_string[compiled_acc],
- gmx_cpuid_acceleration_string[acc]);
+ if (print_to_stderr)
+ {
+ fprintf(stderr, "Compiled acceleration: %s (Gromacs could use %s on this machine, which is better)\n",
+ gmx_cpuid_acceleration_string[compiled_acc],
+ gmx_cpuid_acceleration_string[acc]);
+ }
}
return rc;
}
-/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
+/*
+ * This file is part of the GROMACS molecular simulation package.
*
+ * Copyright (c) 2012,2013, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
*
- * This file is part of GROMACS.
- * Copyright (c) 2012-
- *
- * Written by the Gromacs development team under coordination of
- * David van der Spoel, Berk Hess, and Erik Lindahl.
- *
- * This library is free software; you can redistribute it and/or
+ * GROMACS is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public License
- * as published by the Free Software Foundation; either version 2
+ * as published by the Free Software Foundation; either version 2.1
* of the License, or (at your option) any later version.
*
- * To help us fund GROMACS development, we humbly ask that you cite
- * the research papers on the package. Check out http://www.gromacs.org
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*
- * And Hey:
- * GROup of MAchos and Cynical Suckers
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
*/
#ifdef HAVE_CONFIG_H
#include <config.h>
#include "thread_mpi/threads.h"
-#ifdef HAVE_UNISTD_H
-#include <unistd.h>
-#endif
#if ((defined(WIN32) || defined( _WIN32 ) || defined(WIN64) || defined( _WIN64 )) && !(defined (__CYGWIN__) || defined (__CYGWIN32__)))
#include "windows.h"
#endif
-/* Although we can't have more than 10 GPU different ID-s passed by the user as
- * the id-s are assumed to be represented by single digits, as multiple
- * processes can share a GPU, we can end up with more than 10 IDs.
- * To account for potential extreme cases we'll set the limit to a pretty
- * ridiculous number. */
-static unsigned int max_gpu_ids_user = 64;
+#ifdef GMX_GPU
+const gmx_bool bGPUBinary = TRUE;
+#else
+const gmx_bool bGPUBinary = FALSE;
+#endif
static const char * invalid_gpuid_hint =
"A delimiter-free sequence of valid numeric IDs of available GPUs is expected.";
/* FW decl. */
-static void limit_num_gpus_used(gmx_hw_info_t *hwinfo, int count);
+static void limit_num_gpus_used(gmx_gpu_opt_t *gpu_opt, int count);
+static int gmx_count_gpu_dev_unique(const gmx_gpu_info_t *gpu_info,
+ const gmx_gpu_opt_t *gpu_opt);
-static void sprint_gpus(char *sbuf, const gmx_gpu_info_t *gpu_info)
+static void sprint_gpus(char *sbuf, const gmx_gpu_info_t *gpu_info, gmx_bool bPrintAll)
{
int i, ndev;
char stmp[STRLEN];
char onhost[266], stmp[STRLEN];
int ngpu;
+ if (!gpu_info->bDetectGPUs)
+ {
+ /* We skipped the detection, so don't print detection stats */
+ return;
+ }
+
ngpu = gpu_info->ncuda_dev;
#if defined GMX_MPI && !defined GMX_THREAD_MPI
if (ngpu > 0)
{
- sprint_gpus(stmp, gpu_info);
+ sprint_gpus(stmp, gpu_info, TRUE);
md_print_warn(cr, fplog, "%d GPU%s detected%s:\n%s\n",
ngpu, (ngpu > 1) ? "s" : "", onhost, stmp);
}
static void print_gpu_use_stats(FILE *fplog,
const gmx_gpu_info_t *gpu_info,
+ const gmx_gpu_opt_t *gpu_opt,
const t_commrec *cr)
{
char sbuf[STRLEN], stmp[STRLEN];
- int i, ngpu, ngpu_all;
+ int i, ngpu_comp, ngpu_use;
- ngpu = gpu_info->ncuda_dev_use;
- ngpu_all = gpu_info->ncuda_dev;
+ ngpu_comp = gpu_info->ncuda_dev_compatible;
+ ngpu_use = gpu_opt->ncuda_dev_use;
- /* Issue note if GPUs are available but not used */
- if (ngpu_all > 0 && ngpu < 1)
+ /* Issue a note if GPUs are available but not used */
+ if (ngpu_comp > 0 && ngpu_use < 1)
{
sprintf(sbuf,
"%d compatible GPU%s detected in the system, but none will be used.\n"
"Consider trying GPU acceleration with the Verlet scheme!",
- ngpu_all, (ngpu_all > 1) ? "s" : "");
+ ngpu_comp, (ngpu_comp > 1) ? "s" : "");
}
else
{
- sprintf(sbuf, "%d GPU%s %sselected for this run: ",
- ngpu, (ngpu > 1) ? "s" : "",
- gpu_info->bUserSet ? "user-" : "auto-");
- for (i = 0; i < ngpu; i++)
+ int ngpu_use_uniq;
+
+ ngpu_use_uniq = gmx_count_gpu_dev_unique(gpu_info, gpu_opt);
+
+ sprintf(sbuf, "%d GPU%s %sselected for this run.\n"
+ "Mapping of GPU%s to the %d PP rank%s in this node: ",
+ ngpu_use_uniq, (ngpu_use_uniq > 1) ? "s" : "",
+ gpu_opt->bUserSet ? "user-" : "auto-",
+ (ngpu_use > 1) ? "s" : "",
+ cr->nrank_pp_intranode,
+ (cr->nrank_pp_intranode > 1) ? "s" : "");
+
+ for (i = 0; i < ngpu_use; i++)
{
- sprintf(stmp, "#%d", get_gpu_device_id(gpu_info, i));
- if (i < ngpu - 1)
+ sprintf(stmp, "#%d", get_gpu_device_id(gpu_info, gpu_opt, i));
+ if (i < ngpu_use - 1)
{
strcat(stmp, ", ");
}
/* Parse a "plain" GPU ID string which contains a sequence of digits corresponding
* to GPU IDs; the order will indicate the process/tMPI thread - GPU assignment. */
-static void parse_gpu_id_plain_string(const char *idstr, int *nid, int *idlist)
+static void parse_gpu_id_plain_string(const char *idstr, int *nid, int **idlist)
{
- int i;
- size_t len_idstr;
+ int i;
- len_idstr = strlen(idstr);
+ *nid = strlen(idstr);
- if (len_idstr > max_gpu_ids_user)
- {
- gmx_fatal(FARGS, "%d GPU IDs provided, but only at most %d are supported",
- len_idstr, max_gpu_ids_user);
- }
-
- *nid = len_idstr;
+ snew(*idlist, *nid);
for (i = 0; i < *nid; i++)
{
gmx_fatal(FARGS, "Invalid character in GPU ID string: '%c'\n%s\n",
idstr[i], invalid_gpuid_hint);
}
- idlist[i] = idstr[i] - '0';
+ (*idlist)[i] = idstr[i] - '0';
}
}
-void gmx_check_hw_runconf_consistency(FILE *fplog, gmx_hw_info_t *hwinfo,
- const t_commrec *cr, int ntmpi_requested,
- gmx_bool bUseGPU)
+static void parse_gpu_id_csv_string(const char *idstr, int *nid, int *idlist)
{
- int npppn, ntmpi_pp, ngpu;
- char sbuf[STRLEN], th_or_proc[STRLEN], th_or_proc_plural[STRLEN], pernode[STRLEN];
- char gpu_plural[2];
- gmx_bool bGPUBin, btMPI, bMPI, bMaxMpiThreadsSet, bNthreadsAuto, bEmulateGPU;
- int ret;
- static tMPI_Thread_mutex_t cons_lock = TMPI_THREAD_MUTEX_INITIALIZER;
+ /* XXX implement cvs format to support more than 10 different GPUs in a box. */
+ gmx_incons("Not implemented yet");
+}
+void gmx_check_hw_runconf_consistency(FILE *fplog,
+ const gmx_hw_info_t *hwinfo,
+ const t_commrec *cr,
+ const gmx_hw_opt_t *hw_opt,
+ gmx_bool bUseGPU)
+{
+ int npppn, ntmpi_pp;
+ char sbuf[STRLEN], th_or_proc[STRLEN], th_or_proc_plural[STRLEN], pernode[STRLEN];
+ gmx_bool btMPI, bMPI, bMaxMpiThreadsSet, bNthreadsAuto, bEmulateGPU;
assert(hwinfo);
assert(cr);
return;
}
- /* We run this function only once, but must make sure that all threads
- that are alive run this function, so they get consistent data. We
- achieve this by mutual exclusion and returning if the structure is
- already properly checked & set */
- ret = tMPI_Thread_mutex_lock(&cons_lock);
- if (ret != 0)
+ btMPI = bMPI = FALSE;
+ bNthreadsAuto = FALSE;
+#if defined(GMX_THREAD_MPI)
+ btMPI = TRUE;
+ bNthreadsAuto = (hw_opt->nthreads_tmpi < 1);
+#elif defined(GMX_LIB_MPI)
+ bMPI = TRUE;
+#endif
+
+ /* GPU emulation detection is done later, but we need here as well
+ * -- uncool, but there's no elegant workaround */
+ bEmulateGPU = (getenv("GMX_EMULATE_GPU") != NULL);
+ bMaxMpiThreadsSet = (getenv("GMX_MAX_MPI_THREADS") != NULL);
+
+ /* check the acceleration mdrun is compiled with against hardware
+ capabilities */
+ /* TODO: Here we assume homogeneous hardware which is not necessarily
+ the case! Might not hurt to add an extra check over MPI. */
+ gmx_cpuid_acceleration_check(hwinfo->cpuid_info, fplog, SIMMASTER(cr));
+
+ /* NOTE: this print is only for and on one physical node */
+ print_gpu_detection_stats(fplog, &hwinfo->gpu_info, cr);
+
+ if (hwinfo->gpu_info.ncuda_dev_compatible > 0)
{
- gmx_fatal(FARGS, "Error locking cons mutex: %s", strerror(errno));
+ /* NOTE: this print is only for and on one physical node */
+ print_gpu_use_stats(fplog, &hwinfo->gpu_info, &hw_opt->gpu_opt, cr);
}
- if (!hwinfo->bConsistencyChecked)
+ /* Need to ensure that we have enough GPUs:
+ * - need one GPU per PP node
+ * - no GPU oversubscription with tMPI
+ * */
+ /* number of PP processes per node */
+ npppn = cr->nrank_pp_intranode;
+
+ pernode[0] = '\0';
+ th_or_proc_plural[0] = '\0';
+ if (btMPI)
{
- btMPI = bMPI = FALSE;
- bNthreadsAuto = FALSE;
-#if defined(GMX_THREAD_MPI)
- btMPI = TRUE;
- bNthreadsAuto = (ntmpi_requested < 1);
-#elif defined(GMX_LIB_MPI)
- bMPI = TRUE;
-#endif
+ sprintf(th_or_proc, "thread-MPI thread");
+ if (npppn > 1)
+ {
+ sprintf(th_or_proc_plural, "s");
+ }
+ }
+ else if (bMPI)
+ {
+ sprintf(th_or_proc, "MPI process");
+ if (npppn > 1)
+ {
+ sprintf(th_or_proc_plural, "es");
+ }
+ sprintf(pernode, " per node");
+ }
+ else
+ {
+ /* neither MPI nor tMPI */
+ sprintf(th_or_proc, "process");
+ }
-#ifdef GMX_GPU
- bGPUBin = TRUE;
-#else
- bGPUBin = FALSE;
-#endif
+ if (bUseGPU && hwinfo->gpu_info.ncuda_dev_compatible > 0 &&
+ !bEmulateGPU)
+ {
+ int ngpu_comp, ngpu_use;
+ char gpu_comp_plural[2], gpu_use_plural[2];
+
+ ngpu_comp = hwinfo->gpu_info.ncuda_dev_compatible;
+ ngpu_use = hw_opt->gpu_opt.ncuda_dev_use;
+
+ sprintf(gpu_comp_plural, "%s", (ngpu_comp> 1) ? "s" : "");
+ sprintf(gpu_use_plural, "%s", (ngpu_use > 1) ? "s" : "");
- /* GPU emulation detection is done later, but we need here as well
- * -- uncool, but there's no elegant workaround */
- bEmulateGPU = (getenv("GMX_EMULATE_GPU") != NULL);
- bMaxMpiThreadsSet = (getenv("GMX_MAX_MPI_THREADS") != NULL);
-
- /* check the acceleration mdrun is compiled with against hardware
- capabilities */
- /* TODO: Here we assume homogeneous hardware which is not necessarily
- the case! Might not hurt to add an extra check over MPI. */
- gmx_cpuid_acceleration_check(hwinfo->cpuid_info, fplog);
-
- /* Need to ensure that we have enough GPUs:
- * - need one GPU per PP node
- * - no GPU oversubscription with tMPI
- * => keep on the GPU support, otherwise turn off (or bail if forced)
- * */
- /* number of PP processes per node */
- npppn = cr->nrank_pp_intranode;
-
- pernode[0] = '\0';
- th_or_proc_plural[0] = '\0';
- if (btMPI)
+ /* number of tMPI threads auto-adjusted */
+ if (btMPI && bNthreadsAuto)
{
- sprintf(th_or_proc, "thread-MPI thread");
- if (npppn > 1)
+ if (hw_opt->gpu_opt.bUserSet && npppn < ngpu_use)
{
- sprintf(th_or_proc_plural, "s");
+ /* The user manually provided more GPUs than threads we
+ could automatically start. */
+ gmx_fatal(FARGS,
+ "%d GPU%s provided, but only %d PP thread-MPI thread%s coud be started.\n"
+ "%s requires one PP tread-MPI thread per GPU; use fewer GPUs%s.",
+ ngpu_use, gpu_use_plural,
+ npppn, th_or_proc_plural,
+ ShortProgram(), bMaxMpiThreadsSet ? "\nor allow more threads to be used" : "");
}
- }
- else if (bMPI)
- {
- sprintf(th_or_proc, "MPI process");
- if (npppn > 1)
+
+ if (!hw_opt->gpu_opt.bUserSet && npppn < ngpu_comp)
{
- sprintf(th_or_proc_plural, "es");
+ /* There are more GPUs than tMPI threads; we have
+ limited the number GPUs used. */
+ md_print_warn(cr, fplog,
+ "NOTE: %d GPU%s were detected, but only %d PP thread-MPI thread%s can be started.\n"
+ " %s can use one GPU per PP tread-MPI thread, so only %d GPU%s will be used.%s\n",
+ ngpu_comp, gpu_comp_plural,
+ npppn, th_or_proc_plural,
+ ShortProgram(), npppn,
+ npppn > 1 ? "s" : "",
+ bMaxMpiThreadsSet ? "\n Also, you can allow more threads to be used by increasing GMX_MAX_MPI_THREADS" : "");
}
- sprintf(pernode, " per node");
- }
- else
- {
- /* neither MPI nor tMPI */
- sprintf(th_or_proc, "process");
}
- if (bGPUBin)
+ if (hw_opt->gpu_opt.bUserSet)
{
- print_gpu_detection_stats(fplog, &hwinfo->gpu_info, cr);
+ if (ngpu_use != npppn)
+ {
+ gmx_fatal(FARGS,
+ "Incorrect launch configuration: mismatching number of PP %s%s and GPUs%s.\n"
+ "%s was started with %d PP %s%s%s, but you provided %d GPU%s.",
+ th_or_proc, btMPI ? "s" : "es", pernode,
+ ShortProgram(), npppn, th_or_proc,
+ th_or_proc_plural, pernode,
+ ngpu_use, gpu_use_plural);
+ }
}
-
- if (bUseGPU && hwinfo->bCanUseGPU && !bEmulateGPU)
+ else
{
- ngpu = hwinfo->gpu_info.ncuda_dev_use;
- sprintf(gpu_plural, "%s", (ngpu > 1) ? "s" : "");
-
- /* number of tMPI threads atuo-adjusted */
- if (btMPI && bNthreadsAuto)
+ if (ngpu_comp > npppn)
{
- if (npppn < ngpu)
- {
- if (hwinfo->gpu_info.bUserSet)
- {
- /* The user manually provided more GPUs than threads we
- could automatically start. */
- gmx_fatal(FARGS,
- "%d GPU%s provided, but only %d PP thread-MPI thread%s coud be started.\n"
- "%s requires one PP tread-MPI thread per GPU; use fewer GPUs%s.",
- ngpu, gpu_plural, npppn, th_or_proc_plural,
- ShortProgram(), bMaxMpiThreadsSet ? "\nor allow more threads to be used" : "");
- }
- else
- {
- /* There are more GPUs than tMPI threads; we have to
- limit the number GPUs used. */
- md_print_warn(cr, fplog,
- "NOTE: %d GPU%s were detected, but only %d PP thread-MPI thread%s can be started.\n"
- " %s can use one GPU per PP tread-MPI thread, so only %d GPU%s will be used.%s\n",
- ngpu, gpu_plural, npppn,
- th_or_proc_plural,
- ShortProgram(), npppn,
- npppn > 1 ? "s" : "",
- bMaxMpiThreadsSet ? "\n Also, you can allow more threads to be used by increasing GMX_MAX_MPI_THREADS" : "");
-
- if (cr->rank_pp_intranode == 0)
- {
- limit_num_gpus_used(hwinfo, npppn);
- ngpu = hwinfo->gpu_info.ncuda_dev_use;
- sprintf(gpu_plural, "%s", (ngpu > 1) ? "s" : "");
- }
- }
- }
+ md_print_warn(cr, fplog,
+ "NOTE: potentially sub-optimal launch configuration, %s started with less\n"
+ " PP %s%s%s than GPU%s available.\n"
+ " Each PP %s can use only one GPU, %d GPU%s%s will be used.\n",
+ ShortProgram(), th_or_proc,
+ th_or_proc_plural, pernode, gpu_comp_plural,
+ th_or_proc, npppn, gpu_use_plural, pernode);
}
- if (ngpu != npppn)
+ if (ngpu_use != npppn)
{
- if (hwinfo->gpu_info.bUserSet)
+ /* Avoid duplicate error messages.
+ * Unfortunately we can only do this at the physical node
+ * level, since the hardware setup and MPI process count
+ * might differ between physical nodes.
+ */
+ if (cr->rank_pp_intranode == 0)
{
gmx_fatal(FARGS,
"Incorrect launch configuration: mismatching number of PP %s%s and GPUs%s.\n"
- "%s was started with %d PP %s%s%s, but you provided %d GPU%s.",
+ "%s was started with %d PP %s%s%s, but only %d GPU%s were detected.",
th_or_proc, btMPI ? "s" : "es", pernode,
ShortProgram(), npppn, th_or_proc,
- th_or_proc_plural, pernode, ngpu, gpu_plural);
- }
- else
- {
- if (ngpu > npppn)
- {
- md_print_warn(cr, fplog,
- "NOTE: potentially sub-optimal launch configuration, %s started with less\n"
- " PP %s%s%s than GPU%s available.\n"
- " Each PP %s can use only one GPU, %d GPU%s%s will be used.\n",
- ShortProgram(), th_or_proc,
- th_or_proc_plural, pernode, gpu_plural,
- th_or_proc, npppn, gpu_plural, pernode);
-
- if (bMPI || (btMPI && cr->rank_pp_intranode == 0))
- {
- limit_num_gpus_used(hwinfo, npppn);
- ngpu = hwinfo->gpu_info.ncuda_dev_use;
- sprintf(gpu_plural, "%s", (ngpu > 1) ? "s" : "");
- }
- }
- else
- {
- /* Avoid duplicate error messages.
- * Unfortunately we can only do this at the physical node
- * level, since the hardware setup and MPI process count
- * might be differ over physical nodes.
- */
- if (cr->rank_pp_intranode == 0)
- {
- gmx_fatal(FARGS,
- "Incorrect launch configuration: mismatching number of PP %s%s and GPUs%s.\n"
- "%s was started with %d PP %s%s%s, but only %d GPU%s were detected.",
- th_or_proc, btMPI ? "s" : "es", pernode,
- ShortProgram(), npppn, th_or_proc,
- th_or_proc_plural, pernode, ngpu,
- gpu_plural);
- }
- }
+ th_or_proc_plural, pernode,
+ ngpu_use, gpu_use_plural);
}
}
+ }
- {
- int same_count;
+ {
+ int same_count;
- same_count = gmx_count_gpu_dev_shared(&(hwinfo->gpu_info));
+ same_count = gmx_count_gpu_dev_shared(&hw_opt->gpu_opt);
- if (btMPI && same_count > 0)
- {
- gmx_fatal(FARGS,
- "Invalid GPU assignment: can't share a GPU among multiple thread-MPI threads.\n"
- "Use MPI if you are sure that you want to assign GPU to multiple threads.");
- }
-
- if (same_count > 0)
- {
- md_print_warn(cr, fplog,
- "NOTE: Potentially sub-optimal launch configuration: you assigned %s to\n"
- " multiple %s%s; this should be avoided as it can cause\n"
- " performance loss.\n",
- same_count > 1 ? "GPUs" : "a GPU", th_or_proc, btMPI ? "s" : "es");
- }
+ if (same_count > 0)
+ {
+ md_print_info(cr, fplog,
+ "NOTE: You assigned %s to multiple %s%s.\n",
+ same_count > 1 ? "GPUs" : "a GPU", th_or_proc, btMPI ? "s" : "es");
}
- print_gpu_use_stats(fplog, &hwinfo->gpu_info, cr);
}
- hwinfo->bConsistencyChecked = TRUE;
- }
-
- ret = tMPI_Thread_mutex_unlock(&cons_lock);
- if (ret != 0)
- {
- gmx_fatal(FARGS, "Error unlocking cons mutex: %s", strerror(errno));
}
#ifdef GMX_MPI
}
-int gmx_count_gpu_dev_shared(const gmx_gpu_info_t *gpu_info)
+/* Return 0 if none of the GPU (per node) are shared among PP ranks.
+ *
+ * Sharing GPUs among multiple PP ranks is possible when the user passes
+ * GPU IDs. Here we check for sharing and return a non-zero value when
+ * this is detected. Note that the return value represents the number of
+ * PP rank pairs that share a device.
+ */
+int gmx_count_gpu_dev_shared(const gmx_gpu_opt_t *gpu_opt)
{
int same_count = 0;
- int ngpu = gpu_info->ncuda_dev_use;
+ int ngpu = gpu_opt->ncuda_dev_use;
- if (gpu_info->bUserSet)
+ if (gpu_opt->bUserSet)
{
int i, j;
{
for (j = i + 1; j < ngpu; j++)
{
- same_count += (gpu_info->cuda_dev_use[i] ==
- gpu_info->cuda_dev_use[j]);
+ same_count += (gpu_opt->cuda_dev_use[i] ==
+ gpu_opt->cuda_dev_use[j]);
}
}
}
return same_count;
}
+/* Count and return the number of unique GPUs (per node) selected.
+ *
+ * As sharing GPUs among multiple PP ranks is possible when the user passes
+ * GPU IDs, the number of GPUs user (per node) can be different from the
+ * number of GPU IDs selected.
+ */
+static int gmx_count_gpu_dev_unique(const gmx_gpu_info_t *gpu_info,
+ const gmx_gpu_opt_t *gpu_opt)
+{
+ int i, uniq_count, ngpu;
+ int *uniq_ids;
+
+ assert(gpu_info);
+ assert(gpu_opt);
+
+ ngpu = gpu_info->ncuda_dev;
+ uniq_count = 0;
+
+ snew(uniq_ids, ngpu);
+
+ /* Each element in uniq_ids will be set to 0 or 1. The n-th element set
+ * to 1 indicates that the respective GPU was selected to be used. */
+ for (i = 0; i < gpu_opt->ncuda_dev_use; i++)
+ {
+ uniq_ids[get_gpu_device_id(gpu_info, gpu_opt, i)] = 1;
+ }
+ /* Count the devices used. */
+ for (i = 0; i < ngpu; i++)
+ {
+ uniq_count += uniq_ids[i];
+ }
+
+ sfree(uniq_ids);
+
+ return uniq_count;
+}
+
/* Return the number of hardware threads supported by the current CPU.
* We assume that this is equal with the number of CPUs reported to be
* online by the OS at the time of the call.
*/
-static int get_nthreads_hw_avail(FILE gmx_unused *fplog, const t_commrec gmx_unused *cr)
+static int get_nthreads_hw_avail(FILE *fplog, const t_commrec *cr)
{
int ret = 0;
return ret;
}
+static void gmx_detect_gpus(FILE *fplog, const t_commrec *cr,
+ gmx_gpu_info_t *gpu_info)
+{
+#ifdef GMX_LIB_MPI
+ int rank_world;
+ MPI_Comm physicalnode_comm;
+#endif
+ int rank_local;
+
+ /* Under certain circumstances MPI ranks on the same physical node
+ * can not simultaneously access the same GPU(s). Therefore we run
+ * the detection only on one MPI rank per node and broadcast the info.
+ * Note that with thread-MPI only a single thread runs this code.
+ *
+ * TODO: We should also do CPU hardware detection only once on each
+ * physical node and broadcast it, instead of do it on every MPI rank.
+ */
+#ifdef GMX_LIB_MPI
+ /* A split of MPI_COMM_WORLD over physical nodes is only required here,
+ * so we create and destroy it locally.
+ */
+ MPI_Comm_rank(MPI_COMM_WORLD, &rank_world);
+ MPI_Comm_split(MPI_COMM_WORLD, gmx_physicalnode_id_hash(),
+ rank_world, &physicalnode_comm);
+ MPI_Comm_rank(physicalnode_comm, &rank_local);
+#else
+ /* Here there should be only one process, check this */
+ assert(cr->nnodes == 1 && cr->sim_nodeid == 0);
+
+ rank_local = 0;
+#endif
+
+ if (rank_local == 0)
+ {
+ char detection_error[STRLEN], sbuf[STRLEN];
+
+ if (detect_cuda_gpus(&hwinfo_g->gpu_info, detection_error) != 0)
+ {
+ if (detection_error != NULL && detection_error[0] != '\0')
+ {
+ sprintf(sbuf, ":\n %s\n", detection_error);
+ }
+ else
+ {
+ sprintf(sbuf, ".");
+ }
+ md_print_warn(cr, fplog,
+ "NOTE: Error occurred during GPU detection%s"
+ " Can not use GPU acceleration, will fall back to CPU kernels.\n",
+ sbuf);
+ }
+ }
+
+#ifdef GMX_LIB_MPI
+ /* Broadcast the GPU info to the other ranks within this node */
+ MPI_Bcast(&hwinfo_g->gpu_info.ncuda_dev, 1, MPI_INT, 0, physicalnode_comm);
+
+ if (hwinfo_g->gpu_info.ncuda_dev > 0)
+ {
+ int cuda_dev_size;
+
+ cuda_dev_size = hwinfo_g->gpu_info.ncuda_dev*sizeof_cuda_dev_info();
+
+ if (rank_local > 0)
+ {
+ hwinfo_g->gpu_info.cuda_dev =
+ (cuda_dev_info_ptr_t)malloc(cuda_dev_size);
+ }
+ MPI_Bcast(hwinfo_g->gpu_info.cuda_dev, cuda_dev_size, MPI_BYTE,
+ 0, physicalnode_comm);
+ MPI_Bcast(&hwinfo_g->gpu_info.ncuda_dev_compatible, 1, MPI_INT,
+ 0, physicalnode_comm);
+ }
+
+ MPI_Comm_free(&physicalnode_comm);
+#endif
+}
+
gmx_hw_info_t *gmx_detect_hardware(FILE *fplog, const t_commrec *cr,
- gmx_bool bForceUseGPU, gmx_bool bTryUseGPU,
- const char *gpu_id)
+ gmx_bool bDetectGPUs)
{
- int i;
- const char *env;
- char sbuf[STRLEN], stmp[STRLEN];
gmx_hw_info_t *hw;
- gmx_gpu_info_t gpuinfo_auto, gpuinfo_user;
- gmx_bool bGPUBin;
int ret;
/* make sure no one else is doing the same thing */
if (n_hwinfo == 0)
{
snew(hwinfo_g, 1);
- hwinfo_g->bConsistencyChecked = FALSE;
/* detect CPUID info; no fuss, we don't detect system-wide
* -- sloppy, but that's it for now */
hwinfo_g->nthreads_hw_avail = get_nthreads_hw_avail(fplog, cr);
/* detect GPUs */
- hwinfo_g->gpu_info.ncuda_dev_use = 0;
- hwinfo_g->gpu_info.cuda_dev_use = NULL;
- hwinfo_g->gpu_info.ncuda_dev = 0;
- hwinfo_g->gpu_info.cuda_dev = NULL;
-
-#ifdef GMX_GPU
- bGPUBin = TRUE;
-#else
- bGPUBin = FALSE;
-#endif
-
- /* Bail if binary is not compiled with GPU acceleration, but this is either
- * explicitly (-nb gpu) or implicitly (gpu ID passed) requested. */
- if (bForceUseGPU && !bGPUBin)
+ hwinfo_g->gpu_info.ncuda_dev = 0;
+ hwinfo_g->gpu_info.cuda_dev = NULL;
+ hwinfo_g->gpu_info.ncuda_dev_compatible = 0;
+
+ /* Run the detection if the binary was compiled with GPU support
+ * and we requested detection.
+ */
+ hwinfo_g->gpu_info.bDetectGPUs =
+ (bGPUBinary && bDetectGPUs &&
+ getenv("GMX_DISABLE_GPU_DETECTION") == NULL);
+ if (hwinfo_g->gpu_info.bDetectGPUs)
{
- gmx_fatal(FARGS, "GPU acceleration requested, but %s was compiled without GPU support!", ShortProgram());
- }
- if (gpu_id != NULL && !bGPUBin)
- {
- gmx_fatal(FARGS, "GPU ID string set, but %s was compiled without GPU support!", ShortProgram());
+ gmx_detect_gpus(fplog, cr, &hwinfo_g->gpu_info);
}
+ }
+ /* increase the reference counter */
+ n_hwinfo++;
- /* run the detection if the binary was compiled with GPU support */
- if (bGPUBin && getenv("GMX_DISABLE_GPU_DETECTION") == NULL)
- {
- char detection_error[STRLEN];
+ ret = tMPI_Thread_mutex_unlock(&hw_info_lock);
+ if (ret != 0)
+ {
+ gmx_fatal(FARGS, "Error unlocking hwinfo mutex: %s", strerror(errno));
+ }
- if (detect_cuda_gpus(&hwinfo_g->gpu_info, detection_error) != 0)
- {
- if (detection_error != NULL && detection_error[0] != '\0')
- {
- sprintf(sbuf, ":\n %s\n", detection_error);
- }
- else
- {
- sprintf(sbuf, ".");
- }
- md_print_warn(cr, fplog,
- "NOTE: Error occurred during GPU detection%s"
- " Can not use GPU acceleration, will fall back to CPU kernels.\n",
- sbuf);
- }
- }
+ return hwinfo_g;
+}
- if (bForceUseGPU || bTryUseGPU)
+void gmx_parse_gpu_ids(gmx_gpu_opt_t *gpu_opt)
+{
+ char *env;
+
+ if (gpu_opt->gpu_id != NULL && !bGPUBinary)
+ {
+ gmx_fatal(FARGS, "GPU ID string set, but %s was compiled without GPU support!", ShortProgram());
+ }
+
+ env = getenv("GMX_GPU_ID");
+ if (env != NULL && gpu_opt->gpu_id != NULL)
+ {
+ gmx_fatal(FARGS, "GMX_GPU_ID and -gpu_id can not be used at the same time");
+ }
+ if (env == NULL)
+ {
+ env = gpu_opt->gpu_id;
+ }
+
+ /* parse GPU IDs if the user passed any */
+ if (env != NULL)
+ {
+ parse_gpu_id_plain_string(env,
+ &gpu_opt->ncuda_dev_use,
+ &gpu_opt->cuda_dev_use);
+
+ if (gpu_opt->ncuda_dev_use == 0)
{
- env = getenv("GMX_GPU_ID");
- if (env != NULL && gpu_id != NULL)
- {
- gmx_fatal(FARGS, "GMX_GPU_ID and -gpu_id can not be used at the same time");
- }
- if (env == NULL)
- {
- env = gpu_id;
- }
+ gmx_fatal(FARGS, "Empty GPU ID string encountered.\n%s\n",
+ invalid_gpuid_hint);
+ }
- /* parse GPU IDs if the user passed any */
- if (env != NULL)
- {
- int *gpuid, *checkres;
- int nid, res;
+ gpu_opt->bUserSet = TRUE;
+ }
+}
- snew(gpuid, max_gpu_ids_user);
- snew(checkres, max_gpu_ids_user);
+void gmx_select_gpu_ids(FILE *fplog, const t_commrec *cr,
+ const gmx_gpu_info_t *gpu_info,
+ gmx_bool bForceUseGPU,
+ gmx_gpu_opt_t *gpu_opt)
+{
+ int i;
+ const char *env;
+ char sbuf[STRLEN], stmp[STRLEN];
- parse_gpu_id_plain_string(env, &nid, gpuid);
+ /* Bail if binary is not compiled with GPU acceleration, but this is either
+ * explicitly (-nb gpu) or implicitly (gpu ID passed) requested. */
+ if (bForceUseGPU && !bGPUBinary)
+ {
+ gmx_fatal(FARGS, "GPU acceleration requested, but %s was compiled without GPU support!", ShortProgram());
+ }
- if (nid == 0)
- {
- gmx_fatal(FARGS, "Empty GPU ID string encountered.\n%s\n",
- invalid_gpuid_hint);
- }
+ if (gpu_opt->bUserSet)
+ {
+ /* Check the GPU IDs passed by the user.
+ * (GPU IDs have been parsed by gmx_parse_gpu_ids before)
+ */
+ int *checkres;
+ int res;
- res = check_select_cuda_gpus(checkres, &hwinfo_g->gpu_info,
- gpuid, nid);
+ snew(checkres, gpu_opt->ncuda_dev_use);
- if (!res)
- {
- print_gpu_detection_stats(fplog, &hwinfo_g->gpu_info, cr);
-
- sprintf(sbuf, "Some of the requested GPUs do not exist, behave strangely, or are not compatible:\n");
- for (i = 0; i < nid; i++)
- {
- if (checkres[i] != egpuCompatible)
- {
- sprintf(stmp, " GPU #%d: %s\n",
- gpuid[i], gpu_detect_res_str[checkres[i]]);
- strcat(sbuf, stmp);
- }
- }
- gmx_fatal(FARGS, "%s", sbuf);
- }
+ res = check_selected_cuda_gpus(checkres, gpu_info, gpu_opt);
- hwinfo_g->gpu_info.bUserSet = TRUE;
+ if (!res)
+ {
+ print_gpu_detection_stats(fplog, gpu_info, cr);
- sfree(gpuid);
- sfree(checkres);
- }
- else
+ sprintf(sbuf, "Some of the requested GPUs do not exist, behave strangely, or are not compatible:\n");
+ for (i = 0; i < gpu_opt->ncuda_dev_use; i++)
{
- pick_compatible_gpus(&hwinfo_g->gpu_info);
- hwinfo_g->gpu_info.bUserSet = FALSE;
+ if (checkres[i] != egpuCompatible)
+ {
+ sprintf(stmp, " GPU #%d: %s\n",
+ gpu_opt->cuda_dev_use[i],
+ gpu_detect_res_str[checkres[i]]);
+ strcat(sbuf, stmp);
+ }
}
+ gmx_fatal(FARGS, "%s", sbuf);
+ }
- /* decide whether we can use GPU */
- hwinfo_g->bCanUseGPU = (hwinfo_g->gpu_info.ncuda_dev_use > 0);
- if (!hwinfo_g->bCanUseGPU && bForceUseGPU)
- {
- gmx_fatal(FARGS, "GPU acceleration requested, but no compatible GPUs were detected.");
- }
+ sfree(checkres);
+ }
+ else
+ {
+ pick_compatible_gpus(&hwinfo_g->gpu_info, gpu_opt);
+
+ if (gpu_opt->ncuda_dev_use > cr->nrank_pp_intranode)
+ {
+ /* We picked more GPUs than we can use: limit the number.
+ * We print detailed messages about this later in
+ * gmx_check_hw_runconf_consistency.
+ */
+ limit_num_gpus_used(gpu_opt, cr->nrank_pp_intranode);
}
+
+ gpu_opt->bUserSet = FALSE;
}
- /* increase the reference counter */
- n_hwinfo++;
- ret = tMPI_Thread_mutex_unlock(&hw_info_lock);
- if (ret != 0)
+ /* If the user asked for a GPU, check whether we have a GPU */
+ if (bForceUseGPU && gpu_info->ncuda_dev_compatible == 0)
{
- gmx_fatal(FARGS, "Error unlocking hwinfo mutex: %s", strerror(errno));
+ gmx_fatal(FARGS, "GPU acceleration requested, but no compatible GPUs were detected.");
}
-
- return hwinfo_g;
}
-static void limit_num_gpus_used(gmx_hw_info_t *hwinfo, int count)
+static void limit_num_gpus_used(gmx_gpu_opt_t *gpu_opt, int count)
{
int ndev_use;
- assert(hwinfo);
+ assert(gpu_opt);
- ndev_use = hwinfo->gpu_info.ncuda_dev_use;
+ ndev_use = gpu_opt->ncuda_dev_use;
if (count > ndev_use)
{
}
/* TODO: improve this implementation: either sort GPUs or remove the weakest here */
- hwinfo->gpu_info.ncuda_dev_use = count;
+ gpu_opt->ncuda_dev_use = count;
}
void gmx_hardware_info_free(gmx_hw_info_t *hwinfo)
#include "gmx_cpuid.h"
#include "gmx_omp.h"
#include "gmx_omp_nthreads.h"
-#include "mdrun.h"
#include "md_logging.h"
#include "statutil.h"
#include "gmx_thread_affinity.h"
* \param[out] result_str the message related to the error that occurred
* during the initialization (if there was any).
* \param[in] gpu_info GPU info of all detected devices in the system.
+ * \param[in] gpu_opt options for using the GPUs in gpu_info
* \returns true if no error occurs during initialization.
*/
-gmx_bool init_gpu(int mygpu, char *result_str, const gmx_gpu_info_t *gpu_info)
+gmx_bool init_gpu(int mygpu, char *result_str,
+ const gmx_gpu_info_t *gpu_info,
+ const gmx_gpu_opt_t *gpu_opt)
{
cudaError_t stat;
char sbuf[STRLEN];
assert(gpu_info);
assert(result_str);
- if (mygpu < 0 || mygpu >= gpu_info->ncuda_dev_use)
+ if (mygpu < 0 || mygpu >= gpu_opt->ncuda_dev_use)
{
sprintf(sbuf, "Trying to initialize an inexistent GPU: "
"there are %d %s-selected GPU(s), but #%d was requested.",
- gpu_info->ncuda_dev_use, gpu_info->bUserSet ? "user" : "auto", mygpu);
+ gpu_opt->ncuda_dev_use, gpu_opt->bUserSet ? "user" : "auto", mygpu);
gmx_incons(sbuf);
}
- gpuid = gpu_info->cuda_dev[gpu_info->cuda_dev_use[mygpu]].id;
+ gpuid = gpu_info->cuda_dev[gpu_opt->cuda_dev_use[mygpu]].id;
stat = cudaSetDevice(gpuid);
strncpy(result_str, cudaGetErrorString(stat), STRLEN);
assert(gpu_info);
assert(err_str);
+ gpu_info->ncuda_dev_compatible = 0;
+
ndev = 0;
devs = NULL;
devs[i].id = i;
devs[i].prop = prop;
devs[i].stat = checkres;
+
+ if (checkres == egpuCompatible)
+ {
+ gpu_info->ncuda_dev_compatible++;
+ }
}
retval = 0;
}
* This function selects the compatible gpus and initializes
* gpu_info->cuda_dev_use and gpu_info->ncuda_dev_use.
*
- * Given the list of GPUs available in the system the it checks each gpu in
- * gpu_info->cuda_dev and puts the the indices (into gpu_info->cuda_dev) of
- * the compatible ones into cuda_dev_use with this marking the respective
- * GPUs as "available for use."
+ * Given the list of GPUs available in the system check each device in
+ * gpu_info->cuda_dev and place the indices of the compatible GPUs into
+ * cuda_dev_use with this marking the respective GPUs as "available for use."
* Note that \detect_cuda_gpus must have been called before.
*
- * \param[in] gpu_info pointer to structure holding GPU information
+ * \param[in] gpu_info pointer to structure holding GPU information
+ * \param[in,out] gpu_opt pointer to structure holding GPU options
*/
-void pick_compatible_gpus(gmx_gpu_info_t *gpu_info)
+void pick_compatible_gpus(const gmx_gpu_info_t *gpu_info,
+ gmx_gpu_opt_t *gpu_opt)
{
int i, ncompat;
int *compat;
}
}
- gpu_info->ncuda_dev_use = ncompat;
- snew(gpu_info->cuda_dev_use, ncompat);
- memcpy(gpu_info->cuda_dev_use, compat, ncompat*sizeof(*compat));
+ gpu_opt->ncuda_dev_use = ncompat;
+ snew(gpu_opt->cuda_dev_use, ncompat);
+ memcpy(gpu_opt->cuda_dev_use, compat, ncompat*sizeof(*compat));
sfree(compat);
}
/*! \brief Check the existence/compatibility of a set of GPUs specified by their device IDs.
*
- * Given the a list of GPU devide IDs in \requested_devs, check for the
- * existence and compatibility of the respective GPUs and fill in \gpu_info
- * with the collected information. Also provide the caller with an array with
+ * Given the a list of gpu->ncuda_dev_use GPU device IDs stored in
+ * gpu_opt->cuda_dev_use check the existence and compatibility
+ * of the respective GPUs. Also provide the caller with an array containing
* the result of checks in \checkres.
*
* \param[out] checkres check result for each ID passed in \requested_devs
* \param[in] gpu_info pointer to structure holding GPU information
- * \param[in] requested_devs array of requested device IDs
- * \param[in] count number of IDs in \requested_devs
- * \returns TRUE if every requested GPU is compatible
+ * \param[out] gpu_opt pointer to structure holding GPU options
+ * \returns TRUE if every the requested GPUs are compatible
*/
-gmx_bool check_select_cuda_gpus(int *checkres, gmx_gpu_info_t *gpu_info,
- const int *requested_devs, int count)
+gmx_bool check_selected_cuda_gpus(int *checkres,
+ const gmx_gpu_info_t *gpu_info,
+ gmx_gpu_opt_t *gpu_opt)
{
int i, id;
bool bAllOk;
assert(checkres);
assert(gpu_info);
- assert(requested_devs);
- assert(count >= 0);
+ assert(gpu_opt->ncuda_dev_use >= 0);
- if (count == 0)
+ if (gpu_opt->ncuda_dev_use == 0)
{
return TRUE;
}
+ assert(gpu_opt->cuda_dev_use);
+
/* we will assume that all GPUs requested are valid IDs,
otherwise we'll bail anyways */
- gpu_info->ncuda_dev_use = count;
- snew(gpu_info->cuda_dev_use, count);
bAllOk = true;
- for (i = 0; i < count; i++)
+ for (i = 0; i < gpu_opt->ncuda_dev_use; i++)
{
- id = requested_devs[i];
+ id = gpu_opt->cuda_dev_use[i];
/* devices are stored in increasing order of IDs in cuda_dev */
- gpu_info->cuda_dev_use[i] = id;
+ gpu_opt->cuda_dev_use[i] = id;
checkres[i] = (id >= gpu_info->ncuda_dev) ?
egpuNonexistent : gpu_info->cuda_dev[id].stat;
return;
}
- sfree(gpu_info->cuda_dev_use);
sfree(gpu_info->cuda_dev);
}
* respective CUDA GPU.
*
* \param[in] gpu_info pointer to structure holding GPU information
+ * \param[in] gpu_opt pointer to structure holding GPU options
* \param[in] idx index into the array of used GPUs
* \returns device ID of the requested GPU
*/
-int get_gpu_device_id(const gmx_gpu_info_t *gpu_info, int idx)
+int get_gpu_device_id(const gmx_gpu_info_t *gpu_info,
+ const gmx_gpu_opt_t *gpu_opt,
+ int idx)
{
assert(gpu_info);
- if (idx < 0 && idx >= gpu_info->ncuda_dev_use)
+ assert(gpu_opt);
+ if (idx < 0 && idx >= gpu_opt->ncuda_dev_use)
{
return -1;
}
- return gpu_info->cuda_dev[gpu_info->cuda_dev_use[idx]].id;
+ return gpu_info->cuda_dev[gpu_opt->cuda_dev_use[idx]].id;
}
/*! \brief Returns the device ID of the GPU currently in use.
return gpuid;
}
+
+/*! \brief Returns the size of the cuda_dev_info struct.
+ *
+ * The size of cuda_dev_info can be used for allocation and communication.
+ *
+ * \returns size in bytes of cuda_dev_info
+ */
+size_t sizeof_cuda_dev_info(void)
+{
+ return sizeof(cuda_dev_info);
+}
#include "statutil.h"
#include <ctype.h>
#include "macros.h"
+#include "string2.h"
#include "gromacs/utility/gmxmpi.h"
#endif
}
-#if defined GMX_LIB_MPI && defined GMX_IS_BGQ
+#if defined GMX_LIB_MPI && defined GMX_TARGET_BGQ
#include <spi/include/kernel/location.h>
#endif
+int gmx_physicalnode_id_hash(void)
+{
+ int hash_int;
+
+#ifndef GMX_LIB_MPI
+ /* We have a single physical node */
+ hash_int = 0;
+#else
+ int resultlen;
+ char mpi_hostname[MPI_MAX_PROCESSOR_NAME];
+
+ /* This procedure can only differentiate nodes with different names.
+ * Architectures where different physical nodes have identical names,
+ * such as IBM Blue Gene, should use an architecture specific solution.
+ */
+ MPI_Get_processor_name(mpi_hostname, &resultlen);
+
+ /* The string hash function returns an unsigned int. We cast to an int.
+ * Negative numbers are converted to positive by setting the sign bit to 0.
+ * This makes the hash one bit smaller.
+ * A 63-bit hash (with 64-bit int) should be enough for unique node hashes,
+ * even on a million node machine. 31 bits might not be enough though!
+ */
+ hash_int =
+ (int)gmx_string_fullhash_func(mpi_hostname, gmx_string_hash_init);
+ if (hash_int < 0)
+ {
+ hash_int -= INT_MIN;
+ }
+#endif
+
+ return hash_int;
+}
+
+/* TODO: this function should be fully replaced by gmx_physicalnode_id_hash */
int gmx_hostname_num()
{
#ifndef GMX_MPI
char mpi_hostname[MPI_MAX_PROCESSOR_NAME], hostnum_str[MPI_MAX_PROCESSOR_NAME];
MPI_Get_processor_name(mpi_hostname, &resultlen);
-#ifdef GMX_IS_BGQ
+#ifdef GMX_TARGET_BGQ
Personality_t personality;
Kernel_GetPersonality(&personality, sizeof(personality));
/* Each MPI rank has a unique coordinate in a 6-dimensional space
{
fprintf(debug, "In gmx_hostname_num: hostname '%s', hostnum %d\n",
mpi_hostname, hostnum);
-#ifdef GMX_IS_BGQ
+#ifdef GMX_TARGET_BGQ
fprintf(debug,
"Torus ID A: %d / %d B: %d / %d C: %d / %d D: %d / %d E: %d / %d\nNode ID T: %d / %d core: %d / %d hardware thread: %d / %d\n",
personality.Network_Config.Acoord,
gmx_string_hash_init = 5381;
+unsigned int
+gmx_string_fullhash_func(const char *s, unsigned int hash_init)
+{
+ int c;
+
+ while ((c = (*s++)) != '\0')
+ {
+ hash_init = ((hash_init << 5) + hash_init) ^ c; /* (hash * 33) xor c */
+ }
+ return hash_init;
+}
+
unsigned int
gmx_string_hash_func(const char *s, unsigned int hash_init)
{
* possible after subsequently setting a shorter cut-off with change_dd_cutoff.
*/
+void dd_setup_dlb_resource_sharing(t_commrec *cr,
+ const gmx_hw_info_t *hwinfo,
+ const gmx_hw_opt_t *hw_opt);
+/* When domains (PP MPI ranks) share a GPU, the individual GPU wait times
+ * are meaningless, as it depends on the order in which tasks on the same
+ * GPU finish. Therefore there wait times need to be averaged over the ranks
+ * sharing the same GPU. This function sets up the communication for that.
+ */
+
void setup_dd_grid(FILE *fplog, gmx_domdec_t *dd);
void dd_collect_vec(gmx_domdec_t *dd,
t_state *state_local, t_state *state);
enum {
- ddCyclStep, ddCyclPPduringPME, ddCyclF, ddCyclPME, ddCyclNr
+ ddCyclStep, ddCyclPPduringPME, ddCyclF, ddCyclWaitGPU, ddCyclPME, ddCyclNr
};
void dd_cycles_add(gmx_domdec_t *dd, float cycles, int ddCycl);
* use with group kernels.
*/
-void init_interaction_const(FILE *fp,
- interaction_const_t **interaction_const,
- const t_forcerec *fr,
- real rtab);
-/* Initializes the interaction constant data structure. Currently it
- * uses forcerec as input.
- */
-
void init_forcerec(FILE *fplog,
const output_env_t oenv,
t_forcerec *fr,
/* Check if this binary was compiled with the same acceleration as we
* would suggest for the current hardware. Always print stats to the log file
- * if it is non-NULL, and print a warning in stdout if we don't have a match.
+ * if it is non-NULL, and if we don't have a match, print a warning in log
+ * (if non-NULL) and if print_to_stderr!=0 also to stderr.
*/
int
gmx_cpuid_acceleration_check (gmx_cpuid_t cpuid,
- FILE * log);
+ FILE * log,
+ int print_to_stderr);
/* Release resources used by data structure. Note that the pointer to the
} /* fixes auto-indentation problems */
#endif
-/* the init and consistency functions depend on commrec that may not be
+/* the init and consistency functions depend on commrec that may not be
consistent in cuda because MPI types don't exist there. */
#ifndef __CUDACC__
#include "types/commrec.h"
/* return a pointer to a global hwinfo structure. */
gmx_hw_info_t *gmx_detect_hardware(FILE *fplog, const t_commrec *cr,
- gmx_bool bForceUseGPU, gmx_bool bTryUseGPU,
- const char *gpu_id);
+ gmx_bool bDetectGPUs);
void gmx_hardware_info_free(gmx_hw_info_t *hwinfo);
-/* Check the thread count + GPU assignment. This function must
- either be run by all threads that persist (i.e. all tmpi threads),
- or be run before they are created. */
-void gmx_check_hw_runconf_consistency(FILE *fplog, gmx_hw_info_t *hwinfo,
- const t_commrec *cr, int ntmpi_requsted,
- gmx_bool bUseGPU);
+void gmx_parse_gpu_ids(gmx_gpu_opt_t *gpu_opt);
+
+void gmx_select_gpu_ids(FILE *fplog, const t_commrec *cr,
+ const gmx_gpu_info_t *gpu_info,
+ gmx_bool bForceUseGPU,
+ gmx_gpu_opt_t *gpu_opt);
+
+/* Check the consistency of hw_opt with hwinfo.
+ This function should be called once on each MPI rank. */
+void gmx_check_hw_runconf_consistency(FILE *fplog,
+ const gmx_hw_info_t *hwinfo,
+ const t_commrec *cr,
+ const gmx_hw_opt_t *hw_opt,
+ gmx_bool bUseGPU);
#endif
/* Check whether a GPU is shared among ranks, and return the number of shared
gpus
- hwinfo = the hwinfo struct
+ gpu_opt = the gpu options struct
returns: The number of GPUs shared among ranks, or 0 */
-int gmx_count_gpu_dev_shared(const gmx_gpu_info_t *gpu_info);
+int gmx_count_gpu_dev_shared(const gmx_gpu_opt_t *gpu_opt);
#ifdef __cplusplus
#endif
#endif
-#ifdef GMX_IS_X86
+#ifdef GMX_TARGET_X86
#ifdef GMX_X86_SSE2
/* This is for general x86 SIMD instruction sets that also support SSE2 */
#endif /* GMX_X86_SSE2 */
-#endif /* GMX_IS_X86 */
+#endif /* GMX_TARGET_X86 */
#ifdef GMX_CPU_ACCELERATION_IBM_QPX
-/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
+/*
+ * This file is part of the GROMACS molecular simulation package.
*
- *
- * This source code is part of
- *
- * G R O M A C S
- *
- * GROningen MAchine for Chemical Simulations
- *
- * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
* Copyright (c) 1991-2000, University of Groningen, The Netherlands.
* Copyright (c) 2001-2010, The GROMACS development team,
* check out http://www.gromacs.org for more information.
-
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version 2
+ * Copyright (c) 2012,2013, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
* of the License, or (at your option) any later version.
*
- * If you want to redistribute modifications, please consider that
- * scientific software is very special. Version control is crucial -
- * bugs must be traceable. We will be happy to consider code for
- * inclusion in the official distribution, but derived work must not
- * be called official GROMACS. Details are found in the README & COPYING
- * files - if they are missing, get the official version at www.gromacs.org.
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
*
- * To help us fund GROMACS development, we humbly ask that you cite
- * the papers on the package - you can find them in the top README file.
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*
- * For more info, check our website at http://www.gromacs.org
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
*
- * And Hey:
- * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
*/
#ifndef _GPU_UTILS_H_
int detect_cuda_gpus(gmx_gpu_info_t gmx_unused *gpu_info, char gmx_unused *err_str) FUNC_TERM_INT
FUNC_QUALIFIER
-void pick_compatible_gpus(gmx_gpu_info_t gmx_unused *gpu_info) FUNC_TERM_VOID
+void pick_compatible_gpus(const gmx_gpu_info_t gmx_unused *gpu_info,
+ gmx_gpu_opt_t gmx_unused *gpu_opt) FUNC_TERM_VOID
FUNC_QUALIFIER
-gmx_bool check_select_cuda_gpus(int gmx_unused *checkres, gmx_gpu_info_t gmx_unused *gpu_info,
- const int gmx_unused *requested_devs, int gmx_unused count) FUNC_TERM_INT
+gmx_bool check_selected_cuda_gpus(int gmx_unused *checkres,
+ const gmx_gpu_info_t gmx_unused *gpu_info,
+ gmx_gpu_opt_t gmx_unused *gpu_opt) FUNC_TERM_INT
FUNC_QUALIFIER
void free_gpu_info(const gmx_gpu_info_t gmx_unused *gpu_info) FUNC_TERM_VOID
FUNC_QUALIFIER
-gmx_bool init_gpu(int gmx_unused mygpu, char gmx_unused *result_str, const gmx_gpu_info_t gmx_unused *gpu_info) FUNC_TERM_INT
+gmx_bool init_gpu(int gmx_unused mygpu, char gmx_unused *result_str,
+ const gmx_gpu_info_t gmx_unused *gpu_info,
+ const gmx_gpu_opt_t gmx_unused *gpu_opt) FUNC_TERM_INT
FUNC_QUALIFIER
gmx_bool free_gpu(char gmx_unused *result_str) FUNC_TERM_INT
int get_current_gpu_device_id(void) FUNC_TERM_INT
FUNC_QUALIFIER
-int get_gpu_device_id(const gmx_gpu_info_t gmx_unused *gpu_info, int gmx_unused index) FUNC_TERM_INT
+int get_gpu_device_id(const gmx_gpu_info_t gmx_unused *gpu_info,
+ const gmx_gpu_opt_t gmx_unused *gpu_opt,
+ int index) FUNC_TERM_INT
FUNC_QUALIFIER
void get_gpu_device_info_string(char gmx_unused *s, const gmx_gpu_info_t gmx_unused *gpu_info, int gmx_unused index) FUNC_TERM_VOID
+FUNC_QUALIFIER
+size_t sizeof_cuda_dev_info(void) FUNC_TERM_INT
+
#ifdef __cplusplus
}
#endif
ddnoSEL, ddnoINTERLEAVE, ddnoPP_PME, ddnoCARTESIAN, ddnoNR
};
-/* The options for the thread affinity setting, default: auto */
-enum {
- threadaffSEL, threadaffAUTO, threadaffON, threadaffOFF, threadaffNR
-};
-
-typedef struct {
- int nthreads_tot; /* Total number of threads requested (TMPI) */
- int nthreads_tmpi; /* Number of TMPI threads requested */
- int nthreads_omp; /* Number of OpenMP threads requested */
- int nthreads_omp_pme; /* As nthreads_omp, but for PME only nodes */
- int thread_affinity; /* Thread affinity switch, see enum above */
- int core_pinning_stride; /* Logical core pinning stride */
- int core_pinning_offset; /* Logical core pinning offset */
- char *gpu_id; /* GPU id's to use, each specified as chars */
-} gmx_hw_opt_t;
-
/* Variables for temporary use with the deform option,
* used in runner.c and md.c.
* (These variables should be stored in the tpx file.)
-/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
+/*
+ * This file is part of the GROMACS molecular simulation package.
*
- *
- * This source code is part of
- *
- * G R O M A C S
- *
- * GROningen MAchine for Chemical Simulations
- *
- * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
* Copyright (c) 1991-2000, University of Groningen, The Netherlands.
* Copyright (c) 2001-2012, The GROMACS development team,
* check out http://www.gromacs.org for more information.
+ * Copyright (c) 2012,2013, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version 2
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
* of the License, or (at your option) any later version.
*
- * If you want to redistribute modifications, please consider that
- * scientific software is very special. Version control is crucial -
- * bugs must be traceable. We will be happy to consider code for
- * inclusion in the official distribution, but derived work must not
- * be called official GROMACS. Details are found in the README & COPYING
- * files - if they are missing, get the official version at www.gromacs.org.
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
*
- * To help us fund GROMACS development, we humbly ask that you cite
- * the papers on the package - you can find them in the top README file.
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*
- * For more info, check our website at http://www.gromacs.org
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
*
- * And Hey:
- * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
*/
#ifndef NBNXN_CUDA_DATA_MGMT_H
/*! Initializes the data structures related to CUDA nonbonded calculations. */
FUNC_QUALIFIER
-void nbnxn_cuda_init(FILE gmx_unused *fplog,
- nbnxn_cuda_ptr_t gmx_unused *p_cu_nb,
- const gmx_gpu_info_t gmx_unused *gpu_info, int gmx_unused my_gpu_index,
+void nbnxn_cuda_init(FILE gmx_unused *fplog,
+ nbnxn_cuda_ptr_t gmx_unused *p_cu_nb,
+ const gmx_gpu_info_t gmx_unused *gpu_info,
+ const gmx_gpu_opt_t gmx_unused *gpu_opt,
+ int gmx_unused my_gpu_index,
/* true of both local and non-local are don on GPU */
- gmx_bool gmx_unused bLocalAndNonlocal) FUNC_TERM
+ gmx_bool gmx_unused bLocalAndNonlocal) FUNC_TERM
/*! Initializes simulation constant data. */
FUNC_QUALIFIER
int gmx_node_rank(void);
/* return the rank of the node */
+int gmx_physicalnode_id_hash(void);
+/* Return a non-negative hash that is, hopefully, unique for each physical node.
+ * This hash is useful for determining hardware locality.
+ */
+
int gmx_hostname_num(void);
/* Ostensibly, returns a integer characteristic of and unique to each
physical node in the MPI system. If the first part of the MPI
/*
+ * This file is part of the GROMACS molecular simulation package.
*
- * This source code is part of
- *
- * G R O M A C S
- *
- * GROningen MAchine for Chemical Simulations
- *
- * VERSION 3.2.0
- * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
* Copyright (c) 1991-2000, University of Groningen, The Netherlands.
* Copyright (c) 2001-2004, The GROMACS development team,
* check out http://www.gromacs.org for more information.
-
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version 2
+ * Copyright (c) 2012,2013, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
* of the License, or (at your option) any later version.
*
- * If you want to redistribute modifications, please consider that
- * scientific software is very special. Version control is crucial -
- * bugs must be traceable. We will be happy to consider code for
- * inclusion in the official distribution, but derived work must not
- * be called official GROMACS. Details are found in the README & COPYING
- * files - if they are missing, get the official version at www.gromacs.org.
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
*
- * To help us fund GROMACS development, we humbly ask that you cite
- * the papers on the package - you can find them in the top README file.
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*
- * For more info, check our website at http://www.gromacs.org
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
*
- * And Hey:
- * Gromacs Runs On Most of All Computer Systems
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
*/
/*! \file
* \brief Generic string handling functions.
* \param[in] hash_init Initial (or previous) hash value.
* \returns Updated hash value (hash_init combined with string hash).
*
- * This routine only uses characters for which isalnum(c) is true,
- * and all characters are converted to upper case.
* On the first invocation for a new string, use the constant
* gmx_string_hash_init for the second argument. If you want to create a hash
* corresponding to several concatenated strings, provide the returned hash
* value as hash_init for the second string, etc.
*/
unsigned int
+gmx_string_fullhash_func(const char *s, unsigned int hash_init);
+
+/*! \brief
+ * Return a hash of the string according to Dan J. Bernsteins algorithm.
+ *
+ * \param[in] s String to calculate hash for.
+ * \param[in] hash_init Initial (or previous) hash value.
+ * \returns Updated hash value (hash_init combined with string hash).
+ *
+ * Identical to gmx_string_fullhash_func, except that
+ * this routine only uses characters for which isalnum(c) is true,
+ * and all characters are converted to upper case.
+ */
+unsigned int
gmx_string_hash_func(const char *s, unsigned int hash_init);
/** Return value for gmx_wcmatch() when there is no match. */
rvec posres_comB;
const gmx_hw_info_t *hwinfo;
+ const gmx_gpu_opt_t *gpu_opt;
gmx_bool use_cpu_acceleration;
/* Interaction for calculated in kernels. In many cases this is similar to
* The gmx_hardware_detect module initializes it. */
typedef struct
{
- gmx_bool bUserSet; /* true if the GPUs in cuda_dev_use are manually provided by the user */
-
- int ncuda_dev_use; /* number of devices selected to be used */
- int *cuda_dev_use; /* index of the devices selected to be used */
- int ncuda_dev; /* total number of devices detected */
- cuda_dev_info_ptr_t cuda_dev; /* devices detected in the system (per node) */
+ gmx_bool bDetectGPUs; /* Did we try to detect GPUs? */
+ int ncuda_dev; /* total number of devices detected */
+ cuda_dev_info_ptr_t cuda_dev; /* devices detected in the system (per node) */
+ int ncuda_dev_compatible; /* number of compatible GPUs */
} gmx_gpu_info_t;
/* Hardware information structure with CPU and GPU information.
* (i.e. must be able to be shared among all threads) */
typedef struct
{
- gmx_bool bCanUseGPU; /* True if compatible GPUs are detected during hardware detection */
gmx_gpu_info_t gpu_info; /* Information about GPUs detected in the system */
gmx_cpuid_t cpuid_info; /* CPUID information about CPU detected;
int nthreads_hw_avail; /* Number of hardware threads available; this number
is based on the number of CPUs reported as available
by the OS at the time of detection. */
- gmx_bool bConsistencyChecked; /* whether
- gmx_check_hw_runconf_consistency()
- has been run with this hw_info */
} gmx_hw_info_t;
+
+/* The options for the thread affinity setting, default: auto */
+enum {
+ threadaffSEL, threadaffAUTO, threadaffON, threadaffOFF, threadaffNR
+};
+
+/* GPU device selection information -- for now with only CUDA devices */
+typedef struct
+{
+ char *gpu_id; /* GPU id's to use, each specified as chars */
+ gmx_bool bUserSet; /* true if the GPUs in cuda_dev_use are manually provided by the user */
+
+ int ncuda_dev_use; /* number of device (IDs) selected to be used */
+ int *cuda_dev_use; /* device index list providing GPU to PP rank mapping, GPUs can be listed multiple times when ranks share them */
+} gmx_gpu_opt_t;
+
+/* Threading and GPU options, can be set automatically or by the user */
+typedef struct {
+ int nthreads_tot; /* Total number of threads requested (TMPI) */
+ int nthreads_tmpi; /* Number of TMPI threads requested */
+ int nthreads_omp; /* Number of OpenMP threads requested */
+ int nthreads_omp_pme; /* As nthreads_omp, but for PME only nodes */
+ int thread_affinity; /* Thread affinity switch, see enum above */
+ int core_pinning_stride; /* Logical core pinning stride */
+ int core_pinning_offset; /* Logical core pinning offset */
+
+ gmx_gpu_opt_t gpu_opt; /* The GPU options */
+} gmx_hw_opt_t;
+
#ifdef __cplusplus
}
#endif
#include "nbnxn_search.h"
#include "bondf.h"
#include "gmx_omp_nthreads.h"
+#include "gpu_utils.h"
#include "gromacs/fileio/futil.h"
#include "gromacs/fileio/gmxfio.h"
/* Stuff for load communication */
gmx_bool bRecordLoad;
gmx_domdec_load_t *load;
+ int nrank_gpu_shared;
#ifdef GMX_MPI
MPI_Comm *mpi_comm_load;
+ MPI_Comm mpi_comm_gpu_shared;
#endif
/* Maximum DLB scaling per load balancing step in percent */
if (comm->cycl_n[ddCyclF] > 1)
{
/* Subtract the maximum of the last n cycle counts
- * to get rid of possible high counts due to other soures,
+ * to get rid of possible high counts due to other sources,
* for instance system activity, that would otherwise
* affect the dynamic load balancing.
*/
load -= comm->cycl_max[ddCyclF];
}
+
+#ifdef GMX_MPI
+ if (comm->cycl_n[ddCyclWaitGPU] && comm->nrank_gpu_shared > 1)
+ {
+ float gpu_wait, gpu_wait_sum;
+
+ gpu_wait = comm->cycl[ddCyclWaitGPU];
+ if (comm->cycl_n[ddCyclF] > 1)
+ {
+ /* We should remove the WaitGPU time of the same MD step
+ * as the one with the maximum F time, since the F time
+ * and the wait time are not independent.
+ * Furthermore, the step for the max F time should be chosen
+ * the same on all ranks that share the same GPU.
+ * But to keep the code simple, we remove the average instead.
+ * The main reason for artificially long times at some steps
+ * is spurious CPU activity or MPI time, so we don't expect
+ * that changes in the GPU wait time matter a lot here.
+ */
+ gpu_wait *= (comm->cycl_n[ddCyclF] - 1)/(float)comm->cycl_n[ddCyclF];
+ }
+ /* Sum the wait times over the ranks that share the same GPU */
+ MPI_Allreduce(&gpu_wait, &gpu_wait_sum, 1, MPI_FLOAT, MPI_SUM,
+ comm->mpi_comm_gpu_shared);
+ /* Replace the wait time by the average over the ranks */
+ load += -gpu_wait + gpu_wait_sum/comm->nrank_gpu_shared;
+ }
+#endif
}
return load;
}
#endif
+void dd_setup_dlb_resource_sharing(t_commrec *cr,
+ const gmx_hw_info_t *hwinfo,
+ const gmx_hw_opt_t *hw_opt)
+{
+#ifdef GMX_MPI
+ int physicalnode_id_hash;
+ int gpu_id;
+ gmx_domdec_t *dd;
+ MPI_Comm mpi_comm_pp_physicalnode;
+
+ if (!(cr->duty & DUTY_PP) ||
+ hw_opt->gpu_opt.ncuda_dev_use == 0)
+ {
+ /* Only PP nodes (currently) use GPUs.
+ * If we don't have GPUs, there are no resources to share.
+ */
+ return;
+ }
+
+ physicalnode_id_hash = gmx_physicalnode_id_hash();
+
+ gpu_id = get_gpu_device_id(&hwinfo->gpu_info, &hw_opt->gpu_opt, cr->nodeid);
+
+ dd = cr->dd;
+
+ if (debug)
+ {
+ fprintf(debug, "dd_setup_dd_dlb_gpu_sharing:\n");
+ fprintf(debug, "DD PP rank %d physical node hash %d gpu_id %d\n",
+ dd->rank, physicalnode_id_hash, gpu_id);
+ }
+ /* Split the PP communicator over the physical nodes */
+ /* TODO: See if we should store this (before), as it's also used for
+ * for the nodecomm summution.
+ */
+ MPI_Comm_split(dd->mpi_comm_all, physicalnode_id_hash, dd->rank,
+ &mpi_comm_pp_physicalnode);
+ MPI_Comm_split(mpi_comm_pp_physicalnode, gpu_id, dd->rank,
+ &dd->comm->mpi_comm_gpu_shared);
+ MPI_Comm_free(&mpi_comm_pp_physicalnode);
+ MPI_Comm_size(dd->comm->mpi_comm_gpu_shared, &dd->comm->nrank_gpu_shared);
+
+ if (debug)
+ {
+ fprintf(debug, "nrank_gpu_shared %d\n", dd->comm->nrank_gpu_shared);
+ }
+
+ /* Note that some ranks could share a GPU, while others don't */
+
+ if (dd->comm->nrank_gpu_shared == 1)
+ {
+ MPI_Comm_free(&dd->comm->mpi_comm_gpu_shared);
+ }
+#endif
+}
+
static void make_load_communicators(gmx_domdec_t *dd)
{
#ifdef GMX_MPI
}
+ /* Initialize to GPU share count to 0, might change later */
+ comm->nrank_gpu_shared = 0;
+
comm->eDLB = check_dlb_support(fplog, cr, dlb_opt, comm->bRecordLoad, Flags, ir);
comm->bDynLoadBal = (comm->eDLB == edlbYES);
const gmx_hw_info_t *hwinfo,
gmx_bool bDoNonbonded,
gmx_bool *bUseGPU,
- gmx_bool *bEmulateGPU)
+ gmx_bool *bEmulateGPU,
+ const gmx_gpu_opt_t *gpu_opt)
{
gmx_bool bEmulateGPUEnvVarSet;
char gpu_err_str[STRLEN];
* Note that you should freezing the system as otherwise it will explode.
*/
*bEmulateGPU = (bEmulateGPUEnvVarSet ||
- (!bDoNonbonded && hwinfo->bCanUseGPU));
+ (!bDoNonbonded &&
+ gpu_opt->ncuda_dev_use > 0));
/* Enable GPU mode when GPUs are available or no GPU emulation is requested.
*/
- if (hwinfo->bCanUseGPU && !(*bEmulateGPU))
+ if (gpu_opt->ncuda_dev_use > 0 && !(*bEmulateGPU))
{
/* Each PP node will use the intra-node id-th device from the
* list of detected/selected GPUs. */
- if (!init_gpu(cr->rank_pp_intranode, gpu_err_str, &hwinfo->gpu_info))
+ if (!init_gpu(cr->rank_pp_intranode, gpu_err_str,
+ &hwinfo->gpu_info, gpu_opt))
{
/* At this point the init should never fail as we made sure that
* we have all the GPUs we need. If it still does, we'll bail. */
gmx_fatal(FARGS, "On node %d failed to initialize GPU #%d: %s",
cr->nodeid,
- get_gpu_device_id(&hwinfo->gpu_info, cr->rank_pp_intranode),
+ get_gpu_device_id(&hwinfo->gpu_info, gpu_opt,
+ cr->rank_pp_intranode),
gpu_err_str);
}
}
}
-void init_interaction_const(FILE *fp,
- interaction_const_t **interaction_const,
- const t_forcerec *fr,
- real rtab)
+static void init_interaction_const(FILE *fp,
+ const t_commrec *cr,
+ interaction_const_t **interaction_const,
+ const t_forcerec *fr,
+ real rtab)
{
interaction_const_t *ic;
gmx_bool bUsesSimpleTables = TRUE;
if (fr->nbv != NULL && fr->nbv->bUseGPU)
{
nbnxn_cuda_init_const(fr->nbv->cu_nbv, ic, fr->nbv->grp);
+
+ /* With tMPI + GPUs some ranks may be sharing GPU(s) and therefore
+ * also sharing texture references. To keep the code simple, we don't
+ * treat texture references as shared resources, but this means that
+ * the coulomb_tab and nbfp texture refs will get updated by multiple threads.
+ * Hence, to ensure that the non-bonded kernels don't start before all
+ * texture binding operations are finished, we need to wait for all ranks
+ * to arrive here before continuing.
+ *
+ * Note that we could omit this barrier if GPUs are not shared (or
+ * texture objects are used), but as this is initialization code, there
+ * is not point in complicating things.
+ */
+#ifdef GMX_THREAD_MPI
+ if (PAR(cr))
+ {
+ gmx_barrier(cr);
+ }
+#endif /* GMX_THREAD_MPI */
}
bUsesSimpleTables = uses_simple_tables(fr->cutoff_scheme, fr->nbv, -1);
pick_nbnxn_resources(cr, fr->hwinfo,
fr->bNonbonded,
&nbv->bUseGPU,
- &bEmulateGPU);
+ &bEmulateGPU,
+ fr->gpu_opt);
nbv->nbs = NULL;
/* init the NxN GPU data; the last argument tells whether we'll have
* both local and non-local NB calculation on GPU */
nbnxn_cuda_init(fp, &nbv->cu_nbv,
- &fr->hwinfo->gpu_info, cr->rank_pp_intranode,
+ &fr->hwinfo->gpu_info, fr->gpu_opt,
+ cr->rank_pp_intranode,
(nbv->ngrp > 1) && !bHybridGPURun);
if ((env = getenv("GMX_NB_MIN_CI")) != NULL)
* In mdrun, hwinfo has already been set before calling init_forcerec.
* Here we ignore GPUs, as tools will not use them anyhow.
*/
- fr->hwinfo = gmx_detect_hardware(fp, cr, FALSE, FALSE, NULL);
+ fr->hwinfo = gmx_detect_hardware(fp, cr, FALSE);
}
/* By default we turn acceleration on, but it might be turned off further down... */
}
/* fr->ic is used both by verlet and group kernels (to some extent) now */
- init_interaction_const(fp, &fr->ic, fr, rtab);
+ init_interaction_const(fp, cr, &fr->ic, fr, rtab);
+
if (ir->eDispCorr != edispcNO)
{
calc_enervirdiff(fp, ir->eDispCorr, fr);
void nbnxn_cuda_init(FILE *fplog,
nbnxn_cuda_ptr_t *p_cu_nb,
- const gmx_gpu_info_t *gpu_info, int my_gpu_index,
+ const gmx_gpu_info_t *gpu_info,
+ const gmx_gpu_opt_t *gpu_opt,
+ int my_gpu_index,
gmx_bool bLocalAndNonlocal)
{
cudaError_t stat;
init_plist(nb->plist[eintLocal]);
/* set device info, just point it to the right GPU among the detected ones */
- nb->dev_info = &gpu_info->cuda_dev[get_gpu_device_id(gpu_info, my_gpu_index)];
+ nb->dev_info = &gpu_info->cuda_dev[get_gpu_device_id(gpu_info, gpu_opt, my_gpu_index)];
/* local/non-local GPU streams */
stat = cudaStreamCreate(&nb->stream[eintLocal]);
bTMPIAtomics = false;
#endif
-#ifdef GMX_IS_X86
+#ifdef GMX_TARGET_X86
bX86 = true;
#else
bX86 = false;
* - GPUs are not being shared.
*/
bool bShouldUsePollSync = (bX86 && bTMPIAtomics &&
- (gmx_count_gpu_dev_shared(gpu_info) < 1));
+ (gmx_count_gpu_dev_shared(gpu_opt) < 1));
if (bStreamSync)
{
}}
}}
- /* With Ewald type electrostatics we the forces for excluded atom pairs
- * should not contribute to the virial sum. The exclusion forces
- * are not calculate in the energy kernels, but are in _noener.
- */
- if (!((force_flags & GMX_FORCE_ENERGY) ||
- (EEL_FULL(ic->eeltype) && (force_flags & GMX_FORCE_VIRIAL))))
+ if (!(force_flags & GMX_FORCE_ENERGY))
{{
/* Don't calculate energies */
p_nbk_noener[coult][nbat->comb_rule](nbl[nb], nbat,
out->f,
fshift_p);
}}
- else if (out->nV == 1 || !(force_flags & GMX_FORCE_ENERGY))
+ else if (out->nV == 1)
{{
/* No energy groups */
out->Vvdw[0] = 0;
}
}
- /* With Ewald type electrostatics we the forces for excluded atom pairs
- * should not contribute to the virial sum. The exclusion forces
- * are not calculate in the energy kernels, but are in _noener.
- */
- if (!((force_flags & GMX_FORCE_ENERGY) ||
- (EEL_FULL(ic->eeltype) && (force_flags & GMX_FORCE_VIRIAL))))
+ if (!(force_flags & GMX_FORCE_ENERGY))
{
/* Don't calculate energies */
p_nbk_noener[coult][nbat->comb_rule](nbl[nb], nbat,
out->f,
fshift_p);
}
- else if (out->nV == 1 || !(force_flags & GMX_FORCE_ENERGY))
+ else if (out->nV == 1)
{
/* No energy groups */
out->Vvdw[0] = 0;
}
}
- /* With Ewald type electrostatics we the forces for excluded atom pairs
- * should not contribute to the virial sum. The exclusion forces
- * are not calculate in the energy kernels, but are in _noener.
- */
- if (!((force_flags & GMX_FORCE_ENERGY) ||
- (EEL_FULL(ic->eeltype) && (force_flags & GMX_FORCE_VIRIAL))))
+ if (!(force_flags & GMX_FORCE_ENERGY))
{
/* Don't calculate energies */
p_nbk_noener[coult][nbat->comb_rule](nbl[nb], nbat,
out->f,
fshift_p);
}
- else if (out->nV == 1 || !(force_flags & GMX_FORCE_ENERGY))
+ else if (out->nV == 1)
{
/* No energy groups */
out->Vvdw[0] = 0;
* or easier, allocate at least n*SGSF elements.
*/
static void sort_atoms(int dim, gmx_bool Backwards,
+ int dd_zone,
int *a, int n, rvec *x,
real h0, real invh, int n_per_h,
int *sort)
#ifndef NDEBUG
/* As we can have rounding effect, we use > iso >= here */
- if (zi < 0 || zi > n_per_h*SORT_GRID_OVERSIZE)
+ if (zi < 0 || (dd_zone == 0 && zi > n_per_h*SORT_GRID_OVERSIZE))
{
gmx_fatal(FARGS, "(int)((x[%d][%c]=%f - %f)*%f) = %d, not in 0 - %d*%d\n",
a[i], 'x'+dim, x[a[i]][dim], h0, invh, zi,
}
#endif
+ /* In a non-local domain, particles communcated for bonded interactions
+ * can be far beyond the grid size, which is set by the non-bonded
+ * cut-off distance. We sort such particles into the last cell.
+ */
+ if (zi > n_per_h*SORT_GRID_OVERSIZE)
+ {
+ zi = n_per_h*SORT_GRID_OVERSIZE;
+ }
+
/* Ideally this particle should go in sort cell zi,
* but that might already be in use,
* in that case find the first empty cell higher up
ash = (grid->cell0 + grid->cxy_ind[cxy])*grid->na_sc;
/* Sort the atoms within each x,y column on z coordinate */
- sort_atoms(ZZ, FALSE,
+ sort_atoms(ZZ, FALSE, dd_zone,
nbs->a+ash, na, x,
grid->c0[ZZ],
1.0/nbs->box[ZZ][ZZ], ncz*grid->na_sc,
ash = (grid->cell0 + grid->cxy_ind[cxy])*grid->na_sc;
/* Sort the atoms within each x,y column on z coordinate */
- sort_atoms(ZZ, FALSE,
+ sort_atoms(ZZ, FALSE, dd_zone,
nbs->a+ash, na, x,
grid->c0[ZZ],
1.0/nbs->box[ZZ][ZZ], ncz*grid->na_sc,
#if GPU_NSUBCELL_Y > 1
/* Sort the atoms along y */
- sort_atoms(YY, (sub_z & 1),
+ sort_atoms(YY, (sub_z & 1), dd_zone,
nbs->a+ash_z, na_z, x,
grid->c0[YY]+cy*grid->sy,
grid->inv_sy, subdiv_z,
#if GPU_NSUBCELL_X > 1
/* Sort the atoms along x */
- sort_atoms(XX, ((cz*GPU_NSUBCELL_Y + sub_y) & 1),
+ sort_atoms(XX, ((cz*GPU_NSUBCELL_Y + sub_y) & 1), dd_zone,
nbs->a+ash_y, na_y, x,
grid->c0[XX]+cx*grid->sx,
grid->inv_sx, subdiv_y,
matrix boxs;
rvec vzero, box_diag;
real e, v, dvdl;
- float cycles_pme, cycles_force;
+ float cycles_pme, cycles_force, cycles_wait_gpu;
nonbonded_verlet_t *nbv;
- cycles_force = 0;
- nbv = fr->nbv;
- nb_kernel_type = fr->nbv->grp[0].kernel_type;
+ cycles_force = 0;
+ cycles_wait_gpu = 0;
+ nbv = fr->nbv;
+ nb_kernel_type = fr->nbv->grp[0].kernel_type;
start = mdatoms->start;
homenr = mdatoms->homenr;
{
if (bUseGPU)
{
+ float cycles_tmp;
+
wallcycle_start(wcycle, ewcWAIT_GPU_NB_NL);
nbnxn_cuda_wait_gpu(nbv->cu_nbv,
nbv->grp[eintNonlocal].nbat,
flags, eatNonlocal,
enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
fr->fshift);
- cycles_force += wallcycle_stop(wcycle, ewcWAIT_GPU_NB_NL);
+ cycles_tmp = wallcycle_stop(wcycle, ewcWAIT_GPU_NB_NL);
+ cycles_wait_gpu += cycles_tmp;
+ cycles_force += cycles_tmp;
}
else
{
flags, eatLocal,
enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
fr->fshift);
- wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
+ cycles_wait_gpu += wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
/* now clear the GPU outputs while we finish the step on the CPU */
if (wcycle)
{
dd_cycles_add(cr->dd, cycles_force-cycles_pme, ddCyclF);
+ if (bUseGPU)
+ {
+ dd_cycles_add(cr->dd, cycles_wait_gpu, ddCyclWaitGPU);
+ }
}
}
"[PAR]",
"With GPUs (only supported with the Verlet cut-off scheme), the number",
"of GPUs should match the number of MPI processes or MPI threads,",
- "excluding PME-only processes/threads. With thread-MPI the number",
+ "excluding PME-only processes/threads. With thread-MPI, unless set on the command line, the number",
"of MPI threads will automatically be set to the number of GPUs detected.",
- "When you want to use a subset of the available GPUs, you can use",
- "the [TT]-gpu_id[tt] option, where GPU id's are passed as a string,",
- "e.g. 02 for using GPUs 0 and 2. When you want different GPU id's",
- "on different nodes of a compute cluster, use the GMX_GPU_ID environment",
- "variable instead. The format for GMX_GPU_ID is identical to ",
- "[TT]-gpu_id[tt], but an environment variable can have different values",
- "on different nodes of a cluster.",
+ "To use a subset of the available GPUs, or to manually provide a mapping of",
+ "GPUs to PP ranks, you can use the [TT]-gpu_id[tt] option. The argument of [TT]-gpu_id[tt] is",
+ "a string of digits (without delimiter) representing device id-s of the GPUs to be used.",
+ "For example, \"[TT]02[tt]\" specifies using GPUs 0 and 2 in the first and second PP ranks per compute node",
+ "respectively. To select different sets of GPU-s",
+ "on different nodes of a compute cluster, use the [TT]GMX_GPU_ID[tt] environment",
+ "variable instead. The format for [TT]GMX_GPU_ID[tt] is identical to ",
+ "[TT]-gpu_id[tt], with the difference that an environment variable can have",
+ "different values on different compute nodes. Multiple MPI ranks on each node",
+ "can share GPUs. This is accomplished by specifying the id(s) of the GPU(s)",
+ "multiple times, e.g. \"[TT]0011[tt]\" for four ranks sharing two GPUs in this node.",
+ "This works within a single simulation, or a multi-simulation, with any form of MPI.",
"[PAR]",
"When using PME with separate PME nodes or with a GPU, the two major",
"compute tasks, the non-bonded force calculation and the PME calculation",
output_env_t oenv = NULL;
const char *deviceOptions = "";
- gmx_hw_opt_t hw_opt = {0, 0, 0, 0, threadaffSEL, 0, 0, NULL};
+ /* Non transparent initialization of a complex gmx_hw_opt_t struct.
+ * But unfortunately we are not allowed to call a function here,
+ * since declarations follow below.
+ */
+ gmx_hw_opt_t hw_opt = { 0, 0, 0, 0, threadaffSEL, 0, 0,
+ { NULL, FALSE, 0, NULL } };
t_pargs pa[] = {
"The starting logical core number for pinning to cores; used to avoid pinning threads from different mdrun instances to the same core" },
{ "-pinstride", FALSE, etINT, {&hw_opt.core_pinning_stride},
"Pinning distance in logical cores for threads, use 0 to minimize the number of threads per physical core" },
- { "-gpu_id", FALSE, etSTR, {&hw_opt.gpu_id},
- "List of GPU id's to use" },
+ { "-gpu_id", FALSE, etSTR, {&hw_opt.gpu_opt.gpu_id},
+ "List of GPU device id-s to use, specifies the per-node PP rank to GPU mapping" },
{ "-ddcheck", FALSE, etBOOL, {&bDDBondCheck},
"Check for all bonded interactions with DD" },
{ "-ddbondcomm", FALSE, etBOOL, {&bDDBondComm},
nbv->grp[0].kernel_type == nbnxnk8x8x8_CUDA)
{
nbnxn_cuda_pme_loadbal_update_param(nbv->cu_nbv, ic);
+
+ /* With tMPI + GPUs some ranks may be sharing GPU(s) and therefore
+ * also sharing texture references. To keep the code simple, we don't
+ * treat texture references as shared resources, but this means that
+ * the coulomb_tab texture ref will get updated by multiple threads.
+ * Hence, to ensure that the non-bonded kernels don't start before all
+ * texture binding operations are finished, we need to wait for all ranks
+ * to arrive here before continuing.
+ *
+ * Note that we could omit this barrier if GPUs are not shared (or
+ * texture objects are used), but as this is initialization code, there
+ * is not point in complicating things.
+ */
+#ifdef GMX_THREAD_MPI
+ if (PAR(cr))
+ {
+ gmx_barrier(cr);
+ }
+#endif /* GMX_THREAD_MPI */
}
else
{
#ifdef GMX_THREAD_MPI
struct mdrunner_arglist
{
- gmx_hw_opt_t *hw_opt;
+ gmx_hw_opt_t hw_opt;
FILE *fplog;
t_commrec *cr;
int nfile;
fplog = mc.fplog;
}
- mda->ret = mdrunner(mc.hw_opt, fplog, cr, mc.nfile, fnm, mc.oenv,
+ mda->ret = mdrunner(&mc.hw_opt, fplog, cr, mc.nfile, fnm, mc.oenv,
mc.bVerbose, mc.bCompact, mc.nstglobalcomm,
mc.ddxyz, mc.dd_node_order, mc.rdd,
mc.rconstr, mc.dddlb_opt, mc.dlb_scale,
fnmn = dup_tfn(nfile, fnm);
/* fill the data structure to pass as void pointer to thread start fn */
- mda->hw_opt = hw_opt;
+ /* hw_opt contains pointers, which should all be NULL at this stage */
+ mda->hw_opt = *hw_opt;
mda->fplog = fplog;
mda->cr = cr;
mda->nfile = nfile;
nthreads_tot_max = nthreads_hw;
}
- bCanUseGPU = (inputrec->cutoff_scheme == ecutsVERLET && hwinfo->bCanUseGPU);
+ bCanUseGPU = (inputrec->cutoff_scheme == ecutsVERLET &&
+ hwinfo->gpu_info.ncuda_dev_compatible > 0);
if (bCanUseGPU)
{
- ngpu = hwinfo->gpu_info.ncuda_dev_use;
+ ngpu = hwinfo->gpu_info.ncuda_dev_compatible;
}
else
{
ngpu = 0;
}
+ if (inputrec->cutoff_scheme == ecutsGROUP)
+ {
+ /* We checked this before, but it doesn't hurt to do it once more */
+ assert(hw_opt->nthreads_omp == 1);
+ }
+
nthreads_tmpi =
get_tmpi_omp_thread_division(hwinfo, hw_opt, nthreads_tot_max, ngpu);
t_inputrec *ir,
const gmx_mtop_t *mtop,
matrix box,
- gmx_bool *bUseGPU)
+ gmx_bool bUseGPU)
{
- /* Here we only check for GPU usage on the MPI master process,
- * as here we don't know how many GPUs we will use yet.
- * We check for a GPU on all processes later.
- */
- *bUseGPU = hwinfo->bCanUseGPU || (getenv("GMX_EMULATE_GPU") != NULL);
-
if (ir->verletbuf_drift > 0)
{
/* Update the Verlet buffer size for the current run setup */
* calc_verlet_buffer_size gives the same results for 4x8 and 4x4
* and 4x2 gives a larger buffer than 4x4, this is ok.
*/
- verletbuf_get_list_setup(*bUseGPU, &ls);
+ verletbuf_get_list_setup(bUseGPU, &ls);
calc_verlet_buffer_size(mtop, det(box), ir,
ir->verletbuf_drift, &ls,
/* With GPU or emulation we should check nstlist for performance */
if ((EI_DYNAMICS(ir->eI) &&
- *bUseGPU &&
+ bUseGPU &&
ir->nstlist < NSTLIST_GPU_ENOUGH) ||
getenv(NSTLIST_ENVVAR) != NULL)
{
gmx_mtop_remove_chargegroups(mtop);
}
-static void check_and_update_hw_opt(gmx_hw_opt_t *hw_opt,
- int cutoff_scheme,
- gmx_bool bIsSimMaster)
+static void print_hw_opt(FILE *fp, const gmx_hw_opt_t *hw_opt)
+{
+ fprintf(fp, "hw_opt: nt %d ntmpi %d ntomp %d ntomp_pme %d gpu_id '%s'\n",
+ hw_opt->nthreads_tot,
+ hw_opt->nthreads_tmpi,
+ hw_opt->nthreads_omp,
+ hw_opt->nthreads_omp_pme,
+ hw_opt->gpu_opt.gpu_id != NULL ? hw_opt->gpu_opt.gpu_id : "");
+}
+
+/* Checks we can do when we don't (yet) know the cut-off scheme */
+static void check_and_update_hw_opt_1(gmx_hw_opt_t *hw_opt,
+ gmx_bool bIsSimMaster)
{
gmx_omp_nthreads_read_env(&hw_opt->nthreads_omp, bIsSimMaster);
}
#endif
- if (cutoff_scheme == ecutsGROUP)
- {
- /* We only have OpenMP support for PME only nodes */
- if (hw_opt->nthreads_omp > 1)
- {
- gmx_fatal(FARGS, "OpenMP threads have been requested with cut-off scheme %s, but these are only supported with cut-off scheme %s",
- ecutscheme_names[cutoff_scheme],
- ecutscheme_names[ecutsVERLET]);
- }
- hw_opt->nthreads_omp = 1;
- }
-
if (hw_opt->nthreads_omp_pme > 0 && hw_opt->nthreads_omp <= 0)
{
gmx_fatal(FARGS, "You need to specify -ntomp in addition to -ntomp_pme");
hw_opt->nthreads_omp_pme = hw_opt->nthreads_omp;
}
+ /* Parse GPU IDs, if provided.
+ * We check consistency with the tMPI thread count later.
+ */
+ gmx_parse_gpu_ids(&hw_opt->gpu_opt);
+
+#ifdef GMX_THREAD_MPI
+ if (hw_opt->gpu_opt.ncuda_dev_use > 0 && hw_opt->nthreads_tmpi == 0)
+ {
+ /* Set the number of MPI threads equal to the number of GPUs */
+ hw_opt->nthreads_tmpi = hw_opt->gpu_opt.ncuda_dev_use;
+
+ if (hw_opt->nthreads_tot > 0 &&
+ hw_opt->nthreads_tmpi > hw_opt->nthreads_tot)
+ {
+ /* We have more GPUs than total threads requested.
+ * We choose to (later) generate a mismatch error,
+ * instead of launching more threads than requested.
+ */
+ hw_opt->nthreads_tmpi = hw_opt->nthreads_tot;
+ }
+ }
+#endif
+
if (debug)
{
- fprintf(debug, "hw_opt: nt %d ntmpi %d ntomp %d ntomp_pme %d gpu_id '%s'\n",
- hw_opt->nthreads_tot,
- hw_opt->nthreads_tmpi,
- hw_opt->nthreads_omp,
- hw_opt->nthreads_omp_pme,
- hw_opt->gpu_id != NULL ? hw_opt->gpu_id : "");
+ print_hw_opt(debug, hw_opt);
+ }
+}
+/* Checks we can do when we know the cut-off scheme */
+static void check_and_update_hw_opt_2(gmx_hw_opt_t *hw_opt,
+ int cutoff_scheme)
+{
+ if (cutoff_scheme == ecutsGROUP)
+ {
+ /* We only have OpenMP support for PME only nodes */
+ if (hw_opt->nthreads_omp > 1)
+ {
+ gmx_fatal(FARGS, "OpenMP threads have been requested with cut-off scheme %s, but these are only supported with cut-off scheme %s",
+ ecutscheme_names[cutoff_scheme],
+ ecutscheme_names[ecutsVERLET]);
+ }
+ hw_opt->nthreads_omp = 1;
+ }
+
+ if (hw_opt->nthreads_omp_pme <= 0 && hw_opt->nthreads_omp > 0)
+ {
+ hw_opt->nthreads_omp_pme = hw_opt->nthreads_omp;
+ }
+
+ if (debug)
+ {
+ print_hw_opt(debug, hw_opt);
}
}
}
}
-/* Data structure set by SIMMASTER which needs to be passed to all nodes
- * before the other nodes have read the tpx file and called gmx_detect_hardware.
+/* Frees GPU memory and destroys the CUDA context.
+ *
+ * Note that this function needs to be called even if GPUs are not used
+ * in this run because the PME ranks have no knowledge of whether GPUs
+ * are used or not, but all ranks need to enter the barrier below.
*/
-typedef struct {
- int cutoff_scheme; /* The cutoff scheme from inputrec_t */
- gmx_bool bUseGPU; /* Use GPU or GPU emulation */
-} master_inf_t;
+static void free_gpu_resources(FILE *fplog,
+ const t_forcerec *fr,
+ const t_commrec *cr)
+{
+ gmx_bool bIsPPrankUsingGPU;
+ char gpu_err_str[STRLEN];
+
+ bIsPPrankUsingGPU = (cr->duty & DUTY_PP) && fr->nbv != NULL && fr->nbv->bUseGPU;
+
+ if (bIsPPrankUsingGPU)
+ {
+ /* free nbnxn data in GPU memory */
+ nbnxn_cuda_free(fplog, fr->nbv->cu_nbv);
+
+ /* With tMPI we need to wait for all ranks to finish deallocation before
+ * destroying the context in free_gpu() as some ranks may be sharing
+ * GPU and context.
+ * Note: as only PP ranks need to free GPU resources, so it is safe to
+ * not call the barrier on PME ranks.
+ */
+#ifdef GMX_THREAD_MPI
+ if (PAR(cr))
+ {
+ gmx_barrier(cr);
+ }
+#endif /* GMX_THREAD_MPI */
+
+ /* uninitialize GPU (by destroying the context) */
+ if (!free_gpu(gpu_err_str))
+ {
+ gmx_warning("On node %d failed to free GPU #%d: %s",
+ cr->nodeid, get_current_gpu_device_id(), gpu_err_str);
+ }
+ }
+}
int mdrunner(gmx_hw_opt_t *hw_opt,
FILE *fplog, t_commrec *cr, int nfile,
int nthreads_pp = 1;
gmx_membed_t membed = NULL;
gmx_hw_info_t *hwinfo = NULL;
- master_inf_t minf = {-1, FALSE};
+ /* The master rank decides early on bUseGPU and broadcasts this later */
+ gmx_bool bUseGPU = FALSE;
/* CAUTION: threads may be started later on in this function, so
cr doesn't reflect the final parallel state right now */
/* Detect hardware, gather information. This is an operation that is
* global for this process (MPI rank). */
- hwinfo = gmx_detect_hardware(fplog, cr,
- bForceUseGPU, bTryUseGPU, hw_opt->gpu_id);
+ hwinfo = gmx_detect_hardware(fplog, cr, bTryUseGPU);
snew(state, 1);
convert_to_verlet_scheme(fplog, inputrec, mtop, det(state->box));
}
-
- minf.cutoff_scheme = inputrec->cutoff_scheme;
- minf.bUseGPU = FALSE;
-
if (inputrec->cutoff_scheme == ecutsVERLET)
{
+ /* Here the master rank decides if all ranks will use GPUs */
+ bUseGPU = (hwinfo->gpu_info.ncuda_dev_compatible > 0 ||
+ getenv("GMX_EMULATE_GPU") != NULL);
+
prepare_verlet_scheme(fplog, hwinfo, cr,
inputrec, mtop, state->box,
- &minf.bUseGPU);
+ bUseGPU);
}
- else if (hwinfo->bCanUseGPU)
+ else if (hwinfo->gpu_info.ncuda_dev_compatible > 0)
{
md_print_warn(cr, fplog,
"NOTE: GPU(s) found, but the current simulation can not use GPUs\n"
gmx_fatal(FARGS, "GPU requested, but can't be used without cutoff-scheme=Verlet");
}
}
-#ifdef GMX_IS_BGQ
+#ifdef GMX_TARGET_BGQ
else
{
md_print_warn(cr, fplog,
}
#endif
}
-#ifndef GMX_THREAD_MPI
- if (PAR(cr))
- {
- gmx_bcast_sim(sizeof(minf), &minf, cr);
- }
-#endif
- if (minf.bUseGPU && cr->npmenodes == -1)
- {
- /* Don't automatically use PME-only nodes with GPUs */
- cr->npmenodes = 0;
- }
/* Check for externally set OpenMP affinity and turn off internal
* pinning if any is found. We need to do this check early to tell
*/
gmx_omp_check_thread_affinity(fplog, cr, hw_opt);
-#ifdef GMX_THREAD_MPI
- /* With thread-MPI inputrec is only set here on the master thread */
+ /* Check and update the hardware options for internal consistency */
+ check_and_update_hw_opt_1(hw_opt, SIMMASTER(cr));
+
if (SIMMASTER(cr))
-#endif
{
- check_and_update_hw_opt(hw_opt, minf.cutoff_scheme, SIMMASTER(cr));
-
#ifdef GMX_THREAD_MPI
- /* Early check for externally set process affinity. Can't do over all
- * MPI processes because hwinfo is not available everywhere, but with
- * thread-MPI it's needed as pinning might get turned off which needs
- * to be known before starting thread-MPI. */
+ /* Early check for externally set process affinity.
+ * With thread-MPI this is needed as pinning might get turned off,
+ * which needs to be known before starting thread-MPI.
+ * With thread-MPI hw_opt is processed here on the master rank
+ * and passed to the other ranks later, so we only do this on master.
+ */
gmx_check_thread_affinity_set(fplog,
NULL,
hw_opt, hwinfo->nthreads_hw_avail, FALSE);
#ifdef GMX_THREAD_MPI
if (SIMMASTER(cr))
{
+ /* Since the master knows the cut-off scheme, update hw_opt for this.
+ * This is done later for normal MPI and also once more with tMPI
+ * for all tMPI ranks.
+ */
+ check_and_update_hw_opt_2(hw_opt, inputrec->cutoff_scheme);
+
/* NOW the threads will be started: */
hw_opt->nthreads_tmpi = get_nthreads_mpi(hwinfo,
hw_opt,
fflush(stderr);
#endif
+ /* Check and update hw_opt for the cut-off scheme */
+ check_and_update_hw_opt_2(hw_opt, inputrec->cutoff_scheme);
+
gmx_omp_nthreads_init(fplog, cr,
hwinfo->nthreads_hw_avail,
hw_opt->nthreads_omp,
(cr->duty & DUTY_PP) == 0,
inputrec->cutoff_scheme == ecutsVERLET);
- /* check consistency and decide on the number of gpus to use. */
- gmx_check_hw_runconf_consistency(fplog, hwinfo, cr, hw_opt->nthreads_tmpi,
- minf.bUseGPU);
+ if (PAR(cr))
+ {
+ /* The master rank decided on the use of GPUs,
+ * broadcast this information to all ranks.
+ */
+ gmx_bcast_sim(sizeof(bUseGPU), &bUseGPU, cr);
+ }
+
+ if (bUseGPU)
+ {
+ if (cr->npmenodes == -1)
+ {
+ /* Don't automatically use PME-only nodes with GPUs */
+ cr->npmenodes = 0;
+ }
+
+ /* Select GPU id's to use */
+ gmx_select_gpu_ids(fplog, cr, &hwinfo->gpu_info, bForceUseGPU,
+ &hw_opt->gpu_opt);
+ }
+
+ /* check consistency of CPU acceleration and number of GPUs selected */
+ gmx_check_hw_runconf_consistency(fplog, hwinfo, cr, hw_opt, bUseGPU);
+
+ if (DOMAINDECOMP(cr))
+ {
+ /* When we share GPUs over ranks, we need to know this for the DLB */
+ dd_setup_dlb_resource_sharing(cr, hwinfo, hw_opt);
+ }
/* getting number of PP/PME threads
PME: env variable should be read only on one node to make sure it is
}
/* Initiate forcerecord */
- fr = mk_forcerec();
- fr->hwinfo = hwinfo;
+ fr = mk_forcerec();
+ fr->hwinfo = hwinfo;
+ fr->gpu_opt = &hw_opt->gpu_opt;
init_forcerec(fplog, oenv, fr, fcd, inputrec, mtop, cr, box,
opt2fn("-table", nfile, fnm),
opt2fn("-tabletf", nfile, fnm),
opt2fn("-tablep", nfile, fnm),
opt2fn("-tableb", nfile, fnm),
nbpu_opt,
- FALSE, pforce);
+ FALSE,
+ pforce);
/* version for PCA_NOT_READ_NODE (see md.c) */
/*init_forcerec(fplog,fr,fcd,inputrec,mtop,cr,box,FALSE,
nbnxn_cuda_get_timings(fr->nbv->cu_nbv) : NULL,
EI_DYNAMICS(inputrec->eI) && !MULTISIM(cr));
- if ((cr->duty & DUTY_PP) && fr->nbv != NULL && fr->nbv->bUseGPU)
- {
- char gpu_err_str[STRLEN];
-
- /* free GPU memory and uninitialize GPU (by destroying the context) */
- nbnxn_cuda_free(fplog, fr->nbv->cu_nbv);
- if (!free_gpu(gpu_err_str))
- {
- gmx_warning("On node %d failed to free GPU #%d: %s",
- cr->nodeid, get_current_gpu_device_id(), gpu_err_str);
- }
- }
+ /* Free GPU memory and context */
+ free_gpu_resources(fplog, fr, cr);
if (opt2bSet("-membed", nfile, fnm))
{
endif()
#currently not testing tools because they don't contain any useful tests
- foreach(subtest simple complex kernel freeenergy pdb2gmx)
+ foreach(subtest simple complex kernel freeenergy pdb2gmx rotation)
add_test(NAME regressiontests/${subtest}
#windows requires the command to be perl and not the script
COMMAND perl "${REGRESSIONTEST_PATH}/gmxtest.pl" ${subtest} ${ARGS})