# machine with no git.
#
# NOTE: when releasing the "-dev" suffix needs to be stripped off!
-set(PROJECT_VERSION "4.6.1-dev")
+set(PROJECT_VERSION "4.6.2-dev")
+# The version number of the regressiontest tarball against which this
+# git branch can be tested. Normally, this will be the version of the
+# last patch release. Comment the next line out for branches leading
+# to a major/minor release.
+set(REGRESSIONTEST_VERSION "4.6.1")
set(CUSTOM_VERSION_STRING ""
CACHE STRING "Custom version string (if empty, use hard-coded default)")
mark_as_advanced(CUSTOM_VERSION_STRING)
option(GMX_X11 "Use X window system" OFF)
if (GMX_X11)
- find_package(X11)
- # X11 includes/libraries are only set in the ngmx subdirectory!
- if(X11_FOUND)
- set(HAVE_X11 1)
- endif(X11_FOUND)
+ find_package(X11)
+ # X11 includes/libraries are only set in the ngmx subdirectory!
+ if(NOT X11_FOUND)
+ message(WARNING "X11 include files and/or libraries were not found. Will not build the GROMACS X11-binaries, such as ngmx")
+ endif()
endif(GMX_X11)
include(ThreadMPI)
+++ /dev/null
- Welcome to GROMACS!
-
-*Note*: Detailed, step-by-step installation instructions
-are available on the website
-http://www.gromacs.org/Downloads/Installation_Instructions.
-
-*Note*: If you want to use automake for building look at INSTALL.
- However, automake will be deprecated in releases after 4.5
-
-
-
-Cmake (cross-platform make) is a relatively new build system that
-is gaining in popularity. One of the biggest selling points is
-compatibility with MS Windows. Starting with the 4.5 release,
-it is possible to configure and compile the source code with it.
-
-GETTING CMAKE
-
-Packages for various platforms can be found on the project's download page.
-Most of the Linux distributions come with packages available through the
-corresponding package manage. Make sure the installed version is 2.6 or later.
-Using CMake
-
-Please read carefully the documentation on the CMake website. Developers
-may look at some of the online tutorials.
-
-CONFIGURING WITH DEFAULTS SETTINGS
-
-It is advisable that the configuration and the build of the binaries are done
-outside of the source tree. On Linux/Mac, the following will configure
-the build with the default settings:
-
-$ tar xvfz gromacs-4.5.tar.gz
-$ ls
-gromacs-4.5
-$ mkdir build
-$ cd build
-$ cmake ../gromacs-4.5
-$ make
-
-On multi-core CPU systems (very likely nowadays), the parallel make will do the job much faster:
-
-$ make -j 4
-$ make install
-
-Substitute 4 with the number of available cores.
-
-CONFIGURING WITH CUSTOM OPTIONS
-
-Custom options can be set in a few different ways.A list of the more commonly
-used ones can be found at http://www.gromacs.org/Developer_Zone/Cmake/Custom_options.
-
- *command line flag
-
-The values of custom options are supplied with the -D flag. Note that the source path should
-be the last argument (otherwise remove the space between -D and the option)!
-
-$ cmake -D GMX_DOUBLE=ON ../gromacs-4.5
-
- *interactive CMake session
-
-$ cmake -i ../gromacs-4.5
-
- *curses cmake interface (ccmake)
-
-$ ccmake ../gromacs-4.5
-
- *CMake GUI
-
-$ cmake-gui ../gromacs-4.5
-
-Explanation about the different options will be presented when using any of the
-interactive, curses or gui methods.
-
-All configure options are saved in the CMakeCache.txt file in the build directory.
-The file can be edited using a text editor, but after that cmake should be run again.
-
-$ vim CMakeCache.txt
-$ cmake ../gromacs-4.5
Welcome to the official version of GROMACS!
-If you are familiar with unix, it should be fairly trivial to compile and
-install GROMACS. Installation instructions for CMake are available in the
-INSTALL.* files (the use of autotools is no longer available). A more
-extended step-by-step guide can be found on our website http://www.gromacs.org.
+If you are familiar with Unix, it should be fairly trivial to compile and
+install GROMACS. GROMACS uses only the CMake build sytem, and our
+installation guide can be found at
+http://www.gromacs.org/Documentation/Installation_Instructions.
Of course we will do our utmost to help you with any problems, but PLEASE
READ THE INSTALLATION INSTRUCTIONS BEFORE CONTACTING US!
MPI, threads, double precision, etc.
If you still want to distribute a modified version or use part of GROMACS
-in your own program, remember that the entire modified must be licensed
-under GPL, and that it must clearly be labeled as derived work. It should
-not use the name "official GROMACS", and make sure support questions are
-directed to you instead of the GROMACS developers.
+in your own program, remember that the entire project must be licensed
+according to the requirements of the LGPL v2.1 license under which you
+received this copy of GROMACS. We request that it must clearly be labeled as
+derived work. It should not use the name "official GROMACS", and make
+sure support questions are directed to you instead of the GROMACS developers.
Sorry for the hard wording, but it is meant to protect YOUR reseach results!
* * * * *
* GROMACS: A message-passing parallel molecular dynamics implementation
H.J.C. Berendsen, D. van der Spoel and R. van Drunen
Comp. Phys. Comm. 91, 43-56 (1995)
+ DOI: 10.1016/0010-4655(95)00042-E
* GROMACS 4: Algorithms for highly efficient, load-balanced, and scalable
molecular simulation
B. Hess and C. Kutzner and D. van der Spoel and E. Lindahl
J. Chem. Theory Comput. 4 (2008) pp. 435-447
+ DOI: 10.1021/ct700301q
+
+* GROMACS 4.5: a high-throughput and highly parallel open source
+ molecular simulation toolkit
+ Sander Pronk, Szilárd Páll, Roland Schulz, Per Larsson, Pär Bjelkmar,
+ Rossen Apostolov, Michael R. Shirts, Jeremy C. Smith, Peter M. Kasson,
+ David van der Spoel, Berk Hess, Erik Lindahl.
+ Bioinformatics (2013)
+ DOI: 10.1093/bioinformatics/btt055
There are a lot of cool features we'd like to include in future versions,
but our resources are limited. All kinds of donations are welcome, both in
form of code, hardware and funding! Industrial users who choose to pay
-for a license pro bono (it is still GPL and can be redistributed freely) or
+for a license pro bono (it is still LGPL and can be redistributed freely) or
contribute in other ways are listed as GROMACS supporters on our webpages.
Don't hesitate to contact us at gromacs@gromacs.org if you are interested.
+++ /dev/null
-
-To support special compiler and OS combinations like the Portland
-compilers on Linux/x86, Compaq compilers on Linux/Alpha, and enable
-shared libraries when using wrapper scripts like mpcc we are using
-a prerelease version of libtool (1.4e), and the libtool.m4 script has been
-patched and included at the end of acinclude.m4.
-
-PLEASE NOTE - You NEVER have to install libtool, and autoconf/automake
-are only necessary for developers who change the Makefile.in's. You can
-find the software at ftp.gromacs.org/developers.
-
-This file summarizes the patches/extensions we made so we can check it
-if we ever update the files in GROMACS:
-
-
-
-**********************************************************************
-
-B. Changes in libtool-1.4e:
- All these changes should go in libtool.m4. This file is included
- in the GROMACS acinclude.m4 to make sure we use the right version.
-
-1. I have implemented Fortran 77 support in libtool, which hopefully
- will make its way into the main distribution. This involves
- the AC_LIBTOOL_LANG_F77_CONFIG macro, and all supporting routines
- with "F77" in the name :-)
- Since a couple of f77 compilers (ibm xlf notably) dont accept -DPIC
- I also separated the compiler flag test into one for -fPIC and
- a separate one for -DPIC.
- I have changed echo to printf for the lt_simple.. variables, to
- get the fortran formatting right.
-
- There is one specific Gromacs-related change that won't be a
- part of the main libtool distribution:
-
- I removed the --with-tags argument from _LT_AC_TAGCONFIG,
- to fix libtool always creating the F77 and CXX macros,
- and replaced it with a check for ac_cv_prog_**_g to determine
- whether we should use CXX and/or F77. Don't care about GCJ...
- To avoid AC_PROG_CXX always being expanded, I introduced the macro
- inside an always-false test in our configure.ac.
-
-2. To support dynamic libraries on Alpha/Linux with the Compaq Compilers,
- I have added some flag specifications in the non-gcc compiler section
- of the AC_LIBTOOL_PROG_COMPILER_PIC macro. I added the linux*) case:
-
-=============================================================
- irix5* | irix6* | nonstopux*)
- _LT_AC_TAGVAR(lt_prog_compiler_wl, $1)='-Wl,'
- # PIC (with -KPIC) is the default.
- _LT_AC_TAGVAR(lt_prog_compiler_static, $1)='-non_shared'
- ;;
-
-+ linux*)
-+ # Check flags for non-gnu compilers on Linux
-+ case "$host_cpu" in
-+ alpha*)
-+ # The only non-gnu compiler on Linux/Alpha is the Compaq one:
-+ _LT_AC_TAGVAR(lt_prog_compiler_wl, $1)='-Wl,'
-+ _LT_AC_TAGVAR(lt_prog_compiler_static, $1)='-non_shared'
-+ ;;
-+
-+ i?86)
-+ # Intel icc compiler
-+ if $CC -V 2>&1 | grep 'Intel Corporation' > /dev/null 2>&1; then
-+ _LT_AC_TAGVAR(lt_prog_compiler_wl, $1)='-Qoption,link,'
-+ _LT_AC_TAGVAR(lt_prog_compiler_static, $1)='-static'
-+ else
-+ _LT_AC_TAGVAR(lt_prog_compiler_can_build_shared, $1)=no
-+ fi
-+ ;;
-+
-+ ia64)
-+ # Intel icc compiler
-+ if $CC -V 2>&1 | grep 'Intel Corporation' > /dev/null 2>&1; then
-+ _LT_AC_TAGVAR(lt_prog_compiler_wl, $1)='-Qoption,link,'
-+ _LT_AC_TAGVAR(lt_prog_compiler_static, $1)='-static'
-+ else
-+ _LT_AC_TAGVAR(lt_prog_compiler_can_build_shared, $1)=no
-+ fi
-+ ;;
-+
-+ *)
-+ _LT_AC_TAGVAR(lt_prog_compiler_can_build_shared, $1)=no
-+ ;;
-+ esac
-+ ;;
-
- newsos6)
- _LT_AC_TAGVAR(lt_prog_compiler_pic, $1)='-KPIC'
- _LT_AC_TAGVAR(lt_prog_compiler_static, $1)='-Bstatic'
- ;;
-
-
-=====================================================================
-The intel compilers echoes stupid stuff to stderr, which is interpreted
-as errors when libtool checks for supported flags. As a workaround we
-only grep for the actual flag tried, or the words "flag" or "option".
-The modified versions of the macros are:
-
-
-# AC_LIBTOOL_COMPILER_OPTION(MESSAGE, VARIABLE-NAME, FLAGS,
-# [OUTPUT-FILE], [ACTION-SUCCESS], [ACTION-FAILURE])
-# ----------------------------------------------------------------
-# Check whether the given compiler option works
-AC_DEFUN([AC_LIBTOOL_COMPILER_OPTION],
-[AC_CACHE_CHECK([$1], [$2],
- [$2=no
- ifelse([$4], , [ac_outfile=conftest.$ac_objext], [ac_outfile=$4])
- save_CFLAGS="$CFLAGS"
- CFLAGS="$CFLAGS $3"
- printf "$lt_simple_compile_test_code" > conftest.$ac_ext
- if (eval $ac_compile 2>conftest.err) && test -s $ac_outfile; then
- # Modified by Erik Lindahl:
- # Some compilers (icc, pgcc) echo stupid stuff to stderr.
- # To avoid this being interpreted as errors we check the output
- # and only fail the test if the option is present, or one of the
- # words 'option' or 'flag'.
- if test -n "$3" && grep "$3" conftest.err > /dev/null 2>&1 || grep "option" conftest.err > /dev/null 2>&1 || grep "flag" conftest.err > /dev/null 2>&1 ; then
- # Append any errors to the config.log.
- cat conftest.err 1>&AS_MESSAGE_LOG_FD
- else
- $2=yes
- fi
- fi
- $rm conftest*
- CFLAGS="$save_CFLAGS"
-])
-
-if test x"[$]$2" = xyes; then
- ifelse([$5], , :, [$5])
-else
- ifelse([$6], , :, [$6])
-fi
-])# AC_LIBTOOL_COMPILER_OPTION
-
-
-# AC_LIBTOOL_LINKER_OPTION(MESSAGE, VARIABLE-NAME, FLAGS,
-# [ACTION-SUCCESS], [ACTION-FAILURE])
-# ------------------------------------------------------------
-# Check whether the given compiler option works
-AC_DEFUN([AC_LIBTOOL_LINKER_OPTION],
-[AC_CACHE_CHECK([$1], [$2],
- [$2=no
- save_LDFLAGS="$LDFLAGS"
- LDFLAGS="$LDFLAGS $3"
- printf "$lt_simple_link_test_code" > conftest.$ac_ext
- if (eval $ac_link 2>conftest.err) && test -s conftest$ac_exeext; then
- # The compiler can only warn and ignore the option if not recognized
- # So say no if there are warnings
- if test -n "$3" && grep "$3" conftest.err > /dev/null 2>&1 || grep "option" conftest.err > /dev/null 2>&1 || grep "flag" conftest.err > /dev/null 2>&1 ; then
- # Append any errors to the config.log.
- cat conftest.err 1>&AS_MESSAGE_LOG_FD
- else
- $2=yes
- fi
- fi
- $rm conftest*
- LDFLAGS="$save_LDFLAGS"
-])
-
-if test x"[$]$2" = xyes; then
- ifelse([$4], , :, [$4])
-else
- ifelse([$5], , :, [$5])
-fi
-])# AC_LIBTOOL_LINKER_OPTION
-
-
-
-===============================================================
-
-That's it. I've submitted most patches and extensions to
-automake/libtool mailing lists, so hopefully the next release will be
-easier if they are included in future versions :-)
-
-Erik Lindahl <lindahl@gromacs.org>, 2002-01-23
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
\end{enumerate}
Or, as a sequence of commands to execute:
\begin{verbatim}
-tar xfz gromacs-4.6.tar.gz
-cd gromacs-4.6
+tar xfz gromacs-4.6.1.tar.gz
+cd gromacs-4.6.1
mkdir build
cd build
cmake .. -DGMX_BUILD_OWN_FFTW=ON
acceleration available for implicit solvent simulations in
\gromacs{} at the moment. However, the long-term plan is to enable
this functionality in core Gromacs, and not have the OpenMM
-interface supported by the \gromacs team. RIght now there might be
-some build issues for OpenMM, but they should be fixed by release 4.6.1.
+interface supported by the \gromacs team. Right now there are
+some build issues for OpenMM.
If you wish to run in parallel on multiple machines across a network,
you will need to have
\subsection{Optional build components}
\begin{itemize}
-\item Hardware-optimized \blas{} and \lapack{} libraries are useful for
- a few of the \gromacs{} utilities focused on normal modes and matrix manipulation,
- but they does not provide any benefits for normal simulations.
+\item Hardware-optimized \blas{} and \lapack{} libraries are useful
+ for a few of the \gromacs{} utilities focused on normal modes and
+ matrix manipulation, but they do not provide any benefits for normal
+ simulations. Configuring these are discussed
+ \hyperlink{linear-algebra}{here}.
\item The built-in \gromacs{} trajectory viewer \verb+ngmx+ requires
X11 and Motif/Lesstif libraries and header files. Generally, the
\gromacs{} team rather recommends you use third-party software for
example, download the source tarball and use
% TODO: keep up to date with new releases!
\begin{verbatim}
-$ tar xfz gromacs-4.6.tgz
-$ cd gromacs-4.6
+$ tar xfz gromacs-4.6.1.tgz
+$ cd gromacs-4.6.1
$ mkdir build-cmake
$ cd build-cmake
$ cmake ..
See also: \url{http://cmake.org/Wiki/CMake_Useful_Variables#Environment_Variables}
+\subsection{Linear algebra libraries}\hypertarget{linear-algebra}
+As mentioned above, sometimes vendor \blas{} and \lapack{} libraries
+can provide performance enhancements for \gromacs{} when doing
+normal-mode analysis or covariance analysis. For simplicity, the text
+below will refer only to \blas{}, but the same options are available
+for \lapack{}. By default, the \cmake{} optionv
+\verb+GMX_EXTERNAL_BLAS+ is on, which triggers \cmake{} to search for
+\blas{}. If one is found, then it is used. Otherwise, \cmake{} falls
+back on internal versions provided in the \gromacs{} source. These are
+fine for normal use. If you need to specify a non-standard path to
+search, use \verb+-DCMAKE_PREFIX_PATH=/path/to/search+.
+
+On Apple platforms where the Accelerate Framework is available, these
+will be automatically used for \blas{} and \lapack{}.
+
\subsection{Native GPU acceleration}
If you have the \cuda{} Software Development Kit installed, you can
use \cmake{} with:
\gromacs{} will suffix binaries and libraries for such builds with
'\verb+_d+' for double precision and/or '\verb+_mpi+' for \mpi{} (and
nothing otherwise). This can be controlled manually with
-\verb+GMX_DEFAULT_SUFFIX+, \verb+GMX_BINARY_SUFFIX+ and
-\verb+GMX_LIBRARY_SUFFIX+. This can also be useful for resolving
-libary-naming conflicts with existing packges (\verb+GMX_PREFIX_LIBMD+
-also can be useful).
+\verb+GMX_DEFAULT_SUFFIX (ON/OFF)+, \verb+GMX_BINARY_SUFFIX+ (takes
+a string) and \verb+GMX_LIBS_SUFFIX+ (also takes a string).
+This can also be useful for resolving libary-naming conflicts with
+existing packges (\verb+GMX_PREFIX_LIBMD+ also can be useful).
+For instance, to set a custom suffix for binaries and libraries,
+one might specify:
+
+\begin{verbatim}
+cmake .. -DGMX_DEFAULT_SUFFIX=OFF -DGMX_BINARY_SUFFIX=_mod -DGMX_LIBS_SUFFIX=_mod
+\end{verbatim}
+
+Thus the names of all binaries and libraries will be appended with
+"_mod."
\subsection{Building \gromacs{}}
\verb+-DREGRESSIONTEST_DOWNLOAD+, and run \verb+make check+.
\gromacs{} will automatically download and run the tests for you.
Alternatively, you can download and unpack the tarball yourself from
-\url{http://gerrit.gromacs.org/download/regressiontests-4.6.tar.gz},
+\url{http://gerrit.gromacs.org/download/regressiontests-4.6.1.tar.gz},
and use the advanced \cmake{} option \verb+REGRESSIONTEST_PATH+ to
specify the path to the unpacked tarball, which will then be used for
testing. If this doesn't work, then please read on.
\subsubsection{BlueGene/P}
There is currently no native acceleration on this platform, but the
-default plain C kernels will work. Toolchain files will be improved in
-\gromacs{} 4.6.1.
+default plain C kernels will work.
\subsubsection{BlueGene/Q}
This is the architecture of the K computer, which uses Fujitsu Sparc64viiifx
chips. Gromacs-4.6 will build with default C kernels on this architecture,
-and Gromacs-4.6.1 will add accelerated kernels and a custom toolchain.
+and Gromacs-4.6.2 will add accelerated kernels and a custom toolchain.
\section{Tested platforms}
set dir = $cwd
-set VER = 4.6
+set VER = 4.6.2
+set DATE = `date "+%B %d, %Y`
set MANDIR = online
set HTML = $cwd/html
set HTMLOL = $HTML/$MANDIR
<td ALIGN=LEFT VALIGN=TOP WIDTH=280>
<br><br>
<h2>
-GROMACS 4.6<br>
+GROMACS $VER<br>
Online Reference</h2>
</td>
</TABLE></TD>
<td ALIGN=RIGHT VALIGN=BOTTOM WIDTH="*" NOSAVE>
-<B>VERSION 4.6<br>
-Sat 19 Jan 2013</B></td>
+<B>VERSION $VER<br>
+$DATE</B></td>
</tr>
</table>
echo -n "$program "
cd $HTMLOL
$GMXBINDIR/$program -quiet -man html >& /dev/null
+ cd ..
endif
endif
end
echo "\
The use of autotools for installing GROMACS is deprecated for 4.6 onwards
-Please switch to CMake. Check http://www.gromacs.org/Developer_Zone/Cmake
+Please switch to CMake. Check out
+http://www.gromacs.org/Documentation/Installation_Instructions
for instructions."
#undef GMX_SIMD_WIDTH_HERE
-#undef gmx_epi32
-
/* float/double SIMD register type */
#undef gmx_mm_pr
+/* integer SIMD register type, only used in the tabulated PME kernels */
+#undef gmx_epi32
+
#undef gmx_load_pr
#undef gmx_load1_pr
#undef gmx_set1_pr
#undef gmx_setzero_pr
#undef gmx_store_pr
-/* Only used for debugging */
-#undef gmx_storeu_pr
#undef gmx_add_pr
#undef gmx_sub_pr
#undef gmx_mul_pr
#undef gmx_max_pr
#undef gmx_cmplt_pr
+/* gmx_blendzero_pr(real a, boolean b) does: (b ? a : 0) */
+#undef gmx_blendzero_pr
+/* Logical operations on SIMD booleans */
#undef gmx_and_pr
#undef gmx_or_pr
#undef gmx_andnot_pr
-/* Only used to speed up the nbnxn tabulated PME kernels */
+/* Not required, only used to speed up the nbnxn tabulated PME kernels */
+#undef GMX_HAVE_SIMD_FLOOR
#undef gmx_floor_pr
-/* Only used with x86 when blendv is faster than comparison */
+/* Not required, only used with when blendv is faster than comparison */
+#undef GMX_HAVE_SIMD_BLENDV
#undef gmx_blendv_pr
+/* Not required, gmx_anytrue(x) returns if any of the boolean is x is True.
+ * If this is not present, define GMX_SIMD_IS_TRUE(real x),
+ * which should return x==True, where True is True as defined in SIMD.
+ */
+#undef GMX_HAVE_SIMD_ANYTRUE
+#undef gmx_anytrue_pr
+
+/* Integer set and cast are only used for nbnxn exclusion masks */
+#undef gmx_set1_epi32
+#undef gmx_castsi_pr
+/* For topology exclusion pair checking we need: ((a & b) ? True : False)
+ * when we do a bit-wise and between a and b.
+ * When integer SIMD operations are present, we use gmx_checkbitmask_epi32(a, b)
+ * Otherwise we do all operations, except for the set1, in reals.
+ */
+#undef gmx_load_si
+/* If the same bit is set in both input masks, return all bits 1, otherwise 0 */
+#undef gmx_checkbitmask_epi32
+/* As gmx_checkbitmask_epi32, but operates on reals. In double precision two
+ * identical 32-bit masks are set in one double and one or both can be used.
+ */
+#undef gmx_checkbitmask_pr
-#undef gmx_movemask_pr
-
-/* Integer casts are only used for nbnxn x86 exclusion masks */
-#undef gmx_mm_castsi128_pr
-#undef gmx_mm_castsi256_pr
-
-/* Conversions only used for nbnxn x86 exclusion masks and PME table lookup */
+/* Conversions only used for PME table lookup */
#undef gmx_cvttpr_epi32
#undef gmx_cvtepi32_pr
#undef gmx_pmecorrV_pr
-/* Half SIMD-width types and operations only for nbnxn 2xnn search+kernels */
-#undef gmx_mm_hpr
-
-#undef gmx_load_hpr
-#undef gmx_load1_hpr
-#undef gmx_store_hpr
-#undef gmx_add_hpr
-#undef gmx_sub_hpr
-
-#undef gmx_sum4_hpr
-
-#undef gmx_2hpr_to_pr
-
-
-/* By defining GMX_MM128_HERE or GMX_MM256_HERE before including this file
- * the same intrinsics, with defines, can be compiled for either 128 or 256
- * bit wide SSE or AVX instructions.
- * The gmx_ prefix is replaced by _mm_ or _mm256_ (SSE or AVX).
+/* The same SIMD macros, can be translated to SIMD intrinsics, and compiled
+ * to instructions for, different SIMD width and float precision.
+ * On x86, the gmx_ prefix is replaced by _mm_ or _mm256_ (SSE or AVX).
* The _pr suffix is replaced by _ps or _pd (single or double precision).
* Note that compiler settings will decide if 128-bit intrinsics will
* be translated into SSE or AVX instructions.
*/
-#if !defined GMX_MM128_HERE && !defined GMX_MM256_HERE
-#error "You should define GMX_MM128_HERE or GMX_MM256_HERE"
-#endif
-#if defined GMX_MM128_HERE && defined GMX_MM256_HERE
-#error "You should not define both GMX_MM128_HERE and GMX_MM256_HERE"
+/* Generic macros for obtaining a SIMD aligned pointer from pointer x */
+#undef gmx_simd_align_real
+#undef gmx_simd_align_int
+
+
+#ifdef GMX_USE_HALF_WIDTH_SIMD_HERE
+#if defined GMX_X86_AVX_256
+/* We have half SIMD width support, continue */
+#else
+#error "half SIMD width intrinsics are not supported"
+#endif
#endif
#ifdef GMX_X86_SSE2
-#ifdef GMX_MM128_HERE
-
-#define gmx_epi32 __m128i
+#if !defined GMX_X86_AVX_256 || defined GMX_USE_HALF_WIDTH_SIMD_HERE
#ifndef GMX_DOUBLE
#define gmx_mm_pr __m128
+#define gmx_epi32 __m128i
+
#define gmx_load_pr _mm_load_ps
#define gmx_load1_pr _mm_load1_ps
#define gmx_set1_pr _mm_set1_ps
#define gmx_setzero_pr _mm_setzero_ps
#define gmx_store_pr _mm_store_ps
-#define gmx_storeu_pr _mm_storeu_ps
#define gmx_add_pr _mm_add_ps
#define gmx_sub_pr _mm_sub_ps
#define gmx_mul_pr _mm_mul_ps
#define gmx_max_pr _mm_max_ps
#define gmx_cmplt_pr _mm_cmplt_ps
+#define gmx_blendzero_pr _mm_and_ps
#define gmx_and_pr _mm_and_ps
#define gmx_or_pr _mm_or_ps
#define gmx_andnot_pr _mm_andnot_ps
+#ifdef GMX_X86_SSE4_1
+#define GMX_HAVE_SIMD_FLOOR
#define gmx_floor_pr _mm_floor_ps
+#define GMX_HAVE_SIMD_BLENDV
#define gmx_blendv_pr _mm_blendv_ps
+#endif
-#define gmx_movemask_pr _mm_movemask_ps
+#define GMX_HAVE_SIMD_ANYTRUE
+#define gmx_anytrue_pr _mm_movemask_ps
-#define gmx_mm_castsi128_pr gmx_mm_castsi128_ps
+#define gmx_set1_epi32 _mm_set1_epi32
+#define gmx_castsi_pr gmx_mm_castsi128_ps
+#define gmx_load_si(i) _mm_load_si128((__m128i *) (i))
+#define gmx_checkbitmask_epi32(m0, m1) _mm_cmpeq_epi32(_mm_andnot_si128(m0, m1), _mm_setzero_si128())
#define gmx_cvttpr_epi32 _mm_cvttps_epi32
#define gmx_cvtepi32_pr _mm_cvtepi32_ps
#define gmx_mm_pr __m128d
+#define gmx_epi32 __m128i
+
#define gmx_load_pr _mm_load_pd
#define gmx_load1_pr _mm_load1_pd
#define gmx_set1_pr _mm_set1_pd
#define gmx_setzero_pr _mm_setzero_pd
#define gmx_store_pr _mm_store_pd
-#define gmx_storeu_pr _mm_storeu_pd
#define gmx_add_pr _mm_add_pd
#define gmx_sub_pr _mm_sub_pd
#define gmx_mul_pr _mm_mul_pd
#define gmx_max_pr _mm_max_pd
#define gmx_cmplt_pr _mm_cmplt_pd
+#define gmx_blendzero_pr _mm_and_pd
#define gmx_and_pr _mm_and_pd
#define gmx_or_pr _mm_or_pd
#define gmx_andnot_pr _mm_andnot_pd
+#ifdef GMX_X86_SSE4_1
+#define GMX_HAVE_SIMD_FLOOR
#define gmx_floor_pr _mm_floor_pd
+#define GMX_HAVE_SIMD_BLENDV
#define gmx_blendv_pr _mm_blendv_pd
+#endif
-#define gmx_movemask_pr _mm_movemask_pd
+#define GMX_HAVE_SIMD_ANYTRUE
+#define gmx_anytrue_pr _mm_movemask_pd
-#define gmx_mm_castsi128_pr gmx_mm_castsi128_pd
+#define gmx_set1_epi32 _mm_set1_epi32
+#define gmx_castsi_pr gmx_mm_castsi128_pd
+#define gmx_load_si(i) _mm_load_si128((__m128i *) (i))
+#define gmx_checkbitmask_epi32(m0, m1) _mm_cmpeq_epi32(_mm_andnot_si128(m0, m1), _mm_setzero_si128())
#define gmx_cvttpr_epi32 _mm_cvttpd_epi32
#define gmx_cvtepi32_pr _mm_cvtepi32_pd
#endif /* ifndef GMX_DOUBLE */
-#endif /* GMX_MM128_HERE */
-
-#ifdef GMX_MM256_HERE
-
-#define gmx_epi32 __m256i
+#else
+/* We have GMX_X86_AVX_256 and not GMX_USE_HALF_WIDTH_SIMD_HERE,
+ * so we use 256-bit SIMD.
+ */
#ifndef GMX_DOUBLE
#define gmx_mm_pr __m256
+#define gmx_epi32 __m256i
+
#define gmx_load_pr _mm256_load_ps
#define gmx_load1_pr(x) _mm256_set1_ps((x)[0])
#define gmx_set1_pr _mm256_set1_ps
#define gmx_setzero_pr _mm256_setzero_ps
#define gmx_store_pr _mm256_store_ps
-#define gmx_storeu_pr _mm256_storeu_ps
#define gmx_add_pr _mm256_add_ps
#define gmx_sub_pr _mm256_sub_ps
#define gmx_mul_pr _mm256_mul_ps
#define gmx_max_pr _mm256_max_ps
-/* Not-equal (ordered, non-signaling) */
-#define gmx_cmpneq_pr(x, y) _mm256_cmp_ps(x, y, 0x0c)
-/* Less-than (ordered, non-signaling) */
+/* Less-than (we use ordered, non-signaling, but that's not required) */
#define gmx_cmplt_pr(x, y) _mm256_cmp_ps(x, y, 0x11)
+#define gmx_blendzero_pr _mm256_and_ps
#define gmx_and_pr _mm256_and_ps
#define gmx_or_pr _mm256_or_ps
#define gmx_andnot_pr _mm256_andnot_ps
+#define GMX_HAVE_SIMD_FLOOR
#define gmx_floor_pr _mm256_floor_ps
+#define GMX_HAVE_SIMD_BLENDV
#define gmx_blendv_pr _mm256_blendv_ps
-#define gmx_movemask_pr _mm256_movemask_ps
+#define GMX_HAVE_SIMD_ANYTRUE
+#define gmx_anytrue_pr _mm256_movemask_ps
-#define gmx_mm_castsi256_pr _mm256_castsi256_ps
+#define gmx_set1_epi32 _mm256_set1_epi32
+#define gmx_castsi_pr _mm256_castsi256_ps
+/* With <= 16 bits used the cast and conversion should not be required,
+ * since only mantissa bits are set and that would give a non-zero float,
+ * but with the Intel compiler this does not work correctly.
+ */
+#define gmx_checkbitmask_pr(m0, m1) _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_castps_si256(_mm256_and_ps(m0, m1))), _mm256_setzero_ps(), 0x0c)
#define gmx_cvttpr_epi32 _mm256_cvttps_epi32
#define gmx_pmecorrF_pr gmx_mm256_pmecorrF_ps
#define gmx_pmecorrV_pr gmx_mm256_pmecorrV_ps
-#define gmx_loaddh_pr gmx_mm256_load4_ps
-
-/* Half SIMD-width type */
-#define gmx_mm_hpr __m128
-
-/* Half SIMD-width macros */
-#define gmx_load_hpr _mm_load_ps
-#define gmx_load1_hpr(x) _mm_set1_ps((x)[0])
-#define gmx_store_hpr _mm_store_ps
-#define gmx_add_hpr _mm_add_ps
-#define gmx_sub_hpr _mm_sub_ps
-
-#define gmx_sum4_hpr gmx_mm256_sum4h_m128
-
-/* Conversion between half and full SIMD-width */
-#define gmx_2hpr_to_pr gmx_mm256_set_m128
-
#else
#include "gmx_x86_simd_double.h"
#define gmx_mm_pr __m256d
+/* We use 128-bit integer registers because of missing 256-bit operations */
+#define gmx_epi32 __m128i
+
#define gmx_load_pr _mm256_load_pd
#define gmx_load1_pr(x) _mm256_set1_pd((x)[0])
#define gmx_set1_pr _mm256_set1_pd
#define gmx_setzero_pr _mm256_setzero_pd
#define gmx_store_pr _mm256_store_pd
-#define gmx_storeu_pr _mm256_storeu_pd
#define gmx_add_pr _mm256_add_pd
#define gmx_sub_pr _mm256_sub_pd
#define gmx_mul_pr _mm256_mul_pd
#define gmx_max_pr _mm256_max_pd
-/* Not-equal (ordered, non-signaling) */
-#define gmx_cmpneq_pr(x, y) _mm256_cmp_pd(x, y, 0x0c)
-/* Less-than (ordered, non-signaling) */
+/* Less-than (we use ordered, non-signaling, but that's not required) */
#define gmx_cmplt_pr(x, y) _mm256_cmp_pd(x, y, 0x11)
+#define gmx_blendzero_pr _mm256_and_pd
#define gmx_and_pr _mm256_and_pd
#define gmx_or_pr _mm256_or_pd
#define gmx_andnot_pr _mm256_andnot_pd
+#define GMX_HAVE_SIMD_FLOOR
#define gmx_floor_pr _mm256_floor_pd
+#define GMX_HAVE_SIMD_BLENDV
#define gmx_blendv_pr _mm256_blendv_pd
-#define gmx_movemask_pr _mm256_movemask_pd
+#define GMX_HAVE_SIMD_ANYTRUE
+#define gmx_anytrue_pr _mm256_movemask_pd
-#define gmx_mm_castsi256_pr _mm256_castsi256_pd
+#define gmx_set1_epi32 _mm256_set1_epi32
+#define gmx_castsi_pr _mm256_castsi256_pd
+/* With <= 16 bits used the cast and conversion should not be required,
+ * since only mantissa bits are set and that would give a non-zero float,
+ * but with the Intel compiler this does not work correctly.
+ * Because AVX does not have int->double conversion, we convert via float.
+ */
+#define gmx_checkbitmask_pr(m0, m1) _mm256_cmp_pd(_mm256_castps_pd(_mm256_cvtepi32_ps(_mm256_castpd_si256(_mm256_and_pd(m0, m1)))), _mm256_setzero_pd(), 0x0c)
#define gmx_cvttpr_epi32 _mm256_cvttpd_epi32
#define gmx_pmecorrF_pr gmx_mm256_pmecorrF_pd
#define gmx_pmecorrV_pr gmx_mm256_pmecorrV_pd
-#endif
+#endif /* GMX_DOUBLE */
-#endif /* GMX_MM256_HERE */
+#endif /* 128- or 256-bit x86 SIMD */
#endif /* GMX_X86_SSE2 */
+
+
+/* Generic macros to extract a SIMD aligned pointer from a pointer x.
+ * x should have at least GMX_SIMD_WIDTH_HERE elements extra compared
+ * to how many you want to use, to avoid indexing outside the aligned region.
+ */
+
+#define gmx_simd_align_real(x) (real *)(((size_t)((x)+GMX_SIMD_WIDTH_HERE)) & (~((size_t)(GMX_SIMD_WIDTH_HERE*sizeof(real)-1))))
+
+#define gmx_simd_align_int(x) (int *)(((size_t)((x)+GMX_SIMD_WIDTH_HERE)) & (~((size_t)(GMX_SIMD_WIDTH_HERE*sizeof(int )-1))))
/* Use SIMD accelerated nbnxn search and kernels */
#define GMX_NBNXN_SIMD
-#ifdef GMX_X86_AVX_256
-/* Note that setting this to 128 will also work with AVX-256, but slower */
+/* Uncomment the next line to use, slower, 128-bit SIMD with AVX-256 */
+/* #define GMX_NBNXN_HALF_WIDTH_SIMD */
+
+#if defined GMX_X86_AVX_256 && !defined GMX_NBNXN_HALF_WIDTH_SIMD
#define GMX_NBNXN_SIMD_BITWIDTH 256
#else
#define GMX_NBNXN_SIMD_BITWIDTH 128
real *x; /* x and possibly q, size natoms*xstride */
real *simd_4xn_diag; /* indices to set the SIMD 4xN diagonal masks */
real *simd_2xnn_diag; /* indices to set the SIMD 2x(N+N)diagonal masks */
+ unsigned *simd_excl_mask; /* exclusion masks for SIMD topology exclusions */
int nout; /* The number of force arrays */
nbnxn_atomdata_output_t *out; /* Output data structures */
int nalloc; /* Allocation size of all arrays (for x/f *x/fstride) */
_g_hbond_compl() {
local p c
COMPREPLY=() c=${COMP_WORDS[COMP_CWORD]} p=${COMP_WORDS[COMP_CWORD-1]}
-if (( $COMP_CWORD <= 1 )) || [[ $c == -* ]]; then COMPREPLY=( $(compgen -W ' -f -s -n -num -g -ac -dist -ang -hx -hbn -hbm -don -dan -life -nhbdist -h -version -nice -b -e -dt -tu -xvg -a -r -noda -r2 -abin -rbin -nonitacc -contact -shell -fitstart -fitstart -temp -smooth -dump -max_hb -nomerge -geminate -diff -acflen -nonormalize -P -fitfn -ncskip -beginfit -endfit' -- $c)); return 0; fi
+if (( $COMP_CWORD <= 1 )) || [[ $c == -* ]]; then COMPREPLY=( $(compgen -W ' -f -s -n -num -g -ac -dist -ang -hx -hbn -hbm -don -dan -life -nhbdist -h -version -nice -b -e -dt -tu -xvg -a -r -noda -r2 -abin -rbin -nonitacc -contact -shell -fitstart -fitstart -temp -smooth -dump -max_hb -nomerge -geminate -diff -nthreads -acflen -nonormalize -P -fitfn -ncskip -beginfit -endfit' -- $c)); return 0; fi
case "$p" in
-tu) COMPREPLY=( $(compgen -W ' fs ps ns us ms s ' -- $c ));;
-xvg) COMPREPLY=( $(compgen -W ' xmgrace xmgr none ' -- $c ));;
_g_sans_compl() {
local p c
COMPREPLY=() c=${COMP_WORDS[COMP_CWORD]} p=${COMP_WORDS[COMP_CWORD-1]}
-if (( $COMP_CWORD <= 1 )) || [[ $c == -* ]]; then COMPREPLY=( $(compgen -W ' -s -f -n -d -pr -sq -prframe -sqframe -h -version -nice -b -e -dt -tu -xvg -mode -mcover -nopbc -startq -endq -qstep -seed' -- $c)); return 0; fi
+if (( $COMP_CWORD <= 1 )) || [[ $c == -* ]]; then COMPREPLY=( $(compgen -W ' -s -f -n -d -pr -sq -prframe -sqframe -h -version -nice -b -e -dt -tu -xvg -mode -mcover -nopbc -startq -endq -qstep -seed -nt' -- $c)); return 0; fi
case "$p" in
-tu) COMPREPLY=( $(compgen -W ' fs ps ns us ms s ' -- $c ));;
-xvg) COMPREPLY=( $(compgen -W ' xmgrace xmgr none ' -- $c ));;
_mdrun_compl() {
local p c
COMPREPLY=() c=${COMP_WORDS[COMP_CWORD]} p=${COMP_WORDS[COMP_CWORD-1]}
-if (( $COMP_CWORD <= 1 )) || [[ $c == -* ]]; then COMPREPLY=( $(compgen -W ' -s -o -x -cpi -cpo -c -e -g -dhdl -field -table -tabletf -tablep -tableb -rerun -tpi -tpid -ei -eo -j -jo -ffout -devout -runav -px -pf -ro -ra -rs -rt -mtx -dn -multidir -membed -mp -mn -h -version -nice -deffnm -xvg -pd -dd -ddorder -npme -nt -ntmpi -ntomp -ntomp_pme -nopin -pinht -pinoffset -gpu_id -noddcheck -rdd -rcon -dlb -dds -gcom -nb -notunepme -testverlet -v -nocompact -seppot -pforce -reprod -cpt -cpnum -noappend -nsteps -maxh -multi -replex -nex -reseed -ionize' -- $c)); return 0; fi
+if (( $COMP_CWORD <= 1 )) || [[ $c == -* ]]; then COMPREPLY=( $(compgen -W ' -s -o -x -cpi -cpo -c -e -g -dhdl -field -table -tabletf -tablep -tableb -rerun -tpi -tpid -ei -eo -j -jo -ffout -devout -runav -px -pf -ro -ra -rs -rt -mtx -dn -multidir -membed -mp -mn -h -version -nice -deffnm -xvg -pd -dd -ddorder -npme -nt -ntmpi -ntomp -ntomp_pme -pin -pinoffset -pinstride -gpu_id -noddcheck -rdd -rcon -dlb -dds -gcom -nb -notunepme -testverlet -v -nocompact -seppot -pforce -reprod -cpt -cpnum -noappend -nsteps -maxh -multi -replex -nex -reseed -ionize' -- $c)); return 0; fi
case "$p" in
-xvg) COMPREPLY=( $(compgen -W ' xmgrace xmgr none ' -- $c ));;
-ddorder) COMPREPLY=( $(compgen -W ' interleave pp_pme cartesian ' -- $c ));;
+-pin) COMPREPLY=( $(compgen -W ' auto on off ' -- $c ));;
-dlb) COMPREPLY=( $(compgen -W ' auto no yes ' -- $c ));;
-nb) COMPREPLY=( $(compgen -W ' auto cpu gpu gpu_cpu ' -- $c ));;
-s) COMPREPLY=( $(compgen -X '!*.+(tpr|tpb|tpa)*(.gz|.Z)' -f $c ; compgen -S '/' -X '.*' -d $c ));;
-rt) COMPREPLY=( $(compgen -X '!*.log*(.gz|.Z)' -f $c ; compgen -S '/' -X '.*' -d $c ));;
-mtx) COMPREPLY=( $(compgen -X '!*.mtx*(.gz|.Z)' -f $c ; compgen -S '/' -X '.*' -d $c ));;
-dn) COMPREPLY=( $(compgen -X '!*.ndx*(.gz|.Z)' -f $c ; compgen -S '/' -X '.*' -d $c ));;
--multidir) COMPREPLY=( $(compgen -X '!*.rundir*(.gz|.Z)' -f $c ; compgen -S '/' -X '.*' -d $c ));;
+-multidir) COMPREPLY=( $(compgen -X '!*.*(.gz|.Z)' -f $c ; compgen -S '/' -X '.*' -d $c ));;
-membed) COMPREPLY=( $(compgen -X '!*.dat*(.gz|.Z)' -f $c ; compgen -S '/' -X '.*' -d $c ));;
-mp) COMPREPLY=( $(compgen -X '!*.top*(.gz|.Z)' -f $c ; compgen -S '/' -X '.*' -d $c ));;
-mn) COMPREPLY=( $(compgen -X '!*.ndx*(.gz|.Z)' -f $c ; compgen -S '/' -X '.*' -d $c ));;
_mdrun_mpi_compl() {
local p c
COMPREPLY=() c=${COMP_WORDS[COMP_CWORD]} p=${COMP_WORDS[COMP_CWORD-1]}
-if (( $COMP_CWORD <= 1 )) || [[ $c == -* ]]; then COMPREPLY=( $(compgen -W ' -s -o -x -cpi -cpo -c -e -g -dhdl -field -table -tabletf -tablep -tableb -rerun -tpi -tpid -ei -eo -j -jo -ffout -devout -runav -px -pf -ro -ra -rs -rt -mtx -dn -multidir -membed -mp -mn -h -version -nice -deffnm -xvg -pd -dd -ddorder -npme -nt -ntmpi -ntomp -ntomp_pme -nopin -pinht -pinoffset -gpu_id -noddcheck -rdd -rcon -dlb -dds -gcom -nb -notunepme -testverlet -v -nocompact -seppot -pforce -reprod -cpt -cpnum -noappend -nsteps -maxh -multi -replex -nex -reseed -ionize' -- $c)); return 0; fi
+if (( $COMP_CWORD <= 1 )) || [[ $c == -* ]]; then COMPREPLY=( $(compgen -W ' -s -o -x -cpi -cpo -c -e -g -dhdl -field -table -tabletf -tablep -tableb -rerun -tpi -tpid -ei -eo -j -jo -ffout -devout -runav -px -pf -ro -ra -rs -rt -mtx -dn -multidir -membed -mp -mn -h -version -nice -deffnm -xvg -pd -dd -ddorder -npme -nt -ntmpi -ntomp -ntomp_pme -pin -pinoffset -pinstride -gpu_id -noddcheck -rdd -rcon -dlb -dds -gcom -nb -notunepme -testverlet -v -nocompact -seppot -pforce -reprod -cpt -cpnum -noappend -nsteps -maxh -multi -replex -nex -reseed -ionize' -- $c)); return 0; fi
case "$p" in
-xvg) COMPREPLY=( $(compgen -W ' xmgrace xmgr none ' -- $c ));;
-ddorder) COMPREPLY=( $(compgen -W ' interleave pp_pme cartesian ' -- $c ));;
+-pin) COMPREPLY=( $(compgen -W ' auto on off ' -- $c ));;
-dlb) COMPREPLY=( $(compgen -W ' auto no yes ' -- $c ));;
-nb) COMPREPLY=( $(compgen -W ' auto cpu gpu gpu_cpu ' -- $c ));;
-s) COMPREPLY=( $(compgen -X '!*.+(tpr|tpb|tpa)*(.gz|.Z)' -f $c ; compgen -S '/' -X '.*' -d $c ));;
-tpi) COMPREPLY=( $(compgen -X '!*.xvg*(.gz|.Z)' -f $c ; compgen -S '/' -X '.*' -d $c ));;
-tpid) COMPREPLY=( $(compgen -X '!*.xvg*(.gz|.Z)' -f $c ; compgen -S '/' -X '.*' -d $c ));;
-ei) COMPREPLY=( $(compgen -X '!*.edi*(.gz|.Z)' -f $c ; compgen -S '/' -X '.*' -d $c ));;
--eo) COMPREPLY=( $(compgen -X '!*.edo*(.gz|.Z)' -f $c ; compgen -S '/' -X '.*' -d $c ));;
+-eo) COMPREPLY=( $(compgen -X '!*.xvg*(.gz|.Z)' -f $c ; compgen -S '/' -X '.*' -d $c ));;
-j) COMPREPLY=( $(compgen -X '!*.gct*(.gz|.Z)' -f $c ; compgen -S '/' -X '.*' -d $c ));;
-jo) COMPREPLY=( $(compgen -X '!*.gct*(.gz|.Z)' -f $c ; compgen -S '/' -X '.*' -d $c ));;
-ffout) COMPREPLY=( $(compgen -X '!*.xvg*(.gz|.Z)' -f $c ; compgen -S '/' -X '.*' -d $c ));;
-rt) COMPREPLY=( $(compgen -X '!*.log*(.gz|.Z)' -f $c ; compgen -S '/' -X '.*' -d $c ));;
-mtx) COMPREPLY=( $(compgen -X '!*.mtx*(.gz|.Z)' -f $c ; compgen -S '/' -X '.*' -d $c ));;
-dn) COMPREPLY=( $(compgen -X '!*.ndx*(.gz|.Z)' -f $c ; compgen -S '/' -X '.*' -d $c ));;
--multidir) COMPREPLY=( $(compgen -X '!*.rundir*(.gz|.Z)' -f $c ; compgen -S '/' -X '.*' -d $c ));;
+-multidir) COMPREPLY=( $(compgen -X '!*.line_buf*(.gz|.Z)' -f $c ; compgen -S '/' -X '.*' -d $c ));;
-membed) COMPREPLY=( $(compgen -X '!*.dat*(.gz|.Z)' -f $c ; compgen -S '/' -X '.*' -d $c ));;
-mp) COMPREPLY=( $(compgen -X '!*.top*(.gz|.Z)' -f $c ; compgen -S '/' -X '.*' -d $c ));;
-mn) COMPREPLY=( $(compgen -X '!*.ndx*(.gz|.Z)' -f $c ; compgen -S '/' -X '.*' -d $c ));;
complete g_filter "n/-f/f:*.{xtc,trr,cpt,trj,gro,g96,pdb,g87}{,.gz,.Z}/" "n/-s/f:*.{tpr,tpb,tpa,gro,g96,pdb,brk,ent}{,.gz,.Z}/" "n/-n/f:*.ndx{,.gz,.Z}/" "n/-ol/f:*.{xtc,trr,trj,gro,g96,pdb,g87}{,.gz,.Z}/" "n/-oh/f:*.{xtc,trr,trj,gro,g96,pdb,g87}{,.gz,.Z}/" "c/-/( f s n ol oh h version nice b e dt w nf all nonojump fit)/"
complete g_gyrate "n/-xvg/( xmgrace xmgr none)/" "n/-P/( 0 1 2 3)/" "n/-fitfn/( none exp aexp exp_exp vac exp5 exp7 exp9 erffit)/" "n/-f/f:*.{xtc,trr,cpt,trj,gro,g96,pdb,g87}{,.gz,.Z}/" "n/-s/f:*.{tpr,tpb,tpa,gro,g96,pdb,brk,ent}{,.gz,.Z}/" "n/-n/f:*.ndx{,.gz,.Z}/" "n/-o/f:*.xvg{,.gz,.Z}/" "n/-acf/f:*.xvg{,.gz,.Z}/" "c/-/( f s n o acf h version nice b e dt w xvg nmol q p moi nz acflen nonormalize P fitfn ncskip beginfit endfit)/"
complete g_h2order "n/-xvg/( xmgrace xmgr none)/" "n/-f/f:*.{xtc,trr,cpt,trj,gro,g96,pdb,g87}{,.gz,.Z}/" "n/-n/f:*.ndx{,.gz,.Z}/" "n/-nm/f:*.ndx{,.gz,.Z}/" "n/-s/f:*.{tpr,tpb,tpa}{,.gz,.Z}/" "n/-o/f:*.xvg{,.gz,.Z}/" "c/-/( f n nm s o h version nice b e dt w xvg d sl)/"
-complete g_hbond "n/-tu/( fs ps ns us ms s)/" "n/-xvg/( xmgrace xmgr none)/" "n/-geminate/( none dd ad aa a4)/" "n/-P/( 0 1 2 3)/" "n/-fitfn/( none exp aexp exp_exp vac exp5 exp7 exp9 erffit)/" "n/-f/f:*.{xtc,trr,cpt,trj,gro,g96,pdb,g87}{,.gz,.Z}/" "n/-s/f:*.{tpr,tpb,tpa}{,.gz,.Z}/" "n/-n/f:*.ndx{,.gz,.Z}/" "n/-num/f:*.xvg{,.gz,.Z}/" "n/-g/f:*.log{,.gz,.Z}/" "n/-ac/f:*.xvg{,.gz,.Z}/" "n/-dist/f:*.xvg{,.gz,.Z}/" "n/-ang/f:*.xvg{,.gz,.Z}/" "n/-hx/f:*.xvg{,.gz,.Z}/" "n/-hbn/f:*.ndx{,.gz,.Z}/" "n/-hbm/f:*.xpm{,.gz,.Z}/" "n/-don/f:*.xvg{,.gz,.Z}/" "n/-dan/f:*.xvg{,.gz,.Z}/" "n/-life/f:*.xvg{,.gz,.Z}/" "n/-nhbdist/f:*.xvg{,.gz,.Z}/" "c/-/( f s n num g ac dist ang hx hbn hbm don dan life nhbdist h version nice b e dt tu xvg a r noda r2 abin rbin nonitacc contact shell fitstart fitstart temp smooth dump max_hb nomerge geminate diff acflen nonormalize P fitfn ncskip beginfit endfit)/"
+complete g_hbond "n/-tu/( fs ps ns us ms s)/" "n/-xvg/( xmgrace xmgr none)/" "n/-geminate/( none dd ad aa a4)/" "n/-P/( 0 1 2 3)/" "n/-fitfn/( none exp aexp exp_exp vac exp5 exp7 exp9 erffit)/" "n/-f/f:*.{xtc,trr,cpt,trj,gro,g96,pdb,g87}{,.gz,.Z}/" "n/-s/f:*.{tpr,tpb,tpa}{,.gz,.Z}/" "n/-n/f:*.ndx{,.gz,.Z}/" "n/-num/f:*.xvg{,.gz,.Z}/" "n/-g/f:*.log{,.gz,.Z}/" "n/-ac/f:*.xvg{,.gz,.Z}/" "n/-dist/f:*.xvg{,.gz,.Z}/" "n/-ang/f:*.xvg{,.gz,.Z}/" "n/-hx/f:*.xvg{,.gz,.Z}/" "n/-hbn/f:*.ndx{,.gz,.Z}/" "n/-hbm/f:*.xpm{,.gz,.Z}/" "n/-don/f:*.xvg{,.gz,.Z}/" "n/-dan/f:*.xvg{,.gz,.Z}/" "n/-life/f:*.xvg{,.gz,.Z}/" "n/-nhbdist/f:*.xvg{,.gz,.Z}/" "c/-/( f s n num g ac dist ang hx hbn hbm don dan life nhbdist h version nice b e dt tu xvg a r noda r2 abin rbin nonitacc contact shell fitstart fitstart temp smooth dump max_hb nomerge geminate diff nthreads acflen nonormalize P fitfn ncskip beginfit endfit)/"
complete g_helix "n/-prop/( RAD TWIST RISE LEN NHX DIP RMS CPHI RMSA PHI PSI HB3 HB4 HB5 CD222)/" "n/-s/f:*.{tpr,tpb,tpa}{,.gz,.Z}/" "n/-n/f:*.ndx{,.gz,.Z}/" "n/-f/f:*.{xtc,trr,cpt,trj,gro,g96,pdb,g87}{,.gz,.Z}/" "n/-to/f:*.g87{,.gz,.Z}/" "n/-cz/f:*.{gro,g96,pdb,brk,ent,esp,xyz}{,.gz,.Z}/" "n/-co/f:*.{gro,g96,pdb,brk,ent,esp,xyz}{,.gz,.Z}/" "c/-/( s n f to cz co h version nice b e dt w r0 q noF db prop ev ahxstart ahxend)/"
complete g_helixorient "n/-xvg/( xmgrace xmgr none)/" "n/-s/f:*.{tpr,tpb,tpa}{,.gz,.Z}/" "n/-f/f:*.{xtc,trr,cpt,trj,gro,g96,pdb,g87}{,.gz,.Z}/" "n/-n/f:*.ndx{,.gz,.Z}/" "n/-oaxis/f:*.dat{,.gz,.Z}/" "n/-ocenter/f:*.dat{,.gz,.Z}/" "n/-orise/f:*.xvg{,.gz,.Z}/" "n/-oradius/f:*.xvg{,.gz,.Z}/" "n/-otwist/f:*.xvg{,.gz,.Z}/" "n/-obending/f:*.xvg{,.gz,.Z}/" "n/-otilt/f:*.xvg{,.gz,.Z}/" "n/-orot/f:*.xvg{,.gz,.Z}/" "c/-/( s f n oaxis ocenter orise oradius otwist obending otilt orot h version nice b e dt xvg sidechain incremental)/"
complete g_hydorder "n/-d/( z x y)/" "n/-f/f:*.{xtc,trr,cpt,trj,gro,g96,pdb,g87}{,.gz,.Z}/" "n/-n/f:*.ndx{,.gz,.Z}/" "n/-s/f:*.{tpr,tpb,tpa}{,.gz,.Z}/" "n/-o/f:*.xpm{,.gz,.Z}/" "n/-or/f:*.out{,.gz,.Z}/" "n/-Spect/f:*.out{,.gz,.Z}/" "c/-/( f n s o or Spect h version nice b e dt w d bw sgang1 sgang2 tblock nlevel)/"
complete g_rotacf "n/-xvg/( xmgrace xmgr none)/" "n/-P/( 0 1 2 3)/" "n/-fitfn/( none exp aexp exp_exp vac exp5 exp7 exp9 erffit)/" "n/-f/f:*.{xtc,trr,cpt,trj,gro,g96,pdb,g87}{,.gz,.Z}/" "n/-s/f:*.{tpr,tpb,tpa}{,.gz,.Z}/" "n/-n/f:*.ndx{,.gz,.Z}/" "n/-o/f:*.xvg{,.gz,.Z}/" "c/-/( f s n o h version nice b e dt w xvg d noaver acflen nonormalize P fitfn ncskip beginfit endfit)/"
complete g_rotmat "n/-xvg/( xmgrace xmgr none)/" "n/-ref/( none xyz xy)/" "n/-f/f:*.{xtc,trr,cpt,trj,gro,g96,pdb,g87}{,.gz,.Z}/" "n/-s/f:*.{tpr,tpb,tpa,gro,g96,pdb,brk,ent}{,.gz,.Z}/" "n/-n/f:*.ndx{,.gz,.Z}/" "n/-o/f:*.xvg{,.gz,.Z}/" "c/-/( f s n o h version nice b e dt w xvg ref skip fitxy nomw)/"
complete g_saltbr "n/-f/f:*.{xtc,trr,cpt,trj,gro,g96,pdb,g87}{,.gz,.Z}/" "n/-s/f:*.{tpr,tpb,tpa}{,.gz,.Z}/" "c/-/( f s h version nice b e dt t sep)/"
-complete g_sans "n/-tu/( fs ps ns us ms s)/" "n/-xvg/( xmgrace xmgr none)/" "n/-mode/( direct mc)/" "n/-s/f:*.{tpr,tpb,tpa}{,.gz,.Z}/" "n/-f/f:*.{xtc,trr,cpt,trj,gro,g96,pdb,g87}{,.gz,.Z}/" "n/-n/f:*.ndx{,.gz,.Z}/" "n/-d/f:*.dat{,.gz,.Z}/" "n/-pr/f:*.xvg{,.gz,.Z}/" "n/-sq/f:*.xvg{,.gz,.Z}/" "n/-prframe/f:*.xvg{,.gz,.Z}/" "n/-sqframe/f:*.xvg{,.gz,.Z}/" "c/-/( s f n d pr sq prframe sqframe h version nice b e dt tu xvg mode mcover nopbc startq endq qstep seed)/"
+complete g_sans "n/-tu/( fs ps ns us ms s)/" "n/-xvg/( xmgrace xmgr none)/" "n/-mode/( direct mc)/" "n/-s/f:*.{tpr,tpb,tpa}{,.gz,.Z}/" "n/-f/f:*.{xtc,trr,cpt,trj,gro,g96,pdb,g87}{,.gz,.Z}/" "n/-n/f:*.ndx{,.gz,.Z}/" "n/-d/f:*.dat{,.gz,.Z}/" "n/-pr/f:*.xvg{,.gz,.Z}/" "n/-sq/f:*.xvg{,.gz,.Z}/" "n/-prframe/f:*.xvg{,.gz,.Z}/" "n/-sqframe/f:*.xvg{,.gz,.Z}/" "c/-/( s f n d pr sq prframe sqframe h version nice b e dt tu xvg mode mcover nopbc startq endq qstep seed nt)/"
complete g_sas "n/-xvg/( xmgrace xmgr none)/" "n/-f/f:*.{xtc,trr,cpt,trj,gro,g96,pdb,g87}{,.gz,.Z}/" "n/-s/f:*.{tpr,tpb,tpa,gro,g96,pdb,brk,ent}{,.gz,.Z}/" "n/-o/f:*.xvg{,.gz,.Z}/" "n/-or/f:*.xvg{,.gz,.Z}/" "n/-oa/f:*.xvg{,.gz,.Z}/" "n/-tv/f:*.xvg{,.gz,.Z}/" "n/-q/f:*.pdb{,.gz,.Z}/" "n/-n/f:*.ndx{,.gz,.Z}/" "n/-i/f:*.itp{,.gz,.Z}/" "c/-/( f s o or oa tv q n i h version nice b e dt w xvg probe ndots qmax f_index minarea nopbc noprot dgs)/"
complete g_select "n/-xvg/( xmgrace xmgr none)/" "n/-selrpos/( atom res_com res_cog mol_com mol_cog whole_res_com whole_res_cog whole_mol_com whole_mol_cog part_res_com part_res_cog part_mol_com part_mol_cog dyn_res_com dyn_res_cog dyn_mol_com dyn_mol_cog)/" "n/-seltype/( atom res_com res_cog mol_com mol_cog whole_res_com whole_res_cog whole_mol_com whole_mol_cog part_res_com part_res_cog part_mol_com part_mol_cog dyn_res_com dyn_res_cog dyn_mol_com dyn_mol_cog)/" "n/-resnr/( number index)/" "n/-f/f:*.{xtc,trr,cpt,trj,gro,g96,pdb,g87}{,.gz,.Z}/" "n/-s/f:*.{tpr,tpb,tpa,gro,g96,pdb,brk,ent}{,.gz,.Z}/" "n/-sf/f:*.dat{,.gz,.Z}/" "n/-n/f:*.ndx{,.gz,.Z}/" "n/-os/f:*.xvg{,.gz,.Z}/" "n/-oc/f:*.xvg{,.gz,.Z}/" "n/-oi/f:*.dat{,.gz,.Z}/" "n/-om/f:*.dat{,.gz,.Z}/" "n/-on/f:*.ndx{,.gz,.Z}/" "c/-/( f s sf n os oc oi om on h version nice b e dt xvg normpbc nopbc select selrpos seltype dump norm cfnorm resnr)/"
complete g_sgangle "n/-xvg/( xmgrace xmgr none)/" "n/-f/f:*.{xtc,trr,cpt,trj,gro,g96,pdb,g87}{,.gz,.Z}/" "n/-n/f:*.ndx{,.gz,.Z}/" "n/-s/f:*.{tpr,tpb,tpa}{,.gz,.Z}/" "n/-oa/f:*.xvg{,.gz,.Z}/" "n/-od/f:*.xvg{,.gz,.Z}/" "n/-od1/f:*.xvg{,.gz,.Z}/" "n/-od2/f:*.xvg{,.gz,.Z}/" "c/-/( f n s oa od od1 od2 h version nice b e dt w xvg one z)/"
complete g_xrama "n/-f/f:*.{xtc,trr,cpt,trj,gro,g96,pdb,g87}{,.gz,.Z}/" "n/-s/f:*.{tpr,tpb,tpa}{,.gz,.Z}/" "c/-/( f s h version nice b e dt)/"
complete make_edi "n/-xvg/( xmgrace xmgr none)/" "n/-f/f:*.{trr,cpt,trj}{,.gz,.Z}/" "n/-eig/f:*.xvg{,.gz,.Z}/" "n/-s/f:*.{tpr,tpb,tpa,gro,g96,pdb,brk,ent}{,.gz,.Z}/" "n/-n/f:*.ndx{,.gz,.Z}/" "n/-tar/f:*.{gro,g96,pdb,brk,ent,esp,xyz,tpr,tpb,tpa}{,.gz,.Z}/" "n/-ori/f:*.{gro,g96,pdb,brk,ent,esp,xyz,tpr,tpb,tpa}{,.gz,.Z}/" "n/-o/f:*.edi{,.gz,.Z}/" "c/-/( f eig s n tar ori o h version nice xvg mon linfix linacc radfix radacc radcon flood outfrq slope linstep accdir radstep maxedsteps eqsteps deltaF0 deltaF tau Eflnull T alpha restrain hessian harmonic constF)/"
complete make_ndx "n/-f/f:*.{gro,g96,pdb,brk,ent,esp,xyz,tpr,tpb,tpa}{,.gz,.Z}/" "n/-n/f:*.ndx{,.gz,.Z}/" "n/-o/f:*.ndx{,.gz,.Z}/" "c/-/( f n o h version nice natoms)/"
-complete mdrun "n/-xvg/( xmgrace xmgr none)/" "n/-ddorder/( interleave pp_pme cartesian)/" "n/-dlb/( auto no yes)/" "n/-nb/( auto cpu gpu gpu_cpu)/" "n/-s/f:*.{tpr,tpb,tpa}{,.gz,.Z}/" "n/-o/f:*.{trr,cpt,trj}{,.gz,.Z}/" "n/-x/f:*.xtc{,.gz,.Z}/" "n/-cpi/f:*.cpt{,.gz,.Z}/" "n/-cpo/f:*.cpt{,.gz,.Z}/" "n/-c/f:*.{gro,g96,pdb,brk,ent,esp,xyz}{,.gz,.Z}/" "n/-e/f:*.edr{,.gz,.Z}/" "n/-g/f:*.log{,.gz,.Z}/" "n/-dhdl/f:*.xvg{,.gz,.Z}/" "n/-field/f:*.xvg{,.gz,.Z}/" "n/-table/f:*.xvg{,.gz,.Z}/" "n/-tabletf/f:*.xvg{,.gz,.Z}/" "n/-tablep/f:*.xvg{,.gz,.Z}/" "n/-tableb/f:*.xvg{,.gz,.Z}/" "n/-rerun/f:*.{xtc,trr,cpt,trj,gro,g96,pdb,g87}{,.gz,.Z}/" "n/-tpi/f:*.xvg{,.gz,.Z}/" "n/-tpid/f:*.xvg{,.gz,.Z}/" "n/-ei/f:*.edi{,.gz,.Z}/" "n/-eo/f:*.xvg{,.gz,.Z}/" "n/-j/f:*.gct{,.gz,.Z}/" "n/-jo/f:*.gct{,.gz,.Z}/" "n/-ffout/f:*.xvg{,.gz,.Z}/" "n/-devout/f:*.xvg{,.gz,.Z}/" "n/-runav/f:*.xvg{,.gz,.Z}/" "n/-px/f:*.xvg{,.gz,.Z}/" "n/-pf/f:*.xvg{,.gz,.Z}/" "n/-ro/f:*.xvg{,.gz,.Z}/" "n/-ra/f:*.log{,.gz,.Z}/" "n/-rs/f:*.log{,.gz,.Z}/" "n/-rt/f:*.log{,.gz,.Z}/" "n/-mtx/f:*.mtx{,.gz,.Z}/" "n/-dn/f:*.ndx{,.gz,.Z}/" "n/-multidir/f:*.rundir{,.gz,.Z}/" "n/-membed/f:*.dat{,.gz,.Z}/" "n/-mp/f:*.top{,.gz,.Z}/" "n/-mn/f:*.ndx{,.gz,.Z}/" "c/-/( s o x cpi cpo c e g dhdl field table tabletf tablep tableb rerun tpi tpid ei eo j jo ffout devout runav px pf ro ra rs rt mtx dn multidir membed mp mn h version nice deffnm xvg pd dd ddorder npme nt ntmpi ntomp ntomp_pme nopin pinht pinoffset gpu_id noddcheck rdd rcon dlb dds gcom nb notunepme testverlet v nocompact seppot pforce reprod cpt cpnum noappend nsteps maxh multi replex nex reseed ionize)/"
-complete mdrun_mpi "n/-xvg/( xmgrace xmgr none)/" "n/-ddorder/( interleave pp_pme cartesian)/" "n/-dlb/( auto no yes)/" "n/-nb/( auto cpu gpu gpu_cpu)/" "n/-s/f:*.{tpr,tpb,tpa}{,.gz,.Z}/" "n/-o/f:*.{trr,cpt,trj}{,.gz,.Z}/" "n/-x/f:*.xtc{,.gz,.Z}/" "n/-cpi/f:*.cpt{,.gz,.Z}/" "n/-cpo/f:*.cpt{,.gz,.Z}/" "n/-c/f:*.{gro,g96,pdb,brk,ent,esp,xyz}{,.gz,.Z}/" "n/-e/f:*.edr{,.gz,.Z}/" "n/-g/f:*.log{,.gz,.Z}/" "n/-dhdl/f:*.xvg{,.gz,.Z}/" "n/-field/f:*.xvg{,.gz,.Z}/" "n/-table/f:*.xvg{,.gz,.Z}/" "n/-tabletf/f:*.xvg{,.gz,.Z}/" "n/-tablep/f:*.xvg{,.gz,.Z}/" "n/-tableb/f:*.xvg{,.gz,.Z}/" "n/-rerun/f:*.{xtc,trr,cpt,trj,gro,g96,pdb,g87}{,.gz,.Z}/" "n/-tpi/f:*.xvg{,.gz,.Z}/" "n/-tpid/f:*.xvg{,.gz,.Z}/" "n/-ei/f:*.edi{,.gz,.Z}/" "n/-eo/f:*.edo{,.gz,.Z}/" "n/-j/f:*.gct{,.gz,.Z}/" "n/-jo/f:*.gct{,.gz,.Z}/" "n/-ffout/f:*.xvg{,.gz,.Z}/" "n/-devout/f:*.xvg{,.gz,.Z}/" "n/-runav/f:*.xvg{,.gz,.Z}/" "n/-px/f:*.xvg{,.gz,.Z}/" "n/-pf/f:*.xvg{,.gz,.Z}/" "n/-ro/f:*.xvg{,.gz,.Z}/" "n/-ra/f:*.log{,.gz,.Z}/" "n/-rs/f:*.log{,.gz,.Z}/" "n/-rt/f:*.log{,.gz,.Z}/" "n/-mtx/f:*.mtx{,.gz,.Z}/" "n/-dn/f:*.ndx{,.gz,.Z}/" "n/-multidir/f:*.rundir{,.gz,.Z}/" "n/-membed/f:*.dat{,.gz,.Z}/" "n/-mp/f:*.top{,.gz,.Z}/" "n/-mn/f:*.ndx{,.gz,.Z}/" "c/-/( s o x cpi cpo c e g dhdl field table tabletf tablep tableb rerun tpi tpid ei eo j jo ffout devout runav px pf ro ra rs rt mtx dn multidir membed mp mn h version nice deffnm xvg pd dd ddorder npme nt ntmpi ntomp ntomp_pme nopin pinht pinoffset gpu_id noddcheck rdd rcon dlb dds gcom nb notunepme testverlet v nocompact seppot pforce reprod cpt cpnum noappend nsteps maxh multi replex nex reseed ionize)/"
+complete mdrun "n/-xvg/( xmgrace xmgr none)/" "n/-ddorder/( interleave pp_pme cartesian)/" "n/-pin/( auto on off)/" "n/-dlb/( auto no yes)/" "n/-nb/( auto cpu gpu gpu_cpu)/" "n/-s/f:*.{tpr,tpb,tpa}{,.gz,.Z}/" "n/-o/f:*.{trr,cpt,trj}{,.gz,.Z}/" "n/-x/f:*.xtc{,.gz,.Z}/" "n/-cpi/f:*.cpt{,.gz,.Z}/" "n/-cpo/f:*.cpt{,.gz,.Z}/" "n/-c/f:*.{gro,g96,pdb,brk,ent,esp,xyz}{,.gz,.Z}/" "n/-e/f:*.edr{,.gz,.Z}/" "n/-g/f:*.log{,.gz,.Z}/" "n/-dhdl/f:*.xvg{,.gz,.Z}/" "n/-field/f:*.xvg{,.gz,.Z}/" "n/-table/f:*.xvg{,.gz,.Z}/" "n/-tabletf/f:*.xvg{,.gz,.Z}/" "n/-tablep/f:*.xvg{,.gz,.Z}/" "n/-tableb/f:*.xvg{,.gz,.Z}/" "n/-rerun/f:*.{xtc,trr,cpt,trj,gro,g96,pdb,g87}{,.gz,.Z}/" "n/-tpi/f:*.xvg{,.gz,.Z}/" "n/-tpid/f:*.xvg{,.gz,.Z}/" "n/-ei/f:*.edi{,.gz,.Z}/" "n/-eo/f:*.xvg{,.gz,.Z}/" "n/-j/f:*.gct{,.gz,.Z}/" "n/-jo/f:*.gct{,.gz,.Z}/" "n/-ffout/f:*.xvg{,.gz,.Z}/" "n/-devout/f:*.xvg{,.gz,.Z}/" "n/-runav/f:*.xvg{,.gz,.Z}/" "n/-px/f:*.xvg{,.gz,.Z}/" "n/-pf/f:*.xvg{,.gz,.Z}/" "n/-ro/f:*.xvg{,.gz,.Z}/" "n/-ra/f:*.log{,.gz,.Z}/" "n/-rs/f:*.log{,.gz,.Z}/" "n/-rt/f:*.log{,.gz,.Z}/" "n/-mtx/f:*.mtx{,.gz,.Z}/" "n/-dn/f:*.ndx{,.gz,.Z}/" "n/-multidir/f:*.{,.gz,.Z}/" "n/-membed/f:*.dat{,.gz,.Z}/" "n/-mp/f:*.top{,.gz,.Z}/" "n/-mn/f:*.ndx{,.gz,.Z}/" "c/-/( s o x cpi cpo c e g dhdl field table tabletf tablep tableb rerun tpi tpid ei eo j jo ffout devout runav px pf ro ra rs rt mtx dn multidir membed mp mn h version nice deffnm xvg pd dd ddorder npme nt ntmpi ntomp ntomp_pme pin pinoffset pinstride gpu_id noddcheck rdd rcon dlb dds gcom nb notunepme testverlet v nocompact seppot pforce reprod cpt cpnum noappend nsteps maxh multi replex nex reseed ionize)/"
+complete mdrun_mpi "n/-xvg/( xmgrace xmgr none)/" "n/-ddorder/( interleave pp_pme cartesian)/" "n/-pin/( auto on off)/" "n/-dlb/( auto no yes)/" "n/-nb/( auto cpu gpu gpu_cpu)/" "n/-s/f:*.{tpr,tpb,tpa}{,.gz,.Z}/" "n/-o/f:*.{trr,cpt,trj}{,.gz,.Z}/" "n/-x/f:*.xtc{,.gz,.Z}/" "n/-cpi/f:*.cpt{,.gz,.Z}/" "n/-cpo/f:*.cpt{,.gz,.Z}/" "n/-c/f:*.{gro,g96,pdb,brk,ent,esp,xyz}{,.gz,.Z}/" "n/-e/f:*.edr{,.gz,.Z}/" "n/-g/f:*.log{,.gz,.Z}/" "n/-dhdl/f:*.xvg{,.gz,.Z}/" "n/-field/f:*.xvg{,.gz,.Z}/" "n/-table/f:*.xvg{,.gz,.Z}/" "n/-tabletf/f:*.xvg{,.gz,.Z}/" "n/-tablep/f:*.xvg{,.gz,.Z}/" "n/-tableb/f:*.xvg{,.gz,.Z}/" "n/-rerun/f:*.{xtc,trr,cpt,trj,gro,g96,pdb,g87}{,.gz,.Z}/" "n/-tpi/f:*.xvg{,.gz,.Z}/" "n/-tpid/f:*.xvg{,.gz,.Z}/" "n/-ei/f:*.edi{,.gz,.Z}/" "n/-eo/f:*.xvg{,.gz,.Z}/" "n/-j/f:*.gct{,.gz,.Z}/" "n/-jo/f:*.gct{,.gz,.Z}/" "n/-ffout/f:*.xvg{,.gz,.Z}/" "n/-devout/f:*.xvg{,.gz,.Z}/" "n/-runav/f:*.xvg{,.gz,.Z}/" "n/-px/f:*.xvg{,.gz,.Z}/" "n/-pf/f:*.xvg{,.gz,.Z}/" "n/-ro/f:*.xvg{,.gz,.Z}/" "n/-ra/f:*.log{,.gz,.Z}/" "n/-rs/f:*.log{,.gz,.Z}/" "n/-rt/f:*.log{,.gz,.Z}/" "n/-mtx/f:*.mtx{,.gz,.Z}/" "n/-dn/f:*.ndx{,.gz,.Z}/" "n/-multidir/f:*.line_buf{,.gz,.Z}/" "n/-membed/f:*.dat{,.gz,.Z}/" "n/-mp/f:*.top{,.gz,.Z}/" "n/-mn/f:*.ndx{,.gz,.Z}/" "c/-/( s o x cpi cpo c e g dhdl field table tabletf tablep tableb rerun tpi tpid ei eo j jo ffout devout runav px pf ro ra rs rt mtx dn multidir membed mp mn h version nice deffnm xvg pd dd ddorder npme nt ntmpi ntomp ntomp_pme pin pinoffset pinstride gpu_id noddcheck rdd rcon dlb dds gcom nb notunepme testverlet v nocompact seppot pforce reprod cpt cpnum noappend nsteps maxh multi replex nex reseed ionize)/"
complete mk_angndx "n/-type/( angle dihedral improper ryckaert-bellemans)/" "n/-s/f:*.{tpr,tpb,tpa}{,.gz,.Z}/" "n/-n/f:*.ndx{,.gz,.Z}/" "c/-/( s n h version nice type nohyd hq)/"
complete ngmx "n/-f/f:*.{xtc,trr,cpt,trj,gro,g96,pdb,g87}{,.gz,.Z}/" "n/-s/f:*.{tpr,tpb,tpa}{,.gz,.Z}/" "n/-n/f:*.ndx{,.gz,.Z}/" "c/-/( f s n h version nice b e dt)/"
complete pdb2gmx "n/-chainsep/( id_or_ter id_and_ter ter id interactive)/" "n/-merge/( no all interactive)/" "n/-water/( select none spc spce tip3p tip4p tip5p)/" "n/-vsite/( none hydrogens aromatics)/" "n/-f/f:*.{gro,g96,pdb,brk,ent,esp,xyz,tpr,tpb,tpa}{,.gz,.Z}/" "n/-o/f:*.{gro,g96,pdb,brk,ent,esp,xyz}{,.gz,.Z}/" "n/-p/f:*.top{,.gz,.Z}/" "n/-i/f:*.itp{,.gz,.Z}/" "n/-n/f:*.ndx{,.gz,.Z}/" "n/-q/f:*.{gro,g96,pdb,brk,ent,esp,xyz}{,.gz,.Z}/" "c/-/( f o p i n q h version nice chainsep merge ff water inter ss ter lys arg asp glu gln his angle dist una ignh missing v posrefc vsite heavyh deuterate nochargegrp nocmap renum rtpres)/"
compctl -x 's[-]' -s " f s n ol oh h version nice b e dt w nf all nonojump fit" - 'c[-1,-f]' -g '*.(xtc|trr|cpt|trj|gro|g96|pdb|g87)(|.gz|.Z) *(/)' - 'c[-1,-s]' -g '*.(tpr|tpb|tpa|gro|g96|pdb|brk|ent)(|.gz|.Z) *(/)' - 'c[-1,-n]' -g '*.ndx(|.gz|.Z) *(/)' - 'c[-1,-ol]' -g '*.(xtc|trr|trj|gro|g96|pdb|g87)(|.gz|.Z) *(/)' - 'c[-1,-oh]' -g '*.(xtc|trr|trj|gro|g96|pdb|g87)(|.gz|.Z) *(/)' -- g_filter
compctl -x 's[-]' -s " f s n o acf h version nice b e dt w xvg nmol q p moi nz acflen nonormalize P fitfn ncskip beginfit endfit" - 'c[-1,-xvg]' -s " xmgrace xmgr none" - 'c[-1,-P]' -s " 0 1 2 3" - 'c[-1,-fitfn]' -s " none exp aexp exp_exp vac exp5 exp7 exp9 erffit" - 'c[-1,-f]' -g '*.(xtc|trr|cpt|trj|gro|g96|pdb|g87)(|.gz|.Z) *(/)' - 'c[-1,-s]' -g '*.(tpr|tpb|tpa|gro|g96|pdb|brk|ent)(|.gz|.Z) *(/)' - 'c[-1,-n]' -g '*.ndx(|.gz|.Z) *(/)' - 'c[-1,-o]' -g '*.xvg(|.gz|.Z) *(/)' - 'c[-1,-acf]' -g '*.xvg(|.gz|.Z) *(/)' -- g_gyrate
compctl -x 's[-]' -s " f n nm s o h version nice b e dt w xvg d sl" - 'c[-1,-xvg]' -s " xmgrace xmgr none" - 'c[-1,-f]' -g '*.(xtc|trr|cpt|trj|gro|g96|pdb|g87)(|.gz|.Z) *(/)' - 'c[-1,-n]' -g '*.ndx(|.gz|.Z) *(/)' - 'c[-1,-nm]' -g '*.ndx(|.gz|.Z) *(/)' - 'c[-1,-s]' -g '*.(tpr|tpb|tpa)(|.gz|.Z) *(/)' - 'c[-1,-o]' -g '*.xvg(|.gz|.Z) *(/)' -- g_h2order
-compctl -x 's[-]' -s " f s n num g ac dist ang hx hbn hbm don dan life nhbdist h version nice b e dt tu xvg a r noda r2 abin rbin nonitacc contact shell fitstart fitstart temp smooth dump max_hb nomerge geminate diff acflen nonormalize P fitfn ncskip beginfit endfit" - 'c[-1,-tu]' -s " fs ps ns us ms s" - 'c[-1,-xvg]' -s " xmgrace xmgr none" - 'c[-1,-geminate]' -s " none dd ad aa a4" - 'c[-1,-P]' -s " 0 1 2 3" - 'c[-1,-fitfn]' -s " none exp aexp exp_exp vac exp5 exp7 exp9 erffit" - 'c[-1,-f]' -g '*.(xtc|trr|cpt|trj|gro|g96|pdb|g87)(|.gz|.Z) *(/)' - 'c[-1,-s]' -g '*.(tpr|tpb|tpa)(|.gz|.Z) *(/)' - 'c[-1,-n]' -g '*.ndx(|.gz|.Z) *(/)' - 'c[-1,-num]' -g '*.xvg(|.gz|.Z) *(/)' - 'c[-1,-g]' -g '*.log(|.gz|.Z) *(/)' - 'c[-1,-ac]' -g '*.xvg(|.gz|.Z) *(/)' - 'c[-1,-dist]' -g '*.xvg(|.gz|.Z) *(/)' - 'c[-1,-ang]' -g '*.xvg(|.gz|.Z) *(/)' - 'c[-1,-hx]' -g '*.xvg(|.gz|.Z) *(/)' - 'c[-1,-hbn]' -g '*.ndx(|.gz|.Z) *(/)' - 'c[-1,-hbm]' -g '*.xpm(|.gz|.Z) *(/)' - 'c[-1,-don]' -g '*.xvg(|.gz|.Z) *(/)' - 'c[-1,-dan]' -g '*.xvg(|.gz|.Z) *(/)' - 'c[-1,-life]' -g '*.xvg(|.gz|.Z) *(/)' - 'c[-1,-nhbdist]' -g '*.xvg(|.gz|.Z) *(/)' -- g_hbond
+compctl -x 's[-]' -s " f s n num g ac dist ang hx hbn hbm don dan life nhbdist h version nice b e dt tu xvg a r noda r2 abin rbin nonitacc contact shell fitstart fitstart temp smooth dump max_hb nomerge geminate diff nthreads acflen nonormalize P fitfn ncskip beginfit endfit" - 'c[-1,-tu]' -s " fs ps ns us ms s" - 'c[-1,-xvg]' -s " xmgrace xmgr none" - 'c[-1,-geminate]' -s " none dd ad aa a4" - 'c[-1,-P]' -s " 0 1 2 3" - 'c[-1,-fitfn]' -s " none exp aexp exp_exp vac exp5 exp7 exp9 erffit" - 'c[-1,-f]' -g '*.(xtc|trr|cpt|trj|gro|g96|pdb|g87)(|.gz|.Z) *(/)' - 'c[-1,-s]' -g '*.(tpr|tpb|tpa)(|.gz|.Z) *(/)' - 'c[-1,-n]' -g '*.ndx(|.gz|.Z) *(/)' - 'c[-1,-num]' -g '*.xvg(|.gz|.Z) *(/)' - 'c[-1,-g]' -g '*.log(|.gz|.Z) *(/)' - 'c[-1,-ac]' -g '*.xvg(|.gz|.Z) *(/)' - 'c[-1,-dist]' -g '*.xvg(|.gz|.Z) *(/)' - 'c[-1,-ang]' -g '*.xvg(|.gz|.Z) *(/)' - 'c[-1,-hx]' -g '*.xvg(|.gz|.Z) *(/)' - 'c[-1,-hbn]' -g '*.ndx(|.gz|.Z) *(/)' - 'c[-1,-hbm]' -g '*.xpm(|.gz|.Z) *(/)' - 'c[-1,-don]' -g '*.xvg(|.gz|.Z) *(/)' - 'c[-1,-dan]' -g '*.xvg(|.gz|.Z) *(/)' - 'c[-1,-life]' -g '*.xvg(|.gz|.Z) *(/)' - 'c[-1,-nhbdist]' -g '*.xvg(|.gz|.Z) *(/)' -- g_hbond
compctl -x 's[-]' -s " s n f to cz co h version nice b e dt w r0 q noF db prop ev ahxstart ahxend" - 'c[-1,-prop]' -s " RAD TWIST RISE LEN NHX DIP RMS CPHI RMSA PHI PSI HB3 HB4 HB5 CD222" - 'c[-1,-s]' -g '*.(tpr|tpb|tpa)(|.gz|.Z) *(/)' - 'c[-1,-n]' -g '*.ndx(|.gz|.Z) *(/)' - 'c[-1,-f]' -g '*.(xtc|trr|cpt|trj|gro|g96|pdb|g87)(|.gz|.Z) *(/)' - 'c[-1,-to]' -g '*.g87(|.gz|.Z) *(/)' - 'c[-1,-cz]' -g '*.(gro|g96|pdb|brk|ent|esp|xyz)(|.gz|.Z) *(/)' - 'c[-1,-co]' -g '*.(gro|g96|pdb|brk|ent|esp|xyz)(|.gz|.Z) *(/)' -- g_helix
compctl -x 's[-]' -s " s f n oaxis ocenter orise oradius otwist obending otilt orot h version nice b e dt xvg sidechain incremental" - 'c[-1,-xvg]' -s " xmgrace xmgr none" - 'c[-1,-s]' -g '*.(tpr|tpb|tpa)(|.gz|.Z) *(/)' - 'c[-1,-f]' -g '*.(xtc|trr|cpt|trj|gro|g96|pdb|g87)(|.gz|.Z) *(/)' - 'c[-1,-n]' -g '*.ndx(|.gz|.Z) *(/)' - 'c[-1,-oaxis]' -g '*.dat(|.gz|.Z) *(/)' - 'c[-1,-ocenter]' -g '*.dat(|.gz|.Z) *(/)' - 'c[-1,-orise]' -g '*.xvg(|.gz|.Z) *(/)' - 'c[-1,-oradius]' -g '*.xvg(|.gz|.Z) *(/)' - 'c[-1,-otwist]' -g '*.xvg(|.gz|.Z) *(/)' - 'c[-1,-obending]' -g '*.xvg(|.gz|.Z) *(/)' - 'c[-1,-otilt]' -g '*.xvg(|.gz|.Z) *(/)' - 'c[-1,-orot]' -g '*.xvg(|.gz|.Z) *(/)' -- g_helixorient
compctl -x 's[-]' -s " f n s o or Spect h version nice b e dt w d bw sgang1 sgang2 tblock nlevel" - 'c[-1,-d]' -s " z x y" - 'c[-1,-f]' -g '*.(xtc|trr|cpt|trj|gro|g96|pdb|g87)(|.gz|.Z) *(/)' - 'c[-1,-n]' -g '*.ndx(|.gz|.Z) *(/)' - 'c[-1,-s]' -g '*.(tpr|tpb|tpa)(|.gz|.Z) *(/)' - 'c[-1,-o]' -g '*.xpm(|.gz|.Z) *(/)' - 'c[-1,-or]' -g '*.out(|.gz|.Z) *(/)' - 'c[-1,-Spect]' -g '*.out(|.gz|.Z) *(/)' -- g_hydorder
compctl -x 's[-]' -s " f s n o h version nice b e dt w xvg d noaver acflen nonormalize P fitfn ncskip beginfit endfit" - 'c[-1,-xvg]' -s " xmgrace xmgr none" - 'c[-1,-P]' -s " 0 1 2 3" - 'c[-1,-fitfn]' -s " none exp aexp exp_exp vac exp5 exp7 exp9 erffit" - 'c[-1,-f]' -g '*.(xtc|trr|cpt|trj|gro|g96|pdb|g87)(|.gz|.Z) *(/)' - 'c[-1,-s]' -g '*.(tpr|tpb|tpa)(|.gz|.Z) *(/)' - 'c[-1,-n]' -g '*.ndx(|.gz|.Z) *(/)' - 'c[-1,-o]' -g '*.xvg(|.gz|.Z) *(/)' -- g_rotacf
compctl -x 's[-]' -s " f s n o h version nice b e dt w xvg ref skip fitxy nomw" - 'c[-1,-xvg]' -s " xmgrace xmgr none" - 'c[-1,-ref]' -s " none xyz xy" - 'c[-1,-f]' -g '*.(xtc|trr|cpt|trj|gro|g96|pdb|g87)(|.gz|.Z) *(/)' - 'c[-1,-s]' -g '*.(tpr|tpb|tpa|gro|g96|pdb|brk|ent)(|.gz|.Z) *(/)' - 'c[-1,-n]' -g '*.ndx(|.gz|.Z) *(/)' - 'c[-1,-o]' -g '*.xvg(|.gz|.Z) *(/)' -- g_rotmat
compctl -x 's[-]' -s " f s h version nice b e dt t sep" - 'c[-1,-f]' -g '*.(xtc|trr|cpt|trj|gro|g96|pdb|g87)(|.gz|.Z) *(/)' - 'c[-1,-s]' -g '*.(tpr|tpb|tpa)(|.gz|.Z) *(/)' -- g_saltbr
-compctl -x 's[-]' -s " s f n d pr sq prframe sqframe h version nice b e dt tu xvg mode mcover nopbc startq endq qstep seed" - 'c[-1,-tu]' -s " fs ps ns us ms s" - 'c[-1,-xvg]' -s " xmgrace xmgr none" - 'c[-1,-mode]' -s " direct mc" - 'c[-1,-s]' -g '*.(tpr|tpb|tpa)(|.gz|.Z) *(/)' - 'c[-1,-f]' -g '*.(xtc|trr|cpt|trj|gro|g96|pdb|g87)(|.gz|.Z) *(/)' - 'c[-1,-n]' -g '*.ndx(|.gz|.Z) *(/)' - 'c[-1,-d]' -g '*.dat(|.gz|.Z) *(/)' - 'c[-1,-pr]' -g '*.xvg(|.gz|.Z) *(/)' - 'c[-1,-sq]' -g '*.xvg(|.gz|.Z) *(/)' - 'c[-1,-prframe]' -g '*.xvg(|.gz|.Z) *(/)' - 'c[-1,-sqframe]' -g '*.xvg(|.gz|.Z) *(/)' -- g_sans
+compctl -x 's[-]' -s " s f n d pr sq prframe sqframe h version nice b e dt tu xvg mode mcover nopbc startq endq qstep seed nt" - 'c[-1,-tu]' -s " fs ps ns us ms s" - 'c[-1,-xvg]' -s " xmgrace xmgr none" - 'c[-1,-mode]' -s " direct mc" - 'c[-1,-s]' -g '*.(tpr|tpb|tpa)(|.gz|.Z) *(/)' - 'c[-1,-f]' -g '*.(xtc|trr|cpt|trj|gro|g96|pdb|g87)(|.gz|.Z) *(/)' - 'c[-1,-n]' -g '*.ndx(|.gz|.Z) *(/)' - 'c[-1,-d]' -g '*.dat(|.gz|.Z) *(/)' - 'c[-1,-pr]' -g '*.xvg(|.gz|.Z) *(/)' - 'c[-1,-sq]' -g '*.xvg(|.gz|.Z) *(/)' - 'c[-1,-prframe]' -g '*.xvg(|.gz|.Z) *(/)' - 'c[-1,-sqframe]' -g '*.xvg(|.gz|.Z) *(/)' -- g_sans
compctl -x 's[-]' -s " f s o or oa tv q n i h version nice b e dt w xvg probe ndots qmax f_index minarea nopbc noprot dgs" - 'c[-1,-xvg]' -s " xmgrace xmgr none" - 'c[-1,-f]' -g '*.(xtc|trr|cpt|trj|gro|g96|pdb|g87)(|.gz|.Z) *(/)' - 'c[-1,-s]' -g '*.(tpr|tpb|tpa|gro|g96|pdb|brk|ent)(|.gz|.Z) *(/)' - 'c[-1,-o]' -g '*.xvg(|.gz|.Z) *(/)' - 'c[-1,-or]' -g '*.xvg(|.gz|.Z) *(/)' - 'c[-1,-oa]' -g '*.xvg(|.gz|.Z) *(/)' - 'c[-1,-tv]' -g '*.xvg(|.gz|.Z) *(/)' - 'c[-1,-q]' -g '*.pdb(|.gz|.Z) *(/)' - 'c[-1,-n]' -g '*.ndx(|.gz|.Z) *(/)' - 'c[-1,-i]' -g '*.itp(|.gz|.Z) *(/)' -- g_sas
compctl -x 's[-]' -s " f s sf n os oc oi om on h version nice b e dt xvg normpbc nopbc select selrpos seltype dump norm cfnorm resnr" - 'c[-1,-xvg]' -s " xmgrace xmgr none" - 'c[-1,-selrpos]' -s " atom res_com res_cog mol_com mol_cog whole_res_com whole_res_cog whole_mol_com whole_mol_cog part_res_com part_res_cog part_mol_com part_mol_cog dyn_res_com dyn_res_cog dyn_mol_com dyn_mol_cog" - 'c[-1,-seltype]' -s " atom res_com res_cog mol_com mol_cog whole_res_com whole_res_cog whole_mol_com whole_mol_cog part_res_com part_res_cog part_mol_com part_mol_cog dyn_res_com dyn_res_cog dyn_mol_com dyn_mol_cog" - 'c[-1,-resnr]' -s " number index" - 'c[-1,-f]' -g '*.(xtc|trr|cpt|trj|gro|g96|pdb|g87)(|.gz|.Z) *(/)' - 'c[-1,-s]' -g '*.(tpr|tpb|tpa|gro|g96|pdb|brk|ent)(|.gz|.Z) *(/)' - 'c[-1,-sf]' -g '*.dat(|.gz|.Z) *(/)' - 'c[-1,-n]' -g '*.ndx(|.gz|.Z) *(/)' - 'c[-1,-os]' -g '*.xvg(|.gz|.Z) *(/)' - 'c[-1,-oc]' -g '*.xvg(|.gz|.Z) *(/)' - 'c[-1,-oi]' -g '*.dat(|.gz|.Z) *(/)' - 'c[-1,-om]' -g '*.dat(|.gz|.Z) *(/)' - 'c[-1,-on]' -g '*.ndx(|.gz|.Z) *(/)' -- g_select
compctl -x 's[-]' -s " f n s oa od od1 od2 h version nice b e dt w xvg one z" - 'c[-1,-xvg]' -s " xmgrace xmgr none" - 'c[-1,-f]' -g '*.(xtc|trr|cpt|trj|gro|g96|pdb|g87)(|.gz|.Z) *(/)' - 'c[-1,-n]' -g '*.ndx(|.gz|.Z) *(/)' - 'c[-1,-s]' -g '*.(tpr|tpb|tpa)(|.gz|.Z) *(/)' - 'c[-1,-oa]' -g '*.xvg(|.gz|.Z) *(/)' - 'c[-1,-od]' -g '*.xvg(|.gz|.Z) *(/)' - 'c[-1,-od1]' -g '*.xvg(|.gz|.Z) *(/)' - 'c[-1,-od2]' -g '*.xvg(|.gz|.Z) *(/)' -- g_sgangle
compctl -x 's[-]' -s " f s h version nice b e dt" - 'c[-1,-f]' -g '*.(xtc|trr|cpt|trj|gro|g96|pdb|g87)(|.gz|.Z) *(/)' - 'c[-1,-s]' -g '*.(tpr|tpb|tpa)(|.gz|.Z) *(/)' -- g_xrama
compctl -x 's[-]' -s " f eig s n tar ori o h version nice xvg mon linfix linacc radfix radacc radcon flood outfrq slope linstep accdir radstep maxedsteps eqsteps deltaF0 deltaF tau Eflnull T alpha restrain hessian harmonic constF" - 'c[-1,-xvg]' -s " xmgrace xmgr none" - 'c[-1,-f]' -g '*.(trr|cpt|trj)(|.gz|.Z) *(/)' - 'c[-1,-eig]' -g '*.xvg(|.gz|.Z) *(/)' - 'c[-1,-s]' -g '*.(tpr|tpb|tpa|gro|g96|pdb|brk|ent)(|.gz|.Z) *(/)' - 'c[-1,-n]' -g '*.ndx(|.gz|.Z) *(/)' - 'c[-1,-tar]' -g '*.(gro|g96|pdb|brk|ent|esp|xyz|tpr|tpb|tpa)(|.gz|.Z) *(/)' - 'c[-1,-ori]' -g '*.(gro|g96|pdb|brk|ent|esp|xyz|tpr|tpb|tpa)(|.gz|.Z) *(/)' - 'c[-1,-o]' -g '*.edi(|.gz|.Z) *(/)' -- make_edi
compctl -x 's[-]' -s " f n o h version nice natoms" - 'c[-1,-f]' -g '*.(gro|g96|pdb|brk|ent|esp|xyz|tpr|tpb|tpa)(|.gz|.Z) *(/)' - 'c[-1,-n]' -g '*.ndx(|.gz|.Z) *(/)' - 'c[-1,-o]' -g '*.ndx(|.gz|.Z) *(/)' -- make_ndx
-compctl -x 's[-]' -s " s o x cpi cpo c e g dhdl field table tabletf tablep tableb rerun tpi tpid ei eo j jo ffout devout runav px pf ro ra rs rt mtx dn multidir membed mp mn h version nice deffnm xvg pd dd ddorder npme nt ntmpi ntomp ntomp_pme nopin pinht pinoffset gpu_id noddcheck rdd rcon dlb dds gcom nb notunepme testverlet v nocompact seppot pforce reprod cpt cpnum noappend nsteps maxh multi replex nex reseed ionize" - 'c[-1,-xvg]' -s " xmgrace xmgr none" - 'c[-1,-ddorder]' -s " interleave pp_pme cartesian" - 'c[-1,-dlb]' -s " auto no yes" - 'c[-1,-nb]' -s " auto cpu gpu gpu_cpu" - 'c[-1,-s]' -g '*.(tpr|tpb|tpa)(|.gz|.Z) *(/)' - 'c[-1,-o]' -g '*.(trr|cpt|trj)(|.gz|.Z) *(/)' - 'c[-1,-x]' -g '*.xtc(|.gz|.Z) *(/)' - 'c[-1,-cpi]' -g '*.cpt(|.gz|.Z) *(/)' - 'c[-1,-cpo]' -g '*.cpt(|.gz|.Z) *(/)' - 'c[-1,-c]' -g '*.(gro|g96|pdb|brk|ent|esp|xyz)(|.gz|.Z) *(/)' - 'c[-1,-e]' -g '*.edr(|.gz|.Z) *(/)' - 'c[-1,-g]' -g '*.log(|.gz|.Z) *(/)' - 'c[-1,-dhdl]' -g '*.xvg(|.gz|.Z) *(/)' - 'c[-1,-field]' -g '*.xvg(|.gz|.Z) *(/)' - 'c[-1,-table]' -g '*.xvg(|.gz|.Z) *(/)' - 'c[-1,-tabletf]' -g '*.xvg(|.gz|.Z) *(/)' - 'c[-1,-tablep]' -g '*.xvg(|.gz|.Z) *(/)' - 'c[-1,-tableb]' -g '*.xvg(|.gz|.Z) *(/)' - 'c[-1,-rerun]' -g '*.(xtc|trr|cpt|trj|gro|g96|pdb|g87)(|.gz|.Z) *(/)' - 'c[-1,-tpi]' -g '*.xvg(|.gz|.Z) *(/)' - 'c[-1,-tpid]' -g '*.xvg(|.gz|.Z) *(/)' - 'c[-1,-ei]' -g '*.edi(|.gz|.Z) *(/)' - 'c[-1,-eo]' -g '*.xvg(|.gz|.Z) *(/)' - 'c[-1,-j]' -g '*.gct(|.gz|.Z) *(/)' - 'c[-1,-jo]' -g '*.gct(|.gz|.Z) *(/)' - 'c[-1,-ffout]' -g '*.xvg(|.gz|.Z) *(/)' - 'c[-1,-devout]' -g '*.xvg(|.gz|.Z) *(/)' - 'c[-1,-runav]' -g '*.xvg(|.gz|.Z) *(/)' - 'c[-1,-px]' -g '*.xvg(|.gz|.Z) *(/)' - 'c[-1,-pf]' -g '*.xvg(|.gz|.Z) *(/)' - 'c[-1,-ro]' -g '*.xvg(|.gz|.Z) *(/)' - 'c[-1,-ra]' -g '*.log(|.gz|.Z) *(/)' - 'c[-1,-rs]' -g '*.log(|.gz|.Z) *(/)' - 'c[-1,-rt]' -g '*.log(|.gz|.Z) *(/)' - 'c[-1,-mtx]' -g '*.mtx(|.gz|.Z) *(/)' - 'c[-1,-dn]' -g '*.ndx(|.gz|.Z) *(/)' - 'c[-1,-multidir]' -g '*.rundir(|.gz|.Z) *(/)' - 'c[-1,-membed]' -g '*.dat(|.gz|.Z) *(/)' - 'c[-1,-mp]' -g '*.top(|.gz|.Z) *(/)' - 'c[-1,-mn]' -g '*.ndx(|.gz|.Z) *(/)' -- mdrun
-compctl -x 's[-]' -s " s o x cpi cpo c e g dhdl field table tabletf tablep tableb rerun tpi tpid ei eo j jo ffout devout runav px pf ro ra rs rt mtx dn multidir membed mp mn h version nice deffnm xvg pd dd ddorder npme nt ntmpi ntomp ntomp_pme nopin pinht pinoffset gpu_id noddcheck rdd rcon dlb dds gcom nb notunepme testverlet v nocompact seppot pforce reprod cpt cpnum noappend nsteps maxh multi replex nex reseed ionize" - 'c[-1,-xvg]' -s " xmgrace xmgr none" - 'c[-1,-ddorder]' -s " interleave pp_pme cartesian" - 'c[-1,-dlb]' -s " auto no yes" - 'c[-1,-nb]' -s " auto cpu gpu gpu_cpu" - 'c[-1,-s]' -g '*.(tpr|tpb|tpa)(|.gz|.Z) *(/)' - 'c[-1,-o]' -g '*.(trr|cpt|trj)(|.gz|.Z) *(/)' - 'c[-1,-x]' -g '*.xtc(|.gz|.Z) *(/)' - 'c[-1,-cpi]' -g '*.cpt(|.gz|.Z) *(/)' - 'c[-1,-cpo]' -g '*.cpt(|.gz|.Z) *(/)' - 'c[-1,-c]' -g '*.(gro|g96|pdb|brk|ent|esp|xyz)(|.gz|.Z) *(/)' - 'c[-1,-e]' -g '*.edr(|.gz|.Z) *(/)' - 'c[-1,-g]' -g '*.log(|.gz|.Z) *(/)' - 'c[-1,-dhdl]' -g '*.xvg(|.gz|.Z) *(/)' - 'c[-1,-field]' -g '*.xvg(|.gz|.Z) *(/)' - 'c[-1,-table]' -g '*.xvg(|.gz|.Z) *(/)' - 'c[-1,-tabletf]' -g '*.xvg(|.gz|.Z) *(/)' - 'c[-1,-tablep]' -g '*.xvg(|.gz|.Z) *(/)' - 'c[-1,-tableb]' -g '*.xvg(|.gz|.Z) *(/)' - 'c[-1,-rerun]' -g '*.(xtc|trr|cpt|trj|gro|g96|pdb|g87)(|.gz|.Z) *(/)' - 'c[-1,-tpi]' -g '*.xvg(|.gz|.Z) *(/)' - 'c[-1,-tpid]' -g '*.xvg(|.gz|.Z) *(/)' - 'c[-1,-ei]' -g '*.edi(|.gz|.Z) *(/)' - 'c[-1,-eo]' -g '*.edo(|.gz|.Z) *(/)' - 'c[-1,-j]' -g '*.gct(|.gz|.Z) *(/)' - 'c[-1,-jo]' -g '*.gct(|.gz|.Z) *(/)' - 'c[-1,-ffout]' -g '*.xvg(|.gz|.Z) *(/)' - 'c[-1,-devout]' -g '*.xvg(|.gz|.Z) *(/)' - 'c[-1,-runav]' -g '*.xvg(|.gz|.Z) *(/)' - 'c[-1,-px]' -g '*.xvg(|.gz|.Z) *(/)' - 'c[-1,-pf]' -g '*.xvg(|.gz|.Z) *(/)' - 'c[-1,-ro]' -g '*.xvg(|.gz|.Z) *(/)' - 'c[-1,-ra]' -g '*.log(|.gz|.Z) *(/)' - 'c[-1,-rs]' -g '*.log(|.gz|.Z) *(/)' - 'c[-1,-rt]' -g '*.log(|.gz|.Z) *(/)' - 'c[-1,-mtx]' -g '*.mtx(|.gz|.Z) *(/)' - 'c[-1,-dn]' -g '*.ndx(|.gz|.Z) *(/)' - 'c[-1,-multidir]' -g '*.rundir(|.gz|.Z) *(/)' - 'c[-1,-membed]' -g '*.dat(|.gz|.Z) *(/)' - 'c[-1,-mp]' -g '*.top(|.gz|.Z) *(/)' - 'c[-1,-mn]' -g '*.ndx(|.gz|.Z) *(/)' -- mdrun_mpi
+compctl -x 's[-]' -s " s o x cpi cpo c e g dhdl field table tabletf tablep tableb rerun tpi tpid ei eo j jo ffout devout runav px pf ro ra rs rt mtx dn multidir membed mp mn h version nice deffnm xvg pd dd ddorder npme nt ntmpi ntomp ntomp_pme pin pinoffset pinstride gpu_id noddcheck rdd rcon dlb dds gcom nb notunepme testverlet v nocompact seppot pforce reprod cpt cpnum noappend nsteps maxh multi replex nex reseed ionize" - 'c[-1,-xvg]' -s " xmgrace xmgr none" - 'c[-1,-ddorder]' -s " interleave pp_pme cartesian" - 'c[-1,-pin]' -s " auto on off" - 'c[-1,-dlb]' -s " auto no yes" - 'c[-1,-nb]' -s " auto cpu gpu gpu_cpu" - 'c[-1,-s]' -g '*.(tpr|tpb|tpa)(|.gz|.Z) *(/)' - 'c[-1,-o]' -g '*.(trr|cpt|trj)(|.gz|.Z) *(/)' - 'c[-1,-x]' -g '*.xtc(|.gz|.Z) *(/)' - 'c[-1,-cpi]' -g '*.cpt(|.gz|.Z) *(/)' - 'c[-1,-cpo]' -g '*.cpt(|.gz|.Z) *(/)' - 'c[-1,-c]' -g '*.(gro|g96|pdb|brk|ent|esp|xyz)(|.gz|.Z) *(/)' - 'c[-1,-e]' -g '*.edr(|.gz|.Z) *(/)' - 'c[-1,-g]' -g '*.log(|.gz|.Z) *(/)' - 'c[-1,-dhdl]' -g '*.xvg(|.gz|.Z) *(/)' - 'c[-1,-field]' -g '*.xvg(|.gz|.Z) *(/)' - 'c[-1,-table]' -g '*.xvg(|.gz|.Z) *(/)' - 'c[-1,-tabletf]' -g '*.xvg(|.gz|.Z) *(/)' - 'c[-1,-tablep]' -g '*.xvg(|.gz|.Z) *(/)' - 'c[-1,-tableb]' -g '*.xvg(|.gz|.Z) *(/)' - 'c[-1,-rerun]' -g '*.(xtc|trr|cpt|trj|gro|g96|pdb|g87)(|.gz|.Z) *(/)' - 'c[-1,-tpi]' -g '*.xvg(|.gz|.Z) *(/)' - 'c[-1,-tpid]' -g '*.xvg(|.gz|.Z) *(/)' - 'c[-1,-ei]' -g '*.edi(|.gz|.Z) *(/)' - 'c[-1,-eo]' -g '*.xvg(|.gz|.Z) *(/)' - 'c[-1,-j]' -g '*.gct(|.gz|.Z) *(/)' - 'c[-1,-jo]' -g '*.gct(|.gz|.Z) *(/)' - 'c[-1,-ffout]' -g '*.xvg(|.gz|.Z) *(/)' - 'c[-1,-devout]' -g '*.xvg(|.gz|.Z) *(/)' - 'c[-1,-runav]' -g '*.xvg(|.gz|.Z) *(/)' - 'c[-1,-px]' -g '*.xvg(|.gz|.Z) *(/)' - 'c[-1,-pf]' -g '*.xvg(|.gz|.Z) *(/)' - 'c[-1,-ro]' -g '*.xvg(|.gz|.Z) *(/)' - 'c[-1,-ra]' -g '*.log(|.gz|.Z) *(/)' - 'c[-1,-rs]' -g '*.log(|.gz|.Z) *(/)' - 'c[-1,-rt]' -g '*.log(|.gz|.Z) *(/)' - 'c[-1,-mtx]' -g '*.mtx(|.gz|.Z) *(/)' - 'c[-1,-dn]' -g '*.ndx(|.gz|.Z) *(/)' - 'c[-1,-multidir]' -g '*.(|.gz|.Z) *(/)' - 'c[-1,-membed]' -g '*.dat(|.gz|.Z) *(/)' - 'c[-1,-mp]' -g '*.top(|.gz|.Z) *(/)' - 'c[-1,-mn]' -g '*.ndx(|.gz|.Z) *(/)' -- mdrun
+compctl -x 's[-]' -s " s o x cpi cpo c e g dhdl field table tabletf tablep tableb rerun tpi tpid ei eo j jo ffout devout runav px pf ro ra rs rt mtx dn multidir membed mp mn h version nice deffnm xvg pd dd ddorder npme nt ntmpi ntomp ntomp_pme pin pinoffset pinstride gpu_id noddcheck rdd rcon dlb dds gcom nb notunepme testverlet v nocompact seppot pforce reprod cpt cpnum noappend nsteps maxh multi replex nex reseed ionize" - 'c[-1,-xvg]' -s " xmgrace xmgr none" - 'c[-1,-ddorder]' -s " interleave pp_pme cartesian" - 'c[-1,-pin]' -s " auto on off" - 'c[-1,-dlb]' -s " auto no yes" - 'c[-1,-nb]' -s " auto cpu gpu gpu_cpu" - 'c[-1,-s]' -g '*.(tpr|tpb|tpa)(|.gz|.Z) *(/)' - 'c[-1,-o]' -g '*.(trr|cpt|trj)(|.gz|.Z) *(/)' - 'c[-1,-x]' -g '*.xtc(|.gz|.Z) *(/)' - 'c[-1,-cpi]' -g '*.cpt(|.gz|.Z) *(/)' - 'c[-1,-cpo]' -g '*.cpt(|.gz|.Z) *(/)' - 'c[-1,-c]' -g '*.(gro|g96|pdb|brk|ent|esp|xyz)(|.gz|.Z) *(/)' - 'c[-1,-e]' -g '*.edr(|.gz|.Z) *(/)' - 'c[-1,-g]' -g '*.log(|.gz|.Z) *(/)' - 'c[-1,-dhdl]' -g '*.xvg(|.gz|.Z) *(/)' - 'c[-1,-field]' -g '*.xvg(|.gz|.Z) *(/)' - 'c[-1,-table]' -g '*.xvg(|.gz|.Z) *(/)' - 'c[-1,-tabletf]' -g '*.xvg(|.gz|.Z) *(/)' - 'c[-1,-tablep]' -g '*.xvg(|.gz|.Z) *(/)' - 'c[-1,-tableb]' -g '*.xvg(|.gz|.Z) *(/)' - 'c[-1,-rerun]' -g '*.(xtc|trr|cpt|trj|gro|g96|pdb|g87)(|.gz|.Z) *(/)' - 'c[-1,-tpi]' -g '*.xvg(|.gz|.Z) *(/)' - 'c[-1,-tpid]' -g '*.xvg(|.gz|.Z) *(/)' - 'c[-1,-ei]' -g '*.edi(|.gz|.Z) *(/)' - 'c[-1,-eo]' -g '*.xvg(|.gz|.Z) *(/)' - 'c[-1,-j]' -g '*.gct(|.gz|.Z) *(/)' - 'c[-1,-jo]' -g '*.gct(|.gz|.Z) *(/)' - 'c[-1,-ffout]' -g '*.xvg(|.gz|.Z) *(/)' - 'c[-1,-devout]' -g '*.xvg(|.gz|.Z) *(/)' - 'c[-1,-runav]' -g '*.xvg(|.gz|.Z) *(/)' - 'c[-1,-px]' -g '*.xvg(|.gz|.Z) *(/)' - 'c[-1,-pf]' -g '*.xvg(|.gz|.Z) *(/)' - 'c[-1,-ro]' -g '*.xvg(|.gz|.Z) *(/)' - 'c[-1,-ra]' -g '*.log(|.gz|.Z) *(/)' - 'c[-1,-rs]' -g '*.log(|.gz|.Z) *(/)' - 'c[-1,-rt]' -g '*.log(|.gz|.Z) *(/)' - 'c[-1,-mtx]' -g '*.mtx(|.gz|.Z) *(/)' - 'c[-1,-dn]' -g '*.ndx(|.gz|.Z) *(/)' - 'c[-1,-multidir]' -g '*.line_buf(|.gz|.Z) *(/)' - 'c[-1,-membed]' -g '*.dat(|.gz|.Z) *(/)' - 'c[-1,-mp]' -g '*.top(|.gz|.Z) *(/)' - 'c[-1,-mn]' -g '*.ndx(|.gz|.Z) *(/)' -- mdrun_mpi
compctl -x 's[-]' -s " s n h version nice type nohyd hq" - 'c[-1,-type]' -s " angle dihedral improper ryckaert-bellemans" - 'c[-1,-s]' -g '*.(tpr|tpb|tpa)(|.gz|.Z) *(/)' - 'c[-1,-n]' -g '*.ndx(|.gz|.Z) *(/)' -- mk_angndx
compctl -x 's[-]' -s " f s n h version nice b e dt" - 'c[-1,-f]' -g '*.(xtc|trr|cpt|trj|gro|g96|pdb|g87)(|.gz|.Z) *(/)' - 'c[-1,-s]' -g '*.(tpr|tpb|tpa)(|.gz|.Z) *(/)' - 'c[-1,-n]' -g '*.ndx(|.gz|.Z) *(/)' -- ngmx
compctl -x 's[-]' -s " f o p i n q h version nice chainsep merge ff water inter ss ter lys arg asp glu gln his angle dist una ignh missing v posrefc vsite heavyh deuterate nochargegrp nocmap renum rtpres" - 'c[-1,-chainsep]' -s " id_or_ter id_and_ter ter id interactive" - 'c[-1,-merge]' -s " no all interactive" - 'c[-1,-water]' -s " select none spc spce tip3p tip4p tip5p" - 'c[-1,-vsite]' -s " none hydrogens aromatics" - 'c[-1,-f]' -g '*.(gro|g96|pdb|brk|ent|esp|xyz|tpr|tpb|tpa)(|.gz|.Z) *(/)' - 'c[-1,-o]' -g '*.(gro|g96|pdb|brk|ent|esp|xyz)(|.gz|.Z) *(/)' - 'c[-1,-p]' -g '*.top(|.gz|.Z) *(/)' - 'c[-1,-i]' -g '*.itp(|.gz|.Z) *(/)' - 'c[-1,-n]' -g '*.ndx(|.gz|.Z) *(/)' - 'c[-1,-q]' -g '*.(gro|g96|pdb|brk|ent|esp|xyz)(|.gz|.Z) *(/)' -- pdb2gmx
# NONBONDED_SOURCES is imported from the nonbonded subdirectory.
add_library(gmx ${GMXLIB_SOURCES} ${BLAS_SOURCES} ${LAPACK_SOURCES} ${THREAD_MPI_SRC} ${NONBONDED_SOURCES})
-target_link_libraries(gmx ${GMX_GPU_LIBRARIES} ${GMX_EXTRA_LIBRARIES} ${FFT_LIBRARIES} ${THREAD_LIB} ${OpenMP_SHARED_LINKER_FLAGS})
+target_link_libraries(gmx ${FFT_LIBRARIES} ${GMX_GPU_LIBRARIES} ${GMX_EXTRA_LIBRARIES} ${THREAD_LIB} ${OpenMP_SHARED_LINKER_FLAGS})
if(USE_VERSION_H)
add_dependencies(gmx gmx_version)
endif()
block_bc(cr, fep->delta_lambda);
block_bc(cr, fep->bPrintEnergy);
block_bc(cr, fep->n_lambda);
- snew_bc(cr, fep->all_lambda, efptNR);
- nblock_bc(cr, efptNR, fep->all_lambda);
- for (i = 0; i < efptNR; i++)
+ if (fep->n_lambda > 0)
{
- snew_bc(cr, fep->all_lambda[i], fep->n_lambda);
- nblock_bc(cr, fep->n_lambda, fep->all_lambda[i]);
+ snew_bc(cr, fep->all_lambda, efptNR);
+ nblock_bc(cr, efptNR, fep->all_lambda);
+ for (i = 0; i < efptNR; i++)
+ {
+ snew_bc(cr, fep->all_lambda[i], fep->n_lambda);
+ nblock_bc(cr, fep->n_lambda, fep->all_lambda[i]);
+ }
}
block_bc(cr, fep->sc_alpha);
block_bc(cr, fep->sc_power);
# Write out the list of settings and corresponding kernels to the declaration file
fpdecl.write( '\n\n' )
fpdecl.write( 'nb_kernel_info_t\n' )
-fpdecl.write( 'kernellist_'+Arch+'[] =\n' )
+fpdecl.write( ' kernellist_'+Arch+'[] =\n' )
fpdecl.write( '{\n' )
for decl in kerneldecl[0:-1]:
fpdecl.write( decl + ',\n' )
fpdecl.write( kerneldecl[-1] + '\n' )
fpdecl.write( '};\n\n' )
fpdecl.write( 'int\n' )
-fpdecl.write( 'kernellist_'+Arch+'_size = sizeof(kernellist_'+Arch+')/sizeof(kernellist_'+Arch+'[0]);\n\n')
+fpdecl.write( ' kernellist_'+Arch+'_size = sizeof(kernellist_'+Arch+')/sizeof(kernellist_'+Arch+'[0]);\n\n')
fpdecl.write( '#endif\n')
fpdecl.close()
# Write out the list of settings and corresponding kernels to the declaration file
fpdecl.write( '\n\n' )
fpdecl.write( 'nb_kernel_info_t\n' )
-fpdecl.write( 'kernellist_'+Arch+'[] =\n' )
+fpdecl.write( ' kernellist_'+Arch+'[] =\n' )
fpdecl.write( '{\n' )
for decl in kerneldecl[0:-1]:
fpdecl.write( decl + ',\n' )
fpdecl.write( kerneldecl[-1] + '\n' )
fpdecl.write( '};\n\n' )
fpdecl.write( 'int\n' )
-fpdecl.write( 'kernellist_'+Arch+'_size = sizeof(kernellist_'+Arch+')/sizeof(kernellist_'+Arch+'[0]);\n\n')
+fpdecl.write( ' kernellist_'+Arch+'_size = sizeof(kernellist_'+Arch+')/sizeof(kernellist_'+Arch+'[0]);\n\n')
fpdecl.write( '#endif\n')
fpdecl.close()
nb_kernel_info_t
-kernellist_avx_128_fma_double[] =
+ kernellist_avx_128_fma_double[] =
{
{ nb_kernel_ElecNone_VdwLJ_GeomP1P1_VF_avx_128_fma_double, "nb_kernel_ElecNone_VdwLJ_GeomP1P1_VF_avx_128_fma_double", "avx_128_fma_double", "None", "None", "LennardJones", "None", "ParticleParticle", "", "PotentialAndForce" },
{ nb_kernel_ElecNone_VdwLJ_GeomP1P1_F_avx_128_fma_double, "nb_kernel_ElecNone_VdwLJ_GeomP1P1_F_avx_128_fma_double", "avx_128_fma_double", "None", "None", "LennardJones", "None", "ParticleParticle", "", "Force" },
};
int
-kernellist_avx_128_fma_double_size = sizeof(kernellist_avx_128_fma_double)/sizeof(kernellist_avx_128_fma_double[0]);
+ kernellist_avx_128_fma_double_size = sizeof(kernellist_avx_128_fma_double)/sizeof(kernellist_avx_128_fma_double[0]);
#endif
# Write out the list of settings and corresponding kernels to the declaration file
fpdecl.write( '\n\n' )
fpdecl.write( 'nb_kernel_info_t\n' )
-fpdecl.write( 'kernellist_'+Arch+'[] =\n' )
+fpdecl.write( ' kernellist_'+Arch+'[] =\n' )
fpdecl.write( '{\n' )
for decl in kerneldecl[0:-1]:
fpdecl.write( decl + ',\n' )
fpdecl.write( kerneldecl[-1] + '\n' )
fpdecl.write( '};\n\n' )
fpdecl.write( 'int\n' )
-fpdecl.write( 'kernellist_'+Arch+'_size = sizeof(kernellist_'+Arch+')/sizeof(kernellist_'+Arch+'[0]);\n\n')
+fpdecl.write( ' kernellist_'+Arch+'_size = sizeof(kernellist_'+Arch+')/sizeof(kernellist_'+Arch+'[0]);\n\n')
fpdecl.write( '#endif\n')
fpdecl.close()
nb_kernel_info_t
-kernellist_avx_128_fma_single[] =
+ kernellist_avx_128_fma_single[] =
{
{ nb_kernel_ElecNone_VdwLJ_GeomP1P1_VF_avx_128_fma_single, "nb_kernel_ElecNone_VdwLJ_GeomP1P1_VF_avx_128_fma_single", "avx_128_fma_single", "None", "None", "LennardJones", "None", "ParticleParticle", "", "PotentialAndForce" },
{ nb_kernel_ElecNone_VdwLJ_GeomP1P1_F_avx_128_fma_single, "nb_kernel_ElecNone_VdwLJ_GeomP1P1_F_avx_128_fma_single", "avx_128_fma_single", "None", "None", "LennardJones", "None", "ParticleParticle", "", "Force" },
};
int
-kernellist_avx_128_fma_single_size = sizeof(kernellist_avx_128_fma_single)/sizeof(kernellist_avx_128_fma_single[0]);
+ kernellist_avx_128_fma_single_size = sizeof(kernellist_avx_128_fma_single)/sizeof(kernellist_avx_128_fma_single[0]);
#endif
# Write out the list of settings and corresponding kernels to the declaration file
fpdecl.write( '\n\n' )
fpdecl.write( 'nb_kernel_info_t\n' )
-fpdecl.write( 'kernellist_'+Arch+'[] =\n' )
+fpdecl.write( ' kernellist_'+Arch+'[] =\n' )
fpdecl.write( '{\n' )
for decl in kerneldecl[0:-1]:
fpdecl.write( decl + ',\n' )
fpdecl.write( kerneldecl[-1] + '\n' )
fpdecl.write( '};\n\n' )
fpdecl.write( 'int\n' )
-fpdecl.write( 'kernellist_'+Arch+'_size = sizeof(kernellist_'+Arch+')/sizeof(kernellist_'+Arch+'[0]);\n\n')
+fpdecl.write( ' kernellist_'+Arch+'_size = sizeof(kernellist_'+Arch+')/sizeof(kernellist_'+Arch+'[0]);\n\n')
fpdecl.write( '#endif\n')
fpdecl.close()
nb_kernel_info_t
-kernellist_avx_256_double[] =
+ kernellist_avx_256_double[] =
{
{ nb_kernel_ElecNone_VdwLJ_GeomP1P1_VF_avx_256_double, "nb_kernel_ElecNone_VdwLJ_GeomP1P1_VF_avx_256_double", "avx_256_double", "None", "None", "LennardJones", "None", "ParticleParticle", "", "PotentialAndForce" },
{ nb_kernel_ElecNone_VdwLJ_GeomP1P1_F_avx_256_double, "nb_kernel_ElecNone_VdwLJ_GeomP1P1_F_avx_256_double", "avx_256_double", "None", "None", "LennardJones", "None", "ParticleParticle", "", "Force" },
};
int
-kernellist_avx_256_double_size = sizeof(kernellist_avx_256_double)/sizeof(kernellist_avx_256_double[0]);
+ kernellist_avx_256_double_size = sizeof(kernellist_avx_256_double)/sizeof(kernellist_avx_256_double[0]);
#endif
# Write out the list of settings and corresponding kernels to the declaration file
fpdecl.write( '\n\n' )
fpdecl.write( 'nb_kernel_info_t\n' )
-fpdecl.write( 'kernellist_'+Arch+'[] =\n' )
+fpdecl.write( ' kernellist_'+Arch+'[] =\n' )
fpdecl.write( '{\n' )
for decl in kerneldecl[0:-1]:
fpdecl.write( decl + ',\n' )
fpdecl.write( kerneldecl[-1] + '\n' )
fpdecl.write( '};\n\n' )
fpdecl.write( 'int\n' )
-fpdecl.write( 'kernellist_'+Arch+'_size = sizeof(kernellist_'+Arch+')/sizeof(kernellist_'+Arch+'[0]);\n\n')
+fpdecl.write( ' kernellist_'+Arch+'_size = sizeof(kernellist_'+Arch+')/sizeof(kernellist_'+Arch+'[0]);\n\n')
fpdecl.write( '#endif\n')
fpdecl.close()
nb_kernel_info_t
-kernellist_avx_256_single[] =
+ kernellist_avx_256_single[] =
{
{ nb_kernel_ElecNone_VdwLJ_GeomP1P1_VF_avx_256_single, "nb_kernel_ElecNone_VdwLJ_GeomP1P1_VF_avx_256_single", "avx_256_single", "None", "None", "LennardJones", "None", "ParticleParticle", "", "PotentialAndForce" },
{ nb_kernel_ElecNone_VdwLJ_GeomP1P1_F_avx_256_single, "nb_kernel_ElecNone_VdwLJ_GeomP1P1_F_avx_256_single", "avx_256_single", "None", "None", "LennardJones", "None", "ParticleParticle", "", "Force" },
};
int
-kernellist_avx_256_single_size = sizeof(kernellist_avx_256_single)/sizeof(kernellist_avx_256_single[0]);
+ kernellist_avx_256_single_size = sizeof(kernellist_avx_256_single)/sizeof(kernellist_avx_256_single[0]);
#endif
# Write out the list of settings and corresponding kernels to the declaration file
fpdecl.write( '\n\n' )
fpdecl.write( 'nb_kernel_info_t\n' )
-fpdecl.write( 'kernellist_'+Arch+'[] =\n' )
+fpdecl.write( ' kernellist_'+Arch+'[] =\n' )
fpdecl.write( '{\n' )
for decl in kerneldecl[0:-1]:
fpdecl.write( decl + ',\n' )
fpdecl.write( kerneldecl[-1] + '\n' )
fpdecl.write( '};\n\n' )
fpdecl.write( 'int\n' )
-fpdecl.write( 'kernellist_'+Arch+'_size = sizeof(kernellist_'+Arch+')/sizeof(kernellist_'+Arch+'[0]);\n\n')
+fpdecl.write( ' kernellist_'+Arch+'_size = sizeof(kernellist_'+Arch+')/sizeof(kernellist_'+Arch+'[0]);\n\n')
fpdecl.write( '#endif\n')
fpdecl.close()
nb_kernel_info_t
-kernellist_c[] =
+ kernellist_c[] =
{
{ nb_kernel_ElecNone_VdwLJ_GeomP1P1_VF_c, "nb_kernel_ElecNone_VdwLJ_GeomP1P1_VF_c", "c", "None", "None", "LennardJones", "None", "ParticleParticle", "", "PotentialAndForce" },
{ nb_kernel_ElecNone_VdwLJ_GeomP1P1_F_c, "nb_kernel_ElecNone_VdwLJ_GeomP1P1_F_c", "c", "None", "None", "LennardJones", "None", "ParticleParticle", "", "Force" },
};
int
-kernellist_c_size = sizeof(kernellist_c)/sizeof(kernellist_c[0]);
+ kernellist_c_size = sizeof(kernellist_c)/sizeof(kernellist_c[0]);
#endif
# Write out the list of settings and corresponding kernels to the declaration file
fpdecl.write( '\n\n' )
fpdecl.write( 'nb_kernel_info_t\n' )
-fpdecl.write( 'kernellist_'+Arch+'[] =\n' )
+fpdecl.write( ' kernellist_'+Arch+'[] =\n' )
fpdecl.write( '{\n' )
for decl in kerneldecl[0:-1]:
fpdecl.write( decl + ',\n' )
fpdecl.write( kerneldecl[-1] + '\n' )
fpdecl.write( '};\n\n' )
fpdecl.write( 'int\n' )
-fpdecl.write( 'kernellist_'+Arch+'_size = sizeof(kernellist_'+Arch+')/sizeof(kernellist_'+Arch+'[0]);\n\n')
+fpdecl.write( ' kernellist_'+Arch+'_size = sizeof(kernellist_'+Arch+')/sizeof(kernellist_'+Arch+'[0]);\n\n')
fpdecl.write( '#endif\n')
fpdecl.close()
nb_kernel_info_t
-kernellist_sse2_double[] =
+ kernellist_sse2_double[] =
{
{ nb_kernel_ElecNone_VdwLJ_GeomP1P1_VF_sse2_double, "nb_kernel_ElecNone_VdwLJ_GeomP1P1_VF_sse2_double", "sse2_double", "None", "None", "LennardJones", "None", "ParticleParticle", "", "PotentialAndForce" },
{ nb_kernel_ElecNone_VdwLJ_GeomP1P1_F_sse2_double, "nb_kernel_ElecNone_VdwLJ_GeomP1P1_F_sse2_double", "sse2_double", "None", "None", "LennardJones", "None", "ParticleParticle", "", "Force" },
};
int
-kernellist_sse2_double_size = sizeof(kernellist_sse2_double)/sizeof(kernellist_sse2_double[0]);
+ kernellist_sse2_double_size = sizeof(kernellist_sse2_double)/sizeof(kernellist_sse2_double[0]);
#endif
# Write out the list of settings and corresponding kernels to the declaration file
fpdecl.write( '\n\n' )
fpdecl.write( 'nb_kernel_info_t\n' )
-fpdecl.write( 'kernellist_'+Arch+'[] =\n' )
+fpdecl.write( ' kernellist_'+Arch+'[] =\n' )
fpdecl.write( '{\n' )
for decl in kerneldecl[0:-1]:
fpdecl.write( decl + ',\n' )
fpdecl.write( kerneldecl[-1] + '\n' )
fpdecl.write( '};\n\n' )
fpdecl.write( 'int\n' )
-fpdecl.write( 'kernellist_'+Arch+'_size = sizeof(kernellist_'+Arch+')/sizeof(kernellist_'+Arch+'[0]);\n\n')
+fpdecl.write( ' kernellist_'+Arch+'_size = sizeof(kernellist_'+Arch+')/sizeof(kernellist_'+Arch+'[0]);\n\n')
fpdecl.write( '#endif\n')
fpdecl.close()
nb_kernel_info_t
-kernellist_sse2_single[] =
+ kernellist_sse2_single[] =
{
{ nb_kernel_ElecNone_VdwLJ_GeomP1P1_VF_sse2_single, "nb_kernel_ElecNone_VdwLJ_GeomP1P1_VF_sse2_single", "sse2_single", "None", "None", "LennardJones", "None", "ParticleParticle", "", "PotentialAndForce" },
{ nb_kernel_ElecNone_VdwLJ_GeomP1P1_F_sse2_single, "nb_kernel_ElecNone_VdwLJ_GeomP1P1_F_sse2_single", "sse2_single", "None", "None", "LennardJones", "None", "ParticleParticle", "", "Force" },
};
int
-kernellist_sse2_single_size = sizeof(kernellist_sse2_single)/sizeof(kernellist_sse2_single[0]);
+ kernellist_sse2_single_size = sizeof(kernellist_sse2_single)/sizeof(kernellist_sse2_single[0]);
#endif
# Write out the list of settings and corresponding kernels to the declaration file
fpdecl.write( '\n\n' )
fpdecl.write( 'nb_kernel_info_t\n' )
-fpdecl.write( 'kernellist_'+Arch+'[] =\n' )
+fpdecl.write( ' kernellist_'+Arch+'[] =\n' )
fpdecl.write( '{\n' )
for decl in kerneldecl[0:-1]:
fpdecl.write( decl + ',\n' )
fpdecl.write( kerneldecl[-1] + '\n' )
fpdecl.write( '};\n\n' )
fpdecl.write( 'int\n' )
-fpdecl.write( 'kernellist_'+Arch+'_size = sizeof(kernellist_'+Arch+')/sizeof(kernellist_'+Arch+'[0]);\n\n')
+fpdecl.write( ' kernellist_'+Arch+'_size = sizeof(kernellist_'+Arch+')/sizeof(kernellist_'+Arch+'[0]);\n\n')
fpdecl.write( '#endif\n')
fpdecl.close()
nb_kernel_info_t
-kernellist_sse4_1_double[] =
+ kernellist_sse4_1_double[] =
{
{ nb_kernel_ElecNone_VdwLJ_GeomP1P1_VF_sse4_1_double, "nb_kernel_ElecNone_VdwLJ_GeomP1P1_VF_sse4_1_double", "sse4_1_double", "None", "None", "LennardJones", "None", "ParticleParticle", "", "PotentialAndForce" },
{ nb_kernel_ElecNone_VdwLJ_GeomP1P1_F_sse4_1_double, "nb_kernel_ElecNone_VdwLJ_GeomP1P1_F_sse4_1_double", "sse4_1_double", "None", "None", "LennardJones", "None", "ParticleParticle", "", "Force" },
};
int
-kernellist_sse4_1_double_size = sizeof(kernellist_sse4_1_double)/sizeof(kernellist_sse4_1_double[0]);
+ kernellist_sse4_1_double_size = sizeof(kernellist_sse4_1_double)/sizeof(kernellist_sse4_1_double[0]);
#endif
# Write out the list of settings and corresponding kernels to the declaration file
fpdecl.write( '\n\n' )
fpdecl.write( 'nb_kernel_info_t\n' )
-fpdecl.write( 'kernellist_'+Arch+'[] =\n' )
+fpdecl.write( ' kernellist_'+Arch+'[] =\n' )
fpdecl.write( '{\n' )
for decl in kerneldecl[0:-1]:
fpdecl.write( decl + ',\n' )
fpdecl.write( kerneldecl[-1] + '\n' )
fpdecl.write( '};\n\n' )
fpdecl.write( 'int\n' )
-fpdecl.write( 'kernellist_'+Arch+'_size = sizeof(kernellist_'+Arch+')/sizeof(kernellist_'+Arch+'[0]);\n\n')
+fpdecl.write( ' kernellist_'+Arch+'_size = sizeof(kernellist_'+Arch+')/sizeof(kernellist_'+Arch+'[0]);\n\n')
fpdecl.write( '#endif\n')
fpdecl.close()
nb_kernel_info_t
-kernellist_sse4_1_single[] =
+ kernellist_sse4_1_single[] =
{
{ nb_kernel_ElecNone_VdwLJ_GeomP1P1_VF_sse4_1_single, "nb_kernel_ElecNone_VdwLJ_GeomP1P1_VF_sse4_1_single", "sse4_1_single", "None", "None", "LennardJones", "None", "ParticleParticle", "", "PotentialAndForce" },
{ nb_kernel_ElecNone_VdwLJ_GeomP1P1_F_sse4_1_single, "nb_kernel_ElecNone_VdwLJ_GeomP1P1_F_sse4_1_single", "sse4_1_single", "None", "None", "LennardJones", "None", "ParticleParticle", "", "Force" },
};
int
-kernellist_sse4_1_single_size = sizeof(kernellist_sse4_1_single)/sizeof(kernellist_sse4_1_single[0]);
+ kernellist_sse4_1_single_size = sizeof(kernellist_sse4_1_single)/sizeof(kernellist_sse4_1_single[0]);
#endif
{
#ifdef HAVE_PTHREAD_SETAFFINITY
cpu_set_t set;
- int ret;
+ int ret;
/* run getaffinity to check whether we get back ENOSYS */
- ret=pthread_getaffinity_np(pthread_self(), sizeof(set), &set);
+ ret = pthread_getaffinity_np(pthread_self(), sizeof(set), &set);
if (ret == 0)
{
return TMPI_SETAFFINITY_SUPPORT_YES;
add_library(fahcore ${MDRUN_SOURCES})
else(GMX_FAHCORE)
-list(APPEND GMX_EXTRA_LIBRARIES gmxpreprocess md ${OpenMP_LINKER_FLAGS})
-
set(GMX_KERNEL_PROGRAMS
grompp tpbconv pdb2gmx g_protonate gmxdump g_x2top gmxcheck)
if (NOT GMX_NO_QUOTES)
if (NOT ${PROGRAM} STREQUAL "g_luck")
gmx_add_man_page(${PROGRAM})
endif()
- target_link_libraries(${PROGRAM} ${GMX_EXTRA_LIBRARIES})
+ target_link_libraries(${PROGRAM} gmxpreprocess md gmx ${OpenMP_LINKER_FLAGS})
set_target_properties(${PROGRAM} PROPERTIES OUTPUT_NAME "${PROGRAM}${GMX_BINARY_SUFFIX}")
endforeach()
add_executable(mdrun ${MDRUN_SOURCES} main.c)
gmx_add_man_page(mdrun)
-target_link_libraries(mdrun ${GMX_EXTRA_LIBRARIES})
+target_link_libraries(mdrun gmxpreprocess md gmx ${OpenMP_LINKER_FLAGS})
set_target_properties(mdrun PROPERTIES OUTPUT_NAME "mdrun${GMX_BINARY_SUFFIX}" COMPILE_FLAGS "${OpenMP_C_FLAGS}")
# Construct component groups for installation; note that a component may
add_library(md ${MDLIB_SOURCES})
-target_link_libraries(md ${GMX_GPU_LIBRARIES} gmx ${GMX_EXTRA_LIBRARIES} ${FFT_LIBRARIES} ${OpenMP_SHARED_LINKER_FLAGS})
+target_link_libraries(md ${GMX_GPU_LIBRARIES} gmx)
if(GMX_BUILD_OWN_FFTW)
# This dependency has to be made here rather than the CMakeLists.txt that
# does the FFTW build, because of the order in which
cellsize_min = comm->cellsize_min[dim];
- if (!comm->bVacDLBNoLimit && comm->bPMELoadBalDLBLimits)
+ if (!comm->bVacDLBNoLimit)
{
- cellsize_min = max(cellsize_min,
- comm->PMELoadBal_max_cutoff/comm->cd[dim_ind].np_dlb);
+ /* The cut-off might have changed, e.g. by PME load balacning,
+ * from the value used to set comm->cellsize_min, so check it.
+ */
+ cellsize_min = max(cellsize_min, comm->cutoff/comm->cd[dim_ind].np_dlb);
+
+ if (comm->bPMELoadBalDLBLimits)
+ {
+ /* Check for the cut-off limit set by the PME load balancing */
+ cellsize_min = max(cellsize_min, comm->PMELoadBal_max_cutoff/comm->cd[dim_ind].np_dlb);
+ }
}
return cellsize_min;
* The initialized data sets are then transmitted to the
* other nodes in broadcast_ed_data */
- edi->bNeedDoEdsam = edi->vecs.mon.neig
- || edi->vecs.linfix.neig
- || edi->vecs.linacc.neig
- || edi->vecs.radfix.neig
- || edi->vecs.radacc.neig
- || edi->vecs.radcon.neig;
-
alook = gmx_mtop_atomlookup_init(mtop);
/* evaluate masses (reference structure) */
for (nr_edi = 1; nr_edi <= nED; nr_edi++)
{
+ /* Remember for each ED group whether we have to do essential dynamics
+ * constraints or possibly only flooding */
+ edi->bNeedDoEdsam = edi->vecs.mon.neig
+ || edi->vecs.linfix.neig
+ || edi->vecs.linacc.neig
+ || edi->vecs.radfix.neig
+ || edi->vecs.radacc.neig
+ || edi->vecs.radcon.neig;
+
fprintf(ed->edo, "#\n");
fprintf(ed->edo, "# Summary of applied con/restraints for the ED group %c\n", get_EDgroupChar(nr_edi, nED));
fprintf(ed->edo, "# Atoms in average structure: %d\n", edi->sav.nr);
edi = ed->edpar;
for (nr_edi = 1; nr_edi <= nED; nr_edi++)
{
- nice_legend(&setname, &nsets, &LegendStr, "RMSD to ref", "nm", get_EDgroupChar(nr_edi, nED) );
-
- /* Essential dynamics, projections on eigenvectors */
- nice_legend_evec(&setname, &nsets, &LegendStr, &edi->vecs.mon, get_EDgroupChar(nr_edi, nED), "MON" );
- nice_legend_evec(&setname, &nsets, &LegendStr, &edi->vecs.linfix, get_EDgroupChar(nr_edi, nED), "LINFIX");
- nice_legend_evec(&setname, &nsets, &LegendStr, &edi->vecs.linacc, get_EDgroupChar(nr_edi, nED), "LINACC");
- nice_legend_evec(&setname, &nsets, &LegendStr, &edi->vecs.radfix, get_EDgroupChar(nr_edi, nED), "RADFIX");
- if (edi->vecs.radfix.neig)
- {
- nice_legend(&setname, &nsets, &LegendStr, "RADFIX radius", "nm", get_EDgroupChar(nr_edi, nED));
- }
- nice_legend_evec(&setname, &nsets, &LegendStr, &edi->vecs.radacc, get_EDgroupChar(nr_edi, nED), "RADACC");
- if (edi->vecs.radacc.neig)
- {
- nice_legend(&setname, &nsets, &LegendStr, "RADACC radius", "nm", get_EDgroupChar(nr_edi, nED));
- }
- nice_legend_evec(&setname, &nsets, &LegendStr, &edi->vecs.radcon, get_EDgroupChar(nr_edi, nED), "RADCON");
- if (edi->vecs.radcon.neig)
+ if (edi->bNeedDoEdsam) /* Only print ED legend if at least one ED option is on */
{
- nice_legend(&setname, &nsets, &LegendStr, "RADCON radius", "nm", get_EDgroupChar(nr_edi, nED));
- }
+ nice_legend(&setname, &nsets, &LegendStr, "RMSD to ref", "nm", get_EDgroupChar(nr_edi, nED) );
+ /* Essential dynamics, projections on eigenvectors */
+ nice_legend_evec(&setname, &nsets, &LegendStr, &edi->vecs.mon, get_EDgroupChar(nr_edi, nED), "MON" );
+ nice_legend_evec(&setname, &nsets, &LegendStr, &edi->vecs.linfix, get_EDgroupChar(nr_edi, nED), "LINFIX");
+ nice_legend_evec(&setname, &nsets, &LegendStr, &edi->vecs.linacc, get_EDgroupChar(nr_edi, nED), "LINACC");
+ nice_legend_evec(&setname, &nsets, &LegendStr, &edi->vecs.radfix, get_EDgroupChar(nr_edi, nED), "RADFIX");
+ if (edi->vecs.radfix.neig)
+ {
+ nice_legend(&setname, &nsets, &LegendStr, "RADFIX radius", "nm", get_EDgroupChar(nr_edi, nED));
+ }
+ nice_legend_evec(&setname, &nsets, &LegendStr, &edi->vecs.radacc, get_EDgroupChar(nr_edi, nED), "RADACC");
+ if (edi->vecs.radacc.neig)
+ {
+ nice_legend(&setname, &nsets, &LegendStr, "RADACC radius", "nm", get_EDgroupChar(nr_edi, nED));
+ }
+ nice_legend_evec(&setname, &nsets, &LegendStr, &edi->vecs.radcon, get_EDgroupChar(nr_edi, nED), "RADCON");
+ if (edi->vecs.radcon.neig)
+ {
+ nice_legend(&setname, &nsets, &LegendStr, "RADCON radius", "nm", get_EDgroupChar(nr_edi, nED));
+ }
+ }
edi = edi->next_edi;
} /* end of 'pure' essential dynamics legend entries */
n_edsam = nsets - n_flood;
j = ir->adress->tf_table_index[i]; /* get energy group index */
sprintf(buf + strlen(tabfn) - strlen(ftp2ext(efXVG)) - 1, "tf_%s.%s",
*(mtop->groups.grpname[mtop->groups.grps[egcENER].nm_ind[j]]), ftp2ext(efXVG));
- if(fp)
+ if (fp)
{
- fprintf(fp,"loading tf table for energygrp index %d from %s\n", ir->adress->tf_table_index[i], buf);
+ fprintf(fp, "loading tf table for energygrp index %d from %s\n", ir->adress->tf_table_index[i], buf);
}
fr->atf_tabs[i] = make_atf_table(fp, oenv, fr, buf, box);
}
}
static void print_cycles(FILE *fplog, double c2t, const char *name,
- int nnodes_tot, int nnodes, int nthreads,
+ int nthreads_tot,
+ int nnodes, int nthreads,
int n, double c, double tot)
{
char num[11];
sprintf(num, " ");
sprintf(thstr, " ");
}
- wallt = c*c2t*nnodes_tot/(double)nnodes;
+ /* Convert the cycle count to wallclock time for this task */
+ if (nthreads > 0)
+ {
+ /* Cycle count has been multiplied by the thread count,
+ * correct for the number of threads used.
+ */
+ wallt = c*c2t*nthreads_tot/(double)(nnodes*nthreads);
+ }
+ else
+ {
+ /* nthreads=-1 signals total run time, no correction required */
+ wallt = c*c2t;
+ }
fprintf(fplog, " %-19s %4d %4s %10s %10.3f %12.3f %5.1f\n",
name, nnodes, thstr, num, wallt, c*1e-9, 100*c/tot);
}
{
double *cycles;
double c2t, tot, tot_gpu, tot_cpu_overlap, gpu_cpu_ratio, sum, tot_k;
- int i, j, npp, nth_pp, nth_pme;
+ int i, j, npp, nth_pp, nth_pme, nth_tot;
char buf[STRLEN];
const char *hline = "-----------------------------------------------------------------------------";
if (npme > 0)
{
npp = nnodes - npme;
+
+ nth_tot = npp*nth_pp + npme*nth_pme;
}
else
{
npp = nnodes;
npme = nnodes;
+
+ nth_tot = npp*nth_pp;
}
+
tot = cycles[ewcRUN];
/* Conversion factor from cycles to seconds */
{
if (!is_pme_subcounter(i))
{
- print_cycles(fplog, c2t, wcn[i], nnodes,
+ print_cycles(fplog, c2t, wcn[i], nth_tot,
is_pme_counter(i) ? npme : npp,
is_pme_counter(i) ? nth_pme : nth_pp,
wc->wcc[i].n, cycles[i], tot);
buf[9] = ' ';
snprintf(buf+10, 9, "%-9s", wcn[j]);
buf[19] = '\0';
- print_cycles(fplog, c2t, buf, nnodes,
+ print_cycles(fplog, c2t, buf, nth_tot,
is_pme_counter(i) ? npme : npp,
is_pme_counter(i) ? nth_pme : nth_pp,
wc->wcc_all[i*ewcNR+j].n,
}
}
}
- print_cycles(fplog, c2t, "Rest", npp, npp, -1, 0, tot-sum, tot);
+ print_cycles(fplog, c2t, "Rest", nth_tot, npp, -1, 0, tot-sum, tot);
fprintf(fplog, "%s\n", hline);
- print_cycles(fplog, c2t, "Total", nnodes, nnodes, -1, 0, tot, tot);
+ print_cycles(fplog, c2t, "Total", nth_tot, nnodes, -1, 0, tot, tot);
fprintf(fplog, "%s\n", hline);
if (wc->wcc[ewcPMEMESH].n > 0)
{
if (is_pme_subcounter(i))
{
- print_cycles(fplog, c2t, wcn[i], nnodes,
+ print_cycles(fplog, c2t, wcn[i], nth_tot,
is_pme_counter(i) ? npme : npp,
is_pme_counter(i) ? nth_pme : nth_pp,
wc->wcc[i].n, cycles[i], tot);
fprintf(fplog, "%s\n", hline);
for (i = 0; i < ewcsNR; i++)
{
- print_cycles(fplog, c2t, wcsn[i], nnodes, npp, nth_pp,
+ print_cycles(fplog, c2t, wcsn[i], nth_tot, npp, nth_pp,
wc->wcsc[i].n, cycles[ewcNR+i], tot);
}
fprintf(fplog, "%s\n", hline);
* In the kernel we can subtract 1 to generate the subsequent mask.
*/
const int simd_width = GMX_NBNXN_SIMD_BITWIDTH/(sizeof(real)*8);
- int simd_4xn_diag_size, j;
+ int simd_4xn_diag_size, real_excl, simd_excl_size, j, s;
simd_4xn_diag_size = max(NBNXN_CPU_CLUSTER_I_SIZE, simd_width);
snew_aligned(nbat->simd_4xn_diag, simd_4xn_diag_size, NBNXN_MEM_ALIGN);
/* The next half of the SIMD width is for i + 1 */
nbat->simd_2xnn_diag[simd_width/2+j] = j - 1 - 0.5;
}
+
+ /* We always use 32-bit integer exclusion masks. When we use
+ * double precision, we fit two integers in a double SIMD register.
+ */
+ real_excl = sizeof(real)/sizeof(*nbat->simd_excl_mask);
+ /* Set bits for use with both 4xN and 2x(N+N) kernels */
+ simd_excl_size = NBNXN_CPU_CLUSTER_I_SIZE*simd_width*real_excl;
+ snew_aligned(nbat->simd_excl_mask, simd_excl_size*real_excl, NBNXN_MEM_ALIGN);
+ for (j = 0; j < simd_excl_size; j++)
+ {
+ /* Set the consecutive bits for masking pair exclusions.
+ * For double a single-bit mask would be enough.
+ * But using two bits avoids endianness issues.
+ */
+ for (s = 0; s < real_excl; s++)
+ {
+ /* Set the consecutive bits for masking pair exclusions */
+ nbat->simd_excl_mask[j*real_excl + s] = (1U << j);
+ }
+ }
}
#endif
}
}
break;
- default:
- gmx_incons("Unsupported nbnxn_atomdata_t format");
+ default:
+ gmx_incons("Unsupported nbnxn_atomdata_t format");
}
}
DEBUG -g -D_DEBUG_=1)
#Because this is a static library linked into the (potential) shared library
#it should have the export of the shared library.
+ target_link_libraries(nbnxn_cuda cuda_tools)
SET_TARGET_PROPERTIES(nbnxn_cuda PROPERTIES DEFINE_SYMBOL "md_EXPORTS" )
endif()
void nbnxn_cuda_wait_gpu(nbnxn_cuda_ptr_t cu_nb,
const nbnxn_atomdata_t *nbatom,
int flags, int aloc,
- float *e_lj, float *e_el, rvec *fshift)
+ real *e_lj, real *e_el, rvec *fshift)
{
+ /* NOTE: only implemented for single-precision at this time */
cudaError_t stat;
int i, adat_end, iloc = -1;
volatile unsigned int *poll_word;
nbnxn_x_ci_simd_4xn_t *x_ci_simd_4xn;
nbnxn_x_ci_simd_2xnn_t *x_ci_simd_2xnn;
#endif
- int cj_ind; /* The current cj_ind index for the current list */
- int cj4_init; /* The first unitialized cj4 block */
+ int cj_ind; /* The current cj_ind index for the current list */
+ int cj4_init; /* The first unitialized cj4 block */
- float *d2; /* Bounding box distance work array */
+ float *d2; /* Bounding box distance work array */
- nbnxn_cj_t *cj; /* The j-cell list */
- int cj_nalloc; /* Allocation size of cj */
+ nbnxn_cj_t *cj; /* The j-cell list */
+ int cj_nalloc; /* Allocation size of cj */
- int ncj_noq; /* Nr. of cluster pairs without Coul for flop count */
- int ncj_hlj; /* Nr. of cluster pairs with 1/2 LJ for flop count */
+ int ncj_noq; /* Nr. of cluster pairs without Coul for flop count */
+ int ncj_hlj; /* Nr. of cluster pairs with 1/2 LJ for flop count */
int *sort; /* Sort index */
int sort_nalloc; /* Allocation size of sort */
nbnxn_sci_t *sci_sort; /* Second sci array, for sorting */
int sci_sort_nalloc; /* Allocation size of sci_sort */
- gmx_cache_protect_t cp1; /* Protect cache between threads */
+ gmx_cache_protect_t cp1; /* Protect cache between threads */
} nbnxn_list_work_t;
/* Function type for setting the i-atom coordinate working data */
#endif
/* Without exclusions and energies we only need to mask the cut-off,
- * this can be faster with blendv (only available with SSE4.1 and later).
+ * this can be faster with blendv.
*/
-#if !(defined CHECK_EXCLS || defined CALC_ENERGIES) && defined GMX_X86_SSE4_1 && !defined COUNT_PAIRS
+#if !(defined CHECK_EXCLS || defined CALC_ENERGIES) && defined GMX_HAVE_SIMD_BLENDV && !defined COUNT_PAIRS
/* With RF and tabulated Coulomb we replace cmp+and with sub+blendv.
* With gcc this is slower, except for RF on Sandy Bridge.
* Tested with gcc 4.6.2, 4.6.3 and 4.7.1.
#ifdef CHECK_EXCLS
/* Interaction (non-exclusion) mask of all 1's or 0's */
- gmx_mm_pr int_SSE0;
- gmx_mm_pr int_SSE2;
+ gmx_mm_pr int_S0;
+ gmx_mm_pr int_S2;
#endif
- gmx_mm_pr jxSSE, jySSE, jzSSE;
- gmx_mm_pr dx_SSE0, dy_SSE0, dz_SSE0;
- gmx_mm_pr dx_SSE2, dy_SSE2, dz_SSE2;
- gmx_mm_pr tx_SSE0, ty_SSE0, tz_SSE0;
- gmx_mm_pr tx_SSE2, ty_SSE2, tz_SSE2;
- gmx_mm_pr rsq_SSE0, rinv_SSE0, rinvsq_SSE0;
- gmx_mm_pr rsq_SSE2, rinv_SSE2, rinvsq_SSE2;
+ gmx_mm_pr jx_S, jy_S, jz_S;
+ gmx_mm_pr dx_S0, dy_S0, dz_S0;
+ gmx_mm_pr dx_S2, dy_S2, dz_S2;
+ gmx_mm_pr tx_S0, ty_S0, tz_S0;
+ gmx_mm_pr tx_S2, ty_S2, tz_S2;
+ gmx_mm_pr rsq_S0, rinv_S0, rinvsq_S0;
+ gmx_mm_pr rsq_S2, rinv_S2, rinvsq_S2;
#ifndef CUTOFF_BLENDV
/* wco: within cut-off, mask of all 1's or 0's */
- gmx_mm_pr wco_SSE0;
- gmx_mm_pr wco_SSE2;
+ gmx_mm_pr wco_S0;
+ gmx_mm_pr wco_S2;
#endif
#ifdef VDW_CUTOFF_CHECK
- gmx_mm_pr wco_vdw_SSE0;
+ gmx_mm_pr wco_vdw_S0;
#ifndef HALF_LJ
- gmx_mm_pr wco_vdw_SSE2;
+ gmx_mm_pr wco_vdw_S2;
#endif
#endif
#ifdef CALC_COULOMB
#ifdef CHECK_EXCLS
/* 1/r masked with the interaction mask */
- gmx_mm_pr rinv_ex_SSE0;
- gmx_mm_pr rinv_ex_SSE2;
+ gmx_mm_pr rinv_ex_S0;
+ gmx_mm_pr rinv_ex_S2;
#endif
- gmx_mm_pr jq_SSE;
- gmx_mm_pr qq_SSE0;
- gmx_mm_pr qq_SSE2;
+ gmx_mm_pr jq_S;
+ gmx_mm_pr qq_S0;
+ gmx_mm_pr qq_S2;
#ifdef CALC_COUL_TAB
/* The force (PME mesh force) we need to subtract from 1/r^2 */
- gmx_mm_pr fsub_SSE0;
- gmx_mm_pr fsub_SSE2;
+ gmx_mm_pr fsub_S0;
+ gmx_mm_pr fsub_S2;
#endif
#ifdef CALC_COUL_EWALD
- gmx_mm_pr brsq_SSE0, brsq_SSE2;
- gmx_mm_pr ewcorr_SSE0, ewcorr_SSE2;
+ gmx_mm_pr brsq_S0, brsq_S2;
+ gmx_mm_pr ewcorr_S0, ewcorr_S2;
#endif
/* frcoul = (1/r - fsub)*r */
- gmx_mm_pr frcoul_SSE0;
- gmx_mm_pr frcoul_SSE2;
+ gmx_mm_pr frcoul_S0;
+ gmx_mm_pr frcoul_S2;
#ifdef CALC_COUL_TAB
/* For tables: r, rs=r/sp, rf=floor(rs), frac=rs-rf */
- gmx_mm_pr r_SSE0, rs_SSE0, rf_SSE0, frac_SSE0;
- gmx_mm_pr r_SSE2, rs_SSE2, rf_SSE2, frac_SSE2;
+ gmx_mm_pr r_S0, rs_S0, rf_S0, frac_S0;
+ gmx_mm_pr r_S2, rs_S2, rf_S2, frac_S2;
/* Table index: rs truncated to an int */
-#if !(defined GMX_MM256_HERE && defined GMX_DOUBLE)
- gmx_epi32 ti_SSE0, ti_SSE2;
-#else
- __m128i ti_SSE0, ti_SSE2;
-#endif
+ gmx_epi32 ti_S0, ti_S2;
/* Linear force table values */
- gmx_mm_pr ctab0_SSE0, ctab1_SSE0;
- gmx_mm_pr ctab0_SSE2, ctab1_SSE2;
+ gmx_mm_pr ctab0_S0, ctab1_S0;
+ gmx_mm_pr ctab0_S2, ctab1_S2;
#ifdef CALC_ENERGIES
/* Quadratic energy table value */
- gmx_mm_pr ctabv_SSE0;
- gmx_mm_pr ctabv_SSE2;
+ gmx_mm_pr ctabv_S0;
+ gmx_mm_pr ctabv_S2;
#endif
#endif
#if defined CALC_ENERGIES && (defined CALC_COUL_EWALD || defined CALC_COUL_TAB)
/* The potential (PME mesh) we need to subtract from 1/r */
- gmx_mm_pr vc_sub_SSE0;
- gmx_mm_pr vc_sub_SSE2;
+ gmx_mm_pr vc_sub_S0;
+ gmx_mm_pr vc_sub_S2;
#endif
#ifdef CALC_ENERGIES
/* Electrostatic potential */
- gmx_mm_pr vcoul_SSE0;
- gmx_mm_pr vcoul_SSE2;
+ gmx_mm_pr vcoul_S0;
+ gmx_mm_pr vcoul_S2;
#endif
#endif
/* The force times 1/r */
- gmx_mm_pr fscal_SSE0;
- gmx_mm_pr fscal_SSE2;
+ gmx_mm_pr fscal_S0;
+ gmx_mm_pr fscal_S2;
#ifdef CALC_LJ
#ifdef LJ_COMB_LB
/* LJ sigma_j/2 and sqrt(epsilon_j) */
- gmx_mm_pr hsig_j_SSE, seps_j_SSE;
+ gmx_mm_pr hsig_j_S, seps_j_S;
/* LJ sigma_ij and epsilon_ij */
- gmx_mm_pr sig_SSE0, eps_SSE0;
+ gmx_mm_pr sig_S0, eps_S0;
#ifndef HALF_LJ
- gmx_mm_pr sig_SSE2, eps_SSE2;
+ gmx_mm_pr sig_S2, eps_S2;
#endif
#ifdef CALC_ENERGIES
- gmx_mm_pr sig2_SSE0, sig6_SSE0;
+ gmx_mm_pr sig2_S0, sig6_S0;
#ifndef HALF_LJ
- gmx_mm_pr sig2_SSE2, sig6_SSE2;
+ gmx_mm_pr sig2_S2, sig6_S2;
#endif
#endif /* LJ_COMB_LB */
#endif /* CALC_LJ */
#ifdef LJ_COMB_GEOM
- gmx_mm_pr c6s_j_SSE, c12s_j_SSE;
+ gmx_mm_pr c6s_j_S, c12s_j_S;
#endif
#if defined LJ_COMB_GEOM || defined LJ_COMB_LB
#ifndef FIX_LJ_C
/* LJ C6 and C12 parameters, used with geometric comb. rule */
- gmx_mm_pr c6_SSE0, c12_SSE0;
+ gmx_mm_pr c6_S0, c12_S0;
#ifndef HALF_LJ
- gmx_mm_pr c6_SSE2, c12_SSE2;
+ gmx_mm_pr c6_S2, c12_S2;
#endif
#endif
/* Intermediate variables for LJ calculation */
#ifndef LJ_COMB_LB
- gmx_mm_pr rinvsix_SSE0;
+ gmx_mm_pr rinvsix_S0;
#ifndef HALF_LJ
- gmx_mm_pr rinvsix_SSE2;
+ gmx_mm_pr rinvsix_S2;
#endif
#endif
#ifdef LJ_COMB_LB
- gmx_mm_pr sir_SSE0, sir2_SSE0, sir6_SSE0;
+ gmx_mm_pr sir_S0, sir2_S0, sir6_S0;
#ifndef HALF_LJ
- gmx_mm_pr sir_SSE2, sir2_SSE2, sir6_SSE2;
+ gmx_mm_pr sir_S2, sir2_S2, sir6_S2;
#endif
#endif
- gmx_mm_pr FrLJ6_SSE0, FrLJ12_SSE0;
+ gmx_mm_pr FrLJ6_S0, FrLJ12_S0;
#ifndef HALF_LJ
- gmx_mm_pr FrLJ6_SSE2, FrLJ12_SSE2;
+ gmx_mm_pr FrLJ6_S2, FrLJ12_S2;
#endif
#ifdef CALC_ENERGIES
- gmx_mm_pr VLJ6_SSE0, VLJ12_SSE0, VLJ_SSE0;
+ gmx_mm_pr VLJ6_S0, VLJ12_S0, VLJ_S0;
#ifndef HALF_LJ
- gmx_mm_pr VLJ6_SSE2, VLJ12_SSE2, VLJ_SSE2;
+ gmx_mm_pr VLJ6_S2, VLJ12_S2, VLJ_S2;
#endif
#endif
#endif /* CALC_LJ */
+ gmx_mm_hpr fjx_S, fjy_S, fjz_S;
+
/* j-cluster index */
cj = l_cj[cjind].cj;
#ifdef CHECK_EXCLS
{
/* Load integer interaction mask */
- /* With AVX there are no integer operations, so cast to real */
- gmx_mm_pr mask_pr = gmx_mm_castsi256_pr(_mm256_set1_epi32(l_cj[cjind].excl));
- /* Intel Compiler version 12.1.3 20120130 is buggy: use cast.
- * With gcc we don't need the cast, but it's faster.
- */
-#define cast_cvt(x) _mm256_cvtepi32_ps(_mm256_castps_si256(x))
- int_SSE0 = gmx_cmpneq_pr(cast_cvt(gmx_and_pr(mask_pr, mask0)), zero_SSE);
- int_SSE2 = gmx_cmpneq_pr(cast_cvt(gmx_and_pr(mask_pr, mask2)), zero_SSE);
-#undef cast_cvt
+ gmx_mm_pr mask_pr_S = gmx_castsi_pr(gmx_set1_epi32(l_cj[cjind].excl));
+
+ int_S0 = gmx_checkbitmask_pr(mask_pr_S, mask_S0);
+ int_S2 = gmx_checkbitmask_pr(mask_pr_S, mask_S2);
}
#endif
+
/* load j atom coordinates */
- jxSSE = gmx_loaddh_pr(x+ajx);
- jySSE = gmx_loaddh_pr(x+ajy);
- jzSSE = gmx_loaddh_pr(x+ajz);
+ gmx_loaddh_pr(jx_S, x+ajx);
+ gmx_loaddh_pr(jy_S, x+ajy);
+ gmx_loaddh_pr(jz_S, x+ajz);
/* Calculate distance */
- dx_SSE0 = gmx_sub_pr(ix_SSE0, jxSSE);
- dy_SSE0 = gmx_sub_pr(iy_SSE0, jySSE);
- dz_SSE0 = gmx_sub_pr(iz_SSE0, jzSSE);
- dx_SSE2 = gmx_sub_pr(ix_SSE2, jxSSE);
- dy_SSE2 = gmx_sub_pr(iy_SSE2, jySSE);
- dz_SSE2 = gmx_sub_pr(iz_SSE2, jzSSE);
+ dx_S0 = gmx_sub_pr(ix_S0, jx_S);
+ dy_S0 = gmx_sub_pr(iy_S0, jy_S);
+ dz_S0 = gmx_sub_pr(iz_S0, jz_S);
+ dx_S2 = gmx_sub_pr(ix_S2, jx_S);
+ dy_S2 = gmx_sub_pr(iy_S2, jy_S);
+ dz_S2 = gmx_sub_pr(iz_S2, jz_S);
/* rsq = dx*dx+dy*dy+dz*dz */
- rsq_SSE0 = gmx_calc_rsq_pr(dx_SSE0, dy_SSE0, dz_SSE0);
- rsq_SSE2 = gmx_calc_rsq_pr(dx_SSE2, dy_SSE2, dz_SSE2);
+ rsq_S0 = gmx_calc_rsq_pr(dx_S0, dy_S0, dz_S0);
+ rsq_S2 = gmx_calc_rsq_pr(dx_S2, dy_S2, dz_S2);
#ifndef CUTOFF_BLENDV
- wco_SSE0 = gmx_cmplt_pr(rsq_SSE0, rc2_SSE);
- wco_SSE2 = gmx_cmplt_pr(rsq_SSE2, rc2_SSE);
+ wco_S0 = gmx_cmplt_pr(rsq_S0, rc2_S);
+ wco_S2 = gmx_cmplt_pr(rsq_S2, rc2_S);
#endif
#ifdef CHECK_EXCLS
#if UNROLLJ == UNROLLI
if (cj == ci_sh)
{
- wco_SSE0 = gmx_and_pr(wco_SSE0, diag_SSE0);
- wco_SSE2 = gmx_and_pr(wco_SSE2, diag_SSE2);
+ wco_S0 = gmx_and_pr(wco_S0, diag_S0);
+ wco_S2 = gmx_and_pr(wco_S2, diag_S2);
}
#else
-#error "only UNROLLJ == UNROLLI currently supported in the joined kernels"
+#if UNROLLJ == 2*UNROLLI
+ if (cj*2 == ci_sh)
+ {
+ wco_S0 = gmx_and_pr(wco_S0, diag0_S0);
+ wco_S2 = gmx_and_pr(wco_S2, diag0_S2);
+ }
+ else if (cj*2 + 1 == ci_sh)
+ {
+ wco_S0 = gmx_and_pr(wco_S0, diag1_S0);
+ wco_S2 = gmx_and_pr(wco_S2, diag1_S2);
+ }
+#else
+#error "only UNROLLJ == UNROLLI*(1 or 2) currently supported in 2xnn kernels"
+#endif
#endif
#else /* EXCL_FORCES */
- /* Remove all excluded atom pairs from the list */
- wco_SSE0 = gmx_and_pr(wco_SSE0, int_SSE0);
- wco_SSE2 = gmx_and_pr(wco_SSE2, int_SSE2);
+ /* No exclusion forces: remove all excluded atom pairs from the list */
+ wco_S0 = gmx_and_pr(wco_S0, int_S0);
+ wco_S2 = gmx_and_pr(wco_S2, int_S2);
#endif
#endif
#ifdef COUNT_PAIRS
{
int i, j;
- real tmp[UNROLLJ];
- for (i = 0; i < UNROLLI; i++)
+ real tmpa[2*GMX_SIMD_WIDTH_HERE], *tmp;
+ tmp = gmx_simd_align_real(tmpa);
+ for (i = 0; i < UNROLLI; i+=2)
{
- gmx_storeu_pr(tmp, i == 0 ? wco_SSE0 : (i == 1 ? wco_SSE1 : (i == 2 ? wco_SSE2 : wco_SSE3)));
- for (j = 0; j < UNROLLJ; j++)
+ gmx_store_pr(tmp, i == 0 ? wco_S0 : wco_S2);
+ for (j = 0; j < 2*UNROLLJ; j++)
{
if (!(tmp[j] == 0))
{
#ifdef CHECK_EXCLS
/* For excluded pairs add a small number to avoid r^-6 = NaN */
- rsq_SSE0 = gmx_add_pr(rsq_SSE0, gmx_andnot_pr(int_SSE0, avoid_sing_SSE));
- rsq_SSE2 = gmx_add_pr(rsq_SSE2, gmx_andnot_pr(int_SSE2, avoid_sing_SSE));
+ rsq_S0 = gmx_add_pr(rsq_S0, gmx_andnot_pr(int_S0, avoid_sing_S));
+ rsq_S2 = gmx_add_pr(rsq_S2, gmx_andnot_pr(int_S2, avoid_sing_S));
#endif
/* Calculate 1/r */
- rinv_SSE0 = gmx_invsqrt_pr(rsq_SSE0);
- rinv_SSE2 = gmx_invsqrt_pr(rsq_SSE2);
+ rinv_S0 = gmx_invsqrt_pr(rsq_S0);
+ rinv_S2 = gmx_invsqrt_pr(rsq_S2);
#ifdef CALC_COULOMB
/* Load parameters for j atom */
- jq_SSE = gmx_loaddh_pr(q+aj);
- qq_SSE0 = gmx_mul_pr(iq_SSE0, jq_SSE);
- qq_SSE2 = gmx_mul_pr(iq_SSE2, jq_SSE);
+ gmx_loaddh_pr(jq_S, q+aj);
+ qq_S0 = gmx_mul_pr(iq_S0, jq_S);
+ qq_S2 = gmx_mul_pr(iq_S2, jq_S);
#endif
#ifdef CALC_LJ
#if !defined LJ_COMB_GEOM && !defined LJ_COMB_LB && !defined FIX_LJ_C
- load_lj_pair_params2(nbfp0, nbfp1, type, aj, c6_SSE0, c12_SSE0);
+ load_lj_pair_params2(nbfp0, nbfp1, type, aj, c6_S0, c12_S0);
#ifndef HALF_LJ
- load_lj_pair_params2(nbfp2, nbfp3, type, aj, c6_SSE2, c12_SSE2);
+ load_lj_pair_params2(nbfp2, nbfp3, type, aj, c6_S2, c12_S2);
#endif
#endif /* not defined any LJ rule */
#ifdef LJ_COMB_GEOM
- c6s_j_SSE = gmx_loaddh_pr(ljc+aj2+0);
- c12s_j_SSE = gmx_loaddh_pr(ljc+aj2+STRIDE);
- c6_SSE0 = gmx_mul_pr(c6s_SSE0, c6s_j_SSE );
+ gmx_loaddh_pr(c6s_j_S, ljc+aj2+0);
+ gmx_loaddh_pr(c12s_j_S, ljc+aj2+STRIDE);
+ c6_S0 = gmx_mul_pr(c6s_S0, c6s_j_S );
#ifndef HALF_LJ
- c6_SSE2 = gmx_mul_pr(c6s_SSE2, c6s_j_SSE );
+ c6_S2 = gmx_mul_pr(c6s_S2, c6s_j_S );
#endif
- c12_SSE0 = gmx_mul_pr(c12s_SSE0, c12s_j_SSE);
+ c12_S0 = gmx_mul_pr(c12s_S0, c12s_j_S);
#ifndef HALF_LJ
- c12_SSE2 = gmx_mul_pr(c12s_SSE2, c12s_j_SSE);
+ c12_S2 = gmx_mul_pr(c12s_S2, c12s_j_S);
#endif
#endif /* LJ_COMB_GEOM */
#ifdef LJ_COMB_LB
- hsig_j_SSE = gmx_loaddh_pr(ljc+aj2+0);
- seps_j_SSE = gmx_loaddh_pr(ljc+aj2+STRIDE);
+ gmx_loaddh_pr(hsig_j_S, ljc+aj2+0);
+ gmx_loaddh_pr(seps_j_S, ljc+aj2+STRIDE);
- sig_SSE0 = gmx_add_pr(hsig_i_SSE0, hsig_j_SSE);
- eps_SSE0 = gmx_mul_pr(seps_i_SSE0, seps_j_SSE);
+ sig_S0 = gmx_add_pr(hsig_i_S0, hsig_j_S);
+ eps_S0 = gmx_mul_pr(seps_i_S0, seps_j_S);
#ifndef HALF_LJ
- sig_SSE2 = gmx_add_pr(hsig_i_SSE2, hsig_j_SSE);
- eps_SSE2 = gmx_mul_pr(seps_i_SSE2, seps_j_SSE);
+ sig_S2 = gmx_add_pr(hsig_i_S2, hsig_j_S);
+ eps_S2 = gmx_mul_pr(seps_i_S2, seps_j_S);
#endif
#endif /* LJ_COMB_LB */
#endif /* CALC_LJ */
#ifndef CUTOFF_BLENDV
- rinv_SSE0 = gmx_and_pr(rinv_SSE0, wco_SSE0);
- rinv_SSE2 = gmx_and_pr(rinv_SSE2, wco_SSE2);
+ rinv_S0 = gmx_blendzero_pr(rinv_S0, wco_S0);
+ rinv_S2 = gmx_blendzero_pr(rinv_S2, wco_S2);
#else
/* We only need to mask for the cut-off: blendv is faster */
- rinv_SSE0 = gmx_blendv_pr(rinv_SSE0, zero_SSE, gmx_sub_pr(rc2_SSE, rsq_SSE0));
- rinv_SSE2 = gmx_blendv_pr(rinv_SSE2, zero_SSE, gmx_sub_pr(rc2_SSE, rsq_SSE2));
+ rinv_S0 = gmx_blendv_pr(rinv_S0, zero_S, gmx_sub_pr(rc2_S, rsq_S0));
+ rinv_S2 = gmx_blendv_pr(rinv_S2, zero_S, gmx_sub_pr(rc2_S, rsq_S2));
#endif
- rinvsq_SSE0 = gmx_mul_pr(rinv_SSE0, rinv_SSE0);
- rinvsq_SSE2 = gmx_mul_pr(rinv_SSE2, rinv_SSE2);
+ rinvsq_S0 = gmx_mul_pr(rinv_S0, rinv_S0);
+ rinvsq_S2 = gmx_mul_pr(rinv_S2, rinv_S2);
#ifdef CALC_COULOMB
/* Note that here we calculate force*r, not the usual force/r.
#ifdef EXCL_FORCES
/* Only add 1/r for non-excluded atom pairs */
- rinv_ex_SSE0 = gmx_and_pr(rinv_SSE0, int_SSE0);
- rinv_ex_SSE2 = gmx_and_pr(rinv_SSE2, int_SSE2);
+ rinv_ex_S0 = gmx_blendzero_pr(rinv_S0, int_S0);
+ rinv_ex_S2 = gmx_blendzero_pr(rinv_S2, int_S2);
#else
/* No exclusion forces, we always need 1/r */
-#define rinv_ex_SSE0 rinv_SSE0
-#define rinv_ex_SSE2 rinv_SSE2
+#define rinv_ex_S0 rinv_S0
+#define rinv_ex_S2 rinv_S2
#endif
#ifdef CALC_COUL_RF
/* Electrostatic interactions */
- frcoul_SSE0 = gmx_mul_pr(qq_SSE0, gmx_add_pr(rinv_ex_SSE0, gmx_mul_pr(rsq_SSE0, mrc_3_SSE)));
- frcoul_SSE2 = gmx_mul_pr(qq_SSE2, gmx_add_pr(rinv_ex_SSE2, gmx_mul_pr(rsq_SSE2, mrc_3_SSE)));
+ frcoul_S0 = gmx_mul_pr(qq_S0, gmx_add_pr(rinv_ex_S0, gmx_mul_pr(rsq_S0, mrc_3_S)));
+ frcoul_S2 = gmx_mul_pr(qq_S2, gmx_add_pr(rinv_ex_S2, gmx_mul_pr(rsq_S2, mrc_3_S)));
#ifdef CALC_ENERGIES
- vcoul_SSE0 = gmx_mul_pr(qq_SSE0, gmx_add_pr(rinv_ex_SSE0, gmx_add_pr(gmx_mul_pr(rsq_SSE0, hrc_3_SSE), moh_rc_SSE)));
- vcoul_SSE2 = gmx_mul_pr(qq_SSE2, gmx_add_pr(rinv_ex_SSE2, gmx_add_pr(gmx_mul_pr(rsq_SSE2, hrc_3_SSE), moh_rc_SSE)));
+ vcoul_S0 = gmx_mul_pr(qq_S0, gmx_add_pr(rinv_ex_S0, gmx_add_pr(gmx_mul_pr(rsq_S0, hrc_3_S), moh_rc_S)));
+ vcoul_S2 = gmx_mul_pr(qq_S2, gmx_add_pr(rinv_ex_S2, gmx_add_pr(gmx_mul_pr(rsq_S2, hrc_3_S), moh_rc_S)));
#endif
#endif
* as large distances can cause an overflow in gmx_pmecorrF/V.
*/
#ifndef CUTOFF_BLENDV
- brsq_SSE0 = gmx_mul_pr(beta2_SSE, gmx_and_pr(rsq_SSE0, wco_SSE0));
- brsq_SSE2 = gmx_mul_pr(beta2_SSE, gmx_and_pr(rsq_SSE2, wco_SSE2));
+ brsq_S0 = gmx_mul_pr(beta2_S, gmx_blendzero_pr(rsq_S0, wco_S0));
+ brsq_S2 = gmx_mul_pr(beta2_S, gmx_blendzero_pr(rsq_S2, wco_S2));
#else
/* Strangely, putting mul on a separate line is slower (icc 13) */
- brsq_SSE0 = gmx_mul_pr(beta2_SSE, gmx_blendv_pr(rsq_SSE0, zero_SSE, gmx_sub_pr(rc2_SSE, rsq_SSE0)));
- brsq_SSE2 = gmx_mul_pr(beta2_SSE, gmx_blendv_pr(rsq_SSE2, zero_SSE, gmx_sub_pr(rc2_SSE, rsq_SSE2)));
+ brsq_S0 = gmx_mul_pr(beta2_S, gmx_blendv_pr(rsq_S0, zero_S, gmx_sub_pr(rc2_S, rsq_S0)));
+ brsq_S2 = gmx_mul_pr(beta2_S, gmx_blendv_pr(rsq_S2, zero_S, gmx_sub_pr(rc2_S, rsq_S2)));
#endif
- ewcorr_SSE0 = gmx_mul_pr(gmx_pmecorrF_pr(brsq_SSE0), beta_SSE);
- ewcorr_SSE2 = gmx_mul_pr(gmx_pmecorrF_pr(brsq_SSE2), beta_SSE);
- frcoul_SSE0 = gmx_mul_pr(qq_SSE0, gmx_add_pr(rinv_ex_SSE0, gmx_mul_pr(ewcorr_SSE0, brsq_SSE0)));
- frcoul_SSE2 = gmx_mul_pr(qq_SSE2, gmx_add_pr(rinv_ex_SSE2, gmx_mul_pr(ewcorr_SSE2, brsq_SSE2)));
+ ewcorr_S0 = gmx_mul_pr(gmx_pmecorrF_pr(brsq_S0), beta_S);
+ ewcorr_S2 = gmx_mul_pr(gmx_pmecorrF_pr(brsq_S2), beta_S);
+ frcoul_S0 = gmx_mul_pr(qq_S0, gmx_add_pr(rinv_ex_S0, gmx_mul_pr(ewcorr_S0, brsq_S0)));
+ frcoul_S2 = gmx_mul_pr(qq_S2, gmx_add_pr(rinv_ex_S2, gmx_mul_pr(ewcorr_S2, brsq_S2)));
#ifdef CALC_ENERGIES
- vc_sub_SSE0 = gmx_mul_pr(gmx_pmecorrV_pr(brsq_SSE0), beta_SSE);
- vc_sub_SSE2 = gmx_mul_pr(gmx_pmecorrV_pr(brsq_SSE2), beta_SSE);
+ vc_sub_S0 = gmx_mul_pr(gmx_pmecorrV_pr(brsq_S0), beta_S);
+ vc_sub_S2 = gmx_mul_pr(gmx_pmecorrV_pr(brsq_S2), beta_S);
#endif
#endif /* CALC_COUL_EWALD */
#ifdef CALC_COUL_TAB
/* Electrostatic interactions */
- r_SSE0 = gmx_mul_pr(rsq_SSE0, rinv_SSE0);
- r_SSE2 = gmx_mul_pr(rsq_SSE2, rinv_SSE2);
+ r_S0 = gmx_mul_pr(rsq_S0, rinv_S0);
+ r_S2 = gmx_mul_pr(rsq_S2, rinv_S2);
/* Convert r to scaled table units */
- rs_SSE0 = gmx_mul_pr(r_SSE0, invtsp_SSE);
- rs_SSE2 = gmx_mul_pr(r_SSE2, invtsp_SSE);
+ rs_S0 = gmx_mul_pr(r_S0, invtsp_S);
+ rs_S2 = gmx_mul_pr(r_S2, invtsp_S);
/* Truncate scaled r to an int */
- ti_SSE0 = gmx_cvttpr_epi32(rs_SSE0);
- ti_SSE2 = gmx_cvttpr_epi32(rs_SSE2);
-#ifdef GMX_X86_SSE4_1
- /* SSE4.1 floor is faster than gmx_cvtepi32_ps int->float cast */
- rf_SSE0 = gmx_floor_pr(rs_SSE0);
- rf_SSE2 = gmx_floor_pr(rs_SSE2);
+ ti_S0 = gmx_cvttpr_epi32(rs_S0);
+ ti_S2 = gmx_cvttpr_epi32(rs_S2);
+#ifdef GMX_HAVE_SIMD_FLOOR
+ rf_S0 = gmx_floor_pr(rs_S0);
+ rf_S2 = gmx_floor_pr(rs_S2);
#else
- rf_SSE0 = gmx_cvtepi32_pr(ti_SSE0);
- rf_SSE2 = gmx_cvtepi32_pr(ti_SSE2);
+ rf_S0 = gmx_cvtepi32_pr(ti_S0);
+ rf_S2 = gmx_cvtepi32_pr(ti_S2);
#endif
- frac_SSE0 = gmx_sub_pr(rs_SSE0, rf_SSE0);
- frac_SSE2 = gmx_sub_pr(rs_SSE2, rf_SSE2);
+ frac_S0 = gmx_sub_pr(rs_S0, rf_S0);
+ frac_S2 = gmx_sub_pr(rs_S2, rf_S2);
/* Load and interpolate table forces and possibly energies.
* Force and energy can be combined in one table, stride 4: FDV0
* Currently single precision uses FDV0, double F and V.
*/
#ifndef CALC_ENERGIES
- load_table_f(tab_coul_F, ti_SSE0, ti0, ctab0_SSE0, ctab1_SSE0);
- load_table_f(tab_coul_F, ti_SSE2, ti2, ctab0_SSE2, ctab1_SSE2);
+ load_table_f(tab_coul_F, ti_S0, ti0, ctab0_S0, ctab1_S0);
+ load_table_f(tab_coul_F, ti_S2, ti2, ctab0_S2, ctab1_S2);
#else
#ifdef TAB_FDV0
- load_table_f_v(tab_coul_F, ti_SSE0, ti0, ctab0_SSE0, ctab1_SSE0, ctabv_SSE0);
- load_table_f_v(tab_coul_F, ti_SSE2, ti2, ctab0_SSE2, ctab1_SSE2, ctabv_SSE2);
+ load_table_f_v(tab_coul_F, ti_S0, ti0, ctab0_S0, ctab1_S0, ctabv_S0);
+ load_table_f_v(tab_coul_F, ti_S2, ti2, ctab0_S2, ctab1_S2, ctabv_S2);
#else
- load_table_f_v(tab_coul_F, tab_coul_V, ti_SSE0, ti0, ctab0_SSE0, ctab1_SSE0, ctabv_SSE0);
- load_table_f_v(tab_coul_F, tab_coul_V, ti_SSE2, ti2, ctab0_SSE2, ctab1_SSE2, ctabv_SSE2);
+ load_table_f_v(tab_coul_F, tab_coul_V, ti_S0, ti0, ctab0_S0, ctab1_S0, ctabv_S0);
+ load_table_f_v(tab_coul_F, tab_coul_V, ti_S2, ti2, ctab0_S2, ctab1_S2, ctabv_S2);
#endif
#endif
- fsub_SSE0 = gmx_add_pr(ctab0_SSE0, gmx_mul_pr(frac_SSE0, ctab1_SSE0));
- fsub_SSE2 = gmx_add_pr(ctab0_SSE2, gmx_mul_pr(frac_SSE2, ctab1_SSE2));
- frcoul_SSE0 = gmx_mul_pr(qq_SSE0, gmx_sub_pr(rinv_ex_SSE0, gmx_mul_pr(fsub_SSE0, r_SSE0)));
- frcoul_SSE2 = gmx_mul_pr(qq_SSE2, gmx_sub_pr(rinv_ex_SSE2, gmx_mul_pr(fsub_SSE2, r_SSE2)));
+ fsub_S0 = gmx_add_pr(ctab0_S0, gmx_mul_pr(frac_S0, ctab1_S0));
+ fsub_S2 = gmx_add_pr(ctab0_S2, gmx_mul_pr(frac_S2, ctab1_S2));
+ frcoul_S0 = gmx_mul_pr(qq_S0, gmx_sub_pr(rinv_ex_S0, gmx_mul_pr(fsub_S0, r_S0)));
+ frcoul_S2 = gmx_mul_pr(qq_S2, gmx_sub_pr(rinv_ex_S2, gmx_mul_pr(fsub_S2, r_S2)));
#ifdef CALC_ENERGIES
- vc_sub_SSE0 = gmx_add_pr(ctabv_SSE0, gmx_mul_pr(gmx_mul_pr(mhalfsp_SSE, frac_SSE0), gmx_add_pr(ctab0_SSE0, fsub_SSE0)));
- vc_sub_SSE2 = gmx_add_pr(ctabv_SSE2, gmx_mul_pr(gmx_mul_pr(mhalfsp_SSE, frac_SSE2), gmx_add_pr(ctab0_SSE2, fsub_SSE2)));
+ vc_sub_S0 = gmx_add_pr(ctabv_S0, gmx_mul_pr(gmx_mul_pr(mhalfsp_S, frac_S0), gmx_add_pr(ctab0_S0, fsub_S0)));
+ vc_sub_S2 = gmx_add_pr(ctabv_S2, gmx_mul_pr(gmx_mul_pr(mhalfsp_S, frac_S2), gmx_add_pr(ctab0_S2, fsub_S2)));
#endif
#endif /* CALC_COUL_TAB */
#ifndef NO_SHIFT_EWALD
/* Add Ewald potential shift to vc_sub for convenience */
#ifdef CHECK_EXCLS
- vc_sub_SSE0 = gmx_add_pr(vc_sub_SSE0, gmx_and_pr(sh_ewald_SSE, int_SSE0));
- vc_sub_SSE2 = gmx_add_pr(vc_sub_SSE2, gmx_and_pr(sh_ewald_SSE, int_SSE2));
+ vc_sub_S0 = gmx_add_pr(vc_sub_S0, gmx_blendzero_pr(sh_ewald_S, int_S0));
+ vc_sub_S2 = gmx_add_pr(vc_sub_S2, gmx_blendzero_pr(sh_ewald_S, int_S2));
#else
- vc_sub_SSE0 = gmx_add_pr(vc_sub_SSE0, sh_ewald_SSE);
- vc_sub_SSE2 = gmx_add_pr(vc_sub_SSE2, sh_ewald_SSE);
+ vc_sub_S0 = gmx_add_pr(vc_sub_S0, sh_ewald_S);
+ vc_sub_S2 = gmx_add_pr(vc_sub_S2, sh_ewald_S);
#endif
#endif
- vcoul_SSE0 = gmx_mul_pr(qq_SSE0, gmx_sub_pr(rinv_ex_SSE0, vc_sub_SSE0));
- vcoul_SSE2 = gmx_mul_pr(qq_SSE2, gmx_sub_pr(rinv_ex_SSE2, vc_sub_SSE2));
+ vcoul_S0 = gmx_mul_pr(qq_S0, gmx_sub_pr(rinv_ex_S0, vc_sub_S0));
+ vcoul_S2 = gmx_mul_pr(qq_S2, gmx_sub_pr(rinv_ex_S2, vc_sub_S2));
#endif
#ifdef CALC_ENERGIES
/* Mask energy for cut-off and diagonal */
- vcoul_SSE0 = gmx_and_pr(vcoul_SSE0, wco_SSE0);
- vcoul_SSE2 = gmx_and_pr(vcoul_SSE2, wco_SSE2);
+ vcoul_S0 = gmx_blendzero_pr(vcoul_S0, wco_S0);
+ vcoul_S2 = gmx_blendzero_pr(vcoul_S2, wco_S2);
#endif
#endif /* CALC_COULOMB */
/* Lennard-Jones interaction */
#ifdef VDW_CUTOFF_CHECK
- wco_vdw_SSE0 = gmx_cmplt_pr(rsq_SSE0, rcvdw2_SSE);
+ wco_vdw_S0 = gmx_cmplt_pr(rsq_S0, rcvdw2_S);
#ifndef HALF_LJ
- wco_vdw_SSE2 = gmx_cmplt_pr(rsq_SSE2, rcvdw2_SSE);
+ wco_vdw_S2 = gmx_cmplt_pr(rsq_S2, rcvdw2_S);
#endif
#else
/* Same cut-off for Coulomb and VdW, reuse the registers */
-#define wco_vdw_SSE0 wco_SSE0
-#define wco_vdw_SSE2 wco_SSE2
+#define wco_vdw_S0 wco_S0
+#define wco_vdw_S2 wco_S2
#endif
#ifndef LJ_COMB_LB
- rinvsix_SSE0 = gmx_mul_pr(rinvsq_SSE0, gmx_mul_pr(rinvsq_SSE0, rinvsq_SSE0));
+ rinvsix_S0 = gmx_mul_pr(rinvsq_S0, gmx_mul_pr(rinvsq_S0, rinvsq_S0));
#ifdef EXCL_FORCES
- rinvsix_SSE0 = gmx_and_pr(rinvsix_SSE0, int_SSE0);
+ rinvsix_S0 = gmx_blendzero_pr(rinvsix_S0, int_S0);
#endif
#ifndef HALF_LJ
- rinvsix_SSE2 = gmx_mul_pr(rinvsq_SSE2, gmx_mul_pr(rinvsq_SSE2, rinvsq_SSE2));
+ rinvsix_S2 = gmx_mul_pr(rinvsq_S2, gmx_mul_pr(rinvsq_S2, rinvsq_S2));
#ifdef EXCL_FORCES
- rinvsix_SSE2 = gmx_and_pr(rinvsix_SSE2, int_SSE2);
+ rinvsix_S2 = gmx_blendzero_pr(rinvsix_S2, int_S2);
#endif
#endif
#ifdef VDW_CUTOFF_CHECK
- rinvsix_SSE0 = gmx_and_pr(rinvsix_SSE0, wco_vdw_SSE0);
+ rinvsix_S0 = gmx_blendzero_pr(rinvsix_S0, wco_vdw_S0);
#ifndef HALF_LJ
- rinvsix_SSE2 = gmx_and_pr(rinvsix_SSE2, wco_vdw_SSE2);
+ rinvsix_S2 = gmx_blendzero_pr(rinvsix_S2, wco_vdw_S2);
#endif
#endif
- FrLJ6_SSE0 = gmx_mul_pr(c6_SSE0, rinvsix_SSE0);
+ FrLJ6_S0 = gmx_mul_pr(c6_S0, rinvsix_S0);
#ifndef HALF_LJ
- FrLJ6_SSE2 = gmx_mul_pr(c6_SSE2, rinvsix_SSE2);
+ FrLJ6_S2 = gmx_mul_pr(c6_S2, rinvsix_S2);
#endif
- FrLJ12_SSE0 = gmx_mul_pr(c12_SSE0, gmx_mul_pr(rinvsix_SSE0, rinvsix_SSE0));
+ FrLJ12_S0 = gmx_mul_pr(c12_S0, gmx_mul_pr(rinvsix_S0, rinvsix_S0));
#ifndef HALF_LJ
- FrLJ12_SSE2 = gmx_mul_pr(c12_SSE2, gmx_mul_pr(rinvsix_SSE2, rinvsix_SSE2));
+ FrLJ12_S2 = gmx_mul_pr(c12_S2, gmx_mul_pr(rinvsix_S2, rinvsix_S2));
#endif
#endif /* not LJ_COMB_LB */
#ifdef LJ_COMB_LB
- sir_SSE0 = gmx_mul_pr(sig_SSE0, rinv_SSE0);
+ sir_S0 = gmx_mul_pr(sig_S0, rinv_S0);
#ifndef HALF_LJ
- sir_SSE2 = gmx_mul_pr(sig_SSE2, rinv_SSE2);
+ sir_S2 = gmx_mul_pr(sig_S2, rinv_S2);
#endif
- sir2_SSE0 = gmx_mul_pr(sir_SSE0, sir_SSE0);
+ sir2_S0 = gmx_mul_pr(sir_S0, sir_S0);
#ifndef HALF_LJ
- sir2_SSE2 = gmx_mul_pr(sir_SSE2, sir_SSE2);
+ sir2_S2 = gmx_mul_pr(sir_S2, sir_S2);
#endif
- sir6_SSE0 = gmx_mul_pr(sir2_SSE0, gmx_mul_pr(sir2_SSE0, sir2_SSE0));
+ sir6_S0 = gmx_mul_pr(sir2_S0, gmx_mul_pr(sir2_S0, sir2_S0));
#ifdef EXCL_FORCES
- sir6_SSE0 = gmx_and_pr(sir6_SSE0, int_SSE0);
+ sir6_S0 = gmx_blendzero_pr(sir6_S0, int_S0);
#endif
#ifndef HALF_LJ
- sir6_SSE2 = gmx_mul_pr(sir2_SSE2, gmx_mul_pr(sir2_SSE2, sir2_SSE2));
+ sir6_S2 = gmx_mul_pr(sir2_S2, gmx_mul_pr(sir2_S2, sir2_S2));
#ifdef EXCL_FORCES
- sir6_SSE2 = gmx_and_pr(sir6_SSE2, int_SSE2);
+ sir6_S2 = gmx_blendzero_pr(sir6_S2, int_S2);
#endif
#endif
#ifdef VDW_CUTOFF_CHECK
- sir6_SSE0 = gmx_and_pr(sir6_SSE0, wco_vdw_SSE0);
+ sir6_S0 = gmx_blendzero_pr(sir6_S0, wco_vdw_S0);
#ifndef HALF_LJ
- sir6_SSE2 = gmx_and_pr(sir6_SSE2, wco_vdw_SSE2);
+ sir6_S2 = gmx_blendzero_pr(sir6_S2, wco_vdw_S2);
#endif
#endif
- FrLJ6_SSE0 = gmx_mul_pr(eps_SSE0, sir6_SSE0);
+ FrLJ6_S0 = gmx_mul_pr(eps_S0, sir6_S0);
#ifndef HALF_LJ
- FrLJ6_SSE2 = gmx_mul_pr(eps_SSE2, sir6_SSE2);
+ FrLJ6_S2 = gmx_mul_pr(eps_S2, sir6_S2);
#endif
- FrLJ12_SSE0 = gmx_mul_pr(FrLJ6_SSE0, sir6_SSE0);
+ FrLJ12_S0 = gmx_mul_pr(FrLJ6_S0, sir6_S0);
#ifndef HALF_LJ
- FrLJ12_SSE2 = gmx_mul_pr(FrLJ6_SSE2, sir6_SSE2);
+ FrLJ12_S2 = gmx_mul_pr(FrLJ6_S2, sir6_S2);
#endif
#if defined CALC_ENERGIES
/* We need C6 and C12 to calculate the LJ potential shift */
- sig2_SSE0 = gmx_mul_pr(sig_SSE0, sig_SSE0);
+ sig2_S0 = gmx_mul_pr(sig_S0, sig_S0);
#ifndef HALF_LJ
- sig2_SSE2 = gmx_mul_pr(sig_SSE2, sig_SSE2);
+ sig2_S2 = gmx_mul_pr(sig_S2, sig_S2);
#endif
- sig6_SSE0 = gmx_mul_pr(sig2_SSE0, gmx_mul_pr(sig2_SSE0, sig2_SSE0));
+ sig6_S0 = gmx_mul_pr(sig2_S0, gmx_mul_pr(sig2_S0, sig2_S0));
#ifndef HALF_LJ
- sig6_SSE2 = gmx_mul_pr(sig2_SSE2, gmx_mul_pr(sig2_SSE2, sig2_SSE2));
+ sig6_S2 = gmx_mul_pr(sig2_S2, gmx_mul_pr(sig2_S2, sig2_S2));
#endif
- c6_SSE0 = gmx_mul_pr(eps_SSE0, sig6_SSE0);
+ c6_S0 = gmx_mul_pr(eps_S0, sig6_S0);
#ifndef HALF_LJ
- c6_SSE2 = gmx_mul_pr(eps_SSE2, sig6_SSE2);
+ c6_S2 = gmx_mul_pr(eps_S2, sig6_S2);
#endif
- c12_SSE0 = gmx_mul_pr(c6_SSE0, sig6_SSE0);
+ c12_S0 = gmx_mul_pr(c6_S0, sig6_S0);
#ifndef HALF_LJ
- c12_SSE2 = gmx_mul_pr(c6_SSE2, sig6_SSE2);
+ c12_S2 = gmx_mul_pr(c6_S2, sig6_S2);
#endif
#endif
#endif /* LJ_COMB_LB */
#ifdef CALC_COULOMB
#ifndef ENERGY_GROUPS
- vctotSSE = gmx_add_pr(vctotSSE, gmx_add_pr(vcoul_SSE0, vcoul_SSE2));
+ vctot_S = gmx_add_pr(vctot_S, gmx_add_pr(vcoul_S0, vcoul_S2));
#else
- add_ener_grp_halves(vcoul_SSE0, vctp[0], vctp[1], egp_jj);
- add_ener_grp_halves(vcoul_SSE2, vctp[2], vctp[3], egp_jj);
+ add_ener_grp_halves(vcoul_S0, vctp[0], vctp[1], egp_jj);
+ add_ener_grp_halves(vcoul_S2, vctp[2], vctp[3], egp_jj);
#endif
#endif
#ifdef CALC_LJ
/* Calculate the LJ energies */
- VLJ6_SSE0 = gmx_mul_pr(sixthSSE, gmx_sub_pr(FrLJ6_SSE0, gmx_mul_pr(c6_SSE0, sh_invrc6_SSE)));
+ VLJ6_S0 = gmx_mul_pr(sixth_S, gmx_sub_pr(FrLJ6_S0, gmx_mul_pr(c6_S0, sh_invrc6_S)));
#ifndef HALF_LJ
- VLJ6_SSE2 = gmx_mul_pr(sixthSSE, gmx_sub_pr(FrLJ6_SSE2, gmx_mul_pr(c6_SSE2, sh_invrc6_SSE)));
+ VLJ6_S2 = gmx_mul_pr(sixth_S, gmx_sub_pr(FrLJ6_S2, gmx_mul_pr(c6_S2, sh_invrc6_S)));
#endif
- VLJ12_SSE0 = gmx_mul_pr(twelvethSSE, gmx_sub_pr(FrLJ12_SSE0, gmx_mul_pr(c12_SSE0, sh_invrc12_SSE)));
+ VLJ12_S0 = gmx_mul_pr(twelveth_S, gmx_sub_pr(FrLJ12_S0, gmx_mul_pr(c12_S0, sh_invrc12_S)));
#ifndef HALF_LJ
- VLJ12_SSE2 = gmx_mul_pr(twelvethSSE, gmx_sub_pr(FrLJ12_SSE2, gmx_mul_pr(c12_SSE2, sh_invrc12_SSE)));
+ VLJ12_S2 = gmx_mul_pr(twelveth_S, gmx_sub_pr(FrLJ12_S2, gmx_mul_pr(c12_S2, sh_invrc12_S)));
#endif
- VLJ_SSE0 = gmx_sub_pr(VLJ12_SSE0, VLJ6_SSE0);
+ VLJ_S0 = gmx_sub_pr(VLJ12_S0, VLJ6_S0);
#ifndef HALF_LJ
- VLJ_SSE2 = gmx_sub_pr(VLJ12_SSE2, VLJ6_SSE2);
+ VLJ_S2 = gmx_sub_pr(VLJ12_S2, VLJ6_S2);
#endif
/* The potential shift should be removed for pairs beyond cut-off */
- VLJ_SSE0 = gmx_and_pr(VLJ_SSE0, wco_vdw_SSE0);
+ VLJ_S0 = gmx_blendzero_pr(VLJ_S0, wco_vdw_S0);
#ifndef HALF_LJ
- VLJ_SSE2 = gmx_and_pr(VLJ_SSE2, wco_vdw_SSE2);
+ VLJ_S2 = gmx_blendzero_pr(VLJ_S2, wco_vdw_S2);
#endif
#ifdef CHECK_EXCLS
/* The potential shift should be removed for excluded pairs */
- VLJ_SSE0 = gmx_and_pr(VLJ_SSE0, int_SSE0);
+ VLJ_S0 = gmx_blendzero_pr(VLJ_S0, int_S0);
#ifndef HALF_LJ
- VLJ_SSE2 = gmx_and_pr(VLJ_SSE2, int_SSE2);
+ VLJ_S2 = gmx_blendzero_pr(VLJ_S2, int_S2);
#endif
#endif
#ifndef ENERGY_GROUPS
- VvdwtotSSE = gmx_add_pr(VvdwtotSSE,
+ Vvdwtot_S = gmx_add_pr(Vvdwtot_S,
#ifndef HALF_LJ
- gmx_add_pr(VLJ_SSE0, VLJ_SSE2)
+ gmx_add_pr(VLJ_S0, VLJ_S2)
#else
- VLJ_SSE0
+ VLJ_S0
#endif
- );
+ );
#else
- add_ener_grp_halves(VLJ_SSE0, vvdwtp[0], vvdwtp[1], egp_jj);
+ add_ener_grp_halves(VLJ_S0, vvdwtp[0], vvdwtp[1], egp_jj);
#ifndef HALF_LJ
- add_ener_grp_halves(VLJ_SSE2, vvdwtp[2], vvdwtp[3], egp_jj);
+ add_ener_grp_halves(VLJ_S2, vvdwtp[2], vvdwtp[3], egp_jj);
#endif
#endif
#endif /* CALC_LJ */
#endif /* CALC_ENERGIES */
#ifdef CALC_LJ
- fscal_SSE0 = gmx_mul_pr(rinvsq_SSE0,
+ fscal_S0 = gmx_mul_pr(rinvsq_S0,
#ifdef CALC_COULOMB
- gmx_add_pr(frcoul_SSE0,
+ gmx_add_pr(frcoul_S0,
#else
- (
+ (
#endif
- gmx_sub_pr(FrLJ12_SSE0, FrLJ6_SSE0)));
+ gmx_sub_pr(FrLJ12_S0, FrLJ6_S0)));
#else
- fscal_SSE0 = gmx_mul_pr(rinvsq_SSE0, frcoul_SSE0);
+ fscal_S0 = gmx_mul_pr(rinvsq_S0, frcoul_S0);
#endif /* CALC_LJ */
#if defined CALC_LJ && !defined HALF_LJ
- fscal_SSE2 = gmx_mul_pr(rinvsq_SSE2,
+ fscal_S2 = gmx_mul_pr(rinvsq_S2,
#ifdef CALC_COULOMB
- gmx_add_pr(frcoul_SSE2,
+ gmx_add_pr(frcoul_S2,
#else
- (
+ (
#endif
- gmx_sub_pr(FrLJ12_SSE2, FrLJ6_SSE2)));
+ gmx_sub_pr(FrLJ12_S2, FrLJ6_S2)));
#else
/* Atom 2 and 3 don't have LJ, so only add Coulomb forces */
- fscal_SSE2 = gmx_mul_pr(rinvsq_SSE2, frcoul_SSE2);
+ fscal_S2 = gmx_mul_pr(rinvsq_S2, frcoul_S2);
#endif
/* Calculate temporary vectorial force */
- tx_SSE0 = gmx_mul_pr(fscal_SSE0, dx_SSE0);
- tx_SSE2 = gmx_mul_pr(fscal_SSE2, dx_SSE2);
- ty_SSE0 = gmx_mul_pr(fscal_SSE0, dy_SSE0);
- ty_SSE2 = gmx_mul_pr(fscal_SSE2, dy_SSE2);
- tz_SSE0 = gmx_mul_pr(fscal_SSE0, dz_SSE0);
- tz_SSE2 = gmx_mul_pr(fscal_SSE2, dz_SSE2);
+ tx_S0 = gmx_mul_pr(fscal_S0, dx_S0);
+ tx_S2 = gmx_mul_pr(fscal_S2, dx_S2);
+ ty_S0 = gmx_mul_pr(fscal_S0, dy_S0);
+ ty_S2 = gmx_mul_pr(fscal_S2, dy_S2);
+ tz_S0 = gmx_mul_pr(fscal_S0, dz_S0);
+ tz_S2 = gmx_mul_pr(fscal_S2, dz_S2);
/* Increment i atom force */
- fix_SSE0 = gmx_add_pr(fix_SSE0, tx_SSE0);
- fix_SSE2 = gmx_add_pr(fix_SSE2, tx_SSE2);
- fiy_SSE0 = gmx_add_pr(fiy_SSE0, ty_SSE0);
- fiy_SSE2 = gmx_add_pr(fiy_SSE2, ty_SSE2);
- fiz_SSE0 = gmx_add_pr(fiz_SSE0, tz_SSE0);
- fiz_SSE2 = gmx_add_pr(fiz_SSE2, tz_SSE2);
+ fix_S0 = gmx_add_pr(fix_S0, tx_S0);
+ fix_S2 = gmx_add_pr(fix_S2, tx_S2);
+ fiy_S0 = gmx_add_pr(fiy_S0, ty_S0);
+ fiy_S2 = gmx_add_pr(fiy_S2, ty_S2);
+ fiz_S0 = gmx_add_pr(fiz_S0, tz_S0);
+ fiz_S2 = gmx_add_pr(fiz_S2, tz_S2);
/* Decrement j atom force */
- gmx_store_hpr(f+ajx,
- gmx_sub_hpr( gmx_load_hpr(f+ajx), gmx_sum4_hpr(tx_SSE0, tx_SSE2) ));
- gmx_store_hpr(f+ajy,
- gmx_sub_hpr( gmx_load_hpr(f+ajy), gmx_sum4_hpr(ty_SSE0, ty_SSE2) ));
- gmx_store_hpr(f+ajz,
- gmx_sub_hpr( gmx_load_hpr(f+ajz), gmx_sum4_hpr(tz_SSE0, tz_SSE2) ));
+ gmx_load_hpr(fjx_S, f+ajx);
+ gmx_load_hpr(fjy_S, f+ajy);
+ gmx_load_hpr(fjz_S, f+ajz);
+ gmx_store_hpr(f+ajx, gmx_sub_hpr(fjx_S, gmx_sum4_hpr(tx_S0, tx_S2)));
+ gmx_store_hpr(f+ajy, gmx_sub_hpr(fjy_S, gmx_sum4_hpr(ty_S0, ty_S2)));
+ gmx_store_hpr(f+ajz, gmx_sub_hpr(fjz_S, gmx_sum4_hpr(tz_S0, tz_S2)));
}
-#undef rinv_ex_SSE0
-#undef rinv_ex_SSE2
+#undef rinv_ex_S0
+#undef rinv_ex_S2
-#undef wco_vdw_SSE0
-#undef wco_vdw_SSE2
+#undef wco_vdw_S0
+#undef wco_vdw_S2
#undef CUTOFF_BLENDV
* the research papers on the package. Check out http://www.gromacs.org.
*/
-/* GMX_MM256_HERE should be set before including this file */
+
+/* Include the full width SIMD macros */
#include "gmx_simd_macros.h"
+
+/* Define a few macros for half-width SIMD */
+#if defined GMX_X86_AVX_256 && !defined GMX_DOUBLE
+
+/* Half-width SIMD real type */
+#define gmx_mm_hpr __m128
+
+/* Half-width SIMD operations */
+/* Load reals at half-width aligned pointer b into half-width SIMD register a */
+#define gmx_load_hpr(a, b) a = _mm_load_ps(b)
+/* Load one real at pointer b into half-width SIMD register a */
+#define gmx_load1_hpr(a, b) a = _mm_load1_ps(b)
+/* Load one real at b and one real at b+1 into halves of a, respectively */
+#define gmx_load1p1_pr(a, b) a = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm_load1_ps(b)), _mm_load1_ps(b+1), 0x1)
+/* Load reals at half-width aligned pointer b into two halves of a */
+#define gmx_loaddh_pr(a, b) a = gmx_mm256_load4_ps(b)
+/* To half-width SIMD register b into half width aligned memory a */
+#define gmx_store_hpr(a, b) _mm_store_ps(a, b)
+#define gmx_add_hpr _mm_add_ps
+#define gmx_sub_hpr _mm_sub_ps
+/* Horizontal sum over a half SIMD register */
+#define gmx_sum4_hpr gmx_mm256_sum4h_m128
+
+#else
+#error "Half-width SIMD macros are not yet defined"
+#endif
+
+
#define SUM_SIMD4(x) (x[0]+x[1]+x[2]+x[3])
#define UNROLLI NBNXN_CPU_CLUSTER_I_SIZE
#define UNROLLJ (GMX_SIMD_WIDTH_HERE/2)
-#if defined GMX_MM256_HERE
-#define STRIDE 4
-#endif
+/* The stride of all the atom data arrays is equal to half the SIMD width */
+#define STRIDE (GMX_SIMD_WIDTH_HERE/2)
-#ifdef GMX_MM256_HERE
-#ifndef GMX_DOUBLE
-/* single precision 2x(4+4) kernel */
+#if GMX_SIMD_WIDTH_HERE == 8
#define SUM_SIMD(x) (x[0]+x[1]+x[2]+x[3]+x[4]+x[5]+x[6]+x[7])
-#define TAB_FDV0
+#else
+#if GMX_SIMD_WIDTH_HERE == 16
+/* This is getting ridiculous, SIMD horizontal adds would help,
+ * but this is not performance critical (only used to reduce energies)
+ */
+#define SUM_SIMD(x) (x[0]+x[1]+x[2]+x[3]+x[4]+x[5]+x[6]+x[7]+x[8]+x[9]+x[10]+x[11]+x[12]+x[13]+x[14]+x[15])
#else
#error "unsupported kernel configuration"
#endif
#endif
+
+#if defined GMX_X86_AVX_256 && !defined GMX_DOUBLE
+/* AVX-256 single precision 2x(4+4) kernel,
+ * we can do half SIMD-width aligned FDV0 table loads.
+ */
+#define TAB_FDV0
+#endif
+
+
#define SIMD_MASK_ALL 0xffffffff
#include "nbnxn_kernel_simd_utils.h"
real *vctp[UNROLLI];
#endif
- gmx_mm_pr shX_SSE;
- gmx_mm_pr shY_SSE;
- gmx_mm_pr shZ_SSE;
- gmx_mm_pr ix_SSE0, iy_SSE0, iz_SSE0;
- gmx_mm_pr ix_SSE2, iy_SSE2, iz_SSE2;
- gmx_mm_pr fix_SSE0, fiy_SSE0, fiz_SSE0;
- gmx_mm_pr fix_SSE2, fiy_SSE2, fiz_SSE2;
+ gmx_mm_pr shX_S;
+ gmx_mm_pr shY_S;
+ gmx_mm_pr shZ_S;
+ gmx_mm_pr ix_S0, iy_S0, iz_S0;
+ gmx_mm_pr ix_S2, iy_S2, iz_S2;
+ gmx_mm_pr fix_S0, fiy_S0, fiz_S0;
+ gmx_mm_pr fix_S2, fiy_S2, fiz_S2;
#if UNROLLJ >= 4
#ifndef GMX_DOUBLE
- __m128 fix_SSE, fiy_SSE, fiz_SSE;
+ __m128 fix_S, fiy_S, fiz_S;
#else
- __m256d fix_SSE, fiy_SSE, fiz_SSE;
+ __m256d fix_S, fiy_S, fiz_S;
#endif
#else
- __m128d fix0_SSE, fiy0_SSE, fiz0_SSE;
- __m128d fix2_SSE, fiy2_SSE, fiz2_SSE;
+ __m128d fix0_S, fiy0_S, fiz0_S;
+ __m128d fix2_S, fiy2_S, fiz2_S;
#endif
- /* AVX: use floating point masks, as there are no integer instructions */
- gmx_mm_pr mask0 = _mm256_castsi256_ps(_mm256_set_epi32( 0x0080, 0x0040, 0x0020, 0x0010, 0x0008, 0x0004, 0x0002, 0x0001 ));
- gmx_mm_pr mask2 = _mm256_castsi256_ps(_mm256_set_epi32( 0x8000, 0x4000, 0x2000, 0x1000, 0x0800, 0x0400, 0x0200, 0x0100 ));
-
- gmx_mm_pr diag_jmi_SSE;
+ gmx_mm_pr diag_jmi_S;
#if UNROLLI == UNROLLJ
- gmx_mm_pr diag_SSE0, diag_SSE2;
+ gmx_mm_pr diag_S0, diag_S2;
#else
- gmx_mm_pr diag0_SSE0, diag0_SSE2;
- gmx_mm_pr diag1_SSE0, diag1_SSE2;
+ gmx_mm_pr diag0_S0, diag0_S2;
+ gmx_mm_pr diag1_S0, diag1_S2;
#endif
- gmx_mm_pr zero_SSE = gmx_set1_pr(0);
+ gmx_mm_pr mask_S0, mask_S2;
+
+ gmx_mm_pr zero_S = gmx_set1_pr(0);
- gmx_mm_pr one_SSE = gmx_set1_pr(1.0);
- gmx_mm_pr iq_SSE0 = gmx_setzero_pr();
- gmx_mm_pr iq_SSE2 = gmx_setzero_pr();
- gmx_mm_pr mrc_3_SSE;
+ gmx_mm_pr one_S = gmx_set1_pr(1.0);
+ gmx_mm_pr iq_S0 = gmx_setzero_pr();
+ gmx_mm_pr iq_S2 = gmx_setzero_pr();
+ gmx_mm_pr mrc_3_S;
#ifdef CALC_ENERGIES
- gmx_mm_pr hrc_3_SSE, moh_rc_SSE;
+ gmx_mm_pr hrc_3_S, moh_rc_S;
#endif
#ifdef CALC_COUL_TAB
/* Coulomb table variables */
- gmx_mm_pr invtsp_SSE;
+ gmx_mm_pr invtsp_S;
const real *tab_coul_F;
#ifndef TAB_FDV0
const real *tab_coul_V;
#endif
-#ifdef GMX_MM256_HERE
- int ti0_array[2*GMX_SIMD_WIDTH_HERE-1], *ti0;
- int ti2_array[2*GMX_SIMD_WIDTH_HERE-1], *ti2;
-#endif
+ int ti0_array[2*GMX_SIMD_WIDTH_HERE], *ti0;
+ int ti2_array[2*GMX_SIMD_WIDTH_HERE], *ti2;
#ifdef CALC_ENERGIES
- gmx_mm_pr mhalfsp_SSE;
+ gmx_mm_pr mhalfsp_S;
#endif
#endif
#ifdef CALC_COUL_EWALD
- gmx_mm_pr beta2_SSE, beta_SSE;
+ gmx_mm_pr beta2_S, beta_S;
#endif
#if defined CALC_ENERGIES && (defined CALC_COUL_EWALD || defined CALC_COUL_TAB)
- gmx_mm_pr sh_ewald_SSE;
+ gmx_mm_pr sh_ewald_S;
#endif
#ifdef LJ_COMB_LB
const real *ljc;
- gmx_mm_pr hsig_i_SSE0, seps_i_SSE0;
- gmx_mm_pr hsig_i_SSE2, seps_i_SSE2;
+ gmx_mm_pr hsig_i_S0, seps_i_S0;
+ gmx_mm_pr hsig_i_S2, seps_i_S2;
#else
#ifdef FIX_LJ_C
- real pvdw_array[2*UNROLLI*UNROLLJ+3];
+ real pvdw_array[2*UNROLLI*UNROLLJ+GMX_SIMD_WIDTH_HERE];
real *pvdw_c6, *pvdw_c12;
- gmx_mm_pr c6_SSE0, c12_SSE0;
- gmx_mm_pr c6_SSE2, c12_SSE2;
+ gmx_mm_pr c6_S0, c12_S0;
+ gmx_mm_pr c6_S2, c12_S2;
#endif
#ifdef LJ_COMB_GEOM
const real *ljc;
- gmx_mm_pr c6s_SSE0, c12s_SSE0;
- gmx_mm_pr c6s_SSE1, c12s_SSE1;
- gmx_mm_pr c6s_SSE2 = gmx_setzero_pr(), c12s_SSE2 = gmx_setzero_pr();
- gmx_mm_pr c6s_SSE3 = gmx_setzero_pr(), c12s_SSE3 = gmx_setzero_pr();
+ gmx_mm_pr c6s_S0, c12s_S0;
+ gmx_mm_pr c6s_S1, c12s_S1;
+ gmx_mm_pr c6s_S2 = gmx_setzero_pr(), c12s_S2 = gmx_setzero_pr();
+ gmx_mm_pr c6s_S3 = gmx_setzero_pr(), c12s_S3 = gmx_setzero_pr();
#endif
#endif /* LJ_COMB_LB */
- gmx_mm_pr vctotSSE, VvdwtotSSE;
- gmx_mm_pr sixthSSE, twelvethSSE;
+ gmx_mm_pr vctot_S, Vvdwtot_S;
+ gmx_mm_pr sixth_S, twelveth_S;
- gmx_mm_pr avoid_sing_SSE;
- gmx_mm_pr rc2_SSE;
+ gmx_mm_pr avoid_sing_S;
+ gmx_mm_pr rc2_S;
#ifdef VDW_CUTOFF_CHECK
- gmx_mm_pr rcvdw2_SSE;
+ gmx_mm_pr rcvdw2_S;
#endif
#ifdef CALC_ENERGIES
- gmx_mm_pr sh_invrc6_SSE, sh_invrc12_SSE;
+ gmx_mm_pr sh_invrc6_S, sh_invrc12_S;
/* cppcheck-suppress unassignedVariable */
- real tmpsum_array[15], *tmpsum;
+ real tmpsum_array[2*GMX_SIMD_WIDTH_HERE], *tmpsum;
#endif
#ifdef CALC_SHIFTFORCES
/* cppcheck-suppress unassignedVariable */
- real shf_array[15], *shf;
+ real shf_array[2*GMX_SIMD_WIDTH_HERE], *shf;
#endif
int ninner;
#endif
/* Load j-i for the first i */
- diag_jmi_SSE = gmx_load_pr(nbat->simd_2xnn_diag);
+ diag_jmi_S = gmx_load_pr(nbat->simd_2xnn_diag);
/* Generate all the diagonal masks as comparison results */
#if UNROLLI == UNROLLJ
- diag_SSE0 = gmx_cmplt_pr(zero_SSE, diag_jmi_SSE);
- diag_jmi_SSE = gmx_sub_pr(diag_jmi_SSE, one_SSE);
- diag_jmi_SSE = gmx_sub_pr(diag_jmi_SSE, one_SSE);
- diag_SSE2 = gmx_cmplt_pr(zero_SSE, diag_jmi_SSE);
+ diag_S0 = gmx_cmplt_pr(zero_S, diag_jmi_S);
+ diag_jmi_S = gmx_sub_pr(diag_jmi_S, one_S);
+ diag_jmi_S = gmx_sub_pr(diag_jmi_S, one_S);
+ diag_S2 = gmx_cmplt_pr(zero_S, diag_jmi_S);
#else
#if 2*UNROLLI == UNROLLJ
- diag0_SSE0 = gmx_cmplt_pr(diag_i_SSE, diag_j_SSE);
- diag_i_SSE = gmx_add_pr(diag_i_SSE, one_SSE);
- diag_i_SSE = gmx_add_pr(diag_i_SSE, one_SSE);
- diag0_SSE2 = gmx_cmplt_pr(diag_i_SSE, diag_j_SSE);
- diag_i_SSE = gmx_add_pr(diag_i_SSE, one_SSE);
- diag_i_SSE = gmx_add_pr(diag_i_SSE, one_SSE);
- diag1_SSE0 = gmx_cmplt_pr(diag_i_SSE, diag_j_SSE);
- diag_i_SSE = gmx_add_pr(diag_i_SSE, one_SSE);
- diag_i_SSE = gmx_add_pr(diag_i_SSE, one_SSE);
- diag1_SSE2 = gmx_cmplt_pr(diag_i_SSE, diag_j_SSE);
+ diag0_S0 = gmx_cmplt_pr(diag_i_S, diag_j_S);
+ diag_i_S = gmx_add_pr(diag_i_S, one_S);
+ diag_i_S = gmx_add_pr(diag_i_S, one_S);
+ diag0_S2 = gmx_cmplt_pr(diag_i_S, diag_j_S);
+ diag_i_S = gmx_add_pr(diag_i_S, one_S);
+ diag_i_S = gmx_add_pr(diag_i_S, one_S);
+ diag1_S0 = gmx_cmplt_pr(diag_i_S, diag_j_S);
+ diag_i_S = gmx_add_pr(diag_i_S, one_S);
+ diag_i_S = gmx_add_pr(diag_i_S, one_S);
+ diag1_S2 = gmx_cmplt_pr(diag_i_S, diag_j_S);
#endif
#endif
+ /* Load masks for topology exclusion masking */
+ mask_S0 = gmx_load_pr((real *)nbat->simd_excl_mask + 0*2*UNROLLJ);
+ mask_S2 = gmx_load_pr((real *)nbat->simd_excl_mask + 1*2*UNROLLJ);
+
#ifdef CALC_COUL_TAB
-#ifdef GMX_MM256_HERE
/* Generate aligned table index pointers */
- ti0 = (int *)(((size_t)(ti0_array+GMX_SIMD_WIDTH_HERE-1)) & (~((size_t)(GMX_SIMD_WIDTH_HERE*sizeof(int)-1))));
- ti2 = (int *)(((size_t)(ti2_array+GMX_SIMD_WIDTH_HERE-1)) & (~((size_t)(GMX_SIMD_WIDTH_HERE*sizeof(int)-1))));
-#endif
+ ti0 = gmx_simd_align_int(ti0_array);
+ ti2 = gmx_simd_align_int(ti2_array);
- invtsp_SSE = gmx_set1_pr(ic->tabq_scale);
+ invtsp_S = gmx_set1_pr(ic->tabq_scale);
#ifdef CALC_ENERGIES
- mhalfsp_SSE = gmx_set1_pr(-0.5/ic->tabq_scale);
+ mhalfsp_S = gmx_set1_pr(-0.5/ic->tabq_scale);
#endif
#ifdef TAB_FDV0
#endif /* CALC_COUL_TAB */
#ifdef CALC_COUL_EWALD
- beta2_SSE = gmx_set1_pr(ic->ewaldcoeff*ic->ewaldcoeff);
- beta_SSE = gmx_set1_pr(ic->ewaldcoeff);
+ beta2_S = gmx_set1_pr(ic->ewaldcoeff*ic->ewaldcoeff);
+ beta_S = gmx_set1_pr(ic->ewaldcoeff);
#endif
#if (defined CALC_COUL_TAB || defined CALC_COUL_EWALD) && defined CALC_ENERGIES
- sh_ewald_SSE = gmx_set1_pr(ic->sh_ewald);
+ sh_ewald_S = gmx_set1_pr(ic->sh_ewald);
#endif
q = nbat->q;
shiftvec = shift_vec[0];
x = nbat->x;
- avoid_sing_SSE = gmx_set1_pr(NBNXN_AVOID_SING_R2_INC);
+ avoid_sing_S = gmx_set1_pr(NBNXN_AVOID_SING_R2_INC);
/* The kernel either supports rcoulomb = rvdw or rcoulomb >= rvdw */
- rc2_SSE = gmx_set1_pr(ic->rcoulomb*ic->rcoulomb);
+ rc2_S = gmx_set1_pr(ic->rcoulomb*ic->rcoulomb);
#ifdef VDW_CUTOFF_CHECK
- rcvdw2_SSE = gmx_set1_pr(ic->rvdw*ic->rvdw);
+ rcvdw2_S = gmx_set1_pr(ic->rvdw*ic->rvdw);
#endif
#ifdef CALC_ENERGIES
- sixthSSE = gmx_set1_pr(1.0/6.0);
- twelvethSSE = gmx_set1_pr(1.0/12.0);
+ sixth_S = gmx_set1_pr(1.0/6.0);
+ twelveth_S = gmx_set1_pr(1.0/12.0);
- sh_invrc6_SSE = gmx_set1_pr(ic->sh_invrc6);
- sh_invrc12_SSE = gmx_set1_pr(ic->sh_invrc6*ic->sh_invrc6);
+ sh_invrc6_S = gmx_set1_pr(ic->sh_invrc6);
+ sh_invrc12_S = gmx_set1_pr(ic->sh_invrc6*ic->sh_invrc6);
#endif
- mrc_3_SSE = gmx_set1_pr(-2*ic->k_rf);
+ mrc_3_S = gmx_set1_pr(-2*ic->k_rf);
#ifdef CALC_ENERGIES
- hrc_3_SSE = gmx_set1_pr(ic->k_rf);
+ hrc_3_S = gmx_set1_pr(ic->k_rf);
- moh_rc_SSE = gmx_set1_pr(-ic->c_rf);
+ moh_rc_S = gmx_set1_pr(-ic->c_rf);
#endif
#ifdef CALC_ENERGIES
- tmpsum = (real *)(((size_t)(tmpsum_array+7)) & (~((size_t)31)));
+ tmpsum = gmx_simd_align_real(tmpsum_array);
#endif
#ifdef CALC_SHIFTFORCES
- shf = (real *)(((size_t)(shf_array+7)) & (~((size_t)31)));
+ shf = gmx_simd_align_real(shf_array);
#endif
#ifdef FIX_LJ_C
- pvdw_c6 = (real *)(((size_t)(pvdw_array+3)) & (~((size_t)15)));
+ pvdw_c6 = gmx_simd_align_real(pvdw_array);
pvdw_c12 = pvdw_c6 + UNROLLI*UNROLLJ;
for (jp = 0; jp < UNROLLJ; jp++)
pvdw_c12[2*UNROLLJ+jp] = nbat->nbfp[0*2+1];
pvdw_c12[3*UNROLLJ+jp] = nbat->nbfp[0*2+1];
}
- c6_SSE0 = gmx_load_pr(pvdw_c6 +0*UNROLLJ);
- c6_SSE1 = gmx_load_pr(pvdw_c6 +1*UNROLLJ);
- c6_SSE2 = gmx_load_pr(pvdw_c6 +2*UNROLLJ);
- c6_SSE3 = gmx_load_pr(pvdw_c6 +3*UNROLLJ);
-
- c12_SSE0 = gmx_load_pr(pvdw_c12+0*UNROLLJ);
- c12_SSE1 = gmx_load_pr(pvdw_c12+1*UNROLLJ);
- c12_SSE2 = gmx_load_pr(pvdw_c12+2*UNROLLJ);
- c12_SSE3 = gmx_load_pr(pvdw_c12+3*UNROLLJ);
+ c6_S0 = gmx_load_pr(pvdw_c6 +0*UNROLLJ);
+ c6_S1 = gmx_load_pr(pvdw_c6 +1*UNROLLJ);
+ c6_S2 = gmx_load_pr(pvdw_c6 +2*UNROLLJ);
+ c6_S3 = gmx_load_pr(pvdw_c6 +3*UNROLLJ);
+
+ c12_S0 = gmx_load_pr(pvdw_c12+0*UNROLLJ);
+ c12_S1 = gmx_load_pr(pvdw_c12+1*UNROLLJ);
+ c12_S2 = gmx_load_pr(pvdw_c12+2*UNROLLJ);
+ c12_S3 = gmx_load_pr(pvdw_c12+3*UNROLLJ);
#endif /* FIX_LJ_C */
#ifdef ENERGY_GROUPS
ci = nbln->ci;
ci_sh = (ish == CENTRAL ? ci : -1);
- shX_SSE = gmx_load1_pr(shiftvec+ish3);
- shY_SSE = gmx_load1_pr(shiftvec+ish3+1);
- shZ_SSE = gmx_load1_pr(shiftvec+ish3+2);
+ shX_S = gmx_load1_pr(shiftvec+ish3);
+ shY_S = gmx_load1_pr(shiftvec+ish3+1);
+ shZ_S = gmx_load1_pr(shiftvec+ish3+2);
#if UNROLLJ <= 4
sci = ci*STRIDE;
}
#endif
-#define gmx_load2_hpr(x) _mm256_insertf128_ps(gmx_load1_pr(x), gmx_load1_hpr(x+1), 1)
-
/* Load i atom data */
sciy = scix + STRIDE;
sciz = sciy + STRIDE;
- ix_SSE0 = gmx_add_pr(gmx_load2_hpr(x+scix), shX_SSE);
- ix_SSE2 = gmx_add_pr(gmx_load2_hpr(x+scix+2), shX_SSE);
- iy_SSE0 = gmx_add_pr(gmx_load2_hpr(x+sciy), shY_SSE);
- iy_SSE2 = gmx_add_pr(gmx_load2_hpr(x+sciy+2), shY_SSE);
- iz_SSE0 = gmx_add_pr(gmx_load2_hpr(x+sciz), shZ_SSE);
- iz_SSE2 = gmx_add_pr(gmx_load2_hpr(x+sciz+2), shZ_SSE);
+ gmx_load1p1_pr(ix_S0, x+scix);
+ gmx_load1p1_pr(ix_S2, x+scix+2);
+ gmx_load1p1_pr(iy_S0, x+sciy);
+ gmx_load1p1_pr(iy_S2, x+sciy+2);
+ gmx_load1p1_pr(iz_S0, x+sciz);
+ gmx_load1p1_pr(iz_S2, x+sciz+2);
+ ix_S0 = gmx_add_pr(ix_S0, shX_S);
+ ix_S2 = gmx_add_pr(ix_S2, shX_S);
+ iy_S0 = gmx_add_pr(iy_S0, shY_S);
+ iy_S2 = gmx_add_pr(iy_S2, shY_S);
+ iz_S0 = gmx_add_pr(iz_S0, shZ_S);
+ iz_S2 = gmx_add_pr(iz_S2, shZ_S);
if (do_coul)
{
- gmx_mm_pr facel_SSE;
+ gmx_mm_pr facel_S;
- facel_SSE = gmx_set1_pr(facel);
+ facel_S = gmx_set1_pr(facel);
- iq_SSE0 = gmx_mul_pr(facel_SSE, gmx_load2_hpr(q+sci));
- iq_SSE2 = gmx_mul_pr(facel_SSE, gmx_load2_hpr(q+sci+2));
+ gmx_load1p1_pr(iq_S0, q+sci);
+ gmx_load1p1_pr(iq_S2, q+sci+2);
+ iq_S0 = gmx_mul_pr(facel_S, iq_S0);
+ iq_S2 = gmx_mul_pr(facel_S, iq_S2);
}
#ifdef LJ_COMB_LB
- hsig_i_SSE0 = gmx_load2_hpr(ljc+sci2+0);
- hsig_i_SSE2 = gmx_load2_hpr(ljc+sci2+2);
- seps_i_SSE0 = gmx_load2_hpr(ljc+sci2+STRIDE+0);
- seps_i_SSE2 = gmx_load2_hpr(ljc+sci2+STRIDE+2);
+ gmx_load1p1_pr(hsig_i_S0, ljc+sci2+0);
+ gmx_load1p1_pr(hsig_i_S2, ljc+sci2+2);
+ gmx_load1p1_pr(seps_i_S0, ljc+sci2+STRIDE+0);
+ gmx_load1p1_pr(seps_i_S2, ljc+sci2+STRIDE+2);
#else
#ifdef LJ_COMB_GEOM
- c6s_SSE0 = gmx_load2_hpr(ljc+sci2+0);
+ gmx_load1p1_pr(c6s_S0, ljc+sci2+0);
if (!half_LJ)
{
- c6s_SSE2 = gmx_load2_hpr(ljc+sci2+2);
+ gmx_load1p1_pr(c6s_S2, ljc+sci2+2);
}
- c12s_SSE0 = gmx_load2_hpr(ljc+sci2+STRIDE+0);
+ gmx_load1p1_pr(c12s_S0, ljc+sci2+STRIDE+0);
if (!half_LJ)
{
- c12s_SSE2 = gmx_load2_hpr(ljc+sci2+STRIDE+2);
+ gmx_load1p1_pr(c12s_S2, ljc+sci2+STRIDE+2);
}
#else
nbfp0 = nbfp_ptr + type[sci ]*nbat->ntype*nbfp_stride;
#endif
/* Zero the potential energy for this list */
- VvdwtotSSE = gmx_setzero_pr();
- vctotSSE = gmx_setzero_pr();
+ Vvdwtot_S = gmx_setzero_pr();
+ vctot_S = gmx_setzero_pr();
/* Clear i atom forces */
- fix_SSE0 = gmx_setzero_pr();
- fix_SSE2 = gmx_setzero_pr();
- fiy_SSE0 = gmx_setzero_pr();
- fiy_SSE2 = gmx_setzero_pr();
- fiz_SSE0 = gmx_setzero_pr();
- fiz_SSE2 = gmx_setzero_pr();
+ fix_S0 = gmx_setzero_pr();
+ fix_S2 = gmx_setzero_pr();
+ fiy_S0 = gmx_setzero_pr();
+ fiy_S2 = gmx_setzero_pr();
+ fiz_S0 = gmx_setzero_pr();
+ fiz_S2 = gmx_setzero_pr();
cjind = cjind0;
ninner += cjind1 - cjind0;
/* Add accumulated i-forces to the force array */
-#if UNROLLJ >= 4
-#ifndef GMX_DOUBLE
-#define gmx_load_ps4 _mm_load_ps
-#define gmx_store_ps4 _mm_store_ps
-#define gmx_add_ps4 _mm_add_ps
+#if defined GMX_X86_AVX_256 && !defined GMX_DOUBLE
+#define gmx_load_pr4 _mm_load_ps
+#define gmx_store_pr4 _mm_store_ps
+#define gmx_add_pr4 _mm_add_ps
#else
-#define gmx_load_ps4 _mm256_load_pd
-#define gmx_store_ps4 _mm256_store_pd
-#define gmx_add_ps4 _mm256_add_pd
+#error "You need to define 4-width SIM macros for i-force reduction"
#endif
- GMX_MM_TRANSPOSE_SUM4H_PR(fix_SSE0, fix_SSE2, fix_SSE);
- gmx_store_ps4(f+scix, gmx_add_ps4(fix_SSE, gmx_load_ps4(f+scix)));
+ GMX_MM_TRANSPOSE_SUM4H_PR(fix_S0, fix_S2, fix_S);
+ gmx_store_pr4(f+scix, gmx_add_pr4(fix_S, gmx_load_pr4(f+scix)));
- GMX_MM_TRANSPOSE_SUM4H_PR(fiy_SSE0, fiy_SSE2, fiy_SSE);
- gmx_store_ps4(f+sciy, gmx_add_ps4(fiy_SSE, gmx_load_ps4(f+sciy)));
+ GMX_MM_TRANSPOSE_SUM4H_PR(fiy_S0, fiy_S2, fiy_S);
+ gmx_store_pr4(f+sciy, gmx_add_pr4(fiy_S, gmx_load_pr4(f+sciy)));
- GMX_MM_TRANSPOSE_SUM4H_PR(fiz_SSE0, fiz_SSE2, fiz_SSE);
- gmx_store_ps4(f+sciz, gmx_add_ps4(fiz_SSE, gmx_load_ps4(f+sciz)));
+ GMX_MM_TRANSPOSE_SUM4H_PR(fiz_S0, fiz_S2, fiz_S);
+ gmx_store_pr4(f+sciz, gmx_add_pr4(fiz_S, gmx_load_pr4(f+sciz)));
#ifdef CALC_SHIFTFORCES
- gmx_store_ps4(shf, fix_SSE);
+ gmx_store_pr4(shf, fix_S);
fshift[ish3+0] += SUM_SIMD4(shf);
- gmx_store_ps4(shf, fiy_SSE);
+ gmx_store_pr4(shf, fiy_S);
fshift[ish3+1] += SUM_SIMD4(shf);
- gmx_store_ps4(shf, fiz_SSE);
+ gmx_store_pr4(shf, fiz_S);
fshift[ish3+2] += SUM_SIMD4(shf);
#endif
-#else
- GMX_MM_TRANSPOSE_SUM2_PD(fix_SSE0, fix_SSE1, fix0_SSE);
- _mm_store_pd(f+scix, _mm_add_pd(fix0_SSE, _mm_load_pd(f+scix)));
- GMX_MM_TRANSPOSE_SUM2_PD(fix_SSE2, fix_SSE3, fix2_SSE);
- _mm_store_pd(f+scix+2, _mm_add_pd(fix2_SSE, _mm_load_pd(f+scix+2)));
-
- GMX_MM_TRANSPOSE_SUM2_PD(fiy_SSE0, fiy_SSE1, fiy0_SSE);
- _mm_store_pd(f+sciy, _mm_add_pd(fiy0_SSE, _mm_load_pd(f+sciy)));
- GMX_MM_TRANSPOSE_SUM2_PD(fiy_SSE2, fiy_SSE3, fiy2_SSE);
- _mm_store_pd(f+sciy+2, _mm_add_pd(fiy2_SSE, _mm_load_pd(f+sciy+2)));
-
- GMX_MM_TRANSPOSE_SUM2_PD(fiz_SSE0, fiz_SSE1, fiz0_SSE);
- _mm_store_pd(f+sciz, _mm_add_pd(fiz0_SSE, _mm_load_pd(f+sciz)));
- GMX_MM_TRANSPOSE_SUM2_PD(fiz_SSE2, fiz_SSE3, fiz2_SSE);
- _mm_store_pd(f+sciz+2, _mm_add_pd(fiz2_SSE, _mm_load_pd(f+sciz+2)));
-
-#ifdef CALC_SHIFTFORCES
- _mm_store_pd(shf, _mm_add_pd(fix0_SSE, fix2_SSE));
- fshift[ish3+0] += shf[0] + shf[1];
- _mm_store_pd(shf, _mm_add_pd(fiy0_SSE, fiy2_SSE));
- fshift[ish3+1] += shf[0] + shf[1];
- _mm_store_pd(shf, _mm_add_pd(fiz0_SSE, fiz2_SSE));
- fshift[ish3+2] += shf[0] + shf[1];
-#endif
-#endif
#ifdef CALC_ENERGIES
if (do_coul)
{
- gmx_store_pr(tmpsum, vctotSSE);
+ gmx_store_pr(tmpsum, vctot_S);
*Vc += SUM_SIMD(tmpsum);
}
- gmx_store_pr(tmpsum, VvdwtotSSE);
+ gmx_store_pr(tmpsum, Vvdwtot_S);
*Vvdw += SUM_SIMD(tmpsum);
#endif
#endif
}
-#undef gmx_load2_hpr
-#undef gmx_load_ps4
-#undef gmx_store_ps4
-#undef gmx_store_ps4
+#undef gmx_load_pr4
+#undef gmx_store_pr4
+#undef gmx_store_pr4
#undef CALC_SHIFTFORCES
#undef STRIDE
#undef TAB_FDV0
#undef NBFP_STRIDE
+
+#undef gmx_mm_hpr
+
+#undef gmx_load_hpr
+#undef gmx_load1_hpr
+#undef gmx_load1p1_pr
+#undef gmx_loaddh_pr
+#undef gmx_store_hpr
+#undef gmx_add_hpr
+#undef gmx_sub_hpr
+
+#undef gmx_sum4_hpr
/* Include all flavors of the SSE or AVX 4xN kernel loops */
-#if GMX_NBNXN_SIMD_BITWIDTH == 128
-#define GMX_MM128_HERE
-#else
-#if GMX_NBNXN_SIMD_BITWIDTH == 256
-#define GMX_MM256_HERE
-#else
+#if !(GMX_NBNXN_SIMD_BITWIDTH == 128 || GMX_NBNXN_SIMD_BITWIDTH == 256)
#error "unsupported GMX_NBNXN_SIMD_BITWIDTH"
#endif
-#endif
/* Analytical reaction-field kernels */
#define CALC_COUL_RF
#endif
/* Without exclusions and energies we only need to mask the cut-off,
- * this can be faster with blendv (only available with SSE4.1 and later).
+ * this can be faster when we have defined gmx_blendv_pr, i.e. an instruction
+ * that selects from two SIMD registers based on the contents of a third.
*/
-#if !(defined CHECK_EXCLS || defined CALC_ENERGIES) && defined GMX_X86_SSE4_1 && !defined COUNT_PAIRS
+#if !(defined CHECK_EXCLS || defined CALC_ENERGIES) && defined GMX_HAVE_SIMD_BLENDV && !defined COUNT_PAIRS
/* With RF and tabulated Coulomb we replace cmp+and with sub+blendv.
* With gcc this is slower, except for RF on Sandy Bridge.
* Tested with gcc 4.6.2, 4.6.3 and 4.7.1.
*/
#if (defined CALC_COUL_RF || defined CALC_COUL_TAB) && (!defined __GNUC__ || (defined CALC_COUL_RF && defined GMX_X86_AVX_256))
-#define CUTOFF_BLENDV
+#define NBNXN_CUTOFF_USE_BLENDV
#endif
/* With analytical Ewald we replace cmp+and+and with sub+blendv+blendv.
* This is only faster with icc on Sandy Bridge (PS kernel slower than gcc 4.7).
* Tested with icc 13.
*/
#if defined CALC_COUL_EWALD && defined __INTEL_COMPILER && defined GMX_X86_AVX_256
-#define CUTOFF_BLENDV
+#define NBNXN_CUTOFF_USE_BLENDV
#endif
#endif
#ifdef CHECK_EXCLS
/* Interaction (non-exclusion) mask of all 1's or 0's */
- gmx_mm_pr int_SSE0;
- gmx_mm_pr int_SSE1;
- gmx_mm_pr int_SSE2;
- gmx_mm_pr int_SSE3;
-#endif
-
- gmx_mm_pr jxSSE, jySSE, jzSSE;
- gmx_mm_pr dx_SSE0, dy_SSE0, dz_SSE0;
- gmx_mm_pr dx_SSE1, dy_SSE1, dz_SSE1;
- gmx_mm_pr dx_SSE2, dy_SSE2, dz_SSE2;
- gmx_mm_pr dx_SSE3, dy_SSE3, dz_SSE3;
- gmx_mm_pr tx_SSE0, ty_SSE0, tz_SSE0;
- gmx_mm_pr tx_SSE1, ty_SSE1, tz_SSE1;
- gmx_mm_pr tx_SSE2, ty_SSE2, tz_SSE2;
- gmx_mm_pr tx_SSE3, ty_SSE3, tz_SSE3;
- gmx_mm_pr rsq_SSE0, rinv_SSE0, rinvsq_SSE0;
- gmx_mm_pr rsq_SSE1, rinv_SSE1, rinvsq_SSE1;
- gmx_mm_pr rsq_SSE2, rinv_SSE2, rinvsq_SSE2;
- gmx_mm_pr rsq_SSE3, rinv_SSE3, rinvsq_SSE3;
-#ifndef CUTOFF_BLENDV
+ gmx_mm_pr int_S0;
+ gmx_mm_pr int_S1;
+ gmx_mm_pr int_S2;
+ gmx_mm_pr int_S3;
+#endif
+
+ gmx_mm_pr jx_S, jy_S, jz_S;
+ gmx_mm_pr dx_S0, dy_S0, dz_S0;
+ gmx_mm_pr dx_S1, dy_S1, dz_S1;
+ gmx_mm_pr dx_S2, dy_S2, dz_S2;
+ gmx_mm_pr dx_S3, dy_S3, dz_S3;
+ gmx_mm_pr tx_S0, ty_S0, tz_S0;
+ gmx_mm_pr tx_S1, ty_S1, tz_S1;
+ gmx_mm_pr tx_S2, ty_S2, tz_S2;
+ gmx_mm_pr tx_S3, ty_S3, tz_S3;
+ gmx_mm_pr rsq_S0, rinv_S0, rinvsq_S0;
+ gmx_mm_pr rsq_S1, rinv_S1, rinvsq_S1;
+ gmx_mm_pr rsq_S2, rinv_S2, rinvsq_S2;
+ gmx_mm_pr rsq_S3, rinv_S3, rinvsq_S3;
+#ifndef NBNXN_CUTOFF_USE_BLENDV
/* wco: within cut-off, mask of all 1's or 0's */
- gmx_mm_pr wco_SSE0;
- gmx_mm_pr wco_SSE1;
- gmx_mm_pr wco_SSE2;
- gmx_mm_pr wco_SSE3;
+ gmx_mm_pr wco_S0;
+ gmx_mm_pr wco_S1;
+ gmx_mm_pr wco_S2;
+ gmx_mm_pr wco_S3;
#endif
#ifdef VDW_CUTOFF_CHECK
- gmx_mm_pr wco_vdw_SSE0;
- gmx_mm_pr wco_vdw_SSE1;
+ gmx_mm_pr wco_vdw_S0;
+ gmx_mm_pr wco_vdw_S1;
#ifndef HALF_LJ
- gmx_mm_pr wco_vdw_SSE2;
- gmx_mm_pr wco_vdw_SSE3;
+ gmx_mm_pr wco_vdw_S2;
+ gmx_mm_pr wco_vdw_S3;
#endif
#endif
#ifdef CALC_COULOMB
#ifdef CHECK_EXCLS
/* 1/r masked with the interaction mask */
- gmx_mm_pr rinv_ex_SSE0;
- gmx_mm_pr rinv_ex_SSE1;
- gmx_mm_pr rinv_ex_SSE2;
- gmx_mm_pr rinv_ex_SSE3;
-#endif
- gmx_mm_pr jq_SSE;
- gmx_mm_pr qq_SSE0;
- gmx_mm_pr qq_SSE1;
- gmx_mm_pr qq_SSE2;
- gmx_mm_pr qq_SSE3;
+ gmx_mm_pr rinv_ex_S0;
+ gmx_mm_pr rinv_ex_S1;
+ gmx_mm_pr rinv_ex_S2;
+ gmx_mm_pr rinv_ex_S3;
+#endif
+ gmx_mm_pr jq_S;
+ gmx_mm_pr qq_S0;
+ gmx_mm_pr qq_S1;
+ gmx_mm_pr qq_S2;
+ gmx_mm_pr qq_S3;
#ifdef CALC_COUL_TAB
/* The force (PME mesh force) we need to subtract from 1/r^2 */
- gmx_mm_pr fsub_SSE0;
- gmx_mm_pr fsub_SSE1;
- gmx_mm_pr fsub_SSE2;
- gmx_mm_pr fsub_SSE3;
+ gmx_mm_pr fsub_S0;
+ gmx_mm_pr fsub_S1;
+ gmx_mm_pr fsub_S2;
+ gmx_mm_pr fsub_S3;
#endif
#ifdef CALC_COUL_EWALD
- gmx_mm_pr brsq_SSE0, brsq_SSE1, brsq_SSE2, brsq_SSE3;
- gmx_mm_pr ewcorr_SSE0, ewcorr_SSE1, ewcorr_SSE2, ewcorr_SSE3;
+ gmx_mm_pr brsq_S0, brsq_S1, brsq_S2, brsq_S3;
+ gmx_mm_pr ewcorr_S0, ewcorr_S1, ewcorr_S2, ewcorr_S3;
#endif
/* frcoul = (1/r - fsub)*r */
- gmx_mm_pr frcoul_SSE0;
- gmx_mm_pr frcoul_SSE1;
- gmx_mm_pr frcoul_SSE2;
- gmx_mm_pr frcoul_SSE3;
+ gmx_mm_pr frcoul_S0;
+ gmx_mm_pr frcoul_S1;
+ gmx_mm_pr frcoul_S2;
+ gmx_mm_pr frcoul_S3;
#ifdef CALC_COUL_TAB
/* For tables: r, rs=r/sp, rf=floor(rs), frac=rs-rf */
- gmx_mm_pr r_SSE0, rs_SSE0, rf_SSE0, frac_SSE0;
- gmx_mm_pr r_SSE1, rs_SSE1, rf_SSE1, frac_SSE1;
- gmx_mm_pr r_SSE2, rs_SSE2, rf_SSE2, frac_SSE2;
- gmx_mm_pr r_SSE3, rs_SSE3, rf_SSE3, frac_SSE3;
+ gmx_mm_pr r_S0, rs_S0, rf_S0, frac_S0;
+ gmx_mm_pr r_S1, rs_S1, rf_S1, frac_S1;
+ gmx_mm_pr r_S2, rs_S2, rf_S2, frac_S2;
+ gmx_mm_pr r_S3, rs_S3, rf_S3, frac_S3;
/* Table index: rs truncated to an int */
-#if !(defined GMX_MM256_HERE && defined GMX_DOUBLE)
- gmx_epi32 ti_SSE0, ti_SSE1, ti_SSE2, ti_SSE3;
-#else
- __m128i ti_SSE0, ti_SSE1, ti_SSE2, ti_SSE3;
-#endif
+ gmx_epi32 ti_S0, ti_S1, ti_S2, ti_S3;
/* Linear force table values */
- gmx_mm_pr ctab0_SSE0, ctab1_SSE0;
- gmx_mm_pr ctab0_SSE1, ctab1_SSE1;
- gmx_mm_pr ctab0_SSE2, ctab1_SSE2;
- gmx_mm_pr ctab0_SSE3, ctab1_SSE3;
+ gmx_mm_pr ctab0_S0, ctab1_S0;
+ gmx_mm_pr ctab0_S1, ctab1_S1;
+ gmx_mm_pr ctab0_S2, ctab1_S2;
+ gmx_mm_pr ctab0_S3, ctab1_S3;
#ifdef CALC_ENERGIES
/* Quadratic energy table value */
- gmx_mm_pr ctabv_SSE0;
- gmx_mm_pr ctabv_SSE1;
- gmx_mm_pr ctabv_SSE2;
- gmx_mm_pr ctabv_SSE3;
+ gmx_mm_pr ctabv_S0;
+ gmx_mm_pr ctabv_S1;
+ gmx_mm_pr ctabv_S2;
+ gmx_mm_pr ctabv_S3;
#endif
#endif
#if defined CALC_ENERGIES && (defined CALC_COUL_EWALD || defined CALC_COUL_TAB)
/* The potential (PME mesh) we need to subtract from 1/r */
- gmx_mm_pr vc_sub_SSE0;
- gmx_mm_pr vc_sub_SSE1;
- gmx_mm_pr vc_sub_SSE2;
- gmx_mm_pr vc_sub_SSE3;
+ gmx_mm_pr vc_sub_S0;
+ gmx_mm_pr vc_sub_S1;
+ gmx_mm_pr vc_sub_S2;
+ gmx_mm_pr vc_sub_S3;
#endif
#ifdef CALC_ENERGIES
/* Electrostatic potential */
- gmx_mm_pr vcoul_SSE0;
- gmx_mm_pr vcoul_SSE1;
- gmx_mm_pr vcoul_SSE2;
- gmx_mm_pr vcoul_SSE3;
+ gmx_mm_pr vcoul_S0;
+ gmx_mm_pr vcoul_S1;
+ gmx_mm_pr vcoul_S2;
+ gmx_mm_pr vcoul_S3;
#endif
#endif
/* The force times 1/r */
- gmx_mm_pr fscal_SSE0;
- gmx_mm_pr fscal_SSE1;
- gmx_mm_pr fscal_SSE2;
- gmx_mm_pr fscal_SSE3;
+ gmx_mm_pr fscal_S0;
+ gmx_mm_pr fscal_S1;
+ gmx_mm_pr fscal_S2;
+ gmx_mm_pr fscal_S3;
#ifdef CALC_LJ
#ifdef LJ_COMB_LB
/* LJ sigma_j/2 and sqrt(epsilon_j) */
- gmx_mm_pr hsig_j_SSE, seps_j_SSE;
+ gmx_mm_pr hsig_j_S, seps_j_S;
/* LJ sigma_ij and epsilon_ij */
- gmx_mm_pr sig_SSE0, eps_SSE0;
- gmx_mm_pr sig_SSE1, eps_SSE1;
+ gmx_mm_pr sig_S0, eps_S0;
+ gmx_mm_pr sig_S1, eps_S1;
#ifndef HALF_LJ
- gmx_mm_pr sig_SSE2, eps_SSE2;
- gmx_mm_pr sig_SSE3, eps_SSE3;
+ gmx_mm_pr sig_S2, eps_S2;
+ gmx_mm_pr sig_S3, eps_S3;
#endif
#ifdef CALC_ENERGIES
- gmx_mm_pr sig2_SSE0, sig6_SSE0;
- gmx_mm_pr sig2_SSE1, sig6_SSE1;
+ gmx_mm_pr sig2_S0, sig6_S0;
+ gmx_mm_pr sig2_S1, sig6_S1;
#ifndef HALF_LJ
- gmx_mm_pr sig2_SSE2, sig6_SSE2;
- gmx_mm_pr sig2_SSE3, sig6_SSE3;
+ gmx_mm_pr sig2_S2, sig6_S2;
+ gmx_mm_pr sig2_S3, sig6_S3;
#endif
#endif /* LJ_COMB_LB */
#endif /* CALC_LJ */
#ifdef LJ_COMB_GEOM
- gmx_mm_pr c6s_j_SSE, c12s_j_SSE;
+ gmx_mm_pr c6s_j_S, c12s_j_S;
#endif
#if defined LJ_COMB_GEOM || defined LJ_COMB_LB
#ifndef FIX_LJ_C
/* LJ C6 and C12 parameters, used with geometric comb. rule */
- gmx_mm_pr c6_SSE0, c12_SSE0;
- gmx_mm_pr c6_SSE1, c12_SSE1;
+ gmx_mm_pr c6_S0, c12_S0;
+ gmx_mm_pr c6_S1, c12_S1;
#ifndef HALF_LJ
- gmx_mm_pr c6_SSE2, c12_SSE2;
- gmx_mm_pr c6_SSE3, c12_SSE3;
+ gmx_mm_pr c6_S2, c12_S2;
+ gmx_mm_pr c6_S3, c12_S3;
#endif
#endif
/* Intermediate variables for LJ calculation */
#ifndef LJ_COMB_LB
- gmx_mm_pr rinvsix_SSE0;
- gmx_mm_pr rinvsix_SSE1;
+ gmx_mm_pr rinvsix_S0;
+ gmx_mm_pr rinvsix_S1;
#ifndef HALF_LJ
- gmx_mm_pr rinvsix_SSE2;
- gmx_mm_pr rinvsix_SSE3;
+ gmx_mm_pr rinvsix_S2;
+ gmx_mm_pr rinvsix_S3;
#endif
#endif
#ifdef LJ_COMB_LB
- gmx_mm_pr sir_SSE0, sir2_SSE0, sir6_SSE0;
- gmx_mm_pr sir_SSE1, sir2_SSE1, sir6_SSE1;
+ gmx_mm_pr sir_S0, sir2_S0, sir6_S0;
+ gmx_mm_pr sir_S1, sir2_S1, sir6_S1;
#ifndef HALF_LJ
- gmx_mm_pr sir_SSE2, sir2_SSE2, sir6_SSE2;
- gmx_mm_pr sir_SSE3, sir2_SSE3, sir6_SSE3;
+ gmx_mm_pr sir_S2, sir2_S2, sir6_S2;
+ gmx_mm_pr sir_S3, sir2_S3, sir6_S3;
#endif
#endif
- gmx_mm_pr FrLJ6_SSE0, FrLJ12_SSE0;
- gmx_mm_pr FrLJ6_SSE1, FrLJ12_SSE1;
+ gmx_mm_pr FrLJ6_S0, FrLJ12_S0;
+ gmx_mm_pr FrLJ6_S1, FrLJ12_S1;
#ifndef HALF_LJ
- gmx_mm_pr FrLJ6_SSE2, FrLJ12_SSE2;
- gmx_mm_pr FrLJ6_SSE3, FrLJ12_SSE3;
+ gmx_mm_pr FrLJ6_S2, FrLJ12_S2;
+ gmx_mm_pr FrLJ6_S3, FrLJ12_S3;
#endif
#ifdef CALC_ENERGIES
- gmx_mm_pr VLJ6_SSE0, VLJ12_SSE0, VLJ_SSE0;
- gmx_mm_pr VLJ6_SSE1, VLJ12_SSE1, VLJ_SSE1;
+ gmx_mm_pr VLJ6_S0, VLJ12_S0, VLJ_S0;
+ gmx_mm_pr VLJ6_S1, VLJ12_S1, VLJ_S1;
#ifndef HALF_LJ
- gmx_mm_pr VLJ6_SSE2, VLJ12_SSE2, VLJ_SSE2;
- gmx_mm_pr VLJ6_SSE3, VLJ12_SSE3, VLJ_SSE3;
+ gmx_mm_pr VLJ6_S2, VLJ12_S2, VLJ_S2;
+ gmx_mm_pr VLJ6_S3, VLJ12_S3, VLJ_S3;
#endif
#endif
#endif /* CALC_LJ */
ajz = ajy + STRIDE;
#ifdef CHECK_EXCLS
-#if defined GMX_X86_SSE2 && defined GMX_MM128_HERE
+#ifdef gmx_checkbitmask_epi32
{
- /* Load integer interaction mask */
- __m128i mask_int = _mm_set1_epi32(l_cj[cjind].excl);
+ /* Integer mask set and operations, cast result to real */
+ gmx_epi32 mask_pr_S = gmx_set1_epi32(l_cj[cjind].excl);
- int_SSE0 = gmx_mm_castsi128_pr(_mm_cmpeq_epi32(_mm_andnot_si128(mask_int, mask0), zeroi_SSE));
- int_SSE1 = gmx_mm_castsi128_pr(_mm_cmpeq_epi32(_mm_andnot_si128(mask_int, mask1), zeroi_SSE));
- int_SSE2 = gmx_mm_castsi128_pr(_mm_cmpeq_epi32(_mm_andnot_si128(mask_int, mask2), zeroi_SSE));
- int_SSE3 = gmx_mm_castsi128_pr(_mm_cmpeq_epi32(_mm_andnot_si128(mask_int, mask3), zeroi_SSE));
+ int_S0 = gmx_castsi_pr(gmx_checkbitmask_epi32(mask_pr_S, mask_S0));
+ int_S1 = gmx_castsi_pr(gmx_checkbitmask_epi32(mask_pr_S, mask_S1));
+ int_S2 = gmx_castsi_pr(gmx_checkbitmask_epi32(mask_pr_S, mask_S2));
+ int_S3 = gmx_castsi_pr(gmx_checkbitmask_epi32(mask_pr_S, mask_S3));
}
-#endif
-#if defined GMX_X86_SSE2 && defined GMX_MM256_HERE
- {
-#ifndef GMX_DOUBLE
- /* Load integer interaction mask */
- /* With AVX there are no integer operations, so cast to real */
- gmx_mm_pr mask_pr = gmx_mm_castsi256_pr(_mm256_set1_epi32(l_cj[cjind].excl));
- /* We can't compare all 4*8=32 float bits: shift the mask */
- gmx_mm_pr masksh_pr = gmx_mm_castsi256_pr(_mm256_set1_epi32(l_cj[cjind].excl>>(2*UNROLLJ)));
- /* Intel Compiler version 12.1.3 20120130 is buggy: use cast.
- * With gcc we don't need the cast, but it's faster.
- */
-#define cast_cvt(x) _mm256_cvtepi32_ps(_mm256_castps_si256(x))
- int_SSE0 = gmx_cmpneq_pr(cast_cvt(gmx_and_pr(mask_pr, mask0)), zero_SSE);
- int_SSE1 = gmx_cmpneq_pr(cast_cvt(gmx_and_pr(mask_pr, mask1)), zero_SSE);
- int_SSE2 = gmx_cmpneq_pr(cast_cvt(gmx_and_pr(masksh_pr, mask0)), zero_SSE);
- int_SSE3 = gmx_cmpneq_pr(cast_cvt(gmx_and_pr(masksh_pr, mask1)), zero_SSE);
-#undef cast_cvt
#else
- /* Load integer interaction mask */
- /* With AVX there are no integer operations,
- * and there is no int to double conversion, so cast to float
- */
- __m256 mask_ps = _mm256_castsi256_ps(_mm256_set1_epi32(l_cj[cjind].excl));
-#define cast_cvt(x) _mm256_castps_pd(_mm256_cvtepi32_ps(_mm256_castps_si256(x)))
- int_SSE0 = gmx_cmpneq_pr(cast_cvt(_mm256_and_ps(mask_ps, mask0)), zero_SSE);
- int_SSE1 = gmx_cmpneq_pr(cast_cvt(_mm256_and_ps(mask_ps, mask1)), zero_SSE);
- int_SSE2 = gmx_cmpneq_pr(cast_cvt(_mm256_and_ps(mask_ps, mask2)), zero_SSE);
- int_SSE3 = gmx_cmpneq_pr(cast_cvt(_mm256_and_ps(mask_ps, mask3)), zero_SSE);
-#undef cast_cvt
-#endif
+ {
+ /* Integer mask set, cast to real and real mask operations */
+ gmx_mm_pr mask_pr_S = gmx_castsi_pr(gmx_set1_epi32(l_cj[cjind].excl));
+
+ int_S0 = gmx_checkbitmask_pr(mask_pr_S, mask_S0);
+ int_S1 = gmx_checkbitmask_pr(mask_pr_S, mask_S1);
+ int_S2 = gmx_checkbitmask_pr(mask_pr_S, mask_S2);
+ int_S3 = gmx_checkbitmask_pr(mask_pr_S, mask_S3);
}
#endif
#endif
+
/* load j atom coordinates */
- jxSSE = gmx_load_pr(x+ajx);
- jySSE = gmx_load_pr(x+ajy);
- jzSSE = gmx_load_pr(x+ajz);
+ jx_S = gmx_load_pr(x+ajx);
+ jy_S = gmx_load_pr(x+ajy);
+ jz_S = gmx_load_pr(x+ajz);
/* Calculate distance */
- dx_SSE0 = gmx_sub_pr(ix_SSE0, jxSSE);
- dy_SSE0 = gmx_sub_pr(iy_SSE0, jySSE);
- dz_SSE0 = gmx_sub_pr(iz_SSE0, jzSSE);
- dx_SSE1 = gmx_sub_pr(ix_SSE1, jxSSE);
- dy_SSE1 = gmx_sub_pr(iy_SSE1, jySSE);
- dz_SSE1 = gmx_sub_pr(iz_SSE1, jzSSE);
- dx_SSE2 = gmx_sub_pr(ix_SSE2, jxSSE);
- dy_SSE2 = gmx_sub_pr(iy_SSE2, jySSE);
- dz_SSE2 = gmx_sub_pr(iz_SSE2, jzSSE);
- dx_SSE3 = gmx_sub_pr(ix_SSE3, jxSSE);
- dy_SSE3 = gmx_sub_pr(iy_SSE3, jySSE);
- dz_SSE3 = gmx_sub_pr(iz_SSE3, jzSSE);
+ dx_S0 = gmx_sub_pr(ix_S0, jx_S);
+ dy_S0 = gmx_sub_pr(iy_S0, jy_S);
+ dz_S0 = gmx_sub_pr(iz_S0, jz_S);
+ dx_S1 = gmx_sub_pr(ix_S1, jx_S);
+ dy_S1 = gmx_sub_pr(iy_S1, jy_S);
+ dz_S1 = gmx_sub_pr(iz_S1, jz_S);
+ dx_S2 = gmx_sub_pr(ix_S2, jx_S);
+ dy_S2 = gmx_sub_pr(iy_S2, jy_S);
+ dz_S2 = gmx_sub_pr(iz_S2, jz_S);
+ dx_S3 = gmx_sub_pr(ix_S3, jx_S);
+ dy_S3 = gmx_sub_pr(iy_S3, jy_S);
+ dz_S3 = gmx_sub_pr(iz_S3, jz_S);
/* rsq = dx*dx+dy*dy+dz*dz */
- rsq_SSE0 = gmx_calc_rsq_pr(dx_SSE0, dy_SSE0, dz_SSE0);
- rsq_SSE1 = gmx_calc_rsq_pr(dx_SSE1, dy_SSE1, dz_SSE1);
- rsq_SSE2 = gmx_calc_rsq_pr(dx_SSE2, dy_SSE2, dz_SSE2);
- rsq_SSE3 = gmx_calc_rsq_pr(dx_SSE3, dy_SSE3, dz_SSE3);
+ rsq_S0 = gmx_calc_rsq_pr(dx_S0, dy_S0, dz_S0);
+ rsq_S1 = gmx_calc_rsq_pr(dx_S1, dy_S1, dz_S1);
+ rsq_S2 = gmx_calc_rsq_pr(dx_S2, dy_S2, dz_S2);
+ rsq_S3 = gmx_calc_rsq_pr(dx_S3, dy_S3, dz_S3);
-#ifndef CUTOFF_BLENDV
- wco_SSE0 = gmx_cmplt_pr(rsq_SSE0, rc2_SSE);
- wco_SSE1 = gmx_cmplt_pr(rsq_SSE1, rc2_SSE);
- wco_SSE2 = gmx_cmplt_pr(rsq_SSE2, rc2_SSE);
- wco_SSE3 = gmx_cmplt_pr(rsq_SSE3, rc2_SSE);
+#ifndef NBNXN_CUTOFF_USE_BLENDV
+ wco_S0 = gmx_cmplt_pr(rsq_S0, rc2_S);
+ wco_S1 = gmx_cmplt_pr(rsq_S1, rc2_S);
+ wco_S2 = gmx_cmplt_pr(rsq_S2, rc2_S);
+ wco_S3 = gmx_cmplt_pr(rsq_S3, rc2_S);
#endif
#ifdef CHECK_EXCLS
#if UNROLLJ == UNROLLI
if (cj == ci_sh)
{
- wco_SSE0 = gmx_and_pr(wco_SSE0, diag_SSE0);
- wco_SSE1 = gmx_and_pr(wco_SSE1, diag_SSE1);
- wco_SSE2 = gmx_and_pr(wco_SSE2, diag_SSE2);
- wco_SSE3 = gmx_and_pr(wco_SSE3, diag_SSE3);
+ wco_S0 = gmx_and_pr(wco_S0, diag_S0);
+ wco_S1 = gmx_and_pr(wco_S1, diag_S1);
+ wco_S2 = gmx_and_pr(wco_S2, diag_S2);
+ wco_S3 = gmx_and_pr(wco_S3, diag_S3);
}
#else
#if UNROLLJ < UNROLLI
if (cj == ci_sh*2)
{
- wco_SSE0 = gmx_and_pr(wco_SSE0, diag0_SSE0);
- wco_SSE1 = gmx_and_pr(wco_SSE1, diag0_SSE1);
- wco_SSE2 = gmx_and_pr(wco_SSE2, diag0_SSE2);
- wco_SSE3 = gmx_and_pr(wco_SSE3, diag0_SSE3);
+ wco_S0 = gmx_and_pr(wco_S0, diag0_S0);
+ wco_S1 = gmx_and_pr(wco_S1, diag0_S1);
+ wco_S2 = gmx_and_pr(wco_S2, diag0_S2);
+ wco_S3 = gmx_and_pr(wco_S3, diag0_S3);
}
if (cj == ci_sh*2 + 1)
{
- wco_SSE0 = gmx_and_pr(wco_SSE0, diag1_SSE0);
- wco_SSE1 = gmx_and_pr(wco_SSE1, diag1_SSE1);
- wco_SSE2 = gmx_and_pr(wco_SSE2, diag1_SSE2);
- wco_SSE3 = gmx_and_pr(wco_SSE3, diag1_SSE3);
+ wco_S0 = gmx_and_pr(wco_S0, diag1_S0);
+ wco_S1 = gmx_and_pr(wco_S1, diag1_S1);
+ wco_S2 = gmx_and_pr(wco_S2, diag1_S2);
+ wco_S3 = gmx_and_pr(wco_S3, diag1_S3);
}
#else
if (cj*2 == ci_sh)
{
- wco_SSE0 = gmx_and_pr(wco_SSE0, diag0_SSE0);
- wco_SSE1 = gmx_and_pr(wco_SSE1, diag0_SSE1);
- wco_SSE2 = gmx_and_pr(wco_SSE2, diag0_SSE2);
- wco_SSE3 = gmx_and_pr(wco_SSE3, diag0_SSE3);
+ wco_S0 = gmx_and_pr(wco_S0, diag0_S0);
+ wco_S1 = gmx_and_pr(wco_S1, diag0_S1);
+ wco_S2 = gmx_and_pr(wco_S2, diag0_S2);
+ wco_S3 = gmx_and_pr(wco_S3, diag0_S3);
}
else if (cj*2 + 1 == ci_sh)
{
- wco_SSE0 = gmx_and_pr(wco_SSE0, diag1_SSE0);
- wco_SSE1 = gmx_and_pr(wco_SSE1, diag1_SSE1);
- wco_SSE2 = gmx_and_pr(wco_SSE2, diag1_SSE2);
- wco_SSE3 = gmx_and_pr(wco_SSE3, diag1_SSE3);
+ wco_S0 = gmx_and_pr(wco_S0, diag1_S0);
+ wco_S1 = gmx_and_pr(wco_S1, diag1_S1);
+ wco_S2 = gmx_and_pr(wco_S2, diag1_S2);
+ wco_S3 = gmx_and_pr(wco_S3, diag1_S3);
}
#endif
#endif
#else /* EXCL_FORCES */
- /* Remove all excluded atom pairs from the list */
- wco_SSE0 = gmx_and_pr(wco_SSE0, int_SSE0);
- wco_SSE1 = gmx_and_pr(wco_SSE1, int_SSE1);
- wco_SSE2 = gmx_and_pr(wco_SSE2, int_SSE2);
- wco_SSE3 = gmx_and_pr(wco_SSE3, int_SSE3);
+ /* No exclusion forces: remove all excluded atom pairs from the list */
+ wco_S0 = gmx_and_pr(wco_S0, int_S0);
+ wco_S1 = gmx_and_pr(wco_S1, int_S1);
+ wco_S2 = gmx_and_pr(wco_S2, int_S2);
+ wco_S3 = gmx_and_pr(wco_S3, int_S3);
#endif
#endif
#ifdef COUNT_PAIRS
{
int i, j;
- real tmp[UNROLLJ];
+ real tmpa[2*GMX_SIMD_WIDTH_HERE], *tmp;
+ tmp = gmx_simd_align_real(tmpa);
for (i = 0; i < UNROLLI; i++)
{
- gmx_storeu_pr(tmp, i == 0 ? wco_SSE0 : (i == 1 ? wco_SSE1 : (i == 2 ? wco_SSE2 : wco_SSE3)));
+ gmx_store_pr(tmp, i == 0 ? wco_S0 : (i == 1 ? wco_S1 : (i == 2 ? wco_S2 : wco_S3)));
for (j = 0; j < UNROLLJ; j++)
{
if (!(tmp[j] == 0))
#ifdef CHECK_EXCLS
/* For excluded pairs add a small number to avoid r^-6 = NaN */
- rsq_SSE0 = gmx_add_pr(rsq_SSE0, gmx_andnot_pr(int_SSE0, avoid_sing_SSE));
- rsq_SSE1 = gmx_add_pr(rsq_SSE1, gmx_andnot_pr(int_SSE1, avoid_sing_SSE));
- rsq_SSE2 = gmx_add_pr(rsq_SSE2, gmx_andnot_pr(int_SSE2, avoid_sing_SSE));
- rsq_SSE3 = gmx_add_pr(rsq_SSE3, gmx_andnot_pr(int_SSE3, avoid_sing_SSE));
+ rsq_S0 = gmx_add_pr(rsq_S0, gmx_andnot_pr(int_S0, avoid_sing_S));
+ rsq_S1 = gmx_add_pr(rsq_S1, gmx_andnot_pr(int_S1, avoid_sing_S));
+ rsq_S2 = gmx_add_pr(rsq_S2, gmx_andnot_pr(int_S2, avoid_sing_S));
+ rsq_S3 = gmx_add_pr(rsq_S3, gmx_andnot_pr(int_S3, avoid_sing_S));
#endif
/* Calculate 1/r */
#ifndef GMX_DOUBLE
- rinv_SSE0 = gmx_invsqrt_pr(rsq_SSE0);
- rinv_SSE1 = gmx_invsqrt_pr(rsq_SSE1);
- rinv_SSE2 = gmx_invsqrt_pr(rsq_SSE2);
- rinv_SSE3 = gmx_invsqrt_pr(rsq_SSE3);
+ rinv_S0 = gmx_invsqrt_pr(rsq_S0);
+ rinv_S1 = gmx_invsqrt_pr(rsq_S1);
+ rinv_S2 = gmx_invsqrt_pr(rsq_S2);
+ rinv_S3 = gmx_invsqrt_pr(rsq_S3);
#else
- GMX_MM_INVSQRT2_PD(rsq_SSE0, rsq_SSE1, rinv_SSE0, rinv_SSE1);
- GMX_MM_INVSQRT2_PD(rsq_SSE2, rsq_SSE3, rinv_SSE2, rinv_SSE3);
+ GMX_MM_INVSQRT2_PD(rsq_S0, rsq_S1, rinv_S0, rinv_S1);
+ GMX_MM_INVSQRT2_PD(rsq_S2, rsq_S3, rinv_S2, rinv_S3);
#endif
#ifdef CALC_COULOMB
/* Load parameters for j atom */
- jq_SSE = gmx_load_pr(q+aj);
- qq_SSE0 = gmx_mul_pr(iq_SSE0, jq_SSE);
- qq_SSE1 = gmx_mul_pr(iq_SSE1, jq_SSE);
- qq_SSE2 = gmx_mul_pr(iq_SSE2, jq_SSE);
- qq_SSE3 = gmx_mul_pr(iq_SSE3, jq_SSE);
+ jq_S = gmx_load_pr(q+aj);
+ qq_S0 = gmx_mul_pr(iq_S0, jq_S);
+ qq_S1 = gmx_mul_pr(iq_S1, jq_S);
+ qq_S2 = gmx_mul_pr(iq_S2, jq_S);
+ qq_S3 = gmx_mul_pr(iq_S3, jq_S);
#endif
#ifdef CALC_LJ
#if !defined LJ_COMB_GEOM && !defined LJ_COMB_LB && !defined FIX_LJ_C
- load_lj_pair_params(nbfp0, type, aj, c6_SSE0, c12_SSE0);
- load_lj_pair_params(nbfp1, type, aj, c6_SSE1, c12_SSE1);
+ load_lj_pair_params(nbfp0, type, aj, c6_S0, c12_S0);
+ load_lj_pair_params(nbfp1, type, aj, c6_S1, c12_S1);
#ifndef HALF_LJ
- load_lj_pair_params(nbfp2, type, aj, c6_SSE2, c12_SSE2);
- load_lj_pair_params(nbfp3, type, aj, c6_SSE3, c12_SSE3);
+ load_lj_pair_params(nbfp2, type, aj, c6_S2, c12_S2);
+ load_lj_pair_params(nbfp3, type, aj, c6_S3, c12_S3);
#endif
#endif /* not defined any LJ rule */
#ifdef LJ_COMB_GEOM
- c6s_j_SSE = gmx_load_pr(ljc+aj2+0);
- c12s_j_SSE = gmx_load_pr(ljc+aj2+STRIDE);
- c6_SSE0 = gmx_mul_pr(c6s_SSE0, c6s_j_SSE );
- c6_SSE1 = gmx_mul_pr(c6s_SSE1, c6s_j_SSE );
+ c6s_j_S = gmx_load_pr(ljc+aj2+0);
+ c12s_j_S = gmx_load_pr(ljc+aj2+STRIDE);
+ c6_S0 = gmx_mul_pr(c6s_S0, c6s_j_S );
+ c6_S1 = gmx_mul_pr(c6s_S1, c6s_j_S );
#ifndef HALF_LJ
- c6_SSE2 = gmx_mul_pr(c6s_SSE2, c6s_j_SSE );
- c6_SSE3 = gmx_mul_pr(c6s_SSE3, c6s_j_SSE );
+ c6_S2 = gmx_mul_pr(c6s_S2, c6s_j_S );
+ c6_S3 = gmx_mul_pr(c6s_S3, c6s_j_S );
#endif
- c12_SSE0 = gmx_mul_pr(c12s_SSE0, c12s_j_SSE);
- c12_SSE1 = gmx_mul_pr(c12s_SSE1, c12s_j_SSE);
+ c12_S0 = gmx_mul_pr(c12s_S0, c12s_j_S);
+ c12_S1 = gmx_mul_pr(c12s_S1, c12s_j_S);
#ifndef HALF_LJ
- c12_SSE2 = gmx_mul_pr(c12s_SSE2, c12s_j_SSE);
- c12_SSE3 = gmx_mul_pr(c12s_SSE3, c12s_j_SSE);
+ c12_S2 = gmx_mul_pr(c12s_S2, c12s_j_S);
+ c12_S3 = gmx_mul_pr(c12s_S3, c12s_j_S);
#endif
#endif /* LJ_COMB_GEOM */
#ifdef LJ_COMB_LB
- hsig_j_SSE = gmx_load_pr(ljc+aj2+0);
- seps_j_SSE = gmx_load_pr(ljc+aj2+STRIDE);
+ hsig_j_S = gmx_load_pr(ljc+aj2+0);
+ seps_j_S = gmx_load_pr(ljc+aj2+STRIDE);
- sig_SSE0 = gmx_add_pr(hsig_i_SSE0, hsig_j_SSE);
- sig_SSE1 = gmx_add_pr(hsig_i_SSE1, hsig_j_SSE);
- eps_SSE0 = gmx_mul_pr(seps_i_SSE0, seps_j_SSE);
- eps_SSE1 = gmx_mul_pr(seps_i_SSE1, seps_j_SSE);
+ sig_S0 = gmx_add_pr(hsig_i_S0, hsig_j_S);
+ sig_S1 = gmx_add_pr(hsig_i_S1, hsig_j_S);
+ eps_S0 = gmx_mul_pr(seps_i_S0, seps_j_S);
+ eps_S1 = gmx_mul_pr(seps_i_S1, seps_j_S);
#ifndef HALF_LJ
- sig_SSE2 = gmx_add_pr(hsig_i_SSE2, hsig_j_SSE);
- sig_SSE3 = gmx_add_pr(hsig_i_SSE3, hsig_j_SSE);
- eps_SSE2 = gmx_mul_pr(seps_i_SSE2, seps_j_SSE);
- eps_SSE3 = gmx_mul_pr(seps_i_SSE3, seps_j_SSE);
+ sig_S2 = gmx_add_pr(hsig_i_S2, hsig_j_S);
+ sig_S3 = gmx_add_pr(hsig_i_S3, hsig_j_S);
+ eps_S2 = gmx_mul_pr(seps_i_S2, seps_j_S);
+ eps_S3 = gmx_mul_pr(seps_i_S3, seps_j_S);
#endif
#endif /* LJ_COMB_LB */
#endif /* CALC_LJ */
-#ifndef CUTOFF_BLENDV
- rinv_SSE0 = gmx_and_pr(rinv_SSE0, wco_SSE0);
- rinv_SSE1 = gmx_and_pr(rinv_SSE1, wco_SSE1);
- rinv_SSE2 = gmx_and_pr(rinv_SSE2, wco_SSE2);
- rinv_SSE3 = gmx_and_pr(rinv_SSE3, wco_SSE3);
+#ifndef NBNXN_CUTOFF_USE_BLENDV
+ rinv_S0 = gmx_blendzero_pr(rinv_S0, wco_S0);
+ rinv_S1 = gmx_blendzero_pr(rinv_S1, wco_S1);
+ rinv_S2 = gmx_blendzero_pr(rinv_S2, wco_S2);
+ rinv_S3 = gmx_blendzero_pr(rinv_S3, wco_S3);
#else
/* We only need to mask for the cut-off: blendv is faster */
- rinv_SSE0 = gmx_blendv_pr(rinv_SSE0, zero_SSE, gmx_sub_pr(rc2_SSE, rsq_SSE0));
- rinv_SSE1 = gmx_blendv_pr(rinv_SSE1, zero_SSE, gmx_sub_pr(rc2_SSE, rsq_SSE1));
- rinv_SSE2 = gmx_blendv_pr(rinv_SSE2, zero_SSE, gmx_sub_pr(rc2_SSE, rsq_SSE2));
- rinv_SSE3 = gmx_blendv_pr(rinv_SSE3, zero_SSE, gmx_sub_pr(rc2_SSE, rsq_SSE3));
+ rinv_S0 = gmx_blendv_pr(rinv_S0, zero_S, gmx_sub_pr(rc2_S, rsq_S0));
+ rinv_S1 = gmx_blendv_pr(rinv_S1, zero_S, gmx_sub_pr(rc2_S, rsq_S1));
+ rinv_S2 = gmx_blendv_pr(rinv_S2, zero_S, gmx_sub_pr(rc2_S, rsq_S2));
+ rinv_S3 = gmx_blendv_pr(rinv_S3, zero_S, gmx_sub_pr(rc2_S, rsq_S3));
#endif
- rinvsq_SSE0 = gmx_mul_pr(rinv_SSE0, rinv_SSE0);
- rinvsq_SSE1 = gmx_mul_pr(rinv_SSE1, rinv_SSE1);
- rinvsq_SSE2 = gmx_mul_pr(rinv_SSE2, rinv_SSE2);
- rinvsq_SSE3 = gmx_mul_pr(rinv_SSE3, rinv_SSE3);
+ rinvsq_S0 = gmx_mul_pr(rinv_S0, rinv_S0);
+ rinvsq_S1 = gmx_mul_pr(rinv_S1, rinv_S1);
+ rinvsq_S2 = gmx_mul_pr(rinv_S2, rinv_S2);
+ rinvsq_S3 = gmx_mul_pr(rinv_S3, rinv_S3);
#ifdef CALC_COULOMB
/* Note that here we calculate force*r, not the usual force/r.
#ifdef EXCL_FORCES
/* Only add 1/r for non-excluded atom pairs */
- rinv_ex_SSE0 = gmx_and_pr(rinv_SSE0, int_SSE0);
- rinv_ex_SSE1 = gmx_and_pr(rinv_SSE1, int_SSE1);
- rinv_ex_SSE2 = gmx_and_pr(rinv_SSE2, int_SSE2);
- rinv_ex_SSE3 = gmx_and_pr(rinv_SSE3, int_SSE3);
+ rinv_ex_S0 = gmx_blendzero_pr(rinv_S0, int_S0);
+ rinv_ex_S1 = gmx_blendzero_pr(rinv_S1, int_S1);
+ rinv_ex_S2 = gmx_blendzero_pr(rinv_S2, int_S2);
+ rinv_ex_S3 = gmx_blendzero_pr(rinv_S3, int_S3);
#else
/* No exclusion forces, we always need 1/r */
-#define rinv_ex_SSE0 rinv_SSE0
-#define rinv_ex_SSE1 rinv_SSE1
-#define rinv_ex_SSE2 rinv_SSE2
-#define rinv_ex_SSE3 rinv_SSE3
+#define rinv_ex_S0 rinv_S0
+#define rinv_ex_S1 rinv_S1
+#define rinv_ex_S2 rinv_S2
+#define rinv_ex_S3 rinv_S3
#endif
#ifdef CALC_COUL_RF
/* Electrostatic interactions */
- frcoul_SSE0 = gmx_mul_pr(qq_SSE0, gmx_add_pr(rinv_ex_SSE0, gmx_mul_pr(rsq_SSE0, mrc_3_SSE)));
- frcoul_SSE1 = gmx_mul_pr(qq_SSE1, gmx_add_pr(rinv_ex_SSE1, gmx_mul_pr(rsq_SSE1, mrc_3_SSE)));
- frcoul_SSE2 = gmx_mul_pr(qq_SSE2, gmx_add_pr(rinv_ex_SSE2, gmx_mul_pr(rsq_SSE2, mrc_3_SSE)));
- frcoul_SSE3 = gmx_mul_pr(qq_SSE3, gmx_add_pr(rinv_ex_SSE3, gmx_mul_pr(rsq_SSE3, mrc_3_SSE)));
+ frcoul_S0 = gmx_mul_pr(qq_S0, gmx_add_pr(rinv_ex_S0, gmx_mul_pr(rsq_S0, mrc_3_S)));
+ frcoul_S1 = gmx_mul_pr(qq_S1, gmx_add_pr(rinv_ex_S1, gmx_mul_pr(rsq_S1, mrc_3_S)));
+ frcoul_S2 = gmx_mul_pr(qq_S2, gmx_add_pr(rinv_ex_S2, gmx_mul_pr(rsq_S2, mrc_3_S)));
+ frcoul_S3 = gmx_mul_pr(qq_S3, gmx_add_pr(rinv_ex_S3, gmx_mul_pr(rsq_S3, mrc_3_S)));
#ifdef CALC_ENERGIES
- vcoul_SSE0 = gmx_mul_pr(qq_SSE0, gmx_add_pr(rinv_ex_SSE0, gmx_add_pr(gmx_mul_pr(rsq_SSE0, hrc_3_SSE), moh_rc_SSE)));
- vcoul_SSE1 = gmx_mul_pr(qq_SSE1, gmx_add_pr(rinv_ex_SSE1, gmx_add_pr(gmx_mul_pr(rsq_SSE1, hrc_3_SSE), moh_rc_SSE)));
- vcoul_SSE2 = gmx_mul_pr(qq_SSE2, gmx_add_pr(rinv_ex_SSE2, gmx_add_pr(gmx_mul_pr(rsq_SSE2, hrc_3_SSE), moh_rc_SSE)));
- vcoul_SSE3 = gmx_mul_pr(qq_SSE3, gmx_add_pr(rinv_ex_SSE3, gmx_add_pr(gmx_mul_pr(rsq_SSE3, hrc_3_SSE), moh_rc_SSE)));
+ vcoul_S0 = gmx_mul_pr(qq_S0, gmx_add_pr(rinv_ex_S0, gmx_add_pr(gmx_mul_pr(rsq_S0, hrc_3_S), moh_rc_S)));
+ vcoul_S1 = gmx_mul_pr(qq_S1, gmx_add_pr(rinv_ex_S1, gmx_add_pr(gmx_mul_pr(rsq_S1, hrc_3_S), moh_rc_S)));
+ vcoul_S2 = gmx_mul_pr(qq_S2, gmx_add_pr(rinv_ex_S2, gmx_add_pr(gmx_mul_pr(rsq_S2, hrc_3_S), moh_rc_S)));
+ vcoul_S3 = gmx_mul_pr(qq_S3, gmx_add_pr(rinv_ex_S3, gmx_add_pr(gmx_mul_pr(rsq_S3, hrc_3_S), moh_rc_S)));
#endif
#endif
/* We need to mask (or limit) rsq for the cut-off,
* as large distances can cause an overflow in gmx_pmecorrF/V.
*/
-#ifndef CUTOFF_BLENDV
- brsq_SSE0 = gmx_mul_pr(beta2_SSE, gmx_and_pr(rsq_SSE0, wco_SSE0));
- brsq_SSE1 = gmx_mul_pr(beta2_SSE, gmx_and_pr(rsq_SSE1, wco_SSE1));
- brsq_SSE2 = gmx_mul_pr(beta2_SSE, gmx_and_pr(rsq_SSE2, wco_SSE2));
- brsq_SSE3 = gmx_mul_pr(beta2_SSE, gmx_and_pr(rsq_SSE3, wco_SSE3));
+#ifndef NBNXN_CUTOFF_USE_BLENDV
+ brsq_S0 = gmx_mul_pr(beta2_S, gmx_blendzero_pr(rsq_S0, wco_S0));
+ brsq_S1 = gmx_mul_pr(beta2_S, gmx_blendzero_pr(rsq_S1, wco_S1));
+ brsq_S2 = gmx_mul_pr(beta2_S, gmx_blendzero_pr(rsq_S2, wco_S2));
+ brsq_S3 = gmx_mul_pr(beta2_S, gmx_blendzero_pr(rsq_S3, wco_S3));
#else
/* Strangely, putting mul on a separate line is slower (icc 13) */
- brsq_SSE0 = gmx_mul_pr(beta2_SSE, gmx_blendv_pr(rsq_SSE0, zero_SSE, gmx_sub_pr(rc2_SSE, rsq_SSE0)));
- brsq_SSE1 = gmx_mul_pr(beta2_SSE, gmx_blendv_pr(rsq_SSE1, zero_SSE, gmx_sub_pr(rc2_SSE, rsq_SSE1)));
- brsq_SSE2 = gmx_mul_pr(beta2_SSE, gmx_blendv_pr(rsq_SSE2, zero_SSE, gmx_sub_pr(rc2_SSE, rsq_SSE2)));
- brsq_SSE3 = gmx_mul_pr(beta2_SSE, gmx_blendv_pr(rsq_SSE3, zero_SSE, gmx_sub_pr(rc2_SSE, rsq_SSE3)));
-#endif
- ewcorr_SSE0 = gmx_mul_pr(gmx_pmecorrF_pr(brsq_SSE0), beta_SSE);
- ewcorr_SSE1 = gmx_mul_pr(gmx_pmecorrF_pr(brsq_SSE1), beta_SSE);
- ewcorr_SSE2 = gmx_mul_pr(gmx_pmecorrF_pr(brsq_SSE2), beta_SSE);
- ewcorr_SSE3 = gmx_mul_pr(gmx_pmecorrF_pr(brsq_SSE3), beta_SSE);
- frcoul_SSE0 = gmx_mul_pr(qq_SSE0, gmx_add_pr(rinv_ex_SSE0, gmx_mul_pr(ewcorr_SSE0, brsq_SSE0)));
- frcoul_SSE1 = gmx_mul_pr(qq_SSE1, gmx_add_pr(rinv_ex_SSE1, gmx_mul_pr(ewcorr_SSE1, brsq_SSE1)));
- frcoul_SSE2 = gmx_mul_pr(qq_SSE2, gmx_add_pr(rinv_ex_SSE2, gmx_mul_pr(ewcorr_SSE2, brsq_SSE2)));
- frcoul_SSE3 = gmx_mul_pr(qq_SSE3, gmx_add_pr(rinv_ex_SSE3, gmx_mul_pr(ewcorr_SSE3, brsq_SSE3)));
+ brsq_S0 = gmx_mul_pr(beta2_S, gmx_blendv_pr(rsq_S0, zero_S, gmx_sub_pr(rc2_S, rsq_S0)));
+ brsq_S1 = gmx_mul_pr(beta2_S, gmx_blendv_pr(rsq_S1, zero_S, gmx_sub_pr(rc2_S, rsq_S1)));
+ brsq_S2 = gmx_mul_pr(beta2_S, gmx_blendv_pr(rsq_S2, zero_S, gmx_sub_pr(rc2_S, rsq_S2)));
+ brsq_S3 = gmx_mul_pr(beta2_S, gmx_blendv_pr(rsq_S3, zero_S, gmx_sub_pr(rc2_S, rsq_S3)));
+#endif
+ ewcorr_S0 = gmx_mul_pr(gmx_pmecorrF_pr(brsq_S0), beta_S);
+ ewcorr_S1 = gmx_mul_pr(gmx_pmecorrF_pr(brsq_S1), beta_S);
+ ewcorr_S2 = gmx_mul_pr(gmx_pmecorrF_pr(brsq_S2), beta_S);
+ ewcorr_S3 = gmx_mul_pr(gmx_pmecorrF_pr(brsq_S3), beta_S);
+ frcoul_S0 = gmx_mul_pr(qq_S0, gmx_add_pr(rinv_ex_S0, gmx_mul_pr(ewcorr_S0, brsq_S0)));
+ frcoul_S1 = gmx_mul_pr(qq_S1, gmx_add_pr(rinv_ex_S1, gmx_mul_pr(ewcorr_S1, brsq_S1)));
+ frcoul_S2 = gmx_mul_pr(qq_S2, gmx_add_pr(rinv_ex_S2, gmx_mul_pr(ewcorr_S2, brsq_S2)));
+ frcoul_S3 = gmx_mul_pr(qq_S3, gmx_add_pr(rinv_ex_S3, gmx_mul_pr(ewcorr_S3, brsq_S3)));
#ifdef CALC_ENERGIES
- vc_sub_SSE0 = gmx_mul_pr(gmx_pmecorrV_pr(brsq_SSE0), beta_SSE);
- vc_sub_SSE1 = gmx_mul_pr(gmx_pmecorrV_pr(brsq_SSE1), beta_SSE);
- vc_sub_SSE2 = gmx_mul_pr(gmx_pmecorrV_pr(brsq_SSE2), beta_SSE);
- vc_sub_SSE3 = gmx_mul_pr(gmx_pmecorrV_pr(brsq_SSE3), beta_SSE);
+ vc_sub_S0 = gmx_mul_pr(gmx_pmecorrV_pr(brsq_S0), beta_S);
+ vc_sub_S1 = gmx_mul_pr(gmx_pmecorrV_pr(brsq_S1), beta_S);
+ vc_sub_S2 = gmx_mul_pr(gmx_pmecorrV_pr(brsq_S2), beta_S);
+ vc_sub_S3 = gmx_mul_pr(gmx_pmecorrV_pr(brsq_S3), beta_S);
#endif
#endif /* CALC_COUL_EWALD */
#ifdef CALC_COUL_TAB
/* Electrostatic interactions */
- r_SSE0 = gmx_mul_pr(rsq_SSE0, rinv_SSE0);
- r_SSE1 = gmx_mul_pr(rsq_SSE1, rinv_SSE1);
- r_SSE2 = gmx_mul_pr(rsq_SSE2, rinv_SSE2);
- r_SSE3 = gmx_mul_pr(rsq_SSE3, rinv_SSE3);
+ r_S0 = gmx_mul_pr(rsq_S0, rinv_S0);
+ r_S1 = gmx_mul_pr(rsq_S1, rinv_S1);
+ r_S2 = gmx_mul_pr(rsq_S2, rinv_S2);
+ r_S3 = gmx_mul_pr(rsq_S3, rinv_S3);
/* Convert r to scaled table units */
- rs_SSE0 = gmx_mul_pr(r_SSE0, invtsp_SSE);
- rs_SSE1 = gmx_mul_pr(r_SSE1, invtsp_SSE);
- rs_SSE2 = gmx_mul_pr(r_SSE2, invtsp_SSE);
- rs_SSE3 = gmx_mul_pr(r_SSE3, invtsp_SSE);
+ rs_S0 = gmx_mul_pr(r_S0, invtsp_S);
+ rs_S1 = gmx_mul_pr(r_S1, invtsp_S);
+ rs_S2 = gmx_mul_pr(r_S2, invtsp_S);
+ rs_S3 = gmx_mul_pr(r_S3, invtsp_S);
/* Truncate scaled r to an int */
- ti_SSE0 = gmx_cvttpr_epi32(rs_SSE0);
- ti_SSE1 = gmx_cvttpr_epi32(rs_SSE1);
- ti_SSE2 = gmx_cvttpr_epi32(rs_SSE2);
- ti_SSE3 = gmx_cvttpr_epi32(rs_SSE3);
-#ifdef GMX_X86_SSE4_1
+ ti_S0 = gmx_cvttpr_epi32(rs_S0);
+ ti_S1 = gmx_cvttpr_epi32(rs_S1);
+ ti_S2 = gmx_cvttpr_epi32(rs_S2);
+ ti_S3 = gmx_cvttpr_epi32(rs_S3);
+#ifdef GMX_HAVE_SIMD_FLOOR
/* SSE4.1 floor is faster than gmx_cvtepi32_ps int->float cast */
- rf_SSE0 = gmx_floor_pr(rs_SSE0);
- rf_SSE1 = gmx_floor_pr(rs_SSE1);
- rf_SSE2 = gmx_floor_pr(rs_SSE2);
- rf_SSE3 = gmx_floor_pr(rs_SSE3);
+ rf_S0 = gmx_floor_pr(rs_S0);
+ rf_S1 = gmx_floor_pr(rs_S1);
+ rf_S2 = gmx_floor_pr(rs_S2);
+ rf_S3 = gmx_floor_pr(rs_S3);
#else
- rf_SSE0 = gmx_cvtepi32_pr(ti_SSE0);
- rf_SSE1 = gmx_cvtepi32_pr(ti_SSE1);
- rf_SSE2 = gmx_cvtepi32_pr(ti_SSE2);
- rf_SSE3 = gmx_cvtepi32_pr(ti_SSE3);
+ rf_S0 = gmx_cvtepi32_pr(ti_S0);
+ rf_S1 = gmx_cvtepi32_pr(ti_S1);
+ rf_S2 = gmx_cvtepi32_pr(ti_S2);
+ rf_S3 = gmx_cvtepi32_pr(ti_S3);
#endif
- frac_SSE0 = gmx_sub_pr(rs_SSE0, rf_SSE0);
- frac_SSE1 = gmx_sub_pr(rs_SSE1, rf_SSE1);
- frac_SSE2 = gmx_sub_pr(rs_SSE2, rf_SSE2);
- frac_SSE3 = gmx_sub_pr(rs_SSE3, rf_SSE3);
+ frac_S0 = gmx_sub_pr(rs_S0, rf_S0);
+ frac_S1 = gmx_sub_pr(rs_S1, rf_S1);
+ frac_S2 = gmx_sub_pr(rs_S2, rf_S2);
+ frac_S3 = gmx_sub_pr(rs_S3, rf_S3);
/* Load and interpolate table forces and possibly energies.
* Force and energy can be combined in one table, stride 4: FDV0
* Currently single precision uses FDV0, double F and V.
*/
#ifndef CALC_ENERGIES
- load_table_f(tab_coul_F, ti_SSE0, ti0, ctab0_SSE0, ctab1_SSE0);
- load_table_f(tab_coul_F, ti_SSE1, ti1, ctab0_SSE1, ctab1_SSE1);
- load_table_f(tab_coul_F, ti_SSE2, ti2, ctab0_SSE2, ctab1_SSE2);
- load_table_f(tab_coul_F, ti_SSE3, ti3, ctab0_SSE3, ctab1_SSE3);
+ load_table_f(tab_coul_F, ti_S0, ti0, ctab0_S0, ctab1_S0);
+ load_table_f(tab_coul_F, ti_S1, ti1, ctab0_S1, ctab1_S1);
+ load_table_f(tab_coul_F, ti_S2, ti2, ctab0_S2, ctab1_S2);
+ load_table_f(tab_coul_F, ti_S3, ti3, ctab0_S3, ctab1_S3);
#else
#ifdef TAB_FDV0
- load_table_f_v(tab_coul_F, ti_SSE0, ti0, ctab0_SSE0, ctab1_SSE0, ctabv_SSE0);
- load_table_f_v(tab_coul_F, ti_SSE1, ti1, ctab0_SSE1, ctab1_SSE1, ctabv_SSE1);
- load_table_f_v(tab_coul_F, ti_SSE2, ti2, ctab0_SSE2, ctab1_SSE2, ctabv_SSE2);
- load_table_f_v(tab_coul_F, ti_SSE3, ti3, ctab0_SSE3, ctab1_SSE3, ctabv_SSE3);
+ load_table_f_v(tab_coul_F, ti_S0, ti0, ctab0_S0, ctab1_S0, ctabv_S0);
+ load_table_f_v(tab_coul_F, ti_S1, ti1, ctab0_S1, ctab1_S1, ctabv_S1);
+ load_table_f_v(tab_coul_F, ti_S2, ti2, ctab0_S2, ctab1_S2, ctabv_S2);
+ load_table_f_v(tab_coul_F, ti_S3, ti3, ctab0_S3, ctab1_S3, ctabv_S3);
#else
- load_table_f_v(tab_coul_F, tab_coul_V, ti_SSE0, ti0, ctab0_SSE0, ctab1_SSE0, ctabv_SSE0);
- load_table_f_v(tab_coul_F, tab_coul_V, ti_SSE1, ti1, ctab0_SSE1, ctab1_SSE1, ctabv_SSE1);
- load_table_f_v(tab_coul_F, tab_coul_V, ti_SSE2, ti2, ctab0_SSE2, ctab1_SSE2, ctabv_SSE2);
- load_table_f_v(tab_coul_F, tab_coul_V, ti_SSE3, ti3, ctab0_SSE3, ctab1_SSE3, ctabv_SSE3);
+ load_table_f_v(tab_coul_F, tab_coul_V, ti_S0, ti0, ctab0_S0, ctab1_S0, ctabv_S0);
+ load_table_f_v(tab_coul_F, tab_coul_V, ti_S1, ti1, ctab0_S1, ctab1_S1, ctabv_S1);
+ load_table_f_v(tab_coul_F, tab_coul_V, ti_S2, ti2, ctab0_S2, ctab1_S2, ctabv_S2);
+ load_table_f_v(tab_coul_F, tab_coul_V, ti_S3, ti3, ctab0_S3, ctab1_S3, ctabv_S3);
#endif
#endif
- fsub_SSE0 = gmx_add_pr(ctab0_SSE0, gmx_mul_pr(frac_SSE0, ctab1_SSE0));
- fsub_SSE1 = gmx_add_pr(ctab0_SSE1, gmx_mul_pr(frac_SSE1, ctab1_SSE1));
- fsub_SSE2 = gmx_add_pr(ctab0_SSE2, gmx_mul_pr(frac_SSE2, ctab1_SSE2));
- fsub_SSE3 = gmx_add_pr(ctab0_SSE3, gmx_mul_pr(frac_SSE3, ctab1_SSE3));
- frcoul_SSE0 = gmx_mul_pr(qq_SSE0, gmx_sub_pr(rinv_ex_SSE0, gmx_mul_pr(fsub_SSE0, r_SSE0)));
- frcoul_SSE1 = gmx_mul_pr(qq_SSE1, gmx_sub_pr(rinv_ex_SSE1, gmx_mul_pr(fsub_SSE1, r_SSE1)));
- frcoul_SSE2 = gmx_mul_pr(qq_SSE2, gmx_sub_pr(rinv_ex_SSE2, gmx_mul_pr(fsub_SSE2, r_SSE2)));
- frcoul_SSE3 = gmx_mul_pr(qq_SSE3, gmx_sub_pr(rinv_ex_SSE3, gmx_mul_pr(fsub_SSE3, r_SSE3)));
+ fsub_S0 = gmx_add_pr(ctab0_S0, gmx_mul_pr(frac_S0, ctab1_S0));
+ fsub_S1 = gmx_add_pr(ctab0_S1, gmx_mul_pr(frac_S1, ctab1_S1));
+ fsub_S2 = gmx_add_pr(ctab0_S2, gmx_mul_pr(frac_S2, ctab1_S2));
+ fsub_S3 = gmx_add_pr(ctab0_S3, gmx_mul_pr(frac_S3, ctab1_S3));
+ frcoul_S0 = gmx_mul_pr(qq_S0, gmx_sub_pr(rinv_ex_S0, gmx_mul_pr(fsub_S0, r_S0)));
+ frcoul_S1 = gmx_mul_pr(qq_S1, gmx_sub_pr(rinv_ex_S1, gmx_mul_pr(fsub_S1, r_S1)));
+ frcoul_S2 = gmx_mul_pr(qq_S2, gmx_sub_pr(rinv_ex_S2, gmx_mul_pr(fsub_S2, r_S2)));
+ frcoul_S3 = gmx_mul_pr(qq_S3, gmx_sub_pr(rinv_ex_S3, gmx_mul_pr(fsub_S3, r_S3)));
#ifdef CALC_ENERGIES
- vc_sub_SSE0 = gmx_add_pr(ctabv_SSE0, gmx_mul_pr(gmx_mul_pr(mhalfsp_SSE, frac_SSE0), gmx_add_pr(ctab0_SSE0, fsub_SSE0)));
- vc_sub_SSE1 = gmx_add_pr(ctabv_SSE1, gmx_mul_pr(gmx_mul_pr(mhalfsp_SSE, frac_SSE1), gmx_add_pr(ctab0_SSE1, fsub_SSE1)));
- vc_sub_SSE2 = gmx_add_pr(ctabv_SSE2, gmx_mul_pr(gmx_mul_pr(mhalfsp_SSE, frac_SSE2), gmx_add_pr(ctab0_SSE2, fsub_SSE2)));
- vc_sub_SSE3 = gmx_add_pr(ctabv_SSE3, gmx_mul_pr(gmx_mul_pr(mhalfsp_SSE, frac_SSE3), gmx_add_pr(ctab0_SSE3, fsub_SSE3)));
+ vc_sub_S0 = gmx_add_pr(ctabv_S0, gmx_mul_pr(gmx_mul_pr(mhalfsp_S, frac_S0), gmx_add_pr(ctab0_S0, fsub_S0)));
+ vc_sub_S1 = gmx_add_pr(ctabv_S1, gmx_mul_pr(gmx_mul_pr(mhalfsp_S, frac_S1), gmx_add_pr(ctab0_S1, fsub_S1)));
+ vc_sub_S2 = gmx_add_pr(ctabv_S2, gmx_mul_pr(gmx_mul_pr(mhalfsp_S, frac_S2), gmx_add_pr(ctab0_S2, fsub_S2)));
+ vc_sub_S3 = gmx_add_pr(ctabv_S3, gmx_mul_pr(gmx_mul_pr(mhalfsp_S, frac_S3), gmx_add_pr(ctab0_S3, fsub_S3)));
#endif
#endif /* CALC_COUL_TAB */
#ifndef NO_SHIFT_EWALD
/* Add Ewald potential shift to vc_sub for convenience */
#ifdef CHECK_EXCLS
- vc_sub_SSE0 = gmx_add_pr(vc_sub_SSE0, gmx_and_pr(sh_ewald_SSE, int_SSE0));
- vc_sub_SSE1 = gmx_add_pr(vc_sub_SSE1, gmx_and_pr(sh_ewald_SSE, int_SSE1));
- vc_sub_SSE2 = gmx_add_pr(vc_sub_SSE2, gmx_and_pr(sh_ewald_SSE, int_SSE2));
- vc_sub_SSE3 = gmx_add_pr(vc_sub_SSE3, gmx_and_pr(sh_ewald_SSE, int_SSE3));
+ vc_sub_S0 = gmx_add_pr(vc_sub_S0, gmx_blendzero_pr(sh_ewald_S, int_S0));
+ vc_sub_S1 = gmx_add_pr(vc_sub_S1, gmx_blendzero_pr(sh_ewald_S, int_S1));
+ vc_sub_S2 = gmx_add_pr(vc_sub_S2, gmx_blendzero_pr(sh_ewald_S, int_S2));
+ vc_sub_S3 = gmx_add_pr(vc_sub_S3, gmx_blendzero_pr(sh_ewald_S, int_S3));
#else
- vc_sub_SSE0 = gmx_add_pr(vc_sub_SSE0, sh_ewald_SSE);
- vc_sub_SSE1 = gmx_add_pr(vc_sub_SSE1, sh_ewald_SSE);
- vc_sub_SSE2 = gmx_add_pr(vc_sub_SSE2, sh_ewald_SSE);
- vc_sub_SSE3 = gmx_add_pr(vc_sub_SSE3, sh_ewald_SSE);
+ vc_sub_S0 = gmx_add_pr(vc_sub_S0, sh_ewald_S);
+ vc_sub_S1 = gmx_add_pr(vc_sub_S1, sh_ewald_S);
+ vc_sub_S2 = gmx_add_pr(vc_sub_S2, sh_ewald_S);
+ vc_sub_S3 = gmx_add_pr(vc_sub_S3, sh_ewald_S);
#endif
#endif
- vcoul_SSE0 = gmx_mul_pr(qq_SSE0, gmx_sub_pr(rinv_ex_SSE0, vc_sub_SSE0));
- vcoul_SSE1 = gmx_mul_pr(qq_SSE1, gmx_sub_pr(rinv_ex_SSE1, vc_sub_SSE1));
- vcoul_SSE2 = gmx_mul_pr(qq_SSE2, gmx_sub_pr(rinv_ex_SSE2, vc_sub_SSE2));
- vcoul_SSE3 = gmx_mul_pr(qq_SSE3, gmx_sub_pr(rinv_ex_SSE3, vc_sub_SSE3));
+ vcoul_S0 = gmx_mul_pr(qq_S0, gmx_sub_pr(rinv_ex_S0, vc_sub_S0));
+ vcoul_S1 = gmx_mul_pr(qq_S1, gmx_sub_pr(rinv_ex_S1, vc_sub_S1));
+ vcoul_S2 = gmx_mul_pr(qq_S2, gmx_sub_pr(rinv_ex_S2, vc_sub_S2));
+ vcoul_S3 = gmx_mul_pr(qq_S3, gmx_sub_pr(rinv_ex_S3, vc_sub_S3));
#endif
#ifdef CALC_ENERGIES
/* Mask energy for cut-off and diagonal */
- vcoul_SSE0 = gmx_and_pr(vcoul_SSE0, wco_SSE0);
- vcoul_SSE1 = gmx_and_pr(vcoul_SSE1, wco_SSE1);
- vcoul_SSE2 = gmx_and_pr(vcoul_SSE2, wco_SSE2);
- vcoul_SSE3 = gmx_and_pr(vcoul_SSE3, wco_SSE3);
+ vcoul_S0 = gmx_blendzero_pr(vcoul_S0, wco_S0);
+ vcoul_S1 = gmx_blendzero_pr(vcoul_S1, wco_S1);
+ vcoul_S2 = gmx_blendzero_pr(vcoul_S2, wco_S2);
+ vcoul_S3 = gmx_blendzero_pr(vcoul_S3, wco_S3);
#endif
#endif /* CALC_COULOMB */
/* Lennard-Jones interaction */
#ifdef VDW_CUTOFF_CHECK
- wco_vdw_SSE0 = gmx_cmplt_pr(rsq_SSE0, rcvdw2_SSE);
- wco_vdw_SSE1 = gmx_cmplt_pr(rsq_SSE1, rcvdw2_SSE);
+ wco_vdw_S0 = gmx_cmplt_pr(rsq_S0, rcvdw2_S);
+ wco_vdw_S1 = gmx_cmplt_pr(rsq_S1, rcvdw2_S);
#ifndef HALF_LJ
- wco_vdw_SSE2 = gmx_cmplt_pr(rsq_SSE2, rcvdw2_SSE);
- wco_vdw_SSE3 = gmx_cmplt_pr(rsq_SSE3, rcvdw2_SSE);
+ wco_vdw_S2 = gmx_cmplt_pr(rsq_S2, rcvdw2_S);
+ wco_vdw_S3 = gmx_cmplt_pr(rsq_S3, rcvdw2_S);
#endif
#else
/* Same cut-off for Coulomb and VdW, reuse the registers */
-#define wco_vdw_SSE0 wco_SSE0
-#define wco_vdw_SSE1 wco_SSE1
-#define wco_vdw_SSE2 wco_SSE2
-#define wco_vdw_SSE3 wco_SSE3
+#define wco_vdw_S0 wco_S0
+#define wco_vdw_S1 wco_S1
+#define wco_vdw_S2 wco_S2
+#define wco_vdw_S3 wco_S3
#endif
#ifndef LJ_COMB_LB
- rinvsix_SSE0 = gmx_mul_pr(rinvsq_SSE0, gmx_mul_pr(rinvsq_SSE0, rinvsq_SSE0));
- rinvsix_SSE1 = gmx_mul_pr(rinvsq_SSE1, gmx_mul_pr(rinvsq_SSE1, rinvsq_SSE1));
+ rinvsix_S0 = gmx_mul_pr(rinvsq_S0, gmx_mul_pr(rinvsq_S0, rinvsq_S0));
+ rinvsix_S1 = gmx_mul_pr(rinvsq_S1, gmx_mul_pr(rinvsq_S1, rinvsq_S1));
#ifdef EXCL_FORCES
- rinvsix_SSE0 = gmx_and_pr(rinvsix_SSE0, int_SSE0);
- rinvsix_SSE1 = gmx_and_pr(rinvsix_SSE1, int_SSE1);
+ rinvsix_S0 = gmx_blendzero_pr(rinvsix_S0, int_S0);
+ rinvsix_S1 = gmx_blendzero_pr(rinvsix_S1, int_S1);
#endif
#ifndef HALF_LJ
- rinvsix_SSE2 = gmx_mul_pr(rinvsq_SSE2, gmx_mul_pr(rinvsq_SSE2, rinvsq_SSE2));
- rinvsix_SSE3 = gmx_mul_pr(rinvsq_SSE3, gmx_mul_pr(rinvsq_SSE3, rinvsq_SSE3));
+ rinvsix_S2 = gmx_mul_pr(rinvsq_S2, gmx_mul_pr(rinvsq_S2, rinvsq_S2));
+ rinvsix_S3 = gmx_mul_pr(rinvsq_S3, gmx_mul_pr(rinvsq_S3, rinvsq_S3));
#ifdef EXCL_FORCES
- rinvsix_SSE2 = gmx_and_pr(rinvsix_SSE2, int_SSE2);
- rinvsix_SSE3 = gmx_and_pr(rinvsix_SSE3, int_SSE3);
+ rinvsix_S2 = gmx_blendzero_pr(rinvsix_S2, int_S2);
+ rinvsix_S3 = gmx_blendzero_pr(rinvsix_S3, int_S3);
#endif
#endif
#ifdef VDW_CUTOFF_CHECK
- rinvsix_SSE0 = gmx_and_pr(rinvsix_SSE0, wco_vdw_SSE0);
- rinvsix_SSE1 = gmx_and_pr(rinvsix_SSE1, wco_vdw_SSE1);
+ rinvsix_S0 = gmx_blendzero_pr(rinvsix_S0, wco_vdw_S0);
+ rinvsix_S1 = gmx_blendzero_pr(rinvsix_S1, wco_vdw_S1);
#ifndef HALF_LJ
- rinvsix_SSE2 = gmx_and_pr(rinvsix_SSE2, wco_vdw_SSE2);
- rinvsix_SSE3 = gmx_and_pr(rinvsix_SSE3, wco_vdw_SSE3);
+ rinvsix_S2 = gmx_blendzero_pr(rinvsix_S2, wco_vdw_S2);
+ rinvsix_S3 = gmx_blendzero_pr(rinvsix_S3, wco_vdw_S3);
#endif
#endif
- FrLJ6_SSE0 = gmx_mul_pr(c6_SSE0, rinvsix_SSE0);
- FrLJ6_SSE1 = gmx_mul_pr(c6_SSE1, rinvsix_SSE1);
+ FrLJ6_S0 = gmx_mul_pr(c6_S0, rinvsix_S0);
+ FrLJ6_S1 = gmx_mul_pr(c6_S1, rinvsix_S1);
#ifndef HALF_LJ
- FrLJ6_SSE2 = gmx_mul_pr(c6_SSE2, rinvsix_SSE2);
- FrLJ6_SSE3 = gmx_mul_pr(c6_SSE3, rinvsix_SSE3);
+ FrLJ6_S2 = gmx_mul_pr(c6_S2, rinvsix_S2);
+ FrLJ6_S3 = gmx_mul_pr(c6_S3, rinvsix_S3);
#endif
- FrLJ12_SSE0 = gmx_mul_pr(c12_SSE0, gmx_mul_pr(rinvsix_SSE0, rinvsix_SSE0));
- FrLJ12_SSE1 = gmx_mul_pr(c12_SSE1, gmx_mul_pr(rinvsix_SSE1, rinvsix_SSE1));
+ FrLJ12_S0 = gmx_mul_pr(c12_S0, gmx_mul_pr(rinvsix_S0, rinvsix_S0));
+ FrLJ12_S1 = gmx_mul_pr(c12_S1, gmx_mul_pr(rinvsix_S1, rinvsix_S1));
#ifndef HALF_LJ
- FrLJ12_SSE2 = gmx_mul_pr(c12_SSE2, gmx_mul_pr(rinvsix_SSE2, rinvsix_SSE2));
- FrLJ12_SSE3 = gmx_mul_pr(c12_SSE3, gmx_mul_pr(rinvsix_SSE3, rinvsix_SSE3));
+ FrLJ12_S2 = gmx_mul_pr(c12_S2, gmx_mul_pr(rinvsix_S2, rinvsix_S2));
+ FrLJ12_S3 = gmx_mul_pr(c12_S3, gmx_mul_pr(rinvsix_S3, rinvsix_S3));
#endif
#endif /* not LJ_COMB_LB */
#ifdef LJ_COMB_LB
- sir_SSE0 = gmx_mul_pr(sig_SSE0, rinv_SSE0);
- sir_SSE1 = gmx_mul_pr(sig_SSE1, rinv_SSE1);
+ sir_S0 = gmx_mul_pr(sig_S0, rinv_S0);
+ sir_S1 = gmx_mul_pr(sig_S1, rinv_S1);
#ifndef HALF_LJ
- sir_SSE2 = gmx_mul_pr(sig_SSE2, rinv_SSE2);
- sir_SSE3 = gmx_mul_pr(sig_SSE3, rinv_SSE3);
+ sir_S2 = gmx_mul_pr(sig_S2, rinv_S2);
+ sir_S3 = gmx_mul_pr(sig_S3, rinv_S3);
#endif
- sir2_SSE0 = gmx_mul_pr(sir_SSE0, sir_SSE0);
- sir2_SSE1 = gmx_mul_pr(sir_SSE1, sir_SSE1);
+ sir2_S0 = gmx_mul_pr(sir_S0, sir_S0);
+ sir2_S1 = gmx_mul_pr(sir_S1, sir_S1);
#ifndef HALF_LJ
- sir2_SSE2 = gmx_mul_pr(sir_SSE2, sir_SSE2);
- sir2_SSE3 = gmx_mul_pr(sir_SSE3, sir_SSE3);
+ sir2_S2 = gmx_mul_pr(sir_S2, sir_S2);
+ sir2_S3 = gmx_mul_pr(sir_S3, sir_S3);
#endif
- sir6_SSE0 = gmx_mul_pr(sir2_SSE0, gmx_mul_pr(sir2_SSE0, sir2_SSE0));
- sir6_SSE1 = gmx_mul_pr(sir2_SSE1, gmx_mul_pr(sir2_SSE1, sir2_SSE1));
+ sir6_S0 = gmx_mul_pr(sir2_S0, gmx_mul_pr(sir2_S0, sir2_S0));
+ sir6_S1 = gmx_mul_pr(sir2_S1, gmx_mul_pr(sir2_S1, sir2_S1));
#ifdef EXCL_FORCES
- sir6_SSE0 = gmx_and_pr(sir6_SSE0, int_SSE0);
- sir6_SSE1 = gmx_and_pr(sir6_SSE1, int_SSE1);
+ sir6_S0 = gmx_blendzero_pr(sir6_S0, int_S0);
+ sir6_S1 = gmx_blendzero_pr(sir6_S1, int_S1);
#endif
#ifndef HALF_LJ
- sir6_SSE2 = gmx_mul_pr(sir2_SSE2, gmx_mul_pr(sir2_SSE2, sir2_SSE2));
- sir6_SSE3 = gmx_mul_pr(sir2_SSE3, gmx_mul_pr(sir2_SSE3, sir2_SSE3));
+ sir6_S2 = gmx_mul_pr(sir2_S2, gmx_mul_pr(sir2_S2, sir2_S2));
+ sir6_S3 = gmx_mul_pr(sir2_S3, gmx_mul_pr(sir2_S3, sir2_S3));
#ifdef EXCL_FORCES
- sir6_SSE2 = gmx_and_pr(sir6_SSE2, int_SSE2);
- sir6_SSE3 = gmx_and_pr(sir6_SSE3, int_SSE3);
+ sir6_S2 = gmx_blendzero_pr(sir6_S2, int_S2);
+ sir6_S3 = gmx_blendzero_pr(sir6_S3, int_S3);
#endif
#endif
#ifdef VDW_CUTOFF_CHECK
- sir6_SSE0 = gmx_and_pr(sir6_SSE0, wco_vdw_SSE0);
- sir6_SSE1 = gmx_and_pr(sir6_SSE1, wco_vdw_SSE1);
+ sir6_S0 = gmx_blendzero_pr(sir6_S0, wco_vdw_S0);
+ sir6_S1 = gmx_blendzero_pr(sir6_S1, wco_vdw_S1);
#ifndef HALF_LJ
- sir6_SSE2 = gmx_and_pr(sir6_SSE2, wco_vdw_SSE2);
- sir6_SSE3 = gmx_and_pr(sir6_SSE3, wco_vdw_SSE3);
+ sir6_S2 = gmx_blendzero_pr(sir6_S2, wco_vdw_S2);
+ sir6_S3 = gmx_blendzero_pr(sir6_S3, wco_vdw_S3);
#endif
#endif
- FrLJ6_SSE0 = gmx_mul_pr(eps_SSE0, sir6_SSE0);
- FrLJ6_SSE1 = gmx_mul_pr(eps_SSE1, sir6_SSE1);
+ FrLJ6_S0 = gmx_mul_pr(eps_S0, sir6_S0);
+ FrLJ6_S1 = gmx_mul_pr(eps_S1, sir6_S1);
#ifndef HALF_LJ
- FrLJ6_SSE2 = gmx_mul_pr(eps_SSE2, sir6_SSE2);
- FrLJ6_SSE3 = gmx_mul_pr(eps_SSE3, sir6_SSE3);
+ FrLJ6_S2 = gmx_mul_pr(eps_S2, sir6_S2);
+ FrLJ6_S3 = gmx_mul_pr(eps_S3, sir6_S3);
#endif
- FrLJ12_SSE0 = gmx_mul_pr(FrLJ6_SSE0, sir6_SSE0);
- FrLJ12_SSE1 = gmx_mul_pr(FrLJ6_SSE1, sir6_SSE1);
+ FrLJ12_S0 = gmx_mul_pr(FrLJ6_S0, sir6_S0);
+ FrLJ12_S1 = gmx_mul_pr(FrLJ6_S1, sir6_S1);
#ifndef HALF_LJ
- FrLJ12_SSE2 = gmx_mul_pr(FrLJ6_SSE2, sir6_SSE2);
- FrLJ12_SSE3 = gmx_mul_pr(FrLJ6_SSE3, sir6_SSE3);
+ FrLJ12_S2 = gmx_mul_pr(FrLJ6_S2, sir6_S2);
+ FrLJ12_S3 = gmx_mul_pr(FrLJ6_S3, sir6_S3);
#endif
#if defined CALC_ENERGIES
/* We need C6 and C12 to calculate the LJ potential shift */
- sig2_SSE0 = gmx_mul_pr(sig_SSE0, sig_SSE0);
- sig2_SSE1 = gmx_mul_pr(sig_SSE1, sig_SSE1);
+ sig2_S0 = gmx_mul_pr(sig_S0, sig_S0);
+ sig2_S1 = gmx_mul_pr(sig_S1, sig_S1);
#ifndef HALF_LJ
- sig2_SSE2 = gmx_mul_pr(sig_SSE2, sig_SSE2);
- sig2_SSE3 = gmx_mul_pr(sig_SSE3, sig_SSE3);
+ sig2_S2 = gmx_mul_pr(sig_S2, sig_S2);
+ sig2_S3 = gmx_mul_pr(sig_S3, sig_S3);
#endif
- sig6_SSE0 = gmx_mul_pr(sig2_SSE0, gmx_mul_pr(sig2_SSE0, sig2_SSE0));
- sig6_SSE1 = gmx_mul_pr(sig2_SSE1, gmx_mul_pr(sig2_SSE1, sig2_SSE1));
+ sig6_S0 = gmx_mul_pr(sig2_S0, gmx_mul_pr(sig2_S0, sig2_S0));
+ sig6_S1 = gmx_mul_pr(sig2_S1, gmx_mul_pr(sig2_S1, sig2_S1));
#ifndef HALF_LJ
- sig6_SSE2 = gmx_mul_pr(sig2_SSE2, gmx_mul_pr(sig2_SSE2, sig2_SSE2));
- sig6_SSE3 = gmx_mul_pr(sig2_SSE3, gmx_mul_pr(sig2_SSE3, sig2_SSE3));
+ sig6_S2 = gmx_mul_pr(sig2_S2, gmx_mul_pr(sig2_S2, sig2_S2));
+ sig6_S3 = gmx_mul_pr(sig2_S3, gmx_mul_pr(sig2_S3, sig2_S3));
#endif
- c6_SSE0 = gmx_mul_pr(eps_SSE0, sig6_SSE0);
- c6_SSE1 = gmx_mul_pr(eps_SSE1, sig6_SSE1);
+ c6_S0 = gmx_mul_pr(eps_S0, sig6_S0);
+ c6_S1 = gmx_mul_pr(eps_S1, sig6_S1);
#ifndef HALF_LJ
- c6_SSE2 = gmx_mul_pr(eps_SSE2, sig6_SSE2);
- c6_SSE3 = gmx_mul_pr(eps_SSE3, sig6_SSE3);
+ c6_S2 = gmx_mul_pr(eps_S2, sig6_S2);
+ c6_S3 = gmx_mul_pr(eps_S3, sig6_S3);
#endif
- c12_SSE0 = gmx_mul_pr(c6_SSE0, sig6_SSE0);
- c12_SSE1 = gmx_mul_pr(c6_SSE1, sig6_SSE1);
+ c12_S0 = gmx_mul_pr(c6_S0, sig6_S0);
+ c12_S1 = gmx_mul_pr(c6_S1, sig6_S1);
#ifndef HALF_LJ
- c12_SSE2 = gmx_mul_pr(c6_SSE2, sig6_SSE2);
- c12_SSE3 = gmx_mul_pr(c6_SSE3, sig6_SSE3);
+ c12_S2 = gmx_mul_pr(c6_S2, sig6_S2);
+ c12_S3 = gmx_mul_pr(c6_S3, sig6_S3);
#endif
#endif
#endif /* LJ_COMB_LB */
#ifdef CALC_COULOMB
#ifndef ENERGY_GROUPS
- vctotSSE = gmx_add_pr(vctotSSE, gmx_sum4_pr(vcoul_SSE0, vcoul_SSE1, vcoul_SSE2, vcoul_SSE3));
+ vctot_S = gmx_add_pr(vctot_S, gmx_sum4_pr(vcoul_S0, vcoul_S1, vcoul_S2, vcoul_S3));
#else
- add_ener_grp(vcoul_SSE0, vctp[0], egp_jj);
- add_ener_grp(vcoul_SSE1, vctp[1], egp_jj);
- add_ener_grp(vcoul_SSE2, vctp[2], egp_jj);
- add_ener_grp(vcoul_SSE3, vctp[3], egp_jj);
+ add_ener_grp(vcoul_S0, vctp[0], egp_jj);
+ add_ener_grp(vcoul_S1, vctp[1], egp_jj);
+ add_ener_grp(vcoul_S2, vctp[2], egp_jj);
+ add_ener_grp(vcoul_S3, vctp[3], egp_jj);
#endif
#endif
#ifdef CALC_LJ
/* Calculate the LJ energies */
- VLJ6_SSE0 = gmx_mul_pr(sixthSSE, gmx_sub_pr(FrLJ6_SSE0, gmx_mul_pr(c6_SSE0, sh_invrc6_SSE)));
- VLJ6_SSE1 = gmx_mul_pr(sixthSSE, gmx_sub_pr(FrLJ6_SSE1, gmx_mul_pr(c6_SSE1, sh_invrc6_SSE)));
+ VLJ6_S0 = gmx_mul_pr(sixth_S, gmx_sub_pr(FrLJ6_S0, gmx_mul_pr(c6_S0, sh_invrc6_S)));
+ VLJ6_S1 = gmx_mul_pr(sixth_S, gmx_sub_pr(FrLJ6_S1, gmx_mul_pr(c6_S1, sh_invrc6_S)));
#ifndef HALF_LJ
- VLJ6_SSE2 = gmx_mul_pr(sixthSSE, gmx_sub_pr(FrLJ6_SSE2, gmx_mul_pr(c6_SSE2, sh_invrc6_SSE)));
- VLJ6_SSE3 = gmx_mul_pr(sixthSSE, gmx_sub_pr(FrLJ6_SSE3, gmx_mul_pr(c6_SSE3, sh_invrc6_SSE)));
+ VLJ6_S2 = gmx_mul_pr(sixth_S, gmx_sub_pr(FrLJ6_S2, gmx_mul_pr(c6_S2, sh_invrc6_S)));
+ VLJ6_S3 = gmx_mul_pr(sixth_S, gmx_sub_pr(FrLJ6_S3, gmx_mul_pr(c6_S3, sh_invrc6_S)));
#endif
- VLJ12_SSE0 = gmx_mul_pr(twelvethSSE, gmx_sub_pr(FrLJ12_SSE0, gmx_mul_pr(c12_SSE0, sh_invrc12_SSE)));
- VLJ12_SSE1 = gmx_mul_pr(twelvethSSE, gmx_sub_pr(FrLJ12_SSE1, gmx_mul_pr(c12_SSE1, sh_invrc12_SSE)));
+ VLJ12_S0 = gmx_mul_pr(twelveth_S, gmx_sub_pr(FrLJ12_S0, gmx_mul_pr(c12_S0, sh_invrc12_S)));
+ VLJ12_S1 = gmx_mul_pr(twelveth_S, gmx_sub_pr(FrLJ12_S1, gmx_mul_pr(c12_S1, sh_invrc12_S)));
#ifndef HALF_LJ
- VLJ12_SSE2 = gmx_mul_pr(twelvethSSE, gmx_sub_pr(FrLJ12_SSE2, gmx_mul_pr(c12_SSE2, sh_invrc12_SSE)));
- VLJ12_SSE3 = gmx_mul_pr(twelvethSSE, gmx_sub_pr(FrLJ12_SSE3, gmx_mul_pr(c12_SSE3, sh_invrc12_SSE)));
+ VLJ12_S2 = gmx_mul_pr(twelveth_S, gmx_sub_pr(FrLJ12_S2, gmx_mul_pr(c12_S2, sh_invrc12_S)));
+ VLJ12_S3 = gmx_mul_pr(twelveth_S, gmx_sub_pr(FrLJ12_S3, gmx_mul_pr(c12_S3, sh_invrc12_S)));
#endif
- VLJ_SSE0 = gmx_sub_pr(VLJ12_SSE0, VLJ6_SSE0);
- VLJ_SSE1 = gmx_sub_pr(VLJ12_SSE1, VLJ6_SSE1);
+ VLJ_S0 = gmx_sub_pr(VLJ12_S0, VLJ6_S0);
+ VLJ_S1 = gmx_sub_pr(VLJ12_S1, VLJ6_S1);
#ifndef HALF_LJ
- VLJ_SSE2 = gmx_sub_pr(VLJ12_SSE2, VLJ6_SSE2);
- VLJ_SSE3 = gmx_sub_pr(VLJ12_SSE3, VLJ6_SSE3);
+ VLJ_S2 = gmx_sub_pr(VLJ12_S2, VLJ6_S2);
+ VLJ_S3 = gmx_sub_pr(VLJ12_S3, VLJ6_S3);
#endif
/* The potential shift should be removed for pairs beyond cut-off */
- VLJ_SSE0 = gmx_and_pr(VLJ_SSE0, wco_vdw_SSE0);
- VLJ_SSE1 = gmx_and_pr(VLJ_SSE1, wco_vdw_SSE1);
+ VLJ_S0 = gmx_blendzero_pr(VLJ_S0, wco_vdw_S0);
+ VLJ_S1 = gmx_blendzero_pr(VLJ_S1, wco_vdw_S1);
#ifndef HALF_LJ
- VLJ_SSE2 = gmx_and_pr(VLJ_SSE2, wco_vdw_SSE2);
- VLJ_SSE3 = gmx_and_pr(VLJ_SSE3, wco_vdw_SSE3);
+ VLJ_S2 = gmx_blendzero_pr(VLJ_S2, wco_vdw_S2);
+ VLJ_S3 = gmx_blendzero_pr(VLJ_S3, wco_vdw_S3);
#endif
#ifdef CHECK_EXCLS
/* The potential shift should be removed for excluded pairs */
- VLJ_SSE0 = gmx_and_pr(VLJ_SSE0, int_SSE0);
- VLJ_SSE1 = gmx_and_pr(VLJ_SSE1, int_SSE1);
+ VLJ_S0 = gmx_blendzero_pr(VLJ_S0, int_S0);
+ VLJ_S1 = gmx_blendzero_pr(VLJ_S1, int_S1);
#ifndef HALF_LJ
- VLJ_SSE2 = gmx_and_pr(VLJ_SSE2, int_SSE2);
- VLJ_SSE3 = gmx_and_pr(VLJ_SSE3, int_SSE3);
+ VLJ_S2 = gmx_blendzero_pr(VLJ_S2, int_S2);
+ VLJ_S3 = gmx_blendzero_pr(VLJ_S3, int_S3);
#endif
#endif
#ifndef ENERGY_GROUPS
- VvdwtotSSE = gmx_add_pr(VvdwtotSSE,
+ Vvdwtot_S = gmx_add_pr(Vvdwtot_S,
#ifndef HALF_LJ
- gmx_sum4_pr(VLJ_SSE0, VLJ_SSE1, VLJ_SSE2, VLJ_SSE3)
+ gmx_sum4_pr(VLJ_S0, VLJ_S1, VLJ_S2, VLJ_S3)
#else
- gmx_add_pr(VLJ_SSE0, VLJ_SSE1)
+ gmx_add_pr(VLJ_S0, VLJ_S1)
#endif
- );
+ );
#else
- add_ener_grp(VLJ_SSE0, vvdwtp[0], egp_jj);
- add_ener_grp(VLJ_SSE1, vvdwtp[1], egp_jj);
+ add_ener_grp(VLJ_S0, vvdwtp[0], egp_jj);
+ add_ener_grp(VLJ_S1, vvdwtp[1], egp_jj);
#ifndef HALF_LJ
- add_ener_grp(VLJ_SSE2, vvdwtp[2], egp_jj);
- add_ener_grp(VLJ_SSE3, vvdwtp[3], egp_jj);
+ add_ener_grp(VLJ_S2, vvdwtp[2], egp_jj);
+ add_ener_grp(VLJ_S3, vvdwtp[3], egp_jj);
#endif
#endif
#endif /* CALC_LJ */
#endif /* CALC_ENERGIES */
#ifdef CALC_LJ
- fscal_SSE0 = gmx_mul_pr(rinvsq_SSE0,
+ fscal_S0 = gmx_mul_pr(rinvsq_S0,
#ifdef CALC_COULOMB
- gmx_add_pr(frcoul_SSE0,
+ gmx_add_pr(frcoul_S0,
#else
(
#endif
- gmx_sub_pr(FrLJ12_SSE0, FrLJ6_SSE0)));
- fscal_SSE1 = gmx_mul_pr(rinvsq_SSE1,
+ gmx_sub_pr(FrLJ12_S0, FrLJ6_S0)));
+ fscal_S1 = gmx_mul_pr(rinvsq_S1,
#ifdef CALC_COULOMB
- gmx_add_pr(frcoul_SSE1,
+ gmx_add_pr(frcoul_S1,
#else
(
#endif
- gmx_sub_pr(FrLJ12_SSE1, FrLJ6_SSE1)));
+ gmx_sub_pr(FrLJ12_S1, FrLJ6_S1)));
#else
- fscal_SSE0 = gmx_mul_pr(rinvsq_SSE0, frcoul_SSE0);
- fscal_SSE1 = gmx_mul_pr(rinvsq_SSE1, frcoul_SSE1);
+ fscal_S0 = gmx_mul_pr(rinvsq_S0, frcoul_S0);
+ fscal_S1 = gmx_mul_pr(rinvsq_S1, frcoul_S1);
#endif /* CALC_LJ */
#if defined CALC_LJ && !defined HALF_LJ
- fscal_SSE2 = gmx_mul_pr(rinvsq_SSE2,
+ fscal_S2 = gmx_mul_pr(rinvsq_S2,
#ifdef CALC_COULOMB
- gmx_add_pr(frcoul_SSE2,
+ gmx_add_pr(frcoul_S2,
#else
(
#endif
- gmx_sub_pr(FrLJ12_SSE2, FrLJ6_SSE2)));
- fscal_SSE3 = gmx_mul_pr(rinvsq_SSE3,
+ gmx_sub_pr(FrLJ12_S2, FrLJ6_S2)));
+ fscal_S3 = gmx_mul_pr(rinvsq_S3,
#ifdef CALC_COULOMB
- gmx_add_pr(frcoul_SSE3,
+ gmx_add_pr(frcoul_S3,
#else
(
#endif
- gmx_sub_pr(FrLJ12_SSE3, FrLJ6_SSE3)));
+ gmx_sub_pr(FrLJ12_S3, FrLJ6_S3)));
#else
/* Atom 2 and 3 don't have LJ, so only add Coulomb forces */
- fscal_SSE2 = gmx_mul_pr(rinvsq_SSE2, frcoul_SSE2);
- fscal_SSE3 = gmx_mul_pr(rinvsq_SSE3, frcoul_SSE3);
+ fscal_S2 = gmx_mul_pr(rinvsq_S2, frcoul_S2);
+ fscal_S3 = gmx_mul_pr(rinvsq_S3, frcoul_S3);
#endif
/* Calculate temporary vectorial force */
- tx_SSE0 = gmx_mul_pr(fscal_SSE0, dx_SSE0);
- tx_SSE1 = gmx_mul_pr(fscal_SSE1, dx_SSE1);
- tx_SSE2 = gmx_mul_pr(fscal_SSE2, dx_SSE2);
- tx_SSE3 = gmx_mul_pr(fscal_SSE3, dx_SSE3);
- ty_SSE0 = gmx_mul_pr(fscal_SSE0, dy_SSE0);
- ty_SSE1 = gmx_mul_pr(fscal_SSE1, dy_SSE1);
- ty_SSE2 = gmx_mul_pr(fscal_SSE2, dy_SSE2);
- ty_SSE3 = gmx_mul_pr(fscal_SSE3, dy_SSE3);
- tz_SSE0 = gmx_mul_pr(fscal_SSE0, dz_SSE0);
- tz_SSE1 = gmx_mul_pr(fscal_SSE1, dz_SSE1);
- tz_SSE2 = gmx_mul_pr(fscal_SSE2, dz_SSE2);
- tz_SSE3 = gmx_mul_pr(fscal_SSE3, dz_SSE3);
+ tx_S0 = gmx_mul_pr(fscal_S0, dx_S0);
+ tx_S1 = gmx_mul_pr(fscal_S1, dx_S1);
+ tx_S2 = gmx_mul_pr(fscal_S2, dx_S2);
+ tx_S3 = gmx_mul_pr(fscal_S3, dx_S3);
+ ty_S0 = gmx_mul_pr(fscal_S0, dy_S0);
+ ty_S1 = gmx_mul_pr(fscal_S1, dy_S1);
+ ty_S2 = gmx_mul_pr(fscal_S2, dy_S2);
+ ty_S3 = gmx_mul_pr(fscal_S3, dy_S3);
+ tz_S0 = gmx_mul_pr(fscal_S0, dz_S0);
+ tz_S1 = gmx_mul_pr(fscal_S1, dz_S1);
+ tz_S2 = gmx_mul_pr(fscal_S2, dz_S2);
+ tz_S3 = gmx_mul_pr(fscal_S3, dz_S3);
/* Increment i atom force */
- fix_SSE0 = gmx_add_pr(fix_SSE0, tx_SSE0);
- fix_SSE1 = gmx_add_pr(fix_SSE1, tx_SSE1);
- fix_SSE2 = gmx_add_pr(fix_SSE2, tx_SSE2);
- fix_SSE3 = gmx_add_pr(fix_SSE3, tx_SSE3);
- fiy_SSE0 = gmx_add_pr(fiy_SSE0, ty_SSE0);
- fiy_SSE1 = gmx_add_pr(fiy_SSE1, ty_SSE1);
- fiy_SSE2 = gmx_add_pr(fiy_SSE2, ty_SSE2);
- fiy_SSE3 = gmx_add_pr(fiy_SSE3, ty_SSE3);
- fiz_SSE0 = gmx_add_pr(fiz_SSE0, tz_SSE0);
- fiz_SSE1 = gmx_add_pr(fiz_SSE1, tz_SSE1);
- fiz_SSE2 = gmx_add_pr(fiz_SSE2, tz_SSE2);
- fiz_SSE3 = gmx_add_pr(fiz_SSE3, tz_SSE3);
+ fix_S0 = gmx_add_pr(fix_S0, tx_S0);
+ fix_S1 = gmx_add_pr(fix_S1, tx_S1);
+ fix_S2 = gmx_add_pr(fix_S2, tx_S2);
+ fix_S3 = gmx_add_pr(fix_S3, tx_S3);
+ fiy_S0 = gmx_add_pr(fiy_S0, ty_S0);
+ fiy_S1 = gmx_add_pr(fiy_S1, ty_S1);
+ fiy_S2 = gmx_add_pr(fiy_S2, ty_S2);
+ fiy_S3 = gmx_add_pr(fiy_S3, ty_S3);
+ fiz_S0 = gmx_add_pr(fiz_S0, tz_S0);
+ fiz_S1 = gmx_add_pr(fiz_S1, tz_S1);
+ fiz_S2 = gmx_add_pr(fiz_S2, tz_S2);
+ fiz_S3 = gmx_add_pr(fiz_S3, tz_S3);
/* Decrement j atom force */
gmx_store_pr(f+ajx,
- gmx_sub_pr( gmx_load_pr(f+ajx), gmx_sum4_pr(tx_SSE0, tx_SSE1, tx_SSE2, tx_SSE3) ));
+ gmx_sub_pr( gmx_load_pr(f+ajx), gmx_sum4_pr(tx_S0, tx_S1, tx_S2, tx_S3) ));
gmx_store_pr(f+ajy,
- gmx_sub_pr( gmx_load_pr(f+ajy), gmx_sum4_pr(ty_SSE0, ty_SSE1, ty_SSE2, ty_SSE3) ));
+ gmx_sub_pr( gmx_load_pr(f+ajy), gmx_sum4_pr(ty_S0, ty_S1, ty_S2, ty_S3) ));
gmx_store_pr(f+ajz,
- gmx_sub_pr( gmx_load_pr(f+ajz), gmx_sum4_pr(tz_SSE0, tz_SSE1, tz_SSE2, tz_SSE3) ));
+ gmx_sub_pr( gmx_load_pr(f+ajz), gmx_sum4_pr(tz_S0, tz_S1, tz_S2, tz_S3) ));
}
-#undef rinv_ex_SSE0
-#undef rinv_ex_SSE1
-#undef rinv_ex_SSE2
-#undef rinv_ex_SSE3
+#undef rinv_ex_S0
+#undef rinv_ex_S1
+#undef rinv_ex_S2
+#undef rinv_ex_S3
-#undef wco_vdw_SSE0
-#undef wco_vdw_SSE1
-#undef wco_vdw_SSE2
-#undef wco_vdw_SSE3
+#undef wco_vdw_S0
+#undef wco_vdw_S1
+#undef wco_vdw_S2
+#undef wco_vdw_S3
-#undef CUTOFF_BLENDV
+#undef NBNXN_CUTOFF_USE_BLENDV
#undef EXCL_FORCES
* the research papers on the package. Check out http://www.gromacs.org.
*/
-/* GMX_MM128_HERE or GMX_MM256_HERE should be set before including this file */
+#if !(GMX_NBNXN_SIMD_BITWIDTH == 128 || GMX_NBNXN_SIMD_BITWIDTH == 256)
+#error "unsupported GMX_NBNXN_SIMD_BITWIDTH"
+#endif
+
+#ifdef GMX_NBNXN_HALF_WIDTH_SIMD
+#define GMX_USE_HALF_WIDTH_SIMD_HERE
+#endif
#include "gmx_simd_macros.h"
#define SUM_SIMD4(x) (x[0]+x[1]+x[2]+x[3])
#define UNROLLI NBNXN_CPU_CLUSTER_I_SIZE
#define UNROLLJ GMX_SIMD_WIDTH_HERE
-#if defined GMX_MM128_HERE || defined GMX_DOUBLE
-#define STRIDE 4
-#endif
-#if defined GMX_MM256_HERE && !defined GMX_DOUBLE
-#define STRIDE 8
+/* The stride of all the atom data arrays is max(UNROLLI,UNROLLJ) */
+#if GMX_SIMD_WIDTH_HERE >= UNROLLI
+#define STRIDE GMX_SIMD_WIDTH_HERE
+#else
+#define STRIDE UNROLLI
#endif
-#ifdef GMX_MM128_HERE
-#ifndef GMX_DOUBLE
-/* single precision 4x4 kernel */
-#define SUM_SIMD(x) SUM_SIMD4(x)
-#define TAB_FDV0
+#if GMX_SIMD_WIDTH_HERE == 2
+#define SUM_SIMD(x) (x[0]+x[1])
+#else
+#if GMX_SIMD_WIDTH_HERE == 4
+#define SUM_SIMD(x) SUM_SIMD4(x)
#else
-/* double precision 4x2 kernel */
-#define SUM_SIMD(x) (x[0]+x[1])
+#if GMX_SIMD_WIDTH_HERE == 8
+#define SUM_SIMD(x) (x[0]+x[1]+x[2]+x[3]+x[4]+x[5]+x[6]+x[7])
+#else
+#error "unsupported kernel configuration"
+#endif
#endif
#endif
-#ifdef GMX_MM256_HERE
-#ifndef GMX_DOUBLE
-/* single precision 4x8 kernel */
-#define SUM_SIMD(x) (x[0]+x[1]+x[2]+x[3]+x[4]+x[5]+x[6]+x[7])
+
+/* Decide if we should use the FDV0 table layout */
+#if defined GMX_X86_AVX_256 && !defined GMX_USE_HALF_WIDTH_SIMD_HERE
+/* With full AVX-256 SIMD, half SIMD-width table loads are optimal */
+#if GMX_SIMD_WIDTH_HERE/2 == 4
#define TAB_FDV0
+#endif
#else
-/* double precision 4x4 kernel */
-#define SUM_SIMD(x) SUM_SIMD4(x)
+/* We use the FDV0 table layout when we can use aligned table loads */
+#if GMX_SIMD_WIDTH_HERE == 4
+#define TAB_FDV0
#endif
#endif
+
#define SIMD_MASK_ALL 0xffffffff
#include "nbnxn_kernel_simd_utils.h"
real *vctp[UNROLLI];
#endif
- gmx_mm_pr shX_SSE;
- gmx_mm_pr shY_SSE;
- gmx_mm_pr shZ_SSE;
- gmx_mm_pr ix_SSE0, iy_SSE0, iz_SSE0;
- gmx_mm_pr ix_SSE1, iy_SSE1, iz_SSE1;
- gmx_mm_pr ix_SSE2, iy_SSE2, iz_SSE2;
- gmx_mm_pr ix_SSE3, iy_SSE3, iz_SSE3;
- gmx_mm_pr fix_SSE0, fiy_SSE0, fiz_SSE0;
- gmx_mm_pr fix_SSE1, fiy_SSE1, fiz_SSE1;
- gmx_mm_pr fix_SSE2, fiy_SSE2, fiz_SSE2;
- gmx_mm_pr fix_SSE3, fiy_SSE3, fiz_SSE3;
+ gmx_mm_pr shX_S;
+ gmx_mm_pr shY_S;
+ gmx_mm_pr shZ_S;
+ gmx_mm_pr ix_S0, iy_S0, iz_S0;
+ gmx_mm_pr ix_S1, iy_S1, iz_S1;
+ gmx_mm_pr ix_S2, iy_S2, iz_S2;
+ gmx_mm_pr ix_S3, iy_S3, iz_S3;
+ gmx_mm_pr fix_S0, fiy_S0, fiz_S0;
+ gmx_mm_pr fix_S1, fiy_S1, fiz_S1;
+ gmx_mm_pr fix_S2, fiy_S2, fiz_S2;
+ gmx_mm_pr fix_S3, fiy_S3, fiz_S3;
#if UNROLLJ >= 4
#ifndef GMX_DOUBLE
- __m128 fix_SSE, fiy_SSE, fiz_SSE;
+ __m128 fix_S, fiy_S, fiz_S;
#else
- __m256d fix_SSE, fiy_SSE, fiz_SSE;
+ __m256d fix_S, fiy_S, fiz_S;
#endif
#else
- __m128d fix0_SSE, fiy0_SSE, fiz0_SSE;
- __m128d fix2_SSE, fiy2_SSE, fiz2_SSE;
+ __m128d fix0_S, fiy0_S, fiz0_S;
+ __m128d fix2_S, fiy2_S, fiz2_S;
#endif
-#ifdef GMX_MM128_HERE
-#ifndef GMX_DOUBLE
- __m128i mask0 = _mm_set_epi32( 0x0008, 0x0004, 0x0002, 0x0001 );
- __m128i mask1 = _mm_set_epi32( 0x0080, 0x0040, 0x0020, 0x0010 );
- __m128i mask2 = _mm_set_epi32( 0x0800, 0x0400, 0x0200, 0x0100 );
- __m128i mask3 = _mm_set_epi32( 0x8000, 0x4000, 0x2000, 0x1000 );
-#else
- /* For double precision we need to set two 32bit ints for one double */
- __m128i mask0 = _mm_set_epi32( 0x0002, 0x0002, 0x0001, 0x0001 );
- __m128i mask1 = _mm_set_epi32( 0x0008, 0x0008, 0x0004, 0x0004 );
- __m128i mask2 = _mm_set_epi32( 0x0020, 0x0020, 0x0010, 0x0010 );
- __m128i mask3 = _mm_set_epi32( 0x0080, 0x0080, 0x0040, 0x0040 );
-#endif
-#endif
-#ifdef GMX_MM256_HERE
- /* AVX: use floating point masks, as there are no integer instructions */
-#ifndef GMX_DOUBLE
- gmx_mm_pr mask0 = _mm256_castsi256_ps(_mm256_set_epi32( 0x0080, 0x0040, 0x0020, 0x0010, 0x0008, 0x0004, 0x0002, 0x0001 ));
- gmx_mm_pr mask1 = _mm256_castsi256_ps(_mm256_set_epi32( 0x8000, 0x4000, 0x2000, 0x1000, 0x0800, 0x0400, 0x0200, 0x0100 ));
+ gmx_mm_pr diag_jmi_S;
+#if UNROLLI == UNROLLJ
+ gmx_mm_pr diag_S0, diag_S1, diag_S2, diag_S3;
#else
- /* There is no 256-bit int to double conversion, so we use float here */
- __m256 mask0 = _mm256_castsi256_ps(_mm256_set_epi32( 0x0008, 0x0008, 0x0004, 0x0004, 0x0002, 0x0002, 0x0001, 0x0001 ));
- __m256 mask1 = _mm256_castsi256_ps(_mm256_set_epi32( 0x0080, 0x0080, 0x0040, 0x0040, 0x0020, 0x0020, 0x0010, 0x0010 ));
- __m256 mask2 = _mm256_castsi256_ps(_mm256_set_epi32( 0x0800, 0x0800, 0x0400, 0x0400, 0x0200, 0x0200, 0x0100, 0x0100 ));
- __m256 mask3 = _mm256_castsi256_ps(_mm256_set_epi32( 0x8000, 0x8000, 0x4000, 0x4000, 0x2000, 0x2000, 0x1000, 0x1000 ));
-#endif
+ gmx_mm_pr diag0_S0, diag0_S1, diag0_S2, diag0_S3;
+ gmx_mm_pr diag1_S0, diag1_S1, diag1_S2, diag1_S3;
#endif
- gmx_mm_pr diag_jmi_SSE;
-#if UNROLLI == UNROLLJ
- gmx_mm_pr diag_SSE0, diag_SSE1, diag_SSE2, diag_SSE3;
+#ifdef gmx_checkbitmask_epi32
+ gmx_epi32 mask_S0, mask_S1, mask_S2, mask_S3;
#else
- gmx_mm_pr diag0_SSE0, diag0_SSE1, diag0_SSE2, diag0_SSE3;
- gmx_mm_pr diag1_SSE0, diag1_SSE1, diag1_SSE2, diag1_SSE3;
+ gmx_mm_pr mask_S0, mask_S1, mask_S2, mask_S3;
#endif
-#if defined GMX_X86_SSE2 && defined GMX_MM128_HERE
- __m128i zeroi_SSE = _mm_setzero_si128();
-#endif
- gmx_mm_pr zero_SSE = gmx_set1_pr(0);
+ gmx_mm_pr zero_S = gmx_set1_pr(0);
- gmx_mm_pr one_SSE = gmx_set1_pr(1.0);
- gmx_mm_pr iq_SSE0 = gmx_setzero_pr();
- gmx_mm_pr iq_SSE1 = gmx_setzero_pr();
- gmx_mm_pr iq_SSE2 = gmx_setzero_pr();
- gmx_mm_pr iq_SSE3 = gmx_setzero_pr();
- gmx_mm_pr mrc_3_SSE;
+ gmx_mm_pr one_S = gmx_set1_pr(1.0);
+ gmx_mm_pr iq_S0 = gmx_setzero_pr();
+ gmx_mm_pr iq_S1 = gmx_setzero_pr();
+ gmx_mm_pr iq_S2 = gmx_setzero_pr();
+ gmx_mm_pr iq_S3 = gmx_setzero_pr();
+ gmx_mm_pr mrc_3_S;
#ifdef CALC_ENERGIES
- gmx_mm_pr hrc_3_SSE, moh_rc_SSE;
+ gmx_mm_pr hrc_3_S, moh_rc_S;
#endif
#ifdef CALC_COUL_TAB
/* Coulomb table variables */
- gmx_mm_pr invtsp_SSE;
+ gmx_mm_pr invtsp_S;
const real *tab_coul_F;
#ifndef TAB_FDV0
const real *tab_coul_V;
#endif
-#ifdef GMX_MM256_HERE
+#if GMX_NBNXN_SIMD_BITWIDTH == 256
int ti0_array[2*GMX_SIMD_WIDTH_HERE-1], *ti0;
int ti1_array[2*GMX_SIMD_WIDTH_HERE-1], *ti1;
int ti2_array[2*GMX_SIMD_WIDTH_HERE-1], *ti2;
int ti3_array[2*GMX_SIMD_WIDTH_HERE-1], *ti3;
#endif
#ifdef CALC_ENERGIES
- gmx_mm_pr mhalfsp_SSE;
+ gmx_mm_pr mhalfsp_S;
#endif
#endif
#ifdef CALC_COUL_EWALD
- gmx_mm_pr beta2_SSE, beta_SSE;
+ gmx_mm_pr beta2_S, beta_S;
#endif
#if defined CALC_ENERGIES && (defined CALC_COUL_EWALD || defined CALC_COUL_TAB)
- gmx_mm_pr sh_ewald_SSE;
+ gmx_mm_pr sh_ewald_S;
#endif
#ifdef LJ_COMB_LB
const real *ljc;
- gmx_mm_pr hsig_i_SSE0, seps_i_SSE0;
- gmx_mm_pr hsig_i_SSE1, seps_i_SSE1;
- gmx_mm_pr hsig_i_SSE2, seps_i_SSE2;
- gmx_mm_pr hsig_i_SSE3, seps_i_SSE3;
+ gmx_mm_pr hsig_i_S0, seps_i_S0;
+ gmx_mm_pr hsig_i_S1, seps_i_S1;
+ gmx_mm_pr hsig_i_S2, seps_i_S2;
+ gmx_mm_pr hsig_i_S3, seps_i_S3;
#else
#ifdef FIX_LJ_C
real pvdw_array[2*UNROLLI*UNROLLJ+3];
real *pvdw_c6, *pvdw_c12;
- gmx_mm_pr c6_SSE0, c12_SSE0;
- gmx_mm_pr c6_SSE1, c12_SSE1;
- gmx_mm_pr c6_SSE2, c12_SSE2;
- gmx_mm_pr c6_SSE3, c12_SSE3;
+ gmx_mm_pr c6_S0, c12_S0;
+ gmx_mm_pr c6_S1, c12_S1;
+ gmx_mm_pr c6_S2, c12_S2;
+ gmx_mm_pr c6_S3, c12_S3;
#endif
#ifdef LJ_COMB_GEOM
const real *ljc;
- gmx_mm_pr c6s_SSE0, c12s_SSE0;
- gmx_mm_pr c6s_SSE1, c12s_SSE1;
- gmx_mm_pr c6s_SSE2 = gmx_setzero_pr(), c12s_SSE2 = gmx_setzero_pr();
- gmx_mm_pr c6s_SSE3 = gmx_setzero_pr(), c12s_SSE3 = gmx_setzero_pr();
+ gmx_mm_pr c6s_S0, c12s_S0;
+ gmx_mm_pr c6s_S1, c12s_S1;
+ gmx_mm_pr c6s_S2 = gmx_setzero_pr(), c12s_S2 = gmx_setzero_pr();
+ gmx_mm_pr c6s_S3 = gmx_setzero_pr(), c12s_S3 = gmx_setzero_pr();
#endif
#endif /* LJ_COMB_LB */
- gmx_mm_pr vctotSSE, VvdwtotSSE;
- gmx_mm_pr sixthSSE, twelvethSSE;
+ gmx_mm_pr vctot_S, Vvdwtot_S;
+ gmx_mm_pr sixth_S, twelveth_S;
- gmx_mm_pr avoid_sing_SSE;
- gmx_mm_pr rc2_SSE;
+ gmx_mm_pr avoid_sing_S;
+ gmx_mm_pr rc2_S;
#ifdef VDW_CUTOFF_CHECK
- gmx_mm_pr rcvdw2_SSE;
+ gmx_mm_pr rcvdw2_S;
#endif
#ifdef CALC_ENERGIES
- gmx_mm_pr sh_invrc6_SSE, sh_invrc12_SSE;
+ gmx_mm_pr sh_invrc6_S, sh_invrc12_S;
/* cppcheck-suppress unassignedVariable */
real tmpsum_array[15], *tmpsum;
#endif
/* Load j-i for the first i */
- diag_jmi_SSE = gmx_load_pr(nbat->simd_4xn_diag);
+ diag_jmi_S = gmx_load_pr(nbat->simd_4xn_diag);
/* Generate all the diagonal masks as comparison results */
#if UNROLLI == UNROLLJ
- diag_SSE0 = gmx_cmplt_pr(zero_SSE, diag_jmi_SSE);
- diag_jmi_SSE = gmx_sub_pr(diag_jmi_SSE, one_SSE);
- diag_SSE1 = gmx_cmplt_pr(zero_SSE, diag_jmi_SSE);
- diag_jmi_SSE = gmx_sub_pr(diag_jmi_SSE, one_SSE);
- diag_SSE2 = gmx_cmplt_pr(zero_SSE, diag_jmi_SSE);
- diag_jmi_SSE = gmx_sub_pr(diag_jmi_SSE, one_SSE);
- diag_SSE3 = gmx_cmplt_pr(zero_SSE, diag_jmi_SSE);
+ diag_S0 = gmx_cmplt_pr(zero_S, diag_jmi_S);
+ diag_jmi_S = gmx_sub_pr(diag_jmi_S, one_S);
+ diag_S1 = gmx_cmplt_pr(zero_S, diag_jmi_S);
+ diag_jmi_S = gmx_sub_pr(diag_jmi_S, one_S);
+ diag_S2 = gmx_cmplt_pr(zero_S, diag_jmi_S);
+ diag_jmi_S = gmx_sub_pr(diag_jmi_S, one_S);
+ diag_S3 = gmx_cmplt_pr(zero_S, diag_jmi_S);
#else
#if UNROLLI == 2*UNROLLJ || 2*UNROLLI == UNROLLJ
- diag0_SSE0 = gmx_cmplt_pr(zero_SSE, diag_jmi_SSE);
- diag_jmi_SSE = gmx_sub_pr(diag_jmi_SSE, one_SSE);
- diag0_SSE1 = gmx_cmplt_pr(zero_SSE, diag_jmi_SSE);
- diag_jmi_SSE = gmx_sub_pr(diag_jmi_SSE, one_SSE);
- diag0_SSE2 = gmx_cmplt_pr(zero_SSE, diag_jmi_SSE);
- diag_jmi_SSE = gmx_sub_pr(diag_jmi_SSE, one_SSE);
- diag0_SSE3 = gmx_cmplt_pr(zero_SSE, diag_jmi_SSE);
- diag_jmi_SSE = gmx_sub_pr(diag_jmi_SSE, one_SSE);
+ diag0_S0 = gmx_cmplt_pr(zero_S, diag_jmi_S);
+ diag_jmi_S = gmx_sub_pr(diag_jmi_S, one_S);
+ diag0_S1 = gmx_cmplt_pr(zero_S, diag_jmi_S);
+ diag_jmi_S = gmx_sub_pr(diag_jmi_S, one_S);
+ diag0_S2 = gmx_cmplt_pr(zero_S, diag_jmi_S);
+ diag_jmi_S = gmx_sub_pr(diag_jmi_S, one_S);
+ diag0_S3 = gmx_cmplt_pr(zero_S, diag_jmi_S);
+ diag_jmi_S = gmx_sub_pr(diag_jmi_S, one_S);
#if UNROLLI == 2*UNROLLJ
/* Load j-i for the second half of the j-cluster */
- diag_jmi_SSE = gmx_load_pr(nbat->simd_4xn_diag+UNROLLJ);
+ diag_jmi_S = gmx_load_pr(nbat->simd_4xn_diag+UNROLLJ);
#endif
- diag1_SSE0 = gmx_cmplt_pr(zero_SSE, diag_jmi_SSE);
- diag_jmi_SSE = gmx_sub_pr(diag_jmi_SSE, one_SSE);
- diag1_SSE1 = gmx_cmplt_pr(zero_SSE, diag_jmi_SSE);
- diag_jmi_SSE = gmx_sub_pr(diag_jmi_SSE, one_SSE);
- diag1_SSE2 = gmx_cmplt_pr(zero_SSE, diag_jmi_SSE);
- diag_jmi_SSE = gmx_sub_pr(diag_jmi_SSE, one_SSE);
- diag1_SSE3 = gmx_cmplt_pr(zero_SSE, diag_jmi_SSE);
+ diag1_S0 = gmx_cmplt_pr(zero_S, diag_jmi_S);
+ diag_jmi_S = gmx_sub_pr(diag_jmi_S, one_S);
+ diag1_S1 = gmx_cmplt_pr(zero_S, diag_jmi_S);
+ diag_jmi_S = gmx_sub_pr(diag_jmi_S, one_S);
+ diag1_S2 = gmx_cmplt_pr(zero_S, diag_jmi_S);
+ diag_jmi_S = gmx_sub_pr(diag_jmi_S, one_S);
+ diag1_S3 = gmx_cmplt_pr(zero_S, diag_jmi_S);
#endif
#endif
+ /* Load masks for topology exclusion masking */
+#ifdef gmx_checkbitmask_epi32
+ mask_S0 = gmx_load_si(nbat->simd_excl_mask + 0*GMX_NBNXN_SIMD_BITWIDTH/32);
+ mask_S1 = gmx_load_si(nbat->simd_excl_mask + 1*GMX_NBNXN_SIMD_BITWIDTH/32);
+ mask_S2 = gmx_load_si(nbat->simd_excl_mask + 2*GMX_NBNXN_SIMD_BITWIDTH/32);
+ mask_S3 = gmx_load_si(nbat->simd_excl_mask + 3*GMX_NBNXN_SIMD_BITWIDTH/32);
+#else
+ mask_S0 = gmx_load_pr((real *)nbat->simd_excl_mask + 0*UNROLLJ);
+ mask_S1 = gmx_load_pr((real *)nbat->simd_excl_mask + 1*UNROLLJ);
+ mask_S2 = gmx_load_pr((real *)nbat->simd_excl_mask + 2*UNROLLJ);
+ mask_S3 = gmx_load_pr((real *)nbat->simd_excl_mask + 3*UNROLLJ);
+#endif
+
#ifdef CALC_COUL_TAB
-#ifdef GMX_MM256_HERE
+#if GMX_NBNXN_SIMD_BITWIDTH == 256
/* Generate aligned table index pointers */
- ti0 = (int *)(((size_t)(ti0_array+GMX_SIMD_WIDTH_HERE-1)) & (~((size_t)(GMX_SIMD_WIDTH_HERE*sizeof(int)-1))));
- ti1 = (int *)(((size_t)(ti1_array+GMX_SIMD_WIDTH_HERE-1)) & (~((size_t)(GMX_SIMD_WIDTH_HERE*sizeof(int)-1))));
- ti2 = (int *)(((size_t)(ti2_array+GMX_SIMD_WIDTH_HERE-1)) & (~((size_t)(GMX_SIMD_WIDTH_HERE*sizeof(int)-1))));
- ti3 = (int *)(((size_t)(ti3_array+GMX_SIMD_WIDTH_HERE-1)) & (~((size_t)(GMX_SIMD_WIDTH_HERE*sizeof(int)-1))));
+ ti0 = gmx_simd_align_int(ti0_array);
+ ti1 = gmx_simd_align_int(ti1_array);
+ ti2 = gmx_simd_align_int(ti2_array);
+ ti3 = gmx_simd_align_int(ti3_array);
#endif
- invtsp_SSE = gmx_set1_pr(ic->tabq_scale);
+ invtsp_S = gmx_set1_pr(ic->tabq_scale);
#ifdef CALC_ENERGIES
- mhalfsp_SSE = gmx_set1_pr(-0.5/ic->tabq_scale);
+ mhalfsp_S = gmx_set1_pr(-0.5/ic->tabq_scale);
#endif
#ifdef TAB_FDV0
#endif /* CALC_COUL_TAB */
#ifdef CALC_COUL_EWALD
- beta2_SSE = gmx_set1_pr(ic->ewaldcoeff*ic->ewaldcoeff);
- beta_SSE = gmx_set1_pr(ic->ewaldcoeff);
+ beta2_S = gmx_set1_pr(ic->ewaldcoeff*ic->ewaldcoeff);
+ beta_S = gmx_set1_pr(ic->ewaldcoeff);
#endif
#if (defined CALC_COUL_TAB || defined CALC_COUL_EWALD) && defined CALC_ENERGIES
- sh_ewald_SSE = gmx_set1_pr(ic->sh_ewald);
+ sh_ewald_S = gmx_set1_pr(ic->sh_ewald);
#endif
q = nbat->q;
shiftvec = shift_vec[0];
x = nbat->x;
- avoid_sing_SSE = gmx_set1_pr(NBNXN_AVOID_SING_R2_INC);
+ avoid_sing_S = gmx_set1_pr(NBNXN_AVOID_SING_R2_INC);
/* The kernel either supports rcoulomb = rvdw or rcoulomb >= rvdw */
- rc2_SSE = gmx_set1_pr(ic->rcoulomb*ic->rcoulomb);
+ rc2_S = gmx_set1_pr(ic->rcoulomb*ic->rcoulomb);
#ifdef VDW_CUTOFF_CHECK
- rcvdw2_SSE = gmx_set1_pr(ic->rvdw*ic->rvdw);
+ rcvdw2_S = gmx_set1_pr(ic->rvdw*ic->rvdw);
#endif
#ifdef CALC_ENERGIES
- sixthSSE = gmx_set1_pr(1.0/6.0);
- twelvethSSE = gmx_set1_pr(1.0/12.0);
+ sixth_S = gmx_set1_pr(1.0/6.0);
+ twelveth_S = gmx_set1_pr(1.0/12.0);
- sh_invrc6_SSE = gmx_set1_pr(ic->sh_invrc6);
- sh_invrc12_SSE = gmx_set1_pr(ic->sh_invrc6*ic->sh_invrc6);
+ sh_invrc6_S = gmx_set1_pr(ic->sh_invrc6);
+ sh_invrc12_S = gmx_set1_pr(ic->sh_invrc6*ic->sh_invrc6);
#endif
- mrc_3_SSE = gmx_set1_pr(-2*ic->k_rf);
+ mrc_3_S = gmx_set1_pr(-2*ic->k_rf);
#ifdef CALC_ENERGIES
- hrc_3_SSE = gmx_set1_pr(ic->k_rf);
+ hrc_3_S = gmx_set1_pr(ic->k_rf);
- moh_rc_SSE = gmx_set1_pr(-ic->c_rf);
+ moh_rc_S = gmx_set1_pr(-ic->c_rf);
#endif
#ifdef CALC_ENERGIES
- tmpsum = (real *)(((size_t)(tmpsum_array+7)) & (~((size_t)31)));
+ tmpsum = gmx_simd_align_real(tmpsum_array);
#endif
#ifdef CALC_SHIFTFORCES
- shf = (real *)(((size_t)(shf_array+7)) & (~((size_t)31)));
+ shf = gmx_simd_align_real(shf_array);
#endif
#ifdef FIX_LJ_C
- pvdw_c6 = (real *)(((size_t)(pvdw_array+3)) & (~((size_t)15)));
+ pvdw_c6 = gmx_simd_align_real(pvdw_array+3);
pvdw_c12 = pvdw_c6 + UNROLLI*UNROLLJ;
for (jp = 0; jp < UNROLLJ; jp++)
pvdw_c12[2*UNROLLJ+jp] = nbat->nbfp[0*2+1];
pvdw_c12[3*UNROLLJ+jp] = nbat->nbfp[0*2+1];
}
- c6_SSE0 = gmx_load_pr(pvdw_c6 +0*UNROLLJ);
- c6_SSE1 = gmx_load_pr(pvdw_c6 +1*UNROLLJ);
- c6_SSE2 = gmx_load_pr(pvdw_c6 +2*UNROLLJ);
- c6_SSE3 = gmx_load_pr(pvdw_c6 +3*UNROLLJ);
-
- c12_SSE0 = gmx_load_pr(pvdw_c12+0*UNROLLJ);
- c12_SSE1 = gmx_load_pr(pvdw_c12+1*UNROLLJ);
- c12_SSE2 = gmx_load_pr(pvdw_c12+2*UNROLLJ);
- c12_SSE3 = gmx_load_pr(pvdw_c12+3*UNROLLJ);
+ c6_S0 = gmx_load_pr(pvdw_c6 +0*UNROLLJ);
+ c6_S1 = gmx_load_pr(pvdw_c6 +1*UNROLLJ);
+ c6_S2 = gmx_load_pr(pvdw_c6 +2*UNROLLJ);
+ c6_S3 = gmx_load_pr(pvdw_c6 +3*UNROLLJ);
+
+ c12_S0 = gmx_load_pr(pvdw_c12+0*UNROLLJ);
+ c12_S1 = gmx_load_pr(pvdw_c12+1*UNROLLJ);
+ c12_S2 = gmx_load_pr(pvdw_c12+2*UNROLLJ);
+ c12_S3 = gmx_load_pr(pvdw_c12+3*UNROLLJ);
#endif /* FIX_LJ_C */
#ifdef ENERGY_GROUPS
ci = nbln->ci;
ci_sh = (ish == CENTRAL ? ci : -1);
- shX_SSE = gmx_load1_pr(shiftvec+ish3);
- shY_SSE = gmx_load1_pr(shiftvec+ish3+1);
- shZ_SSE = gmx_load1_pr(shiftvec+ish3+2);
+ shX_S = gmx_load1_pr(shiftvec+ish3);
+ shY_S = gmx_load1_pr(shiftvec+ish3+1);
+ shZ_S = gmx_load1_pr(shiftvec+ish3+2);
#if UNROLLJ <= 4
sci = ci*STRIDE;
/* Load i atom data */
sciy = scix + STRIDE;
sciz = sciy + STRIDE;
- ix_SSE0 = gmx_add_pr(gmx_load1_pr(x+scix), shX_SSE);
- ix_SSE1 = gmx_add_pr(gmx_load1_pr(x+scix+1), shX_SSE);
- ix_SSE2 = gmx_add_pr(gmx_load1_pr(x+scix+2), shX_SSE);
- ix_SSE3 = gmx_add_pr(gmx_load1_pr(x+scix+3), shX_SSE);
- iy_SSE0 = gmx_add_pr(gmx_load1_pr(x+sciy), shY_SSE);
- iy_SSE1 = gmx_add_pr(gmx_load1_pr(x+sciy+1), shY_SSE);
- iy_SSE2 = gmx_add_pr(gmx_load1_pr(x+sciy+2), shY_SSE);
- iy_SSE3 = gmx_add_pr(gmx_load1_pr(x+sciy+3), shY_SSE);
- iz_SSE0 = gmx_add_pr(gmx_load1_pr(x+sciz), shZ_SSE);
- iz_SSE1 = gmx_add_pr(gmx_load1_pr(x+sciz+1), shZ_SSE);
- iz_SSE2 = gmx_add_pr(gmx_load1_pr(x+sciz+2), shZ_SSE);
- iz_SSE3 = gmx_add_pr(gmx_load1_pr(x+sciz+3), shZ_SSE);
+ ix_S0 = gmx_add_pr(gmx_load1_pr(x+scix), shX_S);
+ ix_S1 = gmx_add_pr(gmx_load1_pr(x+scix+1), shX_S);
+ ix_S2 = gmx_add_pr(gmx_load1_pr(x+scix+2), shX_S);
+ ix_S3 = gmx_add_pr(gmx_load1_pr(x+scix+3), shX_S);
+ iy_S0 = gmx_add_pr(gmx_load1_pr(x+sciy), shY_S);
+ iy_S1 = gmx_add_pr(gmx_load1_pr(x+sciy+1), shY_S);
+ iy_S2 = gmx_add_pr(gmx_load1_pr(x+sciy+2), shY_S);
+ iy_S3 = gmx_add_pr(gmx_load1_pr(x+sciy+3), shY_S);
+ iz_S0 = gmx_add_pr(gmx_load1_pr(x+sciz), shZ_S);
+ iz_S1 = gmx_add_pr(gmx_load1_pr(x+sciz+1), shZ_S);
+ iz_S2 = gmx_add_pr(gmx_load1_pr(x+sciz+2), shZ_S);
+ iz_S3 = gmx_add_pr(gmx_load1_pr(x+sciz+3), shZ_S);
if (do_coul)
{
- iq_SSE0 = gmx_set1_pr(facel*q[sci]);
- iq_SSE1 = gmx_set1_pr(facel*q[sci+1]);
- iq_SSE2 = gmx_set1_pr(facel*q[sci+2]);
- iq_SSE3 = gmx_set1_pr(facel*q[sci+3]);
+ iq_S0 = gmx_set1_pr(facel*q[sci]);
+ iq_S1 = gmx_set1_pr(facel*q[sci+1]);
+ iq_S2 = gmx_set1_pr(facel*q[sci+2]);
+ iq_S3 = gmx_set1_pr(facel*q[sci+3]);
}
#ifdef LJ_COMB_LB
- hsig_i_SSE0 = gmx_load1_pr(ljc+sci2+0);
- hsig_i_SSE1 = gmx_load1_pr(ljc+sci2+1);
- hsig_i_SSE2 = gmx_load1_pr(ljc+sci2+2);
- hsig_i_SSE3 = gmx_load1_pr(ljc+sci2+3);
- seps_i_SSE0 = gmx_load1_pr(ljc+sci2+STRIDE+0);
- seps_i_SSE1 = gmx_load1_pr(ljc+sci2+STRIDE+1);
- seps_i_SSE2 = gmx_load1_pr(ljc+sci2+STRIDE+2);
- seps_i_SSE3 = gmx_load1_pr(ljc+sci2+STRIDE+3);
+ hsig_i_S0 = gmx_load1_pr(ljc+sci2+0);
+ hsig_i_S1 = gmx_load1_pr(ljc+sci2+1);
+ hsig_i_S2 = gmx_load1_pr(ljc+sci2+2);
+ hsig_i_S3 = gmx_load1_pr(ljc+sci2+3);
+ seps_i_S0 = gmx_load1_pr(ljc+sci2+STRIDE+0);
+ seps_i_S1 = gmx_load1_pr(ljc+sci2+STRIDE+1);
+ seps_i_S2 = gmx_load1_pr(ljc+sci2+STRIDE+2);
+ seps_i_S3 = gmx_load1_pr(ljc+sci2+STRIDE+3);
#else
#ifdef LJ_COMB_GEOM
- c6s_SSE0 = gmx_load1_pr(ljc+sci2+0);
- c6s_SSE1 = gmx_load1_pr(ljc+sci2+1);
+ c6s_S0 = gmx_load1_pr(ljc+sci2+0);
+ c6s_S1 = gmx_load1_pr(ljc+sci2+1);
if (!half_LJ)
{
- c6s_SSE2 = gmx_load1_pr(ljc+sci2+2);
- c6s_SSE3 = gmx_load1_pr(ljc+sci2+3);
+ c6s_S2 = gmx_load1_pr(ljc+sci2+2);
+ c6s_S3 = gmx_load1_pr(ljc+sci2+3);
}
- c12s_SSE0 = gmx_load1_pr(ljc+sci2+STRIDE+0);
- c12s_SSE1 = gmx_load1_pr(ljc+sci2+STRIDE+1);
+ c12s_S0 = gmx_load1_pr(ljc+sci2+STRIDE+0);
+ c12s_S1 = gmx_load1_pr(ljc+sci2+STRIDE+1);
if (!half_LJ)
{
- c12s_SSE2 = gmx_load1_pr(ljc+sci2+STRIDE+2);
- c12s_SSE3 = gmx_load1_pr(ljc+sci2+STRIDE+3);
+ c12s_S2 = gmx_load1_pr(ljc+sci2+STRIDE+2);
+ c12s_S3 = gmx_load1_pr(ljc+sci2+STRIDE+3);
}
#else
nbfp0 = nbfp_ptr + type[sci ]*nbat->ntype*nbfp_stride;
#endif
/* Zero the potential energy for this list */
- VvdwtotSSE = gmx_setzero_pr();
- vctotSSE = gmx_setzero_pr();
+ Vvdwtot_S = gmx_setzero_pr();
+ vctot_S = gmx_setzero_pr();
/* Clear i atom forces */
- fix_SSE0 = gmx_setzero_pr();
- fix_SSE1 = gmx_setzero_pr();
- fix_SSE2 = gmx_setzero_pr();
- fix_SSE3 = gmx_setzero_pr();
- fiy_SSE0 = gmx_setzero_pr();
- fiy_SSE1 = gmx_setzero_pr();
- fiy_SSE2 = gmx_setzero_pr();
- fiy_SSE3 = gmx_setzero_pr();
- fiz_SSE0 = gmx_setzero_pr();
- fiz_SSE1 = gmx_setzero_pr();
- fiz_SSE2 = gmx_setzero_pr();
- fiz_SSE3 = gmx_setzero_pr();
+ fix_S0 = gmx_setzero_pr();
+ fix_S1 = gmx_setzero_pr();
+ fix_S2 = gmx_setzero_pr();
+ fix_S3 = gmx_setzero_pr();
+ fiy_S0 = gmx_setzero_pr();
+ fiy_S1 = gmx_setzero_pr();
+ fiy_S2 = gmx_setzero_pr();
+ fiy_S3 = gmx_setzero_pr();
+ fiz_S0 = gmx_setzero_pr();
+ fiz_S1 = gmx_setzero_pr();
+ fiz_S2 = gmx_setzero_pr();
+ fiz_S3 = gmx_setzero_pr();
cjind = cjind0;
/* Add accumulated i-forces to the force array */
#if UNROLLJ >= 4
#ifndef GMX_DOUBLE
-#define gmx_load_ps4 _mm_load_ps
-#define gmx_store_ps4 _mm_store_ps
-#define gmx_add_ps4 _mm_add_ps
+#define gmx_load_pr4 _mm_load_ps
+#define gmx_store_pr4 _mm_store_ps
+#define gmx_add_pr4 _mm_add_ps
#else
-#define gmx_load_ps4 _mm256_load_pd
-#define gmx_store_ps4 _mm256_store_pd
-#define gmx_add_ps4 _mm256_add_pd
+#define gmx_load_pr4 _mm256_load_pd
+#define gmx_store_pr4 _mm256_store_pd
+#define gmx_add_pr4 _mm256_add_pd
#endif
- GMX_MM_TRANSPOSE_SUM4_PR(fix_SSE0, fix_SSE1, fix_SSE2, fix_SSE3, fix_SSE);
- gmx_store_ps4(f+scix, gmx_add_ps4(fix_SSE, gmx_load_ps4(f+scix)));
+ GMX_MM_TRANSPOSE_SUM4_PR(fix_S0, fix_S1, fix_S2, fix_S3, fix_S);
+ gmx_store_pr4(f+scix, gmx_add_pr4(fix_S, gmx_load_pr4(f+scix)));
- GMX_MM_TRANSPOSE_SUM4_PR(fiy_SSE0, fiy_SSE1, fiy_SSE2, fiy_SSE3, fiy_SSE);
- gmx_store_ps4(f+sciy, gmx_add_ps4(fiy_SSE, gmx_load_ps4(f+sciy)));
+ GMX_MM_TRANSPOSE_SUM4_PR(fiy_S0, fiy_S1, fiy_S2, fiy_S3, fiy_S);
+ gmx_store_pr4(f+sciy, gmx_add_pr4(fiy_S, gmx_load_pr4(f+sciy)));
- GMX_MM_TRANSPOSE_SUM4_PR(fiz_SSE0, fiz_SSE1, fiz_SSE2, fiz_SSE3, fiz_SSE);
- gmx_store_ps4(f+sciz, gmx_add_ps4(fiz_SSE, gmx_load_ps4(f+sciz)));
+ GMX_MM_TRANSPOSE_SUM4_PR(fiz_S0, fiz_S1, fiz_S2, fiz_S3, fiz_S);
+ gmx_store_pr4(f+sciz, gmx_add_pr4(fiz_S, gmx_load_pr4(f+sciz)));
#ifdef CALC_SHIFTFORCES
- gmx_store_ps4(shf, fix_SSE);
+ gmx_store_pr4(shf, fix_S);
fshift[ish3+0] += SUM_SIMD4(shf);
- gmx_store_ps4(shf, fiy_SSE);
+ gmx_store_pr4(shf, fiy_S);
fshift[ish3+1] += SUM_SIMD4(shf);
- gmx_store_ps4(shf, fiz_SSE);
+ gmx_store_pr4(shf, fiz_S);
fshift[ish3+2] += SUM_SIMD4(shf);
#endif
#else
- GMX_MM_TRANSPOSE_SUM2_PD(fix_SSE0, fix_SSE1, fix0_SSE);
- _mm_store_pd(f+scix, _mm_add_pd(fix0_SSE, _mm_load_pd(f+scix)));
- GMX_MM_TRANSPOSE_SUM2_PD(fix_SSE2, fix_SSE3, fix2_SSE);
- _mm_store_pd(f+scix+2, _mm_add_pd(fix2_SSE, _mm_load_pd(f+scix+2)));
+ GMX_MM_TRANSPOSE_SUM2_PD(fix_S0, fix_S1, fix0_S);
+ _mm_store_pd(f+scix, _mm_add_pd(fix0_S, _mm_load_pd(f+scix)));
+ GMX_MM_TRANSPOSE_SUM2_PD(fix_S2, fix_S3, fix2_S);
+ _mm_store_pd(f+scix+2, _mm_add_pd(fix2_S, _mm_load_pd(f+scix+2)));
- GMX_MM_TRANSPOSE_SUM2_PD(fiy_SSE0, fiy_SSE1, fiy0_SSE);
- _mm_store_pd(f+sciy, _mm_add_pd(fiy0_SSE, _mm_load_pd(f+sciy)));
- GMX_MM_TRANSPOSE_SUM2_PD(fiy_SSE2, fiy_SSE3, fiy2_SSE);
- _mm_store_pd(f+sciy+2, _mm_add_pd(fiy2_SSE, _mm_load_pd(f+sciy+2)));
+ GMX_MM_TRANSPOSE_SUM2_PD(fiy_S0, fiy_S1, fiy0_S);
+ _mm_store_pd(f+sciy, _mm_add_pd(fiy0_S, _mm_load_pd(f+sciy)));
+ GMX_MM_TRANSPOSE_SUM2_PD(fiy_S2, fiy_S3, fiy2_S);
+ _mm_store_pd(f+sciy+2, _mm_add_pd(fiy2_S, _mm_load_pd(f+sciy+2)));
- GMX_MM_TRANSPOSE_SUM2_PD(fiz_SSE0, fiz_SSE1, fiz0_SSE);
- _mm_store_pd(f+sciz, _mm_add_pd(fiz0_SSE, _mm_load_pd(f+sciz)));
- GMX_MM_TRANSPOSE_SUM2_PD(fiz_SSE2, fiz_SSE3, fiz2_SSE);
- _mm_store_pd(f+sciz+2, _mm_add_pd(fiz2_SSE, _mm_load_pd(f+sciz+2)));
+ GMX_MM_TRANSPOSE_SUM2_PD(fiz_S0, fiz_S1, fiz0_S);
+ _mm_store_pd(f+sciz, _mm_add_pd(fiz0_S, _mm_load_pd(f+sciz)));
+ GMX_MM_TRANSPOSE_SUM2_PD(fiz_S2, fiz_S3, fiz2_S);
+ _mm_store_pd(f+sciz+2, _mm_add_pd(fiz2_S, _mm_load_pd(f+sciz+2)));
#ifdef CALC_SHIFTFORCES
- _mm_store_pd(shf, _mm_add_pd(fix0_SSE, fix2_SSE));
+ _mm_store_pd(shf, _mm_add_pd(fix0_S, fix2_S));
fshift[ish3+0] += shf[0] + shf[1];
- _mm_store_pd(shf, _mm_add_pd(fiy0_SSE, fiy2_SSE));
+ _mm_store_pd(shf, _mm_add_pd(fiy0_S, fiy2_S));
fshift[ish3+1] += shf[0] + shf[1];
- _mm_store_pd(shf, _mm_add_pd(fiz0_SSE, fiz2_SSE));
+ _mm_store_pd(shf, _mm_add_pd(fiz0_S, fiz2_S));
fshift[ish3+2] += shf[0] + shf[1];
#endif
#endif
#ifdef CALC_ENERGIES
if (do_coul)
{
- gmx_store_pr(tmpsum, vctotSSE);
+ gmx_store_pr(tmpsum, vctot_S);
*Vc += SUM_SIMD(tmpsum);
}
- gmx_store_pr(tmpsum, VvdwtotSSE);
+ gmx_store_pr(tmpsum, Vvdwtot_S);
*Vvdw += SUM_SIMD(tmpsum);
#endif
#endif
}
-#undef gmx_load_ps4
-#undef gmx_store_ps4
-#undef gmx_store_ps4
+
+#undef gmx_load_pr4
+#undef gmx_store_pr4
+#undef gmx_store_pr4
#undef CALC_SHIFTFORCES
#undef STRIDE
#undef TAB_FDV0
#undef NBFP_STRIDE
+
+#undef GMX_USE_HALF_WIDTH_SIMD_HERE
out1 = _mm_unpackhi_pd(in0, in1); \
}
-#if defined GMX_MM128_HERE || !defined GMX_DOUBLE
+#if GMX_NBNXN_SIMD_BITWIDTH == 128 || !defined GMX_DOUBLE
/* Collect element 0 and 1 of the 4 inputs to out0 and out1, respectively */
#define GMX_MM_SHUFFLE_4_PS_FIL01_TO_2_PS(in0, in1, in2, in3, out0, out1) \
{ \
out = _mm_shuffle_ps(_c01, _c23, _MM_SHUFFLE(2, 0, 2, 0)); \
}
-#ifndef GMX_MM256_HERE
+#if GMX_NBNXN_SIMD_BITWIDTH == 128
#ifndef GMX_DOUBLE
/* Sum the elements within each input register and store the sums in out */
#define GMX_MM_TRANSPOSE_SUM4_PR(in0, in1, in2, in3, out) \
#endif
#endif
-#ifdef GMX_MM128_HERE
+#if GMX_NBNXN_SIMD_BITWIDTH == 128
static inline __m128
gmx_mm128_invsqrt_ps_single(__m128 x)
#endif
-#ifdef GMX_MM256_HERE
+#if GMX_NBNXN_SIMD_BITWIDTH == 256
static inline __m256
gmx_mm256_invsqrt_ps_single(__m256 x)
/* Force and energy table load and interpolation routines */
-#if defined GMX_MM128_HERE && !defined GMX_DOUBLE
+#if GMX_NBNXN_SIMD_BITWIDTH == 128 && !defined GMX_DOUBLE
#define load_lj_pair_params(nbfp, type, aj, c6_SSE, c12_SSE) \
{ \
#endif
-#if defined GMX_MM256_HERE && !defined GMX_DOUBLE
+#if GMX_NBNXN_SIMD_BITWIDTH == 256 && !defined GMX_DOUBLE
/* Put two 128-bit 4-float registers into one 256-bit 8-float register */
#define GMX_2_MM_TO_M256(in0, in1, out) \
#endif
-#if defined GMX_MM128_HERE && defined GMX_DOUBLE
+#if GMX_NBNXN_SIMD_BITWIDTH == 128 && defined GMX_DOUBLE
#define load_lj_pair_params(nbfp, type, aj, c6_SSE, c12_SSE) \
{ \
#endif
-#if defined GMX_MM256_HERE && defined GMX_DOUBLE
+#if GMX_NBNXN_SIMD_BITWIDTH == 256 && defined GMX_DOUBLE
#define load_lj_pair_params(nbfp, type, aj, c6_SSE, c12_SSE) \
{ \
* but it is only used with AVX.
*/
-#if defined GMX_MM128_HERE && !defined GMX_DOUBLE
+#if GMX_NBNXN_SIMD_BITWIDTH == 128 && !defined GMX_DOUBLE
#define load_table_f(tab_coul_FDV0, ti_SSE, ti, ctab0_SSE, ctab1_SSE) \
{ \
#endif
-#if defined GMX_MM256_HERE && !defined GMX_DOUBLE
+#if GMX_NBNXN_SIMD_BITWIDTH == 256 && !defined GMX_DOUBLE
#define load_table_f(tab_coul_FDV0, ti_SSE, ti, ctab0_SSE, ctab1_SSE) \
{ \
#endif
-#if defined GMX_MM128_HERE && defined GMX_DOUBLE
+#if GMX_NBNXN_SIMD_BITWIDTH == 128 && defined GMX_DOUBLE
#define load_table_f(tab_coul_F, ti_SSE, ti, ctab0_SSE, ctab1_SSE) \
{ \
#endif
-#if defined GMX_MM256_HERE && defined GMX_DOUBLE
+#if GMX_NBNXN_SIMD_BITWIDTH == 256 && defined GMX_DOUBLE
/* Put two 128-bit 2-double registers into one 256-bit 4-ouble register */
#define GMX_2_M128D_TO_M256D(in0, in1, out) \
}
}
-#if defined GMX_X86_AVX_256 && GMX_SIMD_WIDTH_HERE == 8
+#if defined GMX_X86_AVX_256 && GMX_SIMD_WIDTH_HERE == 8 && defined gmx_mm_hpr
/* As add_ener_grp above, but for two groups of UNROLLJ/2 stored in
* a single SIMD register.
*/
{
gmx_mm_hpr v_SSE;
- v_SSE = gmx_load_hpr(v0+offset_jj[jj]+jj*GMX_SIMD_WIDTH_HERE/2);
+ gmx_load_hpr(v_SSE, v0+offset_jj[jj]+jj*GMX_SIMD_WIDTH_HERE/2);
gmx_store_hpr(v0+offset_jj[jj]+jj*GMX_SIMD_WIDTH_HERE/2, gmx_add_hpr(v_SSE, e_SSE0));
}
for (jj = 0; jj < (UNROLLJ/2); jj++)
{
gmx_mm_hpr v_SSE;
- v_SSE = gmx_load_hpr(v1+offset_jj[jj]+jj*GMX_SIMD_WIDTH_HERE/2);
+ gmx_load_hpr(v_SSE, v1+offset_jj[jj]+jj*GMX_SIMD_WIDTH_HERE/2);
gmx_store_hpr(v1+offset_jj[jj]+jj*GMX_SIMD_WIDTH_HERE/2, gmx_add_hpr(v_SSE, e_SSE1));
}
}
/* Sort particle index a on coordinates x along dim.
* Backwards tells if we want decreasing iso increasing coordinates.
* h0 is the minimum of the coordinate range.
- * invh is the inverse hole spacing.
- * nsort, the theortical hole limit, is only used for debugging.
+ * invh is the 1/length of the sorting range.
+ * n_per_h (>=n) is the expected average number of particles per 1/invh
* sort is the sorting work array.
+ * sort should have a size of at least n_per_h*SORT_GRID_OVERSIZE + n,
+ * or easier, allocate at least n*SGSF elements.
*/
static void sort_atoms(int dim, gmx_bool Backwards,
int *a, int n, rvec *x,
- real h0, real invh, int nsort, int *sort)
+ real h0, real invh, int n_per_h,
+ int *sort)
{
- int i, c;
+ int nsort, i, c;
int zi, zim, zi_min, zi_max;
int cp, tmp;
return;
}
+#ifndef NDEBUG
+ if (n > n_per_h)
+ {
+ gmx_incons("n > n_per_h");
+ }
+#endif
+
+ /* Transform the inverse range height into the inverse hole height */
+ invh *= n_per_h*SORT_GRID_OVERSIZE;
+
+ /* Set nsort to the maximum possible number of holes used.
+ * In worst case all n elements end up in the last bin.
+ */
+ nsort = n_per_h*SORT_GRID_OVERSIZE + n;
+
/* Determine the index range used, so we can limit it for the second pass */
zi_min = INT_MAX;
zi_max = -1;
*/
zi = (int)((x[a[i]][dim] - h0)*invh);
-#ifdef DEBUG_NBNXN_GRIDDING
- if (zi < 0 || zi >= nsort)
+#ifndef NDEBUG
+ /* As we can have rounding effect, we use > iso >= here */
+ if (zi < 0 || zi > n_per_h*SORT_GRID_OVERSIZE)
{
- gmx_fatal(FARGS, "(int)((x[%d][%c]=%f - %f)*%f) = %d, not in 0 - %d\n",
- a[i], 'x'+dim, x[a[i]][dim], h0, invh, zi, nsort);
+ gmx_fatal(FARGS, "(int)((x[%d][%c]=%f - %f)*%f) = %d, not in 0 - %d*%d\n",
+ a[i], 'x'+dim, x[a[i]][dim], h0, invh, zi,
+ n_per_h, SORT_GRID_OVERSIZE);
}
#endif
sort_atoms(ZZ, FALSE,
nbs->a+ash, na, x,
grid->c0[ZZ],
- ncz*grid->na_sc*SORT_GRID_OVERSIZE/nbs->box[ZZ][ZZ],
- ncz*grid->na_sc*SGSF, sort_work);
+ 1.0/nbs->box[ZZ][ZZ], ncz*grid->na_sc,
+ sort_work);
/* Fill the ncz cells in this column */
cfilled = grid->cxy_ind[cxy];
sort_atoms(ZZ, FALSE,
nbs->a+ash, na, x,
grid->c0[ZZ],
- ncz*grid->na_sc*SORT_GRID_OVERSIZE/nbs->box[ZZ][ZZ],
- ncz*grid->na_sc*SGSF, sort_work);
+ 1.0/nbs->box[ZZ][ZZ], ncz*grid->na_sc,
+ sort_work);
/* This loop goes over the supercells and subcells along z at once */
for (sub_z = 0; sub_z < ncz*GPU_NSUBCELL_Z; sub_z++)
sort_atoms(YY, (sub_z & 1),
nbs->a+ash_z, na_z, x,
grid->c0[YY]+cy*grid->sy,
- subdiv_y*SORT_GRID_OVERSIZE*grid->inv_sy,
- subdiv_y*SGSF, sort_work);
+ grid->inv_sy, subdiv_z,
+ sort_work);
#endif
for (sub_y = 0; sub_y < GPU_NSUBCELL_Y; sub_y++)
sort_atoms(XX, ((cz*GPU_NSUBCELL_Y + sub_y) & 1),
nbs->a+ash_y, na_y, x,
grid->c0[XX]+cx*grid->sx,
- subdiv_x*SORT_GRID_OVERSIZE*grid->inv_sx,
- subdiv_x*SGSF, sort_work);
+ grid->inv_sx, subdiv_y,
+ sort_work);
#endif
for (sub_x = 0; sub_x < GPU_NSUBCELL_X; sub_x++)
cx = (int)((x[i][XX] - grid->c0[XX])*grid->inv_sx);
cy = (int)((x[i][YY] - grid->c0[YY])*grid->inv_sy);
-#ifdef DEBUG_NBNXN_GRIDDING
- if (cx < 0 || cx >= grid->ncx ||
- cy < 0 || cy >= grid->ncy)
+#ifndef NDEBUG
+ if (cx < 0 || cx > grid->ncx ||
+ cy < 0 || cy > grid->ncy)
{
gmx_fatal(FARGS,
"grid cell cx %d cy %d out of range (max %d %d)\n"
}
/* Returns a diagonal or off-diagonal interaction mask for SIMD128 lists */
-static unsigned int get_imask_x86_simd128(gmx_bool rdiag, int ci, int cj)
+static unsigned int get_imask_simd128(gmx_bool rdiag, int ci, int cj)
{
#ifndef GMX_DOUBLE /* cj-size = 4 */
return (rdiag && ci == cj ? NBNXN_INT_MASK_DIAG : NBNXN_INT_MASK_ALL);
}
/* Returns a diagonal or off-diagonal interaction mask for SIMD256 lists */
-static unsigned int get_imask_x86_simd256(gmx_bool rdiag, int ci, int cj)
+static unsigned int get_imask_simd256(gmx_bool rdiag, int ci, int cj)
{
#ifndef GMX_DOUBLE /* cj-size = 8 */
return (rdiag && ci == cj*2 ? NBNXN_INT_MASK_DIAG_J8_0 :
#ifdef GMX_NBNXN_SIMD
#if GMX_NBNXN_SIMD_BITWIDTH == 128
-#define get_imask_x86_simd_4xn get_imask_x86_simd128
+#define get_imask_simd_4xn get_imask_simd128
#else
#if GMX_NBNXN_SIMD_BITWIDTH == 256
-#define get_imask_x86_simd_4xn get_imask_x86_simd256
-#define get_imask_x86_simd_2xnn get_imask_x86_simd128
+#define get_imask_simd_4xn get_imask_simd256
+#define get_imask_simd_2xnn get_imask_simd128
#else
#error "unsupported GMX_NBNXN_SIMD_BITWIDTH"
#endif
if (d2 < rbb2 ||
(d2 < rl2 &&
#ifdef NBNXN_PBB_SSE
- subc_in_range_sse8
+ subc_in_range_sse8
#else
- subc_in_range_x
+ subc_in_range_x
#endif
- (na_c, ci, x_ci, cj_gl, stride, x, rl2)))
+ (na_c, ci, x_ci, cj_gl, stride, x, rl2)))
#else
/* Check if the distance between the two bounding boxes
* in within the pair-list cut-off.
fprintf(fp, " sj %5d imask %x\n",
nbl->cj4[j4].cj[j],
nbl->cj4[j4].imei[0].imask);
- for (si=0; si<GPU_NSUBCELL; si++)
+ for (si = 0; si < GPU_NSUBCELL; si++)
{
if (nbl->cj4[j4].imei[0].imask & (1U << (j*GPU_NSUBCELL + si)))
{
}
/* Count the entries of each size */
- for(i = 0; i <= m; i++)
+ for (i = 0; i <= m; i++)
{
work->sort[i] = 0;
}
- for(s = 0; s < nbl->nsci; s++)
+ for (s = 0; s < nbl->nsci; s++)
{
i = min(m, nbl->sci[s].cj4_ind_end - nbl->sci[s].cj4_ind_start);
work->sort[i]++;
}
/* Calculate the offset for each count */
- s0 = work->sort[m];
+ s0 = work->sort[m];
work->sort[m] = 0;
- for(i = m - 1; i >= 0; i--)
+ for (i = m - 1; i >= 0; i--)
{
s1 = work->sort[i];
work->sort[i] = work->sort[i + 1] + s0;
/* Sort entries directly into place */
sci_sort = work->sci_sort;
- for(s = 0; s < nbl->nsci; s++)
+ for (s = 0; s < nbl->nsci; s++)
{
i = min(m, nbl->sci[s].cj4_ind_end - nbl->sci[s].cj4_ind_start);
sci_sort[work->sort[i]++] = nbl->sci[s];
* the research papers on the package. Check out http://www.gromacs.org.
*/
-#if GMX_NBNXN_SIMD_BITWIDTH == 128
-#define GMX_MM128_HERE
-#else
-#if GMX_NBNXN_SIMD_BITWIDTH == 256
-#define GMX_MM256_HERE
-#else
-#error "unsupported GMX_NBNXN_SIMD_BITWIDTH"
-#endif
+#if GMX_NBNXN_SIMD_BITWIDTH != 256
+#error "unsupported SIMD width"
#endif
+
#include "gmx_simd_macros.h"
+/* Define a few macros for half-width SIMD */
+#if defined GMX_X86_AVX_256 && !defined GMX_DOUBLE
+/* Half-width SIMD real type */
+#define gmx_mm_hpr __m128
+/* Half-width SIMD operations */
+/* Load reals at half-width aligned pointer b into half-width SIMD register a */
+#define gmx_load_hpr(a,b) a = _mm_load_ps(b)
+#define gmx_set1_hpr _mm_set1_ps
+/* Load reals at half-width aligned pointer b into two halves of a */
+#define gmx_loaddh_pr(a, b) a = gmx_mm256_load4_ps(b)
+/* Store half width SIMD registers b and c in ful width register a */
+#define gmx_2hpr_to_pr(a, b, c) a = _mm256_insertf128_ps(_mm256_castps128_ps256(b), c, 0x1)
+#else
+#error "Half-width SIMD macros are not yet defined"
+#endif
+
+
#if GMX_SIMD_WIDTH_HERE >= 2*NBNXN_CPU_CLUSTER_I_SIZE
#define STRIDE_S (GMX_SIMD_WIDTH_HERE/2)
#else
static gmx_inline gmx_mm_pr gmx_load_hpr_hilo_pr(const real *a)
{
- gmx_mm_hpr a_SSE;
+ gmx_mm_hpr a_S;
+ gmx_mm_pr a_a_S;
+
+ gmx_load_hpr(a_S, a);
- a_SSE = _mm_load_ps(a);
+ gmx_2hpr_to_pr(a_a_S, a_S, a_S);
- return gmx_2hpr_to_pr(a_SSE, a_SSE);
+ return a_a_S;
}
static gmx_inline gmx_mm_pr gmx_set_2real_shift_pr(const real *a, real shift)
{
- gmx_mm_hpr a0, a1;
+ gmx_mm_hpr a0_S, a1_S;
+ gmx_mm_pr a0_a1_S;
- a0 = _mm_set1_ps(a[0] + shift);
- a1 = _mm_set1_ps(a[1] + shift);
+ a0_S = gmx_set1_hpr(a[0] + shift);
+ a1_S = gmx_set1_hpr(a[1] + shift);
- return gmx_2hpr_to_pr(a1, a0);
+ gmx_2hpr_to_pr(a0_a1_S, a0_S, a1_S);
+
+ return a0_a1_S;
}
/* Copies PBC shifted i-cell packed atom coordinates to working array */
x_ci->iz_SSE2 = gmx_set_2real_shift_pr(x + ia + 2*STRIDE_S + 2, shz);
}
+#ifndef GMX_HAVE_SIMD_ANYTRUE
+/* Fallback function in case gmx_anytrue_pr is not present */
+static gmx_inline gmx_bool
+gmx_anytrue_2xn_pr(gmx_mm_pr bool_S)
+{
+ real bools_array[2*GMX_SIMD_WIDTH_HERE], *bools;
+ gmx_bool any;
+ int s;
+
+ bools = gmx_simd_align_real(bools_array);
+
+ gmx_store_pr(bools, bool_S);
+
+ any = FALSE;
+ for (s = 0; s < GMX_SIMD_WIDTH_HERE; s++)
+ {
+ if (GMX_SIMD_IS_TRUE(s))
+ {
+ any = TRUE;
+ }
+ }
+
+ return any;
+}
+#endif
+
/* SIMD code for making a pair list of cell ci vs cell cjf-cjl
* for coordinates in packed format.
* Checks bouding box distances and possibly atom pair distances.
wco_any_SSE = gmx_or_pr(wco_SSE0, wco_SSE2);
- InRange = gmx_movemask_pr(wco_any_SSE);
+#ifdef GMX_HAVE_SIMD_ANYTRUE
+ InRange = gmx_anytrue_pr(wco_any_SSE);
+#else
+ InRange = gmx_anytrue_2xn_pr(wco_any_SSE);
+#endif
*ndistc += 2*GMX_SIMD_WIDTH_HERE;
}
wco_any_SSE = gmx_or_pr(wco_SSE0, wco_SSE2);
- InRange = gmx_movemask_pr(wco_any_SSE);
+#ifdef GMX_HAVE_SIMD_ANYTRUE
+ InRange = gmx_anytrue_pr(wco_any_SSE);
+#else
+ InRange = gmx_anytrue_2xn_pr(wco_any_SSE);
+#endif
*ndistc += 2*GMX_SIMD_WIDTH_HERE;
}
{
/* Store cj and the interaction mask */
nbl->cj[nbl->ncj].cj = CI_TO_CJ_SIMD_2XNN(gridj->cell0) + cj;
- nbl->cj[nbl->ncj].excl = get_imask_x86_simd_2xnn(remove_sub_diag, ci, cj);
+ nbl->cj[nbl->ncj].excl = get_imask_simd_2xnn(remove_sub_diag, ci, cj);
nbl->ncj++;
}
/* Increase the closing index in i super-cell list */
}
#undef STRIDE_S
-#undef GMX_MM128_HERE
-#undef GMX_MM256_HERE
+
+#undef gmx_mm_hpr
+#undef gmx_load_hpr
+#undef gmx_set1_hpr
+#undef gmx_2hpr_to_pr
* the research papers on the package. Check out http://www.gromacs.org.
*/
-#if GMX_NBNXN_SIMD_BITWIDTH == 128
-#define GMX_MM128_HERE
-#else
-#if GMX_NBNXN_SIMD_BITWIDTH == 256
-#define GMX_MM256_HERE
-#else
+#if !(GMX_NBNXN_SIMD_BITWIDTH == 128 || GMX_NBNXN_SIMD_BITWIDTH == 256)
#error "unsupported GMX_NBNXN_SIMD_BITWIDTH"
#endif
+
+#ifdef GMX_NBNXN_HALF_WIDTH_SIMD
+#define GMX_USE_HALF_WIDTH_SIMD_HERE
#endif
#include "gmx_simd_macros.h"
x_ci->iz_SSE3 = gmx_set1_pr(x[ia + 2*STRIDE_S + 3] + shz);
}
+#ifndef GMX_HAVE_SIMD_ANYTRUE
+/* Fallback function in case gmx_anytrue_pr is not present */
+static gmx_inline gmx_bool
+gmx_anytrue_4xn_pr(gmx_mm_pr bool_S)
+{
+ real bools_array[2*GMX_SIMD_WIDTH_HERE], *bools;
+ gmx_bool any;
+ int s;
+
+ bools = gmx_simd_align_real(bools_array);
+
+ gmx_store_pr(bools, bool_S);
+
+ any = FALSE;
+ for (s = 0; s < GMX_SIMD_WIDTH_HERE; s++)
+ {
+ if (GMX_SIMD_IS_TRUE(bools[s]))
+ {
+ any = TRUE;
+ }
+ }
+
+ return any;
+}
+#endif
+
/* SIMD code for making a pair list of cell ci vs cell cjf-cjl
* for coordinates in packed format.
* Checks bouding box distances and possibly atom pair distances.
wco_any_SSE23 = gmx_or_pr(wco_SSE2, wco_SSE3);
wco_any_SSE = gmx_or_pr(wco_any_SSE01, wco_any_SSE23);
- InRange = gmx_movemask_pr(wco_any_SSE);
+#ifdef GMX_HAVE_SIMD_ANYTRUE
+ InRange = gmx_anytrue_pr(wco_any_SSE);
+#else
+ InRange = gmx_anytrue_4xn_pr(wco_any_SSE);
+#endif
*ndistc += 4*GMX_SIMD_WIDTH_HERE;
}
wco_any_SSE23 = gmx_or_pr(wco_SSE2, wco_SSE3);
wco_any_SSE = gmx_or_pr(wco_any_SSE01, wco_any_SSE23);
- InRange = gmx_movemask_pr(wco_any_SSE);
+#ifdef GMX_HAVE_SIMD_ANYTRUE
+ InRange = gmx_anytrue_pr(wco_any_SSE);
+#else
+ InRange = gmx_anytrue_4xn_pr(wco_any_SSE);
+#endif
*ndistc += 4*GMX_SIMD_WIDTH_HERE;
}
{
/* Store cj and the interaction mask */
nbl->cj[nbl->ncj].cj = CI_TO_CJ_SIMD_4XN(gridj->cell0) + cj;
- nbl->cj[nbl->ncj].excl = get_imask_x86_simd_4xn(remove_sub_diag, ci, cj);
+ nbl->cj[nbl->ncj].excl = get_imask_simd_4xn(remove_sub_diag, ci, cj);
nbl->ncj++;
}
/* Increase the closing index in i super-cell list */
}
#undef STRIDE_S
-#undef GMX_MM128_HERE
-#undef GMX_MM256_HERE
+#undef GMX_USE_HALF_WIDTH_SIMD_HERE
* processed by the generic AdResS kernel.
*/
if ( (bEnergyGroupCG &&
- wf[i_atom] >= 1-GMX_REAL_EPS && wf[jj] >= 1-GMX_REAL_EPS ) ||
- ( !bEnergyGroupCG && wf[jj] <= GMX_REAL_EPS ) )
+ wf[i_atom] >= 1-GMX_REAL_EPS && wf[jj] >= 1-GMX_REAL_EPS ) ||
+ ( !bEnergyGroupCG && wf[jj] <= GMX_REAL_EPS ) )
{
continue;
}
b_hybrid = !((wf[i_atom] >= 1-GMX_REAL_EPS && wf[jj] >= 1-GMX_REAL_EPS) ||
- (wf[i_atom] <= GMX_REAL_EPS && wf[jj] <= GMX_REAL_EPS));
+ (wf[i_atom] <= GMX_REAL_EPS && wf[jj] <= GMX_REAL_EPS));
if (bNotEx)
{
MPI_Datatype rvec_mpi; /* the pme vector's MPI type */
#endif
- int nthread; /* The number of threads doing PME */
+ gmx_bool bUseThreads; /* Does any of the PME ranks have nthread>1 ? */
+ int nthread; /* The number of threads doing PME on our rank */
gmx_bool bPPnode; /* Node also does particle-particle forces */
gmx_bool bFEP; /* Compute Free energy contribution */
static void pmegrids_init(pmegrids_t *grids,
int nx, int ny, int nz, int nz_base,
int pme_order,
+ gmx_bool bUseThreads,
int nthread,
int overlap_x,
int overlap_y)
make_subgrid_division(n_base, pme_order-1, grids->nthread, grids->nc);
- if (grids->nthread > 1)
+ if (bUseThreads)
{
ivec nst;
int gridsize;
}
}
}
+ else
+ {
+ grids->grid_th = NULL;
+ }
snew(grids->g2t, DIM);
tfac = 1;
{
gmx_pme_t pme = NULL;
- pme_atomcomm_t *atc;
+ int use_threads, sum_use_threads;
ivec ndata;
if (debug)
pme->nthread = nthread;
+ /* Check if any of the PME MPI ranks uses threads */
+ use_threads = (pme->nthread > 1 ? 1 : 0);
+#ifdef GMX_MPI
+ if (pme->nnodes > 1)
+ {
+ MPI_Allreduce(&use_threads, &sum_use_threads, 1, MPI_INT,
+ MPI_SUM, pme->mpi_comm);
+ }
+ else
+#endif
+ {
+ sum_use_threads = use_threads;
+ }
+ pme->bUseThreads = (sum_use_threads > 0);
+
if (ir->ePBC == epbcSCREW)
{
gmx_fatal(FARGS, "pme does not (yet) work with pbc = screw");
/* Check for a limitation of the (current) sum_fftgrid_dd code.
* We only allow multiple communication pulses in dim 1, not in dim 0.
*/
- if (pme->nthread > 1 && (pme->overlap[0].noverlap_nodes > 1 ||
+ if (pme->bUseThreads && (pme->overlap[0].noverlap_nodes > 1 ||
pme->nkx < pme->nnodes_major*pme->pme_order))
{
gmx_fatal(FARGS, "The number of PME grid lines per node along x is %g. But when using OpenMP threads, the number of grid lines per node along x and should be >= pme_order (%d). To resolve this issue, use less nodes along x (and possibly more along y and/or z) by specifying -dd manually.",
pme->pmegrid_nx, pme->pmegrid_ny, pme->pmegrid_nz,
pme->pmegrid_nz_base,
pme->pme_order,
+ pme->bUseThreads,
pme->nthread,
pme->overlap[0].s2g1[pme->nodeid_major]-pme->overlap[0].s2g0[pme->nodeid_major+1],
pme->overlap[1].s2g1[pme->nodeid_minor]-pme->overlap[1].s2g0[pme->nodeid_minor+1]);
pme->pmegrid_nx, pme->pmegrid_ny, pme->pmegrid_nz,
pme->pmegrid_nz_base,
pme->pme_order,
+ pme->bUseThreads,
pme->nthread,
pme->nkx % pme->nnodes_major != 0,
pme->nky % pme->nnodes_minor != 0);
sfree_aligned(new->grid.grid);
new->grid.grid = old->grid.grid;
- if (new->nthread > 1 && new->nthread == old->nthread)
+ if (new->grid_th != NULL && new->nthread == old->nthread)
{
sfree_aligned(new->grid_all);
for (t = 0; t < new->nthread; t++)
for (thread = 0; thread < nthread; thread++)
{
splinedata_t *spline;
- pmegrid_t *grid;
+ pmegrid_t *grid = NULL;
/* make local bsplines */
- if (grids == NULL || grids->nthread == 1)
+ if (grids == NULL || !pme->bUseThreads)
{
spline = &atc->spline[0];
spline->n = atc->n;
- grid = &grids->grid;
+ if (bSpread)
+ {
+ grid = &grids->grid;
+ }
}
else
{
spline = &atc->spline[thread];
- make_thread_local_ind(atc, thread, spline);
+ if (grids->nthread == 1)
+ {
+ /* One thread, we operate on all charges */
+ spline->n = atc->n;
+ }
+ else
+ {
+ /* Get the indices our thread should operate on */
+ make_thread_local_ind(atc, thread, spline);
+ }
grid = &grids->grid_th[thread];
}
#endif
spread_q_bsplines_thread(grid, atc, spline, pme->spline_work);
- if (grids->nthread > 1)
+ if (pme->bUseThreads)
{
copy_local_grid(pme, grids, thread, fftgrid);
}
cs2 += (double)c2;
#endif
- if (bSpread && grids->nthread > 1)
+ if (bSpread && pme->bUseThreads)
{
#ifdef PME_TIME_THREADS
c3 = omp_cyc_start();
if (pme->nnodes > 1)
{
- /* Communicate the overlapping part of the fftgrid */
+ /* Communicate the overlapping part of the fftgrid.
+ * For this communication call we need to check pme->bUseThreads
+ * to have all ranks communicate here, regardless of pme->nthread.
+ */
sum_fftgrid_dd(pme, fftgrid);
}
}
/* We only use the A-charges grid */
grid = &pme->pmegridA;
+ /* Only calculate the spline coefficients, don't actually spread */
spread_on_grid(pme, atc, NULL, TRUE, FALSE, pme->fftgridA);
*V = gather_energy_bsplines(pme, grid->grid.grid, atc);
inc_nrnb(nrnb, eNR_SPREADQBSP,
pme->pme_order*pme->pme_order*pme->pme_order*atc->n);
- if (pme->nthread == 1)
+ if (!pme->bUseThreads)
{
wrap_periodic_pmegrid(pme, grid);
foreach(PROG ${NGMX_PROGRAMS})
add_executable(${PROG} ${PROG}.c ${NGMX_COMMON_SOURCE})
gmx_add_man_page(${PROG})
- target_link_libraries(${PROG} gmx ${GMX_EXTRA_LIBRARIES} ${X11_LIBRARIES})
+ target_link_libraries(${PROG} gmx ${X11_LIBRARIES})
set_target_properties(${PROG} PROPERTIES OUTPUT_NAME "${PROG}${GMX_BINARY_SUFFIX}")
endforeach(PROG)
)
-target_link_libraries(gmxana md gmx ${GSL_LIBRARIES})
+target_link_libraries(gmxana md ${GSL_LIBRARIES})
set_target_properties(gmxana PROPERTIES OUTPUT_NAME "gmxana${GMX_LIBS_SUFFIX}" SOVERSION ${SOVERSION} INSTALL_NAME_DIR "${LIB_INSTALL_DIR}"
COMPILE_FLAGS "${OpenMP_C_FLAGS}")
endif()
endif()
add_executable(${TOOL} ${TOOL}.c)
- target_link_libraries(${TOOL} gmxana ${OpenMP_LINKER_FLAGS})
+ target_link_libraries(${TOOL} gmxana md gmx ${OpenMP_LINKER_FLAGS})
set_target_properties(${TOOL} PROPERTIES OUTPUT_NAME "${TOOL}${GMX_BINARY_SUFFIX}")
endforeach()
}
if (iMu[XX] < 0 || iMu[YY] < 0 || iMu[ZZ] < 0)
{
- gmx_fatal(FARGS,"No index for Mu-X, Mu-Y or Mu-Z energy group.");
+ gmx_fatal(FARGS, "No index for Mu-X, Mu-Y or Mu-Z energy group.");
}
}
else
real sqr_box, r2min, r2max, r2;
rvec shift[NSHIFT], d0, d;
- sqr_box = sqr(min(box[XX][XX], min(box[YY][YY], box[ZZ][ZZ])));
+ sqr_box = sqr(min(norm(box[XX]), min(norm(box[YY]), norm(box[ZZ]))));
s = 0;
for (sz = -1; sz <= 1; sz++)
size = 0, /* nr. of atoms in group. same as nr_tails */
i, j, m, k, l, teller = 0,
slice, /* current slice number */
- nr_frames = 0,
- *slCount; /* nr. of atoms in one slice */
+ nr_frames = 0;
+ int *slCount; /* nr. of atoms in one slice */
real dbangle = 0, /* angle between double bond and axis */
sdbangle = 0; /* sum of these angles */
gmx_bool use_unitvector = FALSE; /* use a specified unit vector instead of axis to specify unit normal*/
svmul(1.0/distsize, dref, dref);
if (radial)
{
- pbc_dx(&pbc, dref, com, dvec);
+ pbc_dx(&pbc, dref, com, dvec);
unitv(dvec, dvec);
}
}
if (radial)
{
/* bin order parameter by arc distance from reference group*/
- arcdist = gmx_angle(dvec,direction);
+ arcdist = gmx_angle(dvec, direction);
(*distvals)[j][i] += arcdist;
}
else if (i == 1)
{
/* Want minimum lateral distance to first group calculated */
tmpdist = trace(box); /* should be max value */
- for (k=0;k<distsize;k++)
+ for (k = 0; k < distsize; k++)
{
pbc_dx(&pbc, x1[distidx[k]], x1[a[index[i]+j]], dvec);
/* at the moment, just remove dvec[axis] */
dvec[axis] = 0;
- tmpdist = min(tmpdist, norm2(dvec));
+ tmpdist = min(tmpdist, norm2(dvec));
}
- //fprintf(stderr, "Min dist %f; trace %f\n", tmpdist, trace(box));
- (*distvals)[j][i]+=sqrt(tmpdist);
+ //fprintf(stderr, "Min dist %f; trace %f\n", tmpdist, trace(box));
+ (*distvals)[j][i] += sqrt(tmpdist);
}
}
} /* end loop j, over all atoms in group */
/* Returns TRUE when "opt" is needed at launch time */
static gmx_bool is_launch_file(char *opt, gmx_bool bSet)
{
- /* Apart from the input .tpr and the output log files we need all options that
+ /* Apart from the input .tpr and the output log files we need all options that
* were set on the command line and that do not start with -b */
- if (0 == strncmp(opt, "-b" , 2) || 0 == strncmp(opt, "-s", 2)
- || 0 == strncmp(opt, "-err", 4) || 0 == strncmp(opt, "-p", 2) )
+ if (0 == strncmp(opt, "-b", 2) || 0 == strncmp(opt, "-s", 2)
+ || 0 == strncmp(opt, "-err", 4) || 0 == strncmp(opt, "-p", 2) )
{
return FALSE;
}
"REGRESSIONTEST_DOWNLOAD not supported with cmake ${CMAKE_VERSION}" FORCE)
endif()
if(REGRESSIONTEST_DOWNLOAD)
- if("${PROJECT_VERSION}" MATCHES "-dev")
- set(REGRESSIONTEST_VERSION master)
- else()
- set(REGRESSIONTEST_VERSION ${PROJECT_VERSION})
+ if(NOT REGRESSIONTEST_VERSION)
+ message(FATAL_ERROR "The configuration files do not specify what regressiontests tarball is suitable for automated download and testing. Please obtain and use a suitable set of tests yourself.")
endif()
set(REGRESSIONTEST_URL
http://gerrit.gromacs.org/download/regressiontests-${REGRESSIONTEST_VERSION}.tar.gz)
#windows requires the command to be perl and not the script
COMMAND perl "${REGRESSIONTEST_PATH}/gmxtest.pl" ${subtest} ${ARGS})
set_tests_properties(regressiontests/${subtest} PROPERTIES
- ENVIRONMENT "PATH=${PATH}")
+ ENVIRONMENT "PATH=${PATH};GMXLIB=${CMAKE_SOURCE_DIR}/share/top")
endforeach()
endif()