pages.
The x86 assembly loops is now a single option to configure,
and the single/double prec. is controlled with --enable-float
(default is yes), to be consistent with fftw.
Removed the less common options from the summary printed by
configure, but they are still available.
Introduced libtool to create both static and dynamic libraries -
you can control it with configure options. --disable-shared might
be suitable for development work.
To avoid compiling both PIC and non-PIC code you can try --with-pic,
but the default is both.
obj
Makefile.in
include
+config.log
+config.cache
+libtool
#
# Most files in the config subdir is included automatically when
-# we issue "make dist", but not depcomp. This just includes that file...
-#
-EXTRA_DIST = config/depcomp
+# we issue "make dist", but some versions of automake seem to have
+# problems with it, so we include them all...
+EXTRA_DIST = config/depcomp config/ltconfig config/ltcf-c.sh \
+ config/ltcf-f77.sh config/config.guess config/config.sub \
+ config/install-sh config/missing config/ltmain.sh \
+ config/mkinstalldirs
#
# This is a shortcut to construct the mdrun executable by first
install-mdrun:
(cd ${top_builddir}/src/kernel && $(MAKE) install-mdrun)
+links:
+ (cd /usr/local/bin && $(LN_S) ${bindir}/* .)
+
#
# Apart from normal things like .o, things matching this are removed
# (The second one removes files beginning with a #)
-CLEANFILES = *~ \\\#*
+CLEANFILES = *~ \\\#*
+DISTCLEANFILES = libtool config.cache config.log
+
if test -n "$fftwname"; then
-# we cannot run the code since MPI program might not be allowed outside a charge queue
+# we cannot run the code since an MPI program might not be allowed on a login node of a supercomputer
AC_TRY_COMPILE([#include <$fftwname.h>],
[int _array_ [1 - 2 * !((sizeof(fftw_real)) == $2)]; ],
ok="yes",ok="no")
[
AC_MSG_RESULT(no)
AC_MSG_ERROR([Cannot find any $prec precision $fftwname.h or $xfftwname.h]
-[Do you have $prec precision FFTW installed? You can find it at www.fftw.org]
-[Note that the default FFTW setup is double precision. You change the]
-[FFTW configuration to single with --enable-float and turn on MPI support]
-[with --enable-mpi. It is a good idea to install both single & double.]
+[Do you have $prec precision FFTW installed? If you are using packages,]
+[note that you also need fftw-devel to compile GROMACS. You can find the ]
+[software at www.fftw.org, and detailed instructions at www.gromacs.org.]
+[If you compiled FFTW yourself: ]
+[Note that the default FFTW setup is double precision. Change the FFTW]
+[configuration to single with --enable-float. If you want MPI support,]
+[use --enable-mpi. It is a good idea to install both single & double.]
[If your sysadm doesn't want to install it you can do it to a location]
-[in your home directory and provide Gromacs configure with the correct]
-[paths by setting the CPPFLAGS and LDFLAGS environment variables.]
-[Check the Gromacs INSTALL file for additional information.])
+[in your home directory and provide the correct paths in the CPPFLAGS]
+[and LDFLAGS environment variables before running configure.]
+[That is also necessary to do if your compiler doesn't search]
+[/usr/local/include and /usr/local/lib by default.]
+[You can find information at www.gromacs.org, or in the INSTALL file.])
])
AC_TRY_COMPILE([#include <$xfftwname.h>],[int _array_ [1 - 2 * !((sizeof(fftw_real)) == $2)];],
[
fftwname=$xfftwname
usedprefix=$fftwcheckprefix
],
+[
AC_MSG_ERROR([Cannot find any $prec precision $fftwname.h or $xfftwname.h]
-[Do you have $prec precision FFTW installed? You can find it at www.fftw.org]
-[Note that the default FFTW setup is double precision. You change the]
-[FFTW configuration to single with --enable-float and turn on MPI support]
-[with --enable-mpi. It is a good idea to install both single & double.]
+[Do you have $prec precision FFTW installed? If you are using packages,]
+[note that you also need fftw-devel to compile GROMACS. You can find the ]
+[software at www.fftw.org, and detailed instructions at www.gromacs.org.]
+[If you compiled FFTW yourself: ]
+[Note that the default FFTW setup is double precision. Change the FFTW]
+[configuration to single with --enable-float. If you want MPI support,]
+[use --enable-mpi. It is a good idea to install both single & double.]
[If your sysadm doesn't want to install it you can do it to a location]
-[in your home directory and provide Gromacs configure with the correct]
-[paths by setting the CPPFLAGS and LDFLAGS environment variables.]
-[Check the Gromacs INSTALL file for additional information.]))
+[in your home directory and provide the correct paths in the CPPFLAGS]
+[and LDFLAGS environment variables before running configure.]
+[That is also necessary to do if your compiler doesn't search]
+[/usr/local/include and /usr/local/lib by default.]
+[You can find information at www.gromacs.org, or in the INSTALL file.])])
fi
AC_CHECK_LIB($fftwname,main,,AC_MSG_ERROR([Can't find a library to match the $fftwname header]))
fi
AC_ARG_WITH(motif-includes,
-[ --with-motif-includes=DIR Motif include files are in DIR],
+[ --with-motif-includes=DIR Motif include files are in DIR],
motif_includes="$withval")
AC_ARG_WITH(motif-libraries,
-[ --with-motif-libraries=DIR Motif libraries are in DIR],
+[ --with-motif-libraries=DIR Motif libraries are in DIR],
motif_libraries="$withval")
AC_MSG_CHECKING(for Motif)
# determine our suggested choices for both C and fortran, and then possibly
# override them with user choices.
+cc_vendor="unknown"
+
case "${host_cpu}-${host_os}" in
*-solaris2*)
esac
if $CC -V 2> /dev/null | grep Compaq > /dev/null 2>&1; then
xCFLAGS="$tmpCFLAGS"
+ cc_vendor="Compaq"
fi
if test "$enable_fortran" = "yes"; then
if $F77 -V 2> /dev/null | grep Compaq > /dev/null 2>&1; then
xCFLAGS="$pgiopt -fast -Minfo=loop -pc 32"
fi
if test "$enable_fortran" = "yes"; then
- if $F77 -V 2> /dev/null | grep Portland /dev/null 2>&1; then
+ if $F77 -V 2> /dev/null | grep Portland > /dev/null 2>&1; then
xFFLAGS="$xCFLAGS -Mneginfo=loop"
fi
fi
fi
CPU_FLAGS=""
+
if test "$GCC" = "yes"; then
+ AM_CONDITIONAL(GNU_CC,true)
# try to guess correct CPU flags, at least for linux
case "${host_cpu}" in
# i586/i686 cpu flags don't improve speed, thus no need to use them.
ACX_CHECK_CC_FLAGS(-mpowerpc,m_powerpc,CPU_FLAGS=-mpowerpc)
fi
esac
+else
+ AM_CONDITIONAL(GNU_CC,false)
fi
if test -n "$CPU_FLAGS"; then
echo "*******************************************************************"
echo "* WARNING: No special optimization settings found for the C *"
echo "* compiler. Use make CFLAGS=..., or edit the top level Makefile. *"
- echo "* Reverting to the default setting CFLAGS=-O3. (mail us about it) *"
+ echo "* Reverting to the default setting CFLAGS=-O3. (mail us about it!)*"
echo "*******************************************************************"
CFLAGS="-O3"
fi
if test "$enable_fortran" = "yes"; then
if test "$ac_test_FFLAGS" != "set"; then
FFLAGS="$xFFLAGS"
-
if test -z "$FFLAGS"; then
echo "*******************************************************************"
echo "* WARNING: No special optimization settings found for the fortran *"
echo "* compiler. Use make FFLAGS=..., or edit the top level Makefile. *"
- echo "* Reverting to the default setting FFLAGS=-O3. (mail us about it) *"
+ echo "* Reverting to the default setting FFLAGS=-O3. (mail us about it!)*"
echo "*******************************************************************"
FFLAGS="-O3"
fi
echo "******************************************"
fi
fi
+
+])
+
+
+
+
+
+
+## libtool.m4 - Configure libtool for the host system. -*-Shell-script-*-
+## Copyright 1996, 1997, 1998, 1999, 2000, 2001
+## Free Software Foundation, Inc.
+## Originally by Gordon Matzigkeit <gord@gnu.ai.mit.edu>, 1996
+##
+## This program is free software; you can redistribute it and/or modify
+## it under the terms of the GNU General Public License as published by
+## the Free Software Foundation; either version 2 of the License, or
+## (at your option) any later version.
+##
+## This program is distributed in the hope that it will be useful, but
+## WITHOUT ANY WARRANTY; without even the implied warranty of
+## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+## General Public License for more details.
+##
+## You should have received a copy of the GNU General Public License
+## along with this program; if not, write to the Free Software
+## Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+##
+## As a special exception to the GNU General Public License, if you
+## distribute this file as part of a program that contains a
+## configuration script generated by Autoconf, you may include it under
+## the same distribution terms that you use for the rest of that program.
+
+# serial 46 AC_PROG_LIBTOOL
+AC_DEFUN([AC_PROG_LIBTOOL],
+[AC_REQUIRE([_AC_PROG_LIBTOOL])dnl
+dnl If AC_PROG_CXX has already been expanded, run AC_LIBTOOL_CXX
+dnl immediately, otherwise, hook it in at the end of AC_PROG_CXX.
+ AC_PROVIDE_IFELSE([AC_PROG_CXX],
+ [AC_LIBTOOL_CXX],
+ [define([AC_PROG_CXX], defn([AC_PROG_CXX])[AC_LIBTOOL_CXX
+])])
+ AC_PROVIDE_IFELSE([AC_PROG_F77],
+ [AC_LIBTOOL_F77],
+ [define([AC_PROG_F77], defn([AC_PROG_F77])[AC_LIBTOOL_F77
+])])
+
+dnl Quote A][M_PROG_GCJ so that aclocal doesn't bring it in needlessly.
+dnl If either AC_PROG_GCJ or A][M_PROG_GCJ have already been expanded, run
+dnl AC_LIBTOOL_GCJ immediately, otherwise, hook it in at the end of both.
+ AC_PROVIDE_IFELSE([AC_PROG_GCJ],
+ [AC_LIBTOOL_GCJ],
+ [AC_PROVIDE_IFELSE([A][M_PROG_GCJ],
+ [AC_LIBTOOL_GCJ],
+ [AC_PROVIDE_IFELSE([LT_AC_PROG_GCJ],
+ [AC_LIBTOOL_GCJ],
+ [ifdef([AC_PROG_GCJ],
+ [define([AC_PROG_GCJ], defn([AC_PROG_GCJ])[AC_LIBTOOL_GCJ
+])])
+ ifdef([A][M_PROG_GCJ],
+ [define([A][M_PROG_GCJ], defn([A][M_PROG_GCJ])[AC_LIBTOOL_GCJ
+])])
+ ifdef([LT_AC_PROG_GCJ],
+ [define([LT_AC_PROG_GCJ], defn([LT_AC_PROG_GCJ])[AC_LIBTOOL_GCJ
+])])])])])])
+
+AC_DEFUN([_AC_PROG_LIBTOOL],
+[AC_REQUIRE([AC_LIBTOOL_SETUP])dnl
+AC_BEFORE([$0],[AC_LIBTOOL_CXX])dnl
+AC_BEFORE([$0],[AC_LIBTOOL_GCJ])dnl
+
+# Save cache, so that ltconfig can load it
+AC_CACHE_SAVE
+
+# Actually configure libtool. ac_aux_dir is where install-sh is found.
+AR="$AR" LTCC="$CC" CC="$CC" CFLAGS="$CFLAGS" CPPFLAGS="$CPPFLAGS" \
+MAGIC_CMD="$MAGIC_CMD" LD="$LD" LDFLAGS="$LDFLAGS" LIBS="$LIBS" \
+LN_S="$LN_S" NM="$NM" RANLIB="$RANLIB" STRIP="$STRIP" \
+AS="$AS" DLLTOOL="$DLLTOOL" OBJDUMP="$OBJDUMP" \
+objext="$OBJEXT" exeext="$EXEEXT" reload_flag="$reload_flag" \
+deplibs_check_method="$deplibs_check_method" file_magic_cmd="$file_magic_cmd" \
+${CONFIG_SHELL-/bin/sh} $ac_aux_dir/ltconfig --no-reexec \
+$libtool_flags --no-verify --build="$build" $ac_aux_dir/ltmain.sh $host \
+|| AC_MSG_ERROR([libtool configure failed])
+
+# Reload cache, that may have been modified by ltconfig
+AC_CACHE_LOAD
+
+# This can be used to rebuild libtool when needed
+LIBTOOL_DEPS="$ac_aux_dir/ltconfig $ac_aux_dir/ltmain.sh $ac_aux_dir/ltcf-c.sh"
+
+# Always use our own libtool.
+LIBTOOL='$(SHELL) $(top_builddir)/libtool'
+AC_SUBST(LIBTOOL)dnl
+
+# Redirect the config.log output again, so that the ltconfig log is not
+# clobbered by the next message.
+exec 5>>./config.log
+])
+
+AC_DEFUN([AC_LIBTOOL_SETUP],
+[AC_PREREQ(2.13)dnl
+AC_REQUIRE([AC_ENABLE_SHARED])dnl
+AC_REQUIRE([AC_ENABLE_STATIC])dnl
+AC_REQUIRE([AC_ENABLE_FAST_INSTALL])dnl
+AC_REQUIRE([AC_CANONICAL_HOST])dnl
+AC_REQUIRE([AC_CANONICAL_BUILD])dnl
+AC_REQUIRE([AC_PROG_CC])dnl
+AC_REQUIRE([AC_PROG_LD])dnl
+AC_REQUIRE([AC_PROG_LD_RELOAD_FLAG])dnl
+AC_REQUIRE([AC_PROG_NM])dnl
+AC_REQUIRE([AC_PROG_LN_S])dnl
+AC_REQUIRE([AC_DEPLIBS_CHECK_METHOD])dnl
+# Autoconf 2.13's AC_OBJEXT and AC_EXEEXT macros only works for C compilers!
+AC_REQUIRE([AC_OBJEXT])dnl
+AC_REQUIRE([AC_EXEEXT])dnl
+dnl
+
+# Only perform the check for file, if the check method requires it
+case $deplibs_check_method in
+file_magic*)
+ if test "$file_magic_cmd" = '$MAGIC_CMD'; then
+ AC_PATH_MAGIC
+ fi
+ ;;
+esac
+
+AC_CHECK_TOOL(RANLIB, ranlib, :)
+AC_CHECK_TOOL(STRIP, strip, :)
+
+# Check for any special flags to pass to ltconfig.
+libtool_flags="--cache-file=$cache_file"
+test "$enable_shared" = no && libtool_flags="$libtool_flags --disable-shared"
+test "$enable_static" = no && libtool_flags="$libtool_flags --disable-static"
+test "$enable_fast_install" = no && libtool_flags="$libtool_flags --disable-fast-install"
+test "$GCC" = yes && libtool_flags="$libtool_flags --with-gcc"
+test "$lt_cv_prog_gnu_ld" = yes && libtool_flags="$libtool_flags --with-gnu-ld"
+ifdef([AC_PROVIDE_AC_LIBTOOL_DLOPEN],
+[libtool_flags="$libtool_flags --enable-dlopen"])
+ifdef([AC_PROVIDE_AC_LIBTOOL_WIN32_DLL],
+[libtool_flags="$libtool_flags --enable-win32-dll"])
+AC_ARG_ENABLE(libtool-lock,
+ [ --disable-libtool-lock avoid locking (might break parallel builds)])
+test "x$enable_libtool_lock" = xno && libtool_flags="$libtool_flags --disable-lock"
+test x"$silent" = xyes && libtool_flags="$libtool_flags --silent"
+
+AC_ARG_WITH(pic,
+ [ --with-pic try to use only PIC/non-PIC [default=both]],
+ pic_mode="$withval", pic_mode=default)
+test x"$pic_mode" = xyes && libtool_flags="$libtool_flags --prefer-pic"
+test x"$pic_mode" = xno && libtool_flags="$libtool_flags --prefer-non-pic"
+
+# Some flags need to be propagated to the compiler or linker for good
+# libtool support.
+case $host in
+*-*-irix6*)
+ # Find out which ABI we are using.
+ echo '[#]line __oline__ "configure"' > conftest.$ac_ext
+ if AC_TRY_EVAL(ac_compile); then
+ case `/usr/bin/file conftest.$ac_objext` in
+ *32-bit*)
+ LD="${LD-ld} -32"
+ ;;
+ *N32*)
+ LD="${LD-ld} -n32"
+ ;;
+ *64-bit*)
+ LD="${LD-ld} -64"
+ ;;
+ esac
+ fi
+ rm -rf conftest*
+ ;;
+
+*-*-sco3.2v5*)
+ # On SCO OpenServer 5, we need -belf to get full-featured binaries.
+ SAVE_CFLAGS="$CFLAGS"
+ CFLAGS="$CFLAGS -belf"
+ AC_CACHE_CHECK([whether the C compiler needs -belf], lt_cv_cc_needs_belf,
+ [AC_LANG_SAVE
+ AC_LANG_C
+ AC_TRY_LINK([],[],[lt_cv_cc_needs_belf=yes],[lt_cv_cc_needs_belf=no])
+ AC_LANG_RESTORE])
+ if test x"$lt_cv_cc_needs_belf" != x"yes"; then
+ # this is probably gcc 2.8.0, egcs 1.0 or newer; no need for -belf
+ CFLAGS="$SAVE_CFLAGS"
+ fi
+ ;;
+
+ifdef([AC_PROVIDE_AC_LIBTOOL_WIN32_DLL],
+[*-*-cygwin* | *-*-mingw* | *-*-pw32*)
+ AC_CHECK_TOOL(DLLTOOL, dlltool, false)
+ AC_CHECK_TOOL(AS, as, false)
+ AC_CHECK_TOOL(OBJDUMP, objdump, false)
+
+ # recent cygwin and mingw systems supply a stub DllMain which the user
+ # can override, but on older systems we have to supply one
+ AC_CACHE_CHECK([if libtool should supply DllMain function], lt_cv_need_dllmain,
+ [AC_TRY_LINK([],
+ [extern int __attribute__((__stdcall__)) DllMain(void*, int, void*);
+ DllMain (0, 0, 0);],
+ [lt_cv_need_dllmain=no],[lt_cv_need_dllmain=yes])])
+
+ case $host/$CC in
+ *-*-cygwin*/gcc*-mno-cygwin*|*-*-mingw*)
+ # old mingw systems require "-dll" to link a DLL, while more recent ones
+ # require "-mdll"
+ SAVE_CFLAGS="$CFLAGS"
+ CFLAGS="$CFLAGS -mdll"
+ AC_CACHE_CHECK([how to link DLLs], lt_cv_cc_dll_switch,
+ [AC_TRY_LINK([], [], [lt_cv_cc_dll_switch=-mdll],[lt_cv_cc_dll_switch=-dll])])
+ CFLAGS="$SAVE_CFLAGS" ;;
+ *-*-cygwin* | *-*-pw32*)
+ # cygwin systems need to pass --dll to the linker, and not link
+ # crt.o which will require a WinMain@16 definition.
+ lt_cv_cc_dll_switch="-Wl,--dll -nostartfiles" ;;
+ esac
+ ;;
+ ])
+esac
+])
+
+# AC_LIBTOOL_DLOPEN - enable checks for dlopen support
+AC_DEFUN([AC_LIBTOOL_DLOPEN], [AC_BEFORE([$0],[AC_LIBTOOL_SETUP])])
+
+# AC_LIBTOOL_WIN32_DLL - declare package support for building win32 dll's
+AC_DEFUN([AC_LIBTOOL_WIN32_DLL], [AC_BEFORE([$0], [AC_LIBTOOL_SETUP])])
+
+# AC_ENABLE_SHARED - implement the --enable-shared flag
+# Usage: AC_ENABLE_SHARED[(DEFAULT)]
+# Where DEFAULT is either `yes' or `no'. If omitted, it defaults to
+# `yes'.
+AC_DEFUN([AC_ENABLE_SHARED],
+[define([AC_ENABLE_SHARED_DEFAULT], ifelse($1, no, no, yes))dnl
+AC_ARG_ENABLE(shared,
+changequote(<<, >>)dnl
+<< --enable-shared[=PKGS] build shared libraries [default=>>AC_ENABLE_SHARED_DEFAULT],
+changequote([, ])dnl
+[p=${PACKAGE-default}
+case $enableval in
+yes) enable_shared=yes ;;
+no) enable_shared=no ;;
+*)
+ enable_shared=no
+ # Look at the argument we got. We use all the common list separators.
+ IFS="${IFS= }"; ac_save_ifs="$IFS"; IFS="${IFS}:,"
+ for pkg in $enableval; do
+ if test "X$pkg" = "X$p"; then
+ enable_shared=yes
+ fi
+ done
+ IFS="$ac_save_ifs"
+ ;;
+esac],
+enable_shared=AC_ENABLE_SHARED_DEFAULT)dnl
+])
+
+# AC_DISABLE_SHARED - set the default shared flag to --disable-shared
+AC_DEFUN([AC_DISABLE_SHARED], [AC_BEFORE([$0],[AC_LIBTOOL_SETUP])dnl
+AC_ENABLE_SHARED(no)])
+
+# AC_ENABLE_STATIC - implement the --enable-static flag
+# Usage: AC_ENABLE_STATIC[(DEFAULT)]
+# Where DEFAULT is either `yes' or `no'. If omitted, it defaults to
+# `yes'.
+AC_DEFUN([AC_ENABLE_STATIC],
+[define([AC_ENABLE_STATIC_DEFAULT], ifelse($1, no, no, yes))dnl
+AC_ARG_ENABLE(static,
+changequote(<<, >>)dnl
+<< --enable-static[=PKGS] build static libraries [default=>>AC_ENABLE_STATIC_DEFAULT],
+changequote([, ])dnl
+[p=${PACKAGE-default}
+case $enableval in
+yes) enable_static=yes ;;
+no) enable_static=no ;;
+*)
+ enable_static=no
+ # Look at the argument we got. We use all the common list separators.
+ IFS="${IFS= }"; ac_save_ifs="$IFS"; IFS="${IFS}:,"
+ for pkg in $enableval; do
+ if test "X$pkg" = "X$p"; then
+ enable_static=yes
+ fi
+ done
+ IFS="$ac_save_ifs"
+ ;;
+esac],
+enable_static=AC_ENABLE_STATIC_DEFAULT)dnl
+])
+
+# AC_DISABLE_STATIC - set the default static flag to --disable-static
+AC_DEFUN([AC_DISABLE_STATIC],
+[AC_BEFORE([$0],[AC_LIBTOOL_SETUP])dnl
+AC_ENABLE_STATIC(no)])
+
+
+# AC_ENABLE_FAST_INSTALL - implement the --enable-fast-install flag
+# Usage: AC_ENABLE_FAST_INSTALL[(DEFAULT)]
+# Where DEFAULT is either `yes' or `no'. If omitted, it defaults to
+# `yes'.
+AC_DEFUN([AC_ENABLE_FAST_INSTALL],
+[define([AC_ENABLE_FAST_INSTALL_DEFAULT], ifelse($1, no, no, yes))dnl
+AC_ARG_ENABLE(fast-install,
+changequote(<<, >>)dnl
+<< --enable-fast-install[=PKGS] optimize for fast installation [default=>>AC_ENABLE_FAST_INSTALL_DEFAULT],
+changequote([, ])dnl
+[p=${PACKAGE-default}
+case $enableval in
+yes) enable_fast_install=yes ;;
+no) enable_fast_install=no ;;
+*)
+ enable_fast_install=no
+ # Look at the argument we got. We use all the common list separators.
+ IFS="${IFS= }"; ac_save_ifs="$IFS"; IFS="${IFS}:,"
+ for pkg in $enableval; do
+ if test "X$pkg" = "X$p"; then
+ enable_fast_install=yes
+ fi
+ done
+ IFS="$ac_save_ifs"
+ ;;
+esac],
+enable_fast_install=AC_ENABLE_FAST_INSTALL_DEFAULT)dnl
+])
+
+# AC_DISABLE_FAST_INSTALL - set the default to --disable-fast-install
+AC_DEFUN([AC_DISABLE_FAST_INSTALL],
+[AC_BEFORE([$0],[AC_LIBTOOL_SETUP])dnl
+AC_ENABLE_FAST_INSTALL(no)])
+
+# AC_LIBTOOL_PICMODE - implement the --with-pic flag
+# Usage: AC_LIBTOOL_PICMODE[(MODE)]
+# Where MODE is either `yes' or `no'. If omitted, it defaults to
+# `both'.
+AC_DEFUN([AC_LIBTOOL_PICMODE],
+[AC_BEFORE([$0],[AC_LIBTOOL_SETUP])dnl
+pic_mode=ifelse($#,1,$1,default)])
+
+
+# AC_PATH_TOOL_PREFIX - find a file program which can recognise shared library
+AC_DEFUN([AC_PATH_TOOL_PREFIX],
+[AC_MSG_CHECKING([for $1])
+AC_CACHE_VAL(lt_cv_path_MAGIC_CMD,
+[case $MAGIC_CMD in
+ /*)
+ lt_cv_path_MAGIC_CMD="$MAGIC_CMD" # Let the user override the test with a path.
+ ;;
+ ?:/*)
+ lt_cv_path_MAGIC_CMD="$MAGIC_CMD" # Let the user override the test with a dos path.
+ ;;
+ *)
+ ac_save_MAGIC_CMD="$MAGIC_CMD"
+ IFS="${IFS= }"; ac_save_ifs="$IFS"; IFS=":"
+dnl $ac_dummy forces splitting on constant user-supplied paths.
+dnl POSIX.2 word splitting is done only on the output of word expansions,
+dnl not every word. This closes a longstanding sh security hole.
+ ac_dummy="ifelse([$2], , $PATH, [$2])"
+ for ac_dir in $ac_dummy; do
+ test -z "$ac_dir" && ac_dir=.
+ if test -f $ac_dir/$1; then
+ lt_cv_path_MAGIC_CMD="$ac_dir/$1"
+ if test -n "$file_magic_test_file"; then
+ case $deplibs_check_method in
+ "file_magic "*)
+ file_magic_regex="`expr \"$deplibs_check_method\" : \"file_magic \(.*\)\"`"
+ MAGIC_CMD="$lt_cv_path_MAGIC_CMD"
+ if eval $file_magic_cmd \$file_magic_test_file 2> /dev/null |
+ egrep "$file_magic_regex" > /dev/null; then
+ :
+ else
+ cat <<EOF 1>&2
+
+*** Warning: the command libtool uses to detect shared libraries,
+*** $file_magic_cmd, produces output that libtool cannot recognize.
+*** The result is that libtool may fail to recognize shared libraries
+*** as such. This will affect the creation of libtool libraries that
+*** depend on shared libraries, but programs linked with such libtool
+*** libraries will work regardless of this problem. Nevertheless, you
+*** may want to report the problem to your system manager and/or to
+*** bug-libtool@gnu.org
+
+EOF
+ fi ;;
+ esac
+ fi
+ break
+ fi
+ done
+ IFS="$ac_save_ifs"
+ MAGIC_CMD="$ac_save_MAGIC_CMD"
+ ;;
+esac])
+MAGIC_CMD="$lt_cv_path_MAGIC_CMD"
+if test -n "$MAGIC_CMD"; then
+ AC_MSG_RESULT($MAGIC_CMD)
+else
+ AC_MSG_RESULT(no)
+fi
])
+# AC_PATH_MAGIC - find a file program which can recognise a shared library
+AC_DEFUN([AC_PATH_MAGIC],
+[AC_REQUIRE([AC_CHECK_TOOL_PREFIX])dnl
+AC_PATH_TOOL_PREFIX(${ac_tool_prefix}file, /usr/bin:$PATH)
+if test -z "$lt_cv_path_MAGIC_CMD"; then
+ if test -n "$ac_tool_prefix"; then
+ AC_PATH_TOOL_PREFIX(file, /usr/bin:$PATH)
+ else
+ MAGIC_CMD=:
+ fi
+fi
+])
+# AC_PROG_LD - find the path to the GNU or non-GNU linker
+AC_DEFUN([AC_PROG_LD],
+[AC_ARG_WITH(gnu-ld,
+[ --with-gnu-ld assume the C compiler uses GNU ld [default=no]],
+test "$withval" = no || with_gnu_ld=yes, with_gnu_ld=no)
+AC_REQUIRE([AC_PROG_CC])dnl
+AC_REQUIRE([AC_CANONICAL_HOST])dnl
+AC_REQUIRE([AC_CANONICAL_BUILD])dnl
+ac_prog=ld
+if test "$GCC" = yes; then
+ # Check if gcc -print-prog-name=ld gives a path.
+ AC_MSG_CHECKING([for ld used by GCC])
+ case $host in
+ *-*-mingw*)
+ # gcc leaves a trailing carriage return which upsets mingw
+ ac_prog=`($CC -print-prog-name=ld) 2>&5 | tr -d '\015'` ;;
+ *)
+ ac_prog=`($CC -print-prog-name=ld) 2>&5` ;;
+ esac
+ case $ac_prog in
+ # Accept absolute paths.
+ [[\\/]* | [A-Za-z]:[\\/]*)]
+ re_direlt=['/[^/][^/]*/\.\./']
+ # Canonicalize the path of ld
+ ac_prog=`echo $ac_prog| sed 's%\\\\%/%g'`
+ while echo $ac_prog | grep "$re_direlt" > /dev/null 2>&1; do
+ ac_prog=`echo $ac_prog| sed "s%$re_direlt%/%"`
+ done
+ test -z "$LD" && LD="$ac_prog"
+ ;;
+ "")
+ # If it fails, then pretend we aren't using GCC.
+ ac_prog=ld
+ ;;
+ *)
+ # If it is relative, then search for the first ld in PATH.
+ with_gnu_ld=unknown
+ ;;
+ esac
+elif test "$with_gnu_ld" = yes; then
+ AC_MSG_CHECKING([for GNU ld])
+else
+ AC_MSG_CHECKING([for non-GNU ld])
+fi
+AC_CACHE_VAL(lt_cv_path_LD,
+[if test -z "$LD"; then
+ IFS="${IFS= }"; ac_save_ifs="$IFS"; IFS="${IFS}${PATH_SEPARATOR-:}"
+ for ac_dir in $PATH; do
+ test -z "$ac_dir" && ac_dir=.
+ if test -f "$ac_dir/$ac_prog" || test -f "$ac_dir/$ac_prog$ac_exeext"; then
+ lt_cv_path_LD="$ac_dir/$ac_prog"
+ # Check to see if the program is GNU ld. I'd rather use --version,
+ # but apparently some GNU ld's only accept -v.
+ # Break only if it was the GNU/non-GNU ld that we prefer.
+ if "$lt_cv_path_LD" -v 2>&1 < /dev/null | egrep '(GNU|with BFD)' > /dev/null; then
+ test "$with_gnu_ld" != no && break
+ else
+ test "$with_gnu_ld" != yes && break
+ fi
+ fi
+ done
+ IFS="$ac_save_ifs"
+else
+ lt_cv_path_LD="$LD" # Let the user override the test with a path.
+fi])
+LD="$lt_cv_path_LD"
+if test -n "$LD"; then
+ AC_MSG_RESULT($LD)
+else
+ AC_MSG_RESULT(no)
+fi
+test -z "$LD" && AC_MSG_ERROR([no acceptable ld found in \$PATH])
+AC_PROG_LD_GNU
+])
+AC_DEFUN([AC_PROG_LD_GNU],
+[AC_CACHE_CHECK([if the linker ($LD) is GNU ld], lt_cv_prog_gnu_ld,
+[# I'd rather use --version here, but apparently some GNU ld's only accept -v.
+if $LD -v 2>&1 </dev/null | egrep '(GNU|with BFD)' 1>&5; then
+ lt_cv_prog_gnu_ld=yes
+else
+ lt_cv_prog_gnu_ld=no
+fi])
+with_gnu_ld=$lt_cv_prog_gnu_ld
+])
+# AC_PROG_LD_RELOAD_FLAG - find reload flag for linker
+# -- PORTME Some linkers may need a different reload flag.
+AC_DEFUN([AC_PROG_LD_RELOAD_FLAG],
+[AC_CACHE_CHECK([for $LD option to reload object files], lt_cv_ld_reload_flag,
+[lt_cv_ld_reload_flag='-r'])
+reload_flag=$lt_cv_ld_reload_flag
+test -n "$reload_flag" && reload_flag=" $reload_flag"
+])
+
+# AC_DEPLIBS_CHECK_METHOD - how to check for library dependencies
+# -- PORTME fill in with the dynamic library characteristics
+AC_DEFUN([AC_DEPLIBS_CHECK_METHOD],
+[AC_CACHE_CHECK([how to recognise dependant libraries],
+lt_cv_deplibs_check_method,
+[lt_cv_file_magic_cmd='$MAGIC_CMD'
+lt_cv_file_magic_test_file=
+lt_cv_deplibs_check_method='unknown'
+# Need to set the preceding variable on all platforms that support
+# interlibrary dependencies.
+# 'none' -- dependencies not supported.
+# `unknown' -- same as none, but documents that we really don't know.
+# 'pass_all' -- all dependencies passed with no checks.
+# 'test_compile' -- check by making test program.
+# 'file_magic [regex]' -- check by looking for files in library path
+# which responds to the $file_magic_cmd with a given egrep regex.
+# If you have `file' or equivalent on your system and you're not sure
+# whether `pass_all' will *always* work, you probably want this one.
+
+case $host_os in
+aix*)
+ lt_cv_deplibs_check_method=pass_all
+ ;;
+
+beos*)
+ lt_cv_deplibs_check_method=pass_all
+ ;;
+
+bsdi4*)
+ lt_cv_deplibs_check_method=['file_magic ELF [0-9][0-9]*-bit [ML]SB (shared object|dynamic lib)']
+ lt_cv_file_magic_cmd='/usr/bin/file -L'
+ lt_cv_file_magic_test_file=/shlib/libc.so
+ ;;
+
+cygwin* | mingw* |pw32*)
+ lt_cv_deplibs_check_method='file_magic file format pei*-i386(.*architecture: i386)?'
+ lt_cv_file_magic_cmd='$OBJDUMP -f'
+ ;;
+
+darwin* | rhapsody*)
+ lt_cv_deplibs_check_method='file_magic Mach-O dynamically linked shared library'
+ lt_cv_file_magic_cmd='/usr/bin/file -L'
+ case "$host_os" in
+ rhapsody* | darwin1.[012])
+ lt_cv_file_magic_test_file='/System/Library/Frameworks/System.framework/System'
+ ;;
+ *) # Darwin 1.3 on
+ lt_cv_file_magic_test_file='/usr/lib/libSystem.dylib'
+ ;;
+ esac
+ ;;
+
+freebsd* )
+ if echo __ELF__ | $CC -E - | grep __ELF__ > /dev/null; then
+ case $host_cpu in
+ i*86 )
+ # Not sure whether the presence of OpenBSD here was a mistake.
+ # Let's accept both of them until this is cleared up.
+ lt_cv_deplibs_check_method=['file_magic (FreeBSD|OpenBSD)/i[3-9]86 (compact )?demand paged shared library']
+ lt_cv_file_magic_cmd=/usr/bin/file
+ lt_cv_file_magic_test_file=`echo /usr/lib/libc.so.*`
+ ;;
+ esac
+ else
+ lt_cv_deplibs_check_method=pass_all
+ fi
+ ;;
+
+gnu*)
+ lt_cv_deplibs_check_method=pass_all
+ ;;
+
+hpux10.20*|hpux11*)
+ lt_cv_deplibs_check_method=['file_magic (s[0-9][0-9][0-9]|PA-RISC[0-9].[0-9]) shared library']
+ lt_cv_file_magic_cmd=/usr/bin/file
+ lt_cv_file_magic_test_file=/usr/lib/libc.sl
+ ;;
+
+irix5* | irix6*)
+ case $host_os in
+ irix5*)
+ # this will be overridden with pass_all, but let us keep it just in case
+ lt_cv_deplibs_check_method="file_magic ELF 32-bit MSB dynamic lib MIPS - version 1"
+ ;;
+ *)
+ case $LD in
+ *-32|*"-32 ") libmagic=32-bit;;
+ *-n32|*"-n32 ") libmagic=N32;;
+ *-64|*"-64 ") libmagic=64-bit;;
+ *) libmagic=never-match;;
+ esac
+ # this will be overridden with pass_all, but let us keep it just in case
+ lt_cv_deplibs_check_method=["file_magic ELF ${libmagic} MSB mips-[1234] dynamic lib MIPS - version 1"]
+ ;;
+ esac
+ lt_cv_file_magic_test_file=`echo /lib${libsuff}/libc.so*`
+ lt_cv_deplibs_check_method=pass_all
+ ;;
+
+# This must be Linux ELF.
+linux-gnu*)
+ case $host_cpu in
+ alpha* | hppa* | i*86 | powerpc* | sparc* | ia64* )
+ lt_cv_deplibs_check_method=pass_all ;;
+ *)
+ # glibc up to 2.1.1 does not perform some relocations on ARM
+ lt_cv_deplibs_check_method=['file_magic ELF [0-9][0-9]*-bit [LM]SB (shared object|dynamic lib )'] ;;
+ esac
+ lt_cv_file_magic_test_file=`echo /lib/libc.so* /lib/libc-*.so`
+ ;;
+
+netbsd*)
+ if echo __ELF__ | $CC -E - | grep __ELF__ > /dev/null; then
+ [lt_cv_deplibs_check_method='match_pattern /lib[^/\.]+\.so\.[0-9]+\.[0-9]+$']
+ else
+ [lt_cv_deplibs_check_method='match_pattern /lib[^/\.]+\.so$']
+ fi
+ ;;
+
+newsos6)
+ [lt_cv_deplibs_check_method='file_magic ELF [0-9][0-9]*-bit [ML]SB (executable|dynamic lib)']
+ lt_cv_file_magic_cmd=/usr/bin/file
+ lt_cv_file_magic_test_file=/usr/lib/libnls.so
+ ;;
+
+osf3* | osf4* | osf5*)
+ # this will be overridden with pass_all, but let us keep it just in case
+ lt_cv_deplibs_check_method='file_magic COFF format alpha shared library'
+ lt_cv_file_magic_test_file=/shlib/libc.so
+ lt_cv_deplibs_check_method=pass_all
+ ;;
+
+sco3.2v5*)
+ lt_cv_deplibs_check_method=pass_all
+ ;;
+
+solaris*)
+ lt_cv_deplibs_check_method=pass_all
+ lt_cv_file_magic_test_file=/lib/libc.so
+ ;;
+
+[sysv5uw[78]* | sysv4*uw2*)]
+ lt_cv_deplibs_check_method=pass_all
+ ;;
+
+sysv4 | sysv4.2uw2* | sysv4.3* | sysv5*)
+ case $host_vendor in
+ ncr)
+ lt_cv_deplibs_check_method=pass_all
+ ;;
+ motorola)
+ lt_cv_deplibs_check_method=['file_magic ELF [0-9][0-9]*-bit [ML]SB (shared object|dynamic lib) M[0-9][0-9]* Version [0-9]']
+ lt_cv_file_magic_test_file=`echo /usr/lib/libc.so*`
+ ;;
+ esac
+ ;;
+esac
+])
+file_magic_cmd=$lt_cv_file_magic_cmd
+deplibs_check_method=$lt_cv_deplibs_check_method
+])
+
+
+# AC_PROG_NM - find the path to a BSD-compatible name lister
+AC_DEFUN([AC_PROG_NM],
+[AC_MSG_CHECKING([for BSD-compatible nm])
+AC_CACHE_VAL(lt_cv_path_NM,
+[if test -n "$NM"; then
+ # Let the user override the test.
+ lt_cv_path_NM="$NM"
+else
+ IFS="${IFS= }"; ac_save_ifs="$IFS"; IFS="${IFS}${PATH_SEPARATOR-:}"
+ for ac_dir in $PATH /usr/ccs/bin /usr/ucb /bin; do
+ test -z "$ac_dir" && ac_dir=.
+ tmp_nm=$ac_dir/${ac_tool_prefix}nm
+ if test -f $tmp_nm || test -f $tmp_nm$ac_exeext ; then
+ # Check to see if the nm accepts a BSD-compat flag.
+ # Adding the `sed 1q' prevents false positives on HP-UX, which says:
+ # nm: unknown option "B" ignored
+ # Tru64's nm complains that /dev/null is an invalid object file
+ if ($tmp_nm -B /dev/null 2>&1 | sed '1q'; exit 0) | egrep '(/dev/null|Invalid file or object type)' >/dev/null; then
+ lt_cv_path_NM="$tmp_nm -B"
+ break
+ elif ($tmp_nm -p /dev/null 2>&1 | sed '1q'; exit 0) | egrep /dev/null >/dev/null; then
+ lt_cv_path_NM="$tmp_nm -p"
+ break
+ else
+ lt_cv_path_NM=${lt_cv_path_NM="$tmp_nm"} # keep the first match, but
+ continue # so that we can try to find one that supports BSD flags
+ fi
+ fi
+ done
+ IFS="$ac_save_ifs"
+ test -z "$lt_cv_path_NM" && lt_cv_path_NM=nm
+fi])
+NM="$lt_cv_path_NM"
+AC_MSG_RESULT([$NM])
+])
+
+# AC_CHECK_LIBM - check for math library
+AC_DEFUN([AC_CHECK_LIBM],
+[AC_REQUIRE([AC_CANONICAL_HOST])dnl
+LIBM=
+case $host in
+*-*-beos* | *-*-cygwin* | *-*-pw32*)
+ # These system don't have libm
+ ;;
+*-ncr-sysv4.3*)
+ AC_CHECK_LIB(mw, _mwvalidcheckl, LIBM="-lmw")
+ AC_CHECK_LIB(m, main, LIBM="$LIBM -lm")
+ ;;
+*)
+ AC_CHECK_LIB(m, main, LIBM="-lm")
+ ;;
+esac
+])
+
+# AC_LIBLTDL_CONVENIENCE[(dir)] - sets LIBLTDL to the link flags for
+# the libltdl convenience library and INCLTDL to the include flags for
+# the libltdl header and adds --enable-ltdl-convenience to the
+# configure arguments. Note that LIBLTDL and INCLTDL are not
+# AC_SUBSTed, nor is AC_CONFIG_SUBDIRS called. If DIR is not
+# provided, it is assumed to be `libltdl'. LIBLTDL will be prefixed
+# with '${top_builddir}/' and INCLTDL will be prefixed with
+# '${top_srcdir}/' (note the single quotes!). If your package is not
+# flat and you're not using automake, define top_builddir and
+# top_srcdir appropriately in the Makefiles.
+AC_DEFUN([AC_LIBLTDL_CONVENIENCE],
+[AC_BEFORE([$0],[AC_LIBTOOL_SETUP])dnl
+ case $enable_ltdl_convenience in
+ no) AC_MSG_ERROR([this package needs a convenience libltdl]) ;;
+ "") enable_ltdl_convenience=yes
+ ac_configure_args="$ac_configure_args --enable-ltdl-convenience" ;;
+ esac
+ LIBLTDL='${top_builddir}/'ifelse($#,1,[$1],['libltdl'])/libltdlc.la
+ INCLTDL='-I${top_srcdir}/'ifelse($#,1,[$1],['libltdl'])
+])
+
+# AC_LIBLTDL_INSTALLABLE[(dir)] - sets LIBLTDL to the link flags for
+# the libltdl installable library and INCLTDL to the include flags for
+# the libltdl header and adds --enable-ltdl-install to the configure
+# arguments. Note that LIBLTDL and INCLTDL are not AC_SUBSTed, nor is
+# AC_CONFIG_SUBDIRS called. If DIR is not provided and an installed
+# libltdl is not found, it is assumed to be `libltdl'. LIBLTDL will
+# be prefixed with '${top_builddir}/' and INCLTDL will be prefixed
+# with '${top_srcdir}/' (note the single quotes!). If your package is
+# not flat and you're not using automake, define top_builddir and
+# top_srcdir appropriately in the Makefiles.
+# In the future, this macro may have to be called after AC_PROG_LIBTOOL.
+AC_DEFUN([AC_LIBLTDL_INSTALLABLE],
+[AC_BEFORE([$0],[AC_LIBTOOL_SETUP])dnl
+ AC_CHECK_LIB(ltdl, main,
+ [test x"$enable_ltdl_install" != xyes && enable_ltdl_install=no],
+ [if test x"$enable_ltdl_install" = xno; then
+ AC_MSG_WARN([libltdl not installed, but installation disabled])
+ else
+ enable_ltdl_install=yes
+ fi
+ ])
+ if test x"$enable_ltdl_install" = x"yes"; then
+ ac_configure_args="$ac_configure_args --enable-ltdl-install"
+ LIBLTDL='${top_builddir}/'ifelse($#,1,[$1],['libltdl'])/libltdl.la
+ INCLTDL='-I${top_srcdir}/'ifelse($#,1,[$1],['libltdl'])
+ else
+ ac_configure_args="$ac_configure_args --enable-ltdl-install=no"
+ LIBLTDL="-lltdl"
+ INCLTDL=
+ fi
+])
+
+# If this macro is not defined by Autoconf, define it here.
+ifdef([AC_PROVIDE_IFELSE],
+ [],
+ [define([AC_PROVIDE_IFELSE],
+ [ifdef([AC_PROVIDE_$1],
+ [$2], [$3])])])
+
+# AC_LIBTOOL_F77 - enable support for fortran libraries
+AC_DEFUN([AC_LIBTOOL_F77], [AC_REQUIRE([_AC_LIBTOOL_F77])])
+
+AC_DEFUN([_AC_LIBTOOL_F77],
+[AC_REQUIRE([AC_PROG_F77])
+LIBTOOL_DEPS=$LIBTOOL_DEPS" $ac_aux_dir/ltcf-f77.sh"
+lt_save_CC="$CC"
+lt_save_CFLAGS="$CFLAGS"
+dnl Make sure LTCC is set to the C compiler, i.e. set LTCC before CC
+dnl is set to the fortran compiler.
+AR="$AR" LTCC="$CC" CC="$F77" F77="$F77" CFLAGS="$FFLAGS" CPPFLAGS="" \
+MAGIC_CMD="$MAGIC_CMD" LD="$LD" LDFLAGS="$LDFLAGS" LIBS="$LIBS" \
+LN_S="$LN_S" NM="$NM" RANLIB="$RANLIB" STRIP="$STRIP" \
+AS="$AS" DLLTOOL="$DLLTOOL" OBJDUMP="$OBJDUMP" \
+objext="$OBJEXT" exeext="$EXEEXT" reload_flag="$reload_flag" \
+deplibs_check_method="$deplibs_check_method" \
+file_magic_cmd="$file_magic_cmd" \
+${CONFIG_SHELL-/bin/sh} $ac_aux_dir/ltconfig -o libtool $libtool_flags \
+--build="$build" --add-tag=F77 $ac_aux_dir/ltcf-f77.sh $host \
+|| AC_MSG_ERROR([libtool tag configuration failed])
+CC="$lt_save_CC"
+CFLAGS="$lt_save_CFLAGS"
+
+# Redirect the config.log output again, so that the ltconfig log is not
+# clobbered by the next message.
+exec 5>>./config.log
+])
+
+# AC_LIBTOOL_CXX - enable support for C++ libraries
+AC_DEFUN([AC_LIBTOOL_CXX], [AC_REQUIRE([_AC_LIBTOOL_CXX])])
+
+AC_DEFUN([_AC_LIBTOOL_CXX],
+[AC_REQUIRE([AC_PROG_CXX])
+AC_REQUIRE([AC_PROG_CXXCPP])
+LIBTOOL_DEPS=$LIBTOOL_DEPS" $ac_aux_dir/ltcf-cxx.sh"
+lt_save_CC="$CC"
+lt_save_CFLAGS="$CFLAGS"
+dnl Make sure LTCC is set to the C compiler, i.e. set LTCC before CC
+dnl is set to the C++ compiler.
+AR="$AR" LTCC="$CC" CC="$CXX" CXX="$CXX" CFLAGS="$CXXFLAGS" CPPFLAGS="$CPPFLAGS" \
+MAGIC_CMD="$MAGIC_CMD" LD="$LD" LDFLAGS="$LDFLAGS" LIBS="$LIBS" \
+LN_S="$LN_S" NM="$NM" RANLIB="$RANLIB" STRIP="$STRIP" \
+AS="$AS" DLLTOOL="$DLLTOOL" OBJDUMP="$OBJDUMP" \
+objext="$OBJEXT" exeext="$EXEEXT" reload_flag="$reload_flag" \
+deplibs_check_method="$deplibs_check_method" \
+file_magic_cmd="$file_magic_cmd" \
+${CONFIG_SHELL-/bin/sh} $ac_aux_dir/ltconfig -o libtool $libtool_flags \
+--build="$build" --add-tag=CXX $ac_aux_dir/ltcf-cxx.sh $host \
+|| AC_MSG_ERROR([libtool tag configuration failed])
+CC="$lt_save_CC"
+CFLAGS="$lt_save_CFLAGS"
+
+# Redirect the config.log output again, so that the ltconfig log is not
+# clobbered by the next message.
+exec 5>>./config.log
+])
+
+# AC_LIBTOOL_GCJ - enable support for GCJ libraries
+AC_DEFUN([AC_LIBTOOL_GCJ],[AC_REQUIRE([_AC_LIBTOOL_GCJ])])
+
+AC_DEFUN([_AC_LIBTOOL_GCJ],
+[AC_REQUIRE([AC_PROG_LIBTOOL])
+AC_PROVIDE_IFELSE([AC_PROG_GCJ],[],
+ [AC_PROVIDE_IFELSE([A][M_PROG_GCJ],[],
+ [AC_PROVIDE_IFELSE([LT_AC_PROG_GCJ],[],
+ [ifdef([AC_PROG_GCJ],[AC_REQUIRE([AC_PROG_GCJ])],
+ [ifdef([A][M_PROG_GCJ],[AC_REQUIRE([A][M_PROG_GCJ])],
+ [AC_REQUIRE([A][C_PROG_GCJ_OR_A][M_PROG_GCJ])])])])])])
+LIBTOOL_DEPS=$LIBTOOL_DEPS" $ac_aux_dir/ltcf-gcj.sh"
+lt_save_CC="$CC"
+lt_save_CFLAGS="$CFLAGS"
+dnl Make sure LTCC is set to the C compiler, i.e. set LTCC before CC
+dnl is set to the C++ compiler.
+AR="$AR" LTCC="$CC" CC="$GCJ" CFLAGS="$GCJFLAGS" CPPFLAGS="$CPPFLAGS" \
+MAGIC_CMD="$MAGIC_CMD" LD="$LD" LDFLAGS="$LDFLAGS" LIBS="$LIBS" \
+LN_S="$LN_S" NM="$NM" RANLIB="$RANLIB" STRIP="$STRIP" \
+AS="$AS" DLLTOOL="$DLLTOOL" OBJDUMP="$OBJDUMP" \
+objext="$OBJEXT" exeext="$EXEEXT" reload_flag="$reload_flag" \
+deplibs_check_method="$deplibs_check_method" \
+file_magic_cmd="$file_magic_cmd" \
+${CONFIG_SHELL-/bin/sh} $ac_aux_dir/ltconfig -o libtool $libtool_flags \
+--build="$build" --add-tag=GCJ $ac_aux_dir/ltcf-gcj.sh $host \
+|| AC_MSG_ERROR([libtool tag configuration failed])
+CC="$lt_save_CC"
+CFLAGS="$lt_save_CFLAGS"
+
+# Redirect the config.log output again, so that the ltconfig log is not
+# clobbered by the next message.
+exec 5>>./config.log
+])
+
+dnl old names
+AC_DEFUN([AM_PROG_LIBTOOL], [AC_PROG_LIBTOOL])
+AC_DEFUN([AM_ENABLE_SHARED], [AC_ENABLE_SHARED($@)])
+AC_DEFUN([AM_ENABLE_STATIC], [AC_ENABLE_STATIC($@)])
+AC_DEFUN([AM_DISABLE_SHARED], [AC_DISABLE_SHARED($@)])
+AC_DEFUN([AM_DISABLE_STATIC], [AC_DISABLE_STATIC($@)])
+AC_DEFUN([AM_PROG_LD], [AC_PROG_LD])
+AC_DEFUN([AM_PROG_NM], [AC_PROG_NM])
+
+dnl This is just to silence aclocal about the macro not being used
+ifelse([AC_DISABLE_FAST_INSTALL])dnl
+ifelse([AC_DISABLE_SHARED])dnl
+
+AC_DEFUN([LT_AC_PROG_GCJ],
+[AC_CHECK_TOOL(GCJ, gcj, no)
+ test "x${GCJFLAGS+set}" = xset || GCJFLAGS="-g -O2"
+ AC_SUBST(GCJFLAGS)
+])
-# aclocal.m4 generated automatically by aclocal 1.4d
+dnl aclocal.m4 generated automatically by aclocal 1.4
-# Copyright 1994, 1995, 1996, 1997, 1998, 1999, 2000
-# Free Software Foundation, Inc.
-# This file is free software; the Free Software Foundation
-# gives unlimited permission to copy and/or distribute it,
-# with or without modifications, as long as this notice is preserved.
+dnl Copyright (C) 1994, 1995-8, 1999 Free Software Foundation, Inc.
+dnl This file is free software; the Free Software Foundation
+dnl gives unlimited permission to copy and/or distribute it,
+dnl with or without modifications, as long as this notice is preserved.
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY, to the extent permitted by law; without
-# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
-# PARTICULAR PURPOSE.
+dnl This program is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY, to the extent permitted by law; without
+dnl even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+dnl PARTICULAR PURPOSE.
# ACX_CHECK_FFTW()
if test -n "$fftwname"; then
-# we cannot run the code since MPI program might not be allowed outside a charge queue
+# we cannot run the code since an MPI program might not be allowed on a login node of a supercomputer
AC_TRY_COMPILE([#include <$fftwname.h>],
[int _array_ [1 - 2 * !((sizeof(fftw_real)) == $2)]; ],
ok="yes",ok="no")
[
AC_MSG_RESULT(no)
AC_MSG_ERROR([Cannot find any $prec precision $fftwname.h or $xfftwname.h]
-[Do you have $prec precision FFTW installed? You can find it at www.fftw.org]
-[Note that the default FFTW setup is double precision. You change the]
-[FFTW configuration to single with --enable-float and turn on MPI support]
-[with --enable-mpi. It is a good idea to install both single & double.]
+[Do you have $prec precision FFTW installed? If you are using packages,]
+[note that you also need fftw-devel to compile GROMACS. You can find the ]
+[software at www.fftw.org, and detailed instructions at www.gromacs.org.]
+[If you compiled FFTW yourself: ]
+[Note that the default FFTW setup is double precision. Change the FFTW]
+[configuration to single with --enable-float. If you want MPI support,]
+[use --enable-mpi. It is a good idea to install both single & double.]
[If your sysadm doesn't want to install it you can do it to a location]
-[in your home directory and provide Gromacs configure with the correct]
-[paths by setting the CPPFLAGS and LDFLAGS environment variables.]
-[Check the Gromacs INSTALL file for additional information.])
+[in your home directory and provide the correct paths in the CPPFLAGS]
+[and LDFLAGS environment variables before running configure.]
+[That is also necessary to do if your compiler doesn't search]
+[/usr/local/include and /usr/local/lib by default.]
+[You can find information at www.gromacs.org, or in the INSTALL file.])
])
AC_TRY_COMPILE([#include <$xfftwname.h>],[int _array_ [1 - 2 * !((sizeof(fftw_real)) == $2)];],
[
fftwname=$xfftwname
usedprefix=$fftwcheckprefix
],
+[
AC_MSG_ERROR([Cannot find any $prec precision $fftwname.h or $xfftwname.h]
-[Do you have $prec precision FFTW installed? You can find it at www.fftw.org]
-[Note that the default FFTW setup is double precision. You change the]
-[FFTW configuration to single with --enable-float and turn on MPI support]
-[with --enable-mpi. It is a good idea to install both single & double.]
+[Do you have $prec precision FFTW installed? If you are using packages,]
+[note that you also need fftw-devel to compile GROMACS. You can find the ]
+[software at www.fftw.org, and detailed instructions at www.gromacs.org.]
+[If you compiled FFTW yourself: ]
+[Note that the default FFTW setup is double precision. Change the FFTW]
+[configuration to single with --enable-float. If you want MPI support,]
+[use --enable-mpi. It is a good idea to install both single & double.]
[If your sysadm doesn't want to install it you can do it to a location]
-[in your home directory and provide Gromacs configure with the correct]
-[paths by setting the CPPFLAGS and LDFLAGS environment variables.]
-[Check the Gromacs INSTALL file for additional information.]))
+[in your home directory and provide the correct paths in the CPPFLAGS]
+[and LDFLAGS environment variables before running configure.]
+[That is also necessary to do if your compiler doesn't search]
+[/usr/local/include and /usr/local/lib by default.]
+[You can find information at www.gromacs.org, or in the INSTALL file.])])
fi
AC_CHECK_LIB($fftwname,main,,AC_MSG_ERROR([Can't find a library to match the $fftwname header]))
fi
AC_ARG_WITH(motif-includes,
-[ --with-motif-includes=DIR Motif include files are in DIR],
+[ --with-motif-includes=DIR Motif include files are in DIR],
motif_includes="$withval")
AC_ARG_WITH(motif-libraries,
-[ --with-motif-libraries=DIR Motif libraries are in DIR],
+[ --with-motif-libraries=DIR Motif libraries are in DIR],
motif_libraries="$withval")
AC_MSG_CHECKING(for Motif)
# determine our suggested choices for both C and fortran, and then possibly
# override them with user choices.
+cc_vendor="unknown"
+
case "${host_cpu}-${host_os}" in
*-solaris2*)
esac
if $CC -V 2> /dev/null | grep Compaq > /dev/null 2>&1; then
xCFLAGS="$tmpCFLAGS"
+ cc_vendor="Compaq"
fi
if test "$enable_fortran" = "yes"; then
if $F77 -V 2> /dev/null | grep Compaq > /dev/null 2>&1; then
xCFLAGS="$pgiopt -fast -Minfo=loop -pc 32"
fi
if test "$enable_fortran" = "yes"; then
- if $F77 -V 2> /dev/null | grep Portland /dev/null 2>&1; then
+ if $F77 -V 2> /dev/null | grep Portland > /dev/null 2>&1; then
xFFLAGS="$xCFLAGS -Mneginfo=loop"
fi
fi
fi
CPU_FLAGS=""
+
if test "$GCC" = "yes"; then
+ AM_CONDITIONAL(GNU_CC,true)
# try to guess correct CPU flags, at least for linux
case "${host_cpu}" in
# i586/i686 cpu flags don't improve speed, thus no need to use them.
ACX_CHECK_CC_FLAGS(-mpowerpc,m_powerpc,CPU_FLAGS=-mpowerpc)
fi
esac
+else
+ AM_CONDITIONAL(GNU_CC,false)
fi
if test -n "$CPU_FLAGS"; then
echo "*******************************************************************"
echo "* WARNING: No special optimization settings found for the C *"
echo "* compiler. Use make CFLAGS=..., or edit the top level Makefile. *"
- echo "* Reverting to the default setting CFLAGS=-O3. (mail us about it) *"
+ echo "* Reverting to the default setting CFLAGS=-O3. (mail us about it!)*"
echo "*******************************************************************"
CFLAGS="-O3"
fi
if test "$enable_fortran" = "yes"; then
if test "$ac_test_FFLAGS" != "set"; then
FFLAGS="$xFFLAGS"
-
if test -z "$FFLAGS"; then
echo "*******************************************************************"
echo "* WARNING: No special optimization settings found for the fortran *"
echo "* compiler. Use make FFLAGS=..., or edit the top level Makefile. *"
- echo "* Reverting to the default setting FFLAGS=-O3. (mail us about it) *"
+ echo "* Reverting to the default setting FFLAGS=-O3. (mail us about it!)*"
echo "*******************************************************************"
FFLAGS="-O3"
fi
echo "******************************************"
fi
fi
+
])
-# Do all the work for Automake. This macro actually does too much --
-# some checks are only needed if your package does certain things.
-# But this isn't really a big deal.
+# serial 46 AC_PROG_LIBTOOL
+AC_DEFUN([AC_PROG_LIBTOOL],
+[AC_REQUIRE([_AC_PROG_LIBTOOL])dnl
+dnl If AC_PROG_CXX has already been expanded, run AC_LIBTOOL_CXX
+dnl immediately, otherwise, hook it in at the end of AC_PROG_CXX.
+ AC_PROVIDE_IFELSE([AC_PROG_CXX],
+ [AC_LIBTOOL_CXX],
+ [define([AC_PROG_CXX], defn([AC_PROG_CXX])[AC_LIBTOOL_CXX
+])])
+ AC_PROVIDE_IFELSE([AC_PROG_F77],
+ [AC_LIBTOOL_F77],
+ [define([AC_PROG_F77], defn([AC_PROG_F77])[AC_LIBTOOL_F77
+])])
+
+dnl Quote A][M_PROG_GCJ so that aclocal doesn't bring it in needlessly.
+dnl If either AC_PROG_GCJ or A][M_PROG_GCJ have already been expanded, run
+dnl AC_LIBTOOL_GCJ immediately, otherwise, hook it in at the end of both.
+ AC_PROVIDE_IFELSE([AC_PROG_GCJ],
+ [AC_LIBTOOL_GCJ],
+ [AC_PROVIDE_IFELSE([A][M_PROG_GCJ],
+ [AC_LIBTOOL_GCJ],
+ [AC_PROVIDE_IFELSE([LT_AC_PROG_GCJ],
+ [AC_LIBTOOL_GCJ],
+ [ifdef([AC_PROG_GCJ],
+ [define([AC_PROG_GCJ], defn([AC_PROG_GCJ])[AC_LIBTOOL_GCJ
+])])
+ ifdef([A][M_PROG_GCJ],
+ [define([A][M_PROG_GCJ], defn([A][M_PROG_GCJ])[AC_LIBTOOL_GCJ
+])])
+ ifdef([LT_AC_PROG_GCJ],
+ [define([LT_AC_PROG_GCJ], defn([LT_AC_PROG_GCJ])[AC_LIBTOOL_GCJ
+])])])])])])
+
+AC_DEFUN([_AC_PROG_LIBTOOL],
+[AC_REQUIRE([AC_LIBTOOL_SETUP])dnl
+AC_BEFORE([$0],[AC_LIBTOOL_CXX])dnl
+AC_BEFORE([$0],[AC_LIBTOOL_GCJ])dnl
+
+# Save cache, so that ltconfig can load it
+AC_CACHE_SAVE
+
+# Actually configure libtool. ac_aux_dir is where install-sh is found.
+AR="$AR" LTCC="$CC" CC="$CC" CFLAGS="$CFLAGS" CPPFLAGS="$CPPFLAGS" \
+MAGIC_CMD="$MAGIC_CMD" LD="$LD" LDFLAGS="$LDFLAGS" LIBS="$LIBS" \
+LN_S="$LN_S" NM="$NM" RANLIB="$RANLIB" STRIP="$STRIP" \
+AS="$AS" DLLTOOL="$DLLTOOL" OBJDUMP="$OBJDUMP" \
+objext="$OBJEXT" exeext="$EXEEXT" reload_flag="$reload_flag" \
+deplibs_check_method="$deplibs_check_method" file_magic_cmd="$file_magic_cmd" \
+${CONFIG_SHELL-/bin/sh} $ac_aux_dir/ltconfig --no-reexec \
+$libtool_flags --no-verify --build="$build" $ac_aux_dir/ltmain.sh $host \
+|| AC_MSG_ERROR([libtool configure failed])
+
+# Reload cache, that may have been modified by ltconfig
+AC_CACHE_LOAD
+
+# This can be used to rebuild libtool when needed
+LIBTOOL_DEPS="$ac_aux_dir/ltconfig $ac_aux_dir/ltmain.sh $ac_aux_dir/ltcf-c.sh"
+
+# Always use our own libtool.
+LIBTOOL='$(SHELL) $(top_builddir)/libtool'
+AC_SUBST(LIBTOOL)dnl
+
+# Redirect the config.log output again, so that the ltconfig log is not
+# clobbered by the next message.
+exec 5>>./config.log
+])
+
+AC_DEFUN([AC_LIBTOOL_SETUP],
+[AC_PREREQ(2.13)dnl
+AC_REQUIRE([AC_ENABLE_SHARED])dnl
+AC_REQUIRE([AC_ENABLE_STATIC])dnl
+AC_REQUIRE([AC_ENABLE_FAST_INSTALL])dnl
+AC_REQUIRE([AC_CANONICAL_HOST])dnl
+AC_REQUIRE([AC_CANONICAL_BUILD])dnl
+AC_REQUIRE([AC_PROG_CC])dnl
+AC_REQUIRE([AC_PROG_LD])dnl
+AC_REQUIRE([AC_PROG_LD_RELOAD_FLAG])dnl
+AC_REQUIRE([AC_PROG_NM])dnl
+AC_REQUIRE([AC_PROG_LN_S])dnl
+AC_REQUIRE([AC_DEPLIBS_CHECK_METHOD])dnl
+# Autoconf 2.13's AC_OBJEXT and AC_EXEEXT macros only works for C compilers!
+AC_REQUIRE([AC_OBJEXT])dnl
+AC_REQUIRE([AC_EXEEXT])dnl
+dnl
+
+# Only perform the check for file, if the check method requires it
+case $deplibs_check_method in
+file_magic*)
+ if test "$file_magic_cmd" = '$MAGIC_CMD'; then
+ AC_PATH_MAGIC
+ fi
+ ;;
+esac
-# serial 5
+AC_CHECK_TOOL(RANLIB, ranlib, :)
+AC_CHECK_TOOL(STRIP, strip, :)
+
+# Check for any special flags to pass to ltconfig.
+libtool_flags="--cache-file=$cache_file"
+test "$enable_shared" = no && libtool_flags="$libtool_flags --disable-shared"
+test "$enable_static" = no && libtool_flags="$libtool_flags --disable-static"
+test "$enable_fast_install" = no && libtool_flags="$libtool_flags --disable-fast-install"
+test "$GCC" = yes && libtool_flags="$libtool_flags --with-gcc"
+test "$lt_cv_prog_gnu_ld" = yes && libtool_flags="$libtool_flags --with-gnu-ld"
+ifdef([AC_PROVIDE_AC_LIBTOOL_DLOPEN],
+[libtool_flags="$libtool_flags --enable-dlopen"])
+ifdef([AC_PROVIDE_AC_LIBTOOL_WIN32_DLL],
+[libtool_flags="$libtool_flags --enable-win32-dll"])
+AC_ARG_ENABLE(libtool-lock,
+ [ --disable-libtool-lock avoid locking (might break parallel builds)])
+test "x$enable_libtool_lock" = xno && libtool_flags="$libtool_flags --disable-lock"
+test x"$silent" = xyes && libtool_flags="$libtool_flags --silent"
+
+AC_ARG_WITH(pic,
+ [ --with-pic try to use only PIC/non-PIC [default=both]],
+ pic_mode="$withval", pic_mode=default)
+test x"$pic_mode" = xyes && libtool_flags="$libtool_flags --prefer-pic"
+test x"$pic_mode" = xno && libtool_flags="$libtool_flags --prefer-non-pic"
+
+# Some flags need to be propagated to the compiler or linker for good
+# libtool support.
+case $host in
+*-*-irix6*)
+ # Find out which ABI we are using.
+ echo '[#]line __oline__ "configure"' > conftest.$ac_ext
+ if AC_TRY_EVAL(ac_compile); then
+ case `/usr/bin/file conftest.$ac_objext` in
+ *32-bit*)
+ LD="${LD-ld} -32"
+ ;;
+ *N32*)
+ LD="${LD-ld} -n32"
+ ;;
+ *64-bit*)
+ LD="${LD-ld} -64"
+ ;;
+ esac
+ fi
+ rm -rf conftest*
+ ;;
-# There are a few dirty hacks below to avoid letting `AC_PROG_CC' be
-# written in clear, in which case automake, when reading aclocal.m4,
-# will think it sees a *use*, and therefore will trigger all it's
-# C support machinery. Also note that it means that autoscan, seeing
-# CC etc. in the Makefile, will ask for an AC_PROG_CC use...
+*-*-sco3.2v5*)
+ # On SCO OpenServer 5, we need -belf to get full-featured binaries.
+ SAVE_CFLAGS="$CFLAGS"
+ CFLAGS="$CFLAGS -belf"
+ AC_CACHE_CHECK([whether the C compiler needs -belf], lt_cv_cc_needs_belf,
+ [AC_LANG_SAVE
+ AC_LANG_C
+ AC_TRY_LINK([],[],[lt_cv_cc_needs_belf=yes],[lt_cv_cc_needs_belf=no])
+ AC_LANG_RESTORE])
+ if test x"$lt_cv_cc_needs_belf" != x"yes"; then
+ # this is probably gcc 2.8.0, egcs 1.0 or newer; no need for -belf
+ CFLAGS="$SAVE_CFLAGS"
+ fi
+ ;;
+ifdef([AC_PROVIDE_AC_LIBTOOL_WIN32_DLL],
+[*-*-cygwin* | *-*-mingw* | *-*-pw32*)
+ AC_CHECK_TOOL(DLLTOOL, dlltool, false)
+ AC_CHECK_TOOL(AS, as, false)
+ AC_CHECK_TOOL(OBJDUMP, objdump, false)
+
+ # recent cygwin and mingw systems supply a stub DllMain which the user
+ # can override, but on older systems we have to supply one
+ AC_CACHE_CHECK([if libtool should supply DllMain function], lt_cv_need_dllmain,
+ [AC_TRY_LINK([],
+ [extern int __attribute__((__stdcall__)) DllMain(void*, int, void*);
+ DllMain (0, 0, 0);],
+ [lt_cv_need_dllmain=no],[lt_cv_need_dllmain=yes])])
+
+ case $host/$CC in
+ *-*-cygwin*/gcc*-mno-cygwin*|*-*-mingw*)
+ # old mingw systems require "-dll" to link a DLL, while more recent ones
+ # require "-mdll"
+ SAVE_CFLAGS="$CFLAGS"
+ CFLAGS="$CFLAGS -mdll"
+ AC_CACHE_CHECK([how to link DLLs], lt_cv_cc_dll_switch,
+ [AC_TRY_LINK([], [], [lt_cv_cc_dll_switch=-mdll],[lt_cv_cc_dll_switch=-dll])])
+ CFLAGS="$SAVE_CFLAGS" ;;
+ *-*-cygwin* | *-*-pw32*)
+ # cygwin systems need to pass --dll to the linker, and not link
+ # crt.o which will require a WinMain@16 definition.
+ lt_cv_cc_dll_switch="-Wl,--dll -nostartfiles" ;;
+ esac
+ ;;
+ ])
+esac
+])
-# We require 2.13 because we rely on SHELL being computed by configure.
-AC_PREREQ([2.13])
+# AC_LIBTOOL_DLOPEN - enable checks for dlopen support
+AC_DEFUN([AC_LIBTOOL_DLOPEN], [AC_BEFORE([$0],[AC_LIBTOOL_SETUP])])
+
+# AC_LIBTOOL_WIN32_DLL - declare package support for building win32 dll's
+AC_DEFUN([AC_LIBTOOL_WIN32_DLL], [AC_BEFORE([$0], [AC_LIBTOOL_SETUP])])
+
+# AC_ENABLE_SHARED - implement the --enable-shared flag
+# Usage: AC_ENABLE_SHARED[(DEFAULT)]
+# Where DEFAULT is either `yes' or `no'. If omitted, it defaults to
+# `yes'.
+AC_DEFUN([AC_ENABLE_SHARED],
+[define([AC_ENABLE_SHARED_DEFAULT], ifelse($1, no, no, yes))dnl
+AC_ARG_ENABLE(shared,
+changequote(<<, >>)dnl
+<< --enable-shared[=PKGS] build shared libraries [default=>>AC_ENABLE_SHARED_DEFAULT],
+changequote([, ])dnl
+[p=${PACKAGE-default}
+case $enableval in
+yes) enable_shared=yes ;;
+no) enable_shared=no ;;
+*)
+ enable_shared=no
+ # Look at the argument we got. We use all the common list separators.
+ IFS="${IFS= }"; ac_save_ifs="$IFS"; IFS="${IFS}:,"
+ for pkg in $enableval; do
+ if test "X$pkg" = "X$p"; then
+ enable_shared=yes
+ fi
+ done
+ IFS="$ac_save_ifs"
+ ;;
+esac],
+enable_shared=AC_ENABLE_SHARED_DEFAULT)dnl
+])
+
+# AC_DISABLE_SHARED - set the default shared flag to --disable-shared
+AC_DEFUN([AC_DISABLE_SHARED], [AC_BEFORE([$0],[AC_LIBTOOL_SETUP])dnl
+AC_ENABLE_SHARED(no)])
+
+# AC_ENABLE_STATIC - implement the --enable-static flag
+# Usage: AC_ENABLE_STATIC[(DEFAULT)]
+# Where DEFAULT is either `yes' or `no'. If omitted, it defaults to
+# `yes'.
+AC_DEFUN([AC_ENABLE_STATIC],
+[define([AC_ENABLE_STATIC_DEFAULT], ifelse($1, no, no, yes))dnl
+AC_ARG_ENABLE(static,
+changequote(<<, >>)dnl
+<< --enable-static[=PKGS] build static libraries [default=>>AC_ENABLE_STATIC_DEFAULT],
+changequote([, ])dnl
+[p=${PACKAGE-default}
+case $enableval in
+yes) enable_static=yes ;;
+no) enable_static=no ;;
+*)
+ enable_static=no
+ # Look at the argument we got. We use all the common list separators.
+ IFS="${IFS= }"; ac_save_ifs="$IFS"; IFS="${IFS}:,"
+ for pkg in $enableval; do
+ if test "X$pkg" = "X$p"; then
+ enable_static=yes
+ fi
+ done
+ IFS="$ac_save_ifs"
+ ;;
+esac],
+enable_static=AC_ENABLE_STATIC_DEFAULT)dnl
+])
+
+# AC_DISABLE_STATIC - set the default static flag to --disable-static
+AC_DEFUN([AC_DISABLE_STATIC],
+[AC_BEFORE([$0],[AC_LIBTOOL_SETUP])dnl
+AC_ENABLE_STATIC(no)])
+
+
+# AC_ENABLE_FAST_INSTALL - implement the --enable-fast-install flag
+# Usage: AC_ENABLE_FAST_INSTALL[(DEFAULT)]
+# Where DEFAULT is either `yes' or `no'. If omitted, it defaults to
+# `yes'.
+AC_DEFUN([AC_ENABLE_FAST_INSTALL],
+[define([AC_ENABLE_FAST_INSTALL_DEFAULT], ifelse($1, no, no, yes))dnl
+AC_ARG_ENABLE(fast-install,
+changequote(<<, >>)dnl
+<< --enable-fast-install[=PKGS] optimize for fast installation [default=>>AC_ENABLE_FAST_INSTALL_DEFAULT],
+changequote([, ])dnl
+[p=${PACKAGE-default}
+case $enableval in
+yes) enable_fast_install=yes ;;
+no) enable_fast_install=no ;;
+*)
+ enable_fast_install=no
+ # Look at the argument we got. We use all the common list separators.
+ IFS="${IFS= }"; ac_save_ifs="$IFS"; IFS="${IFS}:,"
+ for pkg in $enableval; do
+ if test "X$pkg" = "X$p"; then
+ enable_fast_install=yes
+ fi
+ done
+ IFS="$ac_save_ifs"
+ ;;
+esac],
+enable_fast_install=AC_ENABLE_FAST_INSTALL_DEFAULT)dnl
+])
+
+# AC_DISABLE_FAST_INSTALL - set the default to --disable-fast-install
+AC_DEFUN([AC_DISABLE_FAST_INSTALL],
+[AC_BEFORE([$0],[AC_LIBTOOL_SETUP])dnl
+AC_ENABLE_FAST_INSTALL(no)])
+
+# AC_LIBTOOL_PICMODE - implement the --with-pic flag
+# Usage: AC_LIBTOOL_PICMODE[(MODE)]
+# Where MODE is either `yes' or `no'. If omitted, it defaults to
+# `both'.
+AC_DEFUN([AC_LIBTOOL_PICMODE],
+[AC_BEFORE([$0],[AC_LIBTOOL_SETUP])dnl
+pic_mode=ifelse($#,1,$1,default)])
+
+
+# AC_PATH_TOOL_PREFIX - find a file program which can recognise shared library
+AC_DEFUN([AC_PATH_TOOL_PREFIX],
+[AC_MSG_CHECKING([for $1])
+AC_CACHE_VAL(lt_cv_path_MAGIC_CMD,
+[case $MAGIC_CMD in
+ /*)
+ lt_cv_path_MAGIC_CMD="$MAGIC_CMD" # Let the user override the test with a path.
+ ;;
+ ?:/*)
+ lt_cv_path_MAGIC_CMD="$MAGIC_CMD" # Let the user override the test with a dos path.
+ ;;
+ *)
+ ac_save_MAGIC_CMD="$MAGIC_CMD"
+ IFS="${IFS= }"; ac_save_ifs="$IFS"; IFS=":"
+dnl $ac_dummy forces splitting on constant user-supplied paths.
+dnl POSIX.2 word splitting is done only on the output of word expansions,
+dnl not every word. This closes a longstanding sh security hole.
+ ac_dummy="ifelse([$2], , $PATH, [$2])"
+ for ac_dir in $ac_dummy; do
+ test -z "$ac_dir" && ac_dir=.
+ if test -f $ac_dir/$1; then
+ lt_cv_path_MAGIC_CMD="$ac_dir/$1"
+ if test -n "$file_magic_test_file"; then
+ case $deplibs_check_method in
+ "file_magic "*)
+ file_magic_regex="`expr \"$deplibs_check_method\" : \"file_magic \(.*\)\"`"
+ MAGIC_CMD="$lt_cv_path_MAGIC_CMD"
+ if eval $file_magic_cmd \$file_magic_test_file 2> /dev/null |
+ egrep "$file_magic_regex" > /dev/null; then
+ :
+ else
+ cat <<EOF 1>&2
+
+*** Warning: the command libtool uses to detect shared libraries,
+*** $file_magic_cmd, produces output that libtool cannot recognize.
+*** The result is that libtool may fail to recognize shared libraries
+*** as such. This will affect the creation of libtool libraries that
+*** depend on shared libraries, but programs linked with such libtool
+*** libraries will work regardless of this problem. Nevertheless, you
+*** may want to report the problem to your system manager and/or to
+*** bug-libtool@gnu.org
+
+EOF
+ fi ;;
+ esac
+ fi
+ break
+ fi
+ done
+ IFS="$ac_save_ifs"
+ MAGIC_CMD="$ac_save_MAGIC_CMD"
+ ;;
+esac])
+MAGIC_CMD="$lt_cv_path_MAGIC_CMD"
+if test -n "$MAGIC_CMD"; then
+ AC_MSG_RESULT($MAGIC_CMD)
+else
+ AC_MSG_RESULT(no)
+fi
+])
+
+
+# AC_PATH_MAGIC - find a file program which can recognise a shared library
+AC_DEFUN([AC_PATH_MAGIC],
+[AC_REQUIRE([AC_CHECK_TOOL_PREFIX])dnl
+AC_PATH_TOOL_PREFIX(${ac_tool_prefix}file, /usr/bin:$PATH)
+if test -z "$lt_cv_path_MAGIC_CMD"; then
+ if test -n "$ac_tool_prefix"; then
+ AC_PATH_TOOL_PREFIX(file, /usr/bin:$PATH)
+ else
+ MAGIC_CMD=:
+ fi
+fi
+])
+
+
+# AC_PROG_LD - find the path to the GNU or non-GNU linker
+AC_DEFUN([AC_PROG_LD],
+[AC_ARG_WITH(gnu-ld,
+[ --with-gnu-ld assume the C compiler uses GNU ld [default=no]],
+test "$withval" = no || with_gnu_ld=yes, with_gnu_ld=no)
+AC_REQUIRE([AC_PROG_CC])dnl
+AC_REQUIRE([AC_CANONICAL_HOST])dnl
+AC_REQUIRE([AC_CANONICAL_BUILD])dnl
+ac_prog=ld
+if test "$GCC" = yes; then
+ # Check if gcc -print-prog-name=ld gives a path.
+ AC_MSG_CHECKING([for ld used by GCC])
+ case $host in
+ *-*-mingw*)
+ # gcc leaves a trailing carriage return which upsets mingw
+ ac_prog=`($CC -print-prog-name=ld) 2>&5 | tr -d '\015'` ;;
+ *)
+ ac_prog=`($CC -print-prog-name=ld) 2>&5` ;;
+ esac
+ case $ac_prog in
+ # Accept absolute paths.
+ [[\\/]* | [A-Za-z]:[\\/]*)]
+ re_direlt=['/[^/][^/]*/\.\./']
+ # Canonicalize the path of ld
+ ac_prog=`echo $ac_prog| sed 's%\\\\%/%g'`
+ while echo $ac_prog | grep "$re_direlt" > /dev/null 2>&1; do
+ ac_prog=`echo $ac_prog| sed "s%$re_direlt%/%"`
+ done
+ test -z "$LD" && LD="$ac_prog"
+ ;;
+ "")
+ # If it fails, then pretend we aren't using GCC.
+ ac_prog=ld
+ ;;
+ *)
+ # If it is relative, then search for the first ld in PATH.
+ with_gnu_ld=unknown
+ ;;
+ esac
+elif test "$with_gnu_ld" = yes; then
+ AC_MSG_CHECKING([for GNU ld])
+else
+ AC_MSG_CHECKING([for non-GNU ld])
+fi
+AC_CACHE_VAL(lt_cv_path_LD,
+[if test -z "$LD"; then
+ IFS="${IFS= }"; ac_save_ifs="$IFS"; IFS="${IFS}${PATH_SEPARATOR-:}"
+ for ac_dir in $PATH; do
+ test -z "$ac_dir" && ac_dir=.
+ if test -f "$ac_dir/$ac_prog" || test -f "$ac_dir/$ac_prog$ac_exeext"; then
+ lt_cv_path_LD="$ac_dir/$ac_prog"
+ # Check to see if the program is GNU ld. I'd rather use --version,
+ # but apparently some GNU ld's only accept -v.
+ # Break only if it was the GNU/non-GNU ld that we prefer.
+ if "$lt_cv_path_LD" -v 2>&1 < /dev/null | egrep '(GNU|with BFD)' > /dev/null; then
+ test "$with_gnu_ld" != no && break
+ else
+ test "$with_gnu_ld" != yes && break
+ fi
+ fi
+ done
+ IFS="$ac_save_ifs"
+else
+ lt_cv_path_LD="$LD" # Let the user override the test with a path.
+fi])
+LD="$lt_cv_path_LD"
+if test -n "$LD"; then
+ AC_MSG_RESULT($LD)
+else
+ AC_MSG_RESULT(no)
+fi
+test -z "$LD" && AC_MSG_ERROR([no acceptable ld found in \$PATH])
+AC_PROG_LD_GNU
+])
+
+AC_DEFUN([AC_PROG_LD_GNU],
+[AC_CACHE_CHECK([if the linker ($LD) is GNU ld], lt_cv_prog_gnu_ld,
+[# I'd rather use --version here, but apparently some GNU ld's only accept -v.
+if $LD -v 2>&1 </dev/null | egrep '(GNU|with BFD)' 1>&5; then
+ lt_cv_prog_gnu_ld=yes
+else
+ lt_cv_prog_gnu_ld=no
+fi])
+with_gnu_ld=$lt_cv_prog_gnu_ld
+])
+
+# AC_PROG_LD_RELOAD_FLAG - find reload flag for linker
+# -- PORTME Some linkers may need a different reload flag.
+AC_DEFUN([AC_PROG_LD_RELOAD_FLAG],
+[AC_CACHE_CHECK([for $LD option to reload object files], lt_cv_ld_reload_flag,
+[lt_cv_ld_reload_flag='-r'])
+reload_flag=$lt_cv_ld_reload_flag
+test -n "$reload_flag" && reload_flag=" $reload_flag"
+])
+
+# AC_DEPLIBS_CHECK_METHOD - how to check for library dependencies
+# -- PORTME fill in with the dynamic library characteristics
+AC_DEFUN([AC_DEPLIBS_CHECK_METHOD],
+[AC_CACHE_CHECK([how to recognise dependant libraries],
+lt_cv_deplibs_check_method,
+[lt_cv_file_magic_cmd='$MAGIC_CMD'
+lt_cv_file_magic_test_file=
+lt_cv_deplibs_check_method='unknown'
+# Need to set the preceding variable on all platforms that support
+# interlibrary dependencies.
+# 'none' -- dependencies not supported.
+# `unknown' -- same as none, but documents that we really don't know.
+# 'pass_all' -- all dependencies passed with no checks.
+# 'test_compile' -- check by making test program.
+# 'file_magic [regex]' -- check by looking for files in library path
+# which responds to the $file_magic_cmd with a given egrep regex.
+# If you have `file' or equivalent on your system and you're not sure
+# whether `pass_all' will *always* work, you probably want this one.
+
+case $host_os in
+aix*)
+ lt_cv_deplibs_check_method=pass_all
+ ;;
+
+beos*)
+ lt_cv_deplibs_check_method=pass_all
+ ;;
+
+bsdi4*)
+ lt_cv_deplibs_check_method=['file_magic ELF [0-9][0-9]*-bit [ML]SB (shared object|dynamic lib)']
+ lt_cv_file_magic_cmd='/usr/bin/file -L'
+ lt_cv_file_magic_test_file=/shlib/libc.so
+ ;;
+
+cygwin* | mingw* |pw32*)
+ lt_cv_deplibs_check_method='file_magic file format pei*-i386(.*architecture: i386)?'
+ lt_cv_file_magic_cmd='$OBJDUMP -f'
+ ;;
+
+darwin* | rhapsody*)
+ lt_cv_deplibs_check_method='file_magic Mach-O dynamically linked shared library'
+ lt_cv_file_magic_cmd='/usr/bin/file -L'
+ case "$host_os" in
+ rhapsody* | darwin1.[012])
+ lt_cv_file_magic_test_file='/System/Library/Frameworks/System.framework/System'
+ ;;
+ *) # Darwin 1.3 on
+ lt_cv_file_magic_test_file='/usr/lib/libSystem.dylib'
+ ;;
+ esac
+ ;;
+
+freebsd* )
+ if echo __ELF__ | $CC -E - | grep __ELF__ > /dev/null; then
+ case $host_cpu in
+ i*86 )
+ # Not sure whether the presence of OpenBSD here was a mistake.
+ # Let's accept both of them until this is cleared up.
+ lt_cv_deplibs_check_method=['file_magic (FreeBSD|OpenBSD)/i[3-9]86 (compact )?demand paged shared library']
+ lt_cv_file_magic_cmd=/usr/bin/file
+ lt_cv_file_magic_test_file=`echo /usr/lib/libc.so.*`
+ ;;
+ esac
+ else
+ lt_cv_deplibs_check_method=pass_all
+ fi
+ ;;
+
+gnu*)
+ lt_cv_deplibs_check_method=pass_all
+ ;;
+
+hpux10.20*|hpux11*)
+ lt_cv_deplibs_check_method=['file_magic (s[0-9][0-9][0-9]|PA-RISC[0-9].[0-9]) shared library']
+ lt_cv_file_magic_cmd=/usr/bin/file
+ lt_cv_file_magic_test_file=/usr/lib/libc.sl
+ ;;
+
+irix5* | irix6*)
+ case $host_os in
+ irix5*)
+ # this will be overridden with pass_all, but let us keep it just in case
+ lt_cv_deplibs_check_method="file_magic ELF 32-bit MSB dynamic lib MIPS - version 1"
+ ;;
+ *)
+ case $LD in
+ *-32|*"-32 ") libmagic=32-bit;;
+ *-n32|*"-n32 ") libmagic=N32;;
+ *-64|*"-64 ") libmagic=64-bit;;
+ *) libmagic=never-match;;
+ esac
+ # this will be overridden with pass_all, but let us keep it just in case
+ lt_cv_deplibs_check_method=["file_magic ELF ${libmagic} MSB mips-[1234] dynamic lib MIPS - version 1"]
+ ;;
+ esac
+ lt_cv_file_magic_test_file=`echo /lib${libsuff}/libc.so*`
+ lt_cv_deplibs_check_method=pass_all
+ ;;
+
+# This must be Linux ELF.
+linux-gnu*)
+ case $host_cpu in
+ alpha* | hppa* | i*86 | powerpc* | sparc* | ia64* )
+ lt_cv_deplibs_check_method=pass_all ;;
+ *)
+ # glibc up to 2.1.1 does not perform some relocations on ARM
+ lt_cv_deplibs_check_method=['file_magic ELF [0-9][0-9]*-bit [LM]SB (shared object|dynamic lib )'] ;;
+ esac
+ lt_cv_file_magic_test_file=`echo /lib/libc.so* /lib/libc-*.so`
+ ;;
+
+netbsd*)
+ if echo __ELF__ | $CC -E - | grep __ELF__ > /dev/null; then
+ [lt_cv_deplibs_check_method='match_pattern /lib[^/\.]+\.so\.[0-9]+\.[0-9]+$']
+ else
+ [lt_cv_deplibs_check_method='match_pattern /lib[^/\.]+\.so$']
+ fi
+ ;;
+
+newsos6)
+ [lt_cv_deplibs_check_method='file_magic ELF [0-9][0-9]*-bit [ML]SB (executable|dynamic lib)']
+ lt_cv_file_magic_cmd=/usr/bin/file
+ lt_cv_file_magic_test_file=/usr/lib/libnls.so
+ ;;
+
+osf3* | osf4* | osf5*)
+ # this will be overridden with pass_all, but let us keep it just in case
+ lt_cv_deplibs_check_method='file_magic COFF format alpha shared library'
+ lt_cv_file_magic_test_file=/shlib/libc.so
+ lt_cv_deplibs_check_method=pass_all
+ ;;
+
+sco3.2v5*)
+ lt_cv_deplibs_check_method=pass_all
+ ;;
+
+solaris*)
+ lt_cv_deplibs_check_method=pass_all
+ lt_cv_file_magic_test_file=/lib/libc.so
+ ;;
+
+[sysv5uw[78]* | sysv4*uw2*)]
+ lt_cv_deplibs_check_method=pass_all
+ ;;
+
+sysv4 | sysv4.2uw2* | sysv4.3* | sysv5*)
+ case $host_vendor in
+ ncr)
+ lt_cv_deplibs_check_method=pass_all
+ ;;
+ motorola)
+ lt_cv_deplibs_check_method=['file_magic ELF [0-9][0-9]*-bit [ML]SB (shared object|dynamic lib) M[0-9][0-9]* Version [0-9]']
+ lt_cv_file_magic_test_file=`echo /usr/lib/libc.so*`
+ ;;
+ esac
+ ;;
+esac
+])
+file_magic_cmd=$lt_cv_file_magic_cmd
+deplibs_check_method=$lt_cv_deplibs_check_method
+])
+
+
+# AC_PROG_NM - find the path to a BSD-compatible name lister
+AC_DEFUN([AC_PROG_NM],
+[AC_MSG_CHECKING([for BSD-compatible nm])
+AC_CACHE_VAL(lt_cv_path_NM,
+[if test -n "$NM"; then
+ # Let the user override the test.
+ lt_cv_path_NM="$NM"
+else
+ IFS="${IFS= }"; ac_save_ifs="$IFS"; IFS="${IFS}${PATH_SEPARATOR-:}"
+ for ac_dir in $PATH /usr/ccs/bin /usr/ucb /bin; do
+ test -z "$ac_dir" && ac_dir=.
+ tmp_nm=$ac_dir/${ac_tool_prefix}nm
+ if test -f $tmp_nm || test -f $tmp_nm$ac_exeext ; then
+ # Check to see if the nm accepts a BSD-compat flag.
+ # Adding the `sed 1q' prevents false positives on HP-UX, which says:
+ # nm: unknown option "B" ignored
+ # Tru64's nm complains that /dev/null is an invalid object file
+ if ($tmp_nm -B /dev/null 2>&1 | sed '1q'; exit 0) | egrep '(/dev/null|Invalid file or object type)' >/dev/null; then
+ lt_cv_path_NM="$tmp_nm -B"
+ break
+ elif ($tmp_nm -p /dev/null 2>&1 | sed '1q'; exit 0) | egrep /dev/null >/dev/null; then
+ lt_cv_path_NM="$tmp_nm -p"
+ break
+ else
+ lt_cv_path_NM=${lt_cv_path_NM="$tmp_nm"} # keep the first match, but
+ continue # so that we can try to find one that supports BSD flags
+ fi
+ fi
+ done
+ IFS="$ac_save_ifs"
+ test -z "$lt_cv_path_NM" && lt_cv_path_NM=nm
+fi])
+NM="$lt_cv_path_NM"
+AC_MSG_RESULT([$NM])
+])
+
+# AC_CHECK_LIBM - check for math library
+AC_DEFUN([AC_CHECK_LIBM],
+[AC_REQUIRE([AC_CANONICAL_HOST])dnl
+LIBM=
+case $host in
+*-*-beos* | *-*-cygwin* | *-*-pw32*)
+ # These system don't have libm
+ ;;
+*-ncr-sysv4.3*)
+ AC_CHECK_LIB(mw, _mwvalidcheckl, LIBM="-lmw")
+ AC_CHECK_LIB(m, main, LIBM="$LIBM -lm")
+ ;;
+*)
+ AC_CHECK_LIB(m, main, LIBM="-lm")
+ ;;
+esac
+])
+
+# AC_LIBLTDL_CONVENIENCE[(dir)] - sets LIBLTDL to the link flags for
+# the libltdl convenience library and INCLTDL to the include flags for
+# the libltdl header and adds --enable-ltdl-convenience to the
+# configure arguments. Note that LIBLTDL and INCLTDL are not
+# AC_SUBSTed, nor is AC_CONFIG_SUBDIRS called. If DIR is not
+# provided, it is assumed to be `libltdl'. LIBLTDL will be prefixed
+# with '${top_builddir}/' and INCLTDL will be prefixed with
+# '${top_srcdir}/' (note the single quotes!). If your package is not
+# flat and you're not using automake, define top_builddir and
+# top_srcdir appropriately in the Makefiles.
+AC_DEFUN([AC_LIBLTDL_CONVENIENCE],
+[AC_BEFORE([$0],[AC_LIBTOOL_SETUP])dnl
+ case $enable_ltdl_convenience in
+ no) AC_MSG_ERROR([this package needs a convenience libltdl]) ;;
+ "") enable_ltdl_convenience=yes
+ ac_configure_args="$ac_configure_args --enable-ltdl-convenience" ;;
+ esac
+ LIBLTDL='${top_builddir}/'ifelse($#,1,[$1],['libltdl'])/libltdlc.la
+ INCLTDL='-I${top_srcdir}/'ifelse($#,1,[$1],['libltdl'])
+])
+
+# AC_LIBLTDL_INSTALLABLE[(dir)] - sets LIBLTDL to the link flags for
+# the libltdl installable library and INCLTDL to the include flags for
+# the libltdl header and adds --enable-ltdl-install to the configure
+# arguments. Note that LIBLTDL and INCLTDL are not AC_SUBSTed, nor is
+# AC_CONFIG_SUBDIRS called. If DIR is not provided and an installed
+# libltdl is not found, it is assumed to be `libltdl'. LIBLTDL will
+# be prefixed with '${top_builddir}/' and INCLTDL will be prefixed
+# with '${top_srcdir}/' (note the single quotes!). If your package is
+# not flat and you're not using automake, define top_builddir and
+# top_srcdir appropriately in the Makefiles.
+# In the future, this macro may have to be called after AC_PROG_LIBTOOL.
+AC_DEFUN([AC_LIBLTDL_INSTALLABLE],
+[AC_BEFORE([$0],[AC_LIBTOOL_SETUP])dnl
+ AC_CHECK_LIB(ltdl, main,
+ [test x"$enable_ltdl_install" != xyes && enable_ltdl_install=no],
+ [if test x"$enable_ltdl_install" = xno; then
+ AC_MSG_WARN([libltdl not installed, but installation disabled])
+ else
+ enable_ltdl_install=yes
+ fi
+ ])
+ if test x"$enable_ltdl_install" = x"yes"; then
+ ac_configure_args="$ac_configure_args --enable-ltdl-install"
+ LIBLTDL='${top_builddir}/'ifelse($#,1,[$1],['libltdl'])/libltdl.la
+ INCLTDL='-I${top_srcdir}/'ifelse($#,1,[$1],['libltdl'])
+ else
+ ac_configure_args="$ac_configure_args --enable-ltdl-install=no"
+ LIBLTDL="-lltdl"
+ INCLTDL=
+ fi
+])
-# AC_PROVIDE_IFELSE(MACRO-NAME, IF-PROVIDED, IF-NOT-PROVIDED)
-# -----------------------------------------------------------
-# If MACRO-NAME is provided do IF-PROVIDED, else IF-NOT-PROVIDED.
-# The purpose of this macro is to provide the user with a means to
-# check macros which are provided without letting her know how the
-# information is coded.
# If this macro is not defined by Autoconf, define it here.
ifdef([AC_PROVIDE_IFELSE],
[],
[ifdef([AC_PROVIDE_$1],
[$2], [$3])])])
+# AC_LIBTOOL_F77 - enable support for fortran libraries
+AC_DEFUN([AC_LIBTOOL_F77], [AC_REQUIRE([_AC_LIBTOOL_F77])])
+
+AC_DEFUN([_AC_LIBTOOL_F77],
+[AC_REQUIRE([AC_PROG_F77])
+LIBTOOL_DEPS=$LIBTOOL_DEPS" $ac_aux_dir/ltcf-f77.sh"
+lt_save_CC="$CC"
+lt_save_CFLAGS="$CFLAGS"
+dnl Make sure LTCC is set to the C compiler, i.e. set LTCC before CC
+dnl is set to the fortran compiler.
+AR="$AR" LTCC="$CC" CC="$F77" F77="$F77" CFLAGS="$FFLAGS" CPPFLAGS="" \
+MAGIC_CMD="$MAGIC_CMD" LD="$LD" LDFLAGS="$LDFLAGS" LIBS="$LIBS" \
+LN_S="$LN_S" NM="$NM" RANLIB="$RANLIB" STRIP="$STRIP" \
+AS="$AS" DLLTOOL="$DLLTOOL" OBJDUMP="$OBJDUMP" \
+objext="$OBJEXT" exeext="$EXEEXT" reload_flag="$reload_flag" \
+deplibs_check_method="$deplibs_check_method" \
+file_magic_cmd="$file_magic_cmd" \
+${CONFIG_SHELL-/bin/sh} $ac_aux_dir/ltconfig -o libtool $libtool_flags \
+--build="$build" --add-tag=F77 $ac_aux_dir/ltcf-f77.sh $host \
+|| AC_MSG_ERROR([libtool tag configuration failed])
+CC="$lt_save_CC"
+CFLAGS="$lt_save_CFLAGS"
+
+# Redirect the config.log output again, so that the ltconfig log is not
+# clobbered by the next message.
+exec 5>>./config.log
+])
-# AM_INIT_AUTOMAKE(PACKAGE,VERSION, [NO-DEFINE])
-# ----------------------------------------------
-AC_DEFUN([AM_INIT_AUTOMAKE],
-[AC_REQUIRE([AC_PROG_INSTALL])dnl
-# test to see if srcdir already configured
-if test "`CDPATH=:; cd $srcdir && pwd`" != "`pwd`" &&
- test -f $srcdir/config.status; then
- AC_MSG_ERROR([source directory already configured; run \"make distclean\" there first])
-fi
+# AC_LIBTOOL_CXX - enable support for C++ libraries
+AC_DEFUN([AC_LIBTOOL_CXX], [AC_REQUIRE([_AC_LIBTOOL_CXX])])
+
+AC_DEFUN([_AC_LIBTOOL_CXX],
+[AC_REQUIRE([AC_PROG_CXX])
+AC_REQUIRE([AC_PROG_CXXCPP])
+LIBTOOL_DEPS=$LIBTOOL_DEPS" $ac_aux_dir/ltcf-cxx.sh"
+lt_save_CC="$CC"
+lt_save_CFLAGS="$CFLAGS"
+dnl Make sure LTCC is set to the C compiler, i.e. set LTCC before CC
+dnl is set to the C++ compiler.
+AR="$AR" LTCC="$CC" CC="$CXX" CXX="$CXX" CFLAGS="$CXXFLAGS" CPPFLAGS="$CPPFLAGS" \
+MAGIC_CMD="$MAGIC_CMD" LD="$LD" LDFLAGS="$LDFLAGS" LIBS="$LIBS" \
+LN_S="$LN_S" NM="$NM" RANLIB="$RANLIB" STRIP="$STRIP" \
+AS="$AS" DLLTOOL="$DLLTOOL" OBJDUMP="$OBJDUMP" \
+objext="$OBJEXT" exeext="$EXEEXT" reload_flag="$reload_flag" \
+deplibs_check_method="$deplibs_check_method" \
+file_magic_cmd="$file_magic_cmd" \
+${CONFIG_SHELL-/bin/sh} $ac_aux_dir/ltconfig -o libtool $libtool_flags \
+--build="$build" --add-tag=CXX $ac_aux_dir/ltcf-cxx.sh $host \
+|| AC_MSG_ERROR([libtool tag configuration failed])
+CC="$lt_save_CC"
+CFLAGS="$lt_save_CFLAGS"
+
+# Redirect the config.log output again, so that the ltconfig log is not
+# clobbered by the next message.
+exec 5>>./config.log
+])
-# Define the identity of the package.
-PACKAGE=$1
-AC_SUBST(PACKAGE)dnl
-VERSION=$2
-AC_SUBST(VERSION)dnl
-ifelse([$3],,
-[AC_DEFINE_UNQUOTED(PACKAGE, "$PACKAGE", [Name of package])
-AC_DEFINE_UNQUOTED(VERSION, "$VERSION", [Version number of package])])
-
-# Autoconf 2.50 wants to disallow AM_ names. We explicitly allow
-# the ones we care about.
-ifdef([m4_pattern_allow], [m4_pattern_allow([AM_CFLAGS])])
-ifdef([m4_pattern_allow], [m4_pattern_allow([AM_CPPFLAGS])])
-ifdef([m4_pattern_allow], [m4_pattern_allow([AM_CXXFLAGS])])
-ifdef([m4_pattern_allow], [m4_pattern_allow([AM_OBJCFLAGS])])
-ifdef([m4_pattern_allow], [m4_pattern_allow([AM_FFLAGS])])
-ifdef([m4_pattern_allow], [m4_pattern_allow([AM_RFLAGS])])
-ifdef([m4_pattern_allow], [m4_pattern_allow([AM_GCJFLAGS])])
-
-# Some tools Automake needs.
-AC_REQUIRE([AM_SANITY_CHECK])dnl
-AC_REQUIRE([AC_ARG_PROGRAM])dnl
-AM_MISSING_PROG(ACLOCAL, aclocal)
-AM_MISSING_PROG(AUTOCONF, autoconf)
-AM_MISSING_PROG(AUTOMAKE, automake)
-AM_MISSING_PROG(AUTOHEADER, autoheader)
-AM_MISSING_PROG(MAKEINFO, makeinfo)
-AM_MISSING_PROG(AMTAR, tar)
-AM_MISSING_INSTALL_SH
-# We need awk for the "check" target. The system "awk" is bad on
-# some platforms.
-AC_REQUIRE([AC_PROG_AWK])dnl
-AC_REQUIRE([AC_PROG_MAKE_SET])dnl
-AC_REQUIRE([AM_DEP_TRACK])dnl
-AC_REQUIRE([AM_SET_DEPDIR])dnl
-AC_PROVIDE_IFELSE([AC_PROG_][CC],
- [AM_DEPENDENCIES(CC)],
- [define([AC_PROG_][CC],
- defn([AC_PROG_][CC])[AM_DEPENDENCIES(CC)])])dnl
-AC_PROVIDE_IFELSE([AC_PROG_][CXX],
- [AM_DEPENDENCIES(CXX)],
- [define([AC_PROG_][CXX],
- defn([AC_PROG_][CXX])[AM_DEPENDENCIES(CXX)])])dnl
+# AC_LIBTOOL_GCJ - enable support for GCJ libraries
+AC_DEFUN([AC_LIBTOOL_GCJ],[AC_REQUIRE([_AC_LIBTOOL_GCJ])])
+
+AC_DEFUN([_AC_LIBTOOL_GCJ],
+[AC_REQUIRE([AC_PROG_LIBTOOL])
+AC_PROVIDE_IFELSE([AC_PROG_GCJ],[],
+ [AC_PROVIDE_IFELSE([A][M_PROG_GCJ],[],
+ [AC_PROVIDE_IFELSE([LT_AC_PROG_GCJ],[],
+ [ifdef([AC_PROG_GCJ],[AC_REQUIRE([AC_PROG_GCJ])],
+ [ifdef([A][M_PROG_GCJ],[AC_REQUIRE([A][M_PROG_GCJ])],
+ [AC_REQUIRE([A][C_PROG_GCJ_OR_A][M_PROG_GCJ])])])])])])
+LIBTOOL_DEPS=$LIBTOOL_DEPS" $ac_aux_dir/ltcf-gcj.sh"
+lt_save_CC="$CC"
+lt_save_CFLAGS="$CFLAGS"
+dnl Make sure LTCC is set to the C compiler, i.e. set LTCC before CC
+dnl is set to the C++ compiler.
+AR="$AR" LTCC="$CC" CC="$GCJ" CFLAGS="$GCJFLAGS" CPPFLAGS="$CPPFLAGS" \
+MAGIC_CMD="$MAGIC_CMD" LD="$LD" LDFLAGS="$LDFLAGS" LIBS="$LIBS" \
+LN_S="$LN_S" NM="$NM" RANLIB="$RANLIB" STRIP="$STRIP" \
+AS="$AS" DLLTOOL="$DLLTOOL" OBJDUMP="$OBJDUMP" \
+objext="$OBJEXT" exeext="$EXEEXT" reload_flag="$reload_flag" \
+deplibs_check_method="$deplibs_check_method" \
+file_magic_cmd="$file_magic_cmd" \
+${CONFIG_SHELL-/bin/sh} $ac_aux_dir/ltconfig -o libtool $libtool_flags \
+--build="$build" --add-tag=GCJ $ac_aux_dir/ltcf-gcj.sh $host \
+|| AC_MSG_ERROR([libtool tag configuration failed])
+CC="$lt_save_CC"
+CFLAGS="$lt_save_CFLAGS"
+
+# Redirect the config.log output again, so that the ltconfig log is not
+# clobbered by the next message.
+exec 5>>./config.log
+])
+
+dnl old names
+AC_DEFUN([AM_PROG_LIBTOOL], [AC_PROG_LIBTOOL])
+AC_DEFUN([AM_ENABLE_SHARED], [AC_ENABLE_SHARED($@)])
+AC_DEFUN([AM_ENABLE_STATIC], [AC_ENABLE_STATIC($@)])
+AC_DEFUN([AM_DISABLE_SHARED], [AC_DISABLE_SHARED($@)])
+AC_DEFUN([AM_DISABLE_STATIC], [AC_DISABLE_STATIC($@)])
+AC_DEFUN([AM_PROG_LD], [AC_PROG_LD])
+AC_DEFUN([AM_PROG_NM], [AC_PROG_NM])
+
+dnl This is just to silence aclocal about the macro not being used
+ifelse([AC_DISABLE_FAST_INSTALL])dnl
+ifelse([AC_DISABLE_SHARED])dnl
+
+AC_DEFUN([LT_AC_PROG_GCJ],
+[AC_CHECK_TOOL(GCJ, gcj, no)
+ test "x${GCJFLAGS+set}" = xset || GCJFLAGS="-g -O2"
+ AC_SUBST(GCJFLAGS)
])
+# Define a conditional.
+
+AC_DEFUN(AM_CONDITIONAL,
+[AC_SUBST($1_TRUE)
+AC_SUBST($1_FALSE)
+if $2; then
+ $1_TRUE=
+ $1_FALSE='#'
+else
+ $1_TRUE='#'
+ $1_FALSE=
+fi])
+
+# Do all the work for Automake. This macro actually does too much --
+# some checks are only needed if your package does certain things.
+# But this isn't really a big deal.
+
+# serial 1
+
+dnl Usage:
+dnl AM_INIT_AUTOMAKE(package,version, [no-define])
+
+AC_DEFUN(AM_INIT_AUTOMAKE,
+[AC_REQUIRE([AC_PROG_INSTALL])
+PACKAGE=[$1]
+AC_SUBST(PACKAGE)
+VERSION=[$2]
+AC_SUBST(VERSION)
+dnl test to see if srcdir already configured
+if test "`cd $srcdir && pwd`" != "`pwd`" && test -f $srcdir/config.status; then
+ AC_MSG_ERROR([source directory already configured; run "make distclean" there first])
+fi
+ifelse([$3],,
+AC_DEFINE_UNQUOTED(PACKAGE, "$PACKAGE", [Name of package])
+AC_DEFINE_UNQUOTED(VERSION, "$VERSION", [Version number of package]))
+AC_REQUIRE([AM_SANITY_CHECK])
+AC_REQUIRE([AC_ARG_PROGRAM])
+dnl FIXME This is truly gross.
+missing_dir=`cd $ac_aux_dir && pwd`
+AM_MISSING_PROG(ACLOCAL, aclocal, $missing_dir)
+AM_MISSING_PROG(AUTOCONF, autoconf, $missing_dir)
+AM_MISSING_PROG(AUTOMAKE, automake, $missing_dir)
+AM_MISSING_PROG(AUTOHEADER, autoheader, $missing_dir)
+AM_MISSING_PROG(MAKEINFO, makeinfo, $missing_dir)
+AC_REQUIRE([AC_PROG_MAKE_SET])])
+
#
# Check to make sure that the build environment is sane.
#
-# serial 3
-
-# AM_SANITY_CHECK
-# ---------------
-AC_DEFUN([AM_SANITY_CHECK],
+AC_DEFUN(AM_SANITY_CHECK,
[AC_MSG_CHECKING([whether build environment is sane])
# Just in case
sleep 1
-echo timestamp > conftest.file
+echo timestamp > conftestfile
# Do `set' in a subshell so we don't clobber the current shell's
# arguments. Must try -L first in case configure is actually a
# symlink; some systems play weird games with the mod time of symlinks
# (eg FreeBSD returns the mod time of the symlink's containing
# directory).
if (
- set X `ls -Lt $srcdir/configure conftest.file 2> /dev/null`
- if test "$[*]" = "X"; then
+ set X `ls -Lt $srcdir/configure conftestfile 2> /dev/null`
+ if test "[$]*" = "X"; then
# -L didn't work.
- set X `ls -t $srcdir/configure conftest.file`
+ set X `ls -t $srcdir/configure conftestfile`
fi
- if test "$[*]" != "X $srcdir/configure conftest.file" \
- && test "$[*]" != "X conftest.file $srcdir/configure"; then
+ if test "[$]*" != "X $srcdir/configure conftestfile" \
+ && test "[$]*" != "X conftestfile $srcdir/configure"; then
# If neither matched, then we have a broken ls. This can happen
# if, for instance, CONFIG_SHELL is bash and it inherits a
alias in your environment])
fi
- test "$[2]" = conftest.file
+ test "[$]2" = conftestfile
)
then
# Ok.
rm -f conftest*
AC_MSG_RESULT(yes)])
-
-# serial 2
-
-# AM_MISSING_PROG(NAME, PROGRAM)
-# ------------------------------
-AC_DEFUN([AM_MISSING_PROG],
-[AC_REQUIRE([AM_MISSING_HAS_RUN])
-$1=${$1-"${am_missing_run}$2"}
-AC_SUBST($1)])
-
-
-# AM_MISSING_INSTALL_SH
-# ---------------------
-# Like AM_MISSING_PROG, but only looks for install-sh.
-AC_DEFUN([AM_MISSING_INSTALL_SH],
-[AC_REQUIRE([AM_MISSING_HAS_RUN])
-if test -z "$install_sh"; then
- for install_sh in "$ac_aux_dir/install-sh" \
- "$ac_aux_dir/install.sh" \
- "${am_missing_run}${ac_auxdir}/install-sh";
- do
- test -f "$install_sh" && break
- done
- # FIXME: an evil hack: we remove the SHELL invocation from
- # install_sh because automake adds it back in. Sigh.
- install_sh=`echo $install_sh | sed -e 's/\${SHELL}//'`
-fi
-AC_SUBST(install_sh)])
-
-
-# AM_MISSING_HAS_RUN
-# ------------------
-# Define MISSING if not defined so far and test if it supports --run.
-# If it does, set am_missing_run to use it, otherwise, to nothing.
-AC_DEFUN([AM_MISSING_HAS_RUN],
-[test x"${MISSING+set}" = xset ||
- MISSING="\${SHELL} `CDPATH=:; cd $ac_aux_dir && pwd`/missing"
-# Use eval to expand $SHELL
-if eval "$MISSING --run :"; then
- am_missing_run="$MISSING --run "
-else
- am_missing_run=
- am_backtick='`'
- AC_MSG_WARN([${am_backtick}missing' script is too old or missing])
-fi
-])
-
-# serial 3
-
-# There are a few dirty hacks below to avoid letting `AC_PROG_CC' be
-# written in clear, in which case automake, when reading aclocal.m4,
-# will think it sees a *use*, and therefore will trigger all it's
-# C support machinery. Also note that it means that autoscan, seeing
-# CC etc. in the Makefile, will ask for an AC_PROG_CC use...
-
-# AM_DEPENDENCIES(NAME)
-# ---------------------
-# See how the compiler implements dependency checking.
-# NAME is "CC", "CXX" or "OBJC".
-# We try a few techniques and use that to set a single cache variable.
-AC_DEFUN([AM_DEPENDENCIES],
-[AC_REQUIRE([AM_SET_DEPDIR])dnl
-AC_REQUIRE([AM_OUTPUT_DEPENDENCY_COMMANDS])dnl
-ifelse([$1], CC,
- [AC_REQUIRE([AC_PROG_][CC])dnl
-AC_REQUIRE([AC_PROG_][CPP])
-depcc="$CC"
-depcpp="$CPP"],
- [$1], CXX, [AC_REQUIRE([AC_PROG_][CXX])dnl
-AC_REQUIRE([AC_PROG_][CXXCPP])
-depcc="$CXX"
-depcpp="$CXXCPP"],
- [$1], OBJC, [am_cv_OBJC_dependencies_compiler_type=gcc],
- [AC_REQUIRE([AC_PROG_][$1])dnl
-depcc="$$1"
-depcpp=""])
-
-AC_REQUIRE([AM_MAKE_INCLUDE])
-
-AC_CACHE_CHECK([dependency style of $depcc],
- [am_cv_$1_dependencies_compiler_type],
-[if test -z "$AMDEP"; then
- # We make a subdir and do the tests there. Otherwise we can end up
- # making bogus files that we don't know about and never remove. For
- # instance it was reported that on HP-UX the gcc test will end up
- # making a dummy file named `D' -- because `-MD' means `put the output
- # in D'.
- mkdir confdir
- # Copy depcomp to subdir because otherwise we won't find it if we're
- # using a relative directory.
- cp "$am_depcomp" confdir
- cd confdir
-
- am_cv_$1_dependencies_compiler_type=none
- for depmode in `sed -n ['s/^#*\([a-zA-Z0-9]*\))$/\1/p'] < "./depcomp"`; do
- # We need to recreate these files for each test, as the compiler may
- # overwrite some of them when testing with obscure command lines.
- # This happens at least with the AIX C compiler.
- echo '#include "conftest.h"' > conftest.c
- echo 'int i;' > conftest.h
-
- case "$depmode" in
- nosideeffect)
- # after this tag, mechanisms are not by side-effect, so they'll
- # only be used when explicitly requested
- if test "x$enable_dependency_tracking" = xyes; then
- continue
- else
- break
- fi
- ;;
- none) break ;;
- esac
- # We check with `-c' and `-o' for the sake of the "dashmstdout"
- # mode. It turns out that the SunPro C++ compiler does not properly
- # handle `-M -o', and we need to detect this.
- if depmode="$depmode" \
- source=conftest.c object=conftest.o \
- depfile=conftest.Po tmpdepfile=conftest.TPo \
- $SHELL ./depcomp $depcc -c conftest.c -o conftest.o >/dev/null 2>&1 &&
- grep conftest.h conftest.Po > /dev/null 2>&1; then
- am_cv_$1_dependencies_compiler_type="$depmode"
- break
- fi
- done
-
- cd ..
- rm -rf confdir
-else
- am_cv_$1_dependencies_compiler_type=none
-fi
-])
-$1DEPMODE="depmode=$am_cv_$1_dependencies_compiler_type"
-AC_SUBST([$1DEPMODE])
-])
-
-
-# AM_SET_DEPDIR
-# -------------
-# Choose a directory name for dependency files.
-# This macro is AC_REQUIREd in AM_DEPENDENCIES
-AC_DEFUN([AM_SET_DEPDIR],
-[if test -d .deps || mkdir .deps 2> /dev/null || test -d .deps; then
- DEPDIR=.deps
- # We redirect because .deps might already exist and be populated.
- # In this situation we don't want to see an error.
- rmdir .deps > /dev/null 2>&1
-else
- DEPDIR=_deps
-fi
-AC_SUBST(DEPDIR)
-])
-
-
-# AM_DEP_TRACK
-# ------------
-AC_DEFUN([AM_DEP_TRACK],
-[AC_ARG_ENABLE(dependency-tracking,
-[ --disable-dependency-tracking Speeds up one-time builds
- --enable-dependency-tracking Do not reject slow dependency extractors])
-if test "x$enable_dependency_tracking" = xno; then
- AMDEP="#"
+dnl AM_MISSING_PROG(NAME, PROGRAM, DIRECTORY)
+dnl The program must properly implement --version.
+AC_DEFUN(AM_MISSING_PROG,
+[AC_MSG_CHECKING(for working $2)
+# Run test in a subshell; some versions of sh will print an error if
+# an executable is not found, even if stderr is redirected.
+# Redirect stdin to placate older versions of autoconf. Sigh.
+if ($2 --version) < /dev/null > /dev/null 2>&1; then
+ $1=$2
+ AC_MSG_RESULT(found)
else
- am_depcomp="$ac_aux_dir/depcomp"
- if test ! -f "$am_depcomp"; then
- AMDEP="#"
- else
- AMDEP=
- fi
+ $1="$3/missing $2"
+ AC_MSG_RESULT(missing)
fi
-AC_SUBST(AMDEP)
-if test -z "$AMDEP"; then
- AMDEPBACKSLASH='\'
-else
- AMDEPBACKSLASH=
-fi
-pushdef([subst], defn([AC_SUBST]))
-subst(AMDEPBACKSLASH)
-popdef([subst])
-])
-
-# Generate code to set up dependency tracking.
-# This macro should only be invoked once -- use via AC_REQUIRE.
-# Usage:
-# AM_OUTPUT_DEPENDENCY_COMMANDS
-
-#
-# This code is only required when automatic dependency tracking
-# is enabled. FIXME. This creates each `.P' file that we will
-# need in order to bootstrap the dependency handling code.
-AC_DEFUN([AM_OUTPUT_DEPENDENCY_COMMANDS],[
-AC_OUTPUT_COMMANDS([
-test x"$AMDEP" != x"" ||
-for mf in $CONFIG_FILES; do
- case "$mf" in
- Makefile) dirpart=.;;
- */Makefile) dirpart=`echo "$mf" | sed -e 's|/[^/]*$||'`;;
- *) continue;;
- esac
- grep '^DEP_FILES *= *[^ #]' < "$mf" > /dev/null || continue
- # Extract the definition of DEP_FILES from the Makefile without
- # running `make'.
- DEPDIR=`sed -n -e '/^DEPDIR = / s///p' < "$mf"`
- test -z "$DEPDIR" && continue
- # When using ansi2knr, U may be empty or an underscore; expand it
- U=`sed -n -e '/^U = / s///p' < "$mf"`
- test -d "$dirpart/$DEPDIR" || mkdir "$dirpart/$DEPDIR"
- # We invoke sed twice because it is the simplest approach to
- # changing $(DEPDIR) to its actual value in the expansion.
- for file in `sed -n -e '
- /^DEP_FILES = .*\\\\$/ {
- s/^DEP_FILES = //
- :loop
- s/\\\\$//
- p
- n
- /\\\\$/ b loop
- p
- }
- /^DEP_FILES = / s/^DEP_FILES = //p' < "$mf" | \
- sed -e 's/\$(DEPDIR)/'"$DEPDIR"'/g' -e 's/\$U/'"$U"'/g'`; do
- # Make sure the directory exists.
- test -f "$dirpart/$file" && continue
- fdir=`echo "$file" | sed -e 's|/[^/]*$||'`
- $ac_aux_dir/mkinstalldirs "$dirpart/$fdir" > /dev/null 2>&1
- # echo "creating $dirpart/$file"
- echo '# dummy' > "$dirpart/$file"
- done
-done
-], [AMDEP="$AMDEP"
-ac_aux_dir="$ac_aux_dir"])])
-
-# AM_MAKE_INCLUDE()
-# -----------------
-# Check to see how make treats includes.
-AC_DEFUN([AM_MAKE_INCLUDE],
-[am_make=${MAKE-make}
-# BSD make uses .include
-cat > confinc << 'END'
-doit:
- @echo done
-END
-# If we don't find an include directive, just comment out the code.
-AC_MSG_CHECKING([for style of include used by $am_make])
-_am_include='#'
-for am_inc in include .include; do
- echo "$am_inc confinc" > confmf
- if test "`$am_make -f confmf 2> /dev/null`" = "done"; then
- _am_include=$am_inc
- break
- fi
-done
-AC_SUBST(_am_include)
-AC_MSG_RESULT($_am_include)
-rm -f confinc confmf
-])
+AC_SUBST($1)])
# Like AC_CONFIG_HEADER, but automatically create stamp file.
-# serial 3
-
-# When config.status generates a header, we must update the stamp-h file.
-# This file resides in the same directory as the config header
-# that is generated. We must strip everything past the first ":",
-# and everything past the last "/".
-
-AC_PREREQ([2.12])
-
-AC_DEFUN([AM_CONFIG_HEADER],
-[AC_CONFIG_HEADER([$1])
- AC_OUTPUT_COMMANDS(
- ifelse(patsubst([$1], [[^ ]], []),
- [],
- [test -z "$CONFIG_HEADERS" || echo timestamp >dnl
- patsubst([$1], [^\([^:]*/\)?.*], [\1])stamp-h]),
- [am_indx=1
- for am_file in $1; do
- case " $CONFIG_HEADERS " in
- *" $am_file "*)
- echo timestamp > `echo $am_file | sed 's%:.*%%;s%[^/]*$%%'`stamp-h$am_indx
- ;;
- esac
- am_indx=\`expr \$am_indx + 1\`
- done])
-])
-
-# serial 2
-
-# AM_CONDITIONAL(NAME, SHELL-CONDITION)
-# -------------------------------------
-# Define a conditional.
-AC_DEFUN([AM_CONDITIONAL],
-[AC_SUBST([$1_TRUE])
-AC_SUBST([$1_FALSE])
-if $2; then
- $1_TRUE=
- $1_FALSE='#'
-else
- $1_TRUE='#'
- $1_FALSE=
-fi])
+AC_DEFUN(AM_CONFIG_HEADER,
+[AC_PREREQ([2.12])
+AC_CONFIG_HEADER([$1])
+dnl When config.status generates a header, we must update the stamp-h file.
+dnl This file resides in the same directory as the config header
+dnl that is generated. We must strip everything past the first ":",
+dnl and everything past the last "/".
+AC_OUTPUT_COMMANDS(changequote(<<,>>)dnl
+ifelse(patsubst(<<$1>>, <<[^ ]>>, <<>>), <<>>,
+<<test -z "<<$>>CONFIG_HEADERS" || echo timestamp > patsubst(<<$1>>, <<^\([^:]*/\)?.*>>, <<\1>>)stamp-h<<>>dnl>>,
+<<am_indx=1
+for am_file in <<$1>>; do
+ case " <<$>>CONFIG_HEADERS " in
+ *" <<$>>am_file "*<<)>>
+ echo timestamp > `echo <<$>>am_file | sed -e 's%:.*%%' -e 's%[^/]*$%%'`stamp-h$am_indx
+ ;;
+ esac
+ am_indx=`expr "<<$>>am_indx" + 1`
+done<<>>dnl>>)
+changequote([,]))])
jsr \$26,exit
.end main
EOF
- $CC_FOR_BUILD $dummy.s -o $dummy 2>/dev/null
+ $CC_FOR_BUILD $dummy.s -o $dummy 2>/dev/null >/dev/null
if test "$?" = 0 ; then
case `./$dummy` in
0-0)
exit (0);
}
EOF
- (CCOPTS= $CC_FOR_BUILD $dummy.c -o $dummy 2>/dev/null ) && HP_ARCH=`./$dummy`
+ (CCOPTS= $CC_FOR_BUILD $dummy.c -o $dummy 2>/dev/null > /dev/null) && HP_ARCH=`./$dummy`
if test -z "$HP_ARCH"; then HP_ARCH=hppa; fi
rm -f $dummy.c $dummy
fi ;;
}
EOF
LIBC=""
- $CC_FOR_BUILD $dummy.c -o $dummy 2>/dev/null
+ $CC_FOR_BUILD $dummy.c -o $dummy 2>/dev/null >/dev/null
if test "$?" = 0 ; then
./$dummy | grep 1\.99 > /dev/null
if test "$?" = 0 ; then
.end main
EOF
LIBC=""
- $CC_FOR_BUILD $dummy.s -o $dummy 2>/dev/null
+ $CC_FOR_BUILD $dummy.s -o $dummy 2>/dev/null >/dev/null
if test "$?" = 0 ; then
case `./$dummy` in
0-0)
return 0;
}
EOF
- $CC_FOR_BUILD $dummy.c -o $dummy 2>/dev/null && ./$dummy "${UNAME_MACHINE}" && rm $dummy.c $dummy && exit 0
+ $CC_FOR_BUILD $dummy.c -o $dummy 2>/dev/null >/dev/null && ./$dummy "${UNAME_MACHINE}" && rm $dummy.c $dummy && exit 0
rm -f $dummy.c $dummy
elif test "${UNAME_MACHINE}" = "s390"; then
echo s390-ibm-linux && exit 0
return 0;
}
EOF
- $CC_FOR_BUILD $dummy.c -o $dummy 2>/dev/null && ./$dummy "${UNAME_MACHINE}" && rm $dummy.c $dummy && exit 0
+ $CC_FOR_BUILD $dummy.c -o $dummy 2>/dev/null >/dev/null && ./$dummy "${UNAME_MACHINE}" && rm $dummy.c $dummy && exit 0
rm -f $dummy.c $dummy
test x"${TENTATIVE}" != x && echo "${TENTATIVE}" && exit 0
fi ;;
--- /dev/null
+#### This script is meant to be sourced by ltconfig.
+
+# ltcf-c.sh - Create a C compiler specific configuration
+#
+# Copyright (C) 1996-2000, 2001 Free Software Foundation, Inc.
+# Originally by Gordon Matzigkeit <gord@gnu.ai.mit.edu>, 1996
+#
+# This file is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+#
+# As a special exception to the GNU General Public License, if you
+# distribute this file as part of a program that contains a
+# configuration script generated by Autoconf, you may include it under
+# the same distribution terms that you use for the rest of that program.
+
+
+# Source file extension for C test sources.
+ac_ext=c
+
+# Object file extension for compiled C test sources.
+objext=o
+
+# Code to be used in simple compile tests
+lt_simple_compile_test_code="int some_variable = 0;"
+
+# Code to be used in simple link tests
+lt_simple_link_test_code='main(){return(0);}'
+
+## Linker Characteristics
+case $host_os in
+cygwin* | mingw*)
+ # FIXME: the MSVC++ port hasn't been tested in a loooong time
+ # When not using gcc, we currently assume that we are using
+ # Microsoft Visual C++.
+ if test "$with_gcc" != yes; then
+ with_gnu_ld=no
+ fi
+ ;;
+
+esac
+
+ld_shlibs=yes
+if test "$with_gnu_ld" = yes; then
+ # If archive_cmds runs LD, not CC, wlarc should be empty
+ wlarc='${wl}'
+
+ # See if GNU ld supports shared libraries.
+ case $host_os in
+ aix3* | aix4* | aix5*)
+ # On AIX, the GNU linker is very broken
+ ld_shlibs=no
+ cat <<EOF 1>&2
+
+*** Warning: the GNU linker, at least up to release 2.9.1, is reported
+*** to be unable to reliably create shared libraries on AIX.
+*** Therefore, libtool is disabling shared libraries support. If you
+*** really care for shared libraries, you may want to modify your PATH
+*** so that a non-GNU linker is found, and then restart.
+
+EOF
+ ;;
+
+ amigaos*)
+ archive_cmds='$rm $output_objdir/a2ixlibrary.data~$echo "#define NAME $libname" > $output_objdir/a2ixlibrary.data~$echo "#define LIBRARY_ID 1" >> $output_objdir/a2ixlibrary.data~$echo "#define VERSION $major" >> $output_objdir/a2ixlibrary.data~$echo "#define REVISION $revision" >> $output_objdir/a2ixlibrary.data~$AR $AR_FLAGS $lib $libobjs~$RANLIB $lib~(cd $output_objdir && a2ixlibrary -32)'
+ hardcode_libdir_flag_spec='-L$libdir'
+ hardcode_minus_L=yes
+
+ # Samuel A. Falvo II <kc5tja@dolphin.openprojects.net> reports
+ # that the semantics of dynamic libraries on AmigaOS, at least up
+ # to version 4, is to share data among multiple programs linked
+ # with the same dynamic library. Since this doesn't match the
+ # behavior of shared libraries on other platforms, we can use
+ # them.
+ ld_shlibs=no
+ ;;
+
+ beos*)
+ if $LD --help 2>&1 | egrep ': supported targets:.* elf' > /dev/null; then
+ allow_undefined_flag=unsupported
+ # Joseph Beckenbach <jrb3@best.com> says some releases of gcc
+ # support --undefined. This deserves some investigation. FIXME
+ archive_cmds='$CC -nostart $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname -o $lib'
+ else
+ ld_shlibs=no
+ fi
+ ;;
+
+ cygwin* | mingw*)
+ # hardcode_libdir_flag_spec is actually meaningless, as there is
+ # no search path for DLLs.
+ hardcode_libdir_flag_spec='-L$libdir'
+ allow_undefined_flag=unsupported
+ always_export_symbols=yes
+
+ extract_expsyms_cmds='test -f $output_objdir/impgen.c || \
+ sed -e "/^# \/\* impgen\.c starts here \*\//,/^# \/\* impgen.c ends here \*\// { s/^# //; p; }" -e d < $0 > $output_objdir/impgen.c~
+ test -f $output_objdir/impgen.exe || (cd $output_objdir && \
+ if test "x$HOST_CC" != "x" ; then $HOST_CC -o impgen impgen.c ; \
+ else $CC -o impgen impgen.c ; fi)~
+ $output_objdir/impgen $dir/$soroot > $output_objdir/$soname-def'
+
+ old_archive_from_expsyms_cmds='$DLLTOOL --as=$AS --dllname $soname --def $output_objdir/$soname-def --output-lib $output_objdir/$newlib'
+
+ # cygwin and mingw dlls have different entry points and sets of symbols
+ # to exclude.
+ # FIXME: what about values for MSVC?
+ dll_entry=__cygwin_dll_entry@12
+ dll_exclude_symbols=DllMain@12,_cygwin_dll_entry@12,_cygwin_noncygwin_dll_entry@12~
+ case $host_os in
+ mingw*)
+ # mingw values
+ dll_entry=_DllMainCRTStartup@12
+ dll_exclude_symbols=DllMain@12,DllMainCRTStartup@12,DllEntryPoint@12~
+ ;;
+ esac
+
+ # mingw and cygwin differ, and it's simplest to just exclude the union
+ # of the two symbol sets.
+ dll_exclude_symbols=DllMain@12,_cygwin_dll_entry@12,_cygwin_noncygwin_dll_entry@12,DllMainCRTStartup@12,DllEntryPoint@12
+
+ # recent cygwin and mingw systems supply a stub DllMain which the user
+ # can override, but on older systems we have to supply one (in ltdll.c)
+ if test "x$lt_cv_need_dllmain" = "xyes"; then
+ ltdll_obj='$output_objdir/$soname-ltdll.'"$objext "
+ ltdll_cmds='test -f $output_objdir/$soname-ltdll.c || sed -e "/^# \/\* ltdll\.c starts here \*\//,/^# \/\* ltdll.c ends here \*\// { s/^# //; p; }" -e d < $0 > $output_objdir/$soname-ltdll.c~
+ test -f $output_objdir/$soname-ltdll.$objext || (cd $output_objdir && $CC -c $soname-ltdll.c)~'
+ else
+ ltdll_obj=
+ ltdll_cmds=
+ fi
+
+ # Extract the symbol export list from an `--export-all' def file,
+ # then regenerate the def file from the symbol export list, so that
+ # the compiled dll only exports the symbol export list.
+ # Be careful not to strip the DATA tag left be newer dlltools.
+ export_symbols_cmds="$ltdll_cmds"'
+ $DLLTOOL --export-all --exclude-symbols '$dll_exclude_symbols' --output-def $output_objdir/$soname-def '$ltdll_obj'$libobjs $convenience~
+ sed -e "1,/EXPORTS/d" -e "s/ @ [0-9]*//" -e "s/ *;.*$//" < $output_objdir/$soname-def > $export_symbols'
+
+ # If the export-symbols file already is a .def file (1st line
+ # is EXPORTS), use it as is.
+ # If DATA tags from a recent dlltool are present, honour them!
+ archive_expsym_cmds='if test "x`head -1 $export_symbols`" = xEXPORTS; then
+ cp $export_symbols $output_objdir/$soname-def;
+ else
+ echo EXPORTS > $output_objdir/$soname-def;
+ _lt_hint=1;
+ cat $export_symbols | while read symbol; do
+ set dummy \$symbol;
+ case \[$]# in
+ 2) echo " \[$]2 @ \$_lt_hint ; " >> $output_objdir/$soname-def;;
+ *) echo " \[$]2 @ \$_lt_hint \[$]3 ; " >> $output_objdir/$soname-def;;
+ esac;
+ _lt_hint=`expr 1 + \$_lt_hint`;
+ done;
+ fi~
+ '"$ltdll_cmds"'
+ $CC -Wl,--base-file,$output_objdir/$soname-base '$lt_cv_cc_dll_switch' -Wl,-e,'$dll_entry' -o $output_objdir/$soname '$ltdll_obj'$libobjs $deplibs $compiler_flags~
+ $DLLTOOL --as=$AS --dllname $soname --exclude-symbols '$dll_exclude_symbols' --def $output_objdir/$soname-def --base-file $output_objdir/$soname-base --output-exp $output_objdir/$soname-exp~
+ $CC -Wl,--base-file,$output_objdir/$soname-base $output_objdir/$soname-exp '$lt_cv_cc_dll_switch' -Wl,-e,'$dll_entry' -o $output_objdir/$soname '$ltdll_obj'$libobjs $deplibs $compiler_flags~
+ $DLLTOOL --as=$AS --dllname $soname --exclude-symbols '$dll_exclude_symbols' --def $output_objdir/$soname-def --base-file $output_objdir/$soname-base --output-exp $output_objdir/$soname-exp --output-lib $output_objdir/$libname.dll.a~
+ $CC $output_objdir/$soname-exp '$lt_cv_cc_dll_switch' -Wl,-e,'$dll_entry' -o $output_objdir/$soname '$ltdll_obj'$libobjs $deplibs $compiler_flags'
+ ;;
+
+ darwin* | rhapsody*)
+ allow_undefined_flag='-undefined suppress'
+ archive_cmds='$CC `test .$module = .yes && echo -bundle || echo -dynamiclib` $allow_undefined_flag -o $lib $libobjs $deplibs $linkopts -install_name $rpath/$soname `test -n "$verstring" -a x$verstring != x0.0 && echo $verstring`'
+ # We need to add '_' to the symbols in $export_symbols first
+ #archive_expsym_cmds="$archive_cmds"' && strip -s $export_symbols'
+ hardcode_direct=yes
+ hardcode_shlibpath_var=no
+ whole_archive_flag_spec='-all_load $convenience'
+ ;;
+
+ netbsd*)
+ if echo __ELF__ | $CC -E - | grep __ELF__ >/dev/null; then
+ archive_cmds='$LD -Bshareable $libobjs $deplibs $linker_flags -o $lib'
+ wlarc=
+ else
+ archive_cmds='$CC -shared -nodefaultlibs $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname -o $lib'
+ archive_expsym_cmds='$CC -shared -nodefaultlibs $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname ${wl}-retain-symbols-file $wl$export_symbols -o $lib'
+ fi
+ ;;
+
+ solaris* | sysv5*)
+ if $LD -v 2>&1 | egrep 'BFD 2\.8' > /dev/null; then
+ ld_shlibs=no
+ cat <<EOF 1>&2
+
+*** Warning: The releases 2.8.* of the GNU linker cannot reliably
+*** create shared libraries on Solaris systems. Therefore, libtool
+*** is disabling shared libraries support. We urge you to upgrade GNU
+*** binutils to release 2.9.1 or newer. Another option is to modify
+*** your PATH or compiler configuration so that the native linker is
+*** used, and then restart.
+
+EOF
+ elif $LD --help 2>&1 | egrep ': supported targets:.* elf' > /dev/null; then
+ archive_cmds='$CC -shared $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname -o $lib'
+ archive_expsym_cmds='$CC -shared $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname ${wl}-retain-symbols-file $wl$export_symbols -o $lib'
+ else
+ ld_shlibs=no
+ fi
+ ;;
+
+ sunos4*)
+ archive_cmds='$LD -assert pure-text -Bshareable -o $lib $libobjs $deplibs $linker_flags'
+ wlarc=
+ hardcode_direct=yes
+ hardcode_shlibpath_var=no
+ ;;
+
+ *)
+ if $LD --help 2>&1 | egrep ': supported targets:.* elf' > /dev/null; then
+ archive_cmds='$CC -shared $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname -o $lib'
+ archive_expsym_cmds='$CC -shared $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname ${wl}-retain-symbols-file $wl$export_symbols -o $lib'
+ else
+ ld_shlibs=no
+ fi
+ ;;
+ esac
+
+ if test "$ld_shlibs" = yes; then
+ runpath_var=LD_RUN_PATH
+ hardcode_libdir_flag_spec='${wl}--rpath ${wl}$libdir'
+ export_dynamic_flag_spec='${wl}--export-dynamic'
+ case $host_os in
+ cygwin* | mingw*)
+ # dlltool doesn't understand --whole-archive et. al.
+ whole_archive_flag_spec=
+ ;;
+ *)
+ # ancient GNU ld didn't support --whole-archive et. al.
+ if $LD --help 2>&1 | egrep 'no-whole-archive' > /dev/null; then
+ whole_archive_flag_spec="$wlarc"'--whole-archive$convenience '"$wlarc"'--no-whole-archive'
+ else
+ whole_archive_flag_spec=
+ fi
+ ;;
+ esac
+ fi
+else
+ # PORTME fill in a description of your system's linker (not GNU ld)
+ case $host_os in
+ aix3*)
+ allow_undefined_flag=unsupported
+ always_export_symbols=yes
+ archive_expsym_cmds='$LD -o $output_objdir/$soname $libobjs $deplibs $linker_flags -bE:$export_symbols -T512 -H512 -bM:SRE~$AR $AR_FLAGS $lib $output_objdir/$soname'
+ # Note: this linker hardcodes the directories in LIBPATH if there
+ # are no directories specified by -L.
+ hardcode_minus_L=yes
+ if test "$with_gcc" = yes && test -z "$link_static_flag"; then
+ # Neither direct hardcoding nor static linking is supported with a
+ # broken collect2.
+ hardcode_direct=unsupported
+ fi
+ ;;
+
+ aix4* | aix5*)
+ hardcode_direct=yes
+ hardcode_libdir_separator=':'
+ link_all_deplibs=yes
+ # When large executables or shared objects are built, AIX ld can
+ # have problems creating the table of contents. If linking a library
+ # or program results in "error TOC overflow" add -mminimal-toc to
+ # CXXFLAGS/CFLAGS for g++/gcc. In the cases where that is not
+ # enough to fix the problem, add -Wl,-bbigtoc to LDFLAGS.
+ if test "$with_gcc" = yes; then
+ case $host_os in aix4.[012]|aix4.[012].*)
+ # We only want to do this on AIX 4.2 and lower, the check
+ # below for broken collect2 doesn't work under 4.3+
+ collect2name=`${CC} -print-prog-name=collect2`
+ if test -f "$collect2name" && \
+ strings "$collect2name" | grep resolve_lib_name >/dev/null
+ then
+ # We have reworked collect2
+ hardcode_direct=yes
+ else
+ # We have old collect2
+ hardcode_direct=unsupported
+ # It fails to find uninstalled libraries when the uninstalled
+ # path is not listed in the libpath. Setting hardcode_minus_L
+ # to unsupported forces relinking
+ hardcode_minus_L=yes
+ hardcode_libdir_flag_spec='-L$libdir'
+ hardcode_libdir_separator=
+ fi
+ esac
+ shared_flag='-shared'
+ else
+ # not using gcc
+ if test "$host_cpu" = ia64; then
+ shared_flag='${wl}-G'
+ else
+ shared_flag='${wl}-bM:SRE'
+ fi
+ fi
+
+ if test "$host_cpu" = ia64; then
+ # On IA64, the linker does run time linking by default, so we don't
+ # have to do anything special.
+ aix_use_runtimelinking=no
+ exp_sym_flag='-Bexport'
+ no_entry_flag=""
+ else
+ # Test if we are trying to use run time linking, or normal AIX style linking.
+ # If -brtl is somewhere in LDFLAGS, we need to do run time linking.
+ aix_use_runtimelinking=no
+ for ld_flag in $LDFLAGS; do
+ if (test $ld_flag = "-brtl" || test $ld_flag = "-Wl,-brtl" ); then
+ aix_use_runtimelinking=yes
+ break
+ fi
+ done
+ exp_sym_flag='-bexport'
+ no_entry_flag='-bnoentry'
+ fi
+ # -bexpall does not export symbols beginning with underscore (_)
+ always_export_symbols=yes
+ if test "$aix_use_runtimelinking" = yes; then
+ # Warning - without using the other run time loading flags (-brtl), -berok will
+ # link without error, but may produce a broken library.
+ allow_undefined_flag=' ${wl}-berok'
+ hardcode_libdir_flag_spec='${wl}-blibpath:$libdir:/usr/lib:/lib'
+ archive_expsym_cmds="\$CC $shared_flag"' -o $output_objdir/$soname $libobjs $deplibs $compiler_flags ${allow_undefined_flag} '"\${wl}$no_entry_flag \${wl}$exp_sym_flag:\$export_symbols"
+ else
+ if test "$host_cpu" = ia64; then
+ hardcode_libdir_flag_spec='${wl}-R $libdir:/usr/lib:/lib'
+ allow_undefined_flag="-z nodefs"
+ archive_expsym_cmds="\$CC $shared_flag"' -o $output_objdir/$soname $libobjs $deplibs $compiler_flags ${wl}${allow_undefined_flag} '"\${wl}$no_entry_flag \${wl}$exp_sym_flag:\$export_symbols"
+ else
+ allow_undefined_flag=' ${wl}-berok'
+ # -bexpall does not export symbols beginning with underscore (_)
+ always_export_symbols=yes
+ # Exported symbols can be pulled into shared objects from archives
+ whole_archive_flag_spec=' '
+ build_libtool_need_lc=yes
+ hardcode_libdir_flag_spec='${wl}-blibpath:$libdir:/usr/lib:/lib'
+ # This is similar to how AIX traditionally builds it's shared libraries.
+ archive_expsym_cmds="\$CC $shared_flag"' -o $output_objdir/$soname $libobjs $deplibs $compiler_flags ${wl}-bE:$export_symbols ${wl}-bnoentry${allow_undefined_flag}~$AR $AR_FLAGS $output_objdir/$libname$release.a $output_objdir/$soname'
+ fi
+ fi
+ ;;
+
+ amigaos*)
+ archive_cmds='$rm $output_objdir/a2ixlibrary.data~$echo "#define NAME $libname" > $output_objdir/a2ixlibrary.data~$echo "#define LIBRARY_ID 1" >> $output_objdir/a2ixlibrary.data~$echo "#define VERSION $major" >> $output_objdir/a2ixlibrary.data~$echo "#define REVISION $revision" >> $output_objdir/a2ixlibrary.data~$AR $AR_FLAGS $lib $libobjs~$RANLIB $lib~(cd $output_objdir && a2ixlibrary -32)'
+ hardcode_libdir_flag_spec='-L$libdir'
+ hardcode_minus_L=yes
+ # see comment about different semantics on the GNU ld section
+ ld_shlibs=no
+ ;;
+
+ cygwin* | mingw*)
+ # When not using gcc, we currently assume that we are using
+ # Microsoft Visual C++.
+ # hardcode_libdir_flag_spec is actually meaningless, as there is
+ # no search path for DLLs.
+ hardcode_libdir_flag_spec=' '
+ allow_undefined_flag=unsupported
+ # Tell ltmain to make .lib files, not .a files.
+ libext=lib
+ # FIXME: Setting linknames here is a bad hack.
+ archive_cmds='$CC -o $lib $libobjs $compiler_flags `echo "$deplibs" | sed -e '\''s/ -lc$//'\''` -link -dll~linknames='
+ # The linker will automatically build a .lib file if we build a DLL.
+ old_archive_from_new_cmds='true'
+ # FIXME: Should let the user specify the lib program.
+ old_archive_cmds='lib /OUT:$oldlib$oldobjs$old_deplibs'
+ fix_srcfile_path='`cygpath -w "$srcfile"`'
+ ;;
+
+ freebsd1*)
+ ld_shlibs=no
+ ;;
+
+ # FreeBSD 2.2.[012] allows us to include c++rt0.o to get C++ constructor
+ # support. Future versions do this automatically, but an explicit c++rt0.o
+ # does not break anything, and helps significantly (at the cost of a little
+ # extra space).
+ freebsd2.2*)
+ archive_cmds='$LD -Bshareable -o $lib $libobjs $deplibs $linker_flags /usr/lib/c++rt0.o'
+ hardcode_libdir_flag_spec='-R$libdir'
+ hardcode_direct=yes
+ hardcode_shlibpath_var=no
+ ;;
+
+ # Unfortunately, older versions of FreeBSD 2 do not have this feature.
+ freebsd2*)
+ archive_cmds='$LD -Bshareable -o $lib $libobjs $deplibs $linker_flags'
+ hardcode_direct=yes
+ hardcode_minus_L=yes
+ hardcode_shlibpath_var=no
+ ;;
+
+ # FreeBSD 3 and greater uses gcc -shared to do shared libraries.
+ freebsd*)
+ archive_cmds='$CC -shared -o $lib $libobjs $deplibs $compiler_flags'
+ hardcode_libdir_flag_spec='-R$libdir'
+ hardcode_direct=yes
+ hardcode_shlibpath_var=no
+ ;;
+
+ hpux9* | hpux10* | hpux11*)
+ case $host_os in
+ hpux9*) archive_cmds='$rm $output_objdir/$soname~$LD -b +b $install_libdir -o $output_objdir/$soname $libobjs $deplibs $linker_flags~test $output_objdir/$soname = $lib || mv $output_objdir/$soname $lib' ;;
+ *) archive_cmds='$LD -b +h $soname +b $install_libdir -o $lib $libobjs $deplibs $linker_flags' ;;
+ esac
+ hardcode_libdir_flag_spec='${wl}+b ${wl}$libdir'
+ hardcode_libdir_separator=:
+ hardcode_direct=yes
+ hardcode_minus_L=yes # Not in the search PATH, but as the default
+ # location of the library.
+ export_dynamic_flag_spec='${wl}-E'
+ ;;
+
+ irix5* | irix6*)
+ if test "$with_gcc" = yes; then
+ archive_cmds='$CC -shared $libobjs $deplibs $compiler_flags ${wl}-soname ${wl}$soname `test -n "$verstring" && echo ${wl}-set_version ${wl}$verstring` ${wl}-update_registry ${wl}${objdir}/so_locations -o $lib'
+ else
+ archive_cmds='$LD -shared $libobjs $deplibs $linker_flags -soname $soname `test -n "$verstring" && echo -set_version $verstring` -update_registry ${objdir}/so_locations -o $lib'
+ fi
+ hardcode_libdir_flag_spec='${wl}-rpath ${wl}$libdir'
+ hardcode_libdir_separator=:
+ link_all_deplibs=yes
+ ;;
+
+ netbsd*)
+ if echo __ELF__ | $CC -E - | grep __ELF__ >/dev/null; then
+ archive_cmds='$LD -Bshareable -o $lib $libobjs $deplibs $linker_flags' # a.out
+ else
+ archive_cmds='$LD -shared -o $lib $libobjs $deplibs $linker_flags' # ELF
+ fi
+ hardcode_libdir_flag_spec='-R$libdir'
+ hardcode_direct=yes
+ hardcode_shlibpath_var=no
+ ;;
+
+ newsos6)
+ archive_cmds='$LD -G -h $soname -o $lib $libobjs $deplibs $linkopts'
+ hardcode_direct=yes
+ hardcode_libdir_flag_spec='${wl}-rpath ${wl}$libdir'
+ hardcode_libdir_separator=:
+ hardcode_shlibpath_var=no
+ ;;
+
+ openbsd*)
+ archive_cmds='$LD -Bshareable -o $lib $libobjs $deplibs $linker_flags'
+ hardcode_libdir_flag_spec='-R$libdir'
+ hardcode_direct=yes
+ hardcode_shlibpath_var=no
+ ;;
+
+ os2*)
+ hardcode_libdir_flag_spec='-L$libdir'
+ hardcode_minus_L=yes
+ allow_undefined_flag=unsupported
+ archive_cmds='$echo "LIBRARY $libname INITINSTANCE" > $output_objdir/$libname.def~$echo "DESCRIPTION \"$libname\"" >> $output_objdir/$libname.def~$echo DATA >> $output_objdir/$libname.def~$echo " SINGLE NONSHARED" >> $output_objdir/$libname.def~$echo EXPORTS >> $output_objdir/$libname.def~emxexp $libobjs >> $output_objdir/$libname.def~$CC -Zdll -Zcrtdll -o $lib $libobjs $deplibs $compiler_flags $output_objdir/$libname.def'
+ old_archive_from_new_cmds='emximp -o $output_objdir/$libname.a $output_objdir/$libname.def'
+ ;;
+
+ osf3*)
+ if test "$with_gcc" = yes; then
+ allow_undefined_flag=' ${wl}-expect_unresolved ${wl}\*'
+ archive_cmds='$CC -shared${allow_undefined_flag} $libobjs $deplibs $compiler_flags ${wl}-soname ${wl}$soname `test -n "$verstring" && echo ${wl}-set_version ${wl}$verstring` ${wl}-update_registry ${wl}${objdir}/so_locations -o $lib'
+ else
+ allow_undefined_flag=' -expect_unresolved \*'
+ archive_cmds='$LD -shared${allow_undefined_flag} $libobjs $deplibs $linker_flags -soname $soname `test -n "$verstring" && echo -set_version $verstring` -update_registry ${objdir}/so_locations -o $lib'
+ fi
+ hardcode_libdir_flag_spec='${wl}-rpath ${wl}$libdir'
+ hardcode_libdir_separator=:
+ ;;
+
+ osf4* | osf5*) # as osf3* with the addition of -msym flag
+ if test "$with_gcc" = yes; then
+ allow_undefined_flag=' ${wl}-expect_unresolved ${wl}\*'
+ archive_cmds='$CC -shared${allow_undefined_flag} $libobjs $deplibs $compiler_flags ${wl}-msym ${wl}-soname ${wl}$soname `test -n "$verstring" && echo ${wl}-set_version ${wl}$verstring` ${wl}-update_registry ${wl}${objdir}/so_locations -o $lib'
+ hardcode_libdir_flag_spec='${wl}-rpath ${wl}$libdir'
+ else
+ allow_undefined_flag=' -expect_unresolved \*'
+ archive_cmds='$LD -shared${allow_undefined_flag} $libobjs $deplibs $linker_flags -msym -soname $soname `test -n "$verstring" && echo -set_version $verstring` -update_registry ${objdir}/so_locations -o $lib'
+ archive_expsym_cmds='for i in `cat $export_symbols`; do printf "-exported_symbol " >> $lib.exp; echo "\$i" >> $lib.exp; done; echo "-hidden">> $lib.exp~
+ $LD -shared${allow_undefined_flag} -input $lib.exp $linker_flags $libobjs $deplibs -soname $soname `test -n "$verstring" && echo -set_version $verstring` -update_registry ${objdir}/so_locations -o $lib~$rm $lib.exp'
+
+ # cc supports -rpath directly
+ hardcode_libdir_flag_spec='-rpath $libdir'
+ fi
+ hardcode_libdir_separator=:
+ ;;
+
+ sco3.2v5*)
+ archive_cmds='$LD -G -h $soname -o $lib $libobjs $deplibs $linker_flags'
+ hardcode_shlibpath_var=no
+ runpath_var=LD_RUN_PATH
+ hardcode_runpath_var=yes
+ ;;
+
+ solaris*)
+ no_undefined_flag=' -z defs'
+ # $CC -shared without GNU ld will not create a library from C++
+ # object files and a static libstdc++, better avoid it by now
+ archive_cmds='$LD -G${allow_undefined_flag} -h $soname -o $lib $libobjs $deplibs $linker_flags'
+ archive_expsym_cmds='$echo "{ global:" > $lib.exp~cat $export_symbols | sed -e "s/\(.*\)/\1;/" >> $lib.exp~$echo "local: *; };" >> $lib.exp~
+ $LD -G${allow_undefined_flag} -M $lib.exp -h $soname -o $lib $libobjs $deplibs $linker_flags~$rm $lib.exp'
+ hardcode_libdir_flag_spec='-R$libdir'
+ hardcode_shlibpath_var=no
+ case $host_os in
+ solaris2.[0-5] | solaris2.[0-5].*) ;;
+ *) # Supported since Solaris 2.6 (maybe 2.5.1?)
+ whole_archive_flag_spec='-z allextract$convenience -z defaultextract' ;;
+ esac
+ link_all_deplibs=yes
+ ;;
+
+ sunos4*)
+ archive_cmds='$LD -assert pure-text -Bstatic -o $lib $libobjs $deplibs $linker_flags'
+ hardcode_libdir_flag_spec='-L$libdir'
+ hardcode_direct=yes
+ hardcode_minus_L=yes
+ hardcode_shlibpath_var=no
+ ;;
+
+ sysv4)
+ archive_cmds='$LD -G -h $soname -o $lib $libobjs $deplibs $linker_flags'
+ runpath_var='LD_RUN_PATH'
+ hardcode_shlibpath_var=no
+ hardcode_direct=no #Motorola manual says yes, but my tests say they lie
+ ;;
+
+ sysv4.3*)
+ archive_cmds='$LD -G -h $soname -o $lib $libobjs $deplibs $linker_flags'
+ hardcode_shlibpath_var=no
+ export_dynamic_flag_spec='-Bexport'
+ ;;
+
+ sysv5*)
+ no_undefined_flag=' -z text'
+ # $CC -shared without GNU ld will not create a library from C++
+ # object files and a static libstdc++, better avoid it by now
+ archive_cmds='$LD -G${allow_undefined_flag} -h $soname -o $lib $libobjs $deplibs $linker_flags'
+ archive_expsym_cmds='$echo "{ global:" > $lib.exp~cat $export_symbols | sed -e "s/\(.*\)/\1;/" >> $lib.exp~$echo "local: *; };" >> $lib.exp~
+ $LD -G${allow_undefined_flag} -M $lib.exp -h $soname -o $lib $libobjs $deplibs $linker_flags~$rm $lib.exp'
+ hardcode_libdir_flag_spec=
+ hardcode_shlibpath_var=no
+ runpath_var='LD_RUN_PATH'
+ ;;
+
+ uts4*)
+ archive_cmds='$LD -G -h $soname -o $lib $libobjs $deplibs $linker_flags'
+ hardcode_libdir_flag_spec='-L$libdir'
+ hardcode_shlibpath_var=no
+ ;;
+
+ dgux*)
+ archive_cmds='$LD -G -h $soname -o $lib $libobjs $deplibs $linker_flags'
+ hardcode_libdir_flag_spec='-L$libdir'
+ hardcode_shlibpath_var=no
+ ;;
+
+ sysv4*MP*)
+ if test -d /usr/nec; then
+ archive_cmds='$LD -G -h $soname -o $lib $libobjs $deplibs $linker_flags'
+ hardcode_shlibpath_var=no
+ runpath_var=LD_RUN_PATH
+ hardcode_runpath_var=yes
+ ld_shlibs=yes
+ fi
+ ;;
+
+ sysv4.2uw2*)
+ archive_cmds='$LD -G -o $lib $libobjs $deplibs $linker_flags'
+ hardcode_direct=yes
+ hardcode_minus_L=no
+ hardcode_shlibpath_var=no
+ hardcode_runpath_var=yes
+ runpath_var=LD_RUN_PATH
+ ;;
+
+ sysv5uw7* | unixware7*)
+ no_undefined_flag='${wl}-z ${wl}text'
+ if test "$GCC" = yes; then
+ archive_cmds='$CC -shared ${wl}-h ${wl}$soname -o $lib $libobjs $deplibs $compiler_flags'
+ else
+ archive_cmds='$CC -G ${wl}-h ${wl}$soname -o $lib $libobjs $deplibs $compiler_flags'
+ fi
+ runpath_var='LD_RUN_PATH'
+ hardcode_shlibpath_var=no
+ ;;
+
+ *)
+ ld_shlibs=no
+ ;;
+ esac
+fi
+
+## Compiler Characteristics: PIC flags, static flags, etc
+if test "X${ac_cv_prog_cc_pic+set}" = Xset; then
+ :
+else
+ ac_cv_prog_cc_pic=
+ ac_cv_prog_cc_shlib=
+ ac_cv_prog_cc_wl=
+ ac_cv_prog_cc_static=
+ ac_cv_prog_cc_no_builtin=
+ ac_cv_prog_cc_can_build_shared=$can_build_shared
+
+ if test "$with_gcc" = yes; then
+ ac_cv_prog_cc_wl='-Wl,'
+ ac_cv_prog_cc_static='-static'
+
+ case $host_os in
+ aix*)
+ # All AIX code is PIC.
+ if test "$host_cpu" = ia64; then
+ # AIX 5 now supports IA64 processor
+ lt_cv_prog_cc_static='-Bstatic'
+ else
+ lt_cv_prog_cc_static='-bnso -bI:/lib/syscalls.exp'
+ fi
+ ;;
+ amigaos*)
+ # FIXME: we need at least 68020 code to build shared libraries, but
+ # adding the `-m68020' flag to GCC prevents building anything better,
+ # like `-m68040'.
+ ac_cv_prog_cc_pic='-m68020 -resident32 -malways-restore-a4'
+ ;;
+ beos* | irix5* | irix6* | osf3* | osf4* | osf5*)
+ # PIC is the default for these OSes.
+ ;;
+ cygwin* | mingw* | os2*)
+ # This hack is so that the source file can tell whether it is being
+ # built for inclusion in a dll (and should export symbols for example).
+ ac_cv_prog_cc_pic='-DDLL_EXPORT'
+ ;;
+ darwin* | rhapsody*)
+ # PIC is the default on this platform
+ # Common symbols not allowed in MH_DYLIB files
+ lt_cv_prog_cc_pic='-fno-common'
+ ;;
+ *djgpp*)
+ # DJGPP does not support shared libraries at all
+ ac_cv_prog_cc_pic=
+ ;;
+ sysv4*MP*)
+ if test -d /usr/nec; then
+ ac_cv_prog_cc_pic=-Kconform_pic
+ fi
+ ;;
+ *)
+ ac_cv_prog_cc_pic='-fPIC'
+ ;;
+ esac
+ else
+ # PORTME Check for PIC flags for the system compiler.
+ case $host_os in
+ aix*)
+ # All AIX code is PIC.
+ ac_cv_prog_cc_static="$ac_cv_prog_cc_static ${ac_cv_prog_cc_wl}-lC"
+ ;;
+
+ hpux9* | hpux10* | hpux11*)
+ # Is there a better ac_cv_prog_cc_static that works with the bundled CC?
+ ac_cv_prog_cc_wl='-Wl,'
+ ac_cv_prog_cc_static="${ac_cv_prog_cc_wl}-a ${ac_cv_prog_cc_wl}archive"
+ ac_cv_prog_cc_pic='+Z'
+ ;;
+
+ irix5* | irix6*)
+ ac_cv_prog_cc_wl='-Wl,'
+ ac_cv_prog_cc_static='-non_shared'
+ # PIC (with -KPIC) is the default.
+ ;;
+
+ cygwin* | mingw* | os2*)
+ # This hack is so that the source file can tell whether it is being
+ # built for inclusion in a dll (and should export symbols for example).
+ ac_cv_prog_cc_pic='-DDLL_EXPORT'
+ ;;
+
+ newsos6)
+ ac_cv_prog_cc_pic='-KPIC'
+ ac_cv_prog_cc_static='-Bstatic'
+ ;;
+
+ osf3* | osf4* | osf5*)
+ # All OSF/1 code is PIC.
+ ac_cv_prog_cc_wl='-Wl,'
+ ac_cv_prog_cc_static='-non_shared'
+ ;;
+
+ sco3.2v5*)
+ ac_cv_prog_cc_pic='-Kpic'
+ ac_cv_prog_cc_static='-dn'
+ ac_cv_prog_cc_shlib='-belf'
+ ;;
+
+ solaris*)
+ ac_cv_prog_cc_pic='-KPIC'
+ ac_cv_prog_cc_static='-Bstatic'
+ ac_cv_prog_cc_wl='-Wl,'
+ ;;
+
+ sunos4*)
+ ac_cv_prog_cc_pic='-PIC'
+ ac_cv_prog_cc_static='-Bstatic'
+ ac_cv_prog_cc_wl='-Qoption ld '
+ ;;
+
+ sysv4 | sysv4.2uw2* | sysv4.3* | sysv5*)
+ ac_cv_prog_cc_pic='-KPIC'
+ ac_cv_prog_cc_static='-Bstatic'
+ ac_cv_prog_cc_wl='-Wl,'
+ ;;
+
+ uts4*)
+ ac_cv_prog_cc_pic='-pic'
+ ac_cv_prog_cc_static='-Bstatic'
+ ;;
+
+ sysv4*MP*)
+ if test -d /usr/nec ;then
+ ac_cv_prog_cc_pic='-Kconform_pic'
+ ac_cv_prog_cc_static='-Bstatic'
+ fi
+ ;;
+
+ *)
+ ac_cv_prog_cc_can_build_shared=no
+ ;;
+ esac
+ fi
+ case "$host_os" in
+ # Platforms which do not suport PIC and -DPIC is meaningless
+ # on them:
+ *djgpp*)
+ ac_cv_prog_cc_pic=
+ ;;
+ *)
+ ac_cv_prog_cc_pic="$ac_cv_prog_cc_pic -DPIC"
+ ;;
+ esac
+fi
+
+need_lc=yes
+if test "$enable_shared" = yes && test "$with_gcc" = yes; then
+ case $archive_cmds in
+ *'~'*)
+ # FIXME: we may have to deal with multi-command sequences.
+ ;;
+ '$CC '*)
+ # Test whether the compiler implicitly links with -lc since on some
+ # systems, -lgcc has to come before -lc. If gcc already passes -lc
+ # to ld, don't add -lc before -lgcc.
+ echo $ac_n "checking whether -lc should be explicitly linked in... $ac_c" 1>&6
+ if eval "test \"`echo '$''{'ac_cv_archive_cmds_needs_lc'+set}'`\" = set"; then
+ echo $ac_n "(cached) $ac_c" 1>&6
+ need_lc=$ac_cv_archive_cmds_needs_lc
+ else
+ $rm conftest*
+ echo "static int dummy;" > conftest.$ac_ext
+ if { (eval echo ltcf-c.sh:need_lc: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>conftest.err; }; then
+ # Append any warnings to the config.log.
+ cat conftest.err 1>&5
+ soname=conftest
+ lib=conftest
+ libobjs=conftest.$ac_objext
+ deplibs=
+ wl=$ac_cv_prog_cc_wl
+ compiler_flags=-v
+ linker_flags=-v
+ verstring=
+ output_objdir=.
+ libname=conftest
+ save_allow_undefined_flag=$allow_undefined_flag
+ allow_undefined_flag=
+ if { (eval echo ltcf-c.sh:need_lc: \"$archive_cmds\") 1>&5; (eval $archive_cmds) 2>&1 | grep " -lc " 1>&5 ; }; then
+ need_lc=no
+ fi
+ allow_undefined_flag=$save_allow_undefined_flag
+ else
+ cat conftest.err 1>&5
+ fi
+ fi
+ $rm conftest*
+ echo "$ac_t$need_lc" 1>&6
+ ;;
+ esac
+fi
+ac_cv_archive_cmds_needs_lc=$need_lc
--- /dev/null
+#### This script is meant to be sourced by ltconfig.
+
+# ltcf-f77.sh - Create a fortran compiler specific configuration
+#
+# Copyright (C) 1996-1999, 2000, 2001 Free Software Foundation, Inc.
+# Originally by Gordon Matzigkeit <gord@gnu.ai.mit.edu>, 1996
+#
+# NB: This is somewhat of a hack to support fortran for the stuff
+# we need, don't trust it as a complete implementation!
+#
+# Original C++ support by:Gary V. Vaughan <gvv@techie.com>
+# Alexandre Oliva <oliva@lsd.ic.unicamp.br>
+# Ossama Othman <ossama@debian.org>
+# Thomas Thanner <tanner@gmx.de>
+#
+# This file is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+#
+# As a special exception to the GNU General Public License, if you
+# distribute this file as part of a program that contains a
+# configuration script generated by Autoconf, you may include it under
+# the same distribution terms that you use for the rest of that program.
+
+
+# Source file extension for fortran test sources.
+ac_ext=f
+# Source extension for f77 files to be preprocessed
+ac_pre_ext=F
+
+# Object file extension for compiled C++ test sources.
+objext=o
+
+# Code to be used in simple compile tests
+lt_simple_compile_test_code=" subroutine t\n return\n end\n"
+
+# Code to be used in simple link tests
+lt_simple_link_test_code=" program t\n end\n"
+
+# fortran compiler
+F77=${F77-f77}
+
+# ltmain only uses $CC for tagged configurations so we simply trust CC is set....
+
+## Linker Characteristics
+case $host_os in
+cygwin* | mingw*)
+ # FIXME: the MSVC++ port hasn't been tested in a loooong time
+ # When not using gcc, we currently assume that we are using
+ # Microsoft Visual C++.
+ if test "$with_gcc" != yes; then
+ with_gnu_ld=no
+ fi
+ ;;
+esac
+
+ld_shlibs=yes
+if test "$with_gnu_ld" = yes; then
+ # If archive_cmds runs LD, not CC, wlarc should be empty
+ wlarc='${wl}'
+
+ # See if GNU ld supports shared libraries.
+ case $host_os in
+ aix3* | aix4* | aix5*)
+ # On AIX, the GNU linker is very broken
+ ld_shlibs=no
+ cat <<EOF 1>&2
+
+*** Warning: the GNU linker, at least up to release 2.9.1, is reported
+*** to be unable to reliably create shared libraries on AIX.
+*** Therefore, libtool is disabling shared libraries support. If you
+*** really care for shared libraries, you may want to modify your PATH
+*** so that a non-GNU linker is found, and then restart.
+
+EOF
+ ;;
+
+ amigaos*)
+ archive_cmds='$rm $output_objdir/a2ixlibrary.data~$echo "#define NAME $libname" > $output_objdir/a2ixlibrary.data~$echo "#define LIBRARY_ID 1" >> $output_objdir/a2ixlibrary.data~$echo "#define VERSION $major" >> $output_objdir/a2ixlibrary.data~$echo "#define REVISION $revision" >> $output_objdir/a2ixlibrary.data~$AR $AR_FLAGS $lib $libobjs~$RANLIB $lib~(cd $output_objdir && a2ixlibrary -32)'
+ hardcode_libdir_flag_spec='-L$libdir'
+ hardcode_minus_L=yes
+
+ # Samuel A. Falvo II <kc5tja@dolphin.openprojects.net> reports
+ # that the semantics of dynamic libraries on AmigaOS, at least up
+ # to version 4, is to share data among multiple programs linked
+ # with the same dynamic library. Since this doesn't match the
+ # behavior of shared libraries on other platforms, we can use
+ # them.
+ ld_shlibs=no
+ ;;
+
+ beos*)
+ if $LD --help 2>&1 | egrep ': supported targets:.* elf' > /dev/null; then
+ allow_undefined_flag=unsupported
+ # Joseph Beckenbach <jrb3@best.com> says some releases of gcc
+ # support --undefined. This deserves some investigation. FIXME
+ archive_cmds='$CC -nostart $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname -o $lib'
+ else
+ ld_shlibs=no
+ fi
+ ;;
+
+ cygwin* | mingw*)
+ # hardcode_libdir_flag_spec is actually meaningless, as there is
+ # no search path for DLLs.
+ hardcode_libdir_flag_spec='-L$libdir'
+ allow_undefined_flag=unsupported
+ always_export_symbols=yes
+
+ extract_expsyms_cmds='test -f $output_objdir/impgen.c || \
+ sed -e "/^# \/\* impgen\.c starts here \*\//,/^# \/\* impgen.c ends here \*\// { s/^# //; p; }" -e d < $0 > $output_objdir/impgen.c~
+ test -f $output_objdir/impgen.exe || (cd $output_objdir && \
+ if test "x$HOST_CC" != "x" ; then $HOST_CC -o impgen impgen.c ; \
+ else $CC -o impgen impgen.c ; fi)~
+ $output_objdir/impgen $dir/$soroot > $output_objdir/$soname-def'
+
+ old_archive_from_expsyms_cmds='$DLLTOOL --as=$AS --dllname $soname --def $output_objdir/$soname-def --output-lib $output_objdir/$newlib'
+
+ # cygwin and mingw dlls have different entry points and sets of symbols
+ # to exclude.
+ # FIXME: what about values for MSVC?
+ dll_entry=__cygwin_dll_entry@12
+ dll_exclude_symbols=DllMain@12,_cygwin_dll_entry@12,_cygwin_noncygwin_dll_entry@12~
+ case $host_os in
+ mingw*)
+ # mingw values
+ dll_entry=_DllMainCRTStartup@12
+ dll_exclude_symbols=DllMain@12,DllMainCRTStartup@12,DllEntryPoint@12~
+ ;;
+ esac
+
+ # mingw and cygwin differ, and it's simplest to just exclude the union
+ # of the two symbol sets.
+ dll_exclude_symbols=DllMain@12,_cygwin_dll_entry@12,_cygwin_noncygwin_dll_entry@12,DllMainCRTStartup@12,DllEntryPoint@12
+
+ # recent cygwin and mingw systems supply a stub DllMain which the user
+ # can override, but on older systems we have to supply one (in ltdll.c)
+ if test "x$lt_cv_need_dllmain" = "xyes"; then
+ ltdll_obj='$output_objdir/$soname-ltdll.'"$objext "
+ ltdll_cmds='test -f $output_objdir/$soname-ltdll.c || sed -e "/^# \/\* ltdll\.c starts here \*\//,/^# \/\* ltdll.c ends here \*\// { s/^# //; p; }" -e d < $0 > $output_objdir/$soname-ltdll.c~
+ test -f $output_objdir/$soname-ltdll.$objext || (cd $output_objdir && $CC -c $soname-ltdll.c)~'
+ else
+ ltdll_obj=
+ ltdll_cmds=
+ fi
+
+ # Extract the symbol export list from an `--export-all' def file,
+ # then regenerate the def file from the symbol export list, so that
+ # the compiled dll only exports the symbol export list.
+ # Be careful not to strip the DATA tag left be newer dlltools.
+ export_symbols_cmds="$ltdll_cmds"'
+ $DLLTOOL --export-all --exclude-symbols '$dll_exclude_symbols' --output-def $output_objdir/$soname-def '$ltdll_obj'$libobjs $convenience~
+ sed -e "1,/EXPORTS/d" -e "s/ @ [0-9]*//" -e "s/ *;.*$//" < $output_objdir/$soname-def > $export_symbols'
+
+ # If the export-symbols file already is a .def file (1st line
+ # is EXPORTS), use it as is.
+ # If DATA tags from a recent dlltool are present, honour them!
+ archive_expsym_cmds='if test "x`head -1 $export_symbols`" = xEXPORTS; then
+ cp $export_symbols $output_objdir/$soname-def;
+ else
+ echo EXPORTS > $output_objdir/$soname-def;
+ _lt_hint=1;
+ cat $export_symbols | while read symbol; do
+ set dummy \$symbol;
+ case \[$]# in
+ 2) echo " \[$]2 @ \$_lt_hint ; " >> $output_objdir/$soname-def;;
+ *) echo " \[$]2 @ \$_lt_hint \[$]3 ; " >> $output_objdir/$soname-def;;
+ esac;
+ _lt_hint=`expr 1 + \$_lt_hint`;
+ done;
+ fi~
+ '"$ltdll_cmds"'
+ $CC -Wl,--base-file,$output_objdir/$soname-base '$lt_cv_cc_dll_switch' -Wl,-e,'$dll_entry' -o $output_objdir/$soname '$ltdll_obj'$libobjs $deplibs $compiler_flags~
+ $DLLTOOL --as=$AS --dllname $soname --exclude-symbols '$dll_exclude_symbols' --def $output_objdir/$soname-def --base-file $output_objdir/$soname-base --output-exp $output_objdir/$soname-exp~
+ $CC -Wl,--base-file,$output_objdir/$soname-base $output_objdir/$soname-exp '$lt_cv_cc_dll_switch' -Wl,-e,'$dll_entry' -o $output_objdir/$soname '$ltdll_obj'$libobjs $deplibs $compiler_flags~
+ $DLLTOOL --as=$AS --dllname $soname --exclude-symbols '$dll_exclude_symbols' --def $output_objdir/$soname-def --base-file $output_objdir/$soname-base --output-exp $output_objdir/$soname-exp --output-lib $output_objdir/$libname.dll.a~
+ $CC $output_objdir/$soname-exp '$lt_cv_cc_dll_switch' -Wl,-e,'$dll_entry' -o $output_objdir/$soname '$ltdll_obj'$libobjs $deplibs $compiler_flags'
+ ;;
+
+ darwin* | rhapsody*)
+ allow_undefined_flag='-undefined suppress'
+ archive_cmds='$CC `test .$module = .yes && echo -bundle || echo -dynamiclib` $allow_undefined_flag -o $lib $libobjs $deplibs $linkopts -install_name $rpath/$soname `test -n "$verstring" -a x$verstring != x0.0 && echo $verstring`'
+ # We need to add '_' to the symbols in $export_symbols first
+ #archive_expsym_cmds="$archive_cmds"' && strip -s $export_symbols'
+ hardcode_direct=yes
+ hardcode_shlibpath_var=no
+ whole_archive_flag_spec='-all_load $convenience'
+ ;;
+
+ netbsd*)
+ if echo __ELF__ | $CC -E - | grep __ELF__ >/dev/null; then
+ archive_cmds='$LD -Bshareable $libobjs $deplibs $linker_flags -o $lib'
+ wlarc=
+ else
+ archive_cmds='$CC -shared -nodefaultlibs $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname -o $lib'
+ archive_expsym_cmds='$CC -shared -nodefaultlibs $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname ${wl}-retain-symbols-file $wl$export_symbols -o $lib'
+ fi
+ ;;
+
+ solaris* | sysv5*)
+ if $LD -v 2>&1 | egrep 'BFD 2\.8' > /dev/null; then
+ ld_shlibs=no
+ cat <<EOF 1>&2
+
+*** Warning: The releases 2.8.* of the GNU linker cannot reliably
+*** create shared libraries on Solaris systems. Therefore, libtool
+*** is disabling shared libraries support. We urge you to upgrade GNU
+*** binutils to release 2.9.1 or newer. Another option is to modify
+*** your PATH or compiler configuration so that the native linker is
+*** used, and then restart.
+
+EOF
+ elif $LD --help 2>&1 | egrep ': supported targets:.* elf' > /dev/null; then
+ archive_cmds='$CC -shared $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname -o $lib'
+ archive_expsym_cmds='$CC -shared $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname ${wl}-retain-symbols-file $wl$export_symbols -o $lib'
+ else
+ ld_shlibs=no
+ fi
+ ;;
+
+ sunos4*)
+ archive_cmds='$LD -assert pure-text -Bshareable -o $lib $libobjs $deplibs $linker_flags'
+ wlarc=
+ hardcode_direct=yes
+ hardcode_shlibpath_var=no
+ ;;
+
+ *)
+ if $LD --help 2>&1 | egrep ': supported targets:.* elf' > /dev/null; then
+ archive_cmds='$CC -shared $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname -o $lib'
+ archive_expsym_cmds='$CC -shared $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname ${wl}-retain-symbols-file $wl$export_symbols -o $lib'
+ else
+ ld_shlibs=no
+ fi
+ ;;
+ esac
+
+ if test "$ld_shlibs" = yes; then
+ runpath_var=LD_RUN_PATH
+ hardcode_libdir_flag_spec='${wl}--rpath ${wl}$libdir'
+ export_dynamic_flag_spec='${wl}--export-dynamic'
+ case $host_os in
+ cygwin* | mingw*)
+ # dlltool doesn't understand --whole-archive et. al.
+ whole_archive_flag_spec=
+ ;;
+ *)
+ # ancient GNU ld didn't support --whole-archive et. al.
+ if $LD --help 2>&1 | egrep 'no-whole-archive' > /dev/null; then
+ whole_archive_flag_spec="$wlarc"'--whole-archive$convenience '"$wlarc"'--no-whole-archive'
+ else
+ whole_archive_flag_spec=
+ fi
+ ;;
+ esac
+ fi
+else
+ # PORTME fill in a description of your system's linker (not GNU ld)
+ case $host_os in
+ aix3*)
+ allow_undefined_flag=unsupported
+ always_export_symbols=yes
+ archive_expsym_cmds='$LD -o $output_objdir/$soname $libobjs $deplibs $linker_flags -bE:$export_symbols -T512 -H512 -bM:SRE~$AR $AR_FLAGS $lib $output_objdir/$soname'
+ # Note: this linker hardcodes the directories in LIBPATH if there
+ # are no directories specified by -L.
+ hardcode_minus_L=yes
+ if test "$with_gcc" = yes && test -z "$link_static_flag"; then
+ # Neither direct hardcoding nor static linking is supported with a
+ # broken collect2.
+ hardcode_direct=unsupported
+ fi
+ ;;
+
+ aix4* | aix5*)
+ hardcode_direct=yes
+ hardcode_libdir_separator=':'
+ link_all_deplibs=yes
+ # When large executables or shared objects are built, AIX ld can
+ # have problems creating the table of contents. If linking a library
+ # or program results in "error TOC overflow" add -mminimal-toc to
+ # CXXFLAGS/CFLAGS for g++/gcc. In the cases where that is not
+ # enough to fix the problem, add -Wl,-bbigtoc to LDFLAGS.
+ if test "$with_gcc" = yes; then
+ case $host_os in aix4.[012]|aix4.[012].*)
+ # We only want to do this on AIX 4.2 and lower, the check
+ # below for broken collect2 doesn't work under 4.3+
+ collect2name=`${CC} -print-prog-name=collect2`
+ if test -f "$collect2name" && \
+ strings "$collect2name" | grep resolve_lib_name >/dev/null
+ then
+ # We have reworked collect2
+ hardcode_direct=yes
+ else
+ # We have old collect2
+ hardcode_direct=unsupported
+ # It fails to find uninstalled libraries when the uninstalled
+ # path is not listed in the libpath. Setting hardcode_minus_L
+ # to unsupported forces relinking
+ hardcode_minus_L=yes
+ hardcode_libdir_flag_spec='-L$libdir'
+ hardcode_libdir_separator=
+ fi
+ esac
+ shared_flag='-shared'
+ else
+ # not using gcc
+ if test "$host_cpu" = ia64; then
+ shared_flag='${wl}-G'
+ else
+ shared_flag='${wl}-bM:SRE'
+ fi
+ fi
+
+ if test "$host_cpu" = ia64; then
+ # On IA64, the linker does run time linking by default, so we don't
+ # have to do anything special.
+ aix_use_runtimelinking=no
+ exp_sym_flag='-Bexport'
+ no_entry_flag=""
+ else
+ # Test if we are trying to use run time linking, or normal AIX style linking.
+ # If -brtl is somewhere in LDFLAGS, we need to do run time linking.
+ aix_use_runtimelinking=no
+ for ld_flag in $LDFLAGS; do
+ if (test $ld_flag = "-brtl" || test $ld_flag = "-Wl,-brtl" ); then
+ aix_use_runtimelinking=yes
+ break
+ fi
+ done
+ exp_sym_flag='-bexport'
+ no_entry_flag='-bnoentry'
+ fi
+ # -bexpall does not export symbols beginning with underscore (_)
+ always_export_symbols=yes
+ if test "$aix_use_runtimelinking" = yes; then
+ # Warning - without using the other run time loading flags (-brtl), -berok will
+ # link without error, but may produce a broken library.
+ allow_undefined_flag=' ${wl}-berok'
+ hardcode_libdir_flag_spec='${wl}-blibpath:$libdir:/usr/lib:/lib'
+ archive_expsym_cmds="\$CC $shared_flag"' -o $output_objdir/$soname $libobjs $deplibs $compiler_flags ${allow_undefined_flag} '"\${wl}$no_entry_flag \${wl}$exp_sym_flag:\$export_symbols"
+ else
+ if test "$host_cpu" = ia64; then
+ hardcode_libdir_flag_spec='${wl}-R $libdir:/usr/lib:/lib'
+ allow_undefined_flag="-z nodefs"
+ archive_expsym_cmds="\$CC $shared_flag"' -o $output_objdir/$soname $libobjs $deplibs $compiler_flags ${wl}${allow_undefined_flag} '"\${wl}$no_entry_flag \${wl}$exp_sym_flag:\$export_symbols"
+ else
+ allow_undefined_flag=' ${wl}-berok'
+ # -bexpall does not export symbols beginning with underscore (_)
+ always_export_symbols=yes
+ # Exported symbols can be pulled into shared objects from archives
+ whole_archive_flag_spec=' '
+ build_libtool_need_lc=yes
+ hardcode_libdir_flag_spec='${wl}-blibpath:$libdir:/usr/lib:/lib'
+ # This is similar to how AIX traditionally builds it's shared libraries.
+ archive_expsym_cmds="\$CC $shared_flag"' -o $output_objdir/$soname $libobjs $deplibs $compiler_flags ${wl}-bE:$export_symbols ${wl}-bnoentry${allow_undefined_flag}~$AR $AR_FLAGS $output_objdir/$libname$release.a $output_objdir/$soname'
+ fi
+ fi
+ ;;
+
+ amigaos*)
+ archive_cmds='$rm $output_objdir/a2ixlibrary.data~$echo "#define NAME $libname" > $output_objdir/a2ixlibrary.data~$echo "#define LIBRARY_ID 1" >> $output_objdir/a2ixlibrary.data~$echo "#define VERSION $major" >> $output_objdir/a2ixlibrary.data~$echo "#define REVISION $revision" >> $output_objdir/a2ixlibrary.data~$AR $AR_FLAGS $lib $libobjs~$RANLIB $lib~(cd $output_objdir && a2ixlibrary -32)'
+ hardcode_libdir_flag_spec='-L$libdir'
+ hardcode_minus_L=yes
+ # see comment about different semantics on the GNU ld section
+ ld_shlibs=no
+ ;;
+
+ cygwin* | mingw*)
+ # When not using gcc, we currently assume that we are using
+ # Microsoft Visual C++.
+ # hardcode_libdir_flag_spec is actually meaningless, as there is
+ # no search path for DLLs.
+ hardcode_libdir_flag_spec=' '
+ allow_undefined_flag=unsupported
+ # Tell ltmain to make .lib files, not .a files.
+ libext=lib
+ # FIXME: Setting linknames here is a bad hack.
+ archive_cmds='$CC -o $lib $libobjs $compiler_flags `echo "$deplibs" | sed -e '\''s/ -lc$//'\''` -link -dll~linknames='
+ # The linker will automatically build a .lib file if we build a DLL.
+ old_archive_from_new_cmds='true'
+ # FIXME: Should let the user specify the lib program.
+ old_archive_cmds='lib /OUT:$oldlib$oldobjs$old_deplibs'
+ fix_srcfile_path='`cygpath -w "$srcfile"`'
+ ;;
+
+ freebsd1*)
+ ld_shlibs=no
+ ;;
+
+ # FreeBSD 2.2.[012] allows us to include c++rt0.o to get C++ constructor
+ # support. Future versions do this automatically, but an explicit c++rt0.o
+ # does not break anything, and helps significantly (at the cost of a little
+ # extra space).
+ freebsd2.2*)
+ archive_cmds='$LD -Bshareable -o $lib $libobjs $deplibs $linker_flags /usr/lib/c++rt0.o'
+ hardcode_libdir_flag_spec='-R$libdir'
+ hardcode_direct=yes
+ hardcode_shlibpath_var=no
+ ;;
+
+ # Unfortunately, older versions of FreeBSD 2 do not have this feature.
+ freebsd2*)
+ archive_cmds='$LD -Bshareable -o $lib $libobjs $deplibs $linker_flags'
+ hardcode_direct=yes
+ hardcode_minus_L=yes
+ hardcode_shlibpath_var=no
+ ;;
+
+ # FreeBSD 3 and greater uses gcc -shared to do shared libraries.
+ freebsd*)
+ archive_cmds='$CC -shared -o $lib $libobjs $deplibs $compiler_flags'
+ hardcode_libdir_flag_spec='-R$libdir'
+ hardcode_direct=yes
+ hardcode_shlibpath_var=no
+ ;;
+
+ hpux9* | hpux10* | hpux11*)
+ case $host_os in
+ hpux9*) archive_cmds='$rm $output_objdir/$soname~$LD -b +b $install_libdir -o $output_objdir/$soname $libobjs $deplibs $linker_flags~test $output_objdir/$soname = $lib || mv $output_objdir/$soname $lib' ;;
+ *) archive_cmds='$LD -b +h $soname +b $install_libdir -o $lib $libobjs $deplibs $linker_flags' ;;
+ esac
+ hardcode_libdir_flag_spec='${wl}+b ${wl}$libdir'
+ hardcode_libdir_separator=:
+ hardcode_direct=yes
+ hardcode_minus_L=yes # Not in the search PATH, but as the default
+ # location of the library.
+ export_dynamic_flag_spec='${wl}-E'
+ ;;
+
+ irix5* | irix6*)
+ if test "$with_gcc" = yes; then
+ archive_cmds='$CC -shared $libobjs $deplibs $compiler_flags ${wl}-soname ${wl}$soname `test -n "$verstring" && echo ${wl}-set_version ${wl}$verstring` ${wl}-update_registry ${wl}${objdir}/so_locations -o $lib'
+ else
+ archive_cmds='$LD -shared $libobjs $deplibs $linker_flags -soname $soname `test -n "$verstring" && echo -set_version $verstring` -update_registry ${objdir}/so_locations -o $lib'
+ fi
+ hardcode_libdir_flag_spec='${wl}-rpath ${wl}$libdir'
+ hardcode_libdir_separator=:
+ link_all_deplibs=yes
+ ;;
+
+ netbsd*)
+ if echo __ELF__ | $CC -E - | grep __ELF__ >/dev/null; then
+ archive_cmds='$LD -Bshareable -o $lib $libobjs $deplibs $linker_flags' # a.out
+ else
+ archive_cmds='$LD -shared -o $lib $libobjs $deplibs $linker_flags' # ELF
+ fi
+ hardcode_libdir_flag_spec='-R$libdir'
+ hardcode_direct=yes
+ hardcode_shlibpath_var=no
+ ;;
+
+ newsos6)
+ archive_cmds='$LD -G -h $soname -o $lib $libobjs $deplibs $linkopts'
+ hardcode_direct=yes
+ hardcode_libdir_flag_spec='${wl}-rpath ${wl}$libdir'
+ hardcode_libdir_separator=:
+ hardcode_shlibpath_var=no
+ ;;
+
+ openbsd*)
+ archive_cmds='$LD -Bshareable -o $lib $libobjs $deplibs $linker_flags'
+ hardcode_libdir_flag_spec='-R$libdir'
+ hardcode_direct=yes
+ hardcode_shlibpath_var=no
+ ;;
+
+ os2*)
+ hardcode_libdir_flag_spec='-L$libdir'
+ hardcode_minus_L=yes
+ allow_undefined_flag=unsupported
+ archive_cmds='$echo "LIBRARY $libname INITINSTANCE" > $output_objdir/$libname.def~$echo "DESCRIPTION \"$libname\"" >> $output_objdir/$libname.def~$echo DATA >> $output_objdir/$libname.def~$echo " SINGLE NONSHARED" >> $output_objdir/$libname.def~$echo EXPORTS >> $output_objdir/$libname.def~emxexp $libobjs >> $output_objdir/$libname.def~$CC -Zdll -Zcrtdll -o $lib $libobjs $deplibs $compiler_flags $output_objdir/$libname.def'
+ old_archive_from_new_cmds='emximp -o $output_objdir/$libname.a $output_objdir/$libname.def'
+ ;;
+
+ osf3*)
+ if test "$with_gcc" = yes; then
+ allow_undefined_flag=' ${wl}-expect_unresolved ${wl}\*'
+ archive_cmds='$CC -shared${allow_undefined_flag} $libobjs $deplibs $compiler_flags ${wl}-soname ${wl}$soname `test -n "$verstring" && echo ${wl}-set_version ${wl}$verstring` ${wl}-update_registry ${wl}${objdir}/so_locations -o $lib'
+ else
+ allow_undefined_flag=' -expect_unresolved \*'
+ archive_cmds='$LD -shared${allow_undefined_flag} $libobjs $deplibs $linker_flags -soname $soname `test -n "$verstring" && echo -set_version $verstring` -update_registry ${objdir}/so_locations -o $lib'
+ fi
+ hardcode_libdir_flag_spec='${wl}-rpath ${wl}$libdir'
+ hardcode_libdir_separator=:
+ ;;
+
+ osf4* | osf5*) # as osf3* with the addition of -msym flag
+ if test "$with_gcc" = yes; then
+ allow_undefined_flag=' ${wl}-expect_unresolved ${wl}\*'
+ archive_cmds='$CC -shared${allow_undefined_flag} $libobjs $deplibs $compiler_flags ${wl}-msym ${wl}-soname ${wl}$soname `test -n "$verstring" && echo ${wl}-set_version ${wl}$verstring` ${wl}-update_registry ${wl}${objdir}/so_locations -o $lib'
+ hardcode_libdir_flag_spec='${wl}-rpath ${wl}$libdir'
+ else
+ allow_undefined_flag=' -expect_unresolved \*'
+ archive_cmds='$LD -shared${allow_undefined_flag} $libobjs $deplibs $linker_flags -msym -soname $soname `test -n "$verstring" && echo -set_version $verstring` -update_registry ${objdir}/so_locations -o $lib'
+ archive_expsym_cmds='for i in `cat $export_symbols`; do printf "-exported_symbol " >> $lib.exp; echo "\$i" >> $lib.exp; done; echo "-hidden">> $lib.exp~
+ $LD -shared${allow_undefined_flag} -input $lib.exp $linker_flags $libobjs $deplibs -soname $soname `test -n "$verstring" && echo -set_version $verstring` -update_registry ${objdir}/so_locations -o $lib~$rm $lib.exp'
+
+ # cc supports -rpath directly
+ hardcode_libdir_flag_spec='-rpath $libdir'
+ fi
+ hardcode_libdir_separator=:
+ ;;
+
+ sco3.2v5*)
+ archive_cmds='$LD -G -h $soname -o $lib $libobjs $deplibs $linker_flags'
+ hardcode_shlibpath_var=no
+ runpath_var=LD_RUN_PATH
+ hardcode_runpath_var=yes
+ ;;
+
+ solaris*)
+ no_undefined_flag=' -z defs'
+ # $CC -shared without GNU ld will not create a library from C++
+ # object files and a static libstdc++, better avoid it by now
+ archive_cmds='$LD -G${allow_undefined_flag} -h $soname -o $lib $libobjs $deplibs $linker_flags'
+ archive_expsym_cmds='$echo "{ global:" > $lib.exp~cat $export_symbols | sed -e "s/\(.*\)/\1;/" >> $lib.exp~$echo "local: *; };" >> $lib.exp~
+ $LD -G${allow_undefined_flag} -M $lib.exp -h $soname -o $lib $libobjs $deplibs $linker_flags~$rm $lib.exp'
+ hardcode_libdir_flag_spec='-R$libdir'
+ hardcode_shlibpath_var=no
+ case $host_os in
+ solaris2.[0-5] | solaris2.[0-5].*) ;;
+ *) # Supported since Solaris 2.6 (maybe 2.5.1?)
+ whole_archive_flag_spec='-z allextract$convenience -z defaultextract' ;;
+ esac
+ link_all_deplibs=yes
+ ;;
+
+ sunos4*)
+ archive_cmds='$LD -assert pure-text -Bstatic -o $lib $libobjs $deplibs $linker_flags'
+ hardcode_libdir_flag_spec='-L$libdir'
+ hardcode_direct=yes
+ hardcode_minus_L=yes
+ hardcode_shlibpath_var=no
+ ;;
+
+ sysv4)
+ archive_cmds='$LD -G -h $soname -o $lib $libobjs $deplibs $linker_flags'
+ runpath_var='LD_RUN_PATH'
+ hardcode_shlibpath_var=no
+ hardcode_direct=no #Motorola manual says yes, but my tests say they lie
+ ;;
+
+ sysv4.3*)
+ archive_cmds='$LD -G -h $soname -o $lib $libobjs $deplibs $linker_flags'
+ hardcode_shlibpath_var=no
+ export_dynamic_flag_spec='-Bexport'
+ ;;
+
+ sysv5*)
+ no_undefined_flag=' -z text'
+ # $CC -shared without GNU ld will not create a library from C++
+ # object files and a static libstdc++, better avoid it by now
+ archive_cmds='$LD -G${allow_undefined_flag} -h $soname -o $lib $libobjs $deplibs $linker_flags'
+ archive_expsym_cmds='$echo "{ global:" > $lib.exp~cat $export_symbols | sed -e "s/\(.*\)/\1;/" >> $lib.exp~$echo "local: *; };" >> $lib.exp~
+ $LD -G${allow_undefined_flag} -M $lib.exp -h $soname -o $lib $libobjs $deplibs $linker_flags~$rm $lib.exp'
+ hardcode_libdir_flag_spec=
+ hardcode_shlibpath_var=no
+ runpath_var='LD_RUN_PATH'
+ ;;
+
+ uts4*)
+ archive_cmds='$LD -G -h $soname -o $lib $libobjs $deplibs $linker_flags'
+ hardcode_libdir_flag_spec='-L$libdir'
+ hardcode_shlibpath_var=no
+ ;;
+
+ dgux*)
+ archive_cmds='$LD -G -h $soname -o $lib $libobjs $deplibs $linker_flags'
+ hardcode_libdir_flag_spec='-L$libdir'
+ hardcode_shlibpath_var=no
+ ;;
+
+ sysv4*MP*)
+ if test -d /usr/nec; then
+ archive_cmds='$LD -G -h $soname -o $lib $libobjs $deplibs $linker_flags'
+ hardcode_shlibpath_var=no
+ runpath_var=LD_RUN_PATH
+ hardcode_runpath_var=yes
+ ld_shlibs=yes
+ fi
+ ;;
+
+ sysv4.2uw2*)
+ archive_cmds='$LD -G -o $lib $libobjs $deplibs $linker_flags'
+ hardcode_direct=yes
+ hardcode_minus_L=no
+ hardcode_shlibpath_var=no
+ hardcode_runpath_var=yes
+ runpath_var=LD_RUN_PATH
+ ;;
+
+ sysv5uw7* | unixware7*)
+ no_undefined_flag='${wl}-z ${wl}text'
+ if test "$GCC" = yes; then
+ archive_cmds='$CC -shared ${wl}-h ${wl}$soname -o $lib $libobjs $deplibs $compiler_flags'
+ else
+ archive_cmds='$CC -G ${wl}-h ${wl}$soname -o $lib $libobjs $deplibs $compiler_flags'
+ fi
+ runpath_var='LD_RUN_PATH'
+ hardcode_shlibpath_var=no
+ ;;
+
+ *)
+ ld_shlibs=no
+ ;;
+ esac
+fi
+
+
+
+
+#################################
+## Compiler Characteristics: PIC flags, static flags, etc
+if test "X${ac_cv_prog_cc_pic+set}" = Xset; then
+ :
+else
+ ac_cv_prog_cc_pic=
+ ac_cv_prog_cc_shlib=
+ ac_cv_prog_cc_wl=
+ ac_cv_prog_cc_static=
+ ac_cv_prog_cc_no_builtin=
+ ac_cv_prog_cc_can_build_shared=$can_build_shared
+
+ if test "$with_gcc" = yes; then
+ ac_cv_prog_cc_wl='-Wl,'
+ ac_cv_prog_cc_static='-static'
+
+ case $host_os in
+ aix*)
+ # All AIX code is PIC.
+ if test "$host_cpu" = ia64; then
+ # AIX 5 now supports IA64 processor
+ lt_cv_prog_cc_static='-Bstatic'
+ else
+ lt_cv_prog_cc_static='-bnso -bI:/lib/syscalls.exp'
+ fi
+ ;;
+ amigaos*)
+ # FIXME: we need at least 68020 code to build shared libraries, but
+ # adding the `-m68020' flag to GCC prevents building anything better,
+ # like `-m68040'.
+ ac_cv_prog_cc_pic='-m68020 -resident32 -malways-restore-a4'
+ ;;
+ beos* | irix5* | irix6* | osf3* | osf4* | osf5*)
+ # PIC is the default for these OSes.
+ ;;
+ cygwin* | mingw* | os2*)
+ # This hack is so that the source file can tell whether it is being
+ # built for inclusion in a dll (and should export symbols for example).
+ ac_cv_prog_cc_pic='-DDLL_EXPORT'
+ ;;
+ darwin* | rhapsody*)
+ # PIC is the default on this platform
+ # Common symbols not allowed in MH_DYLIB files
+ lt_cv_prog_cc_pic='-fno-common'
+ ;;
+ *djgpp*)
+ # DJGPP does not support shared libraries at all
+ ac_cv_prog_cc_pic=
+ ;;
+ sysv4*MP*)
+ if test -d /usr/nec; then
+ ac_cv_prog_cc_pic=-Kconform_pic
+ fi
+ ;;
+ *)
+ ac_cv_prog_cc_pic='-fPIC'
+ ;;
+ esac
+ else
+ # PORTME Check for PIC flags for the system compiler.
+ case $host_os in
+ aix*)
+ # All AIX code is PIC.
+ ac_cv_prog_cc_static="$ac_cv_prog_cc_static ${ac_cv_prog_cc_wl}-lC"
+ ;;
+
+ hpux9* | hpux10* | hpux11*)
+ # Is there a better ac_cv_prog_cc_static that works with the bundled CC?
+ ac_cv_prog_cc_wl='-Wl,'
+ ac_cv_prog_cc_static="${ac_cv_prog_cc_wl}-a ${ac_cv_prog_cc_wl}archive"
+ ac_cv_prog_cc_pic='+Z'
+ ;;
+
+ irix5* | irix6*)
+ ac_cv_prog_cc_wl='-Wl,'
+ ac_cv_prog_cc_static='-non_shared'
+ # PIC (with -KPIC) is the default.
+ ;;
+
+ cygwin* | mingw* | os2*)
+ # This hack is so that the source file can tell whether it is being
+ # built for inclusion in a dll (and should export symbols for example).
+ ac_cv_prog_cc_pic='-DDLL_EXPORT'
+ ;;
+
+ newsos6)
+ ac_cv_prog_cc_pic='-KPIC'
+ ac_cv_prog_cc_static='-Bstatic'
+ ;;
+
+ osf3* | osf4* | osf5*)
+ # All OSF/1 code is PIC.
+ ac_cv_prog_cc_wl='-Wl,'
+ ac_cv_prog_cc_static='-non_shared'
+ ;;
+
+ sco3.2v5*)
+ ac_cv_prog_cc_pic='-Kpic'
+ ac_cv_prog_cc_static='-dn'
+ ac_cv_prog_cc_shlib='-belf'
+ ;;
+
+ solaris*)
+ ac_cv_prog_cc_pic='-KPIC'
+ ac_cv_prog_cc_static='-Bstatic'
+ ac_cv_prog_cc_wl='-Wl,'
+ ;;
+
+ sunos4*)
+ ac_cv_prog_cc_pic='-PIC'
+ ac_cv_prog_cc_static='-Bstatic'
+ ac_cv_prog_cc_wl='-Qoption ld '
+ ;;
+
+ sysv4 | sysv4.2uw2* | sysv4.3* | sysv5*)
+ ac_cv_prog_cc_pic='-KPIC'
+ ac_cv_prog_cc_static='-Bstatic'
+ ac_cv_prog_cc_wl='-Wl,'
+ ;;
+
+ uts4*)
+ ac_cv_prog_cc_pic='-pic'
+ ac_cv_prog_cc_static='-Bstatic'
+ ;;
+
+ sysv4*MP*)
+ if test -d /usr/nec ;then
+ ac_cv_prog_cc_pic='-Kconform_pic'
+ ac_cv_prog_cc_static='-Bstatic'
+ fi
+ ;;
+
+ *)
+ ac_cv_prog_cc_can_build_shared=no
+ ;;
+ esac
+ fi
+ case "$host_os" in
+ # Platforms which do not suport PIC and -DPIC is meaningless
+ # on them:
+ *djgpp*)
+ ac_cv_prog_cc_pic=
+ ;;
+ *)
+ ac_cv_prog_cc_pic="$ac_cv_prog_cc_pic -DPIC"
+ ;;
+ esac
+fi
+
+need_lc=yes
+if test "$enable_shared" = yes && test "$with_gcc" = yes; then
+ case $archive_cmds in
+ *'~'*)
+ # FIXME: we may have to deal with multi-command sequences.
+ ;;
+ '$CC '*)
+ # Test whether the compiler implicitly links with -lc since on some
+ # systems, -lgcc has to come before -lc. If gcc already passes -lc
+ # to ld, don't add -lc before -lgcc.
+ echo $ac_n "checking whether -lc should be explicitly linked in... $ac_c" 1>&6
+ if eval "test \"`echo '$''{'ac_cv_archive_cmds_needs_lc'+set}'`\" = set"; then
+ echo $ac_n "(cached) $ac_c" 1>&6
+ need_lc=$ac_cv_archive_cmds_needs_lc
+ else
+ $rm conftest*
+ echo "static int dummy;" > conftest.$ac_ext
+ if { (eval echo ltcf-c.sh:need_lc: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>conftest.err; }; then
+ # Append any warnings to the config.log.
+ cat conftest.err 1>&5
+ soname=conftest
+ lib=conftest
+ libobjs=conftest.$ac_objext
+ deplibs=
+ wl=$ac_cv_prog_cc_wl
+ compiler_flags=-v
+ linker_flags=-v
+ verstring=
+ output_objdir=.
+ libname=conftest
+ save_allow_undefined_flag=$allow_undefined_flag
+ allow_undefined_flag=
+ if { (eval echo ltcf-c.sh:need_lc: \"$archive_cmds\") 1>&5; (eval $archive_cmds) 2>&1 | grep " -lc " 1>&5 ; }; then
+ need_lc=no
+ fi
+ allow_undefined_flag=$save_allow_undefined_flag
+ else
+ cat conftest.err 1>&5
+ fi
+ fi
+ $rm conftest*
+ echo "$ac_t$need_lc" 1>&6
+ ;;
+ esac
+fi
+ac_cv_archive_cmds_needs_lc=$need_lc
--- /dev/null
+#! /bin/sh
+
+# ltconfig - Create a system-specific libtool.
+# Copyright (C) 1996, 1997, 1998, 1999, 2000, 2001
+# Free Software Foundation, Inc.
+# Originally by Gordon Matzigkeit <gord@gnu.ai.mit.edu>, 1996
+#
+# This file is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+#
+# As a special exception to the GNU General Public License, if you
+# distribute this file as part of a program that contains a
+# configuration script generated by Autoconf, you may include it under
+# the same distribution terms that you use for the rest of that program.
+
+# A lot of this script is taken from autoconf-2.10.
+
+# Check that we are running under the correct shell.
+SHELL=${CONFIG_SHELL-/bin/sh}
+echo=echo
+if test "X$1" = X--no-reexec; then
+ # Discard the --no-reexec flag, and continue.
+ shift
+elif test "X$1" = X--fallback-echo; then
+ # Avoid inline document here, it may be left over
+ :
+elif test "X`($echo '\t') 2>/dev/null`" = 'X\t'; then
+ # Yippee, $echo works!
+ :
+else
+ # Restart under the correct shell.
+ exec "$SHELL" "$0" --no-reexec ${1+"$@"}
+fi
+
+if test "X$1" = X--fallback-echo; then
+ # used as fallback echo
+ shift
+ cat <<EOF
+$*
+EOF
+ exit 0
+fi
+
+# Find the correct PATH separator. Usually this is `:', but
+# DJGPP uses `;' like DOS.
+if test "X${PATH_SEPARATOR+set}" != Xset; then
+ UNAME=${UNAME-`uname 2>/dev/null`}
+ case X$UNAME in
+ *-DOS) PATH_SEPARATOR=';' ;;
+ *) PATH_SEPARATOR=':' ;;
+ esac
+fi
+
+# The HP-UX ksh and POSIX shell print the target directory to stdout
+# if CDPATH is set.
+if test "X${CDPATH+set}" = Xset; then CDPATH=:; export CDPATH; fi
+
+if test "X${echo_test_string+set}" != Xset; then
+ # find a string as large as possible, as long as the shell can cope with it
+ for cmd in 'sed 50q "$0"' 'sed 20q "$0"' 'sed 10q "$0"' 'sed 2q "$0"' 'echo test'; do
+ # expected sizes: less than 2Kb, 1Kb, 512 bytes, 16 bytes, ...
+ if (echo_test_string="`eval $cmd`") 2>/dev/null &&
+ echo_test_string="`eval $cmd`" &&
+ (test "X$echo_test_string" = "X$echo_test_string") 2>/dev/null; then
+ break
+ fi
+ done
+fi
+
+if test "X`($echo '\t') 2>/dev/null`" = 'X\t' &&
+ echo_testing_string=`($echo "$echo_test_string") 2>/dev/null` &&
+ test "X$echo_testing_string" = "X$echo_test_string"; then
+ :
+else
+ # The Solaris, AIX, and Digital Unix default echo programs unquote
+ # backslashes. This makes it impossible to quote backslashes using
+ # echo "$something" | sed 's/\\/\\\\/g'
+ #
+ # So, first we look for a working echo in the user's PATH.
+
+ IFS="${IFS= }"; save_ifs="$IFS"; IFS="${IFS}${PATH_SEPARATOR}"
+ for dir in $PATH /usr/ucb; do
+ if (test -f $dir/echo || test -f $dir/echo$ac_exeext) &&
+ test "X`($dir/echo '\t') 2>/dev/null`" = 'X\t' &&
+ echo_testing_string=`($dir/echo "$echo_test_string") 2>/dev/null` &&
+ test "X$echo_testing_string" = "X$echo_test_string"; then
+ echo="$dir/echo"
+ break
+ fi
+ done
+ IFS="$save_ifs"
+
+ if test "X$echo" = Xecho; then
+ # We didn't find a better echo, so look for alternatives.
+ if test "X`(print -r '\t') 2>/dev/null`" = 'X\t' &&
+ echo_testing_string=`(print -r "$echo_test_string") 2>/dev/null` &&
+ test "X$echo_testing_string" = "X$echo_test_string"; then
+ # This shell has a builtin print -r that does the trick.
+ echo='print -r'
+ elif (test -f /bin/ksh || test -f /bin/ksh$ac_exeext) &&
+ test "X$CONFIG_SHELL" != X/bin/ksh; then
+ # If we have ksh, try running ltconfig again with it.
+ ORIGINAL_CONFIG_SHELL="${CONFIG_SHELL-/bin/sh}"
+ export ORIGINAL_CONFIG_SHELL
+ CONFIG_SHELL=/bin/ksh
+ export CONFIG_SHELL
+ exec "$CONFIG_SHELL" "$0" --no-reexec ${1+"$@"}
+ else
+ # Try using printf.
+ echo='printf %s\n'
+ if test "X`($echo '\t') 2>/dev/null`" = 'X\t' &&
+ echo_testing_string=`($echo "$echo_test_string") 2>/dev/null` &&
+ test "X$echo_testing_string" = "X$echo_test_string"; then
+ # Cool, printf works
+ :
+ elif echo_testing_string=`("$ORIGINAL_CONFIG_SHELL" "$0" --fallback-echo '\t') 2>/dev/null` &&
+ test "X$echo_testing_string" = 'X\t' &&
+ echo_testing_string=`("$ORIGINAL_CONFIG_SHELL" "$0" --fallback-echo "$echo_test_string") 2>/dev/null` &&
+ test "X$echo_testing_string" = "X$echo_test_string"; then
+ CONFIG_SHELL="$ORIGINAL_CONFIG_SHELL"
+ export CONFIG_SHELL
+ SHELL="$CONFIG_SHELL"
+ export SHELL
+ echo="$CONFIG_SHELL $0 --fallback-echo"
+ elif echo_testing_string=`("$CONFIG_SHELL" "$0" --fallback-echo '\t') 2>/dev/null` &&
+ test "X$echo_testing_string" = 'X\t' &&
+ echo_testing_string=`("$CONFIG_SHELL" "$0" --fallback-echo "$echo_test_string") 2>/dev/null` &&
+ test "X$echo_testing_string" = "X$echo_test_string"; then
+ echo="$CONFIG_SHELL $0 --fallback-echo"
+ else
+ # maybe with a smaller string...
+ prev=:
+
+ for cmd in 'echo test' 'sed 2q "$0"' 'sed 10q "$0"' 'sed 20q "$0"' 'sed 50q "$0"'; do
+ if (test "X$echo_test_string" = "X`eval $cmd`") 2>/dev/null; then
+ break
+ fi
+ prev="$cmd"
+ done
+
+ if test "$prev" != 'sed 50q "$0"'; then
+ echo_test_string=`eval $prev`
+
+ export echo_test_string
+ exec "${ORIGINAL_CONFIG_SHELL-${CONFIG_SHELL-/bin/sh}}" "$0" ${1+"$@"}
+ else
+ # Oops. We lost completely, so just stick with echo.
+ echo=echo
+ fi
+ fi
+ fi
+ fi
+fi
+
+# Sed substitution that helps us do robust quoting. It backslashifies
+# metacharacters that are still active within double-quoted strings.
+Xsed='sed -e s/^X//'
+sed_quote_subst='s/\([\\"\\`$\\\\]\)/\\\1/g'
+
+# Same as above, but do not quote variable references.
+double_quote_subst='s/\([\\"\\`\\\\]\)/\\\1/g'
+
+# Sed substitution to delay expansion of an escaped shell variable in a
+# double_quote_subst'ed string.
+delay_variable_subst='s/\\\\\\\\\\\$/\\\\\\$/g'
+
+# The name of this program.
+progname=`$echo "X$0" | $Xsed -e 's%^.*/%%'`
+
+# Constants:
+PROGRAM=ltconfig
+PACKAGE=libtool
+VERSION=1.4a
+TIMESTAMP=" (1.641.2.255 2001/05/22 10:39:30)"
+ac_compile='${CC-cc} -c $CFLAGS $CPPFLAGS conftest.$ac_ext 1>&5'
+ac_link='${CC-cc} -o conftest $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS 1>&5'
+rm="rm -f"
+
+help="Try \`$progname --help' for more information."
+
+# Global variables:
+default_ofile=libtool
+can_build_shared=yes
+enable_shared=yes
+# All known linkers require a `.a' archive for static linking (except M$VC,
+# which needs '.lib').
+enable_static=yes
+enable_fast_install=yes
+enable_dlopen=unknown
+enable_win32_dll=no
+pic_mode=default
+ltmain=
+silent=
+srcdir=
+ac_config_guess=
+ac_config_sub=
+host=
+build=NONE
+nonopt=NONE
+ofile="$default_ofile"
+verify_host=yes
+tagname=
+with_gcc=no
+with_gnu_ld=no
+need_locks=yes
+ac_ext=c
+libext=a
+cache_file=
+max_cmd_len=
+
+## Dependencies to place before and after the object being linked:
+predep_objects=
+postdep_objects=
+predeps=
+postdeps=
+compiler_lib_search_path=
+
+## Link characteristics:
+allow_undefined_flag=
+no_undefined_flag=
+need_lib_prefix=unknown
+need_version=unknown
+# when you set need_version to no, make sure it does not cause -set_version
+# flags to be left without arguments
+archive_cmds=
+archive_expsym_cmds=
+old_archive_from_new_cmds=
+old_archive_from_expsyms_cmds=
+striplib=
+old_striplib=
+export_dynamic_flag_spec=
+whole_archive_flag_spec=
+thread_safe_flag_spec=
+hardcode_into_libs=no
+hardcode_libdir_flag_spec=
+hardcode_libdir_separator=
+hardcode_direct=no
+hardcode_minus_L=no
+hardcode_shlibpath_var=unsupported
+runpath_var=
+link_all_deplibs=unknown
+always_export_symbols=no
+export_symbols_cmds='$NM $libobjs $convenience | $global_symbol_pipe | sed '\''s/.* //'\'' | sort | uniq > $export_symbols'
+# include_expsyms should be a list of space-separated symbols to be *always*
+# included in the symbol list
+include_expsyms=
+# exclude_expsyms can be an egrep regular expression of symbols to exclude
+# it will be wrapped by ` (' and `)$', so one must not match beginning or
+# end of line. Example: `a|bc|.*d.*' will exclude the symbols `a' and `bc',
+# as well as any symbol that contains `d'.
+exclude_expsyms="_GLOBAL_OFFSET_TABLE_"
+# Although _GLOBAL_OFFSET_TABLE_ is a valid symbol C name, most a.out
+# platforms (ab)use it in PIC code, but their linkers get confused if
+# the symbol is explicitly referenced. Since portable code cannot
+# rely on this symbol name, it's probably fine to never include it in
+# preloaded symbol tables.
+extract_expsyms_cmds=
+
+## Tools:
+old_AR="$AR"
+old_AR_FLAGS="$AR_FLAGS"
+old_CC="$CC"
+old_CFLAGS="$CFLAGS"
+old_CPPFLAGS="$CPPFLAGS"
+old_LDFLAGS="$LDFLAGS"
+old_LIBS="$LIBS"
+old_MAGIC_CMD="$MAGIC_CMD"
+old_LD="$LD"
+old_LN_S="$LN_S"
+old_LTCC="$LTCC"
+old_NM="$NM"
+old_RANLIB="$RANLIB"
+old_STRIP="$STRIP"
+old_AS="$AS"
+old_DLLTOOL="$DLLTOOL"
+old_OBJDUMP="$OBJDUMP"
+old_OBJEXT="$OBJEXT"
+old_EXEEXT="$EXEEXT"
+old_reload_flag="$reload_flag"
+old_deplibs_check_method="$deplibs_check_method"
+old_file_magic_cmd="$file_magic_cmd"
+
+# Parse the command line options.
+args=
+prev=
+for option
+do
+ case $option in
+ -*=*) optarg=`echo "$option" | sed 's/[-_a-zA-Z0-9]*=//'` ;;
+ *) optarg= ;;
+ esac
+
+ # If the previous option needs an argument, assign it.
+ if test -n "$prev"; then
+ eval "$prev=\$option"
+ prev=
+ continue
+ fi
+
+ case $option in
+ --help) cat <<EOM
+Usage: $progname [OPTION]... LTMAIN [HOST]
+
+Generate a system-specific libtool script.
+
+ --build configure for building on BUILD [BUILD=HOST]
+ --debug enable verbose shell tracing
+ --disable-shared do not build shared libraries
+ --disable-static do not build static libraries
+ --disable-fast-install do not optimize for fast installation
+ --enable-dlopen enable dlopen support
+ --enable-win32-dll enable building dlls on win32 hosts
+ --help display this help and exit
+ --no-verify do not verify that HOST is a valid host type
+-o, --output=FILE specify the output file [default=$default_ofile]
+ --quiet same as \`--silent'
+ --silent do not print informational messages
+ --srcdir=DIR find \`config.guess' in DIR
+ --version output version information and exit
+ --add-tag=TAG append an alternate configuration
+ --with-gcc assume that the GNU C compiler will be used
+ --with-gnu-ld assume that the C compiler uses the GNU linker
+ --prefer-pic try to use only PIC objects
+ --prefer-non-pic try to use only non-PIC objects
+ --disable-lock disable file locking
+ --cache-file=FILE configure cache file
+
+LTMAIN is the \`ltmain.sh' shell script fragment or \`ltmain.c' program
+that provides basic libtool functionality.
+
+HOST is the canonical host system name [default=guessed].
+EOM
+ exit 0
+ ;;
+
+ --build) prev=build ;;
+ --build=*) build="$optarg" ;;
+
+ --debug)
+ echo "$progname: enabling shell trace mode"
+ set -x
+ ;;
+
+ --disable-shared) enable_shared=no ;;
+
+ --disable-static) enable_static=no ;;
+
+ --disable-fast-install) enable_fast_install=no ;;
+
+ --enable-dlopen) enable_dlopen=yes ;;
+
+ --enable-win32-dll) enable_win32_dll=yes ;;
+
+ --quiet | --silent) silent=yes ;;
+
+ --srcdir) prev=srcdir ;;
+ --srcdir=*) srcdir="$optarg" ;;
+
+ --no-verify) verify_host=no ;;
+
+ --output | -o) prev=ofile ;;
+ --output=*) ofile="$optarg" ;;
+
+ --version) echo "$PROGRAM (GNU $PACKAGE) $VERSION$TIMESTAMP"; exit 0 ;;
+
+ --add-tag) prev=tagname ;;
+ --add-tag=*) tagname="$optarg" ;;
+
+ --with-gcc) with_gcc=yes ;;
+ --with-gnu-ld) with_gnu_ld=yes ;;
+
+ --prefer-pic) pic_mode=yes ;;
+ --prefer-non-pic) pic_mode=no ;;
+
+ --disable-lock) need_locks=no ;;
+
+ --cache-file=*) cache_file="$optarg" ;;
+
+ -*)
+ echo "$progname: unrecognized option \`$option'" 1>&2
+ echo "$help" 1>&2
+ exit 1
+ ;;
+
+ *)
+ if test -z "$ltmain"; then
+ ltmain="$option"
+ elif test -z "$host"; then
+# This generates an unnecessary warning for sparc-sun-solaris4.1.3_U1
+# if test -n "`echo $option| sed 's/[-a-z0-9.]//g'`"; then
+# echo "$progname: warning \`$option' is not a valid host type" 1>&2
+# fi
+ host="$option"
+ else
+ echo "$progname: too many arguments" 1>&2
+ echo "$help" 1>&2
+ exit 1
+ fi ;;
+ esac
+done
+
+if test -z "$ltmain"; then
+ echo "$progname: you must specify a LTMAIN file" 1>&2
+ echo "$help" 1>&2
+ exit 1
+fi
+
+if test ! -f "$ltmain"; then
+ echo "$progname: \`$ltmain' does not exist" 1>&2
+ echo "$help" 1>&2
+ exit 1
+fi
+
+if test -n "$tagname"; then
+ # Check whether tagname contains only valid characters
+ case `$echo "X$tagname" | $Xsed -e 's/[-_ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz1234567890,/]//g'` in
+ "") ;;
+ *)
+ echo "$progname: invalid tag name: $tagname" 1>&2
+ exit 1
+ ;;
+ esac
+
+ if grep "^### BEGIN LIBTOOL TAG CONFIG: $tagname$" < "$ofile" > /dev/null; then
+ echo "$progname: tag name $tagname already exists" 1>&2
+ exit 1
+ fi
+
+ if test ! -f "$ofile"; then
+ echo "$progname: warning: output file \`$ofile' does not exist" 1>&2
+ fi
+
+ if test -z "$LTCC"; then
+ eval "`$SHELL $ofile --config | grep '^LTCC='`"
+ if test -z "$LTCC"; then
+ echo "$progname: warning: output file \`$ofile' does not look like a libtool script" 1>&2
+ else
+ echo "$progname: warning: using \`LTCC=$LTCC', extracted from \`$ofile'" 1>&2
+ fi
+ fi
+fi
+
+# Quote any args containing shell metacharacters.
+ltconfig_args=
+for arg
+do
+ case $arg in
+ *" "*|*" "*|*[\[\]\~\#\$\^\&\*\(\)\{\}\\\|\;\<\>\?]*)
+ ltconfig_args="$ltconfig_args '$arg'" ;;
+ *) ltconfig_args="$ltconfig_args $arg" ;;
+ esac
+done
+
+# A relevant subset of AC_INIT.
+
+# File descriptor usage:
+# 0 standard input
+# 1 file creation
+# 2 errors and warnings
+# 3 some systems may open it to /dev/tty
+# 4 used on the Kubota Titan
+# 5 compiler messages saved in config.log
+# 6 checking for... messages and results
+if test "$silent" = yes; then
+ exec 6>/dev/null
+else
+ exec 6>&1
+fi
+exec 5>>./config.log
+
+# NLS nuisances.
+# Only set LANG and LC_ALL to C if already set.
+# These must not be set unconditionally because not all systems understand
+# e.g. LANG=C (notably SCO).
+if test "X${LC_ALL+set}" = Xset; then LC_ALL=C; export LC_ALL; fi
+if test "X${LANG+set}" = Xset; then LANG=C; export LANG; fi
+
+if test -n "$cache_file" && test -r "$cache_file" && test -f "$cache_file"; then
+ echo "loading cache $cache_file within ltconfig"
+ . $cache_file
+fi
+
+if (echo "testing\c"; echo 1,2,3) | grep c >/dev/null; then
+ # Stardent Vistra SVR4 grep lacks -e, says ghazi@caip.rutgers.edu.
+ if (echo -n testing; echo 1,2,3) | sed s/-n/xn/ | grep xn >/dev/null; then
+ ac_n= ac_c='
+' ac_t=' '
+ else
+ ac_n=-n ac_c= ac_t=
+ fi
+else
+ ac_n= ac_c='\c' ac_t=
+fi
+
+if test -z "$srcdir"; then
+ # Assume the source directory is the same one as the path to LTMAIN.
+ srcdir=`$echo "X$ltmain" | $Xsed -e 's%/[^/]*$%%'`
+ test "$srcdir" = "$ltmain" && srcdir=.
+fi
+
+trap "$rm conftest*; exit 1" 1 2 15
+if test "$verify_host" = yes; then
+ # Check for config.guess and config.sub.
+ ac_aux_dir=
+ for ac_dir in $srcdir $srcdir/.. $srcdir/../..; do
+ if test -f $ac_dir/config.guess; then
+ ac_aux_dir=$ac_dir
+ break
+ fi
+ done
+ if test -z "$ac_aux_dir"; then
+ echo "$progname: cannot find config.guess in $srcdir $srcdir/.. $srcdir/../.." 1>&2
+ echo "$help" 1>&2
+ exit 1
+ fi
+ ac_config_guess=$ac_aux_dir/config.guess
+ ac_config_sub=$ac_aux_dir/config.sub
+
+ # Make sure we can run config.sub.
+ if $SHELL $ac_config_sub sun4 >/dev/null 2>&1; then :
+ else
+ echo "$progname: cannot run $ac_config_sub" 1>&2
+ echo "$help" 1>&2
+ exit 1
+ fi
+
+ echo $ac_n "checking host system type""... $ac_c" 1>&6
+
+ host_alias=$host
+ case $host_alias in
+ "")
+ # Force config.guess to use the C compiler.
+ # CC_FOR_BUILD overrides the CC variable in config.guess but I had
+ # problems with it so do it this way for now.
+ CC="$LTCC"
+
+ if host_alias=`$SHELL $ac_config_guess`; then :
+ else
+ echo "$progname: cannot guess host type; you must specify one" 1>&2
+ echo "$help" 1>&2
+ exit 1
+ fi
+
+ # Restore the C compiler.
+ CC="$old_CC"
+ ;;
+ esac
+ host=`$SHELL $ac_config_sub $host_alias`
+ echo "$ac_t$host" 1>&6
+
+ # Make sure the host verified.
+ test -z "$host" && exit 1
+
+ # Check for the build system type
+ echo $ac_n "checking build system type... $ac_c" 1>&6
+
+ build_alias=$build
+ case $build_alias in
+ NONE)
+ case $nonopt in
+ NONE) build_alias=$host_alias ;;
+ *) build_alias=$nonopt ;;
+ esac ;;
+ esac
+
+ build=`$SHELL $ac_config_sub $build_alias`
+ build_cpu=`echo $build | sed 's/^\([^-]*\)-\([^-]*\)-\(.*\)$/\1/'`
+ build_vendor=`echo $build | sed 's/^\([^-]*\)-\([^-]*\)-\(.*\)$/\2/'`
+ build_os=`echo $build | sed 's/^\([^-]*\)-\([^-]*\)-\(.*\)$/\3/'`
+ echo "$ac_t""$build" 1>&6
+
+elif test -z "$host"; then
+ echo "$progname: you must specify a host type if you use \`--no-verify'" 1>&2
+ echo "$help" 1>&2
+ exit 1
+else
+ host_alias=$host
+ build_alias=$host_alias
+ build=$host
+fi
+
+if test x"$host" != x"$build"; then
+ ac_tool_prefix=${host_alias}-
+else
+ ac_tool_prefix=
+fi
+
+host_cpu=`echo $host | sed 's/^\([^-]*\)-\([^-]*\)-\(.*\)$/\1/'`
+host_vendor=`echo $host | sed 's/^\([^-]*\)-\([^-]*\)-\(.*\)$/\2/'`
+host_os=`echo $host | sed 's/^\([^-]*\)-\([^-]*\)-\(.*\)$/\3/'`
+
+# Transform linux* to *-*-linux-gnu*, to support old configure scripts.
+case $host_os in
+linux-gnu*) ;;
+linux*) host=`echo $host | sed 's/^\(.*-.*-linux\)\(.*\)$/\1-gnu\2/'`
+esac
+
+case $host_os in
+aix3*)
+ # AIX sometimes has problems with the GCC collect2 program. For some
+ # reason, if we set the COLLECT_NAMES environment variable, the problems
+ # vanish in a puff of smoke.
+ if test "X${COLLECT_NAMES+set}" != Xset; then
+ COLLECT_NAMES=
+ export COLLECT_NAMES
+ fi
+ ;;
+esac
+
+# Determine commands to create old-style static archives.
+old_archive_cmds='$AR $AR_FLAGS $oldlib$oldobjs$old_deplibs'
+old_postinstall_cmds='chmod 644 $oldlib'
+old_postuninstall_cmds=
+
+if test -n "$RANLIB"; then
+ old_archive_cmds="$old_archive_cmds~\$RANLIB \$oldlib"
+ old_postinstall_cmds="\$RANLIB \$oldlib~$old_postinstall_cmds"
+fi
+
+# Source the script associated with the $tagname tag configuration.
+if test -n "$tagname"; then
+ . $ltmain
+else
+ # FIXME: We should use a variable here
+ # Configure for a C compiler
+ . $srcdir/ltcf-c.sh
+fi
+
+# Set sane defaults for various variables
+test -z "$AR" && AR=ar
+test -z "$AR_FLAGS" && AR_FLAGS=cru
+test -z "$AS" && AS=as
+test -z "$CC" && CC=cc
+test -z "$DLLTOOL" && DLLTOOL=dlltool
+test -z "$MAGIC_CMD" && MAGIC_CMD=file
+test -z "$LD" && LD=ld
+test -z "$LN_S" && LN_S="ln -s"
+test -z "$NM" && NM=nm
+test -z "$OBJDUMP" && OBJDUMP=objdump
+test -z "$RANLIB" && RANLIB=:
+test -z "$STRIP" && STRIP=:
+test -z "$objext" && objext=o
+
+echo $ac_n "checking for objdir... $ac_c" 1>&6
+rm -f .libs 2>/dev/null
+mkdir .libs 2>/dev/null
+if test -d .libs; then
+ objdir=.libs
+else
+ # MS-DOS does not allow filenames that begin with a dot.
+ objdir=_libs
+fi
+rmdir .libs 2>/dev/null
+echo "$ac_t$objdir" 1>&6
+
+# If no C compiler was specified, use CC.
+LTCC=${LTCC-"$CC"}
+
+# Allow CC to be a program name with arguments.
+set dummy $CC
+compiler="$2"
+
+# We assume here that the value for ac_cv_prog_cc_pic will not be cached
+# in isolation, and that seeing it set (from the cache) indicates that
+# the associated values are set (in the cache) correctly too.
+echo $ac_n "checking for $compiler option to produce PIC... $ac_c" 1>&6
+echo "$progname:678:checking for $compiler option to produce PIC" 1>&5
+
+if test -z "$ac_cv_prog_cc_pic"; then
+ echo "$ac_t"none 1>&6
+else
+ echo "$ac_t""$ac_cv_prog_cc_pic" 1>&6
+
+ # Check to make sure the pic_flag actually works.
+ echo $ac_n "checking if $compiler PIC flag $ac_cv_prog_cc_pic works... $ac_c" 1>&6
+ echo "$progname:687:checking that $compiler PIC flag $ac_cv_prog_cc_pic works." 1>&5
+ if test "X${ac_cv_prog_cc_pic_works+set}" = Xset && \
+ test "X${ac_cv_prog_cc_pic_works}" != X; then
+ echo $ac_n "(cached) $ac_c" 1>&6
+ else
+ ac_cv_prog_cc_pic_works=yes
+ $rm conftest*
+ echo $lt_simple_compile_test_code > conftest.$ac_ext
+ save_CFLAGS="$CFLAGS"
+ CFLAGS="$CFLAGS $ac_cv_prog_cc_pic -DPIC"
+ if { (eval echo $progname:697: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>conftest.err; } && test -s conftest.$objext; then
+ # Append any warnings to the config.log.
+ cat conftest.err 1>&5
+
+ case $host_os in
+ hpux9* | hpux10* | hpux11*)
+ # On HP-UX, both CC and GCC only warn that PIC is supported... then
+ # they create non-PIC objects. So, if there were any warnings, we
+ # assume that PIC is not supported.
+ if test -s conftest.err; then
+ ac_cv_prog_cc_pic_works=no
+ ac_cv_prog_cc_can_build_shared=no
+ ac_cv_prog_cc_pic=
+ else
+ ac_cv_prog_cc_pic_works=yes
+ ac_cv_prog_cc_pic=" $ac_cv_prog_cc_pic"
+ fi
+ ;;
+ *)
+ ac_cv_prog_cc_pic_works=yes
+ ac_cv_prog_cc_pic=" $ac_cv_prog_cc_pic"
+ ;;
+ esac
+ else
+ # Append any errors to the config.log.
+ cat conftest.err 1>&5
+ ac_cv_prog_cc_pic_works=no
+ ac_cv_prog_cc_can_build_shared=no
+ ac_cv_prog_cc_pic=
+ fi
+ CFLAGS="$save_CFLAGS"
+ $rm conftest*
+ fi
+ # Belt *and* braces to stop my trousers falling down:
+ if test "X$ac_cv_prog_cc_pic_works" = Xno; then
+ ac_cv_prog_cc_pic=
+ ac_cv_prog_cc_can_build_shared=no
+ fi
+ echo "$ac_t""$ac_cv_prog_cc_pic_works" 1>&6
+fi
+
+# Check for any special shared library compilation flags.
+if test -n "$ac_cv_prog_cc_shlib"; then
+ echo "$progname: warning: \`$CC' requires \`$ac_cv_prog_cc_shlib' to build shared libraries" 1>&2
+ if echo "$old_CC $old_CFLAGS " | egrep -e "[ ]$ac_cv_prog_cc_shlib[ ]" >/dev/null; then :
+ else
+ echo "$progname: add \`$ac_cv_prog_cc_shlib' to the CC or CFLAGS env variable and reconfigure" 1>&2
+ ac_cv_prog_cc_can_build_shared=no
+ fi
+fi
+
+echo $ac_n "checking if $compiler static flag $ac_cv_prog_cc_static works... $ac_c" 1>&6
+echo "$progname:749: checking if $compiler static flag $ac_cv_prog_cc_static works" >&5
+if test "X${ac_cv_prog_cc_static_works+set}" = Xset && \
+ test "X${ac_cv_prog_cc_static_works}" != X; then
+ echo $ac_n "(cached) $ac_c" 1>&6
+else
+ $rm conftest*
+ echo $lt_simple_link_test_code > conftest.$ac_ext
+ save_LDFLAGS="$LDFLAGS"
+ LDFLAGS="$LDFLAGS $ac_cv_prog_cc_static"
+ if { (eval echo $progname:758: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest; then
+ ac_cv_prog_cc_static_works=yes
+ else
+ ac_cv_prog_cc_static_works=no
+ ac_cv_prog_cc_static=
+ fi
+ LDFLAGS="$save_LDFLAGS"
+ $rm conftest*
+fi
+# Belt *and* braces to stop my trousers falling down:
+if test "X$ac_cv_prog_cc_static_works" = Xno; then
+ ac_cv_prog_cc_static=
+fi
+echo "$ac_t""$ac_cv_prog_cc_static_works" 1>&6
+pic_flag="$ac_cv_prog_cc_pic"
+special_shlib_compile_flags="$ac_cv_prog_cc_shlib"
+wl="$ac_cv_prog_cc_wl"
+link_static_flag="$ac_cv_prog_cc_static"
+no_builtin_flag="$ac_cv_prog_cc_no_builtin"
+can_build_shared="$ac_cv_prog_cc_can_build_shared"
+
+# find the maximum length of command line arguments
+echo "$progname:780: finding the maximum length of command line arguments" 1>&5
+echo $ac_n "finding the maximum length of command line arguments... $ac_c" 1>&6
+if test "${lt_cv_sys_max_cmd_len+set}" = set; then
+ echo $ac_n "(cached) $ac_c" 1>&6
+else
+ i=0
+ testring="ABCD"
+ # If test is not a shell built-in, we'll probably end up computing a
+ # maximum length that is only half of the actual maximum length, but
+ # we can't tell.
+ while test "X"`$CONFIG_SHELL $0 --fallback-echo "X$testring" 2>/dev/null` \
+ = "XX$testring" &&
+ new_result=`expr "X$testring" : ".*" 2>&1` &&
+ lt_cv_sys_max_cmd_len=$new_result &&
+ test $i != 18 # 1 MB should be enough
+ do
+ i=`expr $i + 1`
+ testring=$testring$testring
+ done
+ testring=
+ # add a significant safety factor because C++ compilers can tack on massive amounts
+ # of additional arguments before passing them to the linker. 1/4 should be good.
+ len=`expr $lt_cv_sys_max_cmd_len \/ 4`
+ lt_cv_sys_max_cmd_len=`expr $lt_cv_sys_max_cmd_len - $len`
+fi
+echo "$progname:@lineno@: result: $lt_cv_sys_max_cmd_len" 1>&5
+echo "${ac_t}$lt_cv_sys_max_cmd_len" 1>&6
+
+if test -n $lt_cv_sys_max_cmd_len ; then
+ max_cmd_len=$lt_cv_sys_max_cmd_len
+else
+ max_cmd_len=none
+fi
+
+# Check to see if options -o and -c are simultaneously supported by compiler
+echo $ac_n "checking if $compiler supports -c -o file.$objext... $ac_c" 1>&6
+if test "${lt_cv_compiler_c_o+set}" = set; then
+ echo $ac_n "(cached) $ac_c" 1>&6
+else
+ $rm -r conftest 2>/dev/null
+ mkdir conftest
+ cd conftest
+ $rm conftest*
+ echo $lt_simple_compile_test_code > conftest.$ac_ext
+ mkdir out
+ # According to Tom Tromey, Ian Lance Taylor reported there are C compilers
+ # that will create temporary files in the current directory regardless of
+ # the output directory. Thus, making CWD read-only will cause this test
+ # to fail, enabling locking or at least warning the user not to do parallel
+ # builds.
+ chmod -w .
+ save_CFLAGS="$CFLAGS"
+ CFLAGS="$CFLAGS -o out/conftest2.$objext"
+ echo "$progname:833: checking if $compiler supports -c -o file.$objext" >&5
+ if { (eval echo $progname:834: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>out/conftest.err; } && test -s out/conftest2.$objext; then
+
+ # The compiler can only warn and ignore the option if not recognized
+ # So say no if there are warnings
+ if test -s out/conftest.err; then
+ lt_cv_compiler_c_o=no
+ else
+ lt_cv_compiler_c_o=yes
+ fi
+ else
+ # Append any errors to the config.log.
+ cat out/conftest.err 1>&5
+ lt_cv_compiler_c_o=no
+ fi
+ CFLAGS="$save_CFLAGS"
+ chmod u+w .
+ $rm conftest* out/*
+ rmdir out
+ cd ..
+ rmdir conftest
+ $rm -r conftest 2>/dev/null
+fi
+compiler_c_o=$lt_cv_compiler_c_o
+echo "${ac_t}$compiler_c_o" 1>&6
+
+# Check to see if we can do hard links to lock some files if needed
+hard_links="nottested"
+if test "$compiler_c_o" = no && test "$need_locks" != no; then
+ # do not overwrite the value of need_locks provided by the user
+ echo $ac_n "checking if we can lock with hard links... $ac_c" 1>&6
+ hard_links=yes
+ $rm conftest*
+ ln conftest.a conftest.b 2>/dev/null && hard_links=no
+ touch conftest.a
+ ln conftest.a conftest.b 2>&5 || hard_links=no
+ ln conftest.a conftest.b 2>/dev/null && hard_links=no
+ echo "$ac_t$hard_links" 1>&6
+ $rm conftest*
+ if test "$hard_links" = no; then
+ echo "*** WARNING: \`$CC' does not support \`-c -o', so \`make -j' may be unsafe" >&2
+ need_locks=warn
+ fi
+else
+ need_locks=no
+fi
+
+if test "$with_gcc" = yes; then
+ # Check to see if options -fno-rtti -fno-exceptions are supported by compiler
+ echo $ac_n "checking if $compiler supports -fno-rtti -fno-exceptions ... $ac_c" 1>&6
+ $rm conftest*
+ echo $lt_simple_compile_test_code > conftest.$ac_ext
+ save_CFLAGS="$CFLAGS"
+ CFLAGS="$CFLAGS -fno-rtti -fno-exceptions -c conftest.$ac_ext"
+ echo "$progname:887: checking if $compiler supports -fno-rtti -fno-exceptions" >&5
+ if { (eval echo $progname:888: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>conftest.err; } && test -s conftest.$objext; then
+
+ # The compiler can only warn and ignore the option if not recognized
+ # So say no if there are warnings
+ if test -s conftest.err; then
+ echo "$ac_t"no 1>&6
+ compiler_rtti_exceptions=no
+ else
+ echo "$ac_t"yes 1>&6
+ compiler_rtti_exceptions=yes
+ fi
+ else
+ # Append any errors to the config.log.
+ cat conftest.err 1>&5
+ compiler_rtti_exceptions=no
+ echo "$ac_t"no 1>&6
+ fi
+ CFLAGS="$save_CFLAGS"
+ $rm conftest*
+
+ if test "$compiler_rtti_exceptions" = "yes"; then
+ no_builtin_flag=' -fno-builtin -fno-rtti -fno-exceptions'
+ else
+ no_builtin_flag=' -fno-builtin'
+ fi
+
+fi
+
+# See if the linker supports building shared libraries.
+echo $ac_n "checking whether the linker ($LD) supports shared libraries... $ac_c" 1>&6
+
+echo "$ac_t$ld_shlibs" 1>&6
+test "$ld_shlibs" = no && can_build_shared=no
+
+# Check hardcoding attributes.
+echo $ac_n "checking how to hardcode library paths into programs... $ac_c" 1>&6
+hardcode_action=
+if test -n "$hardcode_libdir_flag_spec" || \
+ test -n "$runpath_var"; then
+
+ # We can hardcode non-existant directories.
+ if test "$hardcode_direct" != no &&
+ # If the only mechanism to avoid hardcoding is shlibpath_var, we
+ # have to relink, otherwise we might link with an installed library
+ # when we should be linking with a yet-to-be-installed one
+ ## test "$hardcode_shlibpath_var" != no &&
+ test "$hardcode_minus_L" != no; then
+ # Linking always hardcodes the temporary library directory.
+ hardcode_action=relink
+ else
+ # We can link without hardcoding, and we can hardcode nonexisting dirs.
+ hardcode_action=immediate
+ fi
+else
+ # We cannot hardcode anything, or else we can only hardcode existing
+ # directories.
+ hardcode_action=unsupported
+fi
+echo "$ac_t$hardcode_action" 1>&6
+
+echo $ac_n "checking whether stripping libraries is possible... $ac_c" 1>&6
+if test -n "$STRIP" && $STRIP -V 2>&1 | grep "GNU strip" >/dev/null; then
+ test -z "$old_striplib" && old_striplib="$STRIP --strip-debug"
+ test -z "$striplib" && striplib="$STRIP --strip-unneeded"
+ echo "${ac_t}yes" 1>&6
+else
+ echo "${ac_t}no" 1>&6
+fi
+
+case $reload_flag in
+"" | " "*) ;;
+*) reload_flag=" $reload_flag" ;;
+esac
+reload_cmds='$LD$reload_flag -o $output$reload_objs'
+test -z "$deplibs_check_method" && deplibs_check_method=unknown
+
+# PORTME Fill in your ld.so characteristics
+library_names_spec=
+libname_spec='lib$name'
+soname_spec=
+postinstall_cmds=
+postuninstall_cmds=
+finish_cmds=
+finish_eval=
+shlibpath_var=
+shlibpath_overrides_runpath=unknown
+version_type=none
+dynamic_linker="$host_os ld.so"
+sys_lib_dlsearch_path_spec="/lib /usr/lib"
+sys_lib_search_path_spec="/lib /usr/lib /usr/local/lib"
+
+echo $ac_n "checking dynamic linker characteristics... $ac_c" 1>&6
+case $host_os in
+aix3*)
+ version_type=linux
+ library_names_spec='${libname}${release}.so$versuffix $libname.a'
+ shlibpath_var=LIBPATH
+
+ # AIX 3 has no versioning support, so we append a major version to the name.
+ soname_spec='${libname}${release}.so$major'
+ ;;
+
+aix4* | aix5*)
+ if test "$host_cpu" = ia64; then
+ # AIX 5 supports IA64
+ library_names_spec='${libname}${release}.so$versuffix ${libname}${release}.so$major $libname.so'
+ shlibpath_var=LD_LIBRARY_PATH
+ else
+ # AIX (on Power*) has no versioning support, so currently we can not hardcode correct
+ # soname into executable. Probably we can add versioning support to
+ # collect2, so additional links can be useful in future.
+ # We preserve .a as extension for shared libraries though AIX4.2
+ # and later linker supports .so
+ if test "$aix_use_runtimelinking" = yes; then
+ # If using run time linking (on AIX 4.2 or later) use lib<name>.so instead of
+ # lib<name>.a to let people know that these are not typical AIX shared libraries.
+ library_names_spec='${libname}${release}.so$versuffix ${libname}${release}.so$major $libname.so'
+ else
+ # We preserve .a as extension for shared libraries though AIX4.2
+ # and later when we are not doing run time linking.
+ library_names_spec='${libname}${release}.a $libname.a'
+ soname_spec='${libname}${release}.so$major.o'
+ fi
+ # If we're using GNU nm, then we don't want the "-C" option.
+ # -C means demangle to AIX nm, but means don't demangle with GNU nm
+ if $NM -V 2>&1 | egrep '(GNU)' > /dev/null; then
+ export_symbols_cmds='$NM -Bpg $libobjs $convenience | awk '\''{ if (((\$2 == "T") || (\$2 == "D") || (\$2 == "B")) && (substr(\$3,1,1) != ".")) { print \$3 } }'\'' | sort -u > $export_symbols'
+ else
+ export_symbols_cmds='$NM -BCpg $libobjs $convenience | awk '\''{ if (((\$2 == "T") || (\$2 == "D") || (\$2 == "B")) && (substr(\$3,1,1) != ".")) { print \$3 } }'\'' | sort -u > $export_symbols'
+ fi
+ shlibpath_var=LIBPATH
+ deplibs_check_method=pass_all
+ case $host_os in
+ aix4 | aix4.[01] | aix4.[01].*)
+ if { echo '#if __GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ >= 97)'
+ echo ' yes '
+ echo '#endif'; } | ${CC} -E - | grep yes > /dev/null; then
+ :
+ else
+ # With GCC up to 2.95.x, collect2 would create an import file
+ # for dependence libraries. The import file would start with
+ # the line `#! .'. This would cause the generated library to
+ # depend on `.', always an invalid library. This was fixed in
+ # development snapshots of GCC prior to 3.0.
+ can_build_shared=no
+ fi
+ ;;
+ esac
+ fi
+ ;;
+
+amigaos*)
+ library_names_spec='$libname.ixlibrary $libname.a'
+ # Create ${libname}_ixlibrary.a entries in /sys/libs.
+ finish_eval='for lib in `ls $libdir/*.ixlibrary 2>/dev/null`; do libname=`$echo "X$lib" | $Xsed -e '\''s%^.*/\([^/]*\)\.ixlibrary$%\1%'\''`; test $rm /sys/libs/${libname}_ixlibrary.a; $show "(cd /sys/libs && $LN_S $lib ${libname}_ixlibrary.a)"; (cd /sys/libs && $LN_S $lib ${libname}_ixlibrary.a) || exit 1; done'
+ ;;
+
+beos*)
+ library_names_spec='${libname}.so'
+ dynamic_linker="$host_os ld.so"
+ shlibpath_var=LIBRARY_PATH
+ lt_cv_dlopen="load_add_on"
+ lt_cv_dlopen_libs=
+ lt_cv_dlopen_self=yes
+ ;;
+
+bsdi4*)
+ version_type=linux
+ need_version=no
+ library_names_spec='${libname}${release}.so$versuffix ${libname}${release}.so$major $libname.so'
+ soname_spec='${libname}${release}.so$major'
+ finish_cmds='PATH="\$PATH:/sbin" ldconfig $libdir'
+ shlibpath_var=LD_LIBRARY_PATH
+ sys_lib_search_path_spec="/shlib /usr/lib /usr/X11/lib /usr/contrib/lib /lib /usr/local/lib"
+ sys_lib_dlsearch_path_spec="/shlib /usr/lib /usr/local/lib"
+ export_dynamic_flag_spec=-rdynamic
+ # the default ld.so.conf also contains /usr/contrib/lib and
+ # /usr/X11R6/lib (/usr/X11 is a link to /usr/X11R6), but let us allow
+ # libtool to hard-code these into programs
+ ;;
+
+cygwin* | mingw* | pw32*)
+ version_type=windows
+ need_version=no
+ need_lib_prefix=no
+ case $with_gcc,$host_os in
+ yes,cygwin*)
+ library_names_spec='$libname.dll.a'
+ soname_spec='`echo ${libname} | sed -e 's/^lib/cyg/'``echo ${release} | [sed -e 's/[.]/-/g']`${versuffix}.dll'
+ postinstall_cmds='dlpath=`bash 2>&1 -c '\''. $dir/${file}i; echo \$dlname'\''`~
+ dldir=$destdir/`dirname \$dlpath`~
+ test -d \$dldir || mkdir -p \$dldir~
+ $install_prog .libs/$dlname \$dldir/$dlname'
+ postuninstall_cmds='dldll=`bash 2>&1 -c '\''. $file; echo \$dlname'\''`~
+ dlpath=$dir/\$dldll; $rm \$dlpath'
+ ;;
+ yes,mingw*)
+ library_names_spec='${libname}`echo ${release} | sed -e 's/[.]/-/g'`${versuffix}.dll'
+ sys_lib_search_path_spec=`$CC -print-search-dirs | grep "^libraries:" | sed -e "s/^libraries://" -e "s/;/ /g"`
+ ;;
+ yes,pw32*)
+ library_names_spec='`echo ${libname} | sed -e 's/^lib/pw/'``echo ${release} | sed -e 's/[.]/-/g'`${versuffix}.dll'
+;;
+ *)
+ library_names_spec='${libname}`echo ${release} | sed -e 's/[.]/-/g'`${versuffix}.dll $libname.lib'
+ ;;
+ esac
+ dynamic_linker='Win32 ld.exe'
+ # FIXME: first we should search . and the directory the executable is in
+ shlibpath_var=PATH
+ lt_cv_dlopen="LoadLibrary"
+ lt_cv_dlopen_libs=
+ ;;
+
+darwin* | rhapsody*)
+ dynamic_linker="$host_os dyld"
+ version_type=darwin
+ need_lib_prefix=no
+ need_version=no
+ library_names_spec='${libname}${release}${versuffix}.`test .$module = .yes && echo so || echo dylib` ${libname}${release}${major}.$`test .$module = .yes && echo so || echo dylib` ${libname}.`test .$module = .yes && echo so || echo dylib`'
+ soname_spec='${libname}${release}${major}.`test .$module = .yes && echo so || echo dylib`'
+ shlibpath_overrides_runpath=yes
+ shlibpath_var=DYLD_LIBRARY_PATH
+ ;;
+
+freebsd1*)
+ dynamic_linker=no
+ ;;
+
+freebsd*)
+ objformat=`test -x /usr/bin/objformat && /usr/bin/objformat || echo aout`
+ version_type=freebsd-$objformat
+ case $version_type in
+ freebsd-elf*)
+ library_names_spec='${libname}${release}.so$versuffix ${libname}${release}.so $libname.so'
+ need_version=no
+ need_lc=no
+ need_lib_prefix=no
+ ;;
+ freebsd-*)
+ library_names_spec='${libname}${release}.so$versuffix $libname.so$versuffix'
+ need_version=yes
+ ;;
+ esac
+ shlibpath_var=LD_LIBRARY_PATH
+ case $host_os in
+ freebsd2*)
+ shlibpath_overrides_runpath=yes
+ ;;
+ *)
+ shlibpath_overrides_runpath=no
+ hardcode_into_libs=yes
+ ;;
+ esac
+ ;;
+
+gnu*)
+ version_type=linux
+ need_lib_prefix=no
+ need_version=no
+ library_names_spec='${libname}${release}.so$versuffix ${libname}${release}.so${major} ${libname}.so'
+ soname_spec='${libname}${release}.so$major'
+ shlibpath_var=LD_LIBRARY_PATH
+ hardcode_into_libs=yes
+ ;;
+
+hpux9* | hpux10* | hpux11*)
+ # Give a soname corresponding to the major version so that dld.sl refuses to
+ # link against other versions.
+ dynamic_linker="$host_os dld.sl"
+ version_type=sunos
+ need_lib_prefix=no
+ need_version=no
+ shlibpath_var=SHLIB_PATH
+ shlibpath_overrides_runpath=no # +s is required to enable SHLIB_PATH
+ library_names_spec='${libname}${release}.sl$versuffix ${libname}${release}.sl$major $libname.sl'
+ soname_spec='${libname}${release}.sl$major'
+ # HP-UX runs *really* slowly unless shared libraries are mode 555.
+ postinstall_cmds='chmod 555 $lib'
+ ;;
+
+irix5* | irix6*)
+ version_type=irix
+ need_lib_prefix=no
+ need_version=no
+ soname_spec='${libname}${release}.so$major'
+ library_names_spec='${libname}${release}.so$versuffix ${libname}${release}.so$major ${libname}${release}.so $libname.so'
+ case $host_os in
+ irix5*)
+ libsuff= shlibsuff=
+ ;;
+ *)
+ case $LD in # libtool.m4 will add one of these switches to LD
+ *-32|*"-32 ") libsuff= shlibsuff= libmagic=32-bit;;
+ *-n32|*"-n32 ") libsuff=32 shlibsuff=N32 libmagic=N32;;
+ *-64|*"-64 ") libsuff=64 shlibsuff=64 libmagic=64-bit;;
+ *) libsuff= shlibsuff= libmagic=never-match;;
+ esac
+ ;;
+ esac
+ shlibpath_var=LD_LIBRARY${shlibsuff}_PATH
+ shlibpath_overrides_runpath=no
+ sys_lib_search_path_spec="/usr/lib${libsuff} /lib${libsuff} /usr/local/lib${libsuff}"
+ sys_lib_dlsearch_path_spec="/usr/lib${libsuff} /lib${libsuff}"
+ ;;
+
+# No shared lib support for Linux oldld, aout, or coff.
+linux-gnuoldld* | linux-gnuaout* | linux-gnucoff*)
+ dynamic_linker=no
+ ;;
+
+# This must be Linux ELF.
+linux-gnu*)
+ version_type=linux
+ need_lib_prefix=no
+ need_version=no
+ library_names_spec='${libname}${release}.so$versuffix ${libname}${release}.so$major $libname.so'
+ soname_spec='${libname}${release}.so$major'
+ finish_cmds='PATH="\$PATH:/sbin" ldconfig -n $libdir'
+ shlibpath_var=LD_LIBRARY_PATH
+ shlibpath_overrides_runpath=no
+ # This implies no fast_install, which is unacceptable.
+ # Some rework will be needed to allow for fast_install
+ # before this can be enabled.
+ hardcode_into_libs=yes
+
+ # We used to test for /lib/ld.so.1 and disable shared libraries on
+ # powerpc, because MkLinux only supported shared libraries with the
+ # GNU dynamic linker. Since this was broken with cross compilers,
+ # most powerpc-linux boxes support dynamic linking these days and
+ # people can always --disable-shared, the test was removed, and we
+ # assume the GNU/Linux dynamic linker is in use.
+ dynamic_linker='GNU/Linux ld.so'
+ ;;
+
+netbsd*)
+ need_lib_prefix=no
+ need_version=no
+ version_type=sunos
+ if echo __ELF__ | $CC -E - | grep __ELF__ >/dev/null; then
+ library_names_spec='${libname}${release}.so$versuffix ${libname}.so$versuffix'
+ finish_cmds='PATH="\$PATH:/sbin" ldconfig -m $libdir'
+ dynamic_linker='NetBSD (a.out) ld.so'
+ else
+ library_names_spec='${libname}${release}.so$versuffix ${libname}${release}.so$major ${libname}${release}.so ${libname}.so'
+ soname_spec='${libname}${release}.so$major'
+ dynamic_linker='NetBSD ld.elf_so'
+ fi
+ shlibpath_var=LD_LIBRARY_PATH
+ shlibpath_overrides_runpath=yes
+ hardcode_into_libs=yes
+ ;;
+
+newsos6)
+ version_type=linux
+ library_names_spec='${libname}${release}.so$versuffix ${libname}${release}.so$major $libname.so'
+ shlibpath_var=LD_LIBRARY_PATH
+ shlibpath_overrides_runpath=yes
+ ;;
+
+openbsd*)
+ version_type=sunos
+ if test "$with_gnu_ld" = yes; then
+ need_lib_prefix=no
+ need_version=no
+ fi
+ library_names_spec='${libname}${release}.so$versuffix ${libname}.so$versuffix'
+ finish_cmds='PATH="\$PATH:/sbin" ldconfig -m $libdir'
+ shlibpath_var=LD_LIBRARY_PATH
+ ;;
+
+os2*)
+ libname_spec='$name'
+ need_lib_prefix=no
+ library_names_spec='$libname.dll $libname.a'
+ dynamic_linker='OS/2 ld.exe'
+ shlibpath_var=LIBPATH
+ ;;
+
+osf3* | osf4* | osf5*)
+ version_type=osf
+ need_version=no
+ soname_spec='${libname}${release}.so'
+ library_names_spec='${libname}${release}.so$versuffix ${libname}${release}.so $libname.so'
+ shlibpath_var=LD_LIBRARY_PATH
+ sys_lib_search_path_spec="/usr/shlib /usr/ccs/lib /usr/lib/cmplrs/cc /usr/lib /usr/local/lib /var/shlib"
+ sys_lib_dlsearch_path_spec="$sys_lib_search_path_spec"
+ ;;
+
+sco3.2v5*)
+ version_type=osf
+ soname_spec='${libname}${release}.so$major'
+ library_names_spec='${libname}${release}.so$versuffix ${libname}${release}.so$major $libname.so'
+ shlibpath_var=LD_LIBRARY_PATH
+ ;;
+
+solaris*)
+ version_type=linux
+ need_lib_prefix=no
+ need_version=no
+ library_names_spec='${libname}${release}.so$versuffix ${libname}${release}.so$major $libname.so'
+ soname_spec='${libname}${release}.so$major'
+ shlibpath_var=LD_LIBRARY_PATH
+ shlibpath_overrides_runpath=yes
+ hardcode_into_libs=yes
+ # ldd complains unless libraries are executable
+ postinstall_cmds='chmod +x $lib'
+ ;;
+
+sunos4*)
+ version_type=sunos
+ library_names_spec='${libname}${release}.so$versuffix ${libname}.so$versuffix'
+ finish_cmds='PATH="\$PATH:/usr/etc" ldconfig $libdir'
+ shlibpath_var=LD_LIBRARY_PATH
+ shlibpath_overrides_runpath=yes
+ if test "$with_gnu_ld" = yes; then
+ need_lib_prefix=no
+ fi
+ need_version=yes
+ ;;
+
+sysv4 | sysv4.2uw2* | sysv4.3* | sysv5*)
+ version_type=linux
+ library_names_spec='${libname}${release}.so$versuffix ${libname}${release}.so$major $libname.so'
+ soname_spec='${libname}${release}.so$major'
+ shlibpath_var=LD_LIBRARY_PATH
+ case $host_vendor in
+ motorola)
+ need_lib_prefix=no
+ need_version=no
+ shlibpath_overrides_runpath=no
+ sys_lib_search_path_spec='/lib /usr/lib /usr/ccs/lib'
+ ;;
+ esac
+ ;;
+
+uts4*)
+ version_type=linux
+ library_names_spec='${libname}${release}.so$versuffix ${libname}${release}.so$major $libname.so'
+ soname_spec='${libname}${release}.so$major'
+ shlibpath_var=LD_LIBRARY_PATH
+ ;;
+
+dgux*)
+ version_type=linux
+ need_lib_prefix=no
+ need_version=no
+ library_names_spec='${libname}${release}.so$versuffix ${libname}${release}.so$major $libname.so'
+ soname_spec='${libname}${release}.so$major'
+ shlibpath_var=LD_LIBRARY_PATH
+ ;;
+
+sysv4*MP*)
+ if test -d /usr/nec ;then
+ version_type=linux
+ library_names_spec='$libname.so.$versuffix $libname.so.$major $libname.so'
+ soname_spec='$libname.so.$major'
+ shlibpath_var=LD_LIBRARY_PATH
+ fi
+ ;;
+
+*)
+ dynamic_linker=no
+ ;;
+esac
+echo "$ac_t$dynamic_linker" 1>&6
+test "$dynamic_linker" = no && can_build_shared=no
+
+# Check for command to grab the raw symbol name followed by C symbol from nm.
+echo $ac_n "checking command to parse $NM output... $ac_c" 1>&6
+
+# These are sane defaults that work on at least a few old systems.
+# [They come from Ultrix. What could be older than Ultrix?!! ;)]
+
+# Character class describing NM global symbol codes.
+symcode='[BCDEGRST]'
+
+# Regexp to match symbols that can be accessed directly from C.
+sympat='\([_A-Za-z][_A-Za-z0-9]*\)'
+
+# Transform the above into a raw symbol and a C symbol.
+symxfrm='\1 \2\3 \3'
+
+# Transform an extracted symbol line into a proper C declaration
+global_symbol_to_cdecl="sed -n -e 's/^. .* \(.*\)$/extern char \1;/p'"
+
+# Define system-specific variables.
+case $host_os in
+aix*)
+ symcode='[BCDT]'
+ ;;
+cygwin* | mingw* | pw32*)
+ symcode='[ABCDGISTW]'
+ ;;
+hpux*) # Its linker distinguishes data from code symbols
+ global_symbol_to_cdecl="sed -n -e 's/^T .* \(.*\)$/extern char \1();/p' -e 's/^$symcode* .* \(.*\)$/extern char \1;/p'"
+ ;;
+irix*)
+ symcode='[BCDEGRST]'
+ ;;
+solaris* | sysv5*)
+ symcode='[BDT]'
+ ;;
+sysv4)
+ symcode='[DFNSTU]'
+ ;;
+esac
+
+# Handle CRLF in mingw tool chain
+opt_cr=
+case $host_os in
+mingw*)
+ opt_cr=`echo 'x\{0,1\}' | tr x '\015'` # option cr in regexp
+ ;;
+esac
+
+# If we're using GNU nm, then use its standard symbol codes.
+if $NM -V 2>&1 | egrep '(GNU|with BFD)' > /dev/null; then
+ symcode='[ABCDGISTW]'
+fi
+
+# Try without a prefix undercore, then with it.
+for ac_symprfx in "" "_"; do
+
+ # Write the raw and C identifiers.
+ global_symbol_pipe="sed -n -e 's/^.*[ ]\($symcode$symcode*\)[ ][ ]*\($ac_symprfx\)$sympat$opt_cr$/$symxfrm/p'"
+
+ # Check to see that the pipe works correctly.
+ pipe_works=no
+ $rm conftest*
+ cat > conftest.$ac_ext <<EOF
+#ifdef __cplusplus
+extern "C" {
+#endif
+char nm_test_var;
+void nm_test_func(){}
+#ifdef __cplusplus
+}
+#endif
+int main(){nm_test_var='a';nm_test_func();return(0);}
+EOF
+
+ echo "$progname:1430: checking if global_symbol_pipe works" >&5
+ if { (eval echo $progname:1431: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; } && test -s conftest.$objext; then
+ # Now try to grab the symbols.
+ nlist=conftest.nm
+ if { echo "$progname:1434: eval \"$NM conftest.$objext | $global_symbol_pipe > $nlist\"" >&5; eval "$NM conftest.$objext | $global_symbol_pipe > $nlist 2>&5"; } && test -s "$nlist"; then
+
+ # Try sorting and uniquifying the output.
+ if sort "$nlist" | uniq > "$nlist"T; then
+ mv -f "$nlist"T "$nlist"
+ else
+ rm -f "$nlist"T
+ fi
+
+ # Make sure that we snagged all the symbols we need.
+ if egrep ' nm_test_var$' "$nlist" >/dev/null; then
+ if egrep ' nm_test_func$' "$nlist" >/dev/null; then
+ cat <<EOF > conftest.$ac_ext
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+EOF
+ # Now generate the symbol file.
+ eval "$global_symbol_to_cdecl"' < "$nlist" >> conftest.$ac_ext'
+
+ cat <<EOF >> conftest.$ac_ext
+#if defined (__STDC__) && __STDC__
+# define lt_ptr_t void *
+#else
+# define lt_ptr_t char *
+# define const
+#endif
+
+/* The mapping between symbol names and symbols. */
+const struct {
+ const char *name;
+ lt_ptr_t address;
+}
+lt_preloaded_symbols[] =
+{
+EOF
+ sed "s/^$symcode$symcode* \(.*\) \(.*\)$/ {\"\2\", (lt_ptr_t) \&\2},/" < "$nlist" >> conftest.$ac_ext
+ cat <<\EOF >> conftest.$ac_ext
+ {0, (lt_ptr_t) 0}
+};
+
+#ifdef __cplusplus
+}
+#endif
+EOF
+ # Now try linking the two files.
+ mv conftest.$objext conftstm.$objext
+ save_LIBS="$LIBS"
+ save_CFLAGS="$CFLAGS"
+ LIBS="conftstm.$objext"
+ CFLAGS="$CFLAGS$no_builtin_flag"
+ if { (eval echo $progname:1486: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest; then
+ pipe_works=yes
+ else
+ echo "$progname: failed program was:" >&5
+ cat conftest.$ac_ext >&5
+ fi
+ LIBS="$save_LIBS"
+ else
+ echo "cannot find nm_test_func in $nlist" >&5
+ fi
+ else
+ echo "cannot find nm_test_var in $nlist" >&5
+ fi
+ else
+ echo "cannot run $global_symbol_pipe" >&5
+ fi
+ else
+ echo "$progname: failed program was:" >&5
+ cat conftest.$ac_ext >&5
+ fi
+ $rm conftest* conftst*
+
+ # Do not use the global_symbol_pipe unless it works.
+ if test "$pipe_works" = yes; then
+ break
+ else
+ global_symbol_pipe=
+ fi
+done
+if test "$pipe_works" = yes; then
+ echo "${ac_t}ok" 1>&6
+else
+ echo "${ac_t}failed" 1>&6
+fi
+
+if test -z "$global_symbol_pipe"; then
+ global_symbol_to_cdecl=
+fi
+
+# Report the final consequences.
+echo "checking if libtool supports shared libraries... $can_build_shared" 1>&6
+
+# Only try to build win32 dlls if AC_LIBTOOL_WIN32_DLL was used in
+# configure.in, otherwise build static only libraries.
+case $host_os in
+cygwin* | mingw* | pw32* | os2*)
+ if test x$can_build_shared = xyes; then
+ test x$enable_win32_dll = xno && can_build_shared=no
+ echo "checking if package supports dlls... $can_build_shared" 1>&6
+ fi
+;;
+esac
+
+echo $ac_n "checking whether to build shared libraries... $ac_c" 1>&6
+test "$can_build_shared" = "no" && enable_shared=no
+
+# On AIX, shared libraries and static libraries use the same namespace, and
+# are all built from PIC.
+case $host_os in
+aix3*)
+ test "$enable_shared" = yes && enable_static=no
+ if test -n "$RANLIB"; then
+ archive_cmds="$archive_cmds~\$RANLIB \$lib"
+ postinstall_cmds='$RANLIB $lib'
+ fi
+ ;;
+
+aix4*)
+ test "$enable_shared" = yes && enable_static=no
+ ;;
+esac
+
+echo "$ac_t$enable_shared" 1>&6
+
+# Make sure either enable_shared or enable_static is yes.
+test "$enable_shared" = yes || enable_static=yes
+
+echo "checking whether to build static libraries... $enable_static" 1>&6
+
+if test "$hardcode_action" = relink; then
+ # Fast installation is not supported
+ enable_fast_install=no
+elif test "$shlibpath_overrides_runpath" = yes ||
+ test "$enable_shared" = no; then
+ # Fast installation is not necessary
+ enable_fast_install=needless
+fi
+
+variables_saved_for_relink="PATH $shlibpath_var $runpath_var"
+if test "$with_gcc" = yes; then
+ variables_saved_for_relink="$variables_saved_for_relink GCC_EXEC_PREFIX COMPILER_PATH LIBRARY_PATH"
+fi
+
+# Check whether we must set pic_mode to default
+test -z "$pic_flag" && pic_mode=default
+
+if test "x$enable_dlopen" != xyes; then
+ enable_dlopen=unknown
+ enable_dlopen_self=unknown
+ enable_dlopen_self_static=unknown
+else
+if test "X${lt_cv_dlopen+set}" != Xset; then
+ lt_cv_dlopen=no lt_cv_dlopen_libs=
+echo $ac_n "checking for dlopen in -ldl""... $ac_c" 1>&6
+echo "$progname:1590: checking for dlopen in -ldl" >&5
+if test "X${ac_cv_lib_dl_dlopen+set}" = Xset; then
+ echo $ac_n "(cached) $ac_c" 1>&6
+else
+ ac_save_LIBS="$LIBS"
+LIBS="-ldl $LIBS"
+cat > conftest.$ac_ext <<EOF
+#line 1597 "ltconfig"
+/* Override any gcc2 internal prototype to avoid an error. */
+/* We use char because int might match the return type of a gcc2
+ builtin and then its argument prototype would still apply. */
+#ifdef __cplusplus
+extern "C"
+#endif
+char dlopen();
+
+int main() {
+dlopen()
+; return 0; }
+EOF
+if { (eval echo $progname:1610: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+ rm -rf conftest*
+ ac_cv_lib_dl_dlopen=yes
+else
+ echo "$progname: failed program was:" >&5
+ cat conftest.$ac_ext >&5
+ rm -rf conftest*
+ ac_cv_lib_dl_dlopen=no
+fi
+rm -f conftest*
+LIBS="$ac_save_LIBS"
+
+fi
+if test "X$ac_cv_lib_dl_dlopen" = Xyes; then
+ echo "$ac_t""yes" 1>&6
+ lt_cv_dlopen="dlopen" lt_cv_dlopen_libs="-ldl"
+else
+ echo "$ac_t""no" 1>&6
+echo $ac_n "checking for dlopen""... $ac_c" 1>&6
+echo "$progname:1629: checking for dlopen" >&5
+if test "X${ac_cv_func_dlopen+set}" = Xset; then
+ echo $ac_n "(cached) $ac_c" 1>&6
+else
+ cat > conftest.$ac_ext <<EOF
+#line 1634 "ltconfig"
+/* System header to define __stub macros and hopefully few prototypes,
+ which can conflict with char dlopen(); below. */
+#include <assert.h>
+/* Override any gcc2 internal prototype to avoid an error. */
+/* We use char because int might match the return type of a gcc2
+ builtin and then its argument prototype would still apply. */
+#ifdef __cplusplus
+extern "C"
+#endif
+char dlopen();
+
+int main() {
+
+/* The GNU C library defines this for functions which it implements
+ to always fail with ENOSYS. Some functions are actually named
+ something starting with __ and the normal name is an alias. */
+#if defined (__stub_dlopen) || defined (__stub___dlopen)
+choke me
+#else
+dlopen();
+#endif
+
+; return 0; }
+EOF
+if { (eval echo $progname:1659: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+ rm -rf conftest*
+ ac_cv_func_dlopen=yes
+else
+ echo "$progname: failed program was:" >&5
+ cat conftest.$ac_ext >&5
+ rm -rf conftest*
+ ac_cv_func_dlopen=no
+fi
+rm -f conftest*
+fi
+if test "X$ac_cv_func_dlopen" = Xyes; then
+ echo "$ac_t""yes" 1>&6
+ lt_cv_dlopen="dlopen"
+else
+ echo "$ac_t""no" 1>&6
+echo $ac_n "checking for dlopen in -lsvld""... $ac_c" 1>&6
+echo "$progname:1676: checking for dlopen in -lsvld" >&5
+if test "X${ac_cv_lib_svld_dlopen+set}" = Xset; then
+ echo $ac_n "(cached) $ac_c" 1>&6
+else
+ ac_save_LIBS="$LIBS"
+LIBS="-lsvld $LIBS"
+cat > conftest.$ac_ext <<EOF
+#line 1683 "ltconfig"
+/* Override any gcc2 internal prototype to avoid an error. */
+/* We use char because int might match the return type of a gcc2
+ builtin and then its argument prototype would still apply. */
+#ifdef __cplusplus
+extern "C"
+#endif
+char dlopen();
+
+int main() {
+dlopen()
+; return 0; }
+EOF
+if { (eval echo $progname:1696: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+ rm -rf conftest*
+ ac_cv_lib_svld_dlopen=yes
+else
+ echo "$progname: failed program was:" >&5
+ cat conftest.$ac_ext >&5
+ rm -rf conftest*
+ ac_cv_lib_svld_dlopen=no
+fi
+rm -f conftest*
+LIBS="$ac_save_LIBS"
+
+fi
+if test "X$ac_cv_lib_svld_dlopen" = Xyes; then
+ echo "$ac_t""yes" 1>&6
+ lt_cv_dlopen="dlopen" lt_cv_dlopen_libs="-lsvld"
+else
+ echo "$ac_t""no" 1>&6
+echo $ac_n "checking for dld_link in -ldld""... $ac_c" 1>&6
+echo "$progname:1715: checking for dld_link in -ldld" >&5
+if test "X${ac_cv_lib_dld_dld_link+set}" = Xset; then
+ echo $ac_n "(cached) $ac_c" 1>&6
+else
+ ac_save_LIBS="$LIBS"
+LIBS="-ldld $LIBS"
+cat > conftest.$ac_ext <<EOF
+#line 1722 "ltconfig"
+/* Override any gcc2 internal prototype to avoid an error. */
+/* We use char because int might match the return type of a gcc2
+ builtin and then its argument prototype would still apply. */
+#ifdef __cplusplus
+extern "C"
+#endif
+char dld_link();
+
+int main() {
+dld_link()
+; return 0; }
+EOF
+if { (eval echo $progname:1735: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+ rm -rf conftest*
+ ac_cv_lib_dld_dld_link=yes
+else
+ echo "$progname: failed program was:" >&5
+ cat conftest.$ac_ext >&5
+ rm -rf conftest*
+ ac_cv_lib_dld_dld_link=no
+fi
+rm -f conftest*
+LIBS="$ac_save_LIBS"
+
+fi
+if test "X$ac_cv_lib_dld_dld_link" = Xyes; then
+ echo "$ac_t""yes" 1>&6
+ lt_cv_dlopen="dld_link" lt_cv_dlopen_libs="-ldld"
+else
+ echo "$ac_t""no" 1>&6
+echo $ac_n "checking for shl_load""... $ac_c" 1>&6
+echo "$progname:1754: checking for shl_load" >&5
+if test "X${ac_cv_func_shl_load+set}" = Xset; then
+ echo $ac_n "(cached) $ac_c" 1>&6
+else
+ cat > conftest.$ac_ext <<EOF
+#line 1759 "ltconfig"
+/* System header to define __stub macros and hopefully few prototypes,
+ which can conflict with char shl_load(); below. */
+#include <assert.h>
+/* Override any gcc2 internal prototype to avoid an error. */
+/* We use char because int might match the return type of a gcc2
+ builtin and then its argument prototype would still apply. */
+#ifdef __cplusplus
+extern "C"
+#endif
+char shl_load();
+
+int main() {
+
+/* The GNU C library defines this for functions which it implements
+ to always fail with ENOSYS. Some functions are actually named
+ something starting with __ and the normal name is an alias. */
+#if defined (__stub_shl_load) || defined (__stub___shl_load)
+choke me
+#else
+shl_load();
+#endif
+
+; return 0; }
+EOF
+if { (eval echo $progname:1784: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+ rm -rf conftest*
+ ac_cv_func_shl_load=yes
+else
+ echo "$progname: failed program was:" >&5
+ cat conftest.$ac_ext >&5
+ rm -rf conftest*
+ ac_cv_func_shl_load=no
+fi
+rm -f conftest*
+fi
+
+if test "X$ac_cv_func_shl_load" = Xyes; then
+ echo "$ac_t""yes" 1>&6
+ lt_cv_dlopen="shl_load"
+else
+ echo "$ac_t""no" 1>&6
+echo $ac_n "checking for shl_load in -ldld""... $ac_c" 1>&6
+echo "$progname:1802: checking for shl_load in -ldld" >&5
+if test "X${ac_cv_lib_dld_shl_load+set}" = Xset; then
+ echo $ac_n "(cached) $ac_c" 1>&6
+else
+ ac_save_LIBS="$LIBS"
+LIBS="-ldld $LIBS"
+cat > conftest.$ac_ext <<EOF
+#line 1809 "ltconfig"
+#include "confdefs.h"
+/* Override any gcc2 internal prototype to avoid an error. */
+/* We use char because int might match the return type of a gcc2
+ builtin and then its argument prototype would still apply. */
+#ifdef __cplusplus
+extern "C"
+#endif
+char shl_load();
+
+int main() {
+shl_load()
+; return 0; }
+EOF
+if { (eval echo $progname:1823: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+ rm -rf conftest*
+ ac_cv_lib_dld_shl_load=yes
+else
+ echo "$progname: failed program was:" >&5
+ cat conftest.$ac_ext >&5
+ rm -rf conftest*
+ ac_cv_lib_dld_shl_load=no
+fi
+rm -f conftest*
+LIBS="$ac_save_LIBS"
+
+fi
+if test "X$ac_cv_lib_dld_shl_load" = Xyes; then
+ echo "$ac_t""yes" 1>&6
+ lt_cv_dlopen="shl_load" lt_cv_dlopen_libs="-ldld"
+else
+ echo "$ac_t""no" 1>&6
+fi
+
+
+fi
+
+
+fi
+
+
+fi
+
+
+fi
+
+fi
+
+fi
+
+ if test "x$lt_cv_dlopen" != xno; then
+ enable_dlopen=yes
+ else
+ enable_dlopen=no
+ fi
+
+ case $lt_cv_dlopen in
+ dlopen)
+for ac_hdr in dlfcn.h; do
+ac_safe=`echo "$ac_hdr" | sed 'y%./+-%__p_%'`
+echo $ac_n "checking for $ac_hdr""... $ac_c" 1>&6
+echo "$progname:1870: checking for $ac_hdr" >&5
+if eval "test \"`echo 'X$''{'ac_cv_header_$ac_safe'+set}'`\" = Xset"; then
+ echo $ac_n "(cached) $ac_c" 1>&6
+else
+ cat > conftest.$ac_ext <<EOF
+#line 1875 "ltconfig"
+#include <$ac_hdr>
+int fnord = 0;
+int main () { return(0); }
+EOF
+ac_try="$ac_compile >/dev/null 2>conftest.out"
+{ (eval echo $progname:1881: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }
+ac_err=`grep -v '^ *+' conftest.out | grep -v "^conftest.${ac_ext}\$"`
+if test -z "$ac_err"; then
+ rm -rf conftest*
+ eval "ac_cv_header_$ac_safe=yes"
+else
+ echo "$ac_err" >&5
+ echo "$progname: failed program was:" >&5
+ cat conftest.$ac_ext >&5
+ rm -rf conftest*
+ eval "ac_cv_header_$ac_safe=no"
+fi
+rm -f conftest*
+fi
+if eval "test \"`echo '$ac_cv_header_'$ac_safe`\" = yes"; then
+ echo "$ac_t""yes" 1>&6
+else
+ echo "$ac_t""no" 1>&6
+fi
+done
+
+ if test "x$ac_cv_header_dlfcn_h" = xyes; then
+ CPPFLAGS="$CPPFLAGS -DHAVE_DLFCN_H"
+ fi
+ eval LDFLAGS=\"\$LDFLAGS $export_dynamic_flag_spec\"
+ LIBS="$lt_cv_dlopen_libs $LIBS"
+
+ echo $ac_n "checking whether a program can dlopen itself""... $ac_c" 1>&6
+echo "$progname:1909: checking whether a program can dlopen itself" >&5
+if test "X${lt_cv_dlopen_self+set}" = Xset; then
+ echo $ac_n "(cached) $ac_c" 1>&6
+else
+ if test "$cross_compiling" = yes; then
+ lt_cv_dlopen_self=cross
+ else
+ cat > conftest.$ac_ext <<EOF
+#line 1917 "ltconfig"
+
+#if HAVE_DLFCN_H
+#include <dlfcn.h>
+#endif
+
+#include <stdio.h>
+
+#ifdef RTLD_GLOBAL
+# define LTDL_GLOBAL RTLD_GLOBAL
+#else
+# ifdef DL_GLOBAL
+# define LTDL_GLOBAL DL_GLOBAL
+# else
+# define LTDL_GLOBAL 0
+# endif
+#endif
+
+/* We may have to define LTDL_LAZY_OR_NOW in the command line if we
+ find out it does not work in some platform. */
+#ifndef LTDL_LAZY_OR_NOW
+# ifdef RTLD_LAZY
+# define LTDL_LAZY_OR_NOW RTLD_LAZY
+# else
+# ifdef DL_LAZY
+# define LTDL_LAZY_OR_NOW DL_LAZY
+# else
+# ifdef RTLD_NOW
+# define LTDL_LAZY_OR_NOW RTLD_NOW
+# else
+# ifdef DL_NOW
+# define LTDL_LAZY_OR_NOW DL_NOW
+# else
+# define LTDL_LAZY_OR_NOW 0
+# endif
+# endif
+# endif
+# endif
+#endif
+
+void fnord() { int i=42; }
+int main() {
+ void *self, *ptr1, *ptr2; self=dlopen(0,LTDL_GLOBAL|LTDL_LAZY_OR_NOW);
+ if(self) { ptr1=dlsym(self,"fnord"); ptr2=dlsym(self,"_fnord");
+ if(ptr1 || ptr2) { dlclose(self); exit(0); } } exit(1); }
+
+EOF
+if { (eval echo $progname:1964: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest && (./conftest; exit) 2>/dev/null
+then
+ lt_cv_dlopen_self=yes
+else
+ echo "$progname: failed program was:" >&5
+ cat conftest.$ac_ext >&5
+ rm -fr conftest*
+ lt_cv_dlopen_self=no
+fi
+rm -fr conftest*
+fi
+
+fi
+
+echo "$ac_t""$lt_cv_dlopen_self" 1>&6
+
+ if test "$lt_cv_dlopen_self" = yes; then
+ LDFLAGS="$LDFLAGS $link_static_flag"
+ echo $ac_n "checking whether a statically linked program can dlopen itself""... $ac_c" 1>&6
+echo "$progname:1983: checking whether a statically linked program can dlopen itself" >&5
+if test "X${lt_cv_dlopen_self_static+set}" = Xset; then
+ echo $ac_n "(cached) $ac_c" 1>&6
+else
+ if test "$cross_compiling" = yes; then
+ lt_cv_dlopen_self_static=cross
+ else
+ cat > conftest.$ac_ext <<EOF
+#line 1991 "ltconfig"
+
+#if HAVE_DLFCN_H
+#include <dlfcn.h>
+#endif
+
+#include <stdio.h>
+
+#ifdef RTLD_GLOBAL
+# define LTDL_GLOBAL RTLD_GLOBAL
+#else
+# ifdef DL_GLOBAL
+# define LTDL_GLOBAL DL_GLOBAL
+# else
+# define LTDL_GLOBAL 0
+# endif
+#endif
+
+/* We may have to define LTDL_LAZY_OR_NOW in the command line if we
+ find out it does not work in some platform. */
+#ifndef LTDL_LAZY_OR_NOW
+# ifdef RTLD_LAZY
+# define LTDL_LAZY_OR_NOW RTLD_LAZY
+# else
+# ifdef DL_LAZY
+# define LTDL_LAZY_OR_NOW DL_LAZY
+# else
+# ifdef RTLD_NOW
+# define LTDL_LAZY_OR_NOW RTLD_NOW
+# else
+# ifdef DL_NOW
+# define LTDL_LAZY_OR_NOW DL_NOW
+# else
+# define LTDL_LAZY_OR_NOW 0
+# endif
+# endif
+# endif
+# endif
+#endif
+
+void fnord() { int i=42; }
+int main() {
+ void *self, *ptr1, *ptr2; self=dlopen(0,LTDL_GLOBAL|LTDL_LAZY_OR_NOW);
+ if(self) { ptr1=dlsym(self,"fnord"); ptr2=dlsym(self,"_fnord");
+ if(ptr1 || ptr2) { dlclose(self); exit(0); } } exit(1); }
+
+EOF
+if { (eval echo $progname:2038: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest && (./conftest; exit) 2>/dev/null
+then
+ lt_cv_dlopen_self_static=yes
+else
+ echo "$progname: failed program was:" >&5
+ cat conftest.$ac_ext >&5
+ rm -fr conftest*
+ lt_cv_dlopen_self_static=no
+fi
+rm -fr conftest*
+fi
+
+fi
+
+echo "$ac_t""$lt_cv_dlopen_self_static" 1>&6
+fi
+ ;;
+ esac
+
+ case $lt_cv_dlopen_self in
+ yes|no) enable_dlopen_self=$lt_cv_dlopen_self ;;
+ *) enable_dlopen_self=unknown ;;
+ esac
+
+ case $lt_cv_dlopen_self_static in
+ yes|no) enable_dlopen_self_static=$lt_cv_dlopen_self_static ;;
+ *) enable_dlopen_self_static=unknown ;;
+ esac
+fi
+
+# Copy echo and quote the copy, instead of the original, because it is
+# used later.
+ltecho="$echo"
+if test "X$ltecho" = "X$CONFIG_SHELL $0 --fallback-echo"; then
+ ltecho="$CONFIG_SHELL \$0 --fallback-echo"
+fi
+LTSHELL="$SHELL"
+
+LTCONFIG_VERSION="$VERSION"
+
+# Only quote variables if we're using ltmain.sh.
+case $ltmain in
+*.sh)
+ # Now quote all the things that may contain metacharacters.
+ for var in ltecho old_AR old_AR_FLAGS old_CC old_LTCC old_CFLAGS old_CPPFLAGS \
+ old_MAGIC_CMD old_LD old_LDFLAGS old_LIBS \
+ old_LN_S old_NM old_RANLIB old_STRIP \
+ old_AS old_DLLTOOL old_OBJDUMP \
+ old_OBJEXT old_EXEEXT old_reload_flag \
+ old_deplibs_check_method old_file_magic_cmd \
+ AR AR_FLAGS CC LTCC LD LN_S NM LTSHELL LTCONFIG_VERSION \
+ reload_flag reload_cmds wl \
+ pic_flag link_static_flag no_builtin_flag export_dynamic_flag_spec \
+ thread_safe_flag_spec whole_archive_flag_spec libname_spec \
+ library_names_spec soname_spec \
+ RANLIB old_archive_cmds old_archive_from_new_cmds old_postinstall_cmds \
+ old_postuninstall_cmds archive_cmds archive_expsym_cmds postinstall_cmds \
+ postuninstall_cmds extract_expsyms_cmds old_archive_from_expsyms_cmds \
+ predep_objects postdep_objects predeps postdeps compiler_lib_search_path \
+ old_striplib striplib file_magic_cmd export_symbols_cmds \
+ deplibs_check_method allow_undefined_flag no_undefined_flag \
+ finish_cmds finish_eval global_symbol_pipe global_symbol_to_cdecl \
+ hardcode_libdir_flag_spec hardcode_libdir_separator \
+ sys_lib_search_path_spec sys_lib_dlsearch_path_spec \
+ compiler_c_o need_locks exclude_expsyms include_expsyms; do
+
+ case $var in
+ reload_cmds | old_archive_cmds | old_archive_from_new_cmds | \
+ old_postinstall_cmds | old_postuninstall_cmds | \
+ export_symbols_cmds | archive_cmds | archive_expsym_cmds | \
+ extract_expsyms_cmds | old_archive_from_expsyms_cmds | \
+ postinstall_cmds | postuninstall_cmds | \
+ finish_cmds | sys_lib_search_path_spec | sys_lib_dlsearch_path_spec)
+ # Double-quote double-evaled strings.
+ eval "$var=\\\"\`\$echo \"X\$$var\" | \$Xsed -e \"\$double_quote_subst\" -e \"\$sed_quote_subst\" -e \"\$delay_variable_subst\"\`\\\"" ### testsuite: skip nested quoting test
+ ;;
+ *)
+ eval "$var=\\\"\`\$echo \"X\$$var\" | \$Xsed -e \"\$sed_quote_subst\"\`\\\"" ### testsuite: skip nested quoting test
+ ;;
+ esac
+ done
+
+ case $ltecho in
+ *'\$0 --fallback-echo"')
+ ltecho=`$echo "X$ltecho" | $Xsed -e 's/\\\\\\\$0 --fallback-echo"$/$0 --fallback-echo"/'`
+ ;;
+ esac
+
+ if test -z "$tagname"; then
+ trap "$rm \"$ofile\"; exit 1" 1 2 15
+ echo "creating $ofile"
+ $rm "$ofile"
+ cat <<EOF > "$ofile"
+#! $SHELL
+
+# `$echo "$ofile" | sed 's%^.*/%%'` - Provide generalized library-building support services.
+# Generated automatically by $PROGRAM (GNU $PACKAGE $VERSION$TIMESTAMP)
+# NOTE: Changes made to this file will be lost: look at ltconfig or ltmain.sh.
+#
+# Copyright (C) 1996-2000 Free Software Foundation, Inc.
+# Originally by Gordon Matzigkeit <gord@gnu.ai.mit.edu>, 1996
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+#
+# As a special exception to the GNU General Public License, if you
+# distribute this file as part of a program that contains a
+# configuration script generated by Autoconf, you may include it under
+# the same distribution terms that you use for the rest of that program.
+
+# Sed that helps us avoid accidentally triggering echo(1) options like -n.
+Xsed="sed -e s/^X//"
+
+# The HP-UX ksh and POSIX shell print the target directory to stdout
+# if CDPATH is set.
+if test "X\${CDPATH+set}" = Xset; then CDPATH=:; export CDPATH; fi
+
+# The names of the tagged configurations supported by this script.
+available_tags=
+
+### BEGIN LIBTOOL CONFIG
+EOF
+ else
+ echo "appending configuration tag \"$tagname\" to $ofile"
+ echo "### BEGIN LIBTOOL TAG CONFIG: $tagname" >> "$ofile"
+ fi
+ cfgfile="$ofile"
+ ;;
+
+*)
+ # Double-quote the variables that need it (for aesthetics).
+ for var in old_AR old_AR_FLAGS old_CC old_LTCC old_CFLAGS old_CPPFLAGS \
+ old_MAGIC_CMD old_LD old_LDFLAGS old_LIBS \
+ old_LN_S old_NM old_RANLIB old_STRIP \
+ old_AS old_DLLTOOL old_OBJDUMP \
+ old_OBJEXT old_EXEEXT old_reload_flag \
+ old_deplibs_check_method old_file_magic_cmd; do
+ eval "$var=\\\"\$var\\\""
+ done
+
+ # Just create a config file.
+ cfgfile="$ofile.cfg"
+ if test -z "$tagname"; then
+ trap "$rm \"$cfgfile\"; exit 1" 1 2 15
+ echo "creating $cfgfile"
+ $rm "$cfgfile"
+ cat <<EOF > "$cfgfile"
+# `$echo "$cfgfile" | sed 's%^.*/%%'` - Libtool configuration file.
+# Generated automatically by $PROGRAM (GNU $PACKAGE $VERSION$TIMESTAMP)
+
+### BEGIN LIBTOOL CONFIG
+EOF
+ else
+ echo "appending to $cfgfile"
+ echo "### BEGIN LIBTOOL TAG CONFIG: $tagname" >> "$ofile"
+ fi
+ ;;
+esac
+
+cat <<EOF >> "$cfgfile"
+# Libtool was configured as follows, on host `(hostname || uname -n) 2>/dev/null | sed 1q`:
+#
+# AR=$old_AR AR_FLAGS=$old_AR_FLAGS LTCC=$old_LTCC CC=$old_CC \\
+# CFLAGS=$old_CFLAGS CPPFLAGS=$old_CPPFLAGS \\
+# MAGIC_CMD=$old_MAGIC_CMD LD=$old_LD LDFLAGS=$old_LDFLAGS LIBS=$old_LIBS \\
+# LN_S=$old_LN_S NM=$old_NM RANLIB=$old_RANLIB STRIP=$old_STRIP \\
+# AS=$old_AS DLLTOOL=$old_DLLTOOL OBJDUMP=$old_OBJDUMP \\
+# objext=$old_OBJEXT exeext=$old_EXEEXT reload_flag=$old_reload_flag \\
+# deplibs_check_method=$old_deplibs_check_method \\
+# file_magic_cmd=$old_file_magic_cmd \\
+# $0$ltconfig_args
+#
+# Compiler and other test output produced by $progname, useful for
+# debugging $progname, is in ./config.log if it exists.
+
+# The version of $progname that generated this script.
+LTCONFIG_VERSION=$LTCONFIG_VERSION
+
+# Shell to use when invoking shell scripts.
+SHELL=$LTSHELL
+
+# Whether or not to build shared libraries.
+build_libtool_libs=$enable_shared
+
+# Whether or not to add -lc for building shared libraries.
+build_libtool_need_lc=$need_lc
+
+# Whether or not to build static libraries.
+build_old_libs=$enable_static
+
+# Whether or not to optimize for fast installation.
+fast_install=$enable_fast_install
+
+# The host system.
+host_alias=$host_alias
+host=$host
+
+# An echo program that does not interpret backslashes.
+echo=$ltecho
+
+# The archiver.
+AR=$AR
+AR_FLAGS=$AR_FLAGS
+
+# A C compiler.
+LTCC=$LTCC
+
+# A language-specific compiler.
+CC=$CC
+
+# Is the compiler the GNU C compiler?
+with_gcc=$with_gcc
+
+# The linker used to build libraries.
+LD=$LD
+
+# Whether we need hard or soft links.
+LN_S=$LN_S
+
+# A BSD-compatible nm program.
+NM=$NM
+
+# A symbol stripping program
+STRIP=$STRIP
+
+# Used to examine libraries when file_magic_cmd begins "file"
+MAGIC_CMD=$MAGIC_CMD
+
+# Used on cygwin: DLL creation program.
+DLLTOOL="$DLLTOOL"
+
+# Used on cygwin: object dumper.
+OBJDUMP="$OBJDUMP"
+
+# Used on cygwin: assembler.
+AS="$AS"
+
+# The name of the directory that contains temporary libtool files.
+objdir=$objdir
+
+# How to create reloadable object files.
+reload_flag=$reload_flag
+reload_cmds=$reload_cmds
+
+# How to pass a linker flag through the compiler.
+wl=$wl
+
+# Object file suffix (normally "o").
+objext="$objext"
+
+# Old archive suffix (normally "a").
+libext="$libext"
+
+# Executable file suffix (normally "").
+exeext="$exeext"
+
+# Additional compiler flags for building library objects.
+pic_flag=$pic_flag
+pic_mode=$pic_mode
+
+# What is the maximum length of a command?
+max_cmd_len=$max_cmd_len
+
+# Does compiler simultaneously support -c and -o options?
+compiler_c_o=$compiler_c_o
+
+# Must we lock files when doing compilation ?
+need_locks=$need_locks
+
+# Do we need the lib prefix for modules?
+need_lib_prefix=$need_lib_prefix
+
+# Do we need a version for libraries?
+need_version=$need_version
+
+# Whether dlopen is supported.
+dlopen_support=$enable_dlopen
+
+# Whether dlopen of programs is supported.
+dlopen_self=$enable_dlopen_self
+
+# Whether dlopen of statically linked programs is supported.
+dlopen_self_static=$enable_dlopen_self_static
+
+# Compiler flag to prevent dynamic linking.
+link_static_flag=$link_static_flag
+
+# Compiler flag to turn off builtin functions.
+no_builtin_flag=$no_builtin_flag
+
+# Compiler flag to allow reflexive dlopens.
+export_dynamic_flag_spec=$export_dynamic_flag_spec
+
+# Compiler flag to generate shared objects directly from archives.
+whole_archive_flag_spec=$whole_archive_flag_spec
+
+# Compiler flag to generate thread-safe objects.
+thread_safe_flag_spec=$thread_safe_flag_spec
+
+# Library versioning type.
+version_type=$version_type
+
+# Format of library name prefix.
+libname_spec=$libname_spec
+
+# List of archive names. First name is the real one, the rest are links.
+# The last name is the one that the linker finds with -lNAME.
+library_names_spec=$library_names_spec
+
+# The coded name of the library, if different from the real name.
+soname_spec=$soname_spec
+
+# Commands used to build and install an old-style archive.
+RANLIB=$RANLIB
+old_archive_cmds=$old_archive_cmds
+old_postinstall_cmds=$old_postinstall_cmds
+old_postuninstall_cmds=$old_postuninstall_cmds
+
+# Create an old-style archive from a shared archive.
+old_archive_from_new_cmds=$old_archive_from_new_cmds
+
+# Create a temporary old-style archive to link instead of a shared archive.
+old_archive_from_expsyms_cmds=$old_archive_from_expsyms_cmds
+
+# Commands used to build and install a shared archive.
+archive_cmds=$archive_cmds
+archive_expsym_cmds=$archive_expsym_cmds
+postinstall_cmds=$postinstall_cmds
+postuninstall_cmds=$postuninstall_cmds
+
+# Commands to strip libraries.
+old_striplib=$old_striplib
+striplib=$striplib
+
+# Dependencies to place before the objects being linked to create a
+# shared library.
+predep_objects=$predep_objects
+
+# Dependencies to place after the objects being linked to create a
+# shared library.
+postdep_objects=$postdep_objects
+
+# Dependencies to place before the objects being linked to create a
+# shared library.
+predeps=$predeps
+
+# Dependencies to place after the objects being linked to create a
+# shared library.
+postdeps=$postdeps
+
+# The library search path used internally by the compiler when linking
+# a shared library.
+compiler_lib_search_path=$compiler_lib_search_path
+
+# Method to check whether dependent libraries are shared objects.
+deplibs_check_method=$deplibs_check_method
+
+# Command to use when deplibs_check_method == file_magic.
+file_magic_cmd=$file_magic_cmd
+
+# Flag that allows shared libraries with undefined symbols to be built.
+allow_undefined_flag=$allow_undefined_flag
+
+# Flag that forces no undefined symbols.
+no_undefined_flag=$no_undefined_flag
+
+# Commands used to finish a libtool library installation in a directory.
+finish_cmds=$finish_cmds
+
+# Same as above, but a single script fragment to be evaled but not shown.
+finish_eval=$finish_eval
+
+# Take the output of nm and produce a listing of raw symbols and C names.
+global_symbol_pipe=$global_symbol_pipe
+
+# Transform the output of nm in a proper C declaration
+global_symbol_to_cdecl=$global_symbol_to_cdecl
+
+# This is the shared library runtime path variable.
+runpath_var=$runpath_var
+
+# This is the shared library path variable.
+shlibpath_var=$shlibpath_var
+
+# Is shlibpath searched before the hard-coded library search path?
+shlibpath_overrides_runpath=$shlibpath_overrides_runpath
+
+# How to hardcode a shared library path into an executable.
+hardcode_action=$hardcode_action
+
+# Whether we should hardcode library paths into libraries.
+hardcode_into_libs=$hardcode_into_libs
+
+# Flag to hardcode \$libdir into a binary during linking.
+# This must work even if \$libdir does not exist.
+hardcode_libdir_flag_spec=$hardcode_libdir_flag_spec
+
+# Whether we need a single -rpath flag with a separated argument.
+hardcode_libdir_separator=$hardcode_libdir_separator
+
+# Set to yes if using DIR/libNAME.so during linking hardcodes DIR into the
+# resulting binary.
+hardcode_direct=$hardcode_direct
+
+# Set to yes if using the -LDIR flag during linking hardcodes DIR into the
+# resulting binary.
+hardcode_minus_L=$hardcode_minus_L
+
+# Set to yes if using SHLIBPATH_VAR=DIR during linking hardcodes DIR into
+# the resulting binary.
+hardcode_shlibpath_var=$hardcode_shlibpath_var
+
+# Variables whose values should be saved in libtool wrapper scripts and
+# restored at relink time.
+variables_saved_for_relink="$variables_saved_for_relink"
+
+# Whether libtool must link a program against all its dependency libraries.
+link_all_deplibs=$link_all_deplibs
+
+# Compile-time system search path for libraries
+sys_lib_search_path_spec=$sys_lib_search_path_spec
+
+# Run-time system search path for libraries
+sys_lib_dlsearch_path_spec=$sys_lib_dlsearch_path_spec
+
+# Fix the shell variable \$srcfile for the compiler.
+fix_srcfile_path="$fix_srcfile_path"
+
+# Set to yes if exported symbols are required.
+always_export_symbols=$always_export_symbols
+
+# The commands to list exported symbols.
+export_symbols_cmds=$export_symbols_cmds
+
+# The commands to extract the exported symbol list from a shared archive.
+extract_expsyms_cmds=$extract_expsyms_cmds
+
+# Symbols that should not be listed in the preloaded symbols.
+exclude_expsyms=$exclude_expsyms
+
+# Symbols that must always be exported.
+include_expsyms=$include_expsyms
+
+EOF
+
+if test -z "$tagname"; then
+ echo '### END LIBTOOL CONFIG' >> "$ofile"
+else
+ echo "### END LIBTOOL TAG CONFIG: $tagname" >> "$ofile"
+fi
+
+case $ltmain in
+*.sh)
+ echo >> "$ofile"
+ if test -z "$tagname"; then
+ case $host_os in
+ aix3*)
+ cat <<\EOF >> "$ofile"
+
+# AIX sometimes has problems with the GCC collect2 program. For some
+# reason, if we set the COLLECT_NAMES environment variable, the problems
+# vanish in a puff of smoke.
+if test "X${COLLECT_NAMES+set}" != Xset; then
+ COLLECT_NAMES=
+ export COLLECT_NAMES
+fi
+EOF
+ ;;
+ esac
+ case $host in
+ *-*-cygwin* | *-*-mingw* | *-*-pw32* | *-*-os2*)
+ cat <<'EOF' >> "$ofile"
+ # This is a source program that is used to create dlls on Windows
+ # Don't remove nor modify the starting and closing comments
+# /* ltdll.c starts here */
+# #define WIN32_LEAN_AND_MEAN
+# #include <windows.h>
+# #undef WIN32_LEAN_AND_MEAN
+# #include <stdio.h>
+#
+# #ifndef __CYGWIN__
+# # ifdef __CYGWIN32__
+# # define __CYGWIN__ __CYGWIN32__
+# # endif
+# #endif
+#
+# #ifdef __cplusplus
+# extern "C" {
+# #endif
+# BOOL APIENTRY DllMain (HINSTANCE hInst, DWORD reason, LPVOID reserved);
+# #ifdef __cplusplus
+# }
+# #endif
+#
+# #ifdef __CYGWIN__
+# #include <cygwin/cygwin_dll.h>
+# DECLARE_CYGWIN_DLL( DllMain );
+# #endif
+# HINSTANCE __hDllInstance_base;
+#
+# BOOL APIENTRY
+# DllMain (HINSTANCE hInst, DWORD reason, LPVOID reserved)
+# {
+# __hDllInstance_base = hInst;
+# return TRUE;
+# }
+# /* ltdll.c ends here */
+ # This is a source program that is used to create import libraries
+ # on Windows for dlls which lack them. Don't remove nor modify the
+ # starting and closing comments
+# /* impgen.c starts here */
+# /* Copyright (C) 1999-2000 Free Software Foundation, Inc.
+#
+# This file is part of GNU libtool.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+# */
+#
+# #include <stdio.h> /* for printf() */
+# #include <unistd.h> /* for open(), lseek(), read() */
+# #include <fcntl.h> /* for O_RDONLY, O_BINARY */
+# #include <string.h> /* for strdup() */
+#
+# /* O_BINARY isn't required (or even defined sometimes) under Unix */
+# #ifndef O_BINARY
+# #define O_BINARY 0
+# #endif
+#
+# static unsigned int
+# pe_get16 (fd, offset)
+# int fd;
+# int offset;
+# {
+# unsigned char b[2];
+# lseek (fd, offset, SEEK_SET);
+# read (fd, b, 2);
+# return b[0] + (b[1]<<8);
+# }
+#
+# static unsigned int
+# pe_get32 (fd, offset)
+# int fd;
+# int offset;
+# {
+# unsigned char b[4];
+# lseek (fd, offset, SEEK_SET);
+# read (fd, b, 4);
+# return b[0] + (b[1]<<8) + (b[2]<<16) + (b[3]<<24);
+# }
+#
+# static unsigned int
+# pe_as32 (ptr)
+# void *ptr;
+# {
+# unsigned char *b = ptr;
+# return b[0] + (b[1]<<8) + (b[2]<<16) + (b[3]<<24);
+# }
+#
+# int
+# main (argc, argv)
+# int argc;
+# char *argv[];
+# {
+# int dll;
+# unsigned long pe_header_offset, opthdr_ofs, num_entries, i;
+# unsigned long export_rva, export_size, nsections, secptr, expptr;
+# unsigned long name_rvas, nexp;
+# unsigned char *expdata, *erva;
+# char *filename, *dll_name;
+#
+# filename = argv[1];
+#
+# dll = open(filename, O_RDONLY|O_BINARY);
+# if (dll < 1)
+# return 1;
+#
+# dll_name = filename;
+#
+# for (i=0; filename[i]; i++)
+# if (filename[i] == '/' || filename[i] == '\\' || filename[i] == ':')
+# dll_name = filename + i +1;
+#
+# pe_header_offset = pe_get32 (dll, 0x3c);
+# opthdr_ofs = pe_header_offset + 4 + 20;
+# num_entries = pe_get32 (dll, opthdr_ofs + 92);
+#
+# if (num_entries < 1) /* no exports */
+# return 1;
+#
+# export_rva = pe_get32 (dll, opthdr_ofs + 96);
+# export_size = pe_get32 (dll, opthdr_ofs + 100);
+# nsections = pe_get16 (dll, pe_header_offset + 4 +2);
+# secptr = (pe_header_offset + 4 + 20 +
+# pe_get16 (dll, pe_header_offset + 4 + 16));
+#
+# expptr = 0;
+# for (i = 0; i < nsections; i++)
+# {
+# char sname[8];
+# unsigned long secptr1 = secptr + 40 * i;
+# unsigned long vaddr = pe_get32 (dll, secptr1 + 12);
+# unsigned long vsize = pe_get32 (dll, secptr1 + 16);
+# unsigned long fptr = pe_get32 (dll, secptr1 + 20);
+# lseek(dll, secptr1, SEEK_SET);
+# read(dll, sname, 8);
+# if (vaddr <= export_rva && vaddr+vsize > export_rva)
+# {
+# expptr = fptr + (export_rva - vaddr);
+# if (export_rva + export_size > vaddr + vsize)
+# export_size = vsize - (export_rva - vaddr);
+# break;
+# }
+# }
+#
+# expdata = (unsigned char*)malloc(export_size);
+# lseek (dll, expptr, SEEK_SET);
+# read (dll, expdata, export_size);
+# erva = expdata - export_rva;
+#
+# nexp = pe_as32 (expdata+24);
+# name_rvas = pe_as32 (expdata+32);
+#
+# printf ("EXPORTS\n");
+# for (i = 0; i<nexp; i++)
+# {
+# unsigned long name_rva = pe_as32 (erva+name_rvas+i*4);
+# printf ("\t%s @ %ld ;\n", erva+name_rva, 1+ i);
+# }
+#
+# return 0;
+# }
+# /* impgen.c ends here */
+
+EOF
+ ;;
+ esac
+
+
+ # Append the ltmain.sh script.
+ sed '$q' "$ltmain" >> "$ofile" || (rm -f "$ofile"; exit 1)
+ # We use sed instead of cat because bash on DJGPP gets confused if
+ # if finds mixed CR/LF and LF-only lines. Since sed operates in
+ # text mode, it properly converts lines to CR/LF. This bash problem
+ # is reportedly fixed, but why not run on old versions too?
+
+ chmod +x "$ofile"
+ fi
+ ;;
+
+*)
+ # Compile the libtool program.
+ echo "FIXME: would compile $ltmain"
+ ;;
+esac
+
+# Update the list of available tags.
+if test -n "$tagname"; then
+
+ # Extract list of available tagged configurations in $ofile.
+ # Note that this assumes the entire list is on one line.
+ available_tags=`grep "^available_tags=" $ofile | sed -e 's/available_tags=\(.*$\)/\1/' -e 's/\"//g'`
+
+ # Append the new tag name to the list of available tags.
+ available_tags="$available_tags $tagname"
+
+ # Now substitute the updated of available tags.
+ if eval "sed -e 's/^available_tags=.*\$/available_tags=\"$available_tags\"/' ${ofile} > ${ofile}.new"; then
+ mv ${ofile}.new ${ofile}
+ chmod +x "$ofile"
+ else
+ rm -f ${ofile}.new
+ echo "$progname: unable to update list of available tagged configurations."
+ exit 1
+ fi
+fi
+
+# Don't cache tagged configuration!
+test -n "$cache_file" && test -z "$tagname" || exit 0
+
+# AC_CACHE_SAVE
+trap '' 1 2 15
+cat > confcache <<\EOF
+# This file is a shell script that caches the results of configure
+# tests run on this system so they can be shared between configure
+# scripts and configure runs. It is not useful on other systems.
+# If it contains results you don't want to keep, you may remove or edit it.
+#
+# By default, configure uses ./config.cache as the cache file,
+# creating it if it does not exist already. You can give configure
+# the --cache-file=FILE option to use a different cache file; that is
+# what configure does when it calls configure scripts in
+# subdirectories, so they share the cache.
+# Giving --cache-file=/dev/null disables caching, for debugging configure.
+# config.status only pays attention to the cache file if you give it the
+# --recheck option to rerun configure.
+#
+EOF
+# The following way of writing the cache mishandles newlines in values,
+# but we know of no workaround that is simple, portable, and efficient.
+# So, don't put newlines in cache variables' values.
+# Ultrix sh set writes to stderr and can't be redirected directly,
+# and sets the high bit in the cache file unless we assign to the vars.
+(set) 2>&1 |
+ case `(ac_space=' '; set | grep ac_space) 2>&1` in
+ *ac_space=\ *)
+ # `set' does not quote correctly, so add quotes (double-quote substitution
+ # turns \\\\ into \\, and sed turns \\ into \).
+ sed -n \
+ -e "s/'/'\\\\''/g" \
+ -e "s/^\\([a-zA-Z0-9_]*_cv_[a-zA-Z0-9_]*\\)=\\(.*\\)/\\1=\${\\1='\\2'}/p"
+ ;;
+ *)
+ # `set' quotes correctly as required by POSIX, so do not add quotes.
+ sed -n -e 's/^\([a-zA-Z0-9_]*_cv_[a-zA-Z0-9_]*\)=\(.*\)/\1=${\1=\2}/p'
+ ;;
+ esac >> confcache
+if cmp -s $cache_file confcache; then
+ :
+else
+ if test -w $cache_file; then
+ echo "updating cache $cache_file"
+ cat confcache > $cache_file
+ else
+ echo "not updating unwritable cache $cache_file"
+ fi
+fi
+rm -f confcache
+
+exit 0
+
+# Local Variables:
+# mode:shell-script
+# sh-indentation:2
+# End:
--- /dev/null
+# ltmain.sh - Provide generalized library-building support services.
+# NOTE: Changing this file will not affect anything until you rerun ltconfig.
+#
+# Copyright (C) 1996, 1997, 1998, 1999, 2000, 2001
+# Free Software Foundation, Inc.
+# Originally by Gordon Matzigkeit <gord@gnu.ai.mit.edu>, 1996
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+#
+# As a special exception to the GNU General Public License, if you
+# distribute this file as part of a program that contains a
+# configuration script generated by Autoconf, you may include it under
+# the same distribution terms that you use for the rest of that program.
+
+# Check that we have a working $echo.
+if test "X$1" = X--no-reexec; then
+ # Discard the --no-reexec flag, and continue.
+ shift
+elif test "X$1" = X--fallback-echo; then
+ # Avoid inline document here, it may be left over
+ :
+elif test "X`($echo '\t') 2>/dev/null`" = 'X\t'; then
+ # Yippee, $echo works!
+ :
+else
+ # Restart under the correct shell, and then maybe $echo will work.
+ exec $SHELL "$0" --no-reexec ${1+"$@"}
+fi
+
+if test "X$1" = X--fallback-echo; then
+ # used as fallback echo
+ shift
+ cat <<EOF
+$*
+EOF
+ exit 0
+fi
+
+# The name of this program.
+progname=`$echo "$0" | sed 's%^.*/%%'`
+modename="$progname"
+
+# Constants.
+PROGRAM=ltmain.sh
+PACKAGE=libtool
+VERSION=1.4a
+TIMESTAMP=" (1.641.2.255 2001/05/22 10:39:30)"
+
+default_mode=
+help="Try \`$progname --help' for more information."
+magic="%%%MAGIC variable%%%"
+mkdir="mkdir"
+mv="mv -f"
+rm="rm -f"
+
+# Sed substitution that helps us do robust quoting. It backslashifies
+# metacharacters that are still active within double-quoted strings.
+Xsed='sed -e 1s/^X//'
+sed_quote_subst='s/\([\\`\\"$\\\\]\)/\\\1/g'
+SP2NL='tr \040 \012'
+NL2SP='tr \015\012 \040\040'
+
+# NLS nuisances.
+# Only set LANG and LC_ALL to C if already set.
+# These must not be set unconditionally because not all systems understand
+# e.g. LANG=C (notably SCO).
+# We save the old values to restore during execute mode.
+if test "${LC_ALL+set}" = set; then
+ save_LC_ALL="$LC_ALL"; LC_ALL=C; export LC_ALL
+fi
+if test "${LANG+set}" = set; then
+ save_LANG="$LANG"; LANG=C; export LANG
+fi
+
+if test "$LTCONFIG_VERSION" != "$VERSION"; then
+ echo "$modename: ltconfig version \`$LTCONFIG_VERSION' does not match $PROGRAM version \`$VERSION'" 1>&2
+ echo "Fatal configuration error. See the $PACKAGE docs for more information." 1>&2
+ exit 1
+fi
+
+if test "$build_libtool_libs" != yes && test "$build_old_libs" != yes; then
+ echo "$modename: not configured to build any kind of library" 1>&2
+ echo "Fatal configuration error. See the $PACKAGE docs for more information." 1>&2
+ exit 1
+fi
+
+# Global variables.
+mode=$default_mode
+nonopt=
+prev=
+prevopt=
+run=
+show="$echo"
+show_help=
+execute_dlfiles=
+lo2o="s/\\.lo\$/.${objext}/"
+o2lo="s/\\.${objext}\$/.lo/"
+
+# Parse our command line options once, thoroughly.
+while test $# -gt 0
+do
+ arg="$1"
+ shift
+
+ case $arg in
+ -*=*) optarg=`$echo "X$arg" | $Xsed -e 's/[-_a-zA-Z0-9]*=//'` ;;
+ *) optarg= ;;
+ esac
+
+ # If the previous option needs an argument, assign it.
+ if test -n "$prev"; then
+ case $prev in
+ execute_dlfiles)
+ execute_dlfiles="$execute_dlfiles $arg"
+ ;;
+ tag)
+ tagname="$arg"
+
+ # Check whether tagname contains only valid characters
+ case $tagname in
+ *[!-_A-Za-z0-9,/]*)
+ echo "$progname: invalid tag name: $tagname" 1>&2
+ exit 1
+ ;;
+ esac
+
+ case $tagname in
+ CC)
+ # Don't test for the "default" C tag, as we know, it's there, but
+ # not specially marked.
+ ;;
+ *)
+ if grep "^### BEGIN LIBTOOL TAG CONFIG: $tagname$" < "$0" > /dev/null; then
+ taglist="$taglist $tagname"
+ # Evaluate the configuration.
+ eval "`sed -n -e '/^### BEGIN LIBTOOL TAG CONFIG: '$tagname'$/,/^### END LIBTOOL TAG CONFIG: '$tagname'$/p' < $0`"
+ else
+ echo "$progname: ignoring unknown tag $tagname" 1>&2
+ fi
+ ;;
+ esac
+ ;;
+ *)
+ eval "$prev=\$arg"
+ ;;
+ esac
+
+ prev=
+ prevopt=
+ continue
+ fi
+
+ # Have we seen a non-optional argument yet?
+ case $arg in
+ --help)
+ show_help=yes
+ ;;
+
+ --version)
+ echo "$PROGRAM (GNU $PACKAGE) $VERSION$TIMESTAMP"
+ exit 0
+ ;;
+
+ --config)
+ sed -n -e '/^### BEGIN LIBTOOL CONFIG/,/^### END LIBTOOL CONFIG/p' < "$0"
+ # Now print the configurations for the tags.
+ for tagname in $taglist; do
+ sed -n -e "/^### BEGIN LIBTOOL TAG CONFIG: $tagname$/,/^### END LIBTOOL TAG CONFIG: $tagname$/p" < "$0"
+ done
+ exit 0
+ ;;
+
+ --debug)
+ echo "$progname: enabling shell trace mode"
+ set -x
+ ;;
+
+ --dry-run | -n)
+ run=:
+ ;;
+
+ --features)
+ echo "host: $host"
+ if test "$build_libtool_libs" = yes; then
+ echo "enable shared libraries"
+ else
+ echo "disable shared libraries"
+ fi
+ if test "$build_old_libs" = yes; then
+ echo "enable static libraries"
+ else
+ echo "disable static libraries"
+ fi
+ exit 0
+ ;;
+
+ --finish) mode="finish" ;;
+
+ --mode) prevopt="--mode" prev=mode ;;
+ --mode=*) mode="$optarg" ;;
+
+ --quiet | --silent)
+ show=:
+ ;;
+
+ --tag) prevopt="--tag" prev=tag ;;
+ --tag=*)
+ set tag "$optarg" ${1+"$@"}
+ shift
+ prev=tag
+ ;;
+
+ -dlopen)
+ prevopt="-dlopen"
+ prev=execute_dlfiles
+ ;;
+
+ -*)
+ $echo "$modename: unrecognized option \`$arg'" 1>&2
+ $echo "$help" 1>&2
+ exit 1
+ ;;
+
+ *)
+ nonopt="$arg"
+ break
+ ;;
+ esac
+done
+
+if test -n "$prevopt"; then
+ $echo "$modename: option \`$prevopt' requires an argument" 1>&2
+ $echo "$help" 1>&2
+ exit 1
+fi
+
+# If this variable is set in any of the actions, the command in it
+# will be execed at the end. This prevents here-documents from being
+# left over by shells.
+exec_cmd=
+
+if test -z "$show_help"; then
+
+ # Infer the operation mode.
+ if test -z "$mode"; then
+ case $nonopt in
+ *cc | *++ | gcc* | *-gcc*)
+ mode=link
+ for arg
+ do
+ case $arg in
+ -c)
+ mode=compile
+ break
+ ;;
+ esac
+ done
+ ;;
+ *db | *dbx | *strace | *truss)
+ mode=execute
+ ;;
+ *install*|cp|mv)
+ mode=install
+ ;;
+ *rm)
+ mode=uninstall
+ ;;
+ *)
+ # If we have no mode, but dlfiles were specified, then do execute mode.
+ test -n "$execute_dlfiles" && mode=execute
+
+ # Just use the default operation mode.
+ if test -z "$mode"; then
+ if test -n "$nonopt"; then
+ $echo "$modename: warning: cannot infer operation mode from \`$nonopt'" 1>&2
+ else
+ $echo "$modename: warning: cannot infer operation mode without MODE-ARGS" 1>&2
+ fi
+ fi
+ ;;
+ esac
+ fi
+
+ # Only execute mode is allowed to have -dlopen flags.
+ if test -n "$execute_dlfiles" && test "$mode" != execute; then
+ $echo "$modename: unrecognized option \`-dlopen'" 1>&2
+ $echo "$help" 1>&2
+ exit 1
+ fi
+
+ # Change the help message to a mode-specific one.
+ generic_help="$help"
+ help="Try \`$modename --help --mode=$mode' for more information."
+
+ # These modes are in order of execution frequency so that they run quickly.
+ case $mode in
+ # libtool compile mode
+ compile)
+ modename="$modename: compile"
+ # Get the compilation command and the source file.
+ base_compile=
+ prev=
+ lastarg=
+ srcfile="$nonopt"
+ suppress_output=
+
+ user_target=no
+ for arg
+ do
+ case $prev in
+ "") ;;
+ xcompiler)
+ # Aesthetically quote the previous argument.
+ prev=
+ lastarg=`$echo "X$arg" | $Xsed -e "$sed_quote_subst"`
+
+ case $arg in
+ # Double-quote args containing other shell metacharacters.
+ # Many Bourne shells cannot handle close brackets correctly
+ # in scan sets, so we specify it separately.
+ *[\[\~\#\^\&\*\(\)\{\}\|\;\<\>\?\'\ \ ]*|*]*|"")
+ arg="\"$arg\""
+ ;;
+ esac
+
+ # Add the previous argument to base_compile.
+ if test -z "$base_compile"; then
+ base_compile="$lastarg"
+ else
+ base_compile="$base_compile $lastarg"
+ fi
+ continue
+ ;;
+ esac
+
+ # Accept any command-line options.
+ case $arg in
+ -o)
+ if test "$user_target" != "no"; then
+ $echo "$modename: you cannot specify \`-o' more than once" 1>&2
+ exit 1
+ fi
+ user_target=next
+ ;;
+
+ -static)
+ build_old_libs=yes
+ continue
+ ;;
+
+ -prefer-pic)
+ pic_mode=yes
+ continue
+ ;;
+
+ -prefer-non-pic)
+ pic_mode=no
+ continue
+ ;;
+
+ -Xcompiler)
+ prev=xcompiler
+ continue
+ ;;
+
+ -Wc,*)
+ args=`$echo "X$arg" | $Xsed -e "s/^-Wc,//"`
+ lastarg=
+ IFS="${IFS= }"; save_ifs="$IFS"; IFS=','
+ for arg in $args; do
+ IFS="$save_ifs"
+
+ # Double-quote args containing other shell metacharacters.
+ # Many Bourne shells cannot handle close brackets correctly
+ # in scan sets, so we specify it separately.
+ case $arg in
+ *[\[\~\#\^\&\*\(\)\{\}\|\;\<\>\?\'\ \ ]*|*]*|"")
+ arg="\"$arg\""
+ ;;
+ esac
+ lastarg="$lastarg $arg"
+ done
+ IFS="$save_ifs"
+ lastarg=`$echo "X$lastarg" | $Xsed -e "s/^ //"`
+
+ # Add the arguments to base_compile.
+ if test -z "$base_compile"; then
+ base_compile="$lastarg"
+ else
+ base_compile="$base_compile $lastarg"
+ fi
+ continue
+ ;;
+ esac
+
+ case $user_target in
+ next)
+ # The next one is the -o target name
+ user_target=yes
+ continue
+ ;;
+ yes)
+ # We got the output file
+ user_target=set
+ libobj="$arg"
+ continue
+ ;;
+ esac
+
+ # Accept the current argument as the source file.
+ lastarg="$srcfile"
+ srcfile="$arg"
+
+ # Aesthetically quote the previous argument.
+
+ # Backslashify any backslashes, double quotes, and dollar signs.
+ # These are the only characters that are still specially
+ # interpreted inside of double-quoted scrings.
+ lastarg=`$echo "X$lastarg" | $Xsed -e "$sed_quote_subst"`
+
+ # Double-quote args containing other shell metacharacters.
+ # Many Bourne shells cannot handle close brackets correctly
+ # in scan sets, so we specify it separately.
+ case $lastarg in
+ *[\[\~\#\^\&\*\(\)\{\}\|\;\<\>\?\'\ \ ]*|*]*|"")
+ lastarg="\"$lastarg\""
+ ;;
+ esac
+
+ # Add the previous argument to base_compile.
+ if test -z "$base_compile"; then
+ base_compile="$lastarg"
+ else
+ base_compile="$base_compile $lastarg"
+ fi
+ done
+
+ case $user_target in
+ set)
+ ;;
+ no)
+ # Get the name of the library object.
+ libobj=`$echo "X$srcfile" | $Xsed -e 's%^.*/%%'`
+ ;;
+ *)
+ $echo "$modename: you must specify a target with \`-o'" 1>&2
+ exit 1
+ ;;
+ esac
+
+ # Recognize several different file suffixes.
+ # If the user specifies -o file.o, it is replaced with file.lo
+ xform='[cCFSfmso]'
+ case $libobj in
+ *.ada) xform=ada ;;
+ *.adb) xform=adb ;;
+ *.ads) xform=ads ;;
+ *.asm) xform=asm ;;
+ *.c++) xform=c++ ;;
+ *.cc) xform=cc ;;
+ *.class) xform=class ;;
+ *.cpp) xform=cpp ;;
+ *.cxx) xform=cxx ;;
+ *.f90) xform=f90 ;;
+ *.for) xform=for ;;
+ *.java) xform=java ;;
+ esac
+
+ libobj=`$echo "X$libobj" | $Xsed -e "s/\.$xform$/.lo/"`
+
+ case $libobj in
+ *.lo) obj=`$echo "X$libobj" | $Xsed -e "$lo2o"` ;;
+ *)
+ $echo "$modename: cannot determine name of library object from \`$libobj'" 1>&2
+ exit 1
+ ;;
+ esac
+
+ # Infer tagged configuration to use if any are available and
+ # if one wasn't chosen via the "--tag" command line option.
+ # Only attempt this if the compiler in the base compile
+ # command doesn't match the default compiler.
+ if test -n "$available_tags" && test -z "$tagname"; then
+ case $base_compile in
+ "$CC "*) ;;
+ # Blanks in the command may have been stripped by the calling shell,
+ # but not from the CC environment variable when ltconfig was run.
+ "`$echo $CC` "*) ;;
+ *)
+ for z in $available_tags; do
+ if grep "^### BEGIN LIBTOOL TAG CONFIG: $z$" < "$0" > /dev/null; then
+ # Evaluate the configuration.
+ eval "`sed -n -e '/^### BEGIN LIBTOOL TAG CONFIG: '$z'$/,/^### END LIBTOOL TAG CONFIG: '$z'$/p' < $0`"
+ case $base_compile in
+ "$CC "*)
+ # The compiler in the base compile command matches
+ # the one in the tagged configuration.
+ # Assume this is the tagged configuration we want.
+ tagname=$z
+ break
+ ;;
+ "`$echo $CC` "*)
+ tagname=$z
+ break
+ ;;
+ esac
+ fi
+ done
+ # If $tagname still isn't set, then no tagged configuration
+ # was found and let the user know that the "--tag" command
+ # line option must be used.
+ if test -z "$tagname"; then
+ echo "$modename: unable to infer tagged configuration"
+ echo "$modename: specify a tag with \`--tag'" 1>&2
+ exit 1
+# else
+# echo "$modename: using $tagname tagged configuration"
+ fi
+ ;;
+ esac
+ fi
+
+ objname=`$echo "X$obj" | $Xsed -e 's%^.*/%%'`
+ xdir=`$echo "X$obj" | $Xsed -e 's%/[^/]*$%%'`
+ if test "X$xdir" = "X$obj"; then
+ xdir=
+ else
+ xdir=$xdir/
+ fi
+ lobj=${xdir}$objdir/$objname
+
+ if test -z "$base_compile"; then
+ $echo "$modename: you must specify a compilation command" 1>&2
+ $echo "$help" 1>&2
+ exit 1
+ fi
+
+ # Delete any leftover library objects.
+ if test "$build_old_libs" = yes; then
+ removelist="$obj $lobj $libobj ${libobj}T"
+ else
+ removelist="$lobj $libobj ${libobj}T"
+ fi
+
+ $run $rm $removelist
+ trap "$run $rm $removelist; exit 1" 1 2 15
+
+ # On Cygwin there's no "real" PIC flag so we must build both object types
+ case $host_os in
+ cygwin* | mingw* | pw32* | os2*)
+ pic_mode=default
+ ;;
+ esac
+ if test $pic_mode = no && test "$deplibs_check_method" != pass_all; then
+ # non-PIC code in shared libraries is not supported
+ pic_mode=default
+ fi
+
+ # Calculate the filename of the output object if compiler does
+ # not support -o with -c
+ if test "$compiler_c_o" = no; then
+ output_obj=`$echo "X$srcfile" | $Xsed -e 's%^.*/%%' -e 's%\.[^.]*$%%'`.${objext}
+ lockfile="$output_obj.lock"
+ removelist="$removelist $output_obj $lockfile"
+ trap "$run $rm $removelist; exit 1" 1 2 15
+ else
+ output_obj=
+ need_locks=no
+ lockfile=
+ fi
+
+ # Lock this critical section if it is needed
+ # We use this script file to make the link, it avoids creating a new file
+ if test "$need_locks" = yes; then
+ until $run ln "$0" "$lockfile" 2>/dev/null; do
+ $show "Waiting for $lockfile to be removed"
+ sleep 2
+ done
+ elif test "$need_locks" = warn; then
+ if test -f "$lockfile"; then
+ echo "\
+*** ERROR, $lockfile exists and contains:
+`cat $lockfile 2>/dev/null`
+
+This indicates that another process is trying to use the same
+temporary object file, and libtool could not work around it because
+your compiler does not support \`-c' and \`-o' together. If you
+repeat this compilation, it may succeed, by chance, but you had better
+avoid parallel builds (make -j) in this platform, or get a better
+compiler."
+
+ $run $rm $removelist
+ exit 1
+ fi
+ echo $srcfile > "$lockfile"
+ fi
+
+ if test -n "$fix_srcfile_path"; then
+ eval srcfile=\"$fix_srcfile_path\"
+ fi
+
+ $run $rm "$libobj" "${libobj}T"
+
+ # Create a libtool object file (analogous to a ".la" file),
+ # but don't create it if we're doing a dry run.
+ test -z "$run" && cat > ${libobj}T <<EOF
+# $libobj - a libtool object file
+# Generated by $PROGRAM - GNU $PACKAGE $VERSION$TIMESTAMP
+#
+# Please DO NOT delete this file!
+# It is necessary for linking the library.
+
+# Name of the PIC object.
+EOF
+
+ # Only build a PIC object if we are building libtool libraries.
+ if test "$build_libtool_libs" = yes; then
+ # Without this assignment, base_compile gets emptied.
+ fbsd_hideous_sh_bug=$base_compile
+
+ if test "$pic_mode" != no; then
+ command="$base_compile $srcfile $pic_flag"
+ else
+ # Don't build PIC code
+ command="$base_compile $srcfile"
+ fi
+
+ if test ! -d ${xdir}$objdir; then
+ $show "$mkdir ${xdir}$objdir"
+ $run $mkdir ${xdir}$objdir
+ status=$?
+ if test $status -ne 0 && test ! -d ${xdir}$objdir; then
+ exit $status
+ fi
+ fi
+
+ if test -z "$output_obj"; then
+ # Place PIC objects in $objdir
+ command="$command -o $lobj"
+ fi
+
+ $run $rm "$lobj" "$output_obj"
+
+ $show "$command"
+ if $run eval "$command"; then :
+ else
+ test -n "$output_obj" && $run $rm $removelist
+ exit 1
+ fi
+
+ if test "$need_locks" = warn &&
+ test x"`cat $lockfile 2>/dev/null`" != x"$srcfile"; then
+ echo "\
+*** ERROR, $lockfile contains:
+`cat $lockfile 2>/dev/null`
+
+but it should contain:
+$srcfile
+
+This indicates that another process is trying to use the same
+temporary object file, and libtool could not work around it because
+your compiler does not support \`-c' and \`-o' together. If you
+repeat this compilation, it may succeed, by chance, but you had better
+avoid parallel builds (make -j) in this platform, or get a better
+compiler."
+
+ $run $rm $removelist
+ exit 1
+ fi
+
+ # Just move the object if needed, then go on to compile the next one
+ if test -n "$output_obj" && test "x$output_obj" != "x$lobj"; then
+ $show "$mv $output_obj $lobj"
+ if $run $mv $output_obj $lobj; then :
+ else
+ error=$?
+ $run $rm $removelist
+ exit $error
+ fi
+ fi
+
+ # Append the name of the PIC object to the libtool object file.
+ test -z "$run" && cat >> ${libobj}T <<EOF
+pic_object='$objdir/$objname'
+
+EOF
+
+ # Allow error messages only from the first compilation.
+ suppress_output=' >/dev/null 2>&1'
+ else
+ # No PIC object so indicate it doesn't exist in the libtool
+ # object file.
+ test -z "$run" && cat >> ${libobj}T <<EOF
+pic_object=none
+
+EOF
+ fi
+
+ # Only build a position-dependent object if we build old libraries.
+ if test "$build_old_libs" = yes; then
+ if test "$pic_mode" != yes; then
+ # Don't build PIC code
+ command="$base_compile $srcfile"
+ else
+ command="$base_compile $srcfile $pic_flag"
+ fi
+ if test "$compiler_c_o" = yes; then
+ command="$command -o $obj"
+ fi
+
+ # Suppress compiler output if we already did a PIC compilation.
+ command="$command$suppress_output"
+ $run $rm "$obj" "$output_obj"
+ $show "$command"
+ if $run eval "$command"; then :
+ else
+ $run $rm $removelist
+ exit 1
+ fi
+
+ if test "$need_locks" = warn &&
+ test x"`cat $lockfile 2>/dev/null`" != x"$srcfile"; then
+ echo "\
+*** ERROR, $lockfile contains:
+`cat $lockfile 2>/dev/null`
+
+but it should contain:
+$srcfile
+
+This indicates that another process is trying to use the same
+temporary object file, and libtool could not work around it because
+your compiler does not support \`-c' and \`-o' together. If you
+repeat this compilation, it may succeed, by chance, but you had better
+avoid parallel builds (make -j) in this platform, or get a better
+compiler."
+
+ $run $rm $removelist
+ exit 1
+ fi
+
+ # Just move the object if needed
+ if test -n "$output_obj" && test "x$output_obj" != "x$obj"; then
+ $show "$mv $output_obj $obj"
+ if $run $mv $output_obj $obj; then :
+ else
+ error=$?
+ $run $rm $removelist
+ exit $error
+ fi
+ fi
+
+ # Append the name of the non-PIC object the libtool object file.
+ # Only append if the libtool object file exists.
+ test -z "$run" && cat >> ${libobj}T <<EOF
+# Name of the non-PIC object.
+non_pic_object='$objname'
+
+EOF
+ else
+ # Append the name of the non-PIC object the libtool object file.
+ # Only append if the libtool object file exists.
+ test -z "$run" && cat >> ${libobj}T <<EOF
+# Name of the non-PIC object.
+non_pic_object=none
+
+EOF
+ fi
+
+ $run $mv "${libobj}T" "${libobj}"
+
+ # Unlock the critical section if it was locked
+ if test "$need_locks" != no; then
+ $run $rm "$lockfile"
+ fi
+
+ exit 0
+ ;;
+
+ # libtool link mode
+ link | relink)
+ modename="$modename: link"
+ case $host in
+ *-*-cygwin* | *-*-mingw* | *-*-pw32* | *-*-os2*)
+ # It is impossible to link a dll without this setting, and
+ # we shouldn't force the makefile maintainer to figure out
+ # which system we are compiling for in order to pass an extra
+ # flag for every libtool invokation.
+ # allow_undefined=no
+
+ # FIXME: Unfortunately, there are problems with the above when trying
+ # to make a dll which has undefined symbols, in which case not
+ # even a static library is built. For now, we need to specify
+ # -no-undefined on the libtool link line when we can be certain
+ # that all symbols are satisfied, otherwise we get a static library.
+ allow_undefined=yes
+ ;;
+ *)
+ allow_undefined=yes
+ ;;
+ esac
+ libtool_args="$nonopt"
+ base_compile="$nonopt"
+ compile_command="$nonopt"
+ finalize_command="$nonopt"
+
+ compile_rpath=
+ finalize_rpath=
+ compile_shlibpath=
+ finalize_shlibpath=
+ convenience=
+ old_convenience=
+ deplibs=
+ old_deplibs=
+ compiler_flags=
+ linker_flags=
+ dllsearchpath=
+ lib_search_path=`pwd`
+
+ avoid_version=no
+ dlfiles=
+ dlprefiles=
+ dlself=no
+ export_dynamic=no
+ export_symbols=
+ export_symbols_regex=
+ generated=
+ libobjs=
+ ltlibs=
+ module=no
+ no_install=no
+ objs=
+ non_pic_objects=
+ prefer_static_libs=no
+ preload=no
+ prev=
+ prevarg=
+ release=
+ rpath=
+ xrpath=
+ perm_rpath=
+ temp_rpath=
+ thread_safe=no
+ vinfo=
+
+ # We need to know -static, to get the right output filenames.
+ for arg
+ do
+ case $arg in
+ -all-static | -static)
+ if test "X$arg" = "X-all-static"; then
+ if test "$build_libtool_libs" = yes && test -z "$link_static_flag"; then
+ $echo "$modename: warning: complete static linking is impossible in this configuration" 1>&2
+ fi
+ if test -n "$link_static_flag"; then
+ dlopen_self=$dlopen_self_static
+ fi
+ else
+ if test -z "$pic_flag" && test -n "$link_static_flag"; then
+ dlopen_self=$dlopen_self_static
+ fi
+ fi
+ build_libtool_libs=no
+ build_old_libs=yes
+ prefer_static_libs=yes
+ break
+ ;;
+ esac
+ done
+
+ # See if our shared archives depend on static archives.
+ test -n "$old_archive_from_new_cmds" && build_old_libs=yes
+
+ # Go through the arguments, transforming them on the way.
+ while test $# -gt 0; do
+ arg="$1"
+ base_compile="$base_compile $arg"
+ shift
+ case $arg in
+ *[\[\~\#\^\&\*\(\)\{\}\|\;\<\>\?\'\ \ ]*|*]*|"")
+ qarg=\"`$echo "X$arg" | $Xsed -e "$sed_quote_subst"`\" ### testsuite: skip nested quoting test
+ ;;
+ *) qarg=$arg ;;
+ esac
+ libtool_args="$libtool_args $qarg"
+
+ # If the previous option needs an argument, assign it.
+ if test -n "$prev"; then
+ case $prev in
+ output)
+ compile_command="$compile_command @OUTPUT@"
+ finalize_command="$finalize_command @OUTPUT@"
+ ;;
+ esac
+
+ case $prev in
+ dlfiles|dlprefiles)
+ if test "$preload" = no; then
+ # Add the symbol object into the linking commands.
+ compile_command="$compile_command @SYMFILE@"
+ finalize_command="$finalize_command @SYMFILE@"
+ preload=yes
+ fi
+ case $arg in
+ *.la | *.lo) ;; # We handle these cases below.
+ force)
+ if test "$dlself" = no; then
+ dlself=needless
+ export_dynamic=yes
+ fi
+ prev=
+ continue
+ ;;
+ self)
+ if test "$prev" = dlprefiles; then
+ dlself=yes
+ elif test "$prev" = dlfiles && test "$dlopen_self" != yes; then
+ dlself=yes
+ else
+ dlself=needless
+ export_dynamic=yes
+ fi
+ prev=
+ continue
+ ;;
+ *)
+ if test "$prev" = dlfiles; then
+ dlfiles="$dlfiles $arg"
+ else
+ dlprefiles="$dlprefiles $arg"
+ fi
+ prev=
+ continue
+ ;;
+ esac
+ ;;
+ expsyms)
+ export_symbols="$arg"
+ if test ! -f "$arg"; then
+ $echo "$modename: symbol file \`$arg' does not exist"
+ exit 1
+ fi
+ prev=
+ continue
+ ;;
+ expsyms_regex)
+ export_symbols_regex="$arg"
+ prev=
+ continue
+ ;;
+ release)
+ release="-$arg"
+ prev=
+ continue
+ ;;
+ objectlist)
+ if test -f "$arg"; then
+ save_arg=$arg
+ moreargs=
+ for fil in `cat $save_arg`
+ do
+# moreargs="$moreargs $fil"
+ arg=$fil
+ # A libtool-controlled object.
+
+ # Check to see that this really is a libtool object.
+ if (sed -e '2q' $arg | egrep "^# Generated by .*$PACKAGE") >/dev/null 2>&1; then
+ pic_object=
+ non_pic_object=
+
+ # Read the .lo file
+ # If there is no directory component, then add one.
+ case $arg in
+ */* | *\\*) . $arg ;;
+ *) . ./$arg ;;
+ esac
+
+ if test -z "$pic_object" || \
+ test -z "$non_pic_object" ||
+ test "$pic_object" = none && \
+ test "$non_pic_object" = none; then
+ $echo "$modename: cannot find name of object for \`$arg'" 1>&2
+ exit 1
+ fi
+
+ # Extract subdirectory from the argument.
+ xdir=`$echo "X$arg" | $Xsed -e 's%/[^/]*$%%'`
+ if test "X$xdir" = "X$arg"; then
+ xdir=
+ else
+ xdir="$xdir/"
+ fi
+
+ if test "$pic_object" != none; then
+ # Prepend the subdirectory the object is found in.
+ pic_object="$xdir$pic_object"
+
+ if test "$prev" = dlfiles; then
+ if test "$build_libtool_libs" = yes && test "$dlopen_support" = yes; then
+ dlfiles="$dlfiles $pic_object"
+ prev=
+ continue
+ else
+ # If libtool objects are unsupported, then we need to preload.
+ prev=dlprefiles
+ fi
+ fi
+
+ # CHECK ME: I think I busted this. -Ossama
+ if test "$prev" = dlprefiles; then
+ # Preload the old-style object.
+ dlprefiles="$dlprefiles $pic_object"
+ prev=
+ fi
+
+ # A PIC object.
+ libobjs="$libobjs $pic_object"
+ arg="$pic_object"
+ fi
+
+ # Non-PIC object.
+ if test "$non_pic_object" != none; then
+ # Prepend the subdirectory the object is found in.
+ non_pic_object="$xdir$non_pic_object"
+
+ # A standard non-PIC object
+ non_pic_objects="$non_pic_objects $non_pic_object"
+ if test -z "$pic_object" || test "$pic_object" = none ; then
+ arg="$non_pic_object"
+ fi
+ fi
+ else
+ # Only an error if not doing a dry-run.
+ if test -z "$run"; then
+ $echo "$modename: \`$arg' is not a valid libtool object" 1>&2
+ exit 1
+ else
+ # Dry-run case.
+
+ # Extract subdirectory from the argument.
+ xdir=`$echo "X$arg" | $Xsed -e 's%/[^/]*$%%'`
+ if test "X$xdir" = "X$arg"; then
+ xdir=
+ else
+ xdir="$xdir/"
+ fi
+
+ pic_object=`$echo "X${xdir}${objdir}/${arg}" | $Xsed -e "$lo2o"`
+ non_pic_object=`$echo "X${xdir}${arg}" | $Xsed -e "$lo2o"`
+ libobjs="$libobjs $pic_object"
+ non_pic_objects="$non_pic_objects $non_pic_object"
+ fi
+ fi
+ done
+ else
+ $echo "$modename: link input file \`$save_arg' does not exist"
+ exit 1
+ fi
+ arg=$save_arg
+ prev=
+ continue
+ ;;
+ rpath | xrpath)
+ # We need an absolute path.
+ case $arg in
+ [\\/]* | [A-Za-z]:[\\/]*) ;;
+ *)
+ $echo "$modename: only absolute run-paths are allowed" 1>&2
+ exit 1
+ ;;
+ esac
+ if test "$prev" = rpath; then
+ case "$rpath " in
+ *" $arg "*) ;;
+ *) rpath="$rpath $arg" ;;
+ esac
+ else
+ case "$xrpath " in
+ *" $arg "*) ;;
+ *) xrpath="$xrpath $arg" ;;
+ esac
+ fi
+ prev=
+ continue
+ ;;
+ xcompiler)
+ compiler_flags="$compiler_flags $qarg"
+ prev=
+ compile_command="$compile_command $qarg"
+ finalize_command="$finalize_command $qarg"
+ continue
+ ;;
+ xlinker)
+ linker_flags="$linker_flags $qarg"
+ compiler_flags="$compiler_flags $wl$qarg"
+ prev=
+ compile_command="$compile_command $wl$qarg"
+ finalize_command="$finalize_command $wl$qarg"
+ continue
+ ;;
+ *)
+ eval "$prev=\"\$arg\""
+ prev=
+ continue
+ ;;
+ esac
+ fi # test -n $prev
+
+ prevarg="$arg"
+
+ case $arg in
+ -all-static)
+ if test -n "$link_static_flag"; then
+ compile_command="$compile_command $link_static_flag"
+ finalize_command="$finalize_command $link_static_flag"
+ fi
+ continue
+ ;;
+
+ -allow-undefined)
+ # FIXME: remove this flag sometime in the future.
+ $echo "$modename: \`-allow-undefined' is deprecated because it is the default" 1>&2
+ continue
+ ;;
+
+ -avoid-version)
+ avoid_version=yes
+ continue
+ ;;
+
+ -dlopen)
+ prev=dlfiles
+ continue
+ ;;
+
+ -dlpreopen)
+ prev=dlprefiles
+ continue
+ ;;
+
+ -export-dynamic)
+ export_dynamic=yes
+ continue
+ ;;
+
+ -export-symbols | -export-symbols-regex)
+ if test -n "$export_symbols" || test -n "$export_symbols_regex"; then
+ $echo "$modename: more than one -exported-symbols argument is not allowed"
+ exit 1
+ fi
+ if test "X$arg" = "X-export-symbols"; then
+ prev=expsyms
+ else
+ prev=expsyms_regex
+ fi
+ continue
+ ;;
+
+ # The native IRIX linker understands -LANG:*, -LIST:* and -LNO:*
+ # so, if we see these flags be careful not to treat them like -L
+ -L[A-Z][A-Z]*:*)
+ case $with_gcc/$host in
+ no/*-*-irix*)
+ compile_command="$compile_command $arg"
+ finalize_command="$finalize_command $arg"
+ ;;
+ esac
+ continue
+ ;;
+
+ -L*)
+ dir=`$echo "X$arg" | $Xsed -e 's/^-L//'`
+ # We need an absolute path.
+ case $dir in
+ [\\/]* | [A-Za-z]:[\\/]*) ;;
+ *)
+ absdir=`cd "$dir" && pwd`
+ if test -z "$absdir"; then
+ $echo "$modename: cannot determine absolute directory name of \`$dir'" 1>&2
+ exit 1
+ fi
+ dir="$absdir"
+ ;;
+ esac
+ case "$deplibs " in
+ *" -L$dir "*) ;;
+ *)
+ deplibs="$deplibs -L$dir"
+ lib_search_path="$lib_search_path $dir"
+ ;;
+ esac
+ case $host in
+ *-*-cygwin* | *-*-mingw* | *-*-pw32* | *-*-os2*)
+ case :$dllsearchpath: in
+ *":$dir:"*) ;;
+ *) dllsearchpath="$dllsearchpath:$dir";;
+ esac
+ ;;
+ esac
+ continue
+ ;;
+
+ -l*)
+ if test "X$arg" = "X-lc" || test "X$arg" = "X-lm"; then
+ case $host in
+ *-*-cygwin* | *-*-pw32* | *-*-beos*)
+ # These systems don't actually have a C or math library (as such)
+ continue
+ ;;
+ *-*-mingw* | *-*-os2*)
+ # These systems don't actually have a C library (as such)
+ test "X$arg" = "X-lc" && continue
+ ;;
+ esac
+ fi
+ deplibs="$deplibs $arg"
+ continue
+ ;;
+
+ -module)
+ module=yes
+ continue
+ ;;
+
+ -no-fast-install)
+ fast_install=no
+ continue
+ ;;
+
+ -no-install)
+ case $host in
+ *-*-cygwin* | *-*-mingw* | *-*-pw32* | *-*-os2*)
+ # The PATH hackery in wrapper scripts is required on Windows
+ # in order for the loader to find any dlls it needs.
+ $echo "$modename: warning: \`-no-install' is ignored for $host" 1>&2
+ $echo "$modename: warning: assuming \`-no-fast-install' instead" 1>&2
+ fast_install=no
+ ;;
+ *) no_install=yes ;;
+ esac
+ continue
+ ;;
+
+ -no-undefined)
+ allow_undefined=no
+ continue
+ ;;
+
+ -objectlist)
+ prev=objectlist
+ continue
+ ;;
+
+ -o) prev=output ;;
+
+ -release)
+ prev=release
+ continue
+ ;;
+
+ -rpath)
+ prev=rpath
+ continue
+ ;;
+
+ -R)
+ prev=xrpath
+ continue
+ ;;
+
+ -R*)
+ dir=`$echo "X$arg" | $Xsed -e 's/^-R//'`
+ # We need an absolute path.
+ case $dir in
+ [\\/]* | [A-Za-z]:[\\/]*) ;;
+ *)
+ $echo "$modename: only absolute run-paths are allowed" 1>&2
+ exit 1
+ ;;
+ esac
+ case "$xrpath " in
+ *" $dir "*) ;;
+ *) xrpath="$xrpath $dir" ;;
+ esac
+ continue
+ ;;
+
+ -static)
+ # The effects of -static are defined in a previous loop.
+ # We used to do the same as -all-static on platforms that
+ # didn't have a PIC flag, but the assumption that the effects
+ # would be equivalent was wrong. It would break on at least
+ # Digital Unix and AIX.
+ continue
+ ;;
+
+ -thread-safe)
+ thread_safe=yes
+ continue
+ ;;
+
+ -version-info)
+ prev=vinfo
+ continue
+ ;;
+
+ -Wc,*)
+ args=`$echo "X$arg" | $Xsed -e "$sed_quote_subst" -e 's/^-Wc,//'`
+ arg=
+ IFS="${IFS= }"; save_ifs="$IFS"; IFS=','
+ for flag in $args; do
+ IFS="$save_ifs"
+ case $flag in
+ *[\[\~\#\^\&\*\(\)\{\}\|\;\<\>\?\'\ \ ]*|*]*|"")
+ flag="\"$flag\""
+ ;;
+ esac
+ arg="$arg $wl$flag"
+ compiler_flags="$compiler_flags $flag"
+ done
+ IFS="$save_ifs"
+ arg=`$echo "X$arg" | $Xsed -e "s/^ //"`
+ ;;
+
+ -Wl,*)
+ args=`$echo "X$arg" | $Xsed -e "$sed_quote_subst" -e 's/^-Wl,//'`
+ arg=
+ IFS="${IFS= }"; save_ifs="$IFS"; IFS=','
+ for flag in $args; do
+ IFS="$save_ifs"
+ case $flag in
+ *[\[\~\#\^\&\*\(\)\{\}\|\;\<\>\?\'\ \ ]*|*]*|"")
+ flag="\"$flag\""
+ ;;
+ esac
+ arg="$arg $wl$flag"
+ compiler_flags="$compiler_flags $wl$flag"
+ linker_flags="$linker_flags $flag"
+ done
+ IFS="$save_ifs"
+ arg=`$echo "X$arg" | $Xsed -e "s/^ //"`
+ ;;
+
+ -Xcompiler)
+ prev=xcompiler
+ continue
+ ;;
+
+ -Xlinker)
+ prev=xlinker
+ continue
+ ;;
+
+ # Some other compiler flag.
+ -* | +*)
+ # Unknown arguments in both finalize_command and compile_command need
+ # to be aesthetically quoted because they are evaled later.
+ arg=`$echo "X$arg" | $Xsed -e "$sed_quote_subst"`
+ case $arg in
+ *[\[\~\#\^\&\*\(\)\{\}\|\;\<\>\?\'\ \ ]*|*]*|"")
+ arg="\"$arg\""
+ ;;
+ esac
+ ;;
+
+ *.$objext)
+ # A standard object.
+ objs="$objs $arg"
+ ;;
+
+ *.lo)
+ # A libtool-controlled object.
+
+ # Check to see that this really is a libtool object.
+ if (sed -e '2q' $arg | egrep "^# Generated by .*$PACKAGE") >/dev/null 2>&1; then
+ pic_object=
+ non_pic_object=
+
+ # Read the .lo file
+ # If there is no directory component, then add one.
+ case $arg in
+ */* | *\\*) . $arg ;;
+ *) . ./$arg ;;
+ esac
+
+ if test -z "$pic_object" || \
+ test -z "$non_pic_object" ||
+ test "$pic_object" = none && \
+ test "$non_pic_object" = none; then
+ $echo "$modename: cannot find name of object for \`$arg'" 1>&2
+ exit 1
+ fi
+
+ # Extract subdirectory from the argument.
+ xdir=`$echo "X$arg" | $Xsed -e 's%/[^/]*$%%'`
+ if test "X$xdir" = "X$arg"; then
+ xdir=
+ else
+ xdir="$xdir/"
+ fi
+
+ if test "$pic_object" != none; then
+ # Prepend the subdirectory the object is found in.
+ pic_object="$xdir$pic_object"
+
+ if test "$prev" = dlfiles; then
+ if test "$build_libtool_libs" = yes && test "$dlopen_support" = yes; then
+ dlfiles="$dlfiles $pic_object"
+ prev=
+ continue
+ else
+ # If libtool objects are unsupported, then we need to preload.
+ prev=dlprefiles
+ fi
+ fi
+
+ # CHECK ME: I think I busted this. -Ossama
+ if test "$prev" = dlprefiles; then
+ # Preload the old-style object.
+ dlprefiles="$dlprefiles $pic_object"
+ prev=
+ fi
+
+ # A PIC object.
+ libobjs="$libobjs $pic_object"
+ arg="$pic_object"
+ fi
+
+ # Non-PIC object.
+ if test "$non_pic_object" != none; then
+ # Prepend the subdirectory the object is found in.
+ non_pic_object="$xdir$non_pic_object"
+
+ # A standard non-PIC object
+ non_pic_objects="$non_pic_objects $non_pic_object"
+ if test -z "$pic_object" || test "$pic_object" = none ; then
+ arg="$non_pic_object"
+ fi
+ fi
+ else
+ # Only an error if not doing a dry-run.
+ if test -z "$run"; then
+ $echo "$modename: \`$arg' is not a valid libtool object" 1>&2
+ exit 1
+ else
+ # Dry-run case.
+
+ # Extract subdirectory from the argument.
+ xdir=`$echo "X$arg" | $Xsed -e 's%/[^/]*$%%'`
+ if test "X$xdir" = "X$arg"; then
+ xdir=
+ else
+ xdir="$xdir/"
+ fi
+
+ pic_object=`$echo "X${xdir}${objdir}/${arg}" | $Xsed -e "$lo2o"`
+ non_pic_object=`$echo "X${xdir}${arg}" | $Xsed -e "$lo2o"`
+ libobjs="$libobjs $pic_object"
+ non_pic_objects="$non_pic_objects $non_pic_object"
+ fi
+ fi
+ ;;
+
+ *.$libext)
+ # An archive.
+ deplibs="$deplibs $arg"
+ old_deplibs="$old_deplibs $arg"
+ continue
+ ;;
+
+ *.la)
+ # A libtool-controlled library.
+
+ if test "$prev" = dlfiles; then
+ # This library was specified with -dlopen.
+ dlfiles="$dlfiles $arg"
+ prev=
+ elif test "$prev" = dlprefiles; then
+ # The library was specified with -dlpreopen.
+ dlprefiles="$dlprefiles $arg"
+ prev=
+ else
+ deplibs="$deplibs $arg"
+ fi
+ continue
+ ;;
+
+ # Some other compiler argument.
+ *)
+ # Unknown arguments in both finalize_command and compile_command need
+ # to be aesthetically quoted because they are evaled later.
+ arg=`$echo "X$arg" | $Xsed -e "$sed_quote_subst"`
+ case $arg in
+ *[\[\~\#\^\&\*\(\)\{\}\|\;\<\>\?\'\ \ ]*|*]*|"")
+ arg="\"$arg\""
+ ;;
+ esac
+ ;;
+ esac # arg
+
+ # Now actually substitute the argument into the commands.
+ if test -n "$arg"; then
+ compile_command="$compile_command $arg"
+ finalize_command="$finalize_command $arg"
+ fi
+ done # argument parsing loop
+
+ if test -n "$prev"; then
+ $echo "$modename: the \`$prevarg' option requires an argument" 1>&2
+ $echo "$help" 1>&2
+ exit 1
+ fi
+
+ # Infer tagged configuration to use if any are available and
+ # if one wasn't chosen via the "--tag" command line option.
+ # Only attempt this if the compiler in the base link
+ # command doesn't match the default compiler.
+ if test -n "$available_tags" && test -z "$tagname"; then
+ case $base_compile in
+ "$CC "*) ;;
+ # Blanks in the command may have been stripped by the calling shell,
+ # but not from the CC environment variable when ltconfig was run.
+ "`$echo $CC` "*) ;;
+ *)
+ for z in $available_tags; do
+ if grep "^### BEGIN LIBTOOL TAG CONFIG: $z$" < "$0" > /dev/null; then
+ # Evaluate the configuration.
+ eval "`sed -n -e '/^### BEGIN LIBTOOL TAG CONFIG: '$z'$/,/^### END LIBTOOL TAG CONFIG: '$z'$/p' < $0`"
+ case $base_compile in
+ "$CC "*)
+ # The compiler in $compile_command matches
+ # the one in the tagged configuration.
+ # Assume this is the tagged configuration we want.
+ tagname=$z
+ break
+ ;;
+ "`$echo $CC` "*)
+ tagname=$z
+ break
+ ;;
+ esac
+ fi
+ done
+ # If $tagname still isn't set, then no tagged configuration
+ # was found and let the user know that the "--tag" command
+ # line option must be used.
+ if test -z "$tagname"; then
+ echo "$modename: unable to infer tagged configuration"
+ echo "$modename: specify a tag with \`--tag'" 1>&2
+ exit 1
+# else
+# echo "$modename: using $tagname tagged configuration"
+ fi
+ ;;
+ esac
+ fi
+
+ if test "$export_dynamic" = yes && test -n "$export_dynamic_flag_spec"; then
+ eval arg=\"$export_dynamic_flag_spec\"
+ compile_command="$compile_command $arg"
+ finalize_command="$finalize_command $arg"
+ fi
+
+ # calculate the name of the file, without its directory
+ outputname=`$echo "X$output" | $Xsed -e 's%^.*/%%'`
+ libobjs_save="$libobjs"
+
+ if test -n "$shlibpath_var"; then
+ # get the directories listed in $shlibpath_var
+ eval shlib_search_path=\`\$echo \"X\${$shlibpath_var}\" \| \$Xsed -e \'s/:/ /g\'\`
+ else
+ shlib_search_path=
+ fi
+ eval sys_lib_search_path=\"$sys_lib_search_path_spec\"
+ eval sys_lib_dlsearch_path=\"$sys_lib_dlsearch_path_spec\"
+
+ output_objdir=`$echo "X$output" | $Xsed -e 's%/[^/]*$%%'`
+ if test "X$output_objdir" = "X$output"; then
+ output_objdir="$objdir"
+ else
+ output_objdir="$output_objdir/$objdir"
+ fi
+ # Create the object directory.
+ if test ! -d $output_objdir; then
+ $show "$mkdir $output_objdir"
+ $run $mkdir $output_objdir
+ status=$?
+ if test $status -ne 0 && test ! -d $output_objdir; then
+ exit $status
+ fi
+ fi
+
+ # Determine the type of output
+ case $output in
+ "")
+ $echo "$modename: you must specify an output file" 1>&2
+ $echo "$help" 1>&2
+ exit 1
+ ;;
+ *.$libext) linkmode=oldlib ;;
+ *.lo | *.$objext) linkmode=obj ;;
+ *.la) linkmode=lib ;;
+ *) linkmode=prog ;; # Anything else should be a program.
+ esac
+
+ specialdeplibs=
+ libs=
+ # Find all interdependent deplibs by searching for libraries
+ # that are linked more than once (e.g. -la -lb -la)
+ for deplib in $deplibs; do
+ case "$libs " in
+ *" $deplib "*) specialdeplibs="$specialdeplibs $deplib" ;;
+ esac
+ libs="$libs $deplib"
+ done
+
+ if test $linkmode = lib; then
+ libs="$predeps $libs $compiler_lib_search_path $postdeps"
+
+ # Compute libraries that are listed more than once in $predeps
+ # $postdeps and mark them as special (i.e., whose duplicates are
+ # not to be eliminated).
+ pre_post_deps=
+ for pre_post_dep in $predeps $postdeps; do
+ case "$pre_post_deps " in
+ *" $pre_post_dep "*) specialdeplibs="$specialdeplibs $pre_post_deps" ;;
+ esac
+ pre_post_deps="$pre_post_deps $pre_post_dep"
+ done
+ pre_post_deps=
+ fi
+
+ deplibs=
+ newdependency_libs=
+ newlib_search_path=
+ need_relink=no # whether we're linking any uninstalled libtool libraries
+ notinst_deplibs= # not-installed libtool libraries
+ notinst_path= # paths that contain not-installed libtool libraries
+ case $linkmode in
+ lib)
+ passes="conv link"
+ for file in $dlfiles $dlprefiles; do
+ case $file in
+ *.la) ;;
+ *)
+ $echo "$modename: libraries can \`-dlopen' only libtool libraries: $file" 1>&2
+ exit 1
+ ;;
+ esac
+ done
+ ;;
+ prog)
+ compile_deplibs=
+ finalize_deplibs=
+ alldeplibs=no
+ newdlfiles=
+ newdlprefiles=
+ passes="conv scan dlopen dlpreopen link"
+ ;;
+ *) passes="conv"
+ ;;
+ esac
+ for pass in $passes; do
+ if test $linkmode = prog; then
+ # Determine which files to process
+ case $pass in
+ dlopen)
+ libs="$dlfiles"
+ save_deplibs="$deplibs" # Collect dlpreopened libraries
+ deplibs=
+ ;;
+ dlpreopen) libs="$dlprefiles" ;;
+ link) libs="$deplibs %DEPLIBS% $dependency_libs" ;;
+ esac
+ fi
+ for deplib in $libs; do
+ lib=
+ found=no
+ case $deplib in
+ -l*)
+ if test $linkmode = oldlib && test $linkmode = obj; then
+ $echo "$modename: warning: \`-l' is ignored for archives/objects: $deplib" 1>&2
+ continue
+ fi
+ if test $pass = conv; then
+ deplibs="$deplib $deplibs"
+ continue
+ fi
+ name=`$echo "X$deplib" | $Xsed -e 's/^-l//'`
+ for searchdir in $newlib_search_path $lib_search_path $sys_lib_search_path $shlib_search_path; do
+ # Search the libtool library
+ lib="$searchdir/lib${name}.la"
+ if test -f "$lib"; then
+ found=yes
+ break
+ fi
+ done
+ if test "$found" != yes; then
+ # deplib doesn't seem to be a libtool library
+ if test "$linkmode,$pass" = "prog,link"; then
+ compile_deplibs="$deplib $compile_deplibs"
+ finalize_deplibs="$deplib $finalize_deplibs"
+ else
+ deplibs="$deplib $deplibs"
+ test $linkmode = lib && newdependency_libs="$deplib $newdependency_libs"
+ fi
+ continue
+ fi
+ ;; # -l
+ -L*)
+ case $linkmode in
+ lib)
+ deplibs="$deplib $deplibs"
+ test $pass = conv && continue
+ newdependency_libs="$deplib $newdependency_libs"
+ newlib_search_path="$newlib_search_path "`$echo "X$deplib" | $Xsed -e 's/^-L//'`
+ ;;
+ prog)
+ if test $pass = conv; then
+ deplibs="$deplib $deplibs"
+ continue
+ fi
+ if test $pass = scan; then
+ deplibs="$deplib $deplibs"
+ newlib_search_path="$newlib_search_path "`$echo "X$deplib" | $Xsed -e 's/^-L//'`
+ else
+ compile_deplibs="$deplib $compile_deplibs"
+ finalize_deplibs="$deplib $finalize_deplibs"
+ fi
+ ;;
+ *)
+ $echo "$modename: warning: \`-L' is ignored for archives/objects: $deplib" 1>&2
+ ;;
+ esac # linkmode
+ continue
+ ;; # -L
+ -R*)
+ if test $pass = link; then
+ dir=`$echo "X$deplib" | $Xsed -e 's/^-R//'`
+ # Make sure the xrpath contains only unique directories.
+ case "$xrpath " in
+ *" $dir "*) ;;
+ *) xrpath="$xrpath $dir" ;;
+ esac
+ fi
+ deplibs="$deplib $deplibs"
+ continue
+ ;;
+ *.la) lib="$deplib" ;;
+ *.$libext)
+ if test $pass = conv; then
+ deplibs="$deplib $deplibs"
+ continue
+ fi
+ case $linkmode in
+ lib)
+ if test "$deplibs_check_method" != pass_all; then
+ echo
+ echo "*** Warning: This library needs some functionality provided by $deplib."
+ echo "*** I have the capability to make that library automatically link in when"
+ echo "*** you link to this library. But I can only do this if you have a"
+ echo "*** shared version of the library, which you do not appear to have."
+ else
+ echo
+ echo "*** Warning: Linking the shared library $output against the"
+ echo "*** static library $deplib is not portable!"
+ deplibs="$deplib $deplibs"
+ fi
+ continue
+ ;;
+ prog)
+ if test $pass != link; then
+ deplibs="$deplib $deplibs"
+ else
+ compile_deplibs="$deplib $compile_deplibs"
+ finalize_deplibs="$deplib $finalize_deplibs"
+ fi
+ continue
+ ;;
+ esac # linkmode
+ ;; # *.$libext
+ *.lo | *.$objext)
+ if test $pass = dlpreopen || test "$dlopen_support" != yes || test "$build_libtool_libs" = no; then
+ # If there is no dlopen support or we're linking statically,
+ # we need to preload.
+ newdlprefiles="$newdlprefiles $deplib"
+ compile_deplibs="$deplib $compile_deplibs"
+ finalize_deplibs="$deplib $finalize_deplibs"
+ else
+ newdlfiles="$newdlfiles $deplib"
+ fi
+ continue
+ ;;
+ %DEPLIBS%)
+ alldeplibs=yes
+ continue
+ ;;
+ esac # case $deplib
+ if test $found = yes || test -f "$lib"; then :
+ else
+ $echo "$modename: cannot find the library \`$lib'" 1>&2
+ exit 1
+ fi
+
+ # Check to see that this really is a libtool archive.
+ if (sed -e '2q' $lib | egrep "^# Generated by .*$PACKAGE") >/dev/null 2>&1; then :
+ else
+ $echo "$modename: \`$lib' is not a valid libtool archive" 1>&2
+ exit 1
+ fi
+
+ ladir=`$echo "X$lib" | $Xsed -e 's%/[^/]*$%%'`
+ test "X$ladir" = "X$lib" && ladir="."
+
+ dlname=
+ dlopen=
+ dlpreopen=
+ libdir=
+ library_names=
+ old_library=
+ # If the library was installed with an old release of libtool,
+ # it will not redefine variable installed.
+ installed=yes
+
+ # Read the .la file
+ case $lib in
+ */* | *\\*) . $lib ;;
+ *) . ./$lib ;;
+ esac
+
+ if test "$linkmode,$pass" = "lib,link" ||
+ test "$linkmode,$pass" = "prog,scan" ||
+ { test $linkmode = oldlib && test $linkmode = obj; }; then
+ # Add dl[pre]opened files of deplib
+ test -n "$dlopen" && dlfiles="$dlfiles $dlopen"
+ test -n "$dlpreopen" && dlprefiles="$dlprefiles $dlpreopen"
+ fi
+
+ if test $pass = conv; then
+ # Only check for convenience libraries
+ deplibs="$lib $deplibs"
+ if test -z "$libdir"; then
+ if test -z "$old_library"; then
+ $echo "$modename: cannot find name of link library for \`$lib'" 1>&2
+ exit 1
+ fi
+ # It is a libtool convenience library, so add in its objects.
+ convenience="$convenience $ladir/$objdir/$old_library"
+ old_convenience="$old_convenience $ladir/$objdir/$old_library"
+ tmp_libs=
+ for deplib in $dependency_libs; do
+ deplibs="$deplib $deplibs"
+ case "$tmp_libs " in
+ *" $deplib "*) specialdeplibs="$specialdeplibs $deplib" ;;
+ esac
+ tmp_libs="$tmp_libs $deplib"
+ done
+ elif test $linkmode != prog && test $linkmode != lib; then
+ $echo "$modename: \`$lib' is not a convenience library" 1>&2
+ exit 1
+ fi
+ continue
+ fi # $pass = conv
+
+ # Get the name of the library we link against.
+ linklib=
+ for l in $old_library $library_names; do
+ linklib="$l"
+ done
+ if test -z "$linklib"; then
+ $echo "$modename: cannot find name of link library for \`$lib'" 1>&2
+ exit 1
+ fi
+
+ # This library was specified with -dlopen.
+ if test $pass = dlopen; then
+ if test -z "$libdir"; then
+ $echo "$modename: cannot -dlopen a convenience library: \`$lib'" 1>&2
+ exit 1
+ fi
+ if test -z "$dlname" || test "$dlopen_support" != yes || test "$build_libtool_libs" = no; then
+ # If there is no dlname, no dlopen support or we're linking
+ # statically, we need to preload.
+ dlprefiles="$dlprefiles $lib"
+ else
+ newdlfiles="$newdlfiles $lib"
+ fi
+ continue
+ fi # $pass = dlopen
+
+ # We need an absolute path.
+ case $ladir in
+ [\\/]* | [A-Za-z]:[\\/]*) abs_ladir="$ladir" ;;
+ *)
+ abs_ladir=`cd "$ladir" && pwd`
+ if test -z "$abs_ladir"; then
+ $echo "$modename: warning: cannot determine absolute directory name of \`$ladir'" 1>&2
+ $echo "$modename: passing it literally to the linker, although it might fail" 1>&2
+ abs_ladir="$ladir"
+ fi
+ ;;
+ esac
+ laname=`$echo "X$lib" | $Xsed -e 's%^.*/%%'`
+
+ # Find the relevant object directory and library name.
+ if test "X$installed" = Xyes; then
+ if test ! -f "$libdir/$linklib" && test -f "$abs_ladir/$linklib"; then
+ $echo "$modename: warning: library \`$lib' was moved." 1>&2
+ dir="$ladir"
+ absdir="$abs_ladir"
+ libdir="$abs_ladir"
+ else
+ dir="$libdir"
+ absdir="$libdir"
+ fi
+ else
+ dir="$ladir/$objdir"
+ absdir="$abs_ladir/$objdir"
+ # Remove this search path later
+ notinst_path="$notinst_path $abs_ladir"
+ fi # $installed = yes
+ name=`$echo "X$laname" | $Xsed -e 's/\.la$//' -e 's/^lib//'`
+
+ # This library was specified with -dlpreopen.
+ if test $pass = dlpreopen; then
+ if test -z "$libdir"; then
+ $echo "$modename: cannot -dlpreopen a convenience library: \`$lib'" 1>&2
+ exit 1
+ fi
+ # Prefer using a static library (so that no silly _DYNAMIC symbols
+ # are required to link).
+ if test -n "$old_library"; then
+ newdlprefiles="$newdlprefiles $dir/$old_library"
+ # Otherwise, use the dlname, so that lt_dlopen finds it.
+ elif test -n "$dlname"; then
+ newdlprefiles="$newdlprefiles $dir/$dlname"
+ else
+ newdlprefiles="$newdlprefiles $dir/$linklib"
+ fi
+ fi # $pass = dlpreopen
+
+ if test -z "$libdir"; then
+ # Link the convenience library
+ if test $linkmode = lib; then
+ deplibs="$dir/$old_library $deplibs"
+ elif test "$linkmode,$pass" = "prog,link"; then
+ compile_deplibs="$dir/$old_library $compile_deplibs"
+ finalize_deplibs="$dir/$old_library $finalize_deplibs"
+ else
+ deplibs="$lib $deplibs"
+ fi
+ continue
+ fi
+
+ if test $linkmode = prog && test $pass != link; then
+ newlib_search_path="$newlib_search_path $ladir"
+ deplibs="$lib $deplibs"
+
+ linkalldeplibs=no
+ if test "$link_all_deplibs" != no || test -z "$library_names" ||
+ test "$build_libtool_libs" = no; then
+ linkalldeplibs=yes
+ fi
+
+ tmp_libs=
+ for deplib in $dependency_libs; do
+ case $deplib in
+ -L*) newlib_search_path="$newlib_search_path "`$echo "X$deplib" | $Xsed -e 's/^-L//'`;; ### testsuite: skip nested quoting test
+ esac
+ # Need to link against all dependency_libs?
+ if test $linkalldeplibs = yes; then
+ deplibs="$deplib $deplibs"
+ else
+ # Need to hardcode shared library paths
+ # or/and link against static libraries
+ newdependency_libs="$deplib $newdependency_libs"
+ fi
+ case "$tmp_libs " in
+ *" $deplib "*) specialdeplibs="$specialdeplibs $deplib" ;;
+ esac
+ tmp_libs="$tmp_libs $deplib"
+ done # for deplib
+ continue
+ fi # $linkmode = prog...
+
+ link_static=no # Whether the deplib will be linked statically
+ if test -n "$library_names" &&
+ { test "$prefer_static_libs" = no || test -z "$old_library"; }; then
+ # Link against this shared library
+
+ if test "$linkmode,$pass" = "prog,link" ||
+ { test $linkmode = lib && test $hardcode_into_libs = yes; }; then
+ # Hardcode the library path.
+ # Skip directories that are in the system default run-time
+ # search path.
+ case " $sys_lib_dlsearch_path " in
+ *" $absdir "*) ;;
+ *)
+ case "$compile_rpath " in
+ *" $absdir "*) ;;
+ *) compile_rpath="$compile_rpath $absdir"
+ esac
+ ;;
+ esac
+ case " $sys_lib_dlsearch_path " in
+ *" $libdir "*) ;;
+ *)
+ case "$finalize_rpath " in
+ *" $libdir "*) ;;
+ *) finalize_rpath="$finalize_rpath $libdir"
+ esac
+ ;;
+ esac
+ if test $linkmode = prog; then
+ # We need to hardcode the library path
+ if test -n "$shlibpath_var"; then
+ # Make sure the rpath contains only unique directories.
+ case "$temp_rpath " in
+ *" $dir "*) ;;
+ *" $absdir "*) ;;
+ *) temp_rpath="$temp_rpath $dir" ;;
+ esac
+ fi
+ fi
+ fi # $linkmode,$pass = prog,link...
+
+ if test "$alldeplibs" = yes &&
+ { test "$deplibs_check_method" = pass_all ||
+ { test "$build_libtool_libs" = yes &&
+ test -n "$library_names"; }; }; then
+ # We only need to search for static libraries
+ continue
+ fi
+
+ if test "$installed" = no; then
+ notinst_deplibs="$notinst_deplibs $lib"
+ need_relink=yes
+ fi
+
+ if test -n "$old_archive_from_expsyms_cmds"; then
+ # figure out the soname
+ set dummy $library_names
+ realname="$2"
+ shift; shift
+ libname=`eval \\$echo \"$libname_spec\"`
+ # use dlname if we got it. it's perfectly good, no?
+ if test -n "$dlname"; then
+ soname="$dlname"
+ elif test -n "$soname_spec"; then
+ # bleh windows
+ case $host in
+ *cygwin*)
+ major=`expr $current - $age`
+ versuffix="-$major"
+ ;;
+ esac
+ eval soname=\"$soname_spec\"
+ else
+ soname="$realname"
+ fi
+
+ # Make a new name for the extract_expsyms_cmds to use
+ soroot="$soname"
+ soname=`echo $soroot | sed -e 's/^.*\///'`
+ newlib="libimp-`echo $soname | sed 's/^lib//;s/\.dll$//'`.a"
+
+ # If the library has no export list, then create one now
+ if test -f "$output_objdir/$soname-def"; then :
+ else
+ $show "extracting exported symbol list from \`$soname'"
+ IFS="${IFS= }"; save_ifs="$IFS"; IFS='~'
+ eval cmds=\"$extract_expsyms_cmds\"
+ for cmd in $cmds; do
+ IFS="$save_ifs"
+ $show "$cmd"
+ $run eval "$cmd" || exit $?
+ done
+ IFS="$save_ifs"
+ fi
+
+ # Create $newlib
+ if test -f "$output_objdir/$newlib"; then :; else
+ $show "generating import library for \`$soname'"
+ IFS="${IFS= }"; save_ifs="$IFS"; IFS='~'
+ eval cmds=\"$old_archive_from_expsyms_cmds\"
+ for cmd in $cmds; do
+ IFS="$save_ifs"
+ $show "$cmd"
+ $run eval "$cmd" || exit $?
+ done
+ IFS="$save_ifs"
+ fi
+ # make sure the library variables are pointing to the new library
+ dir=$output_objdir
+ linklib=$newlib
+ fi # test -n $old_archive_from_expsyms_cmds
+
+ if test $linkmode = prog || test "$mode" != relink; then
+ add_shlibpath=
+ add_dir=
+ add=
+ lib_linked=yes
+ case $hardcode_action in
+ immediate | unsupported)
+ if test "$hardcode_direct" = no; then
+ add="$dir/$linklib"
+ elif test "$hardcode_minus_L" = no; then
+ case $host in
+ *-*-sunos*) add_shlibpath="$dir" ;;
+ esac
+ add_dir="-L$dir"
+ add="-l$name"
+ elif test "$hardcode_shlibpath_var" = no; then
+ add_shlibpath="$dir"
+ add="-l$name"
+ else
+ lib_linked=no
+ fi
+ ;;
+ relink)
+ if test "$hardcode_direct" = yes; then
+ add="$dir/$linklib"
+ elif test "$hardcode_minus_L" = yes; then
+ add_dir="-L$dir"
+ add="-l$name"
+ elif test "$hardcode_shlibpath_var" = yes; then
+ add_shlibpath="$dir"
+ add="-l$name"
+ else
+ lib_linked=no
+ fi
+ ;;
+ *) lib_linked=no ;;
+ esac
+
+ if test "$lib_linked" != yes; then
+ $echo "$modename: configuration error: unsupported hardcode properties"
+ exit 1
+ fi
+
+ if test -n "$add_shlibpath"; then
+ case :$compile_shlibpath: in
+ *":$add_shlibpath:"*) ;;
+ *) compile_shlibpath="$compile_shlibpath$add_shlibpath:" ;;
+ esac
+ fi
+ if test $linkmode = prog; then
+ test -n "$add_dir" && compile_deplibs="$add_dir $compile_deplibs"
+ test -n "$add" && compile_deplibs="$add $compile_deplibs"
+ else
+ test -n "$add_dir" && deplibs="$add_dir $deplibs"
+ test -n "$add" && deplibs="$add $deplibs"
+ if test "$hardcode_direct" != yes && \
+ test "$hardcode_minus_L" != yes && \
+ test "$hardcode_shlibpath_var" = yes; then
+ case :$finalize_shlibpath: in
+ *":$libdir:"*) ;;
+ *) finalize_shlibpath="$finalize_shlibpath$libdir:" ;;
+ esac
+ fi
+ fi
+ fi
+
+ if test $linkmode = prog || test "$mode" = relink; then
+ add_shlibpath=
+ add_dir=
+ add=
+ # Finalize command for both is simple: just hardcode it.
+ if test "$hardcode_direct" = yes; then
+ add="$libdir/$linklib"
+ elif test "$hardcode_minus_L" = yes; then
+ add_dir="-L$libdir"
+ add="-l$name"
+ elif test "$hardcode_shlibpath_var" = yes; then
+ case :$finalize_shlibpath: in
+ *":$libdir:"*) ;;
+ *) finalize_shlibpath="$finalize_shlibpath$libdir:" ;;
+ esac
+ add="-l$name"
+ else
+ # We cannot seem to hardcode it, guess we'll fake it.
+ add_dir="-L$libdir"
+ add="-l$name"
+ fi
+
+ if test $linkmode = prog; then
+ test -n "$add_dir" && finalize_deplibs="$add_dir $finalize_deplibs"
+ test -n "$add" && finalize_deplibs="$add $finalize_deplibs"
+ else
+ test -n "$add_dir" && deplibs="$add_dir $deplibs"
+ test -n "$add" && deplibs="$add $deplibs"
+ fi
+ fi
+ elif test $linkmode = prog; then
+ if test "$alldeplibs" = yes &&
+ { test "$deplibs_check_method" = pass_all ||
+ { test "$build_libtool_libs" = yes &&
+ test -n "$library_names"; }; }; then
+ # We only need to search for static libraries
+ continue
+ fi
+
+ # Try to link the static library
+ # Here we assume that one of hardcode_direct or hardcode_minus_L
+ # is not unsupported. This is valid on all known static and
+ # shared platforms.
+ if test "$hardcode_direct" != unsupported; then
+ test -n "$old_library" && linklib="$old_library"
+ compile_deplibs="$dir/$linklib $compile_deplibs"
+ finalize_deplibs="$dir/$linklib $finalize_deplibs"
+ else
+ compile_deplibs="-l$name -L$dir $compile_deplibs"
+ finalize_deplibs="-l$name -L$dir $finalize_deplibs"
+ fi
+ elif test "$build_libtool_libs" = yes; then
+ # Not a shared library
+ if test "$deplibs_check_method" != pass_all; then
+ # We're trying link a shared library against a static one
+ # but the system doesn't support it.
+
+ # Just print a warning and add the library to dependency_libs so
+ # that the program can be linked against the static library.
+ echo
+ echo "*** Warning: This library needs some functionality provided by $lib."
+ echo "*** I have the capability to make that library automatically link in when"
+ echo "*** you link to this library. But I can only do this if you have a"
+ echo "*** shared version of the library, which you do not appear to have."
+ if test "$module" = yes; then
+ echo "*** Therefore, libtool will create a static module, that should work "
+ echo "*** as long as the dlopening application is linked with the -dlopen flag."
+ if test -z "$global_symbol_pipe"; then
+ echo
+ echo "*** However, this would only work if libtool was able to extract symbol"
+ echo "*** lists from a program, using \`nm' or equivalent, but libtool could"
+ echo "*** not find such a program. So, this module is probably useless."
+ echo "*** \`nm' from GNU binutils and a full rebuild may help."
+ fi
+ if test "$build_old_libs" = no; then
+ build_libtool_libs=module
+ build_old_libs=yes
+ else
+ build_libtool_libs=no
+ fi
+ fi
+ else
+ convenience="$convenience $dir/$old_library"
+ old_convenience="$old_convenience $dir/$old_library"
+ deplibs="$dir/$old_library $deplibs"
+ link_static=yes
+ fi
+ fi # link shared/static library?
+
+ if test $linkmode = lib; then
+ if test -n "$dependency_libs" &&
+ { test $hardcode_into_libs != yes || test $build_old_libs = yes ||
+ test $link_static = yes; }; then
+ # Extract -R from dependency_libs
+ temp_deplibs=
+ for libdir in $dependency_libs; do
+ case $libdir in
+ -R*) temp_xrpath=`$echo "X$libdir" | $Xsed -e 's/^-R//'`
+ case " $xrpath " in
+ *" $temp_xrpath "*) ;;
+ *) xrpath="$xrpath $temp_xrpath";;
+ esac;;
+ *) temp_deplibs="$temp_deplibs $libdir";;
+ esac
+ done
+ dependency_libs="$temp_deplibs"
+ fi
+
+ newlib_search_path="$newlib_search_path $absdir"
+ # Link against this library
+ test "$link_static" = no && newdependency_libs="$abs_ladir/$laname $newdependency_libs"
+ # ... and its dependency_libs
+ tmp_libs=
+ for deplib in $dependency_libs; do
+ newdependency_libs="$deplib $newdependency_libs"
+ case "$tmp_libs " in
+ *" $deplib "*) specialdeplibs="$specialdeplibs $deplib" ;;
+ esac
+ tmp_libs="$tmp_libs $deplib"
+ done
+
+ if test $link_all_deplibs != no; then
+ # Add the search paths of all dependency libraries
+ for deplib in $dependency_libs; do
+ case $deplib in
+ -L*) path="$deplib" ;;
+ *.la)
+ dir=`$echo "X$deplib" | $Xsed -e 's%/[^/]*$%%'`
+ test "X$dir" = "X$deplib" && dir="."
+ # We need an absolute path.
+ case $dir in
+ [\\/]* | [A-Za-z]:[\\/]*) absdir="$dir" ;;
+ *)
+ absdir=`cd "$dir" && pwd`
+ if test -z "$absdir"; then
+ $echo "$modename: warning: cannot determine absolute directory name of \`$dir'" 1>&2
+ absdir="$dir"
+ fi
+ ;;
+ esac
+ if grep "^installed=no" $deplib > /dev/null; then
+ path="-L$absdir/$objdir"
+ else
+ eval libdir=`sed -n -e 's/^libdir=\(.*\)$/\1/p' $deplib`
+ if test -z "$libdir"; then
+ $echo "$modename: \`$deplib' is not a valid libtool archive" 1>&2
+ exit 1
+ fi
+ if test "$absdir" != "$libdir"; then
+ $echo "$modename: warning: \`$deplib' seems to be moved" 1>&2
+ fi
+ path="-L$absdir"
+ fi
+ ;;
+ *) continue ;;
+ esac
+ case " $deplibs " in
+ *" $path "*) ;;
+ *) deplibs="$path $deplibs" ;;
+ esac
+ done
+ fi # link_all_deplibs != no
+ fi # linkmode = lib
+ done # for deplib in $libs
+ if test $pass = dlpreopen; then
+ # Link the dlpreopened libraries before other libraries
+ for deplib in $save_deplibs; do
+ deplibs="$deplib $deplibs"
+ done
+ fi
+ if test $pass != dlopen; then
+ test $pass != scan && dependency_libs="$newdependency_libs"
+ if test $pass != conv; then
+ # Make sure lib_search_path contains only unique directories.
+ lib_search_path=
+ for dir in $newlib_search_path; do
+ case "$lib_search_path " in
+ *" $dir "*) ;;
+ *) lib_search_path="$lib_search_path $dir" ;;
+ esac
+ done
+ newlib_search_path=
+ fi
+
+ if test "$linkmode,$pass" != "prog,link"; then
+ vars="deplibs"
+ else
+ vars="compile_deplibs finalize_deplibs"
+ fi
+ for var in $vars dependency_libs; do
+ # Add libraries to $var in reverse order
+ eval tmp_libs=\"\$$var\"
+ new_libs=
+ for deplib in $tmp_libs; do
+ case $deplib in
+ -L*) new_libs="$deplib $new_libs" ;;
+ *)
+ case " $specialdeplibs " in
+ *" $deplib "*) new_libs="$deplib $new_libs" ;;
+ *)
+ case " $new_libs " in
+ *" $deplib "*) ;;
+ *) new_libs="$deplib $new_libs" ;;
+ esac
+ ;;
+ esac
+ ;;
+ esac
+ done
+ tmp_libs=
+ for deplib in $new_libs; do
+ case $deplib in
+ -L*)
+ case " $tmp_libs " in
+ *" $deplib "*) ;;
+ *) tmp_libs="$tmp_libs $deplib" ;;
+ esac
+ ;;
+ *) tmp_libs="$tmp_libs $deplib" ;;
+ esac
+ done
+ eval $var=\"$tmp_libs\"
+ done # for var
+ fi
+ if test "$pass" = "conv" &&
+ { test "$linkmode" = "lib" || test "$linkmode" = "prog"; }; then
+ libs="$deplibs" # reset libs
+ deplibs=
+ fi
+ done # for pass
+ if test $linkmode = prog; then
+ dlfiles="$newdlfiles"
+ dlprefiles="$newdlprefiles"
+ fi
+
+ case $linkmode in
+ oldlib)
+ if test -n "$dlfiles$dlprefiles" || test "$dlself" != no; then
+ $echo "$modename: warning: \`-dlopen' is ignored for archives" 1>&2
+ fi
+
+ if test -n "$rpath"; then
+ $echo "$modename: warning: \`-rpath' is ignored for archives" 1>&2
+ fi
+
+ if test -n "$xrpath"; then
+ $echo "$modename: warning: \`-R' is ignored for archives" 1>&2
+ fi
+
+ if test -n "$vinfo"; then
+ $echo "$modename: warning: \`-version-info' is ignored for archives" 1>&2
+ fi
+
+ if test -n "$release"; then
+ $echo "$modename: warning: \`-release' is ignored for archives" 1>&2
+ fi
+
+ if test -n "$export_symbols" || test -n "$export_symbols_regex"; then
+ $echo "$modename: warning: \`-export-symbols' is ignored for archives" 1>&2
+ fi
+
+ # Now set the variables for building old libraries.
+ build_libtool_libs=no
+ oldlibs="$output"
+ objs="$objs$old_deplibs"
+ ;;
+
+ lib)
+ # Make sure we only generate libraries of the form `libNAME.la'.
+ case $outputname in
+ lib*)
+ name=`$echo "X$outputname" | $Xsed -e 's/\.la$//' -e 's/^lib//'`
+ eval libname=\"$libname_spec\"
+ ;;
+ *)
+ if test "$module" = no; then
+ $echo "$modename: libtool library \`$output' must begin with \`lib'" 1>&2
+ $echo "$help" 1>&2
+ exit 1
+ fi
+ if test "$need_lib_prefix" != no; then
+ # Add the "lib" prefix for modules if required
+ name=`$echo "X$outputname" | $Xsed -e 's/\.la$//'`
+ eval libname=\"$libname_spec\"
+ else
+ libname=`$echo "X$outputname" | $Xsed -e 's/\.la$//'`
+ fi
+ ;;
+ esac
+
+ if test -n "$objs"; then
+ if test "$deplibs_check_method" != pass_all; then
+ $echo "$modename: cannot build libtool library \`$output' from non-libtool objects on this host:$objs" 2>&1
+ exit 1
+ else
+ echo
+ echo "*** Warning: Linking the shared library $output against the non-libtool"
+ echo "*** objects $objs is not portable!"
+ libobjs="$libobjs $objs"
+ fi
+ fi
+
+ if test "$dlself" != no; then
+ $echo "$modename: warning: \`-dlopen self' is ignored for libtool libraries" 1>&2
+ fi
+
+ set dummy $rpath
+ if test $# -gt 2; then
+ $echo "$modename: warning: ignoring multiple \`-rpath's for a libtool library" 1>&2
+ fi
+ install_libdir="$2"
+
+ oldlibs=
+ if test -z "$rpath"; then
+ if test "$build_libtool_libs" = yes; then
+ # Building a libtool convenience library.
+ # Some compilers have problems with a `.al' extension so
+ # convenience libraries should have the same extension an
+ # archive normally would.
+ oldlibs="$output_objdir/$libname.$libext $oldlibs"
+ build_libtool_libs=convenience
+ build_old_libs=yes
+ fi
+
+ if test -n "$vinfo"; then
+ $echo "$modename: warning: \`-version-info' is ignored for convenience libraries" 1>&2
+ fi
+
+ if test -n "$release"; then
+ $echo "$modename: warning: \`-release' is ignored for convenience libraries" 1>&2
+ fi
+ else
+
+ # Parse the version information argument.
+ IFS="${IFS= }"; save_ifs="$IFS"; IFS=':'
+ set dummy $vinfo 0 0 0
+ IFS="$save_ifs"
+
+ if test -n "$8"; then
+ $echo "$modename: too many parameters to \`-version-info'" 1>&2
+ $echo "$help" 1>&2
+ exit 1
+ fi
+
+ current="$2"
+ revision="$3"
+ age="$4"
+
+ # Check that each of the things are valid numbers.
+ case $current in
+ 0 | [1-9] | [1-9][0-9] | [1-9][0-9][0-9]) ;;
+ *)
+ $echo "$modename: CURRENT \`$current' is not a nonnegative integer" 1>&2
+ $echo "$modename: \`$vinfo' is not valid version information" 1>&2
+ exit 1
+ ;;
+ esac
+
+ case $revision in
+ 0 | [1-9] | [1-9][0-9] | [1-9][0-9][0-9]) ;;
+ *)
+ $echo "$modename: REVISION \`$revision' is not a nonnegative integer" 1>&2
+ $echo "$modename: \`$vinfo' is not valid version information" 1>&2
+ exit 1
+ ;;
+ esac
+
+ case $age in
+ 0 | [1-9] | [1-9][0-9] | [1-9][0-9][0-9]) ;;
+ *)
+ $echo "$modename: AGE \`$age' is not a nonnegative integer" 1>&2
+ $echo "$modename: \`$vinfo' is not valid version information" 1>&2
+ exit 1
+ ;;
+ esac
+
+ if test $age -gt $current; then
+ $echo "$modename: AGE \`$age' is greater than the current interface number \`$current'" 1>&2
+ $echo "$modename: \`$vinfo' is not valid version information" 1>&2
+ exit 1
+ fi
+
+ # Calculate the version variables.
+ major=
+ versuffix=
+ verstring=
+ case $version_type in
+ none) ;;
+
+ darwin)
+ # Like Linux, but with the current version available in
+ # verstring for coding it into the library header
+ major=.`expr $current - $age`
+ versuffix="$major.$age.$revision"
+ # Darwin ld doesn't like 0 for these options...
+ minor_current=`expr $current + 1`
+ verstring="-compatibility_version $minor_current -current_version $minor_current.$revision"
+ ;;
+
+ freebsd-aout)
+ major=".$current"
+ versuffix=".$current.$revision";
+ ;;
+
+ freebsd-elf)
+ major=".$current"
+ versuffix=".$current";
+ ;;
+
+ irix)
+ major=`expr $current - $age + 1`
+ verstring="sgi$major.$revision"
+
+ # Add in all the interfaces that we are compatible with.
+ loop=$revision
+ while test $loop != 0; do
+ iface=`expr $revision - $loop`
+ loop=`expr $loop - 1`
+ verstring="sgi$major.$iface:$verstring"
+ done
+
+ # Before this point, $major must not contain `.'.
+ major=.$major
+ versuffix="$major.$revision"
+ ;;
+
+ linux)
+ major=.`expr $current - $age`
+ versuffix="$major.$age.$revision"
+ ;;
+
+ osf)
+ major=`expr $current - $age`
+ versuffix=".$current.$age.$revision"
+ verstring="$current.$age.$revision"
+
+ # Add in all the interfaces that we are compatible with.
+ loop=$age
+ while test $loop != 0; do
+ iface=`expr $current - $loop`
+ loop=`expr $loop - 1`
+ verstring="$verstring:${iface}.0"
+ done
+
+ # Make executables depend on our current version.
+ verstring="$verstring:${current}.0"
+ ;;
+
+ sunos)
+ major=".$current"
+ versuffix=".$current.$revision"
+ ;;
+
+ windows)
+ # Use '-' rather than '.', since we only want one
+ # extension on DOS 8.3 filesystems.
+ major=`expr $current - $age`
+ versuffix="-$major"
+ ;;
+
+ *)
+ $echo "$modename: unknown library version type \`$version_type'" 1>&2
+ echo "Fatal configuration error. See the $PACKAGE docs for more information." 1>&2
+ exit 1
+ ;;
+ esac
+
+ # Clear the version info if we defaulted, and they specified a release.
+ if test -z "$vinfo" && test -n "$release"; then
+ major=
+ verstring="0.0"
+ if test "$need_version" = no; then
+ versuffix=
+ else
+ versuffix=".0.0"
+ fi
+ fi
+
+ # Remove version info from name if versioning should be avoided
+ if test "$avoid_version" = yes && test "$need_version" = no; then
+ major=
+ versuffix=
+ verstring=""
+ fi
+
+ # Check to see if the archive will have undefined symbols.
+ if test "$allow_undefined" = yes; then
+ if test "$allow_undefined_flag" = unsupported; then
+ $echo "$modename: warning: undefined symbols not allowed in $host shared libraries" 1>&2
+ build_libtool_libs=no
+ build_old_libs=yes
+ fi
+ else
+ # Don't allow undefined symbols.
+ allow_undefined_flag="$no_undefined_flag"
+ fi
+ fi
+
+ if test "$mode" != relink; then
+ # Remove our outputs, but don't remove object files since they
+ # may have been created when compiling PIC objects.
+ removelist=
+ tempremovelist=`echo "$output_objdir/*"`
+ for p in $tempremovelist; do
+ case $p in
+ *.$objext)
+ ;;
+ $output_objdir/$outputname | $output_objdir/$libname.* | $output_objdir/${libname}${release}.*)
+ removelist="$removelist $p"
+ ;;
+ *) ;;
+ esac
+ done
+ if test -n "$removelist"; then
+ $show "${rm}r $removelist"
+ $run ${rm}r $removelist
+ fi
+ fi
+
+ # Now set the variables for building old libraries.
+ if test "$build_old_libs" = yes && test "$build_libtool_libs" != convenience ; then
+ oldlibs="$oldlibs $output_objdir/$libname.$libext"
+
+ # Transform .lo files to .o files.
+ oldobjs="$objs "`$echo "X$libobjs" | $SP2NL | $Xsed -e '/\.'${libext}'$/d' -e "$lo2o" | $NL2SP`
+ fi
+
+ # Eliminate all temporary directories.
+ for path in $notinst_path; do
+ lib_search_path=`echo "$lib_search_path " | sed -e 's% $path % %g'`
+ deplibs=`echo "$deplibs " | sed -e 's% -L$path % %g'`
+ dependency_libs=`echo "$dependency_libs " | sed -e 's% -L$path % %g'`
+ done
+
+ if test -n "$xrpath"; then
+ # If the user specified any rpath flags, then add them.
+ temp_xrpath=
+ for libdir in $xrpath; do
+ temp_xrpath="$temp_xrpath -R$libdir"
+ case "$finalize_rpath " in
+ *" $libdir "*) ;;
+ *) finalize_rpath="$finalize_rpath $libdir" ;;
+ esac
+ done
+ if test $hardcode_into_libs != yes || test $build_old_libs = yes; then
+ dependency_libs="$temp_xrpath $dependency_libs"
+ fi
+ fi
+
+ # Make sure dlfiles contains only unique files that won't be dlpreopened
+ old_dlfiles="$dlfiles"
+ dlfiles=
+ for lib in $old_dlfiles; do
+ case " $dlprefiles $dlfiles " in
+ *" $lib "*) ;;
+ *) dlfiles="$dlfiles $lib" ;;
+ esac
+ done
+
+ # Make sure dlprefiles contains only unique files
+ old_dlprefiles="$dlprefiles"
+ dlprefiles=
+ for lib in $old_dlprefiles; do
+ case "$dlprefiles " in
+ *" $lib "*) ;;
+ *) dlprefiles="$dlprefiles $lib" ;;
+ esac
+ done
+
+ if test "$build_libtool_libs" = yes; then
+ if test -n "$rpath"; then
+ case $host in
+ *-*-cygwin* | *-*-mingw* | *-*-pw32* | *-*-os2* | *-*-beos*)
+ # these systems don't actually have a c library (as such)!
+ ;;
+ *-*-rhapsody* | *-*-darwin1.[012])
+ # Rhapsody C library is in the System framework
+ deplibs="$deplibs -framework System"
+ ;;
+ *-*-netbsd*)
+ # Don't link with libc until the a.out ld.so is fixed.
+ ;;
+ *)
+ # Add libc to deplibs on all other systems if necessary.
+ if test $build_libtool_need_lc = "yes"; then
+ deplibs="$deplibs -lc"
+ fi
+ ;;
+ esac
+ fi
+
+ # Transform deplibs into only deplibs that can be linked in shared.
+ name_save=$name
+ libname_save=$libname
+ release_save=$release
+ versuffix_save=$versuffix
+ major_save=$major
+ # I'm not sure if I'm treating the release correctly. I think
+ # release should show up in the -l (ie -lgmp5) so we don't want to
+ # add it in twice. Is that correct?
+ release=""
+ versuffix=""
+ major=""
+ newdeplibs=
+ droppeddeps=no
+ case $deplibs_check_method in
+ pass_all)
+ # Don't check for shared/static. Everything works.
+ # This might be a little naive. We might want to check
+ # whether the library exists or not. But this is on
+ # osf3 & osf4 and I'm not really sure... Just
+ # implementing what was already the behaviour.
+ newdeplibs=$deplibs
+ ;;
+ test_compile)
+ # This code stresses the "libraries are programs" paradigm to its
+ # limits. Maybe even breaks it. We compile a program, linking it
+ # against the deplibs as a proxy for the library. Then we can check
+ # whether they linked in statically or dynamically with ldd.
+ $rm conftest.c
+ cat > conftest.c <<EOF
+ int main() { return 0; }
+EOF
+ $rm conftest
+ $LTCC -o conftest conftest.c $deplibs
+ if test $? -eq 0 ; then
+ ldd_output=`ldd conftest`
+ for i in $deplibs; do
+ name="`expr $i : '-l\(.*\)'`"
+ # If $name is empty we are operating on a -L argument.
+ if test -n "$name" && test "$name" != "0"; then
+ libname=`eval \\$echo \"$libname_spec\"`
+ deplib_matches=`eval \\$echo \"$library_names_spec\"`
+ set dummy $deplib_matches
+ deplib_match=$2
+ if test `expr "$ldd_output" : ".*$deplib_match"` -ne 0 ; then
+ newdeplibs="$newdeplibs $i"
+ else
+ droppeddeps=yes
+ echo
+ echo "*** Warning: This library needs some functionality provided by $i."
+ echo "*** I have the capability to make that library automatically link in when"
+ echo "*** you link to this library. But I can only do this if you have a"
+ echo "*** shared version of the library, which you do not appear to have."
+ fi
+ else
+ newdeplibs="$newdeplibs $i"
+ fi
+ done
+ else
+ # Error occured in the first compile. Let's try to salvage the situation:
+ # Compile a seperate program for each library.
+ for i in $deplibs; do
+ name="`expr $i : '-l\(.*\)'`"
+ # If $name is empty we are operating on a -L argument.
+ if test -n "$name" && test "$name" != "0"; then
+ $rm conftest
+ $LTCC -o conftest conftest.c $i
+ # Did it work?
+ if test $? -eq 0 ; then
+ ldd_output=`ldd conftest`
+ libname=`eval \\$echo \"$libname_spec\"`
+ deplib_matches=`eval \\$echo \"$library_names_spec\"`
+ set dummy $deplib_matches
+ deplib_match=$2
+ if test `expr "$ldd_output" : ".*$deplib_match"` -ne 0 ; then
+ newdeplibs="$newdeplibs $i"
+ else
+ droppeddeps=yes
+ echo
+ echo "*** Warning: This library needs some functionality provided by $i."
+ echo "*** I have the capability to make that library automatically link in when"
+ echo "*** you link to this library. But I can only do this if you have a"
+ echo "*** shared version of the library, which you do not appear to have."
+ fi
+ else
+ droppeddeps=yes
+ echo
+ echo "*** Warning! Library $i is needed by this library but I was not able to"
+ echo "*** make it link in! You will probably need to install it or some"
+ echo "*** library that it depends on before this library will be fully"
+ echo "*** functional. Installing it before continuing would be even better."
+ fi
+ else
+ newdeplibs="$newdeplibs $i"
+ fi
+ done
+ fi
+ ;;
+ file_magic*)
+ set dummy $deplibs_check_method
+ file_magic_regex=`expr "$deplibs_check_method" : "$2 \(.*\)"`
+ for a_deplib in $deplibs; do
+ name="`expr $a_deplib : '-l\(.*\)'`"
+ # If $name is empty we are operating on a -L argument.
+ if test -n "$name" && test "$name" != "0"; then
+ libname=`eval \\$echo \"$libname_spec\"`
+ for i in $lib_search_path $sys_lib_search_path $shlib_search_path; do
+ potential_libs=`ls $i/$libname[.-]* 2>/dev/null`
+ for potent_lib in $potential_libs; do
+ # Follow soft links.
+ if ls -lLd "$potent_lib" 2>/dev/null \
+ | grep " -> " >/dev/null; then
+ continue
+ fi
+ # The statement above tries to avoid entering an
+ # endless loop below, in case of cyclic links.
+ # We might still enter an endless loop, since a link
+ # loop can be closed while we follow links,
+ # but so what?
+ potlib="$potent_lib"
+ while test -h "$potlib" 2>/dev/null; do
+ potliblink=`ls -ld $potlib | sed 's/.* -> //'`
+ case $potliblink in
+ [\\/]* | [A-Za-z]:[\\/]*) potlib="$potliblink";;
+ *) potlib=`$echo "X$potlib" | $Xsed -e 's,[^/]*$,,'`"$potliblink";;
+ esac
+ done
+ if eval $file_magic_cmd \"\$potlib\" 2>/dev/null \
+ | sed 10q \
+ | egrep "$file_magic_regex" > /dev/null; then
+ newdeplibs="$newdeplibs $a_deplib"
+ a_deplib=""
+ break 2
+ fi
+ done
+ done
+ if test -n "$a_deplib" ; then
+ droppeddeps=yes
+ echo
+ echo "*** Warning: This library needs some functionality provided by $a_deplib."
+ echo "*** I have the capability to make that library automatically link in when"
+ echo "*** you link to this library. But I can only do this if you have a"
+ echo "*** shared version of the library, which you do not appear to have."
+ fi
+ else
+ # Add a -L argument.
+ newdeplibs="$newdeplibs $a_deplib"
+ fi
+ done # Gone through all deplibs.
+ ;;
+ match_pattern*)
+ set dummy $deplibs_check_method
+ match_pattern_regex=`expr "$deplibs_check_method" : "$2 \(.*\)"`
+ for a_deplib in $deplibs; do
+ name="`expr $a_deplib : '-l\(.*\)'`"
+ # If $name is empty we are operating on a -L argument.
+ if test -n "$name" && test "$name" != "0"; then
+ libname=`eval \\$echo \"$libname_spec\"`
+ for i in $lib_search_path $sys_lib_search_path $shlib_search_path; do
+ potential_libs=`ls $i/$libname[.-]* 2>/dev/null`
+ for potent_lib in $potential_libs; do
+ if eval echo \"$potent_lib\" 2>/dev/null \
+ | sed 10q \
+ | egrep "$match_pattern_regex" > /dev/null; then
+ newdeplibs="$newdeplibs $a_deplib"
+ a_deplib=""
+ break 2
+ fi
+ done
+ done
+ if test -n "$a_deplib" ; then
+ droppeddeps=yes
+ echo
+ echo "*** Warning: This library needs some functionality provided by $a_deplib."
+ echo "*** I have the capability to make that library automatically link in when"
+ echo "*** you link to this library. But I can only do this if you have a"
+ echo "*** shared version of the library, which you do not appear to have."
+ fi
+ else
+ # Add a -L argument.
+ newdeplibs="$newdeplibs $a_deplib"
+ fi
+ done # Gone through all deplibs.
+ ;;
+ none | unknown | *)
+ newdeplibs=""
+ if $echo "X $deplibs" | $Xsed -e 's/ -lc$//' \
+ -e 's/ -[LR][^ ]*//g' -e 's/[ ]//g' |
+ grep . >/dev/null; then
+ echo
+ if test "X$deplibs_check_method" = "Xnone"; then
+ echo "*** Warning: inter-library dependencies are not supported in this platform."
+ else
+ echo "*** Warning: inter-library dependencies are not known to be supported."
+ fi
+ echo "*** All declared inter-library dependencies are being dropped."
+ droppeddeps=yes
+ fi
+ ;;
+ esac
+ versuffix=$versuffix_save
+ major=$major_save
+ release=$release_save
+ libname=$libname_save
+ name=$name_save
+
+ case $host in
+ *-*-rhapsody* | *-*-darwin1.[012])
+ # On Rhapsody replace the C library is the System framework
+ newdeplibs=`$echo "X $newdeplibs" | $Xsed -e 's/ -lc / -framework System /'`
+ ;;
+ esac
+
+ if test "$droppeddeps" = yes; then
+ if test "$module" = yes; then
+ echo
+ echo "*** Warning: libtool could not satisfy all declared inter-library"
+ echo "*** dependencies of module $libname. Therefore, libtool will create"
+ echo "*** a static module, that should work as long as the dlopening"
+ echo "*** application is linked with the -dlopen flag."
+ if test -z "$global_symbol_pipe"; then
+ echo
+ echo "*** However, this would only work if libtool was able to extract symbol"
+ echo "*** lists from a program, using \`nm' or equivalent, but libtool could"
+ echo "*** not find such a program. So, this module is probably useless."
+ echo "*** \`nm' from GNU binutils and a full rebuild may help."
+ fi
+ if test "$build_old_libs" = no; then
+ oldlibs="$output_objdir/$libname.$libext"
+ build_libtool_libs=module
+ build_old_libs=yes
+ else
+ build_libtool_libs=no
+ fi
+ else
+ echo "*** The inter-library dependencies that have been dropped here will be"
+ echo "*** automatically added whenever a program is linked with this library"
+ echo "*** or is declared to -dlopen it."
+
+ if test $allow_undefined = no; then
+ echo
+ echo "*** Since this library must not contain undefined symbols,"
+ echo "*** because either the platform does not support them or"
+ echo "*** it was explicitly requested with -no-undefined,"
+ echo "*** libtool will only create a static version of it."
+ if test "$build_old_libs" = no; then
+ oldlibs="$output_objdir/$libname.$libext"
+ build_libtool_libs=module
+ build_old_libs=yes
+ else
+ build_libtool_libs=no
+ fi
+ fi
+ fi
+ fi
+ # Done checking deplibs!
+ deplibs=$newdeplibs
+ fi
+
+ # All the library-specific variables (install_libdir is set above).
+ library_names=
+ old_library=
+ dlname=
+
+ # Test again, we may have decided not to build it any more
+ if test "$build_libtool_libs" = yes; then
+ if test $hardcode_into_libs = yes; then
+ # Hardcode the library paths
+ hardcode_libdirs=
+ dep_rpath=
+ rpath="$finalize_rpath"
+ test "$mode" != relink && rpath="$compile_rpath$rpath"
+ for libdir in $rpath; do
+ if test -n "$hardcode_libdir_flag_spec"; then
+ if test -n "$hardcode_libdir_separator"; then
+ if test -z "$hardcode_libdirs"; then
+ hardcode_libdirs="$libdir"
+ else
+ # Just accumulate the unique libdirs.
+ case $hardcode_libdir_separator$hardcode_libdirs$hardcode_libdir_separator in
+ *"$hardcode_libdir_separator$libdir$hardcode_libdir_separator"*)
+ ;;
+ *)
+ hardcode_libdirs="$hardcode_libdirs$hardcode_libdir_separator$libdir"
+ ;;
+ esac
+ fi
+ else
+ eval flag=\"$hardcode_libdir_flag_spec\"
+ dep_rpath="$dep_rpath $flag"
+ fi
+ elif test -n "$runpath_var"; then
+ case "$perm_rpath " in
+ *" $libdir "*) ;;
+ *) perm_rpath="$perm_rpath $libdir" ;;
+ esac
+ fi
+ done
+ # Substitute the hardcoded libdirs into the rpath.
+ if test -n "$hardcode_libdir_separator" &&
+ test -n "$hardcode_libdirs"; then
+ libdir="$hardcode_libdirs"
+ eval dep_rpath=\"$hardcode_libdir_flag_spec\"
+ fi
+ if test -n "$runpath_var" && test -n "$perm_rpath"; then
+ # We should set the runpath_var.
+ rpath=
+ for dir in $perm_rpath; do
+ rpath="$rpath$dir:"
+ done
+ eval "$runpath_var='$rpath\$$runpath_var'; export $runpath_var"
+ fi
+ test -n "$dep_rpath" && deplibs="$dep_rpath $deplibs"
+ fi
+
+ shlibpath="$finalize_shlibpath"
+ test "$mode" != relink && shlibpath="$compile_shlibpath$shlibpath"
+ if test -n "$shlibpath"; then
+ eval "$shlibpath_var='$shlibpath\$$shlibpath_var'; export $shlibpath_var"
+ fi
+
+ # Get the real and link names of the library.
+ eval library_names=\"$library_names_spec\"
+ set dummy $library_names
+ realname="$2"
+ shift; shift
+
+ if test -n "$soname_spec"; then
+ eval soname=\"$soname_spec\"
+ else
+ soname="$realname"
+ fi
+ test -z "$dlname" && dlname=$soname
+
+ lib="$output_objdir/$realname"
+ for link
+ do
+ linknames="$linknames $link"
+ done
+
+# # Ensure that we have .o objects for linkers which dislike .lo
+# # (e.g. aix) in case we are running --disable-static
+# for obj in $libobjs; do
+# xdir=`$echo "X$obj" | $Xsed -e 's%/[^/]*$%%'`
+# if test "X$xdir" = "X$obj"; then
+# xdir="."
+# else
+# xdir="$xdir"
+# fi
+# baseobj=`$echo "X$obj" | $Xsed -e 's%^.*/%%'`
+# oldobj=`$echo "X$baseobj" | $Xsed -e "$lo2o"`
+# if test ! -f $xdir/$oldobj && test "$baseobj" != "$oldobj"; then
+# $show "(cd $xdir && ${LN_S} $baseobj $oldobj)"
+# $run eval '(cd $xdir && ${LN_S} $baseobj $oldobj)' || exit $?
+# fi
+# done
+
+ # Use standard objects if they are pic
+ test -z "$pic_flag" && libobjs=`$echo "X$libobjs" | $SP2NL | $Xsed -e "$lo2o" | $NL2SP`
+
+ # Prepare the list of exported symbols
+ if test -z "$export_symbols"; then
+ if test "$always_export_symbols" = yes || test -n "$export_symbols_regex"; then
+ $show "generating symbol list for \`$libname.la'"
+ export_symbols="$output_objdir/$libname.exp"
+ $run $rm $export_symbols
+ eval cmds=\"$export_symbols_cmds\"
+ IFS="${IFS= }"; save_ifs="$IFS"; IFS='~'
+ for cmd in $cmds; do
+ IFS="$save_ifs"
+ $show "$cmd"
+ $run eval "$cmd" || exit $?
+ done
+ IFS="$save_ifs"
+ if test -n "$export_symbols_regex"; then
+ $show "egrep -e \"$export_symbols_regex\" \"$export_symbols\" > \"${export_symbols}T\""
+ $run eval 'egrep -e "$export_symbols_regex" "$export_symbols" > "${export_symbols}T"'
+ $show "$mv \"${export_symbols}T\" \"$export_symbols\""
+ $run eval '$mv "${export_symbols}T" "$export_symbols"'
+ fi
+ fi
+ fi
+
+ if test -n "$export_symbols" && test -n "$include_expsyms"; then
+ $run eval '$echo "X$include_expsyms" | $SP2NL >> "$export_symbols"'
+ fi
+
+ if test -n "$convenience"; then
+ if test -n "$whole_archive_flag_spec"; then
+ save_libobjs=$libobjs
+ eval libobjs=\"\$libobjs $whole_archive_flag_spec\"
+ else
+ gentop="$output_objdir/${outputname}x"
+ $show "${rm}r $gentop"
+ $run ${rm}r "$gentop"
+ $show "$mkdir $gentop"
+ $run $mkdir "$gentop"
+ status=$?
+ if test $status -ne 0 && test ! -d "$gentop"; then
+ exit $status
+ fi
+ generated="$generated $gentop"
+
+ for xlib in $convenience; do
+ # Extract the objects.
+ case $xlib in
+ [\\/]* | [A-Za-z]:[\\/]*) xabs="$xlib" ;;
+ *) xabs=`pwd`"/$xlib" ;;
+ esac
+ xlib=`$echo "X$xlib" | $Xsed -e 's%^.*/%%'`
+ xdir="$gentop/$xlib"
+
+ $show "${rm}r $xdir"
+ $run ${rm}r "$xdir"
+ $show "$mkdir $xdir"
+ $run $mkdir "$xdir"
+ status=$?
+ if test $status -ne 0 && test ! -d "$xdir"; then
+ exit $status
+ fi
+ $show "(cd $xdir && $AR x $xabs)"
+ $run eval "(cd \$xdir && $AR x \$xabs)" || exit $?
+
+ libobjs="$libobjs "`find $xdir -name \*.$objext -print -o -name \*.lo -print | $NL2SP`
+ done
+ fi
+ fi
+
+ if test "$thread_safe" = yes && test -n "$thread_safe_flag_spec"; then
+ eval flag=\"$thread_safe_flag_spec\"
+ linker_flags="$linker_flags $flag"
+ fi
+
+ # Make a backup of the uninstalled library when relinking
+ if test "$mode" = relink; then
+ $run eval '(cd $output_objdir && $rm ${realname}U && $mv $realname ${realname}U)' || exit $?
+ fi
+
+ # Do each of the archive commands.
+ if test -n "$export_symbols" && test -n "$archive_expsym_cmds"; then
+ eval cmds=\"$archive_expsym_cmds\"
+ else
+ eval cmds=\"$archive_cmds\"
+ fi
+ if len=`expr "X$cmds" : ".*"` &&
+ test $len -le $max_cmd_len; then
+ :
+ else
+ # The command line is too long to link in one step, link piecewise.
+ $echo "creating reloadable object files..."
+
+ # Save the value of $output and $libobjs because we want to
+ # use them later. If we have whole_archive_flag_spec, we
+ # want to use save_libobjs as it was before
+ # whole_archive_flag_spec was expanded, because we can't
+ # assume the linker understands whole_archive_flag_spec.
+ # This may have to be revisited, in case too many
+ # convenience libraries get linked in and end up exceeding
+ # the spec.
+ if test -z "$convenience" || test -z "$whole_archive_flag_spec"; then
+ save_libobjs=$libobjs
+ fi
+ save_output=$output
+
+ # Clear the reloadable object creation command queue and
+ # initialize k to one.
+ test_cmds=
+ concat_cmds=
+ objlist=
+ delfiles=
+ last_robj=
+ k=1
+ output=$output_objdir/$save_output-${k}.$objext
+ # Loop over the list of objects to be linked.
+ for obj in $save_libobjs
+ do
+ eval test_cmds=\"$reload_cmds $objlist $last_robj\"
+ if test "X$objlist" = X ||
+ { len=`expr "X$test_cmds" : ".*"` &&
+ test $len -le $max_cmd_len; }; then
+ objlist="$objlist $obj"
+ else
+ # The command $test_cmds is almost too long, add a
+ # command to the queue.
+ if test $k -eq 1 ; then
+ # The first file doesn't have a previous command to add.
+ eval concat_cmds=\"$reload_cmds $objlist $last_robj\"
+ else
+ # All subsequent reloadable object files will link in
+ # the last one created.
+ eval concat_cmds=\"\$concat_cmds~$reload_cmds $objlist $last_robj\"
+ fi
+ last_robj=$output_objdir/$save_output-${k}.$objext
+ k=`expr $k + 1`
+ output=$output_objdir/$save_output-${k}.$objext
+ objlist=$obj
+ len=1
+ fi
+ done
+ # Handle the remaining objects by creating one last
+ # reloadable object file. All subsequent reloadable object
+ # files will link in the last one created.
+ test -z "$concat_cmds" || concat_cmds=$concat_cmds~
+ eval concat_cmds=\"\${concat_cmds}$reload_cmds $objlist $last_robj\"
+
+ # Set up a command to remove the reloadale object files
+ # after they are used.
+ i=0
+ while test $i -lt $k
+ do
+ i=`expr $i + 1`
+ delfiles="$delfiles $output_objdir/$save_output-${i}.$objext"
+ done
+
+ $echo "creating a temporary reloadable object file: $output"
+
+ # Loop through the commands generated above and execute them.
+ IFS="${IFS= }"; save_ifs="$IFS"; IFS='~'
+ for cmd in $concat_cmds; do
+ IFS="$save_ifs"
+ $show "$cmd"
+ $run eval "$cmd" || exit $?
+ done
+ IFS="$save_ifs"
+
+ libobjs=$output
+ # Restore the value of output.
+ output=$save_output
+
+ if test -n "$convenience" && test -n "$whole_archive_flag_spec"; then
+ eval libobjs=\"\$libobjs $whole_archive_flag_spec\"
+ fi
+ # Expand the library linking commands again to reset the
+ # value of $libobjs for piecewise linking.
+
+ # Do each of the archive commands.
+ if test -n "$export_symbols" && test -n "$archive_expsym_cmds"; then
+ eval cmds=\"$archive_expsym_cmds\"
+ else
+ eval cmds=\"$archive_cmds\"
+ fi
+
+ # Append the command to remove the reloadable object files
+ # to the just-reset $cmds.
+ eval cmds=\"\$cmds~$rm $delfiles\"
+ fi
+ IFS="${IFS= }"; save_ifs="$IFS"; IFS='~'
+ for cmd in $cmds; do
+ IFS="$save_ifs"
+ $show "$cmd"
+ $run eval "$cmd" || exit $?
+ done
+ IFS="$save_ifs"
+
+ # Restore the uninstalled library and exit
+ if test "$mode" = relink; then
+ $run eval '(cd $output_objdir && $rm ${realname}T && $mv $realname ${realname}T && $mv "$realname"U $realname)' || exit $?
+ exit 0
+ fi
+
+ # Create links to the real library.
+ for linkname in $linknames; do
+ if test "$realname" != "$linkname"; then
+ $show "(cd $output_objdir && $rm $linkname && $LN_S $realname $linkname)"
+ $run eval '(cd $output_objdir && $rm $linkname && $LN_S $realname $linkname)' || exit $?
+ fi
+ done
+
+ # If -module or -export-dynamic was specified, set the dlname.
+ if test "$module" = yes || test "$export_dynamic" = yes; then
+ # On all known operating systems, these are identical.
+ dlname="$soname"
+ fi
+ fi
+ ;;
+
+ obj)
+ if test -n "$deplibs"; then
+ $echo "$modename: warning: \`-l' and \`-L' are ignored for objects" 1>&2
+ fi
+
+ if test -n "$dlfiles$dlprefiles" || test "$dlself" != no; then
+ $echo "$modename: warning: \`-dlopen' is ignored for objects" 1>&2
+ fi
+
+ if test -n "$rpath"; then
+ $echo "$modename: warning: \`-rpath' is ignored for objects" 1>&2
+ fi
+
+ if test -n "$xrpath"; then
+ $echo "$modename: warning: \`-R' is ignored for objects" 1>&2
+ fi
+
+ if test -n "$vinfo"; then
+ $echo "$modename: warning: \`-version-info' is ignored for objects" 1>&2
+ fi
+
+ if test -n "$release"; then
+ $echo "$modename: warning: \`-release' is ignored for objects" 1>&2
+ fi
+
+ case $output in
+ *.lo)
+ if test -n "$objs$old_deplibs"; then
+ $echo "$modename: cannot build library object \`$output' from non-libtool objects" 1>&2
+ exit 1
+ fi
+ libobj="$output"
+ obj=`$echo "X$output" | $Xsed -e "$lo2o"`
+ ;;
+ *)
+ libobj=
+ obj="$output"
+ ;;
+ esac
+
+ # Delete the old objects.
+ $run $rm $obj $libobj
+
+ # Objects from convenience libraries. This assumes
+ # single-version convenience libraries. Whenever we create
+ # different ones for PIC/non-PIC, this we'll have to duplicate
+ # the extraction.
+ reload_conv_objs=
+ gentop=
+ # reload_cmds runs $LD directly, so let us get rid of
+ # -Wl from whole_archive_flag_spec
+ wl=
+
+ if test -n "$convenience"; then
+ if test -n "$whole_archive_flag_spec"; then
+ eval reload_conv_objs=\"\$reload_objs $whole_archive_flag_spec\"
+ else
+ gentop="$output_objdir/${obj}x"
+ $show "${rm}r $gentop"
+ $run ${rm}r "$gentop"
+ $show "$mkdir $gentop"
+ $run $mkdir "$gentop"
+ status=$?
+ if test $status -ne 0 && test ! -d "$gentop"; then
+ exit $status
+ fi
+ generated="$generated $gentop"
+
+ for xlib in $convenience; do
+ # Extract the objects.
+ case $xlib in
+ [\\/]* | [A-Za-z]:[\\/]*) xabs="$xlib" ;;
+ *) xabs=`pwd`"/$xlib" ;;
+ esac
+ xlib=`$echo "X$xlib" | $Xsed -e 's%^.*/%%'`
+ xdir="$gentop/$xlib"
+
+ $show "${rm}r $xdir"
+ $run ${rm}r "$xdir"
+ $show "$mkdir $xdir"
+ $run $mkdir "$xdir"
+ status=$?
+ if test $status -ne 0 && test ! -d "$xdir"; then
+ exit $status
+ fi
+ $show "(cd $xdir && $AR x $xabs)"
+ $run eval "(cd \$xdir && $AR x \$xabs)" || exit $?
+
+ reload_conv_objs="$reload_objs "`find $xdir -name \*.$objext -print -o -name \*.lo -print | $NL2SP`
+ done
+ fi
+ fi
+
+ # Create the old-style object.
+ reload_objs="$objs$old_deplibs "`$echo "X$libobjs" | $SP2NL | $Xsed -e '/\.'${libext}$'/d' -e '/\.lib$/d' -e "$lo2o" | $NL2SP`" $reload_conv_objs" ### testsuite: skip nested quoting test
+
+ output="$obj"
+ eval cmds=\"$reload_cmds\"
+ IFS="${IFS= }"; save_ifs="$IFS"; IFS='~'
+ for cmd in $cmds; do
+ IFS="$save_ifs"
+ $show "$cmd"
+ $run eval "$cmd" || exit $?
+ done
+ IFS="$save_ifs"
+
+ # Exit if we aren't doing a library object file.
+ if test -z "$libobj"; then
+ if test -n "$gentop"; then
+ $show "${rm}r $gentop"
+ $run ${rm}r $gentop
+ fi
+
+ exit 0
+ fi
+
+ if test "$build_libtool_libs" != yes; then
+ if test -n "$gentop"; then
+ $show "${rm}r $gentop"
+ $run ${rm}r $gentop
+ fi
+
+ # Create an invalid libtool object if no PIC, so that we don't
+ # accidentally link it into a program.
+ # $show "echo timestamp > $libobj"
+ # $run eval "echo timestamp > $libobj" || exit $?
+ exit 0
+ fi
+
+ if test -n "$pic_flag" || test "$pic_mode" != default; then
+ # Only do commands if we really have different PIC objects.
+ reload_objs="$libobjs $reload_conv_objs"
+ output="$libobj"
+ eval cmds=\"$reload_cmds\"
+ IFS="${IFS= }"; save_ifs="$IFS"; IFS='~'
+ for cmd in $cmds; do
+ IFS="$save_ifs"
+ $show "$cmd"
+ $run eval "$cmd" || exit $?
+ done
+ IFS="$save_ifs"
+# else
+# # Just create a symlink.
+# $show $rm $libobj
+# $run $rm $libobj
+# xdir=`$echo "X$libobj" | $Xsed -e 's%/[^/]*$%%'`
+# if test "X$xdir" = "X$libobj"; then
+# xdir="."
+# else
+# xdir="$xdir"
+# fi
+# baseobj=`$echo "X$libobj" | $Xsed -e 's%^.*/%%'`
+# oldobj=`$echo "X$baseobj" | $Xsed -e "$lo2o"`
+# $show "(cd $xdir && $LN_S $oldobj $baseobj)"
+# $run eval '(cd $xdir && $LN_S $oldobj $baseobj)' || exit $?
+ fi
+
+ if test -n "$gentop"; then
+ $show "${rm}r $gentop"
+ $run ${rm}r $gentop
+ fi
+
+ exit 0
+ ;;
+
+ prog)
+ case $host in
+ *cygwin*) output=`echo $output | sed -e 's,.exe$,,;s,$,.exe,'` ;;
+ esac
+ if test -n "$vinfo"; then
+ $echo "$modename: warning: \`-version-info' is ignored for programs" 1>&2
+ fi
+
+ if test -n "$release"; then
+ $echo "$modename: warning: \`-release' is ignored for programs" 1>&2
+ fi
+
+ if test "$preload" = yes; then
+ if test "$dlopen_support" = unknown && test "$dlopen_self" = unknown &&
+ test "$dlopen_self_static" = unknown; then
+ $echo "$modename: warning: \`AC_LIBTOOL_DLOPEN' not used. Assuming no dlopen support."
+ fi
+ fi
+
+ case $host in
+ *-*-rhapsody* | *-*-darwin1.[012])
+ # On Rhapsody replace the C library is the System framework
+ compile_deplibs=`$echo "X $compile_deplibs" | $Xsed -e 's/ -lc / -framework System /'`
+ finalize_deplibs=`$echo "X $finalize_deplibs" | $Xsed -e 's/ -lc / -framework System /'`
+ ;;
+ esac
+
+ compile_command="$compile_command $compile_deplibs"
+ finalize_command="$finalize_command $finalize_deplibs"
+
+ if test -n "$rpath$xrpath"; then
+ # If the user specified any rpath flags, then add them.
+ for libdir in $rpath $xrpath; do
+ # This is the magic to use -rpath.
+ case "$finalize_rpath " in
+ *" $libdir "*) ;;
+ *) finalize_rpath="$finalize_rpath $libdir" ;;
+ esac
+ done
+ fi
+
+ # Now hardcode the library paths
+ rpath=
+ hardcode_libdirs=
+ for libdir in $compile_rpath $finalize_rpath; do
+ if test -n "$hardcode_libdir_flag_spec"; then
+ if test -n "$hardcode_libdir_separator"; then
+ if test -z "$hardcode_libdirs"; then
+ hardcode_libdirs="$libdir"
+ else
+ # Just accumulate the unique libdirs.
+ case $hardcode_libdir_separator$hardcode_libdirs$hardcode_libdir_separator in
+ *"$hardcode_libdir_separator$libdir$hardcode_libdir_separator"*)
+ ;;
+ *)
+ hardcode_libdirs="$hardcode_libdirs$hardcode_libdir_separator$libdir"
+ ;;
+ esac
+ fi
+ else
+ eval flag=\"$hardcode_libdir_flag_spec\"
+ rpath="$rpath $flag"
+ fi
+ elif test -n "$runpath_var"; then
+ case "$perm_rpath " in
+ *" $libdir "*) ;;
+ *) perm_rpath="$perm_rpath $libdir" ;;
+ esac
+ fi
+ case $host in
+ *-*-cygwin* | *-*-mingw* | *-*-pw32* | *-*-os2*)
+ case :$dllsearchpath: in
+ *":$libdir:"*) ;;
+ *) dllsearchpath="$dllsearchpath:$libdir";;
+ esac
+ ;;
+ esac
+ done
+ # Substitute the hardcoded libdirs into the rpath.
+ if test -n "$hardcode_libdir_separator" &&
+ test -n "$hardcode_libdirs"; then
+ libdir="$hardcode_libdirs"
+ eval rpath=\" $hardcode_libdir_flag_spec\"
+ fi
+ compile_rpath="$rpath"
+
+ rpath=
+ hardcode_libdirs=
+ for libdir in $finalize_rpath; do
+ if test -n "$hardcode_libdir_flag_spec"; then
+ if test -n "$hardcode_libdir_separator"; then
+ if test -z "$hardcode_libdirs"; then
+ hardcode_libdirs="$libdir"
+ else
+ # Just accumulate the unique libdirs.
+ case $hardcode_libdir_separator$hardcode_libdirs$hardcode_libdir_separator in
+ *"$hardcode_libdir_separator$libdir$hardcode_libdir_separator"*)
+ ;;
+ *)
+ hardcode_libdirs="$hardcode_libdirs$hardcode_libdir_separator$libdir"
+ ;;
+ esac
+ fi
+ else
+ eval flag=\"$hardcode_libdir_flag_spec\"
+ rpath="$rpath $flag"
+ fi
+ elif test -n "$runpath_var"; then
+ case "$finalize_perm_rpath " in
+ *" $libdir "*) ;;
+ *) finalize_perm_rpath="$finalize_perm_rpath $libdir" ;;
+ esac
+ fi
+ done
+ # Substitute the hardcoded libdirs into the rpath.
+ if test -n "$hardcode_libdir_separator" &&
+ test -n "$hardcode_libdirs"; then
+ libdir="$hardcode_libdirs"
+ eval rpath=\" $hardcode_libdir_flag_spec\"
+ fi
+ finalize_rpath="$rpath"
+
+ dlsyms=
+ if test -n "$dlfiles$dlprefiles" || test "$dlself" != no; then
+ if test -n "$NM" && test -n "$global_symbol_pipe"; then
+ dlsyms="${outputname}S.c"
+ else
+ $echo "$modename: not configured to extract global symbols from dlpreopened files" 1>&2
+ fi
+ fi
+
+ if test -n "$dlsyms"; then
+ case $dlsyms in
+ "") ;;
+ *.c)
+ # Discover the nlist of each of the dlfiles.
+ nlist="$output_objdir/${outputname}.nm"
+
+ $show "$rm $nlist ${nlist}S ${nlist}T"
+ $run $rm "$nlist" "${nlist}S" "${nlist}T"
+
+ # Parse the name list into a source file.
+ $show "creating $output_objdir/$dlsyms"
+
+ test -z "$run" && $echo > "$output_objdir/$dlsyms" "\
+/* $dlsyms - symbol resolution table for \`$outputname' dlsym emulation. */
+/* Generated by $PROGRAM - GNU $PACKAGE $VERSION$TIMESTAMP */
+
+#ifdef __cplusplus
+extern \"C\" {
+#endif
+
+/* Prevent the only kind of declaration conflicts we can make. */
+#define lt_preloaded_symbols some_other_symbol
+
+/* External symbol declarations for the compiler. */\
+"
+
+ if test "$dlself" = yes; then
+ $show "generating symbol list for \`$output'"
+
+ test -z "$run" && $echo ': @PROGRAM@ ' > "$nlist"
+
+ # Add our own program objects to the symbol list.
+ progfiles="$objs$old_deplibs"
+ for arg in $progfiles; do
+ $show "extracting global C symbols from \`$arg'"
+ $run eval "$NM $arg | $global_symbol_pipe >> '$nlist'"
+ done
+
+ if test -n "$exclude_expsyms"; then
+ $run eval 'egrep -v " ($exclude_expsyms)$" "$nlist" > "$nlist"T'
+ $run eval '$mv "$nlist"T "$nlist"'
+ fi
+
+ if test -n "$export_symbols_regex"; then
+ $run eval 'egrep -e "$export_symbols_regex" "$nlist" > "$nlist"T'
+ $run eval '$mv "$nlist"T "$nlist"'
+ fi
+
+ # Prepare the list of exported symbols
+ if test -z "$export_symbols"; then
+ export_symbols="$output_objdir/$output.exp"
+ $run $rm $export_symbols
+ $run eval "sed -n -e '/^: @PROGRAM@$/d' -e 's/^.* \(.*\)$/\1/p' "'< "$nlist" > "$export_symbols"'
+ else
+ $run eval "sed -e 's/\([][.*^$]\)/\\\1/g' -e 's/^/ /' -e 's/$/$/'"' < "$export_symbols" > "$output_objdir/$output.exp"'
+ $run eval 'grep -f "$output_objdir/$output.exp" < "$nlist" > "$nlist"T'
+ $run eval 'mv "$nlist"T "$nlist"'
+ fi
+ fi
+
+ for arg in $dlprefiles; do
+ $show "extracting global C symbols from \`$arg'"
+ name=`echo "$arg" | sed -e 's%^.*/%%'`
+ $run eval 'echo ": $name " >> "$nlist"'
+ $run eval "$NM $arg | $global_symbol_pipe >> '$nlist'"
+ done
+
+ if test -z "$run"; then
+ # Make sure we have at least an empty file.
+ test -f "$nlist" || : > "$nlist"
+
+ if test -n "$exclude_expsyms"; then
+ egrep -v " ($exclude_expsyms)$" "$nlist" > "$nlist"T
+ $mv "$nlist"T "$nlist"
+ fi
+
+ # Try sorting and uniquifying the output.
+ if grep -v "^: " < "$nlist" | sort +2 | uniq > "$nlist"S; then
+ :
+ else
+ grep -v "^: " < "$nlist" > "$nlist"S
+ fi
+
+ if test -f "$nlist"S; then
+ eval "$global_symbol_to_cdecl"' < "$nlist"S >> "$output_objdir/$dlsyms"'
+ else
+ echo '/* NONE */' >> "$output_objdir/$dlsyms"
+ fi
+
+ $echo >> "$output_objdir/$dlsyms" "\
+
+#undef lt_preloaded_symbols
+
+#if defined (__STDC__) && __STDC__
+# define lt_ptr_t void *
+#else
+# define lt_ptr_t char *
+# define const
+#endif
+
+/* The mapping between symbol names and symbols. */
+const struct {
+ const char *name;
+ lt_ptr_t address;
+}
+lt_preloaded_symbols[] =
+{\
+"
+
+ sed -n -e 's/^: \([^ ]*\) $/ {\"\1\", (lt_ptr_t) 0},/p' \
+ -e 's/^. \([^ ]*\) \([^ ]*\)$/ {"\2", (lt_ptr_t) \&\2},/p' \
+ < "$nlist" >> "$output_objdir/$dlsyms"
+
+ $echo >> "$output_objdir/$dlsyms" "\
+ {0, (lt_ptr_t) 0}
+};
+
+/* This works around a problem in FreeBSD linker */
+#ifdef FREEBSD_WORKAROUND
+static const void *lt_preloaded_setup() {
+ return lt_preloaded_symbols;
+}
+#endif
+
+#ifdef __cplusplus
+}
+#endif\
+"
+ fi
+
+ pic_flag_for_symtable=
+ case $host in
+ # compiling the symbol table file with pic_flag works around
+ # a FreeBSD bug that causes programs to crash when -lm is
+ # linked before any other PIC object. But we must not use
+ # pic_flag when linking with -static. The problem exists in
+ # FreeBSD 2.2.6 and is fixed in FreeBSD 3.1.
+ *-*-freebsd2*|*-*-freebsd3.0*|*-*-freebsdelf3.0*)
+ case "$compile_command " in
+ *" -static "*) ;;
+ *) pic_flag_for_symtable=" $pic_flag -DFREEBSD_WORKAROUND";;
+ esac;;
+ *-*-hpux*)
+ case "$compile_command " in
+ *" -static "*) ;;
+ *) pic_flag_for_symtable=" $pic_flag";;
+ esac
+ esac
+
+ # Now compile the dynamic symbol file.
+ $show "(cd $output_objdir && $LTCC -c$no_builtin_flag$pic_flag_for_symtable \"$dlsyms\")"
+ $run eval '(cd $output_objdir && $LTCC -c$no_builtin_flag$pic_flag_for_symtable "$dlsyms")' || exit $?
+
+ # Clean up the generated files.
+ $show "$rm $output_objdir/$dlsyms $nlist ${nlist}S ${nlist}T"
+ $run $rm "$output_objdir/$dlsyms" "$nlist" "${nlist}S" "${nlist}T"
+
+ # Transform the symbol file into the correct name.
+ compile_command=`$echo "X$compile_command" | $Xsed -e "s%@SYMFILE@%$output_objdir/${outputname}S.${objext}%"`
+ finalize_command=`$echo "X$finalize_command" | $Xsed -e "s%@SYMFILE@%$output_objdir/${outputname}S.${objext}%"`
+ ;;
+ *)
+ $echo "$modename: unknown suffix for \`$dlsyms'" 1>&2
+ exit 1
+ ;;
+ esac
+ else
+ # We keep going just in case the user didn't refer to
+ # lt_preloaded_symbols. The linker will fail if global_symbol_pipe
+ # really was required.
+
+ # Nullify the symbol file.
+ compile_command=`$echo "X$compile_command" | $Xsed -e "s% @SYMFILE@%%"`
+ finalize_command=`$echo "X$finalize_command" | $Xsed -e "s% @SYMFILE@%%"`
+ fi
+
+ if test $need_relink = no || test "$build_libtool_libs" != yes; then
+ # Replace the output file specification.
+ compile_command=`$echo "X$compile_command" | $Xsed -e 's%@OUTPUT@%'"$output"'%g'`
+ link_command="$compile_command$compile_rpath"
+
+ # We have no uninstalled library dependencies, so finalize right now.
+ $show "$link_command"
+ $run eval "$link_command"
+ status=$?
+
+ # Delete the generated files.
+ if test -n "$dlsyms"; then
+ $show "$rm $output_objdir/${outputname}S.${objext}"
+ $run $rm "$output_objdir/${outputname}S.${objext}"
+ fi
+
+ exit $status
+ fi
+
+ if test -n "$shlibpath_var"; then
+ # We should set the shlibpath_var
+ rpath=
+ for dir in $temp_rpath; do
+ case $dir in
+ [\\/]* | [A-Za-z]:[\\/]*)
+ # Absolute path.
+ rpath="$rpath$dir:"
+ ;;
+ *)
+ # Relative path: add a thisdir entry.
+ rpath="$rpath\$thisdir/$dir:"
+ ;;
+ esac
+ done
+ temp_rpath="$rpath"
+ fi
+
+ if test -n "$compile_shlibpath$finalize_shlibpath"; then
+ compile_command="$shlibpath_var=\"$compile_shlibpath$finalize_shlibpath\$$shlibpath_var\" $compile_command"
+ fi
+ if test -n "$finalize_shlibpath"; then
+ finalize_command="$shlibpath_var=\"$finalize_shlibpath\$$shlibpath_var\" $finalize_command"
+ fi
+
+ compile_var=
+ finalize_var=
+ if test -n "$runpath_var"; then
+ if test -n "$perm_rpath"; then
+ # We should set the runpath_var.
+ rpath=
+ for dir in $perm_rpath; do
+ rpath="$rpath$dir:"
+ done
+ compile_var="$runpath_var=\"$rpath\$$runpath_var\" "
+ fi
+ if test -n "$finalize_perm_rpath"; then
+ # We should set the runpath_var.
+ rpath=
+ for dir in $finalize_perm_rpath; do
+ rpath="$rpath$dir:"
+ done
+ finalize_var="$runpath_var=\"$rpath\$$runpath_var\" "
+ fi
+ fi
+
+ if test "$no_install" = yes; then
+ # We don't need to create a wrapper script.
+ link_command="$compile_var$compile_command$compile_rpath"
+ # Replace the output file specification.
+ link_command=`$echo "X$link_command" | $Xsed -e 's%@OUTPUT@%'"$output"'%g'`
+ # Delete the old output file.
+ $run $rm $output
+ # Link the executable and exit
+ $show "$link_command"
+ $run eval "$link_command" || exit $?
+ exit 0
+ fi
+
+ if test "$hardcode_action" = relink; then
+ # Fast installation is not supported
+ link_command="$compile_var$compile_command$compile_rpath"
+ relink_command="$finalize_var$finalize_command$finalize_rpath"
+
+ $echo "$modename: warning: this platform does not like uninstalled shared libraries" 1>&2
+ $echo "$modename: \`$output' will be relinked during installation" 1>&2
+ else
+ if test "$fast_install" != no; then
+ link_command="$finalize_var$compile_command$finalize_rpath"
+ if test "$fast_install" = yes; then
+ relink_command=`$echo "X$compile_var$compile_command$compile_rpath" | $Xsed -e 's%@OUTPUT@%\$progdir/\$file%g'`
+ else
+ # fast_install is set to needless
+ relink_command=
+ fi
+ else
+ link_command="$compile_var$compile_command$compile_rpath"
+ relink_command="$finalize_var$finalize_command$finalize_rpath"
+ fi
+ fi
+
+ # Replace the output file specification.
+ link_command=`$echo "X$link_command" | $Xsed -e 's%@OUTPUT@%'"$output_objdir/$outputname"'%g'`
+
+ # Delete the old output files.
+ $run $rm $output $output_objdir/$outputname $output_objdir/lt-$outputname
+
+ $show "$link_command"
+ $run eval "$link_command" || exit $?
+
+ # Now create the wrapper script.
+ $show "creating $output"
+
+ # Quote the relink command for shipping.
+ if test -n "$relink_command"; then
+ # Preserve any variables that may affect compiler behavior
+ for var in $variables_saved_for_relink; do
+ if eval test -z \"\${$var+set}\"; then
+ relink_command="{ test -z \"\${$var+set}\" || unset $var || { $var=; export $var; }; }; $relink_command"
+ elif eval var_value=\$$var; test -z "$var_value"; then
+ relink_command="$var=; export $var; $relink_command"
+ else
+ var_value=`$echo "X$var_value" | $Xsed -e "$sed_quote_subst"`
+ relink_command="$var=\"$var_value\"; export $var; $relink_command"
+ fi
+ done
+ relink_command="cd `pwd`; $relink_command"
+ relink_command=`$echo "X$relink_command" | $Xsed -e "$sed_quote_subst"`
+ fi
+
+ # Quote $echo for shipping.
+ if test "X$echo" = "X$SHELL $0 --fallback-echo"; then
+ case $0 in
+ [\\/]* | [A-Za-z]:[\\/]*) qecho="$SHELL $0 --fallback-echo";;
+ *) qecho="$SHELL `pwd`/$0 --fallback-echo";;
+ esac
+ qecho=`$echo "X$qecho" | $Xsed -e "$sed_quote_subst"`
+ else
+ qecho=`$echo "X$echo" | $Xsed -e "$sed_quote_subst"`
+ fi
+
+ # Only actually do things if our run command is non-null.
+ if test -z "$run"; then
+ # win32 will think the script is a binary if it has
+ # a .exe suffix, so we strip it off here.
+ case $output in
+ *.exe) output=`echo $output|sed 's,.exe$,,'` ;;
+ esac
+ # test for cygwin because mv fails w/o .exe extensions
+ case $host in
+ *cygwin*) exeext=.exe ;;
+ *) exeext= ;;
+ esac
+ $rm $output
+ trap "$rm $output; exit 1" 1 2 15
+
+ $echo > $output "\
+#! $SHELL
+
+# $output - temporary wrapper script for $objdir/$outputname
+# Generated by $PROGRAM - GNU $PACKAGE $VERSION$TIMESTAMP
+#
+# The $output program cannot be directly executed until all the libtool
+# libraries that it depends on are installed.
+#
+# This wrapper script should never be moved out of the build directory.
+# If it is, it will not operate correctly.
+
+# Sed substitution that helps us do robust quoting. It backslashifies
+# metacharacters that are still active within double-quoted strings.
+Xsed='sed -e 1s/^X//'
+sed_quote_subst='$sed_quote_subst'
+
+# The HP-UX ksh and POSIX shell print the target directory to stdout
+# if CDPATH is set.
+if test \"\${CDPATH+set}\" = set; then CDPATH=:; export CDPATH; fi
+
+relink_command=\"$relink_command\"
+
+# This environment variable determines our operation mode.
+if test \"\$libtool_install_magic\" = \"$magic\"; then
+ # install mode needs the following variable:
+ notinst_deplibs='$notinst_deplibs'
+else
+ # When we are sourced in execute mode, \$file and \$echo are already set.
+ if test \"\$libtool_execute_magic\" != \"$magic\"; then
+ echo=\"$qecho\"
+ file=\"\$0\"
+ # Make sure echo works.
+ if test \"X\$1\" = X--no-reexec; then
+ # Discard the --no-reexec flag, and continue.
+ shift
+ elif test \"X\`(\$echo '\t') 2>/dev/null\`\" = 'X\t'; then
+ # Yippee, \$echo works!
+ :
+ else
+ # Restart under the correct shell, and then maybe \$echo will work.
+ exec $SHELL \"\$0\" --no-reexec \${1+\"\$@\"}
+ fi
+ fi\
+"
+ $echo >> $output "\
+
+ # Find the directory that this script lives in.
+ thisdir=\`\$echo \"X\$file\" | \$Xsed -e 's%/[^/]*$%%'\`
+ test \"x\$thisdir\" = \"x\$file\" && thisdir=.
+
+ # Follow symbolic links until we get to the real thisdir.
+ file=\`ls -ld \"\$file\" | sed -n 's/.*-> //p'\`
+ while test -n \"\$file\"; do
+ destdir=\`\$echo \"X\$file\" | \$Xsed -e 's%/[^/]*\$%%'\`
+
+ # If there was a directory component, then change thisdir.
+ if test \"x\$destdir\" != \"x\$file\"; then
+ case \"\$destdir\" in
+ [\\\\/]* | [A-Za-z]:[\\\\/]*) thisdir=\"\$destdir\" ;;
+ *) thisdir=\"\$thisdir/\$destdir\" ;;
+ esac
+ fi
+
+ file=\`\$echo \"X\$file\" | \$Xsed -e 's%^.*/%%'\`
+ file=\`ls -ld \"\$thisdir/\$file\" | sed -n 's/.*-> //p'\`
+ done
+
+ # Try to get the absolute directory name.
+ absdir=\`cd \"\$thisdir\" && pwd\`
+ test -n \"\$absdir\" && thisdir=\"\$absdir\"
+"
+
+ if test "$fast_install" = yes; then
+ echo >> $output "\
+ program=lt-'$outputname'$exeext
+ progdir=\"\$thisdir/$objdir\"
+
+ if test ! -f \"\$progdir/\$program\" || \\
+ { file=\`ls -1dt \"\$progdir/\$program\" \"\$progdir/../\$program\" 2>/dev/null | sed 1q\`; \\
+ test \"X\$file\" != \"X\$progdir/\$program\"; }; then
+
+ file=\"\$\$-\$program\"
+
+ if test ! -d \"\$progdir\"; then
+ $mkdir \"\$progdir\"
+ else
+ $rm \"\$progdir/\$file\"
+ fi"
+
+ echo >> $output "\
+
+ # relink executable if necessary
+ if test -n \"\$relink_command\"; then
+ if relink_command_output=\`eval \$relink_command 2>&1\`; then :
+ else
+ $echo \"\$relink_command_output\" >&2
+ $rm \"\$progdir/\$file\"
+ exit 1
+ fi
+ fi
+
+ $mv \"\$progdir/\$file\" \"\$progdir/\$program\" 2>/dev/null ||
+ { $rm \"\$progdir/\$program\";
+ $mv \"\$progdir/\$file\" \"\$progdir/\$program\"; }
+ $rm \"\$progdir/\$file\"
+ fi"
+ else
+ echo >> $output "\
+ program='$outputname'
+ progdir=\"\$thisdir/$objdir\"
+"
+ fi
+
+ echo >> $output "\
+
+ if test -f \"\$progdir/\$program\"; then"
+
+ # Export our shlibpath_var if we have one.
+ if test "$shlibpath_overrides_runpath" = yes && test -n "$shlibpath_var" && test -n "$temp_rpath"; then
+ $echo >> $output "\
+ # Add our own library path to $shlibpath_var
+ $shlibpath_var=\"$temp_rpath\$$shlibpath_var\"
+
+ # Some systems cannot cope with colon-terminated $shlibpath_var
+ # The second colon is a workaround for a bug in BeOS R4 sed
+ $shlibpath_var=\`\$echo \"X\$$shlibpath_var\" | \$Xsed -e 's/::*\$//'\`
+
+ export $shlibpath_var
+"
+ fi
+
+ # fixup the dll searchpath if we need to.
+ if test -n "$dllsearchpath"; then
+ $echo >> $output "\
+ # Add the dll search path components to the executable PATH
+ PATH=$dllsearchpath:\$PATH
+"
+ fi
+
+ $echo >> $output "\
+ if test \"\$libtool_execute_magic\" != \"$magic\"; then
+ # Run the actual program with our arguments.
+"
+ case $host in
+ # win32 systems need to use the prog path for dll
+ # lookup to work
+ *-*-cygwin* | *-*-pw32*)
+ $echo >> $output "\
+ exec \$progdir/\$program \${1+\"\$@\"}
+"
+ ;;
+
+ # Backslashes separate directories on plain windows
+ *-*-mingw | *-*-os2*)
+ $echo >> $output "\
+ exec \$progdir\\\\\$program \${1+\"\$@\"}
+"
+ ;;
+
+ *)
+ $echo >> $output "\
+ # Export the path to the program.
+ PATH=\"\$progdir:\$PATH\"
+ export PATH
+
+ exec \$program \${1+\"\$@\"}
+"
+ ;;
+ esac
+ $echo >> $output "\
+ \$echo \"\$0: cannot exec \$program \${1+\"\$@\"}\"
+ exit 1
+ fi
+ else
+ # The program doesn't exist.
+ \$echo \"\$0: error: \$progdir/\$program does not exist\" 1>&2
+ \$echo \"This script is just a wrapper for \$program.\" 1>&2
+ echo \"See the $PACKAGE documentation for more information.\" 1>&2
+ exit 1
+ fi
+fi\
+"
+ chmod +x $output
+ fi
+ exit 0
+ ;;
+ esac
+
+ # See if we need to build an old-fashioned archive.
+ for oldlib in $oldlibs; do
+
+ if test "$build_libtool_libs" = convenience; then
+ oldobjs="$libobjs_save"
+ addlibs="$convenience"
+ build_libtool_libs=no
+ else
+ if test "$build_libtool_libs" = module; then
+ oldobjs="$libobjs_save"
+ build_libtool_libs=no
+ else
+ oldobjs="$objs$old_deplibs $non_pic_objects"
+ fi
+ addlibs="$old_convenience"
+ fi
+
+ if test -n "$addlibs"; then
+ gentop="$output_objdir/${outputname}x"
+ $show "${rm}r $gentop"
+ $run ${rm}r "$gentop"
+ $show "$mkdir $gentop"
+ $run $mkdir "$gentop"
+ status=$?
+ if test $status -ne 0 && test ! -d "$gentop"; then
+ exit $status
+ fi
+ generated="$generated $gentop"
+
+ # Add in members from convenience archives.
+ for xlib in $addlibs; do
+ # Extract the objects.
+ case $xlib in
+ [\\/]* | [A-Za-z]:[\\/]*) xabs="$xlib" ;;
+ *) xabs=`pwd`"/$xlib" ;;
+ esac
+ xlib=`$echo "X$xlib" | $Xsed -e 's%^.*/%%'`
+ xdir="$gentop/$xlib"
+
+ $show "${rm}r $xdir"
+ $run ${rm}r "$xdir"
+ $show "$mkdir $xdir"
+ $run $mkdir "$xdir"
+ status=$?
+ if test $status -ne 0 && test ! -d "$xdir"; then
+ exit $status
+ fi
+ $show "(cd $xdir && $AR x $xabs)"
+ $run eval "(cd \$xdir && $AR x \$xabs)" || exit $?
+
+ oldobjs="$oldobjs "`find $xdir -name \*.${objext} -print | $NL2SP`
+ done
+ fi
+
+ # Do each command in the archive commands.
+ if test -n "$old_archive_from_new_cmds" && test "$build_libtool_libs" = yes; then
+ eval cmds=\"$old_archive_from_new_cmds\"
+ else
+# # Ensure that we have .o objects in place in case we decided
+# # not to build a shared library, and have fallen back to building
+# # static libs even though --disable-static was passed!
+# for oldobj in $oldobjs; do
+# if test ! -f $oldobj; then
+# xdir=`$echo "X$oldobj" | $Xsed -e 's%/[^/]*$%%'`
+# if test "X$xdir" = "X$oldobj"; then
+# xdir="."
+# else
+# xdir="$xdir"
+# fi
+# baseobj=`$echo "X$oldobj" | $Xsed -e 's%^.*/%%'`
+# obj=`$echo "X$baseobj" | $Xsed -e "$o2lo"`
+# $show "(cd $xdir && ${LN_S} $obj $baseobj)"
+# $run eval '(cd $xdir && ${LN_S} $obj $baseobj)' || exit $?
+# fi
+# done
+
+ eval cmds=\"$old_archive_cmds\"
+
+ if len=`expr "X$cmds" : ".*"` &&
+ test $len -le $max_cmd_len; then
+ :
+ else
+ # the command line is too long to link in one step, link in parts
+ $echo "using piecewise archive linking..."
+ save_RANLIB=$RANLIB
+ RANLIB=:
+ objlist=
+ concat_cmds=
+ save_oldobjs=$oldobjs
+ for obj in $save_oldobjs
+ do
+ oldobjs="$objlist $obj"
+ objlist="$objlist $obj"
+ eval test_cmds=\"$old_archive_cmds\"
+ if len=`expr "X$test_cmds" : ".*"` &&
+ test $len -le $max_cmd_len; then
+ :
+ else
+ # the above command should be used before it gets too long
+ oldobjs=$objlist
+ test -z "$concat_cmds" || concat_cmds=$concat_cmds~
+ eval concat_cmds=\"\${concat_cmds}$old_archive_cmds\"
+ objlist=
+ fi
+ done
+ RANLIB=$save_RANLIB
+ oldobjs=$objlist
+ eval cmds=\"\$concat_cmds~$old_archive_cmds\"
+ fi
+ fi
+ IFS="${IFS= }"; save_ifs="$IFS"; IFS='~'
+ for cmd in $cmds; do
+ IFS="$save_ifs"
+ $show "$cmd"
+ $run eval "$cmd" || exit $?
+ done
+ IFS="$save_ifs"
+ done
+
+ if test -n "$generated"; then
+ $show "${rm}r$generated"
+ $run ${rm}r$generated
+ fi
+
+ # Now create the libtool archive.
+ case $output in
+ *.la)
+ old_library=
+ test "$build_old_libs" = yes && old_library="$libname.$libext"
+ $show "creating $output"
+
+ # Preserve any variables that may affect compiler behavior
+ for var in $variables_saved_for_relink; do
+ if eval test -z \"\${$var+set}\"; then
+ relink_command="{ test -z \"\${$var+set}\" || unset $var || { $var=; export $var; }; }; $relink_command"
+ elif eval var_value=\$$var; test -z "$var_value"; then
+ relink_command="$var=; export $var; $relink_command"
+ else
+ var_value=`$echo "X$var_value" | $Xsed -e "$sed_quote_subst"`
+ relink_command="$var=\"$var_value\"; export $var; $relink_command"
+ fi
+ done
+ # Quote the link command for shipping.
+ relink_command="cd `pwd`; $SHELL $0 --mode=relink $libtool_args"
+ relink_command=`$echo "X$relink_command" | $Xsed -e "$sed_quote_subst"`
+
+ # Only create the output if not a dry run.
+ if test -z "$run"; then
+ for installed in no yes; do
+ if test "$installed" = yes; then
+ if test -z "$install_libdir"; then
+ break
+ fi
+ output="$output_objdir/$outputname"i
+ # Replace all uninstalled libtool libraries with the installed ones
+ newdependency_libs=
+ for deplib in $dependency_libs; do
+ case $deplib in
+ *.la)
+ name=`$echo "X$deplib" | $Xsed -e 's%^.*/%%'`
+ eval libdir=`sed -n -e 's/^libdir=\(.*\)$/\1/p' $deplib`
+ if test -z "$libdir"; then
+ $echo "$modename: \`$deplib' is not a valid libtool archive" 1>&2
+ exit 1
+ fi
+ newdependency_libs="$newdependency_libs $libdir/$name"
+ ;;
+ *) newdependency_libs="$newdependency_libs $deplib" ;;
+ esac
+ done
+ dependency_libs="$newdependency_libs"
+ newdlfiles=
+ for lib in $dlfiles; do
+ name=`$echo "X$lib" | $Xsed -e 's%^.*/%%'`
+ eval libdir=`sed -n -e 's/^libdir=\(.*\)$/\1/p' $lib`
+ if test -z "$libdir"; then
+ $echo "$modename: \`$lib' is not a valid libtool archive" 1>&2
+ exit 1
+ fi
+ newdlfiles="$newdlfiles $libdir/$name"
+ done
+ dlfiles="$newdlfiles"
+ newdlprefiles=
+ for lib in $dlprefiles; do
+ name=`$echo "X$lib" | $Xsed -e 's%^.*/%%'`
+ eval libdir=`sed -n -e 's/^libdir=\(.*\)$/\1/p' $lib`
+ if test -z "$libdir"; then
+ $echo "$modename: \`$lib' is not a valid libtool archive" 1>&2
+ exit 1
+ fi
+ newdlprefiles="$newdlprefiles $libdir/$name"
+ done
+ dlprefiles="$newdlprefiles"
+ fi
+ $rm $output
+ # place dlname in correct position for cygwin
+ tdlname=$dlname
+ case $host,$output,$installed,$module,$dlname in
+ *cygwin*,*lai,yes,no,*.dll) tdlname=../bin/$dlname ;;
+ esac
+ $echo > $output "\
+# $outputname - a libtool library file
+# Generated by $PROGRAM - GNU $PACKAGE $VERSION$TIMESTAMP
+#
+# Please DO NOT delete this file!
+# It is necessary for linking the library.
+
+# The name that we can dlopen(3).
+dlname='$tdlname'
+
+# Names of this library.
+library_names='$library_names'
+
+# The name of the static archive.
+old_library='$old_library'
+
+# Libraries that this one depends upon.
+dependency_libs='$dependency_libs'
+
+# Version information for $libname.
+current=$current
+age=$age
+revision=$revision
+
+# Is this an already installed library?
+installed=$installed
+
+# Files to dlopen/dlpreopen
+dlopen='$dlfiles'
+dlpreopen='$dlprefiles'
+
+# Directory that this library needs to be installed in:
+libdir='$install_libdir'"
+ if test "$installed" = no && test $need_relink = yes; then
+ $echo >> $output "\
+relink_command=\"$relink_command\""
+ fi
+ done
+ fi
+
+ # Do a symbolic link so that the libtool archive can be found in
+ # LD_LIBRARY_PATH before the program is installed.
+ $show "(cd $output_objdir && $rm $outputname && $LN_S ../$outputname $outputname)"
+ $run eval '(cd $output_objdir && $rm $outputname && $LN_S ../$outputname $outputname)' || exit $?
+ ;;
+ esac
+ exit 0
+ ;;
+
+ # libtool install mode
+ install)
+ modename="$modename: install"
+
+ # There may be an optional sh(1) argument at the beginning of
+ # install_prog (especially on Windows NT).
+ if test "$nonopt" = "$SHELL" || test "$nonopt" = /bin/sh ||
+ # Allow the use of GNU shtool's install command.
+ $echo "X$nonopt" | $Xsed | grep shtool > /dev/null; then
+ # Aesthetically quote it.
+ arg=`$echo "X$nonopt" | $Xsed -e "$sed_quote_subst"`
+ case $arg in
+ *[\[\~\#\^\&\*\(\)\{\}\|\;\<\>\?\'\ \ ]*|*]*)
+ arg="\"$arg\""
+ ;;
+ esac
+ install_prog="$arg "
+ arg="$1"
+ shift
+ else
+ install_prog=
+ arg="$nonopt"
+ fi
+
+ # The real first argument should be the name of the installation program.
+ # Aesthetically quote it.
+ arg=`$echo "X$arg" | $Xsed -e "$sed_quote_subst"`
+ case $arg in
+ *[\[\~\#\^\&\*\(\)\{\}\|\;\<\>\?\'\ \ ]*|*]*)
+ arg="\"$arg\""
+ ;;
+ esac
+ install_prog="$install_prog$arg"
+
+ # We need to accept at least all the BSD install flags.
+ dest=
+ files=
+ opts=
+ prev=
+ install_type=
+ isdir=no
+ stripme=
+ for arg
+ do
+ if test -n "$dest"; then
+ files="$files $dest"
+ dest="$arg"
+ continue
+ fi
+
+ case $arg in
+ -d) isdir=yes ;;
+ -f) prev="-f" ;;
+ -g) prev="-g" ;;
+ -m) prev="-m" ;;
+ -o) prev="-o" ;;
+ -s)
+ stripme=" -s"
+ continue
+ ;;
+ -*) ;;
+
+ *)
+ # If the previous option needed an argument, then skip it.
+ if test -n "$prev"; then
+ prev=
+ else
+ dest="$arg"
+ continue
+ fi
+ ;;
+ esac
+
+ # Aesthetically quote the argument.
+ arg=`$echo "X$arg" | $Xsed -e "$sed_quote_subst"`
+ case $arg in
+ *[\[\~\#\^\&\*\(\)\{\}\|\;\<\>\?\'\ \ ]*|*]*)
+ arg="\"$arg\""
+ ;;
+ esac
+ install_prog="$install_prog $arg"
+ done
+
+ if test -z "$install_prog"; then
+ $echo "$modename: you must specify an install program" 1>&2
+ $echo "$help" 1>&2
+ exit 1
+ fi
+
+ if test -n "$prev"; then
+ $echo "$modename: the \`$prev' option requires an argument" 1>&2
+ $echo "$help" 1>&2
+ exit 1
+ fi
+
+ if test -z "$files"; then
+ if test -z "$dest"; then
+ $echo "$modename: no file or destination specified" 1>&2
+ else
+ $echo "$modename: you must specify a destination" 1>&2
+ fi
+ $echo "$help" 1>&2
+ exit 1
+ fi
+
+ # Strip any trailing slash from the destination.
+ dest=`$echo "X$dest" | $Xsed -e 's%/$%%'`
+
+ # Check to see that the destination is a directory.
+ test -d "$dest" && isdir=yes
+ if test "$isdir" = yes; then
+ destdir="$dest"
+ destname=
+ else
+ destdir=`$echo "X$dest" | $Xsed -e 's%/[^/]*$%%'`
+ test "X$destdir" = "X$dest" && destdir=.
+ destname=`$echo "X$dest" | $Xsed -e 's%^.*/%%'`
+
+ # Not a directory, so check to see that there is only one file specified.
+ set dummy $files
+ if test $# -gt 2; then
+ $echo "$modename: \`$dest' is not a directory" 1>&2
+ $echo "$help" 1>&2
+ exit 1
+ fi
+ fi
+ case $destdir in
+ [\\/]* | [A-Za-z]:[\\/]*) ;;
+ *)
+ for file in $files; do
+ case $file in
+ *.lo) ;;
+ *)
+ $echo "$modename: \`$destdir' must be an absolute directory name" 1>&2
+ $echo "$help" 1>&2
+ exit 1
+ ;;
+ esac
+ done
+ ;;
+ esac
+
+ # This variable tells wrapper scripts just to set variables rather
+ # than running their programs.
+ libtool_install_magic="$magic"
+
+ staticlibs=
+ future_libdirs=
+ current_libdirs=
+ for file in $files; do
+
+ # Do each installation.
+ case $file in
+ *.$libext)
+ # Do the static libraries later.
+ staticlibs="$staticlibs $file"
+ ;;
+
+ *.la)
+ # Check to see that this really is a libtool archive.
+ if (sed -e '2q' $file | egrep "^# Generated by .*$PACKAGE") >/dev/null 2>&1; then :
+ else
+ $echo "$modename: \`$file' is not a valid libtool archive" 1>&2
+ $echo "$help" 1>&2
+ exit 1
+ fi
+
+ library_names=
+ old_library=
+ relink_command=
+ # If there is no directory component, then add one.
+ case $file in
+ */* | *\\*) . $file ;;
+ *) . ./$file ;;
+ esac
+
+ # Add the libdir to current_libdirs if it is the destination.
+ if test "X$destdir" = "X$libdir"; then
+ case "$current_libdirs " in
+ *" $libdir "*) ;;
+ *) current_libdirs="$current_libdirs $libdir" ;;
+ esac
+ else
+ # Note the libdir as a future libdir.
+ case "$future_libdirs " in
+ *" $libdir "*) ;;
+ *) future_libdirs="$future_libdirs $libdir" ;;
+ esac
+ fi
+
+ dir=`$echo "X$file" | $Xsed -e 's%/[^/]*$%%'`/
+ test "X$dir" = "X$file/" && dir=
+ dir="$dir$objdir"
+
+ if test -n "$relink_command"; then
+ $echo "$modename: warning: relinking \`$file'" 1>&2
+ $show "$relink_command"
+ if $run eval "$relink_command"; then :
+ else
+ $echo "$modename: error: relink \`$file' with the above command before installing it" 1>&2
+ continue
+ fi
+ fi
+
+ # See the names of the shared library.
+ set dummy $library_names
+ if test -n "$2"; then
+ realname="$2"
+ shift
+ shift
+
+ srcname="$realname"
+ test -n "$relink_command" && srcname="$realname"T
+
+ # Install the shared library and build the symlinks.
+ $show "$install_prog $dir/$srcname $destdir/$realname"
+ $run eval "$install_prog $dir/$srcname $destdir/$realname" || exit $?
+ if test -n "$stripme" && test -n "$striplib"; then
+ $show "$striplib $destdir/$realname"
+ $run eval "$striplib $destdir/$realname" || exit $?
+ fi
+
+ if test $# -gt 0; then
+ # Delete the old symlinks, and create new ones.
+ for linkname
+ do
+ if test "$linkname" != "$realname"; then
+ $show "(cd $destdir && $rm $linkname && $LN_S $realname $linkname)"
+ $run eval "(cd $destdir && $rm $linkname && $LN_S $realname $linkname)"
+ fi
+ done
+ fi
+
+ # Do each command in the postinstall commands.
+ lib="$destdir/$realname"
+ eval cmds=\"$postinstall_cmds\"
+ IFS="${IFS= }"; save_ifs="$IFS"; IFS='~'
+ for cmd in $cmds; do
+ IFS="$save_ifs"
+ $show "$cmd"
+ $run eval "$cmd" || exit $?
+ done
+ IFS="$save_ifs"
+ fi
+
+ # Install the pseudo-library for information purposes.
+ name=`$echo "X$file" | $Xsed -e 's%^.*/%%'`
+ instname="$dir/$name"i
+ $show "$install_prog $instname $destdir/$name"
+ $run eval "$install_prog $instname $destdir/$name" || exit $?
+
+ # Maybe install the static library, too.
+ test -n "$old_library" && staticlibs="$staticlibs $dir/$old_library"
+ ;;
+
+ *.lo)
+ # Install (i.e. copy) a libtool object.
+
+ # Figure out destination file name, if it wasn't already specified.
+ if test -n "$destname"; then
+ destfile="$destdir/$destname"
+ else
+ destfile=`$echo "X$file" | $Xsed -e 's%^.*/%%'`
+ destfile="$destdir/$destfile"
+ fi
+
+ # Deduce the name of the destination old-style object file.
+ case $destfile in
+ *.lo)
+ staticdest=`$echo "X$destfile" | $Xsed -e "$lo2o"`
+ ;;
+ *.$objext)
+ staticdest="$destfile"
+ destfile=
+ ;;
+ *)
+ $echo "$modename: cannot copy a libtool object to \`$destfile'" 1>&2
+ $echo "$help" 1>&2
+ exit 1
+ ;;
+ esac
+
+ # Install the libtool object if requested.
+ if test -n "$destfile"; then
+ $show "$install_prog $file $destfile"
+ $run eval "$install_prog $file $destfile" || exit $?
+ fi
+
+ # Install the old object if enabled.
+ if test "$build_old_libs" = yes; then
+ # Deduce the name of the old-style object file.
+ staticobj=`$echo "X$file" | $Xsed -e "$lo2o"`
+
+ $show "$install_prog $staticobj $staticdest"
+ $run eval "$install_prog \$staticobj \$staticdest" || exit $?
+ fi
+ exit 0
+ ;;
+
+ *)
+ # Figure out destination file name, if it wasn't already specified.
+ if test -n "$destname"; then
+ destfile="$destdir/$destname"
+ else
+ destfile=`$echo "X$file" | $Xsed -e 's%^.*/%%'`
+ destfile="$destdir/$destfile"
+ fi
+
+ # Do a test to see if this is really a libtool program.
+ if (sed -e '4q' $file | egrep "^# Generated by .*$PACKAGE") >/dev/null 2>&1; then
+ notinst_deplibs=
+ relink_command=
+
+ # If there is no directory component, then add one.
+ case $file in
+ */* | *\\*) . $file ;;
+ *) . ./$file ;;
+ esac
+
+ # Check the variables that should have been set.
+ if test -z "$notinst_deplibs"; then
+ $echo "$modename: invalid libtool wrapper script \`$file'" 1>&2
+ exit 1
+ fi
+
+ finalize=yes
+ for lib in $notinst_deplibs; do
+ # Check to see that each library is installed.
+ libdir=
+ if test -f "$lib"; then
+ # If there is no directory component, then add one.
+ case $lib in
+ */* | *\\*) . $lib ;;
+ *) . ./$lib ;;
+ esac
+ fi
+ libfile="$libdir/"`$echo "X$lib" | $Xsed -e 's%^.*/%%g'` ### testsuite: skip nested quoting test
+ if test -n "$libdir" && test ! -f "$libfile"; then
+ $echo "$modename: warning: \`$lib' has not been installed in \`$libdir'" 1>&2
+ finalize=no
+ fi
+ done
+
+ relink_command=
+ # If there is no directory component, then add one.
+ case $file in
+ */* | *\\*) . $file ;;
+ *) . ./$file ;;
+ esac
+
+ outputname=
+ if test "$fast_install" = no && test -n "$relink_command"; then
+ if test "$finalize" = yes && test -z "$run"; then
+ tmpdir="/tmp"
+ test -n "$TMPDIR" && tmpdir="$TMPDIR"
+ tmpdir="$tmpdir/libtool-$$"
+ if $mkdir -p "$tmpdir" && chmod 700 "$tmpdir"; then :
+ else
+ $echo "$modename: error: cannot create temporary directory \`$tmpdir'" 1>&2
+ continue
+ fi
+ file=`$echo "X$file" | $Xsed -e 's%^.*/%%'`
+ outputname="$tmpdir/$file"
+ # Replace the output file specification.
+ relink_command=`$echo "X$relink_command" | $Xsed -e 's%@OUTPUT@%'"$outputname"'%g'`
+
+ $show "$relink_command"
+ if $run eval "$relink_command"; then :
+ else
+ $echo "$modename: error: relink \`$file' with the above command before installing it" 1>&2
+ ${rm}r "$tmpdir"
+ continue
+ fi
+ file="$outputname"
+ else
+ $echo "$modename: warning: cannot relink \`$file'" 1>&2
+ fi
+ else
+ # Install the binary that we compiled earlier.
+ file=`$echo "X$file" | $Xsed -e "s%\([^/]*\)$%$objdir/\1%"`
+ fi
+ fi
+
+
+ # remove .exe since cygwin /usr/bin/install will append another
+ # one anyways
+ case $install_prog,$host in
+ */usr/bin/install*,*cygwin*)
+ case $file:$destfile in
+ *.exe:*.exe)
+ # this is ok
+ ;;
+ *.exe:*)
+ destfile=$destfile.exe
+ ;;
+ *:*.exe)
+ destfile=`echo $destfile | sed -e 's,.exe$,,'`
+ ;;
+ esac
+ ;;
+ esac
+
+ $show "$install_prog$stripme $file $destfile"
+ $run eval "$install_prog\$stripme \$file \$destfile" || exit $?
+ test -n "$outputname" && ${rm}r "$tmpdir"
+ ;;
+ esac
+ done
+
+ for file in $staticlibs; do
+ name=`$echo "X$file" | $Xsed -e 's%^.*/%%'`
+
+ # Set up the ranlib parameters.
+ oldlib="$destdir/$name"
+
+ $show "$install_prog $file $oldlib"
+ $run eval "$install_prog \$file \$oldlib" || exit $?
+
+ if test -n "$stripme" && test -n "$striplib"; then
+ $show "$old_striplib $oldlib"
+ $run eval "$old_striplib $oldlib" || exit $?
+ fi
+
+ # Do each command in the postinstall commands.
+ eval cmds=\"$old_postinstall_cmds\"
+ IFS="${IFS= }"; save_ifs="$IFS"; IFS='~'
+ for cmd in $cmds; do
+ IFS="$save_ifs"
+ $show "$cmd"
+ $run eval "$cmd" || exit $?
+ done
+ IFS="$save_ifs"
+ done
+
+ if test -n "$future_libdirs"; then
+ $echo "$modename: warning: remember to run \`$progname --finish$future_libdirs'" 1>&2
+ fi
+
+ if test -n "$current_libdirs"; then
+ # Maybe just do a dry run.
+ test -n "$run" && current_libdirs=" -n$current_libdirs"
+ exec_cmd='$SHELL $0 --finish$current_libdirs'
+ else
+ exit 0
+ fi
+ ;;
+
+ # libtool finish mode
+ finish)
+ modename="$modename: finish"
+ libdirs="$nonopt"
+ admincmds=
+
+ if test -n "$finish_cmds$finish_eval" && test -n "$libdirs"; then
+ for dir
+ do
+ libdirs="$libdirs $dir"
+ done
+
+ for libdir in $libdirs; do
+ if test -n "$finish_cmds"; then
+ # Do each command in the finish commands.
+ eval cmds=\"$finish_cmds\"
+ IFS="${IFS= }"; save_ifs="$IFS"; IFS='~'
+ for cmd in $cmds; do
+ IFS="$save_ifs"
+ $show "$cmd"
+ $run eval "$cmd" || admincmds="$admincmds
+ $cmd"
+ done
+ IFS="$save_ifs"
+ fi
+ if test -n "$finish_eval"; then
+ # Do the single finish_eval.
+ eval cmds=\"$finish_eval\"
+ $run eval "$cmds" || admincmds="$admincmds
+ $cmds"
+ fi
+ done
+ fi
+
+ # Exit here if they wanted silent mode.
+ test "$show" = ":" && exit 0
+
+ echo "----------------------------------------------------------------------"
+ echo "Libraries have been installed in:"
+ for libdir in $libdirs; do
+ echo " $libdir"
+ done
+ echo
+ echo "If you ever happen to want to link against installed libraries"
+ echo "in a given directory, LIBDIR, you must either use libtool, and"
+ echo "specify the full pathname of the library, or use the \`-LLIBDIR'"
+ echo "flag during linking and do at least one of the following:"
+ if test -n "$shlibpath_var"; then
+ echo " - add LIBDIR to the \`$shlibpath_var' environment variable"
+ echo " during execution"
+ fi
+ if test -n "$runpath_var"; then
+ echo " - add LIBDIR to the \`$runpath_var' environment variable"
+ echo " during linking"
+ fi
+ if test -n "$hardcode_libdir_flag_spec"; then
+ libdir=LIBDIR
+ eval flag=\"$hardcode_libdir_flag_spec\"
+
+ echo " - use the \`$flag' linker flag"
+ fi
+ if test -n "$admincmds"; then
+ echo " - have your system administrator run these commands:$admincmds"
+ fi
+ if test -f /etc/ld.so.conf; then
+ echo " - have your system administrator add LIBDIR to \`/etc/ld.so.conf'"
+ fi
+ echo
+ echo "See any operating system documentation about shared libraries for"
+ echo "more information, such as the ld(1) and ld.so(8) manual pages."
+ echo "----------------------------------------------------------------------"
+ exit 0
+ ;;
+
+ # libtool execute mode
+ execute)
+ modename="$modename: execute"
+
+ # The first argument is the command name.
+ cmd="$nonopt"
+ if test -z "$cmd"; then
+ $echo "$modename: you must specify a COMMAND" 1>&2
+ $echo "$help"
+ exit 1
+ fi
+
+ # Handle -dlopen flags immediately.
+ for file in $execute_dlfiles; do
+ if test ! -f "$file"; then
+ $echo "$modename: \`$file' is not a file" 1>&2
+ $echo "$help" 1>&2
+ exit 1
+ fi
+
+ dir=
+ case $file in
+ *.la)
+ # Check to see that this really is a libtool archive.
+ if (sed -e '2q' $file | egrep "^# Generated by .*$PACKAGE") >/dev/null 2>&1; then :
+ else
+ $echo "$modename: \`$lib' is not a valid libtool archive" 1>&2
+ $echo "$help" 1>&2
+ exit 1
+ fi
+
+ # Read the libtool library.
+ dlname=
+ library_names=
+
+ # If there is no directory component, then add one.
+ case $file in
+ */* | *\\*) . $file ;;
+ *) . ./$file ;;
+ esac
+
+ # Skip this library if it cannot be dlopened.
+ if test -z "$dlname"; then
+ # Warn if it was a shared library.
+ test -n "$library_names" && $echo "$modename: warning: \`$file' was not linked with \`-export-dynamic'"
+ continue
+ fi
+
+ dir=`$echo "X$file" | $Xsed -e 's%/[^/]*$%%'`
+ test "X$dir" = "X$file" && dir=.
+
+ if test -f "$dir/$objdir/$dlname"; then
+ dir="$dir/$objdir"
+ else
+ $echo "$modename: cannot find \`$dlname' in \`$dir' or \`$dir/$objdir'" 1>&2
+ exit 1
+ fi
+ ;;
+
+ *.lo)
+ # Just add the directory containing the .lo file.
+ dir=`$echo "X$file" | $Xsed -e 's%/[^/]*$%%'`
+ test "X$dir" = "X$file" && dir=.
+ ;;
+
+ *)
+ $echo "$modename: warning \`-dlopen' is ignored for non-libtool libraries and objects" 1>&2
+ continue
+ ;;
+ esac
+
+ # Get the absolute pathname.
+ absdir=`cd "$dir" && pwd`
+ test -n "$absdir" && dir="$absdir"
+
+ # Now add the directory to shlibpath_var.
+ if eval "test -z \"\$$shlibpath_var\""; then
+ eval "$shlibpath_var=\"\$dir\""
+ else
+ eval "$shlibpath_var=\"\$dir:\$$shlibpath_var\""
+ fi
+ done
+
+ # This variable tells wrapper scripts just to set shlibpath_var
+ # rather than running their programs.
+ libtool_execute_magic="$magic"
+
+ # Check if any of the arguments is a wrapper script.
+ args=
+ for file
+ do
+ case $file in
+ -*) ;;
+ *)
+ # Do a test to see if this is really a libtool program.
+ if (sed -e '4q' $file | egrep "^# Generated by .*$PACKAGE") >/dev/null 2>&1; then
+ # If there is no directory component, then add one.
+ case $file in
+ */* | *\\*) . $file ;;
+ *) . ./$file ;;
+ esac
+
+ # Transform arg to wrapped name.
+ file="$progdir/$program"
+ fi
+ ;;
+ esac
+ # Quote arguments (to preserve shell metacharacters).
+ file=`$echo "X$file" | $Xsed -e "$sed_quote_subst"`
+ args="$args \"$file\""
+ done
+
+ if test -z "$run"; then
+ if test -n "$shlibpath_var"; then
+ # Export the shlibpath_var.
+ eval "export $shlibpath_var"
+ fi
+
+ # Restore saved enviroment variables
+ if test "${save_LC_ALL+set}" = set; then
+ LC_ALL="$save_LC_ALL"; export LC_ALL
+ fi
+ if test "${save_LANG+set}" = set; then
+ LANG="$save_LANG"; export LANG
+ fi
+
+ # Now prepare to actually exec the command.
+ exec_cmd='"$cmd"$args'
+ else
+ # Display what would be done.
+ if test -n "$shlibpath_var"; then
+ eval "\$echo \"\$shlibpath_var=\$$shlibpath_var\""
+ $echo "export $shlibpath_var"
+ fi
+ $echo "$cmd$args"
+ exit 0
+ fi
+ ;;
+
+ # libtool clean and uninstall mode
+ clean | uninstall)
+ modename="$modename: $mode"
+ rm="$nonopt"
+ files=
+ rmforce=
+ exit_status=0
+
+ # This variable tells wrapper scripts just to set variables rather
+ # than running their programs.
+ libtool_install_magic="$magic"
+
+ for arg
+ do
+ case $arg in
+ -f) rm="$rm $arg"; rmforce=yes ;;
+ -*) rm="$rm $arg" ;;
+ *) files="$files $arg" ;;
+ esac
+ done
+
+ if test -z "$rm"; then
+ $echo "$modename: you must specify an RM program" 1>&2
+ $echo "$help" 1>&2
+ exit 1
+ fi
+
+ rmdirs=
+
+ for file in $files; do
+ dir=`$echo "X$file" | $Xsed -e 's%/[^/]*$%%'`
+ if test "X$dir" = "X$file"; then
+ dir=.
+ objdir="$objdir"
+ else
+ objdir="$dir/$objdir"
+ fi
+ name=`$echo "X$file" | $Xsed -e 's%^.*/%%'`
+ test $mode = uninstall && objdir="$dir"
+
+ # Remember objdir for removal later, being careful to avoid duplicates
+ if test $mode = clean; then
+ case " $rmdirs " in
+ *" $objdir "*) ;;
+ *) rmdirs="$rmdirs $objdir" ;;
+ esac
+ fi
+
+ # Don't error if the file doesn't exist and rm -f was used.
+ if (test -L "$file") >/dev/null 2>&1 \
+ || (test -h "$file") >/dev/null 2>&1 \
+ || test -f "$file"; then
+ :
+ elif test -d "$file"; then
+ exit_status=1
+ continue
+ elif test "$rmforce" = yes; then
+ continue
+ fi
+
+ rmfiles="$file"
+
+ case $name in
+ *.la)
+ # Possibly a libtool archive, so verify it.
+ if (sed -e '2q' $file | egrep "^# Generated by .*$PACKAGE") >/dev/null 2>&1; then
+ . $dir/$name
+
+ # Delete the libtool libraries and symlinks.
+ for n in $library_names; do
+ rmfiles="$rmfiles $objdir/$n"
+ done
+ test -n "$old_library" && rmfiles="$rmfiles $objdir/$old_library"
+ test $mode = clean && rmfiles="$rmfiles $objdir/$name $objdir/${name}i"
+
+ if test $mode = uninstall; then
+ if test -n "$library_names"; then
+ # Do each command in the postuninstall commands.
+ eval cmds=\"$postuninstall_cmds\"
+ IFS="${IFS= }"; save_ifs="$IFS"; IFS='~'
+ for cmd in $cmds; do
+ IFS="$save_ifs"
+ $show "$cmd"
+ $run eval "$cmd"
+ if test $? != 0 && test "$rmforce" != yes; then
+ exit_status=1
+ fi
+ done
+ IFS="$save_ifs"
+ fi
+
+ if test -n "$old_library"; then
+ # Do each command in the old_postuninstall commands.
+ eval cmds=\"$old_postuninstall_cmds\"
+ IFS="${IFS= }"; save_ifs="$IFS"; IFS='~'
+ for cmd in $cmds; do
+ IFS="$save_ifs"
+ $show "$cmd"
+ $run eval "$cmd"
+ if test $? != 0 && test "$rmforce" != yes; then
+ exit_status=1
+ fi
+ done
+ IFS="$save_ifs"
+ fi
+ # FIXME: should reinstall the best remaining shared library.
+ fi
+ fi
+ ;;
+
+ *.lo)
+ # Possibly a libtool object, so verify it.
+ if (sed -e '2q' $file | egrep "^# Generated by .*$PACKAGE") >/dev/null 2>&1; then
+
+ # Read the .lo file
+ . $dir/$name
+
+ # Add PIC object to the list of files to remove.
+ if test -n "$pic_object" \
+ && test "$pic_object" != none; then
+ rmfiles="$rmfiles $dir/$pic_object"
+ fi
+
+ # Add non-PIC object to the list of files to remove.
+ if test -n "$non_pic_object" \
+ && test "$non_pic_object" != none; then
+ rmfiles="$rmfiles $dir/$non_pic_object"
+ fi
+ fi
+ ;;
+
+ *)
+ # Do a test to see if this is a libtool program.
+ if test $mode = clean &&
+ (sed -e '4q' $file | egrep "^# Generated by .*$PACKAGE") >/dev/null 2>&1; then
+ relink_command=
+ . $dir/$file
+
+ rmfiles="$rmfiles $objdir/$name $objdir/${name}S.${objext}"
+ if test "$fast_install" = yes && test -n "$relink_command"; then
+ rmfiles="$rmfiles $objdir/lt-$name"
+ fi
+ fi
+ ;;
+ esac
+ $show "$rm $rmfiles"
+ $run $rm $rmfiles || exit_status=1
+ done
+
+ # Try to remove the ${objdir}s in the directories where we deleted files
+ for dir in $rmdirs; do
+ if test -d "$dir"; then
+ $show "rmdir $dir"
+ $run rmdir $dir >/dev/null 2>&1
+ fi
+ done
+
+ exit $exit_status
+ ;;
+
+ "")
+ $echo "$modename: you must specify a MODE" 1>&2
+ $echo "$generic_help" 1>&2
+ exit 1
+ ;;
+ esac
+
+ if test -z "$exec_cmd"; then
+ $echo "$modename: invalid operation mode \`$mode'" 1>&2
+ $echo "$generic_help" 1>&2
+ exit 1
+ fi
+fi # test -z "$show_help"
+
+if test -n "$exec_cmd"; then
+ eval exec $exec_cmd
+ exit 1
+fi
+
+# We need to display help for each of the modes.
+case $mode in
+"") $echo \
+"Usage: $modename [OPTION]... [MODE-ARG]...
+
+Provide generalized library-building support services.
+
+ --config show all configuration variables
+ --debug enable verbose shell tracing
+-n, --dry-run display commands without modifying any files
+ --features display basic configuration information and exit
+ --finish same as \`--mode=finish'
+ --help display this help message and exit
+ --mode=MODE use operation mode MODE [default=inferred from MODE-ARGS]
+ --quiet same as \`--silent'
+ --silent don't print informational messages
+ --tag=TAG use configuration variables from tag TAG
+ --version print version information
+
+MODE must be one of the following:
+
+ clean remove files from the build directory
+ compile compile a source file into a libtool object
+ execute automatically set library path, then run a program
+ finish complete the installation of libtool libraries
+ install install libraries or executables
+ link create a library or an executable
+ uninstall remove libraries from an installed directory
+
+MODE-ARGS vary depending on the MODE. Try \`$modename --help --mode=MODE' for
+a more detailed description of MODE."
+ exit 0
+ ;;
+
+clean)
+ $echo \
+"Usage: $modename [OPTION]... --mode=clean RM [RM-OPTION]... FILE...
+
+Remove files from the build directory.
+
+RM is the name of the program to use to delete files associated with each FILE
+(typically \`/bin/rm'). RM-OPTIONS are options (such as \`-f') to be passed
+to RM.
+
+If FILE is a libtool library, object or program, all the files associated
+with it are deleted. Otherwise, only FILE itself is deleted using RM."
+ ;;
+
+compile)
+ $echo \
+"Usage: $modename [OPTION]... --mode=compile COMPILE-COMMAND... SOURCEFILE
+
+Compile a source file into a libtool library object.
+
+This mode accepts the following additional options:
+
+ -o OUTPUT-FILE set the output file name to OUTPUT-FILE
+ -prefer-pic try to building PIC objects only
+ -prefer-non-pic try to building non-PIC objects only
+ -static always build a \`.o' file suitable for static linking
+
+COMPILE-COMMAND is a command to be used in creating a \`standard' object file
+from the given SOURCEFILE.
+
+The output file name is determined by removing the directory component from
+SOURCEFILE, then substituting the C source code suffix \`.c' with the
+library object suffix, \`.lo'."
+ ;;
+
+execute)
+ $echo \
+"Usage: $modename [OPTION]... --mode=execute COMMAND [ARGS]...
+
+Automatically set library path, then run a program.
+
+This mode accepts the following additional options:
+
+ -dlopen FILE add the directory containing FILE to the library path
+
+This mode sets the library path environment variable according to \`-dlopen'
+flags.
+
+If any of the ARGS are libtool executable wrappers, then they are translated
+into their corresponding uninstalled binary, and any of their required library
+directories are added to the library path.
+
+Then, COMMAND is executed, with ARGS as arguments."
+ ;;
+
+finish)
+ $echo \
+"Usage: $modename [OPTION]... --mode=finish [LIBDIR]...
+
+Complete the installation of libtool libraries.
+
+Each LIBDIR is a directory that contains libtool libraries.
+
+The commands that this mode executes may require superuser privileges. Use
+the \`--dry-run' option if you just want to see what would be executed."
+ ;;
+
+install)
+ $echo \
+"Usage: $modename [OPTION]... --mode=install INSTALL-COMMAND...
+
+Install executables or libraries.
+
+INSTALL-COMMAND is the installation command. The first component should be
+either the \`install' or \`cp' program.
+
+The rest of the components are interpreted as arguments to that command (only
+BSD-compatible install options are recognized)."
+ ;;
+
+link)
+ $echo \
+"Usage: $modename [OPTION]... --mode=link LINK-COMMAND...
+
+Link object files or libraries together to form another library, or to
+create an executable program.
+
+LINK-COMMAND is a command using the C compiler that you would use to create
+a program from several object files.
+
+The following components of LINK-COMMAND are treated specially:
+
+ -all-static do not do any dynamic linking at all
+ -avoid-version do not add a version suffix if possible
+ -dlopen FILE \`-dlpreopen' FILE if it cannot be dlopened at runtime
+ -dlpreopen FILE link in FILE and add its symbols to lt_preloaded_symbols
+ -export-dynamic allow symbols from OUTPUT-FILE to be resolved with dlsym(3)
+ -export-symbols SYMFILE
+ try to export only the symbols listed in SYMFILE
+ -export-symbols-regex REGEX
+ try to export only the symbols matching REGEX
+ -LLIBDIR search LIBDIR for required installed libraries
+ -lNAME OUTPUT-FILE requires the installed library libNAME
+ -module build a library that can dlopened
+ -no-fast-install disable the fast-install mode
+ -no-install link a not-installable executable
+ -no-undefined declare that a library does not refer to external symbols
+ -o OUTPUT-FILE create OUTPUT-FILE from the specified objects
+ -objectlist FILE Use a list of object files found in FILE to specify objects
+ -release RELEASE specify package release information
+ -rpath LIBDIR the created library will eventually be installed in LIBDIR
+ -R[ ]LIBDIR add LIBDIR to the runtime path of programs and libraries
+ -static do not do any dynamic linking of libtool libraries
+ -version-info CURRENT[:REVISION[:AGE]]
+ specify library version info [each variable defaults to 0]
+
+All other options (arguments beginning with \`-') are ignored.
+
+Every other argument is treated as a filename. Files ending in \`.la' are
+treated as uninstalled libtool libraries, other files are standard or library
+object files.
+
+If the OUTPUT-FILE ends in \`.la', then a libtool library is created,
+only library objects (\`.lo' files) may be specified, and \`-rpath' is
+required, except when creating a convenience library.
+
+If OUTPUT-FILE ends in \`.a' or \`.lib', then a standard library is created
+using \`ar' and \`ranlib', or on Windows using \`lib'.
+
+If OUTPUT-FILE ends in \`.lo' or \`.${objext}', then a reloadable object file
+is created, otherwise an executable program is created."
+ ;;
+
+uninstall)
+ $echo \
+"Usage: $modename [OPTION]... --mode=uninstall RM [RM-OPTION]... FILE...
+
+Remove libraries from an installation directory.
+
+RM is the name of the program to use to delete files associated with each FILE
+(typically \`/bin/rm'). RM-OPTIONS are options (such as \`-f') to be passed
+to RM.
+
+If FILE is a libtool library, all the files associated with it are deleted.
+Otherwise, only FILE itself is deleted using RM."
+ ;;
+
+*)
+ $echo "$modename: invalid operation mode \`$mode'" 1>&2
+ $echo "$help" 1>&2
+ exit 1
+ ;;
+esac
+
+echo
+$echo "Try \`$modename --help' for more information about other modes."
+
+exit 0
+
+# The TAGs below are defined such that we never get into a situation
+# in which we disable both kinds of libraries. Given conflicting
+# choices, we go for a static library, that is the most portable,
+# since we can't tell whether shared libraries were disabled because
+# the user asked for that or because the platform doesn't support
+# them. This is particularly important on AIX, because we don't
+# support having both static and shared libraries enabled at the same
+# time on that platform, so we default to a shared-only configuration.
+# If a disable-shared tag is given, we'll fallback to a static-only
+# configuration. But we'll never go from static-only to shared-only.
+
+### BEGIN LIBTOOL TAG CONFIG: disable-shared
+build_libtool_libs=no
+build_old_libs=yes
+### END LIBTOOL TAG CONFIG: disable-shared
+
+### BEGIN LIBTOOL TAG CONFIG: disable-static
+build_old_libs=`case $build_libtool_libs in yes) echo no;; *) echo yes;; esac`
+### END LIBTOOL TAG CONFIG: disable-static
+
+# Local Variables:
+# mode:shell-script
+# sh-indentation:2
+# End:
ac_help=
ac_default_prefix=/usr/local
# Any additions from configure.in:
-ac_help="$ac_help
- --disable-dependency-tracking Speeds up one-time builds
- --enable-dependency-tracking Do not reject slow dependency extractors"
ac_default_prefix=/usr/local/gromacs
ac_help="$ac_help
- --enable-mpi Compile parallel version of Gromacs"
+ --enable-mpi Compile parallel version of GROMACS"
ac_help="$ac_help
--enable-vector Compile for a vector machine"
ac_help="$ac_help
- --enable-fortran Dortran loops (default on sgi,ibm,sun,tru64/dec)"
+ --enable-fortran Fortran loops (default on sgi,ibm,sun,axp)"
ac_help="$ac_help
- --enable-double Compile double precision Gromacs"
+ --enable-float Compile single precision GROMACS"
ac_help="$ac_help
--disable-type-suffix Don't add a suffix to double precision files"
ac_help="$ac_help
ac_help="$ac_help
--disable-nice Disable the nice priority on mdrun"
ac_help="$ac_help
- --disable-sse Disable SSE assembly loops on x86"
-ac_help="$ac_help
- --disable-3dnow Disable 3DNow assembly loops on x86"
+ --disable-x86-asm Disable assembly loops on x86"
ac_help="$ac_help
--disable-xdr Disable portable trajectory routines"
ac_help="$ac_help
--enable-hide-table-latency Try to get table data to cache before using it"
ac_help="$ac_help
--with-mpi-environment=VAR Only start MPI mdrun when VAR is set"
+ac_help="$ac_help
+ --enable-shared[=PKGS] build shared libraries [default=yes]"
+ac_help="$ac_help
+ --enable-static[=PKGS] build static libraries [default=yes]"
+ac_help="$ac_help
+ --enable-fast-install[=PKGS] optimize for fast installation [default=yes]"
+ac_help="$ac_help
+ --with-gnu-ld assume the C compiler uses GNU ld [default=no]"
+ac_help="$ac_help
+ --disable-libtool-lock avoid locking (might break parallel builds)"
+ac_help="$ac_help
+ --with-pic try to use only PIC/non-PIC [default=both]"
ac_help="$ac_help
--with-x use the X Window System"
ac_help="$ac_help
- --with-motif-includes=DIR Motif include files are in DIR"
+ --with-motif-includes=DIR Motif include files are in DIR"
ac_help="$ac_help
- --with-motif-libraries=DIR Motif libraries are in DIR"
+ --with-motif-libraries=DIR Motif libraries are in DIR"
# Initialize some variables set by options.
# The variables have the same names as the options, with
ac_n= ac_c='\c' ac_t=
fi
+echo $ac_n "checking for Cygwin environment""... $ac_c" 1>&6
+echo "configure:587: checking for Cygwin environment" >&5
+if eval "test \"`echo '$''{'ac_cv_cygwin'+set}'`\" = set"; then
+ echo $ac_n "(cached) $ac_c" 1>&6
+else
+ cat > conftest.$ac_ext <<EOF
+#line 592 "configure"
+#include "confdefs.h"
+
+int main() {
+
+#ifndef __CYGWIN__
+#define __CYGWIN__ __CYGWIN32__
+#endif
+return __CYGWIN__;
+; return 0; }
+EOF
+if { (eval echo configure:603: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
+ rm -rf conftest*
+ ac_cv_cygwin=yes
+else
+ echo "configure: failed program was:" >&5
+ cat conftest.$ac_ext >&5
+ rm -rf conftest*
+ ac_cv_cygwin=no
+fi
+rm -f conftest*
+rm -f conftest*
+fi
+
+echo "$ac_t""$ac_cv_cygwin" 1>&6
+CYGWIN=
+test "$ac_cv_cygwin" = yes && CYGWIN=yes
+echo $ac_n "checking for mingw32 environment""... $ac_c" 1>&6
+echo "configure:620: checking for mingw32 environment" >&5
+if eval "test \"`echo '$''{'ac_cv_mingw32'+set}'`\" = set"; then
+ echo $ac_n "(cached) $ac_c" 1>&6
+else
+ cat > conftest.$ac_ext <<EOF
+#line 625 "configure"
+#include "confdefs.h"
+
+int main() {
+return __MINGW32__;
+; return 0; }
+EOF
+if { (eval echo configure:632: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
+ rm -rf conftest*
+ ac_cv_mingw32=yes
+else
+ echo "configure: failed program was:" >&5
+ cat conftest.$ac_ext >&5
+ rm -rf conftest*
+ ac_cv_mingw32=no
+fi
+rm -f conftest*
+rm -f conftest*
+fi
+
+echo "$ac_t""$ac_cv_mingw32" 1>&6
+MINGW32=
+test "$ac_cv_mingw32" = yes && MINGW32=yes
ac_aux_dir=
-for ac_dir in ./config $srcdir/./config; do
+for ac_dir in config $srcdir/config; do
if test -f $ac_dir/install-sh; then
ac_aux_dir=$ac_dir
ac_install_sh="$ac_aux_dir/install-sh -c"
fi
done
if test -z "$ac_aux_dir"; then
- { echo "configure: error: can not find install-sh or install.sh in ./config $srcdir/./config" 1>&2; exit 1; }
+ { echo "configure: error: can not find install-sh or install.sh in config $srcdir/config" 1>&2; exit 1; }
fi
ac_config_guess=$ac_aux_dir/config.guess
ac_config_sub=$ac_aux_dir/config.sub
# SVR4 /usr/ucb/install, which tries to use the nonexistent group "staff"
# ./install, which can be erroneously created by make from ./install.sh.
echo $ac_n "checking for a BSD compatible install""... $ac_c" 1>&6
-echo "configure:612: checking for a BSD compatible install" >&5
+echo "configure:681: checking for a BSD compatible install" >&5
if test -z "$INSTALL"; then
if eval "test \"`echo '$''{'ac_cv_path_install'+set}'`\" = set"; then
echo $ac_n "(cached) $ac_c" 1>&6
test -z "$INSTALL_DATA" && INSTALL_DATA='${INSTALL} -m 644'
echo $ac_n "checking whether build environment is sane""... $ac_c" 1>&6
-echo "configure:665: checking whether build environment is sane" >&5
+echo "configure:734: checking whether build environment is sane" >&5
# Just in case
sleep 1
-echo timestamp > conftest.file
+echo timestamp > conftestfile
# Do `set' in a subshell so we don't clobber the current shell's
# arguments. Must try -L first in case configure is actually a
# symlink; some systems play weird games with the mod time of symlinks
# (eg FreeBSD returns the mod time of the symlink's containing
# directory).
if (
- set X `ls -Lt $srcdir/configure conftest.file 2> /dev/null`
+ set X `ls -Lt $srcdir/configure conftestfile 2> /dev/null`
if test "$*" = "X"; then
# -L didn't work.
- set X `ls -t $srcdir/configure conftest.file`
+ set X `ls -t $srcdir/configure conftestfile`
fi
- if test "$*" != "X $srcdir/configure conftest.file" \
- && test "$*" != "X conftest.file $srcdir/configure"; then
+ if test "$*" != "X $srcdir/configure conftestfile" \
+ && test "$*" != "X conftestfile $srcdir/configure"; then
# If neither matched, then we have a broken ls. This can happen
# if, for instance, CONFIG_SHELL is bash and it inherits a
alias in your environment" 1>&2; exit 1; }
fi
- test "$2" = conftest.file
+ test "$2" = conftestfile
)
then
# Ok.
# sed with no file args requires a program.
test "$program_transform_name" = "" && program_transform_name="s,x,x,"
-test x"${MISSING+set}" = xset ||
- MISSING="\${SHELL} `CDPATH=:; cd $ac_aux_dir && pwd`/missing"
-# Use eval to expand $SHELL
-if eval "$MISSING --run :"; then
- am_missing_run="$MISSING --run "
-else
- am_missing_run=
- am_backtick='`'
- echo "configure: warning: ${am_backtick}missing' script is too old or missing" 1>&2
-fi
-
-for ac_prog in mawk gawk nawk awk
-do
-# Extract the first word of "$ac_prog", so it can be a program name with args.
-set dummy $ac_prog; ac_word=$2
-echo $ac_n "checking for $ac_word""... $ac_c" 1>&6
-echo "configure:737: checking for $ac_word" >&5
-if eval "test \"`echo '$''{'ac_cv_prog_AWK'+set}'`\" = set"; then
- echo $ac_n "(cached) $ac_c" 1>&6
-else
- if test -n "$AWK"; then
- ac_cv_prog_AWK="$AWK" # Let the user override the test.
-else
- IFS="${IFS= }"; ac_save_ifs="$IFS"; IFS=":"
- ac_dummy="$PATH"
- for ac_dir in $ac_dummy; do
- test -z "$ac_dir" && ac_dir=.
- if test -f $ac_dir/$ac_word; then
- ac_cv_prog_AWK="$ac_prog"
- break
- fi
- done
- IFS="$ac_save_ifs"
-fi
-fi
-AWK="$ac_cv_prog_AWK"
-if test -n "$AWK"; then
- echo "$ac_t""$AWK" 1>&6
-else
- echo "$ac_t""no" 1>&6
-fi
-
-test -n "$AWK" && break
-done
-
echo $ac_n "checking whether ${MAKE-make} sets \${MAKE}""... $ac_c" 1>&6
-echo "configure:767: checking whether ${MAKE-make} sets \${MAKE}" >&5
+echo "configure:791: checking whether ${MAKE-make} sets \${MAKE}" >&5
set dummy ${MAKE-make}; ac_make=`echo "$2" | sed 'y%./+-%__p_%'`
if eval "test \"`echo '$''{'ac_cv_prog_make_${ac_make}_set'+set}'`\" = set"; then
echo $ac_n "(cached) $ac_c" 1>&6
SET_MAKE="MAKE=${MAKE-make}"
fi
-# Check whether --enable-dependency-tracking or --disable-dependency-tracking was given.
-if test "${enable_dependency_tracking+set}" = set; then
- enableval="$enable_dependency_tracking"
- :
-fi
-
-if test "x$enable_dependency_tracking" = xno; then
- AMDEP="#"
-else
- am_depcomp="$ac_aux_dir/depcomp"
- if test ! -f "$am_depcomp"; then
- AMDEP="#"
- else
- AMDEP=
- fi
-fi
-
-if test -z "$AMDEP"; then
- AMDEPBACKSLASH='\'
-else
- AMDEPBACKSLASH=
-fi
-
+PACKAGE=gromacs
+VERSION=3.0
-if test -d .deps || mkdir .deps 2> /dev/null || test -d .deps; then
- DEPDIR=.deps
- # We redirect because .deps might already exist and be populated.
- # In this situation we don't want to see an error.
- rmdir .deps > /dev/null 2>&1
-else
- DEPDIR=_deps
-fi
-
-
-# test to see if srcdir already configured
-if test "`CDPATH=:; cd $srcdir && pwd`" != "`pwd`" &&
- test -f $srcdir/config.status; then
- { echo "configure: error: source directory already configured; run \"make distclean\" there first" 1>&2; exit 1; }
+if test "`cd $srcdir && pwd`" != "`pwd`" && test -f $srcdir/config.status; then
+ { echo "configure: error: source directory already configured; run "make distclean" there first" 1>&2; exit 1; }
fi
-
-# Define the identity of the package.
-PACKAGE=gromacs
-VERSION=3.0
cat >> confdefs.h <<EOF
#define PACKAGE "$PACKAGE"
EOF
EOF
-# Autoconf 2.50 wants to disallow AM_ names. We explicitly allow
-# the ones we care about.
-
-
-
+missing_dir=`cd $ac_aux_dir && pwd`
+echo $ac_n "checking for working aclocal""... $ac_c" 1>&6
+echo "configure:837: checking for working aclocal" >&5
+# Run test in a subshell; some versions of sh will print an error if
+# an executable is not found, even if stderr is redirected.
+# Redirect stdin to placate older versions of autoconf. Sigh.
+if (aclocal --version) < /dev/null > /dev/null 2>&1; then
+ ACLOCAL=aclocal
+ echo "$ac_t""found" 1>&6
+else
+ ACLOCAL="$missing_dir/missing aclocal"
+ echo "$ac_t""missing" 1>&6
+fi
+echo $ac_n "checking for working autoconf""... $ac_c" 1>&6
+echo "configure:850: checking for working autoconf" >&5
+# Run test in a subshell; some versions of sh will print an error if
+# an executable is not found, even if stderr is redirected.
+# Redirect stdin to placate older versions of autoconf. Sigh.
+if (autoconf --version) < /dev/null > /dev/null 2>&1; then
+ AUTOCONF=autoconf
+ echo "$ac_t""found" 1>&6
+else
+ AUTOCONF="$missing_dir/missing autoconf"
+ echo "$ac_t""missing" 1>&6
+fi
+echo $ac_n "checking for working automake""... $ac_c" 1>&6
+echo "configure:863: checking for working automake" >&5
+# Run test in a subshell; some versions of sh will print an error if
+# an executable is not found, even if stderr is redirected.
+# Redirect stdin to placate older versions of autoconf. Sigh.
+if (automake --version) < /dev/null > /dev/null 2>&1; then
+ AUTOMAKE=automake
+ echo "$ac_t""found" 1>&6
+else
+ AUTOMAKE="$missing_dir/missing automake"
+ echo "$ac_t""missing" 1>&6
+fi
+echo $ac_n "checking for working autoheader""... $ac_c" 1>&6
+echo "configure:876: checking for working autoheader" >&5
+# Run test in a subshell; some versions of sh will print an error if
+# an executable is not found, even if stderr is redirected.
+# Redirect stdin to placate older versions of autoconf. Sigh.
+if (autoheader --version) < /dev/null > /dev/null 2>&1; then
+ AUTOHEADER=autoheader
+ echo "$ac_t""found" 1>&6
+else
+ AUTOHEADER="$missing_dir/missing autoheader"
+ echo "$ac_t""missing" 1>&6
+fi
-# Some tools Automake needs.
-
-ACLOCAL=${ACLOCAL-"${am_missing_run}aclocal"}
-
-
-AUTOCONF=${AUTOCONF-"${am_missing_run}autoconf"}
-
-
-AUTOMAKE=${AUTOMAKE-"${am_missing_run}automake"}
-
-
-AUTOHEADER=${AUTOHEADER-"${am_missing_run}autoheader"}
-
-
-MAKEINFO=${MAKEINFO-"${am_missing_run}makeinfo"}
-
-
-AMTAR=${AMTAR-"${am_missing_run}tar"}
-
-
-if test -z "$install_sh"; then
- for install_sh in "$ac_aux_dir/install-sh" \
- "$ac_aux_dir/install.sh" \
- "${am_missing_run}${ac_auxdir}/install-sh";
- do
- test -f "$install_sh" && break
- done
- # FIXME: an evil hack: we remove the SHELL invocation from
- # install_sh because automake adds it back in. Sigh.
- install_sh=`echo $install_sh | sed -e 's/\${SHELL}//'`
+echo $ac_n "checking for working makeinfo""... $ac_c" 1>&6
+echo "configure:889: checking for working makeinfo" >&5
+# Run test in a subshell; some versions of sh will print an error if
+# an executable is not found, even if stderr is redirected.
+# Redirect stdin to placate older versions of autoconf. Sigh.
+if (makeinfo --version) < /dev/null > /dev/null 2>&1; then
+ MAKEINFO=makeinfo
+ echo "$ac_t""found" 1>&6
+else
+ MAKEINFO="$missing_dir/missing makeinfo"
+ echo "$ac_t""missing" 1>&6
fi
-# We need awk for the "check" target. The system "awk" is bad on
-# some platforms.
+
+SHARED_VERSION_INFO="1:0:0"
-
#######################################################################
#####
-# Check whether --enable-double or --disable-double was given.
-if test "${enable_double+set}" = set; then
- enableval="$enable_double"
- enable_double=$enableval
+# Check whether --enable-float or --disable-float was given.
+if test "${enable_float+set}" = set; then
+ enableval="$enable_float"
+ enable_float=$enableval
else
- enable_double=no
+ enable_float=yes
fi
fi
-
#####
# Check whether --enable-simplewater or --disable-simplewater was given.
#####
-# Check whether --enable-sse or --disable-sse was given.
-if test "${enable_sse+set}" = set; then
- enableval="$enable_sse"
- enable_sse=$enableval
-else
- enable_sse=yes
-fi
-
-
-
-#####
-
-# Check whether --enable-3dnow or --disable-3dnow was given.
-if test "${enable_3dnow+set}" = set; then
- enableval="$enable_3dnow"
- enable_3dnow=$enableval
+# Check whether --enable-x86_asm or --disable-x86_asm was given.
+if test "${enable_x86_asm+set}" = set; then
+ enableval="$enable_x86_asm"
+ enable_x86_asm=$enableval
else
- enable_3dnow=yes
+ enable_x86_asm=yes
fi
fi
echo $ac_n "checking host system type""... $ac_c" 1>&6
-echo "configure:1171: checking host system type" >&5
+echo "configure:1169: checking host system type" >&5
host_alias=$host
case "$host_alias" in
if test "$enable_fortran" = "check"; then
case "${host_cpu}-${host_os}" in
- sparc*-solaris* | alpha*-osf* | rs6000*-aix* | mips*-irix*)
+ sparc*-solaris* | alpha*-* | rs6000*-aix* | mips*-irix*)
enable_fortran=yes
;;
esac
if test "$enable_fortran" = "yes"; then
- # vendor f77 before g77
- for ac_prog in f77 xlf xlf77 cf77 fl32 g77 fort77 f90 xlf90 pgf77 cf77 fort fort77 pgf90
+ # vendor f77 before g77 - but special compiler list for alpha-linux
+ case "${host_cpu}-${host_os}" in
+ alpha*-linux*)
+ for ac_prog in fort f77 g77
do
# Extract the first word of "$ac_prog", so it can be a program name with args.
set dummy $ac_prog; ac_word=$2
test -n "$F77" && break
done
+ ;;
+ *)
+ for ac_prog in f77 xlf xlf77 cf77 fl32 g77 fort77 f90 xlf90 pgf77 cf77 fort fort77 pgf90
+do
+# Extract the first word of "$ac_prog", so it can be a program name with args.
+set dummy $ac_prog; ac_word=$2
+echo $ac_n "checking for $ac_word""... $ac_c" 1>&6
+echo "configure:1330: checking for $ac_word" >&5
+if eval "test \"`echo '$''{'ac_cv_prog_F77'+set}'`\" = set"; then
+ echo $ac_n "(cached) $ac_c" 1>&6
+else
+ if test -n "$F77"; then
+ ac_cv_prog_F77="$F77" # Let the user override the test.
+else
+ IFS="${IFS= }"; ac_save_ifs="$IFS"; IFS=":"
+ ac_dummy="$PATH"
+ for ac_dir in $ac_dummy; do
+ test -z "$ac_dir" && ac_dir=.
+ if test -f $ac_dir/$ac_word; then
+ ac_cv_prog_F77="$ac_prog"
+ break
+ fi
+ done
+ IFS="$ac_save_ifs"
+fi
+fi
+F77="$ac_cv_prog_F77"
+if test -n "$F77"; then
+ echo "$ac_t""$F77" 1>&6
+else
+ echo "$ac_t""no" 1>&6
+fi
+
+test -n "$F77" && break
+done
+
+ ;;
+ esac
if test -z "$F77"; then
for ac_prog in g77 f77 f2c
do
# Extract the first word of "$ac_prog", so it can be a program name with args.
set dummy $ac_prog; ac_word=$2
echo $ac_n "checking for $ac_word""... $ac_c" 1>&6
-echo "configure:1329: checking for $ac_word" >&5
+echo "configure:1367: checking for $ac_word" >&5
if eval "test \"`echo '$''{'ac_cv_prog_F77'+set}'`\" = set"; then
echo $ac_n "(cached) $ac_c" 1>&6
else
fi
echo $ac_n "checking whether the Fortran 77 compiler ($F77 $FFLAGS $LDFLAGS) works""... $ac_c" 1>&6
-echo "configure:1362: checking whether the Fortran 77 compiler ($F77 $FFLAGS $LDFLAGS) works" >&5
+echo "configure:1400: checking whether the Fortran 77 compiler ($F77 $FFLAGS $LDFLAGS) works" >&5
ac_ext=f
ac_compile='${F77-f77} -c $FFLAGS conftest.$ac_ext 1>&5'
end
EOF
-if { (eval echo configure:1375: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+if { (eval echo configure:1413: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
ac_cv_prog_f77_works=yes
# If we can't run a trivial program, we are probably using a cross compiler.
if (./conftest; exit) 2>/dev/null; then
{ echo "configure: error: installation or configuration problem: Fortran 77 compiler cannot create executables." 1>&2; exit 1; }
fi
echo $ac_n "checking whether the Fortran 77 compiler ($F77 $FFLAGS $LDFLAGS) is a cross-compiler""... $ac_c" 1>&6
-echo "configure:1401: checking whether the Fortran 77 compiler ($F77 $FFLAGS $LDFLAGS) is a cross-compiler" >&5
+echo "configure:1439: checking whether the Fortran 77 compiler ($F77 $FFLAGS $LDFLAGS) is a cross-compiler" >&5
echo "$ac_t""$ac_cv_prog_f77_cross" 1>&6
cross_compiling=$ac_cv_prog_f77_cross
echo $ac_n "checking whether we are using GNU Fortran 77""... $ac_c" 1>&6
-echo "configure:1406: checking whether we are using GNU Fortran 77" >&5
+echo "configure:1444: checking whether we are using GNU Fortran 77" >&5
if eval "test \"`echo '$''{'ac_cv_prog_g77'+set}'`\" = set"; then
echo $ac_n "(cached) $ac_c" 1>&6
else
yes
#endif
EOF
-if { ac_try='$F77 -E conftest.fpp'; { (eval echo configure:1415: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }; } | egrep yes >/dev/null 2>&1; then
+if { ac_try='$F77 -E conftest.fpp'; { (eval echo configure:1453: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }; } | egrep yes >/dev/null 2>&1; then
ac_cv_prog_g77=yes
else
ac_cv_prog_g77=no
ac_save_FFLAGS="$FFLAGS"
FFLAGS=
echo $ac_n "checking whether $F77 accepts -g""... $ac_c" 1>&6
-echo "configure:1430: checking whether $F77 accepts -g" >&5
+echo "configure:1468: checking whether $F77 accepts -g" >&5
if eval "test \"`echo '$''{'ac_cv_prog_f77_g'+set}'`\" = set"; then
echo $ac_n "(cached) $ac_c" 1>&6
else
fi
fi
-# Checks for programs.
-echo $ac_n "checking whether ${MAKE-make} sets \${MAKE}""... $ac_c" 1>&6
-echo "configure:1467: checking whether ${MAKE-make} sets \${MAKE}" >&5
-set dummy ${MAKE-make}; ac_make=`echo "$2" | sed 'y%./+-%__p_%'`
-if eval "test \"`echo '$''{'ac_cv_prog_make_${ac_make}_set'+set}'`\" = set"; then
+case "${host_cpu}-${host_os}" in
+ alpha*-linux*)
+ for ac_prog in ccc cc
+do
+# Extract the first word of "$ac_prog", so it can be a program name with args.
+set dummy $ac_prog; ac_word=$2
+echo $ac_n "checking for $ac_word""... $ac_c" 1>&6
+echo "configure:1510: checking for $ac_word" >&5
+if eval "test \"`echo '$''{'ac_cv_prog_CC'+set}'`\" = set"; then
echo $ac_n "(cached) $ac_c" 1>&6
else
- cat > conftestmake <<\EOF
-all:
- @echo 'ac_maketemp="${MAKE}"'
-EOF
-# GNU make sometimes prints "make[1]: Entering...", which would confuse us.
-eval `${MAKE-make} -f conftestmake 2>/dev/null | grep temp=`
-if test -n "$ac_maketemp"; then
- eval ac_cv_prog_make_${ac_make}_set=yes
+ if test -n "$CC"; then
+ ac_cv_prog_CC="$CC" # Let the user override the test.
else
- eval ac_cv_prog_make_${ac_make}_set=no
+ IFS="${IFS= }"; ac_save_ifs="$IFS"; IFS=":"
+ ac_dummy="$PATH"
+ for ac_dir in $ac_dummy; do
+ test -z "$ac_dir" && ac_dir=.
+ if test -f $ac_dir/$ac_word; then
+ ac_cv_prog_CC="$ac_prog"
+ break
+ fi
+ done
+ IFS="$ac_save_ifs"
fi
-rm -f conftestmake
fi
-if eval "test \"`echo '$ac_cv_prog_make_'${ac_make}_set`\" = yes"; then
- echo "$ac_t""yes" 1>&6
- SET_MAKE=
+CC="$ac_cv_prog_CC"
+if test -n "$CC"; then
+ echo "$ac_t""$CC" 1>&6
else
echo "$ac_t""no" 1>&6
- SET_MAKE="MAKE=${MAKE-make}"
fi
-
-# Extract the first word of "cc", so it can be a program name with args.
+test -n "$CC" && break
+done
+ # do vendor cc before gcc
+ ;;
+ *)
+ # Extract the first word of "cc", so it can be a program name with args.
set dummy cc; ac_word=$2
echo $ac_n "checking for $ac_word""... $ac_c" 1>&6
-echo "configure:1497: checking for $ac_word" >&5
+echo "configure:1544: checking for $ac_word" >&5
if eval "test \"`echo '$''{'ac_cv_prog_CC'+set}'`\" = set"; then
echo $ac_n "(cached) $ac_c" 1>&6
else
echo "$ac_t""no" 1>&6
fi
# do vendor cc before gcc
+ ;;
+esac
# Extract the first word of "gcc", so it can be a program name with args.
set dummy gcc; ac_word=$2
echo $ac_n "checking for $ac_word""... $ac_c" 1>&6
-echo "configure:1526: checking for $ac_word" >&5
+echo "configure:1575: checking for $ac_word" >&5
if eval "test \"`echo '$''{'ac_cv_prog_CC'+set}'`\" = set"; then
echo $ac_n "(cached) $ac_c" 1>&6
else
# Extract the first word of "cc", so it can be a program name with args.
set dummy cc; ac_word=$2
echo $ac_n "checking for $ac_word""... $ac_c" 1>&6
-echo "configure:1556: checking for $ac_word" >&5
+echo "configure:1605: checking for $ac_word" >&5
if eval "test \"`echo '$''{'ac_cv_prog_CC'+set}'`\" = set"; then
echo $ac_n "(cached) $ac_c" 1>&6
else
# Extract the first word of "cl", so it can be a program name with args.
set dummy cl; ac_word=$2
echo $ac_n "checking for $ac_word""... $ac_c" 1>&6
-echo "configure:1607: checking for $ac_word" >&5
+echo "configure:1656: checking for $ac_word" >&5
if eval "test \"`echo '$''{'ac_cv_prog_CC'+set}'`\" = set"; then
echo $ac_n "(cached) $ac_c" 1>&6
else
fi
echo $ac_n "checking whether the C compiler ($CC $CFLAGS $LDFLAGS) works""... $ac_c" 1>&6
-echo "configure:1639: checking whether the C compiler ($CC $CFLAGS $LDFLAGS) works" >&5
+echo "configure:1688: checking whether the C compiler ($CC $CFLAGS $LDFLAGS) works" >&5
ac_ext=c
# CFLAGS is not in ac_cpp because -g, -O, etc. are not valid cpp options.
cat > conftest.$ac_ext << EOF
-#line 1650 "configure"
+#line 1699 "configure"
#include "confdefs.h"
main(){return(0);}
EOF
-if { (eval echo configure:1655: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+if { (eval echo configure:1704: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
ac_cv_prog_cc_works=yes
# If we can't run a trivial program, we are probably using a cross compiler.
if (./conftest; exit) 2>/dev/null; then
{ echo "configure: error: installation or configuration problem: C compiler cannot create executables." 1>&2; exit 1; }
fi
echo $ac_n "checking whether the C compiler ($CC $CFLAGS $LDFLAGS) is a cross-compiler""... $ac_c" 1>&6
-echo "configure:1681: checking whether the C compiler ($CC $CFLAGS $LDFLAGS) is a cross-compiler" >&5
+echo "configure:1730: checking whether the C compiler ($CC $CFLAGS $LDFLAGS) is a cross-compiler" >&5
echo "$ac_t""$ac_cv_prog_cc_cross" 1>&6
cross_compiling=$ac_cv_prog_cc_cross
echo $ac_n "checking whether we are using GNU C""... $ac_c" 1>&6
-echo "configure:1686: checking whether we are using GNU C" >&5
+echo "configure:1735: checking whether we are using GNU C" >&5
if eval "test \"`echo '$''{'ac_cv_prog_gcc'+set}'`\" = set"; then
echo $ac_n "(cached) $ac_c" 1>&6
else
yes;
#endif
EOF
-if { ac_try='${CC-cc} -E conftest.c'; { (eval echo configure:1695: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }; } | egrep yes >/dev/null 2>&1; then
+if { ac_try='${CC-cc} -E conftest.c'; { (eval echo configure:1744: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }; } | egrep yes >/dev/null 2>&1; then
ac_cv_prog_gcc=yes
else
ac_cv_prog_gcc=no
ac_save_CFLAGS="$CFLAGS"
CFLAGS=
echo $ac_n "checking whether ${CC-cc} accepts -g""... $ac_c" 1>&6
-echo "configure:1714: checking whether ${CC-cc} accepts -g" >&5
+echo "configure:1763: checking whether ${CC-cc} accepts -g" >&5
if eval "test \"`echo '$''{'ac_cv_prog_cc_g'+set}'`\" = set"; then
echo $ac_n "(cached) $ac_c" 1>&6
else
fi
fi
-
echo $ac_n "checking how to run the C preprocessor""... $ac_c" 1>&6
-echo "configure:1747: checking how to run the C preprocessor" >&5
+echo "configure:1795: checking how to run the C preprocessor" >&5
# On Suns, sometimes $CPP names a directory.
if test -n "$CPP" && test -d "$CPP"; then
CPP=
# On the NeXT, cc -E runs the code through the compiler's parser,
# not just through cpp.
cat > conftest.$ac_ext <<EOF
-#line 1762 "configure"
+#line 1810 "configure"
#include "confdefs.h"
#include <assert.h>
Syntax Error
EOF
ac_try="$ac_cpp conftest.$ac_ext >/dev/null 2>conftest.out"
-{ (eval echo configure:1768: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }
+{ (eval echo configure:1816: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }
ac_err=`grep -v '^ *+' conftest.out | grep -v "^conftest.${ac_ext}\$"`
if test -z "$ac_err"; then
:
rm -rf conftest*
CPP="${CC-cc} -E -traditional-cpp"
cat > conftest.$ac_ext <<EOF
-#line 1779 "configure"
+#line 1827 "configure"
#include "confdefs.h"
#include <assert.h>
Syntax Error
EOF
ac_try="$ac_cpp conftest.$ac_ext >/dev/null 2>conftest.out"
-{ (eval echo configure:1785: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }
+{ (eval echo configure:1833: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }
ac_err=`grep -v '^ *+' conftest.out | grep -v "^conftest.${ac_ext}\$"`
if test -z "$ac_err"; then
:
rm -rf conftest*
CPP="${CC-cc} -nologo -E"
cat > conftest.$ac_ext <<EOF
-#line 1796 "configure"
+#line 1844 "configure"
#include "confdefs.h"
#include <assert.h>
Syntax Error
EOF
ac_try="$ac_cpp conftest.$ac_ext >/dev/null 2>conftest.out"
-{ (eval echo configure:1802: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }
+{ (eval echo configure:1850: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }
ac_err=`grep -v '^ *+' conftest.out | grep -v "^conftest.${ac_ext}\$"`
if test -z "$ac_err"; then
:
fi
echo "$ac_t""$CPP" 1>&6
-am_make=${MAKE-make}
-# BSD make uses .include
-cat > confinc << 'END'
-doit:
- @echo done
-END
-# If we don't find an include directive, just comment out the code.
-echo $ac_n "checking for style of include used by $am_make""... $ac_c" 1>&6
-echo "configure:1834: checking for style of include used by $am_make" >&5
-_am_include='#'
-for am_inc in include .include; do
- echo "$am_inc confinc" > confmf
- if test "`$am_make -f confmf 2> /dev/null`" = "done"; then
- _am_include=$am_inc
- break
- fi
-done
-
-echo "$ac_t""$_am_include" 1>&6
-rm -f confinc confmf
-
-
-depcc="$CC"
-depcpp="$CPP"
-
-
-
-echo $ac_n "checking dependency style of $depcc""... $ac_c" 1>&6
-echo "configure:1854: checking dependency style of $depcc" >&5
-if eval "test \"`echo '$''{'am_cv_CC_dependencies_compiler_type'+set}'`\" = set"; then
- echo $ac_n "(cached) $ac_c" 1>&6
-else
- if test -z "$AMDEP"; then
- # We make a subdir and do the tests there. Otherwise we can end up
- # making bogus files that we don't know about and never remove. For
- # instance it was reported that on HP-UX the gcc test will end up
- # making a dummy file named `D' -- because `-MD' means `put the output
- # in D'.
- mkdir confdir
- # Copy depcomp to subdir because otherwise we won't find it if we're
- # using a relative directory.
- cp "$am_depcomp" confdir
- cd confdir
-
- am_cv_CC_dependencies_compiler_type=none
- for depmode in `sed -n 's/^#*\([a-zA-Z0-9]*\))$/\1/p' < "./depcomp"`; do
- # We need to recreate these files for each test, as the compiler may
- # overwrite some of them when testing with obscure command lines.
- # This happens at least with the AIX C compiler.
- echo '#include "conftest.h"' > conftest.c
- echo 'int i;' > conftest.h
-
- case "$depmode" in
- nosideeffect)
- # after this tag, mechanisms are not by side-effect, so they'll
- # only be used when explicitly requested
- if test "x$enable_dependency_tracking" = xyes; then
- continue
- else
- break
- fi
- ;;
- none) break ;;
- esac
- # We check with `-c' and `-o' for the sake of the "dashmstdout"
- # mode. It turns out that the SunPro C++ compiler does not properly
- # handle `-M -o', and we need to detect this.
- if depmode="$depmode" \
- source=conftest.c object=conftest.o \
- depfile=conftest.Po tmpdepfile=conftest.TPo \
- $SHELL ./depcomp $depcc -c conftest.c -o conftest.o >/dev/null 2>&1 &&
- grep conftest.h conftest.Po > /dev/null 2>&1; then
- am_cv_CC_dependencies_compiler_type="$depmode"
- break
- fi
- done
-
- cd ..
- rm -rf confdir
-else
- am_cv_CC_dependencies_compiler_type=none
-fi
-
-fi
-
-echo "$ac_t""$am_cv_CC_dependencies_compiler_type" 1>&6
-CCDEPMODE="depmode=$am_cv_CC_dependencies_compiler_type"
-
-
BUILD_CC=$CC
if test "$enable_fortran" = "yes"; then
echo $ac_n "checking for Fortran 77 libraries""... $ac_c" 1>&6
-echo "configure:1920: checking for Fortran 77 libraries" >&5
+echo "configure:1879: checking for Fortran 77 libraries" >&5
if eval "test \"`echo '$''{'ac_cv_flibs'+set}'`\" = set"; then
echo $ac_n "checking fortran name mangling""... $ac_c" 1>&6
-echo "configure:2079: checking fortran name mangling" >&5
+echo "configure:2038: checking fortran name mangling" >&5
cat > mangle-func.f <<EOF
subroutine foobar()
return
end
EOF
ac_try='$F77 -c $FFLAGS mangle-func.f 1>&5'
-if { (eval echo configure:2089: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }; then
+if { (eval echo configure:2048: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }; then
ac_try=""
else
echo "configure: failed program was:" >&5
ac_save_LIBS="$LIBS"
LIBS="mangle-func.o $FLIBS $LIBS"
cat > conftest.$ac_ext <<EOF
-#line 2110 "configure"
+#line 2069 "configure"
#include "confdefs.h"
int main() {
foobar();
; return 0; }
EOF
-if { (eval echo configure:2117: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+if { (eval echo configure:2076: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
rm -rf conftest*
ac_f77_mangle_type=lowercase
else
cat conftest.$ac_ext >&5
rm -rf conftest*
cat > conftest.$ac_ext <<EOF
-#line 2125 "configure"
+#line 2084 "configure"
#include "confdefs.h"
int main() {
foobar_();
; return 0; }
EOF
-if { (eval echo configure:2132: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+if { (eval echo configure:2091: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
rm -rf conftest*
ac_f77_mangle_type=lowercase-underscore
else
cat conftest.$ac_ext >&5
rm -rf conftest*
cat > conftest.$ac_ext <<EOF
-#line 2140 "configure"
+#line 2099 "configure"
#include "confdefs.h"
int main() {
FOOBAR();
; return 0; }
EOF
-if { (eval echo configure:2147: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+if { (eval echo configure:2106: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
rm -rf conftest*
ac_f77_mangle_type=uppercase
else
cat conftest.$ac_ext >&5
rm -rf conftest*
cat > conftest.$ac_ext <<EOF
-#line 2155 "configure"
+#line 2114 "configure"
#include "confdefs.h"
int main() {
FOOBAR_();
; return 0; }
EOF
-if { (eval echo configure:2162: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+if { (eval echo configure:2121: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
rm -rf conftest*
ac_f77_mangle_type=uppercase-underscore
else
esac
echo $ac_n "checking whether f77 functions with underscore get an extra underscore""... $ac_c" 1>&6
-echo "configure:2219: checking whether f77 functions with underscore get an extra underscore" >&5
+echo "configure:2178: checking whether f77 functions with underscore get an extra underscore" >&5
ac_ext=c
ac_save_LIBS="$LIBS"
LIBS="mangle-func.o $FLIBS $LIBS"
cat > conftest.$ac_ext <<EOF
-#line 2232 "configure"
+#line 2191 "configure"
#include "confdefs.h"
int main() {
$mangle_try();
; return 0; }
EOF
-if { (eval echo configure:2239: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+if { (eval echo configure:2198: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
rm -rf conftest*
ac_f77_mangle_underscore=yes;
cat >> confdefs.h <<\EOF
fi
# if we are using mpi, also get an MPICC. We cannot set that in the PROG_CC macro
-# above, since the autoconf checks that the created file can be executed. This would
+# above, since autoconf checks that the created file can be executed. This would
# fail on platforms where MPI executables can only be run through a batchqueue.
if test "$enable_mpi" = "yes"; then
# Extract the first word of "$ac_prog", so it can be a program name with args.
set dummy $ac_prog; ac_word=$2
echo $ac_n "checking for $ac_word""... $ac_c" 1>&6
-echo "configure:2276: checking for $ac_word" >&5
+echo "configure:2235: checking for $ac_word" >&5
if eval "test \"`echo '$''{'ac_cv_prog_MPICC'+set}'`\" = set"; then
echo $ac_n "(cached) $ac_c" 1>&6
else
# now change the normal cc to the MPI one - see the comment above.
CC=$MPICC
echo $ac_n "checking whether the MPI cc command works""... $ac_c" 1>&6
-echo "configure:2309: checking whether the MPI cc command works" >&5 # be paranoid
+echo "configure:2268: checking whether the MPI cc command works" >&5 # be paranoid
cat > conftest.$ac_ext <<EOF
-#line 2311 "configure"
+#line 2270 "configure"
#include "confdefs.h"
#include <mpi.h>
int main() {
int argc; char **argv; MPI_Init(&argc,&argv);
; return 0; }
EOF
-if { (eval echo configure:2318: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+if { (eval echo configure:2277: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
rm -rf conftest*
echo "$ac_t""yes" 1>&6
else
fi
fi
-# Find a good install program. We prefer a C program (faster),
-# so one script is as good as another. But avoid the broken or
-# incompatible versions:
-# SysV /etc/install, /usr/sbin/install
-# SunOS /usr/etc/install
-# IRIX /sbin/install
-# AIX /bin/install
-# AIX 4 /usr/bin/installbsd, which doesn't work without a -g flag
-# AFS /usr/afsws/bin/install, which mishandles nonexistent args
-# SVR4 /usr/ucb/install, which tries to use the nonexistent group "staff"
-# ./install, which can be erroneously created by make from ./install.sh.
-echo $ac_n "checking for a BSD compatible install""... $ac_c" 1>&6
-echo "configure:2361: checking for a BSD compatible install" >&5
-if test -z "$INSTALL"; then
-if eval "test \"`echo '$''{'ac_cv_path_install'+set}'`\" = set"; then
- echo $ac_n "(cached) $ac_c" 1>&6
-else
- IFS="${IFS= }"; ac_save_IFS="$IFS"; IFS=":"
- for ac_dir in $PATH; do
- # Account for people who put trailing slashes in PATH elements.
- case "$ac_dir/" in
- /|./|.//|/etc/*|/usr/sbin/*|/usr/etc/*|/sbin/*|/usr/afsws/bin/*|/usr/ucb/*) ;;
- *)
- # OSF1 and SCO ODT 3.0 have their own names for install.
- # Don't use installbsd from OSF since it installs stuff as root
- # by default.
- for ac_prog in ginstall scoinst install; do
- if test -f $ac_dir/$ac_prog; then
- if test $ac_prog = install &&
- grep dspmsg $ac_dir/$ac_prog >/dev/null 2>&1; then
- # AIX install. It has an incompatible calling convention.
- :
- else
- ac_cv_path_install="$ac_dir/$ac_prog -c"
- break 2
- fi
- fi
- done
- ;;
- esac
- done
- IFS="$ac_save_IFS"
-
-fi
- if test "${ac_cv_path_install+set}" = set; then
- INSTALL="$ac_cv_path_install"
- else
- # As a last resort, use the slow shell script. We don't cache a
- # path for INSTALL within a source directory, because that will
- # break other packages using the cache if that directory is
- # removed, or if the path is relative.
- INSTALL="$ac_install_sh"
+# A rather complicated check for the capabilities of as, to make
+# sure we can compile the assembly innerloops.
+if test "$x86" = "yes"; then
+ if test "$enable_x86_asm" = "yes"; then
+ if test "$enable_float" = "no"; then
+ echo "configure: warning: The assembly loops can only be used in single precision - disabling" 1>&2
+ enable_x86_asm=no
+ else
+ echo $ac_n "checking whether as fully supports intel syntax""... $ac_c" 1>&6
+echo "configure:2317: checking whether as fully supports intel syntax" >&5
+cat > conftest.s << EOF
+.intel_syntax noprefix
+checkasm:
+ emms
+ pswapd mm0,mm0
+ movups xmm0,checkasm
+ emms
+ ret
+EOF
+ if { ac_try='$CC -c conftest.s'; { (eval echo configure:2327: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }; }; then
+ echo "$ac_t""yes" 1>&6
+ else
+ echo "$ac_t""no" 1>&6
+ { echo "configure: error: Upgrade to binutils>=2.11, download the as executable
+ from www.gromacs.org, or disable assembly loops." 1>&2; exit 1; }
+ fi
+ fi
fi
+else # not x86
+enable_x86_asm=no
fi
-echo "$ac_t""$INSTALL" 1>&6
-
-# Use test -z because SunOS4 sh mishandles braces in ${var-val}.
-# It thinks the first close brace ends the variable substitution.
-test -z "$INSTALL_PROGRAM" && INSTALL_PROGRAM='${INSTALL}'
-
-test -z "$INSTALL_SCRIPT" && INSTALL_SCRIPT='${INSTALL_PROGRAM}'
-test -z "$INSTALL_DATA" && INSTALL_DATA='${INSTALL} -m 644'
-
-echo $ac_n "checking how to run the C preprocessor""... $ac_c" 1>&6
-echo "configure:2414: checking how to run the C preprocessor" >&5
-# On Suns, sometimes $CPP names a directory.
-if test -n "$CPP" && test -d "$CPP"; then
- CPP=
-fi
-if test -z "$CPP"; then
-if eval "test \"`echo '$''{'ac_cv_prog_CPP'+set}'`\" = set"; then
+# Extract the first word of "ident", so it can be a program name with args.
+set dummy ident; ac_word=$2
+echo $ac_n "checking for $ac_word""... $ac_c" 1>&6
+echo "configure:2343: checking for $ac_word" >&5
+if eval "test \"`echo '$''{'ac_cv_path_IDENT'+set}'`\" = set"; then
echo $ac_n "(cached) $ac_c" 1>&6
else
- # This must be in double quotes, not single quotes, because CPP may get
- # substituted into the Makefile and "${CC-cc}" will confuse make.
- CPP="${CC-cc} -E"
- # On the NeXT, cc -E runs the code through the compiler's parser,
- # not just through cpp.
- cat > conftest.$ac_ext <<EOF
-#line 2429 "configure"
-#include "confdefs.h"
-#include <assert.h>
-Syntax Error
-EOF
-ac_try="$ac_cpp conftest.$ac_ext >/dev/null 2>conftest.out"
-{ (eval echo configure:2435: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }
-ac_err=`grep -v '^ *+' conftest.out | grep -v "^conftest.${ac_ext}\$"`
-if test -z "$ac_err"; then
- :
+ case "$IDENT" in
+ /*)
+ ac_cv_path_IDENT="$IDENT" # Let the user override the test with a path.
+ ;;
+ ?:/*)
+ ac_cv_path_IDENT="$IDENT" # Let the user override the test with a dos path.
+ ;;
+ *)
+ IFS="${IFS= }"; ac_save_ifs="$IFS"; IFS=":"
+ ac_dummy="$PATH"
+ for ac_dir in $ac_dummy; do
+ test -z "$ac_dir" && ac_dir=.
+ if test -f $ac_dir/$ac_word; then
+ ac_cv_path_IDENT="$ac_dir/$ac_word"
+ break
+ fi
+ done
+ IFS="$ac_save_ifs"
+ test -z "$ac_cv_path_IDENT" && ac_cv_path_IDENT="no"
+ ;;
+esac
+fi
+IDENT="$ac_cv_path_IDENT"
+if test -n "$IDENT"; then
+ echo "$ac_t""$IDENT" 1>&6
else
- echo "$ac_err" >&5
- echo "configure: failed program was:" >&5
- cat conftest.$ac_ext >&5
- rm -rf conftest*
- CPP="${CC-cc} -E -traditional-cpp"
+ echo "$ac_t""no" 1>&6
+fi
+
+if test "$IDENT" != "no"; then
+ # seems as if we have the ident program, but does the
+ # compiler support it?
+ echo $ac_n "checking whether the compiler supports ident""... $ac_c" 1>&6
+echo "configure:2380: checking whether the compiler supports ident" >&5
cat > conftest.$ac_ext <<EOF
-#line 2446 "configure"
+#line 2382 "configure"
#include "confdefs.h"
-#include <assert.h>
-Syntax Error
+#ident "@(#) file.h 1.1 12/16/92"
EOF
ac_try="$ac_cpp conftest.$ac_ext >/dev/null 2>conftest.out"
-{ (eval echo configure:2452: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }
+{ (eval echo configure:2387: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }
ac_err=`grep -v '^ *+' conftest.out | grep -v "^conftest.${ac_ext}\$"`
if test -z "$ac_err"; then
- :
-else
- echo "$ac_err" >&5
- echo "configure: failed program was:" >&5
- cat conftest.$ac_ext >&5
rm -rf conftest*
- CPP="${CC-cc} -nologo -E"
- cat > conftest.$ac_ext <<EOF
-#line 2463 "configure"
-#include "confdefs.h"
-#include <assert.h>
-Syntax Error
+
+ echo "$ac_t""yes" 1>&6
+ cat >> confdefs.h <<\EOF
+#define HAVE_IDENT
EOF
-ac_try="$ac_cpp conftest.$ac_ext >/dev/null 2>conftest.out"
-{ (eval echo configure:2469: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }
-ac_err=`grep -v '^ *+' conftest.out | grep -v "^conftest.${ac_ext}\$"`
-if test -z "$ac_err"; then
- :
+
else
echo "$ac_err" >&5
echo "configure: failed program was:" >&5
cat conftest.$ac_ext >&5
rm -rf conftest*
- CPP=/lib/cpp
+ echo "$ac_t""no" 1>&6
fi
rm -f conftest*
fi
-rm -f conftest*
+
+
+echo $ac_n "checking whether ln -s works""... $ac_c" 1>&6
+echo "configure:2409: checking whether ln -s works" >&5
+if eval "test \"`echo '$''{'ac_cv_prog_LN_S'+set}'`\" = set"; then
+ echo $ac_n "(cached) $ac_c" 1>&6
+else
+ rm -f conftestdata
+if ln -s X conftestdata 2>/dev/null
+then
+ rm -f conftestdata
+ ac_cv_prog_LN_S="ln -s"
+else
+ ac_cv_prog_LN_S=ln
+fi
+fi
+LN_S="$ac_cv_prog_LN_S"
+if test "$ac_cv_prog_LN_S" = "ln -s"; then
+ echo "$ac_t""yes" 1>&6
+else
+ echo "$ac_t""no" 1>&6
+fi
+
+# Check whether --enable-shared or --disable-shared was given.
+if test "${enable_shared+set}" = set; then
+ enableval="$enable_shared"
+ p=${PACKAGE-default}
+case $enableval in
+yes) enable_shared=yes ;;
+no) enable_shared=no ;;
+*)
+ enable_shared=no
+ # Look at the argument we got. We use all the common list separators.
+ IFS="${IFS= }"; ac_save_ifs="$IFS"; IFS="${IFS}:,"
+ for pkg in $enableval; do
+ if test "X$pkg" = "X$p"; then
+ enable_shared=yes
+ fi
+ done
+ IFS="$ac_save_ifs"
+ ;;
+esac
+else
+ enable_shared=yes
+fi
+
+# Check whether --enable-static or --disable-static was given.
+if test "${enable_static+set}" = set; then
+ enableval="$enable_static"
+ p=${PACKAGE-default}
+case $enableval in
+yes) enable_static=yes ;;
+no) enable_static=no ;;
+*)
+ enable_static=no
+ # Look at the argument we got. We use all the common list separators.
+ IFS="${IFS= }"; ac_save_ifs="$IFS"; IFS="${IFS}:,"
+ for pkg in $enableval; do
+ if test "X$pkg" = "X$p"; then
+ enable_static=yes
+ fi
+ done
+ IFS="$ac_save_ifs"
+ ;;
+esac
+else
+ enable_static=yes
+fi
+
+# Check whether --enable-fast-install or --disable-fast-install was given.
+if test "${enable_fast_install+set}" = set; then
+ enableval="$enable_fast_install"
+ p=${PACKAGE-default}
+case $enableval in
+yes) enable_fast_install=yes ;;
+no) enable_fast_install=no ;;
+*)
+ enable_fast_install=no
+ # Look at the argument we got. We use all the common list separators.
+ IFS="${IFS= }"; ac_save_ifs="$IFS"; IFS="${IFS}:,"
+ for pkg in $enableval; do
+ if test "X$pkg" = "X$p"; then
+ enable_fast_install=yes
+ fi
+ done
+ IFS="$ac_save_ifs"
+ ;;
+esac
+else
+ enable_fast_install=yes
+fi
+
+echo $ac_n "checking build system type""... $ac_c" 1>&6
+echo "configure:2499: checking build system type" >&5
+
+build_alias=$build
+case "$build_alias" in
+NONE)
+ case $nonopt in
+ NONE) build_alias=$host_alias ;;
+ *) build_alias=$nonopt ;;
+ esac ;;
+esac
+
+build=`${CONFIG_SHELL-/bin/sh} $ac_config_sub $build_alias`
+build_cpu=`echo $build | sed 's/^\([^-]*\)-\([^-]*\)-\(.*\)$/\1/'`
+build_vendor=`echo $build | sed 's/^\([^-]*\)-\([^-]*\)-\(.*\)$/\2/'`
+build_os=`echo $build | sed 's/^\([^-]*\)-\([^-]*\)-\(.*\)$/\3/'`
+echo "$ac_t""$build" 1>&6
+
+# Check whether --with-gnu-ld or --without-gnu-ld was given.
+if test "${with_gnu_ld+set}" = set; then
+ withval="$with_gnu_ld"
+ test "$withval" = no || with_gnu_ld=yes
+else
+ with_gnu_ld=no
+fi
+
+ac_prog=ld
+if test "$GCC" = yes; then
+ # Check if gcc -print-prog-name=ld gives a path.
+ echo $ac_n "checking for ld used by GCC""... $ac_c" 1>&6
+echo "configure:2528: checking for ld used by GCC" >&5
+ case $host in
+ *-*-mingw*)
+ # gcc leaves a trailing carriage return which upsets mingw
+ ac_prog=`($CC -print-prog-name=ld) 2>&5 | tr -d '\015'` ;;
+ *)
+ ac_prog=`($CC -print-prog-name=ld) 2>&5` ;;
+ esac
+ case $ac_prog in
+ # Accept absolute paths.
+ [\\/]* | [A-Za-z]:[\\/]*)
+ re_direlt='/[^/][^/]*/\.\./'
+ # Canonicalize the path of ld
+ ac_prog=`echo $ac_prog| sed 's%\\\\%/%g'`
+ while echo $ac_prog | grep "$re_direlt" > /dev/null 2>&1; do
+ ac_prog=`echo $ac_prog| sed "s%$re_direlt%/%"`
+ done
+ test -z "$LD" && LD="$ac_prog"
+ ;;
+ "")
+ # If it fails, then pretend we aren't using GCC.
+ ac_prog=ld
+ ;;
+ *)
+ # If it is relative, then search for the first ld in PATH.
+ with_gnu_ld=unknown
+ ;;
+ esac
+elif test "$with_gnu_ld" = yes; then
+ echo $ac_n "checking for GNU ld""... $ac_c" 1>&6
+echo "configure:2558: checking for GNU ld" >&5
+else
+ echo $ac_n "checking for non-GNU ld""... $ac_c" 1>&6
+echo "configure:2561: checking for non-GNU ld" >&5
+fi
+if eval "test \"`echo '$''{'lt_cv_path_LD'+set}'`\" = set"; then
+ echo $ac_n "(cached) $ac_c" 1>&6
+else
+ if test -z "$LD"; then
+ IFS="${IFS= }"; ac_save_ifs="$IFS"; IFS="${IFS}${PATH_SEPARATOR-:}"
+ for ac_dir in $PATH; do
+ test -z "$ac_dir" && ac_dir=.
+ if test -f "$ac_dir/$ac_prog" || test -f "$ac_dir/$ac_prog$ac_exeext"; then
+ lt_cv_path_LD="$ac_dir/$ac_prog"
+ # Check to see if the program is GNU ld. I'd rather use --version,
+ # but apparently some GNU ld's only accept -v.
+ # Break only if it was the GNU/non-GNU ld that we prefer.
+ if "$lt_cv_path_LD" -v 2>&1 < /dev/null | egrep '(GNU|with BFD)' > /dev/null; then
+ test "$with_gnu_ld" != no && break
+ else
+ test "$with_gnu_ld" != yes && break
+ fi
+ fi
+ done
+ IFS="$ac_save_ifs"
+else
+ lt_cv_path_LD="$LD" # Let the user override the test with a path.
+fi
+fi
+
+LD="$lt_cv_path_LD"
+if test -n "$LD"; then
+ echo "$ac_t""$LD" 1>&6
+else
+ echo "$ac_t""no" 1>&6
+fi
+test -z "$LD" && { echo "configure: error: no acceptable ld found in \$PATH" 1>&2; exit 1; }
+echo $ac_n "checking if the linker ($LD) is GNU ld""... $ac_c" 1>&6
+echo "configure:2596: checking if the linker ($LD) is GNU ld" >&5
+if eval "test \"`echo '$''{'lt_cv_prog_gnu_ld'+set}'`\" = set"; then
+ echo $ac_n "(cached) $ac_c" 1>&6
+else
+ # I'd rather use --version here, but apparently some GNU ld's only accept -v.
+if $LD -v 2>&1 </dev/null | egrep '(GNU|with BFD)' 1>&5; then
+ lt_cv_prog_gnu_ld=yes
+else
+ lt_cv_prog_gnu_ld=no
+fi
+fi
+
+echo "$ac_t""$lt_cv_prog_gnu_ld" 1>&6
+with_gnu_ld=$lt_cv_prog_gnu_ld
+
+
+echo $ac_n "checking for $LD option to reload object files""... $ac_c" 1>&6
+echo "configure:2613: checking for $LD option to reload object files" >&5
+if eval "test \"`echo '$''{'lt_cv_ld_reload_flag'+set}'`\" = set"; then
+ echo $ac_n "(cached) $ac_c" 1>&6
+else
+ lt_cv_ld_reload_flag='-r'
+fi
+
+echo "$ac_t""$lt_cv_ld_reload_flag" 1>&6
+reload_flag=$lt_cv_ld_reload_flag
+test -n "$reload_flag" && reload_flag=" $reload_flag"
+
+echo $ac_n "checking for BSD-compatible nm""... $ac_c" 1>&6
+echo "configure:2625: checking for BSD-compatible nm" >&5
+if eval "test \"`echo '$''{'lt_cv_path_NM'+set}'`\" = set"; then
+ echo $ac_n "(cached) $ac_c" 1>&6
+else
+ if test -n "$NM"; then
+ # Let the user override the test.
+ lt_cv_path_NM="$NM"
+else
+ IFS="${IFS= }"; ac_save_ifs="$IFS"; IFS="${IFS}${PATH_SEPARATOR-:}"
+ for ac_dir in $PATH /usr/ccs/bin /usr/ucb /bin; do
+ test -z "$ac_dir" && ac_dir=.
+ tmp_nm=$ac_dir/${ac_tool_prefix}nm
+ if test -f $tmp_nm || test -f $tmp_nm$ac_exeext ; then
+ # Check to see if the nm accepts a BSD-compat flag.
+ # Adding the `sed 1q' prevents false positives on HP-UX, which says:
+ # nm: unknown option "B" ignored
+ # Tru64's nm complains that /dev/null is an invalid object file
+ if ($tmp_nm -B /dev/null 2>&1 | sed '1q'; exit 0) | egrep '(/dev/null|Invalid file or object type)' >/dev/null; then
+ lt_cv_path_NM="$tmp_nm -B"
+ break
+ elif ($tmp_nm -p /dev/null 2>&1 | sed '1q'; exit 0) | egrep /dev/null >/dev/null; then
+ lt_cv_path_NM="$tmp_nm -p"
+ break
+ else
+ lt_cv_path_NM=${lt_cv_path_NM="$tmp_nm"} # keep the first match, but
+ continue # so that we can try to find one that supports BSD flags
+ fi
+ fi
+ done
+ IFS="$ac_save_ifs"
+ test -z "$lt_cv_path_NM" && lt_cv_path_NM=nm
+fi
+fi
+
+NM="$lt_cv_path_NM"
+echo "$ac_t""$NM" 1>&6
+
+echo $ac_n "checking how to recognise dependant libraries""... $ac_c" 1>&6
+echo "configure:2663: checking how to recognise dependant libraries" >&5
+if eval "test \"`echo '$''{'lt_cv_deplibs_check_method'+set}'`\" = set"; then
+ echo $ac_n "(cached) $ac_c" 1>&6
+else
+ lt_cv_file_magic_cmd='$MAGIC_CMD'
+lt_cv_file_magic_test_file=
+lt_cv_deplibs_check_method='unknown'
+# Need to set the preceding variable on all platforms that support
+# interlibrary dependencies.
+# 'none' -- dependencies not supported.
+# `unknown' -- same as none, but documents that we really don't know.
+# 'pass_all' -- all dependencies passed with no checks.
+# 'test_compile' -- check by making test program.
+# 'file_magic [regex]' -- check by looking for files in library path
+# which responds to the $file_magic_cmd with a given egrep regex.
+# If you have `file' or equivalent on your system and you're not sure
+# whether `pass_all' will *always* work, you probably want this one.
+
+case $host_os in
+aix*)
+ lt_cv_deplibs_check_method=pass_all
+ ;;
+
+beos*)
+ lt_cv_deplibs_check_method=pass_all
+ ;;
+
+bsdi4*)
+ lt_cv_deplibs_check_method='file_magic ELF [0-9][0-9]*-bit [ML]SB (shared object|dynamic lib)'
+ lt_cv_file_magic_cmd='/usr/bin/file -L'
+ lt_cv_file_magic_test_file=/shlib/libc.so
+ ;;
+
+cygwin* | mingw* |pw32*)
+ lt_cv_deplibs_check_method='file_magic file format pei*-i386(.*architecture: i386)?'
+ lt_cv_file_magic_cmd='$OBJDUMP -f'
+ ;;
+
+darwin* | rhapsody*)
+ lt_cv_deplibs_check_method='file_magic Mach-O dynamically linked shared library'
+ lt_cv_file_magic_cmd='/usr/bin/file -L'
+ case "$host_os" in
+ rhapsody* | darwin1.012)
+ lt_cv_file_magic_test_file='/System/Library/Frameworks/System.framework/System'
+ ;;
+ *) # Darwin 1.3 on
+ lt_cv_file_magic_test_file='/usr/lib/libSystem.dylib'
+ ;;
+ esac
+ ;;
+
+freebsd* )
+ if echo __ELF__ | $CC -E - | grep __ELF__ > /dev/null; then
+ case $host_cpu in
+ i*86 )
+ # Not sure whether the presence of OpenBSD here was a mistake.
+ # Let's accept both of them until this is cleared up.
+ lt_cv_deplibs_check_method='file_magic (FreeBSD|OpenBSD)/i[3-9]86 (compact )?demand paged shared library'
+ lt_cv_file_magic_cmd=/usr/bin/file
+ lt_cv_file_magic_test_file=`echo /usr/lib/libc.so.*`
+ ;;
+ esac
+ else
+ lt_cv_deplibs_check_method=pass_all
+ fi
+ ;;
+
+gnu*)
+ lt_cv_deplibs_check_method=pass_all
+ ;;
+
+hpux10.20*|hpux11*)
+ lt_cv_deplibs_check_method='file_magic (s[0-9][0-9][0-9]|PA-RISC[0-9].[0-9]) shared library'
+ lt_cv_file_magic_cmd=/usr/bin/file
+ lt_cv_file_magic_test_file=/usr/lib/libc.sl
+ ;;
+
+irix5* | irix6*)
+ case $host_os in
+ irix5*)
+ # this will be overridden with pass_all, but let us keep it just in case
+ lt_cv_deplibs_check_method="file_magic ELF 32-bit MSB dynamic lib MIPS - version 1"
+ ;;
+ *)
+ case $LD in
+ *-32|*"-32 ") libmagic=32-bit;;
+ *-n32|*"-n32 ") libmagic=N32;;
+ *-64|*"-64 ") libmagic=64-bit;;
+ *) libmagic=never-match;;
+ esac
+ # this will be overridden with pass_all, but let us keep it just in case
+ lt_cv_deplibs_check_method="file_magic ELF ${libmagic} MSB mips-[1234] dynamic lib MIPS - version 1"
+ ;;
+ esac
+ lt_cv_file_magic_test_file=`echo /lib${libsuff}/libc.so*`
+ lt_cv_deplibs_check_method=pass_all
+ ;;
+
+# This must be Linux ELF.
+linux-gnu*)
+ case $host_cpu in
+ alpha* | hppa* | i*86 | powerpc* | sparc* | ia64* )
+ lt_cv_deplibs_check_method=pass_all ;;
+ *)
+ # glibc up to 2.1.1 does not perform some relocations on ARM
+ lt_cv_deplibs_check_method='file_magic ELF [0-9][0-9]*-bit [LM]SB (shared object|dynamic lib )' ;;
+ esac
+ lt_cv_file_magic_test_file=`echo /lib/libc.so* /lib/libc-*.so`
+ ;;
+
+netbsd*)
+ if echo __ELF__ | $CC -E - | grep __ELF__ > /dev/null; then
+ lt_cv_deplibs_check_method='match_pattern /lib[^/\.]+\.so\.[0-9]+\.[0-9]+$'
+ else
+ lt_cv_deplibs_check_method='match_pattern /lib[^/\.]+\.so$'
+ fi
+ ;;
+
+newsos6)
+ lt_cv_deplibs_check_method='file_magic ELF [0-9][0-9]*-bit [ML]SB (executable|dynamic lib)'
+ lt_cv_file_magic_cmd=/usr/bin/file
+ lt_cv_file_magic_test_file=/usr/lib/libnls.so
+ ;;
+
+osf3* | osf4* | osf5*)
+ # this will be overridden with pass_all, but let us keep it just in case
+ lt_cv_deplibs_check_method='file_magic COFF format alpha shared library'
+ lt_cv_file_magic_test_file=/shlib/libc.so
+ lt_cv_deplibs_check_method=pass_all
+ ;;
+
+sco3.2v5*)
+ lt_cv_deplibs_check_method=pass_all
+ ;;
+
+solaris*)
+ lt_cv_deplibs_check_method=pass_all
+ lt_cv_file_magic_test_file=/lib/libc.so
+ ;;
+
+sysv5uw[78]* | sysv4*uw2*)
+ lt_cv_deplibs_check_method=pass_all
+ ;;
+
+sysv4 | sysv4.2uw2* | sysv4.3* | sysv5*)
+ case $host_vendor in
+ ncr)
+ lt_cv_deplibs_check_method=pass_all
+ ;;
+ motorola)
+ lt_cv_deplibs_check_method='file_magic ELF [0-9][0-9]*-bit [ML]SB (shared object|dynamic lib) M[0-9][0-9]* Version [0-9]'
+ lt_cv_file_magic_test_file=`echo /usr/lib/libc.so*`
+ ;;
+ esac
+ ;;
+esac
+
+fi
+
+echo "$ac_t""$lt_cv_deplibs_check_method" 1>&6
+file_magic_cmd=$lt_cv_file_magic_cmd
+deplibs_check_method=$lt_cv_deplibs_check_method
+
+echo $ac_n "checking for object suffix""... $ac_c" 1>&6
+echo "configure:2827: checking for object suffix" >&5
+if eval "test \"`echo '$''{'ac_cv_objext'+set}'`\" = set"; then
+ echo $ac_n "(cached) $ac_c" 1>&6
+else
+ rm -f conftest*
+echo 'int i = 1;' > conftest.$ac_ext
+if { (eval echo configure:2833: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
+ for ac_file in conftest.*; do
+ case $ac_file in
+ *.c) ;;
+ *) ac_cv_objext=`echo $ac_file | sed -e s/conftest.//` ;;
+ esac
+ done
+else
+ { echo "configure: error: installation or configuration problem; compiler does not work" 1>&2; exit 1; }
fi
rm -f conftest*
- ac_cv_prog_CPP="$CPP"
fi
- CPP="$ac_cv_prog_CPP"
+
+echo "$ac_t""$ac_cv_objext" 1>&6
+OBJEXT=$ac_cv_objext
+ac_objext=$ac_cv_objext
+
+
+
+echo $ac_n "checking for executable suffix""... $ac_c" 1>&6
+echo "configure:2853: checking for executable suffix" >&5
+if eval "test \"`echo '$''{'ac_cv_exeext'+set}'`\" = set"; then
+ echo $ac_n "(cached) $ac_c" 1>&6
else
- ac_cv_prog_CPP="$CPP"
+ if test "$CYGWIN" = yes || test "$MINGW32" = yes; then
+ ac_cv_exeext=.exe
+else
+ rm -f conftest*
+ echo 'int main () { return 0; }' > conftest.$ac_ext
+ ac_cv_exeext=
+ if { (eval echo configure:2863: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; }; then
+ for file in conftest.*; do
+ case $file in
+ *.c | *.o | *.obj) ;;
+ *) ac_cv_exeext=`echo $file | sed -e s/conftest//` ;;
+ esac
+ done
+ else
+ { echo "configure: error: installation or configuration problem: compiler cannot create executables." 1>&2; exit 1; }
+ fi
+ rm -f conftest*
+ test x"${ac_cv_exeext}" = x && ac_cv_exeext=no
+fi
fi
-echo "$ac_t""$CPP" 1>&6
+EXEEXT=""
+test x"${ac_cv_exeext}" != xno && EXEEXT=${ac_cv_exeext}
+echo "$ac_t""${ac_cv_exeext}" 1>&6
+ac_exeext=$EXEEXT
-# A rather complicated check for the nasm program and x86 assembly capabilities
-# to run under windows we must insert a test a change nasm "-f elf" to "-f win32"
-if test "$x86" = "yes"; then
- if test "$enable_sse" = "yes" -o "$enable_3dnow" = "yes"; then
- if test "$enable_double" = "yes"; then
- echo "configure: warning: SSE/3Dnow assembly can only be used in single precision" 1>&2
- enable_sse=no
- enable_3dnow=no
- else
- # Extract the first word of "nasm", so it can be a program name with args.
-set dummy nasm; ac_word=$2
-echo $ac_n "checking for $ac_word""... $ac_c" 1>&6
-echo "configure:2506: checking for $ac_word" >&5
-if eval "test \"`echo '$''{'ac_cv_path_NASM'+set}'`\" = set"; then
+if test $host != $build; then
+ ac_tool_prefix=${host_alias}-
+else
+ ac_tool_prefix=
+fi
+
+# Autoconf 2.13's AC_OBJEXT and AC_EXEEXT macros only works for C compilers!
+
+# Only perform the check for file, if the check method requires it
+case $deplibs_check_method in
+file_magic*)
+ if test "$file_magic_cmd" = '$MAGIC_CMD'; then
+ echo $ac_n "checking for ${ac_tool_prefix}file""... $ac_c" 1>&6
+echo "configure:2896: checking for ${ac_tool_prefix}file" >&5
+if eval "test \"`echo '$''{'lt_cv_path_MAGIC_CMD'+set}'`\" = set"; then
echo $ac_n "(cached) $ac_c" 1>&6
else
- case "$NASM" in
+ case $MAGIC_CMD in
/*)
- ac_cv_path_NASM="$NASM" # Let the user override the test with a path.
+ lt_cv_path_MAGIC_CMD="$MAGIC_CMD" # Let the user override the test with a path.
;;
- ?:/*)
- ac_cv_path_NASM="$NASM" # Let the user override the test with a dos path.
+ ?:/*)
+ lt_cv_path_MAGIC_CMD="$MAGIC_CMD" # Let the user override the test with a dos path.
;;
*)
- IFS="${IFS= }"; ac_save_ifs="$IFS"; IFS=":"
- ac_dummy="$PATH"
- for ac_dir in $ac_dummy; do
+ ac_save_MAGIC_CMD="$MAGIC_CMD"
+ IFS="${IFS= }"; ac_save_ifs="$IFS"; IFS=":"
+ ac_dummy="/usr/bin:$PATH"
+ for ac_dir in $ac_dummy; do
test -z "$ac_dir" && ac_dir=.
- if test -f $ac_dir/$ac_word; then
- ac_cv_path_NASM="$ac_dir/$ac_word"
+ if test -f $ac_dir/${ac_tool_prefix}file; then
+ lt_cv_path_MAGIC_CMD="$ac_dir/${ac_tool_prefix}file"
+ if test -n "$file_magic_test_file"; then
+ case $deplibs_check_method in
+ "file_magic "*)
+ file_magic_regex="`expr \"$deplibs_check_method\" : \"file_magic \(.*\)\"`"
+ MAGIC_CMD="$lt_cv_path_MAGIC_CMD"
+ if eval $file_magic_cmd \$file_magic_test_file 2> /dev/null |
+ egrep "$file_magic_regex" > /dev/null; then
+ :
+ else
+ cat <<EOF 1>&2
+
+*** Warning: the command libtool uses to detect shared libraries,
+*** $file_magic_cmd, produces output that libtool cannot recognize.
+*** The result is that libtool may fail to recognize shared libraries
+*** as such. This will affect the creation of libtool libraries that
+*** depend on shared libraries, but programs linked with such libtool
+*** libraries will work regardless of this problem. Nevertheless, you
+*** may want to report the problem to your system manager and/or to
+*** bug-libtool@gnu.org
+
+EOF
+ fi ;;
+ esac
+ fi
break
fi
done
IFS="$ac_save_ifs"
- test -z "$ac_cv_path_NASM" && ac_cv_path_NASM="no"
+ MAGIC_CMD="$ac_save_MAGIC_CMD"
;;
esac
fi
-NASM="$ac_cv_path_NASM"
-if test -n "$NASM"; then
- echo "$ac_t""$NASM" 1>&6
+
+MAGIC_CMD="$lt_cv_path_MAGIC_CMD"
+if test -n "$MAGIC_CMD"; then
+ echo "$ac_t""$MAGIC_CMD" 1>&6
else
echo "$ac_t""no" 1>&6
fi
- NASMFLAGS="-f elf"
-
- if test "$NASM" = "no"; then
- { echo "configure: error: Nasm is required for SSE and 3DNow loops." 1>&2; exit 1; }
- fi
- if test "$enable_sse" = "yes"; then
- echo $ac_n "checking whether nasm supports SSE instructions""... $ac_c" 1>&6
-echo "configure:2546: checking whether nasm supports SSE instructions" >&5
-cat > conftest_sse.s << EOF
- global checksse
-checksse:
- emms
- xorps xmm0,xmm0
- emms
- ret
-EOF
- if { ac_try='$NASM conftest_sse.s'; { (eval echo configure:2555: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }; }; then
- echo "$ac_t""yes" 1>&6
- else
- echo "$ac_t""no" 1>&6
- { echo "configure: error: Download a patched nasm from the Gromacs homepage,
- or disable SSE assembly." 1>&2; exit 1; }
- fi
- fi
- if test "$enable_3dnow" = "yes"; then
- echo $ac_n "checking whether nasm supports extended 3DNow instructions""... $ac_c" 1>&6
-echo "configure:2565: checking whether nasm supports extended 3DNow instructions" >&5
-cat > conftest_3dnow.s << EOF
- global check3dnow
-check3dnow:
- femms
- pswapd mm0,mm0
- femms
- ret
+if test -z "$lt_cv_path_MAGIC_CMD"; then
+ if test -n "$ac_tool_prefix"; then
+ echo $ac_n "checking for file""... $ac_c" 1>&6
+echo "configure:2958: checking for file" >&5
+if eval "test \"`echo '$''{'lt_cv_path_MAGIC_CMD'+set}'`\" = set"; then
+ echo $ac_n "(cached) $ac_c" 1>&6
+else
+ case $MAGIC_CMD in
+ /*)
+ lt_cv_path_MAGIC_CMD="$MAGIC_CMD" # Let the user override the test with a path.
+ ;;
+ ?:/*)
+ lt_cv_path_MAGIC_CMD="$MAGIC_CMD" # Let the user override the test with a dos path.
+ ;;
+ *)
+ ac_save_MAGIC_CMD="$MAGIC_CMD"
+ IFS="${IFS= }"; ac_save_ifs="$IFS"; IFS=":"
+ ac_dummy="/usr/bin:$PATH"
+ for ac_dir in $ac_dummy; do
+ test -z "$ac_dir" && ac_dir=.
+ if test -f $ac_dir/file; then
+ lt_cv_path_MAGIC_CMD="$ac_dir/file"
+ if test -n "$file_magic_test_file"; then
+ case $deplibs_check_method in
+ "file_magic "*)
+ file_magic_regex="`expr \"$deplibs_check_method\" : \"file_magic \(.*\)\"`"
+ MAGIC_CMD="$lt_cv_path_MAGIC_CMD"
+ if eval $file_magic_cmd \$file_magic_test_file 2> /dev/null |
+ egrep "$file_magic_regex" > /dev/null; then
+ :
+ else
+ cat <<EOF 1>&2
+
+*** Warning: the command libtool uses to detect shared libraries,
+*** $file_magic_cmd, produces output that libtool cannot recognize.
+*** The result is that libtool may fail to recognize shared libraries
+*** as such. This will affect the creation of libtool libraries that
+*** depend on shared libraries, but programs linked with such libtool
+*** libraries will work regardless of this problem. Nevertheless, you
+*** may want to report the problem to your system manager and/or to
+*** bug-libtool@gnu.org
+
EOF
- if { ac_try='$NASM -f elf conftest_3dnow.s'; { (eval echo configure:2574: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }; }; then
- echo "$ac_t""yes" 1>&6
- else
- echo "$ac_t""no" 1>&6
- { echo "configure: error: Download a patched nasm from the Gromacs homepage,
- or disable 3DNow assembly." 1>&2; exit 1; }
- fi
+ fi ;;
+ esac
fi
+ break
fi
+ done
+ IFS="$ac_save_ifs"
+ MAGIC_CMD="$ac_save_MAGIC_CMD"
+ ;;
+esac
+fi
+
+MAGIC_CMD="$lt_cv_path_MAGIC_CMD"
+if test -n "$MAGIC_CMD"; then
+ echo "$ac_t""$MAGIC_CMD" 1>&6
+else
+ echo "$ac_t""no" 1>&6
+fi
+
+ else
+ MAGIC_CMD=:
fi
-else # not x86
-enable_sse=no
-enable_3dnow=no
fi
-# Extract the first word of "ident", so it can be a program name with args.
-set dummy ident; ac_word=$2
+ fi
+ ;;
+esac
+
+# Extract the first word of "${ac_tool_prefix}ranlib", so it can be a program name with args.
+set dummy ${ac_tool_prefix}ranlib; ac_word=$2
+echo $ac_n "checking for $ac_word""... $ac_c" 1>&6
+echo "configure:3029: checking for $ac_word" >&5
+if eval "test \"`echo '$''{'ac_cv_prog_RANLIB'+set}'`\" = set"; then
+ echo $ac_n "(cached) $ac_c" 1>&6
+else
+ if test -n "$RANLIB"; then
+ ac_cv_prog_RANLIB="$RANLIB" # Let the user override the test.
+else
+ IFS="${IFS= }"; ac_save_ifs="$IFS"; IFS=":"
+ ac_dummy="$PATH"
+ for ac_dir in $ac_dummy; do
+ test -z "$ac_dir" && ac_dir=.
+ if test -f $ac_dir/$ac_word; then
+ ac_cv_prog_RANLIB="${ac_tool_prefix}ranlib"
+ break
+ fi
+ done
+ IFS="$ac_save_ifs"
+fi
+fi
+RANLIB="$ac_cv_prog_RANLIB"
+if test -n "$RANLIB"; then
+ echo "$ac_t""$RANLIB" 1>&6
+else
+ echo "$ac_t""no" 1>&6
+fi
+
+
+if test -z "$ac_cv_prog_RANLIB"; then
+if test -n "$ac_tool_prefix"; then
+ # Extract the first word of "ranlib", so it can be a program name with args.
+set dummy ranlib; ac_word=$2
+echo $ac_n "checking for $ac_word""... $ac_c" 1>&6
+echo "configure:3061: checking for $ac_word" >&5
+if eval "test \"`echo '$''{'ac_cv_prog_RANLIB'+set}'`\" = set"; then
+ echo $ac_n "(cached) $ac_c" 1>&6
+else
+ if test -n "$RANLIB"; then
+ ac_cv_prog_RANLIB="$RANLIB" # Let the user override the test.
+else
+ IFS="${IFS= }"; ac_save_ifs="$IFS"; IFS=":"
+ ac_dummy="$PATH"
+ for ac_dir in $ac_dummy; do
+ test -z "$ac_dir" && ac_dir=.
+ if test -f $ac_dir/$ac_word; then
+ ac_cv_prog_RANLIB="ranlib"
+ break
+ fi
+ done
+ IFS="$ac_save_ifs"
+ test -z "$ac_cv_prog_RANLIB" && ac_cv_prog_RANLIB=":"
+fi
+fi
+RANLIB="$ac_cv_prog_RANLIB"
+if test -n "$RANLIB"; then
+ echo "$ac_t""$RANLIB" 1>&6
+else
+ echo "$ac_t""no" 1>&6
+fi
+
+else
+ RANLIB=":"
+fi
+fi
+
+# Extract the first word of "${ac_tool_prefix}strip", so it can be a program name with args.
+set dummy ${ac_tool_prefix}strip; ac_word=$2
echo $ac_n "checking for $ac_word""... $ac_c" 1>&6
-echo "configure:2592: checking for $ac_word" >&5
-if eval "test \"`echo '$''{'ac_cv_path_IDENT'+set}'`\" = set"; then
+echo "configure:3096: checking for $ac_word" >&5
+if eval "test \"`echo '$''{'ac_cv_prog_STRIP'+set}'`\" = set"; then
echo $ac_n "(cached) $ac_c" 1>&6
else
- case "$IDENT" in
- /*)
- ac_cv_path_IDENT="$IDENT" # Let the user override the test with a path.
- ;;
- ?:/*)
- ac_cv_path_IDENT="$IDENT" # Let the user override the test with a dos path.
- ;;
- *)
+ if test -n "$STRIP"; then
+ ac_cv_prog_STRIP="$STRIP" # Let the user override the test.
+else
IFS="${IFS= }"; ac_save_ifs="$IFS"; IFS=":"
ac_dummy="$PATH"
- for ac_dir in $ac_dummy; do
+ for ac_dir in $ac_dummy; do
test -z "$ac_dir" && ac_dir=.
if test -f $ac_dir/$ac_word; then
- ac_cv_path_IDENT="$ac_dir/$ac_word"
+ ac_cv_prog_STRIP="${ac_tool_prefix}strip"
break
fi
done
IFS="$ac_save_ifs"
- test -z "$ac_cv_path_IDENT" && ac_cv_path_IDENT="no"
- ;;
-esac
fi
-IDENT="$ac_cv_path_IDENT"
-if test -n "$IDENT"; then
- echo "$ac_t""$IDENT" 1>&6
-else
- echo "$ac_t""no" 1>&6
fi
-
-if test "$IDENT" != "no"; then
- # seems as if we have the ident program, but does the
- # compiler support it?
- echo $ac_n "checking whether the compiler supports ident""... $ac_c" 1>&6
-echo "configure:2629: checking whether the compiler supports ident" >&5
- cat > conftest.$ac_ext <<EOF
-#line 2631 "configure"
-#include "confdefs.h"
-#ident "@(#) file.h 1.1 12/16/92"
-EOF
-ac_try="$ac_cpp conftest.$ac_ext >/dev/null 2>conftest.out"
-{ (eval echo configure:2636: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }
-ac_err=`grep -v '^ *+' conftest.out | grep -v "^conftest.${ac_ext}\$"`
-if test -z "$ac_err"; then
- rm -rf conftest*
-
- echo "$ac_t""yes" 1>&6
- cat >> confdefs.h <<\EOF
-#define HAVE_IDENT
-EOF
-
+STRIP="$ac_cv_prog_STRIP"
+if test -n "$STRIP"; then
+ echo "$ac_t""$STRIP" 1>&6
else
- echo "$ac_err" >&5
- echo "configure: failed program was:" >&5
- cat conftest.$ac_ext >&5
- rm -rf conftest*
echo "$ac_t""no" 1>&6
fi
-rm -f conftest*
-fi
-# Extract the first word of "ranlib", so it can be a program name with args.
-set dummy ranlib; ac_word=$2
+
+if test -z "$ac_cv_prog_STRIP"; then
+if test -n "$ac_tool_prefix"; then
+ # Extract the first word of "strip", so it can be a program name with args.
+set dummy strip; ac_word=$2
echo $ac_n "checking for $ac_word""... $ac_c" 1>&6
-echo "configure:2659: checking for $ac_word" >&5
-if eval "test \"`echo '$''{'ac_cv_prog_RANLIB'+set}'`\" = set"; then
+echo "configure:3128: checking for $ac_word" >&5
+if eval "test \"`echo '$''{'ac_cv_prog_STRIP'+set}'`\" = set"; then
echo $ac_n "(cached) $ac_c" 1>&6
else
- if test -n "$RANLIB"; then
- ac_cv_prog_RANLIB="$RANLIB" # Let the user override the test.
+ if test -n "$STRIP"; then
+ ac_cv_prog_STRIP="$STRIP" # Let the user override the test.
else
IFS="${IFS= }"; ac_save_ifs="$IFS"; IFS=":"
ac_dummy="$PATH"
for ac_dir in $ac_dummy; do
test -z "$ac_dir" && ac_dir=.
if test -f $ac_dir/$ac_word; then
- ac_cv_prog_RANLIB="ranlib"
+ ac_cv_prog_STRIP="strip"
break
fi
done
IFS="$ac_save_ifs"
- test -z "$ac_cv_prog_RANLIB" && ac_cv_prog_RANLIB=":"
+ test -z "$ac_cv_prog_STRIP" && ac_cv_prog_STRIP=":"
fi
fi
-RANLIB="$ac_cv_prog_RANLIB"
-if test -n "$RANLIB"; then
- echo "$ac_t""$RANLIB" 1>&6
+STRIP="$ac_cv_prog_STRIP"
+if test -n "$STRIP"; then
+ echo "$ac_t""$STRIP" 1>&6
else
echo "$ac_t""no" 1>&6
fi
+else
+ STRIP=":"
+fi
+fi
+
+
+# Check for any special flags to pass to ltconfig.
+libtool_flags="--cache-file=$cache_file"
+test "$enable_shared" = no && libtool_flags="$libtool_flags --disable-shared"
+test "$enable_static" = no && libtool_flags="$libtool_flags --disable-static"
+test "$enable_fast_install" = no && libtool_flags="$libtool_flags --disable-fast-install"
+test "$GCC" = yes && libtool_flags="$libtool_flags --with-gcc"
+test "$lt_cv_prog_gnu_ld" = yes && libtool_flags="$libtool_flags --with-gnu-ld"
+
+
+# Check whether --enable-libtool-lock or --disable-libtool-lock was given.
+if test "${enable_libtool_lock+set}" = set; then
+ enableval="$enable_libtool_lock"
+ :
+fi
+
+test "x$enable_libtool_lock" = xno && libtool_flags="$libtool_flags --disable-lock"
+test x"$silent" = xyes && libtool_flags="$libtool_flags --silent"
+
+# Check whether --with-pic or --without-pic was given.
+if test "${with_pic+set}" = set; then
+ withval="$with_pic"
+ pic_mode="$withval"
+else
+ pic_mode=default
+fi
+
+test x"$pic_mode" = xyes && libtool_flags="$libtool_flags --prefer-pic"
+test x"$pic_mode" = xno && libtool_flags="$libtool_flags --prefer-non-pic"
+
+# Some flags need to be propagated to the compiler or linker for good
+# libtool support.
+case $host in
+*-*-irix6*)
+ # Find out which ABI we are using.
+ echo '#line 3195 "configure"' > conftest.$ac_ext
+ if { (eval echo configure:3196: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
+ case `/usr/bin/file conftest.$ac_objext` in
+ *32-bit*)
+ LD="${LD-ld} -32"
+ ;;
+ *N32*)
+ LD="${LD-ld} -n32"
+ ;;
+ *64-bit*)
+ LD="${LD-ld} -64"
+ ;;
+ esac
+ fi
+ rm -rf conftest*
+ ;;
+
+*-*-sco3.2v5*)
+ # On SCO OpenServer 5, we need -belf to get full-featured binaries.
+ SAVE_CFLAGS="$CFLAGS"
+ CFLAGS="$CFLAGS -belf"
+ echo $ac_n "checking whether the C compiler needs -belf""... $ac_c" 1>&6
+echo "configure:3217: checking whether the C compiler needs -belf" >&5
+if eval "test \"`echo '$''{'lt_cv_cc_needs_belf'+set}'`\" = set"; then
+ echo $ac_n "(cached) $ac_c" 1>&6
+else
+
+ ac_ext=c
+# CFLAGS is not in ac_cpp because -g, -O, etc. are not valid cpp options.
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='${CC-cc} -c $CFLAGS $CPPFLAGS conftest.$ac_ext 1>&5'
+ac_link='${CC-cc} -o conftest${ac_exeext} $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS 1>&5'
+cross_compiling=$ac_cv_prog_cc_cross
+
+ cat > conftest.$ac_ext <<EOF
+#line 3230 "configure"
+#include "confdefs.h"
+
+int main() {
+
+; return 0; }
+EOF
+if { (eval echo configure:3237: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+ rm -rf conftest*
+ lt_cv_cc_needs_belf=yes
+else
+ echo "configure: failed program was:" >&5
+ cat conftest.$ac_ext >&5
+ rm -rf conftest*
+ lt_cv_cc_needs_belf=no
+fi
+rm -f conftest*
+ ac_ext=c
+# CFLAGS is not in ac_cpp because -g, -O, etc. are not valid cpp options.
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='${CC-cc} -c $CFLAGS $CPPFLAGS conftest.$ac_ext 1>&5'
+ac_link='${CC-cc} -o conftest${ac_exeext} $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS 1>&5'
+cross_compiling=$ac_cv_prog_cc_cross
+
+fi
+
+echo "$ac_t""$lt_cv_cc_needs_belf" 1>&6
+ if test x"$lt_cv_cc_needs_belf" != x"yes"; then
+ # this is probably gcc 2.8.0, egcs 1.0 or newer; no need for -belf
+ CFLAGS="$SAVE_CFLAGS"
+ fi
+ ;;
+
+
+esac
+
+
+# Save cache, so that ltconfig can load it
+cat > confcache <<\EOF
+# This file is a shell script that caches the results of configure
+# tests run on this system so they can be shared between configure
+# scripts and configure runs. It is not useful on other systems.
+# If it contains results you don't want to keep, you may remove or edit it.
+#
+# By default, configure uses ./config.cache as the cache file,
+# creating it if it does not exist already. You can give configure
+# the --cache-file=FILE option to use a different cache file; that is
+# what configure does when it calls configure scripts in
+# subdirectories, so they share the cache.
+# Giving --cache-file=/dev/null disables caching, for debugging configure.
+# config.status only pays attention to the cache file if you give it the
+# --recheck option to rerun configure.
+#
+EOF
+# The following way of writing the cache mishandles newlines in values,
+# but we know of no workaround that is simple, portable, and efficient.
+# So, don't put newlines in cache variables' values.
+# Ultrix sh set writes to stderr and can't be redirected directly,
+# and sets the high bit in the cache file unless we assign to the vars.
+(set) 2>&1 |
+ case `(ac_space=' '; set | grep ac_space) 2>&1` in
+ *ac_space=\ *)
+ # `set' does not quote correctly, so add quotes (double-quote substitution
+ # turns \\\\ into \\, and sed turns \\ into \).
+ sed -n \
+ -e "s/'/'\\\\''/g" \
+ -e "s/^\\([a-zA-Z0-9_]*_cv_[a-zA-Z0-9_]*\\)=\\(.*\\)/\\1=\${\\1='\\2'}/p"
+ ;;
+ *)
+ # `set' quotes correctly as required by POSIX, so do not add quotes.
+ sed -n -e 's/^\([a-zA-Z0-9_]*_cv_[a-zA-Z0-9_]*\)=\(.*\)/\1=${\1=\2}/p'
+ ;;
+ esac >> confcache
+if cmp -s $cache_file confcache; then
+ :
+else
+ if test -w $cache_file; then
+ echo "updating cache $cache_file"
+ cat confcache > $cache_file
+ else
+ echo "not updating unwritable cache $cache_file"
+ fi
+fi
+rm -f confcache
+
+
+# Actually configure libtool. ac_aux_dir is where install-sh is found.
+AR="$AR" LTCC="$CC" CC="$CC" CFLAGS="$CFLAGS" CPPFLAGS="$CPPFLAGS" \
+MAGIC_CMD="$MAGIC_CMD" LD="$LD" LDFLAGS="$LDFLAGS" LIBS="$LIBS" \
+LN_S="$LN_S" NM="$NM" RANLIB="$RANLIB" STRIP="$STRIP" \
+AS="$AS" DLLTOOL="$DLLTOOL" OBJDUMP="$OBJDUMP" \
+objext="$OBJEXT" exeext="$EXEEXT" reload_flag="$reload_flag" \
+deplibs_check_method="$deplibs_check_method" file_magic_cmd="$file_magic_cmd" \
+${CONFIG_SHELL-/bin/sh} $ac_aux_dir/ltconfig --no-reexec \
+$libtool_flags --no-verify --build="$build" $ac_aux_dir/ltmain.sh $host \
+|| { echo "configure: error: libtool configure failed" 1>&2; exit 1; }
+
+# Reload cache, that may have been modified by ltconfig
+if test -r "$cache_file"; then
+ echo "loading cache $cache_file"
+ . $cache_file
+else
+ echo "creating cache $cache_file"
+ > $cache_file
+fi
+
+
+# This can be used to rebuild libtool when needed
+LIBTOOL_DEPS="$ac_aux_dir/ltconfig $ac_aux_dir/ltmain.sh $ac_aux_dir/ltcf-c.sh"
+
+# Always use our own libtool.
+LIBTOOL='$(SHELL) $(top_builddir)/libtool'
+
+# Redirect the config.log output again, so that the ltconfig log is not
+# clobbered by the next message.
+exec 5>>./config.log
+
+
+LIBTOOL_DEPS=$LIBTOOL_DEPS" $ac_aux_dir/ltcf-f77.sh"
+lt_save_CC="$CC"
+lt_save_CFLAGS="$CFLAGS"
+AR="$AR" LTCC="$CC" CC="$F77" F77="$F77" CFLAGS="$FFLAGS" CPPFLAGS="" \
+MAGIC_CMD="$MAGIC_CMD" LD="$LD" LDFLAGS="$LDFLAGS" LIBS="$LIBS" \
+LN_S="$LN_S" NM="$NM" RANLIB="$RANLIB" STRIP="$STRIP" \
+AS="$AS" DLLTOOL="$DLLTOOL" OBJDUMP="$OBJDUMP" \
+objext="$OBJEXT" exeext="$EXEEXT" reload_flag="$reload_flag" \
+deplibs_check_method="$deplibs_check_method" \
+file_magic_cmd="$file_magic_cmd" \
+${CONFIG_SHELL-/bin/sh} $ac_aux_dir/ltconfig -o libtool $libtool_flags \
+--build="$build" --add-tag=F77 $ac_aux_dir/ltcf-f77.sh $host \
+|| { echo "configure: error: libtool tag configuration failed" 1>&2; exit 1; }
+CC="$lt_save_CC"
+CFLAGS="$lt_save_CFLAGS"
+
+# Redirect the config.log output again, so that the ltconfig log is not
+# clobbered by the next message.
+exec 5>>./config.log
+
+
+
+
+
+
+
for ac_func in strcasecmp
do
echo $ac_n "checking for $ac_func""... $ac_c" 1>&6
-echo "configure:2689: checking for $ac_func" >&5
+echo "configure:3377: checking for $ac_func" >&5
if eval "test \"`echo '$''{'ac_cv_func_$ac_func'+set}'`\" = set"; then
echo $ac_n "(cached) $ac_c" 1>&6
else
cat > conftest.$ac_ext <<EOF
-#line 2694 "configure"
+#line 3382 "configure"
#include "confdefs.h"
/* System header to define __stub macros and hopefully few prototypes,
which can conflict with char $ac_func(); below. */
; return 0; }
EOF
-if { (eval echo configure:2717: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+if { (eval echo configure:3405: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
rm -rf conftest*
eval "ac_cv_func_$ac_func=yes"
else
for ac_func in strdup
do
echo $ac_n "checking for $ac_func""... $ac_c" 1>&6
-echo "configure:2744: checking for $ac_func" >&5
+echo "configure:3432: checking for $ac_func" >&5
if eval "test \"`echo '$''{'ac_cv_func_$ac_func'+set}'`\" = set"; then
echo $ac_n "(cached) $ac_c" 1>&6
else
cat > conftest.$ac_ext <<EOF
-#line 2749 "configure"
+#line 3437 "configure"
#include "confdefs.h"
/* System header to define __stub macros and hopefully few prototypes,
which can conflict with char $ac_func(); below. */
; return 0; }
EOF
-if { (eval echo configure:2772: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+if { (eval echo configure:3460: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
rm -rf conftest*
eval "ac_cv_func_$ac_func=yes"
else
# Checks for libraries.
############################################################################
echo $ac_n "checking for sqrt in -lm""... $ac_c" 1>&6
-echo "configure:2880: checking for sqrt in -lm" >&5
+echo "configure:3568: checking for sqrt in -lm" >&5
ac_lib_var=`echo m'_'sqrt | sed 'y%./+-%__p_%'`
if eval "test \"`echo '$''{'ac_cv_lib_$ac_lib_var'+set}'`\" = set"; then
echo $ac_n "(cached) $ac_c" 1>&6
ac_save_LIBS="$LIBS"
LIBS="-lm $LIBS"
cat > conftest.$ac_ext <<EOF
-#line 2888 "configure"
+#line 3576 "configure"
#include "confdefs.h"
/* Override any gcc2 internal prototype to avoid an error. */
/* We use char because int might match the return type of a gcc2
sqrt()
; return 0; }
EOF
-if { (eval echo configure:2899: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+if { (eval echo configure:3587: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
rm -rf conftest*
eval "ac_cv_lib_$ac_lib_var=yes"
else
# libm in the link list, thus the test goes after m!
if test "${host_vendor}" = "ibm"; then
echo $ac_n "checking for main in -lxlopt""... $ac_c" 1>&6
-echo "configure:2934: checking for main in -lxlopt" >&5
+echo "configure:3622: checking for main in -lxlopt" >&5
ac_lib_var=`echo xlopt'_'main | sed 'y%./+-%__p_%'`
if eval "test \"`echo '$''{'ac_cv_lib_$ac_lib_var'+set}'`\" = set"; then
echo $ac_n "(cached) $ac_c" 1>&6
ac_save_LIBS="$LIBS"
LIBS="-lxlopt $LIBS"
cat > conftest.$ac_ext <<EOF
-#line 2942 "configure"
+#line 3630 "configure"
#include "confdefs.h"
int main() {
main()
; return 0; }
EOF
-if { (eval echo configure:2949: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+if { (eval echo configure:3637: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
rm -rf conftest*
eval "ac_cv_lib_$ac_lib_var=yes"
else
fi
echo $ac_n "checking for main in -lmass""... $ac_c" 1>&6
-echo "configure:2977: checking for main in -lmass" >&5
+echo "configure:3665: checking for main in -lmass" >&5
ac_lib_var=`echo mass'_'main | sed 'y%./+-%__p_%'`
if eval "test \"`echo '$''{'ac_cv_lib_$ac_lib_var'+set}'`\" = set"; then
echo $ac_n "(cached) $ac_c" 1>&6
ac_save_LIBS="$LIBS"
LIBS="-lmass $LIBS"
cat > conftest.$ac_ext <<EOF
-#line 2985 "configure"
+#line 3673 "configure"
#include "confdefs.h"
int main() {
main()
; return 0; }
EOF
-if { (eval echo configure:2992: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+if { (eval echo configure:3680: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
rm -rf conftest*
eval "ac_cv_lib_$ac_lib_var=yes"
else
case "$gmxcpu" in
power4*)
echo $ac_n "checking for main in -lmassvp4""... $ac_c" 1>&6
-echo "configure:3023: checking for main in -lmassvp4" >&5
+echo "configure:3711: checking for main in -lmassvp4" >&5
ac_lib_var=`echo massvp4'_'main | sed 'y%./+-%__p_%'`
if eval "test \"`echo '$''{'ac_cv_lib_$ac_lib_var'+set}'`\" = set"; then
echo $ac_n "(cached) $ac_c" 1>&6
ac_save_LIBS="$LIBS"
LIBS="-lmassvp4 $LIBS"
cat > conftest.$ac_ext <<EOF
-#line 3031 "configure"
+#line 3719 "configure"
#include "confdefs.h"
int main() {
main()
; return 0; }
EOF
-if { (eval echo configure:3038: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+if { (eval echo configure:3726: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
rm -rf conftest*
eval "ac_cv_lib_$ac_lib_var=yes"
else
;;
power3*)
echo $ac_n "checking for main in -lmassvp3""... $ac_c" 1>&6
-echo "configure:3060: checking for main in -lmassvp3" >&5
+echo "configure:3748: checking for main in -lmassvp3" >&5
ac_lib_var=`echo massvp3'_'main | sed 'y%./+-%__p_%'`
if eval "test \"`echo '$''{'ac_cv_lib_$ac_lib_var'+set}'`\" = set"; then
echo $ac_n "(cached) $ac_c" 1>&6
ac_save_LIBS="$LIBS"
LIBS="-lmassvp3 $LIBS"
cat > conftest.$ac_ext <<EOF
-#line 3068 "configure"
+#line 3756 "configure"
#include "confdefs.h"
int main() {
main()
; return 0; }
EOF
-if { (eval echo configure:3075: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+if { (eval echo configure:3763: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
rm -rf conftest*
eval "ac_cv_lib_$ac_lib_var=yes"
else
;;
power2*)
echo $ac_n "checking for main in -lmassvp3""... $ac_c" 1>&6
-echo "configure:3097: checking for main in -lmassvp3" >&5
+echo "configure:3785: checking for main in -lmassvp3" >&5
ac_lib_var=`echo massvp3'_'main | sed 'y%./+-%__p_%'`
if eval "test \"`echo '$''{'ac_cv_lib_$ac_lib_var'+set}'`\" = set"; then
echo $ac_n "(cached) $ac_c" 1>&6
ac_save_LIBS="$LIBS"
LIBS="-lmassvp3 $LIBS"
cat > conftest.$ac_ext <<EOF
-#line 3105 "configure"
+#line 3793 "configure"
#include "confdefs.h"
int main() {
main()
; return 0; }
EOF
-if { (eval echo configure:3112: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+if { (eval echo configure:3800: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
rm -rf conftest*
eval "ac_cv_lib_$ac_lib_var=yes"
else
;;
*)
echo $ac_n "checking for main in -lmassv""... $ac_c" 1>&6
-echo "configure:3134: checking for main in -lmassv" >&5
+echo "configure:3822: checking for main in -lmassv" >&5
ac_lib_var=`echo massv'_'main | sed 'y%./+-%__p_%'`
if eval "test \"`echo '$''{'ac_cv_lib_$ac_lib_var'+set}'`\" = set"; then
echo $ac_n "(cached) $ac_c" 1>&6
ac_save_LIBS="$LIBS"
LIBS="-lmassv $LIBS"
cat > conftest.$ac_ext <<EOF
-#line 3142 "configure"
+#line 3830 "configure"
#include "confdefs.h"
int main() {
main()
; return 0; }
EOF
-if { (eval echo configure:3149: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+if { (eval echo configure:3837: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
rm -rf conftest*
eval "ac_cv_lib_$ac_lib_var=yes"
else
fi
fi
-if test "$enable_double" = "yes"; then
- precision=8
-else
+if test "$enable_float" = "yes"; then
precision=4
+else
+ precision=8
fi
if test "$enable_mpi" = "yes"; then
ok="no"
# check header doesn't work, since we must use mpicc to get includes, not just /lib/cpp
echo $ac_n "checking for fftw_mpi.h""... $ac_c" 1>&6
-echo "configure:3200: checking for fftw_mpi.h" >&5
+echo "configure:3888: checking for fftw_mpi.h" >&5
cat > conftest.$ac_ext <<EOF
-#line 3202 "configure"
+#line 3890 "configure"
#include "confdefs.h"
#include <fftw_mpi.h>
int main() {
; return 0; }
EOF
-if { (eval echo configure:3209: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
+if { (eval echo configure:3897: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
rm -rf conftest*
fftwname=fftw_mpi
if test -n "$fftwname"; then
-# we cannot run the code since MPI program might not be allowed outside a charge queue
+# we cannot run the code since an MPI program might not be allowed on a login node of a supercomputer
cat > conftest.$ac_ext <<EOF
-#line 3227 "configure"
+#line 3915 "configure"
#include "confdefs.h"
#include <$fftwname.h>
int main() {
int _array_ [1 - 2 * !((sizeof(fftw_real)) == $precision)];
; return 0; }
EOF
-if { (eval echo configure:3234: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
+if { (eval echo configure:3922: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
rm -rf conftest*
ok="yes"
else
if test "$ok" != "yes"; then
xfftwname=${fftwcheckprefix}${fftwname}
echo $ac_n "checking for $xfftwname.h""... $ac_c" 1>&6
-echo "configure:3251: checking for $xfftwname.h" >&5
+echo "configure:3939: checking for $xfftwname.h" >&5
cat > conftest.$ac_ext <<EOF
-#line 3253 "configure"
+#line 3941 "configure"
#include "confdefs.h"
#include <$xfftwname.h>
int main() {
; return 0; }
EOF
-if { (eval echo configure:3260: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
+if { (eval echo configure:3948: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
rm -rf conftest*
echo "$ac_t""yes" 1>&6
else
echo "$ac_t""no" 1>&6
{ echo "configure: error: Cannot find any $prec precision $fftwname.h or $xfftwname.h
-Do you have $prec precision FFTW installed? You can find it at www.fftw.org
-Note that the default FFTW setup is double precision. You change the
-FFTW configuration to single with --enable-float and turn on MPI support
-with --enable-mpi. It is a good idea to install both single & double.
+Do you have $prec precision FFTW installed? If you are using packages,
+note that you also need fftw-devel to compile GROMACS. You can find the
+software at www.fftw.org, and detailed instructions at www.gromacs.org.
+If you compiled FFTW yourself:
+Note that the default FFTW setup is double precision. Change the FFTW
+configuration to single with --enable-float. If you want MPI support,
+use --enable-mpi. It is a good idea to install both single & double.
If your sysadm doesn't want to install it you can do it to a location
-in your home directory and provide Gromacs configure with the correct
-paths by setting the CPPFLAGS and LDFLAGS environment variables.
-Check the Gromacs INSTALL file for additional information." 1>&2; exit 1; }
+in your home directory and provide the correct paths in the CPPFLAGS
+and LDFLAGS environment variables before running configure.
+That is also necessary to do if your compiler doesn't search
+/usr/local/include and /usr/local/lib by default.
+You can find information at www.gromacs.org, or in the INSTALL file." 1>&2; exit 1; }
fi
rm -f conftest*
cat > conftest.$ac_ext <<EOF
-#line 3282 "configure"
+#line 3975 "configure"
#include "confdefs.h"
#include <$xfftwname.h>
int main() {
int _array_ [1 - 2 * !((sizeof(fftw_real)) == $precision)];
; return 0; }
EOF
-if { (eval echo configure:3289: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
+if { (eval echo configure:3982: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
rm -rf conftest*
fftwname=$xfftwname
echo "configure: failed program was:" >&5
cat conftest.$ac_ext >&5
rm -rf conftest*
- { echo "configure: error: Cannot find any $prec precision $fftwname.h or $xfftwname.h
-Do you have $prec precision FFTW installed? You can find it at www.fftw.org
-Note that the default FFTW setup is double precision. You change the
-FFTW configuration to single with --enable-float and turn on MPI support
-with --enable-mpi. It is a good idea to install both single & double.
+
+{ echo "configure: error: Cannot find any $prec precision $fftwname.h or $xfftwname.h
+Do you have $prec precision FFTW installed? If you are using packages,
+note that you also need fftw-devel to compile GROMACS. You can find the
+software at www.fftw.org, and detailed instructions at www.gromacs.org.
+If you compiled FFTW yourself:
+Note that the default FFTW setup is double precision. Change the FFTW
+configuration to single with --enable-float. If you want MPI support,
+use --enable-mpi. It is a good idea to install both single & double.
If your sysadm doesn't want to install it you can do it to a location
-in your home directory and provide Gromacs configure with the correct
-paths by setting the CPPFLAGS and LDFLAGS environment variables.
-Check the Gromacs INSTALL file for additional information." 1>&2; exit 1; }
+in your home directory and provide the correct paths in the CPPFLAGS
+and LDFLAGS environment variables before running configure.
+That is also necessary to do if your compiler doesn't search
+/usr/local/include and /usr/local/lib by default.
+You can find information at www.gromacs.org, or in the INSTALL file." 1>&2; exit 1; }
fi
rm -f conftest*
fi
echo $ac_n "checking for main in -l$fftwname""... $ac_c" 1>&6
-echo "configure:3313: checking for main in -l$fftwname" >&5
+echo "configure:4012: checking for main in -l$fftwname" >&5
ac_lib_var=`echo $fftwname'_'main | sed 'y%./+-%__p_%'`
if eval "test \"`echo '$''{'ac_cv_lib_$ac_lib_var'+set}'`\" = set"; then
echo $ac_n "(cached) $ac_c" 1>&6
ac_save_LIBS="$LIBS"
LIBS="-l$fftwname $LIBS"
cat > conftest.$ac_ext <<EOF
-#line 3321 "configure"
+#line 4020 "configure"
#include "confdefs.h"
int main() {
main()
; return 0; }
EOF
-if { (eval echo configure:3328: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+if { (eval echo configure:4027: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
rm -rf conftest*
eval "ac_cv_lib_$ac_lib_var=yes"
else
fftwname=${ac_fftw_savedprefix}fftw_mpi
echo $ac_n "checking for $fftwname.h""... $ac_c" 1>&6
-echo "configure:3363: checking for $fftwname.h" >&5
+echo "configure:4062: checking for $fftwname.h" >&5
cat > conftest.$ac_ext <<EOF
-#line 3365 "configure"
+#line 4064 "configure"
#include "confdefs.h"
#include <$fftwname.h>
int main() {
; return 0; }
EOF
-if { (eval echo configure:3372: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
+if { (eval echo configure:4071: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
rm -rf conftest*
echo "$ac_t""yes" 1>&6
echo $ac_n "checking for main in -l$fftwname""... $ac_c" 1>&6
-echo "configure:3376: checking for main in -l$fftwname" >&5
+echo "configure:4075: checking for main in -l$fftwname" >&5
ac_lib_var=`echo $fftwname'_'main | sed 'y%./+-%__p_%'`
if eval "test \"`echo '$''{'ac_cv_lib_$ac_lib_var'+set}'`\" = set"; then
echo $ac_n "(cached) $ac_c" 1>&6
ac_save_LIBS="$LIBS"
LIBS="-l$fftwname $LIBS"
cat > conftest.$ac_ext <<EOF
-#line 3384 "configure"
+#line 4083 "configure"
#include "confdefs.h"
int main() {
main()
; return 0; }
EOF
-if { (eval echo configure:3391: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+if { (eval echo configure:4090: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
rm -rf conftest*
eval "ac_cv_lib_$ac_lib_var=yes"
else
ok="no"
# check header doesn't work, since we must use mpicc to get includes, not just /lib/cpp
echo $ac_n "checking for rfftw_mpi.h""... $ac_c" 1>&6
-echo "configure:3448: checking for rfftw_mpi.h" >&5
+echo "configure:4147: checking for rfftw_mpi.h" >&5
cat > conftest.$ac_ext <<EOF
-#line 3450 "configure"
+#line 4149 "configure"
#include "confdefs.h"
#include <rfftw_mpi.h>
int main() {
; return 0; }
EOF
-if { (eval echo configure:3457: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
+if { (eval echo configure:4156: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
rm -rf conftest*
fftwname=rfftw_mpi
if test -n "$fftwname"; then
-# we cannot run the code since MPI program might not be allowed outside a charge queue
+# we cannot run the code since an MPI program might not be allowed on a login node of a supercomputer
cat > conftest.$ac_ext <<EOF
-#line 3475 "configure"
+#line 4174 "configure"
#include "confdefs.h"
#include <$fftwname.h>
int main() {
int _array_ [1 - 2 * !((sizeof(fftw_real)) == $precision)];
; return 0; }
EOF
-if { (eval echo configure:3482: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
+if { (eval echo configure:4181: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
rm -rf conftest*
ok="yes"
else
if test "$ok" != "yes"; then
xfftwname=${fftwcheckprefix}${fftwname}
echo $ac_n "checking for $xfftwname.h""... $ac_c" 1>&6
-echo "configure:3499: checking for $xfftwname.h" >&5
+echo "configure:4198: checking for $xfftwname.h" >&5
cat > conftest.$ac_ext <<EOF
-#line 3501 "configure"
+#line 4200 "configure"
#include "confdefs.h"
#include <$xfftwname.h>
int main() {
; return 0; }
EOF
-if { (eval echo configure:3508: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
+if { (eval echo configure:4207: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
rm -rf conftest*
echo "$ac_t""yes" 1>&6
else
echo "$ac_t""no" 1>&6
{ echo "configure: error: Cannot find any $prec precision $fftwname.h or $xfftwname.h
-Do you have $prec precision FFTW installed? You can find it at www.fftw.org
-Note that the default FFTW setup is double precision. You change the
-FFTW configuration to single with --enable-float and turn on MPI support
-with --enable-mpi. It is a good idea to install both single & double.
+Do you have $prec precision FFTW installed? If you are using packages,
+note that you also need fftw-devel to compile GROMACS. You can find the
+software at www.fftw.org, and detailed instructions at www.gromacs.org.
+If you compiled FFTW yourself:
+Note that the default FFTW setup is double precision. Change the FFTW
+configuration to single with --enable-float. If you want MPI support,
+use --enable-mpi. It is a good idea to install both single & double.
If your sysadm doesn't want to install it you can do it to a location
-in your home directory and provide Gromacs configure with the correct
-paths by setting the CPPFLAGS and LDFLAGS environment variables.
-Check the Gromacs INSTALL file for additional information." 1>&2; exit 1; }
+in your home directory and provide the correct paths in the CPPFLAGS
+and LDFLAGS environment variables before running configure.
+That is also necessary to do if your compiler doesn't search
+/usr/local/include and /usr/local/lib by default.
+You can find information at www.gromacs.org, or in the INSTALL file." 1>&2; exit 1; }
fi
rm -f conftest*
cat > conftest.$ac_ext <<EOF
-#line 3530 "configure"
+#line 4234 "configure"
#include "confdefs.h"
#include <$xfftwname.h>
int main() {
int _array_ [1 - 2 * !((sizeof(fftw_real)) == $precision)];
; return 0; }
EOF
-if { (eval echo configure:3537: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
+if { (eval echo configure:4241: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
rm -rf conftest*
fftwname=$xfftwname
echo "configure: failed program was:" >&5
cat conftest.$ac_ext >&5
rm -rf conftest*
- { echo "configure: error: Cannot find any $prec precision $fftwname.h or $xfftwname.h
-Do you have $prec precision FFTW installed? You can find it at www.fftw.org
-Note that the default FFTW setup is double precision. You change the
-FFTW configuration to single with --enable-float and turn on MPI support
-with --enable-mpi. It is a good idea to install both single & double.
+
+{ echo "configure: error: Cannot find any $prec precision $fftwname.h or $xfftwname.h
+Do you have $prec precision FFTW installed? If you are using packages,
+note that you also need fftw-devel to compile GROMACS. You can find the
+software at www.fftw.org, and detailed instructions at www.gromacs.org.
+If you compiled FFTW yourself:
+Note that the default FFTW setup is double precision. Change the FFTW
+configuration to single with --enable-float. If you want MPI support,
+use --enable-mpi. It is a good idea to install both single & double.
If your sysadm doesn't want to install it you can do it to a location
-in your home directory and provide Gromacs configure with the correct
-paths by setting the CPPFLAGS and LDFLAGS environment variables.
-Check the Gromacs INSTALL file for additional information." 1>&2; exit 1; }
+in your home directory and provide the correct paths in the CPPFLAGS
+and LDFLAGS environment variables before running configure.
+That is also necessary to do if your compiler doesn't search
+/usr/local/include and /usr/local/lib by default.
+You can find information at www.gromacs.org, or in the INSTALL file." 1>&2; exit 1; }
fi
rm -f conftest*
fi
echo $ac_n "checking for main in -l$fftwname""... $ac_c" 1>&6
-echo "configure:3561: checking for main in -l$fftwname" >&5
+echo "configure:4271: checking for main in -l$fftwname" >&5
ac_lib_var=`echo $fftwname'_'main | sed 'y%./+-%__p_%'`
if eval "test \"`echo '$''{'ac_cv_lib_$ac_lib_var'+set}'`\" = set"; then
echo $ac_n "(cached) $ac_c" 1>&6
ac_save_LIBS="$LIBS"
LIBS="-l$fftwname $LIBS"
cat > conftest.$ac_ext <<EOF
-#line 3569 "configure"
+#line 4279 "configure"
#include "confdefs.h"
int main() {
main()
; return 0; }
EOF
-if { (eval echo configure:3576: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+if { (eval echo configure:4286: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
rm -rf conftest*
eval "ac_cv_lib_$ac_lib_var=yes"
else
fftwname=${ac_fftw_savedprefix}rfftw_mpi
echo $ac_n "checking for $fftwname.h""... $ac_c" 1>&6
-echo "configure:3611: checking for $fftwname.h" >&5
+echo "configure:4321: checking for $fftwname.h" >&5
cat > conftest.$ac_ext <<EOF
-#line 3613 "configure"
+#line 4323 "configure"
#include "confdefs.h"
#include <$fftwname.h>
int main() {
; return 0; }
EOF
-if { (eval echo configure:3620: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
+if { (eval echo configure:4330: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
rm -rf conftest*
echo "$ac_t""yes" 1>&6
echo $ac_n "checking for main in -l$fftwname""... $ac_c" 1>&6
-echo "configure:3624: checking for main in -l$fftwname" >&5
+echo "configure:4334: checking for main in -l$fftwname" >&5
ac_lib_var=`echo $fftwname'_'main | sed 'y%./+-%__p_%'`
if eval "test \"`echo '$''{'ac_cv_lib_$ac_lib_var'+set}'`\" = set"; then
echo $ac_n "(cached) $ac_c" 1>&6
ac_save_LIBS="$LIBS"
LIBS="-l$fftwname $LIBS"
cat > conftest.$ac_ext <<EOF
-#line 3632 "configure"
+#line 4342 "configure"
#include "confdefs.h"
int main() {
main()
; return 0; }
EOF
-if { (eval echo configure:3639: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+if { (eval echo configure:4349: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
rm -rf conftest*
eval "ac_cv_lib_$ac_lib_var=yes"
else
ok="no"
# check header doesn't work, since we must use mpicc to get includes, not just /lib/cpp
echo $ac_n "checking for fftw.h""... $ac_c" 1>&6
-echo "configure:3698: checking for fftw.h" >&5
+echo "configure:4408: checking for fftw.h" >&5
cat > conftest.$ac_ext <<EOF
-#line 3700 "configure"
+#line 4410 "configure"
#include "confdefs.h"
#include <fftw.h>
int main() {
; return 0; }
EOF
-if { (eval echo configure:3707: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
+if { (eval echo configure:4417: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
rm -rf conftest*
fftwname=fftw
if test -n "$fftwname"; then
-# we cannot run the code since MPI program might not be allowed outside a charge queue
+# we cannot run the code since an MPI program might not be allowed on a login node of a supercomputer
cat > conftest.$ac_ext <<EOF
-#line 3725 "configure"
+#line 4435 "configure"
#include "confdefs.h"
#include <$fftwname.h>
int main() {
int _array_ [1 - 2 * !((sizeof(fftw_real)) == $precision)];
; return 0; }
EOF
-if { (eval echo configure:3732: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
+if { (eval echo configure:4442: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
rm -rf conftest*
ok="yes"
else
if test "$ok" != "yes"; then
xfftwname=${fftwcheckprefix}${fftwname}
echo $ac_n "checking for $xfftwname.h""... $ac_c" 1>&6
-echo "configure:3749: checking for $xfftwname.h" >&5
+echo "configure:4459: checking for $xfftwname.h" >&5
cat > conftest.$ac_ext <<EOF
-#line 3751 "configure"
+#line 4461 "configure"
#include "confdefs.h"
#include <$xfftwname.h>
int main() {
; return 0; }
EOF
-if { (eval echo configure:3758: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
+if { (eval echo configure:4468: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
rm -rf conftest*
echo "$ac_t""yes" 1>&6
else
echo "$ac_t""no" 1>&6
{ echo "configure: error: Cannot find any $prec precision $fftwname.h or $xfftwname.h
-Do you have $prec precision FFTW installed? You can find it at www.fftw.org
-Note that the default FFTW setup is double precision. You change the
-FFTW configuration to single with --enable-float and turn on MPI support
-with --enable-mpi. It is a good idea to install both single & double.
+Do you have $prec precision FFTW installed? If you are using packages,
+note that you also need fftw-devel to compile GROMACS. You can find the
+software at www.fftw.org, and detailed instructions at www.gromacs.org.
+If you compiled FFTW yourself:
+Note that the default FFTW setup is double precision. Change the FFTW
+configuration to single with --enable-float. If you want MPI support,
+use --enable-mpi. It is a good idea to install both single & double.
If your sysadm doesn't want to install it you can do it to a location
-in your home directory and provide Gromacs configure with the correct
-paths by setting the CPPFLAGS and LDFLAGS environment variables.
-Check the Gromacs INSTALL file for additional information." 1>&2; exit 1; }
+in your home directory and provide the correct paths in the CPPFLAGS
+and LDFLAGS environment variables before running configure.
+That is also necessary to do if your compiler doesn't search
+/usr/local/include and /usr/local/lib by default.
+You can find information at www.gromacs.org, or in the INSTALL file." 1>&2; exit 1; }
fi
rm -f conftest*
cat > conftest.$ac_ext <<EOF
-#line 3780 "configure"
+#line 4495 "configure"
#include "confdefs.h"
#include <$xfftwname.h>
int main() {
int _array_ [1 - 2 * !((sizeof(fftw_real)) == $precision)];
; return 0; }
EOF
-if { (eval echo configure:3787: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
+if { (eval echo configure:4502: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
rm -rf conftest*
fftwname=$xfftwname
echo "configure: failed program was:" >&5
cat conftest.$ac_ext >&5
rm -rf conftest*
- { echo "configure: error: Cannot find any $prec precision $fftwname.h or $xfftwname.h
-Do you have $prec precision FFTW installed? You can find it at www.fftw.org
-Note that the default FFTW setup is double precision. You change the
-FFTW configuration to single with --enable-float and turn on MPI support
-with --enable-mpi. It is a good idea to install both single & double.
+
+{ echo "configure: error: Cannot find any $prec precision $fftwname.h or $xfftwname.h
+Do you have $prec precision FFTW installed? If you are using packages,
+note that you also need fftw-devel to compile GROMACS. You can find the
+software at www.fftw.org, and detailed instructions at www.gromacs.org.
+If you compiled FFTW yourself:
+Note that the default FFTW setup is double precision. Change the FFTW
+configuration to single with --enable-float. If you want MPI support,
+use --enable-mpi. It is a good idea to install both single & double.
If your sysadm doesn't want to install it you can do it to a location
-in your home directory and provide Gromacs configure with the correct
-paths by setting the CPPFLAGS and LDFLAGS environment variables.
-Check the Gromacs INSTALL file for additional information." 1>&2; exit 1; }
+in your home directory and provide the correct paths in the CPPFLAGS
+and LDFLAGS environment variables before running configure.
+That is also necessary to do if your compiler doesn't search
+/usr/local/include and /usr/local/lib by default.
+You can find information at www.gromacs.org, or in the INSTALL file." 1>&2; exit 1; }
fi
rm -f conftest*
fi
echo $ac_n "checking for main in -l$fftwname""... $ac_c" 1>&6
-echo "configure:3811: checking for main in -l$fftwname" >&5
+echo "configure:4532: checking for main in -l$fftwname" >&5
ac_lib_var=`echo $fftwname'_'main | sed 'y%./+-%__p_%'`
if eval "test \"`echo '$''{'ac_cv_lib_$ac_lib_var'+set}'`\" = set"; then
echo $ac_n "(cached) $ac_c" 1>&6
ac_save_LIBS="$LIBS"
LIBS="-l$fftwname $LIBS"
cat > conftest.$ac_ext <<EOF
-#line 3819 "configure"
+#line 4540 "configure"
#include "confdefs.h"
int main() {
main()
; return 0; }
EOF
-if { (eval echo configure:3826: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+if { (eval echo configure:4547: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
rm -rf conftest*
eval "ac_cv_lib_$ac_lib_var=yes"
else
fftwname=${ac_fftw_savedprefix}fftw
echo $ac_n "checking for $fftwname.h""... $ac_c" 1>&6
-echo "configure:3861: checking for $fftwname.h" >&5
+echo "configure:4582: checking for $fftwname.h" >&5
cat > conftest.$ac_ext <<EOF
-#line 3863 "configure"
+#line 4584 "configure"
#include "confdefs.h"
#include <$fftwname.h>
int main() {
; return 0; }
EOF
-if { (eval echo configure:3870: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
+if { (eval echo configure:4591: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
rm -rf conftest*
echo "$ac_t""yes" 1>&6
echo $ac_n "checking for main in -l$fftwname""... $ac_c" 1>&6
-echo "configure:3874: checking for main in -l$fftwname" >&5
+echo "configure:4595: checking for main in -l$fftwname" >&5
ac_lib_var=`echo $fftwname'_'main | sed 'y%./+-%__p_%'`
if eval "test \"`echo '$''{'ac_cv_lib_$ac_lib_var'+set}'`\" = set"; then
echo $ac_n "(cached) $ac_c" 1>&6
ac_save_LIBS="$LIBS"
LIBS="-l$fftwname $LIBS"
cat > conftest.$ac_ext <<EOF
-#line 3882 "configure"
+#line 4603 "configure"
#include "confdefs.h"
int main() {
main()
; return 0; }
EOF
-if { (eval echo configure:3889: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+if { (eval echo configure:4610: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
rm -rf conftest*
eval "ac_cv_lib_$ac_lib_var=yes"
else
ok="no"
# check header doesn't work, since we must use mpicc to get includes, not just /lib/cpp
echo $ac_n "checking for rfftw.h""... $ac_c" 1>&6
-echo "configure:3946: checking for rfftw.h" >&5
+echo "configure:4667: checking for rfftw.h" >&5
cat > conftest.$ac_ext <<EOF
-#line 3948 "configure"
+#line 4669 "configure"
#include "confdefs.h"
#include <rfftw.h>
int main() {
; return 0; }
EOF
-if { (eval echo configure:3955: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
+if { (eval echo configure:4676: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
rm -rf conftest*
fftwname=rfftw
if test -n "$fftwname"; then
-# we cannot run the code since MPI program might not be allowed outside a charge queue
+# we cannot run the code since an MPI program might not be allowed on a login node of a supercomputer
cat > conftest.$ac_ext <<EOF
-#line 3973 "configure"
+#line 4694 "configure"
#include "confdefs.h"
#include <$fftwname.h>
int main() {
int _array_ [1 - 2 * !((sizeof(fftw_real)) == $precision)];
; return 0; }
EOF
-if { (eval echo configure:3980: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
+if { (eval echo configure:4701: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
rm -rf conftest*
ok="yes"
else
if test "$ok" != "yes"; then
xfftwname=${fftwcheckprefix}${fftwname}
echo $ac_n "checking for $xfftwname.h""... $ac_c" 1>&6
-echo "configure:3997: checking for $xfftwname.h" >&5
+echo "configure:4718: checking for $xfftwname.h" >&5
cat > conftest.$ac_ext <<EOF
-#line 3999 "configure"
+#line 4720 "configure"
#include "confdefs.h"
#include <$xfftwname.h>
int main() {
; return 0; }
EOF
-if { (eval echo configure:4006: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
+if { (eval echo configure:4727: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
rm -rf conftest*
echo "$ac_t""yes" 1>&6
else
echo "$ac_t""no" 1>&6
{ echo "configure: error: Cannot find any $prec precision $fftwname.h or $xfftwname.h
-Do you have $prec precision FFTW installed? You can find it at www.fftw.org
-Note that the default FFTW setup is double precision. You change the
-FFTW configuration to single with --enable-float and turn on MPI support
-with --enable-mpi. It is a good idea to install both single & double.
+Do you have $prec precision FFTW installed? If you are using packages,
+note that you also need fftw-devel to compile GROMACS. You can find the
+software at www.fftw.org, and detailed instructions at www.gromacs.org.
+If you compiled FFTW yourself:
+Note that the default FFTW setup is double precision. Change the FFTW
+configuration to single with --enable-float. If you want MPI support,
+use --enable-mpi. It is a good idea to install both single & double.
If your sysadm doesn't want to install it you can do it to a location
-in your home directory and provide Gromacs configure with the correct
-paths by setting the CPPFLAGS and LDFLAGS environment variables.
-Check the Gromacs INSTALL file for additional information." 1>&2; exit 1; }
+in your home directory and provide the correct paths in the CPPFLAGS
+and LDFLAGS environment variables before running configure.
+That is also necessary to do if your compiler doesn't search
+/usr/local/include and /usr/local/lib by default.
+You can find information at www.gromacs.org, or in the INSTALL file." 1>&2; exit 1; }
fi
rm -f conftest*
cat > conftest.$ac_ext <<EOF
-#line 4028 "configure"
+#line 4754 "configure"
#include "confdefs.h"
#include <$xfftwname.h>
int main() {
int _array_ [1 - 2 * !((sizeof(fftw_real)) == $precision)];
; return 0; }
EOF
-if { (eval echo configure:4035: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
+if { (eval echo configure:4761: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
rm -rf conftest*
fftwname=$xfftwname
echo "configure: failed program was:" >&5
cat conftest.$ac_ext >&5
rm -rf conftest*
- { echo "configure: error: Cannot find any $prec precision $fftwname.h or $xfftwname.h
-Do you have $prec precision FFTW installed? You can find it at www.fftw.org
-Note that the default FFTW setup is double precision. You change the
-FFTW configuration to single with --enable-float and turn on MPI support
-with --enable-mpi. It is a good idea to install both single & double.
+
+{ echo "configure: error: Cannot find any $prec precision $fftwname.h or $xfftwname.h
+Do you have $prec precision FFTW installed? If you are using packages,
+note that you also need fftw-devel to compile GROMACS. You can find the
+software at www.fftw.org, and detailed instructions at www.gromacs.org.
+If you compiled FFTW yourself:
+Note that the default FFTW setup is double precision. Change the FFTW
+configuration to single with --enable-float. If you want MPI support,
+use --enable-mpi. It is a good idea to install both single & double.
If your sysadm doesn't want to install it you can do it to a location
-in your home directory and provide Gromacs configure with the correct
-paths by setting the CPPFLAGS and LDFLAGS environment variables.
-Check the Gromacs INSTALL file for additional information." 1>&2; exit 1; }
+in your home directory and provide the correct paths in the CPPFLAGS
+and LDFLAGS environment variables before running configure.
+That is also necessary to do if your compiler doesn't search
+/usr/local/include and /usr/local/lib by default.
+You can find information at www.gromacs.org, or in the INSTALL file." 1>&2; exit 1; }
fi
rm -f conftest*
fi
echo $ac_n "checking for main in -l$fftwname""... $ac_c" 1>&6
-echo "configure:4059: checking for main in -l$fftwname" >&5
+echo "configure:4791: checking for main in -l$fftwname" >&5
ac_lib_var=`echo $fftwname'_'main | sed 'y%./+-%__p_%'`
if eval "test \"`echo '$''{'ac_cv_lib_$ac_lib_var'+set}'`\" = set"; then
echo $ac_n "(cached) $ac_c" 1>&6
ac_save_LIBS="$LIBS"
LIBS="-l$fftwname $LIBS"
cat > conftest.$ac_ext <<EOF
-#line 4067 "configure"
+#line 4799 "configure"
#include "confdefs.h"
int main() {
main()
; return 0; }
EOF
-if { (eval echo configure:4074: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+if { (eval echo configure:4806: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
rm -rf conftest*
eval "ac_cv_lib_$ac_lib_var=yes"
else
fftwname=${ac_fftw_savedprefix}rfftw
echo $ac_n "checking for $fftwname.h""... $ac_c" 1>&6
-echo "configure:4109: checking for $fftwname.h" >&5
+echo "configure:4841: checking for $fftwname.h" >&5
cat > conftest.$ac_ext <<EOF
-#line 4111 "configure"
+#line 4843 "configure"
#include "confdefs.h"
#include <$fftwname.h>
int main() {
; return 0; }
EOF
-if { (eval echo configure:4118: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
+if { (eval echo configure:4850: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
rm -rf conftest*
echo "$ac_t""yes" 1>&6
echo $ac_n "checking for main in -l$fftwname""... $ac_c" 1>&6
-echo "configure:4122: checking for main in -l$fftwname" >&5
+echo "configure:4854: checking for main in -l$fftwname" >&5
ac_lib_var=`echo $fftwname'_'main | sed 'y%./+-%__p_%'`
if eval "test \"`echo '$''{'ac_cv_lib_$ac_lib_var'+set}'`\" = set"; then
echo $ac_n "(cached) $ac_c" 1>&6
ac_save_LIBS="$LIBS"
LIBS="-l$fftwname $LIBS"
cat > conftest.$ac_ext <<EOF
-#line 4130 "configure"
+#line 4862 "configure"
#include "confdefs.h"
int main() {
main()
; return 0; }
EOF
-if { (eval echo configure:4137: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+if { (eval echo configure:4869: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
rm -rf conftest*
eval "ac_cv_lib_$ac_lib_var=yes"
else
######
if test "$enable_xdr" = "no"; then
- echo "configure: warning: * Not using XDR cripples Gromacs significantly. You won't be able to *
- * read or write any hardware independent or compressed trajectories. *
- * We strongly suggest you try to locate the RPC routines instead! *" 1>&2
+ echo "configure: warning: * Not using XDR cripples GROMACS significantly. You won't be able to *
+ * read or write any compressed trajectories. You have no choice on *
+ * windows, but if you run UNIX locate the RPC files - you have them! *" 1>&2
else
# check for xtc headers
for ac_hdr in rpc/rpc.h rpc/xdr.h
do
ac_safe=`echo "$ac_hdr" | sed 'y%./+-%__p_%'`
echo $ac_n "checking for $ac_hdr""... $ac_c" 1>&6
-echo "configure:4212: checking for $ac_hdr" >&5
+echo "configure:4944: checking for $ac_hdr" >&5
if eval "test \"`echo '$''{'ac_cv_header_$ac_safe'+set}'`\" = set"; then
echo $ac_n "(cached) $ac_c" 1>&6
else
cat > conftest.$ac_ext <<EOF
-#line 4217 "configure"
+#line 4949 "configure"
#include "confdefs.h"
#include <$ac_hdr>
EOF
ac_try="$ac_cpp conftest.$ac_ext >/dev/null 2>conftest.out"
-{ (eval echo configure:4222: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }
+{ (eval echo configure:4954: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }
ac_err=`grep -v '^ *+' conftest.out | grep -v "^conftest.${ac_ext}\$"`
if test -z "$ac_err"; then
rm -rf conftest*
# check for xtc libs
# on solaris the xdr stuff is in -lnsl
echo $ac_n "checking for xdr_float in -lnsl""... $ac_c" 1>&6
-echo "configure:4252: checking for xdr_float in -lnsl" >&5
+echo "configure:4984: checking for xdr_float in -lnsl" >&5
ac_lib_var=`echo nsl'_'xdr_float | sed 'y%./+-%__p_%'`
if eval "test \"`echo '$''{'ac_cv_lib_$ac_lib_var'+set}'`\" = set"; then
echo $ac_n "(cached) $ac_c" 1>&6
ac_save_LIBS="$LIBS"
LIBS="-lnsl $LIBS"
cat > conftest.$ac_ext <<EOF
-#line 4260 "configure"
+#line 4992 "configure"
#include "confdefs.h"
/* Override any gcc2 internal prototype to avoid an error. */
/* We use char because int might match the return type of a gcc2
xdr_float()
; return 0; }
EOF
-if { (eval echo configure:4271: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+if { (eval echo configure:5003: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
rm -rf conftest*
eval "ac_cv_lib_$ac_lib_var=yes"
else
fi
cat > conftest.$ac_ext <<EOF
-#line 4299 "configure"
+#line 5031 "configure"
#include "confdefs.h"
#include<rpc/rpc.h>
#include<rpc/xdr.h>
XDR *xd; float f; xdr_float(xd,&f);
; return 0; }
EOF
-if { (eval echo configure:4307: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+if { (eval echo configure:5039: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
:
else
echo "configure: failed program was:" >&5
# Uses ac_ vars as temps to allow command line to override cache and checks.
# --without-x overrides everything else, but does not touch the cache.
echo $ac_n "checking for X""... $ac_c" 1>&6
-echo "configure:4334: checking for X" >&5
+echo "configure:5066: checking for X" >&5
# Check whether --with-x or --without-x was given.
if test "${with_x+set}" = set; then
# First, try using that file with no special directory specified.
cat > conftest.$ac_ext <<EOF
-#line 4396 "configure"
+#line 5128 "configure"
#include "confdefs.h"
#include <$x_direct_test_include>
EOF
ac_try="$ac_cpp conftest.$ac_ext >/dev/null 2>conftest.out"
-{ (eval echo configure:4401: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }
+{ (eval echo configure:5133: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }
ac_err=`grep -v '^ *+' conftest.out | grep -v "^conftest.${ac_ext}\$"`
if test -z "$ac_err"; then
rm -rf conftest*
ac_save_LIBS="$LIBS"
LIBS="-l$x_direct_test_library $LIBS"
cat > conftest.$ac_ext <<EOF
-#line 4470 "configure"
+#line 5202 "configure"
#include "confdefs.h"
int main() {
${x_direct_test_function}()
; return 0; }
EOF
-if { (eval echo configure:4477: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+if { (eval echo configure:5209: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
rm -rf conftest*
LIBS="$ac_save_LIBS"
# We can link X programs with no special library path.
case "`(uname -sr) 2>/dev/null`" in
"SunOS 5"*)
echo $ac_n "checking whether -R must be followed by a space""... $ac_c" 1>&6
-echo "configure:4583: checking whether -R must be followed by a space" >&5
+echo "configure:5315: checking whether -R must be followed by a space" >&5
ac_xsave_LIBS="$LIBS"; LIBS="$LIBS -R$x_libraries"
cat > conftest.$ac_ext <<EOF
-#line 4586 "configure"
+#line 5318 "configure"
#include "confdefs.h"
int main() {
; return 0; }
EOF
-if { (eval echo configure:4593: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+if { (eval echo configure:5325: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
rm -rf conftest*
ac_R_nospace=yes
else
else
LIBS="$ac_xsave_LIBS -R $x_libraries"
cat > conftest.$ac_ext <<EOF
-#line 4609 "configure"
+#line 5341 "configure"
#include "confdefs.h"
int main() {
; return 0; }
EOF
-if { (eval echo configure:4616: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+if { (eval echo configure:5348: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
rm -rf conftest*
ac_R_space=yes
else
# libraries were built with DECnet support. And karl@cs.umb.edu says
# the Alpha needs dnet_stub (dnet does not exist).
echo $ac_n "checking for dnet_ntoa in -ldnet""... $ac_c" 1>&6
-echo "configure:4648: checking for dnet_ntoa in -ldnet" >&5
+echo "configure:5380: checking for dnet_ntoa in -ldnet" >&5
ac_lib_var=`echo dnet'_'dnet_ntoa | sed 'y%./+-%__p_%'`
if eval "test \"`echo '$''{'ac_cv_lib_$ac_lib_var'+set}'`\" = set"; then
echo $ac_n "(cached) $ac_c" 1>&6
ac_save_LIBS="$LIBS"
LIBS="-ldnet $LIBS"
cat > conftest.$ac_ext <<EOF
-#line 4656 "configure"
+#line 5388 "configure"
#include "confdefs.h"
/* Override any gcc2 internal prototype to avoid an error. */
/* We use char because int might match the return type of a gcc2
dnet_ntoa()
; return 0; }
EOF
-if { (eval echo configure:4667: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+if { (eval echo configure:5399: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
rm -rf conftest*
eval "ac_cv_lib_$ac_lib_var=yes"
else
if test $ac_cv_lib_dnet_dnet_ntoa = no; then
echo $ac_n "checking for dnet_ntoa in -ldnet_stub""... $ac_c" 1>&6
-echo "configure:4689: checking for dnet_ntoa in -ldnet_stub" >&5
+echo "configure:5421: checking for dnet_ntoa in -ldnet_stub" >&5
ac_lib_var=`echo dnet_stub'_'dnet_ntoa | sed 'y%./+-%__p_%'`
if eval "test \"`echo '$''{'ac_cv_lib_$ac_lib_var'+set}'`\" = set"; then
echo $ac_n "(cached) $ac_c" 1>&6
ac_save_LIBS="$LIBS"
LIBS="-ldnet_stub $LIBS"
cat > conftest.$ac_ext <<EOF
-#line 4697 "configure"
+#line 5429 "configure"
#include "confdefs.h"
/* Override any gcc2 internal prototype to avoid an error. */
/* We use char because int might match the return type of a gcc2
dnet_ntoa()
; return 0; }
EOF
-if { (eval echo configure:4708: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+if { (eval echo configure:5440: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
rm -rf conftest*
eval "ac_cv_lib_$ac_lib_var=yes"
else
# The nsl library prevents programs from opening the X display
# on Irix 5.2, according to dickey@clark.net.
echo $ac_n "checking for gethostbyname""... $ac_c" 1>&6
-echo "configure:4737: checking for gethostbyname" >&5
+echo "configure:5469: checking for gethostbyname" >&5
if eval "test \"`echo '$''{'ac_cv_func_gethostbyname'+set}'`\" = set"; then
echo $ac_n "(cached) $ac_c" 1>&6
else
cat > conftest.$ac_ext <<EOF
-#line 4742 "configure"
+#line 5474 "configure"
#include "confdefs.h"
/* System header to define __stub macros and hopefully few prototypes,
which can conflict with char gethostbyname(); below. */
; return 0; }
EOF
-if { (eval echo configure:4765: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+if { (eval echo configure:5497: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
rm -rf conftest*
eval "ac_cv_func_gethostbyname=yes"
else
if test $ac_cv_func_gethostbyname = no; then
echo $ac_n "checking for gethostbyname in -lnsl""... $ac_c" 1>&6
-echo "configure:4786: checking for gethostbyname in -lnsl" >&5
+echo "configure:5518: checking for gethostbyname in -lnsl" >&5
ac_lib_var=`echo nsl'_'gethostbyname | sed 'y%./+-%__p_%'`
if eval "test \"`echo '$''{'ac_cv_lib_$ac_lib_var'+set}'`\" = set"; then
echo $ac_n "(cached) $ac_c" 1>&6
ac_save_LIBS="$LIBS"
LIBS="-lnsl $LIBS"
cat > conftest.$ac_ext <<EOF
-#line 4794 "configure"
+#line 5526 "configure"
#include "confdefs.h"
/* Override any gcc2 internal prototype to avoid an error. */
/* We use char because int might match the return type of a gcc2
gethostbyname()
; return 0; }
EOF
-if { (eval echo configure:4805: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+if { (eval echo configure:5537: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
rm -rf conftest*
eval "ac_cv_lib_$ac_lib_var=yes"
else
# -lsocket must be given before -lnsl if both are needed.
# We assume that if connect needs -lnsl, so does gethostbyname.
echo $ac_n "checking for connect""... $ac_c" 1>&6
-echo "configure:4835: checking for connect" >&5
+echo "configure:5567: checking for connect" >&5
if eval "test \"`echo '$''{'ac_cv_func_connect'+set}'`\" = set"; then
echo $ac_n "(cached) $ac_c" 1>&6
else
cat > conftest.$ac_ext <<EOF
-#line 4840 "configure"
+#line 5572 "configure"
#include "confdefs.h"
/* System header to define __stub macros and hopefully few prototypes,
which can conflict with char connect(); below. */
; return 0; }
EOF
-if { (eval echo configure:4863: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+if { (eval echo configure:5595: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
rm -rf conftest*
eval "ac_cv_func_connect=yes"
else
if test $ac_cv_func_connect = no; then
echo $ac_n "checking for connect in -lsocket""... $ac_c" 1>&6
-echo "configure:4884: checking for connect in -lsocket" >&5
+echo "configure:5616: checking for connect in -lsocket" >&5
ac_lib_var=`echo socket'_'connect | sed 'y%./+-%__p_%'`
if eval "test \"`echo '$''{'ac_cv_lib_$ac_lib_var'+set}'`\" = set"; then
echo $ac_n "(cached) $ac_c" 1>&6
ac_save_LIBS="$LIBS"
LIBS="-lsocket $X_EXTRA_LIBS $LIBS"
cat > conftest.$ac_ext <<EOF
-#line 4892 "configure"
+#line 5624 "configure"
#include "confdefs.h"
/* Override any gcc2 internal prototype to avoid an error. */
/* We use char because int might match the return type of a gcc2
connect()
; return 0; }
EOF
-if { (eval echo configure:4903: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+if { (eval echo configure:5635: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
rm -rf conftest*
eval "ac_cv_lib_$ac_lib_var=yes"
else
# gomez@mi.uni-erlangen.de says -lposix is necessary on A/UX.
echo $ac_n "checking for remove""... $ac_c" 1>&6
-echo "configure:4927: checking for remove" >&5
+echo "configure:5659: checking for remove" >&5
if eval "test \"`echo '$''{'ac_cv_func_remove'+set}'`\" = set"; then
echo $ac_n "(cached) $ac_c" 1>&6
else
cat > conftest.$ac_ext <<EOF
-#line 4932 "configure"
+#line 5664 "configure"
#include "confdefs.h"
/* System header to define __stub macros and hopefully few prototypes,
which can conflict with char remove(); below. */
; return 0; }
EOF
-if { (eval echo configure:4955: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+if { (eval echo configure:5687: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
rm -rf conftest*
eval "ac_cv_func_remove=yes"
else
if test $ac_cv_func_remove = no; then
echo $ac_n "checking for remove in -lposix""... $ac_c" 1>&6
-echo "configure:4976: checking for remove in -lposix" >&5
+echo "configure:5708: checking for remove in -lposix" >&5
ac_lib_var=`echo posix'_'remove | sed 'y%./+-%__p_%'`
if eval "test \"`echo '$''{'ac_cv_lib_$ac_lib_var'+set}'`\" = set"; then
echo $ac_n "(cached) $ac_c" 1>&6
ac_save_LIBS="$LIBS"
LIBS="-lposix $LIBS"
cat > conftest.$ac_ext <<EOF
-#line 4984 "configure"
+#line 5716 "configure"
#include "confdefs.h"
/* Override any gcc2 internal prototype to avoid an error. */
/* We use char because int might match the return type of a gcc2
remove()
; return 0; }
EOF
-if { (eval echo configure:4995: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+if { (eval echo configure:5727: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
rm -rf conftest*
eval "ac_cv_lib_$ac_lib_var=yes"
else
# BSDI BSD/OS 2.1 needs -lipc for XOpenDisplay.
echo $ac_n "checking for shmat""... $ac_c" 1>&6
-echo "configure:5019: checking for shmat" >&5
+echo "configure:5751: checking for shmat" >&5
if eval "test \"`echo '$''{'ac_cv_func_shmat'+set}'`\" = set"; then
echo $ac_n "(cached) $ac_c" 1>&6
else
cat > conftest.$ac_ext <<EOF
-#line 5024 "configure"
+#line 5756 "configure"
#include "confdefs.h"
/* System header to define __stub macros and hopefully few prototypes,
which can conflict with char shmat(); below. */
; return 0; }
EOF
-if { (eval echo configure:5047: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+if { (eval echo configure:5779: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
rm -rf conftest*
eval "ac_cv_func_shmat=yes"
else
if test $ac_cv_func_shmat = no; then
echo $ac_n "checking for shmat in -lipc""... $ac_c" 1>&6
-echo "configure:5068: checking for shmat in -lipc" >&5
+echo "configure:5800: checking for shmat in -lipc" >&5
ac_lib_var=`echo ipc'_'shmat | sed 'y%./+-%__p_%'`
if eval "test \"`echo '$''{'ac_cv_lib_$ac_lib_var'+set}'`\" = set"; then
echo $ac_n "(cached) $ac_c" 1>&6
ac_save_LIBS="$LIBS"
LIBS="-lipc $LIBS"
cat > conftest.$ac_ext <<EOF
-#line 5076 "configure"
+#line 5808 "configure"
#include "confdefs.h"
/* Override any gcc2 internal prototype to avoid an error. */
/* We use char because int might match the return type of a gcc2
shmat()
; return 0; }
EOF
-if { (eval echo configure:5087: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+if { (eval echo configure:5819: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
rm -rf conftest*
eval "ac_cv_lib_$ac_lib_var=yes"
else
# libraries we check for below, so use a different variable.
# --interran@uluru.Stanford.EDU, kb@cs.umb.edu.
echo $ac_n "checking for IceConnectionNumber in -lICE""... $ac_c" 1>&6
-echo "configure:5120: checking for IceConnectionNumber in -lICE" >&5
+echo "configure:5852: checking for IceConnectionNumber in -lICE" >&5
ac_lib_var=`echo ICE'_'IceConnectionNumber | sed 'y%./+-%__p_%'`
if eval "test \"`echo '$''{'ac_cv_lib_$ac_lib_var'+set}'`\" = set"; then
echo $ac_n "(cached) $ac_c" 1>&6
ac_save_LIBS="$LIBS"
LIBS="-lICE $X_EXTRA_LIBS $LIBS"
cat > conftest.$ac_ext <<EOF
-#line 5128 "configure"
+#line 5860 "configure"
#include "confdefs.h"
/* Override any gcc2 internal prototype to avoid an error. */
/* We use char because int might match the return type of a gcc2
IceConnectionNumber()
; return 0; }
EOF
-if { (eval echo configure:5139: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+if { (eval echo configure:5871: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
rm -rf conftest*
eval "ac_cv_lib_$ac_lib_var=yes"
else
echo $ac_n "checking for Motif""... $ac_c" 1>&6
-echo "configure:5193: checking for Motif" >&5
+echo "configure:5925: checking for Motif" >&5
#
#
ac_cv_motif_includes="none"
cat > conftest.$ac_ext <<EOF
-#line 5217 "configure"
+#line 5949 "configure"
#include "confdefs.h"
#include <Xm/Xm.h>
int main() {
int a;
; return 0; }
EOF
-if { (eval echo configure:5224: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
+if { (eval echo configure:5956: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
rm -rf conftest*
# Xm/Xm.h is in the standard search path.
#
ac_cv_motif_libraries="none"
cat > conftest.$ac_ext <<EOF
-#line 5289 "configure"
+#line 6021 "configure"
#include "confdefs.h"
#include <Xm/Xm.h>
int main() {
XtToolkitInitialize();
; return 0; }
EOF
-if { (eval echo configure:5296: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+if { (eval echo configure:6028: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
rm -rf conftest*
# libXm.a is in the standard search path.
do
ac_safe=`echo "$ac_hdr" | sed 'y%./+-%__p_%'`
echo $ac_n "checking for $ac_hdr""... $ac_c" 1>&6
-echo "configure:5403: checking for $ac_hdr" >&5
+echo "configure:6135: checking for $ac_hdr" >&5
if eval "test \"`echo '$''{'ac_cv_header_$ac_safe'+set}'`\" = set"; then
echo $ac_n "(cached) $ac_c" 1>&6
else
cat > conftest.$ac_ext <<EOF
-#line 5408 "configure"
+#line 6140 "configure"
#include "confdefs.h"
#include <$ac_hdr>
EOF
ac_try="$ac_cpp conftest.$ac_ext >/dev/null 2>conftest.out"
-{ (eval echo configure:5413: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }
+{ (eval echo configure:6145: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }
ac_err=`grep -v '^ *+' conftest.out | grep -v "^conftest.${ac_ext}\$"`
if test -z "$ac_err"; then
rm -rf conftest*
#####
# Checks for typedefs, structures, and compiler characteristics.
echo $ac_n "checking for working const""... $ac_c" 1>&6
-echo "configure:5444: checking for working const" >&5
+echo "configure:6176: checking for working const" >&5
if eval "test \"`echo '$''{'ac_cv_c_const'+set}'`\" = set"; then
echo $ac_n "(cached) $ac_c" 1>&6
else
cat > conftest.$ac_ext <<EOF
-#line 5449 "configure"
+#line 6181 "configure"
#include "confdefs.h"
int main() {
; return 0; }
EOF
-if { (eval echo configure:5498: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
+if { (eval echo configure:6230: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
rm -rf conftest*
ac_cv_c_const=yes
else
fi
echo $ac_n "checking for ANSI C header files""... $ac_c" 1>&6
-echo "configure:5519: checking for ANSI C header files" >&5
+echo "configure:6251: checking for ANSI C header files" >&5
if eval "test \"`echo '$''{'ac_cv_header_stdc'+set}'`\" = set"; then
echo $ac_n "(cached) $ac_c" 1>&6
else
cat > conftest.$ac_ext <<EOF
-#line 5524 "configure"
+#line 6256 "configure"
#include "confdefs.h"
#include <stdlib.h>
#include <stdarg.h>
#include <float.h>
EOF
ac_try="$ac_cpp conftest.$ac_ext >/dev/null 2>conftest.out"
-{ (eval echo configure:5532: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }
+{ (eval echo configure:6264: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }
ac_err=`grep -v '^ *+' conftest.out | grep -v "^conftest.${ac_ext}\$"`
if test -z "$ac_err"; then
rm -rf conftest*
if test $ac_cv_header_stdc = yes; then
# SunOS 4.x string.h does not declare mem*, contrary to ANSI.
cat > conftest.$ac_ext <<EOF
-#line 5549 "configure"
+#line 6281 "configure"
#include "confdefs.h"
#include <string.h>
EOF
if test $ac_cv_header_stdc = yes; then
# ISC 2.0.2 stdlib.h does not declare free, contrary to ANSI.
cat > conftest.$ac_ext <<EOF
-#line 5567 "configure"
+#line 6299 "configure"
#include "confdefs.h"
#include <stdlib.h>
EOF
:
else
cat > conftest.$ac_ext <<EOF
-#line 5588 "configure"
+#line 6320 "configure"
#include "confdefs.h"
#include <ctype.h>
#define ISLOWER(c) ('a' <= (c) && (c) <= 'z')
exit (0); }
EOF
-if { (eval echo configure:5599: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext} && (./conftest; exit) 2>/dev/null
+if { (eval echo configure:6331: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext} && (./conftest; exit) 2>/dev/null
then
:
else
fi
echo $ac_n "checking for size_t""... $ac_c" 1>&6
-echo "configure:5623: checking for size_t" >&5
+echo "configure:6355: checking for size_t" >&5
if eval "test \"`echo '$''{'ac_cv_type_size_t'+set}'`\" = set"; then
echo $ac_n "(cached) $ac_c" 1>&6
else
cat > conftest.$ac_ext <<EOF
-#line 5628 "configure"
+#line 6360 "configure"
#include "confdefs.h"
#include <sys/types.h>
#if STDC_HEADERS
fi
echo $ac_n "checking whether struct tm is in sys/time.h or time.h""... $ac_c" 1>&6
-echo "configure:5656: checking whether struct tm is in sys/time.h or time.h" >&5
+echo "configure:6388: checking whether struct tm is in sys/time.h or time.h" >&5
if eval "test \"`echo '$''{'ac_cv_struct_tm'+set}'`\" = set"; then
echo $ac_n "(cached) $ac_c" 1>&6
else
cat > conftest.$ac_ext <<EOF
-#line 5661 "configure"
+#line 6393 "configure"
#include "confdefs.h"
#include <sys/types.h>
#include <time.h>
struct tm *tp; tp->tm_sec;
; return 0; }
EOF
-if { (eval echo configure:5669: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
+if { (eval echo configure:6401: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
rm -rf conftest*
ac_cv_struct_tm=time.h
else
fi
echo $ac_n "checking for uid_t in sys/types.h""... $ac_c" 1>&6
-echo "configure:5690: checking for uid_t in sys/types.h" >&5
+echo "configure:6422: checking for uid_t in sys/types.h" >&5
if eval "test \"`echo '$''{'ac_cv_type_uid_t'+set}'`\" = set"; then
echo $ac_n "(cached) $ac_c" 1>&6
else
cat > conftest.$ac_ext <<EOF
-#line 5695 "configure"
+#line 6427 "configure"
#include "confdefs.h"
#include <sys/types.h>
EOF
fi
echo $ac_n "checking for inline""... $ac_c" 1>&6
-echo "configure:5724: checking for inline" >&5
+echo "configure:6456: checking for inline" >&5
if eval "test \"`echo '$''{'ac_cv_c_inline'+set}'`\" = set"; then
echo $ac_n "(cached) $ac_c" 1>&6
else
ac_cv_c_inline=no
for ac_kw in inline __inline__ __inline; do
cat > conftest.$ac_ext <<EOF
-#line 5731 "configure"
+#line 6463 "configure"
#include "confdefs.h"
int main() {
} $ac_kw foo() {
; return 0; }
EOF
-if { (eval echo configure:5738: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
+if { (eval echo configure:6470: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
rm -rf conftest*
ac_cv_c_inline=$ac_kw; break
else
# Checks for library functions.
#AC_FUNC_MALLOC
echo $ac_n "checking for 8-bit clean memcmp""... $ac_c" 1>&6
-echo "configure:5768: checking for 8-bit clean memcmp" >&5
+echo "configure:6500: checking for 8-bit clean memcmp" >&5
if eval "test \"`echo '$''{'ac_cv_func_memcmp_clean'+set}'`\" = set"; then
echo $ac_n "(cached) $ac_c" 1>&6
else
ac_cv_func_memcmp_clean=no
else
cat > conftest.$ac_ext <<EOF
-#line 5776 "configure"
+#line 6508 "configure"
#include "confdefs.h"
main()
}
EOF
-if { (eval echo configure:5786: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext} && (./conftest; exit) 2>/dev/null
+if { (eval echo configure:6518: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext} && (./conftest; exit) 2>/dev/null
then
ac_cv_func_memcmp_clean=yes
else
test $ac_cv_func_memcmp_clean = no && LIBOBJS="$LIBOBJS memcmp.${ac_objext}"
echo $ac_n "checking return type of signal handlers""... $ac_c" 1>&6
-echo "configure:5804: checking return type of signal handlers" >&5
+echo "configure:6536: checking return type of signal handlers" >&5
if eval "test \"`echo '$''{'ac_cv_type_signal'+set}'`\" = set"; then
echo $ac_n "(cached) $ac_c" 1>&6
else
cat > conftest.$ac_ext <<EOF
-#line 5809 "configure"
+#line 6541 "configure"
#include "confdefs.h"
#include <sys/types.h>
#include <signal.h>
int i;
; return 0; }
EOF
-if { (eval echo configure:5826: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
+if { (eval echo configure:6558: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
rm -rf conftest*
ac_cv_type_signal=void
else
echo $ac_n "checking for vprintf""... $ac_c" 1>&6
-echo "configure:5845: checking for vprintf" >&5
+echo "configure:6577: checking for vprintf" >&5
if eval "test \"`echo '$''{'ac_cv_func_vprintf'+set}'`\" = set"; then
echo $ac_n "(cached) $ac_c" 1>&6
else
cat > conftest.$ac_ext <<EOF
-#line 5850 "configure"
+#line 6582 "configure"
#include "confdefs.h"
/* System header to define __stub macros and hopefully few prototypes,
which can conflict with char vprintf(); below. */
; return 0; }
EOF
-if { (eval echo configure:5873: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+if { (eval echo configure:6605: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
rm -rf conftest*
eval "ac_cv_func_vprintf=yes"
else
if test "$ac_cv_func_vprintf" != yes; then
echo $ac_n "checking for _doprnt""... $ac_c" 1>&6
-echo "configure:5897: checking for _doprnt" >&5
+echo "configure:6629: checking for _doprnt" >&5
if eval "test \"`echo '$''{'ac_cv_func__doprnt'+set}'`\" = set"; then
echo $ac_n "(cached) $ac_c" 1>&6
else
cat > conftest.$ac_ext <<EOF
-#line 5902 "configure"
+#line 6634 "configure"
#include "confdefs.h"
/* System header to define __stub macros and hopefully few prototypes,
which can conflict with char _doprnt(); below. */
; return 0; }
EOF
-if { (eval echo configure:5925: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+if { (eval echo configure:6657: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
rm -rf conftest*
eval "ac_cv_func__doprnt=yes"
else
# Substitute things in output and header files.
########################################################################
SUFFIX=""
+GMXLIB_COND_OBJ=""
if test "$enable_mpi" = "yes"; then
cat >> confdefs.h <<\EOF
#define USE_MPI
EOF
- PAR_OBJ='${mpi_obj}'
+ GMXLIB_COND_OBJ="${GMXLIB_COND_OBJ} mpiio.lo"
if test "$enable_mpi_suffix" = "yes"; then
SUFFIX="_mpi"
fi
else
- PAR_OBJ='${libnet_obj}'
+ GMXLIB_COND_OBJ="${GMXLIB_COND_OBJ} libnet.lo"
fi
if test "$enable_vector" = "yes"; then
#define USE_FORTRAN
EOF
- INNER_F77_OBJ='${inner_f77_obj}'
+ GMXLIB_COND_OBJ="${GMXLIB_COND_OBJ} innerf.lo f77_wrappers.lo"
+ if test "$enable_float" = "yes"; then
+ MDLIB_COND_OBJ="flincs.lo fsettle.lo fshake.lo"
+ else
+ MDLIB_COND_OBJ="flincsd.lo fsettled.lo fshaked.lo"
+ fi
if true; then
USE_FORTRAN_FALSE=
fi
else
- INNER_C_OBJ='${inner_c_obj}'
+ GMXLIB_COND_OBJ="${GMXLIB_COND_OBJ} innerc.lo"
+ MDLIB_COND_OBJ="clincs.lo csettle.lo cshake.lo"
if false; then
fi
fi
-if test "$enable_double" = "yes"; then
+if test "$enable_float" = "no"; then
cat >> confdefs.h <<\EOF
#define DOUBLE
EOF
fi
-if test "$enable_sse" = "yes"; then
+if test "$enable_x86_asm" = "yes"; then
cat >> confdefs.h <<\EOF
#define USE_SSE
EOF
- SSE_OBJ='${sse_obj}'
-fi
-
-if test "$enable_3dnow" = "yes"; then
- cat >> confdefs.h <<\EOF
-#define USE_3DNOW
-EOF
-
- TDN_OBJ='${tdn_obj}'
-fi
-
-if test "$enable_sse" = "yes" -o "$enable_3dnow" = "yes"; then
- X86_ASM_OBJ='${x86_asm_obj}'
+ GMXLIB_COND_OBJ="${GMXLIB_COND_OBJ} x86_cpuid.lo x86_sse.lo x86_3dnow.lo"
fi
if test "$motif_includes" != "none" -a "$motif_libraries" != "none"; then
- MOTIF_OBJ='${motif_obj}'
+ GMXLIB_COND_OBJ="${GMXLIB_COND_OBJ} mgmx.lo widget.lo"
fi
if test "$enable_xdr" = "yes"; then
#define USE_XDR
EOF
- XDR_OBJ='${xdr_obj}'
+ GMXLIB_COND_OBJ="${GMXLIB_COND_OBJ} libxdrf.lo ftocstr.lo"
else
- XDR_OBJ='${noxdr_obj}'
+ GMXLIB_COND_OBJ="${GMXLIB_COND_OBJ} dumxdrf.lo"
fi
if test "$enable_softwaresqrt" = "yes"; then
fi
-
-
-
-
-
-
-
-
- # not used right now
-
-
-
# Check if there are any optimizations and options for this arch and cpu
# determine our suggested choices for both C and fortran, and then possibly
# override them with user choices.
+cc_vendor="unknown"
+
case "${host_cpu}-${host_os}" in
*-solaris2*)
esac
if $CC -V 2> /dev/null | grep Compaq > /dev/null 2>&1; then
xCFLAGS="$tmpCFLAGS"
+ cc_vendor="Compaq"
fi
if test "$enable_fortran" = "yes"; then
if $F77 -V 2> /dev/null | grep Compaq > /dev/null 2>&1; then
xCFLAGS="$pgiopt -fast -Minfo=loop -pc 32"
fi
if test "$enable_fortran" = "yes"; then
- if $F77 -V 2> /dev/null | grep Portland /dev/null 2>&1; then
+ if $F77 -V 2> /dev/null | grep Portland > /dev/null 2>&1; then
xFFLAGS="$xCFLAGS -Mneginfo=loop"
fi
fi
echo $ac_n "checking whether $CC accepts -malign-double""... $ac_c" 1>&6
-echo "configure:6532: checking whether $CC accepts -malign-double" >&5
+echo "configure:7250: checking whether $CC accepts -malign-double" >&5
if eval "test \"`echo '$''{'ac_align_double'+set}'`\" = set"; then
echo $ac_n "(cached) $ac_c" 1>&6
else
fi
CPU_FLAGS=""
+
if test "$GCC" = "yes"; then
+
+
+if true; then
+ GNU_CC_TRUE=
+ GNU_CC_FALSE='#'
+else
+ GNU_CC_TRUE='#'
+ GNU_CC_FALSE=
+fi
# try to guess correct CPU flags, at least for linux
case "${host_cpu}" in
# i586/i686 cpu flags don't improve speed, thus no need to use them.
echo $ac_n "checking whether $CC accepts -mcpu=$cputype""... $ac_c" 1>&6
-echo "configure:6579: checking whether $CC accepts -mcpu=$cputype" >&5
+echo "configure:7307: checking whether $CC accepts -mcpu=$cputype" >&5
if eval "test \"`echo '$''{'ac_m_cpu_60x'+set}'`\" = set"; then
echo $ac_n "(cached) $ac_c" 1>&6
else
echo $ac_n "checking whether $CC accepts -mcpu=750""... $ac_c" 1>&6
-echo "configure:6606: checking whether $CC accepts -mcpu=750" >&5
+echo "configure:7334: checking whether $CC accepts -mcpu=750" >&5
if eval "test \"`echo '$''{'ac_m_cpu_750'+set}'`\" = set"; then
echo $ac_n "(cached) $ac_c" 1>&6
else
echo $ac_n "checking whether $CC accepts -mcpu=powerpc""... $ac_c" 1>&6
-echo "configure:6634: checking whether $CC accepts -mcpu=powerpc" >&5
+echo "configure:7362: checking whether $CC accepts -mcpu=powerpc" >&5
if eval "test \"`echo '$''{'ac_m_cpu_powerpc'+set}'`\" = set"; then
echo $ac_n "(cached) $ac_c" 1>&6
else
echo $ac_n "checking whether $CC accepts -mpowerpc""... $ac_c" 1>&6
-echo "configure:6662: checking whether $CC accepts -mpowerpc" >&5
+echo "configure:7390: checking whether $CC accepts -mpowerpc" >&5
if eval "test \"`echo '$''{'ac_m_powerpc'+set}'`\" = set"; then
echo $ac_n "(cached) $ac_c" 1>&6
else
fi
esac
+else
+
+
+if false; then
+ GNU_CC_TRUE=
+ GNU_CC_FALSE='#'
+else
+ GNU_CC_TRUE='#'
+ GNU_CC_FALSE=
+fi
fi
if test -n "$CPU_FLAGS"; then
echo "*******************************************************************"
echo "* WARNING: No special optimization settings found for the C *"
echo "* compiler. Use make CFLAGS=..., or edit the top level Makefile. *"
- echo "* Reverting to the default setting CFLAGS=-O3. (mail us about it) *"
+ echo "* Reverting to the default setting CFLAGS=-O3. (mail us about it!)*"
echo "*******************************************************************"
CFLAGS="-O3"
fi
echo $ac_n "checking whether $CC accepts ${CFLAGS}""... $ac_c" 1>&6
-echo "configure:6714: checking whether $CC accepts ${CFLAGS}" >&5
+echo "configure:7452: checking whether $CC accepts ${CFLAGS}" >&5
if eval "test \"`echo '$''{'ac_guessed_cflags'+set}'`\" = set"; then
echo $ac_n "(cached) $ac_c" 1>&6
else
if test "$enable_fortran" = "yes"; then
if test "$ac_test_FFLAGS" != "set"; then
FFLAGS="$xFFLAGS"
-
if test -z "$FFLAGS"; then
echo "*******************************************************************"
echo "* WARNING: No special optimization settings found for the fortran *"
echo "* compiler. Use make FFLAGS=..., or edit the top level Makefile. *"
- echo "* Reverting to the default setting FFLAGS=-O3. (mail us about it) *"
+ echo "* Reverting to the default setting FFLAGS=-O3. (mail us about it!)*"
echo "*******************************************************************"
FFLAGS="-O3"
fi
echo $ac_n "checking whether $F77 accepts ${FFLAGS}""... $ac_c" 1>&6
-echo "configure:6764: checking whether $F77 accepts ${FFLAGS}" >&5
+echo "configure:7501: checking whether $F77 accepts ${FFLAGS}" >&5
if eval "test \"`echo '$''{'ac_guessed_fflags'+set}'`\" = set"; then
echo $ac_n "(cached) $ac_c" 1>&6
else
echo "******************************************"
fi
fi
+
# should be automatic, but doesnt seem to be?
+
+
+
+
+
+
# put binaries and libraries in subdirectories named as the arch
if test -n "$gmxcpu"; then
bindir="\${exec_prefix}/bin/${host}/${gmxcpu}"
-
trap '' 1 2 15
cat > confcache <<\EOF
# This file is a shell script that caches the results of configure
s%@AUTOMAKE@%$AUTOMAKE%g
s%@AUTOHEADER@%$AUTOHEADER%g
s%@MAKEINFO@%$MAKEINFO%g
-s%@AMTAR@%$AMTAR%g
-s%@install_sh@%$install_sh%g
-s%@AWK@%$AWK%g
s%@SET_MAKE@%$SET_MAKE%g
-s%@AMDEP@%$AMDEP%g
-s%@AMDEPBACKSLASH@%$AMDEPBACKSLASH%g
-s%@DEPDIR@%$DEPDIR%g
s%@host@%$host%g
s%@host_alias@%$host_alias%g
s%@host_cpu@%$host_cpu%g
s%@F77@%$F77%g
s%@CC@%$CC%g
s%@CPP@%$CPP%g
-s%@_am_include@%$_am_include%g
-s%@CCDEPMODE@%$CCDEPMODE%g
s%@BUILD_CC@%$BUILD_CC%g
s%@FLIBS@%$FLIBS%g
s%@MPICC@%$MPICC%g
s%@USE_MPI_TRUE@%$USE_MPI_TRUE%g
s%@USE_MPI_FALSE@%$USE_MPI_FALSE%g
-s%@NASM@%$NASM%g
-s%@NASMFLAGS@%$NASMFLAGS%g
s%@IDENT@%$IDENT%g
+s%@LN_S@%$LN_S%g
+s%@build@%$build%g
+s%@build_alias@%$build_alias%g
+s%@build_cpu@%$build_cpu%g
+s%@build_vendor@%$build_vendor%g
+s%@build_os@%$build_os%g
+s%@OBJEXT@%$OBJEXT%g
+s%@EXEEXT@%$EXEEXT%g
s%@RANLIB@%$RANLIB%g
+s%@STRIP@%$STRIP%g
+s%@LIBTOOL@%$LIBTOOL%g
s%@GMX_USE_XDR_TRUE@%$GMX_USE_XDR_TRUE%g
s%@GMX_USE_XDR_FALSE@%$GMX_USE_XDR_FALSE%g
s%@X_CFLAGS@%$X_CFLAGS%g
s%@USE_FORTRAN_FALSE@%$USE_FORTRAN_FALSE%g
s%@USE_DOUBLE_TRUE@%$USE_DOUBLE_TRUE%g
s%@USE_DOUBLE_FALSE@%$USE_DOUBLE_FALSE%g
-s%@PAR_OBJ@%$PAR_OBJ%g
-s%@INNER_F77_OBJ@%$INNER_F77_OBJ%g
-s%@INNER_C_OBJ@%$INNER_C_OBJ%g
-s%@SSE_OBJ@%$SSE_OBJ%g
-s%@TDN_OBJ@%$TDN_OBJ%g
-s%@X86_ASM_OBJ@%$X86_ASM_OBJ%g
-s%@MOTIF_OBJ@%$MOTIF_OBJ%g
-s%@XDR_OBJ@%$XDR_OBJ%g
-s%@AXP_ASM_OBJ@%$AXP_ASM_OBJ%g
+s%@GNU_CC_TRUE@%$GNU_CC_TRUE%g
+s%@GNU_CC_FALSE@%$GNU_CC_FALSE%g
+s%@INCLUDES@%$INCLUDES%g
+s%@GMXLIB_COND_OBJ@%$GMXLIB_COND_OBJ%g
+s%@MDLIB_COND_OBJ@%$MDLIB_COND_OBJ%g
s=XXX_SUFFIX_XXX=$SUFFIX=g
s%@SUFFIX@%$SUFFIX%g
-s%@INCLUDES@%$INCLUDES%g
CEOF
EOF
EOF
cat >> $CONFIG_STATUS <<EOF
-am_indx=1
- for am_file in src/include/config.h; do
- case " $CONFIG_HEADERS " in
- *" $am_file "*)
- echo timestamp > `echo $am_file | sed 's%:.*%%;s%[^/]*$%%'`stamp-h$am_indx
- ;;
- esac
- am_indx=\`expr \$am_indx + 1\`
- done
-AMDEP="$AMDEP"
-ac_aux_dir="$ac_aux_dir"
+
EOF
cat >> $CONFIG_STATUS <<\EOF
-test -z "$CONFIG_HEADERS" || echo timestamp > src/include/stamp-h
-
-test x"$AMDEP" != x"" ||
-for mf in $CONFIG_FILES; do
- case "$mf" in
- Makefile) dirpart=.;;
- */Makefile) dirpart=`echo "$mf" | sed -e 's|/[^/]*$||'`;;
- *) continue;;
- esac
- grep '^DEP_FILES *= *[^ #]' < "$mf" > /dev/null || continue
- # Extract the definition of DEP_FILES from the Makefile without
- # running `make'.
- DEPDIR=`sed -n -e '/^DEPDIR = / s///p' < "$mf"`
- test -z "$DEPDIR" && continue
- # When using ansi2knr, U may be empty or an underscore; expand it
- U=`sed -n -e '/^U = / s///p' < "$mf"`
- test -d "$dirpart/$DEPDIR" || mkdir "$dirpart/$DEPDIR"
- # We invoke sed twice because it is the simplest approach to
- # changing $(DEPDIR) to its actual value in the expansion.
- for file in `sed -n -e '
- /^DEP_FILES = .*\\\\$/ {
- s/^DEP_FILES = //
- :loop
- s/\\\\$//
- p
- n
- /\\\\$/ b loop
- p
- }
- /^DEP_FILES = / s/^DEP_FILES = //p' < "$mf" | \
- sed -e 's/\$(DEPDIR)/'"$DEPDIR"'/g' -e 's/\$U/'"$U"'/g'`; do
- # Make sure the directory exists.
- test -f "$dirpart/$file" && continue
- fdir=`echo "$file" | sed -e 's|/[^/]*$||'`
- $ac_aux_dir/mkinstalldirs "$dirpart/$fdir" > /dev/null 2>&1
- # echo "creating $dirpart/$file"
- echo '# dummy' > "$dirpart/$file"
- done
-done
-
+test -z "$CONFIG_HEADERS" || echo timestamp > src/include/stamp-h
exit 0
EOF
echo ""
-echo "Gromacs is ready to compile. Summary of options used:"
+echo "GROMACS is ready to compile. Summary of main options:"
echo "Architecture : $host"
if test "$enable_cpu_detection" = "yes"; then
if test -n "$gmxcpu"; then
echo "(Extra CPU detection not necessary or unavailable on this host)"
fi
fi
+echo "Vector architecture : $enable_vector"
echo "MPI parallelization : $enable_mpi"
if test "$enable_mpi" = "yes"; then
echo "Checking MPI environment : $with_mpi_environment"
echo "MPI suffix on files : $enable_mpi_suffix"
fi
-echo "Vector architecture : $enable_vector"
echo "Using Fortran code : $enable_fortran"
-echo "Double precision : $enable_double"
-if test "$enable_double" = "yes"; then
-echo "Type suffix on files : $enable_type_suffix"
+echo "Single precision : $enable_float"
+if test "$enable_float" = "no"; then
+echo "Suffix on double prec. files : $enable_type_suffix"
fi
-echo "Expanding water loops : $enable_simplewater"
-echo "Using water-water loops : $enable_waterwater_loops"
echo "Automatically nice mdrun : $enable_nice"
-echo "Using x86 SSE assembly : $enable_sse"
-echo "Using x86 3DNow assembly : $enable_3dnow"
-echo "Portable trajectories (xdr) : $enable_xdr"
+echo "Using x86 SSE/3DNow assembly : $enable_x86_asm"
echo "Software 1/x : $enable_softwarerecip"
echo "Software 1/sqrt(x) : $enable_softwaresqrt"
echo "Vectorize 1/x : $enable_vectorized_recip"
echo "Vectorize 1/sqrt(x) : $list_of_vectorized_sqrt"
echo "Prefetch coordinates in loops : $list_of_prefetch_x"
echo "Prefetch forces in loops : $list_of_prefetch_f"
-echo "Hide square latency : $enable_hide_square_latency"
-echo "Hide table lookup latency : $enable_hide_table_latency"
-echo "Using X11 : $use_x11"
+echo "X11 support : $use_x11"
echo "Motif support : $use_motif"
echo ""
echo "GROMACS will be installed under $prefix"
echo "Make sure to update your PATH and MANPATH to find the"
-echo "programs and unix manual pages."
+echo "programs and unix manual pages, and possibly LD_LIBRARY_PATH"
+echo "or /etc/ld.so.conf if you are using dynamic libraries"
#######################################################################
AC_INIT(src/gmxlib/3dview.c)
AC_PREREQ(2.13)
-AC_CONFIG_AUX_DIR(./config)
+AC_CONFIG_AUX_DIR(config)
AM_INIT_AUTOMAKE(gromacs, 3.0)
+dnl This is the version info according to the libtool versioning system.
+dnl It does *not* correspond to the release number.
+SHARED_VERSION_INFO="1:0:0"
AC_PREFIX_DEFAULT(/usr/local/gromacs)
AM_CONFIG_HEADER(src/include/config.h)
#####
AC_ARG_ENABLE(mpi,
- [ --enable-mpi Compile parallel version of Gromacs],
+ [ --enable-mpi Compile parallel version of GROMACS],
enable_mpi=$enableval, enable_mpi=no)
#####
AC_ARG_ENABLE(fortran,
- [ --enable-fortran Dortran loops (default on sgi,ibm,sun,tru64/dec)],
+ [ --enable-fortran Fortran loops (default on sgi,ibm,sun,axp)],
enable_fortran=$enableval,enable_fortran=check)
#####
-AC_ARG_ENABLE(double,
- [ --enable-double Compile double precision Gromacs],
- enable_double=$enableval, enable_double=no)
+AC_ARG_ENABLE(float,
+ [ --enable-float Compile single precision GROMACS],
+ enable_float=$enableval, enable_float=yes)
#####
[ --enable-mpi-suffix Add a suffix to MPI files (default on ibm)],
enable_mpi_suffix=$enableval, enable_mpi_suffix=check)
-
#####
AC_ARG_ENABLE(simplewater,
#####
-AC_ARG_ENABLE(sse,
- [ --disable-sse Disable SSE assembly loops on x86],
- enable_sse=$enableval, enable_sse=yes)
-
-
-#####
-
-AC_ARG_ENABLE(3dnow,
- [ --disable-3dnow Disable 3DNow assembly loops on x86],
- enable_3dnow=$enableval, enable_3dnow=yes)
+AC_ARG_ENABLE(x86_asm,
+ [ --disable-x86-asm Disable assembly loops on x86],
+ enable_x86_asm=$enableval, enable_x86_asm=yes)
#####
if test "$enable_fortran" = "check"; then
case "${host_cpu}-${host_os}" in
- sparc*-solaris* | alpha*-osf* | rs6000*-aix* | mips*-irix*)
+ sparc*-solaris* | alpha*-* | rs6000*-aix* | mips*-irix*)
enable_fortran=yes
;;
esac
if test "$enable_fortran" = "yes"; then
- # vendor f77 before g77
- AC_CHECK_PROGS(F77, f77 xlf xlf77 cf77 fl32 g77 fort77 f90 xlf90 pgf77 cf77 fort fort77 pgf90)
+ # vendor f77 before g77 - but special compiler list for alpha-linux
+ case "${host_cpu}-${host_os}" in
+ alpha*-linux*)
+ AC_CHECK_PROGS(F77, fort f77 g77)
+ ;;
+ *)
+ AC_CHECK_PROGS(F77, f77 xlf xlf77 cf77 fl32 g77 fort77 f90 xlf90 pgf77 cf77 fort fort77 pgf90)
+ ;;
+ esac
AC_PROG_F77
if test -z "$F77"; then
AC_MSG_ERROR([No fortran compiler found])
fi
fi
-# Checks for programs.
-AC_PROG_MAKE_SET
-
-AC_CHECK_PROG(CC, cc, cc) # do vendor cc before gcc
+case "${host_cpu}-${host_os}" in
+ alpha*-linux*)
+ AC_CHECK_PROGS(CC, ccc cc) # do vendor cc before gcc
+ ;;
+ *)
+ AC_CHECK_PROG(CC, cc, cc) # do vendor cc before gcc
+ ;;
+esac
AC_PROG_CC
+AC_PROG_CPP
BUILD_CC=$CC
AC_SUBST(BUILD_CC)
fi
# if we are using mpi, also get an MPICC. We cannot set that in the PROG_CC macro
-# above, since the autoconf checks that the created file can be executed. This would
+# above, since autoconf checks that the created file can be executed. This would
# fail on platforms where MPI executables can only be run through a batchqueue.
if test "$enable_mpi" = "yes"; then
AM_CONDITIONAL(USE_MPI,false)
fi
-AC_PROG_INSTALL
-AC_PROG_CPP
-
-# A rather complicated check for the nasm program and x86 assembly capabilities
-# to run under windows we must insert a test a change nasm "-f elf" to "-f win32"
+# A rather complicated check for the capabilities of as, to make
+# sure we can compile the assembly innerloops.
if test "$x86" = "yes"; then
- if [ test "$enable_sse" = "yes" -o "$enable_3dnow" = "yes"]; then
- if test "$enable_double" = "yes"; then
- AC_MSG_WARN([SSE/3Dnow assembly can only be used in single precision])
- enable_sse=no
- enable_3dnow=no
+ if [ test "$enable_x86_asm" = "yes"]; then
+ if test "$enable_float" = "no"; then
+ AC_MSG_WARN([The assembly loops can only be used in single precision - disabling])
+ enable_x86_asm=no
else
- AC_PATH_PROG(NASM,nasm,no)
- NASMFLAGS="-f elf"
- AC_SUBST(NASMFLAGS)
- if test "$NASM" = "no"; then
- AC_MSG_ERROR([Nasm is required for SSE and 3DNow loops.])
- fi
- if test "$enable_sse" = "yes"; then
- AC_MSG_CHECKING([whether nasm supports SSE instructions])
-cat > conftest_sse.s << EOF
- global checksse
-checksse:
+ AC_MSG_CHECKING([whether as fully supports intel syntax])
+cat > conftest.s << EOF
+.intel_syntax noprefix
+checkasm:
emms
- xorps xmm0,xmm0
- emms
- ret
-EOF
- if AC_TRY_COMMAND($NASM conftest_sse.s); then
- AC_MSG_RESULT([yes])
- else
- AC_MSG_RESULT([no])
- AC_MSG_ERROR([Download a patched nasm from the Gromacs homepage,]
- [or disable SSE assembly.])
- fi
- fi
- if test "$enable_3dnow" = "yes"; then
- AC_MSG_CHECKING([whether nasm supports extended 3DNow instructions])
-cat > conftest_3dnow.s << EOF
- global check3dnow
-check3dnow:
- femms
pswapd mm0,mm0
- femms
+ movups xmm0,[checkasm]
+ emms
ret
EOF
- if AC_TRY_COMMAND([$NASM -f elf conftest_3dnow.s]); then
- AC_MSG_RESULT([yes])
- else
- AC_MSG_RESULT([no])
- AC_MSG_ERROR([Download a patched nasm from the Gromacs homepage,]
- [or disable 3DNow assembly.])
- fi
+ if AC_TRY_COMMAND($CC -c conftest.s); then
+ AC_MSG_RESULT([yes])
+ else
+ AC_MSG_RESULT([no])
+ AC_MSG_ERROR([Upgrade to binutils>=2.11, download the as executable]
+ [from www.gromacs.org, or disable assembly loops.])
fi
- fi
+ fi
fi
else # not x86
-enable_sse=no
-enable_3dnow=no
+enable_x86_asm=no
fi
AC_PATH_PROG(IDENT,ident,no)
AC_MSG_RESULT([no]))
fi
-AC_PROG_RANLIB
+
+AC_PROG_LN_S
+AM_PROG_LIBTOOL
AC_CHECK_FUNCS(strcasecmp)
AC_CHECK_FUNCS(strdup)
fi
fi
-if test "$enable_double" = "yes"; then
- precision=8
-else
+if test "$enable_float" = "yes"; then
precision=4
+else
+ precision=8
fi
if test "$enable_mpi" = "yes"; then
######
if test "$enable_xdr" = "no"; then
- AC_MSG_WARN([* Not using XDR cripples Gromacs significantly. You won't be able to *]
- [* read or write any hardware independent or compressed trajectories. *]
- [* We strongly suggest you try to locate the RPC routines instead! *])
+ AC_MSG_WARN([* Not using XDR cripples GROMACS significantly. You won't be able to *]
+ [* read or write any compressed trajectories. You have no choice on *]
+ [* windows, but if you run UNIX locate the RPC files - you have them! *])
else
# check for xtc headers
AC_CHECK_HEADERS(rpc/rpc.h rpc/xdr.h,,AC_MSG_ERROR([RPC/XDR include headers not found]))
# Substitute things in output and header files.
########################################################################
SUFFIX=""
+GMXLIB_COND_OBJ=""
if test "$enable_mpi" = "yes"; then
- AC_DEFINE(USE_MPI,,[Make a parallel version of Gromacs using MPI])
- PAR_OBJ='${mpi_obj}'
+ AC_DEFINE(USE_MPI,,[Make a parallel version of GROMACS using MPI])
+ GMXLIB_COND_OBJ="${GMXLIB_COND_OBJ} mpiio.lo"
if test "$enable_mpi_suffix" = "yes"; then
SUFFIX="_mpi"
fi
else
- PAR_OBJ='${libnet_obj}'
+ GMXLIB_COND_OBJ="${GMXLIB_COND_OBJ} libnet.lo"
fi
if test "$enable_vector" = "yes"; then
if test "$enable_fortran" = "yes"; then
AC_DEFINE(USE_FORTRAN,,[Use Fortran for innerloops and some other core stuff])
- INNER_F77_OBJ='${inner_f77_obj}'
+ GMXLIB_COND_OBJ="${GMXLIB_COND_OBJ} innerf.lo f77_wrappers.lo"
+ if test "$enable_float" = "yes"; then
+ MDLIB_COND_OBJ="flincs.lo fsettle.lo fshake.lo"
+ else
+ MDLIB_COND_OBJ="flincsd.lo fsettled.lo fshaked.lo"
+ fi
AM_CONDITIONAL(USE_FORTRAN,true)
else
- INNER_C_OBJ='${inner_c_obj}'
+ GMXLIB_COND_OBJ="${GMXLIB_COND_OBJ} innerc.lo"
+ MDLIB_COND_OBJ="clincs.lo csettle.lo cshake.lo"
AM_CONDITIONAL(USE_FORTRAN,false)
fi
-if test "$enable_double" = "yes"; then
+if test "$enable_float" = "no"; then
AC_DEFINE(DOUBLE,,[Compile in double precision])
if test "$enable_type_suffix" = "yes"; then
SUFFIX="${SUFFIX}_d"
AC_DEFINE(NO_NICE,,[Turn off the automatic nicing of gromacs])
fi
-if test "$enable_sse" = "yes"; then
- AC_DEFINE(USE_SSE,,[Use x86 assembly with SSE instructions])
- SSE_OBJ='${sse_obj}'
-fi
-
-if test "$enable_3dnow" = "yes"; then
- AC_DEFINE(USE_3DNOW,,[Use x86 assembly with 3DNow instructions])
- TDN_OBJ='${tdn_obj}'
-fi
-
-if [test "$enable_sse" = "yes" -o "$enable_3dnow" = "yes"]; then
- X86_ASM_OBJ='${x86_asm_obj}'
+if test "$enable_x86_asm" = "yes"; then
+ AC_DEFINE(USE_SSE,,[Use x86 assembly loops])
+ GMXLIB_COND_OBJ="${GMXLIB_COND_OBJ} x86_cpuid.lo x86_sse.lo x86_3dnow.lo"
fi
if [test "$motif_includes" != "none" -a "$motif_libraries" != "none"]; then
- MOTIF_OBJ='${motif_obj}'
+ GMXLIB_COND_OBJ="${GMXLIB_COND_OBJ} mgmx.lo widget.lo"
fi
if test "$enable_xdr" = "yes"; then
AC_DEFINE(USE_XDR,,[Use xdr routines to make trajectories portable])
- XDR_OBJ='${xdr_obj}'
+ GMXLIB_COND_OBJ="${GMXLIB_COND_OBJ} libxdrf.lo ftocstr.lo"
else
- XDR_OBJ='${noxdr_obj}'
+ GMXLIB_COND_OBJ="${GMXLIB_COND_OBJ} dumxdrf.lo"
fi
if test "$enable_softwaresqrt" = "yes"; then
- AC_DEFINE(SOFTWARE_SQRT,,[Use the Gromacs software 1/sqrt(x)])
+ AC_DEFINE(SOFTWARE_SQRT,,[Use the GROMACS software 1/sqrt(x)])
fi
if test "$enable_softwarerecip" = "yes"; then
- AC_DEFINE(SOFTWARE_RECIP,,[Use the Gromacs software 1/x])
+ AC_DEFINE(SOFTWARE_RECIP,,[Use the GROMACS software 1/x])
fi
if test "$enable_hide_square_latency" = "yes"; then
AC_DEFINE_UNQUOTED(GMXLIBDIR,"${prefix}/top",[Default topology file location])
fi
-AC_SUBST(PAR_OBJ)
-AC_SUBST(INNER_F77_OBJ)
-AC_SUBST(INNER_C_OBJ)
-AC_SUBST(SSE_OBJ)
-AC_SUBST(TDN_OBJ)
-AC_SUBST(X86_ASM_OBJ)
-AC_SUBST(MOTIF_OBJ)
-AC_SUBST(XDR_OBJ)
-AC_SUBST(AXP_ASM_OBJ) # not used right now
-ACX_SUBST_XXX(SUFFIX)
-AC_SUBST(SUFFIX)
-
# Check if there are any optimizations and options for this arch and cpu
ACX_COMPILER_MAXOPT
AC_SUBST(INCLUDES) # should be automatic, but doesnt seem to be?
+AC_SUBST(GMXLIB_COND_OBJ)
+AC_SUBST(MDLIB_COND_OBJ)
+ACX_SUBST_XXX(SUFFIX)
+AC_SUBST(SUFFIX)
+
+
# put binaries and libraries in subdirectories named as the arch
if test -n "$gmxcpu"; then
bindir="\${exec_prefix}/bin/${host}/${gmxcpu}"
AC_SUBST(bindir)
AC_SUBST(libdir)
-
AC_OUTPUT([Makefile
src/Makefile
src/gmxlib/Makefile
man/Makefile])
echo ""
-echo "Gromacs is ready to compile. Summary of options used:"
+echo "GROMACS is ready to compile. Summary of main options:"
echo "Architecture : $host"
if test "$enable_cpu_detection" = "yes"; then
if test -n "$gmxcpu"; then
echo "(Extra CPU detection not necessary or unavailable on this host)"
fi
fi
+echo "Vector architecture : $enable_vector"
echo "MPI parallelization : $enable_mpi"
if test "$enable_mpi" = "yes"; then
echo "Checking MPI environment : $with_mpi_environment"
echo "MPI suffix on files : $enable_mpi_suffix"
fi
-echo "Vector architecture : $enable_vector"
echo "Using Fortran code : $enable_fortran"
-echo "Double precision : $enable_double"
-if test "$enable_double" = "yes"; then
-echo "Type suffix on files : $enable_type_suffix"
+echo "Single precision : $enable_float"
+if test "$enable_float" = "no"; then
+echo "Suffix on double prec. files : $enable_type_suffix"
fi
-echo "Expanding water loops : $enable_simplewater"
-echo "Using water-water loops : $enable_waterwater_loops"
echo "Automatically nice mdrun : $enable_nice"
-echo "Using x86 SSE assembly : $enable_sse"
-echo "Using x86 3DNow assembly : $enable_3dnow"
-echo "Portable trajectories (xdr) : $enable_xdr"
+echo "Using x86 SSE/3DNow assembly : $enable_x86_asm"
echo "Software 1/x : $enable_softwarerecip"
echo "Software 1/sqrt(x) : $enable_softwaresqrt"
echo "Vectorize 1/x : $enable_vectorized_recip"
echo "Vectorize 1/sqrt(x) : $list_of_vectorized_sqrt"
echo "Prefetch coordinates in loops : $list_of_prefetch_x"
echo "Prefetch forces in loops : $list_of_prefetch_f"
-echo "Hide square latency : $enable_hide_square_latency"
-echo "Hide table lookup latency : $enable_hide_table_latency"
-echo "Using X11 : $use_x11"
+echo "X11 support : $use_x11"
echo "Motif support : $use_motif"
echo ""
echo "GROMACS will be installed under $prefix"
echo "Make sure to update your PATH and MANPATH to find the"
-echo "programs and unix manual pages."
+echo "programs and unix manual pages, and possibly LD_LIBRARY_PATH"
+echo "or /etc/ld.so.conf if you are using dynamic libraries"
/* Use motif/lesstif libraries */
#undef HAVE_MOTIF
-/* Make a parallel version of Gromacs using MPI */
+/* Make a parallel version of GROMACS using MPI */
#undef USE_MPI
/* Optimize for a vector architecture */
/* Turn off the automatic nicing of gromacs */
#undef NO_NICE
-/* Use x86 assembly with SSE instructions */
+/* Use x86 assembly loops */
#undef USE_SSE
-/* Use x86 assembly with 3DNow instructions */
-#undef USE_3DNOW
-
/* Use xdr routines to make trajectories portable */
#undef USE_XDR
-/* Use the Gromacs software 1/sqrt(x) */
+/* Use the GROMACS software 1/sqrt(x) */
#undef SOFTWARE_SQRT
-/* Use the Gromacs software 1/x */
+/* Use the GROMACS software 1/x */
#undef SOFTWARE_RECIP
/* Try to get coordinates to cache before using them */
static char *CopyrightText[] = {
"",
"Copyright (c) 1991-2001",
- "BIOSON Research Institute, Dept. of Biophysical Chemistry",
- "University of Groningen, The Netherlands",
+ "Dept. of Biophysical Chemistry, University of Groningen, The Netherlands",
+ "For additional resources, check out http://www.gromacs.org"
+ ""
+ "This program is free software; you can redistribute it and/or"
+ "modify it under the terms of the GNU General Public License"
+ "as published by the Free Software Foundation; either version 2"
+ "of the License, or (at your option) any later version."
""
};
--- /dev/null
+Makefile.in
--- /dev/null
+## Process this file with automake to produce Makefile.in
+#
+# Don't edit - this file is generated automatically from Makefile.am
+#
+
+man_MANS = g_dih.1 g_msd.1 g_tcaf.1 nmrun.1 \
+ do_dssp.1 g_dipoles.1 g_nmeig.1 g_traj.1 \
+ pdb2gmx.1 editconf.1 g_disre.1 g_nmens.1 \
+ g_velacc.1 protonate.1 eneconv.1 g_dist.1 \
+ g_order.1 genbox.1 tpbconv.1 g_anaeig.1 \
+ g_dyndom.1 g_potential.1 genconf.1 trjcat.1 \
+ g_analyze.1 g_enemat.1 g_rama.1 genion.1 \
+ trjconv.1 g_angle.1 g_energy.1 g_rdf.1 \
+ genpr.1 trjorder.1 g_bond.1 g_gyrate.1 \
+ g_rms.1 gmxcheck.1 wheel.1 g_bundle.1 \
+ g_h2order.1 g_rmsdist.1 gmxdump.1 x2top.1 \
+ g_chi.1 g_hbond.1 g_rmsf.1 grompp.1 \
+ xpm2ps.1 g_cluster.1 g_helix.1 g_rotacf.1 \
+ highway.1 xrama.1 g_confrms.1 g_lie.1 \
+ g_saltbr.1 make_ndx.1 g_covar.1 g_mdmat.1 \
+ g_sas.1 mdrun.1 g_density.1 g_mindist.1 \
+ g_sgangle.1 mk_angndx.1 g_morph.1 g_sorient.1 \
+ ngmx.1 g_dielectric.1
+
+EXTRA_DIST = ${man_MANS}
\ No newline at end of file
<HTML>
+<HEAD>
<TITLE>GROMACS 3.0 Online Reference </TITLE>
+</HEAD>
<LINK rel=stylesheet href="online/style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>GROMACS 3.0 Online Reference</H2>
-<HR>
-<P>
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+
+<table WIDTH="800" NOSAVE NOBORDER >
+<tr NOSAVE>
+<td WIDTH="120" HEIGHT="140" NOSAVE><a href="http://www.gromacs.org/"><img SRC="
+gif/gmxlogo_small.jpg" BORDER=0 height=133 width=116></a></td>
+
+<td ALIGN=LEFT VALIGN=TOP WIDTH=480 NOSAVE>
+<br><br>
+<h2>
+GROMACS 3.0<br>
+Online Reference</h2>
+</td>
+<td ALIGN=RIGHT VALIGN=BOTTOM WIDTH=200 NOSAVE>
+<B>VERSION 3.0<br>
+Tue 15 May 2001</B></td>
+</tr>
+</table>
+
+<hr>
+
<TABLE BORDER=0 CELLSPACING=0 CELLPADDING=10>
<TR>
<TD VALIGN=top WIDTH="25%">
<br><a href=online/trjorder.html>trjorder</a>
<br><a href=online/wheel.html>wheel</a>
<br><a href=online/x2top.html>x2top</a>
+<br><a href=online/xmdrun.html>xmdrun</a>
<br><a href=online/xpm2ps.html>xpm2ps</a>
<br><a href=online/xrama.html>xrama</a>
</multicol>
<TR><TD><A HREF="online/grompp.html">grompp</A><TD>makes a run input file
<TR><TD><A HREF="online/tpbconv.html">tpbconv</A><TD>makes a run input file for restarting a crashed run
<TR><TD><A HREF="online/mdrun.html">mdrun</A><TD>performs a simulation
+<TR><TD><A HREF="online/xmdrun.html">xmdrun</A><TD>performs simulations with extra experimental features
</TABLE>
<A NAME="HNR3">
-<TITLE>do_dssp</TITLE>
+<HTML>/n<HEAD>/n<TITLE>do_dssp</TITLE>
<LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>do_dssp</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
<TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>do_dssp</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
<H3>Description</H3>
do_dssp
reads a trajectory file and computes the secondary structure for
<TR><TD ALIGN=RIGHT> <b><tt>-[no]h</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> Print help info and quit </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-[no]X</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> Use dialog box GUI to edit command line options </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-nice</tt></b> </TD><TD ALIGN=RIGHT> int </TD><TD ALIGN=RIGHT> <tt>19</tt> </TD><TD> Set the nicelevel </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> First frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Last frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Only use frame when t MOD dt = first time (ps) </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> First frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Last frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Only use frame when t MOD dt = first time (fs) </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-[no]w</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> View output <a href="xvg.html">xvg</a>, <a href="xpm.html">xpm</a>, <a href="eps.html">eps</a> and <a href="pdb.html">pdb</a> files </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-sss</tt></b> </TD><TD ALIGN=RIGHT> string </TD><TD ALIGN=RIGHT> <tt>HEBT</tt> </TD><TD> Secondary structures for structure count </TD></TD>
</TABLE>
-<TITLE>editconf</TITLE>
+<HTML>/n<HEAD>/n<TITLE>editconf</TITLE>
<LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>editconf</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
<TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>editconf</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
<H3>Description</H3>
editconf converts generic structure format to <tt>.<a href="gro.html">gro</a></tt>, <tt>.<a href="g96.html">g96</a></tt>
or <tt>.<a href="pdb.html">pdb</a></tt>.
and the radius in the occupancy.
<p>
Finally with option <tt>-label</tt> editconf can add a chain identifier
-to a <a href="pdb.html">pdb</a> file, which can be useful for analysis with e.g. rasmol.
+to a <a href="pdb.html">pdb</a> file, which can be useful for analysis with e.g. rasmol.<p>
+To convert a truncated octrahedron file produced by a package which uses
+a cubic box with the corners cut off (such as Gromos) use:<br>
+<tt>editconf -f <infile> -rotate 0 -45 -35.2644 -bt o -box <veclen> -o <outfile></tt><br>
+where <tt>veclen</tt> is the size of the cubic box times sqrt(3)/2.
<P>
<H3>Files</H3>
<TABLE BORDER=1 CELLSPACING=0 CELLPADDING=2>
-<TITLE>eneconv</TITLE>
+<HTML>/n<HEAD>/n<TITLE>eneconv</TITLE>
<LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>eneconv</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
<TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>eneconv</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
<H3>Description</H3>
When <tt>-f</tt> is <it>not</it> specified:<br>
Concatenates several energy files in sorted order.
-<TITLE>g_anaeig</TITLE>
+<HTML>/n<HEAD>/n<TITLE>g_anaeig</TITLE>
<LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>g_anaeig</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
<TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>g_anaeig</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
<H3>Description</H3>
<tt>g_anaeig</tt> analyzes eigenvectors. The eigenvectors can be of a
covariance matrix (<tt><a href="g_covar.html">g_covar</a></tt>) or of a Normal Modes anaysis
-<TITLE>g_analyze</TITLE>
+<HTML>/n<HEAD>/n<TITLE>g_analyze</TITLE>
<LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>g_analyze</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
<TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>g_analyze</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
<H3>Description</H3>
g_analyze reads an ascii file and analyzes data sets.
A line in the input file may start with a time
-<TITLE>g_angle</TITLE>
+<HTML>/n<HEAD>/n<TITLE>g_angle</TITLE>
<LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>g_angle</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
<TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>g_angle</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
<H3>Description</H3>
g_angle computes the angle distribution for a number of angles
or dihedrals. This way you can check whether your simulation
<TR><TD ALIGN=RIGHT> <b><tt>-[no]h</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> Print help info and quit </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-[no]X</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> Use dialog box GUI to edit command line options </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-nice</tt></b> </TD><TD ALIGN=RIGHT> int </TD><TD ALIGN=RIGHT> <tt>19</tt> </TD><TD> Set the nicelevel </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> First frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Last frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Only use frame when t MOD dt = first time (ps) </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> First frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Last frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Only use frame when t MOD dt = first time (fs) </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-[no]w</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> View output <a href="xvg.html">xvg</a>, <a href="xpm.html">xpm</a>, <a href="eps.html">eps</a> and <a href="pdb.html">pdb</a> files </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-type</tt></b> </TD><TD ALIGN=RIGHT> enum </TD><TD ALIGN=RIGHT> <tt>angle</tt> </TD><TD> Type of angle to analyse: angle, dihedral, improper or ryckaert-bellemans </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-[no]all</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> Plot all angles separately in the averages file, in the order of appearance in the index file. </TD></TD>
-<TITLE>g_bond</TITLE>
+<HTML>/n<HEAD>/n<TITLE>g_bond</TITLE>
<LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>g_bond</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
<TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>g_bond</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
<H3>Description</H3>
g_bond makes a distribution of bond lengths. If all is well a
gaussian distribution should be made when using a harmonic potential.
<TR><TD ALIGN=RIGHT> <b><tt>-[no]h</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> Print help info and quit </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-[no]X</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> Use dialog box GUI to edit command line options </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-nice</tt></b> </TD><TD ALIGN=RIGHT> int </TD><TD ALIGN=RIGHT> <tt>19</tt> </TD><TD> Set the nicelevel </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> First frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Last frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Only use frame when t MOD dt = first time (ps) </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> First frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Last frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Only use frame when t MOD dt = first time (fs) </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-[no]w</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> View output <a href="xvg.html">xvg</a>, <a href="xpm.html">xpm</a>, <a href="eps.html">eps</a> and <a href="pdb.html">pdb</a> files </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-blen</tt></b> </TD><TD ALIGN=RIGHT> real </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Bond length. By default length of first bond </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-tol</tt></b> </TD><TD ALIGN=RIGHT> real </TD><TD ALIGN=RIGHT> <tt> 0.1</tt> </TD><TD> Half width of distribution as fraction of blen </TD></TD>
-<TITLE>g_bundle</TITLE>
+<HTML>/n<HEAD>/n<TITLE>g_bundle</TITLE>
<LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>g_bundle</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
<TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>g_bundle</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
<H3>Description</H3>
g_bundle analyzes bundles of axes. The axes can be for instance
helix axes. The program reads two index groups and divides both
<TR><TD ALIGN=RIGHT> <b><tt>-[no]h</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> Print help info and quit </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-[no]X</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> Use dialog box GUI to edit command line options </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-nice</tt></b> </TD><TD ALIGN=RIGHT> int </TD><TD ALIGN=RIGHT> <tt>19</tt> </TD><TD> Set the nicelevel </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> First frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Last frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Only use frame when t MOD dt = first time (ps) </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> First frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Last frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Only use frame when t MOD dt = first time (fs) </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-na</tt></b> </TD><TD ALIGN=RIGHT> int </TD><TD ALIGN=RIGHT> <tt>0</tt> </TD><TD> Number of axes </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-[no]z</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> Use the Z-axis as reference iso the average axis </TD></TD>
</TABLE>
-<TITLE>g_chi</TITLE>
+<HTML>/n<HEAD>/n<TITLE>g_chi</TITLE>
<LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>g_chi</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
<TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>g_chi</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
<H3>Description</H3>
g_chi computes phi, psi, omega and chi dihedrals for all your
amino acid backbone and sidechains.
<TR><TD ALIGN=RIGHT> <b><tt>-[no]h</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> Print help info and quit </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-[no]X</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> Use dialog box GUI to edit command line options </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-nice</tt></b> </TD><TD ALIGN=RIGHT> int </TD><TD ALIGN=RIGHT> <tt>19</tt> </TD><TD> Set the nicelevel </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> First frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Last frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Only use frame when t MOD dt = first time (ps) </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> First frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Last frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Only use frame when t MOD dt = first time (fs) </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-[no]w</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> View output <a href="xvg.html">xvg</a>, <a href="xpm.html">xpm</a>, <a href="eps.html">eps</a> and <a href="pdb.html">pdb</a> files </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-r0</tt></b> </TD><TD ALIGN=RIGHT> int </TD><TD ALIGN=RIGHT> <tt>1</tt> </TD><TD> starting residue </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-[no]phi</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> Output for Phi dihedral angles </TD></TD>
-<TITLE>g_cluster</TITLE>
+<HTML>/n<HEAD>/n<TITLE>g_cluster</TITLE>
<LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>g_cluster</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
<TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>g_cluster</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
<H3>Description</H3>
g_cluster can cluster structures with several different methods.
Distances between structures can be determined from a trajectory
the smallest average distance to the others or the average structure
or all structures for each cluster will be written to a trajectory
file. When writing all structures, separate numbered files are made
-for each cluster.
+for each cluster.<p>Two output files are always written:<br>
+<tt>-o</tt> writes the RMSD values in the upper left half of the matrix
+and a graphical depiction of the clusters in the lower right half
+(depends on <tt>-max</tt> and <tt>-keepfree</tt>).<br>
+<tt>-g</tt> writes information on the options used and a detailed list
+of all clusters and their members.<p>
+Additionally, a number of optional output files can be written:<br>
+<tt>-dist</tt> writes the RMSD distribution.<br>
+<tt>-ev</tt> writes the eigenvectors of the RMSD matrix
+diagonalization.<br>
+<tt>-sz</tt> writes the cluster sizes.<br>
+<tt>-tr</tt> writes a matrix of the number transitions between
+cluster pairs.<br>
+<tt>-ntr</tt> writes the total number of transitions to or from
+each cluster.<br>
+<tt>-clid</tt> writes the cluster number as a function of time.<br>
+<tt>-cl</tt> writes average (with option <tt>-av</tt>) or central
+structure of each cluster or writes numbered files with cluster members
+for a selected set of clusters (with option <tt>-wcl</tt>, depends on
+<tt>-nst</tt> and <tt>-rmsmin</tt>).<br>
<P>
<H3>Files</H3>
<TABLE BORDER=1 CELLSPACING=0 CELLPADDING=2>
-<TITLE>g_confrms</TITLE>
+<HTML>/n<HEAD>/n<TITLE>g_confrms</TITLE>
<LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>g_confrms</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
<TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>g_confrms</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
<H3>Description</H3>
g_confrms computes the root mean square deviation (RMSD) of two
structures after LSQ fitting the second structure on the first one.
-<TITLE>g_covar</TITLE>
+<HTML>/n<HEAD>/n<TITLE>g_covar</TITLE>
<LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>g_covar</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
<TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>g_covar</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
<H3>Description</H3>
<tt>g_covar</tt> calculates and diagonalizes the (mass-weighted)
covariance matrix.
<TR><TD ALIGN=RIGHT> <b><tt>-[no]h</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> Print help info and quit </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-[no]X</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> Use dialog box GUI to edit command line options </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-nice</tt></b> </TD><TD ALIGN=RIGHT> int </TD><TD ALIGN=RIGHT> <tt>19</tt> </TD><TD> Set the nicelevel </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> First frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Last frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Only use frame when t MOD dt = first time (ps) </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> First frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Last frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Only use frame when t MOD dt = first time (fs) </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-[no]fit</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> yes</tt> </TD><TD> Fit to a reference structure </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-[no]ref</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> Use the deviation from the conformation in the structure file instead of from the average </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-[no]mwa</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> Mass-weighted covariance analysis </TD></TD>
-<TITLE>g_density</TITLE>
+<HTML>/n<HEAD>/n<TITLE>g_density</TITLE>
<LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>g_density</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
<TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>g_density</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
<H3>Description</H3>
Compute partial densities across the box, using an index file. Densities
in gram/cubic centimeter, number densities or electron densities can be
<TR><TD ALIGN=RIGHT> <b><tt>-[no]h</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> Print help info and quit </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-[no]X</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> Use dialog box GUI to edit command line options </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-nice</tt></b> </TD><TD ALIGN=RIGHT> int </TD><TD ALIGN=RIGHT> <tt>19</tt> </TD><TD> Set the nicelevel </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> First frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Last frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Only use frame when t MOD dt = first time (ps) </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> First frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Last frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Only use frame when t MOD dt = first time (fs) </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-[no]w</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> View output <a href="xvg.html">xvg</a>, <a href="xpm.html">xpm</a>, <a href="eps.html">eps</a> and <a href="pdb.html">pdb</a> files </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-d</tt></b> </TD><TD ALIGN=RIGHT> string </TD><TD ALIGN=RIGHT> <tt>Z</tt> </TD><TD> Take the normal on the membrane in direction X, Y or Z. </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-sl</tt></b> </TD><TD ALIGN=RIGHT> int </TD><TD ALIGN=RIGHT> <tt>10</tt> </TD><TD> Divide the box in #nr slices. </TD></TD>
-<TITLE>g_dielectric</TITLE>
+<HTML>/n<HEAD>/n<TITLE>g_dielectric</TITLE>
<LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>g_dielectric</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
<TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>g_dielectric</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
<H3>Description</H3>
dielectric calculates frequency dependent dielectric constants
from the autocorrelation function of the total dipole moment in
<TR><TD ALIGN=RIGHT> <b><tt>-[no]h</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> Print help info and quit </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-[no]X</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> Use dialog box GUI to edit command line options </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-nice</tt></b> </TD><TD ALIGN=RIGHT> int </TD><TD ALIGN=RIGHT> <tt>19</tt> </TD><TD> Set the nicelevel </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> First frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Last frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Only use frame when t MOD dt = first time (ps) </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> First frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Last frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Only use frame when t MOD dt = first time (fs) </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-[no]w</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> View output <a href="xvg.html">xvg</a>, <a href="xpm.html">xpm</a>, <a href="eps.html">eps</a> and <a href="pdb.html">pdb</a> files </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-[no]fft</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> use fast fourier transform for correlation function </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-[no]x1</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> yes</tt> </TD><TD> use first column as X axis rather than first data set </TD></TD>
-<TITLE>g_dih</TITLE>
+<HTML>/n<HEAD>/n<TITLE>g_dih</TITLE>
<LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>g_dih</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
<TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>g_dih</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
<H3>Description</H3>
g_dih can do two things. The default is to analyze dihedral transitions
by merely computing all the dihedral angles defined in your topology
<TR><TD ALIGN=RIGHT> <b><tt>-[no]h</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> Print help info and quit </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-[no]X</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> Use dialog box GUI to edit command line options </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-nice</tt></b> </TD><TD ALIGN=RIGHT> int </TD><TD ALIGN=RIGHT> <tt>19</tt> </TD><TD> Set the nicelevel </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> First frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Last frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Only use frame when t MOD dt = first time (ps) </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> First frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Last frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Only use frame when t MOD dt = first time (fs) </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-[no]w</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> View output <a href="xvg.html">xvg</a>, <a href="xpm.html">xpm</a>, <a href="eps.html">eps</a> and <a href="pdb.html">pdb</a> files </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-[no]sa</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> Perform cluster analysis in dihedral space instead of analysing dihedral transitions. </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-mult</tt></b> </TD><TD ALIGN=RIGHT> int </TD><TD ALIGN=RIGHT> <tt>-1</tt> </TD><TD> mulitiplicity for dihedral angles (by default read from topology) </TD></TD>
-<TITLE>g_dipoles</TITLE>
+<HTML>/n<HEAD>/n<TITLE>g_dipoles</TITLE>
<LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>g_dipoles</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
<TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>g_dipoles</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
<H3>Description</H3>
g_dipoles computes the total dipole plus fluctuations of a simulation
system. From this you can compute e.g. the dielectric constant for
<TR><TD ALIGN=RIGHT> <b><tt>-[no]h</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> Print help info and quit </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-[no]X</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> Use dialog box GUI to edit command line options </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-nice</tt></b> </TD><TD ALIGN=RIGHT> int </TD><TD ALIGN=RIGHT> <tt>19</tt> </TD><TD> Set the nicelevel </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> First frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Last frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Only use frame when t MOD dt = first time (ps) </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> First frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Last frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Only use frame when t MOD dt = first time (fs) </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-[no]w</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> View output <a href="xvg.html">xvg</a>, <a href="xpm.html">xpm</a>, <a href="eps.html">eps</a> and <a href="pdb.html">pdb</a> files </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-mu</tt></b> </TD><TD ALIGN=RIGHT> real </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> dipole of a single molecule (in Debye) </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-mumax</tt></b> </TD><TD ALIGN=RIGHT> real </TD><TD ALIGN=RIGHT> <tt> 5</tt> </TD><TD> max dipole in Debye (for histrogram) </TD></TD>
-<TITLE>g_disre</TITLE>
+<HTML>/n<HEAD>/n<TITLE>g_disre</TITLE>
<LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>g_disre</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
<TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>g_disre</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
<H3>Description</H3>
g_disre computes violations of distance restraints. If necessary
all protons can be added to a protein molecule. The program allways
<TR><TD ALIGN=RIGHT> <b><tt>-[no]h</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> Print help info and quit </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-[no]X</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> Use dialog box GUI to edit command line options </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-nice</tt></b> </TD><TD ALIGN=RIGHT> int </TD><TD ALIGN=RIGHT> <tt>19</tt> </TD><TD> Set the nicelevel </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> First frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Last frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Only use frame when t MOD dt = first time (ps) </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> First frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Last frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Only use frame when t MOD dt = first time (fs) </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-[no]w</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> View output <a href="xvg.html">xvg</a>, <a href="xpm.html">xpm</a>, <a href="eps.html">eps</a> and <a href="pdb.html">pdb</a> files </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-ntop</tt></b> </TD><TD ALIGN=RIGHT> int </TD><TD ALIGN=RIGHT> <tt>6</tt> </TD><TD> Number of large violations that are stored in the log file every step </TD></TD>
</TABLE>
-<TITLE>g_dist</TITLE>
+<HTML>/n<HEAD>/n<TITLE>g_dist</TITLE>
<LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>g_dist</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
<TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>g_dist</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
<H3>Description</H3>
g_dist can calculate the distance between the centers of mass of two
groups of atoms as a function of time. The total distance and its
<TR><TD ALIGN=RIGHT> <b><tt>-[no]h</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> Print help info and quit </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-[no]X</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> Use dialog box GUI to edit command line options </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-nice</tt></b> </TD><TD ALIGN=RIGHT> int </TD><TD ALIGN=RIGHT> <tt>19</tt> </TD><TD> Set the nicelevel </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> First frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Last frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Only use frame when t MOD dt = first time (ps) </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> First frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Last frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Only use frame when t MOD dt = first time (fs) </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-dist</tt></b> </TD><TD ALIGN=RIGHT> real </TD><TD ALIGN=RIGHT> <tt> 0</tt> </TD><TD> Print all atoms in group 2 closer than dist to the center of mass of group 1 </TD></TD>
</TABLE>
<P>
-<TITLE>g_dyndom</TITLE>
+<HTML>/n<HEAD>/n<TITLE>g_dyndom</TITLE>
<LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>g_dyndom</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
<TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>g_dyndom</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
<H3>Description</H3>
g_dyndom reads a <a href="pdb.html">pdb</a> file output from DynDom
http://md.chem.rug.nl/~steve/DynDom/dyndom.home.html
-<TITLE>g_enemat</TITLE>
+<HTML>/n<HEAD>/n<TITLE>g_enemat</TITLE>
<LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>g_enemat</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
<TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>g_enemat</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
<H3>Description</H3>
g_enemat extracts an energy matrix from an energy file.
With <b>-groups</b> a file must be supplied with on each
<TR><TD ALIGN=RIGHT> <b><tt>-[no]h</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> Print help info and quit </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-[no]X</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> Use dialog box GUI to edit command line options </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-nice</tt></b> </TD><TD ALIGN=RIGHT> int </TD><TD ALIGN=RIGHT> <tt>19</tt> </TD><TD> Set the nicelevel </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> First frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Last frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Only use frame when t MOD dt = first time (ps) </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> First frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Last frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Only use frame when t MOD dt = first time (fs) </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-[no]w</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> View output <a href="xvg.html">xvg</a>, <a href="xpm.html">xpm</a>, <a href="eps.html">eps</a> and <a href="pdb.html">pdb</a> files </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-[no]sum</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> Sum the energy terms selected rather than display them all </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-skip</tt></b> </TD><TD ALIGN=RIGHT> int </TD><TD ALIGN=RIGHT> <tt>0</tt> </TD><TD> Skip number of frames between data points </TD></TD>
-<TITLE>g_energy</TITLE>
+<HTML>/n<HEAD>/n<TITLE>g_energy</TITLE>
<LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>g_energy</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
<TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>g_energy</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
<H3>Description</H3>
g_energy extracts energy components or distance restraint
data from an energy file. The user is prompted to interactively
<TR><TD ALIGN=RIGHT> <b><tt>-[no]h</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> Print help info and quit </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-[no]X</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> Use dialog box GUI to edit command line options </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-nice</tt></b> </TD><TD ALIGN=RIGHT> int </TD><TD ALIGN=RIGHT> <tt>19</tt> </TD><TD> Set the nicelevel </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> First frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Last frame (ps) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> First frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Last frame (fs) to read from trajectory </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-[no]w</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> View output <a href="xvg.html">xvg</a>, <a href="xpm.html">xpm</a>, <a href="eps.html">eps</a> and <a href="pdb.html">pdb</a> files </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-[no]fee</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> Do a free energy estimate </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-fetemp</tt></b> </TD><TD ALIGN=RIGHT> real </TD><TD ALIGN=RIGHT> <tt> 300</tt> </TD><TD> Reference temperature for free energy calculation </TD></TD>
-<TITLE>g_gyrate</TITLE>
+<HTML>/n<HEAD>/n<TITLE>g_gyrate</TITLE>
<LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>g_gyrate</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
<TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>g_gyrate</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
<H3>Description</H3>
g_gyrate computes the radius of gyration of a group of atoms
and the radii of gyration about the x, y and z axes,as a function of time. The atoms are explicitly mass weighted.
<TR><TD ALIGN=RIGHT> <b><tt>-[no]h</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> Print help info and quit </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-[no]X</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> Use dialog box GUI to edit command line options </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-nice</tt></b> </TD><TD ALIGN=RIGHT> int </TD><TD ALIGN=RIGHT> <tt>19</tt> </TD><TD> Set the nicelevel </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> First frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Last frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Only use frame when t MOD dt = first time (ps) </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> First frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Last frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Only use frame when t MOD dt = first time (fs) </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-[no]w</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> View output <a href="xvg.html">xvg</a>, <a href="xpm.html">xpm</a>, <a href="eps.html">eps</a> and <a href="pdb.html">pdb</a> files </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-[no]q</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> Use absolute value of the charge of an atom as weighting factor instead of mass </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-[no]p</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> Calculate the radii of gyration about the principal axes. </TD></TD>
-<TITLE>g_h2order</TITLE>
+<HTML>/n<HEAD>/n<TITLE>g_h2order</TITLE>
<LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>g_h2order</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
<TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>g_h2order</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
<H3>Description</H3>
Compute the orientation of water molecules with respect to the normal
of the box. The program determines the average cosine of the angle
<TR><TD ALIGN=RIGHT> <b><tt>-[no]h</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> Print help info and quit </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-[no]X</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> Use dialog box GUI to edit command line options </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-nice</tt></b> </TD><TD ALIGN=RIGHT> int </TD><TD ALIGN=RIGHT> <tt>19</tt> </TD><TD> Set the nicelevel </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> First frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Last frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Only use frame when t MOD dt = first time (ps) </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> First frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Last frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Only use frame when t MOD dt = first time (fs) </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-[no]w</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> View output <a href="xvg.html">xvg</a>, <a href="xpm.html">xpm</a>, <a href="eps.html">eps</a> and <a href="pdb.html">pdb</a> files </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-d</tt></b> </TD><TD ALIGN=RIGHT> string </TD><TD ALIGN=RIGHT> <tt>Z</tt> </TD><TD> Take the normal on the membrane in direction X, Y or Z. </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-sl</tt></b> </TD><TD ALIGN=RIGHT> int </TD><TD ALIGN=RIGHT> <tt>0</tt> </TD><TD> Calculate order parameter as function of boxlength, dividing the box in #nr slices. </TD></TD>
-<TITLE>g_hbond</TITLE>
+<HTML>/n<HEAD>/n<TITLE>g_hbond</TITLE>
<LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>g_hbond</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
<TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>g_hbond</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
<H3>Description</H3>
g_hbond computes and analyzes hydrogen bonds. Hydrogen bonds are
determined based on cutoffs for the angle Donor - Hydrogen - Acceptor
<TR><TD ALIGN=RIGHT> <b><tt>-[no]h</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> Print help info and quit </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-[no]X</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> Use dialog box GUI to edit command line options </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-nice</tt></b> </TD><TD ALIGN=RIGHT> int </TD><TD ALIGN=RIGHT> <tt>19</tt> </TD><TD> Set the nicelevel </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> First frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Last frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Only use frame when t MOD dt = first time (ps) </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> First frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Last frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Only use frame when t MOD dt = first time (fs) </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-[no]ins</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> Analyze solvent insertion </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-a</tt></b> </TD><TD ALIGN=RIGHT> real </TD><TD ALIGN=RIGHT> <tt> 60</tt> </TD><TD> Cutoff angle (degrees, Donor - Hydrogen - Acceptor) </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-r</tt></b> </TD><TD ALIGN=RIGHT> real </TD><TD ALIGN=RIGHT> <tt> 0.25</tt> </TD><TD> Cutoff radius (nm, Hydrogen - Acceptor) </TD></TD>
-<TITLE>g_helix</TITLE>
+<HTML>/n<HEAD>/n<TITLE>g_helix</TITLE>
<LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>g_helix</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
<TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>g_helix</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
<H3>Description</H3>
g_helix computes all kind of helix properties. First, the peptide
is checked to find the longest helical part. This is determined by
<TR><TD ALIGN=RIGHT> <b><tt>-[no]h</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> Print help info and quit </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-[no]X</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> Use dialog box GUI to edit command line options </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-nice</tt></b> </TD><TD ALIGN=RIGHT> int </TD><TD ALIGN=RIGHT> <tt>19</tt> </TD><TD> Set the nicelevel </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> First frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Last frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Only use frame when t MOD dt = first time (ps) </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> First frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Last frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Only use frame when t MOD dt = first time (fs) </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-[no]w</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> View output <a href="xvg.html">xvg</a>, <a href="xpm.html">xpm</a>, <a href="eps.html">eps</a> and <a href="pdb.html">pdb</a> files </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-r0</tt></b> </TD><TD ALIGN=RIGHT> int </TD><TD ALIGN=RIGHT> <tt>1</tt> </TD><TD> The first residue number in the sequence </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-[no]q</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> Check at every step which part of the sequence is helical </TD></TD>
-<TITLE>g_lie</TITLE>
+<HTML>/n<HEAD>/n<TITLE>g_lie</TITLE>
<LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>g_lie</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
<TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>g_lie</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
<H3>Description</H3>
g_lie computes a free energy estimate based on an energy analysis
from. One needs an energy file with the following components:
-<TITLE>g_mdmat</TITLE>
+<HTML>/n<HEAD>/n<TITLE>g_mdmat</TITLE>
<LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>g_mdmat</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
<TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>g_mdmat</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
<H3>Description</H3>
g_mdmat makes distance matrices consisting of the smallest distance
between residue pairs. With -frames these distance matrices can be
<TR><TD ALIGN=RIGHT> <b><tt>-[no]h</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> Print help info and quit </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-[no]X</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> Use dialog box GUI to edit command line options </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-nice</tt></b> </TD><TD ALIGN=RIGHT> int </TD><TD ALIGN=RIGHT> <tt>19</tt> </TD><TD> Set the nicelevel </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> First frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Last frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Only use frame when t MOD dt = first time (ps) </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> First frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Last frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Only use frame when t MOD dt = first time (fs) </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-t</tt></b> </TD><TD ALIGN=RIGHT> real </TD><TD ALIGN=RIGHT> <tt> 1.5</tt> </TD><TD> trunc distance </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-nlevels</tt></b> </TD><TD ALIGN=RIGHT> int </TD><TD ALIGN=RIGHT> <tt>40</tt> </TD><TD> Discretize distance in # levels </TD></TD>
</TABLE>
-<TITLE>g_mindist</TITLE>
+<HTML>/n<HEAD>/n<TITLE>g_mindist</TITLE>
<LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>g_mindist</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
<TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>g_mindist</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
<H3>Description</H3>
g_mindist computes the distance between one group and a number of
other groups.
<TR><TD ALIGN=RIGHT> <b><tt>-[no]h</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> Print help info and quit </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-[no]X</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> Use dialog box GUI to edit command line options </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-nice</tt></b> </TD><TD ALIGN=RIGHT> int </TD><TD ALIGN=RIGHT> <tt>19</tt> </TD><TD> Set the nicelevel </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> First frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Last frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Only use frame when t MOD dt = first time (ps) </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> First frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Last frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Only use frame when t MOD dt = first time (fs) </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-[no]w</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> View output <a href="xvg.html">xvg</a>, <a href="xpm.html">xpm</a>, <a href="eps.html">eps</a> and <a href="pdb.html">pdb</a> files </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-[no]matrix</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> Calculate half a matrix of group-group distances </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-d</tt></b> </TD><TD ALIGN=RIGHT> real </TD><TD ALIGN=RIGHT> <tt> 0.6</tt> </TD><TD> Distance for contacts </TD></TD>
-<TITLE>g_morph</TITLE>
+<HTML>/n<HEAD>/n<TITLE>g_morph</TITLE>
<LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>g_morph</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
<TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>g_morph</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
<H3>Description</H3>
g_morph does a linear interpolation of conformations in order to
create intermediates. Of course these are completely unphysical, but
-<TITLE>g_msd</TITLE>
+<HTML>/n<HEAD>/n<TITLE>g_msd</TITLE>
<LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>g_msd</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
<TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>g_msd</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
<H3>Description</H3>
g_msd computes the mean square displacement (MSD) of atoms from
their initial positions. This provides an easy way to compute
-<TITLE>g_nmeig</TITLE>
+<HTML>/n<HEAD>/n<TITLE>g_nmeig</TITLE>
<LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>g_nmeig</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
<TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>g_nmeig</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
<H3>Description</H3>
g_nmeig calculates the eigenvectors/values of a (Hessian) matrix,
which can be calculated with <tt><a href="nmrun.html">nmrun</a></tt>.
-<TITLE>g_nmens</TITLE>
+<HTML>/n<HEAD>/n<TITLE>g_nmens</TITLE>
<LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>g_nmens</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
<TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>g_nmens</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
<H3>Description</H3>
<tt>g_nmens</tt> generates an ensemble around an average structure
in a subspace which is defined by a set of normal modes (eigenvectors).
-<TITLE>g_order</TITLE>
+<HTML>/n<HEAD>/n<TITLE>g_order</TITLE>
<LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>g_order</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
<TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>g_order</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
<H3>Description</H3>
Compute the order parameter per atom for carbon tails. For atom i the
vector i-1, i+1 is used together with an axis. The index file has to contain
<TR><TD ALIGN=RIGHT> <b><tt>-[no]h</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> Print help info and quit </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-[no]X</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> Use dialog box GUI to edit command line options </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-nice</tt></b> </TD><TD ALIGN=RIGHT> int </TD><TD ALIGN=RIGHT> <tt>19</tt> </TD><TD> Set the nicelevel </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> First frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Last frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Only use frame when t MOD dt = first time (ps) </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> First frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Last frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Only use frame when t MOD dt = first time (fs) </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-[no]w</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> View output <a href="xvg.html">xvg</a>, <a href="xpm.html">xpm</a>, <a href="eps.html">eps</a> and <a href="pdb.html">pdb</a> files </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-d</tt></b> </TD><TD ALIGN=RIGHT> enum </TD><TD ALIGN=RIGHT> <tt>z</tt> </TD><TD> Direction of the normal on the membrane: z, x or y </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-sl</tt></b> </TD><TD ALIGN=RIGHT> int </TD><TD ALIGN=RIGHT> <tt>1</tt> </TD><TD> Calculate order parameter as function of boxlength, dividing the box in #nr slices. </TD></TD>
-<TITLE>g_potential</TITLE>
+<HTML>/n<HEAD>/n<TITLE>g_potential</TITLE>
<LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>g_potential</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
<TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>g_potential</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
<H3>Description</H3>
Compute the electrostatical potential across the box. The potential iscalculated by first summing the charges per slice and then integratingtwice of this charge distribution. Periodic boundaries are not taken into account. Reference of potential is taken to be the left side ofthe box. It's also possible to calculate the potential in sphericalcoordinates as function of r by calculating a charge distribution inspherical slices and twice integrating them. epsilon_r is taken as 1,2 is more appropriate in many cases
<P>
<TR><TD ALIGN=RIGHT> <b><tt>-[no]h</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> Print help info and quit </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-[no]X</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> Use dialog box GUI to edit command line options </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-nice</tt></b> </TD><TD ALIGN=RIGHT> int </TD><TD ALIGN=RIGHT> <tt>19</tt> </TD><TD> Set the nicelevel </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> First frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Last frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Only use frame when t MOD dt = first time (ps) </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> First frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Last frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Only use frame when t MOD dt = first time (fs) </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-[no]w</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> View output <a href="xvg.html">xvg</a>, <a href="xpm.html">xpm</a>, <a href="eps.html">eps</a> and <a href="pdb.html">pdb</a> files </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-d</tt></b> </TD><TD ALIGN=RIGHT> string </TD><TD ALIGN=RIGHT> <tt>Z</tt> </TD><TD> Take the normal on the membrane in direction X, Y or Z. </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-sl</tt></b> </TD><TD ALIGN=RIGHT> int </TD><TD ALIGN=RIGHT> <tt>10</tt> </TD><TD> Calculate potential as function of boxlength, dividing the box in #nr slices. </TD></TD>
-<TITLE>g_rama</TITLE>
+<HTML>/n<HEAD>/n<TITLE>g_rama</TITLE>
<LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>g_rama</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
<TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>g_rama</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
<H3>Description</H3>
g_rama selects the Phi/Psi dihedral combinations from your topology file
and computes these as a function of time.
<TR><TD ALIGN=RIGHT> <b><tt>-[no]h</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> Print help info and quit </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-[no]X</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> Use dialog box GUI to edit command line options </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-nice</tt></b> </TD><TD ALIGN=RIGHT> int </TD><TD ALIGN=RIGHT> <tt>19</tt> </TD><TD> Set the nicelevel </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> First frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Last frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Only use frame when t MOD dt = first time (ps) </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> First frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Last frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Only use frame when t MOD dt = first time (fs) </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-[no]w</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> View output <a href="xvg.html">xvg</a>, <a href="xpm.html">xpm</a>, <a href="eps.html">eps</a> and <a href="pdb.html">pdb</a> files </TD></TD>
</TABLE>
<P>
-<TITLE>g_rdf</TITLE>
+<HTML>/n<HEAD>/n<TITLE>g_rdf</TITLE>
<LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>g_rdf</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
<TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>g_rdf</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
<H3>Description</H3>
The structure of liquids can be studied by either neutron or X-ray
scattering. The most common way to describe liquid structure is by a
<TR><TD ALIGN=RIGHT> <b><tt>-[no]h</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> Print help info and quit </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-[no]X</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> Use dialog box GUI to edit command line options </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-nice</tt></b> </TD><TD ALIGN=RIGHT> int </TD><TD ALIGN=RIGHT> <tt>19</tt> </TD><TD> Set the nicelevel </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> First frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Last frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Only use frame when t MOD dt = first time (ps) </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> First frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Last frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Only use frame when t MOD dt = first time (fs) </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-[no]w</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> View output <a href="xvg.html">xvg</a>, <a href="xpm.html">xpm</a>, <a href="eps.html">eps</a> and <a href="pdb.html">pdb</a> files </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-bin</tt></b> </TD><TD ALIGN=RIGHT> real </TD><TD ALIGN=RIGHT> <tt> 0.001</tt> </TD><TD> Binwidth (nm) </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-[no]com</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> RDF with respect to the center of mass of first group </TD></TD>
-<TITLE>g_rms</TITLE>
+<HTML>/n<HEAD>/n<TITLE>g_rms</TITLE>
<LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>g_rms</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
<TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>g_rms</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
<H3>Description</H3>
g_rms compares two structures by computing the root mean square
deviation (RMSD), the size-independent 'rho' similarity parameter
-<TITLE>g_rmsdist</TITLE>
+<HTML>/n<HEAD>/n<TITLE>g_rmsdist</TITLE>
<LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>g_rmsdist</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
<TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>g_rmsdist</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
<H3>Description</H3>
g_rmsdist computes the root mean square deviation of atom distances,
which has the advantage that no fit is needed like in standard RMS
<TR><TD ALIGN=RIGHT> <b><tt>-[no]h</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> Print help info and quit </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-[no]X</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> Use dialog box GUI to edit command line options </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-nice</tt></b> </TD><TD ALIGN=RIGHT> int </TD><TD ALIGN=RIGHT> <tt>19</tt> </TD><TD> Set the nicelevel </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> First frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Last frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Only use frame when t MOD dt = first time (ps) </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> First frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Last frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Only use frame when t MOD dt = first time (fs) </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-[no]w</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> View output <a href="xvg.html">xvg</a>, <a href="xpm.html">xpm</a>, <a href="eps.html">eps</a> and <a href="pdb.html">pdb</a> files </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-nlevels</tt></b> </TD><TD ALIGN=RIGHT> int </TD><TD ALIGN=RIGHT> <tt>40</tt> </TD><TD> Discretize rms in # levels </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-max</tt></b> </TD><TD ALIGN=RIGHT> real </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Maximum level in matrices </TD></TD>
-<TITLE>g_rmsf</TITLE>
+<HTML>/n<HEAD>/n<TITLE>g_rmsf</TITLE>
<LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>g_rmsf</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
<TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>g_rmsf</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
<H3>Description</H3>
g_rmsf computes the root mean square fluctuation (RMSF, i.e. standard
deviation) of atomic positions
<TR><TD ALIGN=RIGHT> <b><tt>-[no]h</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> Print help info and quit </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-[no]X</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> Use dialog box GUI to edit command line options </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-nice</tt></b> </TD><TD ALIGN=RIGHT> int </TD><TD ALIGN=RIGHT> <tt>19</tt> </TD><TD> Set the nicelevel </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> First frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Last frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Only use frame when t MOD dt = first time (ps) </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> First frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Last frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Only use frame when t MOD dt = first time (fs) </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-[no]w</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> View output <a href="xvg.html">xvg</a>, <a href="xpm.html">xpm</a>, <a href="eps.html">eps</a> and <a href="pdb.html">pdb</a> files </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-[no]res</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> Calculate averages for each residue </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-[no]aniso</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> Compute anisotropic termperature factors </TD></TD>
-<TITLE>g_rotacf</TITLE>
+<HTML>/n<HEAD>/n<TITLE>g_rotacf</TITLE>
<LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>g_rotacf</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
<TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>g_rotacf</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
<H3>Description</H3>
g_rotacf calculates the rotational correlation function
for molecules. Three atoms (i,j,k) must be given in the index
<TR><TD ALIGN=RIGHT> <b><tt>-[no]h</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> Print help info and quit </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-[no]X</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> Use dialog box GUI to edit command line options </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-nice</tt></b> </TD><TD ALIGN=RIGHT> int </TD><TD ALIGN=RIGHT> <tt>19</tt> </TD><TD> Set the nicelevel </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> First frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Last frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Only use frame when t MOD dt = first time (ps) </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> First frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Last frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Only use frame when t MOD dt = first time (fs) </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-[no]w</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> View output <a href="xvg.html">xvg</a>, <a href="xpm.html">xpm</a>, <a href="eps.html">eps</a> and <a href="pdb.html">pdb</a> files </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-[no]d</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> Use index doublets (vectors) for correlation function instead of triplets (planes) </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-[no]aver</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> yes</tt> </TD><TD> Average over molecules </TD></TD>
-<TITLE>g_saltbr</TITLE>
+<HTML>/n<HEAD>/n<TITLE>g_saltbr</TITLE>
<LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>g_saltbr</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
<TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>g_saltbr</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
<H3>Description</H3>
g_saltbr plots the difference between all combination of charged groups
as a function of time. The groups are combined in different ways.A minimum distance can be given, (eg. the cut-off), then groups
<TR><TD ALIGN=RIGHT> <b><tt>-[no]h</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> Print help info and quit </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-[no]X</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> Use dialog box GUI to edit command line options </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-nice</tt></b> </TD><TD ALIGN=RIGHT> int </TD><TD ALIGN=RIGHT> <tt>19</tt> </TD><TD> Set the nicelevel </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> First frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Last frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Only use frame when t MOD dt = first time (ps) </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> First frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Last frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Only use frame when t MOD dt = first time (fs) </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-t</tt></b> </TD><TD ALIGN=RIGHT> real </TD><TD ALIGN=RIGHT> <tt> 1000</tt> </TD><TD> trunc distance </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-[no]sep</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> Use separate files for each interaction (may be MANY) </TD></TD>
</TABLE>
-<TITLE>g_sas</TITLE>
+<HTML>/n<HEAD>/n<TITLE>g_sas</TITLE>
<LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>g_sas</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
<TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>g_sas</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
<H3>Description</H3>
g_sas computes hydrophobic and total solvent accessible surface area.
As a side effect the Connolly surface can be generated as well in
<TR><TD ALIGN=RIGHT> <b><tt>-[no]h</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> Print help info and quit </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-[no]X</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> Use dialog box GUI to edit command line options </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-nice</tt></b> </TD><TD ALIGN=RIGHT> int </TD><TD ALIGN=RIGHT> <tt>19</tt> </TD><TD> Set the nicelevel </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> First frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Last frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Only use frame when t MOD dt = first time (ps) </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> First frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Last frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Only use frame when t MOD dt = first time (fs) </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-[no]w</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> View output <a href="xvg.html">xvg</a>, <a href="xpm.html">xpm</a>, <a href="eps.html">eps</a> and <a href="pdb.html">pdb</a> files </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-solsize</tt></b> </TD><TD ALIGN=RIGHT> real </TD><TD ALIGN=RIGHT> <tt> 0.14</tt> </TD><TD> Radius of the solvent probe (nm) </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-ndots</tt></b> </TD><TD ALIGN=RIGHT> int </TD><TD ALIGN=RIGHT> <tt>24</tt> </TD><TD> Number of dots per sphere, more dots means more accuracy </TD></TD>
-<TITLE>g_sgangle</TITLE>
+<HTML>/n<HEAD>/n<TITLE>g_sgangle</TITLE>
<LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>g_sgangle</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
<TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>g_sgangle</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
<H3>Description</H3>
Compute the angle and distance between two groups.
The groups are defined by a number of atoms given in an index file and
<TR><TD ALIGN=RIGHT> <b><tt>-[no]h</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> Print help info and quit </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-[no]X</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> Use dialog box GUI to edit command line options </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-nice</tt></b> </TD><TD ALIGN=RIGHT> int </TD><TD ALIGN=RIGHT> <tt>19</tt> </TD><TD> Set the nicelevel </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> First frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Last frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Only use frame when t MOD dt = first time (ps) </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> First frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Last frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Only use frame when t MOD dt = first time (fs) </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-[no]w</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> View output <a href="xvg.html">xvg</a>, <a href="xpm.html">xpm</a>, <a href="eps.html">eps</a> and <a href="pdb.html">pdb</a> files </TD></TD>
</TABLE>
<P>
-<TITLE>g_sorient</TITLE>
+<HTML>/n<HEAD>/n<TITLE>g_sorient</TITLE>
<LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>g_sorient</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
<TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>g_sorient</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
<H3>Description</H3>
g_sorient analyzes solvent orientation around solutes.
It calculates two angles between the vector from one or more
<TR><TD ALIGN=RIGHT> <b><tt>-[no]h</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> Print help info and quit </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-[no]X</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> Use dialog box GUI to edit command line options </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-nice</tt></b> </TD><TD ALIGN=RIGHT> int </TD><TD ALIGN=RIGHT> <tt>19</tt> </TD><TD> Set the nicelevel </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> First frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Last frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Only use frame when t MOD dt = first time (ps) </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> First frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Last frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Only use frame when t MOD dt = first time (fs) </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-[no]w</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> View output <a href="xvg.html">xvg</a>, <a href="xpm.html">xpm</a>, <a href="eps.html">eps</a> and <a href="pdb.html">pdb</a> files </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-[no]com</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> Use the center of mass as the reference postion </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-rmin</tt></b> </TD><TD ALIGN=RIGHT> real </TD><TD ALIGN=RIGHT> <tt> 0</tt> </TD><TD> Minimum distance </TD></TD>
-<TITLE>g_tcaf</TITLE>
+<HTML>/n<HEAD>/n<TITLE>g_tcaf</TITLE>
<LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>g_tcaf</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
<TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>g_tcaf</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
<H3>Description</H3>
g_tcaf computes tranverse current autocorrelations.
These are used to estimate the shear viscosity eta.
<TR><TD ALIGN=RIGHT> <b><tt>-[no]h</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> Print help info and quit </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-[no]X</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> Use dialog box GUI to edit command line options </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-nice</tt></b> </TD><TD ALIGN=RIGHT> int </TD><TD ALIGN=RIGHT> <tt>19</tt> </TD><TD> Set the nicelevel </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> First frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Last frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Only use frame when t MOD dt = first time (ps) </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> First frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Last frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Only use frame when t MOD dt = first time (fs) </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-[no]w</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> View output <a href="xvg.html">xvg</a>, <a href="xpm.html">xpm</a>, <a href="eps.html">eps</a> and <a href="pdb.html">pdb</a> files </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-[no]mol</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> Calculate tcaf of molecules </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-[no]k34</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> Also use k=(3,0,0) and k=(4,0,0) </TD></TD>
-<TITLE>g_traj</TITLE>
+<HTML>/n<HEAD>/n<TITLE>g_traj</TITLE>
<LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>g_traj</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
<TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>g_traj</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
<H3>Description</H3>
g_traj plots coordinates, velocities, forces and/or the box.
With <tt>-com</tt> the coordinates, velocities and forces are
<TR><TD ALIGN=RIGHT> <b><tt>-[no]h</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> Print help info and quit </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-[no]X</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> Use dialog box GUI to edit command line options </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-nice</tt></b> </TD><TD ALIGN=RIGHT> int </TD><TD ALIGN=RIGHT> <tt>19</tt> </TD><TD> Set the nicelevel </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> First frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Last frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Only use frame when t MOD dt = first time (ps) </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> First frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Last frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Only use frame when t MOD dt = first time (fs) </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-[no]w</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> View output <a href="xvg.html">xvg</a>, <a href="xpm.html">xpm</a>, <a href="eps.html">eps</a> and <a href="pdb.html">pdb</a> files </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-[no]com</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> Plot data for the com of each group </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-[no]mol</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> Index contains molecule numbers iso atom numbers </TD></TD>
-<TITLE>g_velacc</TITLE>
+<HTML>/n<HEAD>/n<TITLE>g_velacc</TITLE>
<LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>g_velacc</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
<TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>g_velacc</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
<H3>Description</H3>
g_velacc computes the velocity autocorrelation function.
When the <tt>-s</tt> option is used, the momentum autocorrelation
<TR><TD ALIGN=RIGHT> <b><tt>-[no]h</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> Print help info and quit </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-[no]X</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> Use dialog box GUI to edit command line options </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-nice</tt></b> </TD><TD ALIGN=RIGHT> int </TD><TD ALIGN=RIGHT> <tt>19</tt> </TD><TD> Set the nicelevel </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> First frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Last frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Only use frame when t MOD dt = first time (ps) </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> First frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Last frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Only use frame when t MOD dt = first time (fs) </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-[no]w</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> View output <a href="xvg.html">xvg</a>, <a href="xpm.html">xpm</a>, <a href="eps.html">eps</a> and <a href="pdb.html">pdb</a> files </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-[no]mol</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> Calculate vac of molecules </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-acflen</tt></b> </TD><TD ALIGN=RIGHT> int </TD><TD ALIGN=RIGHT> <tt>-1</tt> </TD><TD> Length of the ACF, default is half the number of frames </TD></TD>
-<TITLE>genbox</TITLE>
+<HTML>/n<HEAD>/n<TITLE>genbox</TITLE>
<LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>genbox</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
<TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>genbox</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
<H3>Description</H3>
Genbox can do one of 3 things:<p>
1) Generate a box of solvent. Specify -cs and -box. Or specify -cs and
-<TITLE>genconf</TITLE>
+<HTML>/n<HEAD>/n<TITLE>genconf</TITLE>
<LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>genconf</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
<TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>genconf</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
<H3>Description</H3>
genconf multiplies a given coordinate file by simply stacking them
on <a href="top.html">top</a> of each other, like a small child playing with wooden blocks.
-<TITLE>genion</TITLE>
+<HTML>/n<HEAD>/n<TITLE>genion</TITLE>
<LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>genion</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
<TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>genion</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
<H3>Description</H3>
genion replaces solvent molecules by monoatomic ions at
the position of the first atoms with the most favorable electrostatic
-<TITLE>genpr</TITLE>
+<HTML>/n<HEAD>/n<TITLE>genpr</TITLE>
<LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>genpr</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
<TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>genpr</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
<H3>Description</H3>
genpr produces an include file for a topology containing
a list of atom numbers and three force constants for the
-<TITLE>gmxcheck</TITLE>
+<HTML>/n<HEAD>/n<TITLE>gmxcheck</TITLE>
<LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>gmxcheck</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
<TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>gmxcheck</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
<H3>Description</H3>
gmxcheck reads a trajectory (<tt>.<a href="trj.html">trj</a></tt>, <tt>.<a href="trr.html">trr</a></tt> or
<tt>.<a href="xtc.html">xtc</a></tt>) or an energy file (<tt>.<a href="ene.html">ene</a></tt> or <tt>.<a href="edr.html">edr</a></tt>)
-<TITLE>gmxdump</TITLE>
+<HTML>/n<HEAD>/n<TITLE>gmxdump</TITLE>
<LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>gmxdump</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
<TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>gmxdump</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
<H3>Description</H3>
gmxdump reads a run input file (<tt>.<a href="tpa.html">tpa</a></tt>/<tt>.<a href="tpr.html">tpr</a></tt>/<tt>.<a href="tpb.html">tpb</a></tt>),
a trajectory (<tt>.<a href="trj.html">trj</a></tt>/<tt>.<a href="trr.html">trr</a></tt>/<tt>.<a href="xtc.html">xtc</a></tt>) or an energy
-<TITLE>grompp</TITLE>
+<HTML>/n<HEAD>/n<TITLE>grompp</TITLE>
<LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>grompp</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
<TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>grompp</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
<H3>Description</H3>
The gromacs preprocessor
reads a molecular topology file, checks the validity of the
-<TITLE>highway</TITLE>
+<HTML>/n<HEAD>/n<TITLE>highway</TITLE>
<LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>highway</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
<TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>highway</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
<H3>Description</H3>
highway is the gromacs highway simulator. It is an X-windows
gadget that shows a (periodic) autobahn with a user defined
<TR><TD ALIGN=RIGHT> <b><tt>-[no]h</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> Print help info and quit </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-[no]X</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> Use dialog box GUI to edit command line options </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-nice</tt></b> </TD><TD ALIGN=RIGHT> int </TD><TD ALIGN=RIGHT> <tt>0</tt> </TD><TD> Set the nicelevel </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> First frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Last frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Only use frame when t MOD dt = first time (ps) </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> First frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Last frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Only use frame when t MOD dt = first time (fs) </TD></TD>
</TABLE>
<P>
<hr>
-<TITLE>make_ndx</TITLE>
+<HTML>/n<HEAD>/n<TITLE>make_ndx</TITLE>
<LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>make_ndx</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
<TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>make_ndx</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
<H3>Description</H3>
Index groups are necessary for almost every gromacs program.
All these programs can generate default index groups. You ONLY
-<TITLE>mdrun</TITLE>
+<HTML>/n<HEAD>/n<TITLE>mdrun</TITLE>
<LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>mdrun</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
<TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>mdrun</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
<H3>Description</H3>
The mdrun program performs Molecular Dynamics simulations.
It reads the run input file (<tt>-s</tt>) and distributes the
-<TITLE>mk_angndx</TITLE>
+<HTML>/n<HEAD>/n<TITLE>mk_angndx</TITLE>
<LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>mk_angndx</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
<TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>mk_angndx</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
<H3>Description</H3>
mk_angndx makes an index file for calculation of
angle distributions etc. It uses a run input file (<tt>.tpx</tt>) for the
-<TITLE>ngmx</TITLE>
+<HTML>/n<HEAD>/n<TITLE>ngmx</TITLE>
<LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>ngmx</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
<TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>ngmx</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
<H3>Description</H3>
ngmx is the Gromacs trajectory viewer. This program reads a
trajectory file, a run input file and an index file and plots a
<TR><TD ALIGN=RIGHT> <b><tt>-[no]h</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> Print help info and quit </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-[no]X</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> Use dialog box GUI to edit command line options </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-nice</tt></b> </TD><TD ALIGN=RIGHT> int </TD><TD ALIGN=RIGHT> <tt>0</tt> </TD><TD> Set the nicelevel </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> First frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Last frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Only use frame when t MOD dt = first time (ps) </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> First frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Last frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Only use frame when t MOD dt = first time (fs) </TD></TD>
</TABLE>
<P>
<H3>Diagnostics</H3>
-<TITLE>nmrun</TITLE>
+<HTML>/n<HEAD>/n<TITLE>nmrun</TITLE>
<LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>nmrun</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
<TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>nmrun</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
<H3>Description</H3>
nmrun builds a Hessian matrix from single conformation.
For usual Normal Modes-like calculations, make sure that
-<TITLE>pdb2gmx</TITLE>
+<HTML>/n<HEAD>/n<TITLE>pdb2gmx</TITLE>
<LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>pdb2gmx</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
<TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>pdb2gmx</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
<H3>Description</H3>
This program reads a <a href="pdb.html">pdb</a> file, lets you choose a forcefield, reads
some database files, adds hydrogens to the molecules and generates
-<TITLE>protonate</TITLE>
+<HTML>/n<HEAD>/n<TITLE>protonate</TITLE>
<LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>protonate</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
<TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>protonate</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
<H3>Description</H3>
<tt>protonate</tt> reads (a) conformation(s) and adds all missing
hydrogens as defined in <tt>ffgmx2.<a href="hdb.html">hdb</a></tt>. If only <tt>-s</tt> is
<TR><TD ALIGN=RIGHT> <b><tt>-[no]h</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> Print help info and quit </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-[no]X</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> Use dialog box GUI to edit command line options </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-nice</tt></b> </TD><TD ALIGN=RIGHT> int </TD><TD ALIGN=RIGHT> <tt>0</tt> </TD><TD> Set the nicelevel </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> First frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Last frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Only use frame when t MOD dt = first time (ps) </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> First frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Last frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Only use frame when t MOD dt = first time (fs) </TD></TD>
</TABLE>
<P>
<hr>
-P { text-indent: 0em; font-family: helvetica,verdana,arial,sans-serif }
+th { font-family: arial,helvetica,verdana,sans-serif }
+P { text-indent: 0em; font-family: arial,helvetica,verdana,sans-serif }
H1 { text-indent: 0em; font-size: 24pt; font-family: serif }
-H2 { text-indent: 0em; font-size: 24pt; font-weight: bold; font-family: helvetica,verdana,arial,sans-serif }
-H3 { text-indent: 0em; font-size: 18pt; font-weight: bold; font-family: helvetica,verdana,arial,sans-serif }
-A:link { text-decoration: none; font-family: helvetica,verdana,arial,sans-serif }
-A:active { text-decoration: none; font-family: helvetica,verdana,arial,sans-serif }
-A:visited { text-decoration: none; font-family: helvetica,verdana,arial,sans-serif }
-body { text-indent: 0em; font-family: helvetica,verdana,arial,sans-serif }
-td { font-family: helvetica,verdana,arial,sans-serif }
-th { font-family: helvetica,verdana,arial,sans-serif }
+H2 { text-indent: 0em; font-size: 24pt; font-weight: bold; font-family: arial,helvetica,verdana,sans-serif }
+H3 { text-indent: 0em; font-size: 18pt; font-weight: bold; font-family: arial,helvetica,verdana,sans-serif }
+A:link { text-decoration: none; font-family: arial,helvetica,verdana,sans-serif }
+A:active { text-decoration: none; font-family: arial,helvetica,verdana,sans-serif }
+A:visited { text-decoration: none; font-family: arial,helvetica,verdana,sans-serif }
+body { text-indent: 0em; font-family: arial,helvetica,verdana,sans-serif }
+td { font-family: arial,helvetica,verdana,sans-serif }
+th { font-family: arial,helvetica,verdana,sans-serif }
+li { font-family: arial,helvetica,verdana,sans-serif }
+ul { font-family: arial,helvetica,verdana,sans-serif }
+tt { font-family: courier,"lucida console",serif }
+
+
+
+
+
+
+
-<TITLE>tpbconv</TITLE>
+<HTML>/n<HEAD>/n<TITLE>tpbconv</TITLE>
<LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>tpbconv</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
<TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>tpbconv</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
<H3>Description</H3>
tpbconv can edit run input files in two ways.<p><b>1st.</b> by creating a run input file
for a continuation run when your simulation has crashed due to e.g.
-<TITLE>trjcat</TITLE>
+<HTML>/n<HEAD>/n<TITLE>trjcat</TITLE>
<LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>trjcat</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
<TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>trjcat</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
<H3>Description</H3>
trjcat concatenates several input trajectory files in sorted order.
In case of double time frames the one in the later file is used.
-<TITLE>trjconv</TITLE>
+<HTML>/n<HEAD>/n<TITLE>trjconv</TITLE>
<LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>trjconv</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
<TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>trjconv</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
<H3>Description</H3>
trjconv can convert trajectory files in many ways:<br>
<b>1.</b> from one format to another<br>
<TR><TD ALIGN=RIGHT> <b><tt>-trunc</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Truncate input <a href="trj.html">trj</a> file after this time (ps) </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-exec</tt></b> </TD><TD ALIGN=RIGHT> string </TD><TD ALIGN=RIGHT> <tt></tt> </TD><TD> Execute command for every output frame with the frame number as argument </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-[no]app</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> Append output </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-split</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> 0</tt> </TD><TD> Start writing new file when t MOD split = first time (ps) </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-[no]sep</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> Write each frame to a separate .<a href="gro.html">gro</a> or .<a href="pdb.html">pdb</a> file </TD></TD>
</TABLE>
<P>
-<TITLE>trjorder</TITLE>
+<HTML>/n<HEAD>/n<TITLE>trjorder</TITLE>
<LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>trjorder</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
<TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>trjorder</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
<H3>Description</H3>
trjorder orders molecules according to the smallest distance
to atoms in a reference group. It will ask for a group of reference
<TR><TD ALIGN=RIGHT> <b><tt>-[no]h</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> Print help info and quit </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-[no]X</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> Use dialog box GUI to edit command line options </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-nice</tt></b> </TD><TD ALIGN=RIGHT> int </TD><TD ALIGN=RIGHT> <tt>19</tt> </TD><TD> Set the nicelevel </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> First frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Last frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Only use frame when t MOD dt = first time (ps) </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> First frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Last frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Only use frame when t MOD dt = first time (fs) </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-na</tt></b> </TD><TD ALIGN=RIGHT> int </TD><TD ALIGN=RIGHT> <tt>3</tt> </TD><TD> Number of atoms in a molecule </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-da</tt></b> </TD><TD ALIGN=RIGHT> int </TD><TD ALIGN=RIGHT> <tt>1</tt> </TD><TD> Atom used for the distance calculation </TD></TD>
</TABLE>
-<TITLE>wheel</TITLE>
+<HTML>/n<HEAD>/n<TITLE>wheel</TITLE>
<LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>wheel</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
<TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>wheel</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
<H3>Description</H3>
wheel plots a helical wheel representation of your sequence.The input sequence is in the .<a href="dat.html">dat</a> file where the first line contains
the number of residues and each consecutive line contains a residuename.
-<TITLE>x2top</TITLE>
+<HTML>/n<HEAD>/n<TITLE>x2top</TITLE>
<LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>x2top</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
<TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>x2top</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
<H3>Description</H3>
x2top generates a primitive topology from a coordinate file.
The program assumes all hydrogens are present when defining
--- /dev/null
+<HTML>/n<HEAD>/n<TITLE>xmdrun</TITLE>
+<LINK rel=stylesheet href="style.css" type="text/css">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
+<TR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>xmdrun</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
+<H3>Description</H3>
+xmdrun is the experimental MD program. New features are tested in this
+program before being implemented in the default <a href="mdrun.html">mdrun</a>. Currently under
+investigation are: polarizibility, glass simulations,
+Free energy perturbation, X-Ray bombardments
+and parallel independent simulations.It reads the run input file (<tt>-s</tt>) and distributes the
+topology over nodes if needed. The coordinates are passed
+around, so that computations can begin.
+First a neighborlist is made, then the forces are computed.
+The forces are globally summed, and the velocities and
+positions are updated. If necessary shake is performed to constrain
+bond lengths and/or bond angles.
+Temperature and Pressure can be controlled using weak coupling to a
+bath.<p>
+<a href="mdrun.html">mdrun</a> produces at least three output file, plus one log file
+(<tt>-g</tt>) per node.
+The trajectory file (<tt>-o</tt>), contains coordinates, velocities and
+optionally forces.
+The structure file (<tt>-c</tt>) contains the coordinates and
+velocities of the last step.
+The energy file (<tt>-e</tt>) contains energies, the temperature,
+pressure, etc, a lot of these things are also printed in the log file
+of node 0.
+Optionally coordinates can be written to a compressed trajectory file
+(<tt>-x</tt>).<p>
+When running in parallel with PVM or an old version of MPI the
+<tt>-np</tt> option must be given to indicate the number of
+nodes.<p>
+The option <tt>-dgdl</tt> is only used when free energy perturbation is
+turned on.<p>
+With <tt>-rerun</tt> an input trajectory can be given for which
+forces and energies will be (re)calculated. Neighbor searching will be
+performed for every frame, unless <tt>nstlist</tt> is zero
+(see the <tt>.<a href="mdp.html">mdp</a></tt> file).<p>
+ED (essential dynamics) sampling is switched on by using the <tt>-ei</tt>
+flag followed by an <tt>.<a href="edi.html">edi</a></tt> file.
+The <tt>.<a href="edi.html">edi</a></tt> file can be produced using options in the essdyn
+menu of the WHAT IF program. <a href="mdrun.html">mdrun</a> produces a <tt>.<a href="edo.html">edo</a></tt> file that
+contains projections of positions, velocities and forces onto selected
+eigenvectors.<p>
+The -table option can be used to pass <a href="mdrun.html">mdrun</a> a formatted table with
+user-defined potential functions. The file is read from either the
+current directory or from the GMXLIB directory. A number of preformatted
+tables are presented in the GMXLIB dir, for 6-8, 6-9, 6-10, 6-11, 6-12
+Lennard Jones potentials with normal Coulomb.<p>
+The options <tt>-pi</tt>, <tt>-po</tt>, <tt>-pd</tt>, <tt>-pn</tt> are used
+for potential of mean force calculations and umbrella sampling.
+See manual.<p>
+When <a href="mdrun.html">mdrun</a> receives a TERM signal, it will set nsteps to the current
+step plus one. When <a href="mdrun.html">mdrun</a> receives a USR1 signal, it will set nsteps
+to the next multiple of nstxout after the current step.
+In both cases all the usual output will be written to file.
+When running with MPI, a signal to one of the <a href="mdrun.html">mdrun</a> processes
+is sufficient, this signal should not be sent to mpirun or
+the <a href="mdrun.html">mdrun</a> process that is the parent of the others.
+<P>
+<H3>Files</H3>
+<TABLE BORDER=1 CELLSPACING=0 CELLPADDING=2>
+<TR><TH>option</TH><TH>filename</TH><TH>type</TH><TH>description</TH></TR>
+<TR><TD ALIGN=RIGHT> <b><tt>-s</tt></b> </TD><TD ALIGN=RIGHT> <tt><a href="files.html"> topol.tpr</a></tt> </TD><TD> Input </TD><TD> Generic run input: <a href="tpr.html">tpr</a> <a href="tpb.html">tpb</a> <a href="tpa.html">tpa</a> </TD></TR>
+<TR><TD ALIGN=RIGHT> <b><tt>-o</tt></b> </TD><TD ALIGN=RIGHT> <tt><a href="files.html"> traj.trr</a></tt> </TD><TD> Output </TD><TD> Full precision trajectory: <a href="trr.html">trr</a> <a href="trj.html">trj</a> </TD></TR>
+<TR><TD ALIGN=RIGHT> <b><tt>-x</tt></b> </TD><TD ALIGN=RIGHT> <tt><a href="xtc.html"> traj.xtc</a></tt> </TD><TD> Output, Opt. </TD><TD> Compressed trajectory (portable xdr format) </TD></TR>
+<TR><TD ALIGN=RIGHT> <b><tt>-c</tt></b> </TD><TD ALIGN=RIGHT> <tt><a href="files.html"> confout.gro</a></tt> </TD><TD> Output </TD><TD> Generic structure: <a href="gro.html">gro</a> <a href="g96.html">g96</a> <a href="pdb.html">pdb</a> </TD></TR>
+<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> <tt><a href="files.html"> ener.edr</a></tt> </TD><TD> Output </TD><TD> Generic energy: <a href="edr.html">edr</a> <a href="ene.html">ene</a> </TD></TR>
+<TR><TD ALIGN=RIGHT> <b><tt>-g</tt></b> </TD><TD ALIGN=RIGHT> <tt><a href="log.html"> md.log</a></tt> </TD><TD> Output </TD><TD> Log file </TD></TR>
+<TR><TD ALIGN=RIGHT> <b><tt>-dgdl</tt></b> </TD><TD ALIGN=RIGHT> <tt><a href="xvg.html"> dgdl.xvg</a></tt> </TD><TD> Output, Opt. </TD><TD> xvgr/xmgr file </TD></TR>
+<TR><TD ALIGN=RIGHT> <b><tt>-table</tt></b> </TD><TD ALIGN=RIGHT> <tt><a href="xvg.html"> table.xvg</a></tt> </TD><TD> Input, Opt. </TD><TD> xvgr/xmgr file </TD></TR>
+<TR><TD ALIGN=RIGHT> <b><tt>-rerun</tt></b> </TD><TD ALIGN=RIGHT> <tt><a href="files.html"> rerun.xtc</a></tt> </TD><TD> Input, Opt. </TD><TD> Generic trajectory: <a href="xtc.html">xtc</a> <a href="trr.html">trr</a> <a href="trj.html">trj</a> <a href="gro.html">gro</a> <a href="g96.html">g96</a> <a href="pdb.html">pdb</a> </TD></TR>
+<TR><TD ALIGN=RIGHT> <b><tt>-ei</tt></b> </TD><TD ALIGN=RIGHT> <tt><a href="edi.html"> sam.edi</a></tt> </TD><TD> Input, Opt. </TD><TD> ED sampling input </TD></TR>
+<TR><TD ALIGN=RIGHT> <b><tt>-eo</tt></b> </TD><TD ALIGN=RIGHT> <tt><a href="edo.html"> sam.edo</a></tt> </TD><TD> Output, Opt. </TD><TD> ED sampling output </TD></TR>
+<TR><TD ALIGN=RIGHT> <b><tt>-j</tt></b> </TD><TD ALIGN=RIGHT> <tt><a href="gct.html"> wham.gct</a></tt> </TD><TD> Input, Opt. </TD><TD> General coupling stuff </TD></TR>
+<TR><TD ALIGN=RIGHT> <b><tt>-jo</tt></b> </TD><TD ALIGN=RIGHT> <tt><a href="gct.html"> bam.gct</a></tt> </TD><TD> Input, Opt. </TD><TD> General coupling stuff </TD></TR>
+<TR><TD ALIGN=RIGHT> <b><tt>-ffout</tt></b> </TD><TD ALIGN=RIGHT> <tt><a href="xvg.html"> gct.xvg</a></tt> </TD><TD> Output, Opt. </TD><TD> xvgr/xmgr file </TD></TR>
+<TR><TD ALIGN=RIGHT> <b><tt>-devout</tt></b> </TD><TD ALIGN=RIGHT> <tt><a href="xvg.html">deviatie.xvg</a></tt> </TD><TD> Output, Opt. </TD><TD> xvgr/xmgr file </TD></TR>
+<TR><TD ALIGN=RIGHT> <b><tt>-runav</tt></b> </TD><TD ALIGN=RIGHT> <tt><a href="xvg.html"> runaver.xvg</a></tt> </TD><TD> Output, Opt. </TD><TD> xvgr/xmgr file </TD></TR>
+<TR><TD ALIGN=RIGHT> <b><tt>-pi</tt></b> </TD><TD ALIGN=RIGHT> <tt><a href="ppa.html"> pull.ppa</a></tt> </TD><TD> Input, Opt. </TD><TD> Pull parameters </TD></TR>
+<TR><TD ALIGN=RIGHT> <b><tt>-po</tt></b> </TD><TD ALIGN=RIGHT> <tt><a href="ppa.html"> pullout.ppa</a></tt> </TD><TD> Output, Opt. </TD><TD> Pull parameters </TD></TR>
+<TR><TD ALIGN=RIGHT> <b><tt>-pd</tt></b> </TD><TD ALIGN=RIGHT> <tt><a href="pdo.html"> pull.pdo</a></tt> </TD><TD> Output, Opt. </TD><TD> Pull data output </TD></TR>
+<TR><TD ALIGN=RIGHT> <b><tt>-pn</tt></b> </TD><TD ALIGN=RIGHT> <tt><a href="ndx.html"> pull.ndx</a></tt> </TD><TD> Input, Opt. </TD><TD> Index file </TD></TR>
+</TABLE>
+<P>
+<H3>Other options</H3>
+<TABLE BORDER=1 CELLSPACING=0 CELLPADDING=2>
+<TR><TH>option</TH><TH>type</TH><TH>default</TH><TH>description</TH></TR>
+<TR><TD ALIGN=RIGHT> <b><tt>-[no]h</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> Print help info and quit </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-[no]X</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> Use dialog box GUI to edit command line options </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-nice</tt></b> </TD><TD ALIGN=RIGHT> int </TD><TD ALIGN=RIGHT> <tt>19</tt> </TD><TD> Set the nicelevel </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-deffnm</tt></b> </TD><TD ALIGN=RIGHT> string </TD><TD ALIGN=RIGHT> <tt></tt> </TD><TD> Set the default filename for all file options </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-[no]v</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> Be loud and noisy </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-[no]compact</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> yes</tt> </TD><TD> Write a compact log file </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-[no]multi</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> Do multiple simulations in parallel (only with -np > 1) </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-[no]glas</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> Do glass simulation with special long range corrections </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-[no]ionize</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> Do a simulation including the effect of an X-Ray bombardment on your system </TD></TD>
+</TABLE>
+<P>
+<hr>
+<div ALIGN=RIGHT>
+<font size="-1"><a href="http://www.gromacs.org">http://www.gromacs.org</a></font><br>
+<font size="-1"><a href="mailto:gromacs@gromacs.org">gromacs@gromacs.org</a></font><br>
+</div>
+</BODY>
-<TITLE>xpm2ps</TITLE>
+<HTML>/n<HEAD>/n<TITLE>xpm2ps</TITLE>
<LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>xpm2ps</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
<TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>xpm2ps</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
<H3>Description</H3>
xpm2ps makes a beautiful color plot of an XPixelMap file.
Labels and axis can be displayed, when they are supplied
half of the second one (<tt>-f2</tt>). The diagonal will contain
values from the matrix file selected with <tt>-diag</tt>.
Plotting of the diagonal values can be suppressed altogether by
-setting <tt>-diag</tt> to <tt>none</tt>.<p>
+setting <tt>-diag</tt> to <tt>none</tt>. With
+<tt>-combine</tt> an alternative operation can be selected to combine
+the matrices. In this case, a new color map will be generated with
+a red gradient for negative numbers and a blue for positive.<p>
If the color coding and legend labels of both matrices are identical,
only one legend will be displayed, else two separate legends are
displayed.<p>
<TR><TD ALIGN=RIGHT> <b><tt>-nice</tt></b> </TD><TD ALIGN=RIGHT> int </TD><TD ALIGN=RIGHT> <tt>0</tt> </TD><TD> Set the nicelevel </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-[no]w</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> View output <a href="xvg.html">xvg</a>, <a href="xpm.html">xpm</a>, <a href="eps.html">eps</a> and <a href="pdb.html">pdb</a> files </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-[no]frame</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> yes</tt> </TD><TD> Display frame, ticks, labels, title and legend </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-title</tt></b> </TD><TD ALIGN=RIGHT> enum </TD><TD ALIGN=RIGHT> <tt>top</tt> </TD><TD> Show title at: <a href="top.html">top</a>, ylabel or none </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-title</tt></b> </TD><TD ALIGN=RIGHT> enum </TD><TD ALIGN=RIGHT> <tt>top</tt> </TD><TD> Show title at: <a href="top.html">top</a>, once, ylabel or none </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-[no]yonce</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> Show y-label only once </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-legend</tt></b> </TD><TD ALIGN=RIGHT> enum </TD><TD ALIGN=RIGHT> <tt>both</tt> </TD><TD> Show legend: both, first, second or none </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-diag</tt></b> </TD><TD ALIGN=RIGHT> enum </TD><TD ALIGN=RIGHT> <tt>first</tt> </TD><TD> Diagonal: first, second or none </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-combine</tt></b> </TD><TD ALIGN=RIGHT> enum </TD><TD ALIGN=RIGHT> <tt>halves</tt> </TD><TD> Combine two matrices: halves, add, sub, mult or div </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-bx</tt></b> </TD><TD ALIGN=RIGHT> real </TD><TD ALIGN=RIGHT> <tt> 0</tt> </TD><TD> Box x-size (also y-size when -by is not set) </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-by</tt></b> </TD><TD ALIGN=RIGHT> real </TD><TD ALIGN=RIGHT> <tt> 0</tt> </TD><TD> Box y-size </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-rainbow</tt></b> </TD><TD ALIGN=RIGHT> enum </TD><TD ALIGN=RIGHT> <tt>no</tt> </TD><TD> Rainbow colors, convert white to: no, blue or red </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-gradient</tt></b> </TD><TD ALIGN=RIGHT> vector </TD><TD ALIGN=RIGHT> <tt>0 0 0</tt> </TD><TD> Re-scale colormap to a smooth gradient from white {1,1,1} to {r,g,b} </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-skip</tt></b> </TD><TD ALIGN=RIGHT> int </TD><TD ALIGN=RIGHT> <tt>1</tt> </TD><TD> only write out every nr-th row and column </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-[no]zeroline</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> insert line in <a href="xpm.html">xpm</a> matrix where axis label is zero </TD></TD>
</TABLE>
-<TITLE>xrama</TITLE>
+<HTML>/n<HEAD>/n<TITLE>xrama</TITLE>
<LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>xrama</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
<TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>xrama</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
<H3>Description</H3>
xrama shows a Ramachandran movie, that is, it shows
the Phi/Psi angles as a function of time in an X-Window.<p>Static Phi/Psi plots for printing can be made with <a href="g_rama.html">g_rama</a>.<p>
<TR><TD ALIGN=RIGHT> <b><tt>-[no]h</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> Print help info and quit </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-[no]X</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt> no</tt> </TD><TD> Use dialog box GUI to edit command line options </TD></TD>
<TR><TD ALIGN=RIGHT> <b><tt>-nice</tt></b> </TD><TD ALIGN=RIGHT> int </TD><TD ALIGN=RIGHT> <tt>0</tt> </TD><TD> Set the nicelevel </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> First frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Last frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Only use frame when t MOD dt = first time (ps) </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> First frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Last frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt> -1</tt> </TD><TD> Only use frame when t MOD dt = first time (fs) </TD></TD>
</TABLE>
<P>
<hr>
# Don't edit - this file is generated automatically from Makefile.am
#
-## Let's rule ;-)
-
-# The asm suffix is for intel syntax assembly, and
-# the s suffix for at & t syntax.
-# S suffix files will be preprocessed by cpp, nasm
-# on the other hand can do this directly.
-
-SUFFIXES = .asm .S .F
-.asm.o:
- $(NASM) $(NASMFLAGS) $< -o $@
-.S.s:
- $(CPP) $< > $@
-.F.f:
- $(CPP) $< > $@
-
#################
# We need a second compile command producing executables
# that can be run on the local host to make the innerloops.
# for cross-compilation. We also need it on parallel machines
# where the MPI executables cannot be run outside a batch queue.
+BUILD_COMPILE = $(BUILD_CC) $(DEFS) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
+
+#################
# This might be bad - but I don't know any other way to enable
# us to type make <progname> in subdirs right now
#AM_CPPFLAGS = -DHAVE_CONFIG_H
-BUILD_COMPILE = $(BUILD_CC) $(DEFS) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
-
-
-
# but it can probably be done in a nicer way...
INCLUDES = @INCLUDES@ -I$(top_srcdir)/src/include
LDFLAGS = @LDFLAGS@ -L${top_builddir}/src/gmxlib -L${top_builddir}/src/mdlib
-LDADD = -lmdXXX_SUFFIX_XXX -lgmxXXX_SUFFIX_XXX
+LDADD = ../gmxlib/libgmxXXX_SUFFIX_XXX.la ../mdlib/libmdXXX_SUFFIX_XXX.la
+
+EXTRA_DIST = README programs.txt gromacs-3.0.spec
# NB: The programs in contrib do not get double/mpi suffixes automatically,
# to make it easier for new developers to understand this file. If you want
--- /dev/null
+#
+# RPM specification file to make gromacs packages, version 3.0
+# Presently, you cannot relocate from /usr/local/gromacs.
+#
+# Usage:
+#
+# 1. Start from a gromacs distribution tarball, made
+# with "make dist". Put it in the RPM
+# source directory (usually /usr/src/redhat/SOURCES).
+# 2. Edit the version and release info below (bump the
+# release every time you release a new rpm, restore it
+# to 1 for each a new version.)
+# 3. Edit the files tags IF YOU MOVE OR ADD ANY FILES
+# (also if you change lib versions)
+# 4. This file assumes a i686-pc-linux-gnu configuration -
+# you will have to change that for a different host,
+# since it enters in the directory names gromacs creates.
+# 5. cd to /usr/src/redhat/SPECS and issue
+# rpm -ba gromacs-3.0.spec
+#
+# That's it - you should have both binary and source rpms now.
+#
+
+#
+# Main package - only dynamic libs, and no header files
+#
+Summary: A package for molecular dynamics simulation
+Name: gromacs
+Version: 3.0
+Release: 1
+Copyright: GPL
+Group: Applications/Science
+Source: http://www.gromacs.org/download/gromacs_source/gromacs-3.0.tar.gz
+URL: http://www.gromacs.org
+Packager: Erik Lindahl <lindahl@gromacs.org>
+%description
+GROMACS is a versatile and extremely well optimized package
+to perform molecular dynamics computer simulations and
+subsequent trajectory analysis. It is developed for
+biomolecules like proteins, but the extremely high
+performance means it is used also in several other field
+like polymer chemistry and solid state physics. This
+version has the dynamic libs and executables; to hack new
+utility programs you also need the headers and static
+libs in gromacs-dev. Linux kernel 2.4 is STRONGLY
+recommended on Pentium III and later processors since
+GROMACS can then use assembly loops with SSE instructions.
+#
+# The header files and static libraries go into gromacs-devel...
+#
+%package devel
+Summary: Header files and static libraries for GROMACS
+Group: Applications/Science
+Requires: gromacs = %{version}-%{release}
+%description devel
+This package contains header files, static libraries,
+and a program example for the GROMACS molecular
+dynamics software. You need it if you want to write your
+own analysis programs.
+
+
+%prep
+%setup
+
+%build
+./configure
+
+%install
+make install
+make links
+
+%post
+#
+# Add our library dir to /etc/ld.so.conf if it is not already there
+#
+if test -z `grep /usr/local/gromacs/lib/i686-pc-linux-gnu /etc/ld.so.conf`; then
+ cat >> /etc/ld.so.conf < /usr/local/gromacs/lib/i686-pc-linux-gnu
+fi
+
+# run ldconfig to update the runtime linker database with the new libraries
+# (make sure /sbin is in the $PATH)
+PATH="/sbin:$PATH" ldconfig
+
+%postun
+#
+# Remove gromacs lib dir from /etc/ld.so.conf, since nothing else resides there
+#
+grep -v /usr/local/gromacs/lib/i686-pc-linux-gnu /etc/ld.so.conf > tmpconf
+mv tmpconf /etc/ld.so.conf
+
+# after uninstall, run ldconfig to remove the libs from the linker database
+PATH="/sbin:$PATH" ldconfig
+
+
+
+
+%files
+# binaries
+/usr/local/gromacs/bin/i686-pc-linux-gnu/average
+/usr/local/gromacs/bin/i686-pc-linux-gnu/g_confrms
+/usr/local/gromacs/bin/i686-pc-linux-gnu/g_gyrate
+/usr/local/gromacs/bin/i686-pc-linux-gnu/g_order
+/usr/local/gromacs/bin/i686-pc-linux-gnu/g_order
+/usr/local/gromacs/bin/i686-pc-linux-gnu/g_order
+/usr/local/gromacs/bin/i686-pc-linux-gnu/g_sorient
+/usr/local/gromacs/bin/i686-pc-linux-gnu/highway
+/usr/local/gromacs/bin/i686-pc-linux-gnu/trjconv
+/usr/local/gromacs/bin/i686-pc-linux-gnu/do_dssp
+/usr/local/gromacs/bin/i686-pc-linux-gnu/g_covar
+/usr/local/gromacs/bin/i686-pc-linux-gnu/g_h2order
+/usr/local/gromacs/bin/i686-pc-linux-gnu/g_potential
+/usr/local/gromacs/bin/i686-pc-linux-gnu/g_tcaf
+/usr/local/gromacs/bin/i686-pc-linux-gnu/luck
+/usr/local/gromacs/bin/i686-pc-linux-gnu/trjorder
+/usr/local/gromacs/bin/i686-pc-linux-gnu/editconf
+/usr/local/gromacs/bin/i686-pc-linux-gnu/g_density
+/usr/local/gromacs/bin/i686-pc-linux-gnu/g_hbond
+/usr/local/gromacs/bin/i686-pc-linux-gnu/g_rama
+/usr/local/gromacs/bin/i686-pc-linux-gnu/g_traj
+/usr/local/gromacs/bin/i686-pc-linux-gnu/make_ndx
+/usr/local/gromacs/bin/i686-pc-linux-gnu/wheel
+/usr/local/gromacs/bin/i686-pc-linux-gnu/eneconv
+/usr/local/gromacs/bin/i686-pc-linux-gnu/g_dielectric
+/usr/local/gromacs/bin/i686-pc-linux-gnu/g_helix
+/usr/local/gromacs/bin/i686-pc-linux-gnu/g_rdf
+/usr/local/gromacs/bin/i686-pc-linux-gnu/g_velacc
+/usr/local/gromacs/bin/i686-pc-linux-gnu/mdrun
+/usr/local/gromacs/bin/i686-pc-linux-gnu/x2top
+/usr/local/gromacs/bin/i686-pc-linux-gnu/g_anaeig
+/usr/local/gromacs/bin/i686-pc-linux-gnu/g_dih
+/usr/local/gromacs/bin/i686-pc-linux-gnu/g_lie
+/usr/local/gromacs/bin/i686-pc-linux-gnu/g_rms
+/usr/local/gromacs/bin/i686-pc-linux-gnu/genbox
+/usr/local/gromacs/bin/i686-pc-linux-gnu/mk_angndx
+/usr/local/gromacs/bin/i686-pc-linux-gnu/xmdrun
+/usr/local/gromacs/bin/i686-pc-linux-gnu/g_analyze
+/usr/local/gromacs/bin/i686-pc-linux-gnu/g_dipoles
+/usr/local/gromacs/bin/i686-pc-linux-gnu/g_mdmat
+/usr/local/gromacs/bin/i686-pc-linux-gnu/g_rmsdist
+/usr/local/gromacs/bin/i686-pc-linux-gnu/genconf
+/usr/local/gromacs/bin/i686-pc-linux-gnu/ngmx
+/usr/local/gromacs/bin/i686-pc-linux-gnu/xpm2ps
+/usr/local/gromacs/bin/i686-pc-linux-gnu/g_angle
+/usr/local/gromacs/bin/i686-pc-linux-gnu/g_disre
+/usr/local/gromacs/bin/i686-pc-linux-gnu/g_mindist
+/usr/local/gromacs/bin/i686-pc-linux-gnu/g_rmsf
+/usr/local/gromacs/bin/i686-pc-linux-gnu/genion
+/usr/local/gromacs/bin/i686-pc-linux-gnu/nmrun
+/usr/local/gromacs/bin/i686-pc-linux-gnu/xrama
+/usr/local/gromacs/bin/i686-pc-linux-gnu/g_bond
+/usr/local/gromacs/bin/i686-pc-linux-gnu/g_dist
+/usr/local/gromacs/bin/i686-pc-linux-gnu/g_morph
+/usr/local/gromacs/bin/i686-pc-linux-gnu/g_rotacf
+/usr/local/gromacs/bin/i686-pc-linux-gnu/genpr
+/usr/local/gromacs/bin/i686-pc-linux-gnu/pdb2gmx
+/usr/local/gromacs/bin/i686-pc-linux-gnu/g_bundle
+/usr/local/gromacs/bin/i686-pc-linux-gnu/g_dyndom
+/usr/local/gromacs/bin/i686-pc-linux-gnu/g_msd
+/usr/local/gromacs/bin/i686-pc-linux-gnu/g_saltbr
+/usr/local/gromacs/bin/i686-pc-linux-gnu/gmxcheck
+/usr/local/gromacs/bin/i686-pc-linux-gnu/protonate
+/usr/local/gromacs/bin/i686-pc-linux-gnu/g_chi
+/usr/local/gromacs/bin/i686-pc-linux-gnu/g_enemat
+/usr/local/gromacs/bin/i686-pc-linux-gnu/g_nmeig
+/usr/local/gromacs/bin/i686-pc-linux-gnu/g_sas
+/usr/local/gromacs/bin/i686-pc-linux-gnu/gmxdump
+/usr/local/gromacs/bin/i686-pc-linux-gnu/tpbconv
+/usr/local/gromacs/bin/i686-pc-linux-gnu/g_cluster
+/usr/local/gromacs/bin/i686-pc-linux-gnu/g_energy
+/usr/local/gromacs/bin/i686-pc-linux-gnu/g_nmens
+/usr/local/gromacs/bin/i686-pc-linux-gnu/g_sgangle
+/usr/local/gromacs/bin/i686-pc-linux-gnu/grompp
+/usr/local/gromacs/bin/i686-pc-linux-gnu/trjcat
+#links to /usr/local/bin
+/usr/local/bin/average
+/usr/local/bin/g_confrms
+/usr/local/bin/g_gyrate
+/usr/local/bin/g_order
+/usr/local/bin/g_order
+/usr/local/bin/g_order
+/usr/local/bin/g_sorient
+/usr/local/bin/highway
+/usr/local/bin/trjconv
+/usr/local/bin/do_dssp
+/usr/local/bin/g_covar
+/usr/local/bin/g_h2order
+/usr/local/bin/g_potential
+/usr/local/bin/g_tcaf
+/usr/local/bin/luck
+/usr/local/bin/trjorder
+/usr/local/bin/editconf
+/usr/local/bin/g_density
+/usr/local/bin/g_hbond
+/usr/local/bin/g_rama
+/usr/local/bin/g_traj
+/usr/local/bin/make_ndx
+/usr/local/bin/wheel
+/usr/local/bin/eneconv
+/usr/local/bin/g_dielectric
+/usr/local/bin/g_helix
+/usr/local/bin/g_rdf
+/usr/local/bin/g_velacc
+/usr/local/bin/mdrun
+/usr/local/bin/x2top
+/usr/local/bin/g_anaeig
+/usr/local/bin/g_dih
+/usr/local/bin/g_lie
+/usr/local/bin/g_rms
+/usr/local/bin/genbox
+/usr/local/bin/mk_angndx
+/usr/local/bin/xmdrun
+/usr/local/bin/g_analyze
+/usr/local/bin/g_dipoles
+/usr/local/bin/g_mdmat
+/usr/local/bin/g_rmsdist
+/usr/local/bin/genconf
+/usr/local/bin/ngmx
+/usr/local/bin/xpm2ps
+/usr/local/bin/g_angle
+/usr/local/bin/g_disre
+/usr/local/bin/g_mindist
+/usr/local/bin/g_rmsf
+/usr/local/bin/genion
+/usr/local/bin/nmrun
+/usr/local/bin/xrama
+/usr/local/bin/g_bond
+/usr/local/bin/g_dist
+/usr/local/bin/g_morph
+/usr/local/bin/g_rotacf
+/usr/local/bin/genpr
+/usr/local/bin/pdb2gmx
+/usr/local/bin/g_bundle
+/usr/local/bin/g_dyndom
+/usr/local/bin/g_msd
+/usr/local/bin/g_saltbr
+/usr/local/bin/gmxcheck
+/usr/local/bin/protonate
+/usr/local/bin/g_chi
+/usr/local/bin/g_enemat
+/usr/local/bin/g_nmeig
+/usr/local/bin/g_sas
+/usr/local/bin/gmxdump
+/usr/local/bin/tpbconv
+/usr/local/bin/g_cluster
+/usr/local/bin/g_energy
+/usr/local/bin/g_nmens
+/usr/local/bin/g_sgangle
+/usr/local/bin/grompp
+/usr/local/bin/trjcat
+# the topology library
+/usr/local/gromacs/top/
+/usr/local/gromacs/top/FF.dat
+/usr/local/gromacs/top/ffgmx.itp
+/usr/local/gromacs/top/ffgmxnb.itp
+/usr/local/gromacs/top/ffgmxbon.itp
+/usr/local/gromacs/top/ffgmx.atp
+/usr/local/gromacs/top/ffgmx.hdb
+/usr/local/gromacs/top/ffgmx.n2t
+/usr/local/gromacs/top/ffgmx.rtp
+/usr/local/gromacs/top/ffgmx-c.tdb
+/usr/local/gromacs/top/ffgmx-n.tdb
+/usr/local/gromacs/top/ffgmx2.itp
+/usr/local/gromacs/top/ffgmx2nb.itp
+/usr/local/gromacs/top/ffgmx2bon.itp
+/usr/local/gromacs/top/ffgmx2.atp
+/usr/local/gromacs/top/ffgmx2.hdb
+/usr/local/gromacs/top/ffgmx2.rtp
+/usr/local/gromacs/top/ffgmx2-c.tdb
+/usr/local/gromacs/top/ffgmx2-n.tdb
+/usr/local/gromacs/top/ffG43a1.itp
+/usr/local/gromacs/top/ffG43a1nb.itp
+/usr/local/gromacs/top/ffG43a1bon.itp
+/usr/local/gromacs/top/ffG43a1.atp
+/usr/local/gromacs/top/ffG43a1.hdb
+/usr/local/gromacs/top/ffG43a1.rtp
+/usr/local/gromacs/top/ffG43a1-c.tdb
+/usr/local/gromacs/top/ffG43a1-n.tdb
+/usr/local/gromacs/top/ffG43a2.itp
+/usr/local/gromacs/top/ffG43a2nb.itp
+/usr/local/gromacs/top/ffG43a2bon.itp
+/usr/local/gromacs/top/ffG43a2.atp
+/usr/local/gromacs/top/ffG43a2.hdb
+/usr/local/gromacs/top/ffG43a2.rtp
+/usr/local/gromacs/top/ffG43a2-c.tdb
+/usr/local/gromacs/top/ffG43a2-n.tdb
+/usr/local/gromacs/top/ffG43b1.itp
+/usr/local/gromacs/top/ffG43b1nb.itp
+/usr/local/gromacs/top/ffG43b1bon.itp
+/usr/local/gromacs/top/ffG43b1.atp
+/usr/local/gromacs/top/ffG43b1.hdb
+/usr/local/gromacs/top/ffG43b1.rtp
+/usr/local/gromacs/top/ffG43b1-c.tdb
+/usr/local/gromacs/top/ffG43b1-n.tdb
+/usr/local/gromacs/top/1mlg.itp
+/usr/local/gromacs/top/2mlg.itp
+/usr/local/gromacs/top/benzamide.itp
+/usr/local/gromacs/top/bondadd.itp
+/usr/local/gromacs/top/buck.itp
+/usr/local/gromacs/top/decane.itp
+/usr/local/gromacs/top/dlg.itp
+/usr/local/gromacs/top/dmso.itp
+/usr/local/gromacs/top/fa.itp
+/usr/local/gromacs/top/ff_dum.itp
+/usr/local/gromacs/top/flexspc.itp
+/usr/local/gromacs/top/flexspce.itp
+/usr/local/gromacs/top/flexwat-ferguson.itp
+/usr/local/gromacs/top/h2p4o13.itp
+/usr/local/gromacs/top/h2p8o25.itp
+/usr/local/gromacs/top/h2po4.itp
+/usr/local/gromacs/top/ions.itp
+/usr/local/gromacs/top/methanol.itp
+/usr/local/gromacs/top/spc.itp
+/usr/local/gromacs/top/spce.itp
+/usr/local/gromacs/top/tfe.itp
+/usr/local/gromacs/top/tip3pgmx.itp
+/usr/local/gromacs/top/tip4pgmx.itp
+/usr/local/gromacs/top/urea.itp
+/usr/local/gromacs/top/dec50.gro
+/usr/local/gromacs/top/dmso.gro
+/usr/local/gromacs/top/spc216.gro
+/usr/local/gromacs/top/tip4p.gro
+/usr/local/gromacs/top/urea+h2o.gro
+/usr/local/gromacs/top/aminoacids.dat
+/usr/local/gromacs/top/atommass.dat
+/usr/local/gromacs/top/bromacs.dat
+/usr/local/gromacs/top/ca-shift.dat
+/usr/local/gromacs/top/cb-shift.dat
+/usr/local/gromacs/top/co-shift.dat
+/usr/local/gromacs/top/edissoc.dat
+/usr/local/gromacs/top/gurgle.dat
+/usr/local/gromacs/top/ha-shift.dat
+/usr/local/gromacs/top/links.dat
+/usr/local/gromacs/top/phbres.dat
+/usr/local/gromacs/top/random.dat
+/usr/local/gromacs/top/refi_aa.dat
+/usr/local/gromacs/top/specbond.dat
+/usr/local/gromacs/top/surface.dat
+/usr/local/gromacs/top/vdwradii.dat
+/usr/local/gromacs/top/xlateat.dat
+/usr/local/gromacs/top/export.dlg
+/usr/local/gromacs/top/bonds.dlg
+/usr/local/gromacs/top/ss.map
+/usr/local/gromacs/top/ps.m2p
+/usr/local/gromacs/top/table6-10.xvg
+/usr/local/gromacs/top/table6-11.xvg
+/usr/local/gromacs/top/table6-12.xvg
+/usr/local/gromacs/top/table6-8.xvg
+/usr/local/gromacs/top/table6-9.xvg
+# examples
+/usr/local/gromacs/share/tutor/cleanit
+/usr/local/gromacs/share/tutor/gmxdemo/cpeptide.pdb
+/usr/local/gromacs/share/tutor/gmxdemo/demo
+/usr/local/gromacs/share/tutor/gmxdemo/demo
+/usr/local/gromacs/share/tutor/nmr1/conf.gro
+/usr/local/gromacs/share/tutor/nmr1/grompp.mdp
+/usr/local/gromacs/share/tutor/nmr1/pep.pdb
+/usr/local/gromacs/share/tutor/nmr1/topol.top
+/usr/local/gromacs/share/tutor/nmr2/conf.gro
+/usr/local/gromacs/share/tutor/nmr2/grompp.mdp
+/usr/local/gromacs/share/tutor/nmr2/pep.pdb
+/usr/local/gromacs/share/tutor/nmr2/topol.top
+/usr/local/gromacs/share/tutor/nmr2/genconf.gcp
+/usr/local/gromacs/share/tutor/water/water.top
+/usr/local/gromacs/share/tutor/water/water.mdp
+/usr/local/gromacs/share/tutor/water/spc216.gro
+/usr/local/gromacs/share/tutor/water/spc216.pdb
+/usr/local/gromacs/share/tutor/water/oxygen.ndx
+/usr/local/gromacs/share/tutor/speptide/speptide.pdb
+/usr/local/gromacs/share/tutor/speptide/pr.mdp
+/usr/local/gromacs/share/tutor/speptide/em.mdp
+/usr/local/gromacs/share/tutor/speptide/full.mdp
+# manual pages
+/usr/local/gromacs/man/
+/usr/local/gromacs/man/man1/
+/usr/local/gromacs/man/man1/g_dih.1
+/usr/local/gromacs/man/man1/g_msd.1
+/usr/local/gromacs/man/man1/g_tcaf.1
+/usr/local/gromacs/man/man1/nmrun.1
+/usr/local/gromacs/man/man1/do_dssp.1
+/usr/local/gromacs/man/man1/g_dipoles.1
+/usr/local/gromacs/man/man1/g_nmeig.1
+/usr/local/gromacs/man/man1/g_traj.1
+/usr/local/gromacs/man/man1/pdb2gmx.1
+/usr/local/gromacs/man/man1/editconf.1
+/usr/local/gromacs/man/man1/g_disre.1
+/usr/local/gromacs/man/man1/g_nmens.1
+/usr/local/gromacs/man/man1/g_velacc.1
+/usr/local/gromacs/man/man1/protonate.1
+/usr/local/gromacs/man/man1/eneconv.1
+/usr/local/gromacs/man/man1/g_dist.1
+/usr/local/gromacs/man/man1/g_order.1
+/usr/local/gromacs/man/man1/genbox.1
+/usr/local/gromacs/man/man1/tpbconv.1
+/usr/local/gromacs/man/man1/g_anaeig.1
+/usr/local/gromacs/man/man1/g_dyndom.1
+/usr/local/gromacs/man/man1/g_potential.1
+/usr/local/gromacs/man/man1/genconf.1
+/usr/local/gromacs/man/man1/trjcat.1
+/usr/local/gromacs/man/man1/g_analyze.1
+/usr/local/gromacs/man/man1/g_enemat.1
+/usr/local/gromacs/man/man1/g_rama.1
+/usr/local/gromacs/man/man1/genion.1
+/usr/local/gromacs/man/man1/trjconv.1
+/usr/local/gromacs/man/man1/g_angle.1
+/usr/local/gromacs/man/man1/g_energy.1
+/usr/local/gromacs/man/man1/g_rdf.1
+/usr/local/gromacs/man/man1/genpr.1
+/usr/local/gromacs/man/man1/trjorder.1
+/usr/local/gromacs/man/man1/g_bond.1
+/usr/local/gromacs/man/man1/g_gyrate.1
+/usr/local/gromacs/man/man1/g_rms.1
+/usr/local/gromacs/man/man1/gmxcheck.1
+/usr/local/gromacs/man/man1/wheel.1
+/usr/local/gromacs/man/man1/g_bundle.1
+/usr/local/gromacs/man/man1/g_h2order.1
+/usr/local/gromacs/man/man1/g_rmsdist.1
+/usr/local/gromacs/man/man1/gmxdump.1
+/usr/local/gromacs/man/man1/x2top.1
+/usr/local/gromacs/man/man1/g_chi.1
+/usr/local/gromacs/man/man1/g_hbond.1
+/usr/local/gromacs/man/man1/g_rmsf.1
+/usr/local/gromacs/man/man1/grompp.1
+/usr/local/gromacs/man/man1/xpm2ps.1
+/usr/local/gromacs/man/man1/g_cluster.1
+/usr/local/gromacs/man/man1/g_helix.1
+/usr/local/gromacs/man/man1/g_rotacf.1
+/usr/local/gromacs/man/man1/highway.1
+/usr/local/gromacs/man/man1/xrama.1
+/usr/local/gromacs/man/man1/g_confrms.1
+/usr/local/gromacs/man/man1/g_lie.1
+/usr/local/gromacs/man/man1/g_saltbr.1
+/usr/local/gromacs/man/man1/make_ndx.1
+/usr/local/gromacs/man/man1/g_covar.1
+/usr/local/gromacs/man/man1/g_mdmat.1
+/usr/local/gromacs/man/man1/g_sas.1
+/usr/local/gromacs/man/man1/mdrun.1
+/usr/local/gromacs/man/man1/g_density.1
+/usr/local/gromacs/man/man1/g_mindist.1
+/usr/local/gromacs/man/man1/g_sgangle.1
+/usr/local/gromacs/man/man1/mk_angndx.1
+/usr/local/gromacs/man/man1/g_morph.1
+/usr/local/gromacs/man/man1/g_sorient.1
+/usr/local/gromacs/man/man1/ngmx.1
+/usr/local/gromacs/man/man1/g_dielectric.1
+# html pages
+/usr/local/gromacs/html/
+/usr/local/gromacs/html/gmxfaq.html
+/usr/local/gromacs/html/online.html
+/usr/local/gromacs/html/gif/
+/usr/local/gromacs/html/gif/annealdn.gif
+/usr/local/gromacs/html/gif/features.gif
+/usr/local/gromacs/html/gif/flow_leftrightup.gif
+/usr/local/gromacs/html/gif/flow_vrule.gif
+/usr/local/gromacs/html/gif/annealup.gif
+/usr/local/gromacs/html/gif/flow_down.gif
+/usr/local/gromacs/html/gif/flow_leftup.gif
+/usr/local/gromacs/html/gif/links.gif
+/usr/local/gromacs/html/gif/articles.gif
+/usr/local/gromacs/html/gif/flow_downleft.gif
+/usr/local/gromacs/html/gif/flow_right+left.gif
+/usr/local/gromacs/html/gif/mail.gif
+/usr/local/gromacs/html/gif/bench.gif
+/usr/local/gromacs/html/gif/flow_hline.gif
+/usr/local/gromacs/html/gif/flow_right.gif
+/usr/local/gromacs/html/gif/manual.gif
+/usr/local/gromacs/html/gif/charts_down.gif
+/usr/local/gromacs/html/gif/flow_left.gif
+/usr/local/gromacs/html/gif/flow_rightleftdown.gif
+/usr/local/gromacs/html/gif/rainbow.gif
+/usr/local/gromacs/html/gif/charts_up.gif
+/usr/local/gromacs/html/gif/flow_leftright.gif
+/usr/local/gromacs/html/gif/flow_uprightleft.gif
+/usr/local/gromacs/html/gif/software.gif
+/usr/local/gromacs/html/gif/faq.gif
+/usr/local/gromacs/html/gif/flow_leftrightdown.gif
+/usr/local/gromacs/html/gif/flow_vline.gif
+/usr/local/gromacs/html/gif/topologies.gif
+/usr/local/gromacs/html/gif/plotje.gif
+/usr/local/gromacs/html/gif/xvgr.gif
+/usr/local/gromacs/html/online/
+/usr/local/gromacs/html/online/edo.html
+/usr/local/gromacs/html/online/g96.html
+/usr/local/gromacs/html/online/log.html
+/usr/local/gromacs/html/online/options.html
+/usr/local/gromacs/html/online/tpa.html
+/usr/local/gromacs/html/online/xvg.html
+/usr/local/gromacs/html/online/edr.html
+/usr/local/gromacs/html/online/m2p.html
+/usr/local/gromacs/html/online/getting_started.html
+/usr/local/gromacs/html/online/out.html
+/usr/local/gromacs/html/online/tpb.html
+/usr/local/gromacs/html/online/ene.html
+/usr/local/gromacs/html/online/gro.html
+/usr/local/gromacs/html/online/map.html
+/usr/local/gromacs/html/online/tpr.html
+/usr/local/gromacs/html/online/eps.html
+/usr/local/gromacs/html/online/hat.html
+/usr/local/gromacs/html/online/mdp.html
+/usr/local/gromacs/html/online/xtc.html
+/usr/local/gromacs/html/online/top.html
+/usr/local/gromacs/html/online/pdb.html
+/usr/local/gromacs/html/online/trj.html
+/usr/local/gromacs/html/online/dat.html
+/usr/local/gromacs/html/online/files.html
+/usr/local/gromacs/html/online/mdp_opt.html
+/usr/local/gromacs/html/online/rtp.html
+/usr/local/gromacs/html/online/include_bot.html
+/usr/local/gromacs/html/online/trr.html
+/usr/local/gromacs/html/online/dlg.html
+/usr/local/gromacs/html/online/flow.html
+/usr/local/gromacs/html/online/mtx.html
+/usr/local/gromacs/html/online/tex.html
+/usr/local/gromacs/html/online/include_top.html
+/usr/local/gromacs/html/online/xpm.html
+/usr/local/gromacs/html/online/edi.html
+/usr/local/gromacs/html/online/g87.html
+/usr/local/gromacs/html/online/itp.html
+/usr/local/gromacs/html/online/ndx.html
+/usr/local/gromacs/html/online/style.css
+/usr/local/gromacs/html/style.css
+# dynamic libraries
+/usr/local/gromacs/lib/i686-pc-linux-gnu/libgmx.so.1.0.0
+/usr/local/gromacs/lib/i686-pc-linux-gnu/libmd.so.1.0.0
+/usr/local/gromacs/lib/i686-pc-linux-gnu/libgmx.so.1
+/usr/local/gromacs/lib/i686-pc-linux-gnu/libmd.so.1
+
+#
+# The header files and static libraries go into gromacs-dev...
+#
+
+%files dev
+# include headers
+/usr/local/gromacs/include/
+/usr/local/gromacs/include/3dview.h
+/usr/local/gromacs/include/do_md.h
+/usr/local/gromacs/include/invblock.h
+/usr/local/gromacs/include/nrjac.h
+/usr/local/gromacs/include/rwtop.h
+/usr/local/gromacs/include/tpxio.h
+/usr/local/gromacs/include/assert.h
+/usr/local/gromacs/include/do_nm.h
+/usr/local/gromacs/include/javaio.h
+/usr/local/gromacs/include/nrnb.h
+/usr/local/gromacs/include/sheader.h
+/usr/local/gromacs/include/transfer.h
+/usr/local/gromacs/include/atomprop.h
+/usr/local/gromacs/include/dummies.h
+/usr/local/gromacs/include/list.h
+/usr/local/gromacs/include/ns.h
+/usr/local/gromacs/include/shift.h
+/usr/local/gromacs/include/trnio.h
+/usr/local/gromacs/include/axp_asm.h
+/usr/local/gromacs/include/ebin.h
+/usr/local/gromacs/include/macros.h
+/usr/local/gromacs/include/nsb.h
+/usr/local/gromacs/include/shift_util.h
+/usr/local/gromacs/include/txtdump.h
+/usr/local/gromacs/include/binio.h
+/usr/local/gromacs/include/edsam.h
+/usr/local/gromacs/include/magic.h
+/usr/local/gromacs/include/nsgrid.h
+/usr/local/gromacs/include/sim_util.h
+/usr/local/gromacs/include/typedefs.h
+/usr/local/gromacs/include/block_tx.h
+/usr/local/gromacs/include/enxio.h
+/usr/local/gromacs/include/main.h
+/usr/local/gromacs/include/pbc.h
+/usr/local/gromacs/include/smalloc.h
+/usr/local/gromacs/include/update.h
+/usr/local/gromacs/include/bondf.h
+/usr/local/gromacs/include/ewald.h
+/usr/local/gromacs/include/maths.h
+/usr/local/gromacs/include/pdbio.h
+/usr/local/gromacs/include/sortwater.h
+/usr/local/gromacs/include/utils.h
+/usr/local/gromacs/include/buffer.h
+/usr/local/gromacs/include/ewald_util.h
+/usr/local/gromacs/include/matio.h
+/usr/local/gromacs/include/pdebug.h
+/usr/local/gromacs/include/split.h
+/usr/local/gromacs/include/vcm.h
+/usr/local/gromacs/include/calcgrid.h
+/usr/local/gromacs/include/fatal.h
+/usr/local/gromacs/include/mdatoms.h
+/usr/local/gromacs/include/physics.h
+/usr/local/gromacs/include/vec.h
+/usr/local/gromacs/include/calch.h
+/usr/local/gromacs/include/ffscanf.h
+/usr/local/gromacs/include/mdebin.h
+/usr/local/gromacs/include/pme.h
+/usr/local/gromacs/include/statusio.h
+/usr/local/gromacs/include/viewit.h
+/usr/local/gromacs/include/calcmu.h
+/usr/local/gromacs/include/fftgrid.h
+/usr/local/gromacs/include/mdrun.h
+/usr/local/gromacs/include/pppm.h
+/usr/local/gromacs/include/statutil.h
+/usr/local/gromacs/include/vveclib.h
+/usr/local/gromacs/include/callf77.h
+/usr/local/gromacs/include/fftw_wrapper.h
+/usr/local/gromacs/include/memdump.h
+/usr/local/gromacs/include/princ.h
+/usr/local/gromacs/include/steep.h
+/usr/local/gromacs/include/wgms.h
+/usr/local/gromacs/include/filenm.h
+/usr/local/gromacs/include/memtab.h
+/usr/local/gromacs/include/pull.h
+/usr/local/gromacs/include/strdb.h
+/usr/local/gromacs/include/wman.h
+/usr/local/gromacs/include/comlib.h
+/usr/local/gromacs/include/force.h
+/usr/local/gromacs/include/memtest.h
+/usr/local/gromacs/include/string2.h
+/usr/local/gromacs/include/writeps.h
+/usr/local/gromacs/include/complex.h
+/usr/local/gromacs/include/futil.h
+/usr/local/gromacs/include/metacode.h
+/usr/local/gromacs/include/random.h
+/usr/local/gromacs/include/struc2.h
+/usr/local/gromacs/include/x86_3dnow.h
+/usr/local/gromacs/include/comtest.h
+/usr/local/gromacs/include/gbutil.h
+/usr/local/gromacs/include/mpiio.h
+/usr/local/gromacs/include/rbin.h
+/usr/local/gromacs/include/superb.h
+/usr/local/gromacs/include/x86_cpu.h
+/usr/local/gromacs/include/tgroup.h
+/usr/local/gromacs/include/general.h
+/usr/local/gromacs/include/mshift.h
+/usr/local/gromacs/include/rdgroup.h
+/usr/local/gromacs/include/symtab.h
+/usr/local/gromacs/include/x86_sse.h
+/usr/local/gromacs/include/confio.h
+/usr/local/gromacs/include/gmxfio.h
+/usr/local/gromacs/include/mvdata.h
+/usr/local/gromacs/include/rdklib.h
+/usr/local/gromacs/include/sync.h
+/usr/local/gromacs/include/xdrf.h
+/usr/local/gromacs/include/constr.h
+/usr/local/gromacs/include/grompp.h
+/usr/local/gromacs/include/names.h
+/usr/local/gromacs/include/readcomp.h
+/usr/local/gromacs/include/synclib.h
+/usr/local/gromacs/include/xtcio.h
+/usr/local/gromacs/include/copyrite.h
+/usr/local/gromacs/include/gstat.h
+/usr/local/gromacs/include/network.h
+/usr/local/gromacs/include/readinp.h
+/usr/local/gromacs/include/sysstuff.h
+/usr/local/gromacs/include/xvgr.h
+/usr/local/gromacs/include/delay.h
+/usr/local/gromacs/include/index.h
+/usr/local/gromacs/include/nhash.h
+/usr/local/gromacs/include/renum.h
+/usr/local/gromacs/include/systest.h
+/usr/local/gromacs/include/disre.h
+/usr/local/gromacs/include/init.h
+/usr/local/gromacs/include/nr.h
+/usr/local/gromacs/include/reorder.h
+/usr/local/gromacs/include/tags.h
+/usr/local/gromacs/include/do_fit.h
+/usr/local/gromacs/include/nrama.h
+/usr/local/gromacs/include/rmpbc.h
+/usr/local/gromacs/include/types/
+/usr/local/gromacs/include/types/atoms.h
+/usr/local/gromacs/include/types/edsams.h
+/usr/local/gromacs/include/types/forcerec.h
+/usr/local/gromacs/include/types/ifunc.h
+/usr/local/gromacs/include/types/mdatom.h
+/usr/local/gromacs/include/types/nsborder.h
+/usr/local/gromacs/include/types/simple.h
+/usr/local/gromacs/include/types/block.h
+/usr/local/gromacs/include/types/energy.h
+/usr/local/gromacs/include/types/graph.h
+/usr/local/gromacs/include/types/inputrec.h
+/usr/local/gromacs/include/types/nblist.h
+/usr/local/gromacs/include/types/nsgrid.h
+/usr/local/gromacs/include/types/symtab.h
+/usr/local/gromacs/include/types/commrec.h
+/usr/local/gromacs/include/types/enums.h
+/usr/local/gromacs/include/types/group.h
+/usr/local/gromacs/include/types/ishift.h
+/usr/local/gromacs/include/types/nbslist.h
+/usr/local/gromacs/include/types/parm.h
+/usr/local/gromacs/include/types/topology.h
+/usr/local/gromacs/include/types/drblock.h
+/usr/local/gromacs/include/types/filenm.h
+/usr/local/gromacs/include/types/idef.h
+/usr/local/gromacs/include/types/matrix.h
+/usr/local/gromacs/include/types/nrnb.h
+/usr/local/gromacs/include/types/pulls.h
+/usr/local/gromacs/include/types/trx.h
+/usr/local/gromacs/share/template/template.c
+/usr/local/gromacs/share/template/README
+/usr/local/gromacs/share/template/Makefile
+/usr/local/gromacs/lib/i686-pc-linux-gnu/libgmx.a
+/usr/local/gromacs/lib/i686-pc-linux-gnu/libmd.a
+/usr/local/gromacs/lib/i686-pc-linux-gnu/libgmx.la
+/usr/local/gromacs/lib/i686-pc-linux-gnu/libmd.la
+/usr/local/gromacs/lib/i686-pc-linux-gnu/libgmx.so
+/usr/local/gromacs/lib/i686-pc-linux-gnu/libmd.so
+
grompp|makes a run input file
tpbconv|makes a run input file for restarting a crashed run
mdrun|performs a simulation
+xmdrun|performs simulations with extra experimental features
END
HEAD|Viewing trajectories
EXTRA_DIST = grompplog2top make_gromos_nb.pl make_gromos_rtp.py \
mkhtml mkonline make_gromos_bon.pl \
- mkcompl mknroff make_gromos_rtp.pl
+ mkcompl mknroff make_gromos_rtp.pl \
+ mktex GMXRC NOGMX
cat > $HTMLIDX << EOD
<HTML>
+<HEAD>
<TITLE>GROMACS $VER Online Reference </TITLE>
+</HEAD>
<LINK rel=stylesheet href="online/style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>GROMACS $VER Online Reference</H2>
-<HR>
-<P>
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+
+<table WIDTH="800" NOSAVE NOBORDER >
+<tr NOSAVE>
+<td WIDTH="120" HEIGHT="140" NOSAVE><a href="http://www.gromacs.org/"><img SRC="
+gif/gmxlogo_small.jpg" BORDER=0 height=133 width=116></a></td>
+
+<td ALIGN=LEFT VALIGN=TOP WIDTH=480 NOSAVE>
+<br><br>
+<h2>
+GROMACS 3.0<br>
+Online Reference</h2>
+</td>
+<td ALIGN=RIGHT VALIGN=BOTTOM WIDTH=200 NOSAVE>
+<B>VERSION 3.0<br>
+Tue 15 May 2001</B></td>
+</tr>
+</table>
+
+<hr>
+
<TABLE BORDER=0 CELLSPACING=0 CELLPADDING=10>
<TR>
<TD VALIGN=top WIDTH="25%">
cat >> $HTMLIDX <<EOD
<A HREF="gmxfaq.html">FAQ</a>
<br>
-<br><A HREF="http://www.gromacs.org">GROMACS homepage</A>
</TD>
<TD VALIGN=top WIDTH=75%>
<h3>Programs</h3>
## Process this file with automake to produce Makefile.in
#
-# Don't edit - this file is generated automatically from Makefile.am
-#
+# Note that Makefile is generated automatically from Makefile.in,
+# which is automatically generated from Makefile.am
include $(srcdir)/../Makefile.inc
#
# This is necessary for VPATH builds (and thus distcheck) to work,
-# but it can probably included in a nicer way...
+# but it can probably be included in a nicer way...
INCLUDES = @INCLUDES@ -I$(top_srcdir)/src/include
# produce the gmx library
#
-lib_LIBRARIES = libgmxXXX_SUFFIX_XXX.a
+lib_LTLIBRARIES = libgmxXXX_SUFFIX_XXX.la
-libgmxXXX_SUFFIX_XXX_a_SOURCES = \
+libgmxXXX_SUFFIX_XXX_la_SOURCES = \
3dview.c atomprop.c block_tx.c bondfree.c \
buffer.c calcgrid.c calch.c inner.h \
confio.c copyrite.c disre.c do_fit.c \
xdrd.c xtcio.c xvgr.c replace.h \
x86_cpu.c
-EXTRA_libgmxXXX_SUFFIX_XXX_a_SOURCES = \
- mpiio.c libnet.c x86_3dnow.asm f77_wrappers.c \
+EXTRA_libgmxXXX_SUFFIX_XXX_la_SOURCES = \
+ mpiio.c libnet.c x86_3dnow.S f77_wrappers.c \
libxdrf.c ftocstr.c dumxdrf.c mgmx.c \
- widget.c widget.h x86_cpuid.asm axp_asm.S \
- x86_sse.asm innerc.c innerf.f
+ widget.c widget.h x86_cpuid.S axp_asm.s \
+ x86_sse.S
if USE_FORTRAN
BUILT_SOURCES = innerf.f
BUILT_SOURCES = innerc.c
endif
-mpi_obj = mpiio.o
-libnet_obj = libnet.o
-inner_f77_obj = innerf.o f77_wrappers.o
-inner_c_obj = innerc.o
-xdr_obj = libxdrf.o ftocstr.o
-noxdr_obj = dumxdrf.o
-motif_obj = mgmx.o widget.o
-x86_asm_obj = x86_cpuid.o
-sse_obj = x86_sse.o
-tdn_obj = x86_3dnow.o
-axp_asm_obj = axp_asm.o
-
-libgmxXXX_SUFFIX_XXX_a_LIBADD = \
- @PAR_OBJ@ @INNER_F77_OBJ@ @INNER_C_OBJ@ \
- @AXP_ASM_OBJ@ @X86_ASM_OBJ@ @SSE_OBJ@ \
- @TDN_OBJ@ @MOTIF_OBJ@ @XDR_OBJ@
-
-libgmxXXX_SUFFIX_XXX_a_DEPENDENCIES = \
- @PAR_OBJ@ @INNER_F77_OBJ@ @INNER_C_OBJ@ \
- @AXP_ASM_OBJ@ @X86_ASM_OBJ@ @SSE_OBJ@ \
- @TDN_OBJ@ @MOTIF_OBJ@ @XDR_OBJ@
+#
+# NB: The contents of GMXLIB_COND_OBJ is defined in
+# the main configure.in script file
+#
+
+libgmxXXX_SUFFIX_XXX_la_LIBADD = @GMXLIB_COND_OBJ@
+
+libgmxXXX_SUFFIX_XXX_la_DEPENDENCIES = @GMXLIB_COND_OBJ@
# The inner loops
innerf.f: mkinl
innerc.c: mkinl
./mkinl c
-noinst_PROGRAMS = mkinl
+# The compaq compiler is really stupid and thinks a .S file is some
+# kind of object. Instead it preprocesses the .s files.
+# To make this work with both gcc and the compaq compilers, we
+# do a workaround to make gcc preprocess the .s file instead:
+
+if GNU_CC
+axp_asm.lo: axp_asm.s
+ $(COMPILE) -x assembler-with-cpp -c $(srcdir)/axp_asm.s
+endif
+
+# Mkinl is special - we cant use a noinst_PROGRAMS target, since it
+# might have to be compiled with a special non-MPI compiler whose files
+# can be executed on the build machine (i.e., not for the target host):
-mkinl_SOURCES = \
+EXTRA_DIST = \
mkinl.c mkinl_declarations.c mkinl_outerloop.c \
mkinl_innerloop.c mkinl_calcdist.c mkinl_invsqrt.c \
mkinl_recip.c mkinl_interactions.c metacode.c \
$(BUILD_COMPILE) -o $@ $(MKINL_OBJ)
# clean things explicitly, since the target names might have changed
-CLEANFILES = ${lib_LIBRARIES} *_d.a *_mpi.a *~ \\\#* innerc.c innerf.f
+CLEANFILES = ${lib_LTLIBRARIES} *_d.la *_mpi.la *~ \\\#* innerc.c innerf.f mkinl
* Bcopy/Memcpy patch.
*
$Log$
+Revision 1.7 2001/06/20 10:34:01 lindahl
+
+Converted assembly to use gcc instead of nasm, updated html man
+pages.
+The x86 assembly loops is now a single option to configure,
+and the single/double prec. is controlled with --enable-float
+(default is yes), to be consistent with fftw.
+Removed the less common options from the summary printed by
+configure, but they are still available.
+Introduced libtool to create both static and dynamic libraries -
+you can control it with configure options. --disable-shared might
+be suitable for development work.
+To avoid compiling both PIC and non-PIC code you can try --with-pic,
+but the default is both.
+
Revision 1.6 2001/05/14 17:58:06 lindahl
Tagged files with gromacs 3.0 header
*
*/
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
#if defined(SYSVBFUNC)
#include <memory.h>
fprintf(out,"\n");
for(i=0; (i<NCR); i++)
- sp_print(out,CopyrightText[i]);
+ fprintf(out," %s\n",CopyrightText[i]);
+
sprintf(buf,"%s",szProgram);
#ifdef DOUBLE
{ "Lindahl2001a",
"E. Lindahl and B. Hess and D. van der Spoel",
"GROMACS 3.0: A package for molecular simulation and trajectory analysis",
- "Submitted",
+ "To appear in J. Mol. Mod.",
0, 2001, 0, 0 }
};
#define NSTR (int)asize(citedb)
#endif
#if (defined USE_SSE || defined USE_3DNOW)
+
if(cpu_capabilities==UNKNOWN_CPU)
cpu_capabilities=check_x86cpu(log);
+
#endif
-
+
if (eNL >= 0) {
i0 = eNL;
i1 = i0+1;
#define _mkinl_h
static char *SRCID_mkinl_h = "$Id$";
+
+#ifdef HAVE_CONFIG_H
#include <config.h>
+#endif
+
#include <types/simple.h>
#include <metacode.h>
#define NSR(s) check_html(s,program)
- fprintf(out,"<TITLE>%s</TITLE>\n",program);
+ fprintf(out,"<HTML>\n<HEAD>\n<TITLE>%s</TITLE>\n",program);
fprintf(out,"<LINK rel=stylesheet href=\"style.css\" type=\"text/css\">\n");
- fprintf(out,"<BODY text=\"#000000\" bgcolor=\"#FFFFFF\" link=\"#0000EF\" vlink=\"#650065\" alink=\"#FF0000\">\n");
- fprintf(out,"<H2>%s</H2>\n",program);
- fprintf(out,"<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH=\"98%%\">\n");
- fprintf(out,"<TR>\n<TD><font size=-1><A HREF=\"../online.html\">Main Table of Contents</A></font></TD>\n");
- fprintf(out,"<TD ALIGN=RIGHT><B>%s</B></TR>\n",GromacsVersion());
- fprintf(out,"<TR><TD><font size=-1><A HREF=\"http://www.gromacs.org\">GROMACS homepage</A></font></TD>\n");
- fprintf(out,"<TD ALIGN=RIGHT><B>%s</B></TR></TABLE></CENTER><HR>\n",mydate());
+ fprintf(out,"<BODY text=\"#000000\" bgcolor=\"#FFFFFF\" link=\"#0000FF\" vlink=\"#990000\" alink=\"#FF0000\">\n");
+ fprintf(out,"<table WIDTH=\"800\" NOBORDER >\n<TR>\n");
+ fprintf(out,"<td WIDTH=\"120\" HEIGHT=\"133\">\n"
+ "<a href=\"http://www.gromacs.org/\">"
+ "<img SRC=\"../gif/gmxlogo_small.jpg\""
+ "BORDER=0 height=133 width=116></a></td>");
+ fprintf(out,"<td ALIGN=LEFT VALIGN=TOP WIDTH=480>"
+ "<br><br><h2>GROMACS Online Reference:<br>%s</h2>",program);
+ fprintf(out,"<font size=-1><A HREF=\"../online.html\">Main Table of Contents</A></font><br>");
+ fprintf(out,"<br></td>\n<TD ALIGN=RIGHT VALIGN=BOTTOM><B>%s<br>\n",GromacsVersion());
+ fprintf(out,"%s</B></td></tr></TABLE>\n<HR>\n",mydate());
if (nldesc > 0) {
- fprintf(out,"<H3>Description</H3>\n");
+ fprintf(out,"<H3>Description</H3>\n<p>\n");
for(i=0; (i<nldesc); i++)
fprintf(out,"%s\n",NSR(desc[i]));
}
--- /dev/null
+/*
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * GROningen MAchine for Chemical Simulations
+ *
+ * VERSION 3.0
+ *
+ * Copyright (c) 1991-2001
+ * BIOSON Research Institute, Dept. of Biophysical Chemistry
+ * University of Groningen, The Netherlands
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * If you want to redistribute modifications, please consider that
+ * scientific software is very special. Version control is crucial -
+ * bugs must be traceable. We will be happy to consider code for
+ * inclusion in the official distribution, but derived work must not
+ * be called official GROMACS. Details are found in the README & COPYING
+ * files - if they are missing, get the official version at www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the papers on the package - you can find them in the top README file.
+ *
+ * Do check out http://www.gromacs.org , or mail us at gromacs@gromacs.org
+ *
+ * And Hey:
+ * GROup of MAchos and Cynical Suckers
+ */
+
+/* This file contains a subset of the gromacs innerloops
+ * manually written in assembly to optimize performance
+ * on AMD extended 3DNow-enabled processors like Athlon
+ * and later generations.
+ * Erik Lindahl, 2000-2001, erik@theophys.kth.se
+ *
+ * We use intel syntax for portability. There are probably some GNU-specific
+ * things here, but they are easy to fix.
+ */
+
+.intel_syntax noprefix
+
+.text
+
+mm_two:
+ .long 0x40000000
+ .long 0x40000000
+mm_six:
+ .long 0x40c00000
+ .long 0x40c00000
+mm_twelve:
+ .long 0x41400000
+ .long 0x41400000
+
+ .align 4
+
+.globl check3dnow /* try to issue an Extended 3DNow instruction */
+ .type check3dnow,@function
+check3dnow:
+ femms
+ pswapd mm0,mm0
+ femms
+ ret
+
+
+.globl vecrecip_3dnow
+ .type vecrecip_3dnow,@function
+vecrecip_3dnow:
+ push ebp
+ mov ebp,esp
+ push eax
+ push ebx
+ push ecx
+ push edx
+
+ mov eax, [ebp + 8]
+ mov ebx, [ebp + 12]
+ mov ecx, [ebp + 16]
+ mov edx, ecx
+ shr ecx, 2
+ jecxz .vecrecip_tail
+ emms
+.vecrecip_mainloop:
+ movq mm0,[eax]
+ add eax, 8
+ pfrcp mm1,mm0
+ movq mm4,[eax]
+ pswapd mm0,mm0
+ add eax, 8
+ pfrcp mm2,mm0
+ pswapd mm0,mm0
+ pfrcp mm5,mm4
+ pswapd mm4,mm4
+ punpckldq mm1,mm2
+ pfrcp mm6,mm4
+ pswapd mm4,mm4
+ pfrcpit1 mm0,mm1
+ punpckldq mm5,mm6
+ pfrcpit2 mm0,mm1
+ movq [ebx],mm0
+ pfrcpit1 mm4,mm5
+ add ebx, 8
+ pfrcpit2 mm4,mm5
+ movq [ebx],mm4
+ add ebx, 8
+ dec ecx
+ jecxz .vecrecip_tail
+ jmp short .vecrecip_mainloop
+.vecrecip_tail:
+ mov ecx,edx
+ and ecx,3
+ jecxz .vecrecip_end
+.vecrecip_tailloop:
+ movd mm0,[eax]
+ add eax, 4
+ pfrcp mm1,mm0
+ pfrcpit1 mm0,mm1
+ pfrcpit2 mm0,mm1
+ movd [ebx],mm0
+ add ebx, 4
+ dec ecx
+ jecxz .vecrecip_end
+ jmp short .vecrecip_tailloop
+.vecrecip_end:
+ emms
+ pop edx
+ pop ecx
+ pop ebx
+ pop eax
+ leave
+ ret
+
+
+.globl vecinvsqrt_3dnow
+ .type vecinvsqrt_3dnow,@function
+vecinvsqrt_3dnow:
+ push ebp
+ mov ebp,esp
+ push eax
+ push ebx
+ push ecx
+ push edx
+
+ mov eax, [ebp + 8]
+ mov ebx, [ebp + 12]
+ mov ecx, [ebp + 16]
+ mov edx, ecx
+ shr ecx, 2
+ jecxz .vecinvsqrt_tail
+ emms
+.vecinvsqrt_mainloop:
+ movq mm0,[eax]
+ add eax, 8
+ pfrsqrt mm1,mm0
+ movq mm4,[eax]
+ pswapd mm0,mm0
+ add eax, 8
+ pfrsqrt mm2,mm0
+ pswapd mm0,mm0
+ pfrsqrt mm5,mm4
+ pswapd mm4,mm4
+ punpckldq mm1,mm2
+ pfrsqrt mm6,mm4
+ movq mm3,mm1
+ pswapd mm4,mm4
+ pfmul mm1,mm1
+ punpckldq mm5,mm6
+ pfrsqit1 mm1,mm0
+ movq mm7,mm5
+ pfrcpit2 mm1,mm3
+ pfmul mm5,mm5
+ movq [ebx],mm1
+ pfrsqit1 mm5,mm4
+ add ebx, 8
+ pfrcpit2 mm5,mm7
+ movq [ebx],mm5
+ add ebx, 8
+ dec ecx
+ jecxz .vecinvsqrt_tail
+ jmp short .vecinvsqrt_mainloop
+.vecinvsqrt_tail:
+ mov ecx,edx
+ and ecx,3
+ jecxz .vecinvsqrt_end
+.vecinvsqrt_tailloop:
+ movd mm0,[eax]
+ add eax, 4
+ pfrsqrt mm1,mm0
+ movq mm2,mm1
+ pfmul mm1,mm1
+ pfrsqit1 mm1,mm0
+ pfrcpit2 mm1,mm2
+ movd [ebx],mm1
+ add ebx, 4
+ dec ecx
+ jecxz .vecinvsqrt_end
+ jmp short .vecinvsqrt_tailloop
+.vecinvsqrt_end:
+ emms
+ pop edx
+ pop ecx
+ pop ebx
+ pop eax
+ leave
+ ret
+
+
+.globl inl0100_3dnow
+ .type inl0100_3dnow,@function
+inl0100_3dnow:
+.equ nri, 8
+.equ iinr, 12
+.equ jindex, 16
+.equ jjnr, 20
+.equ shift, 24
+.equ shiftvec, 28
+.equ fshift, 32
+.equ gid, 36
+.equ pos, 40
+.equ faction, 44
+.equ type, 48
+.equ ntype, 52
+.equ nbfp, 56
+.equ Vnb, 60
+ /* stack offsets for local variables */
+.equ is3, 0
+.equ ii3, 4
+.equ ix, 8
+.equ iy, 12
+.equ iz, 16
+.equ vnbtot, 20 /* repeated (64bit) to fill 3dnow reg */
+.equ c6, 28 /* repeated (64bit) to fill 3dnow reg */
+.equ c12, 36 /* repeated (64bit) to fill 3dnow reg */
+.equ six, 44 /* repeated (64bit) to fill 3dnow reg */
+.equ twelve, 52 /* repeated (64bit) to fill 3dnow reg */
+.equ ntia, 60
+.equ innerjjnr, 64
+.equ innerk, 68
+.equ fix, 72
+.equ fiy, 76
+.equ fiz, 80
+.equ dx1, 84
+.equ dy1, 88
+.equ dz1, 92
+.equ dx2, 96
+.equ dy2, 100
+.equ dz2, 104
+ push ebp
+ mov ebp,esp
+ push eax
+ push ebx
+ push ecx
+ push edx
+ push esi
+ push edi
+ sub esp, 108 /* local stack space */
+ femms
+ /* move data to local stack */
+ movq mm0, [mm_six]
+ movq mm1, [mm_twelve]
+ movq [esp + six ], mm0
+ movq [esp + twelve ], mm1
+ /* assume we have at least one i particle - start directly */
+.i0100_outer:
+ mov eax, [ebp + shift] /* eax = pointer into shift[] */
+ mov ebx, [eax] /* ebx=shift[n] */
+ add [ebp + shift], 4 /* advance pointer one step */
+
+ lea ebx, [ebx + ebx*2] /* ebx=3*is */
+ mov [esp + is3],ebx /* store is3 */
+
+ mov eax, [ebp + shiftvec] /* eax = base of shiftvec[] */
+
+ movq mm0, [eax + ebx*4] /* move shX/shY to mm0 and shZ to mm1. */
+ movd mm1, [eax + ebx*4 + 8]
+
+ mov ecx, [ebp + iinr] /* ecx = pointer into iinr[] */
+ add [ebp + iinr], 4 /* advance pointer */
+ mov ebx, [ecx] /* ebx =ii */
+
+ mov edx, [ebp + type]
+ mov edx, [edx + ebx*4]
+ imul edx, [ebp + ntype]
+ shl edx, 1
+ mov [esp + ntia], edx
+
+ lea ebx, [ebx + ebx*2] /* ebx = 3*ii=ii3 */
+ mov eax, [ebp + pos] /* eax = base of pos[] */
+
+ pfadd mm0, [eax + ebx*4] /* ix = shX + posX (and iy too) */
+ movd mm3, [eax + ebx*4 + 8] /* cant use direct memory add for 4 bytes (iz) */
+ mov [esp + ii3], ebx
+ pfadd mm1, mm3
+ movq [esp + ix], mm0
+ movd [esp + iz], mm1
+
+ /* clear total potential and i forces */
+ pxor mm7,mm7
+ movq [esp + vnbtot], mm7
+ movq [esp + fix], mm7
+ movd [esp + fiz], mm7
+
+ mov eax, [ebp + jindex]
+ mov ecx, [eax] /* jindex[n] */
+ mov edx, [eax + 4] /* jindex[n+1] */
+ add [ebp + jindex], 4
+ sub edx, ecx /* number of innerloop atoms */
+
+ mov esi, [ebp + pos]
+ mov edi, [ebp + faction]
+ mov eax, [ebp + jjnr]
+ shl ecx, 2
+ add eax, ecx
+ mov [esp + innerjjnr], eax /* pointer to jjnr[nj0] */
+ sub edx, 2
+ mov [esp + innerk], edx /* number of innerloop atoms */
+ jge .i0100_unroll_loop
+ jmp .i0100_finish_inner
+.i0100_unroll_loop:
+ /* paired innerloop starts here */
+ mov ecx, [esp + innerjjnr] /* pointer to jjnr[k] */
+ mov eax, [ecx]
+ mov ebx, [ecx + 4] /* eax/ebx=jnr */
+ add [esp + innerjjnr], 8 /* advance pointer (unrolled 2) */
+ prefetch [ecx + 16] /* prefetch data - trial and error says 16 is best */
+
+ mov ecx, [ebp + type]
+ mov edx, [ecx + eax*4] /* type [jnr1] */
+ mov ecx, [ecx + ebx*4] /* type [jnr2] */
+
+ mov esi, [ebp + nbfp] /* base of nbfp */
+ shl edx, 1
+ shl ecx, 1
+ add edx, [esp + ntia] /* tja = ntia + 2*type */
+ add ecx, [esp + ntia]
+
+ movq mm5, [esi + edx*4] /* mm5 = 1st c6 / c12 */
+ movq mm7, [esi + ecx*4] /* mm7 = 2nd c6 / c12 */
+ movq mm6,mm5
+ punpckldq mm5,mm7 /* mm5 = 1st c6 / 2nd c6 */
+ punpckhdq mm6,mm7 /* mm6 = 1st c12 / 2nd c12 */
+ movq [esp + c6], mm5
+ movq [esp + c12], mm6
+
+ lea eax, [eax + eax*2] /* replace jnr with j3 */
+ lea ebx, [ebx + ebx*2]
+
+ mov esi, [ebp + pos]
+
+ movq mm0, [esp + ix]
+ movd mm1, [esp + iz]
+ movq mm4, [esi + eax*4] /* fetch first j coordinates */
+ movd mm5, [esi + eax*4 + 8]
+ pfsubr mm4,mm0 /* dr = ir - jr */
+ pfsubr mm5,mm1
+ movq [esp + dx1], mm4 /* store dr */
+ movd [esp + dz1], mm5
+ pfmul mm4,mm4 /* square dx,dy,dz */
+ pfmul mm5,mm5
+ pfacc mm4, mm5 /* accumulate to get dx*dx+dy*dy+dz*dz */
+ pfacc mm4, mm5 /* first rsq in lower mm4 */
+
+ movq mm6, [esi + ebx*4] /* fetch second j coordinates */
+ movd mm7, [esi + ebx*4 + 8]
+
+ pfsubr mm6,mm0 /* dr = ir - jr */
+ pfsubr mm7,mm1
+ movq [esp + dx2], mm6 /* store dr */
+ movd [esp + dz2], mm7
+ pfmul mm6,mm6 /* square dx,dy,dz */
+ pfmul mm7,mm7
+ pfacc mm6, mm7 /* accumulate to get dx*dx+dy*dy+dz*dz */
+ pfacc mm6, mm7 /* second rsq in lower mm6 */
+
+ pfrcp mm0, mm4 /* lookup reciprocal seed */
+ pfrcp mm1, mm6
+
+ punpckldq mm0,mm1
+ punpckldq mm4,mm6 /* now 4 has rsq and 0 the seed for both pairs. */
+ /* amd 3dnow N-R iteration to get full precision. */
+ pfrcpit1 mm4,mm0
+ pfrcpit2 mm4,mm0
+ /* mm4 now contains invsq,
+ * do potential and fscal
+ */
+ movq mm0, mm4
+ pfmul mm4, mm0
+ pfmul mm4, mm0 /* mm4=rinvsix */
+ movq mm5, mm4
+ pfmul mm5, mm5 /* mm5=rinvtwelve */
+
+ pfmul mm5, [esp + c12]
+ pfmul mm4, [esp + c6]
+ movq mm6, mm5 /* mm6 is vnb12-vnb6 */
+ pfsub mm6, mm4
+
+ pfmul mm4, [esp + six]
+
+ pfmul mm5, [esp + twelve]
+ pfsub mm5,mm4
+ pfmul mm0, mm5 /* mm0 is total fscal now */
+
+ prefetchw [esp + dx1] /* prefetch i forces to cache */
+
+ /* spread fscalar to both positions */
+ movq mm1,mm0
+ punpckldq mm0,mm0
+ punpckhdq mm1,mm1
+
+ /* calc vector force */
+ prefetchw [edi + eax*4] /* prefetch the 1st faction to cache */
+ movq mm2, [esp + dx1] /* fetch dr */
+ movd mm3, [esp + dz1]
+
+ /* update vnbtot */
+ pfadd mm6, [esp + vnbtot] /* add the earlier value */
+ movq [esp + vnbtot], mm6 /* store the sum */
+
+ prefetchw [edi + ebx*4] /* prefetch the 2nd faction to cache */
+ pfmul mm2, mm0 /* mult by fs */
+ pfmul mm3, mm0
+
+ movq mm4, [esp + dx2] /* fetch dr */
+ movd mm5, [esp + dz2]
+ pfmul mm4, mm1 /* mult by fs */
+ pfmul mm5, mm1
+ /* update i forces */
+
+ movq mm0, [esp + fix]
+ movd mm1, [esp + fiz]
+ pfadd mm0, mm2
+ pfadd mm1, mm3
+
+ pfadd mm0, mm4
+ pfadd mm1, mm5
+ movq [esp + fix], mm0
+ movd [esp + fiz], mm1
+ /* update j forces */
+
+ movq mm0, [edi + eax*4]
+ movd mm1, [edi + eax*4 + 8]
+ movq mm6, [edi + ebx*4]
+ movd mm7, [edi + ebx*4 + 8]
+
+ pfsub mm0, mm2
+ pfsub mm1, mm3
+ pfsub mm6, mm4
+ pfsub mm7, mm5
+
+ movq [edi + eax*4], mm0
+ movd [edi + eax*4 +8], mm1
+ movq [edi + ebx*4], mm6
+ movd [edi + ebx*4 + 8], mm7
+
+ /* should we do one more iteration? */
+ sub [esp + innerk], 2
+ jl .i0100_finish_inner
+ jmp .i0100_unroll_loop
+.i0100_finish_inner:
+ and [esp + innerk], 1
+ jnz .i0100_single_inner
+ jmp .i0100_updateouterdata
+.i0100_single_inner:
+ /* a single j particle iteration here - compare with the unrolled code for comments */
+ mov eax, [esp + innerjjnr]
+ mov eax, [eax] /* eax=jnr offset */
+
+ mov esi, [ebp + nbfp]
+ mov ecx, [ebp + type]
+ mov edx, [ecx + eax*4] /* type [jnr1] */
+ shl edx, 1
+ add edx, [esp + ntia] /* tja = ntia + 2*type */
+ movd mm5, [esi + edx*4] /* mm5 = 1st c6 */
+ movq [esp + c6], mm5
+ movd mm5, [esi + edx*4 + 4] /* mm5 = 1st c12 */
+ movq [esp + c12], mm5
+
+ mov esi, [ebp + pos]
+ lea eax, [eax + eax*2]
+
+ movq mm0, [esp + ix]
+ movd mm1, [esp + iz]
+ movq mm4, [esi + eax*4]
+ movd mm5, [esi + eax*4 + 8]
+ pfsubr mm4, mm0
+ pfsubr mm5, mm1
+ movq [esp + dx1], mm4
+ pfmul mm4,mm4
+ movd [esp + dz1], mm5
+ pfmul mm5,mm5
+ pfacc mm4, mm5
+ pfacc mm4, mm5 /* mm4=rsq */
+
+ pfrcp mm0,mm4
+ pfrcpit1 mm4,mm0
+ pfrcpit2 mm4,mm0 /* mm4=invsq */
+ /* calculate potentials and scalar force */
+ movq mm0, mm4
+
+ pfmul mm4, mm0
+ pfmul mm4, mm0 /* mm4=rinvsix */
+ movq mm5, mm4
+ pfmul mm5, mm5 /* mm5=rinvtwelve */
+
+ pfmul mm5, [esp + c12]
+ pfmul mm4, [esp + c6]
+ movq mm6, mm5 /* mm6 is vnb12-vnb6 */
+ pfsub mm6, mm4
+
+ pfmul mm4, [esp + six]
+
+ pfmul mm5, [esp + twelve]
+ pfsub mm5, mm4
+ pfmul mm0, mm5 /* mm0 is total fscal now */
+
+ /* update vnbtot */
+ pfadd mm6, [esp + vnbtot] /* add the earlier value */
+ movq [esp + vnbtot], mm6 /* store the sum */
+
+ /* spread fscalar to both positions */
+ punpckldq mm0,mm0
+ /* calc vectorial force */
+ prefetchw [edi + eax*4] /* prefetch faction to cache */
+ movq mm2, [esp + dx1]
+ movd mm3, [esp + dz1]
+
+ pfmul mm2, mm0
+ pfmul mm3, mm0
+
+ /* update i particle force */
+ movq mm0, [esp + fix]
+ movd mm1, [esp + fiz]
+ pfadd mm0, mm2
+ pfadd mm1, mm3
+ movq [esp + fix], mm0
+ movd [esp + fiz], mm1
+ /* update j particle force */
+ movq mm0, [edi + eax*4]
+ movd mm1, [edi + eax *4+ 8]
+ pfsub mm0, mm2
+ pfsub mm1, mm3
+ movq [edi + eax*4], mm0
+ movd [edi + eax*4 +8], mm1
+ /* done! */
+.i0100_updateouterdata:
+ mov ecx, [esp + ii3]
+
+ movq mm6, [edi + ecx*4] /* increment i force */
+ movd mm7, [edi + ecx*4 + 8]
+ pfadd mm6, [esp + fix]
+ pfadd mm7, [esp + fiz]
+ movq [edi + ecx*4], mm6
+ movd [edi + ecx*4 +8], mm7
+
+ mov ebx, [ebp + fshift] /* increment fshift force */
+ mov edx, [esp + is3]
+
+ movq mm6, [ebx + edx*4]
+ movd mm7, [ebx + edx*4 + 8]
+ pfadd mm6, [esp + fix]
+ pfadd mm7, [esp + fiz]
+ movq [ebx + edx*4], mm6
+ movd [ebx + edx*4 + 8], mm7
+
+ mov edx, [ebp + gid] /* get group index for this i particle */
+ mov edx, [edx]
+ add [ebp + gid], 4 /* advance pointer */
+
+ movq mm7, [esp + vnbtot]
+ pfacc mm7,mm7 /* get and sum the two parts of total potential */
+
+ mov eax, [ebp + Vnb]
+ movd mm6, [eax + edx*4]
+ pfadd mm6, mm7
+ movd [eax + edx*4], mm6 /* increment vnb[gid] */
+
+ /* finish if last */
+ mov ecx, [ebp + nri]
+ dec ecx
+ jecxz .i0100_end
+ /* not last, iterate once more! */
+ mov [ebp + nri], ecx
+ jmp .i0100_outer
+.i0100_end:
+ femms
+ add esp, 108
+ pop edi
+ pop esi
+ pop edx
+ pop ecx
+ pop ebx
+ pop eax
+ leave
+ ret
+
+
+
+
+
+
+.globl inl0110_3dnow
+ .type inl0110_3dnow,@function
+inl0110_3dnow:
+.equ nri, 8
+.equ iinr, 12
+.equ jindex, 16
+.equ jjnr, 20
+.equ shift, 24
+.equ shiftvec, 28
+.equ fshift, 32
+.equ gid, 36
+.equ pos, 40
+.equ faction, 44
+.equ type, 48
+.equ ntype, 52
+.equ nbfp, 56
+.equ Vnb, 60
+.equ nsatoms, 64
+ /* stack offsets for local variables */
+.equ is3, 0
+.equ ii3, 4
+.equ shX, 8
+.equ shY, 12
+.equ shZ, 16
+.equ ix, 20
+.equ iy, 24
+.equ iz, 28
+.equ vnbtot, 32 /* repeated (64bit) to fill 3dnow reg */
+.equ c6, 40 /* repeated (64bit) to fill 3dnow reg */
+.equ c12, 48 /* repeated (64bit) to fill 3dnow reg */
+.equ six, 56 /* repeated (64bit) to fill 3dnow reg */
+.equ twelve, 64 /* repeated (64bit) to fill 3dnow reg */
+.equ ntia, 72
+.equ innerjjnr0, 76
+.equ innerk0, 80
+.equ innerjjnr, 84
+.equ innerk, 88
+.equ fix, 92
+.equ fiy, 96
+.equ fiz, 100
+.equ dx1, 104
+.equ dy1, 108
+.equ dz1, 112
+.equ dx2, 116
+.equ dy2, 120
+.equ dz2, 124
+.equ nsvdwc, 128
+.equ nscoul, 132
+.equ nsvdw, 136
+.equ solnr, 140
+ push ebp
+ mov ebp,esp
+ push eax
+ push ebx
+ push ecx
+ push edx
+ push esi
+ push edi
+ sub esp, 144 /* local stack space */
+ femms
+ movq mm0, [mm_six]
+ movq mm1, [mm_twelve]
+ movq [esp + six], mm0
+ movq [esp + twelve], mm1
+ /* assume we have at least one i particle - start directly */
+.i0110_outer:
+ mov eax, [ebp + shift] /* eax = pointer into shift[] */
+ mov ebx, [eax] /* ebx=shift[n] */
+ add [ebp + shift], 4 /* advance pointer one step */
+
+ lea ebx, [ebx + ebx*2] /* ebx=3*is */
+ mov [esp + is3],ebx /* store is3 */
+
+ mov eax, [ebp + shiftvec] /* eax = base of shiftvec[] */
+
+ movq mm0, [eax + ebx*4] /* move shX/shY to mm0 and shZ to mm1 */
+ movd mm1, [eax + ebx*4 + 8]
+ movq [esp + shX], mm0
+ movd [esp + shZ], mm1
+
+ mov ecx, [ebp + iinr] /* ecx = pointer into iinr[] */
+ add [ebp + iinr], 4 /* advance pointer */
+ mov ebx, [ecx] /* ebx=ii */
+
+ mov eax, [ebp + nsatoms]
+ add [ebp + nsatoms], 12
+ mov ecx, [eax]
+ mov edx, [eax + 4]
+ mov eax, [eax + 8]
+ sub ecx, eax
+ sub eax, edx
+
+ mov [esp + nsvdwc], edx
+ mov [esp + nscoul], eax
+ mov [esp + nsvdw], ecx
+
+ /* clear potential */
+ pxor mm7,mm7
+ movq [esp + vnbtot], mm7
+ mov [esp + solnr], ebx
+
+ mov eax, [ebp + jindex]
+ mov ecx, [eax] /* jindex[n] */
+ mov edx, [eax + 4] /* jindex[n+1] */
+ add [ebp + jindex], 4
+ sub edx, ecx /* number of innerloop atoms */
+ mov eax, [ebp + jjnr]
+ shl ecx, 2
+ add eax, ecx
+ mov [esp + innerjjnr0], eax /* pointer to jjnr[nj0] */
+
+ mov [esp + innerk0], edx /* number of innerloop atoms */
+ mov esi, [ebp + pos]
+ mov edi, [ebp + faction]
+
+ mov ecx, [esp + nsvdwc]
+ cmp ecx, 0
+ jnz .i0110_mno_vdwc
+ jmp .i0110_testvdw
+.i0110_mno_vdwc:
+ mov ebx, [esp + solnr]
+ inc dword ptr [esp + solnr]
+
+ mov edx, [ebp + type]
+ mov edx, [edx + ebx*4]
+ imul edx, [ebp + ntype]
+ shl edx, 1
+ mov [esp + ntia], edx
+
+ lea ebx, [ebx + ebx*2] /* ebx = 3*ii=ii3 */
+ mov eax, [ebp + pos] /* eax = base of pos[] */
+ mov [esp + ii3], ebx
+
+ movq mm0, [eax + ebx*4]
+ movd mm1, [eax + ebx*4 + 8]
+ pfadd mm0, [esp + shX]
+ pfadd mm1, [esp + shZ]
+ movq [esp + ix], mm0
+ movd [esp + iz], mm1
+
+ /* clear forces */
+ pxor mm7,mm7
+ movq [esp + fix], mm7
+ movd [esp + fiz], mm7
+
+ mov ecx, [esp + innerjjnr0]
+ mov [esp + innerjjnr], ecx
+ mov edx, [esp + innerk0]
+ sub edx, 2
+ mov [esp + innerk], edx /* number of innerloop atoms */
+ jge .i0110_unroll_vdwc_loop
+ jmp .i0110_finish_vdwc_inner
+.i0110_unroll_vdwc_loop:
+ /* paired innerloop starts here */
+ mov ecx, [esp + innerjjnr] /* pointer to jjnr[k] */
+ mov eax, [ecx]
+ mov ebx, [ecx + 4] /* eax/ebx=jnr */
+ add [esp + innerjjnr], 8 /* advance pointer (unrolled 2) */
+ prefetch [ecx + 16] /* prefetch data - trial and error says 16 is best */
+
+ mov ecx, [ebp + type]
+ mov edx, [ecx + eax*4] /* type [jnr1] */
+ mov ecx, [ecx + ebx*4] /* type [jnr2] */
+
+ mov esi, [ebp + nbfp] /* base of nbfp */
+ shl edx, 1
+ shl ecx, 1
+ add edx, [esp + ntia] /* tja = ntia + 2*type */
+ add ecx, [esp + ntia]
+
+ movq mm5, [esi + edx*4] /* mm5 = 1st c6 / c12 */
+ movq mm7, [esi + ecx*4] /* mm7 = 2nd c6 / c12 */
+ movq mm6,mm5
+ punpckldq mm5,mm7 /* mm5 = 1st c6 / 2nd c6 */
+ punpckhdq mm6,mm7 /* mm6 = 1st c12 / 2nd c12 */
+ movq [esp + c6], mm5
+ movq [esp + c12], mm6
+
+ lea eax, [eax + eax*2] /* replace jnr with j3 */
+ lea ebx, [ebx + ebx*2]
+
+ mov esi, [ebp + pos]
+
+ movq mm0, [esp + ix]
+ movd mm1, [esp + iz]
+ movq mm4, [esi + eax*4] /* fetch first j coordinates */
+ movd mm5, [esi + eax*4 + 8]
+ pfsubr mm4,mm0 /* dr = ir - jr */
+ pfsubr mm5,mm1
+ movq [esp + dx1], mm4 /* store dr */
+ movd [esp + dz1], mm5
+ pfmul mm4,mm4 /* square dx,dy,dz */
+ pfmul mm5,mm5
+ pfacc mm4, mm5 /* accumulate to get dx*dx+dy*dy+dz*dz */
+ pfacc mm4, mm5 /* first rsq in lower mm4 */
+
+ movq mm6, [esi + ebx*4] /* fetch second j coordinates */
+ movd mm7, [esi + ebx*4 + 8]
+
+ pfsubr mm6,mm0 /* dr = ir - jr */
+ pfsubr mm7,mm1
+ movq [esp + dx2], mm6 /* store dr */
+ movd [esp + dz2], mm7
+ pfmul mm6,mm6 /* square dx,dy,dz */
+ pfmul mm7,mm7
+ pfacc mm6, mm7 /* accumulate to get dx*dx+dy*dy+dz*dz */
+ pfacc mm6, mm7 /* second rsq in lower mm6 */
+
+ pfrcp mm0, mm4 /* lookup reciprocal seed */
+ pfrcp mm1, mm6
+
+ punpckldq mm0,mm1
+ punpckldq mm4,mm6 /* now 4 has rsq and 0 the seed for both pairs */
+ /* amd 3dnow N-R iteration to get full precision */
+ pfrcpit1 mm4,mm0
+ pfrcpit2 mm4,mm0
+ /* mm4 now contains invsq,
+ * do potential and fscal
+ */
+ movq mm0, mm4
+ pfmul mm4, mm0
+ pfmul mm4, mm0 /* mm4=rinvsix */
+ movq mm5, mm4
+ pfmul mm5, mm5 /* mm5=rinvtwelve */
+
+ pfmul mm5, [esp + c12]
+ pfmul mm4, [esp + c6]
+ movq mm6, mm5 /* mm6 is vnb12-vnb6 */
+ pfsub mm6, mm4
+
+ pfmul mm4, [esp + six]
+
+ pfmul mm5, [esp + twelve]
+ pfsub mm5,mm4
+ pfmul mm0, mm5 /* mm0 is total fscal now */
+
+ prefetchw [esp + dx1] /* prefetch i forces to cache */
+
+ /* spread fscalar to both positions */
+ movq mm1,mm0
+ punpckldq mm0,mm0
+ punpckhdq mm1,mm1
+
+ /* calc vector force */
+ prefetchw [edi + eax*4] /* prefetch the 1st faction to cache */
+ movq mm2, [esp + dx1] /* fetch dr */
+ movd mm3, [esp + dz1]
+
+ /* update vnbtot */
+ pfadd mm6, [esp + vnbtot] /* add the earlier value */
+ movq [esp + vnbtot], mm6 /* store the sum */
+
+ prefetchw [edi + ebx*4] /* prefetch the 2nd faction to cache */
+ pfmul mm2, mm0 /* mult by fs */
+ pfmul mm3, mm0
+
+ movq mm4, [esp + dx2] /* fetch dr */
+ movd mm5, [esp + dz2]
+ pfmul mm4, mm1 /* mult by fs */
+ pfmul mm5, mm1
+ /* update i forces */
+
+ movq mm0, [esp + fix]
+ movd mm1, [esp + fiz]
+ pfadd mm0, mm2
+ pfadd mm1, mm3
+
+ pfadd mm0, mm4
+ pfadd mm1, mm5
+ movq [esp + fix], mm0
+ movd [esp + fiz], mm1
+ /* update j forces */
+
+ movq mm0, [edi + eax*4]
+ movd mm1, [edi + eax*4 + 8]
+ movq mm6, [edi + ebx*4]
+ movd mm7, [edi + ebx*4 + 8]
+
+ pfsub mm0, mm2
+ pfsub mm1, mm3
+ pfsub mm6, mm4
+ pfsub mm7, mm5
+
+ movq [edi + eax*4], mm0
+ movd [edi + eax*4 +8], mm1
+ movq [edi + ebx*4], mm6
+ movd [edi + ebx*4 + 8], mm7
+
+ /* should we do one more iteration? */
+ sub [esp + innerk], 2
+ jl .i0110_finish_vdwc_inner
+ jmp .i0110_unroll_vdwc_loop
+.i0110_finish_vdwc_inner:
+ and [esp + innerk], 1
+ jnz .i0110_single_vdwc_inner
+ jmp .i0110_updateouterdata_vdwc
+.i0110_single_vdwc_inner:
+ /* a single j particle iteration here - compare with the unrolled code for comments */
+ mov eax, [esp + innerjjnr]
+ mov eax, [eax] /* eax=jnr offset */
+
+ mov esi, [ebp + nbfp]
+ mov ecx, [ebp + type]
+ mov edx, [ecx + eax*4] /* type [jnr1] */
+ shl edx, 1
+ add edx, [esp + ntia] /* tja = ntia + 2*type */
+ movd mm5, [esi + edx*4] /* mm5 = 1st c6 */
+ movq [esp + c6], mm5
+ movd mm5, [esi + edx*4 + 4] /* mm5 = 1st c12 */
+ movq [esp + c12], mm5
+
+ mov esi, [ebp + pos]
+ lea eax, [eax + eax*2]
+
+ movq mm0, [esp + ix]
+ movd mm1, [esp + iz]
+ movq mm4, [esi + eax*4]
+ movd mm5, [esi + eax*4 + 8]
+ pfsubr mm4, mm0
+ pfsubr mm5, mm1
+ movq [esp + dx1], mm4
+ pfmul mm4,mm4
+ movd [esp + dz1], mm5
+ pfmul mm5,mm5
+ pfacc mm4, mm5
+ pfacc mm4, mm5 /* mm4=rsq */
+
+ pfrcp mm0,mm4
+ pfrcpit1 mm4,mm0
+ pfrcpit2 mm4,mm0 /* mm4=invsq */
+ /* calculate potentials and scalar force */
+ movq mm0, mm4
+
+ pfmul mm4, mm0
+ pfmul mm4, mm0 /* mm4=rinvsix */
+ movq mm5, mm4
+ pfmul mm5, mm5 /* mm5=rinvtwelve */
+
+ pfmul mm5, [esp + c12]
+ pfmul mm4, [esp + c6]
+ movq mm6, mm5 /* mm6 is vnb12-vnb6 */
+ pfsub mm6, mm4
+
+ pfmul mm4, [esp + six]
+
+ pfmul mm5, [esp + twelve]
+ pfsub mm5, mm4
+ pfmul mm0, mm5 /* mm0 is total fscal now */
+
+ /* update vnbtot */
+ pfadd mm6, [esp + vnbtot] /* add the earlier value */
+ movq [esp + vnbtot], mm6 /* store the sum */
+
+ /* spread fscalar to both positions */
+ punpckldq mm0,mm0
+ /* calc vectorial force */
+ prefetchw [edi + eax*4] /* prefetch faction to cache */
+ movq mm2, [esp + dx1]
+ movd mm3, [esp + dz1]
+
+ pfmul mm2, mm0
+ pfmul mm3, mm0
+
+ /* update i particle force */
+ movq mm0, [esp + fix]
+ movd mm1, [esp + fiz]
+ pfadd mm0, mm2
+ pfadd mm1, mm3
+ movq [esp + fix], mm0
+ movd [esp + fiz], mm1
+ /* update j particle force */
+ movq mm0, [edi + eax*4]
+ movd mm1, [edi + eax *4+ 8]
+ pfsub mm0, mm2
+ pfsub mm1, mm3
+ movq [edi + eax*4], mm0
+ movd [edi + eax*4 +8], mm1
+ /* done! */
+.i0110_updateouterdata_vdwc:
+ mov ecx, [esp + ii3]
+
+ movq mm6, [edi + ecx*4] /* increment i force */
+ movd mm7, [edi + ecx*4 + 8]
+ pfadd mm6, [esp + fix]
+ pfadd mm7, [esp + fiz]
+ movq [edi + ecx*4], mm6
+ movd [edi + ecx*4 +8], mm7
+
+ mov ebx, [ebp + fshift] /* increment fshift force */
+ mov edx, [esp + is3]
+
+ movq mm6, [ebx + edx*4]
+ movd mm7, [ebx + edx*4 + 8]
+ pfadd mm6, [esp + fix]
+ pfadd mm7, [esp + fiz]
+ movq [ebx + edx*4], mm6
+ movd [ebx + edx*4 + 8], mm7
+
+ /* loop back to mno */
+ dec dword ptr [esp + nsvdwc]
+ jz .i0110_testvdw
+ jmp .i0110_mno_vdwc
+.i0110_testvdw:
+ mov ebx, [esp + nscoul]
+ add [esp + solnr], ebx
+
+ mov ecx, [esp + nsvdw]
+ cmp ecx, 0
+ jnz .i0110_mno_vdw
+ jmp .i0110_last_mno
+.i0110_mno_vdw:
+ mov ebx, [esp + solnr]
+ inc dword ptr [esp + solnr]
+
+ mov edx, [ebp + type]
+ mov edx, [edx + ebx*4]
+ imul edx, [ebp + ntype]
+ shl edx, 1
+ mov [esp + ntia], edx
+
+ lea ebx, [ebx + ebx*2] /* ebx = 3*ii=ii3 */
+ mov eax, [ebp + pos] /* eax = base of pos[] */
+ mov [esp + ii3], ebx
+
+ movq mm0, [eax + ebx*4]
+ movd mm1, [eax + ebx*4 + 8]
+ pfadd mm0, [esp + shX]
+ pfadd mm1, [esp + shZ]
+ movq [esp + ix], mm0
+ movd [esp + iz], mm1
+
+ /* clear forces */
+ pxor mm7,mm7
+ movq [esp + fix], mm7
+ movd [esp + fiz], mm7
+
+ mov ecx, [esp + innerjjnr0]
+ mov [esp + innerjjnr], ecx
+ mov edx, [esp + innerk0]
+ sub edx, 2
+ mov [esp + innerk], edx /* number of innerloop atoms */
+ jge .i0110_unroll_vdw_loop
+ jmp .i0110_finish_vdw_inner
+.i0110_unroll_vdw_loop:
+ /* paired innerloop starts here */
+ mov ecx, [esp + innerjjnr] /* pointer to jjnr[k] */
+ mov eax, [ecx]
+ mov ebx, [ecx + 4] /* eax/ebx=jnr */
+ add [esp + innerjjnr], 8 /* advance pointer (unrolled 2) */
+ prefetch [ecx + 16] /* prefetch data - trial and error says 16 is best */
+
+ mov ecx, [ebp + type]
+ mov edx, [ecx + eax*4] /* type [jnr1] */
+ mov ecx, [ecx + ebx*4] /* type [jnr2] */
+
+ mov esi, [ebp + nbfp] /* base of nbfp */
+ shl edx, 1
+ shl ecx, 1
+ add edx, [esp + ntia] /* tja = ntia + 2*type */
+ add ecx, [esp + ntia]
+
+ movq mm5, [esi + edx*4] /* mm5 = 1st c6 / c12 */
+ movq mm7, [esi + ecx*4] /* mm7 = 2nd c6 / c12 */
+ movq mm6,mm5
+ punpckldq mm5,mm7 /* mm5 = 1st c6 / 2nd c6 */
+ punpckhdq mm6,mm7 /* mm6 = 1st c12 / 2nd c12 */
+ movq [esp + c6], mm5
+ movq [esp + c12], mm6
+
+ lea eax, [eax + eax*2] /* replace jnr with j3 */
+ lea ebx, [ebx + ebx*2]
+
+ mov esi, [ebp + pos]
+
+ movq mm0, [esp + ix]
+ movd mm1, [esp + iz]
+ movq mm4, [esi + eax*4] /* fetch first j coordinates */
+ movd mm5, [esi + eax*4 + 8]
+ pfsubr mm4,mm0 /* dr = ir - jr */
+ pfsubr mm5,mm1
+ movq [esp + dx1], mm4 /* store dr */
+ movd [esp + dz1], mm5
+ pfmul mm4,mm4 /* square dx,dy,dz */
+ pfmul mm5,mm5
+ pfacc mm4, mm5 /* accumulate to get dx*dx+dy*dy+dz*dz */
+ pfacc mm4, mm5 /* first rsq in lower mm4 */
+
+ movq mm6, [esi + ebx*4] /* fetch second j coordinates */
+ movd mm7, [esi + ebx*4 + 8]
+
+ pfsubr mm6,mm0 /* dr = ir - jr */
+ pfsubr mm7,mm1
+ movq [esp + dx2], mm6 /* store dr */
+ movd [esp + dz2], mm7
+ pfmul mm6,mm6 /* square dx,dy,dz */
+ pfmul mm7,mm7
+ pfacc mm6, mm7 /* accumulate to get dx*dx+dy*dy+dz*dz */
+ pfacc mm6, mm7 /* second rsq in lower mm6 */
+
+ pfrcp mm0, mm4 /* lookup reciprocal seed */
+ pfrcp mm1, mm6
+
+ punpckldq mm0,mm1
+ punpckldq mm4,mm6 /* now 4 has rsq and 0 the seed for both pairs */
+ /* amd 3dnow N-R iteration to get full precision */
+ pfrcpit1 mm4,mm0
+ pfrcpit2 mm4,mm0
+ /* mm4 now contains invsq,
+ * do potential and fscal
+ */
+ movq mm0, mm4
+ pfmul mm4, mm0
+ pfmul mm4, mm0 /* mm4=rinvsix */
+ movq mm5, mm4
+ pfmul mm5, mm5 /* mm5=rinvtwelve */
+
+ pfmul mm5, [esp + c12]
+ pfmul mm4, [esp + c6]
+ movq mm6, mm5 /* mm6 is vnb12-vnb6 */
+ pfsub mm6, mm4
+
+ pfmul mm4, [esp + six]
+
+ pfmul mm5, [esp + twelve]
+ pfsub mm5,mm4
+ pfmul mm0, mm5 /* mm0 is total fscal now */
+
+ prefetchw [esp + dx1] /* prefetch i forces to cache */
+
+ /* spread fscalar to both positions */
+ movq mm1,mm0
+ punpckldq mm0,mm0
+ punpckhdq mm1,mm1
+
+ /* calc vector force */
+ prefetchw [edi + eax*4] /* prefetch the 1st faction to cache */
+ movq mm2, [esp + dx1] /* fetch dr */
+ movd mm3, [esp + dz1]
+
+ /* update vnbtot */
+ pfadd mm6, [esp + vnbtot] /* add the earlier value */
+ movq [esp + vnbtot], mm6 /* store the sum */
+
+ prefetchw [edi + ebx*4] /* prefetch the 2nd faction to cache */
+ pfmul mm2, mm0 /* mult by fs */
+ pfmul mm3, mm0
+
+ movq mm4, [esp + dx2] /* fetch dr */
+ movd mm5, [esp + dz2]
+ pfmul mm4, mm1 /* mult by fs */
+ pfmul mm5, mm1
+ /* update i forces */
+
+ movq mm0, [esp + fix]
+ movd mm1, [esp + fiz]
+ pfadd mm0, mm2
+ pfadd mm1, mm3
+
+ pfadd mm0, mm4
+ pfadd mm1, mm5
+ movq [esp + fix], mm0
+ movd [esp + fiz], mm1
+ /* update j forces */
+
+ movq mm0, [edi + eax*4]
+ movd mm1, [edi + eax*4 + 8]
+ movq mm6, [edi + ebx*4]
+ movd mm7, [edi + ebx*4 + 8]
+
+ pfsub mm0, mm2
+ pfsub mm1, mm3
+ pfsub mm6, mm4
+ pfsub mm7, mm5
+
+ movq [edi + eax*4], mm0
+ movd [edi + eax*4 +8], mm1
+ movq [edi + ebx*4], mm6
+ movd [edi + ebx*4 + 8], mm7
+ /* should we do one more iteration? */
+ sub [esp + innerk], 2
+ jl .i0110_finish_vdw_inner
+ jmp .i0110_unroll_vdw_loop
+.i0110_finish_vdw_inner:
+ and [esp + innerk], 1
+ jnz .i0110_single_vdw_inner
+ jmp .i0110_updateouterdata_vdw
+.i0110_single_vdw_inner:
+ /* a single j particle iteration here - compare with the unrolled code for comments */
+ mov eax, [esp + innerjjnr]
+ mov eax, [eax] /* eax=jnr offset */
+
+ mov esi, [ebp + nbfp]
+ mov ecx, [ebp + type]
+ mov edx, [ecx + eax*4] /* type [jnr1] */
+ shl edx, 1
+ add edx, [esp + ntia] /* tja = ntia + 2*type */
+ movd mm5, [esi + edx*4] /* mm5 = 1st c6 */
+ movq [esp + c6], mm5
+ movd mm5, [esi + edx*4 + 4] /* mm5 = 1st c12 */
+ movq [esp + c12], mm5
+
+ mov esi, [ebp + pos]
+ lea eax, [eax + eax*2]
+
+ movq mm0, [esp + ix]
+ movd mm1, [esp + iz]
+ movq mm4, [esi + eax*4]
+ movd mm5, [esi + eax*4 + 8]
+ pfsubr mm4, mm0
+ pfsubr mm5, mm1
+ movq [esp + dx1], mm4
+ pfmul mm4,mm4
+ movd [esp + dz1], mm5
+ pfmul mm5,mm5
+ pfacc mm4, mm5
+ pfacc mm4, mm5 /* mm4=rsq */
+
+ pfrcp mm0,mm4
+ pfrcpit1 mm4,mm0
+ pfrcpit2 mm4,mm0 /* mm4=invsq */
+ /* calculate potentials and scalar force */
+ movq mm0, mm4
+
+ pfmul mm4, mm0
+ pfmul mm4, mm0 /* mm4=rinvsix */
+ movq mm5, mm4
+ pfmul mm5, mm5 /* mm5=rinvtwelve */
+
+ pfmul mm5, [esp + c12]
+ pfmul mm4, [esp + c6]
+ movq mm6, mm5 /* mm6 is vnb12-vnb6 */
+ pfsub mm6, mm4
+
+ pfmul mm4, [esp + six]
+
+ pfmul mm5, [esp + twelve]
+ pfsub mm5, mm4
+ pfmul mm0, mm5 /* mm0 is total fscal now */
+
+ /* update vnbtot */
+ pfadd mm6, [esp + vnbtot] /* add the earlier value */
+ movq [esp + vnbtot], mm6 /* store the sum */
+
+ /* spread fscalar to both positions */
+ punpckldq mm0,mm0
+ /* calc vectorial force */
+ prefetchw [edi + eax*4] /* prefetch faction to cache */
+ movq mm2, [esp + dx1]
+ movd mm3, [esp + dz1]
+
+ pfmul mm2, mm0
+ pfmul mm3, mm0
+
+ /* update i particle force */
+ movq mm0, [esp + fix]
+ movd mm1, [esp + fiz]
+ pfadd mm0, mm2
+ pfadd mm1, mm3
+ movq [esp + fix], mm0
+ movd [esp + fiz], mm1
+ /* update j particle force */
+ movq mm0, [edi + eax*4]
+ movd mm1, [edi + eax *4+ 8]
+ pfsub mm0, mm2
+ pfsub mm1, mm3
+ movq [edi + eax*4], mm0
+ movd [edi + eax*4 +8], mm1
+ /* done! */
+.i0110_updateouterdata_vdw:
+ mov ecx, [esp + ii3]
+
+ movq mm6, [edi + ecx*4] /* increment i force */
+ movd mm7, [edi + ecx*4 + 8]
+ pfadd mm6, [esp + fix]
+ pfadd mm7, [esp + fiz]
+ movq [edi + ecx*4], mm6
+ movd [edi + ecx*4 +8], mm7
+
+ mov ebx, [ebp + fshift] /* increment fshift force */
+ mov edx, [esp + is3]
+
+ movq mm6, [ebx + edx*4]
+ movd mm7, [ebx + edx*4 + 8]
+ pfadd mm6, [esp + fix]
+ pfadd mm7, [esp + fiz]
+ movq [ebx + edx*4], mm6
+ movd [ebx + edx*4 + 8], mm7
+
+ /* loop back to mno */
+ dec dword ptr [esp + nsvdw]
+ jz .i0110_last_mno
+ jmp .i0110_mno_vdw
+
+.i0110_last_mno:
+ mov edx, [ebp + gid] /* get group index for this i particle */
+ mov edx, [edx]
+ add [ebp + gid], 4 /* advance pointer */
+
+ movq mm7, [esp + vnbtot]
+ pfacc mm7,mm7 /* get and sum the two parts of total potential */
+
+ mov eax, [ebp + Vnb]
+ movd mm6, [eax + edx*4]
+ pfadd mm6, mm7
+ movd [eax + edx*4], mm6 /* increment vc[gid] */
+ /* finish if last */
+ mov ecx, [ebp + nri]
+ dec ecx
+ jecxz .i0110_end
+ /* not last, iterate once more! */
+ mov [ebp + nri], ecx
+ jmp .i0110_outer
+.i0110_end:
+ femms
+ add esp, 144
+ pop edi
+ pop esi
+ pop edx
+ pop ecx
+ pop ebx
+ pop eax
+ leave
+ ret
+
+
+
+.globl inl0300_3dnow
+ .type inl0300_3dnow,@function
+inl0300_3dnow:
+.equ nri, 8
+.equ iinr, 12
+.equ jindex, 16
+.equ jjnr, 20
+.equ shift, 24
+.equ shiftvec, 28
+.equ fshift, 32
+.equ gid, 36
+.equ pos, 40
+.equ faction, 44
+.equ type, 48
+.equ ntype, 52
+.equ nbfp, 56
+.equ Vnb, 60
+.equ tabscale, 64
+.equ VFtab, 68
+ /* stack offsets for local variables */
+.equ is3, 0
+.equ ii3, 4
+.equ ix, 8
+.equ iy, 12
+.equ iz, 16
+.equ vnbtot, 20 /* repeated (64bit) to fill 3dnow reg */
+.equ c6, 28 /* repeated (64bit) to fill 3dnow reg */
+.equ c12, 36 /* repeated (64bit) to fill 3dnow reg */
+.equ two, 44 /* repeated (64bit) to fill 3dnow reg */
+.equ n1, 52 /* repeated (64bit) to fill 3dnow reg */
+.equ tsc, 60 /* repeated (64bit) to fill 3dnow reg */
+.equ ntia, 68
+.equ innerjjnr, 72
+.equ innerk, 76
+.equ fix, 80
+.equ fiy, 84
+.equ fiz, 88
+.equ dx1, 92
+.equ dy1, 96
+.equ dz1, 100
+.equ dx2, 104
+.equ dy2, 108
+.equ dz2, 112
+ push eax
+ push ebx
+ push ecx
+ push edx
+ push esi
+ push edi
+ sub esp, 116 /* local stack space */
+ femms
+ /* move data to local stack */
+ movq mm0, [mm_two]
+ movd mm3, [ebp + tabscale]
+ movq [esp + two], mm0
+ punpckldq mm3,mm3
+ movq [esp + tsc], mm3
+ /* assume we have at least one i particle - start directly */
+.i0300_outer:
+ mov eax, [ebp + shift] /* eax = pointer into shift[] */
+ mov ebx, [eax] /* ebx=shift[n] */
+ add [ebp + shift], 4 /* advance pointer one step */
+
+ lea ebx, [ebx + ebx*2] /* ebx=3*is */
+ mov [esp + is3],ebx /* store is3 */
+
+ mov eax, [ebp + shiftvec] /* eax = base of shiftvec[] */
+
+ movq mm0, [eax + ebx*4] /* move shX/shY to mm0 and shZ to mm1 */
+ movd mm1, [eax + ebx*4 + 8]
+
+ mov ecx, [ebp + iinr] /* ecx = pointer into iinr[] */
+ add [ebp + iinr], 4 /* advance pointer */
+ mov ebx, [ecx] /* ebx=ii */
+
+ mov edx, [ebp + type]
+ mov edx, [edx + ebx*4]
+ imul edx, [ebp + ntype]
+ shl edx, 1
+ mov [esp + ntia], edx
+
+ lea ebx, [ebx + ebx*2] /* ebx = 3*ii=ii3 */
+ mov eax, [ebp + pos] /* eax = base of pos[] */
+
+ pfadd mm0, [eax + ebx*4] /* ix = shX + posX (and iy too) */
+ movd mm3, [eax + ebx*4 + 8] /* cant use direct memory add for 4 bytes (iz) */
+ mov [esp + ii3], ebx
+ pfadd mm1, mm3
+ movq [esp + ix], mm0
+ movd [esp + iz], mm1
+
+ /* clear total potential and i forces */
+ pxor mm7,mm7
+ movq [esp + vnbtot], mm7
+ movq [esp + fix], mm7
+ movd [esp + fiz], mm7
+
+ mov eax, [ebp + jindex]
+ mov ecx, [eax] /* jindex[n] */
+ mov edx, [eax + 4] /* jindex[n+1] */
+ add [ebp + jindex], 4
+ sub edx, ecx /* number of innerloop atoms */
+
+ mov esi, [ebp + pos]
+ mov edi, [ebp + faction]
+ mov eax, [ebp + jjnr]
+ shl ecx, 2
+ add eax, ecx
+ mov [esp + innerjjnr], eax /* pointer to jjnr[nj0] */
+ sub edx, 2
+ mov [esp + innerk], edx /* number of innerloop atoms */
+ jge .i0300_unroll_loop
+ jmp .i0300_finish_inner
+.i0300_unroll_loop:
+ /* paired innerloop starts here */
+ mov ecx, [esp + innerjjnr] /* pointer to jjnr[k] */
+ mov eax, [ecx]
+ mov ebx, [ecx + 4] /* eax/ebx=jnr */
+ add [esp + innerjjnr], 8 /* advance pointer (unrolled 2) */
+ prefetch [ecx + 16] /* prefetch data - trial and error says 16 is best */
+
+ mov ecx, [ebp + type]
+ mov edx, [ecx + eax*4] /* type [jnr1] */
+ mov ecx, [ecx + ebx*4] /* type [jnr2] */
+
+ mov esi, [ebp + nbfp] /* base of nbfp */
+ shl edx, 1
+ shl ecx, 1
+ add edx, [esp + ntia] /* tja = ntia + 2*type */
+ add ecx, [esp + ntia]
+
+ movq mm5, [esi + edx*4] /* mm5 = 1st c6 / c12 */
+ movq mm7, [esi + ecx*4] /* mm7 = 2nd c6 / c12 */
+ movq mm6,mm5
+ punpckldq mm5,mm7 /* mm5 = 1st c6 / 2nd c6 */
+ punpckhdq mm6,mm7 /* mm6 = 1st c12 / 2nd c12 */
+ movq [esp + c6], mm5
+ movq [esp + c12], mm6
+
+ lea eax, [eax + eax*2] /* replace jnr with j3 */
+ lea ebx, [ebx + ebx*2]
+
+ mov esi, [ebp + pos]
+
+ movq mm0, [esp + ix]
+ movd mm1, [esp + iz]
+ movq mm4, [esi + eax*4] /* fetch first j coordinates */
+ movd mm5, [esi + eax*4 + 8]
+ pfsubr mm4,mm0 /* dr = ir - jr */
+ pfsubr mm5,mm1
+ movq [esp + dx1], mm4 /* store dr */
+ movd [esp + dz1], mm5
+ pfmul mm4,mm4 /* square dx,dy,dz */
+ pfmul mm5,mm5
+ pfacc mm4, mm5 /* accumulate to get dx*dx+dy*dy+dz*dz */
+ pfacc mm4, mm5 /* first rsq in lower mm4 */
+
+ movq mm6, [esi + ebx*4] /* fetch second j coordinates */
+ movd mm7, [esi + ebx*4 + 8]
+
+ pfsubr mm6,mm0 /* dr = ir - jr */
+ pfsubr mm7,mm1
+ movq [esp + dx2], mm6 /* store dr */
+ movd [esp + dz2], mm7
+ pfmul mm6,mm6 /* square dx,dy,dz */
+ pfmul mm7,mm7
+ pfacc mm6, mm7 /* accumulate to get dx*dx+dy*dy+dz*dz */
+ pfacc mm6, mm7 /* second rsq in lower mm6 */
+
+ pfrsqrt mm0, mm4 /* lookup inverse square root seed */
+ pfrsqrt mm1, mm6
+
+
+ punpckldq mm0,mm1
+ punpckldq mm4,mm6 /* now 4 has rsq and 0 the seed for both pairs */
+ movq mm2,mm0 /* amd 3dnow N-R iteration to get full precision */
+ pfmul mm0,mm0
+ pfrsqit1 mm0,mm4
+ pfrcpit2 mm0,mm2
+ pfmul mm4, mm0
+ movq mm1, mm4
+ /* mm0 is invsqrt, and mm1 r */
+ /* do potential and fscal */
+ pfmul mm1, [esp + tsc] /* mm1=rt */
+ pf2iw mm4,mm1
+ movq [esp + n1], mm4
+ pi2fd mm4,mm4
+ pfsub mm1, mm4 /* now mm1 is eps and mm4 is n0 */
+
+ movq mm2,mm1
+ pfmul mm2,mm2 /* mm1 is eps, mm2 is eps2 */
+
+ mov edx, [ebp + VFtab]
+ /* dispersion table */
+ mov ecx, [esp + n1]
+ shl ecx, 3
+ /* load all the table values we need */
+ movd mm4, [edx + ecx*4]
+ movd mm5, [edx + ecx*4 + 4]
+ movd mm6, [edx + ecx*4 + 8]
+ movd mm7, [edx + ecx*4 + 12]
+ mov ecx, [esp + n1 + 4]
+ shl ecx, 3
+ punpckldq mm4, [edx + ecx*4]
+ punpckldq mm5, [edx + ecx*4 + 4]
+ punpckldq mm6, [edx + ecx*4 + 8]
+ punpckldq mm7, [edx + ecx*4 + 12]
+ pfmul mm6, mm1 /* mm6 = Geps */
+ pfmul mm7, mm2 /* mm7 = Heps2 */
+ pfadd mm5, mm6
+ pfadd mm5, mm7 /* mm5 = Fp */
+ pfmul mm7, [esp + two] /* two*Heps2 */
+ pfadd mm7, mm6
+ pfadd mm7, mm5 /* mm7=FF */
+ pfmul mm5, mm1 /* mm5=eps*Fp */
+ pfadd mm5, mm4 /* mm5= VV */
+
+ movq mm4, [esp + c6]
+ pfmul mm7, mm4 /* fijD */
+ pfmul mm5, mm4 /* vnb6 */
+ movq mm3, mm7 /* add to fscal */
+
+ /* update vnbtot to release mm5! */
+ pfadd mm5, [esp + vnbtot] /* add the earlier value */
+ movq [esp + vnbtot], mm5 /* store the sum */
+
+ /* repulsion table */
+ mov ecx, [esp + n1]
+ shl ecx, 3
+ /* load all the table values we need */
+ movd mm4, [edx + ecx*4 + 16]
+ movd mm5, [edx + ecx*4 + 20]
+ movd mm6, [edx + ecx*4 + 24]
+ movd mm7, [edx + ecx*4 + 28]
+ mov ecx, [esp + n1 + 4]
+ shl ecx, 3
+ punpckldq mm4, [edx + ecx*4 + 16]
+ punpckldq mm5, [edx + ecx*4 + 20]
+ punpckldq mm6, [edx + ecx*4 + 24]
+ punpckldq mm7, [edx + ecx*4 + 28]
+
+ pfmul mm6, mm1 /* mm6 = Geps */
+ pfmul mm7, mm2 /* mm7 = Heps2 */
+ pfadd mm5, mm6
+ pfadd mm5, mm7 /* mm5 = Fp */
+ pfmul mm7, [esp + two] /* two*Heps2 */
+ pfadd mm7, mm6
+ pfadd mm7, mm5 /* mm7=FF */
+ pfmul mm5, mm1 /* mm5=eps*Fp */
+ pfadd mm5, mm4 /* mm5= VV */
+
+ movq mm6, [esp + c12]
+ pfmul mm7, mm6 /* fijR */
+ pfmul mm5, mm6 /* vnb12 */
+ pfadd mm3, mm7 /* total fscal fijD+fijR */
+
+ /* change sign of mm3 */
+ pxor mm1,mm1
+ pfsub mm1, mm3
+ pfmul mm1, [esp + tsc]
+ pfmul mm0, mm1 /* mm0 is total fscal now */
+
+ prefetchw [esp + dx1] /* prefetch i forces to cache */
+
+ /* spread fscalar to both positions */
+ movq mm1,mm0
+ punpckldq mm0,mm0
+ punpckhdq mm1,mm1
+
+ /* calc vector force */
+ prefetchw [edi + eax*4] /* prefetch the 1st faction to cache */
+ movq mm2, [esp + dx1] /* fetch dr */
+ movd mm3, [esp + dz1]
+
+ /* update vnbtot */
+ pfadd mm5, [esp + vnbtot] /* add the earlier value */
+ movq [esp + vnbtot], mm5 /* store the sum */
+
+ prefetchw [edi + ebx*4] /* prefetch the 2nd faction to cache */
+ pfmul mm2, mm0 /* mult by fs */
+ pfmul mm3, mm0
+
+ movq mm4, [esp + dx2] /* fetch dr */
+ movd mm5, [esp + dz2]
+ pfmul mm4, mm1 /* mult by fs */
+ pfmul mm5, mm1
+ /* update i forces */
+
+ movq mm0, [esp + fix]
+ movd mm1, [esp + fiz]
+ pfadd mm0, mm2
+ pfadd mm1, mm3
+
+ pfadd mm0, mm4
+ pfadd mm1, mm5
+ movq [esp + fix], mm0
+ movd [esp + fiz], mm1
+ /* update j forces */
+
+ movq mm0, [edi + eax*4]
+ movd mm1, [edi + eax*4 + 8]
+ movq mm6, [edi + ebx*4]
+ movd mm7, [edi + ebx*4 + 8]
+
+ pfsub mm0, mm2
+ pfsub mm1, mm3
+ pfsub mm6, mm4
+ pfsub mm7, mm5
+
+ movq [edi + eax*4], mm0
+ movd [edi + eax*4 +8], mm1
+ movq [edi + ebx*4], mm6
+ movd [edi + ebx*4 + 8], mm7
+
+ /* should we do one more iteration? */
+ sub [esp + innerk], 2
+ jl .i0300_finish_inner
+ jmp .i0300_unroll_loop
+.i0300_finish_inner:
+ and [esp + innerk], 1
+ jnz .i0300_single_inner
+ jmp .i0300_updateouterdata
+.i0300_single_inner:
+ /* a single j particle iteration here - compare with the unrolled code for comments */
+ mov eax, [esp + innerjjnr]
+ mov eax, [eax] /* eax=jnr offset */
+
+ mov esi, [ebp + nbfp]
+ mov ecx, [ebp + type]
+ mov edx, [ecx + eax*4] /* type [jnr1] */
+ shl edx, 1
+ add edx, [esp + ntia] /* tja = ntia + 2*type */
+ movd mm5, [esi + edx*4] /* mm5 = 1st c6 */
+ movq [esp + c6], mm5
+ movd mm5, [esi + edx*4 + 4] /* mm5 = 1st c12 */
+ movq [esp + c12], mm5
+
+ mov esi, [ebp + pos]
+ lea eax, [eax + eax*2]
+
+ movq mm0, [esp + ix]
+ movd mm1, [esp + iz]
+ movq mm4, [esi + eax*4]
+ movd mm5, [esi + eax*4 + 8]
+ pfsubr mm4, mm0
+ pfsubr mm5, mm1
+ movq [esp + dx1], mm4
+ pfmul mm4,mm4
+ movd [esp + dz1], mm5
+ pfmul mm5,mm5
+ pfacc mm4, mm5
+ pfacc mm4, mm5 /* mm0=rsq */
+
+ pfrsqrt mm0,mm4
+ movq mm2,mm0
+ pfmul mm0,mm0
+ pfrsqit1 mm0,mm4
+ pfrcpit2 mm0,mm2 /* mm1=invsqrt */
+ pfmul mm4, mm0
+ movq mm1, mm4
+ /* mm0 is invsqrt, and mm1 r */
+
+ /* calculate potentials and scalar force */
+ pfmul mm1, [esp + tsc] /* mm1=rt */
+ pf2iw mm4,mm1
+ movd [esp + n1], mm4
+ pi2fd mm4,mm4
+ pfsub mm1, mm4 /* now mm1 is eps and mm4 is n0 */
+
+ movq mm2,mm1
+ pfmul mm2,mm2 /* mm1 is eps, mm2 is eps2 */
+
+ mov edx, [ebp + VFtab]
+ mov ecx, [esp + n1]
+ shl ecx, 3
+ /* dispersion table */
+ /* load all the table values we need */
+ movd mm4, [edx + ecx*4]
+ movd mm5, [edx + ecx*4 + 4]
+ movd mm6, [edx + ecx*4 + 8]
+ movd mm7, [edx + ecx*4 + 12]
+ pfmul mm6, mm1 /* mm6 = Geps */
+ pfmul mm7, mm2 /* mm7 = Heps2 */
+ pfadd mm5, mm6
+ pfadd mm5, mm7 /* mm5 = Fp */
+ pfmul mm7, [esp + two] /* two*Heps2 */
+ pfadd mm7, mm6
+ pfadd mm7, mm5 /* mm7=FF */
+ pfmul mm5, mm1 /* mm5=eps*Fp */
+ pfadd mm5, mm4 /* mm5= VV */
+
+ movq mm4, [esp + c6]
+ pfmul mm7, mm4 /* fijD */
+ pfmul mm5, mm4 /* vnb6 */
+ movq mm3, mm7 /* add to fscal */
+
+ /* update vnbtot to release mm5! */
+ pfadd mm5, [esp + vnbtot] /* add the earlier value */
+ movq [esp + vnbtot], mm5 /* store the sum */
+
+ /* repulsion table */
+ /* load all the table values we need */
+ movd mm4, [edx + ecx*4 + 16]
+ movd mm5, [edx + ecx*4 + 20]
+ movd mm6, [edx + ecx*4 + 24]
+ movd mm7, [edx + ecx*4 + 28]
+
+ pfmul mm6, mm1 /* mm6 = Geps */
+ pfmul mm7, mm2 /* mm7 = Heps2 */
+ pfadd mm5, mm6
+ pfadd mm5, mm7 /* mm5 = Fp */
+ pfmul mm7, [esp + two] /* two*Heps2 */
+ pfadd mm7, mm6
+ pfadd mm7, mm5 /* mm7=FF */
+ pfmul mm5, mm1 /* mm5=eps*Fp */
+ pfadd mm5, mm4 /* mm5= VV */
+
+ movq mm6, [esp + c12]
+ pfmul mm7, mm6 /* fijR */
+ pfmul mm5, mm6 /* vnb12 */
+ pfadd mm3, mm7 /* total fscal fijC+fijD+fijR */
+
+ /* change sign of mm3 */
+ pxor mm1,mm1
+ pfsub mm1, mm3
+ pfmul mm0, [esp + tsc]
+ pfmul mm0, mm1 /* mm0 is total fscal now */
+
+ /* update vnbtot */
+ pfadd mm5, [esp + vnbtot] /* add the earlier value */
+ movq [esp + vnbtot], mm5 /* store the sum */
+
+ /* spread fscalar to both positions */
+ punpckldq mm0,mm0
+ /* calc vectorial force */
+ prefetchw [edi + eax*4] /* prefetch faction to cache */
+ movq mm2, [esp + dx1]
+ movd mm3, [esp + dz1]
+
+ pfmul mm2, mm0
+ pfmul mm3, mm0
+
+ /* update i particle force */
+ movq mm0, [esp + fix]
+ movd mm1, [esp + fiz]
+ pfadd mm0, mm2
+ pfadd mm1, mm3
+ movq [esp + fix], mm0
+ movd [esp + fiz], mm1
+ /* update j particle force */
+ movq mm0, [edi + eax*4]
+ movd mm1, [edi + eax *4+ 8]
+ pfsub mm0, mm2
+ pfsub mm1, mm3
+ movq [edi + eax*4], mm0
+ movd [edi + eax*4 +8], mm1
+ /* done! */
+.i0300_updateouterdata:
+ mov ecx, [esp + ii3]
+
+ movq mm6, [edi + ecx*4] /* increment i force */
+ movd mm7, [edi + ecx*4 + 8]
+ pfadd mm6, [esp + fix]
+ pfadd mm7, [esp + fiz]
+ movq [edi + ecx*4], mm6
+ movd [edi + ecx*4 +8], mm7
+
+ mov ebx, [ebp + fshift] /* increment fshift force */
+ mov edx, [esp + is3]
+
+ movq mm6, [ebx + edx*4]
+ movd mm7, [ebx + edx*4 + 8]
+ pfadd mm6, [esp + fix]
+ pfadd mm7, [esp + fiz]
+ movq [ebx + edx*4], mm6
+ movd [ebx + edx*4 + 8], mm7
+
+ mov edx, [ebp + gid] /* get group index for this i particle */
+ mov edx, [edx]
+ add [ebp + gid], 4 /* advance pointer */
+
+ movq mm7, [esp + vnbtot]
+ pfacc mm7,mm7 /* get and sum the two parts of total potential */
+
+ mov eax, [ebp + Vnb]
+ movd mm6, [eax + edx*4]
+ pfadd mm6, mm7
+ movd [eax + edx*4], mm6 /* increment vnb[gid] */
+
+ /* finish if last */
+ mov ecx, [ebp + nri]
+ dec ecx
+ jecxz .i0300_end
+ /* not last, iterate once more! */
+ mov [ebp + nri], ecx
+ jmp .i0300_outer
+.i0300_end:
+ femms
+ add esp, 116
+ pop edi
+ pop esi
+ pop edx
+ pop ecx
+ pop ebx
+ pop eax
+ leave
+ ret
+
+
+
+
+.globl inl0310_3dnow
+ .type inl0310_3dnow,@function
+inl0310_3dnow:
+.equ nri, 8
+.equ iinr, 12
+.equ jindex, 16
+.equ jjnr, 20
+.equ shift, 24
+.equ shiftvec, 28
+.equ fshift, 32
+.equ gid, 36
+.equ pos, 40
+.equ faction, 44
+.equ type, 48
+.equ ntype, 52
+.equ nbfp, 56
+.equ Vnb, 60
+.equ tabscale, 64
+.equ VFtab, 68
+.equ nsatoms, 72
+ /* stack offsets for local variables */
+.equ is3, 0
+.equ ii3, 4
+.equ shX, 8
+.equ shY, 12
+.equ shZ, 16
+.equ ix, 20
+.equ iy, 24
+.equ iz, 28
+.equ vnbtot, 32 /* repeated (64bit) to fill 3dnow reg */
+.equ c6, 40 /* repeated (64bit) to fill 3dnow reg */
+.equ c12, 48 /* repeated (64bit) to fill 3dnow reg */
+.equ two, 56 /* repeated (64bit) to fill 3dnow reg */
+.equ n1, 64 /* repeated (64bit) to fill 3dnow reg */
+.equ tsc, 72 /* repeated (64bit) to fill 3dnow reg */
+.equ ntia, 80
+.equ innerjjnr0, 84
+.equ innerk0, 88
+.equ innerjjnr, 92
+.equ innerk, 96
+.equ fix, 100
+.equ fiy, 104
+.equ fiz, 108
+.equ dx1, 112
+.equ dy1, 116
+.equ dz1, 120
+.equ dx2, 124
+.equ dy2, 128
+.equ dz2, 132
+.equ nsvdwc, 136
+.equ nscoul, 140
+.equ nsvdw, 144
+.equ solnr, 148
+ push ebp
+ mov ebp,esp
+ push eax
+ push ebx
+ push ecx
+ push edx
+ push esi
+ push edi
+ sub esp, 152 /* local stack space */
+ femms
+ movq mm0, [mm_two]
+ movd mm3, [ebp + tabscale]
+ movq [esp + two], mm0
+ punpckldq mm3,mm3
+ movq [esp + tsc], mm3
+
+ /* assume we have at least one i particle - start directly */
+.i0310_outer:
+ mov eax, [ebp + shift] /* eax = pointer into shift[] */
+ mov ebx, [eax] /* ebx=shift[n] */
+ add [ebp + shift], 4 /* advance pointer one step */
+
+ lea ebx, [ebx + ebx*2] /* ebx=3*is */
+ mov [esp + is3],ebx /* store is3 */
+
+ mov eax, [ebp + shiftvec] /* eax = base of shiftvec[] */
+
+ movq mm0, [eax + ebx*4] /* move shX/shY to mm0 and shZ to mm1 */
+ movd mm1, [eax + ebx*4 + 8]
+ movq [esp + shX], mm0
+ movd [esp + shZ], mm1
+
+ mov ecx, [ebp + iinr] /* ecx = pointer into iinr[] */
+ add [ebp + iinr], 4 /* advance pointer */
+ mov ebx, [ecx] /* ebx=ii */
+
+ mov eax, [ebp + nsatoms]
+ add [ebp + nsatoms], 12
+ mov ecx, [eax]
+ mov edx, [eax + 4]
+ mov eax, [eax + 8]
+ sub ecx, eax
+ sub eax, edx
+
+ mov [esp + nsvdwc], edx
+ mov [esp + nscoul], eax
+ mov [esp + nsvdw], ecx
+
+ /* clear potential */
+ pxor mm7,mm7
+ movq [esp + vnbtot], mm7
+ mov [esp + solnr], ebx
+
+ mov eax, [ebp + jindex]
+ mov ecx, [eax] /* jindex[n] */
+ mov edx, [eax + 4] /* jindex[n+1] */
+ add [ebp + jindex], 4
+ sub edx, ecx /* number of innerloop atoms */
+ mov eax, [ebp + jjnr]
+ shl ecx, 2
+ add eax, ecx
+ mov [esp + innerjjnr0], eax /* pointer to jjnr[nj0] */
+
+ mov [esp + innerk0], edx /* number of innerloop atoms */
+ mov esi, [ebp + pos]
+ mov edi, [ebp + faction]
+
+ mov ecx, [esp + nsvdwc]
+ cmp ecx, 0
+ jnz .i0310_mno_vdwc
+ jmp .i0310_testvdw
+.i0310_mno_vdwc:
+ mov ebx, [esp + solnr]
+ inc dword ptr [esp + solnr]
+
+ mov edx, [ebp + type]
+ mov edx, [edx + ebx*4]
+ imul edx, [ebp + ntype]
+ shl edx, 1
+ mov [esp + ntia], edx
+
+ lea ebx, [ebx + ebx*2] /* ebx = 3*ii=ii3 */
+ mov eax, [ebp + pos] /* eax = base of pos[] */
+ mov [esp + ii3], ebx
+
+ movq mm0, [eax + ebx*4]
+ movd mm1, [eax + ebx*4 + 8]
+ pfadd mm0, [esp + shX]
+ pfadd mm1, [esp + shZ]
+ movq [esp + ix], mm0
+ movd [esp + iz], mm1
+
+ /* clear forces */
+ pxor mm7,mm7
+ movq [esp + fix], mm7
+ movd [esp + fiz], mm7
+
+ mov ecx, [esp + innerjjnr0]
+ mov [esp + innerjjnr], ecx
+ mov edx, [esp + innerk0]
+ sub edx, 2
+ mov [esp + innerk], edx /* number of innerloop atoms */
+ jge .i0310_unroll_vdwc_loop
+ jmp .i0310_finish_vdwc_inner
+.i0310_unroll_vdwc_loop:
+ /* paired innerloop starts here */
+ mov ecx, [esp + innerjjnr] /* pointer to jjnr[k] */
+ mov eax, [ecx]
+ mov ebx, [ecx + 4] /* eax/ebx=jnr */
+ add [esp + innerjjnr], 8 /* advance pointer (unrolled 2) */
+ prefetch [ecx + 16] /* prefetch data - trial and error says 16 is best */
+
+ mov ecx, [ebp + type]
+ mov edx, [ecx + eax*4] /* type [jnr1] */
+ mov ecx, [ecx + ebx*4] /* type [jnr2] */
+
+ mov esi, [ebp + nbfp] /* base of nbfp */
+ shl edx, 1
+ shl ecx, 1
+ add edx, [esp + ntia] /* tja = ntia + 2*type */
+ add ecx, [esp + ntia]
+
+ movq mm5, [esi + edx*4] /* mm5 = 1st c6 / c12 */
+ movq mm7, [esi + ecx*4] /* mm7 = 2nd c6 / c12 */
+ movq mm6,mm5
+ punpckldq mm5,mm7 /* mm5 = 1st c6 / 2nd c6 */
+ punpckhdq mm6,mm7 /* mm6 = 1st c12 / 2nd c12 */
+ movq [esp + c6], mm5
+ movq [esp + c12], mm6
+
+ lea eax, [eax + eax*2] /* replace jnr with j3 */
+ lea ebx, [ebx + ebx*2]
+
+ mov esi, [ebp + pos]
+
+ movq mm0, [esp + ix]
+ movd mm1, [esp + iz]
+ movq mm4, [esi + eax*4] /* fetch first j coordinates */
+ movd mm5, [esi + eax*4 + 8]
+ pfsubr mm4,mm0 /* dr = ir - jr */
+ pfsubr mm5,mm1
+ movq [esp + dx1], mm4 /* store dr */
+ movd [esp + dz1], mm5
+ pfmul mm4,mm4 /* square dx,dy,dz */
+ pfmul mm5,mm5
+ pfacc mm4, mm5 /* accumulate to get dx*dx+dy*dy+dz*dz */
+ pfacc mm4, mm5 /* first rsq in lower mm4 */
+
+ movq mm6, [esi + ebx*4] /* fetch second j coordinates */
+ movd mm7, [esi + ebx*4 + 8]
+
+ pfsubr mm6,mm0 /* dr = ir - jr */
+ pfsubr mm7,mm1
+ movq [esp + dx2], mm6 /* store dr */
+ movd [esp + dz2], mm7
+ pfmul mm6,mm6 /* square dx,dy,dz */
+ pfmul mm7,mm7
+ pfacc mm6, mm7 /* accumulate to get dx*dx+dy*dy+dz*dz */
+ pfacc mm6, mm7 /* second rsq in lower mm6 */
+
+ pfrsqrt mm0, mm4 /* lookup inverse square root seed */
+ pfrsqrt mm1, mm6
+
+
+ punpckldq mm0,mm1
+ punpckldq mm4,mm6 /* now 4 has rsq and 0 the seed for both pairs. */
+ movq mm2,mm0 /* amd 3dnow N-R iteration to get full precision */
+ pfmul mm0,mm0
+ pfrsqit1 mm0,mm4
+ pfrcpit2 mm0,mm2
+ pfmul mm4, mm0
+ movq mm1, mm4
+ /* mm0 is invsqrt, and mm1 r */
+ /* do potential and fscal */
+ pfmul mm1, [esp + tsc] /* mm1=rt */
+ pf2iw mm4,mm1
+ movq [esp + n1], mm4
+ pi2fd mm4,mm4
+ pfsub mm1, mm4 /* now mm1 is eps and mm4 is n0 */
+
+ movq mm2,mm1
+ pfmul mm2,mm2 /* mm1 is eps, mm2 is eps2 */
+
+ mov edx, [ebp + VFtab]
+ /* dispersion table */
+ mov ecx, [esp + n1]
+ shl ecx, 3
+ /* load all the table values we need */
+ movd mm4, [edx + ecx*4]
+ movd mm5, [edx + ecx*4 + 4]
+ movd mm6, [edx + ecx*4 + 8]
+ movd mm7, [edx + ecx*4 + 12]
+ mov ecx, [esp + n1 + 4]
+ shl ecx, 3
+ punpckldq mm4, [edx + ecx*4]
+ punpckldq mm5, [edx + ecx*4 + 4]
+ punpckldq mm6, [edx + ecx*4 + 8]
+ punpckldq mm7, [edx + ecx*4 + 12]
+ pfmul mm6, mm1 /* mm6 = Geps */
+ pfmul mm7, mm2 /* mm7 = Heps2 */
+ pfadd mm5, mm6
+ pfadd mm5, mm7 /* mm5 = Fp */
+ pfmul mm7, [esp + two] /* two*Heps2 */
+ pfadd mm7, mm6
+ pfadd mm7, mm5 /* mm7=FF */
+ pfmul mm5, mm1 /* mm5=eps*Fp */
+ pfadd mm5, mm4 /* mm5= VV */
+
+ movq mm4, [esp + c6]
+ pfmul mm7, mm4 /* fijD */
+ pfmul mm5, mm4 /* vnb6 */
+ movq mm3, mm7 /* add to fscal */
+
+ /* update vnbtot to release mm5! */
+ pfadd mm5, [esp + vnbtot] /* add the earlier value */
+ movq [esp + vnbtot], mm5 /* store the sum */
+
+ /* repulsion table */
+ mov ecx, [esp + n1]
+ shl ecx, 3
+ /* load all the table values we need */
+ movd mm4, [edx + ecx*4 + 16]
+ movd mm5, [edx + ecx*4 + 20]
+ movd mm6, [edx + ecx*4 + 24]
+ movd mm7, [edx + ecx*4 + 28]
+ mov ecx, [esp + n1 + 4]
+ shl ecx, 3
+ punpckldq mm4, [edx + ecx*4 + 16]
+ punpckldq mm5, [edx + ecx*4 + 20]
+ punpckldq mm6, [edx + ecx*4 + 24]
+ punpckldq mm7, [edx + ecx*4 + 28]
+
+ pfmul mm6, mm1 /* mm6 = Geps */
+ pfmul mm7, mm2 /* mm7 = Heps2 */
+ pfadd mm5, mm6
+ pfadd mm5, mm7 /* mm5 = Fp */
+ pfmul mm7, [esp + two] /* two*Heps2 */
+ pfadd mm7, mm6
+ pfadd mm7, mm5 /* mm7=FF */
+ pfmul mm5, mm1 /* mm5=eps*Fp */
+ pfadd mm5, mm4 /* mm5= VV */
+
+ movq mm6, [esp + c12]
+ pfmul mm7, mm6 /* fijR */
+ pfmul mm5, mm6 /* vnb12 */
+ pfadd mm3, mm7 /* total fscal fijD+fijR */
+
+ /* change sign of mm3 */
+ pxor mm1,mm1
+ pfsub mm1, mm3
+ pfmul mm1, [esp + tsc]
+ pfmul mm0, mm1 /* mm0 is total fscal now */
+
+ prefetchw [esp + dx1] /* prefetch i forces to cache */
+
+ /* spread fscalar to both positions */
+ movq mm1,mm0
+ punpckldq mm0,mm0
+ punpckhdq mm1,mm1
+
+ /* calc vector force */
+ prefetchw [edi + eax*4] /* prefetch the 1st faction to cache */
+ movq mm2, [esp + dx1] /* fetch dr */
+ movd mm3, [esp + dz1]
+
+ /* update vnbtot */
+ pfadd mm5, [esp + vnbtot] /* add the earlier value */
+ movq [esp + vnbtot], mm5 /* store the sum */
+
+ prefetchw [edi + ebx*4] /* prefetch the 2nd faction to cache */
+ pfmul mm2, mm0 /* mult by fs */
+ pfmul mm3, mm0
+
+ movq mm4, [esp + dx2] /* fetch dr */
+ movd mm5, [esp + dz2]
+ pfmul mm4, mm1 /* mult by fs */
+ pfmul mm5, mm1
+ /* update i forces */
+
+ movq mm0, [esp + fix]
+ movd mm1, [esp + fiz]
+ pfadd mm0, mm2
+ pfadd mm1, mm3
+
+ pfadd mm0, mm4
+ pfadd mm1, mm5
+ movq [esp + fix], mm0
+ movd [esp + fiz], mm1
+ /* update j forces */
+
+ movq mm0, [edi + eax*4]
+ movd mm1, [edi + eax*4 + 8]
+ movq mm6, [edi + ebx*4]
+ movd mm7, [edi + ebx*4 + 8]
+
+ pfsub mm0, mm2
+ pfsub mm1, mm3
+ pfsub mm6, mm4
+ pfsub mm7, mm5
+
+ movq [edi + eax*4], mm0
+ movd [edi + eax*4 +8], mm1
+ movq [edi + ebx*4], mm6
+ movd [edi + ebx*4 + 8], mm7
+
+ /* should we do one more iteration? */
+ sub [esp + innerk], 2
+ jl .i0310_finish_vdwc_inner
+ jmp .i0310_unroll_vdwc_loop
+.i0310_finish_vdwc_inner:
+ and [esp + innerk], 1
+ jnz .i0310_single_vdwc_inner
+ jmp .i0310_updateouterdata_vdwc
+.i0310_single_vdwc_inner:
+ /* a single j particle iteration here - compare with the unrolled code for comments */
+ mov eax, [esp + innerjjnr]
+ mov eax, [eax] /* eax=jnr offset */
+
+ mov esi, [ebp + nbfp]
+ mov ecx, [ebp + type]
+ mov edx, [ecx + eax*4] /* type [jnr1] */
+ shl edx, 1
+ add edx, [esp + ntia] /* tja = ntia + 2*type */
+ movd mm5, [esi + edx*4] /* mm5 = 1st c6 */
+ movq [esp + c6], mm5
+ movd mm5, [esi + edx*4 + 4] /* mm5 = 1st c12 */
+ movq [esp + c12], mm5
+
+ mov esi, [ebp + pos]
+ lea eax, [eax + eax*2]
+
+ movq mm0, [esp + ix]
+ movd mm1, [esp + iz]
+ movq mm4, [esi + eax*4]
+ movd mm5, [esi + eax*4 + 8]
+ pfsubr mm4, mm0
+ pfsubr mm5, mm1
+ movq [esp + dx1], mm4
+ pfmul mm4,mm4
+ movd [esp + dz1], mm5
+ pfmul mm5,mm5
+ pfacc mm4, mm5
+ pfacc mm4, mm5 /* mm0=rsq */
+
+ pfrsqrt mm0,mm4
+ movq mm2,mm0
+ pfmul mm0,mm0
+ pfrsqit1 mm0,mm4
+ pfrcpit2 mm0,mm2 /* mm1=invsqrt */
+ pfmul mm4, mm0
+ movq mm1, mm4
+ /* mm0 is invsqrt, and mm1 r */
+
+ /* calculate potentials and scalar force */
+ pfmul mm1, [esp + tsc] /* mm1=rt */
+ pf2iw mm4,mm1
+ movd [esp + n1], mm4
+ pi2fd mm4,mm4
+ pfsub mm1, mm4 /* now mm1 is eps and mm4 is n0 */
+
+ movq mm2,mm1
+ pfmul mm2,mm2 /* mm1 is eps, mm2 is eps2 */
+
+ mov edx, [ebp + VFtab]
+ mov ecx, [esp + n1]
+ shl ecx, 3
+ /* dispersion table */
+ /* load all the table values we need */
+ movd mm4, [edx + ecx*4]
+ movd mm5, [edx + ecx*4 + 4]
+ movd mm6, [edx + ecx*4 + 8]
+ movd mm7, [edx + ecx*4 + 12]
+ pfmul mm6, mm1 /* mm6 = Geps */
+ pfmul mm7, mm2 /* mm7 = Heps2 */
+ pfadd mm5, mm6
+ pfadd mm5, mm7 /* mm5 = Fp */
+ pfmul mm7, [esp + two] /* two*Heps2 */
+ pfadd mm7, mm6
+ pfadd mm7, mm5 /* mm7=FF */
+ pfmul mm5, mm1 /* mm5=eps*Fp */
+ pfadd mm5, mm4 /* mm5= VV */
+
+ movq mm4, [esp + c6]
+ pfmul mm7, mm4 /* fijD */
+ pfmul mm5, mm4 /* vnb6 */
+ movq mm3, mm7 /* add to fscal */
+
+ /* update vnbtot to release mm5! */
+ pfadd mm5, [esp + vnbtot] /* add the earlier value */
+ movq [esp + vnbtot], mm5 /* store the sum */
+
+ /* repulsion table */
+ /* load all the table values we need */
+ movd mm4, [edx + ecx*4 + 16]
+ movd mm5, [edx + ecx*4 + 20]
+ movd mm6, [edx + ecx*4 + 24]
+ movd mm7, [edx + ecx*4 + 28]
+
+ pfmul mm6, mm1 /* mm6 = Geps */
+ pfmul mm7, mm2 /* mm7 = Heps2 */
+ pfadd mm5, mm6
+ pfadd mm5, mm7 /* mm5 = Fp */
+ pfmul mm7, [esp + two] /* two*Heps2 */
+ pfadd mm7, mm6
+ pfadd mm7, mm5 /* mm7=FF */
+ pfmul mm5, mm1 /* mm5=eps*Fp */
+ pfadd mm5, mm4 /* mm5= VV */
+
+ movq mm6, [esp + c12]
+ pfmul mm7, mm6 /* fijR */
+ pfmul mm5, mm6 /* vnb12 */
+ pfadd mm3, mm7 /* total fscal fijC+fijD+fijR */
+
+ /* change sign of mm3 */
+ pxor mm1,mm1
+ pfsub mm1, mm3
+ pfmul mm0, [esp + tsc]
+ pfmul mm0, mm1 /* mm0 is total fscal now */
+
+ /* update vnbtot */
+ pfadd mm5, [esp + vnbtot] /* add the earlier value */
+ movq [esp + vnbtot], mm5 /* store the sum */
+
+ /* spread fscalar to both positions */
+ punpckldq mm0,mm0
+ /* calc vectorial force */
+ prefetchw [edi + eax*4] /* prefetch faction to cache */
+ movq mm2, [esp + dx1]
+ movd mm3, [esp + dz1]
+
+ pfmul mm2, mm0
+ pfmul mm3, mm0
+
+ /* update i particle force */
+ movq mm0, [esp + fix]
+ movd mm1, [esp + fiz]
+ pfadd mm0, mm2
+ pfadd mm1, mm3
+ movq [esp + fix], mm0
+ movd [esp + fiz], mm1
+ /* update j particle force */
+ movq mm0, [edi + eax*4]
+ movd mm1, [edi + eax *4+ 8]
+ pfsub mm0, mm2
+ pfsub mm1, mm3
+ movq [edi + eax*4], mm0
+ movd [edi + eax*4 +8], mm1
+ /* done! */
+.i0310_updateouterdata_vdwc:
+ mov ecx, [esp + ii3]
+
+ movq mm6, [edi + ecx*4] /* increment i force */
+ movd mm7, [edi + ecx*4 + 8]
+ pfadd mm6, [esp + fix]
+ pfadd mm7, [esp + fiz]
+ movq [edi + ecx*4], mm6
+ movd [edi + ecx*4 +8], mm7
+
+ mov ebx, [ebp + fshift] /* increment fshift force */
+ mov edx, [esp + is3]
+
+ movq mm6, [ebx + edx*4]
+ movd mm7, [ebx + edx*4 + 8]
+ pfadd mm6, [esp + fix]
+ pfadd mm7, [esp + fiz]
+ movq [ebx + edx*4], mm6
+ movd [ebx + edx*4 + 8], mm7
+
+ /* loop back to mno */
+ dec dword ptr [esp + nsvdwc]
+ jz .i0310_testvdw
+ jmp .i0310_mno_vdwc
+.i0310_testvdw:
+ mov ebx, [esp + nscoul]
+ add [esp + solnr], ebx
+
+ mov ecx, [esp + nsvdw]
+ cmp ecx, 0
+ jnz .i0310_mno_vdw
+ jmp .i0310_last_mno
+.i0310_mno_vdw:
+ mov ebx, [esp + solnr]
+ inc dword ptr [esp + solnr]
+
+ mov edx, [ebp + type]
+ mov edx, [edx + ebx*4]
+ imul edx, [ebp + ntype]
+ shl edx, 1
+ mov [esp + ntia], edx
+
+ lea ebx, [ebx + ebx*2] /* ebx = 3*ii=ii3 */
+ mov eax, [ebp + pos] /* eax = base of pos[] */
+ mov [esp + ii3], ebx
+
+ movq mm0, [eax + ebx*4]
+ movd mm1, [eax + ebx*4 + 8]
+ pfadd mm0, [esp + shX]
+ pfadd mm1, [esp + shZ]
+ movq [esp + ix], mm0
+ movd [esp + iz], mm1
+
+ /* clear forces */
+ pxor mm7,mm7
+ movq [esp + fix], mm7
+ movd [esp + fiz], mm7
+
+ mov ecx, [esp + innerjjnr0]
+ mov [esp + innerjjnr], ecx
+ mov edx, [esp + innerk0]
+ sub edx, 2
+ mov [esp + innerk], edx /* number of innerloop atoms */
+ jge .i0310_unroll_vdw_loop
+ jmp .i0310_finish_vdw_inner
+.i0310_unroll_vdw_loop:
+ /* paired innerloop starts here */
+ mov ecx, [esp + innerjjnr] /* pointer to jjnr[k] */
+ mov eax, [ecx]
+ mov ebx, [ecx + 4] /* eax/ebx=jnr */
+ add [esp + innerjjnr], 8 /* advance pointer (unrolled 2) */
+ prefetch [ecx + 16] /* prefetch data - trial and error says 16 is best */
+
+ mov ecx, [ebp + type]
+ mov edx, [ecx + eax*4] /* type [jnr1] */
+ mov ecx, [ecx + ebx*4] /* type [jnr2] */
+
+ mov esi, [ebp + nbfp] /* base of nbfp */
+ shl edx, 1
+ shl ecx, 1
+ add edx, [esp + ntia] /* tja = ntia + 2*type */
+ add ecx, [esp + ntia]
+
+ movq mm5, [esi + edx*4] /* mm5 = 1st c6 / c12 */
+ movq mm7, [esi + ecx*4] /* mm7 = 2nd c6 / c12 */
+ movq mm6,mm5
+ punpckldq mm5,mm7 /* mm5 = 1st c6 / 2nd c6 */
+ punpckhdq mm6,mm7 /* mm6 = 1st c12 / 2nd c12 */
+ movq [esp + c6], mm5
+ movq [esp + c12], mm6
+
+ lea eax, [eax + eax*2] /* replace jnr with j3 */
+ lea ebx, [ebx + ebx*2]
+
+ mov esi, [ebp + pos]
+
+ movq mm0, [esp + ix]
+ movd mm1, [esp + iz]
+ movq mm4, [esi + eax*4] /* fetch first j coordinates */
+ movd mm5, [esi + eax*4 + 8]
+ pfsubr mm4,mm0 /* dr = ir - jr */
+ pfsubr mm5,mm1
+ movq [esp + dx1], mm4 /* store dr */
+ movd [esp + dz1], mm5
+ pfmul mm4,mm4 /* square dx,dy,dz */
+ pfmul mm5,mm5
+ pfacc mm4, mm5 /* accumulate to get dx*dx+dy*dy+dz*dz */
+ pfacc mm4, mm5 /* first rsq in lower mm4 */
+
+ movq mm6, [esi + ebx*4] /* fetch second j coordinates */
+ movd mm7, [esi + ebx*4 + 8]
+
+ pfsubr mm6,mm0 /* dr = ir - jr */
+ pfsubr mm7,mm1
+ movq [esp + dx2], mm6 /* store dr */
+ movd [esp + dz2], mm7
+ pfmul mm6,mm6 /* square dx,dy,dz */
+ pfmul mm7,mm7
+ pfacc mm6, mm7 /* accumulate to get dx*dx+dy*dy+dz*dz */
+ pfacc mm6, mm7 /* second rsq in lower mm6 */
+
+ pfrsqrt mm0, mm4 /* lookup inverse square root seed */
+ pfrsqrt mm1, mm6
+
+
+ punpckldq mm0,mm1
+ punpckldq mm4,mm6 /* now 4 has rsq and 0 the seed for both pairs */
+ movq mm2,mm0 /* amd 3dnow N-R iteration to get full precision */
+ pfmul mm0,mm0
+ pfrsqit1 mm0,mm4
+ pfrcpit2 mm0,mm2
+ pfmul mm4, mm0
+ movq mm1, mm4
+ /* mm0 is invsqrt, and mm1 r */
+ /* do potential and fscal */
+ pfmul mm1, [esp + tsc] /* mm1=rt */
+ pf2iw mm4,mm1
+ movq [esp + n1], mm4
+ pi2fd mm4,mm4
+ pfsub mm1, mm4 /* now mm1 is eps and mm4 is n0 */
+
+ movq mm2,mm1
+ pfmul mm2,mm2 /* mm1 is eps, mm2 is eps2 */
+
+ mov edx, [ebp + VFtab]
+ /* dispersion table */
+ mov ecx, [esp + n1]
+ shl ecx, 3
+ /* load all the table values we need */
+ movd mm4, [edx + ecx*4]
+ movd mm5, [edx + ecx*4 + 4]
+ movd mm6, [edx + ecx*4 + 8]
+ movd mm7, [edx + ecx*4 + 12]
+ mov ecx, [esp + n1 + 4]
+ shl ecx, 3
+ punpckldq mm4, [edx + ecx*4]
+ punpckldq mm5, [edx + ecx*4 + 4]
+ punpckldq mm6, [edx + ecx*4 + 8]
+ punpckldq mm7, [edx + ecx*4 + 12]
+ pfmul mm6, mm1 /* mm6 = Geps */
+ pfmul mm7, mm2 /* mm7 = Heps2 */
+ pfadd mm5, mm6
+ pfadd mm5, mm7 /* mm5 = Fp */
+ pfmul mm7, [esp + two] /* two*Heps2 */
+ pfadd mm7, mm6
+ pfadd mm7, mm5 /* mm7=FF */
+ pfmul mm5, mm1 /* mm5=eps*Fp */
+ pfadd mm5, mm4 /* mm5= VV */
+
+ movq mm4, [esp + c6]
+ pfmul mm7, mm4 /* fijD */
+ pfmul mm5, mm4 /* vnb6 */
+ movq mm3, mm7 /* add to fscal */
+
+ /* update vnbtot to release mm5! */
+ pfadd mm5, [esp + vnbtot] /* add the earlier value */
+ movq [esp + vnbtot], mm5 /* store the sum */
+
+ /* repulsion table */
+ mov ecx, [esp + n1]
+ shl ecx, 3
+ /* load all the table values we need */
+ movd mm4, [edx + ecx*4 + 16]
+ movd mm5, [edx + ecx*4 + 20]
+ movd mm6, [edx + ecx*4 + 24]
+ movd mm7, [edx + ecx*4 + 28]
+ mov ecx, [esp + n1 + 4]
+ shl ecx, 3
+ punpckldq mm4, [edx + ecx*4 + 16]
+ punpckldq mm5, [edx + ecx*4 + 20]
+ punpckldq mm6, [edx + ecx*4 + 24]
+ punpckldq mm7, [edx + ecx*4 + 28]
+
+ pfmul mm6, mm1 /* mm6 = Geps */
+ pfmul mm7, mm2 /* mm7 = Heps2 */
+ pfadd mm5, mm6
+ pfadd mm5, mm7 /* mm5 = Fp */
+ pfmul mm7, [esp + two] /* two*Heps2 */
+ pfadd mm7, mm6
+ pfadd mm7, mm5 /* mm7=FF */
+ pfmul mm5, mm1 /* mm5=eps*Fp */
+ pfadd mm5, mm4 /* mm5= VV */
+
+ movq mm6, [esp + c12]
+ pfmul mm7, mm6 /* fijR */
+ pfmul mm5, mm6 /* vnb12 */
+ pfadd mm3, mm7 /* total fscal fijD+fijR */
+
+ /* change sign of mm3 */
+ pxor mm1,mm1
+ pfsub mm1, mm3
+ pfmul mm1, [esp + tsc]
+ pfmul mm0, mm1 /* mm0 is total fscal now */
+
+ prefetchw [esp + dx1] /* prefetch i forces to cache */
+
+ /* spread fscalar to both positions */
+ movq mm1,mm0
+ punpckldq mm0,mm0
+ punpckhdq mm1,mm1
+
+ /* calc vector force */
+ prefetchw [edi + eax*4] /* prefetch the 1st faction to cache */
+ movq mm2, [esp + dx1] /* fetch dr */
+ movd mm3, [esp + dz1]
+
+ /* update vnbtot */
+ pfadd mm5, [esp + vnbtot] /* add the earlier value */
+ movq [esp + vnbtot], mm5 /* store the sum */
+
+ prefetchw [edi + ebx*4] /* prefetch the 2nd faction to cache */
+ pfmul mm2, mm0 /* mult by fs */
+ pfmul mm3, mm0
+
+ movq mm4, [esp + dx2] /* fetch dr */
+ movd mm5, [esp + dz2]
+ pfmul mm4, mm1 /* mult by fs */
+ pfmul mm5, mm1
+ /* update i forces */
+
+ movq mm0, [esp + fix]
+ movd mm1, [esp + fiz]
+ pfadd mm0, mm2
+ pfadd mm1, mm3
+
+ pfadd mm0, mm4
+ pfadd mm1, mm5
+ movq [esp + fix], mm0
+ movd [esp + fiz], mm1
+ /* update j forces */
+
+ movq mm0, [edi + eax*4]
+ movd mm1, [edi + eax*4 + 8]
+ movq mm6, [edi + ebx*4]
+ movd mm7, [edi + ebx*4 + 8]
+
+ pfsub mm0, mm2
+ pfsub mm1, mm3
+ pfsub mm6, mm4
+ pfsub mm7, mm5
+
+ movq [edi + eax*4], mm0
+ movd [edi + eax*4 +8], mm1
+ movq [edi + ebx*4], mm6
+ movd [edi + ebx*4 + 8], mm7
+
+ /* should we do one more iteration? */
+ sub [esp + innerk], 2
+ jl .i0310_finish_vdw_inner
+ jmp .i0310_unroll_vdw_loop
+.i0310_finish_vdw_inner:
+ and [esp + innerk], 1
+ jnz .i0310_single_vdw_inner
+ jmp .i0310_updateouterdata_vdw
+.i0310_single_vdw_inner:
+ /* a single j particle iteration here - compare with the unrolled code for comments */
+ mov eax, [esp + innerjjnr]
+ mov eax, [eax] /* eax=jnr offset */
+
+ mov esi, [ebp + nbfp]
+ mov ecx, [ebp + type]
+ mov edx, [ecx + eax*4] /* type [jnr1] */
+ shl edx, 1
+ add edx, [esp + ntia] /* tja = ntia + 2*type */
+ movd mm5, [esi + edx*4] /* mm5 = 1st c6 */
+ movq [esp + c6], mm5
+ movd mm5, [esi + edx*4 + 4] /* mm5 = 1st c12 */
+ movq [esp + c12], mm5
+
+ mov esi, [ebp + pos]
+ lea eax, [eax + eax*2]
+
+ movq mm0, [esp + ix]
+ movd mm1, [esp + iz]
+ movq mm4, [esi + eax*4]
+ movd mm5, [esi + eax*4 + 8]
+ pfsubr mm4, mm0
+ pfsubr mm5, mm1
+ movq [esp + dx1], mm4
+ pfmul mm4,mm4
+ movd [esp + dz1], mm5
+ pfmul mm5,mm5
+ pfacc mm4, mm5
+ pfacc mm4, mm5 /* mm0=rsq */
+
+ pfrsqrt mm0,mm4
+ movq mm2,mm0
+ pfmul mm0,mm0
+ pfrsqit1 mm0,mm4
+ pfrcpit2 mm0,mm2 /* mm1=invsqrt */
+ pfmul mm4, mm0
+ movq mm1, mm4
+ /* mm0 is invsqrt, and mm1 r */
+
+ /* calculate potentials and scalar force */
+ pfmul mm1, [esp + tsc] /* mm1=rt */
+ pf2iw mm4,mm1
+ movd [esp + n1], mm4
+ pi2fd mm4,mm4
+ pfsub mm1, mm4 /* now mm1 is eps and mm4 is n0 */
+
+ movq mm2,mm1
+ pfmul mm2,mm2 /* mm1 is eps, mm2 is eps2 */
+
+ mov edx, [ebp + VFtab]
+ mov ecx, [esp + n1]
+ shl ecx, 3
+ /* dispersion table */
+ /* load all the table values we need */
+ movd mm4, [edx + ecx*4]
+ movd mm5, [edx + ecx*4 + 4]
+ movd mm6, [edx + ecx*4 + 8]
+ movd mm7, [edx + ecx*4 + 12]
+ pfmul mm6, mm1 /* mm6 = Geps */
+ pfmul mm7, mm2 /* mm7 = Heps2 */
+ pfadd mm5, mm6
+ pfadd mm5, mm7 /* mm5 = Fp */
+ pfmul mm7, [esp + two] /* two*Heps2 */
+ pfadd mm7, mm6
+ pfadd mm7, mm5 /* mm7=FF */
+ pfmul mm5, mm1 /* mm5=eps*Fp */
+ pfadd mm5, mm4 /* mm5= VV */
+
+ movq mm4, [esp + c6]
+ pfmul mm7, mm4 /* fijD */
+ pfmul mm5, mm4 /* vnb6 */
+ movq mm3, mm7 /* add to fscal */
+
+ /* update vnbtot to release mm5! */
+ pfadd mm5, [esp + vnbtot] /* add the earlier value */
+ movq [esp + vnbtot], mm5 /* store the sum */
+
+ /* repulsion table */
+ /* load all the table values we need */
+ movd mm4, [edx + ecx*4 + 16]
+ movd mm5, [edx + ecx*4 + 20]
+ movd mm6, [edx + ecx*4 + 24]
+ movd mm7, [edx + ecx*4 + 28]
+
+ pfmul mm6, mm1 /* mm6 = Geps */
+ pfmul mm7, mm2 /* mm7 = Heps2 */
+ pfadd mm5, mm6
+ pfadd mm5, mm7 /* mm5 = Fp */
+ pfmul mm7, [esp + two] /* two*Heps2 */
+ pfadd mm7, mm6
+ pfadd mm7, mm5 /* mm7=FF */
+ pfmul mm5, mm1 /* mm5=eps*Fp */
+ pfadd mm5, mm4 /* mm5= VV */
+
+ movq mm6, [esp + c12]
+ pfmul mm7, mm6 /* fijR */
+ pfmul mm5, mm6 /* vnb12 */
+ pfadd mm3, mm7 /* total fscal fijC+fijD+fijR */
+
+ /* change sign of mm3 */
+ pxor mm1,mm1
+ pfsub mm1, mm3
+ pfmul mm0, [esp + tsc]
+ pfmul mm0, mm1 /* mm0 is total fscal now */
+
+ /* update vnbtot */
+ pfadd mm5, [esp + vnbtot] /* add the earlier value */
+ movq [esp + vnbtot], mm5 /* store the sum */
+
+ /* spread fscalar to both positions */
+ punpckldq mm0,mm0
+ /* calc vectorial force */
+ prefetchw [edi + eax*4] /* prefetch faction to cache */
+ movq mm2, [esp + dx1]
+ movd mm3, [esp + dz1]
+
+ pfmul mm2, mm0
+ pfmul mm3, mm0
+
+ /* update i particle force */
+ movq mm0, [esp + fix]
+ movd mm1, [esp + fiz]
+ pfadd mm0, mm2
+ pfadd mm1, mm3
+ movq [esp + fix], mm0
+ movd [esp + fiz], mm1
+ /* update j particle force */
+ movq mm0, [edi + eax*4]
+ movd mm1, [edi + eax *4+ 8]
+ pfsub mm0, mm2
+ pfsub mm1, mm3
+ movq [edi + eax*4], mm0
+ movd [edi + eax*4 +8], mm1
+ /* done! */
+.i0310_updateouterdata_vdw:
+ mov ecx, [esp + ii3]
+
+ movq mm6, [edi + ecx*4] /* increment i force */
+ movd mm7, [edi + ecx*4 + 8]
+ pfadd mm6, [esp + fix]
+ pfadd mm7, [esp + fiz]
+ movq [edi + ecx*4], mm6
+ movd [edi + ecx*4 +8], mm7
+
+ mov ebx, [ebp + fshift] /* increment fshift force */
+ mov edx, [esp + is3]
+
+ movq mm6, [ebx + edx*4]
+ movd mm7, [ebx + edx*4 + 8]
+ pfadd mm6, [esp + fix]
+ pfadd mm7, [esp + fiz]
+ movq [ebx + edx*4], mm6
+ movd [ebx + edx*4 + 8], mm7
+
+ /* loop back to mno */
+ dec dword ptr [esp + nsvdw]
+ jz .i0310_last_mno
+ jmp .i0310_mno_vdw
+
+.i0310_last_mno:
+ mov edx, [ebp + gid] /* get group index for this i particle */
+ mov edx, [edx]
+ add [ebp + gid], 4 /* advance pointer */
+
+ movq mm7, [esp + vnbtot]
+ pfacc mm7,mm7 /* get and sum the two parts of total potential */
+
+ mov eax, [ebp + Vnb]
+ movd mm6, [eax + edx*4]
+ pfadd mm6, mm7
+ movd [eax + edx*4], mm6 /* increment vc[gid] */
+ /* finish if last */
+ mov ecx, [ebp + nri]
+ dec ecx
+ jecxz .i0310_end
+ /* not last, iterate once more! */
+ mov [ebp + nri], ecx
+ jmp .i0310_outer
+.i0310_end:
+ femms
+ add esp, 152
+ pop edi
+ pop esi
+ pop edx
+ pop ecx
+ pop ebx
+ pop eax
+ leave
+ ret
+
+
+.globl inl1000_3dnow
+ .type inl1000_3dnow,@function
+inl1000_3dnow:
+.equ nri, 8
+.equ iinr, 12
+.equ jindex, 16
+.equ jjnr, 20
+.equ shift, 24
+.equ shiftvec, 28
+.equ fshift, 32
+.equ gid, 36
+.equ pos, 40
+.equ faction, 44
+.equ charge, 48
+.equ facel, 52
+.equ Vc, 56
+ /* stack offsets for local variables */
+.equ is3, 0
+.equ ii3, 4
+.equ ix, 8
+.equ iy, 12
+.equ iz, 16
+.equ iq, 20 /* repeated (64bit) to fill 3dnow reg */
+.equ vctot, 28 /* repeated (64bit) to fill 3dnow reg */
+.equ innerjjnr, 36
+.equ innerk, 40
+.equ fix, 44
+.equ fiy, 48
+.equ fiz, 52
+.equ dx1, 56
+.equ dy1, 60
+.equ dz1, 64
+.equ dx2, 68
+.equ dy2, 72
+.equ dz2, 76
+ push ebp
+ mov ebp,esp
+ push eax
+ push ebx
+ push ecx
+ push edx
+ push esi
+ push edi
+ sub esp, 80 /* 80 bytes local stack space */
+ femms
+ /* assume we have at least one i particle - start directly */
+.i1000_outer:
+ mov eax, [ebp + shift] /* eax = pointer into shift[] */
+ mov ebx, [eax] /* ebx=shift[n] */
+ add [ebp + shift], 4 /* advance pointer one step */
+
+ lea ebx, [ebx + ebx*2] /* ebx=3*is */
+ mov [esp + is3],ebx /* store is3 */
+
+ mov eax, [ebp + shiftvec] /* eax = base of shiftvec[] */
+
+ movq mm0, [eax + ebx*4] /* move shX/shY to mm0 and shZ to mm1 */
+ movd mm1, [eax + ebx*4 + 8]
+
+ mov ecx, [ebp + iinr] /* ecx = pointer into iinr[] */
+ add [ebp + iinr], 4 /* advance pointer */
+ mov ebx, [ecx] /* ebx=ii */
+
+ mov edx, [ebp + charge]
+ movd mm2, [edx + ebx*4] /* mm2=charge[ii] */
+ pfmul mm2, [ebp + facel]
+ punpckldq mm2,mm2 /* spread to both halves */
+ movq [esp + iq], mm2 /* iq =facel*charge[ii] */
+
+ lea ebx, [ebx + ebx*2] /* ebx = 3*ii=ii3 */
+ mov eax, [ebp + pos] /* eax = base of pos[] */
+
+ pfadd mm0, [eax + ebx*4] /* ix = shX + posX (and iy too) */
+ movd mm3, [eax + ebx*4 + 8] /* cant use direct memory add for 4 bytes (iz) */
+ mov [esp + ii3], ebx
+ pfadd mm1, mm3
+ movq [esp + ix], mm0
+ movd [esp + iz], mm1
+
+ /* clear vctot and i forces */
+ pxor mm7,mm7
+ movq [esp + vctot], mm7
+ movq [esp + fix], mm7
+ movd [esp + fiz], mm7
+
+ mov eax, [ebp + jindex]
+ mov ecx, [eax] /* jindex[n] */
+ mov edx, [eax + 4] /* jindex[n+1] */
+ add [ebp + jindex], 4
+ sub edx, ecx /* number of innerloop atoms */
+
+ mov esi, [ebp + pos]
+ mov edi, [ebp + faction]
+ mov eax, [ebp + jjnr]
+ shl ecx, 2
+ add eax, ecx
+ mov [esp + innerjjnr], eax /* pointer to jjnr[nj0] */
+ sub edx, 2
+ mov [esp + innerk], edx /* number of innerloop atoms */
+ jge .i1000_unroll_loop
+ jmp .i1000_finish_inner
+.i1000_unroll_loop:
+ /* paired innerloop starts here */
+ mov ecx, [esp + innerjjnr] /* pointer to jjnr[k] */
+ mov eax, [ecx]
+ mov ebx, [ecx + 4] /* eax/ebx=jnr */
+ add [esp + innerjjnr], 8 /* advance pointer (unrolled 2) */
+ prefetch [ecx + 16] /* prefetch data - trial and error says 16 is best */
+
+ mov ecx, [ebp + charge] /* base of charge[] */
+ movq mm5, [esp + iq]
+ movd mm3, [ecx + eax*4] /* charge[jnr1] */
+ movd mm7, [ecx + ebx*4] /* charge[jnr2] */
+ punpckldq mm3,mm7 /* move charge 2 to high part of mm3 */
+ pfmul mm3,mm5 /* mm3 now has qq for both particles */
+
+ lea eax, [eax + eax*2] /* replace jnr with j3 */
+ lea ebx, [ebx + ebx*2]
+
+ movq mm0, [esp + ix]
+ movd mm1, [esp + iz]
+ movq mm4, [esi + eax*4] /* fetch first j coordinates */
+ movd mm5, [esi + eax*4 + 8]
+ pfsubr mm4,mm0 /* dr = ir - jr */
+ pfsubr mm5,mm1
+ movq [esp + dx1], mm4 /* store dr */
+ movd [esp + dz1], mm5
+ pfmul mm4,mm4 /* square dx,dy,dz */
+ pfmul mm5,mm5
+ pfacc mm4, mm5 /* accumulate to get dx*dx+dy*dy+dz*dz */
+ pfacc mm4, mm5 /* first rsq in lower mm4 */
+
+ movq mm6, [esi + ebx*4] /* fetch second j coordinates */
+ movd mm7, [esi + ebx*4 + 8]
+
+ pfsubr mm6,mm0 /* dr = ir - jr */
+ pfsubr mm7,mm1
+ movq [esp + dx2], mm6 /* store dr */
+ movd [esp + dz2], mm7
+ pfmul mm6,mm6 /* square dx,dy,dz */
+ pfmul mm7,mm7
+ pfacc mm6, mm7 /* accumulate to get dx*dx+dy*dy+dz*dz */
+ pfacc mm6, mm7 /* second rsq in lower mm6 */
+
+ pfrsqrt mm0, mm4 /* lookup inverse square root seed */
+ pfrsqrt mm1, mm6
+
+ punpckldq mm0,mm1
+ punpckldq mm4,mm6 /* now 4 has rsq and 0 the seed for both pairs */
+ movq mm2,mm0 /* amd 3dnow N-R iteration to get full precision */
+ pfmul mm0,mm0
+ pfrsqit1 mm0,mm4
+ pfrcpit2 mm0,mm2
+ movq mm1,mm0
+ pfmul mm0,mm0
+ /* mm0 now contains invsq, and mm1 invsqrt
+ * do potential and fscal
+ */
+ prefetchw [esp + dx1] /* prefetch i forces to cache */
+
+ pfmul mm3,mm1 /* 6 has both vcoul */
+ pfmul mm0,mm3 /* 0 has both fscal */
+
+ /* update vctot */
+
+ pfadd mm3, [esp + vctot] /* add the earlier value */
+ movq [esp + vctot], mm3 /* store the sum */
+ /* spread fscalar to both positions */
+ movq mm1,mm0
+ punpckldq mm0,mm0
+ punpckhdq mm1,mm1
+ /* calc vector force */
+ prefetchw [edi + eax*4] /* prefetch the 1st faction to cache */
+ movq mm2, [esp + dx1] /* fetch dr */
+ movd mm3, [esp + dz1]
+ prefetchw [edi + ebx*4] /* prefetch the 2nd faction to cache */
+ pfmul mm2, mm0 /* mult by fs */
+ pfmul mm3, mm0
+
+ movq mm4, [esp + dx2] /* fetch dr */
+ movd mm5, [esp + dz2]
+ pfmul mm4, mm1 /* mult by fs */
+ pfmul mm5, mm1
+ /* update i forces */
+
+ movq mm0, [esp + fix]
+ movd mm1, [esp + fiz]
+ pfadd mm0, mm2
+ pfadd mm1, mm3
+
+ pfadd mm0, mm4
+ pfadd mm1, mm5
+ movq [esp + fix], mm0
+ movd [esp + fiz], mm1
+ /* update j forces */
+
+ movq mm0, [edi + eax*4]
+ movd mm1, [edi + eax*4 + 8]
+ movq mm6, [edi + ebx*4]
+ movd mm7, [edi + ebx*4 + 8]
+
+ pfsub mm0, mm2
+ pfsub mm1, mm3
+ pfsub mm6, mm4
+ pfsub mm7, mm5
+
+ movq [edi + eax*4], mm0
+ movd [edi + eax*4 +8], mm1
+ movq [edi + ebx*4], mm6
+ movd [edi + ebx*4 + 8], mm7
+
+ /* should we do one more iteration? */
+ sub [esp + innerk], 2
+ jl .i1000_finish_inner
+ jmp .i1000_unroll_loop
+.i1000_finish_inner:
+ and [esp + innerk], 1
+ jnz .i1000_single_inner
+ jmp .i1000_updateouterdata
+.i1000_single_inner:
+ /* a single j particle iteration here - compare with the unrolled code for comments */
+ mov eax, [esp + innerjjnr]
+ mov eax, [eax] /* eax=jnr offset */
+
+ mov ecx, [ebp + charge]
+ movd mm6, [esp + iq]
+ movd mm7, [ecx + eax*4]
+ pfmul mm6, mm7 /* mm6=qq */
+
+ lea eax, [eax + eax*2]
+
+ movq mm0, [esp + ix]
+ movd mm1, [esp + iz]
+ movq mm2, [esi + eax*4]
+ movd mm3, [esi + eax*4 + 8]
+ pfsub mm0, mm2
+ pfsub mm1, mm3
+ movq [esp + dx1], mm0
+ pfmul mm0,mm0
+ movd [esp + dz1], mm1
+ pfmul mm1,mm1
+ pfacc mm0, mm1
+ pfacc mm0, mm1 /* mm0=rsq */
+
+ pfrsqrt mm1,mm0
+ movq mm2,mm1
+ pfmul mm1,mm1
+ pfrsqit1 mm1,mm0
+ pfrcpit2 mm1,mm2 /* mm1=invsqrt */
+ movq mm4, mm1
+ pfmul mm4, mm4 /* mm4=invsq */
+ /* calculate potential and scalar force */
+ pfmul mm6, mm1 /* mm6=vcoul */
+ pfmul mm4, mm6 /* mm4=fscalar */
+ /* update vctot */
+ movq mm5, [esp + vctot]
+ pfadd mm5, mm6
+ movq [esp + vctot], mm5
+ /* spread fscalar to both positions */
+ punpckldq mm4,mm4
+ /* calc vectorial force */
+ prefetchw [edi + eax*4] /* prefetch faction to cache */
+ movq mm0, [esp + dx1]
+ movd mm1, [esp + dz1]
+ pfmul mm0, mm4
+ pfmul mm1, mm4
+ /* update i particle force */
+ movq mm2, [esp + fix]
+ movd mm3, [esp + fiz]
+ pfadd mm2, mm0
+ pfadd mm3, mm1
+ movq [esp + fix], mm2
+ movd [esp + fiz], mm3
+ /* update j particle force */
+ movq mm2, [edi + eax*4]
+ movd mm3, [edi + eax *4+ 8]
+ pfsub mm2, mm0
+ pfsub mm3, mm1
+ movq [edi + eax*4], mm2
+ movd [edi + eax*4 +8], mm3
+ /* done! */
+.i1000_updateouterdata:
+ mov ecx, [esp + ii3]
+
+ movq mm6, [edi + ecx*4] /* increment i force */
+ movd mm7, [edi + ecx*4 + 8]
+ pfadd mm6, [esp + fix]
+ pfadd mm7, [esp + fiz]
+ movq [edi + ecx*4], mm6
+ movd [edi + ecx*4 +8], mm7
+
+ mov ebx, [ebp + fshift] /* increment fshift force */
+ mov edx, [esp + is3]
+
+ movq mm6, [ebx + edx*4]
+ movd mm7, [ebx + edx*4 + 8]
+ pfadd mm6, [esp + fix]
+ pfadd mm7, [esp + fiz]
+ movq [ebx + edx*4], mm6
+ movd [ebx + edx*4 + 8], mm7
+
+ mov edx, [ebp + gid] /* get group index for this i particle */
+ mov edx, [edx]
+ add [ebp + gid], 4 /* advance pointer */
+
+ movq mm7, [esp + vctot]
+ pfacc mm7,mm7 /* get and sum the two parts of total potential */
+
+ mov eax, [ebp + Vc]
+ movd mm6, [eax + edx*4]
+ pfadd mm6, mm7
+ movd [eax + edx*4], mm6 /* increment vc[gid] */
+ /* finish if last */
+ mov ecx, [ebp + nri]
+ dec ecx
+ jecxz .i1000_end
+ /* not last, iterate once more! */
+ mov [ebp + nri], ecx
+ jmp .i1000_outer
+.i1000_end:
+ femms
+ add esp, 80
+ pop edi
+ pop esi
+ pop edx
+ pop ecx
+ pop ebx
+ pop eax
+ leave
+ ret
+
+
+.globl inl1010_3dnow
+ .type inl1010_3dnow,@function
+inl1010_3dnow:
+.equ nri, 8
+.equ iinr, 12
+.equ jindex, 16
+.equ jjnr, 20
+.equ shift, 24
+.equ shiftvec, 28
+.equ fshift, 32
+.equ gid, 36
+.equ pos, 40
+.equ faction, 44
+.equ charge, 48
+.equ facel, 52
+.equ Vc, 56
+.equ nsatoms, 60
+ /* stack offsets for local variables */
+.equ is3, 0
+.equ ii3, 4
+.equ shX, 8
+.equ shY, 12
+.equ shZ, 16
+.equ ix, 20
+.equ iy, 24
+.equ iz, 28
+.equ iq, 32 /* repeated (64bit) to fill 3dnow reg */
+.equ vctot, 40 /* repeated (64bit) to fill 3dnow reg */
+.equ innerjjnr0, 48
+.equ innerk0, 52
+.equ innerjjnr, 56
+.equ innerk, 60
+.equ fix, 64
+.equ fiy, 68
+.equ fiz, 72
+.equ dx1, 76
+.equ dy1, 80
+.equ dz1, 84
+.equ dx2, 88
+.equ dy2, 92
+.equ dz2, 96
+.equ nscoul, 100
+.equ solnr, 104
+ push ebp
+ mov ebp,esp
+ push eax
+ push ebx
+ push ecx
+ push edx
+ push esi
+ push edi
+ sub esp, 108 /* local stack space */
+ femms
+ /* assume we have at least one i particle - start directly */
+ add [ebp + nsatoms], 8
+
+.i1010_outer:
+ mov eax, [ebp + shift] /* eax = pointer into shift[] */
+ mov ebx, [eax] /* ebx=shift[n] */
+ add [ebp + shift], 4 /* advance pointer one step */
+
+ lea ebx, [ebx + ebx*2] /* ebx=3*is */
+ mov [esp + is3],ebx /* store is3 */
+
+ mov eax, [ebp + shiftvec] /* eax = base of shiftvec[] */
+
+ movq mm0, [eax + ebx*4] /* move shX/shY to mm0 and shZ to mm1 */
+ movd mm1, [eax + ebx*4 + 8]
+ movq [esp + shX], mm0
+ movd [esp + shZ], mm1
+
+ mov ecx, [ebp + iinr] /* ecx = pointer into iinr[] */
+ add [ebp + iinr], 4 /* advance pointer */
+ mov ebx, [ecx] /* ebx=ii */
+
+ mov eax, [ebp + nsatoms]
+ mov ecx, [eax]
+ add [ebp + nsatoms], 12
+ mov [esp + nscoul], ecx
+
+ /* clear potential */
+ pxor mm7,mm7
+ movq [esp + vctot], mm7
+ mov [esp + solnr], ebx
+
+ mov eax, [ebp + jindex]
+ mov ecx, [eax] /* jindex[n] */
+ mov edx, [eax + 4] /* jindex[n+1] */
+ add [ebp + jindex], 4
+ sub edx, ecx /* number of innerloop atoms */
+ mov eax, [ebp + jjnr]
+ shl ecx, 2
+ add eax, ecx
+ mov [esp + innerjjnr0], eax /* pointer to jjnr[nj0] */
+
+ mov [esp + innerk0], edx /* number of innerloop atoms */
+ mov esi, [ebp + pos]
+ mov edi, [ebp + faction]
+
+ mov ecx, [esp + nscoul]
+ cmp ecx, 0
+ jnz .i1010_mno_coul
+ jmp .i1010_last_mno
+.i1010_mno_coul:
+ mov ebx, [esp + solnr]
+ inc dword ptr [esp + solnr]
+ mov edx, [ebp + charge]
+ movd mm2, [edx + ebx*4] /* mm2=charge[ii] */
+ pfmul mm2, [ebp + facel]
+ punpckldq mm2,mm2 /* spread to both halves */
+ movq [esp + iq], mm2 /* iq =facel*charge[ii] */
+
+ lea ebx, [ebx + ebx*2] /* ebx = 3*ii=ii3 */
+ mov eax, [ebp + pos] /* eax = base of pos[] */
+ mov [esp + ii3], ebx
+
+ movq mm0, [eax + ebx*4]
+ movd mm1, [eax + ebx*4 + 8]
+ pfadd mm0, [esp + shX]
+ pfadd mm1, [esp + shZ]
+ movq [esp + ix], mm0
+ movd [esp + iz], mm1
+
+ /* clear forces */
+ pxor mm7,mm7
+ movq [esp + fix], mm7
+ movd [esp + fiz], mm7
+
+ mov ecx, [esp + innerjjnr0]
+ mov [esp + innerjjnr], ecx
+ mov edx, [esp + innerk0]
+ sub edx, 2
+ mov [esp + innerk], edx /* number of innerloop atoms */
+ jge .i1010_unroll_coul_loop
+ jmp .i1010_finish_coul_inner
+.i1010_unroll_coul_loop:
+ /* paired innerloop starts here */
+ mov ecx, [esp + innerjjnr] /* pointer to jjnr[k] */
+ mov eax, [ecx]
+ mov ebx, [ecx + 4] /* eax/ebx=jnr */
+ add [esp + innerjjnr], 8 /* advance pointer (unrolled 2) */
+ prefetch [ecx + 16] /* prefetch data - trial and error says 16 is best */
+
+ mov ecx, [ebp + charge] /* base of charge[] */
+ movq mm5, [esp + iq]
+ movd mm3, [ecx + eax*4] /* charge[jnr1] */
+ movd mm7, [ecx + ebx*4] /* charge[jnr2] */
+ punpckldq mm3,mm7 /* move charge 2 to high part of mm3 */
+ pfmul mm3,mm5 /* mm3 now has qq for both particles */
+
+ lea eax, [eax + eax*2] /* replace jnr with j3 */
+ lea ebx, [ebx + ebx*2]
+
+ movq mm0, [esp + ix]
+ movd mm1, [esp + iz]
+ movq mm4, [esi + eax*4] /* fetch first j coordinates */
+ movd mm5, [esi + eax*4 + 8]
+ pfsubr mm4,mm0 /* dr = ir - jr */
+ pfsubr mm5,mm1
+ movq [esp + dx1], mm4 /* store dr */
+ movd [esp + dz1], mm5
+ pfmul mm4,mm4 /* square dx,dy,dz */
+ pfmul mm5,mm5
+ pfacc mm4, mm5 /* accumulate to get dx*dx+dy*dy+dz*dz */
+ pfacc mm4, mm5 /* first rsq in lower mm4 */
+
+ movq mm6, [esi + ebx*4] /* fetch second j coordinates */
+ movd mm7, [esi + ebx*4 + 8]
+
+ pfsubr mm6,mm0 /* dr = ir - jr */
+ pfsubr mm7,mm1
+ movq [esp + dx2], mm6 /* store dr */
+ movd [esp + dz2], mm7
+ pfmul mm6,mm6 /* square dx,dy,dz */
+ pfmul mm7,mm7
+ pfacc mm6, mm7 /* accumulate to get dx*dx+dy*dy+dz*dz */
+ pfacc mm6, mm7 /* second rsq in lower mm6 */
+
+ pfrsqrt mm0, mm4 /* lookup inverse square root seed */
+ pfrsqrt mm1, mm6
+
+ punpckldq mm0,mm1
+ punpckldq mm4,mm6 /* now 4 has rsq and 0 the seed for both pairs */
+ movq mm2,mm0 /* amd 3dnow N-R iteration to get full precision */
+ pfmul mm0,mm0
+ pfrsqit1 mm0,mm4
+ pfrcpit2 mm0,mm2
+ movq mm1,mm0
+ pfmul mm0,mm0
+ /* mm0 now contains invsq, and mm1 invsqrt */
+ /* do potential and fscal */
+ prefetchw [esp + dx1] /* prefetch i forces to cache */
+
+ pfmul mm3,mm1 /* 6 has both vcoul */
+ pfmul mm0,mm3 /* 0 has both fscal */
+
+ /* update vctot */
+
+ pfadd mm3, [esp + vctot] /* add the earlier value */
+ movq [esp + vctot], mm3 /* store the sum */
+ /* spread fscalar to both positions */
+ movq mm1,mm0
+ punpckldq mm0,mm0
+ punpckhdq mm1,mm1
+ /* calc vector force */
+ prefetchw [edi + eax*4] /* prefetch the 1st faction to cache */
+ movq mm2, [esp + dx1] /* fetch dr */
+ movd mm3, [esp + dz1]
+ prefetchw [edi + ebx*4] /* prefetch the 2nd faction to cache */
+ pfmul mm2, mm0 /* mult by fs */
+ pfmul mm3, mm0
+
+ movq mm4, [esp + dx2] /* fetch dr */
+ movd mm5, [esp + dz2]
+ pfmul mm4, mm1 /* mult by fs */
+ pfmul mm5, mm1
+ /* update i forces */
+
+ movq mm0, [esp + fix]
+ movd mm1, [esp + fiz]
+ pfadd mm0, mm2
+ pfadd mm1, mm3
+
+ pfadd mm0, mm4
+ pfadd mm1, mm5
+ movq [esp + fix], mm0
+ movd [esp + fiz], mm1
+ /* update j forces */
+
+ movq mm0, [edi + eax*4]
+ movd mm1, [edi + eax*4 + 8]
+ movq mm6, [edi + ebx*4]
+ movd mm7, [edi + ebx*4 + 8]
+
+ pfsub mm0, mm2
+ pfsub mm1, mm3
+ pfsub mm6, mm4
+ pfsub mm7, mm5
+
+ movq [edi + eax*4], mm0
+ movd [edi + eax*4 +8], mm1
+ movq [edi + ebx*4], mm6
+ movd [edi + ebx*4 + 8], mm7
+
+ /* should we do one more iteration? */
+ sub [esp + innerk], 2
+ jl .i1010_finish_coul_inner
+ jmp .i1010_unroll_coul_loop
+.i1010_finish_coul_inner:
+ and [esp + innerk], 1
+ jnz .i1010_single_coul_inner
+ jmp .i1010_updateouterdata_coul
+.i1010_single_coul_inner:
+ /* a single j particle iteration here - compare with the unrolled code for comments */
+ mov eax, [esp + innerjjnr]
+ mov eax, [eax] /* eax=jnr offset */
+
+ mov ecx, [ebp + charge]
+ movd mm6, [esp + iq]
+ movd mm7, [ecx + eax*4]
+ pfmul mm6, mm7 /* mm6=qq */
+
+ lea eax, [eax + eax*2]
+
+ movq mm0, [esp + ix]
+ movd mm1, [esp + iz]
+ movq mm2, [esi + eax*4]
+ movd mm3, [esi + eax*4 + 8]
+ pfsub mm0, mm2
+ pfsub mm1, mm3
+ movq [esp + dx1], mm0
+ pfmul mm0,mm0
+ movd [esp + dz1], mm1
+ pfmul mm1,mm1
+ pfacc mm0, mm1
+ pfacc mm0, mm1 /* mm0=rsq */
+
+ pfrsqrt mm1,mm0
+ movq mm2,mm1
+ pfmul mm1,mm1
+ pfrsqit1 mm1,mm0
+ pfrcpit2 mm1,mm2 /* mm1=invsqrt */
+ movq mm4, mm1
+ pfmul mm4, mm4 /* mm4=invsq */
+ /* calculate potential and scalar force */
+ pfmul mm6, mm1 /* mm6=vcoul */
+ pfmul mm4, mm6 /* mm4=fscalar */
+ /* update vctot */
+ movq mm5, [esp + vctot]
+ pfadd mm5, mm6
+ movq [esp + vctot], mm5
+ /* spread fscalar to both positions */
+ punpckldq mm4,mm4
+ /* calc vectorial force */
+ prefetchw [edi + eax*4] /* prefetch faction to cache */
+ movq mm0, [esp + dx1]
+ movd mm1, [esp + dz1]
+ pfmul mm0, mm4
+ pfmul mm1, mm4
+ /* update i particle force */
+ movq mm2, [esp + fix]
+ movd mm3, [esp + fiz]
+ pfadd mm2, mm0
+ pfadd mm3, mm1
+ movq [esp + fix], mm2
+ movd [esp + fiz], mm3
+ /* update j particle force */
+ movq mm2, [edi + eax*4]
+ movd mm3, [edi + eax *4+ 8]
+ pfsub mm2, mm0
+ pfsub mm3, mm1
+ movq [edi + eax*4], mm2
+ movd [edi + eax*4 +8], mm3
+ /* done! */
+.i1010_updateouterdata_coul:
+ mov ecx, [esp + ii3]
+
+ movq mm6, [edi + ecx*4] /* increment i force */
+ movd mm7, [edi + ecx*4 + 8]
+ pfadd mm6, [esp + fix]
+ pfadd mm7, [esp + fiz]
+ movq [edi + ecx*4], mm6
+ movd [edi + ecx*4 +8], mm7
+
+ mov ebx, [ebp + fshift] /* increment fshift force */
+ mov edx, [esp + is3]
+
+ movq mm6, [ebx + edx*4]
+ movd mm7, [ebx + edx*4 + 8]
+ pfadd mm6, [esp + fix]
+ pfadd mm7, [esp + fiz]
+ movq [ebx + edx*4], mm6
+ movd [ebx + edx*4 + 8], mm7
+
+ /* loop back to mno */
+ dec dword ptr [esp + nscoul]
+ jz .i1010_last_mno
+ jmp .i1010_mno_coul
+.i1010_last_mno:
+ mov edx, [ebp + gid] /* get group index for this i particle */
+ mov edx, [edx]
+ add [ebp + gid], 4 /* advance pointer */
+
+ movq mm7, [esp + vctot]
+ pfacc mm7,mm7 /* get and sum the two parts of total potential */
+
+ mov eax, [ebp + Vc]
+ movd mm6, [eax + edx*4]
+ pfadd mm6, mm7
+ movd [eax + edx*4], mm6 /* increment vc[gid] */
+ /* finish if last */
+ mov ecx, [ebp + nri]
+ dec ecx
+ jecxz .i1010_end
+ /* not last, iterate once more! */
+ mov [ebp + nri], ecx
+ jmp .i1010_outer
+.i1010_end:
+ femms
+ add esp, 108
+ pop edi
+ pop esi
+ pop edx
+ pop ecx
+ pop ebx
+ pop eax
+ leave
+ ret
+
+
+.globl inl1020_3dnow
+ .type inl1020_3dnow,@function
+inl1020_3dnow:
+.equ nri, 8
+.equ iinr, 12
+.equ jindex, 16
+.equ jjnr, 20
+.equ shift, 24
+.equ shiftvec, 28
+.equ fshift, 32
+.equ gid, 36
+.equ pos, 40
+.equ faction, 44
+.equ charge, 48
+.equ facel, 52
+.equ Vc, 56
+ /* stack offsets for local variables */
+.equ is3, 0
+.equ ii3, 4
+.equ ixO, 8
+.equ iyO, 12
+.equ izO, 16
+.equ ixH, 20/* repeated (64bit) to fill 3dnow reg */
+.equ iyH, 28/* repeated (64bit) to fill 3dnow reg */
+.equ izH, 36/* repeated (64bit) to fill 3dnow reg */
+.equ iqO, 44 /* repeated (64bit) to fill 3dnow reg */
+.equ iqH, 52 /* repeated (64bit) to fill 3dnow reg */
+.equ vctot, 60/* repeated (64bit) to fill 3dnow reg */
+.equ innerjjnr, 68
+.equ innerk, 72
+.equ fixO, 76
+.equ fiyO, 80
+.equ fizO, 84
+.equ fixH, 88/* repeated (64bit) to fill 3dnow reg */
+.equ fiyH, 96/* repeated (64bit) to fill 3dnow reg */
+.equ fizH, 104 /* repeated (64bit) to fill 3dnow reg */
+.equ dxO, 112
+.equ dyO, 116
+.equ dzO, 120
+.equ dxH, 124 /* repeated (64bit) to fill 3dnow reg */
+.equ dyH, 132 /* repeated (64bit) to fill 3dnow reg */
+.equ dzH, 140 /* repeated (64bit) to fill 3dnow reg */
+ push ebp
+ mov ebp,esp
+ push eax
+ push ebx
+ push ecx
+ push edx
+ push esi
+ push edi
+ sub esp, 148 /* local stack space */
+ femms
+ /* assume we have at least one i particle - start directly */
+
+ mov ecx, [ebp + iinr] /* ecx = pointer into iinr[] */
+ mov ebx, [ecx] /* ebx=ii */
+
+ mov edx, [ebp + charge]
+ movd mm1, [ebp + facel]
+ movd mm2, [edx + ebx*4] /* mm2=charge[i.i0] */
+ pfmul mm2, mm1
+ movq [esp + iqO], mm2 /* iqO = facel*charge[ii] */
+
+ movd mm2, [edx + ebx*4 + 4] /* mm2=charge[i.i0+1] */
+ pfmul mm2, mm1
+ punpckldq mm2,mm2 /* spread to both halves */
+ movq [esp + iqH], mm2 /* iqH = facel*charge[i.i0+1] */
+.i1020_outer:
+ mov eax, [ebp + shift] /* eax = pointer into shift[] */
+ mov ebx, [eax] /* ebx=shift[n] */
+ add [ebp + shift], 4 /* advance pointer one step */
+
+ lea ebx, [ebx + ebx*2] /* ebx=3*is */
+ mov [esp + is3],ebx /* store is3 */
+
+ mov eax, [ebp + shiftvec] /* eax = base of shiftvec[] */
+
+ movq mm5, [eax + ebx*4] /* move shX/shY to mm5 and shZ to mm6 */
+ movd mm6, [eax + ebx*4 + 8]
+ movq mm0, mm5
+ movq mm1, mm5
+ movq mm2, mm6
+ punpckldq mm0,mm0 /* also expand shX,Y,Z in mm0--mm2 */
+ punpckhdq mm1,mm1
+ punpckldq mm2,mm2
+
+ mov ecx, [ebp + iinr] /* ecx = pointer into iinr[] */
+ add [ebp + iinr], 4 /* advance pointer */
+ mov ebx, [ecx] /* ebx=ii */
+
+ lea ebx, [ebx + ebx*2] /* ebx = 3*ii=ii3 */
+ mov eax, [ebp + pos] /* eax = base of pos[] */
+
+ pfadd mm5, [eax + ebx*4] /* ix = shX + posX (and iy too) */
+ movd mm7, [eax + ebx*4 + 8] /* cant use direct memory add for 4 bytes (iz) */
+ mov [esp + ii3], ebx /* (use mm7 as temp storage for iz) */
+ pfadd mm6, mm7
+ movq [esp + ixO], mm5
+ movq [esp + izO], mm6
+
+ movd mm3, [eax + ebx*4 + 12]
+ movd mm4, [eax + ebx*4 + 16]
+ movd mm5, [eax + ebx*4 + 20]
+ punpckldq mm3, [eax + ebx*4 + 24]
+ punpckldq mm4, [eax + ebx*4 + 28]
+ punpckldq mm5, [eax + ebx*4 + 32] /* coords of H1 in low mm3-mm5, H2 in high */
+
+ pfadd mm0, mm3
+ pfadd mm1, mm4
+ pfadd mm2, mm5
+ movq [esp + ixH], mm0
+ movq [esp + iyH], mm1
+ movq [esp + izH], mm2
+
+ /* clear vctot and i forces */
+ pxor mm7,mm7
+ movq [esp + vctot], mm7
+ movq [esp + fixO], mm7
+ movd [esp + fizO], mm7
+ movq [esp + fixH], mm7
+ movq [esp + fiyH], mm7
+ movq [esp + fizH], mm7
+
+ mov eax, [ebp + jindex]
+ mov ecx, [eax] /* jindex[n] */
+ mov edx, [eax + 4] /* jindex[n+1] */
+ add [ebp + jindex], 4
+ sub edx, ecx /* number of innerloop atoms */
+ mov [esp + innerk], edx /* number of innerloop atoms */
+
+ mov esi, [ebp + pos]
+ mov edi, [ebp + faction]
+ mov eax, [ebp + jjnr]
+ shl ecx, 2
+ add eax, ecx
+ mov [esp + innerjjnr], eax /* pointer to jjnr[nj0] */
+.i1020_inner_loop:
+ /* a single j particle iteration here - compare with the unrolled code for comments */
+ mov eax, [esp + innerjjnr]
+ mov eax, [eax] /* eax=jnr offset */
+ add [esp + innerjjnr], 4 /* advance pointer */
+ prefetch [ecx + 16] /* prefetch data - trial and error says 16 is best */
+
+ mov ecx, [ebp + charge]
+ movd mm7, [ecx + eax*4]
+ punpckldq mm7,mm7
+ movq mm6,mm7
+ pfmul mm6, [esp + iqO]
+ pfmul mm7, [esp + iqH] /* mm6=qqO, mm7=qqH */
+
+ lea eax, [eax + eax*2]
+
+ movq mm0, [esi + eax*4]
+ movd mm1, [esi + eax*4 + 8]
+ /* copy & expand to mm2-mm4 for the H interactions */
+ movq mm2, mm0
+ movq mm3, mm0
+ movq mm4, mm1
+ punpckldq mm2,mm2
+ punpckhdq mm3,mm3
+ punpckldq mm4,mm4
+
+ pfsubr mm0, [esp + ixO]
+ pfsubr mm1, [esp + izO]
+
+ movq [esp + dxO], mm0
+ pfmul mm0,mm0
+ movd [esp + dzO], mm1
+ pfmul mm1,mm1
+ pfacc mm0, mm1
+ pfadd mm0, mm1 /* mm0=rsqO */
+
+ punpckldq mm2, mm2
+ punpckldq mm3, mm3
+ punpckldq mm4, mm4 /* mm2-mm4 is jx-jz */
+ pfsubr mm2, [esp + ixH]
+ pfsubr mm3, [esp + iyH]
+ pfsubr mm4, [esp + izH] /* mm2-mm4 is dxH-dzH */
+
+ movq [esp + dxH], mm2
+ movq [esp + dyH], mm3
+ movq [esp + dzH], mm4
+ pfmul mm2,mm2
+ pfmul mm3,mm3
+ pfmul mm4,mm4
+
+ pfadd mm3,mm2
+ pfadd mm3,mm4 /* mm3=rsqH */
+
+ pfrsqrt mm1,mm0
+
+ movq mm2,mm1
+ pfmul mm1,mm1
+ pfrsqit1 mm1,mm0
+ pfrcpit2 mm1,mm2 /* mm1=invsqrt */
+ movq mm4, mm1
+ pfmul mm4, mm4 /* mm4=invsq */
+ /* calculate potential and scalar force */
+ pfmul mm6, mm1 /* mm6=vcoul */
+ pfmul mm4, mm6 /* mm4=fscalar */
+
+ pfrsqrt mm5, mm3
+ pswapd mm3,mm3
+ pfrsqrt mm2, mm3
+ pswapd mm3,mm3
+ punpckldq mm5,mm2 /* seeds are in mm5 now, and rsq in mm3 */
+
+ movq mm2, mm5
+ pfmul mm5,mm5
+ pfrsqit1 mm5,mm3
+ pfrcpit2 mm5,mm2 /* mm5=invsqrt */
+ movq mm3,mm5
+ pfmul mm3,mm3 /* mm3=invsq */
+ pfmul mm7, mm5 /* mm7=vcoul */
+ pfmul mm3, mm7 /* mm3=fscal for the two H's */
+
+ /* update vctot */
+ pfadd mm7, mm6
+ pfadd mm7, [esp + vctot]
+ movq [esp + vctot], mm7
+
+ /* spread oxygen fscalar to both positions */
+ punpckldq mm4,mm4
+ /* calc vectorial force for O */
+ prefetchw [edi + eax*4] /* prefetch faction to cache */
+ movq mm0, [esp + dxO]
+ movd mm1, [esp + dzO]
+ pfmul mm0, mm4
+ pfmul mm1, mm4
+
+ /* calc vectorial force for H's */
+ movq mm5, [esp + dxH]
+ movq mm6, [esp + dyH]
+ movq mm7, [esp + dzH]
+ pfmul mm5, mm3
+ pfmul mm6, mm3
+ pfmul mm7, mm3
+
+ /* update iO particle force */
+ movq mm2, [esp + fixO]
+ movd mm3, [esp + fizO]
+ pfadd mm2, mm0
+ pfadd mm3, mm1
+ movq [esp + fixO], mm2
+ movd [esp + fizO], mm3
+
+ /* update iH forces */
+ movq mm2, [esp + fixH]
+ movq mm3, [esp + fiyH]
+ movq mm4, [esp + fizH]
+ pfadd mm2, mm5
+ pfadd mm3, mm6
+ pfadd mm4, mm7
+ movq [esp + fixH], mm2
+ movq [esp + fiyH], mm3
+ movq [esp + fizH], mm4
+
+ /* pack j forces from H in the same form as the oxygen force */
+ pfacc mm5, mm6 /* mm5(l)=fjx(H1+H2) mm5(h)=fjy(H1+H2) */
+ pfacc mm7, mm7 /* mm7(l)=fjz(H1+H2) */
+
+ pfadd mm0, mm5 /* add up total force on j particle */
+ pfadd mm1, mm7
+
+ /* update j particle force */
+ movq mm2, [edi + eax*4]
+ movd mm3, [edi + eax*4 + 8]
+ pfsub mm2, mm0
+ pfsub mm3, mm1
+ movq [edi + eax*4], mm2
+ movd [edi + eax*4 +8], mm3
+
+ /* done - one more? */
+ dec dword ptr [esp + innerk]
+ jz .i1020_updateouterdata
+ jmp .i1020_inner_loop
+.i1020_updateouterdata:
+ mov ecx, [esp + ii3]
+
+ movq mm6, [edi + ecx*4] /* increment iO force */
+ movd mm7, [edi + ecx*4 + 8]
+ pfadd mm6, [esp + fixO]
+ pfadd mm7, [esp + fizO]
+ movq [edi + ecx*4], mm6
+ movd [edi + ecx*4 +8], mm7
+
+ movq mm0, [esp + fixH]
+ movq mm3, [esp + fiyH]
+ movq mm1, [esp + fizH]
+ movq mm2, mm0
+ punpckldq mm0, mm3 /* mm0(l)=fxH1, mm0(h)=fyH1 */
+ punpckhdq mm2, mm3 /* mm2(l)=fxH2, mm2(h)=fyH2 */
+ movq mm3, mm1
+ pswapd mm3,mm3
+ /* mm1 is fzH1 */
+ /* mm3 is fzH2 */
+
+ movq mm6, [edi + ecx*4 + 12] /* increment iH1 force */
+ movd mm7, [edi + ecx*4 + 20]
+ pfadd mm6, mm0
+ pfadd mm7, mm1
+ movq [edi + ecx*4 + 12], mm6
+ movd [edi + ecx*4 + 20], mm7
+
+ movq mm6, [edi + ecx*4 + 24] /* increment iH2 force */
+ movd mm7, [edi + ecx*4 + 32]
+ pfadd mm6, mm2
+ pfadd mm7, mm3
+ movq [edi + ecx*4 + 24], mm6
+ movd [edi + ecx*4 + 32], mm7
+
+
+ mov ebx, [ebp + fshift] /* increment fshift force */
+ mov edx, [esp + is3]
+
+ movq mm6, [ebx + edx*4]
+ movd mm7, [ebx + edx*4 + 8]
+ pfadd mm6, [esp + fixO]
+ pfadd mm7, [esp + fizO]
+ pfadd mm6, mm0
+ pfadd mm7, mm1
+ pfadd mm6, mm2
+ pfadd mm7, mm3
+ movq [ebx + edx*4], mm6
+ movd [ebx + edx*4 + 8], mm7
+
+ mov edx, [ebp + gid] /* get group index for this i particle */
+ mov edx, [edx]
+ add [ebp + gid], 4 /* advance pointer */
+
+ movq mm7, [esp + vctot]
+ pfacc mm7,mm7 /* get and sum the two parts of total potential */
+
+ mov eax, [ebp + Vc]
+ movd mm6, [eax + edx*4]
+ pfadd mm6, mm7
+ movd [eax + edx*4], mm6 /* increment vc[gid] */
+
+ /* finish if last */
+ dec dword ptr [ebp + nri]
+ jz .i1020_end
+ /* not last, iterate once more! */
+ jmp .i1020_outer
+.i1020_end:
+ femms
+ add esp, 148
+ pop edi
+ pop esi
+ pop edx
+ pop ecx
+ pop ebx
+ pop eax
+ leave
+ ret
+
+
+.globl inl1030_3dnow
+ .type inl1030_3dnow,@function
+inl1030_3dnow:
+.equ nri, 8
+.equ iinr, 12
+.equ jindex, 16
+.equ jjnr, 20
+.equ shift, 24
+.equ shiftvec, 28
+.equ fshift, 32
+.equ gid, 36
+.equ pos, 40
+.equ faction, 44
+.equ charge, 48
+.equ facel, 52
+.equ Vc, 56
+ /* stack offsets for local variables */
+.equ is3, 0
+.equ ii3, 4
+.equ ixO, 8
+.equ iyO, 12
+.equ izO, 16
+.equ ixH, 20/* repeated (64bit) to fill 3dnow reg */
+.equ iyH, 28/* repeated (64bit) to fill 3dnow reg */
+.equ izH, 36/* repeated (64bit) to fill 3dnow reg */
+.equ qqOO, 44 /* repeated (64bit) to fill 3dnow reg */
+.equ qqOH, 52 /* repeated (64bit) to fill 3dnow reg */
+.equ qqHH, 60 /* repeated (64bit) to fill 3dnow reg */
+.equ vctot, 68/* repeated (64bit) to fill 3dnow reg */
+.equ innerjjnr, 76
+.equ innerk, 80
+.equ fixO, 84
+.equ fiyO, 88
+.equ fizO, 92
+.equ fixH, 96/* repeated (64bit) to fill 3dnow reg */
+.equ fiyH, 104 /* repeated (64bit) to fill 3dnow reg */
+.equ fizH, 112 /* repeated (64bit) to fill 3dnow reg */
+.equ dxO, 120
+.equ dyO, 124
+.equ dzO, 128
+.equ dxH, 132 /* repeated (64bit) to fill 3dnow reg */
+.equ dyH, 140 /* repeated (64bit) to fill 3dnow reg */
+.equ dzH, 148 /* repeated (64bit) to fill 3dnow reg */
+ push ebp
+ mov ebp,esp
+ push eax
+ push ebx
+ push ecx
+ push edx
+ push esi
+ push edi
+ sub esp, 156 /* local stack space */
+ femms
+ /* assume we have at least one i particle - start directly */
+
+ mov ecx, [ebp + iinr] /* ecx = pointer into iinr[] */
+ mov ebx, [ecx] /* ebx=ii */
+
+ mov edx, [ebp + charge]
+ movd mm1, [ebp + facel] /* mm1=facel */
+ movd mm2, [edx + ebx*4] /* mm2=charge[i.i0] (O) */
+ movd mm3, [edx + ebx*4 + 4] /* mm2=charge[i.i0+1] (H) */
+ movq mm4, mm2
+ pfmul mm4, mm1
+ movq mm6, mm3
+ pfmul mm6, mm1
+ movq mm5, mm4
+ pfmul mm4, mm2 /* mm4=qqOO*facel */
+ pfmul mm5, mm3 /* mm5=qqOH*facel */
+ pfmul mm6, mm3 /* mm6=qqHH*facel */
+ punpckldq mm5,mm5 /* spread to both halves */
+ punpckldq mm6,mm6 /* spread to both halves */
+ movq [esp + qqOO], mm4
+ movq [esp + qqOH], mm5
+ movq [esp + qqHH], mm6
+.i1030_outer:
+ mov eax, [ebp + shift] /* eax = pointer into shift[] */
+ mov ebx, [eax] /* ebx=shift[n] */
+ add [ebp + shift], 4 /* advance pointer one step */
+
+ lea ebx, [ebx + ebx*2] /* ebx=3*is */
+ mov [esp + is3],ebx /* store is3 */
+
+ mov eax, [ebp + shiftvec] /* eax = base of shiftvec[] */
+
+ movq mm5, [eax + ebx*4] /* move shX/shY to mm5 and shZ to mm6 */
+ movd mm6, [eax + ebx*4 + 8]
+ movq mm0, mm5
+ movq mm1, mm5
+ movq mm2, mm6
+ punpckldq mm0,mm0 /* also expand shX,Y,Z in mm0--mm2 */
+ punpckhdq mm1,mm1
+ punpckldq mm2,mm2
+
+ mov ecx, [ebp + iinr] /* ecx = pointer into iinr[] */
+ add [ebp + iinr], 4 /* advance pointer */
+ mov ebx, [ecx] /* ebx=ii */
+
+ lea ebx, [ebx + ebx*2] /* ebx = 3*ii=ii3 */
+ mov eax, [ebp + pos] /* eax = base of pos[] */
+
+ pfadd mm5, [eax + ebx*4] /* ix = shX + posX (and iy too) */
+ movd mm7, [eax + ebx*4 + 8] /* cant use direct memory add for 4 bytes (iz) */
+ mov [esp + ii3], ebx /* (use mm7 as temp storage for iz) */
+ pfadd mm6, mm7
+ movq [esp + ixO], mm5
+ movq [esp + izO], mm6
+
+ movd mm3, [eax + ebx*4 + 12]
+ movd mm4, [eax + ebx*4 + 16]
+ movd mm5, [eax + ebx*4 + 20]
+ punpckldq mm3, [eax + ebx*4 + 24]
+ punpckldq mm4, [eax + ebx*4 + 28]
+ punpckldq mm5, [eax + ebx*4 + 32] /* coords of H1 in low mm3-mm5, H2 in high */
+
+ pfadd mm0, mm3
+ pfadd mm1, mm4
+ pfadd mm2, mm5
+ movq [esp + ixH], mm0
+ movq [esp + iyH], mm1
+ movq [esp + izH], mm2
+
+ /* clear vctot and i forces */
+ pxor mm7,mm7
+ movq [esp + vctot], mm7
+ movq [esp + fixO], mm7
+ movq [esp + fizO], mm7
+ movq [esp + fixH], mm7
+ movq [esp + fiyH], mm7
+ movq [esp + fizH], mm7
+
+ mov eax, [ebp + jindex]
+ mov ecx, [eax] /* jindex[n] */
+ mov edx, [eax + 4] /* jindex[n+1] */
+ add [ebp + jindex], 4
+ sub edx, ecx /* number of innerloop atoms */
+ mov [esp + innerk], edx /* number of innerloop atoms */
+
+ mov esi, [ebp + pos]
+ mov edi, [ebp + faction]
+ mov eax, [ebp + jjnr]
+ shl ecx, 2
+ add eax, ecx
+ mov [esp + innerjjnr], eax /* pointer to jjnr[nj0] */
+.i1030_inner_loop:
+ /* a single j particle iteration here - compare with the unrolled code for comments */
+ mov eax, [esp + innerjjnr]
+ mov eax, [eax] /* eax=jnr offset */
+ add [esp + innerjjnr], 4 /* advance pointer */
+
+ movd mm6, [esp + qqOO]
+ movq mm7, [esp + qqOH]
+
+ lea eax, [eax + eax*2]
+ movq mm0, [esi + eax*4]
+ movd mm1, [esi + eax*4 + 8]
+ /* copy & expand to mm2-mm4 for the H interactions */
+ movq mm2, mm0
+ movq mm3, mm0
+ movq mm4, mm1
+ punpckldq mm2,mm2
+ punpckhdq mm3,mm3
+ punpckldq mm4,mm4
+
+ pfsubr mm0, [esp + ixO]
+ pfsubr mm1, [esp + izO]
+
+ movq [esp + dxO], mm0
+ pfmul mm0,mm0
+ movd [esp + dzO], mm1
+ pfmul mm1,mm1
+ pfacc mm0, mm0
+ pfadd mm0, mm1 /* mm0=rsqO */
+
+ punpckldq mm2, mm2
+ punpckldq mm3, mm3
+ punpckldq mm4, mm4 /* mm2-mm4 is jx-jz */
+ pfsubr mm2, [esp + ixH]
+ pfsubr mm3, [esp + iyH]
+ pfsubr mm4, [esp + izH] /* mm2-mm4 is dxH-dzH */
+
+ movq [esp + dxH], mm2
+ movq [esp + dyH], mm3
+ movq [esp + dzH], mm4
+ pfmul mm2,mm2
+ pfmul mm3,mm3
+ pfmul mm4,mm4
+
+ pfadd mm3,mm2
+ pfadd mm3,mm4 /* mm3=rsqH */
+
+ pfrsqrt mm1,mm0
+
+ movq mm2,mm1
+ pfmul mm1,mm1
+ pfrsqit1 mm1,mm0
+ pfrcpit2 mm1,mm2 /* mm1=invsqrt */
+ movq mm4, mm1
+ pfmul mm4, mm4 /* mm4=invsq */
+ /* calculate potential and scalar force */
+ pfmul mm6, mm1 /* mm6=vcoul */
+ pfmul mm4, mm6 /* mm4=fscalar */
+
+ pfrsqrt mm5, mm3
+ pswapd mm3,mm3
+ pfrsqrt mm2, mm3
+ pswapd mm3,mm3
+ punpckldq mm5,mm2 /* seeds are in mm5 now, and rsq in mm3 */
+
+ movq mm2, mm5
+ pfmul mm5,mm5
+ pfrsqit1 mm5,mm3
+ pfrcpit2 mm5,mm2 /* mm5=invsqrt */
+ movq mm3,mm5
+ pfmul mm3,mm3 /* mm3=invsq */
+ pfmul mm7, mm5 /* mm7=vcoul */
+ pfmul mm3, mm7 /* mm3=fscal for the two H's */
+
+ /* update vctot */
+ pfadd mm7, mm6
+ pfadd mm7, [esp + vctot]
+ movq [esp + vctot], mm7
+
+ /* spread oxygen fscalar to both positions */
+ punpckldq mm4,mm4
+ /* calc vectorial force for O */
+ movq mm0, [esp + dxO]
+ movd mm1, [esp + dzO]
+ pfmul mm0, mm4
+ pfmul mm1, mm4
+
+ /* calc vectorial force for H's */
+ movq mm5, [esp + dxH]
+ movq mm6, [esp + dyH]
+ movq mm7, [esp + dzH]
+ pfmul mm5, mm3
+ pfmul mm6, mm3
+ pfmul mm7, mm3
+
+ /* update iO particle force */
+ movq mm2, [esp + fixO]
+ movd mm3, [esp + fizO]
+ pfadd mm2, mm0
+ pfadd mm3, mm1
+ movq [esp + fixO], mm2
+ movd [esp + fizO], mm3
+
+ /* update iH forces */
+ movq mm2, [esp + fixH]
+ movq mm3, [esp + fiyH]
+ movq mm4, [esp + fizH]
+ pfadd mm2, mm5
+ pfadd mm3, mm6
+ pfadd mm4, mm7
+ movq [esp + fixH], mm2
+ movq [esp + fiyH], mm3
+ movq [esp + fizH], mm4
+
+ /* pack j forces from H in the same form as the oxygen force */
+ pfacc mm5, mm6 /* mm5(l)=fjx(H1+H2) mm5(h)=fjy(H1+H2) */
+ pfacc mm7, mm7 /* mm7(l)=fjz(H1+H2) */
+
+ pfadd mm0, mm5 /* add up total force on j particle */
+ pfadd mm1, mm7
+
+ /* update j particle force */
+ movq mm2, [edi + eax*4]
+ movd mm3, [edi + eax*4 + 8]
+ pfsub mm2, mm0
+ pfsub mm3, mm1
+ movq [edi + eax*4], mm2
+ movd [edi + eax*4 +8], mm3
+
+ /* interactions with j H1 */
+ movq mm0, [esi + eax*4 + 12]
+ movd mm1, [esi + eax*4 + 20]
+ /* copy & expand to mm2-mm4 for the H interactions */
+ movq mm2, mm0
+ movq mm3, mm0
+ movq mm4, mm1
+ punpckldq mm2,mm2
+ punpckhdq mm3,mm3
+ punpckldq mm4,mm4
+
+ movd mm6, [esp + qqOH]
+ movq mm7, [esp + qqHH]
+
+ pfsubr mm0, [esp + ixO]
+ pfsubr mm1, [esp + izO]
+
+ movq [esp + dxO], mm0
+ pfmul mm0,mm0
+ movd [esp + dzO], mm1
+ pfmul mm1,mm1
+ pfacc mm0, mm1
+ pfadd mm0, mm1 /* mm0=rsqO */
+
+ punpckldq mm2, mm2
+ punpckldq mm3, mm3
+ punpckldq mm4, mm4 /* mm2-mm4 is jx-jz */
+ pfsubr mm2, [esp + ixH]
+ pfsubr mm3, [esp + iyH]
+ pfsubr mm4, [esp + izH] /* mm2-mm4 is dxH-dzH */
+
+ movq [esp + dxH], mm2
+ movq [esp + dyH], mm3
+ movq [esp + dzH], mm4
+ pfmul mm2,mm2
+ pfmul mm3,mm3
+ pfmul mm4,mm4
+
+ pfadd mm3,mm2
+ pfadd mm3,mm4 /* mm3=rsqH */
+
+ pfrsqrt mm1,mm0
+
+ movq mm2,mm1
+ pfmul mm1,mm1
+ pfrsqit1 mm1,mm0
+ pfrcpit2 mm1,mm2 /* mm1=invsqrt */
+ movq mm4, mm1
+ pfmul mm4, mm4 /* mm4=invsq */
+ /* calculate potential and scalar force */
+ pfmul mm6, mm1 /* mm6=vcoul */
+ pfmul mm4, mm6 /* mm4=fscalar */
+
+ pfrsqrt mm5, mm3
+ pswapd mm3,mm3
+ pfrsqrt mm2, mm3
+ pswapd mm3,mm3
+ punpckldq mm5,mm2 /* seeds are in mm5 now, and rsq in mm3 */
+
+ movq mm2, mm5
+ pfmul mm5,mm5
+ pfrsqit1 mm5,mm3
+ pfrcpit2 mm5,mm2 /* mm5=invsqrt */
+ movq mm3,mm5
+ pfmul mm3,mm3 /* mm3=invsq */
+ pfmul mm7, mm5 /* mm7=vcoul */
+ pfmul mm3, mm7 /* mm3=fscal for the two H's */
+
+ /* update vctot */
+ pfadd mm7, mm6
+ pfadd mm7, [esp + vctot]
+ movq [esp + vctot], mm7
+
+ /* spread oxygen fscalar to both positions */
+ punpckldq mm4,mm4
+ /* calc vectorial force for O */
+ movq mm0, [esp + dxO]
+ movd mm1, [esp + dzO]
+ pfmul mm0, mm4
+ pfmul mm1, mm4
+
+ /* calc vectorial force for H's */
+ movq mm5, [esp + dxH]
+ movq mm6, [esp + dyH]
+ movq mm7, [esp + dzH]
+ pfmul mm5, mm3
+ pfmul mm6, mm3
+ pfmul mm7, mm3
+
+ /* update iO particle force */
+ movq mm2, [esp + fixO]
+ movd mm3, [esp + fizO]
+ pfadd mm2, mm0
+ pfadd mm3, mm1
+ movq [esp + fixO], mm2
+ movd [esp + fizO], mm3
+
+ /* update iH forces */
+ movq mm2, [esp + fixH]
+ movq mm3, [esp + fiyH]
+ movq mm4, [esp + fizH]
+ pfadd mm2, mm5
+ pfadd mm3, mm6
+ pfadd mm4, mm7
+ movq [esp + fixH], mm2
+ movq [esp + fiyH], mm3
+ movq [esp + fizH], mm4
+
+ /* pack j forces from H in the same form as the oxygen force */
+ pfacc mm5, mm6 /* mm5(l)=fjx(H1+H2) mm5(h)=fjy(H1+H2) */
+ pfacc mm7, mm7 /* mm7(l)=fjz(H1+H2) */
+
+ pfadd mm0, mm5 /* add up total force on j particle */
+ pfadd mm1, mm7
+
+ /* update j particle force */
+ movq mm2, [edi + eax*4 + 12]
+ movd mm3, [edi + eax*4 + 20]
+ pfsub mm2, mm0
+ pfsub mm3, mm1
+ movq [edi + eax*4 + 12], mm2
+ movd [edi + eax*4 + 20], mm3
+
+ /* interactions with j H2 */
+ movq mm0, [esi + eax*4 + 24]
+ movd mm1, [esi + eax*4 + 32]
+ /* copy & expand to mm2-mm4 for the H interactions */
+ movq mm2, mm0
+ movq mm3, mm0
+ movq mm4, mm1
+ punpckldq mm2,mm2
+ punpckhdq mm3,mm3
+ punpckldq mm4,mm4
+
+ movd mm6, [esp + qqOH]
+ movq mm7, [esp + qqHH]
+
+ pfsubr mm0, [esp + ixO]
+ pfsubr mm1, [esp + izO]
+
+ movq [esp + dxO], mm0
+ pfmul mm0,mm0
+ movd [esp + dzO], mm1
+ pfmul mm1,mm1
+ pfacc mm0, mm1
+ pfadd mm0, mm1 /* mm0=rsqO */
+
+ punpckldq mm2, mm2
+ punpckldq mm3, mm3
+ punpckldq mm4, mm4 /* mm2-mm4 is jx-jz */
+ pfsubr mm2, [esp + ixH]
+ pfsubr mm3, [esp + iyH]
+ pfsubr mm4, [esp + izH] /* mm2-mm4 is dxH-dzH */
+
+ movq [esp + dxH], mm2
+ movq [esp + dyH], mm3
+ movq [esp + dzH], mm4
+ pfmul mm2,mm2
+ pfmul mm3,mm3
+ pfmul mm4,mm4
+
+ pfadd mm3,mm2
+ pfadd mm3,mm4 /* mm3=rsqH */
+
+ pfrsqrt mm1,mm0
+
+ movq mm2,mm1
+ pfmul mm1,mm1
+ pfrsqit1 mm1,mm0
+ pfrcpit2 mm1,mm2 /* mm1=invsqrt */
+ movq mm4, mm1
+ pfmul mm4, mm4 /* mm4=invsq */
+ /* calculate potential and scalar force */
+ pfmul mm6, mm1 /* mm6=vcoul */
+ pfmul mm4, mm6 /* mm4=fscalar */
+
+ pfrsqrt mm5, mm3
+ pswapd mm3,mm3
+ pfrsqrt mm2, mm3
+ pswapd mm3,mm3
+ punpckldq mm5,mm2 /* seeds are in mm5 now, and rsq in mm3 */
+
+ movq mm2, mm5
+ pfmul mm5,mm5
+ pfrsqit1 mm5,mm3
+ pfrcpit2 mm5,mm2 /* mm5=invsqrt */
+ movq mm3,mm5
+ pfmul mm3,mm3 /* mm3=invsq */
+ pfmul mm7, mm5 /* mm7=vcoul */
+ pfmul mm3, mm7 /* mm3=fscal for the two H's */
+
+ /* update vctot */
+ pfadd mm7, mm6
+ pfadd mm7, [esp + vctot]
+ movq [esp + vctot], mm7
+
+ /* spread oxygen fscalar to both positions */
+ punpckldq mm4,mm4
+ /* calc vectorial force for O */
+ movq mm0, [esp + dxO]
+ movd mm1, [esp + dzO]
+ pfmul mm0, mm4
+ pfmul mm1, mm4
+
+ /* calc vectorial force for H's */
+ movq mm5, [esp + dxH]
+ movq mm6, [esp + dyH]
+ movq mm7, [esp + dzH]
+ pfmul mm5, mm3
+ pfmul mm6, mm3
+ pfmul mm7, mm3
+
+ /* update iO particle force */
+ movq mm2, [esp + fixO]
+ movd mm3, [esp + fizO]
+ pfadd mm2, mm0
+ pfadd mm3, mm1
+ movq [esp + fixO], mm2
+ movd [esp + fizO], mm3
+
+ /* update iH forces */
+ movq mm2, [esp + fixH]
+ movq mm3, [esp + fiyH]
+ movq mm4, [esp + fizH]
+ pfadd mm2, mm5
+ pfadd mm3, mm6
+ pfadd mm4, mm7
+ movq [esp + fixH], mm2
+ movq [esp + fiyH], mm3
+ movq [esp + fizH], mm4
+
+ /* pack j forces from H in the same form as the oxygen force */
+ pfacc mm5, mm6 /* mm5(l)=fjx(H1+H2) mm5(h)=fjy(H1+H2) */
+ pfacc mm7, mm7 /* mm7(l)=fjz(H1+H2) */
+
+ pfadd mm0, mm5 /* add up total force on j particle */
+ pfadd mm1, mm7
+
+ /* update j particle force */
+ movq mm2, [edi + eax*4 + 24]
+ movd mm3, [edi + eax*4 + 32]
+ pfsub mm2, mm0
+ pfsub mm3, mm1
+ movq [edi + eax*4 + 24], mm2
+ movd [edi + eax*4 + 32], mm3
+
+ /* done - one more? */
+ dec dword ptr [esp + innerk]
+ jz .i1030_updateouterdata
+ jmp .i1030_inner_loop
+.i1030_updateouterdata:
+ mov ecx, [esp + ii3]
+
+ movq mm6, [edi + ecx*4] /* increment iO force */
+ movd mm7, [edi + ecx*4 + 8]
+ pfadd mm6, [esp + fixO]
+ pfadd mm7, [esp + fizO]
+ movq [edi + ecx*4], mm6
+ movd [edi + ecx*4 +8], mm7
+
+ movq mm0, [esp + fixH]
+ movq mm3, [esp + fiyH]
+ movq mm1, [esp + fizH]
+ movq mm2, mm0
+ punpckldq mm0, mm3 /* mm0(l)=fxH1, mm0(h)=fyH1 */
+ punpckhdq mm2, mm3 /* mm2(l)=fxH2, mm2(h)=fyH2 */
+ movq mm3, mm1
+ pswapd mm3,mm3
+ /* mm1 is fzH1 */
+ /* mm3 is fzH2 */
+
+ movq mm6, [edi + ecx*4 + 12] /* increment iH1 force */
+ movd mm7, [edi + ecx*4 + 20]
+ pfadd mm6, mm0
+ pfadd mm7, mm1
+ movq [edi + ecx*4 + 12], mm6
+ movd [edi + ecx*4 + 20], mm7
+
+ movq mm6, [edi + ecx*4 + 24] /* increment iH2 force */
+ movd mm7, [edi + ecx*4 + 32]
+ pfadd mm6, mm2
+ pfadd mm7, mm3
+ movq [edi + ecx*4 + 24], mm6
+ movd [edi + ecx*4 + 32], mm7
+
+
+ mov ebx, [ebp + fshift] /* increment fshift force */
+ mov edx, [esp + is3]
+
+ movq mm6, [ebx + edx*4]
+ movd mm7, [ebx + edx*4 + 8]
+ pfadd mm6, [esp + fixO]
+ pfadd mm7, [esp + fizO]
+ pfadd mm6, mm0
+ pfadd mm7, mm1
+ pfadd mm6, mm2
+ pfadd mm7, mm3
+ movq [ebx + edx*4], mm6
+ movd [ebx + edx*4 + 8], mm7
+
+ mov edx, [ebp + gid] /* get group index for this i particle */
+ mov edx, [edx]
+ add [ebp + gid], 4 /* advance pointer */
+
+ movq mm7, [esp + vctot]
+ pfacc mm7,mm7 /* get and sum the two parts of total potential */
+
+ mov eax, [ebp + Vc]
+ movd mm6, [eax + edx*4]
+ pfadd mm6, mm7
+ movd [eax + edx*4], mm6 /* increment vc[gid] */
+ /* finish if last */
+ dec dword ptr [ebp + nri]
+ jz .i1030_end
+ /* not last, iterate once more! */
+ jmp .i1030_outer
+.i1030_end:
+ femms
+ add esp, 156
+ pop edi
+ pop esi
+ pop edx
+ pop ecx
+ pop ebx
+ pop eax
+ leave
+ ret
+
+
+.globl inl1100_3dnow
+ .type inl1100_3dnow,@function
+inl1100_3dnow:
+.equ nri, 8
+.equ iinr, 12
+.equ jindex, 16
+.equ jjnr, 20
+.equ shift, 24
+.equ shiftvec, 28
+.equ fshift, 32
+.equ gid, 36
+.equ pos, 40
+.equ faction, 44
+.equ charge, 48
+.equ facel, 52
+.equ Vc, 56
+.equ type, 60
+.equ ntype, 64
+.equ nbfp, 68
+.equ Vnb, 72
+ /* stack offsets for local variables */
+.equ is3, 0
+.equ ii3, 4
+.equ ix, 8
+.equ iy, 12
+.equ iz, 16
+.equ iq, 20 /* repeated (64bit) to fill 3dnow reg */
+.equ vctot, 28 /* repeated (64bit) to fill 3dnow reg */
+.equ vnbtot, 36 /* repeated (64bit) to fill 3dnow reg */
+.equ c6, 44 /* repeated (64bit) to fill 3dnow reg */
+.equ c12, 52 /* repeated (64bit) to fill 3dnow reg */
+.equ six, 60 /* repeated (64bit) to fill 3dnow reg */
+.equ twelve, 68 /* repeated (64bit) to fill 3dnow reg */
+.equ ntia, 76
+.equ innerjjnr, 80
+.equ innerk, 84
+.equ fix, 88
+.equ fiy, 92
+.equ fiz, 96
+.equ dx1, 100
+.equ dy1, 104
+.equ dz1, 108
+.equ dx2, 112
+.equ dy2, 116
+.equ dz2, 120
+ push ebp
+ mov ebp,esp
+
+ push eax
+ push ebx
+ push ecx
+ push edx
+ push esi
+ push edi
+ sub esp, 124 /* local stack space */
+ femms
+ /* move data to local stack */
+ movq mm0, [mm_six]
+ movq mm1, [mm_twelve]
+ movq [esp + six], mm0
+ movq [esp + twelve], mm1
+ /* assume we have at least one i particle - start directly */
+.i1100_outer:
+ mov eax, [ebp + shift] /* eax = pointer into shift[] */
+ mov ebx, [eax] /* ebx=shift[n] */
+ add [ebp + shift], 4 /* advance pointer one step */
+
+ lea ebx, [ebx + ebx*2] /* ebx=3*is */
+ mov [esp + is3],ebx /* store is3 */
+
+ mov eax, [ebp + shiftvec] /* eax = base of shiftvec[] */
+
+ movq mm0, [eax + ebx*4] /* move shX/shY to mm0 and shZ to mm1 */
+ movd mm1, [eax + ebx*4 + 8]
+
+ mov ecx, [ebp + iinr] /* ecx = pointer into iinr[] */
+ add [ebp + iinr], 4 /* advance pointer */
+ mov ebx, [ecx] /* ebx=ii */
+
+ mov edx, [ebp + charge]
+ movd mm2, [edx + ebx*4] /* mm2=charge[ii] */
+ pfmul mm2, [ebp + facel]
+ punpckldq mm2,mm2 /* spread to both halves */
+ movq [esp + iq], mm2 /* iq =facel*charge[ii] */
+
+ mov edx, [ebp + type]
+ mov edx, [edx + ebx*4]
+ imul edx, [ebp + ntype]
+ shl edx, 1
+ mov [esp + ntia], edx
+
+ lea ebx, [ebx + ebx*2] /* ebx = 3*ii=ii3 */
+ mov eax, [ebp + pos] /* eax = base of pos[] */
+
+ pfadd mm0, [eax + ebx*4] /* ix = shX + posX (and iy too) */
+ movd mm3, [eax + ebx*4 + 8] /* cant use direct memory add for 4 bytes (iz) */
+ mov [esp + ii3], ebx
+ pfadd mm1, mm3
+ movq [esp + ix], mm0
+ movd [esp + iz], mm1
+
+ /* clear total potential and i forces */
+ pxor mm7,mm7
+ movq [esp + vctot], mm7
+ movq [esp + vnbtot], mm7
+ movq [esp + fix], mm7
+ movd [esp + fiz], mm7
+
+ mov eax, [ebp + jindex]
+ mov ecx, [eax] /* jindex[n] */
+ mov edx, [eax + 4] /* jindex[n+1] */
+ add [ebp + jindex], 4
+ sub edx, ecx /* number of innerloop atoms */
+
+ mov esi, [ebp + pos]
+ mov edi, [ebp + faction]
+ mov eax, [ebp + jjnr]
+ shl ecx, 2
+ add eax, ecx
+ mov [esp + innerjjnr], eax /* pointer to jjnr[nj0] */
+ sub edx, 2
+ mov [esp + innerk], edx /* number of innerloop atoms */
+ jge .i1100_unroll_loop
+ jmp .i1100_finish_inner
+.i1100_unroll_loop:
+ /* paired innerloop starts here */
+ mov ecx, [esp + innerjjnr] /* pointer to jjnr[k] */
+ mov eax, [ecx]
+ mov ebx, [ecx + 4] /* eax/ebx=jnr */
+ add [esp + innerjjnr], 8 /* advance pointer (unrolled 2) */
+ prefetch [ecx + 16] /* prefetch data - trial and error says 16 is best */
+
+ mov ecx, [ebp + charge] /* base of charge[] */
+ movq mm5, [esp + iq]
+ movd mm3, [ecx + eax*4] /* charge[jnr1] */
+ punpckldq mm3, [ecx + ebx*4] /* move charge 2 to high part of mm3 */
+ pfmul mm3,mm5 /* mm3 now has qq for both particles */
+
+ mov ecx, [ebp + type]
+ mov edx, [ecx + eax*4] /* type [jnr1] */
+ mov ecx, [ecx + ebx*4] /* type [jnr2] */
+
+ mov esi, [ebp + nbfp] /* base of nbfp */
+ shl edx, 1
+ shl ecx, 1
+ add edx, [esp + ntia] /* tja = ntia + 2*type */
+ add ecx, [esp + ntia]
+
+ movq mm5, [esi + edx*4] /* mm5 = 1st c6 / c12 */
+ movq mm7, [esi + ecx*4] /* mm7 = 2nd c6 / c12 */
+ movq mm6,mm5
+ punpckldq mm5,mm7 /* mm5 = 1st c6 / 2nd c6 */
+ punpckhdq mm6,mm7 /* mm6 = 1st c12 / 2nd c12 */
+ movq [esp + c6], mm5
+ movq [esp + c12], mm6
+
+ lea eax, [eax + eax*2] /* replace jnr with j3 */
+ lea ebx, [ebx + ebx*2]
+
+ mov esi, [ebp + pos]
+
+ movq mm0, [esp + ix]
+ movd mm1, [esp + iz]
+ movq mm4, [esi + eax*4] /* fetch first j coordinates */
+ movd mm5, [esi + eax*4 + 8]
+ pfsubr mm4,mm0 /* dr = ir - jr */
+ pfsubr mm5,mm1
+ movq [esp + dx1], mm4 /* store dr */
+ movd [esp + dz1], mm5
+ pfmul mm4,mm4 /* square dx,dy,dz */
+ pfmul mm5,mm5
+ pfacc mm4, mm5 /* accumulate to get dx*dx+dy*dy+dz*dz */
+ pfacc mm4, mm5 /* first rsq in lower mm4 */
+
+ movq mm6, [esi + ebx*4] /* fetch second j coordinates */
+ movd mm7, [esi + ebx*4 + 8]
+
+ pfsubr mm6,mm0 /* dr = ir - jr */
+ pfsubr mm7,mm1
+ movq [esp + dx2], mm6 /* store dr */
+ movd [esp + dz2], mm7
+ pfmul mm6,mm6 /* square dx,dy,dz */
+ pfmul mm7,mm7
+ pfacc mm6, mm7 /* accumulate to get dx*dx+dy*dy+dz*dz */
+ pfacc mm6, mm7 /* second rsq in lower mm6 */
+
+ pfrsqrt mm0, mm4 /* lookup inverse square root seed */
+ pfrsqrt mm1, mm6
+
+ punpckldq mm0,mm1
+ punpckldq mm4,mm6 /* now 4 has rsq and 0 the seed for both pairs */
+ movq mm2,mm0 /* amd 3dnow N-R iteration to get full precision */
+ pfmul mm0,mm0
+ pfrsqit1 mm0,mm4
+ pfrcpit2 mm0,mm2
+ movq mm1,mm0
+ pfmul mm0,mm0
+ /* mm0 now contains invsq, and mm1 invsqrt */
+ /* do potential and fscal */
+ movq mm4, mm0
+ pfmul mm4, mm0
+ pfmul mm4, mm0 /* mm4=rinvsix */
+ movq mm5, mm4
+ pfmul mm5, mm5 /* mm5=rinvtwelve */
+
+ pfmul mm3, mm1 /* mm3 has vcoul for both interactions */
+ movq mm7, mm3 /* use mm7 for sum to make fscal */
+
+ pfmul mm5, [esp + c12]
+ pfmul mm4, [esp + c6]
+ movq mm6, mm5 /* mm6 is vnb12-vnb6 */
+ pfsub mm6, mm4
+
+ pfmul mm4, [esp + six]
+
+ pfmul mm5, [esp + twelve]
+ pfsub mm7,mm4
+ pfadd mm7, mm5
+ pfmul mm0, mm7 /* mm0 is total fscal now */
+
+ prefetchw [esp + dx1] /* prefetch i forces to cache */
+
+ /* update vctot */
+ pfadd mm3, [esp + vctot] /* add the earlier value */
+ movq [esp + vctot], mm3 /* store the sum */
+
+ /* spread fscalar to both positions */
+ movq mm1,mm0
+ punpckldq mm0,mm0
+ punpckhdq mm1,mm1
+
+ /* calc vector force */
+ prefetchw [edi + eax*4] /* prefetch the 1st faction to cache */
+ movq mm2, [esp + dx1] /* fetch dr */
+ movd mm3, [esp + dz1]
+
+ /* update vnbtot */
+ pfadd mm6, [esp + vnbtot] /* add the earlier value */
+ movq [esp + vnbtot], mm6 /* store the sum */
+
+ prefetchw [edi + ebx*4] /* prefetch the 2nd faction to cache */
+ pfmul mm2, mm0 /* mult by fs */
+ pfmul mm3, mm0
+
+ movq mm4, [esp + dx2] /* fetch dr */
+ movd mm5, [esp + dz2]
+ pfmul mm4, mm1 /* mult by fs */
+ pfmul mm5, mm1
+ /* update i forces */
+
+ movq mm0, [esp + fix]
+ movd mm1, [esp + fiz]
+ pfadd mm0, mm2
+ pfadd mm1, mm3
+
+ pfadd mm0, mm4
+ pfadd mm1, mm5
+ movq [esp + fix], mm0
+ movd [esp + fiz], mm1
+ /* update j forces */
+
+ movq mm0, [edi + eax*4]
+ movd mm1, [edi + eax*4 + 8]
+ movq mm6, [edi + ebx*4]
+ movd mm7, [edi + ebx*4 + 8]
+
+ pfsub mm0, mm2
+ pfsub mm1, mm3
+ pfsub mm6, mm4
+ pfsub mm7, mm5
+
+ movq [edi + eax*4], mm0
+ movd [edi + eax*4 +8], mm1
+ movq [edi + ebx*4], mm6
+ movd [edi + ebx*4 + 8], mm7
+
+ /* should we do one more iteration? */
+ sub [esp + innerk], 2
+ jl .i1100_finish_inner
+ jmp .i1100_unroll_loop
+.i1100_finish_inner:
+ and [esp + innerk], 1
+ jnz .i1100_single_inner
+ jmp .i1100_updateouterdata
+.i1100_single_inner:
+ /* a single j particle iteration here - compare with the unrolled code for comments */
+ mov eax, [esp + innerjjnr]
+ mov eax, [eax] /* eax=jnr offset */
+
+ mov ecx, [ebp + charge]
+ movd mm5, [esp + iq]
+ movd mm3, [ecx + eax*4]
+ pfmul mm3, mm5 /* mm3=qq */
+
+ mov esi, [ebp + nbfp]
+ mov ecx, [ebp + type]
+ mov edx, [ecx + eax*4] /* type [jnr1] */
+ shl edx, 1
+ add edx, [esp + ntia] /* tja = ntia + 2*type */
+ movd mm5, [esi + edx*4] /* mm5 = 1st c6 */
+ movq [esp + c6], mm5
+ movd mm5, [esi + edx*4 + 4] /* mm5 = 1st c12 */
+ movq [esp + c12], mm5
+
+
+ mov esi, [ebp + pos]
+ lea eax, [eax + eax*2]
+
+ movq mm0, [esp + ix]
+ movd mm1, [esp + iz]
+ movq mm4, [esi + eax*4]
+ movd mm5, [esi + eax*4 + 8]
+ pfsubr mm4, mm0
+ pfsubr mm5, mm1
+ movq [esp + dx1], mm4
+ pfmul mm4,mm4
+ movd [esp + dz1], mm5
+ pfmul mm5,mm5
+ pfacc mm4, mm5
+ pfacc mm4, mm5 /* mm0=rsq */
+
+ pfrsqrt mm0,mm4
+ movq mm2,mm0
+ pfmul mm0,mm0
+ pfrsqit1 mm0,mm4
+ pfrcpit2 mm0,mm2 /* mm1=invsqrt */
+ movq mm1, mm0
+ pfmul mm0, mm0 /* mm0=invsq */
+ /* calculate potentials and scalar force */
+ movq mm4, mm0
+ pfmul mm4, mm0
+ pfmul mm4, mm0 /* mm4=rinvsix */
+ movq mm5, mm4
+ pfmul mm5, mm5 /* mm5=rinvtwelve */
+
+ pfmul mm3, mm1 /* mm3 has vcoul for both interactions */
+ movq mm7, mm3 /* use mm7 for sum to make fscal */
+
+ pfmul mm5, [esp + c12]
+ pfmul mm4, [esp + c6]
+ movq mm6, mm5 /* mm6 is vnb12-vnb6 */
+ pfsub mm6, mm4
+
+ pfmul mm4, [esp + six]
+
+ pfmul mm5, [esp + twelve]
+ pfsub mm7,mm4
+ pfadd mm7, mm5
+ pfmul mm0, mm7 /* mm0 is total fscal now */
+
+ /* update vctot */
+ pfadd mm3, [esp + vctot]
+ movq [esp + vctot], mm3
+
+ /* update vnbtot */
+ pfadd mm6, [esp + vnbtot] /* add the earlier value */
+ movq [esp + vnbtot], mm6 /* store the sum */
+
+ /* spread fscalar to both positions */
+ punpckldq mm0,mm0
+ /* calc vectorial force */
+ prefetchw [edi + eax*4] /* prefetch faction to cache */
+ movq mm2, [esp + dx1]
+ movd mm3, [esp + dz1]
+
+
+ pfmul mm2, mm0
+ pfmul mm3, mm0
+
+ /* update i particle force */
+ movq mm0, [esp + fix]
+ movd mm1, [esp + fiz]
+ pfadd mm0, mm2
+ pfadd mm1, mm3
+ movq [esp + fix], mm0
+ movd [esp + fiz], mm1
+ /* update j particle force */
+ movq mm0, [edi + eax*4]
+ movd mm1, [edi + eax *4+ 8]
+ pfsub mm0, mm2
+ pfsub mm1, mm3
+ movq [edi + eax*4], mm0
+ movd [edi + eax*4 +8], mm1
+ /* done! */
+.i1100_updateouterdata:
+ mov ecx, [esp + ii3]
+
+ movq mm6, [edi + ecx*4] /* increment i force */
+ movd mm7, [edi + ecx*4 + 8]
+ pfadd mm6, [esp + fix]
+ pfadd mm7, [esp + fiz]
+ movq [edi + ecx*4], mm6
+ movd [edi + ecx*4 +8], mm7
+
+ mov ebx, [ebp + fshift] /* increment fshift force */
+ mov edx, [esp + is3]
+
+ movq mm6, [ebx + edx*4]
+ movd mm7, [ebx + edx*4 + 8]
+ pfadd mm6, [esp + fix]
+ pfadd mm7, [esp + fiz]
+ movq [ebx + edx*4], mm6
+ movd [ebx + edx*4 + 8], mm7
+
+ mov edx, [ebp + gid] /* get group index for this i particle */
+ mov edx, [edx]
+ add [ebp + gid], 4 /* advance pointer */
+
+ movq mm7, [esp + vctot]
+ pfacc mm7,mm7 /* get and sum the two parts of total potential */
+
+ mov eax, [ebp + Vc]
+ movd mm6, [eax + edx*4]
+ pfadd mm6, mm7
+ movd [eax + edx*4], mm6 /* increment vc[gid] */
+
+ movq mm7, [esp + vnbtot]
+ pfacc mm7,mm7 /* get and sum the two parts of total potential */
+
+ mov eax, [ebp + Vnb]
+ movd mm6, [eax + edx*4]
+ pfadd mm6, mm7
+ movd [eax + edx*4], mm6 /* increment vnb[gid] */
+
+ /* finish if last */
+ mov ecx, [ebp + nri]
+ dec ecx
+ jecxz .i1100_end
+ /* not last, iterate once more! */
+ mov [ebp + nri], ecx
+ jmp .i1100_outer
+.i1100_end:
+ femms
+ add esp, 124
+ pop edi
+ pop esi
+ pop edx
+ pop ecx
+ pop ebx
+ pop eax
+ leave
+ ret
+
+
+
+
+
+.globl inl1110_3dnow
+ .type inl1110_3dnow,@function
+inl1110_3dnow:
+.equ nri, 8
+.equ iinr, 12
+.equ jindex, 16
+.equ jjnr, 20
+.equ shift, 24
+.equ shiftvec, 28
+.equ fshift, 32
+.equ gid, 36
+.equ pos, 40
+.equ faction, 44
+.equ charge, 48
+.equ facel, 52
+.equ Vc, 56
+.equ type, 60
+.equ ntype, 64
+.equ nbfp, 68
+.equ Vnb, 72
+.equ nsatoms, 76
+ /* stack offsets for local variables */
+.equ is3, 0
+.equ ii3, 4
+.equ shX, 8
+.equ shY, 12
+.equ shZ, 16
+.equ ix, 20
+.equ iy, 24
+.equ iz, 28
+.equ iq, 32 /* repeated (64bit) to fill 3dnow reg */
+.equ vctot, 40 /* repeated (64bit) to fill 3dnow reg */
+.equ vnbtot, 48 /* repeated (64bit) to fill 3dnow reg */
+.equ c6, 56 /* repeated (64bit) to fill 3dnow reg */
+.equ c12, 64 /* repeated (64bit) to fill 3dnow reg */
+.equ six, 72 /* repeated (64bit) to fill 3dnow reg */
+.equ twelve, 80 /* repeated (64bit) to fill 3dnow reg */
+.equ ntia, 88
+.equ innerjjnr0, 92
+.equ innerk0, 96
+.equ innerjjnr, 100
+.equ innerk, 104
+.equ fix, 108
+.equ fiy, 112
+.equ fiz, 116
+.equ dx1, 120
+.equ dy1, 124
+.equ dz1, 128
+.equ dx2, 132
+.equ dy2, 136
+.equ dz2, 140
+.equ nsvdwc, 144
+.equ nscoul, 148
+.equ nsvdw, 152
+.equ solnr, 156
+ push ebp
+ mov ebp,esp
+ push eax
+ push ebx
+ push ecx
+ push edx
+ push esi
+ push edi
+ sub esp, 160 /* local stack space */
+ femms
+ movq mm0, [mm_six]
+ movq mm1, [mm_twelve]
+ movq [esp + six], mm0
+ movq [esp + twelve], mm1
+ /* assume we have at least one i particle - start directly */
+.i1110_outer:
+ mov eax, [ebp + shift] /* eax = pointer into shift[] */
+ mov ebx, [eax] /* ebx=shift[n] */
+ add [ebp + shift], 4 /* advance pointer one step */
+
+ lea ebx, [ebx + ebx*2] /* ebx=3*is */
+ mov [esp + is3],ebx /* store is3 */
+
+ mov eax, [ebp + shiftvec] /* eax = base of shiftvec[] */
+
+ movq mm0, [eax + ebx*4] /* move shX/shY to mm0 and shZ to mm1 */
+ movd mm1, [eax + ebx*4 + 8]
+ movq [esp + shX], mm0
+ movd [esp + shZ], mm1
+
+ mov ecx, [ebp + iinr] /* ecx = pointer into iinr[] */
+ add [ebp + iinr], 4 /* advance pointer */
+ mov ebx, [ecx] /* ebx=ii */
+
+ mov eax, [ebp + nsatoms]
+ add [ebp + nsatoms], 12
+ mov ecx, [eax]
+ mov edx, [eax + 4]
+ mov eax, [eax + 8]
+ sub ecx, eax
+ sub eax, edx
+
+ mov [esp + nsvdwc], edx
+ mov [esp + nscoul], eax
+ mov [esp + nsvdw], ecx
+
+ /* clear potential */
+ pxor mm7,mm7
+ movq [esp + vctot], mm7
+ movq [esp + vnbtot], mm7
+ mov [esp + solnr], ebx
+
+ mov eax, [ebp + jindex]
+ mov ecx, [eax] /* jindex[n] */
+ mov edx, [eax + 4] /* jindex[n+1] */
+ add [ebp + jindex], 4
+ sub edx, ecx /* number of innerloop atoms */
+ mov eax, [ebp + jjnr]
+ shl ecx, 2
+ add eax, ecx
+ mov [esp + innerjjnr0], eax /* pointer to jjnr[nj0] */
+
+ mov [esp + innerk0], edx /* number of innerloop atoms */
+ mov esi, [ebp + pos]
+ mov edi, [ebp + faction]
+
+ mov ecx, [esp + nsvdwc]
+ cmp ecx, 0
+ jnz .i1110_mno_vdwc
+ jmp .i1110_testcoul
+.i1110_mno_vdwc:
+ mov ebx, [esp + solnr]
+ inc dword ptr [esp + solnr]
+ mov edx, [ebp + charge]
+ movd mm2, [edx + ebx*4] /* mm2=charge[ii] */
+ pfmul mm2, [ebp + facel]
+ punpckldq mm2,mm2 /* spread to both halves */
+ movq [esp + iq], mm2 /* iq =facel*charge[ii] */
+
+ mov edx, [ebp + type]
+ mov edx, [edx + ebx*4]
+ imul edx, [ebp + ntype]
+ shl edx, 1
+ mov [esp + ntia], edx
+
+ lea ebx, [ebx + ebx*2] /* ebx = 3*ii=ii3 */
+ mov eax, [ebp + pos] /* eax = base of pos[] */
+ mov [esp + ii3], ebx
+
+ movq mm0, [eax + ebx*4]
+ movd mm1, [eax + ebx*4 + 8]
+ pfadd mm0, [esp + shX]
+ pfadd mm1, [esp + shZ]
+ movq [esp + ix], mm0
+ movd [esp + iz], mm1
+
+ /* clear forces */
+ pxor mm7,mm7
+ movq [esp + fix], mm7
+ movd [esp + fiz], mm7
+
+ mov ecx, [esp + innerjjnr0]
+ mov [esp + innerjjnr], ecx
+ mov edx, [esp + innerk0]
+ sub edx, 2
+ mov [esp + innerk], edx /* number of innerloop atoms */
+ jge .i1110_unroll_vdwc_loop
+ jmp .i1110_finish_vdwc_inner
+.i1110_unroll_vdwc_loop:
+ /* paired innerloop starts here */
+ mov ecx, [esp + innerjjnr] /* pointer to jjnr[k] */
+ mov eax, [ecx]
+ mov ebx, [ecx + 4] /* eax/ebx=jnr */
+ add [esp + innerjjnr], 8 /* advance pointer (unrolled 2) */
+ prefetch [ecx + 16] /* prefetch data - trial and error says 16 is best */
+
+ mov ecx, [ebp + charge] /* base of charge[] */
+ movq mm5, [esp + iq]
+ movd mm3, [ecx + eax*4] /* charge[jnr1] */
+ punpckldq mm3, [ecx + ebx*4] /* move charge 2 to high part of mm3 */
+ pfmul mm3,mm5 /* mm3 now has qq for both particles */
+
+ mov ecx, [ebp + type]
+ mov edx, [ecx + eax*4] /* type [jnr1] */
+ mov ecx, [ecx + ebx*4] /* type [jnr2] */
+
+ mov esi, [ebp + nbfp] /* base of nbfp */
+ shl edx, 1
+ shl ecx, 1
+ add edx, [esp + ntia] /* tja = ntia + 2*type */
+ add ecx, [esp + ntia]
+
+ movq mm5, [esi + edx*4] /* mm5 = 1st c6 / c12 */
+ movq mm7, [esi + ecx*4] /* mm7 = 2nd c6 / c12 */
+ movq mm6,mm5
+ punpckldq mm5,mm7 /* mm5 = 1st c6 / 2nd c6 */
+ punpckhdq mm6,mm7 /* mm6 = 1st c12 / 2nd c12 */
+ movq [esp + c6], mm5
+ movq [esp + c12], mm6
+
+ lea eax, [eax + eax*2] /* replace jnr with j3 */
+ lea ebx, [ebx + ebx*2]
+
+ mov esi, [ebp + pos]
+
+ movq mm0, [esp + ix]
+ movd mm1, [esp + iz]
+ movq mm4, [esi + eax*4] /* fetch first j coordinates */
+ movd mm5, [esi + eax*4 + 8]
+ pfsubr mm4,mm0 /* dr = ir - jr */
+ pfsubr mm5,mm1
+ movq [esp + dx1], mm4 /* store dr */
+ movd [esp + dz1], mm5
+ pfmul mm4,mm4 /* square dx,dy,dz */
+ pfmul mm5,mm5
+ pfacc mm4, mm5 /* accumulate to get dx*dx+dy*dy+dz*dz */
+ pfacc mm4, mm5 /* first rsq in lower mm4 */
+
+ movq mm6, [esi + ebx*4] /* fetch second j coordinates */
+ movd mm7, [esi + ebx*4 + 8]
+
+ pfsubr mm6,mm0 /* dr = ir - jr */
+ pfsubr mm7,mm1
+ movq [esp + dx2], mm6 /* store dr */
+ movd [esp + dz2], mm7
+ pfmul mm6,mm6 /* square dx,dy,dz */
+ pfmul mm7,mm7
+ pfacc mm6, mm7 /* accumulate to get dx*dx+dy*dy+dz*dz */
+ pfacc mm6, mm7 /* second rsq in lower mm6 */
+
+ pfrsqrt mm0, mm4 /* lookup inverse square root seed */
+ pfrsqrt mm1, mm6
+
+ punpckldq mm0,mm1
+ punpckldq mm4,mm6 /* now 4 has rsq and 0 the seed for both pairs */
+ movq mm2,mm0 /* amd 3dnow N-R iteration to get full precision */
+ pfmul mm0,mm0
+ pfrsqit1 mm0,mm4
+ pfrcpit2 mm0,mm2
+ movq mm1,mm0
+ pfmul mm0,mm0
+ /* mm0 now contains invsq, and mm1 invsqrt */
+ /* do potential and fscal */
+ movq mm4, mm0
+ pfmul mm4, mm0
+ pfmul mm4, mm0 /* mm4=rinvsix */
+ movq mm5, mm4
+ pfmul mm5, mm5 /* mm5=rinvtwelve */
+
+ pfmul mm3, mm1 /* mm3 has vcoul for both interactions */
+ movq mm7, mm3 /* use mm7 for sum to make fscal */
+
+ pfmul mm5, [esp + c12]
+ pfmul mm4, [esp + c6]
+ movq mm6, mm5 /* mm6 is vnb12-vnb6 */
+ pfsub mm6, mm4
+
+ pfmul mm4, [esp + six]
+
+ pfmul mm5, [esp + twelve]
+ pfsub mm7,mm4
+ pfadd mm7, mm5
+ pfmul mm0, mm7 /* mm0 is total fscal now */
+
+ prefetchw [esp + dx1] /* prefetch i forces to cache */
+
+ /* update vctot */
+ pfadd mm3, [esp + vctot] /* add the earlier value */
+ movq [esp + vctot], mm3 /* store the sum */
+
+ /* spread fscalar to both positions */
+ movq mm1,mm0
+ punpckldq mm0,mm0
+ punpckhdq mm1,mm1
+
+ /* calc vector force */
+ prefetchw [edi + eax*4] /* prefetch the 1st faction to cache */
+ movq mm2, [esp + dx1] /* fetch dr */
+ movd mm3, [esp + dz1]
+
+ /* update vnbtot */
+ pfadd mm6, [esp + vnbtot] /* add the earlier value */
+ movq [esp + vnbtot], mm6 /* store the sum */
+
+ prefetchw [edi + ebx*4] /* prefetch the 2nd faction to cache */
+ pfmul mm2, mm0 /* mult by fs */
+ pfmul mm3, mm0
+
+ movq mm4, [esp + dx2] /* fetch dr */
+ movd mm5, [esp + dz2]
+ pfmul mm4, mm1 /* mult by fs */
+ pfmul mm5, mm1
+ /* update i forces */
+
+ movq mm0, [esp + fix]
+ movd mm1, [esp + fiz]
+ pfadd mm0, mm2
+ pfadd mm1, mm3
+
+ pfadd mm0, mm4
+ pfadd mm1, mm5
+ movq [esp + fix], mm0
+ movd [esp + fiz], mm1
+ /* update j forces */
+
+ movq mm0, [edi + eax*4]
+ movd mm1, [edi + eax*4 + 8]
+ movq mm6, [edi + ebx*4]
+ movd mm7, [edi + ebx*4 + 8]
+
+ pfsub mm0, mm2
+ pfsub mm1, mm3
+ pfsub mm6, mm4
+ pfsub mm7, mm5
+
+ movq [edi + eax*4], mm0
+ movd [edi + eax*4 +8], mm1
+ movq [edi + ebx*4], mm6
+ movd [edi + ebx*4 + 8], mm7
+
+ /* should we do one more iteration? */
+ sub [esp + innerk], 2
+ jl .i1110_finish_vdwc_inner
+ jmp .i1110_unroll_vdwc_loop
+.i1110_finish_vdwc_inner:
+ and [esp + innerk], 1
+ jnz .i1110_single_vdwc_inner
+ jmp .i1110_updateouterdata_vdwc
+.i1110_single_vdwc_inner:
+ /* a single j particle iteration here - compare with the unrolled code for comments */
+ mov eax, [esp + innerjjnr]
+ mov eax, [eax] /* eax=jnr offset */
+
+ mov ecx, [ebp + charge]
+ movd mm5, [esp + iq]
+ movd mm3, [ecx + eax*4]
+ pfmul mm3, mm5 /* mm3=qq */
+
+ mov esi, [ebp + nbfp]
+ mov ecx, [ebp + type]
+ mov edx, [ecx + eax*4] /* type [jnr1] */
+ shl edx, 1
+ add edx, [esp + ntia] /* tja = ntia + 2*type */
+ movd mm5, [esi + edx*4] /* mm5 = 1st c6 */
+ movq [esp + c6], mm5
+ movd mm5, [esi + edx*4 + 4] /* mm5 = 1st c12 */
+ movq [esp + c12], mm5
+
+
+ mov esi, [ebp + pos]
+ lea eax, [eax + eax*2]
+
+ movq mm0, [esp + ix]
+ movd mm1, [esp + iz]
+ movq mm4, [esi + eax*4]
+ movd mm5, [esi + eax*4 + 8]
+ pfsubr mm4, mm0
+ pfsubr mm5, mm1
+ movq [esp + dx1], mm4
+ pfmul mm4,mm4
+ movd [esp + dz1], mm5
+ pfmul mm5,mm5
+ pfacc mm4, mm5
+ pfacc mm4, mm5 /* mm0=rsq */
+
+ pfrsqrt mm0,mm4
+ movq mm2,mm0
+ pfmul mm0,mm0
+ pfrsqit1 mm0,mm4
+ pfrcpit2 mm0,mm2 /* mm1=invsqrt */
+ movq mm1, mm0
+ pfmul mm0, mm0 /* mm0=invsq */
+ /* calculate potentials and scalar force */
+ movq mm4, mm0
+ pfmul mm4, mm0
+ pfmul mm4, mm0 /* mm4=rinvsix */
+ movq mm5, mm4
+ pfmul mm5, mm5 /* mm5=rinvtwelve */
+
+ pfmul mm3, mm1 /* mm3 has vcoul for both interactions */
+ movq mm7, mm3 /* use mm7 for sum to make fscal */
+
+ pfmul mm5, [esp + c12]
+ pfmul mm4, [esp + c6]
+ movq mm6, mm5 /* mm6 is vnb12-vnb6 */
+ pfsub mm6, mm4
+
+ pfmul mm4, [esp + six]
+
+ pfmul mm5, [esp + twelve]
+ pfsub mm7,mm4
+ pfadd mm7, mm5
+ pfmul mm0, mm7 /* mm0 is total fscal now */
+
+ /* update vctot */
+ pfadd mm3, [esp + vctot]
+ movq [esp + vctot], mm3
+
+ /* update vnbtot */
+ pfadd mm6, [esp + vnbtot] /* add the earlier value */
+ movq [esp + vnbtot], mm6 /* store the sum */
+
+ /* spread fscalar to both positions */
+ punpckldq mm0,mm0
+ /* calc vectorial force */
+ prefetchw [edi + eax*4] /* prefetch faction to cache */
+ movq mm2, [esp + dx1]
+ movd mm3, [esp + dz1]
+
+
+ pfmul mm2, mm0
+ pfmul mm3, mm0
+
+ /* update i particle force */
+ movq mm0, [esp + fix]
+ movd mm1, [esp + fiz]
+ pfadd mm0, mm2
+ pfadd mm1, mm3
+ movq [esp + fix], mm0
+ movd [esp + fiz], mm1
+ /* update j particle force */
+ movq mm0, [edi + eax*4]
+ movd mm1, [edi + eax *4+ 8]
+ pfsub mm0, mm2
+ pfsub mm1, mm3
+ movq [edi + eax*4], mm0
+ movd [edi + eax*4 +8], mm1
+ /* done! */
+.i1110_updateouterdata_vdwc:
+ mov ecx, [esp + ii3]
+
+ movq mm6, [edi + ecx*4] /* increment i force */
+ movd mm7, [edi + ecx*4 + 8]
+ pfadd mm6, [esp + fix]
+ pfadd mm7, [esp + fiz]
+ movq [edi + ecx*4], mm6
+ movd [edi + ecx*4 +8], mm7
+
+ mov ebx, [ebp + fshift] /* increment fshift force */
+ mov edx, [esp + is3]
+
+ movq mm6, [ebx + edx*4]
+ movd mm7, [ebx + edx*4 + 8]
+ pfadd mm6, [esp + fix]
+ pfadd mm7, [esp + fiz]
+ movq [ebx + edx*4], mm6
+ movd [ebx + edx*4 + 8], mm7
+
+ /* loop back to mno */
+ dec dword ptr [esp + nsvdwc]
+ jz .i1110_testcoul
+ jmp .i1110_mno_vdwc
+.i1110_testcoul:
+ mov ecx, [esp + nscoul]
+ cmp ecx, 0
+ jnz .i1110_mno_coul
+ jmp .i1110_testvdw
+.i1110_mno_coul:
+ mov ebx, [esp + solnr]
+ inc dword ptr [esp + solnr]
+ mov edx, [ebp + charge]
+ movd mm2, [edx + ebx*4] /* mm2=charge[ii] */
+ pfmul mm2, [ebp + facel]
+ punpckldq mm2,mm2 /* spread to both halves */
+ movq [esp + iq], mm2 /* iq =facel*charge[ii] */
+
+ lea ebx, [ebx + ebx*2] /* ebx = 3*ii=ii3 */
+ mov eax, [ebp + pos] /* eax = base of pos[] */
+ mov [esp + ii3], ebx
+
+ movq mm0, [eax + ebx*4]
+ movd mm1, [eax + ebx*4 + 8]
+ pfadd mm0, [esp + shX]
+ pfadd mm1, [esp + shZ]
+ movq [esp + ix], mm0
+ movd [esp + iz], mm1
+
+ /* clear forces */
+ pxor mm7,mm7
+ movq [esp + fix], mm7
+ movd [esp + fiz], mm7
+
+ mov ecx, [esp + innerjjnr0]
+ mov [esp + innerjjnr], ecx
+ mov edx, [esp + innerk0]
+ sub edx, 2
+ mov [esp + innerk], edx /* number of innerloop atoms */
+ jge .i1110_unroll_coul_loop
+ jmp .i1110_finish_coul_inner
+.i1110_unroll_coul_loop:
+ /* paired innerloop starts here */
+ mov ecx, [esp + innerjjnr] /* pointer to jjnr[k] */
+ mov eax, [ecx]
+ mov ebx, [ecx + 4] /* eax/ebx=jnr */
+ add [esp + innerjjnr], 8 /* advance pointer (unrolled 2) */
+ prefetch [ecx + 16] /* prefetch data - trial and error says 16 is best */
+
+ mov ecx, [ebp + charge] /* base of charge[] */
+ movq mm5, [esp + iq]
+ movd mm3, [ecx + eax*4] /* charge[jnr1] */
+ movd mm7, [ecx + ebx*4] /* charge[jnr2] */
+ punpckldq mm3,mm7 /* move charge 2 to high part of mm3 */
+ pfmul mm3,mm5 /* mm3 now has qq for both particles */
+
+ lea eax, [eax + eax*2] /* replace jnr with j3 */
+ lea ebx, [ebx + ebx*2]
+
+ movq mm0, [esp + ix]
+ movd mm1, [esp + iz]
+ movq mm4, [esi + eax*4] /* fetch first j coordinates */
+ movd mm5, [esi + eax*4 + 8]
+ pfsubr mm4,mm0 /* dr = ir - jr */
+ pfsubr mm5,mm1
+ movq [esp + dx1], mm4 /* store dr */
+ movd [esp + dz1], mm5
+ pfmul mm4,mm4 /* square dx,dy,dz */
+ pfmul mm5,mm5
+ pfacc mm4, mm5 /* accumulate to get dx*dx+dy*dy+dz*dz */
+ pfacc mm4, mm5 /* first rsq in lower mm4 */
+
+ movq mm6, [esi + ebx*4] /* fetch second j coordinates */
+ movd mm7, [esi + ebx*4 + 8]
+
+ pfsubr mm6,mm0 /* dr = ir - jr */
+ pfsubr mm7,mm1
+ movq [esp + dx2], mm6 /* store dr */
+ movd [esp + dz2], mm7
+ pfmul mm6,mm6 /* square dx,dy,dz */
+ pfmul mm7,mm7
+ pfacc mm6, mm7 /* accumulate to get dx*dx+dy*dy+dz*dz */
+ pfacc mm6, mm7 /* second rsq in lower mm6 */
+
+ pfrsqrt mm0, mm4 /* lookup inverse square root seed */
+ pfrsqrt mm1, mm6
+
+ punpckldq mm0,mm1
+ punpckldq mm4,mm6 /* now 4 has rsq and 0 the seed for both pairs */
+ movq mm2,mm0 /* amd 3dnow N-R iteration to get full precision */
+ pfmul mm0,mm0
+ pfrsqit1 mm0,mm4
+ pfrcpit2 mm0,mm2
+ movq mm1,mm0
+ pfmul mm0,mm0
+ /* mm0 now contains invsq, and mm1 invsqrt */
+ /* do potential and fscal */
+ prefetchw [esp + dx1] /* prefetch i forces to cache */
+
+ pfmul mm3,mm1 /* 6 has both vcoul */
+ pfmul mm0,mm3 /* 0 has both fscal */
+
+ /* update vctot */
+
+ pfadd mm3, [esp + vctot] /* add the earlier value */
+ movq [esp + vctot], mm3 /* store the sum */
+ /* spread fscalar to both positions */
+ movq mm1,mm0
+ punpckldq mm0,mm0
+ punpckhdq mm1,mm1
+ /* calc vector force */
+ prefetchw [edi + eax*4] /* prefetch the 1st faction to cache */
+ movq mm2, [esp + dx1] /* fetch dr */
+ movd mm3, [esp + dz1]
+ prefetchw [edi + ebx*4] /* prefetch the 2nd faction to cache */
+ pfmul mm2, mm0 /* mult by fs */
+ pfmul mm3, mm0
+
+ movq mm4, [esp + dx2] /* fetch dr */
+ movd mm5, [esp + dz2]
+ pfmul mm4, mm1 /* mult by fs */
+ pfmul mm5, mm1
+ /* update i forces */
+
+ movq mm0, [esp + fix]
+ movd mm1, [esp + fiz]
+ pfadd mm0, mm2
+ pfadd mm1, mm3
+
+ pfadd mm0, mm4
+ pfadd mm1, mm5
+ movq [esp + fix], mm0
+ movd [esp + fiz], mm1
+ /* update j forces */
+
+ movq mm0, [edi + eax*4]
+ movd mm1, [edi + eax*4 + 8]
+ movq mm6, [edi + ebx*4]
+ movd mm7, [edi + ebx*4 + 8]
+
+ pfsub mm0, mm2
+ pfsub mm1, mm3
+ pfsub mm6, mm4
+ pfsub mm7, mm5
+
+ movq [edi + eax*4], mm0
+ movd [edi + eax*4 +8], mm1
+ movq [edi + ebx*4], mm6
+ movd [edi + ebx*4 + 8], mm7
+
+ /* should we do one more iteration? */
+ sub [esp + innerk], 2
+ jl .i1110_finish_coul_inner
+ jmp .i1110_unroll_coul_loop
+.i1110_finish_coul_inner:
+ and [esp + innerk], 1
+ jnz .i1110_single_coul_inner
+ jmp .i1110_updateouterdata_coul
+.i1110_single_coul_inner:
+ /* a single j particle iteration here - compare with the unrolled code for comments */
+ mov eax, [esp + innerjjnr]
+ mov eax, [eax] /* eax=jnr offset */
+
+ mov ecx, [ebp + charge]
+ movd mm6, [esp + iq]
+ movd mm7, [ecx + eax*4]
+ pfmul mm6, mm7 /* mm6=qq */
+
+ lea eax, [eax + eax*2]
+
+ movq mm0, [esp + ix]
+ movd mm1, [esp + iz]
+ movq mm2, [esi + eax*4]
+ movd mm3, [esi + eax*4 + 8]
+ pfsub mm0, mm2
+ pfsub mm1, mm3
+ movq [esp + dx1], mm0
+ pfmul mm0,mm0
+ movd [esp + dz1], mm1
+ pfmul mm1,mm1
+ pfacc mm0, mm1
+ pfacc mm0, mm1 /* mm0=rsq */
+
+ pfrsqrt mm1,mm0
+ movq mm2,mm1
+ pfmul mm1,mm1
+ pfrsqit1 mm1,mm0
+ pfrcpit2 mm1,mm2 /* mm1=invsqrt */
+ movq mm4, mm1
+ pfmul mm4, mm4 /* mm4=invsq */
+ /* calculate potential and scalar force */
+ pfmul mm6, mm1 /* mm6=vcoul */
+ pfmul mm4, mm6 /* mm4=fscalar */
+ /* update vctot */
+ movq mm5, [esp + vctot]
+ pfadd mm5, mm6
+ movq [esp + vctot], mm5
+ /* spread fscalar to both positions */
+ punpckldq mm4,mm4
+ /* calc vectorial force */
+ prefetchw [edi + eax*4] /* prefetch faction to cache */
+ movq mm0, [esp + dx1]
+ movd mm1, [esp + dz1]
+ pfmul mm0, mm4
+ pfmul mm1, mm4
+ /* update i particle force */
+ movq mm2, [esp + fix]
+ movd mm3, [esp + fiz]
+ pfadd mm2, mm0
+ pfadd mm3, mm1
+ movq [esp + fix], mm2
+ movd [esp + fiz], mm3
+ /* update j particle force */
+ movq mm2, [edi + eax*4]
+ movd mm3, [edi + eax *4+ 8]
+ pfsub mm2, mm0
+ pfsub mm3, mm1
+ movq [edi + eax*4], mm2
+ movd [edi + eax*4 +8], mm3
+ /* done! */
+.i1110_updateouterdata_coul:
+ mov ecx, [esp + ii3]
+
+ movq mm6, [edi + ecx*4] /* increment i force */
+ movd mm7, [edi + ecx*4 + 8]
+ pfadd mm6, [esp + fix]
+ pfadd mm7, [esp + fiz]
+ movq [edi + ecx*4], mm6
+ movd [edi + ecx*4 +8], mm7
+
+ mov ebx, [ebp + fshift] /* increment fshift force */
+ mov edx, [esp + is3]
+
+ movq mm6, [ebx + edx*4]
+ movd mm7, [ebx + edx*4 + 8]
+ pfadd mm6, [esp + fix]
+ pfadd mm7, [esp + fiz]
+ movq [ebx + edx*4], mm6
+ movd [ebx + edx*4 + 8], mm7
+
+ /* loop back to mno */
+ dec dword ptr [esp + nscoul]
+ jz .i1110_testvdw
+ jmp .i1110_mno_coul
+.i1110_testvdw:
+ mov ecx, [esp + nsvdw]
+ cmp ecx, 0
+ jnz .i1110_mno_vdw
+ jmp .i1110_last_mno
+.i1110_mno_vdw:
+ mov ebx, [esp + solnr]
+ inc dword ptr [esp + solnr]
+
+ mov edx, [ebp + type]
+ mov edx, [edx + ebx*4]
+ imul edx, [ebp + ntype]
+ shl edx, 1
+ mov [esp + ntia], edx
+
+ lea ebx, [ebx + ebx*2] /* ebx = 3*ii=ii3 */
+ mov eax, [ebp + pos] /* eax = base of pos[] */
+ mov [esp + ii3], ebx
+
+ movq mm0, [eax + ebx*4]
+ movd mm1, [eax + ebx*4 + 8]
+ pfadd mm0, [esp + shX]
+ pfadd mm1, [esp + shZ]
+ movq [esp + ix], mm0
+ movd [esp + iz], mm1
+
+ /* clear forces */
+ pxor mm7,mm7
+ movq [esp + fix], mm7
+ movd [esp + fiz], mm7
+
+ mov ecx, [esp + innerjjnr0]
+ mov [esp + innerjjnr], ecx
+ mov edx, [esp + innerk0]
+ sub edx, 2
+ mov [esp + innerk], edx /* number of innerloop atoms */
+ jge .i1110_unroll_vdw_loop
+ jmp .i1110_finish_vdw_inner
+.i1110_unroll_vdw_loop:
+ /* paired innerloop starts here */
+ mov ecx, [esp + innerjjnr] /* pointer to jjnr[k] */
+ mov eax, [ecx]
+ mov ebx, [ecx + 4] /* eax/ebx=jnr */
+ add [esp + innerjjnr], 8 /* advance pointer (unrolled 2) */
+ prefetch [ecx + 16] /* prefetch data - trial and error says 16 is best */
+
+ mov ecx, [ebp + type]
+ mov edx, [ecx + eax*4] /* type [jnr1] */
+ mov ecx, [ecx + ebx*4] /* type [jnr2] */
+
+ mov esi, [ebp + nbfp] /* base of nbfp */
+ shl edx, 1
+ shl ecx, 1
+ add edx, [esp + ntia] /* tja = ntia + 2*type */
+ add ecx, [esp + ntia]
+
+ movq mm5, [esi + edx*4] /* mm5 = 1st c6 / c12 */
+ movq mm7, [esi + ecx*4] /* mm7 = 2nd c6 / c12 */
+ movq mm6,mm5
+ punpckldq mm5,mm7 /* mm5 = 1st c6 / 2nd c6 */
+ punpckhdq mm6,mm7 /* mm6 = 1st c12 / 2nd c12 */
+ movq [esp + c6], mm5
+ movq [esp + c12], mm6
+
+ lea eax, [eax + eax*2] /* replace jnr with j3 */
+ lea ebx, [ebx + ebx*2]
+
+ mov esi, [ebp + pos]
+
+ movq mm0, [esp + ix]
+ movd mm1, [esp + iz]
+ movq mm4, [esi + eax*4] /* fetch first j coordinates */
+ movd mm5, [esi + eax*4 + 8]
+ pfsubr mm4,mm0 /* dr = ir - jr */
+ pfsubr mm5,mm1
+ movq [esp + dx1], mm4 /* store dr */
+ movd [esp + dz1], mm5
+ pfmul mm4,mm4 /* square dx,dy,dz */
+ pfmul mm5,mm5
+ pfacc mm4, mm5 /* accumulate to get dx*dx+dy*dy+dz*dz */
+ pfacc mm4, mm5 /* first rsq in lower mm4 */
+
+ movq mm6, [esi + ebx*4] /* fetch second j coordinates */
+ movd mm7, [esi + ebx*4 + 8]
+
+ pfsubr mm6,mm0 /* dr = ir - jr */
+ pfsubr mm7,mm1
+ movq [esp + dx2], mm6 /* store dr */
+ movd [esp + dz2], mm7
+ pfmul mm6,mm6 /* square dx,dy,dz */
+ pfmul mm7,mm7
+ pfacc mm6, mm7 /* accumulate to get dx*dx+dy*dy+dz*dz */
+ pfacc mm6, mm7 /* second rsq in lower mm6 */
+
+ pfrsqrt mm0, mm4 /* lookup inverse square root seed */
+ pfrsqrt mm1, mm6
+
+ punpckldq mm0,mm1
+ punpckldq mm4,mm6 /* now 4 has rsq and 0 the seed for both pairs */
+ movq mm2,mm0 /* amd 3dnow N-R iteration to get full precision */
+ pfmul mm0,mm0
+ pfrsqit1 mm0,mm4
+ pfrcpit2 mm0,mm2
+ movq mm1,mm0
+ pfmul mm0,mm0
+ /* mm0 now contains invsq, and mm1 invsqrt */
+ /* do potential and fscal */
+ movq mm4, mm0
+ pfmul mm4, mm0
+ pfmul mm4, mm0 /* mm4=rinvsix */
+ movq mm5, mm4
+ pfmul mm5, mm5 /* mm5=rinvtwelve */
+
+ pfmul mm5, [esp + c12]
+ pfmul mm4, [esp + c6]
+ movq mm6, mm5 /* mm6 is vnb12-vnb6 */
+ pfsub mm6, mm4
+
+ pfmul mm4, [esp + six]
+
+ pfmul mm5, [esp + twelve]
+ movq mm7, mm5
+ pfsub mm7,mm4
+ pfmul mm0, mm7 /* mm0 is total fscal now */
+
+ prefetchw [esp + dx1] /* prefetch i forces to cache */
+
+ /* spread fscalar to both positions */
+ movq mm1,mm0
+ punpckldq mm0,mm0
+ punpckhdq mm1,mm1
+
+ /* calc vector force */
+ prefetchw [edi + eax*4] /* prefetch the 1st faction to cache */
+ movq mm2, [esp + dx1] /* fetch dr */
+ movd mm3, [esp + dz1]
+
+ /* update vnbtot */
+ pfadd mm6, [esp + vnbtot] /* add the earlier value */
+ movq [esp + vnbtot], mm6 /* store the sum */
+
+ prefetchw [edi + ebx*4] /* prefetch the 2nd faction to cache */
+ pfmul mm2, mm0 /* mult by fs */
+ pfmul mm3, mm0
+
+ movq mm4, [esp + dx2] /* fetch dr */
+ movd mm5, [esp + dz2]
+ pfmul mm4, mm1 /* mult by fs */
+ pfmul mm5, mm1
+ /* update i forces */
+
+ movq mm0, [esp + fix]
+ movd mm1, [esp + fiz]
+ pfadd mm0, mm2
+ pfadd mm1, mm3
+
+ pfadd mm0, mm4
+ pfadd mm1, mm5
+ movq [esp + fix], mm0
+ movd [esp + fiz], mm1
+ /* update j forces */
+
+ movq mm0, [edi + eax*4]
+ movd mm1, [edi + eax*4 + 8]
+ movq mm6, [edi + ebx*4]
+ movd mm7, [edi + ebx*4 + 8]
+
+ pfsub mm0, mm2
+ pfsub mm1, mm3
+ pfsub mm6, mm4
+ pfsub mm7, mm5
+
+ movq [edi + eax*4], mm0
+ movd [edi + eax*4 +8], mm1
+ movq [edi + ebx*4], mm6
+ movd [edi + ebx*4 + 8], mm7
+
+ /* should we do one more iteration? */
+ sub [esp + innerk], 2
+ jl .i1110_finish_vdw_inner
+ jmp .i1110_unroll_vdw_loop
+.i1110_finish_vdw_inner:
+ and [esp + innerk], 1
+ jnz .i1110_single_vdw_inner
+ jmp .i1110_updateouterdata_vdw
+.i1110_single_vdw_inner:
+ /* a single j particle iteration here - compare with the unrolled code for comments */
+ mov eax, [esp + innerjjnr]
+ mov eax, [eax] /* eax=jnr offset */
+
+ mov esi, [ebp + nbfp]
+ mov ecx, [ebp + type]
+ mov edx, [ecx + eax*4] /* type [jnr1] */
+ shl edx, 1
+ add edx, [esp + ntia] /* tja = ntia + 2*type */
+ movd mm5, [esi + edx*4] /* mm5 = 1st c6 */
+ movq [esp + c6], mm5
+ movd mm5, [esi + edx*4 + 4] /* mm5 = 1st c12 */
+ movq [esp + c12], mm5
+
+
+ mov esi, [ebp + pos]
+ lea eax, [eax + eax*2]
+
+ movq mm0, [esp + ix]
+ movd mm1, [esp + iz]
+ movq mm4, [esi + eax*4]
+ movd mm5, [esi + eax*4 + 8]
+ pfsubr mm4, mm0
+ pfsubr mm5, mm1
+ movq [esp + dx1], mm4
+ pfmul mm4,mm4
+ movd [esp + dz1], mm5
+ pfmul mm5,mm5
+ pfacc mm4, mm5
+ pfacc mm4, mm5 /* mm0=rsq */
+
+ pfrsqrt mm0,mm4
+ movq mm2,mm0
+ pfmul mm0,mm0
+ pfrsqit1 mm0,mm4
+ pfrcpit2 mm0,mm2 /* mm1=invsqrt */
+ movq mm1, mm0
+ pfmul mm0, mm0 /* mm0=invsq */
+ /* calculate potentials and scalar force */
+ movq mm4, mm0
+ pfmul mm4, mm0
+ pfmul mm4, mm0 /* mm4=rinvsix */
+ movq mm5, mm4
+ pfmul mm5, mm5 /* mm5=rinvtwelve */
+
+ pfmul mm5, [esp + c12]
+ pfmul mm4, [esp + c6]
+ movq mm6, mm5 /* mm6 is vnb12-vnb6 */
+ pfsub mm6, mm4
+
+ pfmul mm4, [esp + six]
+
+ pfmul mm5, [esp + twelve]
+ movq mm7, mm5
+ pfsub mm7,mm4
+ pfmul mm0, mm7 /* mm0 is total fscal now */
+
+ /* update vnbtot */
+ pfadd mm6, [esp + vnbtot] /* add the earlier value */
+ movq [esp + vnbtot], mm6 /* store the sum */
+
+ /* spread fscalar to both positions */
+ punpckldq mm0,mm0
+ /* calc vectorial force */
+ prefetchw [edi + eax*4] /* prefetch faction to cache */
+ movq mm2, [esp + dx1]
+ movd mm3, [esp + dz1]
+
+
+ pfmul mm2, mm0
+ pfmul mm3, mm0
+
+ /* update i particle force */
+ movq mm0, [esp + fix]
+ movd mm1, [esp + fiz]
+ pfadd mm0, mm2
+ pfadd mm1, mm3
+ movq [esp + fix], mm0
+ movd [esp + fiz], mm1
+ /* update j particle force */
+ movq mm0, [edi + eax*4]
+ movd mm1, [edi + eax *4+ 8]
+ pfsub mm0, mm2
+ pfsub mm1, mm3
+ movq [edi + eax*4], mm0
+ movd [edi + eax*4 +8], mm1
+ /* done! */
+.i1110_updateouterdata_vdw:
+ mov ecx, [esp + ii3]
+
+ movq mm6, [edi + ecx*4] /* increment i force */
+ movd mm7, [edi + ecx*4 + 8]
+ pfadd mm6, [esp + fix]
+ pfadd mm7, [esp + fiz]
+ movq [edi + ecx*4], mm6
+ movd [edi + ecx*4 +8], mm7
+
+ mov ebx, [ebp + fshift] /* increment fshift force */
+ mov edx, [esp + is3]
+
+ movq mm6, [ebx + edx*4]
+ movd mm7, [ebx + edx*4 + 8]
+ pfadd mm6, [esp + fix]
+ pfadd mm7, [esp + fiz]
+ movq [ebx + edx*4], mm6
+ movd [ebx + edx*4 + 8], mm7
+
+ /* loop back to mno */
+ dec dword ptr [esp + nsvdw]
+ jz .i1110_last_mno
+ jmp .i1110_mno_vdw
+
+.i1110_last_mno:
+ mov edx, [ebp + gid] /* get group index for this i particle */
+ mov edx, [edx]
+ add [ebp + gid], 4 /* advance pointer */
+
+ movq mm7, [esp + vctot]
+ pfacc mm7,mm7 /* get and sum the two parts of total potential */
+
+ mov eax, [ebp + Vc]
+ movd mm6, [eax + edx*4]
+ pfadd mm6, mm7
+ movd [eax + edx*4], mm6 /* increment vc[gid] */
+
+ movq mm7, [esp + vnbtot]
+ pfacc mm7,mm7 /* get and sum the two parts of total potential */
+
+ mov eax, [ebp + Vnb]
+ movd mm6, [eax + edx*4]
+ pfadd mm6, mm7
+ movd [eax + edx*4], mm6 /* increment vc[gid] */
+ /* finish if last */
+ mov ecx, [ebp + nri]
+ dec ecx
+ jecxz .i1110_end
+ /* not last, iterate once more! */
+ mov [ebp + nri], ecx
+ jmp .i1110_outer
+.i1110_end:
+ femms
+ add esp, 160
+ pop edi
+ pop esi
+ pop edx
+ pop ecx
+ pop ebx
+ pop eax
+ leave
+ ret
+
+
+
+.globl inl1120_3dnow
+ .type inl1120_3dnow,@function
+inl1120_3dnow:
+.equ nri, 8
+.equ iinr, 12
+.equ jindex, 16
+.equ jjnr, 20
+.equ shift, 24
+.equ shiftvec, 28
+.equ fshift, 32
+.equ gid, 36
+.equ pos, 40
+.equ faction, 44
+.equ charge, 48
+.equ facel, 52
+.equ Vc, 56
+.equ type, 60
+.equ ntype, 64
+.equ nbfp, 68
+.equ Vnb, 72
+ /* stack offsets for local variables */
+.equ is3, 0
+.equ ii3, 4
+.equ ixO, 8
+.equ iyO, 12
+.equ izO, 16
+.equ ixH, 20 /* repeated (64bit) to fill 3dnow reg */
+.equ iyH, 28 /* repeated (64bit) to fill 3dnow reg */
+.equ izH, 36 /* repeated (64bit) to fill 3dnow reg */
+.equ iqO, 44 /* repeated (64bit) to fill 3dnow reg */
+.equ iqH, 52 /* repeated (64bit) to fill 3dnow reg */
+.equ vctot, 60 /* repeated (64bit) to fill 3dnow reg */
+.equ vnbtot, 68 /* repeated (64bit) to fill 3dnow reg */
+.equ c6, 76 /* repeated (64bit) to fill 3dnow reg */
+.equ c12, 84 /* repeated (64bit) to fill 3dnow reg */
+.equ six, 92 /* repeated (64bit) to fill 3dnow reg */
+.equ twelve, 100 /* repeated (64bit) to fill 3dnow reg */
+.equ ntia, 108 /* repeated (64bit) to fill 3dnow reg */
+.equ innerjjnr, 116
+.equ innerk, 120
+.equ fixO, 124
+.equ fiyO, 128
+.equ fizO, 132
+.equ fixH, 136 /* repeated (64bit) to fill 3dnow reg */
+.equ fiyH, 144 /* repeated (64bit) to fill 3dnow reg */
+.equ fizH, 152 /* repeated (64bit) to fill 3dnow reg */
+.equ dxO, 160
+.equ dyO, 164
+.equ dzO, 168
+.equ dxH, 172 /* repeated (64bit) to fill 3dnow reg */
+.equ dyH, 180 /* repeated (64bit) to fill 3dnow reg */
+.equ dzH, 188 /* repeated (64bit) to fill 3dnow reg */
+ push ebp
+ mov ebp,esp
+ push eax
+ push ebx
+ push ecx
+ push edx
+ push esi
+ push edi
+ sub esp, 196 /* local stack space */
+ femms
+ /* assume we have at least one i particle - start directly */
+
+ mov ecx, [ebp + iinr] /* ecx = pointer into iinr[] */
+ mov ebx, [ecx] /* ebx=ii */
+
+ mov edx, [ebp + charge]
+ movd mm1, [ebp + facel]
+ movd mm2, [edx + ebx*4] /* mm2=charge[i.i0] */
+ pfmul mm2, mm1
+ movq [esp + iqO], mm2 /* iqO = facel*charge[ii] */
+
+ movd mm2, [edx + ebx*4 + 4] /* mm2=charge[i.i0+1] */
+ pfmul mm2, mm1
+ punpckldq mm2,mm2 /* spread to both halves */
+ movq [esp + iqH], mm2 /* iqH = facel*charge[i.i0+1] */
+
+ mov edx, [ebp + type]
+ mov ecx, [edx + ebx*4]
+ shl ecx, 1
+ imul ecx, [ebp + ntype] /* ecx = ntia = 2*ntype*type[i.i0] */
+ mov [esp + ntia], ecx
+
+ movq mm3, [mm_six]
+ movq mm4, [mm_twelve]
+ movq [esp + six], mm3
+ movq [esp + twelve], mm4
+.i1120_outer:
+ mov eax, [ebp + shift] /* eax = pointer into shift[] */
+ mov ebx, [eax] /* ebx=shift[n] */
+ add [ebp + shift], 4 /* advance pointer one step */
+
+ lea ebx, [ebx + ebx*2] /* ebx=3*is */
+ mov [esp + is3],ebx /* store is3 */
+
+ mov eax, [ebp + shiftvec] /* eax = base of shiftvec[] */
+
+ movq mm5, [eax + ebx*4] /* move shX/shY to mm5 and shZ to mm6. */
+ movd mm6, [eax + ebx*4 + 8]
+ movq mm0, mm5
+ movq mm1, mm5
+ movq mm2, mm6
+ punpckldq mm0,mm0 /* also expand shX,Y,Z in mm0--mm2. */
+ punpckhdq mm1,mm1
+ punpckldq mm2,mm2
+
+ mov ecx, [ebp + iinr] /* ecx = pointer into iinr[] */
+ add [ebp + iinr], 4 /* advance pointer */
+ mov ebx, [ecx] /* ebx=ii */
+
+ lea ebx, [ebx + ebx*2] /* ebx = 3*ii=ii3 */
+ mov eax, [ebp + pos] /* eax = base of pos[] */
+
+ pfadd mm5, [eax + ebx*4] /* ix = shX + posX (and iy too) */
+ movd mm7, [eax + ebx*4 + 8] /* cant use direct memory add for 4 bytes (iz) */
+ mov [esp + ii3], ebx /* (use mm7 as temp. storage for iz.) */
+ pfadd mm6, mm7
+ movq [esp + ixO], mm5
+ movq [esp + izO], mm6
+
+ movd mm3, [eax + ebx*4 + 12]
+ movd mm4, [eax + ebx*4 + 16]
+ movd mm5, [eax + ebx*4 + 20]
+ punpckldq mm3, [eax + ebx*4 + 24]
+ punpckldq mm4, [eax + ebx*4 + 28]
+ punpckldq mm5, [eax + ebx*4 + 32] /* coords of H1 in low mm3-mm5, H2 in high */
+
+ pfadd mm0, mm3
+ pfadd mm1, mm4
+ pfadd mm2, mm5
+ movq [esp + ixH], mm0
+ movq [esp + iyH], mm1
+ movq [esp + izH], mm2
+
+ /* clear vctot and i forces */
+ pxor mm7,mm7
+ movq [esp + vctot], mm7
+ movq [esp + vnbtot], mm7
+ movq [esp + fixO], mm7
+ movd [esp + fizO], mm7
+ movq [esp + fixH], mm7
+ movq [esp + fiyH], mm7
+ movq [esp + fizH], mm7
+
+ mov eax, [ebp + jindex]
+ mov ecx, [eax] /* jindex[n] */
+ mov edx, [eax + 4] /* jindex[n+1] */
+ add [ebp + jindex], 4
+ sub edx, ecx /* number of innerloop atoms */
+ mov [esp + innerk], edx /* number of innerloop atoms */
+
+ mov esi, [ebp + pos]
+ mov edi, [ebp + faction]
+ mov eax, [ebp + jjnr]
+ shl ecx, 2
+ add eax, ecx
+ mov [esp + innerjjnr], eax /* pointer to jjnr[nj0] */
+.i1120_inner_loop:
+ /* a single j particle iteration here - compare with the unrolled code for comments. */
+ mov eax, [esp + innerjjnr]
+ mov eax, [eax] /* eax=jnr offset */
+ add [esp + innerjjnr], 4 /* advance pointer */
+ prefetch [ecx + 16] /* prefetch data - trial and error says 16 is best */
+
+ mov ecx, [ebp + charge]
+ movd mm7, [ecx + eax*4]
+ punpckldq mm7,mm7
+ movq mm6,mm7
+ pfmul mm6, [esp + iqO]
+ pfmul mm7, [esp + iqH] /* mm6=qqO, mm7=qqH */
+
+ mov ecx, [ebp + type]
+ mov edx, [ecx + eax*4] /* type [jnr] */
+ mov ecx, [ebp + nbfp]
+ shl edx, 1
+ add edx, [esp + ntia] /* tja = ntia + 2*type */
+ movd mm5, [ecx + edx*4] /* mm5 = 1st c6 */
+ movq [esp + c6], mm5
+ movd mm5, [ecx + edx*4 + 4] /* mm5 = 1st c12 */
+ movq [esp + c12], mm5
+
+ lea eax, [eax + eax*2]
+
+ movq mm0, [esi + eax*4]
+ movd mm1, [esi + eax*4 + 8]
+ /* copy & expand to mm2-mm4 for the H interactions */
+ movq mm2, mm0
+ movq mm3, mm0
+ movq mm4, mm1
+ punpckldq mm2,mm2
+ punpckhdq mm3,mm3
+ punpckldq mm4,mm4
+
+ pfsubr mm0, [esp + ixO]
+ pfsubr mm1, [esp + izO]
+
+ movq [esp + dxO], mm0
+ pfmul mm0,mm0
+ movd [esp + dzO], mm1
+ pfmul mm1,mm1
+ pfacc mm0, mm1
+ pfadd mm0, mm1 /* mm0=rsqO */
+
+ punpckldq mm2, mm2
+ punpckldq mm3, mm3
+ punpckldq mm4, mm4 /* mm2-mm4 is jx-jz */
+ pfsubr mm2, [esp + ixH]
+ pfsubr mm3, [esp + iyH]
+ pfsubr mm4, [esp + izH] /* mm2-mm4 is dxH-dzH */
+
+ movq [esp + dxH], mm2
+ movq [esp + dyH], mm3
+ movq [esp + dzH], mm4
+ pfmul mm2,mm2
+ pfmul mm3,mm3
+ pfmul mm4,mm4
+
+ pfadd mm3,mm2
+ pfadd mm3,mm4 /* mm3=rsqH */
+
+ pfrsqrt mm1,mm0
+
+ movq mm2,mm1
+ pfmul mm1,mm1
+ pfrsqit1 mm1,mm0
+ pfrcpit2 mm1,mm2 /* mm1=invsqrt */
+ movq mm4, mm1
+ pfmul mm4, mm4 /* mm4=invsq */
+
+ movq mm0, mm4
+ pfmul mm0, mm4
+ pfmul mm0, mm4 /* mm0=rinvsix */
+ movq mm2, mm0
+ pfmul mm2, mm2 /* mm2=rintwelve */
+
+ /* calculate potential and scalar force */
+ pfmul mm6, mm1 /* mm6=vcoul */
+ movq mm1, mm6 /* use mm1 for fscal sum */
+
+ /* LJ for the oxygen */
+ pfmul mm0, [esp + c6]
+ pfmul mm2, [esp + c12]
+
+ /* calc nb potential */
+ movq mm5, mm2
+ pfsub mm5, mm0
+
+ /* calc nb force */
+ pfmul mm0, [esp + six]
+ pfmul mm2, [esp + twelve]
+
+ /* increment scalar force */
+ pfsub mm1, mm0
+ pfadd mm1, mm2
+ pfmul mm4, mm1 /* total scalar force on oxygen. */
+
+ /* update nb potential */
+ pfadd mm5, [esp + vnbtot]
+ movq [esp + vnbtot], mm5
+
+ pfrsqrt mm5, mm3
+ pswapd mm3,mm3
+ pfrsqrt mm2, mm3
+ pswapd mm3,mm3
+ punpckldq mm5,mm2 /* seeds are in mm5 now, and rsq in mm3. */
+
+ movq mm2, mm5
+ pfmul mm5,mm5
+ pfrsqit1 mm5,mm3
+ pfrcpit2 mm5,mm2 /* mm5=invsqrt */
+ movq mm3,mm5
+ pfmul mm3,mm3 /* mm3=invsq */
+ pfmul mm7, mm5 /* mm7=vcoul */
+ pfmul mm3, mm7 /* mm3=fscal for the two H's. */
+
+ /* update vctot */
+ pfadd mm7, mm6
+ pfadd mm7, [esp + vctot]
+ movq [esp + vctot], mm7
+
+ /* spread oxygen fscalar to both positions */
+ punpckldq mm4,mm4
+ /* calc vectorial force for O */
+ prefetchw [edi + eax*4] /* prefetch faction to cache */
+ movq mm0, [esp + dxO]
+ movd mm1, [esp + dzO]
+ pfmul mm0, mm4
+ pfmul mm1, mm4
+
+ /* calc vectorial force for H's */
+ movq mm5, [esp + dxH]
+ movq mm6, [esp + dyH]
+ movq mm7, [esp + dzH]
+ pfmul mm5, mm3
+ pfmul mm6, mm3
+ pfmul mm7, mm3
+
+ /* update iO particle force */
+ movq mm2, [esp + fixO]
+ movd mm3, [esp + fizO]
+ pfadd mm2, mm0
+ pfadd mm3, mm1
+ movq [esp + fixO], mm2
+ movd [esp + fizO], mm3
+
+ /* update iH forces */
+ movq mm2, [esp + fixH]
+ movq mm3, [esp + fiyH]
+ movq mm4, [esp + fizH]
+ pfadd mm2, mm5
+ pfadd mm3, mm6
+ pfadd mm4, mm7
+ movq [esp + fixH], mm2
+ movq [esp + fiyH], mm3
+ movq [esp + fizH], mm4
+
+ /* pack j forces from H in the same form as the oxygen force. */
+ pfacc mm5, mm6 /* mm5(l)=fjx(H1+H2) mm5(h)=fjy(H1+H2) */
+ pfacc mm7, mm7 /* mm7(l)=fjz(H1+H2) */
+
+ pfadd mm0, mm5 /* add up total force on j particle. */
+ pfadd mm1, mm7
+
+ /* update j particle force */
+ movq mm2, [edi + eax*4]
+ movd mm3, [edi + eax*4 + 8]
+ pfsub mm2, mm0
+ pfsub mm3, mm1
+ movq [edi + eax*4], mm2
+ movd [edi + eax*4 +8], mm3
+
+ /* done - one more? */
+ dec dword ptr [esp + innerk]
+ jz .i1120_updateouterdata
+ jmp .i1120_inner_loop
+.i1120_updateouterdata:
+ mov ecx, [esp + ii3]
+
+ movq mm6, [edi + ecx*4] /* increment iO force */
+ movd mm7, [edi + ecx*4 + 8]
+ pfadd mm6, [esp + fixO]
+ pfadd mm7, [esp + fizO]
+ movq [edi + ecx*4], mm6
+ movd [edi + ecx*4 +8], mm7
+
+ movq mm0, [esp + fixH]
+ movq mm3, [esp + fiyH]
+ movq mm1, [esp + fizH]
+ movq mm2, mm0
+ punpckldq mm0, mm3 /* mm0(l)=fxH1, mm0(h)=fyH1 */
+ punpckhdq mm2, mm3 /* mm2(l)=fxH2, mm2(h)=fyH2 */
+ movq mm3, mm1
+ pswapd mm3,mm3
+ /* mm1 is fzH1 */
+ /* mm3 is fzH2 */
+
+ movq mm6, [edi + ecx*4 + 12] /* increment iH1 force */
+ movd mm7, [edi + ecx*4 + 20]
+ pfadd mm6, mm0
+ pfadd mm7, mm1
+ movq [edi + ecx*4 + 12], mm6
+ movd [edi + ecx*4 + 20], mm7
+
+ movq mm6, [edi + ecx*4 + 24] /* increment iH2 force */
+ movd mm7, [edi + ecx*4 + 32]
+ pfadd mm6, mm2
+ pfadd mm7, mm3
+ movq [edi + ecx*4 + 24], mm6
+ movd [edi + ecx*4 + 32], mm7
+
+
+ mov ebx, [ebp + fshift] /* increment fshift force */
+ mov edx, [esp + is3]
+
+ movq mm6, [ebx + edx*4]
+ movd mm7, [ebx + edx*4 + 8]
+ pfadd mm6, [esp + fixO]
+ pfadd mm7, [esp + fizO]
+ pfadd mm6, mm0
+ pfadd mm7, mm1
+ pfadd mm6, mm2
+ pfadd mm7, mm3
+ movq [ebx + edx*4], mm6
+ movd [ebx + edx*4 + 8], mm7
+
+ mov edx, [ebp + gid] /* get group index for this i particle */
+ mov edx, [edx]
+ add [ebp + gid], 4 /* advance pointer */
+
+ movq mm7, [esp + vctot]
+ pfacc mm7,mm7 /* get and sum the two parts of total potential */
+
+ mov eax, [ebp + Vc]
+ movd mm6, [eax + edx*4]
+ pfadd mm6, mm7
+ movd [eax + edx*4], mm6 /* increment vc[gid] */
+
+ movq mm7, [esp + vnbtot]
+ pfacc mm7,mm7 /* same for Vnb */
+
+ mov eax, [ebp + Vnb]
+ movd mm6, [eax + edx*4]
+ pfadd mm6, mm7
+ movd [eax + edx*4], mm6 /* increment vnb[gid] */
+ /* finish if last */
+ dec dword ptr [ebp + nri]
+ jz .i1120_end
+ /* not last, iterate once more! */
+ jmp .i1120_outer
+.i1120_end:
+ femms
+ add esp, 196
+ pop edi
+ pop esi
+ pop edx
+ pop ecx
+ pop ebx
+ pop eax
+ leave
+ ret
+
+
+
+.globl inl1130_3dnow
+ .type inl1130_3dnow,@function
+inl1130_3dnow:
+.equ nri, 8
+.equ iinr, 12
+.equ jindex, 16
+.equ jjnr, 20
+.equ shift, 24
+.equ shiftvec, 28
+.equ fshift, 32
+.equ gid, 36
+.equ pos, 40
+.equ faction, 44
+.equ charge, 48
+.equ facel, 52
+.equ Vc, 56
+.equ type, 60
+.equ ntype, 64
+.equ nbfp, 68
+.equ Vnb, 72
+ /* stack offsets for local variables */
+.equ is3, 0
+.equ ii3, 4
+.equ ixO, 8
+.equ iyO, 12
+.equ izO, 16
+.equ ixH, 20 /* repeated (64bit) to fill 3dnow reg */
+.equ iyH, 28 /* repeated (64bit) to fill 3dnow reg */
+.equ izH, 36 /* repeated (64bit) to fill 3dnow reg */
+.equ qqOO, 44 /* repeated (64bit) to fill 3dnow reg */
+.equ qqOH, 52 /* repeated (64bit) to fill 3dnow reg */
+.equ qqHH, 60 /* repeated (64bit) to fill 3dnow reg */
+.equ c6, 68 /* repeated (64bit) to fill 3dnow reg */
+.equ c12, 76 /* repeated (64bit) to fill 3dnow reg */
+.equ six, 84 /* repeated (64bit) to fill 3dnow reg */
+.equ twelve, 92 /* repeated (64bit) to fill 3dnow reg */
+.equ vctot, 100 /* repeated (64bit) to fill 3dnow reg */
+.equ vnbtot, 108 /* repeated (64bit) to fill 3dnow reg */
+.equ innerjjnr, 116
+.equ innerk, 120
+.equ fixO, 124
+.equ fiyO, 128
+.equ fizO, 132
+.equ fixH, 136 /* repeated (64bit) to fill 3dnow reg */
+.equ fiyH, 144 /* repeated (64bit) to fill 3dnow reg */
+.equ fizH, 152 /* repeated (64bit) to fill 3dnow reg */
+.equ dxO, 160
+.equ dyO, 164
+.equ dzO, 168
+.equ dxH, 172 /* repeated (64bit) to fill 3dnow reg */
+.equ dyH, 180 /* repeated (64bit) to fill 3dnow reg */
+.equ dzH, 188 /* repeated (64bit) to fill 3dnow reg */
+ push ebp
+ mov ebp,esp
+ push eax
+ push ebx
+ push ecx
+ push edx
+ push esi
+ push edi
+ sub esp, 196 /* local stack space */
+ femms
+ /* assume we have at least one i particle - start directly */
+
+ mov ecx, [ebp + iinr] /* ecx = pointer into iinr[] */
+ mov ebx, [ecx] /* ebx=ii */
+
+ mov edx, [ebp + charge]
+ movd mm1, [ebp + facel] /* mm1=facel */
+ movd mm2, [edx + ebx*4] /* mm2=charge[i.i0] (O) */
+ movd mm3, [edx + ebx*4 + 4] /* mm2=charge[i.i0+1] (H) */
+ movq mm4, mm2
+ pfmul mm4, mm1
+ movq mm6, mm3
+ pfmul mm6, mm1
+ movq mm5, mm4
+ pfmul mm4, mm2 /* mm4=qqOO*facel */
+ pfmul mm5, mm3 /* mm5=qqOH*facel */
+ pfmul mm6, mm3 /* mm6=qqHH*facel */
+ punpckldq mm5,mm5 /* spread to both halves */
+ punpckldq mm6,mm6 /* spread to both halves */
+ movq [esp + qqOO], mm4
+ movq [esp + qqOH], mm5
+ movq [esp + qqHH], mm6
+ mov edx, [ebp + type]
+ mov ecx, [edx + ebx*4]
+ shl ecx, 1
+ mov edx, ecx
+ imul ecx, [ebp + ntype]
+ add edx, ecx
+ mov eax, [ebp + nbfp]
+ movd mm0, [eax + edx*4]
+ movd mm1, [eax + edx*4 + 4]
+ movq [esp + c6], mm0
+ movq [esp + c12], mm1
+ movq mm2, [mm_six]
+ movq mm3, [mm_twelve]
+ movq [esp + six], mm2
+ movq [esp + twelve], mm3
+.i1130_outer:
+ mov eax, [ebp + shift] /* eax = pointer into shift[] */
+ mov ebx, [eax] /* ebx=shift[n] */
+ add [ebp + shift], 4 /* advance pointer one step */
+
+ lea ebx, [ebx + ebx*2] /* ebx=3*is */
+ mov [esp + is3],ebx /* store is3 */
+
+ mov eax, [ebp + shiftvec] /* eax = base of shiftvec[] */
+
+ movq mm5, [eax + ebx*4] /* move shX/shY to mm5 and shZ to mm6. */
+ movd mm6, [eax + ebx*4 + 8]
+ movq mm0, mm5
+ movq mm1, mm5
+ movq mm2, mm6
+ punpckldq mm0,mm0 /* also expand shX,Y,Z in mm0--mm2. */
+ punpckhdq mm1,mm1
+ punpckldq mm2,mm2
+
+ mov ecx, [ebp + iinr] /* ecx = pointer into iinr[] */
+ add [ebp + iinr], 4 /* advance pointer */
+ mov ebx, [ecx] /* ebx=ii */
+
+ lea ebx, [ebx + ebx*2] /* ebx = 3*ii=ii3 */
+ mov eax, [ebp + pos] /* eax = base of pos[] */
+
+ pfadd mm5, [eax + ebx*4] /* ix = shX + posX (and iy too) */
+ movd mm7, [eax + ebx*4 + 8] /* cant use direct memory add for 4 bytes (iz) */
+ mov [esp + ii3], ebx /* (use mm7 as temp. storage for iz.) */
+ pfadd mm6, mm7
+ movq [esp + ixO], mm5
+ movq [esp + izO], mm6
+
+ movd mm3, [eax + ebx*4 + 12]
+ movd mm4, [eax + ebx*4 + 16]
+ movd mm5, [eax + ebx*4 + 20]
+ punpckldq mm3, [eax + ebx*4 + 24]
+ punpckldq mm4, [eax + ebx*4 + 28]
+ punpckldq mm5, [eax + ebx*4 + 32] /* coords of H1 in low mm3-mm5, H2 in high */
+
+ pfadd mm0, mm3
+ pfadd mm1, mm4
+ pfadd mm2, mm5
+ movq [esp + ixH], mm0
+ movq [esp + iyH], mm1
+ movq [esp + izH], mm2
+
+ /* clear vctot and i forces */
+ pxor mm7,mm7
+ movq [esp + vctot], mm7
+ movq [esp + vnbtot], mm7
+ movq [esp + fixO], mm7
+ movq [esp + fizO], mm7
+ movq [esp + fixH], mm7
+ movq [esp + fiyH], mm7
+ movq [esp + fizH], mm7
+
+ mov eax, [ebp + jindex]
+ mov ecx, [eax] /* jindex[n] */
+ mov edx, [eax + 4] /* jindex[n+1] */
+ add [ebp + jindex], 4
+ sub edx, ecx /* number of innerloop atoms */
+ mov [esp + innerk], edx /* number of innerloop atoms */
+
+ mov esi, [ebp + pos]
+ mov edi, [ebp + faction]
+ mov eax, [ebp + jjnr]
+ shl ecx, 2
+ add eax, ecx
+ mov [esp + innerjjnr], eax /* pointer to jjnr[nj0] */
+.i1130_inner_loop:
+ /* a single j particle iteration here - compare with the unrolled code for comments. */
+ mov eax, [esp + innerjjnr]
+ mov eax, [eax] /* eax=jnr offset */
+ add [esp + innerjjnr], 4 /* advance pointer */
+
+ movd mm6, [esp + qqOO]
+ movq mm7, [esp + qqOH]
+
+ lea eax, [eax + eax*2]
+ movq mm0, [esi + eax*4]
+ movd mm1, [esi + eax*4 + 8]
+ /* copy & expand to mm2-mm4 for the H interactions */
+ movq mm2, mm0
+ movq mm3, mm0
+ movq mm4, mm1
+ punpckldq mm2,mm2
+ punpckhdq mm3,mm3
+ punpckldq mm4,mm4
+
+ pfsubr mm0, [esp + ixO]
+ pfsubr mm1, [esp + izO]
+
+ movq [esp + dxO], mm0
+ pfmul mm0,mm0
+ movd [esp + dzO], mm1
+ pfmul mm1,mm1
+ pfacc mm0, mm0
+ pfadd mm0, mm1 /* mm0=rsqO */
+
+ punpckldq mm2, mm2
+ punpckldq mm3, mm3
+ punpckldq mm4, mm4 /* mm2-mm4 is jx-jz */
+ pfsubr mm2, [esp + ixH]
+ pfsubr mm3, [esp + iyH]
+ pfsubr mm4, [esp + izH] /* mm2-mm4 is dxH-dzH */
+
+ movq [esp + dxH], mm2
+ movq [esp + dyH], mm3
+ movq [esp + dzH], mm4
+ pfmul mm2,mm2
+ pfmul mm3,mm3
+ pfmul mm4,mm4
+
+ pfadd mm3,mm2
+ pfadd mm3,mm4 /* mm3=rsqH */
+
+ pfrsqrt mm1,mm0
+
+ movq mm2,mm1
+ pfmul mm1,mm1
+ pfrsqit1 mm1,mm0
+ pfrcpit2 mm1,mm2 /* mm1=invsqrt */ OO
+ movq mm4, mm1
+ pfmul mm4, mm4 /* mm4=invsq */ OO
+
+ movq mm2, mm4
+ pfmul mm2, mm4
+ pfmul mm2, mm4
+ movq mm0, mm2
+ pfmul mm0,mm0
+ pfmul mm2, [esp + c6]
+ pfmul mm0, [esp + c12]
+ movq mm5, mm0
+ pfsub mm5, mm2 /* vnb */
+
+ pfmul mm2, [esp + six]
+ pfmul mm0, [esp + twelve]
+
+ pfsub mm0, mm2
+
+ /* calculate potential and scalar force */
+ pfmul mm6, mm1 /* mm6=vcoul */
+ pfadd mm0, mm6
+ pfmul mm4, mm0 /* mm4=fscalar */
+
+ /* update nb potential */
+ pfadd mm5, [esp + vnbtot]
+ movq [esp + vnbtot], mm5
+
+ pfrsqrt mm5, mm3
+ pswapd mm3,mm3
+ pfrsqrt mm2, mm3
+ pswapd mm3,mm3
+ punpckldq mm5,mm2 /* seeds are in mm5 now, and rsq in mm3 */
+
+ movq mm2, mm5
+ pfmul mm5,mm5
+ pfrsqit1 mm5,mm3
+ pfrcpit2 mm5,mm2 /* mm5=invsqrt */
+ movq mm3,mm5
+ pfmul mm3,mm3 /* mm3=invsq */
+ pfmul mm7, mm5 /* mm7=vcoul */
+ pfmul mm3, mm7 /* mm3=fscal for the two H's. */
+
+ /* update vctot */
+ pfadd mm7, mm6
+ pfadd mm7, [esp + vctot]
+ movq [esp + vctot], mm7
+
+ /* spread oxygen fscalar to both positions */
+ punpckldq mm4,mm4
+ /* calc vectorial force for O */
+ movq mm0, [esp + dxO]
+ movd mm1, [esp + dzO]
+ pfmul mm0, mm4
+ pfmul mm1, mm4
+
+ /* calc vectorial force for H's */
+ movq mm5, [esp + dxH]
+ movq mm6, [esp + dyH]
+ movq mm7, [esp + dzH]
+ pfmul mm5, mm3
+ pfmul mm6, mm3
+ pfmul mm7, mm3
+
+ /* update iO particle force */
+ movq mm2, [esp + fixO]
+ movd mm3, [esp + fizO]
+ pfadd mm2, mm0
+ pfadd mm3, mm1
+ movq [esp + fixO], mm2
+ movd [esp + fizO], mm3
+
+ /* update iH forces */
+ movq mm2, [esp + fixH]
+ movq mm3, [esp + fiyH]
+ movq mm4, [esp + fizH]
+ pfadd mm2, mm5
+ pfadd mm3, mm6
+ pfadd mm4, mm7
+ movq [esp + fixH], mm2
+ movq [esp + fiyH], mm3
+ movq [esp + fizH], mm4
+
+ /* pack j forces from H in the same form as the oxygen force. */
+ pfacc mm5, mm6 /* mm5(l)=fjx(H1+H2) mm5(h)=fjy(H1+H2) */
+ pfacc mm7, mm7 /* mm7(l)=fjz(H1+H2) */
+
+ pfadd mm0, mm5 /* add up total force on j particle. */
+ pfadd mm1, mm7
+
+ /* update j particle force */
+ movq mm2, [edi + eax*4]
+ movd mm3, [edi + eax*4 + 8]
+ pfsub mm2, mm0
+ pfsub mm3, mm1
+ movq [edi + eax*4], mm2
+ movd [edi + eax*4 +8], mm3
+
+ /* interactions with j H1 */
+ movq mm0, [esi + eax*4 + 12]
+ movd mm1, [esi + eax*4 + 20]
+ /* copy & expand to mm2-mm4 for the H interactions */
+ movq mm2, mm0
+ movq mm3, mm0
+ movq mm4, mm1
+ punpckldq mm2,mm2
+ punpckhdq mm3,mm3
+ punpckldq mm4,mm4
+
+ movd mm6, [esp + qqOH]
+ movq mm7, [esp + qqHH]
+
+ pfsubr mm0, [esp + ixO]
+ pfsubr mm1, [esp + izO]
+
+ movq [esp + dxO], mm0
+ pfmul mm0,mm0
+ movd [esp + dzO], mm1
+ pfmul mm1,mm1
+ pfacc mm0, mm1
+ pfadd mm0, mm1 /* mm0=rsqO */
+
+ punpckldq mm2, mm2
+ punpckldq mm3, mm3
+ punpckldq mm4, mm4 /* mm2-mm4 is jx-jz */
+ pfsubr mm2, [esp + ixH]
+ pfsubr mm3, [esp + iyH]
+ pfsubr mm4, [esp + izH] /* mm2-mm4 is dxH-dzH */
+
+ movq [esp + dxH], mm2
+ movq [esp + dyH], mm3
+ movq [esp + dzH], mm4
+ pfmul mm2,mm2
+ pfmul mm3,mm3
+ pfmul mm4,mm4
+
+ pfadd mm3,mm2
+ pfadd mm3,mm4 /* mm3=rsqH */
+
+ pfrsqrt mm1,mm0
+
+ movq mm2,mm1
+ pfmul mm1,mm1
+ pfrsqit1 mm1,mm0
+ pfrcpit2 mm1,mm2 /* mm1=invsqrt */
+ movq mm4, mm1
+ pfmul mm4, mm4 /* mm4=invsq */
+ /* calculate potential and scalar force */
+ pfmul mm6, mm1 /* mm6=vcoul */
+ pfmul mm4, mm6 /* mm4=fscalar */
+
+ pfrsqrt mm5, mm3
+ pswapd mm3,mm3
+ pfrsqrt mm2, mm3
+ pswapd mm3,mm3
+ punpckldq mm5,mm2 /* seeds are in mm5 now, and rsq in mm3 */
+
+ movq mm2, mm5
+ pfmul mm5,mm5
+ pfrsqit1 mm5,mm3
+ pfrcpit2 mm5,mm2 /* mm5=invsqrt */
+ movq mm3,mm5
+ pfmul mm3,mm3 /* mm3=invsq */
+ pfmul mm7, mm5 /* mm7=vcoul */
+ pfmul mm3, mm7 /* mm3=fscal for the two H's. */
+
+ /* update vctot */
+ pfadd mm7, mm6
+ pfadd mm7, [esp + vctot]
+ movq [esp + vctot], mm7
+
+ /* spread oxygen fscalar to both positions */
+ punpckldq mm4,mm4
+ /* calc vectorial force for O */
+ movq mm0, [esp + dxO]
+ movd mm1, [esp + dzO]
+ pfmul mm0, mm4
+ pfmul mm1, mm4
+
+ /* calc vectorial force for H's */
+ movq mm5, [esp + dxH]
+ movq mm6, [esp + dyH]
+ movq mm7, [esp + dzH]
+ pfmul mm5, mm3
+ pfmul mm6, mm3
+ pfmul mm7, mm3
+
+ /* update iO particle force */
+ movq mm2, [esp + fixO]
+ movd mm3, [esp + fizO]
+ pfadd mm2, mm0
+ pfadd mm3, mm1
+ movq [esp + fixO], mm2
+ movd [esp + fizO], mm3
+
+ /* update iH forces */
+ movq mm2, [esp + fixH]
+ movq mm3, [esp + fiyH]
+ movq mm4, [esp + fizH]
+ pfadd mm2, mm5
+ pfadd mm3, mm6
+ pfadd mm4, mm7
+ movq [esp + fixH], mm2
+ movq [esp + fiyH], mm3
+ movq [esp + fizH], mm4
+
+ /* pack j forces from H in the same form as the oxygen force. */
+ pfacc mm5, mm6 /* mm5(l)=fjx(H1+H2) mm5(h)=fjy(H1+H2) */
+ pfacc mm7, mm7 /* mm7(l)=fjz(H1+H2) */
+
+ pfadd mm0, mm5 /* add up total force on j particle. */
+ pfadd mm1, mm7
+
+ /* update j particle force */
+ movq mm2, [edi + eax*4 + 12]
+ movd mm3, [edi + eax*4 + 20]
+ pfsub mm2, mm0
+ pfsub mm3, mm1
+ movq [edi + eax*4 + 12], mm2
+ movd [edi + eax*4 + 20], mm3
+
+ /* interactions with j H2 */
+ movq mm0, [esi + eax*4 + 24]
+ movd mm1, [esi + eax*4 + 32]
+ /* copy & expand to mm2-mm4 for the H interactions */
+ movq mm2, mm0
+ movq mm3, mm0
+ movq mm4, mm1
+ punpckldq mm2,mm2
+ punpckhdq mm3,mm3
+ punpckldq mm4,mm4
+
+ movd mm6, [esp + qqOH]
+ movq mm7, [esp + qqHH]
+
+ pfsubr mm0, [esp + ixO]
+ pfsubr mm1, [esp + izO]
+
+ movq [esp + dxO], mm0
+ pfmul mm0,mm0
+ movd [esp + dzO], mm1
+ pfmul mm1,mm1
+ pfacc mm0, mm1
+ pfadd mm0, mm1 /* mm0=rsqO */
+
+ punpckldq mm2, mm2
+ punpckldq mm3, mm3
+ punpckldq mm4, mm4 /* mm2-mm4 is jx-jz */
+ pfsubr mm2, [esp + ixH]
+ pfsubr mm3, [esp + iyH]
+ pfsubr mm4, [esp + izH] /* mm2-mm4 is dxH-dzH */
+
+ movq [esp + dxH], mm2
+ movq [esp + dyH], mm3
+ movq [esp + dzH], mm4
+ pfmul mm2,mm2
+ pfmul mm3,mm3
+ pfmul mm4,mm4
+
+ pfadd mm3,mm2
+ pfadd mm3,mm4 /* mm3=rsqH */
+
+ pfrsqrt mm1,mm0
+
+ movq mm2,mm1
+ pfmul mm1,mm1
+ pfrsqit1 mm1,mm0
+ pfrcpit2 mm1,mm2 /* mm1=invsqrt */
+ movq mm4, mm1
+ pfmul mm4, mm4 /* mm4=invsq */
+ /* calculate potential and scalar force */
+ pfmul mm6, mm1 /* mm6=vcoul */
+ pfmul mm4, mm6 /* mm4=fscalar */
+
+ pfrsqrt mm5, mm3
+ pswapd mm3,mm3
+ pfrsqrt mm2, mm3
+ pswapd mm3,mm3
+ punpckldq mm5,mm2 /* seeds are in mm5 now, and rsq in mm3. */
+
+ movq mm2, mm5
+ pfmul mm5,mm5
+ pfrsqit1 mm5,mm3
+ pfrcpit2 mm5,mm2 /* mm5=invsqrt */
+ movq mm3,mm5
+ pfmul mm3,mm3 /* mm3=invsq */
+ pfmul mm7, mm5 /* mm7=vcoul */
+ pfmul mm3, mm7 /* mm3=fscal for the two H's. */
+
+ /* update vctot */
+ pfadd mm7, mm6
+ pfadd mm7, [esp + vctot]
+ movq [esp + vctot], mm7
+
+ /* spread oxygen fscalar to both positions */
+ punpckldq mm4,mm4
+ /* calc vectorial force for O */
+ movq mm0, [esp + dxO]
+ movd mm1, [esp + dzO]
+ pfmul mm0, mm4
+ pfmul mm1, mm4
+
+ /* calc vectorial force for H's */
+ movq mm5, [esp + dxH]
+ movq mm6, [esp + dyH]
+ movq mm7, [esp + dzH]
+ pfmul mm5, mm3
+ pfmul mm6, mm3
+ pfmul mm7, mm3
+
+ /* update iO particle force */
+ movq mm2, [esp + fixO]
+ movd mm3, [esp + fizO]
+ pfadd mm2, mm0
+ pfadd mm3, mm1
+ movq [esp + fixO], mm2
+ movd [esp + fizO], mm3
+
+ /* update iH forces */
+ movq mm2, [esp + fixH]
+ movq mm3, [esp + fiyH]
+ movq mm4, [esp + fizH]
+ pfadd mm2, mm5
+ pfadd mm3, mm6
+ pfadd mm4, mm7
+ movq [esp + fixH], mm2
+ movq [esp + fiyH], mm3
+ movq [esp + fizH], mm4
+
+ /* pack j forces from H in the same form as the oxygen force. */
+ pfacc mm5, mm6 /* mm5(l)=fjx(H1+H2) mm5(h)=fjy(H1+H2) */
+ pfacc mm7, mm7 /* mm7(l)=fjz(H1+H2) */
+
+ pfadd mm0, mm5 /* add up total force on j particle. */
+ pfadd mm1, mm7
+
+ /* update j particle force */
+ movq mm2, [edi + eax*4 + 24]
+ movd mm3, [edi + eax*4 + 32]
+ pfsub mm2, mm0
+ pfsub mm3, mm1
+ movq [edi + eax*4 + 24], mm2
+ movd [edi + eax*4 + 32], mm3
+
+ /* done - one more? */
+ dec dword ptr [esp + innerk]
+ jz .i1130_updateouterdata
+ jmp .i1130_inner_loop
+.i1130_updateouterdata:
+ mov ecx, [esp + ii3]
+
+ movq mm6, [edi + ecx*4] /* increment iO force */
+ movd mm7, [edi + ecx*4 + 8]
+ pfadd mm6, [esp + fixO]
+ pfadd mm7, [esp + fizO]
+ movq [edi + ecx*4], mm6
+ movd [edi + ecx*4 +8], mm7
+
+ movq mm0, [esp + fixH]
+ movq mm3, [esp + fiyH]
+ movq mm1, [esp + fizH]
+ movq mm2, mm0
+ punpckldq mm0, mm3 /* mm0(l)=fxH1, mm0(h)=fyH1 */
+ punpckhdq mm2, mm3 /* mm2(l)=fxH2, mm2(h)=fyH2 */
+ movq mm3, mm1
+ pswapd mm3,mm3
+ /* mm1 is fzH1 */
+ /* mm3 is fzH2 */
+
+ movq mm6, [edi + ecx*4 + 12] /* increment iH1 force */
+ movd mm7, [edi + ecx*4 + 20]
+ pfadd mm6, mm0
+ pfadd mm7, mm1
+ movq [edi + ecx*4 + 12], mm6
+ movd [edi + ecx*4 + 20], mm7
+
+ movq mm6, [edi + ecx*4 + 24] /* increment iH2 force */
+ movd mm7, [edi + ecx*4 + 32]
+ pfadd mm6, mm2
+ pfadd mm7, mm3
+ movq [edi + ecx*4 + 24], mm6
+ movd [edi + ecx*4 + 32], mm7
+
+
+ mov ebx, [ebp + fshift] /* increment fshift force */
+ mov edx, [esp + is3]
+
+ movq mm6, [ebx + edx*4]
+ movd mm7, [ebx + edx*4 + 8]
+ pfadd mm6, [esp + fixO]
+ pfadd mm7, [esp + fizO]
+ pfadd mm6, mm0
+ pfadd mm7, mm1
+ pfadd mm6, mm2
+ pfadd mm7, mm3
+ movq [ebx + edx*4], mm6
+ movd [ebx + edx*4 + 8], mm7
+
+ mov edx, [ebp + gid] /* get group index for this i particle */
+ mov edx, [edx]
+ add [ebp + gid], 4 /* advance pointer */
+
+ movq mm7, [esp + vctot]
+ pfacc mm7,mm7 /* get and sum the two parts of total potential */
+
+ mov eax, [ebp + Vc]
+ movd mm6, [eax + edx*4]
+ pfadd mm6, mm7
+ movd [eax + edx*4], mm6 /* increment vc[gid] */
+
+ movq mm7, [esp + vnbtot]
+ pfacc mm7,mm7 /* get and sum the two parts of total potential */
+
+ mov eax, [ebp + Vnb]
+ movd mm6, [eax + edx*4]
+ pfadd mm6, mm7
+ movd [eax + edx*4], mm6 /* increment vnbtot[gid] */
+ /* finish if last */
+ dec dword ptr [ebp + nri]
+ jz .i1130_end
+ /* not last, iterate once more! */
+ jmp .i1130_outer
+.i1130_end:
+ femms
+ add esp, 196
+ pop edi
+ pop esi
+ pop edx
+ pop ecx
+ pop ebx
+ pop eax
+ leave
+ ret
+
+
+
+
+.globl inl3000_3dnow
+ .type inl3000_3dnow,@function
+inl3000_3dnow:
+.equ nri, 8
+.equ iinr, 12
+.equ jindex, 16
+.equ jjnr, 20
+.equ shift, 24
+.equ shiftvec, 28
+.equ fshift, 32
+.equ gid, 36
+.equ pos, 40
+.equ faction, 44
+.equ charge, 48
+.equ facel, 52
+.equ Vc, 56
+.equ tabscale, 60
+.equ VFtab, 64
+ /* stack offsets for local variables */
+.equ is3, 0
+.equ ii3, 4
+.equ ix, 8
+.equ iy, 12
+.equ iz, 16
+.equ iq, 20 /* repeated (64bit) to fill 3dnow reg */
+.equ vctot, 28 /* repeated (64bit) to fill 3dnow reg */
+.equ two, 36 /* repeated (64bit) to fill 3dnow reg */
+.equ n1, 44 /* repeated (64bit) to fill 3dnow reg */
+.equ tsc, 52 /* repeated (64bit) to fill 3dnow reg */
+.equ ntia, 60
+.equ innerjjnr, 64
+.equ innerk, 68
+.equ fix, 72
+.equ fiy, 76
+.equ fiz, 80
+.equ dx1, 84
+.equ dy1, 88
+.equ dz1, 92
+.equ dx2, 96
+.equ dy2, 100
+.equ dz2, 104
+ push ebp
+ mov ebp,esp
+ push eax
+ push ebx
+ push ecx
+ push edx
+ push esi
+ push edi
+ sub esp, 108 /* local stack space */
+ femms
+ /* move data to local stack */
+ movq mm0, [mm_two]
+ movd mm3, [ebp + tabscale]
+ movq [esp + two], mm0
+ punpckldq mm3,mm3
+ movq [esp + tsc], mm3
+ /* assume we have at least one i particle - start directly */
+.i3000_outer:
+ mov eax, [ebp + shift] /* eax = pointer into shift[] */
+ mov ebx, [eax] /* ebx=shift[n] */
+ add [ebp + shift], 4 /* advance pointer one step */
+
+ lea ebx, [ebx + ebx*2] /* ebx=3*is */
+ mov [esp + is3],ebx /* store is3 */
+
+ mov eax, [ebp + shiftvec] /* eax = base of shiftvec[] */
+
+ movq mm0, [eax + ebx*4] /* move shX/shY to mm0 and shZ to mm1 */
+ movd mm1, [eax + ebx*4 + 8]
+
+ mov ecx, [ebp + iinr] /* ecx = pointer into iinr[] */
+ add [ebp + iinr], 4 /* advance pointer */
+ mov ebx, [ecx] /* ebx=ii */
+
+ mov edx, [ebp + charge]
+ movd mm2, [edx + ebx*4] /* mm2=charge[ii] */
+ pfmul mm2, [ebp + facel]
+ punpckldq mm2,mm2 /* spread to both halves */
+ movq [esp + iq], mm2 /* iq =facel*charge[ii] */
+
+ lea ebx, [ebx + ebx*2] /* ebx = 3*ii=ii3 */
+ mov eax, [ebp + pos] /* eax = base of pos[] */
+
+ pfadd mm0, [eax + ebx*4] /* ix = shX + posX (and iy too) */
+ movd mm3, [eax + ebx*4 + 8] /* cant use direct memory add for 4 bytes (iz) */
+ mov [esp + ii3], ebx
+ pfadd mm1, mm3
+ movq [esp + ix], mm0
+ movd [esp + iz], mm1
+
+ /* clear total potential and i forces */
+ pxor mm7,mm7
+ movq [esp + vctot], mm7
+ movq [esp + fix], mm7
+ movd [esp + fiz], mm7
+
+ mov eax, [ebp + jindex]
+ mov ecx, [eax] /* jindex[n] */
+ mov edx, [eax + 4] /* jindex[n+1] */
+ add [ebp + jindex], 4
+ sub edx, ecx /* number of innerloop atoms */
+
+ mov esi, [ebp + pos]
+ mov edi, [ebp + faction]
+ mov eax, [ebp + jjnr]
+ shl ecx, 2
+ add eax, ecx
+ mov [esp + innerjjnr], eax /* pointer to jjnr[nj0] */
+ sub edx, 2
+ mov [esp + innerk], edx /* number of innerloop atoms */
+ jge .i3000_unroll_loop
+ jmp .i3000_finish_inner
+.i3000_unroll_loop:
+ /* paired innerloop starts here */
+ mov ecx, [esp + innerjjnr] /* pointer to jjnr[k] */
+ mov eax, [ecx]
+ mov ebx, [ecx + 4] /* eax/ebx=jnr */
+ add [esp + innerjjnr], 8 /* advance pointer (unrolled 2) */
+ prefetch [ecx + 16] /* prefetch data - trial and error says 16 is best */
+
+ mov ecx, [ebp + charge] /* base of charge[] */
+ movq mm5, [esp + iq]
+ movd mm3, [ecx + eax*4] /* charge[jnr1] */
+ punpckldq mm3, [ecx + ebx*4] /* move charge 2 to high part of mm3 */
+ pfmul mm3,mm5 /* mm3 now has qq for both particles */
+
+ lea eax, [eax + eax*2] /* replace jnr with j3 */
+ lea ebx, [ebx + ebx*2]
+
+ mov esi, [ebp + pos]
+
+ movq mm0, [esp + ix]
+ movd mm1, [esp + iz]
+ movq mm4, [esi + eax*4] /* fetch first j coordinates */
+ movd mm5, [esi + eax*4 + 8]
+ pfsubr mm4,mm0 /* dr = ir - jr */
+ pfsubr mm5,mm1
+ movq [esp + dx1], mm4 /* store dr */
+ movd [esp + dz1], mm5
+ pfmul mm4,mm4 /* square dx,dy,dz */
+ pfmul mm5,mm5
+ pfacc mm4, mm5 /* accumulate to get dx*dx+dy*dy+dz*dz */
+ pfacc mm4, mm5 /* first rsq in lower mm4 */
+
+ movq mm6, [esi + ebx*4] /* fetch second j coordinates */
+ movd mm7, [esi + ebx*4 + 8]
+
+ pfsubr mm6,mm0 /* dr = ir - jr */
+ pfsubr mm7,mm1
+ movq [esp + dx2], mm6 /* store dr */
+ movd [esp + dz2], mm7
+ pfmul mm6,mm6 /* square dx,dy,dz */
+ pfmul mm7,mm7
+ pfacc mm6, mm7 /* accumulate to get dx*dx+dy*dy+dz*dz */
+ pfacc mm6, mm7 /* second rsq in lower mm6 */
+
+ pfrsqrt mm0, mm4 /* lookup inverse square root seed */
+ pfrsqrt mm1, mm6
+
+
+ punpckldq mm0,mm1
+ punpckldq mm4,mm6 /* now 4 has rsq and 0 the seed for both pairs. */
+ movq mm2,mm0 /* amd 3dnow N-R iteration to get full precision. */
+ pfmul mm0,mm0
+ pfrsqit1 mm0,mm4
+ pfrcpit2 mm0,mm2
+ pfmul mm4, mm0
+ movq mm1, mm4
+ /* mm0 is invsqrt, and mm1 r. */
+ /* do potential and fscal */
+ pfmul mm1, [esp + tsc] /* mm1=rt */
+ pf2iw mm4,mm1
+ movq [esp + n1], mm4
+ pi2fd mm4,mm4
+ pfsub mm1, mm4 /* now mm1 is eps and mm4 is n0 */
+
+ movq mm2,mm1
+ pfmul mm2,mm2 /* mm1 is eps, mm2 is eps2 */
+
+ mov edx, [ebp + VFtab]
+ mov ecx, [esp + n1]
+ shl ecx, 2
+ /* coulomb table */
+ /* load all the table values we need */
+ movd mm4, [edx + ecx*4]
+ movd mm5, [edx + ecx*4 + 4]
+ movd mm6, [edx + ecx*4 + 8]
+ movd mm7, [edx + ecx*4 + 12]
+ mov ecx, [esp + n1 + 4]
+ shl ecx, 2
+ punpckldq mm4, [edx + ecx*4]
+ punpckldq mm5, [edx + ecx*4 + 4]
+ punpckldq mm6, [edx + ecx*4 + 8]
+ punpckldq mm7, [edx + ecx*4 + 12]
+
+ pfmul mm6, mm1 /* mm6 = Geps */
+ pfmul mm7, mm2 /* mm7 = Heps2 */
+
+ pfadd mm5, mm6
+ pfadd mm5, mm7 /* mm5 = Fp */
+
+ pfmul mm7, [esp + two] /* two*Heps2 */
+ pfadd mm7, mm6
+ pfadd mm7, mm5 /* mm7=FF */
+
+ pfmul mm5, mm1 /* mm5=eps*Fp */
+ pfadd mm5, mm4 /* mm5= VV */
+
+ pfmul mm5, mm3 /* vcoul=qq*VV */
+ pfmul mm3, mm7 /* fijC=FF*qq */
+
+ /* at this point mm5 contains vcoul and mm3 fijC. */
+ /* increment vcoul - then we can get rid of mm5. */
+ /* update vctot */
+ pfadd mm5, [esp + vctot] /* add the earlier value */
+ movq [esp + vctot], mm5 /* store the sum */
+
+ /* change sign of mm3 */
+ pxor mm1,mm1
+ pfsub mm1, mm3
+ pfmul mm1, [esp + tsc]
+ pfmul mm0, mm1 /* mm0 is total fscal now */
+
+ prefetchw [esp + dx1] /* prefetch i forces to cache */
+
+ /* spread fscalar to both positions */
+ movq mm1,mm0
+ punpckldq mm0,mm0
+ punpckhdq mm1,mm1
+
+ /* calc vector force */
+ prefetchw [edi + eax*4] /* prefetch the 1st faction to cache */
+ movq mm2, [esp + dx1] /* fetch dr */
+ movd mm3, [esp + dz1]
+
+ prefetchw [edi + ebx*4] /* prefetch the 2nd faction to cache */
+ pfmul mm2, mm0 /* mult by fs */
+ pfmul mm3, mm0
+
+ movq mm4, [esp + dx2] /* fetch dr */
+ movd mm5, [esp + dz2]
+ pfmul mm4, mm1 /* mult by fs */
+ pfmul mm5, mm1
+ /* update i forces */
+
+ movq mm0, [esp + fix]
+ movd mm1, [esp + fiz]
+ pfadd mm0, mm2
+ pfadd mm1, mm3
+
+ pfadd mm0, mm4
+ pfadd mm1, mm5
+ movq [esp + fix], mm0
+ movd [esp + fiz], mm1
+ /* update j forces */
+
+ movq mm0, [edi + eax*4]
+ movd mm1, [edi + eax*4 + 8]
+ movq mm6, [edi + ebx*4]
+ movd mm7, [edi + ebx*4 + 8]
+
+ pfsub mm0, mm2
+ pfsub mm1, mm3
+ pfsub mm6, mm4
+ pfsub mm7, mm5
+
+ movq [edi + eax*4], mm0
+ movd [edi + eax*4 +8], mm1
+ movq [edi + ebx*4], mm6
+ movd [edi + ebx*4 + 8], mm7
+
+ /* should we do one more iteration? */
+ sub [esp + innerk], 2
+ jl .i3000_finish_inner
+ jmp .i3000_unroll_loop
+.i3000_finish_inner:
+ and [esp + innerk], 1
+ jnz .i3000_single_inner
+ jmp .i3000_updateouterdata
+.i3000_single_inner:
+ /* a single j particle iteration here - compare with the unrolled code for comments. */
+ mov eax, [esp + innerjjnr]
+ mov eax, [eax] /* eax=jnr offset */
+
+ mov ecx, [ebp + charge]
+ movd mm5, [esp + iq]
+ movd mm3, [ecx + eax*4]
+ pfmul mm3, mm5 /* mm3=qq */
+
+ mov esi, [ebp + pos]
+ lea eax, [eax + eax*2]
+
+ movq mm0, [esp + ix]
+ movd mm1, [esp + iz]
+ movq mm4, [esi + eax*4]
+ movd mm5, [esi + eax*4 + 8]
+ pfsubr mm4, mm0
+ pfsubr mm5, mm1
+ movq [esp + dx1], mm4
+ pfmul mm4,mm4
+ movd [esp + dz1], mm5
+ pfmul mm5,mm5
+ pfacc mm4, mm5
+ pfacc mm4, mm5 /* mm0=rsq */
+
+ pfrsqrt mm0,mm4
+ movq mm2,mm0
+ pfmul mm0,mm0
+ pfrsqit1 mm0,mm4
+ pfrcpit2 mm0,mm2 /* mm1=invsqrt */
+ pfmul mm4, mm0
+ movq mm1, mm4
+ /* mm0 is invsqrt, and mm1 r. */
+
+ /* calculate potentials and scalar force */
+ pfmul mm1, [esp + tsc] /* mm1=rt */
+ pf2iw mm4,mm1
+ movd [esp + n1], mm4
+ pi2fd mm4,mm4
+ pfsub mm1, mm4 /* now mm1 is eps and mm4 is n0 */
+
+ movq mm2,mm1
+ pfmul mm2,mm2 /* mm1 is eps, mm2 is eps2 */
+
+ /* coulomb table */
+ mov edx, [ebp + VFtab]
+ mov ecx, [esp + n1]
+ shl ecx, 2
+ /* load all the table values we need */
+ movd mm4, [edx + ecx*4]
+ movd mm5, [edx + ecx*4 + 4]
+ movd mm6, [edx + ecx*4 + 8]
+ movd mm7, [edx + ecx*4 + 12]
+
+ pfmul mm6, mm1 /* mm6 = Geps */
+ pfmul mm7, mm2 /* mm7 = Heps2 */
+
+ pfadd mm5, mm6
+ pfadd mm5, mm7 /* mm5 = Fp */
+
+ pfmul mm7, [esp + two] /* two*Heps2 */
+ pfadd mm7, mm6
+ pfadd mm7, mm5 /* mm7=FF */
+
+ pfmul mm5, mm1 /* mm5=eps*Fp */
+ pfadd mm5, mm4 /* mm5= VV */
+
+ pfmul mm5, mm3 /* vcoul=qq*VV */
+ pfmul mm3, mm7 /* fijC=FF*qq */
+
+ /* at this point mm5 contains vcoul and mm3 fijC */
+ /* increment vcoul - then we can get rid of mm5 */
+ /* update vctot */
+ pfadd mm5, [esp + vctot] /* add the earlier value */
+ movq [esp + vctot], mm5 /* store the sum */
+
+ /* change sign of mm3 */
+ pxor mm1,mm1
+ pfsub mm1, mm3
+ pfmul mm0, [esp + tsc]
+ pfmul mm0, mm1 /* mm0 is total fscal now */
+
+ /* spread fscalar to both positions */
+ punpckldq mm0,mm0
+ /* calc vectorial force */
+ prefetchw [edi + eax*4] /* prefetch faction to cache */
+ movq mm2, [esp + dx1]
+ movd mm3, [esp + dz1]
+
+
+ pfmul mm2, mm0
+ pfmul mm3, mm0
+
+ /* update i particle force */
+ movq mm0, [esp + fix]
+ movd mm1, [esp + fiz]
+ pfadd mm0, mm2
+ pfadd mm1, mm3
+ movq [esp + fix], mm0
+ movd [esp + fiz], mm1
+ /* update j particle force */
+ movq mm0, [edi + eax*4]
+ movd mm1, [edi + eax *4+ 8]
+ pfsub mm0, mm2
+ pfsub mm1, mm3
+ movq [edi + eax*4], mm0
+ movd [edi + eax*4 +8], mm1
+ /* done! */
+.i3000_updateouterdata:
+ mov ecx, [esp + ii3]
+
+ movq mm6, [edi + ecx*4] /* increment i force */
+ movd mm7, [edi + ecx*4 + 8]
+ pfadd mm6, [esp + fix]
+ pfadd mm7, [esp + fiz]
+ movq [edi + ecx*4], mm6
+ movd [edi + ecx*4 +8], mm7
+
+ mov ebx, [ebp + fshift] /* increment fshift force */
+ mov edx, [esp + is3]
+
+ movq mm6, [ebx + edx*4]
+ movd mm7, [ebx + edx*4 + 8]
+ pfadd mm6, [esp + fix]
+ pfadd mm7, [esp + fiz]
+ movq [ebx + edx*4], mm6
+ movd [ebx + edx*4 + 8], mm7
+
+ mov edx, [ebp + gid] /* get group index for this i particle */
+ mov edx, [edx]
+ add [ebp + gid], 4 /* advance pointer */
+
+ movq mm7, [esp + vctot]
+ pfacc mm7,mm7 /* get and sum the two parts of total potential */
+
+ mov eax, [ebp + Vc]
+ movd mm6, [eax + edx*4]
+ pfadd mm6, mm7
+ movd [eax + edx*4], mm6 /* increment vc[gid] */
+
+ /* finish if last */
+ mov ecx, [ebp + nri]
+ dec ecx
+ jecxz .i3000_end
+ /* not last, iterate once more! */
+ mov [ebp + nri], ecx
+ jmp .i3000_outer
+.i3000_end:
+ femms
+ add esp, 108
+ pop edi
+ pop esi
+ pop edx
+ pop ecx
+ pop ebx
+ pop eax
+ leave
+ ret
+
+
+
+
+.globl inl3010_3dnow
+ .type inl3010_3dnow,@function
+inl3010_3dnow:
+.equ nri, 8
+.equ iinr, 12
+.equ jindex, 16
+.equ jjnr, 20
+.equ shift, 24
+.equ shiftvec, 28
+.equ fshift, 32
+.equ gid, 36
+.equ pos, 40
+.equ faction, 44
+.equ charge, 48
+.equ facel, 52
+.equ Vc, 56
+.equ tabscale, 60
+.equ VFtab, 64
+.equ nsatoms, 68
+ /* stack offsets for local variables */
+.equ is3, 0
+.equ ii3, 4
+.equ shX, 8
+.equ shY, 12
+.equ shZ, 16
+.equ ix, 20
+.equ iy, 24
+.equ iz, 28
+.equ iq, 32 /* repeated (64bit) to fill 3dnow reg */
+.equ vctot, 40 /* repeated (64bit) to fill 3dnow reg */
+.equ two, 48 /* repeated (64bit) to fill 3dnow reg */
+.equ n1, 56 /* repeated (64bit) to fill 3dnow reg */
+.equ tsc, 64 /* repeated (64bit) to fill 3dnow reg */
+.equ innerjjnr0, 72
+.equ innerk0, 76
+.equ innerjjnr, 80
+.equ innerk, 84
+.equ fix, 88
+.equ fiy, 92
+.equ fiz, 96
+.equ dx1, 100
+.equ dy1, 104
+.equ dz1, 108
+.equ dx2, 112
+.equ dy2, 116
+.equ dz2, 120
+.equ nscoul, 124
+.equ solnr, 128
+ push ebp
+ mov ebp,esp
+ push eax
+ push ebx
+ push ecx
+ push edx
+ push esi
+ push edi
+ sub esp, 132 /* local stack space */
+ femms
+
+ add [ebp + nsatoms], 8
+ movq mm2, [mm_two]
+ movq [esp + two], mm2
+ movd mm3, [ebp + tabscale]
+ punpckldq mm3,mm3
+ movq [esp + tsc], mm3
+
+ /* assume we have at least one i particle - start directly */
+.i3010_outer:
+ mov eax, [ebp + shift] /* eax = pointer into shift[] */
+ mov ebx, [eax] /* ebx=shift[n] */
+ add [ebp + shift], 4 /* advance pointer one step */
+
+ lea ebx, [ebx + ebx*2] /* ebx=3*is */
+ mov [esp + is3],ebx /* store is3 */
+
+ mov eax, [ebp + shiftvec] /* eax = base of shiftvec[] */
+
+ movq mm0, [eax + ebx*4] /* move shX/shY to mm0 and shZ to mm1 */
+ movd mm1, [eax + ebx*4 + 8]
+ movq [esp + shX], mm0
+ movd [esp + shZ], mm1
+
+ mov ecx, [ebp + iinr] /* ecx = pointer into iinr[] */
+ add [ebp + iinr], 4 /* advance pointer */
+ mov ebx, [ecx] /* ebx=ii */
+
+ mov eax, [ebp + nsatoms]
+ mov ecx, [eax]
+ add [ebp + nsatoms], 12
+ mov [esp + nscoul], ecx
+
+ /* clear potential */
+ pxor mm7,mm7
+ movq [esp + vctot], mm7
+ mov [esp + solnr], ebx
+
+ mov eax, [ebp + jindex]
+ mov ecx, [eax] /* jindex[n] */
+ mov edx, [eax + 4] /* jindex[n+1] */
+ add [ebp + jindex], 4
+ sub edx, ecx /* number of innerloop atoms */
+ mov eax, [ebp + jjnr]
+ shl ecx, 2
+ add eax, ecx
+ mov [esp + innerjjnr0], eax /* pointer to jjnr[nj0] */
+
+ mov [esp + innerk0], edx /* number of innerloop atoms */
+ mov esi, [ebp + pos]
+ mov edi, [ebp + faction]
+ mov ecx, [esp + nscoul]
+ cmp ecx, 0
+ jnz .i3010_mno_coul
+ jmp .i3010_last_mno
+.i3010_mno_coul:
+ mov ebx, [esp + solnr]
+ inc dword ptr [esp + solnr]
+ mov edx, [ebp + charge]
+ movd mm2, [edx + ebx*4] /* mm2=charge[ii] */
+ pfmul mm2, [ebp + facel]
+ punpckldq mm2,mm2 /* spread to both halves */
+ movq [esp + iq], mm2 /* iq =facel*charge[ii] */
+
+ lea ebx, [ebx + ebx*2] /* ebx = 3*ii=ii3 */
+ mov eax, [ebp + pos] /* eax = base of pos[] */
+ mov [esp + ii3], ebx
+
+ movq mm0, [eax + ebx*4]
+ movd mm1, [eax + ebx*4 + 8]
+ pfadd mm0, [esp + shX]
+ pfadd mm1, [esp + shZ]
+ movq [esp + ix], mm0
+ movd [esp + iz], mm1
+
+ /* clear forces */
+ pxor mm7,mm7
+ movq [esp + fix], mm7
+ movd [esp + fiz], mm7
+
+ mov ecx, [esp + innerjjnr0]
+ mov [esp + innerjjnr], ecx
+ mov edx, [esp + innerk0]
+ sub edx, 2
+ mov [esp + innerk], edx /* number of innerloop atoms */
+ jge .i3010_unroll_coul_loop
+ jmp .i3010_finish_coul_inner
+.i3010_unroll_coul_loop:
+ /* paired innerloop starts here */
+ mov ecx, [esp + innerjjnr] /* pointer to jjnr[k] */
+ mov eax, [ecx]
+ mov ebx, [ecx + 4] /* eax/ebx=jnr */
+ add [esp + innerjjnr], 8 /* advance pointer (unrolled 2) */
+ prefetch [ecx + 16] /* prefetch data - trial and error says 16 is best */
+
+ mov ecx, [ebp + charge] /* base of charge[] */
+ movq mm5, [esp + iq]
+ movd mm3, [ecx + eax*4] /* charge[jnr1] */
+ punpckldq mm3, [ecx + ebx*4] /* move charge 2 to high part of mm3 */
+ pfmul mm3,mm5 /* mm3 now has qq for both particles */
+
+ lea eax, [eax + eax*2] /* replace jnr with j3 */
+ lea ebx, [ebx + ebx*2]
+
+ mov esi, [ebp + pos]
+
+ movq mm0, [esp + ix]
+ movd mm1, [esp + iz]
+ movq mm4, [esi + eax*4] /* fetch first j coordinates */
+ movd mm5, [esi + eax*4 + 8]
+ pfsubr mm4,mm0 /* dr = ir - jr */
+ pfsubr mm5,mm1
+ movq [esp + dx1], mm4 /* store dr */
+ movd [esp + dz1], mm5
+ pfmul mm4,mm4 /* square dx,dy,dz */
+ pfmul mm5,mm5
+ pfacc mm4, mm5 /* accumulate to get dx*dx+dy*dy+dz*dz */
+ pfacc mm4, mm5 /* first rsq in lower mm4 */
+
+ movq mm6, [esi + ebx*4] /* fetch second j coordinates */
+ movd mm7, [esi + ebx*4 + 8]
+
+ pfsubr mm6,mm0 /* dr = ir - jr */
+ pfsubr mm7,mm1
+ movq [esp + dx2], mm6 /* store dr */
+ movd [esp + dz2], mm7
+ pfmul mm6,mm6 /* square dx,dy,dz */
+ pfmul mm7,mm7
+ pfacc mm6, mm7 /* accumulate to get dx*dx+dy*dy+dz*dz */
+ pfacc mm6, mm7 /* second rsq in lower mm6 */
+
+ pfrsqrt mm0, mm4 /* lookup inverse square root seed */
+ pfrsqrt mm1, mm6
+
+
+ punpckldq mm0,mm1
+ punpckldq mm4,mm6 /* now 4 has rsq and 0 the seed for both pairs. */
+ movq mm2,mm0 /* amd 3dnow N-R iteration to get full precision. */
+ pfmul mm0,mm0
+ pfrsqit1 mm0,mm4
+ pfrcpit2 mm0,mm2
+ pfmul mm4, mm0
+ movq mm1, mm4
+ /* mm0 is invsqrt, and mm1 r. */
+ /* do potential and fscal */
+ pfmul mm1, [esp + tsc] /* mm1=rt */
+ pf2iw mm4,mm1
+ movq [esp + n1], mm4
+ pi2fd mm4,mm4
+ pfsub mm1, mm4 /* now mm1 is eps and mm4 is n0 */
+
+ movq mm2,mm1
+ pfmul mm2,mm2 /* mm1 is eps, mm2 is eps2 */
+
+ mov edx, [ebp + VFtab]
+ mov ecx, [esp + n1]
+ shl ecx, 2
+ /* coulomb table */
+ /* load all the table values we need */
+ movd mm4, [edx + ecx*4]
+ movd mm5, [edx + ecx*4 + 4]
+ movd mm6, [edx + ecx*4 + 8]
+ movd mm7, [edx + ecx*4 + 12]
+ mov ecx, [esp + n1 + 4]
+ shl ecx, 2
+ punpckldq mm4, [edx + ecx*4]
+ punpckldq mm5, [edx + ecx*4 + 4]
+ punpckldq mm6, [edx + ecx*4 + 8]
+ punpckldq mm7, [edx + ecx*4 + 12]
+
+ pfmul mm6, mm1 /* mm6 = Geps */
+ pfmul mm7, mm2 /* mm7 = Heps2 */
+
+ pfadd mm5, mm6
+ pfadd mm5, mm7 /* mm5 = Fp */
+
+ pfmul mm7, [esp + two] /* two*Heps2 */
+ pfadd mm7, mm6
+ pfadd mm7, mm5 /* mm7=FF */
+
+ pfmul mm5, mm1 /* mm5=eps*Fp */
+ pfadd mm5, mm4 /* mm5= VV */
+
+ pfmul mm5, mm3 /* vcoul=qq*VV */
+ pfmul mm3, mm7 /* fijC=FF*qq */
+
+ /* at this point mm5 contains vcoul and mm3 fijC */
+ /* increment vcoul - then we can get rid of mm5 */
+ /* update vctot */
+ pfadd mm5, [esp + vctot] /* add the earlier value */
+ movq [esp + vctot], mm5 /* store the sum */
+
+ /* change sign of mm3 */
+ pxor mm1,mm1
+ pfsub mm1, mm3
+ pfmul mm1, [esp + tsc]
+ pfmul mm0, mm1 /* mm0 is total fscal now */
+
+ prefetchw [esp + dx1] /* prefetch i forces to cache */
+
+ /* spread fscalar to both positions */
+ movq mm1,mm0
+ punpckldq mm0,mm0
+ punpckhdq mm1,mm1
+
+ /* calc vector force */
+ prefetchw [edi + eax*4] /* prefetch the 1st faction to cache */
+ movq mm2, [esp + dx1] /* fetch dr */
+ movd mm3, [esp + dz1]
+
+ prefetchw [edi + ebx*4] /* prefetch the 2nd faction to cache */
+ pfmul mm2, mm0 /* mult by fs */
+ pfmul mm3, mm0
+
+ movq mm4, [esp + dx2] /* fetch dr */
+ movd mm5, [esp + dz2]
+ pfmul mm4, mm1 /* mult by fs */
+ pfmul mm5, mm1
+ /* update i forces */
+
+ movq mm0, [esp + fix]
+ movd mm1, [esp + fiz]
+ pfadd mm0, mm2
+ pfadd mm1, mm3
+
+ pfadd mm0, mm4
+ pfadd mm1, mm5
+ movq [esp + fix], mm0
+ movd [esp + fiz], mm1
+ /* update j forces */
+
+ movq mm0, [edi + eax*4]
+ movd mm1, [edi + eax*4 + 8]
+ movq mm6, [edi + ebx*4]
+ movd mm7, [edi + ebx*4 + 8]
+
+ pfsub mm0, mm2
+ pfsub mm1, mm3
+ pfsub mm6, mm4
+ pfsub mm7, mm5
+
+ movq [edi + eax*4], mm0
+ movd [edi + eax*4 +8], mm1
+ movq [edi + ebx*4], mm6
+ movd [edi + ebx*4 + 8], mm7
+
+ /* should we do one more iteration? */
+ sub [esp + innerk], 2
+ jl .i3010_finish_coul_inner
+ jmp .i3010_unroll_coul_loop
+.i3010_finish_coul_inner:
+ and [esp + innerk], 1
+ jnz .i3010_single_coul_inner
+ jmp .i3010_updateouterdata_coul
+.i3010_single_coul_inner:
+ /* a single j particle iteration here - compare with the unrolled code for comments. */
+ mov eax, [esp + innerjjnr]
+ mov eax, [eax] /* eax=jnr offset */
+
+ mov ecx, [ebp + charge]
+ movd mm5, [esp + iq]
+ movd mm3, [ecx + eax*4]
+ pfmul mm3, mm5 /* mm3=qq */
+
+ mov esi, [ebp + pos]
+ lea eax, [eax + eax*2]
+
+ movq mm0, [esp + ix]
+ movd mm1, [esp + iz]
+ movq mm4, [esi + eax*4]
+ movd mm5, [esi + eax*4 + 8]
+ pfsubr mm4, mm0
+ pfsubr mm5, mm1
+ movq [esp + dx1], mm4
+ pfmul mm4,mm4
+ movd [esp + dz1], mm5
+ pfmul mm5,mm5
+ pfacc mm4, mm5
+ pfacc mm4, mm5 /* mm0=rsq */
+
+ pfrsqrt mm0,mm4
+ movq mm2,mm0
+ pfmul mm0,mm0
+ pfrsqit1 mm0,mm4
+ pfrcpit2 mm0,mm2 /* mm1=invsqrt */
+ pfmul mm4, mm0
+ movq mm1, mm4
+ /* mm0 is invsqrt, and mm1 r. */
+
+ /* calculate potentials and scalar force */
+ pfmul mm1, [esp + tsc] /* mm1=rt */
+ pf2iw mm4,mm1
+ movd [esp + n1], mm4
+ pi2fd mm4,mm4
+ pfsub mm1, mm4 /* now mm1 is eps and mm4 is n0 */
+
+ movq mm2,mm1
+ pfmul mm2,mm2 /* mm1 is eps, mm2 is eps2 */
+
+ /* coulomb table */
+ mov edx, [ebp + VFtab]
+ mov ecx, [esp + n1]
+ shl ecx, 2
+ /* load all the table values we need */
+ movd mm4, [edx + ecx*4]
+ movd mm5, [edx + ecx*4 + 4]
+ movd mm6, [edx + ecx*4 + 8]
+ movd mm7, [edx + ecx*4 + 12]
+
+ pfmul mm6, mm1 /* mm6 = Geps */
+ pfmul mm7, mm2 /* mm7 = Heps2 */
+
+ pfadd mm5, mm6
+ pfadd mm5, mm7 /* mm5 = Fp */
+
+ pfmul mm7, [esp + two] /* two*Heps2 */
+ pfadd mm7, mm6
+ pfadd mm7, mm5 /* mm7=FF */
+
+ pfmul mm5, mm1 /* mm5=eps*Fp */
+ pfadd mm5, mm4 /* mm5= VV */
+
+ pfmul mm5, mm3 /* vcoul=qq*VV */
+ pfmul mm3, mm7 /* fijC=FF*qq */
+
+ /* at this point mm5 contains vcoul and mm3 fijC */
+ /* increment vcoul - then we can get rid of mm5 */
+ /* update vctot */
+ pfadd mm5, [esp + vctot] /* add the earlier value */
+ movq [esp + vctot], mm5 /* store the sum */
+
+ /* change sign of mm3 */
+ pxor mm1,mm1
+ pfsub mm1, mm3
+ pfmul mm0, [esp + tsc]
+ pfmul mm0, mm1 /* mm0 is total fscal now */
+
+ /* spread fscalar to both positions */
+ punpckldq mm0,mm0
+ /* calc vectorial force */
+ prefetchw [edi + eax*4] /* prefetch faction to cache */
+ movq mm2, [esp + dx1]
+ movd mm3, [esp + dz1]
+
+
+ pfmul mm2, mm0
+ pfmul mm3, mm0
+
+ /* update i particle force */
+ movq mm0, [esp + fix]
+ movd mm1, [esp + fiz]
+ pfadd mm0, mm2
+ pfadd mm1, mm3
+ movq [esp + fix], mm0
+ movd [esp + fiz], mm1
+ /* update j particle force */
+ movq mm0, [edi + eax*4]
+ movd mm1, [edi + eax *4+ 8]
+ pfsub mm0, mm2
+ pfsub mm1, mm3
+ movq [edi + eax*4], mm0
+ movd [edi + eax*4 +8], mm1
+ /* done! */
+.i3010_updateouterdata_coul:
+ mov ecx, [esp + ii3]
+
+ movq mm6, [edi + ecx*4] /* increment i force */
+ movd mm7, [edi + ecx*4 + 8]
+ pfadd mm6, [esp + fix]
+ pfadd mm7, [esp + fiz]
+ movq [edi + ecx*4], mm6
+ movd [edi + ecx*4 +8], mm7
+
+ mov ebx, [ebp + fshift] /* increment fshift force */
+ mov edx, [esp + is3]
+
+ movq mm6, [ebx + edx*4]
+ movd mm7, [ebx + edx*4 + 8]
+ pfadd mm6, [esp + fix]
+ pfadd mm7, [esp + fiz]
+ movq [ebx + edx*4], mm6
+ movd [ebx + edx*4 + 8], mm7
+
+ /* loop back to mno */
+ dec dword ptr [esp + nscoul]
+ jz .i3010_last_mno
+ jmp .i3010_mno_coul
+.i3010_last_mno:
+ mov edx, [ebp + gid] /* get group index for this i particle */
+ mov edx, [edx]
+ add [ebp + gid], 4 /* advance pointer */
+
+ movq mm7, [esp + vctot]
+ pfacc mm7,mm7 /* get and sum the two parts of total potential */
+
+ mov eax, [ebp + Vc]
+ movd mm6, [eax + edx*4]
+ pfadd mm6, mm7
+ movd [eax + edx*4], mm6 /* increment vc[gid] */
+ /* finish if last */
+ mov ecx, [ebp + nri]
+ dec ecx
+ jecxz .i3010_end
+ /* not last, iterate once more! */
+ mov [ebp + nri], ecx
+ jmp .i3010_outer
+.i3010_end:
+ femms
+ add esp, 132
+ pop edi
+ pop esi
+ pop edx
+ pop ecx
+ pop ebx
+ pop eax
+ leave
+ ret
+
+
+
+
+.globl inl3020_3dnow
+ .type inl3020_3dnow,@function
+inl3020_3dnow:
+.equ nri, 8
+.equ iinr, 12
+.equ jindex, 16
+.equ jjnr, 20
+.equ shift, 24
+.equ shiftvec, 28
+.equ fshift, 32
+.equ gid, 36
+.equ pos, 40
+.equ faction, 44
+.equ charge, 48
+.equ facel, 52
+.equ Vc, 56
+.equ tabscale, 60
+.equ VFtab, 64
+ /* stack offsets for local variables */
+.equ is3, 0
+.equ ii3, 4
+.equ ixO, 8
+.equ iyO, 12
+.equ izO, 16
+.equ ixH, 20 /* repeated (64bit) to fill 3dnow reg */
+.equ iyH, 28 /* repeated (64bit) to fill 3dnow reg */
+.equ izH, 36 /* repeated (64bit) to fill 3dnow reg */
+.equ iqO, 44 /* repeated (64bit) to fill 3dnow reg */
+.equ iqH, 52 /* repeated (64bit) to fill 3dnow reg */
+.equ qqO, 60 /* repeated (64bit) to fill 3dnow reg */
+.equ qqH, 68 /* repeated (64bit) to fill 3dnow reg */
+.equ vctot, 76 /* repeated (64bit) to fill 3dnow reg */
+.equ two, 84 /* repeated (64bit) to fill 3dnow reg */
+.equ n1, 92 /* repeated (64bit) to fill 3dnow reg */
+.equ tsc, 100 /* repeated (64bit) to fill 3dnow reg */
+.equ innerjjnr, 108
+.equ innerk, 112
+.equ fixO, 116
+.equ fiyO, 120
+.equ fizO, 124
+.equ fixH, 128 /* repeated (64bit) to fill 3dnow reg */
+.equ fiyH, 136 /* repeated (64bit) to fill 3dnow reg */
+.equ fizH, 144 /* repeated (64bit) to fill 3dnow reg */
+.equ dxO, 152
+.equ dyO, 156
+.equ dzO, 160
+.equ dxH, 164 /* repeated (64bit) to fill 3dnow reg */
+.equ dyH, 172 /* repeated (64bit) to fill 3dnow reg */
+.equ dzH, 180 /* repeated (64bit) to fill 3dnow reg */
+.equ tmprsqH, 188 /* repeated (64bit) to fill 3dnow reg */
+ push ebp
+ mov ebp,esp
+ push eax
+ push ebx
+ push ecx
+ push edx
+ push esi
+ push edi
+ sub esp, 196 /* local stack space */
+ femms
+
+ mov ecx, [ebp + iinr] /* ecx = pointer into iinr[] */
+ mov ebx, [ecx] /* ebx=ii */
+
+ mov edx, [ebp + charge]
+ movd mm1, [ebp + facel]
+ movd mm2, [edx + ebx*4] /* mm2=charge[i.i0] */
+ pfmul mm2, mm1
+ movq [esp + iqO], mm2 /* iqO = facel*charge[ii] */
+
+ movd mm2, [edx + ebx*4 + 4] /* mm2=charge[i.i0+1] */
+ pfmul mm2, mm1
+ punpckldq mm2,mm2 /* spread to both halves */
+ movq [esp + iqH], mm2 /* iqH = facel*charge[i.i0+1] */
+
+ movq mm3, [mm_two]
+ movd mm4, [ebp + tabscale]
+ punpckldq mm4,mm4 /* spread to both halves */
+ movq [esp + two], mm3
+ movq [esp + tsc], mm4
+ /* assume we have at least one i particle - start directly */
+.i3020_outer:
+ mov eax, [ebp + shift] /* eax = pointer into shift[] */
+ mov ebx, [eax] /* ebx=shift[n] */
+ add [ebp + shift], 4 /* advance pointer one step */
+
+ lea ebx, [ebx + ebx*2] /* ebx=3*is */
+ mov [esp + is3],ebx /* store is3 */
+
+ mov eax, [ebp + shiftvec] /* eax = base of shiftvec[] */
+
+ movq mm5, [eax + ebx*4] /* move shX/shY to mm5 and shZ to mm6. */
+ movd mm6, [eax + ebx*4 + 8]
+ movq mm0, mm5
+ movq mm1, mm5
+ movq mm2, mm6
+ punpckldq mm0,mm0 /* also expand shX,Y,Z in mm0--mm2. */
+ punpckhdq mm1,mm1
+ punpckldq mm2,mm2
+
+ mov ecx, [ebp + iinr] /* ecx = pointer into iinr[] */
+ add [ebp + iinr], 4 /* advance pointer */
+ mov ebx, [ecx] /* ebx=ii */
+
+ lea ebx, [ebx + ebx*2] /* ebx = 3*ii=ii3 */
+ mov eax, [ebp + pos] /* eax = base of pos[] */
+
+ pfadd mm5, [eax + ebx*4] /* ix = shX + posX (and iy too) */
+ movd mm7, [eax + ebx*4 + 8] /* cant use direct memory add for 4 bytes (iz) */
+ mov [esp + ii3], ebx /* (use mm7 as temp. storage for iz.) */
+ pfadd mm6, mm7
+ movq [esp + ixO], mm5
+ movq [esp + izO], mm6
+
+ movd mm3, [eax + ebx*4 + 12]
+ movd mm4, [eax + ebx*4 + 16]
+ movd mm5, [eax + ebx*4 + 20]
+ punpckldq mm3, [eax + ebx*4 + 24]
+ punpckldq mm4, [eax + ebx*4 + 28]
+ punpckldq mm5, [eax + ebx*4 + 32] /* coords of H1 in low mm3-mm5, H2 in high */
+
+ pfadd mm0, mm3
+ pfadd mm1, mm4
+ pfadd mm2, mm5
+ movq [esp + ixH], mm0
+ movq [esp + iyH], mm1
+ movq [esp + izH], mm2
+
+ /* clear vctot and i forces */
+ pxor mm7,mm7
+ movq [esp + vctot], mm7
+ movq [esp + fixO], mm7
+ movd [esp + fizO], mm7
+ movq [esp + fixH], mm7
+ movq [esp + fiyH], mm7
+ movq [esp + fizH], mm7
+
+ mov eax, [ebp + jindex]
+ mov ecx, [eax] /* jindex[n] */
+ mov edx, [eax + 4] /* jindex[n+1] */
+ add [ebp + jindex], 4
+ sub edx, ecx /* number of innerloop atoms */
+ mov [esp + innerk], edx
+
+ mov esi, [ebp + pos]
+ mov edi, [ebp + faction]
+ mov eax, [ebp + jjnr]
+ shl ecx, 2
+ add eax, ecx
+ mov [esp + innerjjnr], eax /* pointer to jjnr[nj0] */
+.i3020_inner_loop:
+ /* a single j particle iteration */
+ mov eax, [esp + innerjjnr]
+ mov eax, [eax] /* eax=jnr offset */
+ add [esp + innerjjnr], 4 /* advance pointer */
+ prefetch [ecx + 16] /* prefetch data - trial and error says 16 is best */
+
+ mov ecx, [ebp + charge]
+ movd mm7, [ecx + eax*4]
+ punpckldq mm7,mm7
+ movq mm6,mm7
+ pfmul mm6, [esp + iqO]
+ pfmul mm7, [esp + iqH] /* mm6=qqO, mm7=qqH */
+ movd [esp + qqO], mm6
+ movq [esp + qqH], mm7
+
+ lea eax, [eax + eax*2]
+
+ movq mm0, [esi + eax*4]
+ movd mm1, [esi + eax*4 + 8]
+ /* copy & expand to mm2-mm4 for the H interactions */
+ movq mm2, mm0
+ movq mm3, mm0
+ movq mm4, mm1
+ punpckldq mm2,mm2
+ punpckhdq mm3,mm3
+ punpckldq mm4,mm4
+
+ pfsubr mm0, [esp + ixO]
+ pfsubr mm1, [esp + izO]
+
+ movq [esp + dxO], mm0
+ pfmul mm0,mm0
+ movd [esp + dzO], mm1
+ pfmul mm1,mm1
+ pfacc mm0, mm1
+ pfadd mm0, mm1 /* mm0=rsqO */
+
+ punpckldq mm2, mm2
+ punpckldq mm3, mm3
+ punpckldq mm4, mm4 /* mm2-mm4 is jx-jz */
+ pfsubr mm2, [esp + ixH]
+ pfsubr mm3, [esp + iyH]
+ pfsubr mm4, [esp + izH] /* mm2-mm4 is dxH-dzH */
+
+ movq [esp + dxH], mm2
+ movq [esp + dyH], mm3
+ movq [esp + dzH], mm4
+ pfmul mm2,mm2
+ pfmul mm3,mm3
+ pfmul mm4,mm4
+
+ pfadd mm3,mm2
+ pfadd mm3,mm4 /* mm3=rsqH */
+ movq [esp + tmprsqH], mm3
+
+ pfrsqrt mm1,mm0
+
+ movq mm2,mm1
+ pfmul mm1,mm1
+ pfrsqit1 mm1,mm0
+ pfrcpit2 mm1,mm2 /* mm1=invsqrt */
+
+ pfmul mm0, mm1 /* mm0=r */
+
+ pfmul mm0, [esp + tsc]
+ pf2iw mm4, mm0
+ movd [esp + n1], mm4
+ pi2fd mm4,mm4
+ pfsub mm0, mm4 /* now mm0 is eps and mm4 n0 */
+ movq mm2, mm0
+ pfmul mm2, mm2 /* mm0 is eps, mm2 eps2 */
+
+ /* coulomb table */
+ mov edx, [ebp + VFtab]
+ mov ecx, [esp + n1]
+ shl ecx, 2
+ /* load all values we need */
+ movd mm4, [edx + ecx*4]
+ movd mm5, [edx + ecx*4 + 4]
+ movd mm6, [edx + ecx*4 + 8]
+ movd mm7, [edx + ecx*4 + 12]
+
+ pfmul mm6, mm0 /* mm6 = Geps */
+ pfmul mm7, mm2 /* mm7 = Heps2 */
+
+ pfadd mm5, mm6
+ pfadd mm5, mm7 /* mm5 = Fp */
+
+ pfmul mm7, [esp + two] /* two*Heps2 */
+ pfadd mm7, mm6
+ pfadd mm7, mm5 /* mm7=FF */
+
+ pfmul mm5, mm0 /* mm5=eps*Fp */
+ pfadd mm5, mm4 /* mm5= VV */
+
+ pfmul mm5, [esp + qqO] /* vcoul=qq*VV */
+ pfmul mm7, [esp + qqO] /* fijC=qq*FF */
+ /* update vctot directly, use mm3 for fscal sum. */
+ pfadd mm5, [esp + vctot]
+ movq [esp + vctot], mm5
+ movq mm3, mm7
+
+ /* change sign of fscal and multiply with rinv */
+ pxor mm0,mm0
+ pfsubr mm3, mm0
+ pfmul mm3, [esp + tsc]
+ pfmul mm3, mm1 /* mm3 is total fscal (for the oxygen) now */
+
+ /* Ready with the oxygen - potential is updated, fscal is in mm3. */
+ /* now do the two hydrogens. */
+
+ movq mm0, [esp + tmprsqH] /* mm0=r */sqH
+
+ pfrsqrt mm1, mm0
+ pswapd mm0,mm0
+ pfrsqrt mm2, mm0
+ pswapd mm0,mm0
+ punpckldq mm1,mm2 /* seeds are in mm1 now, and rsq in mm0. */
+
+ movq mm2, mm1
+ pfmul mm1,mm1
+ pfrsqit1 mm1,mm0
+ pfrcpit2 mm1,mm2 /* mm1=invsqrt */
+
+ pfmul mm0,mm1 /* mm0=r */
+ pfmul mm0, [esp + tsc]
+ pf2iw mm4, mm0
+ movq [esp + n1], mm4
+ pi2fd mm4,mm4
+ pfsub mm0, mm4 /* now mm0 is eps and mm4 n0 */
+ movq mm2, mm0
+ pfmul mm2, mm2 /* mm0 is eps, mm2 eps2 */
+
+ /* coulomb table */
+ mov edx, [ebp + VFtab]
+ mov ecx, [esp + n1]
+ shl ecx, 2
+ /* load all values we need */
+ movd mm4, [edx + ecx*4]
+ movd mm5, [edx + ecx*4 + 4]
+ movd mm6, [edx + ecx*4 + 8]
+ movd mm7, [edx + ecx*4 + 12]
+ mov ecx, [esp + n1 + 4]
+ shl ecx, 2
+ punpckldq mm4, [edx + ecx*4]
+ punpckldq mm5, [edx + ecx*4 + 4]
+ punpckldq mm6, [edx + ecx*4 + 8]
+ punpckldq mm7, [edx + ecx*4 + 12]
+
+ pfmul mm6, mm0 /* mm6 = Geps */
+ pfmul mm7, mm2 /* mm7 = Heps2 */
+
+ pfadd mm5, mm6
+ pfadd mm5, mm7 /* mm5 = Fp */
+
+ pfmul mm7, [esp + two] /* two*Heps2 */
+ pfadd mm7, mm6
+ pfadd mm7, mm5 /* mm7=FF */
+
+ pfmul mm5, mm0 /* mm5=eps*Fp */
+ pfadd mm5, mm4 /* mm5= VV */
+
+ pfmul mm5, [esp + qqH] /* vcoul=qq*VV */
+ pfmul mm7, [esp + qqH] /* fijC=qq*FF */
+
+ /* update vctot */
+ pfadd mm5, [esp + vctot]
+ movq [esp + vctot], mm5
+
+ /* change sign of fijC and multiply by rinv */
+ pxor mm4,mm4
+ pfsub mm4, mm7
+ pfmul mm4, [esp + tsc]
+ pfmul mm4, mm1 /* mm4 is total fscal (for the hydrogens) now */
+
+ /* spread oxygen fscalar to both positions */
+ punpckldq mm3,mm3
+ /* calc vectorial force for O */
+ prefetchw [edi + eax*4] /* prefetch faction to cache */
+ movq mm0, [esp + dxO]
+ movd mm1, [esp + dzO]
+ pfmul mm0, mm3
+ pfmul mm1, mm3
+
+ /* calc vectorial force for H's */
+ movq mm5, [esp + dxH]
+ movq mm6, [esp + dyH]
+ movq mm7, [esp + dzH]
+ pfmul mm5, mm4
+ pfmul mm6, mm4
+ pfmul mm7, mm4
+
+ /* update iO particle force */
+ movq mm2, [esp + fixO]
+ movd mm3, [esp + fizO]
+ pfadd mm2, mm0
+ pfadd mm3, mm1
+ movq [esp + fixO], mm2
+ movd [esp + fizO], mm3
+
+ /* update iH forces */
+ movq mm2, [esp + fixH]
+ movq mm3, [esp + fiyH]
+ movq mm4, [esp + fizH]
+ pfadd mm2, mm5
+ pfadd mm3, mm6
+ pfadd mm4, mm7
+ movq [esp + fixH], mm2
+ movq [esp + fiyH], mm3
+ movq [esp + fizH], mm4
+
+ /* pack j forces from H in the same form as the oxygen force. */
+ pfacc mm5, mm6 /* mm5(l)=fjx(H1+H2) mm5(h)=fjy(H1+H2) */
+ pfacc mm7, mm7 /* mm7(l)=fjz(H1+H2) */
+
+ pfadd mm0, mm5 /* add up total force on j particle. */
+ pfadd mm1, mm7
+
+ /* update j particle force */
+ movq mm2, [edi + eax*4]
+ movd mm3, [edi + eax*4 + 8]
+ pfsub mm2, mm0
+ pfsub mm3, mm1
+ movq [edi + eax*4], mm2
+ movd [edi + eax*4 + 8], mm3
+
+ /* done - one more? */
+ dec dword ptr [esp + innerk]
+ jz .i3020_updateouterdata
+ jmp .i3020_inner_loop
+.i3020_updateouterdata:
+ mov ecx, [esp + ii3]
+
+ movq mm6, [edi + ecx*4] /* increment iO force */
+ movd mm7, [edi + ecx*4 + 8]
+ pfadd mm6, [esp + fixO]
+ pfadd mm7, [esp + fizO]
+ movq [edi + ecx*4], mm6
+ movd [edi + ecx*4 +8], mm7
+
+ movq mm0, [esp + fixH]
+ movq mm3, [esp + fiyH]
+ movq mm1, [esp + fizH]
+ movq mm2, mm0
+ punpckldq mm0, mm3 /* mm0(l)=fxH1, mm0(h)=fyH1 */
+ punpckhdq mm2, mm3 /* mm2(l)=fxH2, mm2(h)=fyH2 */
+ movq mm3, mm1
+ pswapd mm3, mm3
+ /* mm1 is fzH1 */
+ /* mm3 is fzH2 */
+
+ movq mm6, [edi + ecx*4 + 12] /* increment iH1 force */
+ movd mm7, [edi + ecx*4 + 20]
+ pfadd mm6, mm0
+ pfadd mm7, mm1
+ movq [edi + ecx*4 + 12], mm6
+ movd [edi + ecx*4 + 20], mm7
+
+ movq mm6, [edi + ecx*4 + 24] /* increment iH2 force */
+ movd mm7, [edi + ecx*4 + 32]
+ pfadd mm6, mm2
+ pfadd mm7, mm3
+ movq [edi + ecx*4 + 24], mm6
+ movd [edi + ecx*4 + 32], mm7
+
+
+ mov ebx, [ebp + fshift] /* increment fshift force */
+ mov edx, [esp + is3]
+
+ movq mm6, [ebx + edx*4]
+ movd mm7, [ebx + edx*4 + 8]
+ pfadd mm6, [esp + fixO]
+ pfadd mm7, [esp + fizO]
+ pfadd mm6, mm0
+ pfadd mm7, mm1
+ pfadd mm6, mm2
+ pfadd mm7, mm3
+ movq [ebx + edx*4], mm6
+ movd [ebx + edx*4 + 8], mm7
+
+ mov edx, [ebp + gid] /* get group index for this i particle */
+ mov edx, [edx]
+ add [ebp + gid], 4 /* advance pointer */
+
+ movq mm7, [esp + vctot]
+ pfacc mm7,mm7 /* get and sum the two parts of total potential */
+
+ mov eax, [ebp + Vc]
+ movd mm6, [eax + edx*4]
+ pfadd mm6, mm7
+ movd [eax + edx*4], mm6 /* increment vc[gid] */
+
+ /* finish if last */
+ dec dword ptr [ebp + nri]
+ jz .i3020_end
+ /* not last, iterate once more! */
+ jmp .i3020_outer
+.i3020_end:
+ femms
+ add esp, 196
+ pop edi
+ pop esi
+ pop edx
+ pop ecx
+ pop ebx
+ pop eax
+ leave
+ ret
+
+
+
+.globl inl3030_3dnow
+ .type inl3030_3dnow,@function
+inl3030_3dnow:
+.equ nri, 8
+.equ iinr, 12
+.equ jindex, 16
+.equ jjnr, 20
+.equ shift, 24
+.equ shiftvec, 28
+.equ fshift, 32
+.equ gid, 36
+.equ pos, 40
+.equ faction, 44
+.equ charge, 48
+.equ facel, 52
+.equ Vc, 56
+.equ tabscale, 60
+.equ VFtab, 64
+ /* stack offsets for local variables */
+.equ is3, 0
+.equ ii3, 4
+.equ ixO, 8
+.equ iyO, 12
+.equ izO, 16
+.equ ixH, 20 /* repeated (64bit) to fill 3dnow reg */
+.equ iyH, 28 /* repeated (64bit) to fill 3dnow reg */
+.equ izH, 36 /* repeated (64bit) to fill 3dnow reg */
+.equ qqOO, 44 /* repeated (64bit) to fill 3dnow reg */
+.equ qqOH, 52 /* repeated (64bit) to fill 3dnow reg */
+.equ qqHH, 60 /* repeated (64bit) to fill 3dnow reg */
+.equ two, 68 /* repeated (64bit) to fill 3dnow reg */
+.equ n1, 76 /* repeated (64bit) to fill 3dnow reg */
+.equ tsc, 84 /* repeated (64bit) to fill 3dnow reg */
+.equ vctot, 92 /* repeated (64bit) to fill 3dnow reg */
+.equ innerjjnr, 100
+.equ innerk, 104
+.equ fixO, 108
+.equ fiyO, 112
+.equ fizO, 116
+.equ fixH, 120 /* repeated (64bit) to fill 3dnow reg */
+.equ fiyH, 128 /* repeated (64bit) to fill 3dnow reg */
+.equ fizH, 136 /* repeated (64bit) to fill 3dnow reg */
+.equ dxO, 144
+.equ dyO, 148
+.equ dzO, 152
+.equ dxH, 156 /* repeated (64bit) to fill 3dnow reg */
+.equ dyH, 164 /* repeated (64bit) to fill 3dnow reg */
+.equ dzH, 172 /* repeated (64bit) to fill 3dnow reg */
+.equ tmprsqH, 180 /* repeated (64bit) to fill 3dnow reg */
+ push ebp
+ mov ebp,esp
+ push eax
+ push ebx
+ push ecx
+ push edx
+ push esi
+ push edi
+ sub esp, 188 /* local stack space */
+ femms
+ /* assume we have at least one i particle - start directly */
+
+ mov ecx, [ebp + iinr] /* ecx = pointer into iinr[] */
+ mov ebx, [ecx] /* ebx=ii */
+
+ mov edx, [ebp + charge]
+ movd mm1, [ebp + facel] /* mm1=facel */
+ movd mm2, [edx + ebx*4] /* mm2=charge[i.i0] (O) */
+ movd mm3, [edx + ebx*4 + 4] /* mm2=charge[i.i0+1] (H) */
+ movq mm4, mm2
+ pfmul mm4, mm1
+ movq mm6, mm3
+ pfmul mm6, mm1
+ movq mm5, mm4
+ pfmul mm4, mm2 /* mm4=qqOO*facel */
+ pfmul mm5, mm3 /* mm5=qqOH*facel */
+ pfmul mm6, mm3 /* mm6=qqHH*facel */
+ punpckldq mm5,mm5 /* spread to both halves */
+ punpckldq mm6,mm6 /* spread to both halves */
+ movq [esp + qqOO], mm4
+ movq [esp + qqOH], mm5
+ movq [esp + qqHH], mm6
+ movq mm2, [mm_two]
+ movq [esp + two], mm2
+ movd mm3, [ebp + tabscale]
+ punpckldq mm3,mm3
+ movq [esp + tsc], mm3
+.i3030_outer:
+ mov eax, [ebp + shift] /* eax = pointer into shift[] */
+ mov ebx, [eax] /* ebx=shift[n] */
+ add [ebp + shift], 4 /* advance pointer one step */
+
+ lea ebx, [ebx + ebx*2] /* ebx=3*is */
+ mov [esp + is3],ebx /* store is3 */
+
+ mov eax, [ebp + shiftvec] /* eax = base of shiftvec[] */
+
+ movq mm5, [eax + ebx*4] /* move shX/shY to mm5 and shZ to mm6. */
+ movd mm6, [eax + ebx*4 + 8]
+ movq mm0, mm5
+ movq mm1, mm5
+ movq mm2, mm6
+ punpckldq mm0,mm0 /* also expand shX,Y,Z in mm0--mm2. */
+ punpckhdq mm1,mm1
+ punpckldq mm2,mm2
+
+ mov ecx, [ebp + iinr] /* ecx = pointer into iinr[] */
+ add [ebp + iinr], 4 /* advance pointer */
+ mov ebx, [ecx] /* ebx=ii */
+
+ lea ebx, [ebx + ebx*2] /* ebx = 3*ii=ii3 */
+ mov eax, [ebp + pos] /* eax = base of pos[] */
+
+ pfadd mm5, [eax + ebx*4] /* ix = shX + posX (and iy too) */
+ movd mm7, [eax + ebx*4 + 8] /* cant use direct memory add for 4 bytes (iz) */
+ mov [esp + ii3], ebx /* (use mm7 as temp. storage for iz.) */
+ pfadd mm6, mm7
+ movq [esp + ixO], mm5
+ movq [esp + izO], mm6
+
+ movd mm3, [eax + ebx*4 + 12]
+ movd mm4, [eax + ebx*4 + 16]
+ movd mm5, [eax + ebx*4 + 20]
+ punpckldq mm3, [eax + ebx*4 + 24]
+ punpckldq mm4, [eax + ebx*4 + 28]
+ punpckldq mm5, [eax + ebx*4 + 32] /* coords of H1 in low mm3-mm5, H2 in high */
+
+ pfadd mm0, mm3
+ pfadd mm1, mm4
+ pfadd mm2, mm5
+ movq [esp + ixH], mm0
+ movq [esp + iyH], mm1
+ movq [esp + izH], mm2
+
+ /* clear vctot and i forces */
+ pxor mm7,mm7
+ movq [esp + vctot], mm7
+ movq [esp + fixO], mm7
+ movq [esp + fizO], mm7
+ movq [esp + fixH], mm7
+ movq [esp + fiyH], mm7
+ movq [esp + fizH], mm7
+
+ mov eax, [ebp + jindex]
+ mov ecx, [eax] /* jindex[n] */
+ mov edx, [eax + 4] /* jindex[n+1] */
+ add [ebp + jindex], 4
+ sub edx, ecx /* number of innerloop atoms */
+ mov [esp + innerk], edx /* number of innerloop atoms */
+
+ mov esi, [ebp + pos]
+ mov edi, [ebp + faction]
+ mov eax, [ebp + jjnr]
+ shl ecx, 2
+ add eax, ecx
+ mov [esp + innerjjnr], eax /* pointer to jjnr[nj0] */
+.i3030_inner_loop:
+ /* a single j particle iteration here - compare with the unrolled code for comments. */
+ mov eax, [esp + innerjjnr]
+ mov eax, [eax] /* eax=jnr offset */
+ add [esp + innerjjnr], 4 /* advance pointer */
+
+ lea eax, [eax + eax*2]
+
+ movq mm0, [esi + eax*4]
+ movd mm1, [esi + eax*4 + 8]
+ /* copy & expand to mm2-mm4 for the H interactions */
+ movq mm2, mm0
+ movq mm3, mm0
+ movq mm4, mm1
+ punpckldq mm2,mm2
+ punpckhdq mm3,mm3
+ punpckldq mm4,mm4
+
+ pfsubr mm0, [esp + ixO]
+ pfsubr mm1, [esp + izO]
+
+ movq [esp + dxO], mm0
+ pfmul mm0,mm0
+ movd [esp + dzO], mm1
+ pfmul mm1,mm1
+ pfacc mm0, mm0
+ pfadd mm0, mm1 /* mm0=rsqO */
+
+ punpckldq mm2, mm2
+ punpckldq mm3, mm3
+ punpckldq mm4, mm4 /* mm2-mm4 is jx-jz */
+ pfsubr mm2, [esp + ixH]
+ pfsubr mm3, [esp + iyH]
+ pfsubr mm4, [esp + izH] /* mm2-mm4 is dxH-dzH */
+
+ movq [esp + dxH], mm2
+ movq [esp + dyH], mm3
+ movq [esp + dzH], mm4
+ pfmul mm2,mm2
+ pfmul mm3,mm3
+ pfmul mm4,mm4
+
+ pfadd mm3,mm2
+ pfadd mm3,mm4 /* mm3=rsqH */
+ movq [esp + tmprsqH], mm3
+
+ pfrsqrt mm1,mm0
+
+ movq mm2,mm1
+ pfmul mm1,mm1
+ pfrsqit1 mm1,mm0
+ pfrcpit2 mm1,mm2 /* mm1=invsqrt */ OO
+ pfmul mm0, mm1 /* mm0=rsq */ OO
+
+ pfmul mm0, [esp + tsc]
+ pf2iw mm4, mm0
+ movd [esp + n1], mm4
+ pi2fd mm4,mm4
+ pfsub mm0, mm4 /* now mm0 is eps and mm4 n0 */
+ movq mm2, mm0
+ pfmul mm2, mm2 /* mm0 is eps, mm2 eps2 */
+
+ /* coulomb table */
+ mov edx, [ebp + VFtab]
+ mov ecx, [esp + n1]
+ shl ecx, 2
+
+ /* load all values we need */
+ movd mm4, [edx + ecx*4]
+ movd mm5, [edx + ecx*4 + 4]
+ movd mm6, [edx + ecx*4 + 8]
+ movd mm7, [edx + ecx*4 + 12]
+
+ pfmul mm6, mm0 /* mm6 = Geps */
+ pfmul mm7, mm2 /* mm7 = Heps2 */
+
+ pfadd mm5, mm6
+ pfadd mm5, mm7 /* mm5 = Fp */
+
+ pfmul mm7, [esp + two] /* two*Heps2 */
+ pfadd mm7, mm6
+ pfadd mm7, mm5 /* mm7=FF */
+
+ pfmul mm5, mm0 /* mm5=eps*Fp */
+ pfadd mm5, mm4 /* mm5= VV */
+
+ pfmul mm5, [esp + qqOO] /* vcoul=qq*VV */
+ pfmul mm7, [esp + qqOO] /* fijC=qq*FF */
+
+ /* update vctot directly, use mm3 for fscal sum. */
+ pfadd mm5, [esp + vctot]
+ movq [esp + vctot], mm5
+ movq mm3, mm7
+
+ /* change sign of fscal and multiply with rinv */
+ pxor mm0,mm0
+ pfsubr mm3, mm0
+ pfmul mm3, [esp + tsc]
+ pfmul mm3, mm1 /* mm3 is total fscal (for the oxygen) now */
+
+ /* Ready with the oxygen - potential is updated, fscal is in mm3. */
+ /* time for hydrogens! */
+
+ movq mm0, [esp + tmprsqH]
+
+ pfrsqrt mm1, mm0
+ pswapd mm0,mm0
+ pfrsqrt mm2, mm0
+ pswapd mm0,mm0
+ punpckldq mm1,mm2 /* seeds are in mm1 now, and rsq in mm0. */
+
+ movq mm2, mm1
+ pfmul mm1,mm1
+ pfrsqit1 mm1,mm0
+ pfrcpit2 mm1,mm2 /* mm1=invsqrt */
+
+ pfmul mm0,mm1 /* mm0=r */
+ pfmul mm0, [esp + tsc]
+ pf2iw mm4, mm0
+ movq [esp + n1], mm4
+ pi2fd mm4,mm4
+ pfsub mm0, mm4 /* now mm0 is eps and mm4 n0 */
+ movq mm2, mm0
+ pfmul mm2, mm2 /* mm0 is eps, mm2 eps2 */
+
+ /* coulomb table */
+ mov edx, [ebp + VFtab]
+ mov ecx, [esp + n1]
+ shl ecx, 2
+ /* load all values we need */
+ movd mm4, [edx + ecx*4]
+ movd mm5, [edx + ecx*4 + 4]
+ movd mm6, [edx + ecx*4 + 8]
+ movd mm7, [edx + ecx*4 + 12]
+ mov ecx, [esp + n1 + 4]
+ shl ecx, 2
+ punpckldq mm4, [edx + ecx*4]
+ punpckldq mm5, [edx + ecx*4 + 4]
+ punpckldq mm6, [edx + ecx*4 + 8]
+ punpckldq mm7, [edx + ecx*4 + 12]
+
+ pfmul mm6, mm0 /* mm6 = Geps */
+ pfmul mm7, mm2 /* mm7 = Heps2 */
+
+ pfadd mm5, mm6
+ pfadd mm5, mm7 /* mm5 = Fp */
+
+ pfmul mm7, [esp + two] /* two*Heps2 */
+ pfadd mm7, mm6
+ pfadd mm7, mm5 /* mm7=FF */
+
+ pfmul mm5, mm0 /* mm5=eps*Fp */
+ pfadd mm5, mm4 /* mm5= VV */
+
+ pfmul mm5, [esp + qqOH] /* vcoul=qq*VV */
+ pfmul mm7, [esp + qqOH] /* fijC=qq*FF */
+ /* update vctot */
+ pfadd mm5, [esp + vctot]
+ movq [esp + vctot], mm5
+
+ /* change sign of fijC and multiply by rinv */
+ pxor mm4,mm4
+ pfsub mm4, mm7
+ pfmul mm4, [esp + tsc]
+ pfmul mm4, mm1 /* mm4 is total fscal (for the hydrogens) now */
+
+ /* spread oxygen fscalar to both positions */
+ punpckldq mm3,mm3
+ /* calc vectorial force for O */
+ movq mm0, [esp + dxO]
+ movd mm1, [esp + dzO]
+ pfmul mm0, mm3
+ pfmul mm1, mm3
+
+ /* calc vectorial force for H's */
+ movq mm5, [esp + dxH]
+ movq mm6, [esp + dyH]
+ movq mm7, [esp + dzH]
+ pfmul mm5, mm4
+ pfmul mm6, mm4
+ pfmul mm7, mm4
+
+ /* update iO particle force */
+ movq mm2, [esp + fixO]
+ movd mm3, [esp + fizO]
+ pfadd mm2, mm0
+ pfadd mm3, mm1
+ movq [esp + fixO], mm2
+ movd [esp + fizO], mm3
+
+ /* update iH forces */
+ movq mm2, [esp + fixH]
+ movq mm3, [esp + fiyH]
+ movq mm4, [esp + fizH]
+ pfadd mm2, mm5
+ pfadd mm3, mm6
+ pfadd mm4, mm7
+ movq [esp + fixH], mm2
+ movq [esp + fiyH], mm3
+ movq [esp + fizH], mm4
+
+ /* pack j forces from H in the same form as the oxygen force. */
+ pfacc mm5, mm6 /* mm5(l)=fjx(H1+H2) mm5(h)=fjy(H1+H2) */
+ pfacc mm7, mm7 /* mm7(l)=fjz(H1+H2) */
+
+ pfadd mm0, mm5 /* add up total force on j particle. */
+ pfadd mm1, mm7
+
+ /* update j particle force */
+ movq mm2, [edi + eax*4]
+ movd mm3, [edi + eax*4 + 8]
+ pfsub mm2, mm0
+ pfsub mm3, mm1
+ movq [edi + eax*4], mm2
+ movd [edi + eax*4 +8], mm3
+
+ /* interactions with j H1 */
+
+ movq mm0, [esi + eax*4 + 12]
+ movd mm1, [esi + eax*4 + 20]
+ /* copy & expand to mm2-mm4 for the H interactions */
+ movq mm2, mm0
+ movq mm3, mm0
+ movq mm4, mm1
+ punpckldq mm2,mm2
+ punpckhdq mm3,mm3
+ punpckldq mm4,mm4
+
+ pfsubr mm0, [esp + ixO]
+ pfsubr mm1, [esp + izO]
+
+ movq [esp + dxO], mm0
+ pfmul mm0,mm0
+ movd [esp + dzO], mm1
+ pfmul mm1,mm1
+ pfacc mm0, mm1
+ pfadd mm0, mm1 /* mm0=rsqO */
+
+ punpckldq mm2, mm2
+ punpckldq mm3, mm3
+ punpckldq mm4, mm4 /* mm2-mm4 is jx-jz */
+ pfsubr mm2, [esp + ixH]
+ pfsubr mm3, [esp + iyH]
+ pfsubr mm4, [esp + izH] /* mm2-mm4 is dxH-dzH */
+
+ movq [esp + dxH], mm2
+ movq [esp + dyH], mm3
+ movq [esp + dzH], mm4
+ pfmul mm2,mm2
+ pfmul mm3,mm3
+ pfmul mm4,mm4
+
+ pfadd mm3,mm2
+ pfadd mm3,mm4 /* mm3=rsqH */
+ movq [esp + tmprsqH], mm3
+
+ pfrsqrt mm1,mm0
+
+ movq mm2,mm1
+ pfmul mm1,mm1
+ pfrsqit1 mm1,mm0
+ pfrcpit2 mm1,mm2 /* mm1=invsqrt */
+ pfmul mm0, mm1 /* mm0=rsq */
+
+ pfmul mm0, [esp + tsc]
+ pf2iw mm4, mm0
+ movd [esp + n1], mm4
+ pi2fd mm4,mm4
+ pfsub mm0, mm4 /* now mm0 is eps and mm4 n0 */
+ movq mm2, mm0
+ pfmul mm2, mm2 /* mm0 is eps, mm2 eps2 */
+
+ /* coulomb table */
+ mov edx, [ebp + VFtab]
+ mov ecx, [esp + n1]
+ shl ecx, 2
+
+ /* load all values we need */
+ movd mm4, [edx + ecx*4]
+ movd mm5, [edx + ecx*4 + 4]
+ movd mm6, [edx + ecx*4 + 8]
+ movd mm7, [edx + ecx*4 + 12]
+
+ pfmul mm6, mm0 /* mm6 = Geps */
+ pfmul mm7, mm2 /* mm7 = Heps2 */
+
+ pfadd mm5, mm6
+ pfadd mm5, mm7 /* mm5 = Fp */
+
+ pfmul mm7, [esp + two] /* two*Heps2 */
+ pfadd mm7, mm6
+ pfadd mm7, mm5 /* mm7=FF */
+
+ pfmul mm5, mm0 /* mm5=eps*Fp */
+ pfadd mm5, mm4 /* mm5= VV */
+
+ pfmul mm5, [esp + qqOH] /* vcoul=qq*VV */
+ pfmul mm7, [esp + qqOH] /* fijC=qq*FF */
+
+ /* update vctot directly, force is moved to mm3 */
+ pfadd mm5, [esp + vctot]
+ movq [esp + vctot], mm5
+ pxor mm3, mm3
+ pfsub mm3, mm7
+ pfmul mm3, [esp + tsc]
+ pfmul mm3, mm1 /* mm3 is total fscal (for the oxygen) now */
+
+ movq mm0, [esp + tmprsqH]
+
+ pfrsqrt mm1, mm0
+ pswapd mm0,mm0
+ pfrsqrt mm2, mm0
+ pswapd mm0,mm0
+ punpckldq mm1,mm2 /* seeds are in mm1 now, and rsq in mm0. */
+
+ movq mm2, mm1
+ pfmul mm1,mm1
+ pfrsqit1 mm1,mm0
+ pfrcpit2 mm1,mm2 /* mm1=invsqrt */
+
+ pfmul mm0,mm1 /* mm0=r */
+ pfmul mm0, [esp + tsc]
+ pf2iw mm4, mm0
+ movq [esp + n1], mm4
+ pi2fd mm4,mm4
+ pfsub mm0, mm4 /* now mm0 is eps and mm4 n0 */
+ movq mm2, mm0
+ pfmul mm2, mm2 /* mm0 is eps, mm2 eps2 */
+
+ /* coulomb table */
+ mov edx, [ebp + VFtab]
+ mov ecx, [esp + n1]
+ shl ecx, 2
+ /* load all values we need */
+ movd mm4, [edx + ecx*4]
+ movd mm5, [edx + ecx*4 + 4]
+ movd mm6, [edx + ecx*4 + 8]
+ movd mm7, [edx + ecx*4 + 12]
+ mov ecx, [esp + n1 + 4]
+ shl ecx, 2
+ punpckldq mm4, [edx + ecx*4]
+ punpckldq mm5, [edx + ecx*4 + 4]
+ punpckldq mm6, [edx + ecx*4 + 8]
+ punpckldq mm7, [edx + ecx*4 + 12]
+
+
+ pfmul mm6, mm0 /* mm6 = Geps */
+ pfmul mm7, mm2 /* mm7 = Heps2 */
+
+ pfadd mm5, mm6
+ pfadd mm5, mm7 /* mm5 = Fp */
+
+ pfmul mm7, [esp + two] /* two*Heps2 */
+ pfadd mm7, mm6
+ pfadd mm7, mm5 /* mm7=FF */
+
+ pfmul mm5, mm0 /* mm5=eps*Fp */
+ pfadd mm5, mm4 /* mm5= VV */
+
+ pfmul mm5, [esp + qqHH] /* vcoul=qq*VV */
+ pfmul mm7, [esp + qqHH] /* fijC=qq*FF */
+ /* update vctot */
+ pfadd mm5, [esp + vctot]
+ movq [esp + vctot], mm5
+
+ /* change sign of fijC and multiply by rinv */
+ pxor mm4,mm4
+ pfsub mm4, mm7
+ pfmul mm4, [esp + tsc]
+ pfmul mm4, mm1 /* mm4 is total fscal (for the hydrogens) now */
+
+ /* spread oxygen fscalar to both positions */
+ punpckldq mm3,mm3
+ /* calc vectorial force for O */
+ movq mm0, [esp + dxO]
+ movd mm1, [esp + dzO]
+ pfmul mm0, mm3
+ pfmul mm1, mm3
+
+ /* calc vectorial force for H's */
+ movq mm5, [esp + dxH]
+ movq mm6, [esp + dyH]
+ movq mm7, [esp + dzH]
+ pfmul mm5, mm4
+ pfmul mm6, mm4
+ pfmul mm7, mm4
+
+ /* update iO particle force */
+ movq mm2, [esp + fixO]
+ movd mm3, [esp + fizO]
+ pfadd mm2, mm0
+ pfadd mm3, mm1
+ movq [esp + fixO], mm2
+ movd [esp + fizO], mm3
+
+ /* update iH forces */
+ movq mm2, [esp + fixH]
+ movq mm3, [esp + fiyH]
+ movq mm4, [esp + fizH]
+ pfadd mm2, mm5
+ pfadd mm3, mm6
+ pfadd mm4, mm7
+ movq [esp + fixH], mm2
+ movq [esp + fiyH], mm3
+ movq [esp + fizH], mm4
+
+ /* pack j forces from H in the same form as the oxygen force. */
+ pfacc mm5, mm6 /* mm5(l)=fjx(H1+H2) mm5(h)=fjy(H1+H2) */
+ pfacc mm7, mm7 /* mm7(l)=fjz(H1+H2) */
+
+ pfadd mm0, mm5 /* add up total force on j particle. */
+ pfadd mm1, mm7
+
+ /* update j particle force */
+ movq mm2, [edi + eax*4 + 12]
+ movd mm3, [edi + eax*4 + 20]
+ pfsub mm2, mm0
+ pfsub mm3, mm1
+ movq [edi + eax*4 + 12], mm2
+ movd [edi + eax*4 + 20], mm3
+
+ /* interactions with j H2 */
+ movq mm0, [esi + eax*4 + 24]
+ movd mm1, [esi + eax*4 + 32]
+ /* copy & expand to mm2-mm4 for the H interactions */
+ movq mm2, mm0
+ movq mm3, mm0
+ movq mm4, mm1
+ punpckldq mm2,mm2
+ punpckhdq mm3,mm3
+ punpckldq mm4,mm4
+
+ pfsubr mm0, [esp + ixO]
+ pfsubr mm1, [esp + izO]
+
+ movq [esp + dxO], mm0
+ pfmul mm0,mm0
+ movd [esp + dzO], mm1
+ pfmul mm1,mm1
+ pfacc mm0, mm1
+ pfadd mm0, mm1 /* mm0=rsqO */
+
+ punpckldq mm2, mm2
+ punpckldq mm3, mm3
+ punpckldq mm4, mm4 /* mm2-mm4 is jx-jz */
+ pfsubr mm2, [esp + ixH]
+ pfsubr mm3, [esp + iyH]
+ pfsubr mm4, [esp + izH] /* mm2-mm4 is dxH-dzH */
+
+ movq [esp + dxH], mm2
+ movq [esp + dyH], mm3
+ movq [esp + dzH], mm4
+ pfmul mm2,mm2
+ pfmul mm3,mm3
+ pfmul mm4,mm4
+
+ pfadd mm3,mm2
+ pfadd mm3,mm4 /* mm3=rsqH */
+ movq [esp + tmprsqH], mm3
+
+ pfrsqrt mm1,mm0
+
+ movq mm2,mm1
+ pfmul mm1,mm1
+ pfrsqit1 mm1,mm0
+ pfrcpit2 mm1,mm2 /* mm1=invsqrt */
+ pfmul mm0, mm1
+
+ pfmul mm0, [esp + tsc]
+ pf2iw mm4, mm0
+ movd [esp + n1], mm4
+ pi2fd mm4,mm4
+ pfsub mm0, mm4 /* now mm0 is eps and mm4 n0 */
+ movq mm2, mm0
+ pfmul mm2, mm2 /* mm0 is eps, mm2 eps2 */
+
+ /* coulomb table */
+ mov edx, [ebp + VFtab]
+ mov ecx, [esp + n1]
+ shl ecx, 2
+
+ /* load all values we need */
+ movd mm4, [edx + ecx*4]
+ movd mm5, [edx + ecx*4 + 4]
+ movd mm6, [edx + ecx*4 + 8]
+ movd mm7, [edx + ecx*4 + 12]
+
+ pfmul mm6, mm0 /* mm6 = Geps */
+ pfmul mm7, mm2 /* mm7 = Heps2 */
+
+ pfadd mm5, mm6
+ pfadd mm5, mm7 /* mm5 = Fp */
+
+ pfmul mm7, [esp + two] /* two*Heps2 */
+ pfadd mm7, mm6
+ pfadd mm7, mm5 /* mm7=FF */
+
+ pfmul mm5, mm0 /* mm5=eps*Fp */
+ pfadd mm5, mm4 /* mm5= VV */
+
+ pfmul mm5, [esp + qqOH] /* vcoul=qq*VV */
+ pfmul mm7, [esp + qqOH] /* fijC=qq*FF */
+
+ /* update vctot directly, use mm3 for fscal sum. */
+ pfadd mm5, [esp + vctot]
+ movq [esp + vctot], mm5
+ pxor mm3,mm3
+ pfsub mm3, mm7
+ pfmul mm3, [esp + tsc]
+ pfmul mm3, mm1 /* mm3 is total fscal (for the oxygen) now */
+
+ movq mm0, [esp + tmprsqH]
+
+ pfrsqrt mm1, mm0
+ pswapd mm0,mm0
+ pfrsqrt mm2, mm0
+ pswapd mm0,mm0
+ punpckldq mm1,mm2 /* seeds are in mm1 now, and rsq in mm0. */
+
+ movq mm2, mm1
+ pfmul mm1,mm1
+ pfrsqit1 mm1,mm0
+ pfrcpit2 mm1,mm2 /* mm1=invsqrt */
+
+ pfmul mm0,mm1 /* mm0=r */
+ pfmul mm0, [esp + tsc]
+ pf2iw mm4, mm0
+ movq [esp + n1], mm4
+ pi2fd mm4,mm4
+ pfsub mm0, mm4 /* now mm0 is eps and mm4 n0 */
+ movq mm2, mm0
+ pfmul mm2, mm2 /* mm0 is eps, mm2 eps2 */
+
+ /* coulomb table */
+ mov edx, [ebp + VFtab]
+ mov ecx, [esp + n1]
+ shl ecx, 2
+ /* load all values we need */
+ movd mm4, [edx + ecx*4]
+ movd mm5, [edx + ecx*4 + 4]
+ movd mm6, [edx + ecx*4 + 8]
+ movd mm7, [edx + ecx*4 + 12]
+ mov ecx, [esp + n1 + 4]
+ shl ecx, 2
+ punpckldq mm4, [edx + ecx*4]
+ punpckldq mm5, [edx + ecx*4 + 4]
+ punpckldq mm6, [edx + ecx*4 + 8]
+ punpckldq mm7, [edx + ecx*4 + 12]
+
+
+ pfmul mm6, mm0 /* mm6 = Geps */
+ pfmul mm7, mm2 /* mm7 = Heps2 */
+
+ pfadd mm5, mm6
+ pfadd mm5, mm7 /* mm5 = Fp */
+
+ pfmul mm7, [esp + two] /* two*Heps2 */
+ pfadd mm7, mm6
+ pfadd mm7, mm5 /* mm7=FF */
+
+ pfmul mm5, mm0 /* mm5=eps*Fp */
+ pfadd mm5, mm4 /* mm5= VV */
+
+ pfmul mm5, [esp + qqHH] /* vcoul=qq*VV */
+ pfmul mm7, [esp + qqHH] /* fijC=qq*FF */
+ /* update vctot */
+ pfadd mm5, [esp + vctot]
+ movq [esp + vctot], mm5
+
+ /* change sign of fijC and multiply by rinv */
+ pxor mm4,mm4
+ pfsub mm4, mm7
+ pfmul mm4, [esp + tsc]
+ pfmul mm4, mm1 /* mm4 is total fscal (for the hydrogens) now */
+
+ /* spread oxygen fscalar to both positions */
+ punpckldq mm3,mm3
+ /* calc vectorial force for O */
+ movq mm0, [esp + dxO]
+ movd mm1, [esp + dzO]
+ pfmul mm0, mm3
+ pfmul mm1, mm3
+
+ /* calc vectorial force for H's */
+ movq mm5, [esp + dxH]
+ movq mm6, [esp + dyH]
+ movq mm7, [esp + dzH]
+ pfmul mm5, mm4
+ pfmul mm6, mm4
+ pfmul mm7, mm4
+
+ /* update iO particle force */
+ movq mm2, [esp + fixO]
+ movd mm3, [esp + fizO]
+ pfadd mm2, mm0
+ pfadd mm3, mm1
+ movq [esp + fixO], mm2
+ movd [esp + fizO], mm3
+
+ /* update iH forces */
+ movq mm2, [esp + fixH]
+ movq mm3, [esp + fiyH]
+ movq mm4, [esp + fizH]
+ pfadd mm2, mm5
+ pfadd mm3, mm6
+ pfadd mm4, mm7
+ movq [esp + fixH], mm2
+ movq [esp + fiyH], mm3
+ movq [esp + fizH], mm4
+
+ /* pack j forces from H in the same form as the oxygen force. */
+ pfacc mm5, mm6 /* mm5(l)=fjx(H1+H2) mm5(h)=fjy(H1+H2) */
+ pfacc mm7, mm7 /* mm7(l)=fjz(H1+H2) */
+
+ pfadd mm0, mm5 /* add up total force on j particle. */
+ pfadd mm1, mm7
+
+ /* update j particle force */
+ movq mm2, [edi + eax*4 + 24]
+ movd mm3, [edi + eax*4 + 32]
+ pfsub mm2, mm0
+ pfsub mm3, mm1
+ movq [edi + eax*4 + 24], mm2
+ movd [edi + eax*4 + 32], mm3
+
+ /* done - one more? */
+ dec dword ptr [esp + innerk]
+ jz .i3030_updateouterdata
+ jmp .i3030_inner_loop
+.i3030_updateouterdata:
+ mov ecx, [esp + ii3]
+
+ movq mm6, [edi + ecx*4] /* increment iO force */
+ movd mm7, [edi + ecx*4 + 8]
+ pfadd mm6, [esp + fixO]
+ pfadd mm7, [esp + fizO]
+ movq [edi + ecx*4], mm6
+ movd [edi + ecx*4 +8], mm7
+
+ movq mm0, [esp + fixH]
+ movq mm3, [esp + fiyH]
+ movq mm1, [esp + fizH]
+ movq mm2, mm0
+ punpckldq mm0, mm3 /* mm0(l)=fxH1, mm0(h)=fyH1 */
+ punpckhdq mm2, mm3 /* mm2(l)=fxH2, mm2(h)=fyH2 */
+ movq mm3, mm1
+ pswapd mm3,mm3
+ /* mm1 is fzH1 */
+ /* mm3 is fzH2 */
+
+ movq mm6, [edi + ecx*4 + 12] /* increment iH1 force */
+ movd mm7, [edi + ecx*4 + 20]
+ pfadd mm6, mm0
+ pfadd mm7, mm1
+ movq [edi + ecx*4 + 12], mm6
+ movd [edi + ecx*4 + 20], mm7
+
+ movq mm6, [edi + ecx*4 + 24] /* increment iH2 force */
+ movd mm7, [edi + ecx*4 + 32]
+ pfadd mm6, mm2
+ pfadd mm7, mm3
+ movq [edi + ecx*4 + 24], mm6
+ movd [edi + ecx*4 + 32], mm7
+
+
+ mov ebx, [ebp + fshift] /* increment fshift force */
+ mov edx, [esp + is3]
+
+ movq mm6, [ebx + edx*4]
+ movd mm7, [ebx + edx*4 + 8]
+ pfadd mm6, [esp + fixO]
+ pfadd mm7, [esp + fizO]
+ pfadd mm6, mm0
+ pfadd mm7, mm1
+ pfadd mm6, mm2
+ pfadd mm7, mm3
+ movq [ebx + edx*4], mm6
+ movd [ebx + edx*4 + 8], mm7
+
+ mov edx, [ebp + gid] /* get group index for this i particle */
+ mov edx, [edx]
+ add [ebp + gid], 4 /* advance pointer */
+
+ movq mm7, [esp + vctot]
+ pfacc mm7,mm7 /* get and sum the two parts of total potential */
+
+ mov eax, [ebp + Vc]
+ movd mm6, [eax + edx*4]
+ pfadd mm6, mm7
+ movd [eax + edx*4], mm6 /* increment vc[gid] */
+
+ /* finish if last */
+ dec dword ptr [ebp + nri]
+ jz .i3030_end
+ /* not last, iterate once more! */
+ jmp .i3030_outer
+.i3030_end:
+ femms
+ add esp, 188
+ pop edi
+ pop esi
+ pop edx
+ pop ecx
+ pop ebx
+ pop eax
+ leave
+ ret
+
+
+
+
+.globl inl3100_3dnow
+ .type inl3100_3dnow,@function
+inl3100_3dnow:
+.equ nri, 8
+.equ iinr, 12
+.equ jindex, 16
+.equ jjnr, 20
+.equ shift, 24
+.equ shiftvec, 28
+.equ fshift, 32
+.equ gid, 36
+.equ pos, 40
+.equ faction, 44
+.equ charge, 48
+.equ facel, 52
+.equ Vc, 56
+.equ type, 60
+.equ ntype, 64
+.equ nbfp, 68
+.equ Vnb, 72
+.equ tabscale, 76
+.equ VFtab, 80
+ /* stack offsets for local variables */
+.equ is3, 0
+.equ ii3, 4
+.equ ix, 8
+.equ iy, 12
+.equ iz, 16
+.equ iq, 20 /* repeated (64bit) to fill 3dnow reg */
+.equ vctot, 28 /* repeated (64bit) to fill 3dnow reg */
+.equ vnbtot, 36 /* repeated (64bit) to fill 3dnow reg */
+.equ c6, 44 /* repeated (64bit) to fill 3dnow reg */
+.equ c12, 52 /* repeated (64bit) to fill 3dnow reg */
+.equ six, 60 /* repeated (64bit) to fill 3dnow reg */
+.equ twelve, 68 /* repeated (64bit) to fill 3dnow reg */
+.equ two, 76 /* repeated (64bit) to fill 3dnow reg */
+.equ n1, 84 /* repeated (64bit) to fill 3dnow reg */
+.equ tsc, 92 /* repeated (64bit) to fill 3dnow reg */
+.equ ntia, 100
+.equ innerjjnr, 104
+.equ innerk, 108
+.equ fix, 112
+.equ fiy, 116
+.equ fiz, 120
+.equ dx1, 124
+.equ dy1, 128
+.equ dz1, 132
+.equ dx2, 136
+.equ dy2, 140
+.equ dz2, 144
+ push ebp
+ mov ebp,esp
+ push eax
+ push ebx
+ push ecx
+ push edx
+ push esi
+ push edi
+ sub esp, 148 /* local stack space */
+ femms
+ /* move data to local stack */
+ movq mm0, [mm_two]
+ movq mm1, [mm_six]
+ movq mm2, [mm_twelve]
+ movd mm3, [ebp + tabscale]
+ movq [esp + two], mm0
+ movq [esp + six], mm1
+ movq [esp + twelve], mm2
+ punpckldq mm3,mm3
+ movq [esp + tsc], mm3
+ /* assume we have at least one i particle - start directly */
+.i3100_outer:
+ mov eax, [ebp + shift] /* eax = pointer into shift[] */
+ mov ebx, [eax] /* ebx=shift[n] */
+ add [ebp + shift], 4 /* advance pointer one step */
+
+ lea ebx, [ebx + ebx*2] /* ebx=3*is */
+ mov [esp + is3],ebx /* store is3 */
+
+ mov eax, [ebp + shiftvec] /* eax = base of shiftvec[] */
+
+ movq mm0, [eax + ebx*4] /* move shX/shY to mm0 and shZ to mm1 */
+ movd mm1, [eax + ebx*4 + 8]
+
+ mov ecx, [ebp + iinr] /* ecx = pointer into iinr[] */
+ add [ebp + iinr], 4 /* advance pointer */
+ mov ebx, [ecx] /* ebx=ii */
+
+ mov edx, [ebp + charge]
+ movd mm2, [edx + ebx*4] /* mm2=charge[ii] */
+ pfmul mm2, [ebp + facel]
+ punpckldq mm2,mm2 /* spread to both halves */
+ movq [esp + iq], mm2 /* iq =facel*charge[ii] */
+
+ mov edx, [ebp + type]
+ mov edx, [edx + ebx*4]
+ imul edx, [ebp + ntype]
+ shl edx, 1
+ mov [esp + ntia], edx
+
+ lea ebx, [ebx + ebx*2] /* ebx = 3*ii=ii3 */
+ mov eax, [ebp + pos] /* eax = base of pos[] */
+
+ pfadd mm0, [eax + ebx*4] /* ix = shX + posX (and iy too) */
+ movd mm3, [eax + ebx*4 + 8] /* cant use direct memory add for 4 bytes (iz) */
+ mov [esp + ii3], ebx
+ pfadd mm1, mm3
+ movq [esp + ix], mm0
+ movd [esp + iz], mm1
+
+ /* clear total potential and i forces */
+ pxor mm7,mm7
+ movq [esp + vctot], mm7
+ movq [esp + vnbtot], mm7
+ movq [esp + fix], mm7
+ movd [esp + fiz], mm7
+
+ mov eax, [ebp + jindex]
+ mov ecx, [eax] /* jindex[n] */
+ mov edx, [eax + 4] /* jindex[n+1] */
+ add [ebp + jindex], 4
+ sub edx, ecx /* number of innerloop atoms */
+
+ mov esi, [ebp + pos]
+ mov edi, [ebp + faction]
+ mov eax, [ebp + jjnr]
+ shl ecx, 2
+ add eax, ecx
+ mov [esp + innerjjnr], eax /* pointer to jjnr[nj0] */
+ sub edx, 2
+ mov [esp + innerk], edx /* number of innerloop atoms */
+ jge .i3100_unroll_loop
+ jmp .i3100_finish_inner
+.i3100_unroll_loop:
+ /* paired innerloop starts here */
+ mov ecx, [esp + innerjjnr] /* pointer to jjnr[k] */
+ mov eax, [ecx]
+ mov ebx, [ecx + 4] /* eax/ebx=jnr */
+ add [esp + innerjjnr], 8 /* advance pointer (unrolled 2) */
+ prefetch [ecx + 16] /* prefetch data - trial and error says 16 is best */
+
+ mov ecx, [ebp + charge] /* base of charge[] */
+ movq mm5, [esp + iq]
+ movd mm3, [ecx + eax*4] /* charge[jnr1] */
+ punpckldq mm3, [ecx + ebx*4] /* move charge 2 to high part of mm3 */
+ pfmul mm3,mm5 /* mm3 now has qq for both particles */
+
+ mov ecx, [ebp + type]
+ mov edx, [ecx + eax*4] /* type [jnr1] */
+ mov ecx, [ecx + ebx*4] /* type [jnr2] */
+
+ mov esi, [ebp + nbfp] /* base of nbfp */
+ shl edx, 1
+ shl ecx, 1
+ add edx, [esp + ntia] /* tja = ntia + 2*type */
+ add ecx, [esp + ntia]
+
+ movq mm5, [esi + edx*4] /* mm5 = 1st c6 / c12 */
+ movq mm7, [esi + ecx*4] /* mm7 = 2nd c6 / c12 */
+ movq mm6,mm5
+ punpckldq mm5,mm7 /* mm5 = 1st c6 / 2nd c6 */
+ punpckhdq mm6,mm7 /* mm6 = 1st c12 / 2nd c12 */
+ movq [esp + c6], mm5
+ movq [esp + c12], mm6
+
+ lea eax, [eax + eax*2] /* replace jnr with j3 */
+ lea ebx, [ebx + ebx*2]
+
+ mov esi, [ebp + pos]
+
+ movq mm0, [esp + ix]
+ movd mm1, [esp + iz]
+ movq mm4, [esi + eax*4] /* fetch first j coordinates */
+ movd mm5, [esi + eax*4 + 8]
+ pfsubr mm4,mm0 /* dr = ir - jr */
+ pfsubr mm5,mm1
+ movq [esp + dx1], mm4 /* store dr */
+ movd [esp + dz1], mm5
+ pfmul mm4,mm4 /* square dx,dy,dz */
+ pfmul mm5,mm5
+ pfacc mm4, mm5 /* accumulate to get dx*dx+dy*dy+dz*dz */
+ pfacc mm4, mm5 /* first rsq in lower mm4 */
+
+ movq mm6, [esi + ebx*4] /* fetch second j coordinates */
+ movd mm7, [esi + ebx*4 + 8]
+
+ pfsubr mm6,mm0 /* dr = ir - jr */
+ pfsubr mm7,mm1
+ movq [esp + dx2], mm6 /* store dr */
+ movd [esp + dz2], mm7
+ pfmul mm6,mm6 /* square dx,dy,dz */
+ pfmul mm7,mm7
+ pfacc mm6, mm7 /* accumulate to get dx*dx+dy*dy+dz*dz */
+ pfacc mm6, mm7 /* second rsq in lower mm6 */
+
+ pfrsqrt mm0, mm4 /* lookup inverse square root seed */
+ pfrsqrt mm1, mm6
+
+
+ punpckldq mm0,mm1
+ punpckldq mm4,mm6 /* now 4 has rsq and 0 the seed for both pairs. */
+ movq mm2,mm0 /* amd 3dnow N-R iteration to get full precision. */
+ pfmul mm0,mm0
+ pfrsqit1 mm0,mm4
+ pfrcpit2 mm0,mm2
+ pfmul mm4, mm0
+ movq mm1, mm4
+ /* mm0 is invsqrt, and mm1 r. */
+ /* do potential and fscal */
+ pfmul mm1, [esp + tsc] /* mm1=rt */
+ pf2iw mm4,mm1
+ movq [esp + n1], mm4
+ pi2fd mm4,mm4
+ pfsub mm1, mm4 /* now mm1 is eps and mm4 is n0 */
+
+ movq mm2,mm1
+ pfmul mm2,mm2 /* mm1 is eps, mm2 is eps2 */
+
+ mov edx, [ebp + VFtab]
+ mov ecx, [esp + n1]
+ shl ecx, 2
+ /* coulomb table */
+ /* load all the table values we need */
+ movd mm4, [edx + ecx*4]
+ movd mm5, [edx + ecx*4 + 4]
+ movd mm6, [edx + ecx*4 + 8]
+ movd mm7, [edx + ecx*4 + 12]
+ mov ecx, [esp + n1 + 4]
+ shl ecx, 2
+ punpckldq mm4, [edx + ecx*4]
+ punpckldq mm5, [edx + ecx*4 + 4]
+ punpckldq mm6, [edx + ecx*4 + 8]
+ punpckldq mm7, [edx + ecx*4 + 12]
+
+ pfmul mm6, mm1 /* mm6 = Geps */
+ pfmul mm7, mm2 /* mm7 = Heps2 */
+
+ pfadd mm5, mm6
+ pfadd mm5, mm7 /* mm5 = Fp */
+
+ pfmul mm7, [esp + two] /* two*Heps2 */
+ pfadd mm7, mm6
+ pfadd mm7, mm5 /* mm7=FF */
+
+ pfmul mm5, mm1 /* mm5=eps*Fp */
+ pfadd mm5, mm4 /* mm5= VV */
+
+ pfmul mm5, mm3 /* vcoul=qq*VV */
+ pfmul mm3, mm7 /* fijC=FF*qq */
+
+ movq mm1, mm0
+ pfmul mm1,mm1 /* mm1=invsq */
+ movq mm2, mm1
+ pfmul mm2,mm1
+ pfmul mm2,mm1 /* mm2=rinvsix */
+ movq mm1,mm2
+ pfmul mm1,mm1 /* mm1=rinvtwelve */
+
+ pfmul mm3, [esp + tsc]
+
+ pfmul mm1, [esp + c12]
+
+ pfmul mm2, [esp + c6]
+
+ movq mm4, mm1
+ pfsub mm4, mm2 /* mm4 = vnb12-vnb6 */
+
+ pfmul mm2, [esp + six]
+ pfmul mm1, [esp + twelve]
+
+ pfsub mm1, mm2
+ pfmul mm1, mm0 /* mm1= (12*vnb12-6*vnb6)*rinv11 */
+
+ pfsub mm1, mm3
+
+ /* update vctot */
+ pfadd mm5, [esp + vctot] /* add the earlier value */
+ movq [esp + vctot], mm5 /* store the sum */
+
+ pfmul mm0, mm1 /* mm0 is total fscal now */
+
+ prefetchw [esp + dx1] /* prefetch i forces to cache */
+
+ /* spread fscalar to both positions */
+ movq mm1,mm0
+ punpckldq mm0,mm0
+ punpckhdq mm1,mm1
+
+ /* calc vector force */
+ prefetchw [edi + eax*4] /* prefetch the 1st faction to cache */
+ movq mm2, [esp + dx1] /* fetch dr */
+ movd mm3, [esp + dz1]
+
+ /* update vnbtot */
+ pfadd mm4, [esp + vnbtot] /* add the earlier value */
+ movq [esp + vnbtot], mm4 /* store the sum */
+
+ prefetchw [edi + ebx*4] /* prefetch the 2nd faction to cache */
+ pfmul mm2, mm0 /* mult by fs */
+ pfmul mm3, mm0
+
+ movq mm4, [esp + dx2] /* fetch dr */
+ movd mm5, [esp + dz2]
+ pfmul mm4, mm1 /* mult by fs */
+ pfmul mm5, mm1
+ /* update i forces */
+
+ movq mm0, [esp + fix]
+ movd mm1, [esp + fiz]
+ pfadd mm0, mm2
+ pfadd mm1, mm3
+
+ pfadd mm0, mm4
+ pfadd mm1, mm5
+ movq [esp + fix], mm0
+ movd [esp + fiz], mm1
+ /* update j forces */
+
+ movq mm0, [edi + eax*4]
+ movd mm1, [edi + eax*4 + 8]
+ movq mm6, [edi + ebx*4]
+ movd mm7, [edi + ebx*4 + 8]
+
+ pfsub mm0, mm2
+ pfsub mm1, mm3
+ pfsub mm6, mm4
+ pfsub mm7, mm5
+
+ movq [edi + eax*4], mm0
+ movd [edi + eax*4 +8], mm1
+ movq [edi + ebx*4], mm6
+ movd [edi + ebx*4 + 8], mm7
+
+ /* should we do one more iteration? */
+ sub [esp + innerk], 2
+ jl .i3100_finish_inner
+ jmp .i3100_unroll_loop
+.i3100_finish_inner:
+ and [esp + innerk], 1
+ jnz .i3100_single_inner
+ jmp .i3100_updateouterdata
+.i3100_single_inner:
+ /* a single j particle iteration here - compare with the unrolled code for comments. */
+ mov eax, [esp + innerjjnr]
+ mov eax, [eax] /* eax=jnr offset */
+
+ mov ecx, [ebp + charge]
+ movd mm5, [esp + iq]
+ movd mm3, [ecx + eax*4]
+ pfmul mm3, mm5 /* mm3=qq */
+
+ mov esi, [ebp + nbfp]
+ mov ecx, [ebp + type]
+ mov edx, [ecx + eax*4] /* type [jnr1] */
+ shl edx, 1
+ add edx, [esp + ntia] /* tja = ntia + 2*type */
+ movd mm5, [esi + edx*4] /* mm5 = 1st c6 */
+ movq [esp + c6], mm5
+ movd mm5, [esi + edx*4 + 4] /* mm5 = 1st c12 */
+ movq [esp + c12], mm5
+
+
+ mov esi, [ebp + pos]
+ lea eax, [eax + eax*2]
+
+ movq mm0, [esp + ix]
+ movd mm1, [esp + iz]
+ movq mm4, [esi + eax*4]
+ movd mm5, [esi + eax*4 + 8]
+ pfsubr mm4, mm0
+ pfsubr mm5, mm1
+ movq [esp + dx1], mm4
+ pfmul mm4,mm4
+ movd [esp + dz1], mm5
+ pfmul mm5,mm5
+ pfacc mm4, mm5
+ pfacc mm4, mm5 /* mm4=rsq */
+
+ pfrsqrt mm0,mm4
+ movq mm2,mm0
+ pfmul mm0,mm0
+ pfrsqit1 mm0,mm4
+ pfrcpit2 mm0,mm2 /* mm1=invsqrt */
+ pfmul mm4, mm0
+ movq mm1, mm4
+ /* mm0 is invsqrt, and mm1 r. */
+ /* calculate potentials and scalar force */
+ pfmul mm1, [esp + tsc] /* mm1=rt */
+ pf2iw mm4,mm1
+ movd [esp + n1], mm4
+ pi2fd mm4,mm4
+ pfsub mm1, mm4 /* now mm1 is eps and mm4 is n0 */
+
+ movq mm2,mm1
+ pfmul mm2,mm2 /* mm1 is eps, mm2 is eps2 */
+
+ /* coulomb table */
+ mov edx, [ebp + VFtab]
+ mov ecx, [esp + n1]
+ shl ecx, 2
+ /* load all the table values we need */
+ movd mm4, [edx + ecx*4]
+ movd mm5, [edx + ecx*4 + 4]
+ movd mm6, [edx + ecx*4 + 8]
+ movd mm7, [edx + ecx*4 + 12]
+
+ pfmul mm6, mm1 /* mm6 = Geps */
+ pfmul mm7, mm2 /* mm7 = Heps2 */
+
+ pfadd mm5, mm6
+ pfadd mm5, mm7 /* mm5 = Fp */
+
+ pfmul mm7, [esp + two] /* two*Heps2 */
+ pfadd mm7, mm6
+ pfadd mm7, mm5 /* mm7=FF */
+
+ pfmul mm5, mm1 /* mm5=eps*Fp */
+ pfadd mm5, mm4 /* mm5= VV */
+
+ pfmul mm5, mm3 /* vcoul=qq*VV */
+ pfmul mm3, mm7 /* fijC=FF*qq */
+
+ /* at this point mm5 contains vcoul and mm3 fijC */
+
+ movq mm1, mm0
+ pfmul mm1,mm1 /* mm1=invsq */
+ movq mm2, mm1
+ pfmul mm2,mm1
+ pfmul mm2,mm1 /* mm2=rinvsix */
+ movq mm1,mm2
+ pfmul mm1,mm1 /* mm1=rinvtwelve */
+
+ pfmul mm3, [esp + tsc]
+
+ pfmul mm1, [esp + c12]
+
+ pfmul mm2, [esp + c6]
+
+ movq mm4, mm1
+ pfsub mm4, mm2 /* mm4 = vnb12-vnb6 */
+
+ pfmul mm2, [esp + six]
+ pfmul mm1, [esp + twelve]
+
+ pfsub mm1, mm2
+ pfmul mm1, mm0 /* mm1= (12*vnb12-6*vnb6)*rinv11 */
+
+ pfsub mm1, mm3
+
+ /* update vctot */
+ pfadd mm5, [esp + vctot] /* add the earlier value */
+ movq [esp + vctot], mm5 /* store the sum */
+
+ pfmul mm0, mm1 /* mm0 is total fscal now */
+
+ /* spread fscalar to both positions */
+ punpckldq mm0,mm0
+ /* calc vectorial force */
+ prefetchw [edi + eax*4] /* prefetch faction to cache */
+ movq mm2, [esp + dx1]
+ movd mm3, [esp + dz1]
+
+ /* update vnbtot */
+ pfadd mm4, [esp + vnbtot] /* add the earlier value */
+ movq [esp + vnbtot], mm4 /* store the sum */
+
+ pfmul mm2, mm0
+ pfmul mm3, mm0
+
+ /* update i particle force */
+ movq mm0, [esp + fix]
+ movd mm1, [esp + fiz]
+ pfadd mm0, mm2
+ pfadd mm1, mm3
+ movq [esp + fix], mm0
+ movd [esp + fiz], mm1
+ /* update j particle force */
+ movq mm0, [edi + eax*4]
+ movd mm1, [edi + eax *4+ 8]
+ pfsub mm0, mm2
+ pfsub mm1, mm3
+ movq [edi + eax*4], mm0
+ movd [edi + eax*4 +8], mm1
+ /* done! */
+.i3100_updateouterdata:
+ mov ecx, [esp + ii3]
+
+ movq mm6, [edi + ecx*4] /* increment i force */
+ movd mm7, [edi + ecx*4 + 8]
+ pfadd mm6, [esp + fix]
+ pfadd mm7, [esp + fiz]
+ movq [edi + ecx*4], mm6
+ movd [edi + ecx*4 +8], mm7
+
+ mov ebx, [ebp + fshift] /* increment fshift force */
+ mov edx, [esp + is3]
+
+ movq mm6, [ebx + edx*4]
+ movd mm7, [ebx + edx*4 + 8]
+ pfadd mm6, [esp + fix]
+ pfadd mm7, [esp + fiz]
+ movq [ebx + edx*4], mm6
+ movd [ebx + edx*4 + 8], mm7
+
+ mov edx, [ebp + gid] /* get group index for this i particle */
+ mov edx, [edx]
+ add [ebp + gid], 4 /* advance pointer */
+
+ movq mm7, [esp + vctot]
+ pfacc mm7,mm7 /* get and sum the two parts of total potential */
+
+ mov eax, [ebp + Vc]
+ movd mm6, [eax + edx*4]
+ pfadd mm6, mm7
+ movd [eax + edx*4], mm6 /* increment vc[gid] */
+
+ movq mm7, [esp + vnbtot]
+ pfacc mm7,mm7 /* get and sum the two parts of total potential */
+
+ mov eax, [ebp + Vnb]
+ movd mm6, [eax + edx*4]
+ pfadd mm6, mm7
+ movd [eax + edx*4], mm6 /* increment vnb[gid] */
+
+ /* finish if last */
+ mov ecx, [ebp + nri]
+ dec ecx
+ jecxz .i3100_end
+ /* not last, iterate once more! */
+ mov [ebp + nri], ecx
+ jmp .i3100_outer
+.i3100_end:
+ femms
+ add esp, 148
+ pop edi
+ pop esi
+ pop edx
+ pop ecx
+ pop ebx
+ pop eax
+ leave
+ ret
+
+
+
+
+
+
+
+.globl inl3110_3dnow
+ .type inl3110_3dnow,@function
+inl3110_3dnow:
+.equ nri, 8
+.equ iinr, 12
+.equ jindex, 16
+.equ jjnr, 20
+.equ shift, 24
+.equ shiftvec, 28
+.equ fshift, 32
+.equ gid, 36
+.equ pos, 40
+.equ faction, 44
+.equ charge, 48
+.equ facel, 52
+.equ Vc, 56
+.equ type, 60
+.equ ntype, 64
+.equ nbfp, 68
+.equ Vnb, 72
+.equ tabscale, 76
+.equ VFtab, 80
+.equ nsatoms, 84
+ /* stack offsets for local variables */
+.equ is3, 0
+.equ ii3, 4
+.equ shX, 8
+.equ shY, 12
+.equ shZ, 16
+.equ ix, 20
+.equ iy, 24
+.equ iz, 28
+.equ iq, 32 /* repeated (64bit) to fill 3dnow reg */
+.equ vctot, 40 /* repeated (64bit) to fill 3dnow reg */
+.equ vnbtot, 48 /* repeated (64bit) to fill 3dnow reg */
+.equ c6, 56 /* repeated (64bit) to fill 3dnow reg */
+.equ c12, 64 /* repeated (64bit) to fill 3dnow reg */
+.equ six, 72 /* repeated (64bit) to fill 3dnow reg */
+.equ twelve, 80 /* repeated (64bit) to fill 3dnow reg */
+.equ two, 88 /* repeated (64bit) to fill 3dnow reg */
+.equ n1, 96 /* repeated (64bit) to fill 3dnow reg */
+.equ tsc, 104 /* repeated (64bit) to fill 3dnow reg */
+.equ ntia, 112
+.equ innerjjnr0, 116
+.equ innerk0, 120
+.equ innerjjnr, 124
+.equ innerk, 128
+.equ fix, 132
+.equ fiy, 136
+.equ fiz, 140
+.equ dx1, 144
+.equ dy1, 148
+.equ dz1, 152
+.equ dx2, 156
+.equ dy2, 160
+.equ dz2, 164
+.equ nsvdwc, 168
+.equ nscoul, 172
+.equ nsvdw, 176
+.equ solnr, 180
+ push ebp
+ mov ebp,esp
+ push eax
+ push ebx
+ push ecx
+ push edx
+ push esi
+ push edi
+ sub esp, 184 /* local stack space */
+ femms
+ movq mm0, [mm_six]
+ movq mm1, [mm_twelve]
+ movq [esp + six], mm0
+ movq [esp + twelve], mm1
+ movq mm2, [mm_two]
+ movd mm3, [ebp + tabscale]
+ movq [esp + two], mm2
+ punpckldq mm3,mm3
+ movq [esp + tsc], mm3
+ /* assume we have at least one i particle - start directly */
+.i3110_outer:
+ mov eax, [ebp + shift] /* eax = pointer into shift[] */
+ mov ebx, [eax] /* ebx=shift[n] */
+ add [ebp + shift], 4 /* advance pointer one step */
+
+ lea ebx, [ebx + ebx*2] /* ebx=3*is */
+ mov [esp + is3],ebx /* store is3 */
+
+ mov eax, [ebp + shiftvec] /* eax = base of shiftvec[] */
+
+ movq mm0, [eax + ebx*4] /* move shX/shY to mm0 and shZ to mm1 */
+ movd mm1, [eax + ebx*4 + 8]
+ movq [esp + shX], mm0
+ movd [esp + shZ], mm1
+
+ mov ecx, [ebp + iinr] /* ecx = pointer into iinr[] */
+ add [ebp + iinr], 4 /* advance pointer */
+ mov ebx, [ecx] /* ebx=ii */
+
+ mov eax, [ebp + nsatoms]
+ add [ebp + nsatoms], 12
+ mov ecx, [eax]
+ mov edx, [eax + 4]
+ mov eax, [eax + 8]
+ sub ecx, eax
+ sub eax, edx
+
+ mov [esp + nsvdwc], edx
+ mov [esp + nscoul], eax
+ mov [esp + nsvdw], ecx
+
+ /* clear potential */
+ pxor mm7,mm7
+ movq [esp + vctot], mm7
+ movq [esp + vnbtot], mm7
+ mov [esp + solnr], ebx
+
+ mov eax, [ebp + jindex]
+ mov ecx, [eax] /* jindex[n] */
+ mov edx, [eax + 4] /* jindex[n+1] */
+ add [ebp + jindex], 4
+ sub edx, ecx /* number of innerloop atoms */
+ mov eax, [ebp + jjnr]
+ shl ecx, 2
+ add eax, ecx
+ mov [esp + innerjjnr0], eax /* pointer to jjnr[nj0] */
+
+ mov [esp + innerk0], edx /* number of innerloop atoms */
+ mov esi, [ebp + pos]
+ mov edi, [ebp + faction]
+
+ mov ecx, [esp + nsvdwc]
+ cmp ecx, 0
+ jnz .i3110_mno_vdwc
+ jmp .i3110_testcoul
+.i3110_mno_vdwc:
+ mov ebx, [esp + solnr]
+ inc dword ptr [esp + solnr]
+ mov edx, [ebp + charge]
+ movd mm2, [edx + ebx*4] /* mm2=charge[ii] */
+ pfmul mm2, [ebp + facel]
+ punpckldq mm2,mm2 /* spread to both halves */
+ movq [esp + iq], mm2 /* iq =facel*charge[ii] */
+
+ mov edx, [ebp + type]
+ mov edx, [edx + ebx*4]
+ imul edx, [ebp + ntype]
+ shl edx, 1
+ mov [esp + ntia], edx
+
+ lea ebx, [ebx + ebx*2] /* ebx = 3*ii=ii3 */
+ mov eax, [ebp + pos] /* eax = base of pos[] */
+ mov [esp + ii3], ebx
+
+ movq mm0, [eax + ebx*4]
+ movd mm1, [eax + ebx*4 + 8]
+ pfadd mm0, [esp + shX]
+ pfadd mm1, [esp + shZ]
+ movq [esp + ix], mm0
+ movd [esp + iz], mm1
+
+ /* clear forces */
+ pxor mm7,mm7
+ movq [esp + fix], mm7
+ movd [esp + fiz], mm7
+
+ mov ecx, [esp + innerjjnr0]
+ mov [esp + innerjjnr], ecx
+ mov edx, [esp + innerk0]
+ sub edx, 2
+ mov [esp + innerk], edx /* number of innerloop atoms */
+ jge .i3110_unroll_vdwc_loop
+ jmp .i3110_finish_vdwc_inner
+.i3110_unroll_vdwc_loop:
+ /* paired innerloop starts here */
+ mov ecx, [esp + innerjjnr] /* pointer to jjnr[k] */
+ mov eax, [ecx]
+ mov ebx, [ecx + 4] /* eax/ebx=jnr */
+ add [esp + innerjjnr], 8 /* advance pointer (unrolled 2) */
+ prefetch [ecx + 16] /* prefetch data - trial and error says 16 is best */
+
+ mov ecx, [ebp + charge] /* base of charge[] */
+ movq mm5, [esp + iq]
+ movd mm3, [ecx + eax*4] /* charge[jnr1] */
+ punpckldq mm3, [ecx + ebx*4] /* move charge 2 to high part of mm3 */
+ pfmul mm3,mm5 /* mm3 now has qq for both particles */
+
+ mov ecx, [ebp + type]
+ mov edx, [ecx + eax*4] /* type [jnr1] */
+ mov ecx, [ecx + ebx*4] /* type [jnr2] */
+
+ mov esi, [ebp + nbfp] /* base of nbfp */
+ shl edx, 1
+ shl ecx, 1
+ add edx, [esp + ntia] /* tja = ntia + 2*type */
+ add ecx, [esp + ntia]
+
+ movq mm5, [esi + edx*4] /* mm5 = 1st c6 / c12 */
+ movq mm7, [esi + ecx*4] /* mm7 = 2nd c6 / c12 */
+ movq mm6,mm5
+ punpckldq mm5,mm7 /* mm5 = 1st c6 / 2nd c6 */
+ punpckhdq mm6,mm7 /* mm6 = 1st c12 / 2nd c12 */
+ movq [esp + c6], mm5
+ movq [esp + c12], mm6
+
+ lea eax, [eax + eax*2] /* replace jnr with j3 */
+ lea ebx, [ebx + ebx*2]
+
+ mov esi, [ebp + pos]
+
+ movq mm0, [esp + ix]
+ movd mm1, [esp + iz]
+ movq mm4, [esi + eax*4] /* fetch first j coordinates */
+ movd mm5, [esi + eax*4 + 8]
+ pfsubr mm4,mm0 /* dr = ir - jr */
+ pfsubr mm5,mm1
+ movq [esp + dx1], mm4 /* store dr */
+ movd [esp + dz1], mm5
+ pfmul mm4,mm4 /* square dx,dy,dz */
+ pfmul mm5,mm5
+ pfacc mm4, mm5 /* accumulate to get dx*dx+dy*dy+dz*dz */
+ pfacc mm4, mm5 /* first rsq in lower mm4 */
+
+ movq mm6, [esi + ebx*4] /* fetch second j coordinates */
+ movd mm7, [esi + ebx*4 + 8]
+
+ pfsubr mm6,mm0 /* dr = ir - jr */
+ pfsubr mm7,mm1
+ movq [esp + dx2], mm6 /* store dr */
+ movd [esp + dz2], mm7
+ pfmul mm6,mm6 /* square dx,dy,dz */
+ pfmul mm7,mm7
+ pfacc mm6, mm7 /* accumulate to get dx*dx+dy*dy+dz*dz */
+ pfacc mm6, mm7 /* second rsq in lower mm6 */
+
+ pfrsqrt mm0, mm4 /* lookup inverse square root seed */
+ pfrsqrt mm1, mm6
+
+
+ punpckldq mm0,mm1
+ punpckldq mm4,mm6 /* now 4 has rsq and 0 the seed for both pairs. */
+ movq mm2,mm0 /* amd 3dnow N-R iteration to get full precision. */
+ pfmul mm0,mm0
+ pfrsqit1 mm0,mm4
+ pfrcpit2 mm0,mm2
+ pfmul mm4, mm0
+ movq mm1, mm4
+ /* mm0 is invsqrt, and mm1 r. */
+ /* do potential and fscal */
+ pfmul mm1, [esp + tsc] /* mm1=rt */
+ pf2iw mm4,mm1
+ movq [esp + n1], mm4
+ pi2fd mm4,mm4
+ pfsub mm1, mm4 /* now mm1 is eps and mm4 is n0 */
+
+ movq mm2,mm1
+ pfmul mm2,mm2 /* mm1 is eps, mm2 is eps2 */
+
+ mov edx, [ebp + VFtab]
+ mov ecx, [esp + n1]
+ shl ecx, 2
+ /* coulomb table */
+ /* load all the table values we need */
+ movd mm4, [edx + ecx*4]
+ movd mm5, [edx + ecx*4 + 4]
+ movd mm6, [edx + ecx*4 + 8]
+ movd mm7, [edx + ecx*4 + 12]
+ mov ecx, [esp + n1 + 4]
+ shl ecx, 2
+ punpckldq mm4, [edx + ecx*4]
+ punpckldq mm5, [edx + ecx*4 + 4]
+ punpckldq mm6, [edx + ecx*4 + 8]
+ punpckldq mm7, [edx + ecx*4 + 12]
+
+ pfmul mm6, mm1 /* mm6 = Geps */
+ pfmul mm7, mm2 /* mm7 = Heps2 */
+
+ pfadd mm5, mm6
+ pfadd mm5, mm7 /* mm5 = Fp */
+
+ pfmul mm7, [esp + two] /* two*Heps2 */
+ pfadd mm7, mm6
+ pfadd mm7, mm5 /* mm7=FF */
+
+ pfmul mm5, mm1 /* mm5=eps*Fp */
+ pfadd mm5, mm4 /* mm5= VV */
+
+ pfmul mm5, mm3 /* vcoul=qq*VV */
+ pfmul mm3, mm7 /* fijC=FF*qq */
+
+ movq mm1, mm0
+ pfmul mm1,mm1 /* mm1=invsq */
+ movq mm2, mm1
+ pfmul mm2,mm1
+ pfmul mm2,mm1 /* mm2=rinvsix */
+ movq mm1,mm2
+ pfmul mm1,mm1 /* mm1=rinvtwelve */
+
+ pfmul mm3, [esp + tsc]
+
+ pfmul mm1, [esp + c12]
+
+ pfmul mm2, [esp + c6]
+
+ movq mm4, mm1
+ pfsub mm4, mm2 /* mm4 = vnb12-vnb6 */
+
+ pfmul mm2, [esp + six]
+ pfmul mm1, [esp + twelve]
+
+ pfsub mm1, mm2
+ pfmul mm1, mm0 /* mm1= (12*vnb12-6*vnb6)*rinv11 */
+
+ pfsub mm1, mm3
+
+ /* update vctot */
+ pfadd mm5, [esp + vctot] /* add the earlier value */
+ movq [esp + vctot], mm5 /* store the sum */
+
+ pfmul mm0, mm1 /* mm0 is total fscal now */
+
+ prefetchw [esp + dx1] /* prefetch i forces to cache */
+
+ /* spread fscalar to both positions */
+ movq mm1,mm0
+ punpckldq mm0,mm0
+ punpckhdq mm1,mm1
+
+ /* calc vector force */
+ prefetchw [edi + eax*4] /* prefetch the 1st faction to cache */
+ movq mm2, [esp + dx1] /* fetch dr */
+ movd mm3, [esp + dz1]
+
+ /* update vnbtot */
+ pfadd mm4, [esp + vnbtot] /* add the earlier value */
+ movq [esp + vnbtot], mm4 /* store the sum */
+
+ prefetchw [edi + ebx*4] /* prefetch the 2nd faction to cache */
+ pfmul mm2, mm0 /* mult by fs */
+ pfmul mm3, mm0
+
+ movq mm4, [esp + dx2] /* fetch dr */
+ movd mm5, [esp + dz2]
+ pfmul mm4, mm1 /* mult by fs */
+ pfmul mm5, mm1
+ /* update i forces */
+
+ movq mm0, [esp + fix]
+ movd mm1, [esp + fiz]
+ pfadd mm0, mm2
+ pfadd mm1, mm3
+
+ pfadd mm0, mm4
+ pfadd mm1, mm5
+ movq [esp + fix], mm0
+ movd [esp + fiz], mm1
+ /* update j forces */
+
+ movq mm0, [edi + eax*4]
+ movd mm1, [edi + eax*4 + 8]
+ movq mm6, [edi + ebx*4]
+ movd mm7, [edi + ebx*4 + 8]
+
+ pfsub mm0, mm2
+ pfsub mm1, mm3
+ pfsub mm6, mm4
+ pfsub mm7, mm5
+
+ movq [edi + eax*4], mm0
+ movd [edi + eax*4 +8], mm1
+ movq [edi + ebx*4], mm6
+ movd [edi + ebx*4 + 8], mm7
+
+ /* should we do one more iteration? */
+ sub [esp + innerk], 2
+ jl .i3110_finish_vdwc_inner
+ jmp .i3110_unroll_vdwc_loop
+.i3110_finish_vdwc_inner:
+ and [esp + innerk], 1
+ jnz .i3110_single_vdwc_inner
+ jmp .i3110_updateouterdata_vdwc
+.i3110_single_vdwc_inner:
+ /* a single j particle iteration here - compare with the unrolled code for comments. */
+ mov eax, [esp + innerjjnr]
+ mov eax, [eax] /* eax=jnr offset */
+
+ mov ecx, [ebp + charge]
+ movd mm5, [esp + iq]
+ movd mm3, [ecx + eax*4]
+ pfmul mm3, mm5 /* mm3=qq */
+
+ mov esi, [ebp + nbfp]
+ mov ecx, [ebp + type]
+ mov edx, [ecx + eax*4] /* type [jnr1] */
+ shl edx, 1
+ add edx, [esp + ntia] /* tja = ntia + 2*type */
+ movd mm5, [esi + edx*4] /* mm5 = 1st c6 */
+ movq [esp + c6], mm5
+ movd mm5, [esi + edx*4 + 4] /* mm5 = 1st c12 */
+ movq [esp + c12], mm5
+
+
+ mov esi, [ebp + pos]
+ lea eax, [eax + eax*2]
+
+ movq mm0, [esp + ix]
+ movd mm1, [esp + iz]
+ movq mm4, [esi + eax*4]
+ movd mm5, [esi + eax*4 + 8]
+ pfsubr mm4, mm0
+ pfsubr mm5, mm1
+ movq [esp + dx1], mm4
+ pfmul mm4,mm4
+ movd [esp + dz1], mm5
+ pfmul mm5,mm5
+ pfacc mm4, mm5
+ pfacc mm4, mm5 /* mm4=rsq */
+
+ pfrsqrt mm0,mm4
+ movq mm2,mm0
+ pfmul mm0,mm0
+ pfrsqit1 mm0,mm4
+ pfrcpit2 mm0,mm2 /* mm1=invsqrt */
+ pfmul mm4, mm0
+ movq mm1, mm4
+ /* mm0 is invsqrt, and mm1 r. */
+ /* calculate potentials and scalar force */
+ pfmul mm1, [esp + tsc] /* mm1=rt */
+ pf2iw mm4,mm1
+ movd [esp + n1], mm4
+ pi2fd mm4,mm4
+ pfsub mm1, mm4 /* now mm1 is eps and mm4 is n0 */
+
+ movq mm2,mm1
+ pfmul mm2,mm2 /* mm1 is eps, mm2 is eps2 */
+
+ /* coulomb table */
+ mov edx, [ebp + VFtab]
+ mov ecx, [esp + n1]
+ shl ecx, 2
+ /* load all the table values we need */
+ movd mm4, [edx + ecx*4]
+ movd mm5, [edx + ecx*4 + 4]
+ movd mm6, [edx + ecx*4 + 8]
+ movd mm7, [edx + ecx*4 + 12]
+
+ pfmul mm6, mm1 /* mm6 = Geps */
+ pfmul mm7, mm2 /* mm7 = Heps2 */
+
+ pfadd mm5, mm6
+ pfadd mm5, mm7 /* mm5 = Fp */
+
+ pfmul mm7, [esp + two] /* two*Heps2 */
+ pfadd mm7, mm6
+ pfadd mm7, mm5 /* mm7=FF */
+
+ pfmul mm5, mm1 /* mm5=eps*Fp */
+ pfadd mm5, mm4 /* mm5= VV */
+
+ pfmul mm5, mm3 /* vcoul=qq*VV */
+ pfmul mm3, mm7 /* fijC=FF*qq */
+
+ movq mm1, mm0
+ pfmul mm1,mm1 /* mm1=invsq */
+ movq mm2, mm1
+ pfmul mm2,mm1
+ pfmul mm2,mm1 /* mm2=rinvsix */
+ movq mm1,mm2
+ pfmul mm1,mm1 /* mm1=rinvtwelve */
+
+ pfmul mm3, [esp + tsc]
+
+ pfmul mm1, [esp + c12]
+
+ pfmul mm2, [esp + c6]
+
+ movq mm4, mm1
+ pfsub mm4, mm2 /* mm4 = vnb12-vnb6 */
+
+ pfmul mm2, [esp + six]
+ pfmul mm1, [esp + twelve]
+
+ pfsub mm1, mm2
+ pfmul mm1, mm0 /* mm1= (12*vnb12-6*vnb6)*rinv11 */
+
+ pfsub mm1, mm3
+
+ /* update vctot */
+ pfadd mm5, [esp + vctot] /* add the earlier value */
+ movq [esp + vctot], mm5 /* store the sum */
+
+ pfmul mm0, mm1 /* mm0 is total fscal now */
+
+ /* spread fscalar to both positions */
+ punpckldq mm0,mm0
+ /* calc vectorial force */
+ prefetchw [edi + eax*4] /* prefetch faction to cache */
+ movq mm2, [esp + dx1]
+ movd mm3, [esp + dz1]
+
+ /* update vnbtot */
+ pfadd mm4, [esp + vnbtot] /* add the earlier value */
+ movq [esp + vnbtot], mm4 /* store the sum */
+
+ pfmul mm2, mm0
+ pfmul mm3, mm0
+
+ /* update i particle force */
+ movq mm0, [esp + fix]
+ movd mm1, [esp + fiz]
+ pfadd mm0, mm2
+ pfadd mm1, mm3
+ movq [esp + fix], mm0
+ movd [esp + fiz], mm1
+ /* update j particle force */
+ movq mm0, [edi + eax*4]
+ movd mm1, [edi + eax *4+ 8]
+ pfsub mm0, mm2
+ pfsub mm1, mm3
+ movq [edi + eax*4], mm0
+ movd [edi + eax*4 +8], mm1
+ /* done! */
+.i3110_updateouterdata_vdwc:
+ mov ecx, [esp + ii3]
+
+ movq mm6, [edi + ecx*4] /* increment i force */
+ movd mm7, [edi + ecx*4 + 8]
+ pfadd mm6, [esp + fix]
+ pfadd mm7, [esp + fiz]
+ movq [edi + ecx*4], mm6
+ movd [edi + ecx*4 +8], mm7
+
+ mov ebx, [ebp + fshift] /* increment fshift force */
+ mov edx, [esp + is3]
+
+ movq mm6, [ebx + edx*4]
+ movd mm7, [ebx + edx*4 + 8]
+ pfadd mm6, [esp + fix]
+ pfadd mm7, [esp + fiz]
+ movq [ebx + edx*4], mm6
+ movd [ebx + edx*4 + 8], mm7
+
+ /* loop back to mno */
+ dec dword ptr [esp + nsvdwc]
+ jz .i3110_testcoul
+ jmp .i3110_mno_vdwc
+.i3110_testcoul:
+ mov ecx, [esp + nscoul]
+ cmp ecx, 0
+ jnz .i3110_mno_coul
+ jmp .i3110_testvdw
+.i3110_mno_coul:
+ mov ebx, [esp + solnr]
+ inc dword ptr [esp + solnr]
+ mov edx, [ebp + charge]
+ movd mm2, [edx + ebx*4] /* mm2=charge[ii] */
+ pfmul mm2, [ebp + facel]
+ punpckldq mm2,mm2 /* spread to both halves */
+ movq [esp + iq], mm2 /* iq =facel*charge[ii] */
+
+ lea ebx, [ebx + ebx*2] /* ebx = 3*ii=ii3 */
+ mov eax, [ebp + pos] /* eax = base of pos[] */
+ mov [esp + ii3], ebx
+
+ movq mm0, [eax + ebx*4]
+ movd mm1, [eax + ebx*4 + 8]
+ pfadd mm0, [esp + shX]
+ pfadd mm1, [esp + shZ]
+ movq [esp + ix], mm0
+ movd [esp + iz], mm1
+
+ /* clear forces */
+ pxor mm7,mm7
+ movq [esp + fix], mm7
+ movd [esp + fiz], mm7
+
+ mov ecx, [esp + innerjjnr0]
+ mov [esp + innerjjnr], ecx
+ mov edx, [esp + innerk0]
+ sub edx, 2
+ mov [esp + innerk], edx /* number of innerloop atoms */
+ jge .i3110_unroll_coul_loop
+ jmp .i3110_finish_coul_inner
+.i3110_unroll_coul_loop:
+ /* paired innerloop starts here */
+ mov ecx, [esp + innerjjnr] /* pointer to jjnr[k] */
+ mov eax, [ecx]
+ mov ebx, [ecx + 4] /* eax/ebx=jnr */
+ add [esp + innerjjnr], 8 /* advance pointer (unrolled 2) */
+ prefetch [ecx + 16] /* prefetch data - trial and error says 16 is best */
+
+ mov ecx, [ebp + charge] /* base of charge[] */
+ movq mm5, [esp + iq]
+ movd mm3, [ecx + eax*4] /* charge[jnr1] */
+ punpckldq mm3, [ecx + ebx*4] /* move charge 2 to high part of mm3 */
+ pfmul mm3,mm5 /* mm3 now has qq for both particles */
+
+ lea eax, [eax + eax*2] /* replace jnr with j3 */
+ lea ebx, [ebx + ebx*2]
+
+ mov esi, [ebp + pos]
+
+ movq mm0, [esp + ix]
+ movd mm1, [esp + iz]
+ movq mm4, [esi + eax*4] /* fetch first j coordinates */
+ movd mm5, [esi + eax*4 + 8]
+ pfsubr mm4,mm0 /* dr = ir - jr */
+ pfsubr mm5,mm1
+ movq [esp + dx1], mm4 /* store dr */
+ movd [esp + dz1], mm5
+ pfmul mm4,mm4 /* square dx,dy,dz */
+ pfmul mm5,mm5
+ pfacc mm4, mm5 /* accumulate to get dx*dx+dy*dy+dz*dz */
+ pfacc mm4, mm5 /* first rsq in lower mm4 */
+
+ movq mm6, [esi + ebx*4] /* fetch second j coordinates */
+ movd mm7, [esi + ebx*4 + 8]
+
+ pfsubr mm6,mm0 /* dr = ir - jr */
+ pfsubr mm7,mm1
+ movq [esp + dx2], mm6 /* store dr */
+ movd [esp + dz2], mm7
+ pfmul mm6,mm6 /* square dx,dy,dz */
+ pfmul mm7,mm7
+ pfacc mm6, mm7 /* accumulate to get dx*dx+dy*dy+dz*dz */
+ pfacc mm6, mm7 /* second rsq in lower mm6 */
+
+ pfrsqrt mm0, mm4 /* lookup inverse square root seed */
+ pfrsqrt mm1, mm6
+
+
+ punpckldq mm0,mm1
+ punpckldq mm4,mm6 /* now 4 has rsq and 0 the seed for both pairs. */
+ movq mm2,mm0 /* amd 3dnow N-R iteration to get full precision. */
+ pfmul mm0,mm0
+ pfrsqit1 mm0,mm4
+ pfrcpit2 mm0,mm2
+ pfmul mm4, mm0
+ movq mm1, mm4
+ /* mm0 is invsqrt, and mm1 r. */
+ /* do potential and fscal */
+ pfmul mm1, [esp + tsc] /* mm1=rt */
+ pf2iw mm4,mm1
+ movq [esp + n1], mm4
+ pi2fd mm4,mm4
+ pfsub mm1, mm4 /* now mm1 is eps and mm4 is n0 */
+
+ movq mm2,mm1
+ pfmul mm2,mm2 /* mm1 is eps, mm2 is eps2 */
+
+ mov edx, [ebp + VFtab]
+ mov ecx, [esp + n1]
+ shl ecx, 2
+ /* coulomb table */
+ /* load all the table values we need */
+ movd mm4, [edx + ecx*4]
+ movd mm5, [edx + ecx*4 + 4]
+ movd mm6, [edx + ecx*4 + 8]
+ movd mm7, [edx + ecx*4 + 12]
+ mov ecx, [esp + n1 + 4]
+ shl ecx, 2
+ punpckldq mm4, [edx + ecx*4]
+ punpckldq mm5, [edx + ecx*4 + 4]
+ punpckldq mm6, [edx + ecx*4 + 8]
+ punpckldq mm7, [edx + ecx*4 + 12]
+
+ pfmul mm6, mm1 /* mm6 = Geps */
+ pfmul mm7, mm2 /* mm7 = Heps2 */
+
+ pfadd mm5, mm6
+ pfadd mm5, mm7 /* mm5 = Fp */
+
+ pfmul mm7, [esp + two] /* two*Heps2 */
+ pfadd mm7, mm6
+ pfadd mm7, mm5 /* mm7=FF */
+
+ pfmul mm5, mm1 /* mm5=eps*Fp */
+ pfadd mm5, mm4 /* mm5= VV */
+
+ pfmul mm5, mm3 /* vcoul=qq*VV */
+ pfmul mm3, mm7 /* fijC=FF*qq */
+
+ /* at this point mm5 contains vcoul and mm3 fijC */
+ /* increment vcoul - then we can get rid of mm5 */
+ /* update vctot */
+ pfadd mm5, [esp + vctot] /* add the earlier value */
+ movq [esp + vctot], mm5 /* store the sum */
+
+ /* change sign of mm3 */
+ pxor mm1,mm1
+ pfsub mm1, mm3
+ pfmul mm1, [esp + tsc]
+ pfmul mm0, mm1 /* mm0 is total fscal now */
+
+ prefetchw [esp + dx1] /* prefetch i forces to cache */
+
+ /* spread fscalar to both positions */
+ movq mm1,mm0
+ punpckldq mm0,mm0
+ punpckhdq mm1,mm1
+
+ /* calc vector force */
+ prefetchw [edi + eax*4] /* prefetch the 1st faction to cache */
+ movq mm2, [esp + dx1] /* fetch dr */
+ movd mm3, [esp + dz1]
+
+ prefetchw [edi + ebx*4] /* prefetch the 2nd faction to cache */
+ pfmul mm2, mm0 /* mult by fs */
+ pfmul mm3, mm0
+
+ movq mm4, [esp + dx2] /* fetch dr */
+ movd mm5, [esp + dz2]
+ pfmul mm4, mm1 /* mult by fs */
+ pfmul mm5, mm1
+ /* update i forces */
+
+ movq mm0, [esp + fix]
+ movd mm1, [esp + fiz]
+ pfadd mm0, mm2
+ pfadd mm1, mm3
+
+ pfadd mm0, mm4
+ pfadd mm1, mm5
+ movq [esp + fix], mm0
+ movd [esp + fiz], mm1
+ /* update j forces */
+
+ movq mm0, [edi + eax*4]
+ movd mm1, [edi + eax*4 + 8]
+ movq mm6, [edi + ebx*4]
+ movd mm7, [edi + ebx*4 + 8]
+
+ pfsub mm0, mm2
+ pfsub mm1, mm3
+ pfsub mm6, mm4
+ pfsub mm7, mm5
+
+ movq [edi + eax*4], mm0
+ movd [edi + eax*4 +8], mm1
+ movq [edi + ebx*4], mm6
+ movd [edi + ebx*4 + 8], mm7
+
+ /* should we do one more iteration? */
+ sub [esp + innerk], 2
+ jl .i3110_finish_coul_inner
+ jmp .i3110_unroll_coul_loop
+.i3110_finish_coul_inner:
+ and [esp + innerk], 1
+ jnz .i3110_single_coul_inner
+ jmp .i3110_updateouterdata_coul
+.i3110_single_coul_inner:
+ /* a single j particle iteration here - compare with the unrolled code for comments. */
+ mov eax, [esp + innerjjnr]
+ mov eax, [eax] /* eax=jnr offset */
+
+ mov ecx, [ebp + charge]
+ movd mm5, [esp + iq]
+ movd mm3, [ecx + eax*4]
+ pfmul mm3, mm5 /* mm3=qq */
+
+ mov esi, [ebp + pos]
+ lea eax, [eax + eax*2]
+
+ movq mm0, [esp + ix]
+ movd mm1, [esp + iz]
+ movq mm4, [esi + eax*4]
+ movd mm5, [esi + eax*4 + 8]
+ pfsubr mm4, mm0
+ pfsubr mm5, mm1
+ movq [esp + dx1], mm4
+ pfmul mm4,mm4
+ movd [esp + dz1], mm5
+ pfmul mm5,mm5
+ pfacc mm4, mm5
+ pfacc mm4, mm5 /* mm0=rsq */
+
+ pfrsqrt mm0,mm4
+ movq mm2,mm0
+ pfmul mm0,mm0
+ pfrsqit1 mm0,mm4
+ pfrcpit2 mm0,mm2 /* mm1=invsqrt */
+ pfmul mm4, mm0
+ movq mm1, mm4
+ /* mm0 is invsqrt, and mm1 r. */
+
+ /* calculate potentials and scalar force */
+ pfmul mm1, [esp + tsc] /* mm1=rt */
+ pf2iw mm4,mm1
+ movd [esp + n1], mm4
+ pi2fd mm4,mm4
+ pfsub mm1, mm4 /* now mm1 is eps and mm4 is n0 */
+
+ movq mm2,mm1
+ pfmul mm2,mm2 /* mm1 is eps, mm2 is eps2 */
+
+ /* coulomb table */
+ mov edx, [ebp + VFtab]
+ mov ecx, [esp + n1]
+ shl ecx, 2
+ /* load all the table values we need */
+ movd mm4, [edx + ecx*4]
+ movd mm5, [edx + ecx*4 + 4]
+ movd mm6, [edx + ecx*4 + 8]
+ movd mm7, [edx + ecx*4 + 12]
+
+ pfmul mm6, mm1 /* mm6 = Geps */
+ pfmul mm7, mm2 /* mm7 = Heps2 */
+
+ pfadd mm5, mm6
+ pfadd mm5, mm7 /* mm5 = Fp */
+
+ pfmul mm7, [esp + two] /* two*Heps2 */
+ pfadd mm7, mm6
+ pfadd mm7, mm5 /* mm7=FF */
+
+ pfmul mm5, mm1 /* mm5=eps*Fp */
+ pfadd mm5, mm4 /* mm5= VV */
+
+ pfmul mm5, mm3 /* vcoul=qq*VV */
+ pfmul mm3, mm7 /* fijC=FF*qq */
+
+ /* at this point mm5 contains vcoul and mm3 fijC */
+ /* increment vcoul - then we can get rid of mm5 */
+ /* update vctot */
+ pfadd mm5, [esp + vctot] /* add the earlier value */
+ movq [esp + vctot], mm5 /* store the sum */
+
+ /* change sign of mm3 */
+ pxor mm1,mm1
+ pfsub mm1, mm3
+ pfmul mm0, [esp + tsc]
+ pfmul mm0, mm1 /* mm0 is total fscal now */
+
+ /* spread fscalar to both positions */
+ punpckldq mm0,mm0
+ /* calc vectorial force */
+ prefetchw [edi + eax*4] /* prefetch faction to cache */
+ movq mm2, [esp + dx1]
+ movd mm3, [esp + dz1]
+
+
+ pfmul mm2, mm0
+ pfmul mm3, mm0
+
+ /* update i particle force */
+ movq mm0, [esp + fix]
+ movd mm1, [esp + fiz]
+ pfadd mm0, mm2
+ pfadd mm1, mm3
+ movq [esp + fix], mm0
+ movd [esp + fiz], mm1
+ /* update j particle force */
+ movq mm0, [edi + eax*4]
+ movd mm1, [edi + eax *4+ 8]
+ pfsub mm0, mm2
+ pfsub mm1, mm3
+ movq [edi + eax*4], mm0
+ movd [edi + eax*4 +8], mm1
+ /* done! */
+.i3110_updateouterdata_coul:
+ mov ecx, [esp + ii3]
+
+ movq mm6, [edi + ecx*4] /* increment i force */
+ movd mm7, [edi + ecx*4 + 8]
+ pfadd mm6, [esp + fix]
+ pfadd mm7, [esp + fiz]
+ movq [edi + ecx*4], mm6
+ movd [edi + ecx*4 +8], mm7
+
+ mov ebx, [ebp + fshift] /* increment fshift force */
+ mov edx, [esp + is3]
+
+ movq mm6, [ebx + edx*4]
+ movd mm7, [ebx + edx*4 + 8]
+ pfadd mm6, [esp + fix]
+ pfadd mm7, [esp + fiz]
+ movq [ebx + edx*4], mm6
+ movd [ebx + edx*4 + 8], mm7
+
+ /* loop back to mno */
+ dec dword ptr [esp + nscoul]
+ jz .i3110_testvdw
+ jmp .i3110_mno_coul
+.i3110_testvdw:
+ mov ecx, [esp + nsvdw]
+ cmp ecx, 0
+ jnz .i3110_mno_vdw
+ jmp .i3110_last_mno
+.i3110_mno_vdw:
+ mov ebx, [esp + solnr]
+ inc dword ptr [esp + solnr]
+
+ mov edx, [ebp + type]
+ mov edx, [edx + ebx*4]
+ imul edx, [ebp + ntype]
+ shl edx, 1
+ mov [esp + ntia], edx
+
+ lea ebx, [ebx + ebx*2] /* ebx = 3*ii=ii3 */
+ mov eax, [ebp + pos] /* eax = base of pos[] */
+ mov [esp + ii3], ebx
+
+ movq mm0, [eax + ebx*4]
+ movd mm1, [eax + ebx*4 + 8]
+ pfadd mm0, [esp + shX]
+ pfadd mm1, [esp + shZ]
+ movq [esp + ix], mm0
+ movd [esp + iz], mm1
+
+ /* clear forces */
+ pxor mm7,mm7
+ movq [esp + fix], mm7
+ movd [esp + fiz], mm7
+
+ mov ecx, [esp + innerjjnr0]
+ mov [esp + innerjjnr], ecx
+ mov edx, [esp + innerk0]
+ sub edx, 2
+ mov [esp + innerk], edx /* number of innerloop atoms */
+ jge .i3110_unroll_vdw_loop
+ jmp .i3110_finish_vdw_inner
+.i3110_unroll_vdw_loop:
+ /* paired innerloop starts here */
+ mov ecx, [esp + innerjjnr] /* pointer to jjnr[k] */
+ mov eax, [ecx]
+ mov ebx, [ecx + 4] /* eax/ebx=jnr */
+ add [esp + innerjjnr], 8 /* advance pointer (unrolled 2) */
+ prefetch [ecx + 16] /* prefetch data - trial and error says 16 is best */
+
+ mov ecx, [ebp + type]
+ mov edx, [ecx + eax*4] /* type [jnr1] */
+ mov ecx, [ecx + ebx*4] /* type [jnr2] */
+
+ mov esi, [ebp + nbfp] /* base of nbfp */
+ shl edx, 1
+ shl ecx, 1
+ add edx, [esp + ntia] /* tja = ntia + 2*type */
+ add ecx, [esp + ntia]
+
+ movq mm5, [esi + edx*4] /* mm5 = 1st c6 / c12 */
+ movq mm7, [esi + ecx*4] /* mm7 = 2nd c6 / c12 */
+ movq mm6,mm5
+ punpckldq mm5,mm7 /* mm5 = 1st c6 / 2nd c6 */
+ punpckhdq mm6,mm7 /* mm6 = 1st c12 / 2nd c12 */
+ movq [esp + c6], mm5
+ movq [esp + c12], mm6
+
+ lea eax, [eax + eax*2] /* replace jnr with j3 */
+ lea ebx, [ebx + ebx*2]
+
+ mov esi, [ebp + pos]
+
+ movq mm0, [esp + ix]
+ movd mm1, [esp + iz]
+ movq mm4, [esi + eax*4] /* fetch first j coordinates */
+ movd mm5, [esi + eax*4 + 8]
+ pfsubr mm4,mm0 /* dr = ir - jr */
+ pfsubr mm5,mm1
+ movq [esp + dx1], mm4 /* store dr */
+ movd [esp + dz1], mm5
+ pfmul mm4,mm4 /* square dx,dy,dz */
+ pfmul mm5,mm5
+ pfacc mm4, mm5 /* accumulate to get dx*dx+dy*dy+dz*dz */
+ pfacc mm4, mm5 /* first rsq in lower mm4 */
+
+ movq mm6, [esi + ebx*4] /* fetch second j coordinates */
+ movd mm7, [esi + ebx*4 + 8]
+
+ pfsubr mm6,mm0 /* dr = ir - jr */
+ pfsubr mm7,mm1
+ movq [esp + dx2], mm6 /* store dr */
+ movd [esp + dz2], mm7
+ pfmul mm6,mm6 /* square dx,dy,dz */
+ pfmul mm7,mm7
+ pfacc mm6, mm7 /* accumulate to get dx*dx+dy*dy+dz*dz */
+ pfacc mm6, mm7 /* second rsq in lower mm6 */
+
+ pfrcp mm0, mm4 /* lookup reciprocal seed */
+ pfrcp mm1, mm6
+
+ punpckldq mm0,mm1
+ punpckldq mm4,mm6 /* now 4 has rsq and 0 the seed for both pairs. */
+ /* amd 3dnow N-R iteration to get full precision. */
+ pfrcpit1 mm4,mm0
+ pfrcpit2 mm4,mm0
+ /* mm4 now contains invsq,
+ * do potential and fscal
+ */
+ movq mm0, mm4
+ pfmul mm4, mm0
+ pfmul mm4, mm0 /* mm4=rinvsix */
+ movq mm5, mm4
+ pfmul mm5, mm5 /* mm5=rinvtwelve */
+
+ pfmul mm5, [esp + c12]
+ pfmul mm4, [esp + c6]
+ movq mm6, mm5 /* mm6 is vnb12-vnb6 */
+ pfsub mm6, mm4
+
+ pfmul mm4, [esp + six]
+
+ pfmul mm5, [esp + twelve]
+ pfsub mm5,mm4
+ pfmul mm0, mm5 /* mm0 is total fscal now */
+
+ prefetchw [esp + dx1] /* prefetch i forces to cache */
+
+ /* spread fscalar to both positions */
+ movq mm1,mm0
+ punpckldq mm0,mm0
+ punpckhdq mm1,mm1
+
+ /* calc vector force */
+ prefetchw [edi + eax*4] /* prefetch the 1st faction to cache */
+ movq mm2, [esp + dx1] /* fetch dr */
+ movd mm3, [esp + dz1]
+
+ /* update vnbtot */
+ pfadd mm6, [esp + vnbtot] /* add the earlier value */
+ movq [esp + vnbtot], mm6 /* store the sum */
+
+ prefetchw [edi + ebx*4] /* prefetch the 2nd faction to cache */
+ pfmul mm2, mm0 /* mult by fs */
+ pfmul mm3, mm0
+
+ movq mm4, [esp + dx2] /* fetch dr */
+ movd mm5, [esp + dz2]
+ pfmul mm4, mm1 /* mult by fs */
+ pfmul mm5, mm1
+ /* update i forces */
+
+ movq mm0, [esp + fix]
+ movd mm1, [esp + fiz]
+ pfadd mm0, mm2
+ pfadd mm1, mm3
+
+ pfadd mm0, mm4
+ pfadd mm1, mm5
+ movq [esp + fix], mm0
+ movd [esp + fiz], mm1
+ /* update j forces */
+
+ movq mm0, [edi + eax*4]
+ movd mm1, [edi + eax*4 + 8]
+ movq mm6, [edi + ebx*4]
+ movd mm7, [edi + ebx*4 + 8]
+
+ pfsub mm0, mm2
+ pfsub mm1, mm3
+ pfsub mm6, mm4
+ pfsub mm7, mm5
+
+ movq [edi + eax*4], mm0
+ movd [edi + eax*4 +8], mm1
+ movq [edi + ebx*4], mm6
+ movd [edi + ebx*4 + 8], mm7
+
+ /* should we do one more iteration? */
+ sub [esp + innerk], 2
+ jl .i3110_finish_vdw_inner
+ jmp .i3110_unroll_vdw_loop
+.i3110_finish_vdw_inner:
+ and [esp + innerk], 1
+ jnz .i3110_single_vdw_inner
+ jmp .i3110_updateouterdata_vdw
+.i3110_single_vdw_inner:
+ /* a single j particle iteration here - compare with the unrolled code for comments. */
+ mov eax, [esp + innerjjnr]
+ mov eax, [eax] /* eax=jnr offset */
+
+ mov esi, [ebp + nbfp]
+ mov ecx, [ebp + type]
+ mov edx, [ecx + eax*4] /* type [jnr1] */
+ shl edx, 1
+ add edx, [esp + ntia] /* tja = ntia + 2*type */
+ movd mm5, [esi + edx*4] /* mm5 = 1st c6 */
+ movq [esp + c6], mm5
+ movd mm5, [esi + edx*4 + 4] /* mm5 = 1st c12 */
+ movq [esp + c12], mm5
+
+ mov esi, [ebp + pos]
+ lea eax, [eax + eax*2]
+
+ movq mm0, [esp + ix]
+ movd mm1, [esp + iz]
+ movq mm4, [esi + eax*4]
+ movd mm5, [esi + eax*4 + 8]
+ pfsubr mm4, mm0
+ pfsubr mm5, mm1
+ movq [esp + dx1], mm4
+ pfmul mm4,mm4
+ movd [esp + dz1], mm5
+ pfmul mm5,mm5
+ pfacc mm4, mm5
+ pfacc mm4, mm5 /* mm4=rsq */
+
+ pfrcp mm0,mm4
+ pfrcpit1 mm4,mm0
+ pfrcpit2 mm4,mm0 /* mm4=invsq */
+ /* calculate potentials and scalar force */
+ movq mm0, mm4
+
+ pfmul mm4, mm0
+ pfmul mm4, mm0 /* mm4=rinvsix */
+ movq mm5, mm4
+ pfmul mm5, mm5 /* mm5=rinvtwelve */
+
+ pfmul mm5, [esp + c12]
+ pfmul mm4, [esp + c6]
+ movq mm6, mm5 /* mm6 is vnb12-vnb6 */
+ pfsub mm6, mm4
+
+ pfmul mm4, [esp + six]
+
+ pfmul mm5, [esp + twelve]
+ pfsub mm5, mm4
+ pfmul mm0, mm5 /* mm0 is total fscal now */
+
+ /* update vnbtot */
+ pfadd mm6, [esp + vnbtot] /* add the earlier value */
+ movq [esp + vnbtot], mm6 /* store the sum */
+
+ /* spread fscalar to both positions */
+ punpckldq mm0,mm0
+ /* calc vectorial force */
+ prefetchw [edi + eax*4] /* prefetch faction to cache */
+ movq mm2, [esp + dx1]
+ movd mm3, [esp + dz1]
+
+ pfmul mm2, mm0
+ pfmul mm3, mm0
+
+ /* update i particle force */
+ movq mm0, [esp + fix]
+ movd mm1, [esp + fiz]
+ pfadd mm0, mm2
+ pfadd mm1, mm3
+ movq [esp + fix], mm0
+ movd [esp + fiz], mm1
+ /* update j particle force */
+ movq mm0, [edi + eax*4]
+ movd mm1, [edi + eax *4+ 8]
+ pfsub mm0, mm2
+ pfsub mm1, mm3
+ movq [edi + eax*4], mm0
+ movd [edi + eax*4 +8], mm1
+ /* done! */
+.i3110_updateouterdata_vdw:
+ mov ecx, [esp + ii3]
+
+ movq mm6, [edi + ecx*4] /* increment i force */
+ movd mm7, [edi + ecx*4 + 8]
+ pfadd mm6, [esp + fix]
+ pfadd mm7, [esp + fiz]
+ movq [edi + ecx*4], mm6
+ movd [edi + ecx*4 +8], mm7
+
+ mov ebx, [ebp + fshift] /* increment fshift force */
+ mov edx, [esp + is3]
+
+ movq mm6, [ebx + edx*4]
+ movd mm7, [ebx + edx*4 + 8]
+ pfadd mm6, [esp + fix]
+ pfadd mm7, [esp + fiz]
+ movq [ebx + edx*4], mm6
+ movd [ebx + edx*4 + 8], mm7
+
+ /* loop back to mno */
+ dec dword ptr [esp + nsvdw]
+ jz .i3110_last_mno
+ jmp .i3110_mno_vdw
+
+.i3110_last_mno:
+ mov edx, [ebp + gid] /* get group index for this i particle */
+ mov edx, [edx]
+ add [ebp + gid], 4 /* advance pointer */
+
+ movq mm7, [esp + vctot]
+ pfacc mm7,mm7 /* get and sum the two parts of total potential */
+
+ mov eax, [ebp + Vc]
+ movd mm6, [eax + edx*4]
+ pfadd mm6, mm7
+ movd [eax + edx*4], mm6 /* increment vc[gid] */
+
+ movq mm7, [esp + vnbtot]
+ pfacc mm7,mm7 /* get and sum the two parts of total potential */
+
+ mov eax, [ebp + Vnb]
+ movd mm6, [eax + edx*4]
+ pfadd mm6, mm7
+ movd [eax + edx*4], mm6 /* increment vc[gid] */
+ /* finish if last */
+ mov ecx, [ebp + nri]
+ dec ecx
+ jecxz .i3110_end
+ /* not last, iterate once more! */
+ mov [ebp + nri], ecx
+ jmp .i3110_outer
+.i3110_end:
+ femms
+ add esp, 184
+ pop edi
+ pop esi
+ pop edx
+ pop ecx
+ pop ebx
+ pop eax
+ leave
+ ret
+
+
+
+
+.globl inl3120_3dnow
+ .type inl3120_3dnow,@function
+inl3120_3dnow:
+.equ nri, 8
+.equ iinr, 12
+.equ jindex, 16
+.equ jjnr, 20
+.equ shift, 24
+.equ shiftvec, 28
+.equ fshift, 32
+.equ gid, 36
+.equ pos, 40
+.equ faction, 44
+.equ charge, 48
+.equ facel, 52
+.equ Vc, 56
+.equ type, 60
+.equ ntype, 64
+.equ nbfp, 68
+.equ Vnb, 72
+.equ tabscale, 76
+.equ VFtab, 80
+ /* stack offsets for local variables */
+.equ is3, 0
+.equ ii3, 4
+.equ ixO, 8
+.equ iyO, 12
+.equ izO, 16
+.equ ixH, 20 /* repeated (64bit) to fill 3dnow reg */
+.equ iyH, 28 /* repeated (64bit) to fill 3dnow reg */
+.equ izH, 36 /* repeated (64bit) to fill 3dnow reg */
+.equ iqO, 44 /* repeated (64bit) to fill 3dnow reg */
+.equ iqH, 52 /* repeated (64bit) to fill 3dnow reg */
+.equ qqO, 60 /* repeated (64bit) to fill 3dnow reg */
+.equ qqH, 68 /* repeated (64bit) to fill 3dnow reg */
+.equ vctot, 76 /* repeated (64bit) to fill 3dnow reg */
+.equ vnbtot, 84 /* repeated (64bit) to fill 3dnow reg */
+.equ c6, 92 /* repeated (64bit) to fill 3dnow reg */
+.equ c12, 100 /* repeated (64bit) to fill 3dnow reg */
+.equ six, 108 /* repeated (64bit) to fill 3dnow reg */
+.equ twelve, 116 /* repeated (64bit) to fill 3dnow reg */
+.equ two, 124 /* repeated (64bit) to fill 3dnow reg */
+.equ n1, 132 /* repeated (64bit) to fill 3dnow reg */
+.equ tsc, 140 /* repeated (64bit) to fill 3dnow reg */
+.equ ntia, 148 /* repeated (64bit) to fill 3dnow reg */
+.equ innerjjnr, 156
+.equ innerk, 160
+.equ fixO, 164
+.equ fiyO, 168
+.equ fizO, 172
+.equ fixH, 176 /* repeated (64bit) to fill 3dnow reg */
+.equ fiyH, 184 /* repeated (64bit) to fill 3dnow reg */
+.equ fizH, 192 /* repeated (64bit) to fill 3dnow reg */
+.equ dxO, 200
+.equ dyO, 204
+.equ dzO, 208
+.equ dxH, 212 /* repeated (64bit) to fill 3dnow reg */
+.equ dyH, 220 /* repeated (64bit) to fill 3dnow reg */
+.equ dzH, 228 /* repeated (64bit) to fill 3dnow reg */
+.equ tmprsqH, 236 /* repeated (64bit) to fill 3dnow reg */
+ push ebp
+ mov ebp,esp
+ push eax
+ push ebx
+ push ecx
+ push edx
+ push esi
+ push edi
+ sub esp, 244 /* local stack space */
+ femms
+
+ mov ecx, [ebp + iinr] /* ecx = pointer into iinr[] */
+ mov ebx, [ecx] /* ebx=ii */
+
+ mov edx, [ebp + charge]
+ movd mm1, [ebp + facel]
+ movd mm2, [edx + ebx*4] /* mm2=charge[i.i0] */
+ pfmul mm2, mm1
+ movq [esp + iqO], mm2 /* iqO = facel*charge[ii] */
+
+ movd mm2, [edx + ebx*4 + 4] /* mm2=charge[i.i0+1] */
+ pfmul mm2, mm1
+ punpckldq mm2,mm2 /* spread to both halves */
+ movq [esp + iqH], mm2 /* iqH = facel*charge[i.i0+1] */
+
+ mov edx, [ebp + type]
+ mov edx, [edx + ebx*4]
+ shl edx, 1
+ mov ecx, edx
+ imul ecx, [ebp + ntype] /* ecx = ntia = 2*ntype*type[i.i0] */
+ mov [esp + ntia], ecx
+
+ movq mm3, [mm_two]
+ movq mm4, [mm_six]
+ movq mm5, [mm_twelve]
+ movq mm6, [ebp + tabscale]
+ punpckldq mm6,mm6 /* spread to both halves */
+ movq [esp + two], mm3
+ movq [esp + six], mm4
+ movq [esp + twelve], mm5
+ movq [esp + tsc], mm6
+ /* assume we have at least one i particle - start directly */
+.i3120_outer:
+ mov eax, [ebp + shift] /* eax = pointer into shift[] */
+ mov ebx, [eax] /* ebx=shift[n] */
+ add [ebp + shift], 4 /* advance pointer one step */
+
+ lea ebx, [ebx + ebx*2] /* ebx=3*is */
+ mov [esp + is3],ebx /* store is3 */
+
+ mov eax, [ebp + shiftvec] /* eax = base of shiftvec[] */
+
+ movq mm5, [eax + ebx*4] /* move shX/shY to mm5 and shZ to mm6. */
+ movd mm6, [eax + ebx*4 + 8]
+ movq mm0, mm5
+ movq mm1, mm5
+ movq mm2, mm6
+ punpckldq mm0,mm0 /* also expand shX,Y,Z in mm0--mm2. */
+ punpckhdq mm1,mm1
+ punpckldq mm2,mm2
+
+ mov ecx, [ebp + iinr] /* ecx = pointer into iinr[] */
+ add [ebp + iinr], 4 /* advance pointer */
+ mov ebx, [ecx] /* ebx=ii */
+
+ lea ebx, [ebx + ebx*2] /* ebx = 3*ii=ii3 */
+ mov eax, [ebp + pos] /* eax = base of pos[] */
+
+ pfadd mm5, [eax + ebx*4] /* ix = shX + posX (and iy too) */
+ movd mm7, [eax + ebx*4 + 8] /* cant use direct memory add for 4 bytes (iz) */
+ mov [esp + ii3], ebx /* (use mm7 as temp. storage for iz.) */
+ pfadd mm6, mm7
+ movq [esp + ixO], mm5
+ movq [esp + izO], mm6
+
+ movd mm3, [eax + ebx*4 + 12]
+ movd mm4, [eax + ebx*4 + 16]
+ movd mm5, [eax + ebx*4 + 20]
+ punpckldq mm3, [eax + ebx*4 + 24]
+ punpckldq mm4, [eax + ebx*4 + 28]
+ punpckldq mm5, [eax + ebx*4 + 32] /* coords of H1 in low mm3-mm5, H2 in high */
+
+ pfadd mm0, mm3
+ pfadd mm1, mm4
+ pfadd mm2, mm5
+ movq [esp + ixH], mm0
+ movq [esp + iyH], mm1
+ movq [esp + izH], mm2
+
+ /* clear vctot and i forces */
+ pxor mm7,mm7
+ movq [esp + vctot], mm7
+ movq [esp + vnbtot], mm7
+ movq [esp + fixO], mm7
+ movd [esp + fizO], mm7
+ movq [esp + fixH], mm7
+ movq [esp + fiyH], mm7
+ movq [esp + fizH], mm7
+
+ mov eax, [ebp + jindex]
+ mov ecx, [eax] /* jindex[n] */
+ mov edx, [eax + 4] /* jindex[n+1] */
+ add [ebp + jindex], 4
+ sub edx, ecx /* number of innerloop atoms */
+ mov [esp + innerk], edx
+
+ mov esi, [ebp + pos]
+ mov edi, [ebp + faction]
+ mov eax, [ebp + jjnr]
+ shl ecx, 2
+ add eax, ecx
+ mov [esp + innerjjnr], eax /* pointer to jjnr[nj0] */
+.i3120_inner_loop:
+ /* a single j particle iteration here - compare with the unrolled code for comments. */
+ mov eax, [esp + innerjjnr]
+ mov eax, [eax] /* eax=jnr offset */
+ add [esp + innerjjnr], 4 /* advance pointer */
+ prefetch [ecx + 16] /* prefetch data - trial and error says 16 is best */
+
+ mov ecx, [ebp + charge]
+ movd mm7, [ecx + eax*4]
+ punpckldq mm7,mm7
+ movq mm6,mm7
+ pfmul mm6, [esp + iqO]
+ pfmul mm7, [esp + iqH] /* mm6=qqO, mm7=qqH */
+ movd [esp + qqO], mm6
+ movq [esp + qqH], mm7
+
+ mov ecx, [ebp + type]
+ mov edx, [ecx + eax*4] /* type [jnr] */
+ mov ecx, [ebp + nbfp]
+ shl edx, 1
+ add edx, [esp + ntia] /* tja = ntia + 2*type */
+ movd mm5, [ecx + edx*4] /* mm5 = 1st c6 */
+ movq [esp + c6], mm5
+ movd mm5, [ecx + edx*4 + 4] /* mm5 = 1st c12 */
+ movq [esp + c12], mm5
+
+ lea eax, [eax + eax*2]
+
+ movq mm0, [esi + eax*4]
+ movd mm1, [esi + eax*4 + 8]
+ /* copy & expand to mm2-mm4 for the H interactions */
+ movq mm2, mm0
+ movq mm3, mm0
+ movq mm4, mm1
+ punpckldq mm2,mm2
+ punpckhdq mm3,mm3
+ punpckldq mm4,mm4
+
+ pfsubr mm0, [esp + ixO]
+ pfsubr mm1, [esp + izO]
+
+ movq [esp + dxO], mm0
+ pfmul mm0,mm0
+ movd [esp + dzO], mm1
+ pfmul mm1,mm1
+ pfacc mm0, mm1
+ pfadd mm0, mm1 /* mm0=rsqO */
+
+ punpckldq mm2, mm2
+ punpckldq mm3, mm3
+ punpckldq mm4, mm4 /* mm2-mm4 is jx-jz */
+ pfsubr mm2, [esp + ixH]
+ pfsubr mm3, [esp + iyH]
+ pfsubr mm4, [esp + izH] /* mm2-mm4 is dxH-dzH */
+
+ movq [esp + dxH], mm2
+ movq [esp + dyH], mm3
+ movq [esp + dzH], mm4
+ pfmul mm2,mm2
+ pfmul mm3,mm3
+ pfmul mm4,mm4
+
+ pfadd mm3,mm2
+ pfadd mm3,mm4 /* mm3=rsqH */
+ movq [esp + tmprsqH], mm3
+
+ pfrsqrt mm1,mm0
+
+ movq mm2,mm1
+ pfmul mm1,mm1
+ pfrsqit1 mm1,mm0
+ pfrcpit2 mm1,mm2 /* mm1=invsqrt */
+
+ pfmul mm0, mm1 /* mm0=r */
+
+ pfmul mm0, [esp + tsc]
+ pf2iw mm4, mm0
+ movd [esp + n1], mm4
+ pi2fd mm4,mm4
+ pfsub mm0, mm4 /* now mm0 is eps and mm4 n0 */
+ movq mm2, mm0
+ pfmul mm2, mm2 /* mm0 is eps, mm2 eps2 */
+
+ /* coulomb table */
+ mov edx, [ebp + VFtab]
+ mov ecx, [esp + n1]
+ shl ecx, 2
+ /* load all values we need */
+ movd mm4, [edx + ecx*4]
+ movd mm5, [edx + ecx*4 + 4]
+ movd mm6, [edx + ecx*4 + 8]
+ movd mm7, [edx + ecx*4 + 12]
+
+ pfmul mm6, mm0 /* mm6 = Geps */
+ pfmul mm7, mm2 /* mm7 = Heps2 */
+
+ pfadd mm5, mm6
+ pfadd mm5, mm7 /* mm5 = Fp */
+
+ pfmul mm7, [esp + two] /* two*Heps2 */
+ pfadd mm7, mm6
+ pfadd mm7, mm5 /* mm7=FF */
+
+ pfmul mm5, mm0 /* mm5=eps*Fp */
+ pfadd mm5, mm4 /* mm5= VV */
+
+ pfmul mm5, [esp + qqO] /* vcoul=qq*VV */
+ pfmul mm7, [esp + qqO] /* fijC=qq*FF */
+ /* update vctot directly, use mm3 for fscal sum. */
+ pfadd mm5, [esp + vctot]
+ movq [esp + vctot], mm5
+
+ movq mm3, mm7
+ pfmul mm3, [esp + tsc]
+
+ /* nontabulated LJ - mm1 is invsqrt. - keep mm1! */
+ movq mm0, mm1
+ pfmul mm0, mm0 /* mm0 is invsq */
+ movq mm2, mm0
+ pfmul mm2, mm0
+ pfmul mm2, mm0 /* mm2 = rinvsix */
+ movq mm4, mm2
+ pfmul mm4, mm4 /* mm4=rinvtwelve */
+
+ pfmul mm4, [esp + c12]
+ pfmul mm2, [esp + c6]
+ movq mm5, mm4
+ pfsub mm5, mm2 /* mm5=vnb12-vnb6 */
+
+ pfmul mm2, [esp + six]
+ pfmul mm4, [esp + twelve]
+ pfsub mm4, mm2
+ pfmul mm4, mm1 /* mm4=(12*vnb12-6*vnb6)*rinv11 */
+
+ pfsubr mm3, mm4
+ pfmul mm3, mm1 /* mm3 is total fscal (for the oxygen) now */
+
+ /* update vnbtot */
+ pfadd mm5, [esp + vnbtot] /* add the earlier value */
+ movq [esp + vnbtot], mm5 /* store the sum */
+
+ /* Ready with the oxygen - potential is updated, fscal is in mm3. */
+ /* now do the two hydrogens. */
+ movq mm0, [esp + tmprsqH] /* mm0=r */sqH
+
+ pfrsqrt mm1, mm0
+ pswapd mm0,mm0
+ pfrsqrt mm2, mm0
+ pswapd mm0,mm0
+ punpckldq mm1,mm2 /* seeds are in mm1 now, and rsq in mm0. */
+
+ movq mm2, mm1
+ pfmul mm1,mm1
+ pfrsqit1 mm1,mm0
+ pfrcpit2 mm1,mm2 /* mm1=invsqrt */
+
+ pfmul mm0,mm1 /* mm0=r */
+ pfmul mm0, [esp + tsc]
+ pf2iw mm4, mm0
+ movq [esp + n1], mm4
+ pi2fd mm4,mm4
+ pfsub mm0, mm4 /* now mm0 is eps and mm4 n0 */
+ movq mm2, mm0
+ pfmul mm2, mm2 /* mm0 is eps, mm2 eps2 */
+
+ /* coulomb table */
+ mov edx, [ebp + VFtab]
+ mov ecx, [esp + n1]
+ shl ecx, 2
+ /* load all values we need */
+ movd mm4, [edx + ecx*4]
+ movd mm5, [edx + ecx*4 + 4]
+ movd mm6, [edx + ecx*4 + 8]
+ movd mm7, [edx + ecx*4 + 12]
+ mov ecx, [esp + n1 + 4]
+ shl ecx, 2
+ punpckldq mm4, [edx + ecx*4]
+ punpckldq mm5, [edx + ecx*4 + 4]
+ punpckldq mm6, [edx + ecx*4 + 8]
+ punpckldq mm7, [edx + ecx*4 + 12]
+
+ pfmul mm6, mm0 /* mm6 = Geps */
+ pfmul mm7, mm2 /* mm7 = Heps2 */
+
+ pfadd mm5, mm6
+ pfadd mm5, mm7 /* mm5 = Fp */
+
+ pfmul mm7, [esp + two] /* two*Heps2 */
+ pfadd mm7, mm6
+ pfadd mm7, mm5 /* mm7=FF */
+
+ pfmul mm5, mm0 /* mm5=eps*Fp */
+ pfadd mm5, mm4 /* mm5= VV */
+
+ pfmul mm5, [esp + qqH] /* vcoul=qq*VV */
+ pfmul mm7, [esp + qqH] /* fijC=qq*FF */
+ /* update vctot */
+ pfadd mm5, [esp + vctot]
+ movq [esp + vctot], mm5
+
+ /* change sign of fijC and multiply by rinv */
+ pxor mm4,mm4
+ pfsub mm4, mm7
+ pfmul mm4, [esp + tsc]
+ pfmul mm4, mm1 /* mm4 is total fscal (for the hydrogens) now */
+
+ /* spread oxygen fscalar to both positions */
+ punpckldq mm3,mm3
+ /* calc vectorial force for O */
+ prefetchw [edi + eax*4] /* prefetch faction to cache */
+ movq mm0, [esp + dxO]
+ movd mm1, [esp + dzO]
+ pfmul mm0, mm3
+ pfmul mm1, mm3
+
+ /* calc vectorial force for H's */
+ movq mm5, [esp + dxH]
+ movq mm6, [esp + dyH]
+ movq mm7, [esp + dzH]
+ pfmul mm5, mm4
+ pfmul mm6, mm4
+ pfmul mm7, mm4
+
+ /* update iO particle force */
+ movq mm2, [esp + fixO]
+ movd mm3, [esp + fizO]
+ pfadd mm2, mm0
+ pfadd mm3, mm1
+ movq [esp + fixO], mm2
+ movd [esp + fizO], mm3
+
+ /* update iH forces */
+ movq mm2, [esp + fixH]
+ movq mm3, [esp + fiyH]
+ movq mm4, [esp + fizH]
+ pfadd mm2, mm5
+ pfadd mm3, mm6
+ pfadd mm4, mm7
+ movq [esp + fixH], mm2
+ movq [esp + fiyH], mm3
+ movq [esp + fizH], mm4
+
+ /* pack j forces from H in the same form as the oxygen force. */
+ pfacc mm5, mm6 /* mm5(l)=fjx(H1+H2) mm5(h)=fjy(H1+H2) */
+ pfacc mm7, mm7 /* mm7(l)=fjz(H1+H2) */
+
+ pfadd mm0, mm5 /* add up total force on j particle. */
+ pfadd mm1, mm7
+
+ /* update j particle force */
+ movq mm2, [edi + eax*4]
+ movd mm3, [edi + eax*4 + 8]
+ pfsub mm2, mm0
+ pfsub mm3, mm1
+ movq [edi + eax*4], mm2
+ movd [edi + eax*4 +8], mm3
+
+ /* done - one more? */
+ dec dword ptr [esp + innerk]
+ jz .i3120_updateouterdata
+ jmp .i3120_inner_loop
+.i3120_updateouterdata:
+ mov ecx, [esp + ii3]
+
+ movq mm6, [edi + ecx*4] /* increment iO force */
+ movd mm7, [edi + ecx*4 + 8]
+ pfadd mm6, [esp + fixO]
+ pfadd mm7, [esp + fizO]
+ movq [edi + ecx*4], mm6
+ movd [edi + ecx*4 +8], mm7
+
+ movq mm0, [esp + fixH]
+ movq mm3, [esp + fiyH]
+ movq mm1, [esp + fizH]
+ movq mm2, mm0
+ punpckldq mm0, mm3 /* mm0(l)=fxH1, mm0(h)=fyH1 */
+ punpckhdq mm2, mm3 /* mm2(l)=fxH2, mm2(h)=fyH2 */
+ movq mm3, mm1
+ pswapd mm3,mm3
+ /* mm1 is fzH1 */
+ /* mm3 is fzH2 */
+
+ movq mm6, [edi + ecx*4 + 12] /* increment iH1 force */
+ movd mm7, [edi + ecx*4 + 20]
+ pfadd mm6, mm0
+ pfadd mm7, mm1
+ movq [edi + ecx*4 + 12], mm6
+ movd [edi + ecx*4 + 20], mm7
+
+ movq mm6, [edi + ecx*4 + 24] /* increment iH2 force */
+ movd mm7, [edi + ecx*4 + 32]
+ pfadd mm6, mm2
+ pfadd mm7, mm3
+ movq [edi + ecx*4 + 24], mm6
+ movd [edi + ecx*4 + 32], mm7
+
+
+ mov ebx, [ebp + fshift] /* increment fshift force */
+ mov edx, [esp + is3]
+
+ movq mm6, [ebx + edx*4]
+ movd mm7, [ebx + edx*4 + 8]
+ pfadd mm6, [esp + fixO]
+ pfadd mm7, [esp + fizO]
+ pfadd mm6, mm0
+ pfadd mm7, mm1
+ pfadd mm6, mm2
+ pfadd mm7, mm3
+ movq [ebx + edx*4], mm6
+ movd [ebx + edx*4 + 8], mm7
+
+ mov edx, [ebp + gid] /* get group index for this i particle */
+ mov edx, [edx]
+ add [ebp + gid], 4 /* advance pointer */
+
+ movq mm7, [esp + vctot]
+ pfacc mm7,mm7 /* get and sum the two parts of total potential */
+
+ mov eax, [ebp + Vc]
+ movd mm6, [eax + edx*4]
+ pfadd mm6, mm7
+ movd [eax + edx*4], mm6 /* increment vc[gid] */
+
+ movq mm7, [esp + vnbtot]
+ pfacc mm7,mm7 /* same for Vnb */
+
+ mov eax, [ebp + Vnb]
+ movd mm6, [eax + edx*4]
+ pfadd mm6, mm7
+ movd [eax + edx*4], mm6 /* increment vnb[gid] */
+ /* finish if last */
+ dec dword ptr [ebp + nri]
+ jz .i3120_end
+ /* not last, iterate once more! */
+ jmp .i3120_outer
+.i3120_end:
+ femms
+ add esp, 244
+ pop edi
+ pop esi
+ pop edx
+ pop ecx
+ pop ebx
+ pop eax
+ leave
+ ret
+
+
+
+
+
+.globl inl3130_3dnow
+ .type inl3130_3dnow,@function
+inl3130_3dnow:
+.equ nri, 8
+.equ iinr, 12
+.equ jindex, 16
+.equ jjnr, 20
+.equ shift, 24
+.equ shiftvec, 28
+.equ fshift, 32
+.equ gid, 36
+.equ pos, 40
+.equ faction, 44
+.equ charge, 48
+.equ facel, 52
+.equ Vc, 56
+.equ type, 60
+.equ ntype, 64
+.equ nbfp, 68
+.equ Vnb, 72
+.equ tabscale, 76
+.equ VFtab, 80
+ /* stack offsets for local variables */
+.equ is3, 0
+.equ ii3, 4
+.equ ixO, 8
+.equ iyO, 12
+.equ izO, 16
+.equ ixH, 20 /* repeated (64bit) to fill 3dnow reg */
+.equ iyH, 28 /* repeated (64bit) to fill 3dnow reg */
+.equ izH, 36 /* repeated (64bit) to fill 3dnow reg */
+.equ qqOO, 44 /* repeated (64bit) to fill 3dnow reg */
+.equ qqOH, 52 /* repeated (64bit) to fill 3dnow reg */
+.equ qqHH, 60 /* repeated (64bit) to fill 3dnow reg */
+.equ c6, 68 /* repeated (64bit) to fill 3dnow reg */
+.equ c12, 76 /* repeated (64bit) to fill 3dnow reg */
+.equ six, 84 /* repeated (64bit) to fill 3dnow reg */
+.equ twelve, 92 /* repeated (64bit) to fill 3dnow reg */
+.equ two, 100 /* repeated (64bit) to fill 3dnow reg */
+.equ n1, 108 /* repeated (64bit) to fill 3dnow reg */
+.equ tsc, 116 /* repeated (64bit) to fill 3dnow reg */
+.equ vctot, 124 /* repeated (64bit) to fill 3dnow reg */
+.equ vnbtot, 132 /* repeated (64bit) to fill 3dnow reg */
+.equ innerjjnr, 140
+.equ innerk, 144
+.equ fixO, 148
+.equ fiyO, 152
+.equ fizO, 156
+.equ fixH, 160 /* repeated (64bit) to fill 3dnow reg */
+.equ fiyH, 168 /* repeated (64bit) to fill 3dnow reg */
+.equ fizH, 176 /* repeated (64bit) to fill 3dnow reg */
+.equ dxO, 184
+.equ dyO, 188
+.equ dzO, 192
+.equ dxH, 200 /* repeated (64bit) to fill 3dnow reg */
+.equ dyH, 208 /* repeated (64bit) to fill 3dnow reg */
+.equ dzH, 216 /* repeated (64bit) to fill 3dnow reg */
+.equ tmprsqH, 224 /* repeated (64bit) to fill 3dnow reg */
+ push ebp
+ mov ebp,esp
+ push eax
+ push ebx
+ push ecx
+ push edx
+ push esi
+ push edi
+ sub esp, 232 /* local stack space */
+ femms
+ /* assume we have at least one i particle - start directly */
+
+ mov ecx, [ebp + iinr] /* ecx = pointer into iinr[] */
+ mov ebx, [ecx] /* ebx=ii */
+
+ mov edx, [ebp + charge]
+ movd mm1, [ebp + facel] /* mm1=facel */
+ movd mm2, [edx + ebx*4] /* mm2=charge[i.i0] (O) */
+ movd mm3, [edx + ebx*4 + 4] /* mm2=charge[i.i0+1] (H) */
+ movq mm4, mm2
+ pfmul mm4, mm1
+ movq mm6, mm3
+ pfmul mm6, mm1
+ movq mm5, mm4
+ pfmul mm4, mm2 /* mm4=qqOO*facel */
+ pfmul mm5, mm3 /* mm5=qqOH*facel */
+ pfmul mm6, mm3 /* mm6=qqHH*facel */
+ punpckldq mm5,mm5 /* spread to both halves */
+ punpckldq mm6,mm6 /* spread to both halves */
+ movq [esp + qqOO], mm4
+ movq [esp + qqOH], mm5
+ movq [esp + qqHH], mm6
+ mov edx, [ebp + type]
+ mov ecx, [edx + ebx*4]
+ shl ecx, 1
+ mov edx, ecx
+ imul ecx, [ebp + ntype]
+ add edx, ecx
+ mov eax, [ebp + nbfp]
+ movd mm0, [eax + edx*4]
+ movd mm1, [eax + edx*4 + 4]
+ movq [esp + c6], mm0
+ movq [esp + c12], mm1
+ movq mm2, [mm_two]
+ movq mm3, [mm_six]
+ movq mm4, [mm_twelve]
+ movq [esp + two], mm2
+ movq [esp + six], mm3
+ movq [esp + twelve], mm4
+ movd mm5, [ebp + tabscale]
+ punpckldq mm5,mm5
+ movq [esp + tsc], mm5
+.i3130_outer:
+ mov eax, [ebp + shift] /* eax = pointer into shift[] */
+ mov ebx, [eax] /* ebx=shift[n] */
+ add [ebp + shift], 4 /* advance pointer one step */
+
+ lea ebx, [ebx + ebx*2] /* ebx=3*is */
+ mov [esp + is3],ebx /* store is3 */
+
+ mov eax, [ebp + shiftvec] /* eax = base of shiftvec[] */
+
+ movq mm5, [eax + ebx*4] /* move shX/shY to mm5 and shZ to mm6. */
+ movd mm6, [eax + ebx*4 + 8]
+ movq mm0, mm5
+ movq mm1, mm5
+ movq mm2, mm6
+ punpckldq mm0,mm0 /* also expand shX,Y,Z in mm0--mm2. */
+ punpckhdq mm1,mm1
+ punpckldq mm2,mm2
+
+ mov ecx, [ebp + iinr] /* ecx = pointer into iinr[] */
+ add [ebp + iinr], 4 /* advance pointer */
+ mov ebx, [ecx] /* ebx=ii */
+
+ lea ebx, [ebx + ebx*2] /* ebx = 3*ii=ii3 */
+ mov eax, [ebp + pos] /* eax = base of pos[] */
+
+ pfadd mm5, [eax + ebx*4] /* ix = shX + posX (and iy too) */
+ movd mm7, [eax + ebx*4 + 8] /* cant use direct memory add for 4 bytes (iz) */
+ mov [esp + ii3], ebx /* (use mm7 as temp. storage for iz.) */
+ pfadd mm6, mm7
+ movq [esp + ixO], mm5
+ movq [esp + izO], mm6
+
+ movd mm3, [eax + ebx*4 + 12]
+ movd mm4, [eax + ebx*4 + 16]
+ movd mm5, [eax + ebx*4 + 20]
+ punpckldq mm3, [eax + ebx*4 + 24]
+ punpckldq mm4, [eax + ebx*4 + 28]
+ punpckldq mm5, [eax + ebx*4 + 32] /* coords of H1 in low mm3-mm5, H2 in high */
+
+ pfadd mm0, mm3
+ pfadd mm1, mm4
+ pfadd mm2, mm5
+ movq [esp + ixH], mm0
+ movq [esp + iyH], mm1
+ movq [esp + izH], mm2
+
+ /* clear vctot and i forces */
+ pxor mm7,mm7
+ movq [esp + vctot], mm7
+ movq [esp + vnbtot], mm7
+ movq [esp + fixO], mm7
+ movq [esp + fizO], mm7
+ movq [esp + fixH], mm7
+ movq [esp + fiyH], mm7
+ movq [esp + fizH], mm7
+
+ mov eax, [ebp + jindex]
+ mov ecx, [eax] /* jindex[n] */
+ mov edx, [eax + 4] /* jindex[n+1] */
+ add [ebp + jindex], 4
+ sub edx, ecx /* number of innerloop atoms */
+ mov [esp + innerk], edx /* number of innerloop atoms */
+
+ mov esi, [ebp + pos]
+ mov edi, [ebp + faction]
+ mov eax, [ebp + jjnr]
+ shl ecx, 2
+ add eax, ecx
+ mov [esp + innerjjnr], eax /* pointer to jjnr[nj0] */
+.i3130_inner_loop:
+ /* a single j particle iteration here - compare with the unrolled code for comments. */
+ mov eax, [esp + innerjjnr]
+ mov eax, [eax] /* eax=jnr offset */
+ add [esp + innerjjnr], 4 /* advance pointer */
+
+ lea eax, [eax + eax*2]
+
+ movq mm0, [esi + eax*4]
+ movd mm1, [esi + eax*4 + 8]
+ /* copy & expand to mm2-mm4 for the H interactions */
+ movq mm2, mm0
+ movq mm3, mm0
+ movq mm4, mm1
+ punpckldq mm2,mm2
+ punpckhdq mm3,mm3
+ punpckldq mm4,mm4
+
+ pfsubr mm0, [esp + ixO]
+ pfsubr mm1, [esp + izO]
+
+ movq [esp + dxO], mm0
+ pfmul mm0,mm0
+ movd [esp + dzO], mm1
+ pfmul mm1,mm1
+ pfacc mm0, mm0
+ pfadd mm0, mm1 /* mm0=rsqO */
+
+ punpckldq mm2, mm2
+ punpckldq mm3, mm3
+ punpckldq mm4, mm4 /* mm2-mm4 is jx-jz */
+ pfsubr mm2, [esp + ixH]
+ pfsubr mm3, [esp + iyH]
+ pfsubr mm4, [esp + izH] /* mm2-mm4 is dxH-dzH */
+
+ movq [esp + dxH], mm2
+ movq [esp + dyH], mm3
+ movq [esp + dzH], mm4
+ pfmul mm2,mm2
+ pfmul mm3,mm3
+ pfmul mm4,mm4
+
+ pfadd mm3,mm2
+ pfadd mm3,mm4 /* mm3=rsqH */
+ movq [esp + tmprsqH], mm3
+
+ pfrsqrt mm1,mm0
+
+ movq mm2,mm1
+ pfmul mm1,mm1
+ pfrsqit1 mm1,mm0
+ pfrcpit2 mm1,mm2 /* mm1=invsqrt */ OO
+ pfmul mm0, mm1 /* mm0=rsq */ OO
+
+ pfmul mm0, [esp + tsc]
+ pf2iw mm4, mm0
+ movd [esp + n1], mm4
+ pi2fd mm4,mm4
+ pfsub mm0, mm4 /* now mm0 is eps and mm4 n0 */
+ movq mm2, mm0
+ pfmul mm2, mm2 /* mm0 is eps, mm2 eps2 */
+
+ /* coulomb table */
+ mov edx, [ebp + VFtab]
+ mov ecx, [esp + n1]
+ shl ecx, 2
+
+ /* load all values we need */
+ movd mm4, [edx + ecx*4]
+ movd mm5, [edx + ecx*4 + 4]
+ movd mm6, [edx + ecx*4 + 8]
+ movd mm7, [edx + ecx*4 + 12]
+
+ pfmul mm6, mm0 /* mm6 = Geps */
+ pfmul mm7, mm2 /* mm7 = Heps2 */
+
+ pfadd mm5, mm6
+ pfadd mm5, mm7 /* mm5 = Fp */
+
+ pfmul mm7, [esp + two] /* two*Heps2 */
+ pfadd mm7, mm6
+ pfadd mm7, mm5 /* mm7=FF */
+
+ pfmul mm5, mm0 /* mm5=eps*Fp */
+ pfadd mm5, mm4 /* mm5= VV */
+
+ pfmul mm5, [esp + qqOO] /* vcoul=qq*VV */
+ pfmul mm7, [esp + qqOO] /* fijC=qq*FF */
+
+ /* update vctot directly, use mm3 for fscal sum. */
+ pfadd mm5, [esp + vctot]
+ movq [esp + vctot], mm5
+ movq mm3, mm7
+ pfmul mm3, [esp + tsc]
+
+ movq mm5, mm1
+ pfmul mm5,mm5
+ movq mm4, mm5
+ pfmul mm4,mm5
+ pfmul mm4,mm5
+ movq mm5, mm4
+ pfmul mm5,mm5 /* mm4=rinvsix, mm5=rinvtwelve */
+
+ pfmul mm4, [esp + c6]
+ pfmul mm5, [esp + c12]
+ movq mm6,mm5
+ pfsub mm6,mm4
+
+ pfmul mm4, [esp + six]
+ pfmul mm5, [esp + twelve]
+ pfsub mm5,mm4
+ pfmul mm5, mm1
+ pfsubr mm3, mm5
+
+ pfmul mm3, mm1 /* mm3 is total fscal (for the oxygen) now */
+
+ /* update vnbtot */
+ pfadd mm6, [esp + vnbtot] /* add the earlier value */
+ movq [esp + vnbtot], mm6 /* store the sum */
+
+ /* Ready with the oxygen - potential is updated, fscal is in mm3. */
+ /* time for hydrogens! */
+
+ movq mm0, [esp + tmprsqH]
+
+ pfrsqrt mm1, mm0
+ pswapd mm0,mm0
+ pfrsqrt mm2, mm0
+ pswapd mm0,mm0
+ punpckldq mm1,mm2 /* seeds are in mm1 now, and rsq in mm0. */
+
+ movq mm2, mm1
+ pfmul mm1,mm1
+ pfrsqit1 mm1,mm0
+ pfrcpit2 mm1,mm2 /* mm1=invsqrt */
+
+ pfmul mm0,mm1 /* mm0=r */
+ pfmul mm0, [esp + tsc]
+ pf2iw mm4, mm0
+ movq [esp + n1], mm4
+ pi2fd mm4,mm4
+ pfsub mm0, mm4 /* now mm0 is eps and mm4 n0 */
+ movq mm2, mm0
+ pfmul mm2, mm2 /* mm0 is eps, mm2 eps2 */
+
+ /* coulomb table */
+ mov edx, [ebp + VFtab]
+ mov ecx, [esp + n1]
+ shl ecx, 2
+ /* load all values we need */
+ movd mm4, [edx + ecx*4]
+ movd mm5, [edx + ecx*4 + 4]
+ movd mm6, [edx + ecx*4 + 8]
+ movd mm7, [edx + ecx*4 + 12]
+ mov ecx, [esp + n1 + 4]
+ shl ecx, 2
+ punpckldq mm4, [edx + ecx*4]
+ punpckldq mm5, [edx + ecx*4 + 4]
+ punpckldq mm6, [edx + ecx*4 + 8]
+ punpckldq mm7, [edx + ecx*4 + 12]
+
+ pfmul mm6, mm0 /* mm6 = Geps */
+ pfmul mm7, mm2 /* mm7 = Heps2 */
+
+ pfadd mm5, mm6
+ pfadd mm5, mm7 /* mm5 = Fp */
+
+ pfmul mm7, [esp + two] /* two*Heps2 */
+ pfadd mm7, mm6
+ pfadd mm7, mm5 /* mm7=FF */
+
+ pfmul mm5, mm0 /* mm5=eps*Fp */
+ pfadd mm5, mm4 /* mm5= VV */
+
+ pfmul mm5, [esp + qqOH] /* vcoul=qq*VV */
+ pfmul mm7, [esp + qqOH] /* fijC=qq*FF */
+ /* update vctot */
+ pfadd mm5, [esp + vctot]
+ movq [esp + vctot], mm5
+
+ /* change sign of fijC and multiply by rinv */
+ pxor mm4,mm4
+ pfsub mm4, mm7
+ pfmul mm4, [esp + tsc]
+ pfmul mm4, mm1 /* mm4 is total fscal (for the hydrogens) now */
+
+ /* spread oxygen fscalar to both positions */
+ punpckldq mm3,mm3
+ /* calc vectorial force for O */
+ movq mm0, [esp + dxO]
+ movd mm1, [esp + dzO]
+ pfmul mm0, mm3
+ pfmul mm1, mm3
+
+ /* calc vectorial force for H's */
+ movq mm5, [esp + dxH]
+ movq mm6, [esp + dyH]
+ movq mm7, [esp + dzH]
+ pfmul mm5, mm4
+ pfmul mm6, mm4
+ pfmul mm7, mm4
+
+ /* update iO particle force */
+ movq mm2, [esp + fixO]
+ movd mm3, [esp + fizO]
+ pfadd mm2, mm0
+ pfadd mm3, mm1
+ movq [esp + fixO], mm2
+ movd [esp + fizO], mm3
+
+ /* update iH forces */
+ movq mm2, [esp + fixH]
+ movq mm3, [esp + fiyH]
+ movq mm4, [esp + fizH]
+ pfadd mm2, mm5
+ pfadd mm3, mm6
+ pfadd mm4, mm7
+ movq [esp + fixH], mm2
+ movq [esp + fiyH], mm3
+ movq [esp + fizH], mm4
+
+ /* pack j forces from H in the same form as the oxygen force. */
+ pfacc mm5, mm6 /* mm5(l)=fjx(H1+H2) mm5(h)=fjy(H1+H2) */
+ pfacc mm7, mm7 /* mm7(l)=fjz(H1+H2) */
+
+ pfadd mm0, mm5 /* add up total force on j particle. */
+ pfadd mm1, mm7
+
+ /* update j particle force */
+ movq mm2, [edi + eax*4]
+ movd mm3, [edi + eax*4 + 8]
+ pfsub mm2, mm0
+ pfsub mm3, mm1
+ movq [edi + eax*4], mm2
+ movd [edi + eax*4 +8], mm3
+
+ /* interactions with j H1 */
+
+ movq mm0, [esi + eax*4 + 12]
+ movd mm1, [esi + eax*4 + 20]
+ /* copy & expand to mm2-mm4 for the H interactions */
+ movq mm2, mm0
+ movq mm3, mm0
+ movq mm4, mm1
+ punpckldq mm2,mm2
+ punpckhdq mm3,mm3
+ punpckldq mm4,mm4
+
+ pfsubr mm0, [esp + ixO]
+ pfsubr mm1, [esp + izO]
+
+ movq [esp + dxO], mm0
+ pfmul mm0,mm0
+ movd [esp + dzO], mm1
+ pfmul mm1,mm1
+ pfacc mm0, mm1
+ pfadd mm0, mm1 /* mm0=rsqO */
+
+ punpckldq mm2, mm2
+ punpckldq mm3, mm3
+ punpckldq mm4, mm4 /* mm2-mm4 is jx-jz */
+ pfsubr mm2, [esp + ixH]
+ pfsubr mm3, [esp + iyH]
+ pfsubr mm4, [esp + izH] /* mm2-mm4 is dxH-dzH */
+
+ movq [esp + dxH], mm2
+ movq [esp + dyH], mm3
+ movq [esp + dzH], mm4
+ pfmul mm2,mm2
+ pfmul mm3,mm3
+ pfmul mm4,mm4
+
+ pfadd mm3,mm2
+ pfadd mm3,mm4 /* mm3=rsqH */
+ movq [esp + tmprsqH], mm3
+
+ pfrsqrt mm1,mm0
+
+ movq mm2,mm1
+ pfmul mm1,mm1
+ pfrsqit1 mm1,mm0
+ pfrcpit2 mm1,mm2 /* mm1=invsqrt */
+ pfmul mm0, mm1 /* mm0=rsq */
+
+ pfmul mm0, [esp + tsc]
+ pf2iw mm4, mm0
+ movd [esp + n1], mm4
+ pi2fd mm4,mm4
+ pfsub mm0, mm4 /* now mm0 is eps and mm4 n0 */
+ movq mm2, mm0
+ pfmul mm2, mm2 /* mm0 is eps, mm2 eps2 */
+
+ /* coulomb table */
+ mov edx, [ebp + VFtab]
+ mov ecx, [esp + n1]
+ shl ecx, 2
+
+ /* load all values we need */
+ movd mm4, [edx + ecx*4]
+ movd mm5, [edx + ecx*4 + 4]
+ movd mm6, [edx + ecx*4 + 8]
+ movd mm7, [edx + ecx*4 + 12]
+
+ pfmul mm6, mm0 /* mm6 = Geps */
+ pfmul mm7, mm2 /* mm7 = Heps2 */
+
+ pfadd mm5, mm6
+ pfadd mm5, mm7 /* mm5 = Fp */
+
+ pfmul mm7, [esp + two] /* two*Heps2 */
+ pfadd mm7, mm6
+ pfadd mm7, mm5 /* mm7=FF */
+
+ pfmul mm5, mm0 /* mm5=eps*Fp */
+ pfadd mm5, mm4 /* mm5= VV */
+
+ pfmul mm5, [esp + qqOH] /* vcoul=qq*VV */
+ pfmul mm7, [esp + qqOH] /* fijC=qq*FF */
+
+ /* update vctot directly, force is moved to mm3 */
+ pfadd mm5, [esp + vctot]
+ movq [esp + vctot], mm5
+ pxor mm3, mm3
+ pfsub mm3, mm7
+ pfmul mm3, [esp + tsc]
+ pfmul mm3, mm1 /* mm3 is total fscal (for the oxygen) now */
+
+ movq mm0, [esp + tmprsqH]
+
+ pfrsqrt mm1, mm0
+ pswapd mm0,mm0
+ pfrsqrt mm2, mm0
+ pswapd mm0,mm0
+ punpckldq mm1,mm2 /* seeds are in mm1 now, and rsq in mm0. */
+
+ movq mm2, mm1
+ pfmul mm1,mm1
+ pfrsqit1 mm1,mm0
+ pfrcpit2 mm1,mm2 /* mm1=invsqrt */
+
+ pfmul mm0,mm1 /* mm0=r */
+ pfmul mm0, [esp + tsc]
+ pf2iw mm4, mm0
+ movq [esp + n1], mm4
+ pi2fd mm4,mm4
+ pfsub mm0, mm4 /* now mm0 is eps and mm4 n0 */
+ movq mm2, mm0
+ pfmul mm2, mm2 /* mm0 is eps, mm2 eps2 */
+
+ /* coulomb table */
+ mov edx, [ebp + VFtab]
+ mov ecx, [esp + n1]
+ shl ecx, 2
+ /* load all values we need */
+ movd mm4, [edx + ecx*4]
+ movd mm5, [edx + ecx*4 + 4]
+ movd mm6, [edx + ecx*4 + 8]
+ movd mm7, [edx + ecx*4 + 12]
+ mov ecx, [esp + n1 + 4]
+ shl ecx, 2
+ punpckldq mm4, [edx + ecx*4]
+ punpckldq mm5, [edx + ecx*4 + 4]
+ punpckldq mm6, [edx + ecx*4 + 8]
+ punpckldq mm7, [edx + ecx*4 + 12]
+
+
+ pfmul mm6, mm0 /* mm6 = Geps */
+ pfmul mm7, mm2 /* mm7 = Heps2 */
+
+ pfadd mm5, mm6
+ pfadd mm5, mm7 /* mm5 = Fp */
+
+ pfmul mm7, [esp + two] /* two*Heps2 */
+ pfadd mm7, mm6
+ pfadd mm7, mm5 /* mm7=FF */
+
+ pfmul mm5, mm0 /* mm5=eps*Fp */
+ pfadd mm5, mm4 /* mm5= VV */
+
+ pfmul mm5, [esp + qqHH] /* vcoul=qq*VV */
+ pfmul mm7, [esp + qqHH] /* fijC=qq*FF */
+ /* update vctot */
+ pfadd mm5, [esp + vctot]
+ movq [esp + vctot], mm5
+
+ /* change sign of fijC and multiply by rinv */
+ pxor mm4,mm4
+ pfsub mm4, mm7
+ pfmul mm4, [esp + tsc]
+ pfmul mm4, mm1 /* mm4 is total fscal (for the hydrogens) now */
+
+ /* spread oxygen fscalar to both positions */
+ punpckldq mm3,mm3
+ /* calc vectorial force for O */
+ movq mm0, [esp + dxO]
+ movd mm1, [esp + dzO]
+ pfmul mm0, mm3
+ pfmul mm1, mm3
+
+ /* calc vectorial force for H's */
+ movq mm5, [esp + dxH]
+ movq mm6, [esp + dyH]
+ movq mm7, [esp + dzH]
+ pfmul mm5, mm4
+ pfmul mm6, mm4
+ pfmul mm7, mm4
+
+ /* update iO particle force */
+ movq mm2, [esp + fixO]
+ movd mm3, [esp + fizO]
+ pfadd mm2, mm0
+ pfadd mm3, mm1
+ movq [esp + fixO], mm2
+ movd [esp + fizO], mm3
+
+ /* update iH forces */
+ movq mm2, [esp + fixH]
+ movq mm3, [esp + fiyH]
+ movq mm4, [esp + fizH]
+ pfadd mm2, mm5
+ pfadd mm3, mm6
+ pfadd mm4, mm7
+ movq [esp + fixH], mm2
+ movq [esp + fiyH], mm3
+ movq [esp + fizH], mm4
+
+ /* pack j forces from H in the same form as the oxygen force. */
+ pfacc mm5, mm6 /* mm5(l)=fjx(H1+H2) mm5(h)=fjy(H1+H2) */
+ pfacc mm7, mm7 /* mm7(l)=fjz(H1+H2) */
+
+ pfadd mm0, mm5 /* add up total force on j particle. */
+ pfadd mm1, mm7
+
+ /* update j particle force */
+ movq mm2, [edi + eax*4 + 12]
+ movd mm3, [edi + eax*4 + 20]
+ pfsub mm2, mm0
+ pfsub mm3, mm1
+ movq [edi + eax*4 + 12], mm2
+ movd [edi + eax*4 + 20], mm3
+
+ /* interactions with j H2 */
+ movq mm0, [esi + eax*4 + 24]
+ movd mm1, [esi + eax*4 + 32]
+ /* copy & expand to mm2-mm4 for the H interactions */
+ movq mm2, mm0
+ movq mm3, mm0
+ movq mm4, mm1
+ punpckldq mm2,mm2
+ punpckhdq mm3,mm3
+ punpckldq mm4,mm4
+
+ pfsubr mm0, [esp + ixO]
+ pfsubr mm1, [esp + izO]
+
+ movq [esp + dxO], mm0
+ pfmul mm0,mm0
+ movd [esp + dzO], mm1
+ pfmul mm1,mm1
+ pfacc mm0, mm1
+ pfadd mm0, mm1 /* mm0=rsqO */
+
+ punpckldq mm2, mm2
+ punpckldq mm3, mm3
+ punpckldq mm4, mm4 /* mm2-mm4 is jx-jz */
+ pfsubr mm2, [esp + ixH]
+ pfsubr mm3, [esp + iyH]
+ pfsubr mm4, [esp + izH] /* mm2-mm4 is dxH-dzH */
+
+ movq [esp + dxH], mm2
+ movq [esp + dyH], mm3
+ movq [esp + dzH], mm4
+ pfmul mm2,mm2
+ pfmul mm3,mm3
+ pfmul mm4,mm4
+
+ pfadd mm3,mm2
+ pfadd mm3,mm4 /* mm3=rsqH */
+ movq [esp + tmprsqH], mm3
+
+ pfrsqrt mm1,mm0
+
+ movq mm2,mm1
+ pfmul mm1,mm1
+ pfrsqit1 mm1,mm0
+ pfrcpit2 mm1,mm2 /* mm1=invsqrt */
+ pfmul mm0, mm1
+
+ pfmul mm0, [esp + tsc]
+ pf2iw mm4, mm0
+ movd [esp + n1], mm4
+ pi2fd mm4,mm4
+ pfsub mm0, mm4 /* now mm0 is eps and mm4 n0 */
+ movq mm2, mm0
+ pfmul mm2, mm2 /* mm0 is eps, mm2 eps2 */
+
+ /* coulomb table */
+ mov edx, [ebp + VFtab]
+ mov ecx, [esp + n1]
+ shl ecx, 2
+
+ /* load all values we need */
+ movd mm4, [edx + ecx*4]
+ movd mm5, [edx + ecx*4 + 4]
+ movd mm6, [edx + ecx*4 + 8]
+ movd mm7, [edx + ecx*4 + 12]
+
+ pfmul mm6, mm0 /* mm6 = Geps */
+ pfmul mm7, mm2 /* mm7 = Heps2 */
+
+ pfadd mm5, mm6
+ pfadd mm5, mm7 /* mm5 = Fp */
+
+ pfmul mm7, [esp + two] /* two*Heps2 */
+ pfadd mm7, mm6
+ pfadd mm7, mm5 /* mm7=FF */
+
+ pfmul mm5, mm0 /* mm5=eps*Fp */
+ pfadd mm5, mm4 /* mm5= VV */
+
+ pfmul mm5, [esp + qqOH] /* vcoul=qq*VV */
+ pfmul mm7, [esp + qqOH] /* fijC=qq*FF */
+
+ /* update vctot directly, use mm3 for fscal sum. */
+ pfadd mm5, [esp + vctot]
+ movq [esp + vctot], mm5
+ pxor mm3,mm3
+ pfsub mm3, mm7
+ pfmul mm3, [esp + tsc]
+ pfmul mm3, mm1 /* mm3 is total fscal (for the oxygen) now */
+
+ movq mm0, [esp + tmprsqH]
+
+ pfrsqrt mm1, mm0
+ pswapd mm0,mm0
+ pfrsqrt mm2, mm0
+ pswapd mm0,mm0
+ punpckldq mm1,mm2 /* seeds are in mm1 now, and rsq in mm0. */
+
+ movq mm2, mm1
+ pfmul mm1,mm1
+ pfrsqit1 mm1,mm0
+ pfrcpit2 mm1,mm2 /* mm1=invsqrt */
+
+ pfmul mm0,mm1 /* mm0=r */
+ pfmul mm0, [esp + tsc]
+ pf2iw mm4, mm0
+ movq [esp + n1], mm4
+ pi2fd mm4,mm4
+ pfsub mm0, mm4 /* now mm0 is eps and mm4 n0 */
+ movq mm2, mm0
+ pfmul mm2, mm2 /* mm0 is eps, mm2 eps2 */
+
+ /* coulomb table */
+ mov edx, [ebp + VFtab]
+ mov ecx, [esp + n1]
+ shl ecx, 2
+ /* load all values we need */
+ movd mm4, [edx + ecx*4]
+ movd mm5, [edx + ecx*4 + 4]
+ movd mm6, [edx + ecx*4 + 8]
+ movd mm7, [edx + ecx*4 + 12]
+ mov ecx, [esp + n1 + 4]
+ shl ecx, 2
+ punpckldq mm4, [edx + ecx*4]
+ punpckldq mm5, [edx + ecx*4 + 4]
+ punpckldq mm6, [edx + ecx*4 + 8]
+ punpckldq mm7, [edx + ecx*4 + 12]
+
+
+ pfmul mm6, mm0 /* mm6 = Geps */
+ pfmul mm7, mm2 /* mm7 = Heps2 */
+
+ pfadd mm5, mm6
+ pfadd mm5, mm7 /* mm5 = Fp */
+
+ pfmul mm7, [esp + two] /* two*Heps2 */
+ pfadd mm7, mm6
+ pfadd mm7, mm5 /* mm7=FF */
+
+ pfmul mm5, mm0 /* mm5=eps*Fp */
+ pfadd mm5, mm4 /* mm5= VV */
+
+ pfmul mm5, [esp + qqHH] /* vcoul=qq*VV */
+ pfmul mm7, [esp + qqHH] /* fijC=qq*FF */
+ /* update vctot */
+ pfadd mm5, [esp + vctot]
+ movq [esp + vctot], mm5
+
+ /* change sign of fijC and multiply by rinv */
+ pxor mm4,mm4
+ pfsub mm4, mm7
+ pfmul mm4, [esp + tsc]
+ pfmul mm4, mm1 /* mm4 is total fscal (for the hydrogens) now */
+
+ /* spread oxygen fscalar to both positions */
+ punpckldq mm3,mm3
+ /* calc vectorial force for O */
+ movq mm0, [esp + dxO]
+ movd mm1, [esp + dzO]
+ pfmul mm0, mm3
+ pfmul mm1, mm3
+
+ /* calc vectorial force for H's */
+ movq mm5, [esp + dxH]
+ movq mm6, [esp + dyH]
+ movq mm7, [esp + dzH]
+ pfmul mm5, mm4
+ pfmul mm6, mm4
+ pfmul mm7, mm4
+
+ /* update iO particle force */
+ movq mm2, [esp + fixO]
+ movd mm3, [esp + fizO]
+ pfadd mm2, mm0
+ pfadd mm3, mm1
+ movq [esp + fixO], mm2
+ movd [esp + fizO], mm3
+
+ /* update iH forces */
+ movq mm2, [esp + fixH]
+ movq mm3, [esp + fiyH]
+ movq mm4, [esp + fizH]
+ pfadd mm2, mm5
+ pfadd mm3, mm6
+ pfadd mm4, mm7
+ movq [esp + fixH], mm2
+ movq [esp + fiyH], mm3
+ movq [esp + fizH], mm4
+
+ /* pack j forces from H in the same form as the oxygen force. */
+ pfacc mm5, mm6 /* mm5(l)=fjx(H1+H2) mm5(h)=fjy(H1+H2) */
+ pfacc mm7, mm7 /* mm7(l)=fjz(H1+H2) */
+
+ pfadd mm0, mm5 /* add up total force on j particle. */
+ pfadd mm1, mm7
+
+ /* update j particle force */
+ movq mm2, [edi + eax*4 + 24]
+ movd mm3, [edi + eax*4 + 32]
+ pfsub mm2, mm0
+ pfsub mm3, mm1
+ movq [edi + eax*4 + 24], mm2
+ movd [edi + eax*4 + 32], mm3
+
+ /* done - one more? */
+ dec dword ptr [esp + innerk]
+ jz .i3130_updateouterdata
+ jmp .i3130_inner_loop
+.i3130_updateouterdata:
+ mov ecx, [esp + ii3]
+
+ movq mm6, [edi + ecx*4] /* increment iO force */
+ movd mm7, [edi + ecx*4 + 8]
+ pfadd mm6, [esp + fixO]
+ pfadd mm7, [esp + fizO]
+ movq [edi + ecx*4], mm6
+ movd [edi + ecx*4 +8], mm7
+
+ movq mm0, [esp + fixH]
+ movq mm3, [esp + fiyH]
+ movq mm1, [esp + fizH]
+ movq mm2, mm0
+ punpckldq mm0, mm3 /* mm0(l)=fxH1, mm0(h)=fyH1 */
+ punpckhdq mm2, mm3 /* mm2(l)=fxH2, mm2(h)=fyH2 */
+ movq mm3, mm1
+ pswapd mm3,mm3
+ /* mm1 is fzH1 */
+ /* mm3 is fzH2 */
+
+ movq mm6, [edi + ecx*4 + 12] /* increment iH1 force */
+ movd mm7, [edi + ecx*4 + 20]
+ pfadd mm6, mm0
+ pfadd mm7, mm1
+ movq [edi + ecx*4 + 12], mm6
+ movd [edi + ecx*4 + 20], mm7
+
+ movq mm6, [edi + ecx*4 + 24] /* increment iH2 force */
+ movd mm7, [edi + ecx*4 + 32]
+ pfadd mm6, mm2
+ pfadd mm7, mm3
+ movq [edi + ecx*4 + 24], mm6
+ movd [edi + ecx*4 + 32], mm7
+
+
+ mov ebx, [ebp + fshift] /* increment fshift force */
+ mov edx, [esp + is3]
+
+ movq mm6, [ebx + edx*4]
+ movd mm7, [ebx + edx*4 + 8]
+ pfadd mm6, [esp + fixO]
+ pfadd mm7, [esp + fizO]
+ pfadd mm6, mm0
+ pfadd mm7, mm1
+ pfadd mm6, mm2
+ pfadd mm7, mm3
+ movq [ebx + edx*4], mm6
+ movd [ebx + edx*4 + 8], mm7
+
+ mov edx, [ebp + gid] /* get group index for this i particle */
+ mov edx, [edx]
+ add [ebp + gid], 4 /* advance pointer */
+
+ movq mm7, [esp + vctot]
+ pfacc mm7,mm7 /* get and sum the two parts of total potential */
+
+ mov eax, [ebp + Vc]
+ movd mm6, [eax + edx*4]
+ pfadd mm6, mm7
+ movd [eax + edx*4], mm6 /* increment vc[gid] */
+
+ movq mm7, [esp + vnbtot]
+ pfacc mm7,mm7 /* get and sum the two parts of total potential */
+
+ mov eax, [ebp + Vnb]
+ movd mm6, [eax + edx*4]
+ pfadd mm6, mm7
+ movd [eax + edx*4], mm6 /* increment vnbtot[gid] */
+ /* finish if last */
+ dec dword ptr [ebp + nri]
+ jz .i3130_end
+ /* not last, iterate once more! */
+ jmp .i3130_outer
+.i3130_end:
+ femms
+ add esp, 232
+ pop edi
+ pop esi
+ pop edx
+ pop ecx
+ pop ebx
+ pop eax
+ leave
+ ret
+
+
+.globl inl3300_3dnow
+ .type inl3300_3dnow,@function
+inl3300_3dnow:
+.equ nri, 8
+.equ iinr, 12
+.equ jindex, 16
+.equ jjnr, 20
+.equ shift, 24
+.equ shiftvec, 28
+.equ fshift, 32
+.equ gid, 36
+.equ pos, 40
+.equ faction, 44
+.equ charge, 48
+.equ facel, 52
+.equ Vc, 56
+.equ type, 60
+.equ ntype, 64
+.equ nbfp, 68
+.equ Vnb, 72
+.equ tabscale, 76
+.equ VFtab, 80
+ /* stack offsets for local variables */
+.equ is3, 0
+.equ ii3, 4
+.equ ix, 8
+.equ iy, 12
+.equ iz, 16
+.equ iq, 20 /* repeated (64bit) to fill 3dnow reg */
+.equ vctot, 28 /* repeated (64bit) to fill 3dnow reg */
+.equ vnbtot, 36 /* repeated (64bit) to fill 3dnow reg */
+.equ c6, 44 /* repeated (64bit) to fill 3dnow reg */
+.equ c12, 52 /* repeated (64bit) to fill 3dnow reg */
+.equ two, 60 /* repeated (64bit) to fill 3dnow reg */
+.equ n1, 68 /* repeated (64bit) to fill 3dnow reg */
+.equ tsc, 76 /* repeated (64bit) to fill 3dnow reg */
+.equ ntia, 84
+.equ innerjjnr, 88
+.equ innerk, 92
+.equ fix, 96
+.equ fiy, 100
+.equ fiz, 104
+.equ dx1, 108
+.equ dy1, 112
+.equ dz1, 116
+.equ dx2, 120
+.equ dy2, 124
+.equ dz2, 128
+ push ebp
+ mov ebp,esp
+ push eax
+ push ebx
+ push ecx
+ push edx
+ push esi
+ push edi
+ sub esp, 132 /* local stack space */
+ femms
+ /* move data to local stack */
+ movq mm0, [mm_two]
+ movd mm3, [ebp + tabscale]
+ movq [esp + two], mm0
+ punpckldq mm3,mm3
+ movq [esp + tsc], mm3
+ /* assume we have at least one i particle - start directly */
+.i3300_outer:
+ mov eax, [ebp + shift] /* eax = pointer into shift[] */
+ mov ebx, [eax] /* ebx=shift[n] */
+ add [ebp + shift], 4 /* advance pointer one step */
+
+ lea ebx, [ebx + ebx*2] /* ebx=3*is */
+ mov [esp + is3],ebx /* store is3 */
+
+ mov eax, [ebp + shiftvec] /* eax = base of shiftvec[] */
+
+ movq mm0, [eax + ebx*4] /* move shX/shY to mm0 and shZ to mm1 */
+ movd mm1, [eax + ebx*4 + 8]
+
+ mov ecx, [ebp + iinr] /* ecx = pointer into iinr[] */
+ add [ebp + iinr], 4 /* advance pointer */
+ mov ebx, [ecx] /* ebx=ii */
+
+ mov edx, [ebp + charge]
+ movd mm2, [edx + ebx*4] /* mm2=charge[ii] */
+ pfmul mm2, [ebp + facel]
+ punpckldq mm2,mm2 /* spread to both halves */
+ movq [esp + iq], mm2 /* iq =facel*charge[ii] */
+
+ mov edx, [ebp + type]
+ mov edx, [edx + ebx*4]
+ imul edx, [ebp + ntype]
+ shl edx, 1
+ mov [esp + ntia], edx
+
+ lea ebx, [ebx + ebx*2] /* ebx = 3*ii=ii3 */
+ mov eax, [ebp + pos] /* eax = base of pos[] */
+
+ pfadd mm0, [eax + ebx*4] /* ix = shX + posX (and iy too) */
+ movd mm3, [eax + ebx*4 + 8] /* cant use direct memory add for 4 bytes (iz) */
+ mov [esp + ii3], ebx
+ pfadd mm1, mm3
+ movq [esp + ix], mm0
+ movd [esp + iz], mm1
+
+ /* clear total potential and i forces */
+ pxor mm7,mm7
+ movq [esp + vctot], mm7
+ movq [esp + vnbtot], mm7
+ movq [esp + fix], mm7
+ movd [esp + fiz], mm7
+
+ mov eax, [ebp + jindex]
+ mov ecx, [eax] /* jindex[n] */
+ mov edx, [eax + 4] /* jindex[n+1] */
+ add [ebp + jindex], 4
+ sub edx, ecx /* number of innerloop atoms */
+
+ mov esi, [ebp + pos]
+ mov edi, [ebp + faction]
+ mov eax, [ebp + jjnr]
+ shl ecx, 2
+ add eax, ecx
+ mov [esp + innerjjnr], eax /* pointer to jjnr[nj0] */
+ sub edx, 2
+ mov [esp + innerk], edx /* number of innerloop atoms */
+ jge .i3300_unroll_loop
+ jmp .i3300_finish_inner
+.i3300_unroll_loop:
+ /* paired innerloop starts here */
+ mov ecx, [esp + innerjjnr] /* pointer to jjnr[k] */
+ mov eax, [ecx]
+ mov ebx, [ecx + 4] /* eax/ebx=jnr */
+ add [esp + innerjjnr], 8 /* advance pointer (unrolled 2) */
+ prefetch [ecx + 16] /* prefetch data - trial and error says 16 is best */
+
+ mov ecx, [ebp + charge] /* base of charge[] */
+ movq mm5, [esp + iq]
+ movd mm3, [ecx + eax*4] /* charge[jnr1] */
+ punpckldq mm3, [ecx + ebx*4] /* move charge 2 to high part of mm3 */
+ pfmul mm3,mm5 /* mm3 now has qq for both particles */
+
+ mov ecx, [ebp + type]
+ mov edx, [ecx + eax*4] /* type [jnr1] */
+ mov ecx, [ecx + ebx*4] /* type [jnr2] */
+
+ mov esi, [ebp + nbfp] /* base of nbfp */
+ shl edx, 1
+ shl ecx, 1
+ add edx, [esp + ntia] /* tja = ntia + 2*type */
+ add ecx, [esp + ntia]
+
+ movq mm5, [esi + edx*4] /* mm5 = 1st c6 / c12 */
+ movq mm7, [esi + ecx*4] /* mm7 = 2nd c6 / c12 */
+ movq mm6,mm5
+ punpckldq mm5,mm7 /* mm5 = 1st c6 / 2nd c6 */
+ punpckhdq mm6,mm7 /* mm6 = 1st c12 / 2nd c12 */
+ movq [esp + c6], mm5
+ movq [esp + c12], mm6
+
+ lea eax, [eax + eax*2] /* replace jnr with j3 */
+ lea ebx, [ebx + ebx*2]
+
+ mov esi, [ebp + pos]
+
+ movq mm0, [esp + ix]
+ movd mm1, [esp + iz]
+ movq mm4, [esi + eax*4] /* fetch first j coordinates */
+ movd mm5, [esi + eax*4 + 8]
+ pfsubr mm4,mm0 /* dr = ir - jr */
+ pfsubr mm5,mm1
+ movq [esp + dx1], mm4 /* store dr */
+ movd [esp + dz1], mm5
+ pfmul mm4,mm4 /* square dx,dy,dz */
+ pfmul mm5,mm5
+ pfacc mm4, mm5 /* accumulate to get dx*dx+dy*dy+dz*dz */
+ pfacc mm4, mm5 /* first rsq in lower mm4 */
+
+ movq mm6, [esi + ebx*4] /* fetch second j coordinates */
+ movd mm7, [esi + ebx*4 + 8]
+
+ pfsubr mm6,mm0 /* dr = ir - jr */
+ pfsubr mm7,mm1
+ movq [esp + dx2], mm6 /* store dr */
+ movd [esp + dz2], mm7
+ pfmul mm6,mm6 /* square dx,dy,dz */
+ pfmul mm7,mm7
+ pfacc mm6, mm7 /* accumulate to get dx*dx+dy*dy+dz*dz */
+ pfacc mm6, mm7 /* second rsq in lower mm6 */
+
+ pfrsqrt mm0, mm4 /* lookup inverse square root seed */
+ pfrsqrt mm1, mm6
+
+
+ punpckldq mm0,mm1
+ punpckldq mm4,mm6 /* now 4 has rsq and 0 the seed for both pairs. */
+ movq mm2,mm0 /* amd 3dnow N-R iteration to get full precision. */
+ pfmul mm0,mm0
+ pfrsqit1 mm0,mm4
+ pfrcpit2 mm0,mm2
+ pfmul mm4, mm0
+ movq mm1, mm4
+ /* mm0 is invsqrt, and mm1 r. */
+ /* do potential and fscal */
+ pfmul mm1, [esp + tsc] /* mm1=rt */
+ pf2iw mm4,mm1
+ movq [esp + n1], mm4
+ pi2fd mm4,mm4
+ pfsub mm1, mm4 /* now mm1 is eps and mm4 is n0 */
+
+ movq mm2,mm1
+ pfmul mm2,mm2 /* mm1 is eps, mm2 is eps2 */
+
+ mov edx, [ebp + VFtab]
+ mov ecx, [esp + n1]
+ lea ecx, [ecx + ecx*2]
+ shl ecx, 2
+ /* load all the table values we need */
+ movd mm4, [edx + ecx*4]
+ movd mm5, [edx + ecx*4 + 4]
+ movd mm6, [edx + ecx*4 + 8]
+ movd mm7, [edx + ecx*4 + 12]
+ mov ecx, [esp + n1 + 4]
+ lea ecx, [ecx + ecx*2]
+ shl ecx, 2
+ punpckldq mm4, [edx + ecx*4]
+ punpckldq mm5, [edx + ecx*4 + 4]
+ punpckldq mm6, [edx + ecx*4 + 8]
+ punpckldq mm7, [edx + ecx*4 + 12]
+
+ pfmul mm6, mm1 /* mm6 = Geps */
+ pfmul mm7, mm2 /* mm7 = Heps2 */
+
+ pfadd mm5, mm6
+ pfadd mm5, mm7 /* mm5 = Fp */
+
+ pfmul mm7, [esp + two] /* two*Heps2 */
+ pfadd mm7, mm6
+ pfadd mm7, mm5 /* mm7=FF */
+
+ pfmul mm5, mm1 /* mm5=eps*Fp */
+ pfadd mm5, mm4 /* mm5= VV */
+
+ pfmul mm5, mm3 /* vcoul=qq*VV */
+ pfmul mm3, mm7 /* fijC=FF*qq */
+
+ /* at this point mm5 contains vcoul and mm3 fijC */
+ /* increment vcoul - then we can get rid of mm5 */
+ /* update vctot */
+ pfadd mm5, [esp + vctot] /* add the earlier value */
+ movq [esp + vctot], mm5 /* store the sum */
+
+ /* dispersion table */
+ mov ecx, [esp + n1]
+ lea ecx, [ecx + ecx*2]
+ shl ecx, 2
+ /* load all the table values we need */
+ movd mm4, [edx + ecx*4 + 16]
+ movd mm5, [edx + ecx*4 + 20]
+ movd mm6, [edx + ecx*4 + 24]
+ movd mm7, [edx + ecx*4 + 28]
+ mov ecx, [esp + n1 + 4]
+ lea ecx, [ecx + ecx*2]
+ shl ecx, 2
+ punpckldq mm4, [edx + ecx*4 + 16]
+ punpckldq mm5, [edx + ecx*4 + 20]
+ punpckldq mm6, [edx + ecx*4 + 24]
+ punpckldq mm7, [edx + ecx*4 + 28]
+ pfmul mm6, mm1 /* mm6 = Geps */
+ pfmul mm7, mm2 /* mm7 = Heps2 */
+ pfadd mm5, mm6
+ pfadd mm5, mm7 /* mm5 = Fp */
+ pfmul mm7, [esp + two] /* two*Heps2 */
+ pfadd mm7, mm6
+ pfadd mm7, mm5 /* mm7=FF */
+ pfmul mm5, mm1 /* mm5=eps*Fp */
+ pfadd mm5, mm4 /* mm5= VV */
+
+ movq mm4, [esp + c6]
+ pfmul mm7, mm4 /* fijD */
+ pfmul mm5, mm4 /* vnb6 */
+ pfadd mm3, mm7 /* add to fscal */
+
+ /* update vnbtot to release mm5! */
+ pfadd mm5, [esp + vnbtot] /* add the earlier value */
+ movq [esp + vnbtot], mm5 /* store the sum */
+
+ /* repulsion table */
+ mov ecx, [esp + n1]
+ lea ecx, [ecx + ecx*2]
+ shl ecx, 2
+ /* load all the table values we need */
+ movd mm4, [edx + ecx*4 + 32]
+ movd mm5, [edx + ecx*4 + 36]
+ movd mm6, [edx + ecx*4 + 40]
+ movd mm7, [edx + ecx*4 + 44]
+ mov ecx, [esp + n1 + 4]
+ lea ecx, [ecx + ecx*2]
+ shl ecx, 2
+ punpckldq mm4, [edx + ecx*4 + 32]
+ punpckldq mm5, [edx + ecx*4 + 36]
+ punpckldq mm6, [edx + ecx*4 + 40]
+ punpckldq mm7, [edx + ecx*4 + 44]
+
+ pfmul mm6, mm1 /* mm6 = Geps */
+ pfmul mm7, mm2 /* mm7 = Heps2 */
+ pfadd mm5, mm6
+ pfadd mm5, mm7 /* mm5 = Fp */
+ pfmul mm7, [esp + two] /* two*Heps2 */
+ pfadd mm7, mm6
+ pfadd mm7, mm5 /* mm7=FF */
+ pfmul mm5, mm1 /* mm5=eps*Fp */
+ pfadd mm5, mm4 /* mm5= VV */
+
+ movq mm6, [esp + c12]
+ pfmul mm7, mm6 /* fijR */
+ pfmul mm5, mm6 /* vnb12 */
+ pfadd mm3, mm7 /* total fscal fijC+fijD+fijR */
+
+ /* change sign of mm3 */
+ pxor mm1,mm1
+ pfsub mm1, mm3
+ pfmul mm0, [esp + tsc]
+ pfmul mm0, mm1 /* mm0 is total fscal now */
+
+ prefetchw [esp + dx1] /* prefetch i forces to cache */
+
+ /* spread fscalar to both positions */
+ movq mm1,mm0
+ punpckldq mm0,mm0
+ punpckhdq mm1,mm1
+
+ /* calc vector force */
+ prefetchw [edi + eax*4] /* prefetch the 1st faction to cache */
+ movq mm2, [esp + dx1] /* fetch dr */
+ movd mm3, [esp + dz1]
+
+ /* update vnbtot */
+ pfadd mm5, [esp + vnbtot] /* add the earlier value */
+ movq [esp + vnbtot], mm5 /* store the sum */
+
+ prefetchw [edi + ebx*4] /* prefetch the 2nd faction to cache */
+ pfmul mm2, mm0 /* mult by fs */
+ pfmul mm3, mm0
+
+ movq mm4, [esp + dx2] /* fetch dr */
+ movd mm5, [esp + dz2]
+ pfmul mm4, mm1 /* mult by fs */
+ pfmul mm5, mm1
+ /* update i forces */
+
+ movq mm0, [esp + fix]
+ movd mm1, [esp + fiz]
+ pfadd mm0, mm2
+ pfadd mm1, mm3
+
+ pfadd mm0, mm4
+ pfadd mm1, mm5
+ movq [esp + fix], mm0
+ movd [esp + fiz], mm1
+ /* update j forces */
+
+ movq mm0, [edi + eax*4]
+ movd mm1, [edi + eax*4 + 8]
+ movq mm6, [edi + ebx*4]
+ movd mm7, [edi + ebx*4 + 8]
+
+ pfsub mm0, mm2
+ pfsub mm1, mm3
+ pfsub mm6, mm4
+ pfsub mm7, mm5
+
+ movq [edi + eax*4], mm0
+ movd [edi + eax*4 +8], mm1
+ movq [edi + ebx*4], mm6
+ movd [edi + ebx*4 + 8], mm7
+
+ /* should we do one more iteration? */
+ sub [esp + innerk], 2
+ jl .i3300_finish_inner
+ jmp .i3300_unroll_loop
+.i3300_finish_inner:
+ and [esp + innerk], 1
+ jnz .i3300_single_inner
+ jmp .i3300_updateouterdata
+.i3300_single_inner:
+ /* a single j particle iteration here - compare with the unrolled code for comments. */
+ mov eax, [esp + innerjjnr]
+ mov eax, [eax] /* eax=jnr offset */
+
+ mov ecx, [ebp + charge]
+ movd mm5, [esp + iq]
+ movd mm3, [ecx + eax*4]
+ pfmul mm3, mm5 /* mm3=qq */
+
+ mov esi, [ebp + nbfp]
+ mov ecx, [ebp + type]
+ mov edx, [ecx + eax*4] /* type [jnr1] */
+ shl edx, 1
+ add edx, [esp + ntia] /* tja = ntia + 2*type */
+ movd mm5, [esi + edx*4] /* mm5 = 1st c6 */
+ movq [esp + c6], mm5
+ movd mm5, [esi + edx*4 + 4] /* mm5 = 1st c12 */
+ movq [esp + c12], mm5
+
+ mov esi, [ebp + pos]
+ lea eax, [eax + eax*2]
+
+ movq mm0, [esp + ix]
+ movd mm1, [esp + iz]
+ movq mm4, [esi + eax*4]
+ movd mm5, [esi + eax*4 + 8]
+ pfsubr mm4, mm0
+ pfsubr mm5, mm1
+ movq [esp + dx1], mm4
+ pfmul mm4,mm4
+ movd [esp + dz1], mm5
+ pfmul mm5,mm5
+ pfacc mm4, mm5
+ pfacc mm4, mm5 /* mm0=rsq */
+
+ pfrsqrt mm0,mm4
+ movq mm2,mm0
+ pfmul mm0,mm0
+ pfrsqit1 mm0,mm4
+ pfrcpit2 mm0,mm2 /* mm1=invsqrt */
+ pfmul mm4, mm0
+ movq mm1, mm4
+ /* mm0 is invsqrt, and mm1 r. */
+
+ /* calculate potentials and scalar force */
+ pfmul mm1, [esp + tsc] /* mm1=rt */
+ pf2iw mm4,mm1
+ movd [esp + n1], mm4
+ pi2fd mm4,mm4
+ pfsub mm1, mm4 /* now mm1 is eps and mm4 is n0 */
+
+ movq mm2,mm1
+ pfmul mm2,mm2 /* mm1 is eps, mm2 is eps2 */
+
+ /* coulomb table */
+ mov edx, [ebp + VFtab]
+ mov ecx, [esp + n1]
+ lea ecx, [ecx + ecx*2]
+ shl ecx, 2
+ /* load all the table values we need */
+ movd mm4, [edx + ecx*4]
+ movd mm5, [edx + ecx*4 + 4]
+ movd mm6, [edx + ecx*4 + 8]
+ movd mm7, [edx + ecx*4 + 12]
+
+ pfmul mm6, mm1 /* mm6 = Geps */
+ pfmul mm7, mm2 /* mm7 = Heps2 */
+
+ pfadd mm5, mm6
+ pfadd mm5, mm7 /* mm5 = Fp */
+
+ pfmul mm7, [esp + two] /* two*Heps2 */
+ pfadd mm7, mm6
+ pfadd mm7, mm5 /* mm7=FF */
+
+ pfmul mm5, mm1 /* mm5=eps*Fp */
+ pfadd mm5, mm4 /* mm5= VV */
+
+ pfmul mm5, mm3 /* vcoul=qq*VV */
+ pfmul mm3, mm7 /* fijC=FF*qq */
+
+ /* at this point mm5 contains vcoul and mm3 fijC */
+ /* increment vcoul - then we can get rid of mm5 */
+ /* update vctot */
+ pfadd mm5, [esp + vctot] /* add the earlier value */
+ movq [esp + vctot], mm5 /* store the sum */
+
+ /* dispersion table */
+ /* load all the table values we need */
+ movd mm4, [edx + ecx*4 + 16]
+ movd mm5, [edx + ecx*4 + 20]
+ movd mm6, [edx + ecx*4 + 24]
+ movd mm7, [edx + ecx*4 + 28]
+ pfmul mm6, mm1 /* mm6 = Geps */
+ pfmul mm7, mm2 /* mm7 = Heps2 */
+ pfadd mm5, mm6
+ pfadd mm5, mm7 /* mm5 = Fp */
+ pfmul mm7, [esp + two] /* two*Heps2 */
+ pfadd mm7, mm6
+ pfadd mm7, mm5 /* mm7=FF */
+ pfmul mm5, mm1 /* mm5=eps*Fp */
+ pfadd mm5, mm4 /* mm5= VV */
+
+ movq mm4, [esp + c6]
+ pfmul mm7, mm4 /* fijD */
+ pfmul mm5, mm4 /* vnb6 */
+ pfadd mm3, mm7 /* add to fscal */
+
+ /* update vnbtot to release mm5! */
+ pfadd mm5, [esp + vnbtot] /* add the earlier value */
+ movq [esp + vnbtot], mm5 /* store the sum */
+
+ /* repulsion table */
+ /* load all the table values we need */
+ movd mm4, [edx + ecx*4 + 32]
+ movd mm5, [edx + ecx*4 + 36]
+ movd mm6, [edx + ecx*4 + 40]
+ movd mm7, [edx + ecx*4 + 44]
+
+ pfmul mm6, mm1 /* mm6 = Geps */
+ pfmul mm7, mm2 /* mm7 = Heps2 */
+ pfadd mm5, mm6
+ pfadd mm5, mm7 /* mm5 = Fp */
+ pfmul mm7, [esp + two] /* two*Heps2 */
+ pfadd mm7, mm6
+ pfadd mm7, mm5 /* mm7=FF */
+ pfmul mm5, mm1 /* mm5=eps*Fp */
+ pfadd mm5, mm4 /* mm5= VV */
+
+ movq mm6, [esp + c12]
+ pfmul mm7, mm6 /* fijR */
+ pfmul mm5, mm6 /* vnb12 */
+ pfadd mm3, mm7 /* total fscal fijC+fijD+fijR */
+
+ /* change sign of mm3 */
+ pxor mm1,mm1
+ pfsub mm1, mm3
+ pfmul mm0, [esp + tsc]
+ pfmul mm0, mm1 /* mm0 is total fscal now */
+
+ /* update vnbtot */
+ pfadd mm5, [esp + vnbtot] /* add the earlier value */
+ movq [esp + vnbtot], mm5 /* store the sum */
+
+ /* spread fscalar to both positions */
+ punpckldq mm0,mm0
+ /* calc vectorial force */
+ prefetchw [edi + eax*4] /* prefetch faction to cache */
+ movq mm2, [esp + dx1]
+ movd mm3, [esp + dz1]
+
+
+ pfmul mm2, mm0
+ pfmul mm3, mm0
+
+ /* update i particle force */
+ movq mm0, [esp + fix]
+ movd mm1, [esp + fiz]
+ pfadd mm0, mm2
+ pfadd mm1, mm3
+ movq [esp + fix], mm0
+ movd [esp + fiz], mm1
+ /* update j particle force */
+ movq mm0, [edi + eax*4]
+ movd mm1, [edi + eax *4+ 8]
+ pfsub mm0, mm2
+ pfsub mm1, mm3
+ movq [edi + eax*4], mm0
+ movd [edi + eax*4 +8], mm1
+ /* done! */
+.i3300_updateouterdata:
+ mov ecx, [esp + ii3]
+
+ movq mm6, [edi + ecx*4] /* increment i force */
+ movd mm7, [edi + ecx*4 + 8]
+ pfadd mm6, [esp + fix]
+ pfadd mm7, [esp + fiz]
+ movq [edi + ecx*4], mm6
+ movd [edi + ecx*4 +8], mm7
+
+ mov ebx, [ebp + fshift] /* increment fshift force */
+ mov edx, [esp + is3]
+
+ movq mm6, [ebx + edx*4]
+ movd mm7, [ebx + edx*4 + 8]
+ pfadd mm6, [esp + fix]
+ pfadd mm7, [esp + fiz]
+ movq [ebx + edx*4], mm6
+ movd [ebx + edx*4 + 8], mm7
+
+ mov edx, [ebp + gid] /* get group index for this i particle */
+ mov edx, [edx]
+ add [ebp + gid], 4 /* advance pointer */
+
+ movq mm7, [esp + vctot]
+ pfacc mm7,mm7 /* get and sum the two parts of total potential */
+
+ mov eax, [ebp + Vc]
+ movd mm6, [eax + edx*4]
+ pfadd mm6, mm7
+ movd [eax + edx*4], mm6 /* increment vc[gid] */
+
+ movq mm7, [esp + vnbtot]
+ pfacc mm7,mm7 /* get and sum the two parts of total potential */
+
+ mov eax, [ebp + Vnb]
+ movd mm6, [eax + edx*4]
+ pfadd mm6, mm7
+ movd [eax + edx*4], mm6 /* increment vnb[gid] */
+
+ /* finish if last */
+ mov ecx, [ebp + nri]
+ dec ecx
+ jecxz .i3300_end
+ /* not last, iterate once more! */
+ mov [ebp + nri], ecx
+ jmp .i3300_outer
+.i3300_end:
+ femms
+ add esp, 132
+ pop edi
+ pop esi
+ pop edx
+ pop ecx
+ pop ebx
+ pop eax
+ leave
+ ret
+
+
+
+
+
+.globl inl3310_3dnow
+ .type inl3310_3dnow,@function
+inl3310_3dnow:
+.equ nri, 8
+.equ iinr, 12
+.equ jindex, 16
+.equ jjnr, 20
+.equ shift, 24
+.equ shiftvec, 28
+.equ fshift, 32
+.equ gid, 36
+.equ pos, 40
+.equ faction, 44
+.equ charge, 48
+.equ facel, 52
+.equ Vc, 56
+.equ type, 60
+.equ ntype, 64
+.equ nbfp, 68
+.equ Vnb, 72
+.equ tabscale, 76
+.equ VFtab, 80
+.equ nsatoms, 84
+ /* stack offsets for local variables */
+.equ is3, 0
+.equ ii3, 4
+.equ shX, 8
+.equ shY, 12
+.equ shZ, 16
+.equ ix, 20
+.equ iy, 24
+.equ iz, 28
+.equ iq, 32 /* repeated (64bit) to fill 3dnow reg */
+.equ vctot, 40 /* repeated (64bit) to fill 3dnow reg */
+.equ vnbtot, 48 /* repeated (64bit) to fill 3dnow reg */
+.equ c6, 56 /* repeated (64bit) to fill 3dnow reg */
+.equ c12, 64 /* repeated (64bit) to fill 3dnow reg */
+.equ two, 72 /* repeated (64bit) to fill 3dnow reg */
+.equ n1, 80 /* repeated (64bit) to fill 3dnow reg */
+.equ tsc, 88 /* repeated (64bit) to fill 3dnow reg */
+.equ ntia, 96
+.equ innerjjnr0, 100
+.equ innerk0, 104
+.equ innerjjnr, 108
+.equ innerk, 112
+.equ fix, 116
+.equ fiy, 120
+.equ fiz, 124
+.equ dx1, 128
+.equ dy1, 132
+.equ dz1, 136
+.equ dx2, 140
+.equ dy2, 144
+.equ dz2, 148
+.equ nsvdwc, 152
+.equ nscoul, 156
+.equ nsvdw, 160
+.equ solnr, 164
+ push ebp
+ mov ebp,esp
+ push eax
+ push ebx
+ push ecx
+ push edx
+ push esi
+ push edi
+ sub esp, 168 /* local stack space */
+ femms
+ movq mm0, [mm_two]
+ movd mm3, [ebp + tabscale]
+ movq [esp + two], mm0
+ punpckldq mm3,mm3
+ movq [esp + tsc], mm3
+ /* assume we have at least one i particle - start directly */
+.i3310_outer:
+ mov eax, [ebp + shift] /* eax = pointer into shift[] */
+ mov ebx, [eax] /* ebx=shift[n] */
+ add [ebp + shift], 4 /* advance pointer one step */
+
+ lea ebx, [ebx + ebx*2] /* ebx=3*is */
+ mov [esp + is3],ebx /* store is3 */
+
+ mov eax, [ebp + shiftvec] /* eax = base of shiftvec[] */
+
+ movq mm0, [eax + ebx*4] /* move shX/shY to mm0 and shZ to mm1 */
+ movd mm1, [eax + ebx*4 + 8]
+ movq [esp + shX], mm0
+ movd [esp + shZ], mm1
+
+ mov ecx, [ebp + iinr] /* ecx = pointer into iinr[] */
+ add [ebp + iinr], 4 /* advance pointer */
+ mov ebx, [ecx] /* ebx=ii */
+
+ mov eax, [ebp + nsatoms]
+ add [ebp + nsatoms], 12
+ mov ecx, [eax]
+ mov edx, [eax + 4]
+ mov eax, [eax + 8]
+ sub ecx, eax
+ sub eax, edx
+
+ mov [esp + nsvdwc], edx
+ mov [esp + nscoul], eax
+ mov [esp + nsvdw], ecx
+
+ /* clear potential */
+ pxor mm7,mm7
+ movq [esp + vctot], mm7
+ movq [esp + vnbtot], mm7
+ mov [esp + solnr], ebx
+
+ mov eax, [ebp + jindex]
+ mov ecx, [eax] /* jindex[n] */
+ mov edx, [eax + 4] /* jindex[n+1] */
+ add [ebp + jindex], 4
+ sub edx, ecx /* number of innerloop atoms */
+ mov eax, [ebp + jjnr]
+ shl ecx, 2
+ add eax, ecx
+ mov [esp + innerjjnr0], eax /* pointer to jjnr[nj0] */
+
+ mov [esp + innerk0], edx /* number of innerloop atoms */
+ mov esi, [ebp + pos]
+ mov edi, [ebp + faction]
+
+ mov ecx, [esp + nsvdwc]
+ cmp ecx, 0
+ jnz .i3310_mno_vdwc
+ jmp .i3310_testcoul
+.i3310_mno_vdwc:
+ mov ebx, [esp + solnr]
+ inc dword ptr [esp + solnr]
+ mov edx, [ebp + charge]
+ movd mm2, [edx + ebx*4] /* mm2=charge[ii] */
+ pfmul mm2, [ebp + facel]
+ punpckldq mm2,mm2 /* spread to both halves */
+ movq [esp + iq], mm2 /* iq =facel*charge[ii] */
+
+ mov edx, [ebp + type]
+ mov edx, [edx + ebx*4]
+ imul edx, [ebp + ntype]
+ shl edx, 1
+ mov [esp + ntia], edx
+
+ lea ebx, [ebx + ebx*2] /* ebx = 3*ii=ii3 */
+ mov eax, [ebp + pos] /* eax = base of pos[] */
+ mov [esp + ii3], ebx
+
+ movq mm0, [eax + ebx*4]
+ movd mm1, [eax + ebx*4 + 8]
+ pfadd mm0, [esp + shX]
+ pfadd mm1, [esp + shZ]
+ movq [esp + ix], mm0
+ movd [esp + iz], mm1
+
+ /* clear forces */
+ pxor mm7,mm7
+ movq [esp + fix], mm7
+ movd [esp + fiz], mm7
+
+ mov ecx, [esp + innerjjnr0]
+ mov [esp + innerjjnr], ecx
+ mov edx, [esp + innerk0]
+ sub edx, 2
+ mov [esp + innerk], edx /* number of innerloop atoms */
+ jge .i3310_unroll_vdwc_loop
+ jmp .i3310_finish_vdwc_inner
+.i3310_unroll_vdwc_loop:
+ /* paired innerloop starts here */
+ mov ecx, [esp + innerjjnr] /* pointer to jjnr[k] */
+ mov eax, [ecx]
+ mov ebx, [ecx + 4] /* eax/ebx=jnr */
+ add [esp + innerjjnr], 8 /* advance pointer (unrolled 2) */
+ prefetch [ecx + 16] /* prefetch data - trial and error says 16 is best */
+
+ mov ecx, [ebp + charge] /* base of charge[] */
+ movq mm5, [esp + iq]
+ movd mm3, [ecx + eax*4] /* charge[jnr1] */
+ punpckldq mm3, [ecx + ebx*4] /* move charge 2 to high part of mm3 */
+ pfmul mm3,mm5 /* mm3 now has qq for both particles */
+
+ mov ecx, [ebp + type]
+ mov edx, [ecx + eax*4] /* type [jnr1] */
+ mov ecx, [ecx + ebx*4] /* type [jnr2] */
+
+ mov esi, [ebp + nbfp] /* base of nbfp */
+ shl edx, 1
+ shl ecx, 1
+ add edx, [esp + ntia] /* tja = ntia + 2*type */
+ add ecx, [esp + ntia]
+
+ movq mm5, [esi + edx*4] /* mm5 = 1st c6 / c12 */
+ movq mm7, [esi + ecx*4] /* mm7 = 2nd c6 / c12 */
+ movq mm6,mm5
+ punpckldq mm5,mm7 /* mm5 = 1st c6 / 2nd c6 */
+ punpckhdq mm6,mm7 /* mm6 = 1st c12 / 2nd c12 */
+ movq [esp + c6], mm5
+ movq [esp + c12], mm6
+
+ lea eax, [eax + eax*2] /* replace jnr with j3 */
+ lea ebx, [ebx + ebx*2]
+
+ mov esi, [ebp + pos]
+
+ movq mm0, [esp + ix]
+ movd mm1, [esp + iz]
+ movq mm4, [esi + eax*4] /* fetch first j coordinates */
+ movd mm5, [esi + eax*4 + 8]
+ pfsubr mm4,mm0 /* dr = ir - jr */
+ pfsubr mm5,mm1
+ movq [esp + dx1], mm4 /* store dr */
+ movd [esp + dz1], mm5
+ pfmul mm4,mm4 /* square dx,dy,dz */
+ pfmul mm5,mm5
+ pfacc mm4, mm5 /* accumulate to get dx*dx+dy*dy+dz*dz */
+ pfacc mm4, mm5 /* first rsq in lower mm4 */
+
+ movq mm6, [esi + ebx*4] /* fetch second j coordinates */
+ movd mm7, [esi + ebx*4 + 8]
+
+ pfsubr mm6,mm0 /* dr = ir - jr */
+ pfsubr mm7,mm1
+ movq [esp + dx2], mm6 /* store dr */
+ movd [esp + dz2], mm7
+ pfmul mm6,mm6 /* square dx,dy,dz */
+ pfmul mm7,mm7
+ pfacc mm6, mm7 /* accumulate to get dx*dx+dy*dy+dz*dz */
+ pfacc mm6, mm7 /* second rsq in lower mm6 */
+
+ pfrsqrt mm0, mm4 /* lookup inverse square root seed */
+ pfrsqrt mm1, mm6
+
+
+ punpckldq mm0,mm1
+ punpckldq mm4,mm6 /* now 4 has rsq and 0 the seed for both pairs. */
+ movq mm2,mm0 /* amd 3dnow N-R iteration to get full precision. */
+ pfmul mm0,mm0
+ pfrsqit1 mm0,mm4
+ pfrcpit2 mm0,mm2
+ pfmul mm4, mm0
+ movq mm1, mm4
+ /* mm0 is invsqrt, and mm1 r. */
+ /* do potential and fscal */
+ pfmul mm1, [esp + tsc] /* mm1=rt */
+ pf2iw mm4,mm1
+ movq [esp + n1], mm4
+ pi2fd mm4,mm4
+ pfsub mm1, mm4 /* now mm1 is eps and mm4 is n0 */
+
+ movq mm2,mm1
+ pfmul mm2,mm2 /* mm1 is eps, mm2 is eps2 */
+
+ mov edx, [ebp + VFtab]
+ mov ecx, [esp + n1]
+ lea ecx, [ecx + ecx*2]
+ shl ecx, 2
+ /* load all the table values we need */
+ movd mm4, [edx + ecx*4]
+ movd mm5, [edx + ecx*4 + 4]
+ movd mm6, [edx + ecx*4 + 8]
+ movd mm7, [edx + ecx*4 + 12]
+ mov ecx, [esp + n1 + 4]
+ lea ecx, [ecx + ecx*2]
+ shl ecx, 2
+ punpckldq mm4, [edx + ecx*4]
+ punpckldq mm5, [edx + ecx*4 + 4]
+ punpckldq mm6, [edx + ecx*4 + 8]
+ punpckldq mm7, [edx + ecx*4 + 12]
+
+ pfmul mm6, mm1 /* mm6 = Geps */
+ pfmul mm7, mm2 /* mm7 = Heps2 */
+
+ pfadd mm5, mm6
+ pfadd mm5, mm7 /* mm5 = Fp */
+
+ pfmul mm7, [esp + two] /* two*Heps2 */
+ pfadd mm7, mm6
+ pfadd mm7, mm5 /* mm7=FF */
+
+ pfmul mm5, mm1 /* mm5=eps*Fp */
+ pfadd mm5, mm4 /* mm5= VV */
+
+ pfmul mm5, mm3 /* vcoul=qq*VV */
+ pfmul mm3, mm7 /* fijC=FF*qq */
+
+ /* at this point mm5 contains vcoul and mm3 fijC */
+ /* increment vcoul - then we can get rid of mm5 */
+ /* update vctot */
+ pfadd mm5, [esp + vctot] /* add the earlier value */
+ movq [esp + vctot], mm5 /* store the sum */
+
+ /* dispersion table */
+ mov ecx, [esp + n1]
+ lea ecx, [ecx + ecx*2]
+ shl ecx, 2
+ /* load all the table values we need */
+ movd mm4, [edx + ecx*4 + 16]
+ movd mm5, [edx + ecx*4 + 20]
+ movd mm6, [edx + ecx*4 + 24]
+ movd mm7, [edx + ecx*4 + 28]
+ mov ecx, [esp + n1 + 4]
+ lea ecx, [ecx + ecx*2]
+ shl ecx, 2
+ punpckldq mm4, [edx + ecx*4 + 16]
+ punpckldq mm5, [edx + ecx*4 + 20]
+ punpckldq mm6, [edx + ecx*4 + 24]
+ punpckldq mm7, [edx + ecx*4 + 28]
+ pfmul mm6, mm1 /* mm6 = Geps */
+ pfmul mm7, mm2 /* mm7 = Heps2 */
+ pfadd mm5, mm6
+ pfadd mm5, mm7 /* mm5 = Fp */
+ pfmul mm7, [esp + two] /* two*Heps2 */
+ pfadd mm7, mm6
+ pfadd mm7, mm5 /* mm7=FF */
+ pfmul mm5, mm1 /* mm5=eps*Fp */
+ pfadd mm5, mm4 /* mm5= VV */
+
+ movq mm4, [esp + c6]
+ pfmul mm7, mm4 /* fijD */
+ pfmul mm5, mm4 /* vnb6 */
+ pfadd mm3, mm7 /* add to fscal */
+
+ /* update vnbtot to release mm5! */
+ pfadd mm5, [esp + vnbtot] /* add the earlier value */
+ movq [esp + vnbtot], mm5 /* store the sum */
+
+ /* repulsion table */
+ mov ecx, [esp + n1]
+ lea ecx, [ecx + ecx*2]
+ shl ecx, 2
+ /* load all the table values we need */
+ movd mm4, [edx + ecx*4 + 32]
+ movd mm5, [edx + ecx*4 + 36]
+ movd mm6, [edx + ecx*4 + 40]
+ movd mm7, [edx + ecx*4 + 44]
+ mov ecx, [esp + n1 + 4]
+ lea ecx, [ecx + ecx*2]
+ shl ecx, 2
+ punpckldq mm4, [edx + ecx*4 + 32]
+ punpckldq mm5, [edx + ecx*4 + 36]
+ punpckldq mm6, [edx + ecx*4 + 40]
+ punpckldq mm7, [edx + ecx*4 + 44]
+
+ pfmul mm6, mm1 /* mm6 = Geps */
+ pfmul mm7, mm2 /* mm7 = Heps2 */
+ pfadd mm5, mm6
+ pfadd mm5, mm7 /* mm5 = Fp */
+ pfmul mm7, [esp + two] /* two*Heps2 */
+ pfadd mm7, mm6
+ pfadd mm7, mm5 /* mm7=FF */
+ pfmul mm5, mm1 /* mm5=eps*Fp */
+ pfadd mm5, mm4 /* mm5= VV */
+
+ movq mm6, [esp + c12]
+ pfmul mm7, mm6 /* fijR */
+ pfmul mm5, mm6 /* vnb12 */
+ pfadd mm3, mm7 /* total fscal fijC+fijD+fijR */
+
+ /* change sign of mm3 */
+ pxor mm1,mm1
+ pfsub mm1, mm3
+ pfmul mm0, [esp + tsc]
+ pfmul mm0, mm1 /* mm0 is total fscal now */
+
+ prefetchw [esp + dx1] /* prefetch i forces to cache */
+
+ /* spread fscalar to both positions */
+ movq mm1,mm0
+ punpckldq mm0,mm0
+ punpckhdq mm1,mm1
+
+ /* calc vector force */
+ prefetchw [edi + eax*4] /* prefetch the 1st faction to cache */
+ movq mm2, [esp + dx1] /* fetch dr */
+ movd mm3, [esp + dz1]
+
+ /* update vnbtot */
+ pfadd mm5, [esp + vnbtot] /* add the earlier value */
+ movq [esp + vnbtot], mm5 /* store the sum */
+
+ prefetchw [edi + ebx*4] /* prefetch the 2nd faction to cache */
+ pfmul mm2, mm0 /* mult by fs */
+ pfmul mm3, mm0
+
+ movq mm4, [esp + dx2] /* fetch dr */
+ movd mm5, [esp + dz2]
+ pfmul mm4, mm1 /* mult by fs */
+ pfmul mm5, mm1
+ /* update i forces */
+
+ movq mm0, [esp + fix]
+ movd mm1, [esp + fiz]
+ pfadd mm0, mm2
+ pfadd mm1, mm3
+
+ pfadd mm0, mm4
+ pfadd mm1, mm5
+ movq [esp + fix], mm0
+ movd [esp + fiz], mm1
+ /* update j forces */
+
+ movq mm0, [edi + eax*4]
+ movd mm1, [edi + eax*4 + 8]
+ movq mm6, [edi + ebx*4]
+ movd mm7, [edi + ebx*4 + 8]
+
+ pfsub mm0, mm2
+ pfsub mm1, mm3
+ pfsub mm6, mm4
+ pfsub mm7, mm5
+
+ movq [edi + eax*4], mm0
+ movd [edi + eax*4 +8], mm1
+ movq [edi + ebx*4], mm6
+ movd [edi + ebx*4 + 8], mm7
+
+ /* should we do one more iteration? */
+ sub [esp + innerk], 2
+ jl .i3310_finish_vdwc_inner
+ jmp .i3310_unroll_vdwc_loop
+.i3310_finish_vdwc_inner:
+ and [esp + innerk], 1
+ jnz .i3310_single_vdwc_inner
+ jmp .i3310_updateouterdata_vdwc
+.i3310_single_vdwc_inner:
+ /* a single j particle iteration here - compare with the unrolled code for comments. */
+ mov eax, [esp + innerjjnr]
+ mov eax, [eax] /* eax=jnr offset */
+
+ mov ecx, [ebp + charge]
+ movd mm5, [esp + iq]
+ movd mm3, [ecx + eax*4]
+ pfmul mm3, mm5 /* mm3=qq */
+
+ mov esi, [ebp + nbfp]
+ mov ecx, [ebp + type]
+ mov edx, [ecx + eax*4] /* type [jnr1] */
+ shl edx, 1
+ add edx, [esp + ntia] /* tja = ntia + 2*type */
+ movd mm5, [esi + edx*4] /* mm5 = 1st c6 */
+ movq [esp + c6], mm5
+ movd mm5, [esi + edx*4 + 4] /* mm5 = 1st c12 */
+ movq [esp + c12], mm5
+
+ mov esi, [ebp + pos]
+ lea eax, [eax + eax*2]
+
+ movq mm0, [esp + ix]
+ movd mm1, [esp + iz]
+ movq mm4, [esi + eax*4]
+ movd mm5, [esi + eax*4 + 8]
+ pfsubr mm4, mm0
+ pfsubr mm5, mm1
+ movq [esp + dx1], mm4
+ pfmul mm4,mm4
+ movd [esp + dz1], mm5
+ pfmul mm5,mm5
+ pfacc mm4, mm5
+ pfacc mm4, mm5 /* mm0=rsq */
+
+ pfrsqrt mm0,mm4
+ movq mm2,mm0
+ pfmul mm0,mm0
+ pfrsqit1 mm0,mm4
+ pfrcpit2 mm0,mm2 /* mm1=invsqrt */
+ pfmul mm4, mm0
+ movq mm1, mm4
+ /* mm0 is invsqrt, and mm1 r. */
+
+ /* calculate potentials and scalar force */
+ pfmul mm1, [esp + tsc] /* mm1=rt */
+ pf2iw mm4,mm1
+ movd [esp + n1], mm4
+ pi2fd mm4,mm4
+ pfsub mm1, mm4 /* now mm1 is eps and mm4 is n0 */
+
+ movq mm2,mm1
+ pfmul mm2,mm2 /* mm1 is eps, mm2 is eps2 */
+
+ /* coulomb table */
+ mov edx, [ebp + VFtab]
+ mov ecx, [esp + n1]
+ lea ecx, [ecx + ecx*2]
+ shl ecx, 2
+ /* load all the table values we need */
+ movd mm4, [edx + ecx*4]
+ movd mm5, [edx + ecx*4 + 4]
+ movd mm6, [edx + ecx*4 + 8]
+ movd mm7, [edx + ecx*4 + 12]
+
+ pfmul mm6, mm1 /* mm6 = Geps */
+ pfmul mm7, mm2 /* mm7 = Heps2 */
+
+ pfadd mm5, mm6
+ pfadd mm5, mm7 /* mm5 = Fp */
+
+ pfmul mm7, [esp + two] /* two*Heps2 */
+ pfadd mm7, mm6
+ pfadd mm7, mm5 /* mm7=FF */
+
+ pfmul mm5, mm1 /* mm5=eps*Fp */
+ pfadd mm5, mm4 /* mm5= VV */
+
+ pfmul mm5, mm3 /* vcoul=qq*VV */
+ pfmul mm3, mm7 /* fijC=FF*qq */
+
+ /* at this point mm5 contains vcoul and mm3 fijC */
+ /* increment vcoul - then we can get rid of mm5 */
+ /* update vctot */
+ pfadd mm5, [esp + vctot] /* add the earlier value */
+ movq [esp + vctot], mm5 /* store the sum */
+
+ /* dispersion table */
+ /* load all the table values we need */
+ movd mm4, [edx + ecx*4 + 16]
+ movd mm5, [edx + ecx*4 + 20]
+ movd mm6, [edx + ecx*4 + 24]
+ movd mm7, [edx + ecx*4 + 28]
+ pfmul mm6, mm1 /* mm6 = Geps */
+ pfmul mm7, mm2 /* mm7 = Heps2 */
+ pfadd mm5, mm6
+ pfadd mm5, mm7 /* mm5 = Fp */
+ pfmul mm7, [esp + two] /* two*Heps2 */
+ pfadd mm7, mm6
+ pfadd mm7, mm5 /* mm7=FF */
+ pfmul mm5, mm1 /* mm5=eps*Fp */
+ pfadd mm5, mm4 /* mm5= VV */
+
+ movq mm4, [esp + c6]
+ pfmul mm7, mm4 /* fijD */
+ pfmul mm5, mm4 /* vnb6 */
+ pfadd mm3, mm7 /* add to fscal */
+
+ /* update vnbtot to release mm5! */
+ pfadd mm5, [esp + vnbtot] /* add the earlier value */
+ movq [esp + vnbtot], mm5 /* store the sum */
+
+ /* repulsion table */
+ /* load all the table values we need */
+ movd mm4, [edx + ecx*4 + 32]
+ movd mm5, [edx + ecx*4 + 36]
+ movd mm6, [edx + ecx*4 + 40]
+ movd mm7, [edx + ecx*4 + 44]
+
+ pfmul mm6, mm1 /* mm6 = Geps */
+ pfmul mm7, mm2 /* mm7 = Heps2 */
+ pfadd mm5, mm6
+ pfadd mm5, mm7 /* mm5 = Fp */
+ pfmul mm7, [esp + two] /* two*Heps2 */
+ pfadd mm7, mm6
+ pfadd mm7, mm5 /* mm7=FF */
+ pfmul mm5, mm1 /* mm5=eps*Fp */
+ pfadd mm5, mm4 /* mm5= VV */
+
+ movq mm6, [esp + c12]
+ pfmul mm7, mm6 /* fijR */
+ pfmul mm5, mm6 /* vnb12 */
+ pfadd mm3, mm7 /* total fscal fijC+fijD+fijR */
+
+ /* change sign of mm3 */
+ pxor mm1,mm1
+ pfsub mm1, mm3
+ pfmul mm0, [esp + tsc]
+ pfmul mm0, mm1 /* mm0 is total fscal now */
+
+ /* update vnbtot */
+ pfadd mm5, [esp + vnbtot] /* add the earlier value */
+ movq [esp + vnbtot], mm5 /* store the sum */
+
+ /* spread fscalar to both positions */
+ punpckldq mm0,mm0
+ /* calc vectorial force */
+ prefetchw [edi + eax*4] /* prefetch faction to cache */
+ movq mm2, [esp + dx1]
+ movd mm3, [esp + dz1]
+
+
+ pfmul mm2, mm0
+ pfmul mm3, mm0
+
+ /* update i particle force */
+ movq mm0, [esp + fix]
+ movd mm1, [esp + fiz]
+ pfadd mm0, mm2
+ pfadd mm1, mm3
+ movq [esp + fix], mm0
+ movd [esp + fiz], mm1
+ /* update j particle force */
+ movq mm0, [edi + eax*4]
+ movd mm1, [edi + eax *4+ 8]
+ pfsub mm0, mm2
+ pfsub mm1, mm3
+ movq [edi + eax*4], mm0
+ movd [edi + eax*4 +8], mm1
+ /* done! */
+.i3310_updateouterdata_vdwc:
+ mov ecx, [esp + ii3]
+
+ movq mm6, [edi + ecx*4] /* increment i force */
+ movd mm7, [edi + ecx*4 + 8]
+ pfadd mm6, [esp + fix]
+ pfadd mm7, [esp + fiz]
+ movq [edi + ecx*4], mm6
+ movd [edi + ecx*4 +8], mm7
+
+ mov ebx, [ebp + fshift] /* increment fshift force */
+ mov edx, [esp + is3]
+
+ movq mm6, [ebx + edx*4]
+ movd mm7, [ebx + edx*4 + 8]
+ pfadd mm6, [esp + fix]
+ pfadd mm7, [esp + fiz]
+ movq [ebx + edx*4], mm6
+ movd [ebx + edx*4 + 8], mm7
+
+ /* loop back to mno */
+ dec dword ptr [esp + nsvdwc]
+ jz .i3310_testcoul
+ jmp .i3310_mno_vdwc
+.i3310_testcoul:
+ mov ecx, [esp + nscoul]
+ cmp ecx, 0
+ jnz .i3310_mno_coul
+ jmp .i3310_testvdw
+.i3310_mno_coul:
+ mov ebx, [esp + solnr]
+ inc dword ptr [esp + solnr]
+ mov edx, [ebp + charge]
+ movd mm2, [edx + ebx*4] /* mm2=charge[ii] */
+ pfmul mm2, [ebp + facel]
+ punpckldq mm2,mm2 /* spread to both halves */
+ movq [esp + iq], mm2 /* iq =facel*charge[ii] */
+
+ lea ebx, [ebx + ebx*2] /* ebx = 3*ii=ii3 */
+ mov eax, [ebp + pos] /* eax = base of pos[] */
+ mov [esp + ii3], ebx
+
+ movq mm0, [eax + ebx*4]
+ movd mm1, [eax + ebx*4 + 8]
+ pfadd mm0, [esp + shX]
+ pfadd mm1, [esp + shZ]
+ movq [esp + ix], mm0
+ movd [esp + iz], mm1
+
+ /* clear forces */
+ pxor mm7,mm7
+ movq [esp + fix], mm7
+ movd [esp + fiz], mm7
+
+ mov ecx, [esp + innerjjnr0]
+ mov [esp + innerjjnr], ecx
+ mov edx, [esp + innerk0]
+ sub edx, 2
+ mov [esp + innerk], edx /* number of innerloop atoms */
+ jge .i3310_unroll_coul_loop
+ jmp .i3310_finish_coul_inner
+.i3310_unroll_coul_loop:
+ /* paired innerloop starts here */
+ mov ecx, [esp + innerjjnr] /* pointer to jjnr[k] */
+ mov eax, [ecx]
+ mov ebx, [ecx + 4] /* eax/ebx=jnr */
+ add [esp + innerjjnr], 8 /* advance pointer (unrolled 2) */
+ prefetch [ecx + 16] /* prefetch data - trial and error says 16 is best */
+
+ mov ecx, [ebp + charge] /* base of charge[] */
+ movq mm5, [esp + iq]
+ movd mm3, [ecx + eax*4] /* charge[jnr1] */
+ punpckldq mm3, [ecx + ebx*4] /* move charge 2 to high part of mm3 */
+ pfmul mm3,mm5 /* mm3 now has qq for both particles */
+
+ lea eax, [eax + eax*2] /* replace jnr with j3 */
+ lea ebx, [ebx + ebx*2]
+
+ mov esi, [ebp + pos]
+
+ movq mm0, [esp + ix]
+ movd mm1, [esp + iz]
+ movq mm4, [esi + eax*4] /* fetch first j coordinates */
+ movd mm5, [esi + eax*4 + 8]
+ pfsubr mm4,mm0 /* dr = ir - jr */
+ pfsubr mm5,mm1
+ movq [esp + dx1], mm4 /* store dr */
+ movd [esp + dz1], mm5
+ pfmul mm4,mm4 /* square dx,dy,dz */
+ pfmul mm5,mm5
+ pfacc mm4, mm5 /* accumulate to get dx*dx+dy*dy+dz*dz */
+ pfacc mm4, mm5 /* first rsq in lower mm4 */
+
+ movq mm6, [esi + ebx*4] /* fetch second j coordinates */
+ movd mm7, [esi + ebx*4 + 8]
+
+ pfsubr mm6,mm0 /* dr = ir - jr */
+ pfsubr mm7,mm1
+ movq [esp + dx2], mm6 /* store dr */
+ movd [esp + dz2], mm7
+ pfmul mm6,mm6 /* square dx,dy,dz */
+ pfmul mm7,mm7
+ pfacc mm6, mm7 /* accumulate to get dx*dx+dy*dy+dz*dz */
+ pfacc mm6, mm7 /* second rsq in lower mm6 */
+
+ pfrsqrt mm0, mm4 /* lookup inverse square root seed */
+ pfrsqrt mm1, mm6
+
+
+ punpckldq mm0,mm1
+ punpckldq mm4,mm6 /* now 4 has rsq and 0 the seed for both pairs. */
+ movq mm2,mm0 /* amd 3dnow N-R iteration to get full precision. */
+ pfmul mm0,mm0
+ pfrsqit1 mm0,mm4
+ pfrcpit2 mm0,mm2
+ pfmul mm4, mm0
+ movq mm1, mm4
+ /* mm0 is invsqrt, and mm1 r. */
+ /* do potential and fscal */
+ pfmul mm1, [esp + tsc] /* mm1=rt */
+ pf2iw mm4,mm1
+ movq [esp + n1], mm4
+ pi2fd mm4,mm4
+ pfsub mm1, mm4 /* now mm1 is eps and mm4 is n0 */
+
+ movq mm2,mm1
+ pfmul mm2,mm2 /* mm1 is eps, mm2 is eps2 */
+
+ mov edx, [ebp + VFtab]
+ mov ecx, [esp + n1]
+ lea ecx, [ecx + ecx*2]
+ shl ecx, 2
+ /* coulomb table */
+ /* load all the table values we need */
+ movd mm4, [edx + ecx*4]
+ movd mm5, [edx + ecx*4 + 4]
+ movd mm6, [edx + ecx*4 + 8]
+ movd mm7, [edx + ecx*4 + 12]
+ mov ecx, [esp + n1 + 4]
+ lea ecx, [ecx + ecx*2]
+ shl ecx, 2
+ punpckldq mm4, [edx + ecx*4]
+ punpckldq mm5, [edx + ecx*4 + 4]
+ punpckldq mm6, [edx + ecx*4 + 8]
+ punpckldq mm7, [edx + ecx*4 + 12]
+
+ pfmul mm6, mm1 /* mm6 = Geps */
+ pfmul mm7, mm2 /* mm7 = Heps2 */
+
+ pfadd mm5, mm6
+ pfadd mm5, mm7 /* mm5 = Fp */
+
+ pfmul mm7, [esp + two] /* two*Heps2 */
+ pfadd mm7, mm6
+ pfadd mm7, mm5 /* mm7=FF */
+
+ pfmul mm5, mm1 /* mm5=eps*Fp */
+ pfadd mm5, mm4 /* mm5= VV */
+
+ pfmul mm5, mm3 /* vcoul=qq*VV */
+ pfmul mm3, mm7 /* fijC=FF*qq */
+
+ /* at this point mm5 contains vcoul and mm3 fijC */
+ /* increment vcoul - then we can get rid of mm5 */
+ /* update vctot */
+ pfadd mm5, [esp + vctot] /* add the earlier value */
+ movq [esp + vctot], mm5 /* store the sum */
+
+ /* change sign of mm3 */
+ pxor mm1,mm1
+ pfsub mm1, mm3
+ pfmul mm1, [esp + tsc]
+ pfmul mm0, mm1 /* mm0 is total fscal now */
+
+ prefetchw [esp + dx1] /* prefetch i forces to cache */
+
+ /* spread fscalar to both positions */
+ movq mm1,mm0
+ punpckldq mm0,mm0
+ punpckhdq mm1,mm1
+
+ /* calc vector force */
+ prefetchw [edi + eax*4] /* prefetch the 1st faction to cache */
+ movq mm2, [esp + dx1] /* fetch dr */
+ movd mm3, [esp + dz1]
+
+ prefetchw [edi + ebx*4] /* prefetch the 2nd faction to cache */
+ pfmul mm2, mm0 /* mult by fs */
+ pfmul mm3, mm0
+
+ movq mm4, [esp + dx2] /* fetch dr */
+ movd mm5, [esp + dz2]
+ pfmul mm4, mm1 /* mult by fs */
+ pfmul mm5, mm1
+ /* update i forces */
+
+ movq mm0, [esp + fix]
+ movd mm1, [esp + fiz]
+ pfadd mm0, mm2
+ pfadd mm1, mm3
+
+ pfadd mm0, mm4
+ pfadd mm1, mm5
+ movq [esp + fix], mm0
+ movd [esp + fiz], mm1
+ /* update j forces */
+
+ movq mm0, [edi + eax*4]
+ movd mm1, [edi + eax*4 + 8]
+ movq mm6, [edi + ebx*4]
+ movd mm7, [edi + ebx*4 + 8]
+
+ pfsub mm0, mm2
+ pfsub mm1, mm3
+ pfsub mm6, mm4
+ pfsub mm7, mm5
+
+ movq [edi + eax*4], mm0
+ movd [edi + eax*4 +8], mm1
+ movq [edi + ebx*4], mm6
+ movd [edi + ebx*4 + 8], mm7
+
+ /* should we do one more iteration? */
+ sub [esp + innerk], 2
+ jl .i3310_finish_coul_inner
+ jmp .i3310_unroll_coul_loop
+.i3310_finish_coul_inner:
+ and [esp + innerk], 1
+ jnz .i3310_single_coul_inner
+ jmp .i3310_updateouterdata_coul
+.i3310_single_coul_inner:
+ /* a single j particle iteration here - compare with the unrolled code for comments. */
+ mov eax, [esp + innerjjnr]
+ mov eax, [eax] /* eax=jnr offset */
+
+ mov ecx, [ebp + charge]
+ movd mm5, [esp + iq]
+ movd mm3, [ecx + eax*4]
+ pfmul mm3, mm5 /* mm3=qq */
+
+ mov esi, [ebp + pos]
+ lea eax, [eax + eax*2]
+
+ movq mm0, [esp + ix]
+ movd mm1, [esp + iz]
+ movq mm4, [esi + eax*4]
+ movd mm5, [esi + eax*4 + 8]
+ pfsubr mm4, mm0
+ pfsubr mm5, mm1
+ movq [esp + dx1], mm4
+ pfmul mm4,mm4
+ movd [esp + dz1], mm5
+ pfmul mm5,mm5
+ pfacc mm4, mm5
+ pfacc mm4, mm5 /* mm0=rsq */
+
+ pfrsqrt mm0,mm4
+ movq mm2,mm0
+ pfmul mm0,mm0
+ pfrsqit1 mm0,mm4
+ pfrcpit2 mm0,mm2 /* mm1=invsqrt */
+ pfmul mm4, mm0
+ movq mm1, mm4
+ /* mm0 is invsqrt, and mm1 r. */
+
+ /* calculate potentials and scalar force */
+ pfmul mm1, [esp + tsc] /* mm1=rt */
+ pf2iw mm4,mm1
+ movd [esp + n1], mm4
+ pi2fd mm4,mm4
+ pfsub mm1, mm4 /* now mm1 is eps and mm4 is n0 */
+
+ movq mm2,mm1
+ pfmul mm2,mm2 /* mm1 is eps, mm2 is eps2 */
+
+ /* coulomb table */
+ mov edx, [ebp + VFtab]
+ mov ecx, [esp + n1]
+ lea ecx, [ecx + ecx*2]
+ shl ecx, 2
+ /* load all the table values we need */
+ movd mm4, [edx + ecx*4]
+ movd mm5, [edx + ecx*4 + 4]
+ movd mm6, [edx + ecx*4 + 8]
+ movd mm7, [edx + ecx*4 + 12]
+
+ pfmul mm6, mm1 /* mm6 = Geps */
+ pfmul mm7, mm2 /* mm7 = Heps2 */
+
+ pfadd mm5, mm6
+ pfadd mm5, mm7 /* mm5 = Fp */
+
+ pfmul mm7, [esp + two] /* two*Heps2 */
+ pfadd mm7, mm6
+ pfadd mm7, mm5 /* mm7=FF */
+
+ pfmul mm5, mm1 /* mm5=eps*Fp */
+ pfadd mm5, mm4 /* mm5= VV */
+
+ pfmul mm5, mm3 /* vcoul=qq*VV */
+ pfmul mm3, mm7 /* fijC=FF*qq */
+
+ /* at this point mm5 contains vcoul and mm3 fijC */
+ /* increment vcoul - then we can get rid of mm5 */
+ /* update vctot */
+ pfadd mm5, [esp + vctot] /* add the earlier value */
+ movq [esp + vctot], mm5 /* store the sum */
+
+ /* change sign of mm3 */
+ pxor mm1,mm1
+ pfsub mm1, mm3
+ pfmul mm0, [esp + tsc]
+ pfmul mm0, mm1 /* mm0 is total fscal now */
+
+ /* spread fscalar to both positions */
+ punpckldq mm0,mm0
+ /* calc vectorial force */
+ prefetchw [edi + eax*4] /* prefetch faction to cache */
+ movq mm2, [esp + dx1]
+ movd mm3, [esp + dz1]
+
+
+ pfmul mm2, mm0
+ pfmul mm3, mm0
+
+ /* update i particle force */
+ movq mm0, [esp + fix]
+ movd mm1, [esp + fiz]
+ pfadd mm0, mm2
+ pfadd mm1, mm3
+ movq [esp + fix], mm0
+ movd [esp + fiz], mm1
+ /* update j particle force */
+ movq mm0, [edi + eax*4]
+ movd mm1, [edi + eax *4+ 8]
+ pfsub mm0, mm2
+ pfsub mm1, mm3
+ movq [edi + eax*4], mm0
+ movd [edi + eax*4 +8], mm1
+ /* done! */
+.i3310_updateouterdata_coul:
+ mov ecx, [esp + ii3]
+
+ movq mm6, [edi + ecx*4] /* increment i force */
+ movd mm7, [edi + ecx*4 + 8]
+ pfadd mm6, [esp + fix]
+ pfadd mm7, [esp + fiz]
+ movq [edi + ecx*4], mm6
+ movd [edi + ecx*4 +8], mm7
+
+ mov ebx, [ebp + fshift] /* increment fshift force */
+ mov edx, [esp + is3]
+
+ movq mm6, [ebx + edx*4]
+ movd mm7, [ebx + edx*4 + 8]
+ pfadd mm6, [esp + fix]
+ pfadd mm7, [esp + fiz]
+ movq [ebx + edx*4], mm6
+ movd [ebx + edx*4 + 8], mm7
+
+ /* loop back to mno */
+ dec dword ptr [esp + nscoul]
+ jz .i3310_testvdw
+ jmp .i3310_mno_coul
+.i3310_testvdw:
+ mov ecx, [esp + nsvdw]
+ cmp ecx, 0
+ jnz .i3310_mno_vdw
+ jmp .i3310_last_mno
+.i3310_mno_vdw:
+ mov ebx, [esp + solnr]
+ inc dword ptr [esp + solnr]
+
+ mov edx, [ebp + type]
+ mov edx, [edx + ebx*4]
+ imul edx, [ebp + ntype]
+ shl edx, 1
+ mov [esp + ntia], edx
+
+ lea ebx, [ebx + ebx*2] /* ebx = 3*ii=ii3 */
+ mov eax, [ebp + pos] /* eax = base of pos[] */
+ mov [esp + ii3], ebx
+
+ movq mm0, [eax + ebx*4]
+ movd mm1, [eax + ebx*4 + 8]
+ pfadd mm0, [esp + shX]
+ pfadd mm1, [esp + shZ]
+ movq [esp + ix], mm0
+ movd [esp + iz], mm1
+
+ /* clear forces */
+ pxor mm7,mm7
+ movq [esp + fix], mm7
+ movd [esp + fiz], mm7
+
+ mov ecx, [esp + innerjjnr0]
+ mov [esp + innerjjnr], ecx
+ mov edx, [esp + innerk0]
+ sub edx, 2
+ mov [esp + innerk], edx /* number of innerloop atoms */
+ jge .i3310_unroll_vdw_loop
+ jmp .i3310_finish_vdw_inner
+.i3310_unroll_vdw_loop:
+ /* paired innerloop starts here */
+ mov ecx, [esp + innerjjnr] /* pointer to jjnr[k] */
+ mov eax, [ecx]
+ mov ebx, [ecx + 4] /* eax/ebx=jnr */
+ add [esp + innerjjnr], 8 /* advance pointer (unrolled 2) */
+ prefetch [ecx + 16] /* prefetch data - trial and error says 16 is best */
+
+ mov ecx, [ebp + type]
+ mov edx, [ecx + eax*4] /* type [jnr1] */
+ mov ecx, [ecx + ebx*4] /* type [jnr2] */
+
+ mov esi, [ebp + nbfp] /* base of nbfp */
+ shl edx, 1
+ shl ecx, 1
+ add edx, [esp + ntia] /* tja = ntia + 2*type */
+ add ecx, [esp + ntia]
+
+ movq mm5, [esi + edx*4] /* mm5 = 1st c6 / c12 */
+ movq mm7, [esi + ecx*4] /* mm7 = 2nd c6 / c12 */
+ movq mm6, mm5
+ punpckldq mm5, mm7 /* mm5 = 1st c6 / 2nd c6 */
+ punpckhdq mm6, mm7 /* mm6 = 1st c12 / 2nd c12 */
+ movq [esp + c6], mm5
+ movq [esp + c12], mm6
+
+ lea eax, [eax + eax*2] /* replace jnr with j3 */
+ lea ebx, [ebx + ebx*2]
+
+ mov esi, [ebp + pos]
+
+ movq mm0, [esp + ix]
+ movd mm1, [esp + iz]
+ movq mm4, [esi + eax*4] /* fetch first j coordinates */
+ movd mm5, [esi + eax*4 + 8]
+ pfsubr mm4,mm0 /* dr = ir - jr */
+ pfsubr mm5,mm1
+ movq [esp + dx1], mm4 /* store dr */
+ movd [esp + dz1], mm5
+ pfmul mm4,mm4 /* square dx,dy,dz */
+ pfmul mm5,mm5
+ pfacc mm4, mm5 /* accumulate to get dx*dx+dy*dy+dz*dz */
+ pfacc mm4, mm5 /* first rsq in lower mm4 */
+
+ movq mm6, [esi + ebx*4] /* fetch second j coordinates */
+ movd mm7, [esi + ebx*4 + 8]
+
+ pfsubr mm6, mm0 /* dr = ir - jr */
+ pfsubr mm7, mm1
+ movq [esp + dx2], mm6 /* store dr */
+ movd [esp + dz2], mm7
+ pfmul mm6, mm6 /* square dx,dy,dz */
+ pfmul mm7, mm7
+ pfacc mm6, mm7 /* accumulate to get dx*dx+dy*dy+dz*dz */
+ pfacc mm6, mm7 /* second rsq in lower mm6 */
+
+ pfrsqrt mm0, mm4 /* lookup inverse square root seed */
+ pfrsqrt mm1, mm6
+
+
+ punpckldq mm0, mm1
+ punpckldq mm4, mm6 /* now 4 has rsq and 0 the seed for both pairs. */
+ movq mm2, mm0 /* amd 3dnow N-R iteration to get full precision. */
+ pfmul mm0, mm0
+ pfrsqit1 mm0, mm4
+ pfrcpit2 mm0, mm2
+ pfmul mm4, mm0
+ movq mm1, mm4
+ /* mm0 is invsqrt, and mm1 r. */
+ /* do potential and fscal */
+ pfmul mm1, [esp + tsc] /* mm1=rt */
+ pf2iw mm4, mm1
+ movq [esp + n1], mm4
+ pi2fd mm4, mm4
+ pfsub mm1, mm4 /* now mm1 is eps and mm4 is n0 */
+
+ movq mm2, mm1
+ pfmul mm2, mm2 /* mm1 is eps, mm2 is eps2 */
+
+ mov edx, [ebp + VFtab]
+ /* dispersion table */
+ mov ecx, [esp + n1]
+ lea ecx, [ecx + ecx*2]
+ shl ecx, 2
+ /* load all the table values we need */
+ movd mm4, [edx + ecx*4]
+ movd mm5, [edx + ecx*4 + 4]
+ movd mm6, [edx + ecx*4 + 8]
+ movd mm7, [edx + ecx*4 + 12]
+ mov ecx, [esp + n1 + 4]
+ lea ecx, [ecx + ecx*2]
+ shl ecx, 2
+ punpckldq mm4, [edx + ecx*4]
+ punpckldq mm5, [edx + ecx*4 + 4]
+ punpckldq mm6, [edx + ecx*4 + 8]
+ punpckldq mm7, [edx + ecx*4 + 12]
+ pfmul mm6, mm1 /* mm6 = Geps */
+ pfmul mm7, mm2 /* mm7 = Heps2 */
+ pfadd mm5, mm6
+ pfadd mm5, mm7 /* mm5 = Fp */
+ pfmul mm7, [esp + two] /* two*Heps2 */
+ pfadd mm7, mm6
+ pfadd mm7, mm5 /* mm7=FF */
+ pfmul mm5, mm1 /* mm5=eps*Fp */
+ pfadd mm5, mm4 /* mm5= VV */
+
+ movq mm4, [esp + c6]
+ pfmul mm7, mm4 /* fijD */
+ pfmul mm5, mm4 /* vnb6 */
+ movq mm3, mm7 /* add to fscal */
+
+ /* update vnbtot to release mm5! */
+ pfadd mm5, [esp + vnbtot] /* add the earlier value */
+ movq [esp + vnbtot], mm5 /* store the sum */
+
+ /* repulsion table */
+ mov ecx, [esp + n1]
+ lea ecx, [ecx + ecx*2]
+ shl ecx, 2
+ /* load all the table values we need */
+ movd mm4, [edx + ecx*4 + 16]
+ movd mm5, [edx + ecx*4 + 20]
+ movd mm6, [edx + ecx*4 + 24]
+ movd mm7, [edx + ecx*4 + 28]
+ mov ecx, [esp + n1 + 4]
+ lea ecx, [ecx + ecx*2]
+ shl ecx, 2
+ punpckldq mm4, [edx + ecx*4 + 16]
+ punpckldq mm5, [edx + ecx*4 + 20]
+ punpckldq mm6, [edx + ecx*4 + 24]
+ punpckldq mm7, [edx + ecx*4 + 28]
+
+ pfmul mm6, mm1 /* mm6 = Geps */
+ pfmul mm7, mm2 /* mm7 = Heps2 */
+ pfadd mm5, mm6
+ pfadd mm5, mm7 /* mm5 = Fp */
+ pfmul mm7, [esp + two] /* two*Heps2 */
+ pfadd mm7, mm6
+ pfadd mm7, mm5 /* mm7=FF */
+ pfmul mm5, mm1 /* mm5=eps*Fp */
+ pfadd mm5, mm4 /* mm5= VV */
+
+ movq mm6, [esp + c12]
+ pfmul mm7, mm6 /* fijR */Toggle
+ pfmul mm5, mm6 /* vnb12 */
+ pfadd mm3, mm7 /* total fscal fijD+fijR */
+
+ /* change sign of mm3 */
+ pxor mm1,mm1
+ pfsub mm1, mm3
+ pfmul mm1, [esp + tsc]
+ pfmul mm0, mm1 /* mm0 is total fscal now */
+
+ prefetchw [esp + dx1] /* prefetch i forces to cache */
+
+ /* spread fscalar to both positions */
+ movq mm1,mm0
+ punpckldq mm0,mm0
+ punpckhdq mm1,mm1
+
+ /* calc vector force */
+ prefetchw [edi + eax*4] /* prefetch the 1st faction to cache */
+ movq mm2, [esp + dx1] /* fetch dr */
+ movd mm3, [esp + dz1]
+
+ /* update vnbtot */
+ pfadd mm5, [esp + vnbtot] /* add the earlier value */
+ movq [esp + vnbtot], mm5 /* store the sum */
+
+ prefetchw [edi + ebx*4] /* prefetch the 2nd faction to cache */
+ pfmul mm2, mm0 /* mult by fs */
+ pfmul mm3, mm0
+
+ movq mm4, [esp + dx2] /* fetch dr */
+ movd mm5, [esp + dz2]
+ pfmul mm4, mm1 /* mult by fs */
+ pfmul mm5, mm1
+ /* update i forces */
+
+ movq mm0, [esp + fix]
+ movd mm1, [esp + fiz]
+ pfadd mm0, mm2
+ pfadd mm1, mm3
+
+ pfadd mm0, mm4
+ pfadd mm1, mm5
+ movq [esp + fix], mm0
+ movd [esp + fiz], mm1
+ /* update j forces */
+
+ movq mm0, [edi + eax*4]
+ movd mm1, [edi + eax*4 + 8]
+ movq mm6, [edi + ebx*4]
+ movd mm7, [edi + ebx*4 + 8]
+
+ pfsub mm0, mm2
+ pfsub mm1, mm3
+ pfsub mm6, mm4
+ pfsub mm7, mm5
+
+ movq [edi + eax*4], mm0
+ movd [edi + eax*4 +8], mm1
+ movq [edi + ebx*4], mm6
+ movd [edi + ebx*4 + 8], mm7
+
+ /* should we do one more iteration? */
+ sub [esp + innerk], 2
+ jl .i3310_finish_vdw_inner
+ jmp .i3310_unroll_vdw_loop
+.i3310_finish_vdw_inner:
+ and [esp + innerk], 1
+ jnz .i3310_single_vdw_inner
+ jmp .i3310_updateouterdata_vdw
+.i3310_single_vdw_inner:
+ /* a single j particle iteration here - compare with the unrolled code for comments. */
+ mov eax, [esp + innerjjnr]
+ mov eax, [eax] /* eax=jnr offset */
+
+ mov esi, [ebp + nbfp]
+ mov ecx, [ebp + type]
+ mov edx, [ecx + eax*4] /* type [jnr1] */
+ shl edx, 1
+ add edx, [esp + ntia] /* tja = ntia + 2*type */
+ movd mm5, [esi + edx*4] /* mm5 = 1st c6 */
+ movq [esp + c6], mm5
+ movd mm5, [esi + edx*4 + 4] /* mm5 = 1st c12 */
+ movq [esp + c12], mm5
+
+ mov esi, [ebp + pos]
+ lea eax, [eax + eax*2]
+
+ movq mm0, [esp + ix]
+ movd mm1, [esp + iz]
+ movq mm4, [esi + eax*4]
+ movd mm5, [esi + eax*4 + 8]
+ pfsubr mm4, mm0
+ pfsubr mm5, mm1
+ movq [esp + dx1], mm4
+ pfmul mm4,mm4
+ movd [esp + dz1], mm5
+ pfmul mm5,mm5
+ pfacc mm4, mm5
+ pfacc mm4, mm5 /* mm0=rsq */
+
+ pfrsqrt mm0,mm4
+ movq mm2,mm0
+ pfmul mm0,mm0
+ pfrsqit1 mm0,mm4
+ pfrcpit2 mm0,mm2 /* mm1=invsqrt */
+ pfmul mm4, mm0
+ movq mm1, mm4
+ /* mm0 is invsqrt, and mm1 r. */
+
+ /* calculate potentials and scalar force */
+ pfmul mm1, [esp + tsc] /* mm1=rt */
+ pf2iw mm4,mm1
+ movd [esp + n1], mm4
+ pi2fd mm4,mm4
+ pfsub mm1, mm4 /* now mm1 is eps and mm4 n0. */
+
+ movq mm2,mm1
+ pfmul mm2,mm2 /* mm1 is eps, mm2 is eps2 */
+
+ mov edx, [ebp + VFtab]
+ mov ecx, [esp + n1]
+ lea ecx, [ecx + ecx*2]
+ shl ecx, 2
+ /* dispersion table
+ * load all the table values we need
+ */
+ movd mm4, [edx + ecx*4]
+ movd mm5, [edx + ecx*4 + 4]
+ movd mm6, [edx + ecx*4 + 8]
+ movd mm7, [edx + ecx*4 + 12]
+ pfmul mm6, mm1 /* mm6 = Geps */
+ pfmul mm7, mm2 /* mm7 = Heps2 */
+ pfadd mm5, mm6
+ pfadd mm5, mm7 /* mm5 = Fp */
+ pfmul mm7, [esp + two] /* two*Heps2 */
+ pfadd mm7, mm6
+ pfadd mm7, mm5 /* mm7=FF */
+ pfmul mm5, mm1 /* mm5=eps*Fp */
+ pfadd mm5, mm4 /* mm5= VV */
+
+ movq mm4, [esp + c6]
+ pfmul mm7, mm4 /* fijD */
+ pfmul mm5, mm4 /* vnb6 */
+ movq mm3, mm7 /* add to fscal */
+
+ /* update vnbtot to release mm5! */
+ pfadd mm5, [esp + vnbtot] /* add the earlier value */
+ movq [esp + vnbtot], mm5 /* store the sum */
+
+ /* repulsion table
+ * load all the table values we need
+ */
+ movd mm4, [edx + ecx*4 + 16]
+ movd mm5, [edx + ecx*4 + 20]
+ movd mm6, [edx + ecx*4 + 24]
+ movd mm7, [edx + ecx*4 + 28]
+
+ pfmul mm6, mm1 /* mm6 = Geps */
+ pfmul mm7, mm2 /* mm7 = Heps2 */
+ pfadd mm5, mm6
+ pfadd mm5, mm7 /* mm5 = Fp */
+ pfmul mm7, [esp + two] /* two*Heps2 */
+ pfadd mm7, mm6
+ pfadd mm7, mm5 /* mm7=FF */
+ pfmul mm5, mm1 /* mm5=eps*Fp */
+ pfadd mm5, mm4 /* mm5= VV */
+
+ movq mm6, [esp + c12]
+ pfmul mm7, mm6 /* fijR */
+ pfmul mm5, mm6 /* vnb12 */
+ pfadd mm3, mm7 /* total fscal fijC+fijD+fijR */
+
+ /* change sign of mm3 */
+ pxor mm1,mm1
+ pfsub mm1, mm3
+ pfmul mm0, [esp + tsc]
+ pfmul mm0, mm1 /* mm0 is total fscal now */
+
+ /* update vnbtot */
+ pfadd mm5, [esp + vnbtot] /* add the earlier value */
+ movq [esp + vnbtot], mm5 /* store the sum */
+
+ /* spread fscalar to both positions */
+ punpckldq mm0,mm0
+ /* calc vectorial force */
+ prefetchw [edi + eax*4] /* prefetch faction to cache */
+ movq mm2, [esp + dx1]
+ movd mm3, [esp + dz1]
+
+ pfmul mm2, mm0
+ pfmul mm3, mm0
+
+ /* update i particle force */
+ movq mm0, [esp + fix]
+ movd mm1, [esp + fiz]
+ pfadd mm0, mm2
+ pfadd mm1, mm3
+ movq [esp + fix], mm0
+ movd [esp + fiz], mm1
+ /* update j particle force */
+ movq mm0, [edi + eax*4]
+ movd mm1, [edi + eax *4+ 8]
+ pfsub mm0, mm2
+ pfsub mm1, mm3
+ movq [edi + eax*4], mm0
+ movd [edi + eax*4 +8], mm1
+ /* done! */
+.i3310_updateouterdata_vdw:
+ mov ecx, [esp + ii3]
+
+ movq mm6, [edi + ecx*4] /* increment i force */
+ movd mm7, [edi + ecx*4 + 8]
+ pfadd mm6, [esp + fix]
+ pfadd mm7, [esp + fiz]
+ movq [edi + ecx*4], mm6
+ movd [edi + ecx*4 +8], mm7
+
+ mov ebx, [ebp + fshift] /* increment fshift force */
+ mov edx, [esp + is3]
+
+ movq mm6, [ebx + edx*4]
+ movd mm7, [ebx + edx*4 + 8]
+ pfadd mm6, [esp + fix]
+ pfadd mm7, [esp + fiz]
+ movq [ebx + edx*4], mm6
+ movd [ebx + edx*4 + 8], mm7
+
+ /* loop back to mno */
+ dec dword ptr [esp + nsvdw]
+ jz .i3310_last_mno
+ jmp .i3310_mno_vdw
+
+.i3310_last_mno:
+ mov edx, [ebp + gid] /* get group index for this i particle */
+ mov edx, [edx]
+ add [ebp + gid], 4 /* advance pointer */
+
+ movq mm7, [esp + vctot]
+ pfacc mm7,mm7 /* get and sum the two parts of total potential */
+
+ mov eax, [ebp + Vc]
+ movd mm6, [eax + edx*4]
+ pfadd mm6, mm7
+ movd [eax + edx*4], mm6 /* increment vc[gid] */
+
+ movq mm7, [esp + vnbtot]
+ pfacc mm7,mm7 /* get and sum the two parts of total potential */
+
+ mov eax, [ebp + Vnb]
+ movd mm6, [eax + edx*4]
+ pfadd mm6, mm7
+ movd [eax + edx*4], mm6 /* increment vc[gid] */
+ /* finish if last */
+ mov ecx, [ebp + nri]
+ dec ecx
+ jecxz .i3310_end
+ /* not last, iterate once more! */
+ mov [ebp + nri], ecx
+ jmp .i3310_outer
+.i3310_end:
+ femms
+ add esp, 168
+ pop edi
+ pop esi
+ pop edx
+ pop ecx
+ pop ebx
+ pop eax
+ leave
+ ret
+
+
+.globl inl3320_3dnow
+ .type inl3320_3dnow,@function
+inl3320_3dnow:
+.equ nri, 8
+.equ iinr, 12
+.equ jindex, 16
+.equ jjnr, 20
+.equ shift, 24
+.equ shiftvec, 28
+.equ fshift, 32
+.equ gid, 36
+.equ pos, 40
+.equ faction, 44
+.equ charge, 48
+.equ facel, 52
+.equ Vc, 56
+.equ type, 60
+.equ ntype, 64
+.equ nbfp, 68
+.equ Vnb, 72
+.equ tabscale, 76
+.equ VFtab, 80
+ /* stack offsets for local variables */
+.equ is3, 0
+.equ ii3, 4
+.equ ixO, 8
+.equ iyO, 12
+.equ izO, 16
+.equ ixH, 20 /* repeated (64bit) to fill 3dnow reg */
+.equ iyH, 28 /* repeated (64bit) to fill 3dnow reg */
+.equ izH, 36 /* repeated (64bit) to fill 3dnow reg */
+.equ iqO, 44 /* repeated (64bit) to fill 3dnow reg */
+.equ iqH, 52 /* repeated (64bit) to fill 3dnow reg */
+.equ qqO, 60 /* repeated (64bit) to fill 3dnow reg */
+.equ qqH, 68 /* repeated (64bit) to fill 3dnow reg */
+.equ vctot, 76 /* repeated (64bit) to fill 3dnow reg */
+.equ vnbtot, 84 /* repeated (64bit) to fill 3dnow reg */
+.equ c6, 92 /* repeated (64bit) to fill 3dnow reg */
+.equ c12, 100 /* repeated (64bit) to fill 3dnow reg */
+.equ two, 108 /* repeated (64bit) to fill 3dnow reg */
+.equ n1, 116 /* repeated (64bit) to fill 3dnow reg */
+.equ tsc, 124 /* repeated (64bit) to fill 3dnow reg */
+.equ ntia, 132 /* repeated (64bit) to fill 3dnow reg */
+.equ innerjjnr, 140
+.equ innerk, 144
+.equ fixO, 148
+.equ fiyO, 152
+.equ fizO, 156
+.equ fixH, 160 /* repeated (64bit) to fill 3dnow reg */
+.equ fiyH, 168 /* repeated (64bit) to fill 3dnow reg */
+.equ fizH, 176 /* repeated (64bit) to fill 3dnow reg */
+.equ dxO, 184
+.equ dyO, 188
+.equ dzO, 192
+.equ dxH, 196 /* repeated (64bit) to fill 3dnow reg */
+.equ dyH, 204 /* repeated (64bit) to fill 3dnow reg */
+.equ dzH, 212 /* repeated (64bit) to fill 3dnow reg */
+.equ tmprsqH, 220 /* repeated (64bit) to fill 3dnow reg */
+ push ebp
+ mov ebp,esp
+ push eax
+ push ebx
+ push ecx
+ push edx
+ push esi
+ push edi
+ sub esp, 228 /* local stack space */
+ femms
+
+ mov ecx, [ebp + iinr] /* ecx = pointer into iinr[] */
+ mov ebx, [ecx] /* ebx=ii */
+
+ mov edx, [ebp + charge]
+ movd mm1, [ebp + facel]
+ movd mm2, [edx + ebx*4] /* mm2=charge[i.i0] */
+ pfmul mm2, mm1
+ movq [esp + iqO], mm2 /* iqO = facel*charge[ii] */
+
+ movd mm2, [edx + ebx*4 + 4] /* mm2=charge[i.i0+1] */
+ pfmul mm2, mm1
+ punpckldq mm2,mm2 /* spread to both halves */
+ movq [esp + iqH], mm2 /* iqH = facel*charge[i.i0+1] */
+
+ mov edx, [ebp + type]
+ mov ecx, [edx + ebx*4]
+ shl ecx, 1
+ imul ecx, [ebp + ntype] /* ecx = ntia = 2*ntype*type[i.i0] */
+ mov [esp + ntia], ecx
+
+ movq mm3, [mm_two]
+ movq mm4, [ebp + tabscale]
+ punpckldq mm4,mm4 /* spread to both halves */
+ movq [esp + two], mm3
+ movq [esp + tsc], mm4
+ /* assume we have at least one i particle - start directly */
+.i3320_outer:
+ mov eax, [ebp + shift] /* eax = pointer into shift[] */
+ mov ebx, [eax] /* ebx=shift[n] */
+ add [ebp + shift], 4 /* advance pointer one step */
+
+ lea ebx, [ebx + ebx*2] /* ebx=3*is */
+ mov [esp + is3],ebx /* store is3 */
+
+ mov eax, [ebp + shiftvec] /* eax = base of shiftvec[] */
+
+ movq mm5, [eax + ebx*4] /* move shX/shY to mm5 and shZ to mm6. */
+ movd mm6, [eax + ebx*4 + 8]
+ movq mm0, mm5
+ movq mm1, mm5
+ movq mm2, mm6
+ punpckldq mm0,mm0 /* also expand shX,Y,Z in mm0--mm2. */
+ punpckhdq mm1,mm1
+ punpckldq mm2,mm2
+
+ mov ecx, [ebp + iinr] /* ecx = pointer into iinr[] */
+ add [ebp + iinr], 4 /* advance pointer */
+ mov ebx, [ecx] /* ebx=ii */
+
+ lea ebx, [ebx + ebx*2] /* ebx = 3*ii=ii3 */
+ mov eax, [ebp + pos] /* eax = base of pos[] */
+
+ pfadd mm5, [eax + ebx*4] /* ix = shX + posX (and iy too) */
+ movd mm7, [eax + ebx*4 + 8] /* cant use direct memory add for 4 bytes (iz) */
+ mov [esp + ii3], ebx /* (use mm7 as temp. storage for iz.) */
+ pfadd mm6, mm7
+ movq [esp + ixO], mm5
+ movq [esp + izO], mm6
+
+ movd mm3, [eax + ebx*4 + 12]
+ movd mm4, [eax + ebx*4 + 16]
+ movd mm5, [eax + ebx*4 + 20]
+ punpckldq mm3, [eax + ebx*4 + 24]
+ punpckldq mm4, [eax + ebx*4 + 28]
+ punpckldq mm5, [eax + ebx*4 + 32] /* coords of H1 in low mm3-mm5, H2 in high */
+
+ pfadd mm0, mm3
+ pfadd mm1, mm4
+ pfadd mm2, mm5
+ movq [esp + ixH], mm0
+ movq [esp + iyH], mm1
+ movq [esp + izH], mm2
+
+ /* clear vctot and i forces */
+ pxor mm7,mm7
+ movq [esp + vctot], mm7
+ movq [esp + vnbtot], mm7
+ movq [esp + fixO], mm7
+ movd [esp + fizO], mm7
+ movq [esp + fixH], mm7
+ movq [esp + fiyH], mm7
+ movq [esp + fizH], mm7
+
+ mov eax, [ebp + jindex]
+ mov ecx, [eax] /* jindex[n] */
+ mov edx, [eax + 4] /* jindex[n+1] */
+ add [ebp + jindex], 4
+ sub edx, ecx /* number of innerloop atoms */
+ mov [esp + innerk], edx
+
+ mov esi, [ebp + pos]
+ mov edi, [ebp + faction]
+ mov eax, [ebp + jjnr]
+ shl ecx, 2
+ add eax, ecx
+ mov [esp + innerjjnr], eax /* pointer to jjnr[nj0] */
+.i3320_inner_loop:
+ /* a single j particle iteration here - compare with the unrolled code for comments. */
+ mov eax, [esp + innerjjnr]
+ mov eax, [eax] /* eax=jnr offset */
+ add [esp + innerjjnr], 4 /* advance pointer */
+ prefetch [ecx + 16] /* prefetch data - trial and error says 16 is best */
+
+ mov ecx, [ebp + charge]
+ movd mm7, [ecx + eax*4]
+ punpckldq mm7,mm7
+ movq mm6,mm7
+ pfmul mm6, [esp + iqO]
+ pfmul mm7, [esp + iqH] /* mm6=qqO, mm7=qqH */
+ movd [esp + qqO], mm6
+ movq [esp + qqH], mm7
+
+ mov ecx, [ebp + type]
+ mov edx, [ecx + eax*4] /* type [jnr] */
+ mov ecx, [ebp + nbfp]
+ shl edx, 1
+ add edx, [esp + ntia] /* tja = ntia + 2*type */
+ movd mm5, [ecx + edx*4] /* mm5 = 1st c6 */
+ movq [esp + c6], mm5
+ movd mm5, [ecx + edx*4 + 4] /* mm5 = 1st c12 */
+ movq [esp + c12], mm5
+
+ lea eax, [eax + eax*2]
+
+ movq mm0, [esi + eax*4]
+ movd mm1, [esi + eax*4 + 8]
+ /* copy & expand to mm2-mm4 for the H interactions */
+ movq mm2, mm0
+ movq mm3, mm0
+ movq mm4, mm1
+ punpckldq mm2,mm2
+ punpckhdq mm3,mm3
+ punpckldq mm4,mm4
+
+ pfsubr mm0, [esp + ixO]
+ pfsubr mm1, [esp + izO]
+
+ movq [esp + dxO], mm0
+ pfmul mm0,mm0
+ movd [esp + dzO], mm1
+ pfmul mm1,mm1
+ pfacc mm0, mm1
+ pfadd mm0, mm1 /* mm0=rsqO */
+
+ punpckldq mm2, mm2
+ punpckldq mm3, mm3
+ punpckldq mm4, mm4 /* mm2-mm4 is jx-jz */
+ pfsubr mm2, [esp + ixH]
+ pfsubr mm3, [esp + iyH]
+ pfsubr mm4, [esp + izH] /* mm2-mm4 is dxH-dzH */
+
+ movq [esp + dxH], mm2
+ movq [esp + dyH], mm3
+ movq [esp + dzH], mm4
+ pfmul mm2,mm2
+ pfmul mm3,mm3
+ pfmul mm4,mm4
+
+ pfadd mm3,mm2
+ pfadd mm3,mm4 /* mm3=rsqH */
+ movq [esp + tmprsqH], mm3
+
+ pfrsqrt mm1,mm0
+
+ movq mm2,mm1
+ pfmul mm1,mm1
+ pfrsqit1 mm1,mm0
+ pfrcpit2 mm1,mm2 /* mm1=invsqrt */
+
+ pfmul mm0, mm1 /* mm0=r */
+
+ pfmul mm0, [esp + tsc]
+ pf2iw mm4, mm0
+ movd [esp + n1], mm4
+ pi2fd mm4,mm4
+ pfsub mm0, mm4 /* now mm0 is eps and mm4 n0 */
+ movq mm2, mm0
+ pfmul mm2, mm2 /* mm0 is eps, mm2 eps2 */
+
+ /* coulomb table */
+ mov edx, [ebp + VFtab]
+ mov ecx, [esp + n1]
+ lea ecx, [ecx + ecx*2]
+ shl ecx, 2
+ /* load all values we need */
+ movd mm4, [edx + ecx*4]
+ movd mm5, [edx + ecx*4 + 4]
+ movd mm6, [edx + ecx*4 + 8]
+ movd mm7, [edx + ecx*4 + 12]
+
+ pfmul mm6, mm0 /* mm6 = Geps */
+ pfmul mm7, mm2 /* mm7 = Heps2 */
+
+ pfadd mm5, mm6
+ pfadd mm5, mm7 /* mm5 = Fp */
+
+ pfmul mm7, [esp + two] /* two*Heps2 */
+ pfadd mm7, mm6
+ pfadd mm7, mm5 /* mm7=FF */
+
+ pfmul mm5, mm0 /* mm5=eps*Fp */
+ pfadd mm5, mm4 /* mm5= VV */
+
+ pfmul mm5, [esp + qqO] /* vcoul=qq*VV */
+ pfmul mm7, [esp + qqO] /* fijC=qq*FF */
+
+ /* update vctot directly, use mm3 for fscal sum. */
+ pfadd mm5, [esp + vctot]
+ movq [esp + vctot], mm5
+ movq mm3, mm7
+
+ /* dispersion table */
+ /* load all the table values we need */
+ movd mm4, [edx + ecx*4 + 16]
+ movd mm5, [edx + ecx*4 + 20]
+ movd mm6, [edx + ecx*4 + 24]
+ movd mm7, [edx + ecx*4 + 28]
+ pfmul mm6, mm0 /* mm6 = Geps */
+ pfmul mm7, mm2 /* mm7 = Heps2 */
+ pfadd mm5, mm6
+ pfadd mm5, mm7 /* mm5 = Fp */
+ pfmul mm7, [esp + two] /* two*Heps2 */
+ pfadd mm7, mm6
+ pfadd mm7, mm5 /* mm7=FF */
+ pfmul mm5, mm0 /* mm5=eps*Fp */
+ pfadd mm5, mm4 /* mm5= VV */
+
+ movq mm4, [esp + c6]
+ pfmul mm7, mm4 /* fijD */
+ pfmul mm5, mm4 /* vnb6 */
+ pfadd mm3, mm7 /* add to fscal */
+
+ /* update vnbtot to release mm5! */
+ pfadd mm5, [esp + vnbtot] /* add the earlier value */
+ movq [esp + vnbtot], mm5 /* store the sum */
+
+ /* repulsion table */
+ /* load all the table values we need */
+ movd mm4, [edx + ecx*4 + 32]
+ movd mm5, [edx + ecx*4 + 36]
+ movd mm6, [edx + ecx*4 + 40]
+ movd mm7, [edx + ecx*4 + 44]
+
+ pfmul mm6, mm0 /* mm6 = Geps */
+ pfmul mm7, mm2 /* mm7 = Heps2 */
+ pfadd mm5, mm6
+ pfadd mm5, mm7 /* mm5 = Fp */
+ pfmul mm7, [esp + two] /* two*Heps2 */
+ pfadd mm7, mm6
+ pfadd mm7, mm5 /* mm7=FF */
+ pfmul mm5, mm0 /* mm5=eps*Fp */
+ pfadd mm5, mm4 /* mm5= VV */
+
+ movq mm6, [esp + c12]
+ pfmul mm7, mm6 /* fijR */
+ pfmul mm5, mm6 /* vnb12 */
+ pfadd mm3, mm7 /* total fscal fijC+fijD+fijR */
+
+ /* change sign of fscal and multiply with rinv */
+ pxor mm0,mm0
+ pfsubr mm3, mm0
+ pfmul mm3, [esp + tsc]
+ pfmul mm3, mm1 /* mm3 is total fscal (for the oxygen) now */
+
+ /* update vnbtot */
+ pfadd mm5, [esp + vnbtot] /* add the earlier value */
+ movq [esp + vnbtot], mm5 /* store the sum */
+
+ /* Ready with the oxygen - potential is updated, fscal is in mm3. */
+ /* now do the two hydrogens. */
+ movq mm0, [esp + tmprsqH] /* mm0=r */sqH
+
+ pfrsqrt mm1, mm0
+ pswapd mm0,mm0
+ pfrsqrt mm2, mm0
+ pswapd mm0,mm0
+ punpckldq mm1,mm2 /* seeds are in mm1 now, and rsq in mm0. */
+
+ movq mm2, mm1
+ pfmul mm1,mm1
+ pfrsqit1 mm1,mm0
+ pfrcpit2 mm1,mm2 /* mm1=invsqrt */
+
+ pfmul mm0,mm1 /* mm0=r */
+ pfmul mm0, [esp + tsc]
+ pf2iw mm4, mm0
+ movq [esp + n1], mm4
+ pi2fd mm4,mm4
+ pfsub mm0, mm4 /* now mm0 is eps and mm4 n0 */
+ movq mm2, mm0
+ pfmul mm2, mm2 /* mm0 is eps, mm2 eps2 */
+
+ /* coulomb table */
+ mov edx, [ebp + VFtab]
+ mov ecx, [esp + n1]
+ lea ecx, [ecx + ecx*2]
+ shl ecx, 2
+ /* load all values we need */
+ movd mm4, [edx + ecx*4]
+ movd mm5, [edx + ecx*4 + 4]
+ movd mm6, [edx + ecx*4 + 8]
+ movd mm7, [edx + ecx*4 + 12]
+ mov ecx, [esp + n1 + 4]
+ lea ecx, [ecx + ecx*2]
+ shl ecx, 2
+ punpckldq mm4, [edx + ecx*4]
+ punpckldq mm5, [edx + ecx*4 + 4]
+ punpckldq mm6, [edx + ecx*4 + 8]
+ punpckldq mm7, [edx + ecx*4 + 12]
+
+
+ pfmul mm6, mm0 /* mm6 = Geps */
+ pfmul mm7, mm2 /* mm7 = Heps2 */
+
+ pfadd mm5, mm6
+ pfadd mm5, mm7 /* mm5 = Fp */
+
+ pfmul mm7, [esp + two] /* two*Heps2 */
+ pfadd mm7, mm6
+ pfadd mm7, mm5 /* mm7=FF */
+
+ pfmul mm5, mm0 /* mm5=eps*Fp */
+ pfadd mm5, mm4 /* mm5= VV */
+
+ pfmul mm5, [esp + qqH] /* vcoul=qq*VV */
+ pfmul mm7, [esp + qqH] /* fijC=qq*FF */
+ /* update vctot */
+ pfadd mm5, [esp + vctot]
+ movq [esp + vctot], mm5
+
+ /* change sign of fijC and multiply by rinv */
+ pxor mm4,mm4
+ pfsub mm4, mm7
+ pfmul mm4, [esp + tsc]
+ pfmul mm4, mm1 /* mm4 is total fscal (for the hydrogens) now */
+
+ /* spread oxygen fscalar to both positions */
+ punpckldq mm3,mm3
+ /* calc vectorial force for O */
+ prefetchw [edi + eax*4] /* prefetch faction to cache */
+ movq mm0, [esp + dxO]
+ movd mm1, [esp + dzO]
+ pfmul mm0, mm3
+ pfmul mm1, mm3
+
+ /* calc vectorial force for H's */
+ movq mm5, [esp + dxH]
+ movq mm6, [esp + dyH]
+ movq mm7, [esp + dzH]
+ pfmul mm5, mm4
+ pfmul mm6, mm4
+ pfmul mm7, mm4
+
+ /* update iO particle force */
+ movq mm2, [esp + fixO]
+ movd mm3, [esp + fizO]
+ pfadd mm2, mm0
+ pfadd mm3, mm1
+ movq [esp + fixO], mm2
+ movd [esp + fizO], mm3
+
+ /* update iH forces */
+ movq mm2, [esp + fixH]
+ movq mm3, [esp + fiyH]
+ movq mm4, [esp + fizH]
+ pfadd mm2, mm5
+ pfadd mm3, mm6
+ pfadd mm4, mm7
+ movq [esp + fixH], mm2
+ movq [esp + fiyH], mm3
+ movq [esp + fizH], mm4
+
+ /* pack j forces from H in the same form as the oxygen force. */
+ pfacc mm5, mm6 /* mm5(l)=fjx(H1+H2) mm5(h)=fjy(H1+H2) */
+ pfacc mm7, mm7 /* mm7(l)=fjz(H1+H2) */
+
+ pfadd mm0, mm5 /* add up total force on j particle. */
+ pfadd mm1, mm7
+
+ /* update j particle force */
+ movq mm2, [edi + eax*4]
+ movd mm3, [edi + eax*4 + 8]
+ pfsub mm2, mm0
+ pfsub mm3, mm1
+ movq [edi + eax*4], mm2
+ movd [edi + eax*4 +8], mm3
+
+ /* done - one more? */
+ dec dword ptr [esp + innerk]
+ jz .i3320_updateouterdata
+ jmp .i3320_inner_loop
+.i3320_updateouterdata:
+ mov ecx, [esp + ii3]
+
+ movq mm6, [edi + ecx*4] /* increment iO force */
+ movd mm7, [edi + ecx*4 + 8]
+ pfadd mm6, [esp + fixO]
+ pfadd mm7, [esp + fizO]
+ movq [edi + ecx*4], mm6
+ movd [edi + ecx*4 +8], mm7
+
+ movq mm0, [esp + fixH]
+ movq mm3, [esp + fiyH]
+ movq mm1, [esp + fizH]
+ movq mm2, mm0
+ punpckldq mm0, mm3 /* mm0(l)=fxH1, mm0(h)=fyH1 */
+ punpckhdq mm2, mm3 /* mm2(l)=fxH2, mm2(h)=fyH2 */
+ movq mm3, mm1
+ pswapd mm3,mm3
+ /* mm1 is fzH1 */
+ /* mm3 is fzH2 */
+
+ movq mm6, [edi + ecx*4 + 12] /* increment iH1 force */
+ movd mm7, [edi + ecx*4 + 20]
+ pfadd mm6, mm0
+ pfadd mm7, mm1
+ movq [edi + ecx*4 + 12], mm6
+ movd [edi + ecx*4 + 20], mm7
+
+ movq mm6, [edi + ecx*4 + 24] /* increment iH2 force */
+ movd mm7, [edi + ecx*4 + 32]
+ pfadd mm6, mm2
+ pfadd mm7, mm3
+ movq [edi + ecx*4 + 24], mm6
+ movd [edi + ecx*4 + 32], mm7
+
+
+ mov ebx, [ebp + fshift] /* increment fshift force */
+ mov edx, [esp + is3]
+
+ movq mm6, [ebx + edx*4]
+ movd mm7, [ebx + edx*4 + 8]
+ pfadd mm6, [esp + fixO]
+ pfadd mm7, [esp + fizO]
+ pfadd mm6, mm0
+ pfadd mm7, mm1
+ pfadd mm6, mm2
+ pfadd mm7, mm3
+ movq [ebx + edx*4], mm6
+ movd [ebx + edx*4 + 8], mm7
+
+ mov edx, [ebp + gid] /* get group index for this i particle */
+ mov edx, [edx]
+ add [ebp + gid], 4 /* advance pointer */
+
+ movq mm7, [esp + vctot]
+ pfacc mm7,mm7 /* get and sum the two parts of total potential */
+
+ mov eax, [ebp + Vc]
+ movd mm6, [eax + edx*4]
+ pfadd mm6, mm7
+ movd [eax + edx*4], mm6 /* increment vc[gid] */
+
+ movq mm7, [esp + vnbtot]
+ pfacc mm7,mm7 /* same for Vnb */
+
+ mov eax, [ebp + Vnb]
+ movd mm6, [eax + edx*4]
+ pfadd mm6, mm7
+ movd [eax + edx*4], mm6 /* increment vnb[gid] */
+ /* finish if last */
+ dec dword ptr [ebp + nri]
+ jz .i3320_end
+ /* not last, iterate once more! */
+ jmp .i3320_outer
+.i3320_end:
+ femms
+ add esp, 228
+ pop edi
+ pop esi
+ pop edx
+ pop ecx
+ pop ebx
+ pop eax
+ leave
+ ret
+
+
+
+.globl inl3330_3dnow
+ .type inl3330_3dnow,@function
+inl3330_3dnow:
+.equ nri, 8
+.equ iinr, 12
+.equ jindex, 16
+.equ jjnr, 20
+.equ shift, 24
+.equ shiftvec, 28
+.equ fshift, 32
+.equ gid, 36
+.equ pos, 40
+.equ faction, 44
+.equ charge, 48
+.equ facel, 52
+.equ Vc, 56
+.equ type, 60
+.equ ntype, 64
+.equ nbfp, 68
+.equ Vnb, 72
+.equ tabscale, 76
+.equ VFtab, 80
+ /* stack offsets for local variables */
+.equ is3, 0
+.equ ii3, 4
+.equ ixO, 8
+.equ iyO, 12
+.equ izO, 16
+.equ ixH, 20 /* repeated (64bit) to fill 3dnow reg */
+.equ iyH, 28 /* repeated (64bit) to fill 3dnow reg */
+.equ izH, 36 /* repeated (64bit) to fill 3dnow reg */
+.equ qqOO, 44 /* repeated (64bit) to fill 3dnow reg */
+.equ qqOH, 52 /* repeated (64bit) to fill 3dnow reg */
+.equ qqHH, 60 /* repeated (64bit) to fill 3dnow reg */
+.equ c6, 68 /* repeated (64bit) to fill 3dnow reg */
+.equ c12, 76 /* repeated (64bit) to fill 3dnow reg */
+.equ two, 84 /* repeated (64bit) to fill 3dnow reg */
+.equ n1, 92 /* repeated (64bit) to fill 3dnow reg */
+.equ tsc, 100 /* repeated (64bit) to fill 3dnow reg */
+.equ vctot, 108 /* repeated (64bit) to fill 3dnow reg */
+.equ vnbtot, 116 /* repeated (64bit) to fill 3dnow reg */
+.equ innerjjnr, 124
+.equ innerk, 128
+.equ fixO, 132
+.equ fiyO, 136
+.equ fizO, 140
+.equ fixH, 144 /* repeated (64bit) to fill 3dnow reg */
+.equ fiyH, 152 /* repeated (64bit) to fill 3dnow reg */
+.equ fizH, 160 /* repeated (64bit) to fill 3dnow reg */
+.equ dxO, 168
+.equ dyO, 172
+.equ dzO, 176
+.equ dxH, 180 /* repeated (64bit) to fill 3dnow reg */
+.equ dyH, 188 /* repeated (64bit) to fill 3dnow reg */
+.equ dzH, 196 /* repeated (64bit) to fill 3dnow reg */
+.equ tmprsqH, 204 /* repeated (64bit) to fill 3dnow reg */
+ push ebp
+ mov ebp,esp
+ push eax
+ push ebx
+ push ecx
+ push edx
+ push esi
+ push edi
+ sub esp, 212 /* local stack space */
+ femms
+ /* assume we have at least one i particle - start directly */
+
+ mov ecx, [ebp + iinr] /* ecx = pointer into iinr[] */
+ mov ebx, [ecx] /* ebx=ii */
+
+ mov edx, [ebp + charge]
+ movd mm1, [ebp + facel] /* mm1=facel */
+ movd mm2, [edx + ebx*4] /* mm2=charge[i.i0] (O) */
+ movd mm3, [edx + ebx*4 + 4] /* mm2=charge[i.i0+1] (H) */
+ movq mm4, mm2
+ pfmul mm4, mm1
+ movq mm6, mm3
+ pfmul mm6, mm1
+ movq mm5, mm4
+ pfmul mm4, mm2 /* mm4=qqOO*facel */
+ pfmul mm5, mm3 /* mm5=qqOH*facel */
+ pfmul mm6, mm3 /* mm6=qqHH*facel */
+ punpckldq mm5,mm5 /* spread to both halves */
+ punpckldq mm6,mm6 /* spread to both halves */
+ movq [esp + qqOO], mm4
+ movq [esp + qqOH], mm5
+ movq [esp + qqHH], mm6
+ mov edx, [ebp + type]
+ mov ecx, [edx + ebx*4]
+ shl ecx, 1
+ mov edx, ecx
+ imul ecx, [ebp + ntype]
+ add edx, ecx
+ mov eax, [ebp + nbfp]
+ movd mm0, [eax + edx*4]
+ movd mm1, [eax + edx*4 + 4]
+ movq [esp + c6], mm0
+ movq [esp + c12], mm1
+ movq mm2, [mm_two]
+ movq [esp + two], mm2
+ movd mm3, [ebp + tabscale]
+ punpckldq mm3,mm3
+ movq [esp + tsc], mm3
+.i3330_outer:
+ mov eax, [ebp + shift] /* eax = pointer into shift[] */
+ mov ebx, [eax] /* ebx=shift[n] */
+ add [ebp + shift], 4 /* advance pointer one step */
+
+ lea ebx, [ebx + ebx*2] /* ebx=3*is */
+ mov [esp + is3],ebx /* store is3 */
+
+ mov eax, [ebp + shiftvec] /* eax = base of shiftvec[] */
+
+ movq mm5, [eax + ebx*4] /* move shX/shY to mm5 and shZ to mm6. */
+ movd mm6, [eax + ebx*4 + 8]
+ movq mm0, mm5
+ movq mm1, mm5
+ movq mm2, mm6
+ punpckldq mm0,mm0 /* also expand shX,Y,Z in mm0--mm2. */
+ punpckhdq mm1,mm1
+ punpckldq mm2,mm2
+
+ mov ecx, [ebp + iinr] /* ecx = pointer into iinr[] */
+ add [ebp + iinr], 4 /* advance pointer */
+ mov ebx, [ecx] /* ebx=ii */
+
+ lea ebx, [ebx + ebx*2] /* ebx = 3*ii=ii3 */
+ mov eax, [ebp + pos] /* eax = base of pos[] */
+
+ pfadd mm5, [eax + ebx*4] /* ix = shX + posX (and iy too) */
+ movd mm7, [eax + ebx*4 + 8] /* cant use direct memory add for 4 bytes (iz) */
+ mov [esp + ii3], ebx /* (use mm7 as temp. storage for iz.) */
+ pfadd mm6, mm7
+ movq [esp + ixO], mm5
+ movq [esp + izO], mm6
+
+ movd mm3, [eax + ebx*4 + 12]
+ movd mm4, [eax + ebx*4 + 16]
+ movd mm5, [eax + ebx*4 + 20]
+ punpckldq mm3, [eax + ebx*4 + 24]
+ punpckldq mm4, [eax + ebx*4 + 28]
+ punpckldq mm5, [eax + ebx*4 + 32] /* coords of H1 in low mm3-mm5, H2 in high */
+
+ pfadd mm0, mm3
+ pfadd mm1, mm4
+ pfadd mm2, mm5
+ movq [esp + ixH], mm0
+ movq [esp + iyH], mm1
+ movq [esp + izH], mm2
+
+ /* clear vctot and i forces */
+ pxor mm7,mm7
+ movq [esp + vctot], mm7
+ movq [esp + vnbtot], mm7
+ movq [esp + fixO], mm7
+ movq [esp + fizO], mm7
+ movq [esp + fixH], mm7
+ movq [esp + fiyH], mm7
+ movq [esp + fizH], mm7
+
+ mov eax, [ebp + jindex]
+ mov ecx, [eax] /* jindex[n] */
+ mov edx, [eax + 4] /* jindex[n+1] */
+ add [ebp + jindex], 4
+ sub edx, ecx /* number of innerloop atoms */
+ mov [esp + innerk], edx
+
+ mov esi, [ebp + pos]
+ mov edi, [ebp + faction]
+ mov eax, [ebp + jjnr]
+ shl ecx, 2
+ add eax, ecx
+ mov [esp + innerjjnr], eax /* pointer to jjnr[nj0] */
+.i3330_inner_loop:
+ /* a single j particle iteration here - compare with the unrolled code for comments. */
+ mov eax, [esp + innerjjnr]
+ mov eax, [eax] /* eax=jnr offset */
+ add [esp + innerjjnr], 4 /* advance pointer */
+
+ lea eax, [eax + eax*2]
+
+ movq mm0, [esi + eax*4]
+ movd mm1, [esi + eax*4 + 8]
+ /* copy & expand to mm2-mm4 for the H interactions */
+ movq mm2, mm0
+ movq mm3, mm0
+ movq mm4, mm1
+ punpckldq mm2,mm2
+ punpckhdq mm3,mm3
+ punpckldq mm4,mm4
+
+ pfsubr mm0, [esp + ixO]
+ pfsubr mm1, [esp + izO]
+
+ movq [esp + dxO], mm0
+ pfmul mm0,mm0
+ movd [esp + dzO], mm1
+ pfmul mm1,mm1
+ pfacc mm0, mm0
+ pfadd mm0, mm1 /* mm0=rsqO */
+
+ punpckldq mm2, mm2
+ punpckldq mm3, mm3
+ punpckldq mm4, mm4 /* mm2-mm4 is jx-jz */
+ pfsubr mm2, [esp + ixH]
+ pfsubr mm3, [esp + iyH]
+ pfsubr mm4, [esp + izH] /* mm2-mm4 is dxH-dzH */
+
+ movq [esp + dxH], mm2
+ movq [esp + dyH], mm3
+ movq [esp + dzH], mm4
+ pfmul mm2,mm2
+ pfmul mm3,mm3
+ pfmul mm4,mm4
+
+ pfadd mm3,mm2
+ pfadd mm3,mm4 /* mm3=rsqH */
+ movq [esp + tmprsqH], mm3
+
+ pfrsqrt mm1,mm0
+
+ movq mm2,mm1
+ pfmul mm1,mm1
+ pfrsqit1 mm1,mm0
+ pfrcpit2 mm1,mm2 /* mm1=invsqrt */ OO
+ pfmul mm0, mm1 /* mm0=rsq */ OO
+
+ pfmul mm0, [esp + tsc]
+ pf2iw mm4, mm0
+ movd [esp + n1], mm4
+ pi2fd mm4,mm4
+ pfsub mm0, mm4 /* now mm0 is eps and mm4 n0 */
+ movq mm2, mm0
+ pfmul mm2, mm2 /* mm0 is eps, mm2 eps2 */
+
+ /* coulomb table */
+ mov edx, [ebp + VFtab]
+ mov ecx, [esp + n1]
+ lea ecx, [ecx + ecx*2]
+ shl ecx, 2
+
+ /* load all values we need */
+ movd mm4, [edx + ecx*4]
+ movd mm5, [edx + ecx*4 + 4]
+ movd mm6, [edx + ecx*4 + 8]
+ movd mm7, [edx + ecx*4 + 12]
+
+ pfmul mm6, mm0 /* mm6 = Geps */
+ pfmul mm7, mm2 /* mm7 = Heps2 */
+
+ pfadd mm5, mm6
+ pfadd mm5, mm7 /* mm5 = Fp */
+
+ pfmul mm7, [esp + two] /* two*Heps2 */
+ pfadd mm7, mm6
+ pfadd mm7, mm5 /* mm7=FF */
+
+ pfmul mm5, mm0 /* mm5=eps*Fp */
+ pfadd mm5, mm4 /* mm5= VV */
+
+ pfmul mm5, [esp + qqOO] /* vcoul=qq*VV */
+ pfmul mm7, [esp + qqOO] /* fijC=qq*FF */
+
+ /* update vctot directly, use mm3 for fscal sum. */
+ pfadd mm5, [esp + vctot]
+ movq [esp + vctot], mm5
+ movq mm3, mm7
+
+ /* dispersion table */
+ /* load all the table values we need */
+ movd mm4, [edx + ecx*4 + 16]
+ movd mm5, [edx + ecx*4 + 20]
+ movd mm6, [edx + ecx*4 + 24]
+ movd mm7, [edx + ecx*4 + 28]
+ pfmul mm6, mm0 /* mm6 = Geps */
+ pfmul mm7, mm2 /* mm7 = Heps2 */
+ pfadd mm5, mm6
+ pfadd mm5, mm7 /* mm5 = Fp */
+ pfmul mm7, [esp + two] /* two*Heps2 */
+ pfadd mm7, mm6
+ pfadd mm7, mm5 /* mm7=FF */
+ pfmul mm5, mm0 /* mm5=eps*Fp */
+ pfadd mm5, mm4 /* mm5= VV */
+
+ movq mm4, [esp + c6]
+ pfmul mm7, mm4 /* fijD */
+ pfmul mm5, mm4 /* vnb6 */
+ pfadd mm3, mm7 /* add to fscal */
+
+ /* update vnbtot to release mm5! */
+ pfadd mm5, [esp + vnbtot] /* add the earlier value */
+ movq [esp + vnbtot], mm5 /* store the sum */
+
+ /* repulsion table */
+ /* load all the table values we need */
+ movd mm4, [edx + ecx*4 + 32]
+ movd mm5, [edx + ecx*4 + 36]
+ movd mm6, [edx + ecx*4 + 40]
+ movd mm7, [edx + ecx*4 + 44]
+
+ pfmul mm6, mm0 /* mm6 = Geps */
+ pfmul mm7, mm2 /* mm7 = Heps2 */
+ pfadd mm5, mm6
+ pfadd mm5, mm7 /* mm5 = Fp */
+ pfmul mm7, [esp + two] /* two*Heps2 */
+ pfadd mm7, mm6
+ pfadd mm7, mm5 /* mm7=FF */
+ pfmul mm5, mm0 /* mm5=eps*Fp */
+ pfadd mm5, mm4 /* mm5= VV */
+
+ movq mm6, [esp + c12]
+ pfmul mm7, mm6 /* fijR */
+ pfmul mm5, mm6 /* vnb12 */
+ pfadd mm3, mm7 /* total fscal fijC+fijD+fijR */
+
+ /* change sign of fscal and multiply with rinv */
+ pxor mm0,mm0
+ pfsubr mm3, mm0
+ pfmul mm3, [esp + tsc]
+ pfmul mm3, mm1 /* mm3 is total fscal (for the oxygen) now */
+
+ /* update vnbtot */
+ pfadd mm5, [esp + vnbtot] /* add the earlier value */
+ movq [esp + vnbtot], mm5 /* store the sum */
+
+ /* Ready with the oxygen - potential is updated, fscal is in mm3.
+ * time for hydrogens!
+ */
+
+ movq mm0, [esp + tmprsqH]
+
+ pfrsqrt mm1, mm0
+ pswapd mm0,mm0
+ pfrsqrt mm2, mm0
+ pswapd mm0,mm0
+ punpckldq mm1,mm2 /* seeds are in mm1 now, and rsq in mm0. */
+
+ movq mm2, mm1
+ pfmul mm1,mm1
+ pfrsqit1 mm1,mm0
+ pfrcpit2 mm1,mm2 /* mm1=invsqrt */
+
+ pfmul mm0,mm1 /* mm0=r */
+ pfmul mm0, [esp + tsc]
+ pf2iw mm4, mm0
+ movq [esp + n1], mm4
+ pi2fd mm4,mm4
+ pfsub mm0, mm4 /* now mm0 is eps and mm4 n0 */
+ movq mm2, mm0
+ pfmul mm2, mm2 /* mm0 is eps, mm2 eps2 */
+
+ /* coulomb table */
+ mov edx, [ebp + VFtab]
+ mov ecx, [esp + n1]
+ lea ecx, [ecx + ecx*2]
+ shl ecx, 2
+ /* load all values we need */
+ movd mm4, [edx + ecx*4]
+ movd mm5, [edx + ecx*4 + 4]
+ movd mm6, [edx + ecx*4 + 8]
+ movd mm7, [edx + ecx*4 + 12]
+ mov ecx, [esp + n1 + 4]
+ lea ecx, [ecx + ecx*2]
+ shl ecx, 2
+ punpckldq mm4, [edx + ecx*4]
+ punpckldq mm5, [edx + ecx*4 + 4]
+ punpckldq mm6, [edx + ecx*4 + 8]
+ punpckldq mm7, [edx + ecx*4 + 12]
+
+ pfmul mm6, mm0 /* mm6 = Geps */
+ pfmul mm7, mm2 /* mm7 = Heps2 */
+
+ pfadd mm5, mm6
+ pfadd mm5, mm7 /* mm5 = Fp */
+
+ pfmul mm7, [esp + two] /* two*Heps2 */
+ pfadd mm7, mm6
+ pfadd mm7, mm5 /* mm7=FF */
+
+ pfmul mm5, mm0 /* mm5=eps*Fp */
+ pfadd mm5, mm4 /* mm5= VV */
+
+ pfmul mm5, [esp + qqOH] /* vcoul=qq*VV */
+ pfmul mm7, [esp + qqOH] /* fijC=qq*FF */
+ /* update vctot */
+ pfadd mm5, [esp + vctot]
+ movq [esp + vctot], mm5
+
+ /* change sign of fijC and multiply by rinv */
+ pxor mm4,mm4
+ pfsub mm4, mm7
+ pfmul mm4, [esp + tsc]
+ pfmul mm4, mm1 /* mm4 is total fscal (for the hydrogens) now */
+
+ /* spread oxygen fscalar to both positions */
+ punpckldq mm3,mm3
+ /* calc vectorial force for O */
+ movq mm0, [esp + dxO]
+ movd mm1, [esp + dzO]
+ pfmul mm0, mm3
+ pfmul mm1, mm3
+
+ /* calc vectorial force for H's */
+ movq mm5, [esp + dxH]
+ movq mm6, [esp + dyH]
+ movq mm7, [esp + dzH]
+ pfmul mm5, mm4
+ pfmul mm6, mm4
+ pfmul mm7, mm4
+
+ /* update iO particle force */
+ movq mm2, [esp + fixO]
+ movd mm3, [esp + fizO]
+ pfadd mm2, mm0
+ pfadd mm3, mm1
+ movq [esp + fixO], mm2
+ movd [esp + fizO], mm3
+
+ /* update iH forces */
+ movq mm2, [esp + fixH]
+ movq mm3, [esp + fiyH]
+ movq mm4, [esp + fizH]
+ pfadd mm2, mm5
+ pfadd mm3, mm6
+ pfadd mm4, mm7
+ movq [esp + fixH], mm2
+ movq [esp + fiyH], mm3
+ movq [esp + fizH], mm4
+
+ /* pack j forces from H in the same form as the oxygen force. */
+ pfacc mm5, mm6 /* mm5(l)=fjx(H1+H2) mm5(h)=fjy(H1+H2) */
+ pfacc mm7, mm7 /* mm7(l)=fjz(H1+H2) */
+
+ pfadd mm0, mm5 /* add up total force on j particle. */
+ pfadd mm1, mm7
+
+ /* update j particle force */
+ movq mm2, [edi + eax*4]
+ movd mm3, [edi + eax*4 + 8]
+ pfsub mm2, mm0
+ pfsub mm3, mm1
+ movq [edi + eax*4], mm2
+ movd [edi + eax*4 +8], mm3
+
+ /* interactions with j H1 */
+
+ movq mm0, [esi + eax*4 + 12]
+ movd mm1, [esi + eax*4 + 20]
+ /* copy & expand to mm2-mm4 for the H interactions */
+ movq mm2, mm0
+ movq mm3, mm0
+ movq mm4, mm1
+ punpckldq mm2,mm2
+ punpckhdq mm3,mm3
+ punpckldq mm4,mm4
+
+ pfsubr mm0, [esp + ixO]
+ pfsubr mm1, [esp + izO]
+
+ movq [esp + dxO], mm0
+ pfmul mm0,mm0
+ movd [esp + dzO], mm1
+ pfmul mm1,mm1
+ pfacc mm0, mm1
+ pfadd mm0, mm1 /* mm0=rsqO */
+
+ punpckldq mm2, mm2
+ punpckldq mm3, mm3
+ punpckldq mm4, mm4 /* mm2-mm4 is jx-jz */
+ pfsubr mm2, [esp + ixH]
+ pfsubr mm3, [esp + iyH]
+ pfsubr mm4, [esp + izH] /* mm2-mm4 is dxH-dzH */
+
+ movq [esp + dxH], mm2
+ movq [esp + dyH], mm3
+ movq [esp + dzH], mm4
+ pfmul mm2,mm2
+ pfmul mm3,mm3
+ pfmul mm4,mm4
+
+ pfadd mm3,mm2
+ pfadd mm3,mm4 /* mm3=rsqH */
+ movq [esp + tmprsqH], mm3
+
+ pfrsqrt mm1,mm0
+
+ movq mm2,mm1
+ pfmul mm1,mm1
+ pfrsqit1 mm1,mm0
+ pfrcpit2 mm1,mm2 /* mm1=invsqrt */
+ pfmul mm0, mm1 /* mm0=rsq */
+
+ pfmul mm0, [esp + tsc]
+ pf2iw mm4, mm0
+ movd [esp + n1], mm4
+ pi2fd mm4,mm4
+ pfsub mm0, mm4 /* now mm0 is eps and mm4 n0 */
+ movq mm2, mm0
+ pfmul mm2, mm2 /* mm0 is eps, mm2 eps2 */
+
+ /* coulomb table */
+ mov edx, [ebp + VFtab]
+ mov ecx, [esp + n1]
+ lea ecx, [ecx + ecx*2]
+ shl ecx, 2
+
+ /* load all values we need */
+ movd mm4, [edx + ecx*4]
+ movd mm5, [edx + ecx*4 + 4]
+ movd mm6, [edx + ecx*4 + 8]
+ movd mm7, [edx + ecx*4 + 12]
+
+ pfmul mm6, mm0 /* mm6 = Geps */
+ pfmul mm7, mm2 /* mm7 = Heps2 */
+
+ pfadd mm5, mm6
+ pfadd mm5, mm7 /* mm5 = Fp */
+
+ pfmul mm7, [esp + two] /* two*Heps2 */
+ pfadd mm7, mm6
+ pfadd mm7, mm5 /* mm7=FF */
+
+ pfmul mm5, mm0 /* mm5=eps*Fp */
+ pfadd mm5, mm4 /* mm5= VV */
+
+ pfmul mm5, [esp + qqOH] /* vcoul=qq*VV */
+ pfmul mm7, [esp + qqOH] /* fijC=qq*FF */
+
+ /* update vctot directly, force is moved to mm3. */
+ pfadd mm5, [esp + vctot]
+ movq [esp + vctot], mm5
+ pxor mm3, mm3
+ pfsub mm3, mm7
+ pfmul mm3, [esp + tsc]
+ pfmul mm3, mm1 /* mm3 is total fscal (for the oxygen) now */
+
+ movq mm0, [esp + tmprsqH]
+
+ pfrsqrt mm1, mm0
+ pswapd mm0,mm0
+ pfrsqrt mm2, mm0
+ pswapd mm0,mm0
+ punpckldq mm1,mm2 /* seeds are in mm1 now, and rsq in mm0. */
+
+ movq mm2, mm1
+ pfmul mm1,mm1
+ pfrsqit1 mm1,mm0
+ pfrcpit2 mm1,mm2 /* mm1=invsqrt */
+
+ pfmul mm0,mm1 /* mm0=r */
+ pfmul mm0, [esp + tsc]
+ pf2iw mm4, mm0
+ movq [esp + n1], mm4
+ pi2fd mm4,mm4
+ pfsub mm0, mm4 /* now mm0 is eps and mm4 n0 */
+ movq mm2, mm0
+ pfmul mm2, mm2 /* mm0 is eps, mm2 eps2 */
+
+ /* coulomb table */
+ mov edx, [ebp + VFtab]
+ mov ecx, [esp + n1]
+ lea ecx, [ecx + ecx*2]
+ shl ecx, 2
+ /* load all values we need */
+ movd mm4, [edx + ecx*4]
+ movd mm5, [edx + ecx*4 + 4]
+ movd mm6, [edx + ecx*4 + 8]
+ movd mm7, [edx + ecx*4 + 12]
+ mov ecx, [esp + n1 + 4]
+ lea ecx, [ecx + ecx*2]
+ shl ecx, 2
+ punpckldq mm4, [edx + ecx*4]
+ punpckldq mm5, [edx + ecx*4 + 4]
+ punpckldq mm6, [edx + ecx*4 + 8]
+ punpckldq mm7, [edx + ecx*4 + 12]
+
+
+ pfmul mm6, mm0 /* mm6 = Geps */
+ pfmul mm7, mm2 /* mm7 = Heps2 */
+
+ pfadd mm5, mm6
+ pfadd mm5, mm7 /* mm5 = Fp */
+
+ pfmul mm7, [esp + two] /* two*Heps2 */
+ pfadd mm7, mm6
+ pfadd mm7, mm5 /* mm7=FF */
+
+ pfmul mm5, mm0 /* mm5=eps*Fp */
+ pfadd mm5, mm4 /* mm5= VV */
+
+ pfmul mm5, [esp + qqHH] /* vcoul=qq*VV */
+ pfmul mm7, [esp + qqHH] /* fijC=qq*FF */
+ /* update vctot */
+ pfadd mm5, [esp + vctot]
+ movq [esp + vctot], mm5
+
+ /* change sign of fijC and multiply by rinv */
+ pxor mm4,mm4
+ pfsub mm4, mm7
+ pfmul mm4, [esp + tsc]
+ pfmul mm4, mm1 /* mm4 is total fscal (for the hydrogens) now */
+
+ /* spread oxygen fscalar to both positions */
+ punpckldq mm3,mm3
+ /* calc vectorial force for O */
+ movq mm0, [esp + dxO]
+ movd mm1, [esp + dzO]
+ pfmul mm0, mm3
+ pfmul mm1, mm3
+
+ /* calc vectorial force for H's */
+ movq mm5, [esp + dxH]
+ movq mm6, [esp + dyH]
+ movq mm7, [esp + dzH]
+ pfmul mm5, mm4
+ pfmul mm6, mm4
+ pfmul mm7, mm4
+
+ /* update iO particle force */
+ movq mm2, [esp + fixO]
+ movd mm3, [esp + fizO]
+ pfadd mm2, mm0
+ pfadd mm3, mm1
+ movq [esp + fixO], mm2
+ movd [esp + fizO], mm3
+
+ /* update iH forces */
+ movq mm2, [esp + fixH]
+ movq mm3, [esp + fiyH]
+ movq mm4, [esp + fizH]
+ pfadd mm2, mm5
+ pfadd mm3, mm6
+ pfadd mm4, mm7
+ movq [esp + fixH], mm2
+ movq [esp + fiyH], mm3
+ movq [esp + fizH], mm4
+
+ /* pack j forces from H in the same form as the oxygen force. */
+ pfacc mm5, mm6 /* mm5(l)=fjx(H1+H2) mm5(h)=fjy(H1+H2) */
+ pfacc mm7, mm7 /* mm7(l)=fjz(H1+H2) */
+
+ pfadd mm0, mm5 /* add up total force on j particle. */
+ pfadd mm1, mm7
+
+ /* update j particle force */
+ movq mm2, [edi + eax*4 + 12]
+ movd mm3, [edi + eax*4 + 20]
+ pfsub mm2, mm0
+ pfsub mm3, mm1
+ movq [edi + eax*4 + 12], mm2
+ movd [edi + eax*4 + 20], mm3
+
+ /* interactions with j H2 */
+ movq mm0, [esi + eax*4 + 24]
+ movd mm1, [esi + eax*4 + 32]
+ /* copy & expand to mm2-mm4 for the H interactions */
+ movq mm2, mm0
+ movq mm3, mm0
+ movq mm4, mm1
+ punpckldq mm2,mm2
+ punpckhdq mm3,mm3
+ punpckldq mm4,mm4
+
+ pfsubr mm0, [esp + ixO]
+ pfsubr mm1, [esp + izO]
+
+ movq [esp + dxO], mm0
+ pfmul mm0,mm0
+ movd [esp + dzO], mm1
+ pfmul mm1,mm1
+ pfacc mm0, mm1
+ pfadd mm0, mm1 /* mm0=rsqO */
+
+ punpckldq mm2, mm2
+ punpckldq mm3, mm3
+ punpckldq mm4, mm4 /* mm2-mm4 is jx-jz */
+ pfsubr mm2, [esp + ixH]
+ pfsubr mm3, [esp + iyH]
+ pfsubr mm4, [esp + izH] /* mm2-mm4 is dxH-dzH */
+
+ movq [esp + dxH], mm2
+ movq [esp + dyH], mm3
+ movq [esp + dzH], mm4
+ pfmul mm2,mm2
+ pfmul mm3,mm3
+ pfmul mm4,mm4
+
+ pfadd mm3,mm2
+ pfadd mm3,mm4 /* mm3=rsqH */
+ movq [esp + tmprsqH], mm3
+
+ pfrsqrt mm1,mm0
+
+ movq mm2,mm1
+ pfmul mm1,mm1
+ pfrsqit1 mm1,mm0
+ pfrcpit2 mm1,mm2 /* mm1=invsqrt */
+ pfmul mm0, mm1
+
+ pfmul mm0, [esp + tsc]
+ pf2iw mm4, mm0
+ movd [esp + n1], mm4
+ pi2fd mm4,mm4
+ pfsub mm0, mm4 /* now mm0 is eps and mm4 n0 */
+ movq mm2, mm0
+ pfmul mm2, mm2 /* mm0 is eps, mm2 eps2 */
+
+ /* coulomb table */
+ mov edx, [ebp + VFtab]
+ mov ecx, [esp + n1]
+ lea ecx, [ecx + ecx*2]
+ shl ecx, 2
+
+ /* load all values we need */
+ movd mm4, [edx + ecx*4]
+ movd mm5, [edx + ecx*4 + 4]
+ movd mm6, [edx + ecx*4 + 8]
+ movd mm7, [edx + ecx*4 + 12]
+
+ pfmul mm6, mm0 /* mm6 = Geps */
+ pfmul mm7, mm2 /* mm7 = Heps2 */
+
+ pfadd mm5, mm6
+ pfadd mm5, mm7 /* mm5 = Fp */
+
+ pfmul mm7, [esp + two] /* two*Heps2 */
+ pfadd mm7, mm6
+ pfadd mm7, mm5 /* mm7=FF */
+
+ pfmul mm5, mm0 /* mm5=eps*Fp */
+ pfadd mm5, mm4 /* mm5= VV */
+
+ pfmul mm5, [esp + qqOH] /* vcoul=qq*VV */
+ pfmul mm7, [esp + qqOH] /* fijC=qq*FF */
+
+ /* update vctot directly, use mm3 for fscal sum */
+ pfadd mm5, [esp + vctot]
+ movq [esp + vctot], mm5
+ pxor mm3,mm3
+ pfsub mm3, mm7
+ pfmul mm3, [esp + tsc]
+ pfmul mm3, mm1 /* mm3 is total fscal (for the oxygen) now */
+
+ movq mm0, [esp + tmprsqH]
+
+ pfrsqrt mm1, mm0
+ pswapd mm0,mm0
+ pfrsqrt mm2, mm0
+ pswapd mm0,mm0
+ punpckldq mm1,mm2 /* seeds are in mm1 now, and rsq in mm0. */
+
+ movq mm2, mm1
+ pfmul mm1,mm1
+ pfrsqit1 mm1,mm0
+ pfrcpit2 mm1,mm2 /* mm1=invsqrt */
+
+ pfmul mm0,mm1 /* mm0=r */
+ pfmul mm0, [esp + tsc]
+ pf2iw mm4, mm0
+ movq [esp + n1], mm4
+ pi2fd mm4,mm4
+ pfsub mm0, mm4 /* now mm0 is eps and mm4 n0 */
+ movq mm2, mm0
+ pfmul mm2, mm2 /* mm0 is eps, mm2 eps2 */
+
+ /* coulomb table */
+ mov edx, [ebp + VFtab]
+ mov ecx, [esp + n1]
+ lea ecx, [ecx + ecx*2]
+ shl ecx, 2
+ /* load all values we need */
+ movd mm4, [edx + ecx*4]
+ movd mm5, [edx + ecx*4 + 4]
+ movd mm6, [edx + ecx*4 + 8]
+ movd mm7, [edx + ecx*4 + 12]
+ mov ecx, [esp + n1 + 4]/* mm5 = Fp */
+ lea ecx, [ecx + ecx*2]
+ shl ecx, 2
+ punpckldq mm4, [edx + ecx*4]
+ punpckldq mm5, [edx + ecx*4 + 4]
+ punpckldq mm6, [edx + ecx*4 + 8]
+ punpckldq mm7, [edx + ecx*4 + 12]
+
+
+ pfmul mm6, mm0 /* mm6 = Geps */
+ pfmul mm7, mm2 /* mm7 = Heps2 */
+
+ pfadd mm5, mm6
+ pfadd mm5, mm7 /* mm5 = Fp */
+
+ pfmul mm7, [esp + two] /* two*Heps2 */
+ pfadd mm7, mm6
+ pfadd mm7, mm5 /* mm7=FF */
+
+ pfmul mm5, mm0 /* mm5=eps*Fp */
+ pfadd mm5, mm4 /* mm5= VV */
+
+ pfmul mm5, [esp + qqHH] /* vcoul=qq*VV */
+ pfmul mm7, [esp + qqHH] /* fijC=qq*FF */
+ /* update vctot */
+ pfadd mm5, [esp + vctot]
+ movq [esp + vctot], mm5
+
+ /* change sign of fijC and multiply by rinv */
+ pxor mm4,mm4
+ pfsub mm4, mm7
+ pfmul mm4, [esp + tsc]
+ pfmul mm4, mm1 /* mm4 is total fscal (for the hydrogens) now */
+
+ /* spread oxygen fscalar to both positions */
+ punpckldq mm3,mm3
+ /* calc vectorial force for O */
+ movq mm0, [esp + dxO]
+ movd mm1, [esp + dzO]
+ pfmul mm0, mm3
+ pfmul mm1, mm3
+
+ /* calc vectorial force for H's */
+ movq mm5, [esp + dxH]
+ movq mm6, [esp + dyH]
+ movq mm7, [esp + dzH]
+ pfmul mm5, mm4
+ pfmul mm6, mm4
+ pfmul mm7, mm4
+
+ /* update iO particle force */
+ movq mm2, [esp + fixO]
+ movd mm3, [esp + fizO]
+ pfadd mm2, mm0
+ pfadd mm3, mm1
+ movq [esp + fixO], mm2
+ movd [esp + fizO], mm3
+
+ /* update iH forces */
+ movq mm2, [esp + fixH]
+ movq mm3, [esp + fiyH]
+ movq mm4, [esp + fizH]
+ pfadd mm2, mm5
+ pfadd mm3, mm6
+ pfadd mm4, mm7
+ movq [esp + fixH], mm2
+ movq [esp + fiyH], mm3
+ movq [esp + fizH], mm4
+
+ /* pack j forces from H in the same form as the oxygen force. */
+ pfacc mm5, mm6 /* mm5(l)=fjx(H1+H2) mm5(h)=fjy(H1+H2) */
+ pfacc mm7, mm7 /* mm7(l)=fjz(H1+H2) */
+
+ pfadd mm0, mm5 /* add up total force on j particle. */
+ pfadd mm1, mm7
+
+ /* update j particle force */
+ movq mm2, [edi + eax*4 + 24]
+ movd mm3, [edi + eax*4 + 32]
+ pfsub mm2, mm0
+ pfsub mm3, mm1
+ movq [edi + eax*4 + 24], mm2
+ movd [edi + eax*4 + 32], mm3
+
+ /* done - one more? */
+ dec dword ptr [esp + innerk]
+ jz .i3330_updateouterdata
+ jmp .i3330_inner_loop
+.i3330_updateouterdata:
+ mov ecx, [esp + ii3]
+
+ movq mm6, [edi + ecx*4] /* increment iO force */
+ movd mm7, [edi + ecx*4 + 8]
+ pfadd mm6, [esp + fixO]
+ pfadd mm7, [esp + fizO]
+ movq [edi + ecx*4], mm6
+ movd [edi + ecx*4 +8], mm7
+
+ movq mm0, [esp + fixH]
+ movq mm3, [esp + fiyH]
+ movq mm1, [esp + fizH]
+ movq mm2, mm0
+ punpckldq mm0, mm3 /* mm0(l)=fxH1, mm0(h)=fyH1 */
+ punpckhdq mm2, mm3 /* mm2(l)=fxH2, mm2(h)=fyH2 */
+ movq mm3, mm1
+ pswapd mm3,mm3
+ /* mm1 is fzH1 */
+ /* mm3 is fzH2 */
+
+ movq mm6, [edi + ecx*4 + 12] /* increment iH1 force */
+ movd mm7, [edi + ecx*4 + 20]
+ pfadd mm6, mm0
+ pfadd mm7, mm1
+ movq [edi + ecx*4 + 12], mm6
+ movd [edi + ecx*4 + 20], mm7
+
+ movq mm6, [edi + ecx*4 + 24] /* increment iH2 force */
+ movd mm7, [edi + ecx*4 + 32]
+ pfadd mm6, mm2
+ pfadd mm7, mm3
+ movq [edi + ecx*4 + 24], mm6
+ movd [edi + ecx*4 + 32], mm7
+
+
+ mov ebx, [ebp + fshift] /* increment fshift force */
+ mov edx, [esp + is3]
+
+ movq mm6, [ebx + edx*4]
+ movd mm7, [ebx + edx*4 + 8]
+ pfadd mm6, [esp + fixO]
+ pfadd mm7, [esp + fizO]
+ pfadd mm6, mm0
+ pfadd mm7, mm1
+ pfadd mm6, mm2
+ pfadd mm7, mm3
+ movq [ebx + edx*4], mm6
+ movd [ebx + edx*4 + 8], mm7
+
+ mov edx, [ebp + gid] /* get group index for this i particle */
+ mov edx, [edx]
+ add [ebp + gid], 4 /* advance pointer */
+
+ movq mm7, [esp + vctot]
+ pfacc mm7,mm7 /* get and sum the two parts of total potential */
+
+ mov eax, [ebp + Vc]
+ movd mm6, [eax + edx*4]
+ pfadd mm6, mm7
+ movd [eax + edx*4], mm6 /* increment vc[gid] */
+
+ movq mm7, [esp + vnbtot]
+ pfacc mm7,mm7 /* get and sum the two parts of total potential */
+
+ mov eax, [ebp + Vnb]
+ movd mm6, [eax + edx*4]
+ pfadd mm6, mm7
+ movd [eax + edx*4], mm6 /* increment vnbtot[gid] */
+ /* finish if last */
+ dec dword ptr [ebp + nri]
+ jz .i3330_end
+ /* not last, iterate once more! */
+ jmp .i3330_outer
+.i3330_end:
+ femms
+ add esp, 212
+ pop edi
+ pop esi
+ pop edx
+ pop ecx
+ pop ebx
+ pop eax
+ leave
+ ret
+
+++ /dev/null
-;;
-;; This source code is part of
-;;
-;; G R O M A C S
-;;
-;; GROningen MAchine for Chemical Simulations
-;;
-;; VERSION 3.0
-;;
-;; Copyright (c) 1991-2001
-;; BIOSON Research Institute, Dept. of Biophysical Chemistry
-;; University of Groningen, The Netherlands
-;;
-;; This program is free software; you can redistribute it and/or
-;; modify it under the terms of the GNU General Public License
-;; as published by the Free Software Foundation; either version 2
-;; of the License, or (at your option) any later version.
-;;
-;; If you want to redistribute modifications, please consider that
-;; scientific software is very special. Version control is crucial -
-;; bugs must be traceable. We will be happy to consider code for
-;; inclusion in the official distribution, but derived work must not
-;; be called official GROMACS. Details are found in the README & COPYING
-;; files - if they are missing, get the official version at www.gromacs.org.
-;;
-;; To help us fund GROMACS development, we humbly ask that you cite
-;; the papers on the package - you can find them in the top README file.
-;;
-;; Do check out http://www.gromacs.org , or mail us at gromacs@gromacs.org .
-;;
-;; And Hey:
-;; GROup of MAchos and Cynical Suckers
-
-
-; NASM macro set to make interfacing to 32-bit programs easier -*- nasm -*-
-%imacro proc 1 ; begin a procedure definition
-%push proc
- global %1
-%1: push ebp
- mov ebp,esp
-%assign %$arg 8
-%define %$procname %1
-%endmacro
-
-
-
-%imacro arg 0-1 4 ; used with the argument name as a label
-%00 equ %$arg
-%assign %$arg %1+%$arg
-%endmacro
-
-
-
-%imacro endproc 0
-%ifnctx proc
-%error Mismatched `endproc'/`proc'
-
-%else
- leave
- ret
-__end_%$procname: ; useful for calculating function size
-
-%pop
-%endif
-%endmacro
-
- ;; This file contains a subset of the gromacs innerloops
- ;; manually written in assembly to optimize performance
- ;; on AMD extended 3DNow-enabled processors like Athlon
- ;; and later generations.
- ;; Erik Lindahl, 2000, erik@theophys.kth.se
-
-segment .data
-mm_two
- dd 2.0
- dd 2.0
-mm_six
- dd 6.0
- dd 6.0
-mm_twelve
- dd 12.0
- dd 12.0
-
-
-segment .text
-
-
- global check3dnow ; tries to issue a simple 3DNOW instruction
-check3dnow:
- femms
- pfmul mm0,mm0
- femms
- ret
-
-
- global vecrecip_3dnow
-vecrecip_3dnow
- push ebp
- mov ebp,esp
- push eax
- push ebx
- push ecx
- push edx
-
- mov eax, [ebp + 8]
- mov ebx, [ebp + 12]
- mov ecx, [ebp + 16]
- mov edx, ecx
- shr ecx, 2
- jecxz .tail
- emms
-.mainloop:
- movq mm0,[eax]
- add eax, byte 8
- pfrcp mm1,mm0
- movq mm4,[eax]
- pswapd mm0,mm0
- add eax, byte 8
- pfrcp mm2,mm0
- pswapd mm0,mm0
- pfrcp mm5,mm4
- pswapd mm4,mm4
- punpckldq mm1,mm2
- pfrcp mm6,mm4
- pswapd mm4,mm4
- pfrcpit1 mm0,mm1
- punpckldq mm5,mm6
- pfrcpit2 mm0,mm1
- movq [ebx],mm0
- pfrcpit1 mm4,mm5
- add ebx, byte 8
- pfrcpit2 mm4,mm5
- movq [ebx],mm4
- add ebx, byte 8
- dec ecx
- jecxz .tail
- jmp short .mainloop
-.tail:
- mov ecx,edx
- and ecx,3
- jecxz .end
-.tailloop:
- movd mm0,[eax]
- add eax, byte 4
- pfrcp mm1,mm0
- pfrcpit1 mm0,mm1
- pfrcpit2 mm0,mm1
- movd [ebx],mm0
- add ebx, byte 4
- dec ecx
- jecxz .end
- jmp short .tailloop
-.end:
- emms
- pop edx
- pop ecx
- pop ebx
- pop eax
- leave
- ret
-
-
-segment .text
-
- global vecinvsqrt_3dnow
-vecinvsqrt_3dnow
- push ebp
- mov ebp,esp
- push eax
- push ebx
- push ecx
- push edx
-
- mov eax, [ebp + 8]
- mov ebx, [ebp + 12]
- mov ecx, [ebp + 16]
- mov edx, ecx
- shr ecx, 2
- jecxz .tail
- emms
-.mainloop:
- movq mm0,[eax]
- add eax, byte 8
- pfrsqrt mm1,mm0
- movq mm4,[eax]
- pswapd mm0,mm0
- add eax, byte 8
- pfrsqrt mm2,mm0
- pswapd mm0,mm0
- pfrsqrt mm5,mm4
- pswapd mm4,mm4
- punpckldq mm1,mm2
- pfrsqrt mm6,mm4
- movq mm3,mm1
- pswapd mm4,mm4
- pfmul mm1,mm1
- punpckldq mm5,mm6
- pfrsqit1 mm1,mm0
- movq mm7,mm5
- pfrcpit2 mm1,mm3
- pfmul mm5,mm5
- movq [ebx],mm1
- pfrsqit1 mm5,mm4
- add ebx, byte 8
- pfrcpit2 mm5,mm7
- movq [ebx],mm5
- add ebx, byte 8
- dec ecx
- jecxz .tail
- jmp short .mainloop
-.tail:
- mov ecx,edx
- and ecx,3
- jecxz .end
-.tailloop:
- movd mm0,[eax]
- add eax, byte 4
- pfrsqrt mm1,mm0
- movq mm2,mm1
- pfmul mm1,mm1
- pfrsqit1 mm1,mm0
- pfrcpit2 mm1,mm2
- movd [ebx],mm1
- add ebx, byte 4
- dec ecx
- jecxz .end
- jmp short .tailloop
-.end:
- emms
- pop edx
- pop ecx
- pop ebx
- pop eax
- leave
- ret
-
-
-
-proc inl0100_3dnow
-%$nri arg
-%$iinr arg
-%$jindex arg
-%$jjnr arg
-%$shift arg
-%$shiftvec arg
-%$fshift arg
-%$gid arg
-%$pos arg
-%$faction arg
-%$type arg
-%$ntype arg
-%$nbfp arg
-%$Vnb arg
- ;; stack offsets for local variables
-.is3 equ 0
-.ii3 equ 4
-.ix equ 8
-.iy equ 12
-.iz equ 16
-.vnbtot equ 20 ; repeated (64bit) to fill 3dnow reg
-.c6 equ 28 ; repeated (64bit) to fill 3dnow reg
-.c12 equ 36 ; repeated (64bit) to fill 3dnow reg
-.six equ 44 ; repeated (64bit) to fill 3dnow reg
-.twelve equ 52 ; repeated (64bit) to fill 3dnow reg
-.ntia equ 60
-.innerjjnr equ 64
-.innerk equ 68
-.fix equ 72
-.fiy equ 76
-.fiz equ 80
-.dx1 equ 84
-.dy1 equ 88
-.dz1 equ 92
-.dx2 equ 96
-.dy2 equ 100
-.dz2 equ 104
- push eax
- push ebx
- push ecx
- push edx
- push esi
- push edi
- sub esp, 108 ; local stack space
- femms
- ; move data to local stack
- movq mm0, [mm_six]
- movq mm1, [mm_twelve]
- movq [esp + .six], mm0
- movq [esp + .twelve], mm1
- ;; assume we have at least one i particle - start directly
-.outer:
- mov eax, [ebp + %$shift] ; eax = pointer into shift[]
- mov ebx, [eax] ; ebx=shift[n]
- add [ebp + %$shift], dword 4 ; advance pointer one step
-
- lea ebx, [ebx + ebx*2] ; ebx=3*is
- mov [esp + .is3],ebx ; store is3
-
- mov eax, [ebp + %$shiftvec] ; eax = base of shiftvec[]
-
- movq mm0, [eax + ebx*4] ; move shX/shY to mm0 and shZ to mm1.
- movd mm1, [eax + ebx*4 + 8]
-
- mov ecx, [ebp + %$iinr] ; ecx = pointer into iinr[]
- add [ebp + %$iinr], dword 4 ; advance pointer
- mov ebx, [ecx] ; ebx =ii
-
- mov edx, [ebp + %$type]
- mov edx, [edx + ebx*4]
- imul edx, [ebp + %$ntype]
- shl edx, 1
- mov [esp + .ntia], edx
-
- lea ebx, [ebx + ebx*2] ; ebx = 3*ii=ii3
- mov eax, [ebp + %$pos] ; eax = base of pos[]
-
- pfadd mm0, [eax + ebx*4] ; ix = shX + posX (and iy too)
- movd mm3, [eax + ebx*4 + 8] ; cant use direct memory add for 4 bytes (iz)
- mov [esp + .ii3], ebx
- pfadd mm1, mm3
- movq [esp + .ix], mm0
- movd [esp + .iz], mm1
-
- ;; clear total potential and i forces.
- pxor mm7,mm7
- movq [esp + .vnbtot], mm7
- movq [esp + .fix], mm7
- movd [esp + .fiz], mm7
-
- mov eax, [ebp + %$jindex]
- mov ecx, [eax] ; jindex[n]
- mov edx, [eax + 4] ; jindex[n+1]
- add [ebp + %$jindex], dword 4
- sub edx, ecx ; number of innerloop atoms
-
- mov esi, [ebp + %$pos]
- mov edi, [ebp + %$faction]
- mov eax, [ebp + %$jjnr]
- shl ecx, 2
- add eax, ecx
- mov [esp + .innerjjnr], eax ; pointer to jjnr[nj0]
- sub edx, dword 2
- mov [esp + .innerk], edx ; number of innerloop atoms
- jge .unroll_loop
- jmp .finish_inner
-.unroll_loop:
- ;; paired innerloop here.
- mov ecx, [esp + .innerjjnr] ; pointer to jjnr[k]
- mov eax, [ecx]
- mov ebx, [ecx + 4] ; eax/ebx=jnr
- add [esp + .innerjjnr], dword 8 ; advance pointer (unrolled 2)
- prefetch [ecx + 16] ; prefetch data - trial and error says 16 is best
-
- mov ecx, [ebp + %$type]
- mov edx, [ecx + eax*4] ; type [jnr1]
- mov ecx, [ecx + ebx*4] ; type [jnr2]
-
- mov esi, [ebp + %$nbfp] ; base of nbfp
- shl edx, 1
- shl ecx, 1
- add edx, [esp + .ntia] ; tja = ntia + 2*type
- add ecx, [esp + .ntia]
-
- movq mm5, [esi + edx*4] ; mm5 = 1st c6 / c12
- movq mm7, [esi + ecx*4] ; mm7 = 2nd c6 / c12
- movq mm6,mm5
- punpckldq mm5,mm7 ; mm5 = 1st c6 / 2nd c6
- punpckhdq mm6,mm7 ; mm6 = 1st c12 / 2nd c12
- movq [esp + .c6], mm5
- movq [esp + .c12], mm6
-
- lea eax, [eax + eax*2] ; replace jnr with j3
- lea ebx, [ebx + ebx*2]
-
- mov esi, [ebp + %$pos]
-
- movq mm0, [esp + .ix]
- movd mm1, [esp + .iz]
- movq mm4, [esi + eax*4] ; fetch first j coordinates
- movd mm5, [esi + eax*4 + 8]
- pfsubr mm4,mm0 ; dr = ir - jr
- pfsubr mm5,mm1
- movq [esp + .dx1], mm4 ; store dr
- movd [esp + .dz1], mm5
- pfmul mm4,mm4 ; square dx,dy,dz
- pfmul mm5,mm5
- pfacc mm4, mm5 ; accumulate to get dx*dx+dy*dy+dz*dz
- pfacc mm4, mm5 ; first rsq in lower mm4
-
- movq mm6, [esi + ebx*4] ; fetch second j coordinates
- movd mm7, [esi + ebx*4 + 8]
-
- pfsubr mm6,mm0 ; dr = ir - jr
- pfsubr mm7,mm1
- movq [esp + .dx2], mm6 ; store dr
- movd [esp + .dz2], mm7
- pfmul mm6,mm6 ; square dx,dy,dz
- pfmul mm7,mm7
- pfacc mm6, mm7 ; accumulate to get dx*dx+dy*dy+dz*dz
- pfacc mm6, mm7 ; second rsq in lower mm6
-
- pfrcp mm0, mm4 ; lookup reciprocal seed
- pfrcp mm1, mm6
-
- punpckldq mm0,mm1
- punpckldq mm4,mm6 ; now 4 has rsq and 0 the seed for both pairs.
- ; amd 3dnow N-R iteration to get full precision.
- pfrcpit1 mm4,mm0
- pfrcpit2 mm4,mm0
- ;; mm4 now contains invsq,
- ;; do potential and fscal
- movq mm0, mm4
- pfmul mm4, mm0
- pfmul mm4, mm0 ; mm4=rinvsix
- movq mm5, mm4
- pfmul mm5, mm5 ; mm5=rinvtwelve
-
- pfmul mm5, [esp + .c12]
- pfmul mm4, [esp + .c6]
- movq mm6, mm5 ; mm6 is vnb12-vnb6
- pfsub mm6, mm4
-
- pfmul mm4, [esp + .six]
-
- pfmul mm5, [esp + .twelve]
- pfsub mm5,mm4
- pfmul mm0, mm5 ; mm0 is total fscal now
-
- prefetchw [esp + .dx1] ; prefetch i forces to cache
-
- ;; spread fscalar to both positions
- movq mm1,mm0
- punpckldq mm0,mm0
- punpckhdq mm1,mm1
-
- ;; calc vector force
- prefetchw [edi + eax*4] ; prefetch the 1st faction to cache
- movq mm2, [esp + .dx1] ; fetch dr
- movd mm3, [esp + .dz1]
-
- ;; update vnbtot
- pfadd mm6, [esp + .vnbtot] ; add the earlier value
- movq [esp + .vnbtot], mm6 ; store the sum
-
- prefetchw [edi + ebx*4] ; prefetch the 2nd faction to cache
- pfmul mm2, mm0 ; mult by fs
- pfmul mm3, mm0
-
- movq mm4, [esp + .dx2] ; fetch dr
- movd mm5, [esp + .dz2]
- pfmul mm4, mm1 ; mult by fs
- pfmul mm5, mm1
- ;; update i forces
-
- movq mm0, [esp + .fix]
- movd mm1, [esp + .fiz]
- pfadd mm0, mm2
- pfadd mm1, mm3
-
- pfadd mm0, mm4
- pfadd mm1, mm5
- movq [esp + .fix], mm0
- movd [esp + .fiz], mm1
- ;; update j forces
-
- movq mm0, [edi + eax*4]
- movd mm1, [edi + eax*4 + 8]
- movq mm6, [edi + ebx*4]
- movd mm7, [edi + ebx*4 + 8]
-
- pfsub mm0, mm2
- pfsub mm1, mm3
- pfsub mm6, mm4
- pfsub mm7, mm5
-
- movq [edi + eax*4], mm0
- movd [edi + eax*4 +8], mm1
- movq [edi + ebx*4], mm6
- movd [edi + ebx*4 + 8], mm7
-
- ;; should we do one more iteration?
- sub [esp + .innerk], dword 2
- jl .finish_inner
- jmp .unroll_loop
-.finish_inner:
- and [esp + .innerk], dword 1
- jnz .single_inner
- jmp .updateouterdata
-.single_inner:
- ;; a single j particle iteration here - compare with the unrolled code for comments.
- mov eax, [esp + .innerjjnr]
- mov eax, [eax] ; eax=jnr offset
-
- mov esi, [ebp + %$nbfp]
- mov ecx, [ebp + %$type]
- mov edx, [ecx + eax*4] ; type [jnr1]
- shl edx, 1
- add edx, [esp + .ntia] ; tja = ntia + 2*type
- movd mm5, [esi + edx*4] ; mm5 = 1st c6
- movq [esp + .c6], mm5
- movd mm5, [esi + edx*4 + 4] ; mm5 = 1st c12
- movq [esp + .c12], mm5
-
- mov esi, [ebp + %$pos]
- lea eax, [eax + eax*2]
-
- movq mm0, [esp + .ix]
- movd mm1, [esp + .iz]
- movq mm4, [esi + eax*4]
- movd mm5, [esi + eax*4 + 8]
- pfsubr mm4, mm0
- pfsubr mm5, mm1
- movq [esp + .dx1], mm4
- pfmul mm4,mm4
- movd [esp + .dz1], mm5
- pfmul mm5,mm5
- pfacc mm4, mm5
- pfacc mm4, mm5 ; mm4=rsq
-
- pfrcp mm0,mm4
- pfrcpit1 mm4,mm0
- pfrcpit2 mm4,mm0 ; mm4=invsq
- ;; calculate potentials and scalar force
- movq mm0, mm4
-
- pfmul mm4, mm0
- pfmul mm4, mm0 ; mm4=rinvsix
- movq mm5, mm4
- pfmul mm5, mm5 ; mm5=rinvtwelve
-
- pfmul mm5, [esp + .c12]
- pfmul mm4, [esp + .c6]
- movq mm6, mm5 ; mm6 is vnb12-vnb6
- pfsub mm6, mm4
-
- pfmul mm4, [esp + .six]
-
- pfmul mm5, [esp + .twelve]
- pfsub mm5, mm4
- pfmul mm0, mm5 ; mm0 is total fscal now
-
- ;; update vnbtot
- pfadd mm6, [esp + .vnbtot] ; add the earlier value
- movq [esp + .vnbtot], mm6 ; store the sum
-
- ;; spread fscalar to both positions
- punpckldq mm0,mm0
- ;; calc vectorial force
- prefetchw [edi + eax*4] ; prefetch faction to cache
- movq mm2, [esp + .dx1]
- movd mm3, [esp + .dz1]
-
- pfmul mm2, mm0
- pfmul mm3, mm0
-
- ;; update i particle force
- movq mm0, [esp + .fix]
- movd mm1, [esp + .fiz]
- pfadd mm0, mm2
- pfadd mm1, mm3
- movq [esp + .fix], mm0
- movd [esp + .fiz], mm1
- ;; update j particle force
- movq mm0, [edi + eax*4]
- movd mm1, [edi + eax *4+ 8]
- pfsub mm0, mm2
- pfsub mm1, mm3
- movq [edi + eax*4], mm0
- movd [edi + eax*4 +8], mm1
- ;; done!
-.updateouterdata:
- mov ecx, [esp + .ii3]
-
- movq mm6, [edi + ecx*4] ; increment i force
- movd mm7, [edi + ecx*4 + 8]
- pfadd mm6, [esp + .fix]
- pfadd mm7, [esp + .fiz]
- movq [edi + ecx*4], mm6
- movd [edi + ecx*4 +8], mm7
-
- mov ebx, [ebp + %$fshift] ; increment fshift force
- mov edx, [esp + .is3]
-
- movq mm6, [ebx + edx*4]
- movd mm7, [ebx + edx*4 + 8]
- pfadd mm6, [esp + .fix]
- pfadd mm7, [esp + .fiz]
- movq [ebx + edx*4], mm6
- movd [ebx + edx*4 + 8], mm7
-
- mov edx, [ebp + %$gid] ; get group index for this i particle
- mov edx, [edx]
- add [ebp + %$gid], dword 4 ; advance pointer
-
- movq mm7, [esp + .vnbtot]
- pfacc mm7,mm7 ; get and sum the two parts of total potential
-
- mov eax, [ebp + %$Vnb]
- movd mm6, [eax + edx*4]
- pfadd mm6, mm7
- movd [eax + edx*4], mm6 ; increment vnb[gid]
-
- ;; finish if last
- mov ecx, [ebp + %$nri]
- dec ecx
- jecxz .end
- ;; not last, iterate once more!
- mov [ebp + %$nri], ecx
- jmp .outer
-.end:
- femms
- add esp, 108
- pop edi
- pop esi
- pop edx
- pop ecx
- pop ebx
- pop eax
- endproc
-
-
-
-
-
-proc inl0110_3dnow
-%$nri arg
-%$iinr arg
-%$jindex arg
-%$jjnr arg
-%$shift arg
-%$shiftvec arg
-%$fshift arg
-%$gid arg
-%$pos arg
-%$faction arg
-%$type arg
-%$ntype arg
-%$nbfp arg
-%$Vnb arg
-%$nsatoms arg
- ;; stack offsets for local variables
-.is3 equ 0
-.ii3 equ 4
-.shX equ 8
-.shY equ 12
-.shZ equ 16
-.ix equ 20
-.iy equ 24
-.iz equ 28
-.vnbtot equ 32 ; repeated (64bit) to fill 3dnow reg
-.c6 equ 40 ; repeated (64bit) to fill 3dnow reg
-.c12 equ 48 ; repeated (64bit) to fill 3dnow reg
-.six equ 56 ; repeated (64bit) to fill 3dnow reg
-.twelve equ 64 ; repeated (64bit) to fill 3dnow reg
-.ntia equ 72
-.innerjjnr0 equ 76
-.innerk0 equ 80
-.innerjjnr equ 84
-.innerk equ 88
-.fix equ 92
-.fiy equ 96
-.fiz equ 100
-.dx1 equ 104
-.dy1 equ 108
-.dz1 equ 112
-.dx2 equ 116
-.dy2 equ 120
-.dz2 equ 124
-.nsvdwc equ 128
-.nscoul equ 132
-.nsvdw equ 136
-.solnr equ 140
- push eax
- push ebx
- push ecx
- push edx
- push esi
- push edi
- sub esp, 144 ; local stack space
- femms
- movq mm0, [mm_six]
- movq mm1, [mm_twelve]
- movq [esp + .six], mm0
- movq [esp + .twelve], mm1
- ;; assume we have at least one i particle - start directly
-.outer:
- mov eax, [ebp + %$shift] ; eax = pointer into shift[]
- mov ebx, [eax] ; ebx=shift[n]
- add [ebp + %$shift], dword 4 ; advance pointer one step
-
- lea ebx, [ebx + ebx*2] ; ebx=3*is
- mov [esp + .is3],ebx ; store is3
-
- mov eax, [ebp + %$shiftvec] ; eax = base of shiftvec[]
-
- movq mm0, [eax + ebx*4] ; move shX/shY to mm0 and shZ to mm1.
- movd mm1, [eax + ebx*4 + 8]
- movq [esp + .shX], mm0
- movd [esp + .shZ], mm1
-
- mov ecx, [ebp + %$iinr] ; ecx = pointer into iinr[]
- add [ebp + %$iinr], dword 4 ; advance pointer
- mov ebx, [ecx] ; ebx =ii
-
- mov eax, [ebp + %$nsatoms]
- add [ebp + %$nsatoms], dword 12
- mov ecx, [eax]
- mov edx, [eax + 4]
- mov eax, [eax + 8]
- sub ecx, eax
- sub eax, edx
-
- mov [esp + .nsvdwc], edx
- mov [esp + .nscoul], eax
- mov [esp + .nsvdw], ecx
-
- ;; clear potential
- pxor mm7,mm7
- movq [esp + .vnbtot], mm7
- mov [esp + .solnr], ebx
-
- mov eax, [ebp + %$jindex]
- mov ecx, [eax] ; jindex[n]
- mov edx, [eax + 4] ; jindex[n+1]
- add [ebp + %$jindex], dword 4
- sub edx, ecx ; number of innerloop atoms
- mov eax, [ebp + %$jjnr]
- shl ecx, 2
- add eax, ecx
- mov [esp + .innerjjnr0], eax ; pointer to jjnr[nj0]
-
- mov [esp + .innerk0], edx ; number of innerloop atoms
- mov esi, [ebp + %$pos]
- mov edi, [ebp + %$faction]
-
- mov ecx, [esp + .nsvdwc]
- cmp ecx, dword 0
- jnz .mno_vdwc
- jmp .testvdw
-.mno_vdwc:
- mov ebx, [esp + .solnr]
- inc dword [esp + .solnr]
-
- mov edx, [ebp + %$type]
- mov edx, [edx + ebx*4]
- imul edx, [ebp + %$ntype]
- shl edx, 1
- mov [esp + .ntia], edx
-
- lea ebx, [ebx + ebx*2] ; ebx = 3*ii=ii3
- mov eax, [ebp + %$pos] ; eax = base of pos[]
- mov [esp + .ii3], ebx
-
- movq mm0, [eax + ebx*4]
- movd mm1, [eax + ebx*4 + 8]
- pfadd mm0, [esp + .shX]
- pfadd mm1, [esp + .shZ]
- movq [esp + .ix], mm0
- movd [esp + .iz], mm1
-
- ;; clear forces.
- pxor mm7,mm7
- movq [esp + .fix], mm7
- movd [esp + .fiz], mm7
-
- mov ecx, [esp + .innerjjnr0]
- mov [esp + .innerjjnr], ecx
- mov edx, [esp + .innerk0]
- sub edx, dword 2
- mov [esp + .innerk], edx ; number of innerloop atoms
- jge .unroll_vdwc_loop
- jmp .finish_vdwc_inner
-.unroll_vdwc_loop:
- ;; paired innerloop here.
- mov ecx, [esp + .innerjjnr] ; pointer to jjnr[k]
- mov eax, [ecx]
- mov ebx, [ecx + 4] ; eax/ebx=jnr
- add [esp + .innerjjnr], dword 8 ; advance pointer (unrolled 2)
- prefetch [ecx + 16] ; prefetch data - trial and error says 16 is best
-
- mov ecx, [ebp + %$type]
- mov edx, [ecx + eax*4] ; type [jnr1]
- mov ecx, [ecx + ebx*4] ; type [jnr2]
-
- mov esi, [ebp + %$nbfp] ; base of nbfp
- shl edx, 1
- shl ecx, 1
- add edx, [esp + .ntia] ; tja = ntia + 2*type
- add ecx, [esp + .ntia]
-
- movq mm5, [esi + edx*4] ; mm5 = 1st c6 / c12
- movq mm7, [esi + ecx*4] ; mm7 = 2nd c6 / c12
- movq mm6,mm5
- punpckldq mm5,mm7 ; mm5 = 1st c6 / 2nd c6
- punpckhdq mm6,mm7 ; mm6 = 1st c12 / 2nd c12
- movq [esp + .c6], mm5
- movq [esp + .c12], mm6
-
- lea eax, [eax + eax*2] ; replace jnr with j3
- lea ebx, [ebx + ebx*2]
-
- mov esi, [ebp + %$pos]
-
- movq mm0, [esp + .ix]
- movd mm1, [esp + .iz]
- movq mm4, [esi + eax*4] ; fetch first j coordinates
- movd mm5, [esi + eax*4 + 8]
- pfsubr mm4,mm0 ; dr = ir - jr
- pfsubr mm5,mm1
- movq [esp + .dx1], mm4 ; store dr
- movd [esp + .dz1], mm5
- pfmul mm4,mm4 ; square dx,dy,dz
- pfmul mm5,mm5
- pfacc mm4, mm5 ; accumulate to get dx*dx+dy*dy+dz*dz
- pfacc mm4, mm5 ; first rsq in lower mm4
-
- movq mm6, [esi + ebx*4] ; fetch second j coordinates
- movd mm7, [esi + ebx*4 + 8]
-
- pfsubr mm6,mm0 ; dr = ir - jr
- pfsubr mm7,mm1
- movq [esp + .dx2], mm6 ; store dr
- movd [esp + .dz2], mm7
- pfmul mm6,mm6 ; square dx,dy,dz
- pfmul mm7,mm7
- pfacc mm6, mm7 ; accumulate to get dx*dx+dy*dy+dz*dz
- pfacc mm6, mm7 ; second rsq in lower mm6
-
- pfrcp mm0, mm4 ; lookup reciprocal seed
- pfrcp mm1, mm6
-
- punpckldq mm0,mm1
- punpckldq mm4,mm6 ; now 4 has rsq and 0 the seed for both pairs.
- ; amd 3dnow N-R iteration to get full precision.
- pfrcpit1 mm4,mm0
- pfrcpit2 mm4,mm0
- ;; mm4 now contains invsq,
- ;; do potential and fscal
- movq mm0, mm4
- pfmul mm4, mm0
- pfmul mm4, mm0 ; mm4=rinvsix
- movq mm5, mm4
- pfmul mm5, mm5 ; mm5=rinvtwelve
-
- pfmul mm5, [esp + .c12]
- pfmul mm4, [esp + .c6]
- movq mm6, mm5 ; mm6 is vnb12-vnb6
- pfsub mm6, mm4
-
- pfmul mm4, [esp + .six]
-
- pfmul mm5, [esp + .twelve]
- pfsub mm5,mm4
- pfmul mm0, mm5 ; mm0 is total fscal now
-
- prefetchw [esp + .dx1] ; prefetch i forces to cache
-
- ;; spread fscalar to both positions
- movq mm1,mm0
- punpckldq mm0,mm0
- punpckhdq mm1,mm1
-
- ;; calc vector force
- prefetchw [edi + eax*4] ; prefetch the 1st faction to cache
- movq mm2, [esp + .dx1] ; fetch dr
- movd mm3, [esp + .dz1]
-
- ;; update vnbtot
- pfadd mm6, [esp + .vnbtot] ; add the earlier value
- movq [esp + .vnbtot], mm6 ; store the sum
-
- prefetchw [edi + ebx*4] ; prefetch the 2nd faction to cache
- pfmul mm2, mm0 ; mult by fs
- pfmul mm3, mm0
-
- movq mm4, [esp + .dx2] ; fetch dr
- movd mm5, [esp + .dz2]
- pfmul mm4, mm1 ; mult by fs
- pfmul mm5, mm1
- ;; update i forces
-
- movq mm0, [esp + .fix]
- movd mm1, [esp + .fiz]
- pfadd mm0, mm2
- pfadd mm1, mm3
-
- pfadd mm0, mm4
- pfadd mm1, mm5
- movq [esp + .fix], mm0
- movd [esp + .fiz], mm1
- ;; update j forces
-
- movq mm0, [edi + eax*4]
- movd mm1, [edi + eax*4 + 8]
- movq mm6, [edi + ebx*4]
- movd mm7, [edi + ebx*4 + 8]
-
- pfsub mm0, mm2
- pfsub mm1, mm3
- pfsub mm6, mm4
- pfsub mm7, mm5
-
- movq [edi + eax*4], mm0
- movd [edi + eax*4 +8], mm1
- movq [edi + ebx*4], mm6
- movd [edi + ebx*4 + 8], mm7
-
- ;; should we do one more iteration?
- sub [esp + .innerk], dword 2
- jl .finish_vdwc_inner
- jmp .unroll_vdwc_loop
-.finish_vdwc_inner:
- and [esp + .innerk], dword 1
- jnz .single_vdwc_inner
- jmp .updateouterdata_vdwc
-.single_vdwc_inner:
- ;; a single j particle iteration here - compare with the unrolled code for comments.
- mov eax, [esp + .innerjjnr]
- mov eax, [eax] ; eax=jnr offset
-
- mov esi, [ebp + %$nbfp]
- mov ecx, [ebp + %$type]
- mov edx, [ecx + eax*4] ; type [jnr1]
- shl edx, 1
- add edx, [esp + .ntia] ; tja = ntia + 2*type
- movd mm5, [esi + edx*4] ; mm5 = 1st c6
- movq [esp + .c6], mm5
- movd mm5, [esi + edx*4 + 4] ; mm5 = 1st c12
- movq [esp + .c12], mm5
-
- mov esi, [ebp + %$pos]
- lea eax, [eax + eax*2]
-
- movq mm0, [esp + .ix]
- movd mm1, [esp + .iz]
- movq mm4, [esi + eax*4]
- movd mm5, [esi + eax*4 + 8]
- pfsubr mm4, mm0
- pfsubr mm5, mm1
- movq [esp + .dx1], mm4
- pfmul mm4,mm4
- movd [esp + .dz1], mm5
- pfmul mm5,mm5
- pfacc mm4, mm5
- pfacc mm4, mm5 ; mm4=rsq
-
- pfrcp mm0,mm4
- pfrcpit1 mm4,mm0
- pfrcpit2 mm4,mm0 ; mm4=invsq
- ;; calculate potentials and scalar force
- movq mm0, mm4
-
- pfmul mm4, mm0
- pfmul mm4, mm0 ; mm4=rinvsix
- movq mm5, mm4
- pfmul mm5, mm5 ; mm5=rinvtwelve
-
- pfmul mm5, [esp + .c12]
- pfmul mm4, [esp + .c6]
- movq mm6, mm5 ; mm6 is vnb12-vnb6
- pfsub mm6, mm4
-
- pfmul mm4, [esp + .six]
-
- pfmul mm5, [esp + .twelve]
- pfsub mm5, mm4
- pfmul mm0, mm5 ; mm0 is total fscal now
-
- ;; update vnbtot
- pfadd mm6, [esp + .vnbtot] ; add the earlier value
- movq [esp + .vnbtot], mm6 ; store the sum
-
- ;; spread fscalar to both positions
- punpckldq mm0,mm0
- ;; calc vectorial force
- prefetchw [edi + eax*4] ; prefetch faction to cache
- movq mm2, [esp + .dx1]
- movd mm3, [esp + .dz1]
-
- pfmul mm2, mm0
- pfmul mm3, mm0
-
- ;; update i particle force
- movq mm0, [esp + .fix]
- movd mm1, [esp + .fiz]
- pfadd mm0, mm2
- pfadd mm1, mm3
- movq [esp + .fix], mm0
- movd [esp + .fiz], mm1
- ;; update j particle force
- movq mm0, [edi + eax*4]
- movd mm1, [edi + eax *4+ 8]
- pfsub mm0, mm2
- pfsub mm1, mm3
- movq [edi + eax*4], mm0
- movd [edi + eax*4 +8], mm1
- ;; done!
-.updateouterdata_vdwc:
- mov ecx, [esp + .ii3]
-
- movq mm6, [edi + ecx*4] ; increment i force
- movd mm7, [edi + ecx*4 + 8]
- pfadd mm6, [esp + .fix]
- pfadd mm7, [esp + .fiz]
- movq [edi + ecx*4], mm6
- movd [edi + ecx*4 +8], mm7
-
- mov ebx, [ebp + %$fshift] ; increment fshift force
- mov edx, [esp + .is3]
-
- movq mm6, [ebx + edx*4]
- movd mm7, [ebx + edx*4 + 8]
- pfadd mm6, [esp + .fix]
- pfadd mm7, [esp + .fiz]
- movq [ebx + edx*4], mm6
- movd [ebx + edx*4 + 8], mm7
-
- ;; loop back to mno.
- dec dword [esp + .nsvdwc]
- jz .testvdw
- jmp .mno_vdwc
-.testvdw
- mov ebx, [esp + .nscoul]
- add [esp + .solnr], dword ebx
-
- mov ecx, [esp + .nsvdw]
- cmp ecx, byte 0
- jnz .mno_vdw
- jmp .last_mno
-.mno_vdw:
- mov ebx, [esp + .solnr]
- inc dword [esp + .solnr]
-
- mov edx, [ebp + %$type]
- mov edx, [edx + ebx*4]
- imul edx, [ebp + %$ntype]
- shl edx, 1
- mov [esp + .ntia], edx
-
- lea ebx, [ebx + ebx*2] ; ebx = 3*ii=ii3
- mov eax, [ebp + %$pos] ; eax = base of pos[]
- mov [esp + .ii3], ebx
-
- movq mm0, [eax + ebx*4]
- movd mm1, [eax + ebx*4 + 8]
- pfadd mm0, [esp + .shX]
- pfadd mm1, [esp + .shZ]
- movq [esp + .ix], mm0
- movd [esp + .iz], mm1
-
- ;; clear forces.
- pxor mm7,mm7
- movq [esp + .fix], mm7
- movd [esp + .fiz], mm7
-
- mov ecx, [esp + .innerjjnr0]
- mov [esp + .innerjjnr], ecx
- mov edx, [esp + .innerk0]
- sub edx, dword 2
- mov [esp + .innerk], edx ; number of innerloop atoms
- jge .unroll_vdw_loop
- jmp .finish_vdw_inner
-.unroll_vdw_loop:
- ;; paired innerloop here.
- mov ecx, [esp + .innerjjnr] ; pointer to jjnr[k]
- mov eax, [ecx]
- mov ebx, [ecx + 4] ; eax/ebx=jnr
- add [esp + .innerjjnr], dword 8 ; advance pointer (unrolled 2)
- prefetch [ecx + 16] ; prefetch data - trial and error says 16 is best
-
- mov ecx, [ebp + %$type]
- mov edx, [ecx + eax*4] ; type [jnr1]
- mov ecx, [ecx + ebx*4] ; type [jnr2]
-
- mov esi, [ebp + %$nbfp] ; base of nbfp
- shl edx, 1
- shl ecx, 1
- add edx, [esp + .ntia] ; tja = ntia + 2*type
- add ecx, [esp + .ntia]
-
- movq mm5, [esi + edx*4] ; mm5 = 1st c6 / c12
- movq mm7, [esi + ecx*4] ; mm7 = 2nd c6 / c12
- movq mm6,mm5
- punpckldq mm5,mm7 ; mm5 = 1st c6 / 2nd c6
- punpckhdq mm6,mm7 ; mm6 = 1st c12 / 2nd c12
- movq [esp + .c6], mm5
- movq [esp + .c12], mm6
-
- lea eax, [eax + eax*2] ; replace jnr with j3
- lea ebx, [ebx + ebx*2]
-
- mov esi, [ebp + %$pos]
-
- movq mm0, [esp + .ix]
- movd mm1, [esp + .iz]
- movq mm4, [esi + eax*4] ; fetch first j coordinates
- movd mm5, [esi + eax*4 + 8]
- pfsubr mm4,mm0 ; dr = ir - jr
- pfsubr mm5,mm1
- movq [esp + .dx1], mm4 ; store dr
- movd [esp + .dz1], mm5
- pfmul mm4,mm4 ; square dx,dy,dz
- pfmul mm5,mm5
- pfacc mm4, mm5 ; accumulate to get dx*dx+dy*dy+dz*dz
- pfacc mm4, mm5 ; first rsq in lower mm4
-
- movq mm6, [esi + ebx*4] ; fetch second j coordinates
- movd mm7, [esi + ebx*4 + 8]
-
- pfsubr mm6,mm0 ; dr = ir - jr
- pfsubr mm7,mm1
- movq [esp + .dx2], mm6 ; store dr
- movd [esp + .dz2], mm7
- pfmul mm6,mm6 ; square dx,dy,dz
- pfmul mm7,mm7
- pfacc mm6, mm7 ; accumulate to get dx*dx+dy*dy+dz*dz
- pfacc mm6, mm7 ; second rsq in lower mm6
-
- pfrcp mm0, mm4 ; lookup reciprocal seed
- pfrcp mm1, mm6
-
- punpckldq mm0,mm1
- punpckldq mm4,mm6 ; now 4 has rsq and 0 the seed for both pairs.
- ; amd 3dnow N-R iteration to get full precision.
- pfrcpit1 mm4,mm0
- pfrcpit2 mm4,mm0
- ;; mm4 now contains invsq,
- ;; do potential and fscal
- movq mm0, mm4
- pfmul mm4, mm0
- pfmul mm4, mm0 ; mm4=rinvsix
- movq mm5, mm4
- pfmul mm5, mm5 ; mm5=rinvtwelve
-
- pfmul mm5, [esp + .c12]
- pfmul mm4, [esp + .c6]
- movq mm6, mm5 ; mm6 is vnb12-vnb6
- pfsub mm6, mm4
-
- pfmul mm4, [esp + .six]
-
- pfmul mm5, [esp + .twelve]
- pfsub mm5,mm4
- pfmul mm0, mm5 ; mm0 is total fscal now
-
- prefetchw [esp + .dx1] ; prefetch i forces to cache
-
- ;; spread fscalar to both positions
- movq mm1,mm0
- punpckldq mm0,mm0
- punpckhdq mm1,mm1
-
- ;; calc vector force
- prefetchw [edi + eax*4] ; prefetch the 1st faction to cache
- movq mm2, [esp + .dx1] ; fetch dr
- movd mm3, [esp + .dz1]
-
- ;; update vnbtot
- pfadd mm6, [esp + .vnbtot] ; add the earlier value
- movq [esp + .vnbtot], mm6 ; store the sum
-
- prefetchw [edi + ebx*4] ; prefetch the 2nd faction to cache
- pfmul mm2, mm0 ; mult by fs
- pfmul mm3, mm0
-
- movq mm4, [esp + .dx2] ; fetch dr
- movd mm5, [esp + .dz2]
- pfmul mm4, mm1 ; mult by fs
- pfmul mm5, mm1
- ;; update i forces
-
- movq mm0, [esp + .fix]
- movd mm1, [esp + .fiz]
- pfadd mm0, mm2
- pfadd mm1, mm3
-
- pfadd mm0, mm4
- pfadd mm1, mm5
- movq [esp + .fix], mm0
- movd [esp + .fiz], mm1
- ;; update j forces
-
- movq mm0, [edi + eax*4]
- movd mm1, [edi + eax*4 + 8]
- movq mm6, [edi + ebx*4]
- movd mm7, [edi + ebx*4 + 8]
-
- pfsub mm0, mm2
- pfsub mm1, mm3
- pfsub mm6, mm4
- pfsub mm7, mm5
-
- movq [edi + eax*4], mm0
- movd [edi + eax*4 +8], mm1
- movq [edi + ebx*4], mm6
- movd [edi + ebx*4 + 8], mm7
- ;; should we do one more iteration?
- sub [esp + .innerk], dword 2
- jl .finish_vdw_inner
- jmp .unroll_vdw_loop
-.finish_vdw_inner:
- and [esp + .innerk], dword 1
- jnz .single_vdw_inner
- jmp .updateouterdata_vdw
-.single_vdw_inner:
- ;; a single j particle iteration here - compare with the unrolled code for comments.
- mov eax, [esp + .innerjjnr]
- mov eax, [eax] ; eax=jnr offset
-
- mov esi, [ebp + %$nbfp]
- mov ecx, [ebp + %$type]
- mov edx, [ecx + eax*4] ; type [jnr1]
- shl edx, 1
- add edx, [esp + .ntia] ; tja = ntia + 2*type
- movd mm5, [esi + edx*4] ; mm5 = 1st c6
- movq [esp + .c6], mm5
- movd mm5, [esi + edx*4 + 4] ; mm5 = 1st c12
- movq [esp + .c12], mm5
-
- mov esi, [ebp + %$pos]
- lea eax, [eax + eax*2]
-
- movq mm0, [esp + .ix]
- movd mm1, [esp + .iz]
- movq mm4, [esi + eax*4]
- movd mm5, [esi + eax*4 + 8]
- pfsubr mm4, mm0
- pfsubr mm5, mm1
- movq [esp + .dx1], mm4
- pfmul mm4,mm4
- movd [esp + .dz1], mm5
- pfmul mm5,mm5
- pfacc mm4, mm5
- pfacc mm4, mm5 ; mm4=rsq
-
- pfrcp mm0,mm4
- pfrcpit1 mm4,mm0
- pfrcpit2 mm4,mm0 ; mm4=invsq
- ;; calculate potentials and scalar force
- movq mm0, mm4
-
- pfmul mm4, mm0
- pfmul mm4, mm0 ; mm4=rinvsix
- movq mm5, mm4
- pfmul mm5, mm5 ; mm5=rinvtwelve
-
- pfmul mm5, [esp + .c12]
- pfmul mm4, [esp + .c6]
- movq mm6, mm5 ; mm6 is vnb12-vnb6
- pfsub mm6, mm4
-
- pfmul mm4, [esp + .six]
-
- pfmul mm5, [esp + .twelve]
- pfsub mm5, mm4
- pfmul mm0, mm5 ; mm0 is total fscal now
-
- ;; update vnbtot
- pfadd mm6, [esp + .vnbtot] ; add the earlier value
- movq [esp + .vnbtot], mm6 ; store the sum
-
- ;; spread fscalar to both positions
- punpckldq mm0,mm0
- ;; calc vectorial force
- prefetchw [edi + eax*4] ; prefetch faction to cache
- movq mm2, [esp + .dx1]
- movd mm3, [esp + .dz1]
-
- pfmul mm2, mm0
- pfmul mm3, mm0
-
- ;; update i particle force
- movq mm0, [esp + .fix]
- movd mm1, [esp + .fiz]
- pfadd mm0, mm2
- pfadd mm1, mm3
- movq [esp + .fix], mm0
- movd [esp + .fiz], mm1
- ;; update j particle force
- movq mm0, [edi + eax*4]
- movd mm1, [edi + eax *4+ 8]
- pfsub mm0, mm2
- pfsub mm1, mm3
- movq [edi + eax*4], mm0
- movd [edi + eax*4 +8], mm1
- ;; done!
-.updateouterdata_vdw:
- mov ecx, [esp + .ii3]
-
- movq mm6, [edi + ecx*4] ; increment i force
- movd mm7, [edi + ecx*4 + 8]
- pfadd mm6, [esp + .fix]
- pfadd mm7, [esp + .fiz]
- movq [edi + ecx*4], mm6
- movd [edi + ecx*4 +8], mm7
-
- mov ebx, [ebp + %$fshift] ; increment fshift force
- mov edx, [esp + .is3]
-
- movq mm6, [ebx + edx*4]
- movd mm7, [ebx + edx*4 + 8]
- pfadd mm6, [esp + .fix]
- pfadd mm7, [esp + .fiz]
- movq [ebx + edx*4], mm6
- movd [ebx + edx*4 + 8], mm7
-
- ;; loop back to mno.
- dec dword [esp + .nsvdw]
- jz .last_mno
- jmp .mno_vdw
-
-.last_mno:
- mov edx, [ebp + %$gid] ; get group index for this i particle
- mov edx, [edx]
- add [ebp + %$gid], dword 4 ; advance pointer
-
- movq mm7, [esp + .vnbtot]
- pfacc mm7,mm7 ; get and sum the two parts of total potential
-
- mov eax, [ebp + %$Vnb]
- movd mm6, [eax + edx*4]
- pfadd mm6, mm7
- movd [eax + edx*4], mm6 ; increment vc[gid]
- ;; finish if last
- mov ecx, [ebp + %$nri]
- dec ecx
- jecxz .end
- ;; not last, iterate once more!
- mov [ebp + %$nri], ecx
- jmp .outer
-.end:
- femms
- add esp, 144
- pop edi
- pop esi
- pop edx
- pop ecx
- pop ebx
- pop eax
- endproc
-
-
-
-proc inl0300_3dnow
-%$nri arg
-%$iinr arg
-%$jindex arg
-%$jjnr arg
-%$shift arg
-%$shiftvec arg
-%$fshift arg
-%$gid arg
-%$pos arg
-%$faction arg
-%$type arg
-%$ntype arg
-%$nbfp arg
-%$Vnb arg
-%$tabscale arg
-%$VFtab arg
- ;; stack offsets for local variables
-.is3 equ 0
-.ii3 equ 4
-.ix equ 8
-.iy equ 12
-.iz equ 16
-.vnbtot equ 20 ; repeated (64bit) to fill 3dnow reg
-.c6 equ 28 ; repeated (64bit) to fill 3dnow reg
-.c12 equ 36 ; repeated (64bit) to fill 3dnow reg
-.two equ 44 ; repeated (64bit) to fill 3dnow reg
-.n1 equ 52 ; repeated (64bit) to fill 3dnow reg
-.tabscale equ 60 ; repeated (64bit) to fill 3dnow reg
-.ntia equ 68
-.innerjjnr equ 72
-.innerk equ 76
-.fix equ 80
-.fiy equ 84
-.fiz equ 88
-.dx1 equ 92
-.dy1 equ 96
-.dz1 equ 100
-.dx2 equ 104
-.dy2 equ 108
-.dz2 equ 112
- push eax
- push ebx
- push ecx
- push edx
- push esi
- push edi
- sub esp, 116 ; local stack space
- femms
- ; move data to local stack
- movq mm0, [mm_two]
- movd mm3, [ebp + %$tabscale]
- movq [esp + .two], mm0
- punpckldq mm3,mm3
- movq [esp + .tabscale], mm3
- ;; assume we have at least one i particle - start directly
-.outer:
- mov eax, [ebp + %$shift] ; eax = pointer into shift[]
- mov ebx, [eax] ; ebx=shift[n]
- add [ebp + %$shift], dword 4 ; advance pointer one step
-
- lea ebx, [ebx + ebx*2] ; ebx=3*is
- mov [esp + .is3],ebx ; store is3
-
- mov eax, [ebp + %$shiftvec] ; eax = base of shiftvec[]
-
- movq mm0, [eax + ebx*4] ; move shX/shY to mm0 and shZ to mm1.
- movd mm1, [eax + ebx*4 + 8]
-
- mov ecx, [ebp + %$iinr] ; ecx = pointer into iinr[]
- add [ebp + %$iinr], dword 4 ; advance pointer
- mov ebx, [ecx] ; ebx =ii
-
- mov edx, [ebp + %$type]
- mov edx, [edx + ebx*4]
- imul edx, [ebp + %$ntype]
- shl edx, 1
- mov [esp + .ntia], edx
-
- lea ebx, [ebx + ebx*2] ; ebx = 3*ii=ii3
- mov eax, [ebp + %$pos] ; eax = base of pos[]
-
- pfadd mm0, [eax + ebx*4] ; ix = shX + posX (and iy too)
- movd mm3, [eax + ebx*4 + 8] ; cant use direct memory add for 4 bytes (iz)
- mov [esp + .ii3], ebx
- pfadd mm1, mm3
- movq [esp + .ix], mm0
- movd [esp + .iz], mm1
-
- ;; clear total potential and i forces.
- pxor mm7,mm7
- movq [esp + .vnbtot], mm7
- movq [esp + .fix], mm7
- movd [esp + .fiz], mm7
-
- mov eax, [ebp + %$jindex]
- mov ecx, [eax] ; jindex[n]
- mov edx, [eax + 4] ; jindex[n+1]
- add [ebp + %$jindex], dword 4
- sub edx, ecx ; number of innerloop atoms
-
- mov esi, [ebp + %$pos]
- mov edi, [ebp + %$faction]
- mov eax, [ebp + %$jjnr]
- shl ecx, 2
- add eax, ecx
- mov [esp + .innerjjnr], eax ; pointer to jjnr[nj0]
- sub edx, dword 2
- mov [esp + .innerk], edx ; number of innerloop atoms
- jge .unroll_loop
- jmp .finish_inner
-.unroll_loop:
- ;; paired innerloop here.
- mov ecx, [esp + .innerjjnr] ; pointer to jjnr[k]
- mov eax, [ecx]
- mov ebx, [ecx + 4] ; eax/ebx=jnr
- add [esp + .innerjjnr], dword 8 ; advance pointer (unrolled 2)
- prefetch [ecx + 16] ; prefetch data - trial and error says 16 is best
-
- mov ecx, [ebp + %$type]
- mov edx, [ecx + eax*4] ; type [jnr1]
- mov ecx, [ecx + ebx*4] ; type [jnr2]
-
- mov esi, [ebp + %$nbfp] ; base of nbfp
- shl edx, 1
- shl ecx, 1
- add edx, [esp + .ntia] ; tja = ntia + 2*type
- add ecx, [esp + .ntia]
-
- movq mm5, [esi + edx*4] ; mm5 = 1st c6 / c12
- movq mm7, [esi + ecx*4] ; mm7 = 2nd c6 / c12
- movq mm6,mm5
- punpckldq mm5,mm7 ; mm5 = 1st c6 / 2nd c6
- punpckhdq mm6,mm7 ; mm6 = 1st c12 / 2nd c12
- movq [esp + .c6], mm5
- movq [esp + .c12], mm6
-
- lea eax, [eax + eax*2] ; replace jnr with j3
- lea ebx, [ebx + ebx*2]
-
- mov esi, [ebp + %$pos]
-
- movq mm0, [esp + .ix]
- movd mm1, [esp + .iz]
- movq mm4, [esi + eax*4] ; fetch first j coordinates
- movd mm5, [esi + eax*4 + 8]
- pfsubr mm4,mm0 ; dr = ir - jr
- pfsubr mm5,mm1
- movq [esp + .dx1], mm4 ; store dr
- movd [esp + .dz1], mm5
- pfmul mm4,mm4 ; square dx,dy,dz
- pfmul mm5,mm5
- pfacc mm4, mm5 ; accumulate to get dx*dx+dy*dy+dz*dz
- pfacc mm4, mm5 ; first rsq in lower mm4
-
- movq mm6, [esi + ebx*4] ; fetch second j coordinates
- movd mm7, [esi + ebx*4 + 8]
-
- pfsubr mm6,mm0 ; dr = ir - jr
- pfsubr mm7,mm1
- movq [esp + .dx2], mm6 ; store dr
- movd [esp + .dz2], mm7
- pfmul mm6,mm6 ; square dx,dy,dz
- pfmul mm7,mm7
- pfacc mm6, mm7 ; accumulate to get dx*dx+dy*dy+dz*dz
- pfacc mm6, mm7 ; second rsq in lower mm6
-
- pfrsqrt mm0, mm4 ; lookup inverse square root seed
- pfrsqrt mm1, mm6
-
-
- punpckldq mm0,mm1
- punpckldq mm4,mm6 ; now 4 has rsq and 0 the seed for both pairs.
- movq mm2,mm0 ; amd 3dnow N-R iteration to get full precision.
- pfmul mm0,mm0
- pfrsqit1 mm0,mm4
- pfrcpit2 mm0,mm2
- pfmul mm4, mm0
- movq mm1, mm4
- ;; mm0 is invsqrt, and mm1 r.
- ;; do potential and fscal
- pfmul mm1, [esp + .tabscale] ; mm1=rt
- pf2iw mm4,mm1
- movq [esp + .n1], mm4
- pi2fd mm4,mm4
- pfsub mm1, mm4 ; now mm1 is eps and mm4 n0.
-
- movq mm2,mm1
- pfmul mm2,mm2 ; mm1 is eps, mm2 is eps2
-
- mov edx, [ebp + %$VFtab]
- ; dispersion table
- mov ecx, [esp + .n1]
- shl ecx, 3
- ; load all the table values we need
- movd mm4, [edx + ecx*4]
- movd mm5, [edx + ecx*4 + 4]
- movd mm6, [edx + ecx*4 + 8]
- movd mm7, [edx + ecx*4 + 12]
- mov ecx, [esp + .n1 + 4]
- shl ecx, 3
- punpckldq mm4, [edx + ecx*4]
- punpckldq mm5, [edx + ecx*4 + 4]
- punpckldq mm6, [edx + ecx*4 + 8]
- punpckldq mm7, [edx + ecx*4 + 12]
- pfmul mm6, mm1 ; mm6 = Geps
- pfmul mm7, mm2 ; mm7 = Heps2
- pfadd mm5, mm6
- pfadd mm5, mm7 ; mm5 = Fp
- pfmul mm7, [esp + .two] ; two*Heps2
- pfadd mm7, mm6
- pfadd mm7, mm5 ; mm7=FF
- pfmul mm5, mm1 ; mm5=eps*Fp
- pfadd mm5, mm4 ; mm5= VV
-
- movq mm4, [esp + .c6]
- pfmul mm7, mm4 ; fijD
- pfmul mm5, mm4 ; vnb6
- movq mm3, mm7 ; add to fscal
-
- ;; update vnbtot to release mm5!
- pfadd mm5, [esp + .vnbtot] ; add the earlier value
- movq [esp + .vnbtot], mm5 ; store the sum
-
- ; repulsion table
- mov ecx, [esp + .n1]
- shl ecx, 3
- ; load all the table values we need
- movd mm4, [edx + ecx*4 + 16]
- movd mm5, [edx + ecx*4 + 20]
- movd mm6, [edx + ecx*4 + 24]
- movd mm7, [edx + ecx*4 + 28]
- mov ecx, [esp + .n1 + 4]
- shl ecx, 3
- punpckldq mm4, [edx + ecx*4 + 16]
- punpckldq mm5, [edx + ecx*4 + 20]
- punpckldq mm6, [edx + ecx*4 + 24]
- punpckldq mm7, [edx + ecx*4 + 28]
-
- pfmul mm6, mm1 ; mm6 = Geps
- pfmul mm7, mm2 ; mm7 = Heps2
- pfadd mm5, mm6
- pfadd mm5, mm7 ; mm5 = Fp
- pfmul mm7, [esp + .two] ; two*Heps2
- pfadd mm7, mm6
- pfadd mm7, mm5 ; mm7=FF
- pfmul mm5, mm1 ; mm5=eps*Fp
- pfadd mm5, mm4 ; mm5= VV
-
- movq mm6, [esp + .c12]
- pfmul mm7, mm6 ; fijR
- pfmul mm5, mm6 ; vnb12
- pfadd mm3, mm7 ; total fscal fijD+fijR
-
- ; change sign of mm3
- pxor mm1,mm1
- pfsub mm1, mm3
- pfmul mm1, [esp + .tabscale]
- pfmul mm0, mm1 ; mm0 is total fscal now
-
- prefetchw [esp + .dx1] ; prefetch i forces to cache
-
- ;; spread fscalar to both positions
- movq mm1,mm0
- punpckldq mm0,mm0
- punpckhdq mm1,mm1
-
- ;; calc vector force
- prefetchw [edi + eax*4] ; prefetch the 1st faction to cache
- movq mm2, [esp + .dx1] ; fetch dr
- movd mm3, [esp + .dz1]
-
- ;; update vnbtot
- pfadd mm5, [esp + .vnbtot] ; add the earlier value
- movq [esp + .vnbtot], mm5 ; store the sum
-
- prefetchw [edi + ebx*4] ; prefetch the 2nd faction to cache
- pfmul mm2, mm0 ; mult by fs
- pfmul mm3, mm0
-
- movq mm4, [esp + .dx2] ; fetch dr
- movd mm5, [esp + .dz2]
- pfmul mm4, mm1 ; mult by fs
- pfmul mm5, mm1
- ;; update i forces
-
- movq mm0, [esp + .fix]
- movd mm1, [esp + .fiz]
- pfadd mm0, mm2
- pfadd mm1, mm3
-
- pfadd mm0, mm4
- pfadd mm1, mm5
- movq [esp + .fix], mm0
- movd [esp + .fiz], mm1
- ;; update j forces
-
- movq mm0, [edi + eax*4]
- movd mm1, [edi + eax*4 + 8]
- movq mm6, [edi + ebx*4]
- movd mm7, [edi + ebx*4 + 8]
-
- pfsub mm0, mm2
- pfsub mm1, mm3
- pfsub mm6, mm4
- pfsub mm7, mm5
-
- movq [edi + eax*4], mm0
- movd [edi + eax*4 +8], mm1
- movq [edi + ebx*4], mm6
- movd [edi + ebx*4 + 8], mm7
-
- ;; should we do one more iteration?
- sub [esp + .innerk], dword 2
- jl .finish_inner
- jmp .unroll_loop
-.finish_inner:
- and [esp + .innerk], dword 1
- jnz .single_inner
- jmp .updateouterdata
-.single_inner:
- ;; a single j particle iteration here - compare with the unrolled code for comments.
- mov eax, [esp + .innerjjnr]
- mov eax, [eax] ; eax=jnr offset
-
- mov esi, [ebp + %$nbfp]
- mov ecx, [ebp + %$type]
- mov edx, [ecx + eax*4] ; type [jnr1]
- shl edx, 1
- add edx, [esp + .ntia] ; tja = ntia + 2*type
- movd mm5, [esi + edx*4] ; mm5 = 1st c6
- movq [esp + .c6], mm5
- movd mm5, [esi + edx*4 + 4] ; mm5 = 1st c12
- movq [esp + .c12], mm5
-
- mov esi, [ebp + %$pos]
- lea eax, [eax + eax*2]
-
- movq mm0, [esp + .ix]
- movd mm1, [esp + .iz]
- movq mm4, [esi + eax*4]
- movd mm5, [esi + eax*4 + 8]
- pfsubr mm4, mm0
- pfsubr mm5, mm1
- movq [esp + .dx1], mm4
- pfmul mm4,mm4
- movd [esp + .dz1], mm5
- pfmul mm5,mm5
- pfacc mm4, mm5
- pfacc mm4, mm5 ; mm0=rsq
-
- pfrsqrt mm0,mm4
- movq mm2,mm0
- pfmul mm0,mm0
- pfrsqit1 mm0,mm4
- pfrcpit2 mm0,mm2 ; mm1=invsqrt
- pfmul mm4, mm0
- movq mm1, mm4
- ;; mm0 is invsqrt, and mm1 r.
-
- ;; calculate potentials and scalar force
- pfmul mm1, [esp + .tabscale] ; mm1=rt
- pf2iw mm4,mm1
- movd [esp + .n1], mm4
- pi2fd mm4,mm4
- pfsub mm1, mm4 ; now mm1 is eps and mm4 n0.
-
- movq mm2,mm1
- pfmul mm2,mm2 ; mm1 is eps, mm2 is eps2
-
- mov edx, [ebp + %$VFtab]
- mov ecx, [esp + .n1]
- shl ecx, 3
- ; dispersion table
- ; load all the table values we need
- movd mm4, [edx + ecx*4]
- movd mm5, [edx + ecx*4 + 4]
- movd mm6, [edx + ecx*4 + 8]
- movd mm7, [edx + ecx*4 + 12]
- pfmul mm6, mm1 ; mm6 = Geps
- pfmul mm7, mm2 ; mm7 = Heps2
- pfadd mm5, mm6
- pfadd mm5, mm7 ; mm5 = Fp
- pfmul mm7, [esp + .two] ; two*Heps2
- pfadd mm7, mm6
- pfadd mm7, mm5 ; mm7=FF
- pfmul mm5, mm1 ; mm5=eps*Fp
- pfadd mm5, mm4 ; mm5= VV
-
- movq mm4, [esp + .c6]
- pfmul mm7, mm4 ; fijD
- pfmul mm5, mm4 ; vnb6
- movq mm3, mm7 ; add to fscal
-
- ;; update vnbtot to release mm5!
- pfadd mm5, [esp + .vnbtot] ; add the earlier value
- movq [esp + .vnbtot], mm5 ; store the sum
-
- ; repulsion table
- ; load all the table values we need
- movd mm4, [edx + ecx*4 + 16]
- movd mm5, [edx + ecx*4 + 20]
- movd mm6, [edx + ecx*4 + 24]
- movd mm7, [edx + ecx*4 + 28]
-
- pfmul mm6, mm1 ; mm6 = Geps
- pfmul mm7, mm2 ; mm7 = Heps2
- pfadd mm5, mm6
- pfadd mm5, mm7 ; mm5 = Fp
- pfmul mm7, [esp + .two] ; two*Heps2
- pfadd mm7, mm6
- pfadd mm7, mm5 ; mm7=FF
- pfmul mm5, mm1 ; mm5=eps*Fp
- pfadd mm5, mm4 ; mm5= VV
-
- movq mm6, [esp + .c12]
- pfmul mm7, mm6 ; fijR
- pfmul mm5, mm6 ; vnb12
- pfadd mm3, mm7 ; total fscal fijC+fijD+fijR
-
- ; change sign of mm3
- pxor mm1,mm1
- pfsub mm1, mm3
- pfmul mm0, [esp + .tabscale]
- pfmul mm0, mm1 ; mm0 is total fscal now
-
- ;; update vnbtot
- pfadd mm5, [esp + .vnbtot] ; add the earlier value
- movq [esp + .vnbtot], mm5 ; store the sum
-
- ;; spread fscalar to both positions
- punpckldq mm0,mm0
- ;; calc vectorial force
- prefetchw [edi + eax*4] ; prefetch faction to cache
- movq mm2, [esp + .dx1]
- movd mm3, [esp + .dz1]
-
- pfmul mm2, mm0
- pfmul mm3, mm0
-
- ;; update i particle force
- movq mm0, [esp + .fix]
- movd mm1, [esp + .fiz]
- pfadd mm0, mm2
- pfadd mm1, mm3
- movq [esp + .fix], mm0
- movd [esp + .fiz], mm1
- ;; update j particle force
- movq mm0, [edi + eax*4]
- movd mm1, [edi + eax *4+ 8]
- pfsub mm0, mm2
- pfsub mm1, mm3
- movq [edi + eax*4], mm0
- movd [edi + eax*4 +8], mm1
- ;; done!
-.updateouterdata:
- mov ecx, [esp + .ii3]
-
- movq mm6, [edi + ecx*4] ; increment i force
- movd mm7, [edi + ecx*4 + 8]
- pfadd mm6, [esp + .fix]
- pfadd mm7, [esp + .fiz]
- movq [edi + ecx*4], mm6
- movd [edi + ecx*4 +8], mm7
-
- mov ebx, [ebp + %$fshift] ; increment fshift force
- mov edx, [esp + .is3]
-
- movq mm6, [ebx + edx*4]
- movd mm7, [ebx + edx*4 + 8]
- pfadd mm6, [esp + .fix]
- pfadd mm7, [esp + .fiz]
- movq [ebx + edx*4], mm6
- movd [ebx + edx*4 + 8], mm7
-
- mov edx, [ebp + %$gid] ; get group index for this i particle
- mov edx, [edx]
- add [ebp + %$gid], dword 4 ; advance pointer
-
- movq mm7, [esp + .vnbtot]
- pfacc mm7,mm7 ; get and sum the two parts of total potential
-
- mov eax, [ebp + %$Vnb]
- movd mm6, [eax + edx*4]
- pfadd mm6, mm7
- movd [eax + edx*4], mm6 ; increment vnb[gid]
-
- ;; finish if last
- mov ecx, [ebp + %$nri]
- dec ecx
- jecxz .end
- ;; not last, iterate once more!
- mov [ebp + %$nri], ecx
- jmp .outer
-.end:
- femms
- add esp, 116
- pop edi
- pop esi
- pop edx
- pop ecx
- pop ebx
- pop eax
- endproc
-
-
-
-
-proc inl0310_3dnow
-%$nri arg
-%$iinr arg
-%$jindex arg
-%$jjnr arg
-%$shift arg
-%$shiftvec arg
-%$fshift arg
-%$gid arg
-%$pos arg
-%$faction arg
-%$type arg
-%$ntype arg
-%$nbfp arg
-%$Vnb arg
-%$tabscale arg
-%$VFtab arg
-%$nsatoms arg
- ;; stack offsets for local variables
-.is3 equ 0
-.ii3 equ 4
-.shX equ 8
-.shY equ 12
-.shZ equ 16
-.ix equ 20
-.iy equ 24
-.iz equ 28
-.vnbtot equ 32 ; repeated (64bit) to fill 3dnow reg
-.c6 equ 40 ; repeated (64bit) to fill 3dnow reg
-.c12 equ 48 ; repeated (64bit) to fill 3dnow reg
-.two equ 56 ; repeated (64bit) to fill 3dnow reg
-.n1 equ 64 ; repeated (64bit) to fill 3dnow reg
-.tabscale equ 72 ; repeated (64bit) to fill 3dnow reg
-.ntia equ 80
-.innerjjnr0 equ 84
-.innerk0 equ 88
-.innerjjnr equ 92
-.innerk equ 96
-.fix equ 100
-.fiy equ 104
-.fiz equ 108
-.dx1 equ 112
-.dy1 equ 116
-.dz1 equ 120
-.dx2 equ 124
-.dy2 equ 128
-.dz2 equ 132
-.nsvdwc equ 136
-.nscoul equ 140
-.nsvdw equ 144
-.solnr equ 148
- push eax
- push ebx
- push ecx
- push edx
- push esi
- push edi
- sub esp, 152 ; local stack space
- femms
- movq mm0, [mm_two]
- movd mm3, [ebp + %$tabscale]
- movq [esp + .two], mm0
- punpckldq mm3,mm3
- movq [esp + .tabscale], mm3
-
- ;; assume we have at least one i particle - start directly
-.outer:
- mov eax, [ebp + %$shift] ; eax = pointer into shift[]
- mov ebx, [eax] ; ebx=shift[n]
- add [ebp + %$shift], dword 4 ; advance pointer one step
-
- lea ebx, [ebx + ebx*2] ; ebx=3*is
- mov [esp + .is3],ebx ; store is3
-
- mov eax, [ebp + %$shiftvec] ; eax = base of shiftvec[]
-
- movq mm0, [eax + ebx*4] ; move shX/shY to mm0 and shZ to mm1.
- movd mm1, [eax + ebx*4 + 8]
- movq [esp + .shX], mm0
- movd [esp + .shZ], mm1
-
- mov ecx, [ebp + %$iinr] ; ecx = pointer into iinr[]
- add [ebp + %$iinr], dword 4 ; advance pointer
- mov ebx, [ecx] ; ebx =ii
-
- mov eax, [ebp + %$nsatoms]
- add [ebp + %$nsatoms], dword 12
- mov ecx, [eax]
- mov edx, [eax + 4]
- mov eax, [eax + 8]
- sub ecx, eax
- sub eax, edx
-
- mov [esp + .nsvdwc], edx
- mov [esp + .nscoul], eax
- mov [esp + .nsvdw], ecx
-
- ;; clear potential
- pxor mm7,mm7
- movq [esp + .vnbtot], mm7
- mov [esp + .solnr], ebx
-
- mov eax, [ebp + %$jindex]
- mov ecx, [eax] ; jindex[n]
- mov edx, [eax + 4] ; jindex[n+1]
- add [ebp + %$jindex], dword 4
- sub edx, ecx ; number of innerloop atoms
- mov eax, [ebp + %$jjnr]
- shl ecx, 2
- add eax, ecx
- mov [esp + .innerjjnr0], eax ; pointer to jjnr[nj0]
-
- mov [esp + .innerk0], edx ; number of innerloop atoms
- mov esi, [ebp + %$pos]
- mov edi, [ebp + %$faction]
-
- mov ecx, [esp + .nsvdwc]
- cmp ecx, dword 0
- jnz .mno_vdwc
- jmp .testvdw
-.mno_vdwc:
- mov ebx, [esp + .solnr]
- inc dword [esp + .solnr]
-
- mov edx, [ebp + %$type]
- mov edx, [edx + ebx*4]
- imul edx, [ebp + %$ntype]
- shl edx, 1
- mov [esp + .ntia], edx
-
- lea ebx, [ebx + ebx*2] ; ebx = 3*ii=ii3
- mov eax, [ebp + %$pos] ; eax = base of pos[]
- mov [esp + .ii3], ebx
-
- movq mm0, [eax + ebx*4]
- movd mm1, [eax + ebx*4 + 8]
- pfadd mm0, [esp + .shX]
- pfadd mm1, [esp + .shZ]
- movq [esp + .ix], mm0
- movd [esp + .iz], mm1
-
- ;; clear forces.
- pxor mm7,mm7
- movq [esp + .fix], mm7
- movd [esp + .fiz], mm7
-
- mov ecx, [esp + .innerjjnr0]
- mov [esp + .innerjjnr], ecx
- mov edx, [esp + .innerk0]
- sub edx, dword 2
- mov [esp + .innerk], edx ; number of innerloop atoms
- jge .unroll_vdwc_loop
- jmp .finish_vdwc_inner
-.unroll_vdwc_loop:
- ;; paired innerloop here.
- mov ecx, [esp + .innerjjnr] ; pointer to jjnr[k]
- mov eax, [ecx]
- mov ebx, [ecx + 4] ; eax/ebx=jnr
- add [esp + .innerjjnr], dword 8 ; advance pointer (unrolled 2)
- prefetch [ecx + 16] ; prefetch data - trial and error says 16 is best
-
- mov ecx, [ebp + %$type]
- mov edx, [ecx + eax*4] ; type [jnr1]
- mov ecx, [ecx + ebx*4] ; type [jnr2]
-
- mov esi, [ebp + %$nbfp] ; base of nbfp
- shl edx, 1
- shl ecx, 1
- add edx, [esp + .ntia] ; tja = ntia + 2*type
- add ecx, [esp + .ntia]
-
- movq mm5, [esi + edx*4] ; mm5 = 1st c6 / c12
- movq mm7, [esi + ecx*4] ; mm7 = 2nd c6 / c12
- movq mm6,mm5
- punpckldq mm5,mm7 ; mm5 = 1st c6 / 2nd c6
- punpckhdq mm6,mm7 ; mm6 = 1st c12 / 2nd c12
- movq [esp + .c6], mm5
- movq [esp + .c12], mm6
-
- lea eax, [eax + eax*2] ; replace jnr with j3
- lea ebx, [ebx + ebx*2]
-
- mov esi, [ebp + %$pos]
-
- movq mm0, [esp + .ix]
- movd mm1, [esp + .iz]
- movq mm4, [esi + eax*4] ; fetch first j coordinates
- movd mm5, [esi + eax*4 + 8]
- pfsubr mm4,mm0 ; dr = ir - jr
- pfsubr mm5,mm1
- movq [esp + .dx1], mm4 ; store dr
- movd [esp + .dz1], mm5
- pfmul mm4,mm4 ; square dx,dy,dz
- pfmul mm5,mm5
- pfacc mm4, mm5 ; accumulate to get dx*dx+dy*dy+dz*dz
- pfacc mm4, mm5 ; first rsq in lower mm4
-
- movq mm6, [esi + ebx*4] ; fetch second j coordinates
- movd mm7, [esi + ebx*4 + 8]
-
- pfsubr mm6,mm0 ; dr = ir - jr
- pfsubr mm7,mm1
- movq [esp + .dx2], mm6 ; store dr
- movd [esp + .dz2], mm7
- pfmul mm6,mm6 ; square dx,dy,dz
- pfmul mm7,mm7
- pfacc mm6, mm7 ; accumulate to get dx*dx+dy*dy+dz*dz
- pfacc mm6, mm7 ; second rsq in lower mm6
-
- pfrsqrt mm0, mm4 ; lookup inverse square root seed
- pfrsqrt mm1, mm6
-
-
- punpckldq mm0,mm1
- punpckldq mm4,mm6 ; now 4 has rsq and 0 the seed for both pairs.
- movq mm2,mm0 ; amd 3dnow N-R iteration to get full precision.
- pfmul mm0,mm0
- pfrsqit1 mm0,mm4
- pfrcpit2 mm0,mm2
- pfmul mm4, mm0
- movq mm1, mm4
- ;; mm0 is invsqrt, and mm1 r.
- ;; do potential and fscal
- pfmul mm1, [esp + .tabscale] ; mm1=rt
- pf2iw mm4,mm1
- movq [esp + .n1], mm4
- pi2fd mm4,mm4
- pfsub mm1, mm4 ; now mm1 is eps and mm4 n0.
-
- movq mm2,mm1
- pfmul mm2,mm2 ; mm1 is eps, mm2 is eps2
-
- mov edx, [ebp + %$VFtab]
- ; dispersion table
- mov ecx, [esp + .n1]
- shl ecx, 3
- ; load all the table values we need
- movd mm4, [edx + ecx*4]
- movd mm5, [edx + ecx*4 + 4]
- movd mm6, [edx + ecx*4 + 8]
- movd mm7, [edx + ecx*4 + 12]
- mov ecx, [esp + .n1 + 4]
- shl ecx, 3
- punpckldq mm4, [edx + ecx*4]
- punpckldq mm5, [edx + ecx*4 + 4]
- punpckldq mm6, [edx + ecx*4 + 8]
- punpckldq mm7, [edx + ecx*4 + 12]
- pfmul mm6, mm1 ; mm6 = Geps
- pfmul mm7, mm2 ; mm7 = Heps2
- pfadd mm5, mm6
- pfadd mm5, mm7 ; mm5 = Fp
- pfmul mm7, [esp + .two] ; two*Heps2
- pfadd mm7, mm6
- pfadd mm7, mm5 ; mm7=FF
- pfmul mm5, mm1 ; mm5=eps*Fp
- pfadd mm5, mm4 ; mm5= VV
-
- movq mm4, [esp + .c6]
- pfmul mm7, mm4 ; fijD
- pfmul mm5, mm4 ; vnb6
- movq mm3, mm7 ; add to fscal
-
- ;; update vnbtot to release mm5!
- pfadd mm5, [esp + .vnbtot] ; add the earlier value
- movq [esp + .vnbtot], mm5 ; store the sum
-
- ; repulsion table
- mov ecx, [esp + .n1]
- shl ecx, 3
- ; load all the table values we need
- movd mm4, [edx + ecx*4 + 16]
- movd mm5, [edx + ecx*4 + 20]
- movd mm6, [edx + ecx*4 + 24]
- movd mm7, [edx + ecx*4 + 28]
- mov ecx, [esp + .n1 + 4]
- shl ecx, 3
- punpckldq mm4, [edx + ecx*4 + 16]
- punpckldq mm5, [edx + ecx*4 + 20]
- punpckldq mm6, [edx + ecx*4 + 24]
- punpckldq mm7, [edx + ecx*4 + 28]
-
- pfmul mm6, mm1 ; mm6 = Geps
- pfmul mm7, mm2 ; mm7 = Heps2
- pfadd mm5, mm6
- pfadd mm5, mm7 ; mm5 = Fp
- pfmul mm7, [esp + .two] ; two*Heps2
- pfadd mm7, mm6
- pfadd mm7, mm5 ; mm7=FF
- pfmul mm5, mm1 ; mm5=eps*Fp
- pfadd mm5, mm4 ; mm5= VV
-
- movq mm6, [esp + .c12]
- pfmul mm7, mm6 ; fijR
- pfmul mm5, mm6 ; vnb12
- pfadd mm3, mm7 ; total fscal fijD+fijR
-
- ; change sign of mm3
- pxor mm1,mm1
- pfsub mm1, mm3
- pfmul mm1, [esp + .tabscale]
- pfmul mm0, mm1 ; mm0 is total fscal now
-
- prefetchw [esp + .dx1] ; prefetch i forces to cache
-
- ;; spread fscalar to both positions
- movq mm1,mm0
- punpckldq mm0,mm0
- punpckhdq mm1,mm1
-
- ;; calc vector force
- prefetchw [edi + eax*4] ; prefetch the 1st faction to cache
- movq mm2, [esp + .dx1] ; fetch dr
- movd mm3, [esp + .dz1]
-
- ;; update vnbtot
- pfadd mm5, [esp + .vnbtot] ; add the earlier value
- movq [esp + .vnbtot], mm5 ; store the sum
-
- prefetchw [edi + ebx*4] ; prefetch the 2nd faction to cache
- pfmul mm2, mm0 ; mult by fs
- pfmul mm3, mm0
-
- movq mm4, [esp + .dx2] ; fetch dr
- movd mm5, [esp + .dz2]
- pfmul mm4, mm1 ; mult by fs
- pfmul mm5, mm1
- ;; update i forces
-
- movq mm0, [esp + .fix]
- movd mm1, [esp + .fiz]
- pfadd mm0, mm2
- pfadd mm1, mm3
-
- pfadd mm0, mm4
- pfadd mm1, mm5
- movq [esp + .fix], mm0
- movd [esp + .fiz], mm1
- ;; update j forces
-
- movq mm0, [edi + eax*4]
- movd mm1, [edi + eax*4 + 8]
- movq mm6, [edi + ebx*4]
- movd mm7, [edi + ebx*4 + 8]
-
- pfsub mm0, mm2
- pfsub mm1, mm3
- pfsub mm6, mm4
- pfsub mm7, mm5
-
- movq [edi + eax*4], mm0
- movd [edi + eax*4 +8], mm1
- movq [edi + ebx*4], mm6
- movd [edi + ebx*4 + 8], mm7
-
- ;; should we do one more iteration?
- sub [esp + .innerk], dword 2
- jl .finish_vdwc_inner
- jmp .unroll_vdwc_loop
-.finish_vdwc_inner:
- and [esp + .innerk], dword 1
- jnz .single_vdwc_inner
- jmp .updateouterdata_vdwc
-.single_vdwc_inner:
- ;; a single j particle iteration here - compare with the unrolled code for comments.
- mov eax, [esp + .innerjjnr]
- mov eax, [eax] ; eax=jnr offset
-
- mov esi, [ebp + %$nbfp]
- mov ecx, [ebp + %$type]
- mov edx, [ecx + eax*4] ; type [jnr1]
- shl edx, 1
- add edx, [esp + .ntia] ; tja = ntia + 2*type
- movd mm5, [esi + edx*4] ; mm5 = 1st c6
- movq [esp + .c6], mm5
- movd mm5, [esi + edx*4 + 4] ; mm5 = 1st c12
- movq [esp + .c12], mm5
-
- mov esi, [ebp + %$pos]
- lea eax, [eax + eax*2]
-
- movq mm0, [esp + .ix]
- movd mm1, [esp + .iz]
- movq mm4, [esi + eax*4]
- movd mm5, [esi + eax*4 + 8]
- pfsubr mm4, mm0
- pfsubr mm5, mm1
- movq [esp + .dx1], mm4
- pfmul mm4,mm4
- movd [esp + .dz1], mm5
- pfmul mm5,mm5
- pfacc mm4, mm5
- pfacc mm4, mm5 ; mm0=rsq
-
- pfrsqrt mm0,mm4
- movq mm2,mm0
- pfmul mm0,mm0
- pfrsqit1 mm0,mm4
- pfrcpit2 mm0,mm2 ; mm1=invsqrt
- pfmul mm4, mm0
- movq mm1, mm4
- ;; mm0 is invsqrt, and mm1 r.
-
- ;; calculate potentials and scalar force
- pfmul mm1, [esp + .tabscale] ; mm1=rt
- pf2iw mm4,mm1
- movd [esp + .n1], mm4
- pi2fd mm4,mm4
- pfsub mm1, mm4 ; now mm1 is eps and mm4 n0.
-
- movq mm2,mm1
- pfmul mm2,mm2 ; mm1 is eps, mm2 is eps2
-
- mov edx, [ebp + %$VFtab]
- mov ecx, [esp + .n1]
- shl ecx, 3
- ; dispersion table
- ; load all the table values we need
- movd mm4, [edx + ecx*4]
- movd mm5, [edx + ecx*4 + 4]
- movd mm6, [edx + ecx*4 + 8]
- movd mm7, [edx + ecx*4 + 12]
- pfmul mm6, mm1 ; mm6 = Geps
- pfmul mm7, mm2 ; mm7 = Heps2
- pfadd mm5, mm6
- pfadd mm5, mm7 ; mm5 = Fp
- pfmul mm7, [esp + .two] ; two*Heps2
- pfadd mm7, mm6
- pfadd mm7, mm5 ; mm7=FF
- pfmul mm5, mm1 ; mm5=eps*Fp
- pfadd mm5, mm4 ; mm5= VV
-
- movq mm4, [esp + .c6]
- pfmul mm7, mm4 ; fijD
- pfmul mm5, mm4 ; vnb6
- movq mm3, mm7 ; add to fscal
-
- ;; update vnbtot to release mm5!
- pfadd mm5, [esp + .vnbtot] ; add the earlier value
- movq [esp + .vnbtot], mm5 ; store the sum
-
- ; repulsion table
- ; load all the table values we need
- movd mm4, [edx + ecx*4 + 16]
- movd mm5, [edx + ecx*4 + 20]
- movd mm6, [edx + ecx*4 + 24]
- movd mm7, [edx + ecx*4 + 28]
-
- pfmul mm6, mm1 ; mm6 = Geps
- pfmul mm7, mm2 ; mm7 = Heps2
- pfadd mm5, mm6
- pfadd mm5, mm7 ; mm5 = Fp
- pfmul mm7, [esp + .two] ; two*Heps2
- pfadd mm7, mm6
- pfadd mm7, mm5 ; mm7=FF
- pfmul mm5, mm1 ; mm5=eps*Fp
- pfadd mm5, mm4 ; mm5= VV
-
- movq mm6, [esp + .c12]
- pfmul mm7, mm6 ; fijR
- pfmul mm5, mm6 ; vnb12
- pfadd mm3, mm7 ; total fscal fijC+fijD+fijR
-
- ; change sign of mm3
- pxor mm1,mm1
- pfsub mm1, mm3
- pfmul mm0, [esp + .tabscale]
- pfmul mm0, mm1 ; mm0 is total fscal now
-
- ;; update vnbtot
- pfadd mm5, [esp + .vnbtot] ; add the earlier value
- movq [esp + .vnbtot], mm5 ; store the sum
-
- ;; spread fscalar to both positions
- punpckldq mm0,mm0
- ;; calc vectorial force
- prefetchw [edi + eax*4] ; prefetch faction to cache
- movq mm2, [esp + .dx1]
- movd mm3, [esp + .dz1]
-
- pfmul mm2, mm0
- pfmul mm3, mm0
-
- ;; update i particle force
- movq mm0, [esp + .fix]
- movd mm1, [esp + .fiz]
- pfadd mm0, mm2
- pfadd mm1, mm3
- movq [esp + .fix], mm0
- movd [esp + .fiz], mm1
- ;; update j particle force
- movq mm0, [edi + eax*4]
- movd mm1, [edi + eax *4+ 8]
- pfsub mm0, mm2
- pfsub mm1, mm3
- movq [edi + eax*4], mm0
- movd [edi + eax*4 +8], mm1
- ;; done!
-.updateouterdata_vdwc:
- mov ecx, [esp + .ii3]
-
- movq mm6, [edi + ecx*4] ; increment i force
- movd mm7, [edi + ecx*4 + 8]
- pfadd mm6, [esp + .fix]
- pfadd mm7, [esp + .fiz]
- movq [edi + ecx*4], mm6
- movd [edi + ecx*4 +8], mm7
-
- mov ebx, [ebp + %$fshift] ; increment fshift force
- mov edx, [esp + .is3]
-
- movq mm6, [ebx + edx*4]
- movd mm7, [ebx + edx*4 + 8]
- pfadd mm6, [esp + .fix]
- pfadd mm7, [esp + .fiz]
- movq [ebx + edx*4], mm6
- movd [ebx + edx*4 + 8], mm7
-
- ;; loop back to mno.
- dec dword [esp + .nsvdwc]
- jz .testvdw
- jmp .mno_vdwc
-.testvdw
- mov ebx, [esp + .nscoul]
- add [esp + .solnr], dword ebx
-
- mov ecx, [esp + .nsvdw]
- cmp ecx, byte 0
- jnz .mno_vdw
- jmp .last_mno
-.mno_vdw:
- mov ebx, [esp + .solnr]
- inc dword [esp + .solnr]
-
- mov edx, [ebp + %$type]
- mov edx, [edx + ebx*4]
- imul edx, [ebp + %$ntype]
- shl edx, 1
- mov [esp + .ntia], edx
-
- lea ebx, [ebx + ebx*2] ; ebx = 3*ii=ii3
- mov eax, [ebp + %$pos] ; eax = base of pos[]
- mov [esp + .ii3], ebx
-
- movq mm0, [eax + ebx*4]
- movd mm1, [eax + ebx*4 + 8]
- pfadd mm0, [esp + .shX]
- pfadd mm1, [esp + .shZ]
- movq [esp + .ix], mm0
- movd [esp + .iz], mm1
-
- ;; clear forces.
- pxor mm7,mm7
- movq [esp + .fix], mm7
- movd [esp + .fiz], mm7
-
- mov ecx, [esp + .innerjjnr0]
- mov [esp + .innerjjnr], ecx
- mov edx, [esp + .innerk0]
- sub edx, dword 2
- mov [esp + .innerk], edx ; number of innerloop atoms
- jge .unroll_vdw_loop
- jmp .finish_vdw_inner
-.unroll_vdw_loop:
- ;; paired innerloop here.
- mov ecx, [esp + .innerjjnr] ; pointer to jjnr[k]
- mov eax, [ecx]
- mov ebx, [ecx + 4] ; eax/ebx=jnr
- add [esp + .innerjjnr], dword 8 ; advance pointer (unrolled 2)
- prefetch [ecx + 16] ; prefetch data - trial and error says 16 is best
-
- mov ecx, [ebp + %$type]
- mov edx, [ecx + eax*4] ; type [jnr1]
- mov ecx, [ecx + ebx*4] ; type [jnr2]
-
- mov esi, [ebp + %$nbfp] ; base of nbfp
- shl edx, 1
- shl ecx, 1
- add edx, [esp + .ntia] ; tja = ntia + 2*type
- add ecx, [esp + .ntia]
-
- movq mm5, [esi + edx*4] ; mm5 = 1st c6 / c12
- movq mm7, [esi + ecx*4] ; mm7 = 2nd c6 / c12
- movq mm6,mm5
- punpckldq mm5,mm7 ; mm5 = 1st c6 / 2nd c6
- punpckhdq mm6,mm7 ; mm6 = 1st c12 / 2nd c12
- movq [esp + .c6], mm5
- movq [esp + .c12], mm6
-
- lea eax, [eax + eax*2] ; replace jnr with j3
- lea ebx, [ebx + ebx*2]
-
- mov esi, [ebp + %$pos]
-
- movq mm0, [esp + .ix]
- movd mm1, [esp + .iz]
- movq mm4, [esi + eax*4] ; fetch first j coordinates
- movd mm5, [esi + eax*4 + 8]
- pfsubr mm4,mm0 ; dr = ir - jr
- pfsubr mm5,mm1
- movq [esp + .dx1], mm4 ; store dr
- movd [esp + .dz1], mm5
- pfmul mm4,mm4 ; square dx,dy,dz
- pfmul mm5,mm5
- pfacc mm4, mm5 ; accumulate to get dx*dx+dy*dy+dz*dz
- pfacc mm4, mm5 ; first rsq in lower mm4
-
- movq mm6, [esi + ebx*4] ; fetch second j coordinates
- movd mm7, [esi + ebx*4 + 8]
-
- pfsubr mm6,mm0 ; dr = ir - jr
- pfsubr mm7,mm1
- movq [esp + .dx2], mm6 ; store dr
- movd [esp + .dz2], mm7
- pfmul mm6,mm6 ; square dx,dy,dz
- pfmul mm7,mm7
- pfacc mm6, mm7 ; accumulate to get dx*dx+dy*dy+dz*dz
- pfacc mm6, mm7 ; second rsq in lower mm6
-
- pfrsqrt mm0, mm4 ; lookup inverse square root seed
- pfrsqrt mm1, mm6
-
-
- punpckldq mm0,mm1
- punpckldq mm4,mm6 ; now 4 has rsq and 0 the seed for both pairs.
- movq mm2,mm0 ; amd 3dnow N-R iteration to get full precision.
- pfmul mm0,mm0
- pfrsqit1 mm0,mm4
- pfrcpit2 mm0,mm2
- pfmul mm4, mm0
- movq mm1, mm4
- ;; mm0 is invsqrt, and mm1 r.
- ;; do potential and fscal
- pfmul mm1, [esp + .tabscale] ; mm1=rt
- pf2iw mm4,mm1
- movq [esp + .n1], mm4
- pi2fd mm4,mm4
- pfsub mm1, mm4 ; now mm1 is eps and mm4 n0.
-
- movq mm2,mm1
- pfmul mm2,mm2 ; mm1 is eps, mm2 is eps2
-
- mov edx, [ebp + %$VFtab]
- ; dispersion table
- mov ecx, [esp + .n1]
- shl ecx, 3
- ; load all the table values we need
- movd mm4, [edx + ecx*4]
- movd mm5, [edx + ecx*4 + 4]
- movd mm6, [edx + ecx*4 + 8]
- movd mm7, [edx + ecx*4 + 12]
- mov ecx, [esp + .n1 + 4]
- shl ecx, 3
- punpckldq mm4, [edx + ecx*4]
- punpckldq mm5, [edx + ecx*4 + 4]
- punpckldq mm6, [edx + ecx*4 + 8]
- punpckldq mm7, [edx + ecx*4 + 12]
- pfmul mm6, mm1 ; mm6 = Geps
- pfmul mm7, mm2 ; mm7 = Heps2
- pfadd mm5, mm6
- pfadd mm5, mm7 ; mm5 = Fp
- pfmul mm7, [esp + .two] ; two*Heps2
- pfadd mm7, mm6
- pfadd mm7, mm5 ; mm7=FF
- pfmul mm5, mm1 ; mm5=eps*Fp
- pfadd mm5, mm4 ; mm5= VV
-
- movq mm4, [esp + .c6]
- pfmul mm7, mm4 ; fijD
- pfmul mm5, mm4 ; vnb6
- movq mm3, mm7 ; add to fscal
-
- ;; update vnbtot to release mm5!
- pfadd mm5, [esp + .vnbtot] ; add the earlier value
- movq [esp + .vnbtot], mm5 ; store the sum
-
- ; repulsion table
- mov ecx, [esp + .n1]
- shl ecx, 3
- ; load all the table values we need
- movd mm4, [edx + ecx*4 + 16]
- movd mm5, [edx + ecx*4 + 20]
- movd mm6, [edx + ecx*4 + 24]
- movd mm7, [edx + ecx*4 + 28]
- mov ecx, [esp + .n1 + 4]
- shl ecx, 3
- punpckldq mm4, [edx + ecx*4 + 16]
- punpckldq mm5, [edx + ecx*4 + 20]
- punpckldq mm6, [edx + ecx*4 + 24]
- punpckldq mm7, [edx + ecx*4 + 28]
-
- pfmul mm6, mm1 ; mm6 = Geps
- pfmul mm7, mm2 ; mm7 = Heps2
- pfadd mm5, mm6
- pfadd mm5, mm7 ; mm5 = Fp
- pfmul mm7, [esp + .two] ; two*Heps2
- pfadd mm7, mm6
- pfadd mm7, mm5 ; mm7=FF
- pfmul mm5, mm1 ; mm5=eps*Fp
- pfadd mm5, mm4 ; mm5= VV
-
- movq mm6, [esp + .c12]
- pfmul mm7, mm6 ; fijR
- pfmul mm5, mm6 ; vnb12
- pfadd mm3, mm7 ; total fscal fijD+fijR
-
- ; change sign of mm3
- pxor mm1,mm1
- pfsub mm1, mm3
- pfmul mm1, [esp + .tabscale]
- pfmul mm0, mm1 ; mm0 is total fscal now
-
- prefetchw [esp + .dx1] ; prefetch i forces to cache
-
- ;; spread fscalar to both positions
- movq mm1,mm0
- punpckldq mm0,mm0
- punpckhdq mm1,mm1
-
- ;; calc vector force
- prefetchw [edi + eax*4] ; prefetch the 1st faction to cache
- movq mm2, [esp + .dx1] ; fetch dr
- movd mm3, [esp + .dz1]
-
- ;; update vnbtot
- pfadd mm5, [esp + .vnbtot] ; add the earlier value
- movq [esp + .vnbtot], mm5 ; store the sum
-
- prefetchw [edi + ebx*4] ; prefetch the 2nd faction to cache
- pfmul mm2, mm0 ; mult by fs
- pfmul mm3, mm0
-
- movq mm4, [esp + .dx2] ; fetch dr
- movd mm5, [esp + .dz2]
- pfmul mm4, mm1 ; mult by fs
- pfmul mm5, mm1
- ;; update i forces
-
- movq mm0, [esp + .fix]
- movd mm1, [esp + .fiz]
- pfadd mm0, mm2
- pfadd mm1, mm3
-
- pfadd mm0, mm4
- pfadd mm1, mm5
- movq [esp + .fix], mm0
- movd [esp + .fiz], mm1
- ;; update j forces
-
- movq mm0, [edi + eax*4]
- movd mm1, [edi + eax*4 + 8]
- movq mm6, [edi + ebx*4]
- movd mm7, [edi + ebx*4 + 8]
-
- pfsub mm0, mm2
- pfsub mm1, mm3
- pfsub mm6, mm4
- pfsub mm7, mm5
-
- movq [edi + eax*4], mm0
- movd [edi + eax*4 +8], mm1
- movq [edi + ebx*4], mm6
- movd [edi + ebx*4 + 8], mm7
-
- ;; should we do one more iteration?
- sub [esp + .innerk], dword 2
- jl .finish_vdw_inner
- jmp .unroll_vdw_loop
-.finish_vdw_inner:
- and [esp + .innerk], dword 1
- jnz .single_vdw_inner
- jmp .updateouterdata_vdw
-.single_vdw_inner:
- ;; a single j particle iteration here - compare with the unrolled code for comments.
- mov eax, [esp + .innerjjnr]
- mov eax, [eax] ; eax=jnr offset
-
- mov esi, [ebp + %$nbfp]
- mov ecx, [ebp + %$type]
- mov edx, [ecx + eax*4] ; type [jnr1]
- shl edx, 1
- add edx, [esp + .ntia] ; tja = ntia + 2*type
- movd mm5, [esi + edx*4] ; mm5 = 1st c6
- movq [esp + .c6], mm5
- movd mm5, [esi + edx*4 + 4] ; mm5 = 1st c12
- movq [esp + .c12], mm5
-
- mov esi, [ebp + %$pos]
- lea eax, [eax + eax*2]
-
- movq mm0, [esp + .ix]
- movd mm1, [esp + .iz]
- movq mm4, [esi + eax*4]
- movd mm5, [esi + eax*4 + 8]
- pfsubr mm4, mm0
- pfsubr mm5, mm1
- movq [esp + .dx1], mm4
- pfmul mm4,mm4
- movd [esp + .dz1], mm5
- pfmul mm5,mm5
- pfacc mm4, mm5
- pfacc mm4, mm5 ; mm0=rsq
-
- pfrsqrt mm0,mm4
- movq mm2,mm0
- pfmul mm0,mm0
- pfrsqit1 mm0,mm4
- pfrcpit2 mm0,mm2 ; mm1=invsqrt
- pfmul mm4, mm0
- movq mm1, mm4
- ;; mm0 is invsqrt, and mm1 r.
-
- ;; calculate potentials and scalar force
- pfmul mm1, [esp + .tabscale] ; mm1=rt
- pf2iw mm4,mm1
- movd [esp + .n1], mm4
- pi2fd mm4,mm4
- pfsub mm1, mm4 ; now mm1 is eps and mm4 n0.
-
- movq mm2,mm1
- pfmul mm2,mm2 ; mm1 is eps, mm2 is eps2
-
- mov edx, [ebp + %$VFtab]
- mov ecx, [esp + .n1]
- shl ecx, 3
- ; dispersion table
- ; load all the table values we need
- movd mm4, [edx + ecx*4]
- movd mm5, [edx + ecx*4 + 4]
- movd mm6, [edx + ecx*4 + 8]
- movd mm7, [edx + ecx*4 + 12]
- pfmul mm6, mm1 ; mm6 = Geps
- pfmul mm7, mm2 ; mm7 = Heps2
- pfadd mm5, mm6
- pfadd mm5, mm7 ; mm5 = Fp
- pfmul mm7, [esp + .two] ; two*Heps2
- pfadd mm7, mm6
- pfadd mm7, mm5 ; mm7=FF
- pfmul mm5, mm1 ; mm5=eps*Fp
- pfadd mm5, mm4 ; mm5= VV
-
- movq mm4, [esp + .c6]
- pfmul mm7, mm4 ; fijD
- pfmul mm5, mm4 ; vnb6
- movq mm3, mm7 ; add to fscal
-
- ;; update vnbtot to release mm5!
- pfadd mm5, [esp + .vnbtot] ; add the earlier value
- movq [esp + .vnbtot], mm5 ; store the sum
-
- ; repulsion table
- ; load all the table values we need
- movd mm4, [edx + ecx*4 + 16]
- movd mm5, [edx + ecx*4 + 20]
- movd mm6, [edx + ecx*4 + 24]
- movd mm7, [edx + ecx*4 + 28]
-
- pfmul mm6, mm1 ; mm6 = Geps
- pfmul mm7, mm2 ; mm7 = Heps2
- pfadd mm5, mm6
- pfadd mm5, mm7 ; mm5 = Fp
- pfmul mm7, [esp + .two] ; two*Heps2
- pfadd mm7, mm6
- pfadd mm7, mm5 ; mm7=FF
- pfmul mm5, mm1 ; mm5=eps*Fp
- pfadd mm5, mm4 ; mm5= VV
-
- movq mm6, [esp + .c12]
- pfmul mm7, mm6 ; fijR
- pfmul mm5, mm6 ; vnb12
- pfadd mm3, mm7 ; total fscal fijC+fijD+fijR
-
- ; change sign of mm3
- pxor mm1,mm1
- pfsub mm1, mm3
- pfmul mm0, [esp + .tabscale]
- pfmul mm0, mm1 ; mm0 is total fscal now
-
- ;; update vnbtot
- pfadd mm5, [esp + .vnbtot] ; add the earlier value
- movq [esp + .vnbtot], mm5 ; store the sum
-
- ;; spread fscalar to both positions
- punpckldq mm0,mm0
- ;; calc vectorial force
- prefetchw [edi + eax*4] ; prefetch faction to cache
- movq mm2, [esp + .dx1]
- movd mm3, [esp + .dz1]
-
- pfmul mm2, mm0
- pfmul mm3, mm0
-
- ;; update i particle force
- movq mm0, [esp + .fix]
- movd mm1, [esp + .fiz]
- pfadd mm0, mm2
- pfadd mm1, mm3
- movq [esp + .fix], mm0
- movd [esp + .fiz], mm1
- ;; update j particle force
- movq mm0, [edi + eax*4]
- movd mm1, [edi + eax *4+ 8]
- pfsub mm0, mm2
- pfsub mm1, mm3
- movq [edi + eax*4], mm0
- movd [edi + eax*4 +8], mm1
- ;; done!
-.updateouterdata_vdw:
- mov ecx, [esp + .ii3]
-
- movq mm6, [edi + ecx*4] ; increment i force
- movd mm7, [edi + ecx*4 + 8]
- pfadd mm6, [esp + .fix]
- pfadd mm7, [esp + .fiz]
- movq [edi + ecx*4], mm6
- movd [edi + ecx*4 +8], mm7
-
- mov ebx, [ebp + %$fshift] ; increment fshift force
- mov edx, [esp + .is3]
-
- movq mm6, [ebx + edx*4]
- movd mm7, [ebx + edx*4 + 8]
- pfadd mm6, [esp + .fix]
- pfadd mm7, [esp + .fiz]
- movq [ebx + edx*4], mm6
- movd [ebx + edx*4 + 8], mm7
-
- ;; loop back to mno.
- dec dword [esp + .nsvdw]
- jz .last_mno
- jmp .mno_vdw
-
-.last_mno:
- mov edx, [ebp + %$gid] ; get group index for this i particle
- mov edx, [edx]
- add [ebp + %$gid], dword 4 ; advance pointer
-
- movq mm7, [esp + .vnbtot]
- pfacc mm7,mm7 ; get and sum the two parts of total potential
-
- mov eax, [ebp + %$Vnb]
- movd mm6, [eax + edx*4]
- pfadd mm6, mm7
- movd [eax + edx*4], mm6 ; increment vc[gid]
- ;; finish if last
- mov ecx, [ebp + %$nri]
- dec ecx
- jecxz .end
- ;; not last, iterate once more!
- mov [ebp + %$nri], ecx
- jmp .outer
-.end:
- femms
- add esp, 152
- pop edi
- pop esi
- pop edx
- pop ecx
- pop ebx
- pop eax
- endproc
-
-
-proc inl1000_3dnow
-%$nri arg
-%$iinr arg
-%$jindex arg
-%$jjnr arg
-%$shift arg
-%$shiftvec arg
-%$fshift arg
-%$gid arg
-%$pos arg
-%$faction arg
-%$charge arg
-%$facel arg
-%$Vc arg
- ;; stack offsets for local variables
-.is3 equ 0
-.ii3 equ 4
-.ix equ 8
-.iy equ 12
-.iz equ 16
-.iq equ 20 ; repeated (64bit) to fill 3dnow reg
-.vctot equ 28 ; repeated (64bit) to fill 3dnow reg
-.innerjjnr equ 36
-.innerk equ 40
-.fix equ 44
-.fiy equ 48
-.fiz equ 52
-.dx1 equ 56
-.dy1 equ 60
-.dz1 equ 64
-.dx2 equ 68
-.dy2 equ 72
-.dz2 equ 76
- push eax
- push ebx
- push ecx
- push edx
- push esi
- push edi
- sub esp, 80 ; 80 bytes local stack space
- femms
- ;; assume we have at least one i particle - start directly
-.outer:
- mov eax, [ebp + %$shift] ; eax = pointer into shift[]
- mov ebx, [eax] ; ebx=shift[n]
- add [ebp + %$shift], dword 4 ; advance pointer one step
-
- lea ebx, [ebx + ebx*2] ; ebx=3*is
- mov [esp + .is3],ebx ; store is3
-
- mov eax, [ebp + %$shiftvec] ; eax = base of shiftvec[]
-
- movq mm0, [eax + ebx*4] ; move shX/shY to mm0 and shZ to mm1.
- movd mm1, [eax + ebx*4 + 8]
-
- mov ecx, [ebp + %$iinr] ; ecx = pointer into iinr[]
- add [ebp + %$iinr], dword 4 ; advance pointer
- mov ebx, [ecx] ; ebx =ii
-
- mov edx, [ebp + %$charge]
- movd mm2, [edx + ebx*4] ; mm2=charge[ii]
- pfmul mm2, [ebp + %$facel]
- punpckldq mm2,mm2 ; spread to both halves
- movq [esp + .iq], mm2 ; iq =facel*charge[ii]
-
- lea ebx, [ebx + ebx*2] ; ebx = 3*ii=ii3
- mov eax, [ebp + %$pos] ; eax = base of pos[]
-
- pfadd mm0, [eax + ebx*4] ; ix = shX + posX (and iy too)
- movd mm3, [eax + ebx*4 + 8] ; cant use direct memory add for 4 bytes (iz)
- mov [esp + .ii3], ebx
- pfadd mm1, mm3
- movq [esp + .ix], mm0
- movd [esp + .iz], mm1
-
- ;; clear vctot and i forces.
- pxor mm7,mm7
- movq [esp + .vctot], mm7
- movq [esp + .fix], mm7
- movd [esp + .fiz], mm7
-
- mov eax, [ebp + %$jindex]
- mov ecx, [eax] ; jindex[n]
- mov edx, [eax + 4] ; jindex[n+1]
- add [ebp + %$jindex], dword 4
- sub edx, ecx ; number of innerloop atoms
-
- mov esi, [ebp + %$pos]
- mov edi, [ebp + %$faction]
- mov eax, [ebp + %$jjnr]
- shl ecx, 2
- add eax, ecx
- mov [esp + .innerjjnr], eax ; pointer to jjnr[nj0]
- sub edx, dword 2
- mov [esp + .innerk], edx ; number of innerloop atoms
- jge .unroll_loop
- jmp .finish_inner
-.unroll_loop:
- ;; paired innerloop here.
- mov ecx, [esp + .innerjjnr] ; pointer to jjnr[k]
- mov eax, [ecx]
- mov ebx, [ecx + 4] ; eax/ebx=jnr
- add [esp + .innerjjnr], dword 8 ; advance pointer (unrolled 2)
- prefetch [ecx + 16] ; prefetch data - trial and error says 16 is best
-
- mov ecx, [ebp + %$charge] ; base of charge[]
- movq mm5, [esp + .iq]
- movd mm3, [ecx + eax*4] ; charge[jnr1]
- movd mm7, [ecx + ebx*4] ; charge[jnr2]
- punpckldq mm3,mm7 ; move charge 2 to high part of mm3
- pfmul mm3,mm5 ; mm3 now has qq for both particles
-
- lea eax, [eax + eax*2] ; replace jnr with j3
- lea ebx, [ebx + ebx*2]
-
- movq mm0, [esp + .ix]
- movd mm1, [esp + .iz]
- movq mm4, [esi + eax*4] ; fetch first j coordinates
- movd mm5, [esi + eax*4 + 8]
- pfsubr mm4,mm0 ; dr = ir - jr
- pfsubr mm5,mm1
- movq [esp + .dx1], mm4 ; store dr
- movd [esp + .dz1], mm5
- pfmul mm4,mm4 ; square dx,dy,dz
- pfmul mm5,mm5
- pfacc mm4, mm5 ; accumulate to get dx*dx+dy*dy+dz*dz
- pfacc mm4, mm5 ; first rsq in lower mm4
-
- movq mm6, [esi + ebx*4] ; fetch second j coordinates
- movd mm7, [esi + ebx*4 + 8]
-
- pfsubr mm6,mm0 ; dr = ir - jr
- pfsubr mm7,mm1
- movq [esp + .dx2], mm6 ; store dr
- movd [esp + .dz2], mm7
- pfmul mm6,mm6 ; square dx,dy,dz
- pfmul mm7,mm7
- pfacc mm6, mm7 ; accumulate to get dx*dx+dy*dy+dz*dz
- pfacc mm6, mm7 ; second rsq in lower mm6
-
- pfrsqrt mm0, mm4 ; lookup inverse square root seed
- pfrsqrt mm1, mm6
-
- punpckldq mm0,mm1
- punpckldq mm4,mm6 ; now 4 has rsq and 0 the seed for both pairs.
- movq mm2,mm0 ; amd 3dnow N-R iteration to get full precision.
- pfmul mm0,mm0
- pfrsqit1 mm0,mm4
- pfrcpit2 mm0,mm2
- movq mm1,mm0
- pfmul mm0,mm0
- ;; mm0 now contains invsq, and mm1 invsqrt
- ;; do potential and fscal
- prefetchw [esp + .dx1] ; prefetch i forces to cache
-
- pfmul mm3,mm1 ; 6 has both vcoul
- pfmul mm0,mm3 ; 0 has both fscal
-
- ;; update vctot
-
- pfadd mm3, [esp + .vctot] ; add the earlier value
- movq [esp + .vctot], mm3 ; store the sum
- ;; spread fscalar to both positions
- movq mm1,mm0
- punpckldq mm0,mm0
- punpckhdq mm1,mm1
- ;; calc vector force
- prefetchw [edi + eax*4] ; prefetch the 1st faction to cache
- movq mm2, [esp + .dx1] ; fetch dr
- movd mm3, [esp + .dz1]
- prefetchw [edi + ebx*4] ; prefetch the 2nd faction to cache
- pfmul mm2, mm0 ; mult by fs
- pfmul mm3, mm0
-
- movq mm4, [esp + .dx2] ; fetch dr
- movd mm5, [esp + .dz2]
- pfmul mm4, mm1 ; mult by fs
- pfmul mm5, mm1
- ;; update i forces
-
- movq mm0, [esp + .fix]
- movd mm1, [esp + .fiz]
- pfadd mm0, mm2
- pfadd mm1, mm3
-
- pfadd mm0, mm4
- pfadd mm1, mm5
- movq [esp + .fix], mm0
- movd [esp + .fiz], mm1
- ;; update j forces
-
- movq mm0, [edi + eax*4]
- movd mm1, [edi + eax*4 + 8]
- movq mm6, [edi + ebx*4]
- movd mm7, [edi + ebx*4 + 8]
-
- pfsub mm0, mm2
- pfsub mm1, mm3
- pfsub mm6, mm4
- pfsub mm7, mm5
-
- movq [edi + eax*4], mm0
- movd [edi + eax*4 +8], mm1
- movq [edi + ebx*4], mm6
- movd [edi + ebx*4 + 8], mm7
-
- ;; should we do one more iteration?
- sub [esp + .innerk], dword 2
- jl .finish_inner
- jmp .unroll_loop
-.finish_inner:
- and [esp + .innerk], dword 1
- jnz .single_inner
- jmp .updateouterdata
-.single_inner:
- ;; a single j particle iteration here - compare with the unrolled code for comments.
- mov eax, [esp + .innerjjnr]
- mov eax, [eax] ; eax=jnr offset
-
- mov ecx, [ebp + %$charge]
- movd mm6, [esp + .iq]
- movd mm7, [ecx + eax*4]
- pfmul mm6, mm7 ; mm6=qq
-
- lea eax, [eax + eax*2]
-
- movq mm0, [esp + .ix]
- movd mm1, [esp + .iz]
- movq mm2, [esi + eax*4]
- movd mm3, [esi + eax*4 + 8]
- pfsub mm0, mm2
- pfsub mm1, mm3
- movq [esp + .dx1], mm0
- pfmul mm0,mm0
- movd [esp + .dz1], mm1
- pfmul mm1,mm1
- pfacc mm0, mm1
- pfacc mm0, mm1 ; mm0=rsq
-
- pfrsqrt mm1,mm0
- movq mm2,mm1
- pfmul mm1,mm1
- pfrsqit1 mm1,mm0
- pfrcpit2 mm1,mm2 ; mm1=invsqrt
- movq mm4, mm1
- pfmul mm4, mm4 ; mm4=invsq
- ;; calculate potential and scalar force
- pfmul mm6, mm1 ; mm6=vcoul
- pfmul mm4, mm6 ; mm4=fscalar
- ;; update vctot
- movq mm5, [esp + .vctot]
- pfadd mm5, mm6
- movq [esp + .vctot], mm5
- ;; spread fscalar to both positions
- punpckldq mm4,mm4
- ;; calc vectorial force
- prefetchw [edi + eax*4] ; prefetch faction to cache
- movq mm0, [esp + .dx1]
- movd mm1, [esp + .dz1]
- pfmul mm0, mm4
- pfmul mm1, mm4
- ;; update i particle force
- movq mm2, [esp + .fix]
- movd mm3, [esp + .fiz]
- pfadd mm2, mm0
- pfadd mm3, mm1
- movq [esp + .fix], mm2
- movd [esp + .fiz], mm3
- ;; update j particle force
- movq mm2, [edi + eax*4]
- movd mm3, [edi + eax *4+ 8]
- pfsub mm2, mm0
- pfsub mm3, mm1
- movq [edi + eax*4], mm2
- movd [edi + eax*4 +8], mm3
- ;; done!
-.updateouterdata:
- mov ecx, [esp + .ii3]
-
- movq mm6, [edi + ecx*4] ; increment i force
- movd mm7, [edi + ecx*4 + 8]
- pfadd mm6, [esp + .fix]
- pfadd mm7, [esp + .fiz]
- movq [edi + ecx*4], mm6
- movd [edi + ecx*4 +8], mm7
-
- mov ebx, [ebp + %$fshift] ; increment fshift force
- mov edx, [esp + .is3]
-
- movq mm6, [ebx + edx*4]
- movd mm7, [ebx + edx*4 + 8]
- pfadd mm6, [esp + .fix]
- pfadd mm7, [esp + .fiz]
- movq [ebx + edx*4], mm6
- movd [ebx + edx*4 + 8], mm7
-
- mov edx, [ebp + %$gid] ; get group index for this i particle
- mov edx, [edx]
- add [ebp + %$gid], dword 4 ; advance pointer
-
- movq mm7, [esp + .vctot]
- pfacc mm7,mm7 ; get and sum the two parts of total potential
-
- mov eax, [ebp + %$Vc]
- movd mm6, [eax + edx*4]
- pfadd mm6, mm7
- movd [eax + edx*4], mm6 ; increment vc[gid]
- ;; finish if last
- mov ecx, [ebp + %$nri]
- dec ecx
- jecxz .end
- ;; not last, iterate once more!
- mov [ebp + %$nri], ecx
- jmp .outer
-.end:
- femms
- add esp, 80
- pop edi
- pop esi
- pop edx
- pop ecx
- pop ebx
- pop eax
- endproc
-
-
-proc inl1010_3dnow
-%$nri arg
-%$iinr arg
-%$jindex arg
-%$jjnr arg
-%$shift arg
-%$shiftvec arg
-%$fshift arg
-%$gid arg
-%$pos arg
-%$faction arg
-%$charge arg
-%$facel arg
-%$Vc arg
-%$nsatoms arg
- ;; stack offsets for local variables
-.is3 equ 0
-.ii3 equ 4
-.shX equ 8
-.shY equ 12
-.shZ equ 16
-.ix equ 20
-.iy equ 24
-.iz equ 28
-.iq equ 32 ; repeated (64bit) to fill 3dnow reg
-.vctot equ 40 ; repeated (64bit) to fill 3dnow reg
-.innerjjnr0 equ 48
-.innerk0 equ 52
-.innerjjnr equ 56
-.innerk equ 60
-.fix equ 64
-.fiy equ 68
-.fiz equ 72
-.dx1 equ 76
-.dy1 equ 80
-.dz1 equ 84
-.dx2 equ 88
-.dy2 equ 92
-.dz2 equ 96
-.nscoul equ 100
-.solnr equ 104
- push eax
- push ebx
- push ecx
- push edx
- push esi
- push edi
- sub esp, 108 ; local stack space
- femms
- ;; assume we have at least one i particle - start directly
- add [ebp + %$nsatoms], dword 8
-
-.outer:
- mov eax, [ebp + %$shift] ; eax = pointer into shift[]
- mov ebx, [eax] ; ebx=shift[n]
- add [ebp + %$shift], dword 4 ; advance pointer one step
-
- lea ebx, [ebx + ebx*2] ; ebx=3*is
- mov [esp + .is3],ebx ; store is3
-
- mov eax, [ebp + %$shiftvec] ; eax = base of shiftvec[]
-
- movq mm0, [eax + ebx*4] ; move shX/shY to mm0 and shZ to mm1.
- movd mm1, [eax + ebx*4 + 8]
- movq [esp + .shX], mm0
- movd [esp + .shZ], mm1
-
- mov ecx, [ebp + %$iinr] ; ecx = pointer into iinr[]
- add [ebp + %$iinr], dword 4 ; advance pointer
- mov ebx, [ecx] ; ebx =ii
-
- mov eax, [ebp + %$nsatoms]
- mov ecx, [eax]
- add [ebp + %$nsatoms], dword 12
- mov [esp + .nscoul], ecx
-
- ;; clear potential
- pxor mm7,mm7
- movq [esp + .vctot], mm7
- mov [esp + .solnr], ebx
-
- mov eax, [ebp + %$jindex]
- mov ecx, [eax] ; jindex[n]
- mov edx, [eax + 4] ; jindex[n+1]
- add [ebp + %$jindex], dword 4
- sub edx, ecx ; number of innerloop atoms
- mov eax, [ebp + %$jjnr]
- shl ecx, 2
- add eax, ecx
- mov [esp + .innerjjnr0], eax ; pointer to jjnr[nj0]
-
- mov [esp + .innerk0], edx ; number of innerloop atoms
- mov esi, [ebp + %$pos]
- mov edi, [ebp + %$faction]
-
- mov ecx, [esp + .nscoul]
- cmp ecx, dword 0
- jnz .mno_coul
- jmp .last_mno
-.mno_coul:
- mov ebx, [esp + .solnr]
- inc dword [esp + .solnr]
- mov edx, [ebp + %$charge]
- movd mm2, [edx + ebx*4] ; mm2=charge[ii]
- pfmul mm2, [ebp + %$facel]
- punpckldq mm2,mm2 ; spread to both halves
- movq [esp + .iq], mm2 ; iq =facel*charge[ii]
-
- lea ebx, [ebx + ebx*2] ; ebx = 3*ii=ii3
- mov eax, [ebp + %$pos] ; eax = base of pos[]
- mov [esp + .ii3], ebx
-
- movq mm0, [eax + ebx*4]
- movd mm1, [eax + ebx*4 + 8]
- pfadd mm0, [esp + .shX]
- pfadd mm1, [esp + .shZ]
- movq [esp + .ix], mm0
- movd [esp + .iz], mm1
-
- ;; clear forces.
- pxor mm7,mm7
- movq [esp + .fix], mm7
- movd [esp + .fiz], mm7
-
- mov ecx, [esp + .innerjjnr0]
- mov [esp + .innerjjnr], ecx
- mov edx, [esp + .innerk0]
- sub edx, dword 2
- mov [esp + .innerk], edx ; number of innerloop atoms
- jge .unroll_coul_loop
- jmp .finish_coul_inner
-.unroll_coul_loop:
- ;; paired innerloop here.
- mov ecx, [esp + .innerjjnr] ; pointer to jjnr[k]
- mov eax, [ecx]
- mov ebx, [ecx + 4] ; eax/ebx=jnr
- add [esp + .innerjjnr], dword 8 ; advance pointer (unrolled 2)
- prefetch [ecx + 16] ; prefetch data - trial and error says 16 is best
-
- mov ecx, [ebp + %$charge] ; base of charge[]
- movq mm5, [esp + .iq]
- movd mm3, [ecx + eax*4] ; charge[jnr1]
- movd mm7, [ecx + ebx*4] ; charge[jnr2]
- punpckldq mm3,mm7 ; move charge 2 to high part of mm3
- pfmul mm3,mm5 ; mm3 now has qq for both particles
-
- lea eax, [eax + eax*2] ; replace jnr with j3
- lea ebx, [ebx + ebx*2]
-
- movq mm0, [esp + .ix]
- movd mm1, [esp + .iz]
- movq mm4, [esi + eax*4] ; fetch first j coordinates
- movd mm5, [esi + eax*4 + 8]
- pfsubr mm4,mm0 ; dr = ir - jr
- pfsubr mm5,mm1
- movq [esp + .dx1], mm4 ; store dr
- movd [esp + .dz1], mm5
- pfmul mm4,mm4 ; square dx,dy,dz
- pfmul mm5,mm5
- pfacc mm4, mm5 ; accumulate to get dx*dx+dy*dy+dz*dz
- pfacc mm4, mm5 ; first rsq in lower mm4
-
- movq mm6, [esi + ebx*4] ; fetch second j coordinates
- movd mm7, [esi + ebx*4 + 8]
-
- pfsubr mm6,mm0 ; dr = ir - jr
- pfsubr mm7,mm1
- movq [esp + .dx2], mm6 ; store dr
- movd [esp + .dz2], mm7
- pfmul mm6,mm6 ; square dx,dy,dz
- pfmul mm7,mm7
- pfacc mm6, mm7 ; accumulate to get dx*dx+dy*dy+dz*dz
- pfacc mm6, mm7 ; second rsq in lower mm6
-
- pfrsqrt mm0, mm4 ; lookup inverse square root seed
- pfrsqrt mm1, mm6
-
- punpckldq mm0,mm1
- punpckldq mm4,mm6 ; now 4 has rsq and 0 the seed for both pairs.
- movq mm2,mm0 ; amd 3dnow N-R iteration to get full precision.
- pfmul mm0,mm0
- pfrsqit1 mm0,mm4
- pfrcpit2 mm0,mm2
- movq mm1,mm0
- pfmul mm0,mm0
- ;; mm0 now contains invsq, and mm1 invsqrt
- ;; do potential and fscal
- prefetchw [esp + .dx1] ; prefetch i forces to cache
-
- pfmul mm3,mm1 ; 6 has both vcoul
- pfmul mm0,mm3 ; 0 has both fscal
-
- ;; update vctot
-
- pfadd mm3, [esp + .vctot] ; add the earlier value
- movq [esp + .vctot], mm3 ; store the sum
- ;; spread fscalar to both positions
- movq mm1,mm0
- punpckldq mm0,mm0
- punpckhdq mm1,mm1
- ;; calc vector force
- prefetchw [edi + eax*4] ; prefetch the 1st faction to cache
- movq mm2, [esp + .dx1] ; fetch dr
- movd mm3, [esp + .dz1]
- prefetchw [edi + ebx*4] ; prefetch the 2nd faction to cache
- pfmul mm2, mm0 ; mult by fs
- pfmul mm3, mm0
-
- movq mm4, [esp + .dx2] ; fetch dr
- movd mm5, [esp + .dz2]
- pfmul mm4, mm1 ; mult by fs
- pfmul mm5, mm1
- ;; update i forces
-
- movq mm0, [esp + .fix]
- movd mm1, [esp + .fiz]
- pfadd mm0, mm2
- pfadd mm1, mm3
-
- pfadd mm0, mm4
- pfadd mm1, mm5
- movq [esp + .fix], mm0
- movd [esp + .fiz], mm1
- ;; update j forces
-
- movq mm0, [edi + eax*4]
- movd mm1, [edi + eax*4 + 8]
- movq mm6, [edi + ebx*4]
- movd mm7, [edi + ebx*4 + 8]
-
- pfsub mm0, mm2
- pfsub mm1, mm3
- pfsub mm6, mm4
- pfsub mm7, mm5
-
- movq [edi + eax*4], mm0
- movd [edi + eax*4 +8], mm1
- movq [edi + ebx*4], mm6
- movd [edi + ebx*4 + 8], mm7
-
- ;; should we do one more iteration?
- sub [esp + .innerk], dword 2
- jl .finish_coul_inner
- jmp .unroll_coul_loop
-.finish_coul_inner:
- and [esp + .innerk], dword 1
- jnz .single_coul_inner
- jmp .updateouterdata_coul
-.single_coul_inner:
- ;; a single j particle iteration here - compare with the unrolled code for comments.
- mov eax, [esp + .innerjjnr]
- mov eax, [eax] ; eax=jnr offset
-
- mov ecx, [ebp + %$charge]
- movd mm6, [esp + .iq]
- movd mm7, [ecx + eax*4]
- pfmul mm6, mm7 ; mm6=qq
-
- lea eax, [eax + eax*2]
-
- movq mm0, [esp + .ix]
- movd mm1, [esp + .iz]
- movq mm2, [esi + eax*4]
- movd mm3, [esi + eax*4 + 8]
- pfsub mm0, mm2
- pfsub mm1, mm3
- movq [esp + .dx1], mm0
- pfmul mm0,mm0
- movd [esp + .dz1], mm1
- pfmul mm1,mm1
- pfacc mm0, mm1
- pfacc mm0, mm1 ; mm0=rsq
-
- pfrsqrt mm1,mm0
- movq mm2,mm1
- pfmul mm1,mm1
- pfrsqit1 mm1,mm0
- pfrcpit2 mm1,mm2 ; mm1=invsqrt
- movq mm4, mm1
- pfmul mm4, mm4 ; mm4=invsq
- ;; calculate potential and scalar force
- pfmul mm6, mm1 ; mm6=vcoul
- pfmul mm4, mm6 ; mm4=fscalar
- ;; update vctot
- movq mm5, [esp + .vctot]
- pfadd mm5, mm6
- movq [esp + .vctot], mm5
- ;; spread fscalar to both positions
- punpckldq mm4,mm4
- ;; calc vectorial force
- prefetchw [edi + eax*4] ; prefetch faction to cache
- movq mm0, [esp + .dx1]
- movd mm1, [esp + .dz1]
- pfmul mm0, mm4
- pfmul mm1, mm4
- ;; update i particle force
- movq mm2, [esp + .fix]
- movd mm3, [esp + .fiz]
- pfadd mm2, mm0
- pfadd mm3, mm1
- movq [esp + .fix], mm2
- movd [esp + .fiz], mm3
- ;; update j particle force
- movq mm2, [edi + eax*4]
- movd mm3, [edi + eax *4+ 8]
- pfsub mm2, mm0
- pfsub mm3, mm1
- movq [edi + eax*4], mm2
- movd [edi + eax*4 +8], mm3
- ;; done!
-.updateouterdata_coul:
- mov ecx, [esp + .ii3]
-
- movq mm6, [edi + ecx*4] ; increment i force
- movd mm7, [edi + ecx*4 + 8]
- pfadd mm6, [esp + .fix]
- pfadd mm7, [esp + .fiz]
- movq [edi + ecx*4], mm6
- movd [edi + ecx*4 +8], mm7
-
- mov ebx, [ebp + %$fshift] ; increment fshift force
- mov edx, [esp + .is3]
-
- movq mm6, [ebx + edx*4]
- movd mm7, [ebx + edx*4 + 8]
- pfadd mm6, [esp + .fix]
- pfadd mm7, [esp + .fiz]
- movq [ebx + edx*4], mm6
- movd [ebx + edx*4 + 8], mm7
-
- ;; loop back to mno.
- dec dword [esp + .nscoul]
- jz .last_mno
- jmp .mno_coul
-.last_mno:
- mov edx, [ebp + %$gid] ; get group index for this i particle
- mov edx, [edx]
- add [ebp + %$gid], dword 4 ; advance pointer
-
- movq mm7, [esp + .vctot]
- pfacc mm7,mm7 ; get and sum the two parts of total potential
-
- mov eax, [ebp + %$Vc]
- movd mm6, [eax + edx*4]
- pfadd mm6, mm7
- movd [eax + edx*4], mm6 ; increment vc[gid]
- ;; finish if last
- mov ecx, [ebp + %$nri]
- dec ecx
- jecxz .end
- ;; not last, iterate once more!
- mov [ebp + %$nri], ecx
- jmp .outer
-.end:
- femms
- add esp, 108
- pop edi
- pop esi
- pop edx
- pop ecx
- pop ebx
- pop eax
- endproc
-
-
-proc inl1020_3dnow
-%$nri arg
-%$iinr arg
-%$jindex arg
-%$jjnr arg
-%$shift arg
-%$shiftvec arg
-%$fshift arg
-%$gid arg
-%$pos arg
-%$faction arg
-%$charge arg
-%$facel arg
-%$Vc arg
- ;; stack offsets for local variables
-.is3 equ 0
-.ii3 equ 4
-.ixO equ 8
-.iyO equ 12
-.izO equ 16
-.ixH equ 20 ; repeated (64bit) to fill 3dnow reg
-.iyH equ 28 ; repeated (64bit) to fill 3dnow reg
-.izH equ 36 ; repeated (64bit) to fill 3dnow reg
-.iqO equ 44 ; repeated (64bit) to fill 3dnow reg
-.iqH equ 52 ; repeated (64bit) to fill 3dnow reg
-.vctot equ 60 ; repeated (64bit) to fill 3dnow reg
-.innerjjnr equ 68
-.innerk equ 72
-.fixO equ 76
-.fiyO equ 80
-.fizO equ 84
-.fixH equ 88 ; repeated (64bit) to fill 3dnow reg
-.fiyH equ 96 ; repeated (64bit) to fill 3dnow reg
-.fizH equ 104 ; repeated (64bit) to fill 3dnow reg
-.dxO equ 112
-.dyO equ 116
-.dzO equ 120
-.dxH equ 124 ; repeated (64bit) to fill 3dnow reg
-.dyH equ 132 ; repeated (64bit) to fill 3dnow reg
-.dzH equ 140 ; repeated (64bit) to fill 3dnow reg
- push eax
- push ebx
- push ecx
- push edx
- push esi
- push edi
- sub esp, 148 ; local stack space
- femms
- ;; assume we have at least one i particle - start directly
-
- mov ecx, [ebp + %$iinr] ; ecx = pointer into iinr[]
- mov ebx, [ecx] ; ebx =ii
-
- mov edx, [ebp + %$charge]
- movd mm1, [ebp + %$facel]
- movd mm2, [edx + ebx*4] ; mm2=charge[ii0]
- pfmul mm2, mm1
- movq [esp + .iqO], mm2 ; iqO = facel*charge[ii]
-
- movd mm2, [edx + ebx*4 + 4] ; mm2=charge[ii0+1]
- pfmul mm2, mm1
- punpckldq mm2,mm2 ; spread to both halves
- movq [esp + .iqH], mm2 ; iqH = facel*charge[ii0+1]
-.outer:
- mov eax, [ebp + %$shift] ; eax = pointer into shift[]
- mov ebx, [eax] ; ebx=shift[n]
- add [ebp + %$shift], dword 4 ; advance pointer one step
-
- lea ebx, [ebx + ebx*2] ; ebx=3*is
- mov [esp + .is3],ebx ; store is3
-
- mov eax, [ebp + %$shiftvec] ; eax = base of shiftvec[]
-
- movq mm5, [eax + ebx*4] ; move shX/shY to mm5 and shZ to mm6.
- movd mm6, [eax + ebx*4 + 8]
- movq mm0, mm5
- movq mm1, mm5
- movq mm2, mm6
- punpckldq mm0,mm0 ; also expand shX,Y,Z in mm0--mm2.
- punpckhdq mm1,mm1
- punpckldq mm2,mm2
-
- mov ecx, [ebp + %$iinr] ; ecx = pointer into iinr[]
- add [ebp + %$iinr], dword 4 ; advance pointer
- mov ebx, [ecx] ; ebx =ii
-
- lea ebx, [ebx + ebx*2] ; ebx = 3*ii=ii3
- mov eax, [ebp + %$pos] ; eax = base of pos[]
-
- pfadd mm5, [eax + ebx*4] ; ix = shX + posX (and iy too)
- movd mm7, [eax + ebx*4 + 8] ; cant use direct memory add for 4 bytes (iz)
- mov [esp + .ii3], ebx ; (use mm7 as temp. storage for iz.)
- pfadd mm6, mm7
- movq [esp + .ixO], mm5
- movq [esp + .izO], mm6
-
- movd mm3, [eax + ebx*4 + 12]
- movd mm4, [eax + ebx*4 + 16]
- movd mm5, [eax + ebx*4 + 20]
- punpckldq mm3, [eax + ebx*4 + 24]
- punpckldq mm4, [eax + ebx*4 + 28]
- punpckldq mm5, [eax + ebx*4 + 32] ; coords of H1 in low mm3-mm5, H2 in high.
-
- pfadd mm0, mm3
- pfadd mm1, mm4
- pfadd mm2, mm5
- movq [esp + .ixH], mm0
- movq [esp + .iyH], mm1
- movq [esp + .izH], mm2
-
- ;; clear vctot and i forces.
- pxor mm7,mm7
- movq [esp + .vctot], mm7
- movq [esp + .fixO], mm7
- movd [esp + .fizO], mm7
- movq [esp + .fixH], mm7
- movq [esp + .fiyH], mm7
- movq [esp + .fizH], mm7
-
- mov eax, [ebp + %$jindex]
- mov ecx, [eax] ; jindex[n]
- mov edx, [eax + 4] ; jindex[n+1]
- add [ebp + %$jindex], dword 4
- sub edx, ecx ; number of innerloop atoms
- mov [esp + .innerk], edx ; number of innerloop atoms
-
- mov esi, [ebp + %$pos]
- mov edi, [ebp + %$faction]
- mov eax, [ebp + %$jjnr]
- shl ecx, 2
- add eax, ecx
- mov [esp + .innerjjnr], eax ; pointer to jjnr[nj0]
-.inner_loop:
- ;; a single j particle iteration here - compare with the unrolled code for comments.
- mov eax, [esp + .innerjjnr]
- mov eax, [eax] ; eax=jnr offset
- add [esp + .innerjjnr], dword 4 ; advance pointer
- ;prefetch [ecx + 16] ; prefetch data - trial and error says 16 is best
-
- mov ecx, [ebp + %$charge]
- movd mm7, [ecx + eax*4]
- punpckldq mm7,mm7
- movq mm6,mm7
- pfmul mm6, [esp + .iqO]
- pfmul mm7, [esp + .iqH] ; mm6=qqO, mm7=qqH
-
- lea eax, [eax + eax*2]
-
- movq mm0, [esi + eax*4]
- movd mm1, [esi + eax*4 + 8]
- ;; copy & expand to mm2-mm4 for the H interactions
- movq mm2, mm0
- movq mm3, mm0
- movq mm4, mm1
- punpckldq mm2,mm2
- punpckhdq mm3,mm3
- punpckldq mm4,mm4
-
- pfsubr mm0, [esp + .ixO]
- pfsubr mm1, [esp + .izO]
-
- movq [esp + .dxO], mm0
- pfmul mm0,mm0
- movd [esp + .dzO], mm1
- pfmul mm1,mm1
- pfacc mm0, mm1
- pfadd mm0, mm1 ; mm0=rsqO
-
- punpckldq mm2, mm2
- punpckldq mm3, mm3
- punpckldq mm4, mm4 ; mm2-mm4 is jx-jz
- pfsubr mm2, [esp + .ixH]
- pfsubr mm3, [esp + .iyH]
- pfsubr mm4, [esp + .izH] ; mm2-mm4 is dxH-dzH
-
- movq [esp + .dxH], mm2
- movq [esp + .dyH], mm3
- movq [esp + .dzH], mm4
- pfmul mm2,mm2
- pfmul mm3,mm3
- pfmul mm4,mm4
-
- pfadd mm3,mm2
- pfadd mm3,mm4 ; mm3=rsqH
-
- pfrsqrt mm1,mm0
-
- movq mm2,mm1
- pfmul mm1,mm1
- pfrsqit1 mm1,mm0
- pfrcpit2 mm1,mm2 ; mm1=invsqrt
- movq mm4, mm1
- pfmul mm4, mm4 ; mm4=invsq
- ;; calculate potential and scalar force
- pfmul mm6, mm1 ; mm6=vcoul
- pfmul mm4, mm6 ; mm4=fscalar
-
- pfrsqrt mm5, mm3
- pswapd mm3,mm3
- pfrsqrt mm2, mm3
- pswapd mm3,mm3
- punpckldq mm5,mm2 ; seeds are in mm5 now, and rsq in mm3.
-
- movq mm2, mm5
- pfmul mm5,mm5
- pfrsqit1 mm5,mm3
- pfrcpit2 mm5,mm2 ; mm5=invsqrt
- movq mm3,mm5
- pfmul mm3,mm3 ; mm3=invsq
- pfmul mm7, mm5 ; mm7=vcoul
- pfmul mm3, mm7 ; mm3=fscal for the two H's.
-
- ;; update vctot
- pfadd mm7, mm6
- pfadd mm7, [esp + .vctot]
- movq [esp + .vctot], mm7
-
- ;; spread oxygen fscalar to both positions
- punpckldq mm4,mm4
- ;; calc vectorial force for O
- prefetchw [edi + eax*4] ; prefetch faction to cache
- movq mm0, [esp + .dxO]
- movd mm1, [esp + .dzO]
- pfmul mm0, mm4
- pfmul mm1, mm4
-
- ;; calc vectorial force for H's
- movq mm5, [esp + .dxH]
- movq mm6, [esp + .dyH]
- movq mm7, [esp + .dzH]
- pfmul mm5, mm3
- pfmul mm6, mm3
- pfmul mm7, mm3
-
- ;; update iO particle force
- movq mm2, [esp + .fixO]
- movd mm3, [esp + .fizO]
- pfadd mm2, mm0
- pfadd mm3, mm1
- movq [esp + .fixO], mm2
- movd [esp + .fizO], mm3
-
- ;; update iH forces
- movq mm2, [esp + .fixH]
- movq mm3, [esp + .fiyH]
- movq mm4, [esp + .fizH]
- pfadd mm2, mm5
- pfadd mm3, mm6
- pfadd mm4, mm7
- movq [esp + .fixH], mm2
- movq [esp + .fiyH], mm3
- movq [esp + .fizH], mm4
-
- ;; pack j forces from H in the same form as the oxygen force.
- pfacc mm5, mm6 ; mm5(l)=fjx(H1+H2) mm5(h)=fjy(H1+H2)
- pfacc mm7, mm7 ; mm7(l)=fjz(H1+H2)
-
- pfadd mm0, mm5 ; add up total force on j particle.
- pfadd mm1, mm7
-
- ;; update j particle force
- movq mm2, [edi + eax*4]
- movd mm3, [edi + eax*4 + 8]
- pfsub mm2, mm0
- pfsub mm3, mm1
- movq [edi + eax*4], mm2
- movd [edi + eax*4 +8], mm3
-
- ;; done - one more?
- dec dword [esp + .innerk]
- jz .updateouterdata
- jmp .inner_loop
-.updateouterdata:
- mov ecx, [esp + .ii3]
-
- movq mm6, [edi + ecx*4] ; increment iO force
- movd mm7, [edi + ecx*4 + 8]
- pfadd mm6, [esp + .fixO]
- pfadd mm7, [esp + .fizO]
- movq [edi + ecx*4], mm6
- movd [edi + ecx*4 +8], mm7
-
- movq mm0, [esp + .fixH]
- movq mm3, [esp + .fiyH]
- movq mm1, [esp + .fizH]
- movq mm2, mm0
- punpckldq mm0, mm3 ; mm0(l)=fxH1, mm0(h)=fyH1
- punpckhdq mm2, mm3 ; mm2(l)=fxH2, mm2(h)=fyH2
- movq mm3, mm1
- pswapd mm3,mm3
- ;; mm1 is fzH1
- ;; mm3 is fzH2
-
- movq mm6, [edi + ecx*4 + 12] ; increment iH1 force
- movd mm7, [edi + ecx*4 + 20]
- pfadd mm6, mm0
- pfadd mm7, mm1
- movq [edi + ecx*4 + 12], mm6
- movd [edi + ecx*4 + 20], mm7
-
- movq mm6, [edi + ecx*4 + 24] ; increment iH2 force
- movd mm7, [edi + ecx*4 + 32]
- pfadd mm6, mm2
- pfadd mm7, mm3
- movq [edi + ecx*4 + 24], mm6
- movd [edi + ecx*4 + 32], mm7
-
-
- mov ebx, [ebp + %$fshift] ; increment fshift force
- mov edx, [esp + .is3]
-
- movq mm6, [ebx + edx*4]
- movd mm7, [ebx + edx*4 + 8]
- pfadd mm6, [esp + .fixO]
- pfadd mm7, [esp + .fizO]
- pfadd mm6, mm0
- pfadd mm7, mm1
- pfadd mm6, mm2
- pfadd mm7, mm3
- movq [ebx + edx*4], mm6
- movd [ebx + edx*4 + 8], mm7
-
- mov edx, [ebp + %$gid] ; get group index for this i particle
- mov edx, [edx]
- add [ebp + %$gid], dword 4 ; advance pointer
-
- movq mm7, [esp + .vctot]
- pfacc mm7,mm7 ; get and sum the two parts of total potential
-
- mov eax, [ebp + %$Vc]
- movd mm6, [eax + edx*4]
- pfadd mm6, mm7
- movd [eax + edx*4], mm6 ; increment vc[gid]
-
- ;; finish if last
- dec dword [ebp + %$nri]
- jz .end
- ;; not last, iterate once more!
- jmp .outer
-.end:
- femms
- add esp, 148
- pop edi
- pop esi
- pop edx
- pop ecx
- pop ebx
- pop eax
- endproc
-
-
-proc inl1030_3dnow
-%$nri arg
-%$iinr arg
-%$jindex arg
-%$jjnr arg
-%$shift arg
-%$shiftvec arg
-%$fshift arg
-%$gid arg
-%$pos arg
-%$faction arg
-%$charge arg
-%$facel arg
-%$Vc arg
- ;; stack offsets for local variables
-.is3 equ 0
-.ii3 equ 4
-.ixO equ 8
-.iyO equ 12
-.izO equ 16
-.ixH equ 20 ; repeated (64bit) to fill 3dnow reg
-.iyH equ 28 ; repeated (64bit) to fill 3dnow reg
-.izH equ 36 ; repeated (64bit) to fill 3dnow reg
-.qqOO equ 44 ; repeated (64bit) to fill 3dnow reg
-.qqOH equ 52 ; repeated (64bit) to fill 3dnow reg
-.qqHH equ 60 ; repeated (64bit) to fill 3dnow reg
-.vctot equ 68 ; repeated (64bit) to fill 3dnow reg
-.innerjjnr equ 76
-.innerk equ 80
-.fixO equ 84
-.fiyO equ 88
-.fizO equ 92
-.fixH equ 96 ; repeated (64bit) to fill 3dnow reg
-.fiyH equ 104 ; repeated (64bit) to fill 3dnow reg
-.fizH equ 112 ; repeated (64bit) to fill 3dnow reg
-.dxO equ 120
-.dyO equ 124
-.dzO equ 128
-.dxH equ 132 ; repeated (64bit) to fill 3dnow reg
-.dyH equ 140 ; repeated (64bit) to fill 3dnow reg
-.dzH equ 148 ; repeated (64bit) to fill 3dnow reg
- push eax
- push ebx
- push ecx
- push edx
- push esi
- push edi
- sub esp, 156 ; local stack space
- femms
- ;; assume we have at least one i particle - start directly
-
- mov ecx, [ebp + %$iinr] ; ecx = pointer into iinr[]
- mov ebx, [ecx] ; ebx =ii
-
- mov edx, [ebp + %$charge]
- movd mm1, [ebp + %$facel] ; mm1=facel
- movd mm2, [edx + ebx*4] ; mm2=charge[ii0] (O)
- movd mm3, [edx + ebx*4 + 4] ; mm2=charge[ii0+1] (H)
- movq mm4, mm2
- pfmul mm4, mm1
- movq mm6, mm3
- pfmul mm6, mm1
- movq mm5, mm4
- pfmul mm4, mm2 ; mm4=qqOO*facel
- pfmul mm5, mm3 ; mm5=qqOH*facel
- pfmul mm6, mm3 ; mm6=qqHH*facel
- punpckldq mm5,mm5 ; spread to both halves
- punpckldq mm6,mm6 ; spread to both halves
- movq [esp + .qqOO], mm4
- movq [esp + .qqOH], mm5
- movq [esp + .qqHH], mm6
-.outer:
- mov eax, [ebp + %$shift] ; eax = pointer into shift[]
- mov ebx, [eax] ; ebx=shift[n]
- add [ebp + %$shift], dword 4 ; advance pointer one step
-
- lea ebx, [ebx + ebx*2] ; ebx=3*is
- mov [esp + .is3],ebx ; store is3
-
- mov eax, [ebp + %$shiftvec] ; eax = base of shiftvec[]
-
- movq mm5, [eax + ebx*4] ; move shX/shY to mm5 and shZ to mm6.
- movd mm6, [eax + ebx*4 + 8]
- movq mm0, mm5
- movq mm1, mm5
- movq mm2, mm6
- punpckldq mm0,mm0 ; also expand shX,Y,Z in mm0--mm2.
- punpckhdq mm1,mm1
- punpckldq mm2,mm2
-
- mov ecx, [ebp + %$iinr] ; ecx = pointer into iinr[]
- add [ebp + %$iinr], dword 4 ; advance pointer
- mov ebx, [ecx] ; ebx =ii
-
- lea ebx, [ebx + ebx*2] ; ebx = 3*ii=ii3
- mov eax, [ebp + %$pos] ; eax = base of pos[]
-
- pfadd mm5, [eax + ebx*4] ; ix = shX + posX (and iy too)
- movd mm7, [eax + ebx*4 + 8] ; cant use direct memory add for 4 bytes (iz)
- mov [esp + .ii3], ebx ; (use mm7 as temp. storage for iz.)
- pfadd mm6, mm7
- movq [esp + .ixO], mm5
- movq [esp + .izO], mm6
-
- movd mm3, [eax + ebx*4 + 12]
- movd mm4, [eax + ebx*4 + 16]
- movd mm5, [eax + ebx*4 + 20]
- punpckldq mm3, [eax + ebx*4 + 24]
- punpckldq mm4, [eax + ebx*4 + 28]
- punpckldq mm5, [eax + ebx*4 + 32] ; coords of H1 in low mm3-mm5, H2 in high.
-
- pfadd mm0, mm3
- pfadd mm1, mm4
- pfadd mm2, mm5
- movq [esp + .ixH], mm0
- movq [esp + .iyH], mm1
- movq [esp + .izH], mm2
-
- ;; clear vctot and i forces.
- pxor mm7,mm7
- movq [esp + .vctot], mm7
- movq [esp + .fixO], mm7
- movq [esp + .fizO], mm7
- movq [esp + .fixH], mm7
- movq [esp + .fiyH], mm7
- movq [esp + .fizH], mm7
-
- mov eax, [ebp + %$jindex]
- mov ecx, [eax] ; jindex[n]
- mov edx, [eax + 4] ; jindex[n+1]
- add [ebp + %$jindex], dword 4
- sub edx, ecx ; number of innerloop atoms
- mov [esp + .innerk], edx ; number of innerloop atoms
-
- mov esi, [ebp + %$pos]
- mov edi, [ebp + %$faction]
- mov eax, [ebp + %$jjnr]
- shl ecx, 2
- add eax, ecx
- mov [esp + .innerjjnr], eax ; pointer to jjnr[nj0]
-.inner_loop:
- ;; a single j particle iteration here - compare with the unrolled code for comments.
- mov eax, [esp + .innerjjnr]
- mov eax, [eax] ; eax=jnr offset
- add [esp + .innerjjnr], dword 4 ; advance pointer
-
- movd mm6, [esp + .qqOO]
- movq mm7, [esp + .qqOH]
-
- lea eax, [eax + eax*2]
- movq mm0, [esi + eax*4]
- movd mm1, [esi + eax*4 + 8]
- ;; copy & expand to mm2-mm4 for the H interactions
- movq mm2, mm0
- movq mm3, mm0
- movq mm4, mm1
- punpckldq mm2,mm2
- punpckhdq mm3,mm3
- punpckldq mm4,mm4
-
- pfsubr mm0, [esp + .ixO]
- pfsubr mm1, [esp + .izO]
-
- movq [esp + .dxO], mm0
- pfmul mm0,mm0
- movd [esp + .dzO], mm1
- pfmul mm1,mm1
- pfacc mm0, mm0
- pfadd mm0, mm1 ; mm0=rsqO
-
- punpckldq mm2, mm2
- punpckldq mm3, mm3
- punpckldq mm4, mm4 ; mm2-mm4 is jx-jz
- pfsubr mm2, [esp + .ixH]
- pfsubr mm3, [esp + .iyH]
- pfsubr mm4, [esp + .izH] ; mm2-mm4 is dxH-dzH
-
- movq [esp + .dxH], mm2
- movq [esp + .dyH], mm3
- movq [esp + .dzH], mm4
- pfmul mm2,mm2
- pfmul mm3,mm3
- pfmul mm4,mm4
-
- pfadd mm3,mm2
- pfadd mm3,mm4 ; mm3=rsqH
-
- pfrsqrt mm1,mm0
-
- movq mm2,mm1
- pfmul mm1,mm1
- pfrsqit1 mm1,mm0
- pfrcpit2 mm1,mm2 ; mm1=invsqrt
- movq mm4, mm1
- pfmul mm4, mm4 ; mm4=invsq
- ;; calculate potential and scalar force
- pfmul mm6, mm1 ; mm6=vcoul
- pfmul mm4, mm6 ; mm4=fscalar
-
- pfrsqrt mm5, mm3
- pswapd mm3,mm3
- pfrsqrt mm2, mm3
- pswapd mm3,mm3
- punpckldq mm5,mm2 ; seeds are in mm5 now, and rsq in mm3.
-
- movq mm2, mm5
- pfmul mm5,mm5
- pfrsqit1 mm5,mm3
- pfrcpit2 mm5,mm2 ; mm5=invsqrt
- movq mm3,mm5
- pfmul mm3,mm3 ; mm3=invsq
- pfmul mm7, mm5 ; mm7=vcoul
- pfmul mm3, mm7 ; mm3=fscal for the two H's.
-
- ;; update vctot
- pfadd mm7, mm6
- pfadd mm7, [esp + .vctot]
- movq [esp + .vctot], mm7
-
- ;; spread oxygen fscalar to both positions
- punpckldq mm4,mm4
- ;; calc vectorial force for O
- movq mm0, [esp + .dxO]
- movd mm1, [esp + .dzO]
- pfmul mm0, mm4
- pfmul mm1, mm4
-
- ;; calc vectorial force for H's
- movq mm5, [esp + .dxH]
- movq mm6, [esp + .dyH]
- movq mm7, [esp + .dzH]
- pfmul mm5, mm3
- pfmul mm6, mm3
- pfmul mm7, mm3
-
- ;; update iO particle force
- movq mm2, [esp + .fixO]
- movd mm3, [esp + .fizO]
- pfadd mm2, mm0
- pfadd mm3, mm1
- movq [esp + .fixO], mm2
- movd [esp + .fizO], mm3
-
- ;; update iH forces
- movq mm2, [esp + .fixH]
- movq mm3, [esp + .fiyH]
- movq mm4, [esp + .fizH]
- pfadd mm2, mm5
- pfadd mm3, mm6
- pfadd mm4, mm7
- movq [esp + .fixH], mm2
- movq [esp + .fiyH], mm3
- movq [esp + .fizH], mm4
-
- ;; pack j forces from H in the same form as the oxygen force.
- pfacc mm5, mm6 ; mm5(l)=fjx(H1+H2) mm5(h)=fjy(H1+H2)
- pfacc mm7, mm7 ; mm7(l)=fjz(H1+H2)
-
- pfadd mm0, mm5 ; add up total force on j particle.
- pfadd mm1, mm7
-
- ;; update j particle force
- movq mm2, [edi + eax*4]
- movd mm3, [edi + eax*4 + 8]
- pfsub mm2, mm0
- pfsub mm3, mm1
- movq [edi + eax*4], mm2
- movd [edi + eax*4 +8], mm3
-
- ; interactions with j H1.
- movq mm0, [esi + eax*4 + 12]
- movd mm1, [esi + eax*4 + 20]
- ;; copy & expand to mm2-mm4 for the H interactions
- movq mm2, mm0
- movq mm3, mm0
- movq mm4, mm1
- punpckldq mm2,mm2
- punpckhdq mm3,mm3
- punpckldq mm4,mm4
-
- movd mm6, [esp + .qqOH]
- movq mm7, [esp + .qqHH]
-
- pfsubr mm0, [esp + .ixO]
- pfsubr mm1, [esp + .izO]
-
- movq [esp + .dxO], mm0
- pfmul mm0,mm0
- movd [esp + .dzO], mm1
- pfmul mm1,mm1
- pfacc mm0, mm1
- pfadd mm0, mm1 ; mm0=rsqO
-
- punpckldq mm2, mm2
- punpckldq mm3, mm3
- punpckldq mm4, mm4 ; mm2-mm4 is jx-jz
- pfsubr mm2, [esp + .ixH]
- pfsubr mm3, [esp + .iyH]
- pfsubr mm4, [esp + .izH] ; mm2-mm4 is dxH-dzH
-
- movq [esp + .dxH], mm2
- movq [esp + .dyH], mm3
- movq [esp + .dzH], mm4
- pfmul mm2,mm2
- pfmul mm3,mm3
- pfmul mm4,mm4
-
- pfadd mm3,mm2
- pfadd mm3,mm4 ; mm3=rsqH
-
- pfrsqrt mm1,mm0
-
- movq mm2,mm1
- pfmul mm1,mm1
- pfrsqit1 mm1,mm0
- pfrcpit2 mm1,mm2 ; mm1=invsqrt
- movq mm4, mm1
- pfmul mm4, mm4 ; mm4=invsq
- ;; calculate potential and scalar force
- pfmul mm6, mm1 ; mm6=vcoul
- pfmul mm4, mm6 ; mm4=fscalar
-
- pfrsqrt mm5, mm3
- pswapd mm3,mm3
- pfrsqrt mm2, mm3
- pswapd mm3,mm3
- punpckldq mm5,mm2 ; seeds are in mm5 now, and rsq in mm3.
-
- movq mm2, mm5
- pfmul mm5,mm5
- pfrsqit1 mm5,mm3
- pfrcpit2 mm5,mm2 ; mm5=invsqrt
- movq mm3,mm5
- pfmul mm3,mm3 ; mm3=invsq
- pfmul mm7, mm5 ; mm7=vcoul
- pfmul mm3, mm7 ; mm3=fscal for the two H's.
-
- ;; update vctot
- pfadd mm7, mm6
- pfadd mm7, [esp + .vctot]
- movq [esp + .vctot], mm7
-
- ;; spread oxygen fscalar to both positions
- punpckldq mm4,mm4
- ;; calc vectorial force for O
- movq mm0, [esp + .dxO]
- movd mm1, [esp + .dzO]
- pfmul mm0, mm4
- pfmul mm1, mm4
-
- ;; calc vectorial force for H's
- movq mm5, [esp + .dxH]
- movq mm6, [esp + .dyH]
- movq mm7, [esp + .dzH]
- pfmul mm5, mm3
- pfmul mm6, mm3
- pfmul mm7, mm3
-
- ;; update iO particle force
- movq mm2, [esp + .fixO]
- movd mm3, [esp + .fizO]
- pfadd mm2, mm0
- pfadd mm3, mm1
- movq [esp + .fixO], mm2
- movd [esp + .fizO], mm3
-
- ;; update iH forces
- movq mm2, [esp + .fixH]
- movq mm3, [esp + .fiyH]
- movq mm4, [esp + .fizH]
- pfadd mm2, mm5
- pfadd mm3, mm6
- pfadd mm4, mm7
- movq [esp + .fixH], mm2
- movq [esp + .fiyH], mm3
- movq [esp + .fizH], mm4
-
- ;; pack j forces from H in the same form as the oxygen force.
- pfacc mm5, mm6 ; mm5(l)=fjx(H1+H2) mm5(h)=fjy(H1+H2)
- pfacc mm7, mm7 ; mm7(l)=fjz(H1+H2)
-
- pfadd mm0, mm5 ; add up total force on j particle.
- pfadd mm1, mm7
-
- ;; update j particle force
- movq mm2, [edi + eax*4 + 12]
- movd mm3, [edi + eax*4 + 20]
- pfsub mm2, mm0
- pfsub mm3, mm1
- movq [edi + eax*4 + 12], mm2
- movd [edi + eax*4 + 20], mm3
-
- ; interactions with j H2
- movq mm0, [esi + eax*4 + 24]
- movd mm1, [esi + eax*4 + 32]
- ;; copy & expand to mm2-mm4 for the H interactions
- movq mm2, mm0
- movq mm3, mm0
- movq mm4, mm1
- punpckldq mm2,mm2
- punpckhdq mm3,mm3
- punpckldq mm4,mm4
-
- movd mm6, [esp + .qqOH]
- movq mm7, [esp + .qqHH]
-
- pfsubr mm0, [esp + .ixO]
- pfsubr mm1, [esp + .izO]
-
- movq [esp + .dxO], mm0
- pfmul mm0,mm0
- movd [esp + .dzO], mm1
- pfmul mm1,mm1
- pfacc mm0, mm1
- pfadd mm0, mm1 ; mm0=rsqO
-
- punpckldq mm2, mm2
- punpckldq mm3, mm3
- punpckldq mm4, mm4 ; mm2-mm4 is jx-jz
- pfsubr mm2, [esp + .ixH]
- pfsubr mm3, [esp + .iyH]
- pfsubr mm4, [esp + .izH] ; mm2-mm4 is dxH-dzH
-
- movq [esp + .dxH], mm2
- movq [esp + .dyH], mm3
- movq [esp + .dzH], mm4
- pfmul mm2,mm2
- pfmul mm3,mm3
- pfmul mm4,mm4
-
- pfadd mm3,mm2
- pfadd mm3,mm4 ; mm3=rsqH
-
- pfrsqrt mm1,mm0
-
- movq mm2,mm1
- pfmul mm1,mm1
- pfrsqit1 mm1,mm0
- pfrcpit2 mm1,mm2 ; mm1=invsqrt
- movq mm4, mm1
- pfmul mm4, mm4 ; mm4=invsq
- ;; calculate potential and scalar force
- pfmul mm6, mm1 ; mm6=vcoul
- pfmul mm4, mm6 ; mm4=fscalar
-
- pfrsqrt mm5, mm3
- pswapd mm3,mm3
- pfrsqrt mm2, mm3
- pswapd mm3,mm3
- punpckldq mm5,mm2 ; seeds are in mm5 now, and rsq in mm3.
-
- movq mm2, mm5
- pfmul mm5,mm5
- pfrsqit1 mm5,mm3
- pfrcpit2 mm5,mm2 ; mm5=invsqrt
- movq mm3,mm5
- pfmul mm3,mm3 ; mm3=invsq
- pfmul mm7, mm5 ; mm7=vcoul
- pfmul mm3, mm7 ; mm3=fscal for the two H's.
-
- ;; update vctot
- pfadd mm7, mm6
- pfadd mm7, [esp + .vctot]
- movq [esp + .vctot], mm7
-
- ;; spread oxygen fscalar to both positions
- punpckldq mm4,mm4
- ;; calc vectorial force for O
- movq mm0, [esp + .dxO]
- movd mm1, [esp + .dzO]
- pfmul mm0, mm4
- pfmul mm1, mm4
-
- ;; calc vectorial force for H's
- movq mm5, [esp + .dxH]
- movq mm6, [esp + .dyH]
- movq mm7, [esp + .dzH]
- pfmul mm5, mm3
- pfmul mm6, mm3
- pfmul mm7, mm3
-
- ;; update iO particle force
- movq mm2, [esp + .fixO]
- movd mm3, [esp + .fizO]
- pfadd mm2, mm0
- pfadd mm3, mm1
- movq [esp + .fixO], mm2
- movd [esp + .fizO], mm3
-
- ;; update iH forces
- movq mm2, [esp + .fixH]
- movq mm3, [esp + .fiyH]
- movq mm4, [esp + .fizH]
- pfadd mm2, mm5
- pfadd mm3, mm6
- pfadd mm4, mm7
- movq [esp + .fixH], mm2
- movq [esp + .fiyH], mm3
- movq [esp + .fizH], mm4
-
- ;; pack j forces from H in the same form as the oxygen force.
- pfacc mm5, mm6 ; mm5(l)=fjx(H1+H2) mm5(h)=fjy(H1+H2)
- pfacc mm7, mm7 ; mm7(l)=fjz(H1+H2)
-
- pfadd mm0, mm5 ; add up total force on j particle.
- pfadd mm1, mm7
-
- ;; update j particle force
- movq mm2, [edi + eax*4 + 24]
- movd mm3, [edi + eax*4 + 32]
- pfsub mm2, mm0
- pfsub mm3, mm1
- movq [edi + eax*4 + 24], mm2
- movd [edi + eax*4 + 32], mm3
-
- ;; done - one more?
- dec dword [esp + .innerk]
- jz .updateouterdata
- jmp .inner_loop
-.updateouterdata:
- mov ecx, [esp + .ii3]
-
- movq mm6, [edi + ecx*4] ; increment iO force
- movd mm7, [edi + ecx*4 + 8]
- pfadd mm6, [esp + .fixO]
- pfadd mm7, [esp + .fizO]
- movq [edi + ecx*4], mm6
- movd [edi + ecx*4 +8], mm7
-
- movq mm0, [esp + .fixH]
- movq mm3, [esp + .fiyH]
- movq mm1, [esp + .fizH]
- movq mm2, mm0
- punpckldq mm0, mm3 ; mm0(l)=fxH1, mm0(h)=fyH1
- punpckhdq mm2, mm3 ; mm2(l)=fxH2, mm2(h)=fyH2
- movq mm3, mm1
- pswapd mm3,mm3
- ;; mm1 is fzH1
- ;; mm3 is fzH2
-
- movq mm6, [edi + ecx*4 + 12] ; increment iH1 force
- movd mm7, [edi + ecx*4 + 20]
- pfadd mm6, mm0
- pfadd mm7, mm1
- movq [edi + ecx*4 + 12], mm6
- movd [edi + ecx*4 + 20], mm7
-
- movq mm6, [edi + ecx*4 + 24] ; increment iH2 force
- movd mm7, [edi + ecx*4 + 32]
- pfadd mm6, mm2
- pfadd mm7, mm3
- movq [edi + ecx*4 + 24], mm6
- movd [edi + ecx*4 + 32], mm7
-
-
- mov ebx, [ebp + %$fshift] ; increment fshift force
- mov edx, [esp + .is3]
-
- movq mm6, [ebx + edx*4]
- movd mm7, [ebx + edx*4 + 8]
- pfadd mm6, [esp + .fixO]
- pfadd mm7, [esp + .fizO]
- pfadd mm6, mm0
- pfadd mm7, mm1
- pfadd mm6, mm2
- pfadd mm7, mm3
- movq [ebx + edx*4], mm6
- movd [ebx + edx*4 + 8], mm7
-
- mov edx, [ebp + %$gid] ; get group index for this i particle
- mov edx, [edx]
- add [ebp + %$gid], dword 4 ; advance pointer
-
- movq mm7, [esp + .vctot]
- pfacc mm7,mm7 ; get and sum the two parts of total potential
-
- mov eax, [ebp + %$Vc]
- movd mm6, [eax + edx*4]
- pfadd mm6, mm7
- movd [eax + edx*4], mm6 ; increment vc[gid]
- ;; finish if last
- dec dword [ebp + %$nri]
- jz .end
- ;; not last, iterate once more!
- jmp .outer
-.end:
- femms
- add esp, 156
- pop edi
- pop esi
- pop edx
- pop ecx
- pop ebx
- pop eax
- endproc
-
-
-proc inl1100_3dnow
-%$nri arg
-%$iinr arg
-%$jindex arg
-%$jjnr arg
-%$shift arg
-%$shiftvec arg
-%$fshift arg
-%$gid arg
-%$pos arg
-%$faction arg
-%$charge arg
-%$facel arg
-%$Vc arg
-%$type arg
-%$ntype arg
-%$nbfp arg
-%$Vnb arg
- ;; stack offsets for local variables
-.is3 equ 0
-.ii3 equ 4
-.ix equ 8
-.iy equ 12
-.iz equ 16
-.iq equ 20 ; repeated (64bit) to fill 3dnow reg
-.vctot equ 28 ; repeated (64bit) to fill 3dnow reg
-.vnbtot equ 36 ; repeated (64bit) to fill 3dnow reg
-.c6 equ 44 ; repeated (64bit) to fill 3dnow reg
-.c12 equ 52 ; repeated (64bit) to fill 3dnow reg
-.six equ 60 ; repeated (64bit) to fill 3dnow reg
-.twelve equ 68 ; repeated (64bit) to fill 3dnow reg
-.ntia equ 76
-.innerjjnr equ 80
-.innerk equ 84
-.fix equ 88
-.fiy equ 92
-.fiz equ 96
-.dx1 equ 100
-.dy1 equ 104
-.dz1 equ 108
-.dx2 equ 112
-.dy2 equ 116
-.dz2 equ 120
- push eax
- push ebx
- push ecx
- push edx
- push esi
- push edi
- sub esp, 124 ; local stack space
- femms
- ; move data to local stack
- movq mm0, [mm_six]
- movq mm1, [mm_twelve]
- movq [esp + .six], mm0
- movq [esp + .twelve], mm1
- ;; assume we have at least one i particle - start directly
-.outer:
- mov eax, [ebp + %$shift] ; eax = pointer into shift[]
- mov ebx, [eax] ; ebx=shift[n]
- add [ebp + %$shift], dword 4 ; advance pointer one step
-
- lea ebx, [ebx + ebx*2] ; ebx=3*is
- mov [esp + .is3],ebx ; store is3
-
- mov eax, [ebp + %$shiftvec] ; eax = base of shiftvec[]
-
- movq mm0, [eax + ebx*4] ; move shX/shY to mm0 and shZ to mm1.
- movd mm1, [eax + ebx*4 + 8]
-
- mov ecx, [ebp + %$iinr] ; ecx = pointer into iinr[]
- add [ebp + %$iinr], dword 4 ; advance pointer
- mov ebx, [ecx] ; ebx =ii
-
- mov edx, [ebp + %$charge]
- movd mm2, [edx + ebx*4] ; mm2=charge[ii]
- pfmul mm2, [ebp + %$facel]
- punpckldq mm2,mm2 ; spread to both halves
- movq [esp + .iq], mm2 ; iq =facel*charge[ii]
-
- mov edx, [ebp + %$type]
- mov edx, [edx + ebx*4]
- imul edx, [ebp + %$ntype]
- shl edx, 1
- mov [esp + .ntia], edx
-
- lea ebx, [ebx + ebx*2] ; ebx = 3*ii=ii3
- mov eax, [ebp + %$pos] ; eax = base of pos[]
-
- pfadd mm0, [eax + ebx*4] ; ix = shX + posX (and iy too)
- movd mm3, [eax + ebx*4 + 8] ; cant use direct memory add for 4 bytes (iz)
- mov [esp + .ii3], ebx
- pfadd mm1, mm3
- movq [esp + .ix], mm0
- movd [esp + .iz], mm1
-
- ;; clear total potential and i forces.
- pxor mm7,mm7
- movq [esp + .vctot], mm7
- movq [esp + .vnbtot], mm7
- movq [esp + .fix], mm7
- movd [esp + .fiz], mm7
-
- mov eax, [ebp + %$jindex]
- mov ecx, [eax] ; jindex[n]
- mov edx, [eax + 4] ; jindex[n+1]
- add [ebp + %$jindex], dword 4
- sub edx, ecx ; number of innerloop atoms
-
- mov esi, [ebp + %$pos]
- mov edi, [ebp + %$faction]
- mov eax, [ebp + %$jjnr]
- shl ecx, 2
- add eax, ecx
- mov [esp + .innerjjnr], eax ; pointer to jjnr[nj0]
- sub edx, dword 2
- mov [esp + .innerk], edx ; number of innerloop atoms
- jge .unroll_loop
- jmp .finish_inner
-.unroll_loop:
- ;; paired innerloop here.
- mov ecx, [esp + .innerjjnr] ; pointer to jjnr[k]
- mov eax, [ecx]
- mov ebx, [ecx + 4] ; eax/ebx=jnr
- add [esp + .innerjjnr], dword 8 ; advance pointer (unrolled 2)
- prefetch [ecx + 16] ; prefetch data - trial and error says 16 is best
-
- mov ecx, [ebp + %$charge] ; base of charge[]
- movq mm5, [esp + .iq]
- movd mm3, [ecx + eax*4] ; charge[jnr1]
- punpckldq mm3, [ecx + ebx*4] ; move charge 2 to high part of mm3
- pfmul mm3,mm5 ; mm3 now has qq for both particles
-
- mov ecx, [ebp + %$type]
- mov edx, [ecx + eax*4] ; type [jnr1]
- mov ecx, [ecx + ebx*4] ; type [jnr2]
-
- mov esi, [ebp + %$nbfp] ; base of nbfp
- shl edx, 1
- shl ecx, 1
- add edx, [esp + .ntia] ; tja = ntia + 2*type
- add ecx, [esp + .ntia]
-
- movq mm5, [esi + edx*4] ; mm5 = 1st c6 / c12
- movq mm7, [esi + ecx*4] ; mm7 = 2nd c6 / c12
- movq mm6,mm5
- punpckldq mm5,mm7 ; mm5 = 1st c6 / 2nd c6
- punpckhdq mm6,mm7 ; mm6 = 1st c12 / 2nd c12
- movq [esp + .c6], mm5
- movq [esp + .c12], mm6
-
- lea eax, [eax + eax*2] ; replace jnr with j3
- lea ebx, [ebx + ebx*2]
-
- mov esi, [ebp + %$pos]
-
- movq mm0, [esp + .ix]
- movd mm1, [esp + .iz]
- movq mm4, [esi + eax*4] ; fetch first j coordinates
- movd mm5, [esi + eax*4 + 8]
- pfsubr mm4,mm0 ; dr = ir - jr
- pfsubr mm5,mm1
- movq [esp + .dx1], mm4 ; store dr
- movd [esp + .dz1], mm5
- pfmul mm4,mm4 ; square dx,dy,dz
- pfmul mm5,mm5
- pfacc mm4, mm5 ; accumulate to get dx*dx+dy*dy+dz*dz
- pfacc mm4, mm5 ; first rsq in lower mm4
-
- movq mm6, [esi + ebx*4] ; fetch second j coordinates
- movd mm7, [esi + ebx*4 + 8]
-
- pfsubr mm6,mm0 ; dr = ir - jr
- pfsubr mm7,mm1
- movq [esp + .dx2], mm6 ; store dr
- movd [esp + .dz2], mm7
- pfmul mm6,mm6 ; square dx,dy,dz
- pfmul mm7,mm7
- pfacc mm6, mm7 ; accumulate to get dx*dx+dy*dy+dz*dz
- pfacc mm6, mm7 ; second rsq in lower mm6
-
- pfrsqrt mm0, mm4 ; lookup inverse square root seed
- pfrsqrt mm1, mm6
-
- punpckldq mm0,mm1
- punpckldq mm4,mm6 ; now 4 has rsq and 0 the seed for both pairs.
- movq mm2,mm0 ; amd 3dnow N-R iteration to get full precision.
- pfmul mm0,mm0
- pfrsqit1 mm0,mm4
- pfrcpit2 mm0,mm2
- movq mm1,mm0
- pfmul mm0,mm0
- ;; mm0 now contains invsq, and mm1 invsqrt
- ;; do potential and fscal
- movq mm4, mm0
- pfmul mm4, mm0
- pfmul mm4, mm0 ; mm4=rinvsix
- movq mm5, mm4
- pfmul mm5, mm5 ; mm5=rinvtwelve
-
- pfmul mm3, mm1 ; mm3 has vcoul for both interactions
- movq mm7, mm3 ; use mm7 for sum to make fscal
-
- pfmul mm5, [esp + .c12]
- pfmul mm4, [esp + .c6]
- movq mm6, mm5 ; mm6 is vnb12-vnb6
- pfsub mm6, mm4
-
- pfmul mm4, [esp + .six]
-
- pfmul mm5, [esp + .twelve]
- pfsub mm7,mm4
- pfadd mm7, mm5
- pfmul mm0, mm7 ; mm0 is total fscal now
-
- prefetchw [esp + .dx1] ; prefetch i forces to cache
-
- ;; update vctot
- pfadd mm3, [esp + .vctot] ; add the earlier value
- movq [esp + .vctot], mm3 ; store the sum
-
- ;; spread fscalar to both positions
- movq mm1,mm0
- punpckldq mm0,mm0
- punpckhdq mm1,mm1
-
- ;; calc vector force
- prefetchw [edi + eax*4] ; prefetch the 1st faction to cache
- movq mm2, [esp + .dx1] ; fetch dr
- movd mm3, [esp + .dz1]
-
- ;; update vnbtot
- pfadd mm6, [esp + .vnbtot] ; add the earlier value
- movq [esp + .vnbtot], mm6 ; store the sum
-
- prefetchw [edi + ebx*4] ; prefetch the 2nd faction to cache
- pfmul mm2, mm0 ; mult by fs
- pfmul mm3, mm0
-
- movq mm4, [esp + .dx2] ; fetch dr
- movd mm5, [esp + .dz2]
- pfmul mm4, mm1 ; mult by fs
- pfmul mm5, mm1
- ;; update i forces
-
- movq mm0, [esp + .fix]
- movd mm1, [esp + .fiz]
- pfadd mm0, mm2
- pfadd mm1, mm3
-
- pfadd mm0, mm4
- pfadd mm1, mm5
- movq [esp + .fix], mm0
- movd [esp + .fiz], mm1
- ;; update j forces
-
- movq mm0, [edi + eax*4]
- movd mm1, [edi + eax*4 + 8]
- movq mm6, [edi + ebx*4]
- movd mm7, [edi + ebx*4 + 8]
-
- pfsub mm0, mm2
- pfsub mm1, mm3
- pfsub mm6, mm4
- pfsub mm7, mm5
-
- movq [edi + eax*4], mm0
- movd [edi + eax*4 +8], mm1
- movq [edi + ebx*4], mm6
- movd [edi + ebx*4 + 8], mm7
-
- ;; should we do one more iteration?
- sub [esp + .innerk], dword 2
- jl .finish_inner
- jmp .unroll_loop
-.finish_inner:
- and [esp + .innerk], dword 1
- jnz .single_inner
- jmp .updateouterdata
-.single_inner:
- ;; a single j particle iteration here - compare with the unrolled code for comments.
- mov eax, [esp + .innerjjnr]
- mov eax, [eax] ; eax=jnr offset
-
- mov ecx, [ebp + %$charge]
- movd mm5, [esp + .iq]
- movd mm3, [ecx + eax*4]
- pfmul mm3, mm5 ; mm3=qq
-
- mov esi, [ebp + %$nbfp]
- mov ecx, [ebp + %$type]
- mov edx, [ecx + eax*4] ; type [jnr1]
- shl edx, 1
- add edx, [esp + .ntia] ; tja = ntia + 2*type
- movd mm5, [esi + edx*4] ; mm5 = 1st c6
- movq [esp + .c6], mm5
- movd mm5, [esi + edx*4 + 4] ; mm5 = 1st c12
- movq [esp + .c12], mm5
-
-
- mov esi, [ebp + %$pos]
- lea eax, [eax + eax*2]
-
- movq mm0, [esp + .ix]
- movd mm1, [esp + .iz]
- movq mm4, [esi + eax*4]
- movd mm5, [esi + eax*4 + 8]
- pfsubr mm4, mm0
- pfsubr mm5, mm1
- movq [esp + .dx1], mm4
- pfmul mm4,mm4
- movd [esp + .dz1], mm5
- pfmul mm5,mm5
- pfacc mm4, mm5
- pfacc mm4, mm5 ; mm0=rsq
-
- pfrsqrt mm0,mm4
- movq mm2,mm0
- pfmul mm0,mm0
- pfrsqit1 mm0,mm4
- pfrcpit2 mm0,mm2 ; mm1=invsqrt
- movq mm1, mm0
- pfmul mm0, mm0 ; mm0=invsq
- ;; calculate potentials and scalar force
- movq mm4, mm0
- pfmul mm4, mm0
- pfmul mm4, mm0 ; mm4=rinvsix
- movq mm5, mm4
- pfmul mm5, mm5 ; mm5=rinvtwelve
-
- pfmul mm3, mm1 ; mm3 has vcoul for both interactions
- movq mm7, mm3 ; use mm7 for sum to make fscal
-
- pfmul mm5, [esp + .c12]
- pfmul mm4, [esp + .c6]
- movq mm6, mm5 ; mm6 is vnb12-vnb6
- pfsub mm6, mm4
-
- pfmul mm4, [esp + .six]
-
- pfmul mm5, [esp + .twelve]
- pfsub mm7,mm4
- pfadd mm7, mm5
- pfmul mm0, mm7 ; mm0 is total fscal now
-
- ;; update vctot
- pfadd mm3, [esp + .vctot]
- movq [esp + .vctot], mm3
-
- ;; update vnbtot
- pfadd mm6, [esp + .vnbtot] ; add the earlier value
- movq [esp + .vnbtot], mm6 ; store the sum
-
- ;; spread fscalar to both positions
- punpckldq mm0,mm0
- ;; calc vectorial force
- prefetchw [edi + eax*4] ; prefetch faction to cache
- movq mm2, [esp + .dx1]
- movd mm3, [esp + .dz1]
-
-
- pfmul mm2, mm0
- pfmul mm3, mm0
-
- ;; update i particle force
- movq mm0, [esp + .fix]
- movd mm1, [esp + .fiz]
- pfadd mm0, mm2
- pfadd mm1, mm3
- movq [esp + .fix], mm0
- movd [esp + .fiz], mm1
- ;; update j particle force
- movq mm0, [edi + eax*4]
- movd mm1, [edi + eax *4+ 8]
- pfsub mm0, mm2
- pfsub mm1, mm3
- movq [edi + eax*4], mm0
- movd [edi + eax*4 +8], mm1
- ;; done!
-.updateouterdata:
- mov ecx, [esp + .ii3]
-
- movq mm6, [edi + ecx*4] ; increment i force
- movd mm7, [edi + ecx*4 + 8]
- pfadd mm6, [esp + .fix]
- pfadd mm7, [esp + .fiz]
- movq [edi + ecx*4], mm6
- movd [edi + ecx*4 +8], mm7
-
- mov ebx, [ebp + %$fshift] ; increment fshift force
- mov edx, [esp + .is3]
-
- movq mm6, [ebx + edx*4]
- movd mm7, [ebx + edx*4 + 8]
- pfadd mm6, [esp + .fix]
- pfadd mm7, [esp + .fiz]
- movq [ebx + edx*4], mm6
- movd [ebx + edx*4 + 8], mm7
-
- mov edx, [ebp + %$gid] ; get group index for this i particle
- mov edx, [edx]
- add [ebp + %$gid], dword 4 ; advance pointer
-
- movq mm7, [esp + .vctot]
- pfacc mm7,mm7 ; get and sum the two parts of total potential
-
- mov eax, [ebp + %$Vc]
- movd mm6, [eax + edx*4]
- pfadd mm6, mm7
- movd [eax + edx*4], mm6 ; increment vc[gid]
-
- movq mm7, [esp + .vnbtot]
- pfacc mm7,mm7 ; get and sum the two parts of total potential
-
- mov eax, [ebp + %$Vnb]
- movd mm6, [eax + edx*4]
- pfadd mm6, mm7
- movd [eax + edx*4], mm6 ; increment vnb[gid]
-
- ;; finish if last
- mov ecx, [ebp + %$nri]
- dec ecx
- jecxz .end
- ;; not last, iterate once more!
- mov [ebp + %$nri], ecx
- jmp .outer
-.end:
- femms
- add esp, 124
- pop edi
- pop esi
- pop edx
- pop ecx
- pop ebx
- pop eax
- endproc
-
-
-
-
-
-proc inl1110_3dnow
-%$nri arg
-%$iinr arg
-%$jindex arg
-%$jjnr arg
-%$shift arg
-%$shiftvec arg
-%$fshift arg
-%$gid arg
-%$pos arg
-%$faction arg
-%$charge arg
-%$facel arg
-%$Vc arg
-%$type arg
-%$ntype arg
-%$nbfp arg
-%$Vnb arg
-%$nsatoms arg
- ;; stack offsets for local variables
-.is3 equ 0
-.ii3 equ 4
-.shX equ 8
-.shY equ 12
-.shZ equ 16
-.ix equ 20
-.iy equ 24
-.iz equ 28
-.iq equ 32 ; repeated (64bit) to fill 3dnow reg
-.vctot equ 40 ; repeated (64bit) to fill 3dnow reg
-.vnbtot equ 48 ; repeated (64bit) to fill 3dnow reg
-.c6 equ 56 ; repeated (64bit) to fill 3dnow reg
-.c12 equ 64 ; repeated (64bit) to fill 3dnow reg
-.six equ 72 ; repeated (64bit) to fill 3dnow reg
-.twelve equ 80 ; repeated (64bit) to fill 3dnow reg
-.ntia equ 88
-.innerjjnr0 equ 92
-.innerk0 equ 96
-.innerjjnr equ 100
-.innerk equ 104
-.fix equ 108
-.fiy equ 112
-.fiz equ 116
-.dx1 equ 120
-.dy1 equ 124
-.dz1 equ 128
-.dx2 equ 132
-.dy2 equ 136
-.dz2 equ 140
-.nsvdwc equ 144
-.nscoul equ 148
-.nsvdw equ 152
-.solnr equ 156
- push eax
- push ebx
- push ecx
- push edx
- push esi
- push edi
- sub esp, 160 ; local stack space
- femms
- movq mm0, [mm_six]
- movq mm1, [mm_twelve]
- movq [esp + .six], mm0
- movq [esp + .twelve], mm1
- ;; assume we have at least one i particle - start directly
-.outer:
- mov eax, [ebp + %$shift] ; eax = pointer into shift[]
- mov ebx, [eax] ; ebx=shift[n]
- add [ebp + %$shift], dword 4 ; advance pointer one step
-
- lea ebx, [ebx + ebx*2] ; ebx=3*is
- mov [esp + .is3],ebx ; store is3
-
- mov eax, [ebp + %$shiftvec] ; eax = base of shiftvec[]
-
- movq mm0, [eax + ebx*4] ; move shX/shY to mm0 and shZ to mm1.
- movd mm1, [eax + ebx*4 + 8]
- movq [esp + .shX], mm0
- movd [esp + .shZ], mm1
-
- mov ecx, [ebp + %$iinr] ; ecx = pointer into iinr[]
- add [ebp + %$iinr], dword 4 ; advance pointer
- mov ebx, [ecx] ; ebx =ii
-
- mov eax, [ebp + %$nsatoms]
- add [ebp + %$nsatoms], dword 12
- mov ecx, [eax]
- mov edx, [eax + 4]
- mov eax, [eax + 8]
- sub ecx, eax
- sub eax, edx
-
- mov [esp + .nsvdwc], edx
- mov [esp + .nscoul], eax
- mov [esp + .nsvdw], ecx
-
- ;; clear potential
- pxor mm7,mm7
- movq [esp + .vctot], mm7
- movq [esp + .vnbtot], mm7
- mov [esp + .solnr], ebx
-
- mov eax, [ebp + %$jindex]
- mov ecx, [eax] ; jindex[n]
- mov edx, [eax + 4] ; jindex[n+1]
- add [ebp + %$jindex], dword 4
- sub edx, ecx ; number of innerloop atoms
- mov eax, [ebp + %$jjnr]
- shl ecx, 2
- add eax, ecx
- mov [esp + .innerjjnr0], eax ; pointer to jjnr[nj0]
-
- mov [esp + .innerk0], edx ; number of innerloop atoms
- mov esi, [ebp + %$pos]
- mov edi, [ebp + %$faction]
-
- mov ecx, [esp + .nsvdwc]
- cmp ecx, dword 0
- jnz .mno_vdwc
- jmp .testcoul
-.mno_vdwc:
- mov ebx, [esp + .solnr]
- inc dword [esp + .solnr]
- mov edx, [ebp + %$charge]
- movd mm2, [edx + ebx*4] ; mm2=charge[ii]
- pfmul mm2, [ebp + %$facel]
- punpckldq mm2,mm2 ; spread to both halves
- movq [esp + .iq], mm2 ; iq =facel*charge[ii]
-
- mov edx, [ebp + %$type]
- mov edx, [edx + ebx*4]
- imul edx, [ebp + %$ntype]
- shl edx, 1
- mov [esp + .ntia], edx
-
- lea ebx, [ebx + ebx*2] ; ebx = 3*ii=ii3
- mov eax, [ebp + %$pos] ; eax = base of pos[]
- mov [esp + .ii3], ebx
-
- movq mm0, [eax + ebx*4]
- movd mm1, [eax + ebx*4 + 8]
- pfadd mm0, [esp + .shX]
- pfadd mm1, [esp + .shZ]
- movq [esp + .ix], mm0
- movd [esp + .iz], mm1
-
- ;; clear forces.
- pxor mm7,mm7
- movq [esp + .fix], mm7
- movd [esp + .fiz], mm7
-
- mov ecx, [esp + .innerjjnr0]
- mov [esp + .innerjjnr], ecx
- mov edx, [esp + .innerk0]
- sub edx, dword 2
- mov [esp + .innerk], edx ; number of innerloop atoms
- jge .unroll_vdwc_loop
- jmp .finish_vdwc_inner
-.unroll_vdwc_loop:
- ;; paired innerloop here.
- mov ecx, [esp + .innerjjnr] ; pointer to jjnr[k]
- mov eax, [ecx]
- mov ebx, [ecx + 4] ; eax/ebx=jnr
- add [esp + .innerjjnr], dword 8 ; advance pointer (unrolled 2)
- prefetch [ecx + 16] ; prefetch data - trial and error says 16 is best
-
- mov ecx, [ebp + %$charge] ; base of charge[]
- movq mm5, [esp + .iq]
- movd mm3, [ecx + eax*4] ; charge[jnr1]
- punpckldq mm3, [ecx + ebx*4] ; move charge 2 to high part of mm3
- pfmul mm3,mm5 ; mm3 now has qq for both particles
-
- mov ecx, [ebp + %$type]
- mov edx, [ecx + eax*4] ; type [jnr1]
- mov ecx, [ecx + ebx*4] ; type [jnr2]
-
- mov esi, [ebp + %$nbfp] ; base of nbfp
- shl edx, 1
- shl ecx, 1
- add edx, [esp + .ntia] ; tja = ntia + 2*type
- add ecx, [esp + .ntia]
-
- movq mm5, [esi + edx*4] ; mm5 = 1st c6 / c12
- movq mm7, [esi + ecx*4] ; mm7 = 2nd c6 / c12
- movq mm6,mm5
- punpckldq mm5,mm7 ; mm5 = 1st c6 / 2nd c6
- punpckhdq mm6,mm7 ; mm6 = 1st c12 / 2nd c12
- movq [esp + .c6], mm5
- movq [esp + .c12], mm6
-
- lea eax, [eax + eax*2] ; replace jnr with j3
- lea ebx, [ebx + ebx*2]
-
- mov esi, [ebp + %$pos]
-
- movq mm0, [esp + .ix]
- movd mm1, [esp + .iz]
- movq mm4, [esi + eax*4] ; fetch first j coordinates
- movd mm5, [esi + eax*4 + 8]
- pfsubr mm4,mm0 ; dr = ir - jr
- pfsubr mm5,mm1
- movq [esp + .dx1], mm4 ; store dr
- movd [esp + .dz1], mm5
- pfmul mm4,mm4 ; square dx,dy,dz
- pfmul mm5,mm5
- pfacc mm4, mm5 ; accumulate to get dx*dx+dy*dy+dz*dz
- pfacc mm4, mm5 ; first rsq in lower mm4
-
- movq mm6, [esi + ebx*4] ; fetch second j coordinates
- movd mm7, [esi + ebx*4 + 8]
-
- pfsubr mm6,mm0 ; dr = ir - jr
- pfsubr mm7,mm1
- movq [esp + .dx2], mm6 ; store dr
- movd [esp + .dz2], mm7
- pfmul mm6,mm6 ; square dx,dy,dz
- pfmul mm7,mm7
- pfacc mm6, mm7 ; accumulate to get dx*dx+dy*dy+dz*dz
- pfacc mm6, mm7 ; second rsq in lower mm6
-
- pfrsqrt mm0, mm4 ; lookup inverse square root seed
- pfrsqrt mm1, mm6
-
- punpckldq mm0,mm1
- punpckldq mm4,mm6 ; now 4 has rsq and 0 the seed for both pairs.
- movq mm2,mm0 ; amd 3dnow N-R iteration to get full precision.
- pfmul mm0,mm0
- pfrsqit1 mm0,mm4
- pfrcpit2 mm0,mm2
- movq mm1,mm0
- pfmul mm0,mm0
- ;; mm0 now contains invsq, and mm1 invsqrt
- ;; do potential and fscal
- movq mm4, mm0
- pfmul mm4, mm0
- pfmul mm4, mm0 ; mm4=rinvsix
- movq mm5, mm4
- pfmul mm5, mm5 ; mm5=rinvtwelve
-
- pfmul mm3, mm1 ; mm3 has vcoul for both interactions
- movq mm7, mm3 ; use mm7 for sum to make fscal
-
- pfmul mm5, [esp + .c12]
- pfmul mm4, [esp + .c6]
- movq mm6, mm5 ; mm6 is vnb12-vnb6
- pfsub mm6, mm4
-
- pfmul mm4, [esp + .six]
-
- pfmul mm5, [esp + .twelve]
- pfsub mm7,mm4
- pfadd mm7, mm5
- pfmul mm0, mm7 ; mm0 is total fscal now
-
- prefetchw [esp + .dx1] ; prefetch i forces to cache
-
- ;; update vctot
- pfadd mm3, [esp + .vctot] ; add the earlier value
- movq [esp + .vctot], mm3 ; store the sum
-
- ;; spread fscalar to both positions
- movq mm1,mm0
- punpckldq mm0,mm0
- punpckhdq mm1,mm1
-
- ;; calc vector force
- prefetchw [edi + eax*4] ; prefetch the 1st faction to cache
- movq mm2, [esp + .dx1] ; fetch dr
- movd mm3, [esp + .dz1]
-
- ;; update vnbtot
- pfadd mm6, [esp + .vnbtot] ; add the earlier value
- movq [esp + .vnbtot], mm6 ; store the sum
-
- prefetchw [edi + ebx*4] ; prefetch the 2nd faction to cache
- pfmul mm2, mm0 ; mult by fs
- pfmul mm3, mm0
-
- movq mm4, [esp + .dx2] ; fetch dr
- movd mm5, [esp + .dz2]
- pfmul mm4, mm1 ; mult by fs
- pfmul mm5, mm1
- ;; update i forces
-
- movq mm0, [esp + .fix]
- movd mm1, [esp + .fiz]
- pfadd mm0, mm2
- pfadd mm1, mm3
-
- pfadd mm0, mm4
- pfadd mm1, mm5
- movq [esp + .fix], mm0
- movd [esp + .fiz], mm1
- ;; update j forces
-
- movq mm0, [edi + eax*4]
- movd mm1, [edi + eax*4 + 8]
- movq mm6, [edi + ebx*4]
- movd mm7, [edi + ebx*4 + 8]
-
- pfsub mm0, mm2
- pfsub mm1, mm3
- pfsub mm6, mm4
- pfsub mm7, mm5
-
- movq [edi + eax*4], mm0
- movd [edi + eax*4 +8], mm1
- movq [edi + ebx*4], mm6
- movd [edi + ebx*4 + 8], mm7
-
- ;; should we do one more iteration?
- sub [esp + .innerk], dword 2
- jl .finish_vdwc_inner
- jmp .unroll_vdwc_loop
-.finish_vdwc_inner:
- and [esp + .innerk], dword 1
- jnz .single_vdwc_inner
- jmp .updateouterdata_vdwc
-.single_vdwc_inner:
- ;; a single j particle iteration here - compare with the unrolled code for comments.
- mov eax, [esp + .innerjjnr]
- mov eax, [eax] ; eax=jnr offset
-
- mov ecx, [ebp + %$charge]
- movd mm5, [esp + .iq]
- movd mm3, [ecx + eax*4]
- pfmul mm3, mm5 ; mm3=qq
-
- mov esi, [ebp + %$nbfp]
- mov ecx, [ebp + %$type]
- mov edx, [ecx + eax*4] ; type [jnr1]
- shl edx, 1
- add edx, [esp + .ntia] ; tja = ntia + 2*type
- movd mm5, [esi + edx*4] ; mm5 = 1st c6
- movq [esp + .c6], mm5
- movd mm5, [esi + edx*4 + 4] ; mm5 = 1st c12
- movq [esp + .c12], mm5
-
-
- mov esi, [ebp + %$pos]
- lea eax, [eax + eax*2]
-
- movq mm0, [esp + .ix]
- movd mm1, [esp + .iz]
- movq mm4, [esi + eax*4]
- movd mm5, [esi + eax*4 + 8]
- pfsubr mm4, mm0
- pfsubr mm5, mm1
- movq [esp + .dx1], mm4
- pfmul mm4,mm4
- movd [esp + .dz1], mm5
- pfmul mm5,mm5
- pfacc mm4, mm5
- pfacc mm4, mm5 ; mm0=rsq
-
- pfrsqrt mm0,mm4
- movq mm2,mm0
- pfmul mm0,mm0
- pfrsqit1 mm0,mm4
- pfrcpit2 mm0,mm2 ; mm1=invsqrt
- movq mm1, mm0
- pfmul mm0, mm0 ; mm0=invsq
- ;; calculate potentials and scalar force
- movq mm4, mm0
- pfmul mm4, mm0
- pfmul mm4, mm0 ; mm4=rinvsix
- movq mm5, mm4
- pfmul mm5, mm5 ; mm5=rinvtwelve
-
- pfmul mm3, mm1 ; mm3 has vcoul for both interactions
- movq mm7, mm3 ; use mm7 for sum to make fscal
-
- pfmul mm5, [esp + .c12]
- pfmul mm4, [esp + .c6]
- movq mm6, mm5 ; mm6 is vnb12-vnb6
- pfsub mm6, mm4
-
- pfmul mm4, [esp + .six]
-
- pfmul mm5, [esp + .twelve]
- pfsub mm7,mm4
- pfadd mm7, mm5
- pfmul mm0, mm7 ; mm0 is total fscal now
-
- ;; update vctot
- pfadd mm3, [esp + .vctot]
- movq [esp + .vctot], mm3
-
- ;; update vnbtot
- pfadd mm6, [esp + .vnbtot] ; add the earlier value
- movq [esp + .vnbtot], mm6 ; store the sum
-
- ;; spread fscalar to both positions
- punpckldq mm0,mm0
- ;; calc vectorial force
- prefetchw [edi + eax*4] ; prefetch faction to cache
- movq mm2, [esp + .dx1]
- movd mm3, [esp + .dz1]
-
-
- pfmul mm2, mm0
- pfmul mm3, mm0
-
- ;; update i particle force
- movq mm0, [esp + .fix]
- movd mm1, [esp + .fiz]
- pfadd mm0, mm2
- pfadd mm1, mm3
- movq [esp + .fix], mm0
- movd [esp + .fiz], mm1
- ;; update j particle force
- movq mm0, [edi + eax*4]
- movd mm1, [edi + eax *4+ 8]
- pfsub mm0, mm2
- pfsub mm1, mm3
- movq [edi + eax*4], mm0
- movd [edi + eax*4 +8], mm1
- ;; done!
-.updateouterdata_vdwc:
- mov ecx, [esp + .ii3]
-
- movq mm6, [edi + ecx*4] ; increment i force
- movd mm7, [edi + ecx*4 + 8]
- pfadd mm6, [esp + .fix]
- pfadd mm7, [esp + .fiz]
- movq [edi + ecx*4], mm6
- movd [edi + ecx*4 +8], mm7
-
- mov ebx, [ebp + %$fshift] ; increment fshift force
- mov edx, [esp + .is3]
-
- movq mm6, [ebx + edx*4]
- movd mm7, [ebx + edx*4 + 8]
- pfadd mm6, [esp + .fix]
- pfadd mm7, [esp + .fiz]
- movq [ebx + edx*4], mm6
- movd [ebx + edx*4 + 8], mm7
-
- ;; loop back to mno.
- dec dword [esp + .nsvdwc]
- jz .testcoul
- jmp .mno_vdwc
-.testcoul
- mov ecx, [esp + .nscoul]
- cmp ecx, byte 0
- jnz .mno_coul
- jmp .testvdw
-.mno_coul:
- mov ebx, [esp + .solnr]
- inc dword [esp + .solnr]
- mov edx, [ebp + %$charge]
- movd mm2, [edx + ebx*4] ; mm2=charge[ii]
- pfmul mm2, [ebp + %$facel]
- punpckldq mm2,mm2 ; spread to both halves
- movq [esp + .iq], mm2 ; iq =facel*charge[ii]
-
- lea ebx, [ebx + ebx*2] ; ebx = 3*ii=ii3
- mov eax, [ebp + %$pos] ; eax = base of pos[]
- mov [esp + .ii3], ebx
-
- movq mm0, [eax + ebx*4]
- movd mm1, [eax + ebx*4 + 8]
- pfadd mm0, [esp + .shX]
- pfadd mm1, [esp + .shZ]
- movq [esp + .ix], mm0
- movd [esp + .iz], mm1
-
- ;; clear forces.
- pxor mm7,mm7
- movq [esp + .fix], mm7
- movd [esp + .fiz], mm7
-
- mov ecx, [esp + .innerjjnr0]
- mov [esp + .innerjjnr], ecx
- mov edx, [esp + .innerk0]
- sub edx, dword 2
- mov [esp + .innerk], edx ; number of innerloop atoms
- jge .unroll_coul_loop
- jmp .finish_coul_inner
-.unroll_coul_loop:
- ;; paired innerloop here.
- mov ecx, [esp + .innerjjnr] ; pointer to jjnr[k]
- mov eax, [ecx]
- mov ebx, [ecx + 4] ; eax/ebx=jnr
- add [esp + .innerjjnr], dword 8 ; advance pointer (unrolled 2)
- prefetch [ecx + 16] ; prefetch data - trial and error says 16 is best
-
- mov ecx, [ebp + %$charge] ; base of charge[]
- movq mm5, [esp + .iq]
- movd mm3, [ecx + eax*4] ; charge[jnr1]
- movd mm7, [ecx + ebx*4] ; charge[jnr2]
- punpckldq mm3,mm7 ; move charge 2 to high part of mm3
- pfmul mm3,mm5 ; mm3 now has qq for both particles
-
- lea eax, [eax + eax*2] ; replace jnr with j3
- lea ebx, [ebx + ebx*2]
-
- movq mm0, [esp + .ix]
- movd mm1, [esp + .iz]
- movq mm4, [esi + eax*4] ; fetch first j coordinates
- movd mm5, [esi + eax*4 + 8]
- pfsubr mm4,mm0 ; dr = ir - jr
- pfsubr mm5,mm1
- movq [esp + .dx1], mm4 ; store dr
- movd [esp + .dz1], mm5
- pfmul mm4,mm4 ; square dx,dy,dz
- pfmul mm5,mm5
- pfacc mm4, mm5 ; accumulate to get dx*dx+dy*dy+dz*dz
- pfacc mm4, mm5 ; first rsq in lower mm4
-
- movq mm6, [esi + ebx*4] ; fetch second j coordinates
- movd mm7, [esi + ebx*4 + 8]
-
- pfsubr mm6,mm0 ; dr = ir - jr
- pfsubr mm7,mm1
- movq [esp + .dx2], mm6 ; store dr
- movd [esp + .dz2], mm7
- pfmul mm6,mm6 ; square dx,dy,dz
- pfmul mm7,mm7
- pfacc mm6, mm7 ; accumulate to get dx*dx+dy*dy+dz*dz
- pfacc mm6, mm7 ; second rsq in lower mm6
-
- pfrsqrt mm0, mm4 ; lookup inverse square root seed
- pfrsqrt mm1, mm6
-
- punpckldq mm0,mm1
- punpckldq mm4,mm6 ; now 4 has rsq and 0 the seed for both pairs.
- movq mm2,mm0 ; amd 3dnow N-R iteration to get full precision.
- pfmul mm0,mm0
- pfrsqit1 mm0,mm4
- pfrcpit2 mm0,mm2
- movq mm1,mm0
- pfmul mm0,mm0
- ;; mm0 now contains invsq, and mm1 invsqrt
- ;; do potential and fscal
- prefetchw [esp + .dx1] ; prefetch i forces to cache
-
- pfmul mm3,mm1 ; 6 has both vcoul
- pfmul mm0,mm3 ; 0 has both fscal
-
- ;; update vctot
-
- pfadd mm3, [esp + .vctot] ; add the earlier value
- movq [esp + .vctot], mm3 ; store the sum
- ;; spread fscalar to both positions
- movq mm1,mm0
- punpckldq mm0,mm0
- punpckhdq mm1,mm1
- ;; calc vector force
- prefetchw [edi + eax*4] ; prefetch the 1st faction to cache
- movq mm2, [esp + .dx1] ; fetch dr
- movd mm3, [esp + .dz1]
- prefetchw [edi + ebx*4] ; prefetch the 2nd faction to cache
- pfmul mm2, mm0 ; mult by fs
- pfmul mm3, mm0
-
- movq mm4, [esp + .dx2] ; fetch dr
- movd mm5, [esp + .dz2]
- pfmul mm4, mm1 ; mult by fs
- pfmul mm5, mm1
- ;; update i forces
-
- movq mm0, [esp + .fix]
- movd mm1, [esp + .fiz]
- pfadd mm0, mm2
- pfadd mm1, mm3
-
- pfadd mm0, mm4
- pfadd mm1, mm5
- movq [esp + .fix], mm0
- movd [esp + .fiz], mm1
- ;; update j forces
-
- movq mm0, [edi + eax*4]
- movd mm1, [edi + eax*4 + 8]
- movq mm6, [edi + ebx*4]
- movd mm7, [edi + ebx*4 + 8]
-
- pfsub mm0, mm2
- pfsub mm1, mm3
- pfsub mm6, mm4
- pfsub mm7, mm5
-
- movq [edi + eax*4], mm0
- movd [edi + eax*4 +8], mm1
- movq [edi + ebx*4], mm6
- movd [edi + ebx*4 + 8], mm7
-
- ;; should we do one more iteration?
- sub [esp + .innerk], dword 2
- jl .finish_coul_inner
- jmp .unroll_coul_loop
-.finish_coul_inner:
- and [esp + .innerk], dword 1
- jnz .single_coul_inner
- jmp .updateouterdata_coul
-.single_coul_inner:
- ;; a single j particle iteration here - compare with the unrolled code for comments.
- mov eax, [esp + .innerjjnr]
- mov eax, [eax] ; eax=jnr offset
-
- mov ecx, [ebp + %$charge]
- movd mm6, [esp + .iq]
- movd mm7, [ecx + eax*4]
- pfmul mm6, mm7 ; mm6=qq
-
- lea eax, [eax + eax*2]
-
- movq mm0, [esp + .ix]
- movd mm1, [esp + .iz]
- movq mm2, [esi + eax*4]
- movd mm3, [esi + eax*4 + 8]
- pfsub mm0, mm2
- pfsub mm1, mm3
- movq [esp + .dx1], mm0
- pfmul mm0,mm0
- movd [esp + .dz1], mm1
- pfmul mm1,mm1
- pfacc mm0, mm1
- pfacc mm0, mm1 ; mm0=rsq
-
- pfrsqrt mm1,mm0
- movq mm2,mm1
- pfmul mm1,mm1
- pfrsqit1 mm1,mm0
- pfrcpit2 mm1,mm2 ; mm1=invsqrt
- movq mm4, mm1
- pfmul mm4, mm4 ; mm4=invsq
- ;; calculate potential and scalar force
- pfmul mm6, mm1 ; mm6=vcoul
- pfmul mm4, mm6 ; mm4=fscalar
- ;; update vctot
- movq mm5, [esp + .vctot]
- pfadd mm5, mm6
- movq [esp + .vctot], mm5
- ;; spread fscalar to both positions
- punpckldq mm4,mm4
- ;; calc vectorial force
- prefetchw [edi + eax*4] ; prefetch faction to cache
- movq mm0, [esp + .dx1]
- movd mm1, [esp + .dz1]
- pfmul mm0, mm4
- pfmul mm1, mm4
- ;; update i particle force
- movq mm2, [esp + .fix]
- movd mm3, [esp + .fiz]
- pfadd mm2, mm0
- pfadd mm3, mm1
- movq [esp + .fix], mm2
- movd [esp + .fiz], mm3
- ;; update j particle force
- movq mm2, [edi + eax*4]
- movd mm3, [edi + eax *4+ 8]
- pfsub mm2, mm0
- pfsub mm3, mm1
- movq [edi + eax*4], mm2
- movd [edi + eax*4 +8], mm3
- ;; done!
-.updateouterdata_coul:
- mov ecx, [esp + .ii3]
-
- movq mm6, [edi + ecx*4] ; increment i force
- movd mm7, [edi + ecx*4 + 8]
- pfadd mm6, [esp + .fix]
- pfadd mm7, [esp + .fiz]
- movq [edi + ecx*4], mm6
- movd [edi + ecx*4 +8], mm7
-
- mov ebx, [ebp + %$fshift] ; increment fshift force
- mov edx, [esp + .is3]
-
- movq mm6, [ebx + edx*4]
- movd mm7, [ebx + edx*4 + 8]
- pfadd mm6, [esp + .fix]
- pfadd mm7, [esp + .fiz]
- movq [ebx + edx*4], mm6
- movd [ebx + edx*4 + 8], mm7
-
- ;; loop back to mno.
- dec dword [esp + .nscoul]
- jz .testvdw
- jmp .mno_coul
-.testvdw
- mov ecx, [esp + .nsvdw]
- cmp ecx, byte 0
- jnz .mno_vdw
- jmp .last_mno
-.mno_vdw:
- mov ebx, [esp + .solnr]
- inc dword [esp + .solnr]
-
- mov edx, [ebp + %$type]
- mov edx, [edx + ebx*4]
- imul edx, [ebp + %$ntype]
- shl edx, 1
- mov [esp + .ntia], edx
-
- lea ebx, [ebx + ebx*2] ; ebx = 3*ii=ii3
- mov eax, [ebp + %$pos] ; eax = base of pos[]
- mov [esp + .ii3], ebx
-
- movq mm0, [eax + ebx*4]
- movd mm1, [eax + ebx*4 + 8]
- pfadd mm0, [esp + .shX]
- pfadd mm1, [esp + .shZ]
- movq [esp + .ix], mm0
- movd [esp + .iz], mm1
-
- ;; clear forces.
- pxor mm7,mm7
- movq [esp + .fix], mm7
- movd [esp + .fiz], mm7
-
- mov ecx, [esp + .innerjjnr0]
- mov [esp + .innerjjnr], ecx
- mov edx, [esp + .innerk0]
- sub edx, dword 2
- mov [esp + .innerk], edx ; number of innerloop atoms
- jge .unroll_vdw_loop
- jmp .finish_vdw_inner
-.unroll_vdw_loop:
- ;; paired innerloop here.
- mov ecx, [esp + .innerjjnr] ; pointer to jjnr[k]
- mov eax, [ecx]
- mov ebx, [ecx + 4] ; eax/ebx=jnr
- add [esp + .innerjjnr], dword 8 ; advance pointer (unrolled 2)
- prefetch [ecx + 16] ; prefetch data - trial and error says 16 is best
-
- mov ecx, [ebp + %$type]
- mov edx, [ecx + eax*4] ; type [jnr1]
- mov ecx, [ecx + ebx*4] ; type [jnr2]
-
- mov esi, [ebp + %$nbfp] ; base of nbfp
- shl edx, 1
- shl ecx, 1
- add edx, [esp + .ntia] ; tja = ntia + 2*type
- add ecx, [esp + .ntia]
-
- movq mm5, [esi + edx*4] ; mm5 = 1st c6 / c12
- movq mm7, [esi + ecx*4] ; mm7 = 2nd c6 / c12
- movq mm6,mm5
- punpckldq mm5,mm7 ; mm5 = 1st c6 / 2nd c6
- punpckhdq mm6,mm7 ; mm6 = 1st c12 / 2nd c12
- movq [esp + .c6], mm5
- movq [esp + .c12], mm6
-
- lea eax, [eax + eax*2] ; replace jnr with j3
- lea ebx, [ebx + ebx*2]
-
- mov esi, [ebp + %$pos]
-
- movq mm0, [esp + .ix]
- movd mm1, [esp + .iz]
- movq mm4, [esi + eax*4] ; fetch first j coordinates
- movd mm5, [esi + eax*4 + 8]
- pfsubr mm4,mm0 ; dr = ir - jr
- pfsubr mm5,mm1
- movq [esp + .dx1], mm4 ; store dr
- movd [esp + .dz1], mm5
- pfmul mm4,mm4 ; square dx,dy,dz
- pfmul mm5,mm5
- pfacc mm4, mm5 ; accumulate to get dx*dx+dy*dy+dz*dz
- pfacc mm4, mm5 ; first rsq in lower mm4
-
- movq mm6, [esi + ebx*4] ; fetch second j coordinates
- movd mm7, [esi + ebx*4 + 8]
-
- pfsubr mm6,mm0 ; dr = ir - jr
- pfsubr mm7,mm1
- movq [esp + .dx2], mm6 ; store dr
- movd [esp + .dz2], mm7
- pfmul mm6,mm6 ; square dx,dy,dz
- pfmul mm7,mm7
- pfacc mm6, mm7 ; accumulate to get dx*dx+dy*dy+dz*dz
- pfacc mm6, mm7 ; second rsq in lower mm6
-
- pfrsqrt mm0, mm4 ; lookup inverse square root seed
- pfrsqrt mm1, mm6
-
- punpckldq mm0,mm1
- punpckldq mm4,mm6 ; now 4 has rsq and 0 the seed for both pairs.
- movq mm2,mm0 ; amd 3dnow N-R iteration to get full precision.
- pfmul mm0,mm0
- pfrsqit1 mm0,mm4
- pfrcpit2 mm0,mm2
- movq mm1,mm0
- pfmul mm0,mm0
- ;; mm0 now contains invsq, and mm1 invsqrt
- ;; do potential and fscal
- movq mm4, mm0
- pfmul mm4, mm0
- pfmul mm4, mm0 ; mm4=rinvsix
- movq mm5, mm4
- pfmul mm5, mm5 ; mm5=rinvtwelve
-
- pfmul mm5, [esp + .c12]
- pfmul mm4, [esp + .c6]
- movq mm6, mm5 ; mm6 is vnb12-vnb6
- pfsub mm6, mm4
-
- pfmul mm4, [esp + .six]
-
- pfmul mm5, [esp + .twelve]
- movq mm7, mm5
- pfsub mm7,mm4
- pfmul mm0, mm7 ; mm0 is total fscal now
-
- prefetchw [esp + .dx1] ; prefetch i forces to cache
-
- ;; spread fscalar to both positions
- movq mm1,mm0
- punpckldq mm0,mm0
- punpckhdq mm1,mm1
-
- ;; calc vector force
- prefetchw [edi + eax*4] ; prefetch the 1st faction to cache
- movq mm2, [esp + .dx1] ; fetch dr
- movd mm3, [esp + .dz1]
-
- ;; update vnbtot
- pfadd mm6, [esp + .vnbtot] ; add the earlier value
- movq [esp + .vnbtot], mm6 ; store the sum
-
- prefetchw [edi + ebx*4] ; prefetch the 2nd faction to cache
- pfmul mm2, mm0 ; mult by fs
- pfmul mm3, mm0
-
- movq mm4, [esp + .dx2] ; fetch dr
- movd mm5, [esp + .dz2]
- pfmul mm4, mm1 ; mult by fs
- pfmul mm5, mm1
- ;; update i forces
-
- movq mm0, [esp + .fix]
- movd mm1, [esp + .fiz]
- pfadd mm0, mm2
- pfadd mm1, mm3
-
- pfadd mm0, mm4
- pfadd mm1, mm5
- movq [esp + .fix], mm0
- movd [esp + .fiz], mm1
- ;; update j forces
-
- movq mm0, [edi + eax*4]
- movd mm1, [edi + eax*4 + 8]
- movq mm6, [edi + ebx*4]
- movd mm7, [edi + ebx*4 + 8]
-
- pfsub mm0, mm2
- pfsub mm1, mm3
- pfsub mm6, mm4
- pfsub mm7, mm5
-
- movq [edi + eax*4], mm0
- movd [edi + eax*4 +8], mm1
- movq [edi + ebx*4], mm6
- movd [edi + ebx*4 + 8], mm7
-
- ;; should we do one more iteration?
- sub [esp + .innerk], dword 2
- jl .finish_vdw_inner
- jmp .unroll_vdw_loop
-.finish_vdw_inner:
- and [esp + .innerk], dword 1
- jnz .single_vdw_inner
- jmp .updateouterdata_vdw
-.single_vdw_inner:
- ;; a single j particle iteration here - compare with the unrolled code for comments.
- mov eax, [esp + .innerjjnr]
- mov eax, [eax] ; eax=jnr offset
-
- mov esi, [ebp + %$nbfp]
- mov ecx, [ebp + %$type]
- mov edx, [ecx + eax*4] ; type [jnr1]
- shl edx, 1
- add edx, [esp + .ntia] ; tja = ntia + 2*type
- movd mm5, [esi + edx*4] ; mm5 = 1st c6
- movq [esp + .c6], mm5
- movd mm5, [esi + edx*4 + 4] ; mm5 = 1st c12
- movq [esp + .c12], mm5
-
-
- mov esi, [ebp + %$pos]
- lea eax, [eax + eax*2]
-
- movq mm0, [esp + .ix]
- movd mm1, [esp + .iz]
- movq mm4, [esi + eax*4]
- movd mm5, [esi + eax*4 + 8]
- pfsubr mm4, mm0
- pfsubr mm5, mm1
- movq [esp + .dx1], mm4
- pfmul mm4,mm4
- movd [esp + .dz1], mm5
- pfmul mm5,mm5
- pfacc mm4, mm5
- pfacc mm4, mm5 ; mm0=rsq
-
- pfrsqrt mm0,mm4
- movq mm2,mm0
- pfmul mm0,mm0
- pfrsqit1 mm0,mm4
- pfrcpit2 mm0,mm2 ; mm1=invsqrt
- movq mm1, mm0
- pfmul mm0, mm0 ; mm0=invsq
- ;; calculate potentials and scalar force
- movq mm4, mm0
- pfmul mm4, mm0
- pfmul mm4, mm0 ; mm4=rinvsix
- movq mm5, mm4
- pfmul mm5, mm5 ; mm5=rinvtwelve
-
- pfmul mm5, [esp + .c12]
- pfmul mm4, [esp + .c6]
- movq mm6, mm5 ; mm6 is vnb12-vnb6
- pfsub mm6, mm4
-
- pfmul mm4, [esp + .six]
-
- pfmul mm5, [esp + .twelve]
- movq mm7, mm5
- pfsub mm7,mm4
- pfmul mm0, mm7 ; mm0 is total fscal now
-
- ;; update vnbtot
- pfadd mm6, [esp + .vnbtot] ; add the earlier value
- movq [esp + .vnbtot], mm6 ; store the sum
-
- ;; spread fscalar to both positions
- punpckldq mm0,mm0
- ;; calc vectorial force
- prefetchw [edi + eax*4] ; prefetch faction to cache
- movq mm2, [esp + .dx1]
- movd mm3, [esp + .dz1]
-
-
- pfmul mm2, mm0
- pfmul mm3, mm0
-
- ;; update i particle force
- movq mm0, [esp + .fix]
- movd mm1, [esp + .fiz]
- pfadd mm0, mm2
- pfadd mm1, mm3
- movq [esp + .fix], mm0
- movd [esp + .fiz], mm1
- ;; update j particle force
- movq mm0, [edi + eax*4]
- movd mm1, [edi + eax *4+ 8]
- pfsub mm0, mm2
- pfsub mm1, mm3
- movq [edi + eax*4], mm0
- movd [edi + eax*4 +8], mm1
- ;; done!
-.updateouterdata_vdw:
- mov ecx, [esp + .ii3]
-
- movq mm6, [edi + ecx*4] ; increment i force
- movd mm7, [edi + ecx*4 + 8]
- pfadd mm6, [esp + .fix]
- pfadd mm7, [esp + .fiz]
- movq [edi + ecx*4], mm6
- movd [edi + ecx*4 +8], mm7
-
- mov ebx, [ebp + %$fshift] ; increment fshift force
- mov edx, [esp + .is3]
-
- movq mm6, [ebx + edx*4]
- movd mm7, [ebx + edx*4 + 8]
- pfadd mm6, [esp + .fix]
- pfadd mm7, [esp + .fiz]
- movq [ebx + edx*4], mm6
- movd [ebx + edx*4 + 8], mm7
-
- ;; loop back to mno.
- dec dword [esp + .nsvdw]
- jz .last_mno
- jmp .mno_vdw
-
-.last_mno:
- mov edx, [ebp + %$gid] ; get group index for this i particle
- mov edx, [edx]
- add [ebp + %$gid], dword 4 ; advance pointer
-
- movq mm7, [esp + .vctot]
- pfacc mm7,mm7 ; get and sum the two parts of total potential
-
- mov eax, [ebp + %$Vc]
- movd mm6, [eax + edx*4]
- pfadd mm6, mm7
- movd [eax + edx*4], mm6 ; increment vc[gid]
-
- movq mm7, [esp + .vnbtot]
- pfacc mm7,mm7 ; get and sum the two parts of total potential
-
- mov eax, [ebp + %$Vnb]
- movd mm6, [eax + edx*4]
- pfadd mm6, mm7
- movd [eax + edx*4], mm6 ; increment vc[gid]
- ;; finish if last
- mov ecx, [ebp + %$nri]
- dec ecx
- jecxz .end
- ;; not last, iterate once more!
- mov [ebp + %$nri], ecx
- jmp .outer
-.end:
- femms
- add esp, 160
- pop edi
- pop esi
- pop edx
- pop ecx
- pop ebx
- pop eax
- endproc
-
-
-
-proc inl1120_3dnow
-%$nri arg
-%$iinr arg
-%$jindex arg
-%$jjnr arg
-%$shift arg
-%$shiftvec arg
-%$fshift arg
-%$gid arg
-%$pos arg
-%$faction arg
-%$charge arg
-%$facel arg
-%$Vc arg
-%$type arg
-%$ntype arg
-%$nbfp arg
-%$Vnb arg
- ;; stack offsets for local variables
-.is3 equ 0
-.ii3 equ 4
-.ixO equ 8
-.iyO equ 12
-.izO equ 16
-.ixH equ 20 ; repeated (64bit) to fill 3dnow reg
-.iyH equ 28 ; repeated (64bit) to fill 3dnow reg
-.izH equ 36 ; repeated (64bit) to fill 3dnow reg
-.iqO equ 44 ; repeated (64bit) to fill 3dnow reg
-.iqH equ 52 ; repeated (64bit) to fill 3dnow reg
-.vctot equ 60 ; repeated (64bit) to fill 3dnow reg
-.vnbtot equ 68 ; repeated (64bit) to fill 3dnow reg
-.c6 equ 76 ; repeated (64bit) to fill 3dnow reg
-.c12 equ 84 ; repeated (64bit) to fill 3dnow reg
-.six equ 92 ; repeated (64bit) to fill 3dnow reg
-.twelve equ 100 ; repeated (64bit) to fill 3dnow reg
-.ntia equ 108 ; repeated (64bit) to fill 3dnow reg
-.innerjjnr equ 116
-.innerk equ 120
-.fixO equ 124
-.fiyO equ 128
-.fizO equ 132
-.fixH equ 136 ; repeated (64bit) to fill 3dnow reg
-.fiyH equ 144 ; repeated (64bit) to fill 3dnow reg
-.fizH equ 152 ; repeated (64bit) to fill 3dnow reg
-.dxO equ 160
-.dyO equ 164
-.dzO equ 168
-.dxH equ 172 ; repeated (64bit) to fill 3dnow reg
-.dyH equ 180 ; repeated (64bit) to fill 3dnow reg
-.dzH equ 188 ; repeated (64bit) to fill 3dnow reg
- push eax
- push ebx
- push ecx
- push edx
- push esi
- push edi
- sub esp, 196 ; local stack space
- femms
- ;; assume we have at least one i particle - start directly
-
- mov ecx, [ebp + %$iinr] ; ecx = pointer into iinr[]
- mov ebx, [ecx] ; ebx =ii
-
- mov edx, [ebp + %$charge]
- movd mm1, [ebp + %$facel]
- movd mm2, [edx + ebx*4] ; mm2=charge[ii0]
- pfmul mm2, mm1
- movq [esp + .iqO], mm2 ; iqO = facel*charge[ii]
-
- movd mm2, [edx + ebx*4 + 4] ; mm2=charge[ii0+1]
- pfmul mm2, mm1
- punpckldq mm2,mm2 ; spread to both halves
- movq [esp + .iqH], mm2 ; iqH = facel*charge[ii0+1]
-
- mov edx, [ebp + %$type]
- mov ecx, [edx + ebx*4]
- shl ecx, 1
- imul ecx, [ebp + %$ntype] ; ecx = ntia = 2*ntype*type[ii0]
- mov [esp + .ntia], ecx
-
- movq mm3, [mm_six]
- movq mm4, [mm_twelve]
- movq [esp + .six], mm3
- movq [esp + .twelve], mm4
-.outer:
- mov eax, [ebp + %$shift] ; eax = pointer into shift[]
- mov ebx, [eax] ; ebx=shift[n]
- add [ebp + %$shift], dword 4 ; advance pointer one step
-
- lea ebx, [ebx + ebx*2] ; ebx=3*is
- mov [esp + .is3],ebx ; store is3
-
- mov eax, [ebp + %$shiftvec] ; eax = base of shiftvec[]
-
- movq mm5, [eax + ebx*4] ; move shX/shY to mm5 and shZ to mm6.
- movd mm6, [eax + ebx*4 + 8]
- movq mm0, mm5
- movq mm1, mm5
- movq mm2, mm6
- punpckldq mm0,mm0 ; also expand shX,Y,Z in mm0--mm2.
- punpckhdq mm1,mm1
- punpckldq mm2,mm2
-
- mov ecx, [ebp + %$iinr] ; ecx = pointer into iinr[]
- add [ebp + %$iinr], dword 4 ; advance pointer
- mov ebx, [ecx] ; ebx =ii
-
- lea ebx, [ebx + ebx*2] ; ebx = 3*ii=ii3
- mov eax, [ebp + %$pos] ; eax = base of pos[]
-
- pfadd mm5, [eax + ebx*4] ; ix = shX + posX (and iy too)
- movd mm7, [eax + ebx*4 + 8] ; cant use direct memory add for 4 bytes (iz)
- mov [esp + .ii3], ebx ; (use mm7 as temp. storage for iz.)
- pfadd mm6, mm7
- movq [esp + .ixO], mm5
- movq [esp + .izO], mm6
-
- movd mm3, [eax + ebx*4 + 12]
- movd mm4, [eax + ebx*4 + 16]
- movd mm5, [eax + ebx*4 + 20]
- punpckldq mm3, [eax + ebx*4 + 24]
- punpckldq mm4, [eax + ebx*4 + 28]
- punpckldq mm5, [eax + ebx*4 + 32] ; coords of H1 in low mm3-mm5, H2 in high.
-
- pfadd mm0, mm3
- pfadd mm1, mm4
- pfadd mm2, mm5
- movq [esp + .ixH], mm0
- movq [esp + .iyH], mm1
- movq [esp + .izH], mm2
-
- ;; clear vctot and i forces.
- pxor mm7,mm7
- movq [esp + .vctot], mm7
- movq [esp + .vnbtot], mm7
- movq [esp + .fixO], mm7
- movd [esp + .fizO], mm7
- movq [esp + .fixH], mm7
- movq [esp + .fiyH], mm7
- movq [esp + .fizH], mm7
-
- mov eax, [ebp + %$jindex]
- mov ecx, [eax] ; jindex[n]
- mov edx, [eax + 4] ; jindex[n+1]
- add [ebp + %$jindex], dword 4
- sub edx, ecx ; number of innerloop atoms
- mov [esp + .innerk], edx ; number of innerloop atoms
-
- mov esi, [ebp + %$pos]
- mov edi, [ebp + %$faction]
- mov eax, [ebp + %$jjnr]
- shl ecx, 2
- add eax, ecx
- mov [esp + .innerjjnr], eax ; pointer to jjnr[nj0]
-.inner_loop:
- ;; a single j particle iteration here - compare with the unrolled code for comments.
- mov eax, [esp + .innerjjnr]
- mov eax, [eax] ; eax=jnr offset
- add [esp + .innerjjnr], dword 4 ; advance pointer
- ;prefetch [ecx + 16] ; prefetch data - trial and error says 16 is best
-
- mov ecx, [ebp + %$charge]
- movd mm7, [ecx + eax*4]
- punpckldq mm7,mm7
- movq mm6,mm7
- pfmul mm6, [esp + .iqO]
- pfmul mm7, [esp + .iqH] ; mm6=qqO, mm7=qqH
-
- mov ecx, [ebp + %$type]
- mov edx, [ecx + eax*4] ; type [jnr]
- mov ecx, [ebp + %$nbfp]
- shl edx, 1
- add edx, [esp + .ntia] ; tja = ntia + 2*type
- movd mm5, [ecx + edx*4] ; mm5 = 1st c6
- movq [esp + .c6], mm5
- movd mm5, [ecx + edx*4 + 4] ; mm5 = 1st c12
- movq [esp + .c12], mm5
-
- lea eax, [eax + eax*2]
-
- movq mm0, [esi + eax*4]
- movd mm1, [esi + eax*4 + 8]
- ;; copy & expand to mm2-mm4 for the H interactions
- movq mm2, mm0
- movq mm3, mm0
- movq mm4, mm1
- punpckldq mm2,mm2
- punpckhdq mm3,mm3
- punpckldq mm4,mm4
-
- pfsubr mm0, [esp + .ixO]
- pfsubr mm1, [esp + .izO]
-
- movq [esp + .dxO], mm0
- pfmul mm0,mm0
- movd [esp + .dzO], mm1
- pfmul mm1,mm1
- pfacc mm0, mm1
- pfadd mm0, mm1 ; mm0=rsqO
-
- punpckldq mm2, mm2
- punpckldq mm3, mm3
- punpckldq mm4, mm4 ; mm2-mm4 is jx-jz
- pfsubr mm2, [esp + .ixH]
- pfsubr mm3, [esp + .iyH]
- pfsubr mm4, [esp + .izH] ; mm2-mm4 is dxH-dzH
-
- movq [esp + .dxH], mm2
- movq [esp + .dyH], mm3
- movq [esp + .dzH], mm4
- pfmul mm2,mm2
- pfmul mm3,mm3
- pfmul mm4,mm4
-
- pfadd mm3,mm2
- pfadd mm3,mm4 ; mm3=rsqH
-
- pfrsqrt mm1,mm0
-
- movq mm2,mm1
- pfmul mm1,mm1
- pfrsqit1 mm1,mm0
- pfrcpit2 mm1,mm2 ; mm1=invsqrt
- movq mm4, mm1
- pfmul mm4, mm4 ; mm4=invsq
-
- movq mm0, mm4
- pfmul mm0, mm4
- pfmul mm0, mm4 ; mm0=rinvsix
- movq mm2, mm0
- pfmul mm2, mm2 ; mm2=rintwelve
-
- ;; calculate potential and scalar force
- pfmul mm6, mm1 ; mm6=vcoul
- movq mm1, mm6 ; use mm1 for fscal sum
-
- ;; LJ for the oxygen
- pfmul mm0, [esp + .c6]
- pfmul mm2, [esp + .c12]
-
- ;; calc nb potential
- movq mm5, mm2
- pfsub mm5, mm0
-
- ;; calc nb force
- pfmul mm0, [esp + .six]
- pfmul mm2, [esp + .twelve]
-
- ;; increment scalar force
- pfsub mm1, mm0
- pfadd mm1, mm2
- pfmul mm4, mm1 ; total scalar force on oxygen.
-
- ;; update nb potential
- pfadd mm5, [esp + .vnbtot]
- movq [esp + .vnbtot], mm5
-
- pfrsqrt mm5, mm3
- pswapd mm3,mm3
- pfrsqrt mm2, mm3
- pswapd mm3,mm3
- punpckldq mm5,mm2 ; seeds are in mm5 now, and rsq in mm3.
-
- movq mm2, mm5
- pfmul mm5,mm5
- pfrsqit1 mm5,mm3
- pfrcpit2 mm5,mm2 ; mm5=invsqrt
- movq mm3,mm5
- pfmul mm3,mm3 ; mm3=invsq
- pfmul mm7, mm5 ; mm7=vcoul
- pfmul mm3, mm7 ; mm3=fscal for the two H's.
-
- ;; update vctot
- pfadd mm7, mm6
- pfadd mm7, [esp + .vctot]
- movq [esp + .vctot], mm7
-
- ;; spread oxygen fscalar to both positions
- punpckldq mm4,mm4
- ;; calc vectorial force for O
- prefetchw [edi + eax*4] ; prefetch faction to cache
- movq mm0, [esp + .dxO]
- movd mm1, [esp + .dzO]
- pfmul mm0, mm4
- pfmul mm1, mm4
-
- ;; calc vectorial force for H's
- movq mm5, [esp + .dxH]
- movq mm6, [esp + .dyH]
- movq mm7, [esp + .dzH]
- pfmul mm5, mm3
- pfmul mm6, mm3
- pfmul mm7, mm3
-
- ;; update iO particle force
- movq mm2, [esp + .fixO]
- movd mm3, [esp + .fizO]
- pfadd mm2, mm0
- pfadd mm3, mm1
- movq [esp + .fixO], mm2
- movd [esp + .fizO], mm3
-
- ;; update iH forces
- movq mm2, [esp + .fixH]
- movq mm3, [esp + .fiyH]
- movq mm4, [esp + .fizH]
- pfadd mm2, mm5
- pfadd mm3, mm6
- pfadd mm4, mm7
- movq [esp + .fixH], mm2
- movq [esp + .fiyH], mm3
- movq [esp + .fizH], mm4
-
- ;; pack j forces from H in the same form as the oxygen force.
- pfacc mm5, mm6 ; mm5(l)=fjx(H1+H2) mm5(h)=fjy(H1+H2)
- pfacc mm7, mm7 ; mm7(l)=fjz(H1+H2)
-
- pfadd mm0, mm5 ; add up total force on j particle.
- pfadd mm1, mm7
-
- ;; update j particle force
- movq mm2, [edi + eax*4]
- movd mm3, [edi + eax*4 + 8]
- pfsub mm2, mm0
- pfsub mm3, mm1
- movq [edi + eax*4], mm2
- movd [edi + eax*4 +8], mm3
-
- ;; done - one more?
- dec dword [esp + .innerk]
- jz .updateouterdata
- jmp .inner_loop
-.updateouterdata:
- mov ecx, [esp + .ii3]
-
- movq mm6, [edi + ecx*4] ; increment iO force
- movd mm7, [edi + ecx*4 + 8]
- pfadd mm6, [esp + .fixO]
- pfadd mm7, [esp + .fizO]
- movq [edi + ecx*4], mm6
- movd [edi + ecx*4 +8], mm7
-
- movq mm0, [esp + .fixH]
- movq mm3, [esp + .fiyH]
- movq mm1, [esp + .fizH]
- movq mm2, mm0
- punpckldq mm0, mm3 ; mm0(l)=fxH1, mm0(h)=fyH1
- punpckhdq mm2, mm3 ; mm2(l)=fxH2, mm2(h)=fyH2
- movq mm3, mm1
- pswapd mm3,mm3
- ;; mm1 is fzH1
- ;; mm3 is fzH2
-
- movq mm6, [edi + ecx*4 + 12] ; increment iH1 force
- movd mm7, [edi + ecx*4 + 20]
- pfadd mm6, mm0
- pfadd mm7, mm1
- movq [edi + ecx*4 + 12], mm6
- movd [edi + ecx*4 + 20], mm7
-
- movq mm6, [edi + ecx*4 + 24] ; increment iH2 force
- movd mm7, [edi + ecx*4 + 32]
- pfadd mm6, mm2
- pfadd mm7, mm3
- movq [edi + ecx*4 + 24], mm6
- movd [edi + ecx*4 + 32], mm7
-
-
- mov ebx, [ebp + %$fshift] ; increment fshift force
- mov edx, [esp + .is3]
-
- movq mm6, [ebx + edx*4]
- movd mm7, [ebx + edx*4 + 8]
- pfadd mm6, [esp + .fixO]
- pfadd mm7, [esp + .fizO]
- pfadd mm6, mm0
- pfadd mm7, mm1
- pfadd mm6, mm2
- pfadd mm7, mm3
- movq [ebx + edx*4], mm6
- movd [ebx + edx*4 + 8], mm7
-
- mov edx, [ebp + %$gid] ; get group index for this i particle
- mov edx, [edx]
- add [ebp + %$gid], dword 4 ; advance pointer
-
- movq mm7, [esp + .vctot]
- pfacc mm7,mm7 ; get and sum the two parts of total potential
-
- mov eax, [ebp + %$Vc]
- movd mm6, [eax + edx*4]
- pfadd mm6, mm7
- movd [eax + edx*4], mm6 ; increment vc[gid]
-
- movq mm7, [esp + .vnbtot]
- pfacc mm7,mm7 ; same for Vnb.
-
- mov eax, [ebp + %$Vnb]
- movd mm6, [eax + edx*4]
- pfadd mm6, mm7
- movd [eax + edx*4], mm6 ; increment vnb[gid]
- ;; finish if last
- dec dword [ebp + %$nri]
- jz .end
- ;; not last, iterate once more!
- jmp .outer
-.end:
- femms
- add esp, 196
- pop edi
- pop esi
- pop edx
- pop ecx
- pop ebx
- pop eax
- endproc
-
-
-
-proc inl1130_3dnow
-%$nri arg
-%$iinr arg
-%$jindex arg
-%$jjnr arg
-%$shift arg
-%$shiftvec arg
-%$fshift arg
-%$gid arg
-%$pos arg
-%$faction arg
-%$charge arg
-%$facel arg
-%$Vc arg
-%$type arg
-%$ntype arg
-%$nbfp arg
-%$Vnb arg
- ;; stack offsets for local variables
-.is3 equ 0
-.ii3 equ 4
-.ixO equ 8
-.iyO equ 12
-.izO equ 16
-.ixH equ 20 ; repeated (64bit) to fill 3dnow reg
-.iyH equ 28 ; repeated (64bit) to fill 3dnow reg
-.izH equ 36 ; repeated (64bit) to fill 3dnow reg
-.qqOO equ 44 ; repeated (64bit) to fill 3dnow reg
-.qqOH equ 52 ; repeated (64bit) to fill 3dnow reg
-.qqHH equ 60 ; repeated (64bit) to fill 3dnow reg
-.c6 equ 68 ; repeated (64bit) to fill 3dnow reg
-.c12 equ 76 ; repeated (64bit) to fill 3dnow reg
-.six equ 84 ; repeated (64bit) to fill 3dnow reg
-.twelve equ 92 ; repeated (64bit) to fill 3dnow reg
-.vctot equ 100 ; repeated (64bit) to fill 3dnow reg
-.vnbtot equ 108 ; repeated (64bit) to fill 3dnow reg
-.innerjjnr equ 116
-.innerk equ 120
-.fixO equ 124
-.fiyO equ 128
-.fizO equ 132
-.fixH equ 136 ; repeated (64bit) to fill 3dnow reg
-.fiyH equ 144 ; repeated (64bit) to fill 3dnow reg
-.fizH equ 152 ; repeated (64bit) to fill 3dnow reg
-.dxO equ 160
-.dyO equ 164
-.dzO equ 168
-.dxH equ 172 ; repeated (64bit) to fill 3dnow reg
-.dyH equ 180 ; repeated (64bit) to fill 3dnow reg
-.dzH equ 188 ; repeated (64bit) to fill 3dnow reg
- push eax
- push ebx
- push ecx
- push edx
- push esi
- push edi
- sub esp, 196 ; local stack space
- femms
- ;; assume we have at least one i particle - start directly
-
- mov ecx, [ebp + %$iinr] ; ecx = pointer into iinr[]
- mov ebx, [ecx] ; ebx =ii
-
- mov edx, [ebp + %$charge]
- movd mm1, [ebp + %$facel] ; mm1=facel
- movd mm2, [edx + ebx*4] ; mm2=charge[ii0] (O)
- movd mm3, [edx + ebx*4 + 4] ; mm2=charge[ii0+1] (H)
- movq mm4, mm2
- pfmul mm4, mm1
- movq mm6, mm3
- pfmul mm6, mm1
- movq mm5, mm4
- pfmul mm4, mm2 ; mm4=qqOO*facel
- pfmul mm5, mm3 ; mm5=qqOH*facel
- pfmul mm6, mm3 ; mm6=qqHH*facel
- punpckldq mm5,mm5 ; spread to both halves
- punpckldq mm6,mm6 ; spread to both halves
- movq [esp + .qqOO], mm4
- movq [esp + .qqOH], mm5
- movq [esp + .qqHH], mm6
- mov edx, [ebp + %$type]
- mov ecx, [edx + ebx*4]
- shl ecx, 1
- mov edx, ecx
- imul ecx, [ebp + %$ntype]
- add edx, ecx
- mov eax, [ebp + %$nbfp]
- movd mm0, [eax + edx*4]
- movd mm1, [eax + edx*4 + 4]
- movq [esp + .c6], mm0
- movq [esp + .c12], mm1
- movq mm2, [mm_six]
- movq mm3, [mm_twelve]
- movq [esp + .six], mm2
- movq [esp + .twelve], mm3
-.outer:
- mov eax, [ebp + %$shift] ; eax = pointer into shift[]
- mov ebx, [eax] ; ebx=shift[n]
- add [ebp + %$shift], dword 4 ; advance pointer one step
-
- lea ebx, [ebx + ebx*2] ; ebx=3*is
- mov [esp + .is3],ebx ; store is3
-
- mov eax, [ebp + %$shiftvec] ; eax = base of shiftvec[]
-
- movq mm5, [eax + ebx*4] ; move shX/shY to mm5 and shZ to mm6.
- movd mm6, [eax + ebx*4 + 8]
- movq mm0, mm5
- movq mm1, mm5
- movq mm2, mm6
- punpckldq mm0,mm0 ; also expand shX,Y,Z in mm0--mm2.
- punpckhdq mm1,mm1
- punpckldq mm2,mm2
-
- mov ecx, [ebp + %$iinr] ; ecx = pointer into iinr[]
- add [ebp + %$iinr], dword 4 ; advance pointer
- mov ebx, [ecx] ; ebx =ii
-
- lea ebx, [ebx + ebx*2] ; ebx = 3*ii=ii3
- mov eax, [ebp + %$pos] ; eax = base of pos[]
-
- pfadd mm5, [eax + ebx*4] ; ix = shX + posX (and iy too)
- movd mm7, [eax + ebx*4 + 8] ; cant use direct memory add for 4 bytes (iz)
- mov [esp + .ii3], ebx ; (use mm7 as temp. storage for iz.)
- pfadd mm6, mm7
- movq [esp + .ixO], mm5
- movq [esp + .izO], mm6
-
- movd mm3, [eax + ebx*4 + 12]
- movd mm4, [eax + ebx*4 + 16]
- movd mm5, [eax + ebx*4 + 20]
- punpckldq mm3, [eax + ebx*4 + 24]
- punpckldq mm4, [eax + ebx*4 + 28]
- punpckldq mm5, [eax + ebx*4 + 32] ; coords of H1 in low mm3-mm5, H2 in high.
-
- pfadd mm0, mm3
- pfadd mm1, mm4
- pfadd mm2, mm5
- movq [esp + .ixH], mm0
- movq [esp + .iyH], mm1
- movq [esp + .izH], mm2
-
- ;; clear vctot and i forces.
- pxor mm7,mm7
- movq [esp + .vctot], mm7
- movq [esp + .vnbtot], mm7
- movq [esp + .fixO], mm7
- movq [esp + .fizO], mm7
- movq [esp + .fixH], mm7
- movq [esp + .fiyH], mm7
- movq [esp + .fizH], mm7
-
- mov eax, [ebp + %$jindex]
- mov ecx, [eax] ; jindex[n]
- mov edx, [eax + 4] ; jindex[n+1]
- add [ebp + %$jindex], dword 4
- sub edx, ecx ; number of innerloop atoms
- mov [esp + .innerk], edx ; number of innerloop atoms
-
- mov esi, [ebp + %$pos]
- mov edi, [ebp + %$faction]
- mov eax, [ebp + %$jjnr]
- shl ecx, 2
- add eax, ecx
- mov [esp + .innerjjnr], eax ; pointer to jjnr[nj0]
-.inner_loop:
- ;; a single j particle iteration here - compare with the unrolled code for comments.
- mov eax, [esp + .innerjjnr]
- mov eax, [eax] ; eax=jnr offset
- add [esp + .innerjjnr], dword 4 ; advance pointer
-
- movd mm6, [esp + .qqOO]
- movq mm7, [esp + .qqOH]
-
- lea eax, [eax + eax*2]
- movq mm0, [esi + eax*4]
- movd mm1, [esi + eax*4 + 8]
- ;; copy & expand to mm2-mm4 for the H interactions
- movq mm2, mm0
- movq mm3, mm0
- movq mm4, mm1
- punpckldq mm2,mm2
- punpckhdq mm3,mm3
- punpckldq mm4,mm4
-
- pfsubr mm0, [esp + .ixO]
- pfsubr mm1, [esp + .izO]
-
- movq [esp + .dxO], mm0
- pfmul mm0,mm0
- movd [esp + .dzO], mm1
- pfmul mm1,mm1
- pfacc mm0, mm0
- pfadd mm0, mm1 ; mm0=rsqO
-
- punpckldq mm2, mm2
- punpckldq mm3, mm3
- punpckldq mm4, mm4 ; mm2-mm4 is jx-jz
- pfsubr mm2, [esp + .ixH]
- pfsubr mm3, [esp + .iyH]
- pfsubr mm4, [esp + .izH] ; mm2-mm4 is dxH-dzH
-
- movq [esp + .dxH], mm2
- movq [esp + .dyH], mm3
- movq [esp + .dzH], mm4
- pfmul mm2,mm2
- pfmul mm3,mm3
- pfmul mm4,mm4
-
- pfadd mm3,mm2
- pfadd mm3,mm4 ; mm3=rsqH
-
- pfrsqrt mm1,mm0
-
- movq mm2,mm1
- pfmul mm1,mm1
- pfrsqit1 mm1,mm0
- pfrcpit2 mm1,mm2 ; mm1=invsqrt OO
- movq mm4, mm1
- pfmul mm4, mm4 ; mm4=invsq OO
-
- movq mm2, mm4
- pfmul mm2, mm4
- pfmul mm2, mm4
- movq mm0, mm2
- pfmul mm0,mm0
- pfmul mm2, [esp + .c6]
- pfmul mm0, [esp + .c12]
- movq mm5, mm0
- pfsub mm5, mm2 ; vnb
-
- pfmul mm2, [esp + .six]
- pfmul mm0, [esp + .twelve]
-
- pfsub mm0, mm2
-
- ;; calculate potential and scalar force
- pfmul mm6, mm1 ; mm6=vcoul
- pfadd mm0, mm6
- pfmul mm4, mm0 ; mm4=fscalar
-
- ;; update nb potential
- pfadd mm5, [esp + .vnbtot]
- movq [esp + .vnbtot], mm5
-
- pfrsqrt mm5, mm3
- pswapd mm3,mm3
- pfrsqrt mm2, mm3
- pswapd mm3,mm3
- punpckldq mm5,mm2 ; seeds are in mm5 now, and rsq in mm3.
-
- movq mm2, mm5
- pfmul mm5,mm5
- pfrsqit1 mm5,mm3
- pfrcpit2 mm5,mm2 ; mm5=invsqrt
- movq mm3,mm5
- pfmul mm3,mm3 ; mm3=invsq
- pfmul mm7, mm5 ; mm7=vcoul
- pfmul mm3, mm7 ; mm3=fscal for the two H's.
-
- ;; update vctot
- pfadd mm7, mm6
- pfadd mm7, [esp + .vctot]
- movq [esp + .vctot], mm7
-
- ;; spread oxygen fscalar to both positions
- punpckldq mm4,mm4
- ;; calc vectorial force for O
- movq mm0, [esp + .dxO]
- movd mm1, [esp + .dzO]
- pfmul mm0, mm4
- pfmul mm1, mm4
-
- ;; calc vectorial force for H's
- movq mm5, [esp + .dxH]
- movq mm6, [esp + .dyH]
- movq mm7, [esp + .dzH]
- pfmul mm5, mm3
- pfmul mm6, mm3
- pfmul mm7, mm3
-
- ;; update iO particle force
- movq mm2, [esp + .fixO]
- movd mm3, [esp + .fizO]
- pfadd mm2, mm0
- pfadd mm3, mm1
- movq [esp + .fixO], mm2
- movd [esp + .fizO], mm3
-
- ;; update iH forces
- movq mm2, [esp + .fixH]
- movq mm3, [esp + .fiyH]
- movq mm4, [esp + .fizH]
- pfadd mm2, mm5
- pfadd mm3, mm6
- pfadd mm4, mm7
- movq [esp + .fixH], mm2
- movq [esp + .fiyH], mm3
- movq [esp + .fizH], mm4
-
- ;; pack j forces from H in the same form as the oxygen force.
- pfacc mm5, mm6 ; mm5(l)=fjx(H1+H2) mm5(h)=fjy(H1+H2)
- pfacc mm7, mm7 ; mm7(l)=fjz(H1+H2)
-
- pfadd mm0, mm5 ; add up total force on j particle.
- pfadd mm1, mm7
-
- ;; update j particle force
- movq mm2, [edi + eax*4]
- movd mm3, [edi + eax*4 + 8]
- pfsub mm2, mm0
- pfsub mm3, mm1
- movq [edi + eax*4], mm2
- movd [edi + eax*4 +8], mm3
-
- ; interactions with j H1.
- movq mm0, [esi + eax*4 + 12]
- movd mm1, [esi + eax*4 + 20]
- ;; copy & expand to mm2-mm4 for the H interactions
- movq mm2, mm0
- movq mm3, mm0
- movq mm4, mm1
- punpckldq mm2,mm2
- punpckhdq mm3,mm3
- punpckldq mm4,mm4
-
- movd mm6, [esp + .qqOH]
- movq mm7, [esp + .qqHH]
-
- pfsubr mm0, [esp + .ixO]
- pfsubr mm1, [esp + .izO]
-
- movq [esp + .dxO], mm0
- pfmul mm0,mm0
- movd [esp + .dzO], mm1
- pfmul mm1,mm1
- pfacc mm0, mm1
- pfadd mm0, mm1 ; mm0=rsqO
-
- punpckldq mm2, mm2
- punpckldq mm3, mm3
- punpckldq mm4, mm4 ; mm2-mm4 is jx-jz
- pfsubr mm2, [esp + .ixH]
- pfsubr mm3, [esp + .iyH]
- pfsubr mm4, [esp + .izH] ; mm2-mm4 is dxH-dzH
-
- movq [esp + .dxH], mm2
- movq [esp + .dyH], mm3
- movq [esp + .dzH], mm4
- pfmul mm2,mm2
- pfmul mm3,mm3
- pfmul mm4,mm4
-
- pfadd mm3,mm2
- pfadd mm3,mm4 ; mm3=rsqH
-
- pfrsqrt mm1,mm0
-
- movq mm2,mm1
- pfmul mm1,mm1
- pfrsqit1 mm1,mm0
- pfrcpit2 mm1,mm2 ; mm1=invsqrt
- movq mm4, mm1
- pfmul mm4, mm4 ; mm4=invsq
- ;; calculate potential and scalar force
- pfmul mm6, mm1 ; mm6=vcoul
- pfmul mm4, mm6 ; mm4=fscalar
-
- pfrsqrt mm5, mm3
- pswapd mm3,mm3
- pfrsqrt mm2, mm3
- pswapd mm3,mm3
- punpckldq mm5,mm2 ; seeds are in mm5 now, and rsq in mm3.
-
- movq mm2, mm5
- pfmul mm5,mm5
- pfrsqit1 mm5,mm3
- pfrcpit2 mm5,mm2 ; mm5=invsqrt
- movq mm3,mm5
- pfmul mm3,mm3 ; mm3=invsq
- pfmul mm7, mm5 ; mm7=vcoul
- pfmul mm3, mm7 ; mm3=fscal for the two H's.
-
- ;; update vctot
- pfadd mm7, mm6
- pfadd mm7, [esp + .vctot]
- movq [esp + .vctot], mm7
-
- ;; spread oxygen fscalar to both positions
- punpckldq mm4,mm4
- ;; calc vectorial force for O
- movq mm0, [esp + .dxO]
- movd mm1, [esp + .dzO]
- pfmul mm0, mm4
- pfmul mm1, mm4
-
- ;; calc vectorial force for H's
- movq mm5, [esp + .dxH]
- movq mm6, [esp + .dyH]
- movq mm7, [esp + .dzH]
- pfmul mm5, mm3
- pfmul mm6, mm3
- pfmul mm7, mm3
-
- ;; update iO particle force
- movq mm2, [esp + .fixO]
- movd mm3, [esp + .fizO]
- pfadd mm2, mm0
- pfadd mm3, mm1
- movq [esp + .fixO], mm2
- movd [esp + .fizO], mm3
-
- ;; update iH forces
- movq mm2, [esp + .fixH]
- movq mm3, [esp + .fiyH]
- movq mm4, [esp + .fizH]
- pfadd mm2, mm5
- pfadd mm3, mm6
- pfadd mm4, mm7
- movq [esp + .fixH], mm2
- movq [esp + .fiyH], mm3
- movq [esp + .fizH], mm4
-
- ;; pack j forces from H in the same form as the oxygen force.
- pfacc mm5, mm6 ; mm5(l)=fjx(H1+H2) mm5(h)=fjy(H1+H2)
- pfacc mm7, mm7 ; mm7(l)=fjz(H1+H2)
-
- pfadd mm0, mm5 ; add up total force on j particle.
- pfadd mm1, mm7
-
- ;; update j particle force
- movq mm2, [edi + eax*4 + 12]
- movd mm3, [edi + eax*4 + 20]
- pfsub mm2, mm0
- pfsub mm3, mm1
- movq [edi + eax*4 + 12], mm2
- movd [edi + eax*4 + 20], mm3
-
- ; interactions with j H2
- movq mm0, [esi + eax*4 + 24]
- movd mm1, [esi + eax*4 + 32]
- ;; copy & expand to mm2-mm4 for the H interactions
- movq mm2, mm0
- movq mm3, mm0
- movq mm4, mm1
- punpckldq mm2,mm2
- punpckhdq mm3,mm3
- punpckldq mm4,mm4
-
- movd mm6, [esp + .qqOH]
- movq mm7, [esp + .qqHH]
-
- pfsubr mm0, [esp + .ixO]
- pfsubr mm1, [esp + .izO]
-
- movq [esp + .dxO], mm0
- pfmul mm0,mm0
- movd [esp + .dzO], mm1
- pfmul mm1,mm1
- pfacc mm0, mm1
- pfadd mm0, mm1 ; mm0=rsqO
-
- punpckldq mm2, mm2
- punpckldq mm3, mm3
- punpckldq mm4, mm4 ; mm2-mm4 is jx-jz
- pfsubr mm2, [esp + .ixH]
- pfsubr mm3, [esp + .iyH]
- pfsubr mm4, [esp + .izH] ; mm2-mm4 is dxH-dzH
-
- movq [esp + .dxH], mm2
- movq [esp + .dyH], mm3
- movq [esp + .dzH], mm4
- pfmul mm2,mm2
- pfmul mm3,mm3
- pfmul mm4,mm4
-
- pfadd mm3,mm2
- pfadd mm3,mm4 ; mm3=rsqH
-
- pfrsqrt mm1,mm0
-
- movq mm2,mm1
- pfmul mm1,mm1
- pfrsqit1 mm1,mm0
- pfrcpit2 mm1,mm2 ; mm1=invsqrt
- movq mm4, mm1
- pfmul mm4, mm4 ; mm4=invsq
- ;; calculate potential and scalar force
- pfmul mm6, mm1 ; mm6=vcoul
- pfmul mm4, mm6 ; mm4=fscalar
-
- pfrsqrt mm5, mm3
- pswapd mm3,mm3
- pfrsqrt mm2, mm3
- pswapd mm3,mm3
- punpckldq mm5,mm2 ; seeds are in mm5 now, and rsq in mm3.
-
- movq mm2, mm5
- pfmul mm5,mm5
- pfrsqit1 mm5,mm3
- pfrcpit2 mm5,mm2 ; mm5=invsqrt
- movq mm3,mm5
- pfmul mm3,mm3 ; mm3=invsq
- pfmul mm7, mm5 ; mm7=vcoul
- pfmul mm3, mm7 ; mm3=fscal for the two H's.
-
- ;; update vctot
- pfadd mm7, mm6
- pfadd mm7, [esp + .vctot]
- movq [esp + .vctot], mm7
-
- ;; spread oxygen fscalar to both positions
- punpckldq mm4,mm4
- ;; calc vectorial force for O
- movq mm0, [esp + .dxO]
- movd mm1, [esp + .dzO]
- pfmul mm0, mm4
- pfmul mm1, mm4
-
- ;; calc vectorial force for H's
- movq mm5, [esp + .dxH]
- movq mm6, [esp + .dyH]
- movq mm7, [esp + .dzH]
- pfmul mm5, mm3
- pfmul mm6, mm3
- pfmul mm7, mm3
-
- ;; update iO particle force
- movq mm2, [esp + .fixO]
- movd mm3, [esp + .fizO]
- pfadd mm2, mm0
- pfadd mm3, mm1
- movq [esp + .fixO], mm2
- movd [esp + .fizO], mm3
-
- ;; update iH forces
- movq mm2, [esp + .fixH]
- movq mm3, [esp + .fiyH]
- movq mm4, [esp + .fizH]
- pfadd mm2, mm5
- pfadd mm3, mm6
- pfadd mm4, mm7
- movq [esp + .fixH], mm2
- movq [esp + .fiyH], mm3
- movq [esp + .fizH], mm4
-
- ;; pack j forces from H in the same form as the oxygen force.
- pfacc mm5, mm6 ; mm5(l)=fjx(H1+H2) mm5(h)=fjy(H1+H2)
- pfacc mm7, mm7 ; mm7(l)=fjz(H1+H2)
-
- pfadd mm0, mm5 ; add up total force on j particle.
- pfadd mm1, mm7
-
- ;; update j particle force
- movq mm2, [edi + eax*4 + 24]
- movd mm3, [edi + eax*4 + 32]
- pfsub mm2, mm0
- pfsub mm3, mm1
- movq [edi + eax*4 + 24], mm2
- movd [edi + eax*4 + 32], mm3
-
- ;; done - one more?
- dec dword [esp + .innerk]
- jz .updateouterdata
- jmp .inner_loop
-.updateouterdata:
- mov ecx, [esp + .ii3]
-
- movq mm6, [edi + ecx*4] ; increment iO force
- movd mm7, [edi + ecx*4 + 8]
- pfadd mm6, [esp + .fixO]
- pfadd mm7, [esp + .fizO]
- movq [edi + ecx*4], mm6
- movd [edi + ecx*4 +8], mm7
-
- movq mm0, [esp + .fixH]
- movq mm3, [esp + .fiyH]
- movq mm1, [esp + .fizH]
- movq mm2, mm0
- punpckldq mm0, mm3 ; mm0(l)=fxH1, mm0(h)=fyH1
- punpckhdq mm2, mm3 ; mm2(l)=fxH2, mm2(h)=fyH2
- movq mm3, mm1
- pswapd mm3,mm3
- ;; mm1 is fzH1
- ;; mm3 is fzH2
-
- movq mm6, [edi + ecx*4 + 12] ; increment iH1 force
- movd mm7, [edi + ecx*4 + 20]
- pfadd mm6, mm0
- pfadd mm7, mm1
- movq [edi + ecx*4 + 12], mm6
- movd [edi + ecx*4 + 20], mm7
-
- movq mm6, [edi + ecx*4 + 24] ; increment iH2 force
- movd mm7, [edi + ecx*4 + 32]
- pfadd mm6, mm2
- pfadd mm7, mm3
- movq [edi + ecx*4 + 24], mm6
- movd [edi + ecx*4 + 32], mm7
-
-
- mov ebx, [ebp + %$fshift] ; increment fshift force
- mov edx, [esp + .is3]
-
- movq mm6, [ebx + edx*4]
- movd mm7, [ebx + edx*4 + 8]
- pfadd mm6, [esp + .fixO]
- pfadd mm7, [esp + .fizO]
- pfadd mm6, mm0
- pfadd mm7, mm1
- pfadd mm6, mm2
- pfadd mm7, mm3
- movq [ebx + edx*4], mm6
- movd [ebx + edx*4 + 8], mm7
-
- mov edx, [ebp + %$gid] ; get group index for this i particle
- mov edx, [edx]
- add [ebp + %$gid], dword 4 ; advance pointer
-
- movq mm7, [esp + .vctot]
- pfacc mm7,mm7 ; get and sum the two parts of total potential
-
- mov eax, [ebp + %$Vc]
- movd mm6, [eax + edx*4]
- pfadd mm6, mm7
- movd [eax + edx*4], mm6 ; increment vc[gid]
-
- movq mm7, [esp + .vnbtot]
- pfacc mm7,mm7 ; get and sum the two parts of total potential
-
- mov eax, [ebp + %$Vnb]
- movd mm6, [eax + edx*4]
- pfadd mm6, mm7
- movd [eax + edx*4], mm6 ; increment vnbtot[gid]
- ;; finish if last
- dec dword [ebp + %$nri]
- jz .end
- ;; not last, iterate once more!
- jmp .outer
-.end:
- femms
- add esp, 196
- pop edi
- pop esi
- pop edx
- pop ecx
- pop ebx
- pop eax
- endproc
-
-
-
-
-proc inl3000_3dnow
-%$nri arg
-%$iinr arg
-%$jindex arg
-%$jjnr arg
-%$shift arg
-%$shiftvec arg
-%$fshift arg
-%$gid arg
-%$pos arg
-%$faction arg
-%$charge arg
-%$facel arg
-%$Vc arg
-%$tabscale arg
-%$VFtab arg
- ;; stack offsets for local variables
-.is3 equ 0
-.ii3 equ 4
-.ix equ 8
-.iy equ 12
-.iz equ 16
-.iq equ 20 ; repeated (64bit) to fill 3dnow reg
-.vctot equ 28 ; repeated (64bit) to fill 3dnow reg
-.two equ 36 ; repeated (64bit) to fill 3dnow reg
-.n1 equ 44 ; repeated (64bit) to fill 3dnow reg
-.tabscale equ 52 ; repeated (64bit) to fill 3dnow reg
-.ntia equ 60
-.innerjjnr equ 64
-.innerk equ 68
-.fix equ 72
-.fiy equ 76
-.fiz equ 80
-.dx1 equ 84
-.dy1 equ 88
-.dz1 equ 92
-.dx2 equ 96
-.dy2 equ 100
-.dz2 equ 104
- push eax
- push ebx
- push ecx
- push edx
- push esi
- push edi
- sub esp, 108 ; local stack space
- femms
- ; move data to local stack
- movq mm0, [mm_two]
- movd mm3, [ebp + %$tabscale]
- movq [esp + .two], mm0
- punpckldq mm3,mm3
- movq [esp + .tabscale], mm3
- ;; assume we have at least one i particle - start directly
-.outer:
- mov eax, [ebp + %$shift] ; eax = pointer into shift[]
- mov ebx, [eax] ; ebx=shift[n]
- add [ebp + %$shift], dword 4 ; advance pointer one step
-
- lea ebx, [ebx + ebx*2] ; ebx=3*is
- mov [esp + .is3],ebx ; store is3
-
- mov eax, [ebp + %$shiftvec] ; eax = base of shiftvec[]
-
- movq mm0, [eax + ebx*4] ; move shX/shY to mm0 and shZ to mm1.
- movd mm1, [eax + ebx*4 + 8]
-
- mov ecx, [ebp + %$iinr] ; ecx = pointer into iinr[]
- add [ebp + %$iinr], dword 4 ; advance pointer
- mov ebx, [ecx] ; ebx =ii
-
- mov edx, [ebp + %$charge]
- movd mm2, [edx + ebx*4] ; mm2=charge[ii]
- pfmul mm2, [ebp + %$facel]
- punpckldq mm2,mm2 ; spread to both halves
- movq [esp + .iq], mm2 ; iq =facel*charge[ii]
-
- lea ebx, [ebx + ebx*2] ; ebx = 3*ii=ii3
- mov eax, [ebp + %$pos] ; eax = base of pos[]
-
- pfadd mm0, [eax + ebx*4] ; ix = shX + posX (and iy too)
- movd mm3, [eax + ebx*4 + 8] ; cant use direct memory add for 4 bytes (iz)
- mov [esp + .ii3], ebx
- pfadd mm1, mm3
- movq [esp + .ix], mm0
- movd [esp + .iz], mm1
-
- ;; clear total potential and i forces.
- pxor mm7,mm7
- movq [esp + .vctot], mm7
- movq [esp + .fix], mm7
- movd [esp + .fiz], mm7
-
- mov eax, [ebp + %$jindex]
- mov ecx, [eax] ; jindex[n]
- mov edx, [eax + 4] ; jindex[n+1]
- add [ebp + %$jindex], dword 4
- sub edx, ecx ; number of innerloop atoms
-
- mov esi, [ebp + %$pos]
- mov edi, [ebp + %$faction]
- mov eax, [ebp + %$jjnr]
- shl ecx, 2
- add eax, ecx
- mov [esp + .innerjjnr], eax ; pointer to jjnr[nj0]
- sub edx, dword 2
- mov [esp + .innerk], edx ; number of innerloop atoms
- jge .unroll_loop
- jmp .finish_inner
-.unroll_loop:
- ;; paired innerloop here.
- mov ecx, [esp + .innerjjnr] ; pointer to jjnr[k]
- mov eax, [ecx]
- mov ebx, [ecx + 4] ; eax/ebx=jnr
- add [esp + .innerjjnr], dword 8 ; advance pointer (unrolled 2)
- prefetch [ecx + 16] ; prefetch data - trial and error says 16 is best
-
- mov ecx, [ebp + %$charge] ; base of charge[]
- movq mm5, [esp + .iq]
- movd mm3, [ecx + eax*4] ; charge[jnr1]
- punpckldq mm3, [ecx + ebx*4] ; move charge 2 to high part of mm3
- pfmul mm3,mm5 ; mm3 now has qq for both particles
-
- lea eax, [eax + eax*2] ; replace jnr with j3
- lea ebx, [ebx + ebx*2]
-
- mov esi, [ebp + %$pos]
-
- movq mm0, [esp + .ix]
- movd mm1, [esp + .iz]
- movq mm4, [esi + eax*4] ; fetch first j coordinates
- movd mm5, [esi + eax*4 + 8]
- pfsubr mm4,mm0 ; dr = ir - jr
- pfsubr mm5,mm1
- movq [esp + .dx1], mm4 ; store dr
- movd [esp + .dz1], mm5
- pfmul mm4,mm4 ; square dx,dy,dz
- pfmul mm5,mm5
- pfacc mm4, mm5 ; accumulate to get dx*dx+dy*dy+dz*dz
- pfacc mm4, mm5 ; first rsq in lower mm4
-
- movq mm6, [esi + ebx*4] ; fetch second j coordinates
- movd mm7, [esi + ebx*4 + 8]
-
- pfsubr mm6,mm0 ; dr = ir - jr
- pfsubr mm7,mm1
- movq [esp + .dx2], mm6 ; store dr
- movd [esp + .dz2], mm7
- pfmul mm6,mm6 ; square dx,dy,dz
- pfmul mm7,mm7
- pfacc mm6, mm7 ; accumulate to get dx*dx+dy*dy+dz*dz
- pfacc mm6, mm7 ; second rsq in lower mm6
-
- pfrsqrt mm0, mm4 ; lookup inverse square root seed
- pfrsqrt mm1, mm6
-
-
- punpckldq mm0,mm1
- punpckldq mm4,mm6 ; now 4 has rsq and 0 the seed for both pairs.
- movq mm2,mm0 ; amd 3dnow N-R iteration to get full precision.
- pfmul mm0,mm0
- pfrsqit1 mm0,mm4
- pfrcpit2 mm0,mm2
- pfmul mm4, mm0
- movq mm1, mm4
- ;; mm0 is invsqrt, and mm1 r.
- ;; do potential and fscal
- pfmul mm1, [esp + .tabscale] ; mm1=rt
- pf2iw mm4,mm1
- movq [esp + .n1], mm4
- pi2fd mm4,mm4
- pfsub mm1, mm4 ; now mm1 is eps and mm4 n0.
-
- movq mm2,mm1
- pfmul mm2,mm2 ; mm1 is eps, mm2 is eps2
-
- mov edx, [ebp + %$VFtab]
- mov ecx, [esp + .n1]
- shl ecx, 2
- ; coulomb table
- ; load all the table values we need
- movd mm4, [edx + ecx*4]
- movd mm5, [edx + ecx*4 + 4]
- movd mm6, [edx + ecx*4 + 8]
- movd mm7, [edx + ecx*4 + 12]
- mov ecx, [esp + .n1 + 4]
- shl ecx, 2
- punpckldq mm4, [edx + ecx*4]
- punpckldq mm5, [edx + ecx*4 + 4]
- punpckldq mm6, [edx + ecx*4 + 8]
- punpckldq mm7, [edx + ecx*4 + 12]
-
- pfmul mm6, mm1 ; mm6 = Geps
- pfmul mm7, mm2 ; mm7 = Heps2
-
- pfadd mm5, mm6
- pfadd mm5, mm7 ; mm5 = Fp
-
- pfmul mm7, [esp + .two] ; two*Heps2
- pfadd mm7, mm6
- pfadd mm7, mm5 ; mm7=FF
-
- pfmul mm5, mm1 ; mm5=eps*Fp
- pfadd mm5, mm4 ; mm5= VV
-
- pfmul mm5, mm3 ; vcoul=qq*VV
- pfmul mm3, mm7 ; fijC=FF*qq
-
- ; at this point mm5 contains vcoul and mm3 fijC.
- ; increment vcoul - then we can get rid of mm5.
- ;; update vctot
- pfadd mm5, [esp + .vctot] ; add the earlier value
- movq [esp + .vctot], mm5 ; store the sum
-
- ; change sign of mm3
- pxor mm1,mm1
- pfsub mm1, mm3
- pfmul mm1, [esp + .tabscale]
- pfmul mm0, mm1 ; mm0 is total fscal now
-
- prefetchw [esp + .dx1] ; prefetch i forces to cache
-
- ;; spread fscalar to both positions
- movq mm1,mm0
- punpckldq mm0,mm0
- punpckhdq mm1,mm1
-
- ;; calc vector force
- prefetchw [edi + eax*4] ; prefetch the 1st faction to cache
- movq mm2, [esp + .dx1] ; fetch dr
- movd mm3, [esp + .dz1]
-
- prefetchw [edi + ebx*4] ; prefetch the 2nd faction to cache
- pfmul mm2, mm0 ; mult by fs
- pfmul mm3, mm0
-
- movq mm4, [esp + .dx2] ; fetch dr
- movd mm5, [esp + .dz2]
- pfmul mm4, mm1 ; mult by fs
- pfmul mm5, mm1
- ;; update i forces
-
- movq mm0, [esp + .fix]
- movd mm1, [esp + .fiz]
- pfadd mm0, mm2
- pfadd mm1, mm3
-
- pfadd mm0, mm4
- pfadd mm1, mm5
- movq [esp + .fix], mm0
- movd [esp + .fiz], mm1
- ;; update j forces
-
- movq mm0, [edi + eax*4]
- movd mm1, [edi + eax*4 + 8]
- movq mm6, [edi + ebx*4]
- movd mm7, [edi + ebx*4 + 8]
-
- pfsub mm0, mm2
- pfsub mm1, mm3
- pfsub mm6, mm4
- pfsub mm7, mm5
-
- movq [edi + eax*4], mm0
- movd [edi + eax*4 +8], mm1
- movq [edi + ebx*4], mm6
- movd [edi + ebx*4 + 8], mm7
-
- ;; should we do one more iteration?
- sub [esp + .innerk], dword 2
- jl .finish_inner
- jmp .unroll_loop
-.finish_inner:
- and [esp + .innerk], dword 1
- jnz .single_inner
- jmp .updateouterdata
-.single_inner:
- ;; a single j particle iteration here - compare with the unrolled code for comments.
- mov eax, [esp + .innerjjnr]
- mov eax, [eax] ; eax=jnr offset
-
- mov ecx, [ebp + %$charge]
- movd mm5, [esp + .iq]
- movd mm3, [ecx + eax*4]
- pfmul mm3, mm5 ; mm3=qq
-
- mov esi, [ebp + %$pos]
- lea eax, [eax + eax*2]
-
- movq mm0, [esp + .ix]
- movd mm1, [esp + .iz]
- movq mm4, [esi + eax*4]
- movd mm5, [esi + eax*4 + 8]
- pfsubr mm4, mm0
- pfsubr mm5, mm1
- movq [esp + .dx1], mm4
- pfmul mm4,mm4
- movd [esp + .dz1], mm5
- pfmul mm5,mm5
- pfacc mm4, mm5
- pfacc mm4, mm5 ; mm0=rsq
-
- pfrsqrt mm0,mm4
- movq mm2,mm0
- pfmul mm0,mm0
- pfrsqit1 mm0,mm4
- pfrcpit2 mm0,mm2 ; mm1=invsqrt
- pfmul mm4, mm0
- movq mm1, mm4
- ;; mm0 is invsqrt, and mm1 r.
-
- ;; calculate potentials and scalar force
- pfmul mm1, [esp + .tabscale] ; mm1=rt
- pf2iw mm4,mm1
- movd [esp + .n1], mm4
- pi2fd mm4,mm4
- pfsub mm1, mm4 ; now mm1 is eps and mm4 n0.
-
- movq mm2,mm1
- pfmul mm2,mm2 ; mm1 is eps, mm2 is eps2
-
- ; coulomb table
- mov edx, [ebp + %$VFtab]
- mov ecx, [esp + .n1]
- shl ecx, 2
- ; load all the table values we need
- movd mm4, [edx + ecx*4]
- movd mm5, [edx + ecx*4 + 4]
- movd mm6, [edx + ecx*4 + 8]
- movd mm7, [edx + ecx*4 + 12]
-
- pfmul mm6, mm1 ; mm6 = Geps
- pfmul mm7, mm2 ; mm7 = Heps2
-
- pfadd mm5, mm6
- pfadd mm5, mm7 ; mm5 = Fp
-
- pfmul mm7, [esp + .two] ; two*Heps2
- pfadd mm7, mm6
- pfadd mm7, mm5 ; mm7=FF
-
- pfmul mm5, mm1 ; mm5=eps*Fp
- pfadd mm5, mm4 ; mm5= VV
-
- pfmul mm5, mm3 ; vcoul=qq*VV
- pfmul mm3, mm7 ; fijC=FF*qq
-
- ; at this point mm5 contains vcoul and mm3 fijC.
- ; increment vcoul - then we can get rid of mm5.
- ;; update vctot
- pfadd mm5, [esp + .vctot] ; add the earlier value
- movq [esp + .vctot], mm5 ; store the sum
-
- ; change sign of mm3
- pxor mm1,mm1
- pfsub mm1, mm3
- pfmul mm0, [esp + .tabscale]
- pfmul mm0, mm1 ; mm0 is total fscal now
-
- ;; spread fscalar to both positions
- punpckldq mm0,mm0
- ;; calc vectorial force
- prefetchw [edi + eax*4] ; prefetch faction to cache
- movq mm2, [esp + .dx1]
- movd mm3, [esp + .dz1]
-
-
- pfmul mm2, mm0
- pfmul mm3, mm0
-
- ;; update i particle force
- movq mm0, [esp + .fix]
- movd mm1, [esp + .fiz]
- pfadd mm0, mm2
- pfadd mm1, mm3
- movq [esp + .fix], mm0
- movd [esp + .fiz], mm1
- ;; update j particle force
- movq mm0, [edi + eax*4]
- movd mm1, [edi + eax *4+ 8]
- pfsub mm0, mm2
- pfsub mm1, mm3
- movq [edi + eax*4], mm0
- movd [edi + eax*4 +8], mm1
- ;; done!
-.updateouterdata:
- mov ecx, [esp + .ii3]
-
- movq mm6, [edi + ecx*4] ; increment i force
- movd mm7, [edi + ecx*4 + 8]
- pfadd mm6, [esp + .fix]
- pfadd mm7, [esp + .fiz]
- movq [edi + ecx*4], mm6
- movd [edi + ecx*4 +8], mm7
-
- mov ebx, [ebp + %$fshift] ; increment fshift force
- mov edx, [esp + .is3]
-
- movq mm6, [ebx + edx*4]
- movd mm7, [ebx + edx*4 + 8]
- pfadd mm6, [esp + .fix]
- pfadd mm7, [esp + .fiz]
- movq [ebx + edx*4], mm6
- movd [ebx + edx*4 + 8], mm7
-
- mov edx, [ebp + %$gid] ; get group index for this i particle
- mov edx, [edx]
- add [ebp + %$gid], dword 4 ; advance pointer
-
- movq mm7, [esp + .vctot]
- pfacc mm7,mm7 ; get and sum the two parts of total potential
-
- mov eax, [ebp + %$Vc]
- movd mm6, [eax + edx*4]
- pfadd mm6, mm7
- movd [eax + edx*4], mm6 ; increment vc[gid]
-
- ;; finish if last
- mov ecx, [ebp + %$nri]
- dec ecx
- jecxz .end
- ;; not last, iterate once more!
- mov [ebp + %$nri], ecx
- jmp .outer
-.end:
- femms
- add esp, 108
- pop edi
- pop esi
- pop edx
- pop ecx
- pop ebx
- pop eax
- endproc
-
-
-
-
-proc inl3010_3dnow
-%$nri arg
-%$iinr arg
-%$jindex arg
-%$jjnr arg
-%$shift arg
-%$shiftvec arg
-%$fshift arg
-%$gid arg
-%$pos arg
-%$faction arg
-%$charge arg
-%$facel arg
-%$Vc arg
-%$tabscale arg
-%$VFtab arg
-%$nsatoms arg
- ;; stack offsets for local variables
-.is3 equ 0
-.ii3 equ 4
-.shX equ 8
-.shY equ 12
-.shZ equ 16
-.ix equ 20
-.iy equ 24
-.iz equ 28
-.iq equ 32 ; repeated (64bit) to fill 3dnow reg
-.vctot equ 40 ; repeated (64bit) to fill 3dnow reg
-.two equ 48 ; repeated (64bit) to fill 3dnow reg
-.n1 equ 56 ; repeated (64bit) to fill 3dnow reg
-.tabscale equ 64 ; repeated (64bit) to fill 3dnow reg
-.innerjjnr0 equ 72
-.innerk0 equ 76
-.innerjjnr equ 80
-.innerk equ 84
-.fix equ 88
-.fiy equ 92
-.fiz equ 96
-.dx1 equ 100
-.dy1 equ 104
-.dz1 equ 108
-.dx2 equ 112
-.dy2 equ 116
-.dz2 equ 120
-.nscoul equ 124
-.solnr equ 128
- push eax
- push ebx
- push ecx
- push edx
- push esi
- push edi
- sub esp, 132 ; local stack space
- femms
-
- add [ebp + %$nsatoms], dword 8
- movq mm2, [mm_two]
- movq [esp + .two], mm2
- movd mm3, [ebp + %$tabscale]
- punpckldq mm3,mm3
- movq [esp + .tabscale], mm3
-
- ;; assume we have at least one i particle - start directly
-.outer:
- mov eax, [ebp + %$shift] ; eax = pointer into shift[]
- mov ebx, [eax] ; ebx=shift[n]
- add [ebp + %$shift], dword 4 ; advance pointer one step
-
- lea ebx, [ebx + ebx*2] ; ebx=3*is
- mov [esp + .is3],ebx ; store is3
-
- mov eax, [ebp + %$shiftvec] ; eax = base of shiftvec[]
-
- movq mm0, [eax + ebx*4] ; move shX/shY to mm0 and shZ to mm1.
- movd mm1, [eax + ebx*4 + 8]
- movq [esp + .shX], mm0
- movd [esp + .shZ], mm1
-
- mov ecx, [ebp + %$iinr] ; ecx = pointer into iinr[]
- add [ebp + %$iinr], dword 4 ; advance pointer
- mov ebx, [ecx] ; ebx =ii
-
- mov eax, [ebp + %$nsatoms]
- mov ecx, [eax]
- add [ebp + %$nsatoms], dword 12
- mov [esp + .nscoul], ecx
-
- ;; clear potential
- pxor mm7,mm7
- movq [esp + .vctot], mm7
- mov [esp + .solnr], ebx
-
- mov eax, [ebp + %$jindex]
- mov ecx, [eax] ; jindex[n]
- mov edx, [eax + 4] ; jindex[n+1]
- add [ebp + %$jindex], dword 4
- sub edx, ecx ; number of innerloop atoms
- mov eax, [ebp + %$jjnr]
- shl ecx, 2
- add eax, ecx
- mov [esp + .innerjjnr0], eax ; pointer to jjnr[nj0]
-
- mov [esp + .innerk0], edx ; number of innerloop atoms
- mov esi, [ebp + %$pos]
- mov edi, [ebp + %$faction]
- mov ecx, [esp + .nscoul]
- cmp ecx, byte 0
- jnz .mno_coul
- jmp .last_mno
-.mno_coul:
- mov ebx, [esp + .solnr]
- inc dword [esp + .solnr]
- mov edx, [ebp + %$charge]
- movd mm2, [edx + ebx*4] ; mm2=charge[ii]
- pfmul mm2, [ebp + %$facel]
- punpckldq mm2,mm2 ; spread to both halves
- movq [esp + .iq], mm2 ; iq =facel*charge[ii]
-
- lea ebx, [ebx + ebx*2] ; ebx = 3*ii=ii3
- mov eax, [ebp + %$pos] ; eax = base of pos[]
- mov [esp + .ii3], ebx
-
- movq mm0, [eax + ebx*4]
- movd mm1, [eax + ebx*4 + 8]
- pfadd mm0, [esp + .shX]
- pfadd mm1, [esp + .shZ]
- movq [esp + .ix], mm0
- movd [esp + .iz], mm1
-
- ;; clear forces.
- pxor mm7,mm7
- movq [esp + .fix], mm7
- movd [esp + .fiz], mm7
-
- mov ecx, [esp + .innerjjnr0]
- mov [esp + .innerjjnr], ecx
- mov edx, [esp + .innerk0]
- sub edx, dword 2
- mov [esp + .innerk], edx ; number of innerloop atoms
- jge .unroll_coul_loop
- jmp .finish_coul_inner
-.unroll_coul_loop:
- ;; paired innerloop here.
- mov ecx, [esp + .innerjjnr] ; pointer to jjnr[k]
- mov eax, [ecx]
- mov ebx, [ecx + 4] ; eax/ebx=jnr
- add [esp + .innerjjnr], dword 8 ; advance pointer (unrolled 2)
- prefetch [ecx + 16] ; prefetch data - trial and error says 16 is best
-
- mov ecx, [ebp + %$charge] ; base of charge[]
- movq mm5, [esp + .iq]
- movd mm3, [ecx + eax*4] ; charge[jnr1]
- punpckldq mm3, [ecx + ebx*4] ; move charge 2 to high part of mm3
- pfmul mm3,mm5 ; mm3 now has qq for both particles
-
- lea eax, [eax + eax*2] ; replace jnr with j3
- lea ebx, [ebx + ebx*2]
-
- mov esi, [ebp + %$pos]
-
- movq mm0, [esp + .ix]
- movd mm1, [esp + .iz]
- movq mm4, [esi + eax*4] ; fetch first j coordinates
- movd mm5, [esi + eax*4 + 8]
- pfsubr mm4,mm0 ; dr = ir - jr
- pfsubr mm5,mm1
- movq [esp + .dx1], mm4 ; store dr
- movd [esp + .dz1], mm5
- pfmul mm4,mm4 ; square dx,dy,dz
- pfmul mm5,mm5
- pfacc mm4, mm5 ; accumulate to get dx*dx+dy*dy+dz*dz
- pfacc mm4, mm5 ; first rsq in lower mm4
-
- movq mm6, [esi + ebx*4] ; fetch second j coordinates
- movd mm7, [esi + ebx*4 + 8]
-
- pfsubr mm6,mm0 ; dr = ir - jr
- pfsubr mm7,mm1
- movq [esp + .dx2], mm6 ; store dr
- movd [esp + .dz2], mm7
- pfmul mm6,mm6 ; square dx,dy,dz
- pfmul mm7,mm7
- pfacc mm6, mm7 ; accumulate to get dx*dx+dy*dy+dz*dz
- pfacc mm6, mm7 ; second rsq in lower mm6
-
- pfrsqrt mm0, mm4 ; lookup inverse square root seed
- pfrsqrt mm1, mm6
-
-
- punpckldq mm0,mm1
- punpckldq mm4,mm6 ; now 4 has rsq and 0 the seed for both pairs.
- movq mm2,mm0 ; amd 3dnow N-R iteration to get full precision.
- pfmul mm0,mm0
- pfrsqit1 mm0,mm4
- pfrcpit2 mm0,mm2
- pfmul mm4, mm0
- movq mm1, mm4
- ;; mm0 is invsqrt, and mm1 r.
- ;; do potential and fscal
- pfmul mm1, [esp + .tabscale] ; mm1=rt
- pf2iw mm4,mm1
- movq [esp + .n1], mm4
- pi2fd mm4,mm4
- pfsub mm1, mm4 ; now mm1 is eps and mm4 n0.
-
- movq mm2,mm1
- pfmul mm2,mm2 ; mm1 is eps, mm2 is eps2
-
- mov edx, [ebp + %$VFtab]
- mov ecx, [esp + .n1]
- shl ecx, 2
- ; coulomb table
- ; load all the table values we need
- movd mm4, [edx + ecx*4]
- movd mm5, [edx + ecx*4 + 4]
- movd mm6, [edx + ecx*4 + 8]
- movd mm7, [edx + ecx*4 + 12]
- mov ecx, [esp + .n1 + 4]
- shl ecx, 2
- punpckldq mm4, [edx + ecx*4]
- punpckldq mm5, [edx + ecx*4 + 4]
- punpckldq mm6, [edx + ecx*4 + 8]
- punpckldq mm7, [edx + ecx*4 + 12]
-
- pfmul mm6, mm1 ; mm6 = Geps
- pfmul mm7, mm2 ; mm7 = Heps2
-
- pfadd mm5, mm6
- pfadd mm5, mm7 ; mm5 = Fp
-
- pfmul mm7, [esp + .two] ; two*Heps2
- pfadd mm7, mm6
- pfadd mm7, mm5 ; mm7=FF
-
- pfmul mm5, mm1 ; mm5=eps*Fp
- pfadd mm5, mm4 ; mm5= VV
-
- pfmul mm5, mm3 ; vcoul=qq*VV
- pfmul mm3, mm7 ; fijC=FF*qq
-
- ; at this point mm5 contains vcoul and mm3 fijC.
- ; increment vcoul - then we can get rid of mm5.
- ;; update vctot
- pfadd mm5, [esp + .vctot] ; add the earlier value
- movq [esp + .vctot], mm5 ; store the sum
-
- ; change sign of mm3
- pxor mm1,mm1
- pfsub mm1, mm3
- pfmul mm1, [esp + .tabscale]
- pfmul mm0, mm1 ; mm0 is total fscal now
-
- prefetchw [esp + .dx1] ; prefetch i forces to cache
-
- ;; spread fscalar to both positions
- movq mm1,mm0
- punpckldq mm0,mm0
- punpckhdq mm1,mm1
-
- ;; calc vector force
- prefetchw [edi + eax*4] ; prefetch the 1st faction to cache
- movq mm2, [esp + .dx1] ; fetch dr
- movd mm3, [esp + .dz1]
-
- prefetchw [edi + ebx*4] ; prefetch the 2nd faction to cache
- pfmul mm2, mm0 ; mult by fs
- pfmul mm3, mm0
-
- movq mm4, [esp + .dx2] ; fetch dr
- movd mm5, [esp + .dz2]
- pfmul mm4, mm1 ; mult by fs
- pfmul mm5, mm1
- ;; update i forces
-
- movq mm0, [esp + .fix]
- movd mm1, [esp + .fiz]
- pfadd mm0, mm2
- pfadd mm1, mm3
-
- pfadd mm0, mm4
- pfadd mm1, mm5
- movq [esp + .fix], mm0
- movd [esp + .fiz], mm1
- ;; update j forces
-
- movq mm0, [edi + eax*4]
- movd mm1, [edi + eax*4 + 8]
- movq mm6, [edi + ebx*4]
- movd mm7, [edi + ebx*4 + 8]
-
- pfsub mm0, mm2
- pfsub mm1, mm3
- pfsub mm6, mm4
- pfsub mm7, mm5
-
- movq [edi + eax*4], mm0
- movd [edi + eax*4 +8], mm1
- movq [edi + ebx*4], mm6
- movd [edi + ebx*4 + 8], mm7
-
- ;; should we do one more iteration?
- sub [esp + .innerk], dword 2
- jl .finish_coul_inner
- jmp .unroll_coul_loop
-.finish_coul_inner:
- and [esp + .innerk], dword 1
- jnz .single_coul_inner
- jmp .updateouterdata_coul
-.single_coul_inner:
- ;; a single j particle iteration here - compare with the unrolled code for comments.
- mov eax, [esp + .innerjjnr]
- mov eax, [eax] ; eax=jnr offset
-
- mov ecx, [ebp + %$charge]
- movd mm5, [esp + .iq]
- movd mm3, [ecx + eax*4]
- pfmul mm3, mm5 ; mm3=qq
-
- mov esi, [ebp + %$pos]
- lea eax, [eax + eax*2]
-
- movq mm0, [esp + .ix]
- movd mm1, [esp + .iz]
- movq mm4, [esi + eax*4]
- movd mm5, [esi + eax*4 + 8]
- pfsubr mm4, mm0
- pfsubr mm5, mm1
- movq [esp + .dx1], mm4
- pfmul mm4,mm4
- movd [esp + .dz1], mm5
- pfmul mm5,mm5
- pfacc mm4, mm5
- pfacc mm4, mm5 ; mm0=rsq
-
- pfrsqrt mm0,mm4
- movq mm2,mm0
- pfmul mm0,mm0
- pfrsqit1 mm0,mm4
- pfrcpit2 mm0,mm2 ; mm1=invsqrt
- pfmul mm4, mm0
- movq mm1, mm4
- ;; mm0 is invsqrt, and mm1 r.
-
- ;; calculate potentials and scalar force
- pfmul mm1, [esp + .tabscale] ; mm1=rt
- pf2iw mm4,mm1
- movd [esp + .n1], mm4
- pi2fd mm4,mm4
- pfsub mm1, mm4 ; now mm1 is eps and mm4 n0.
-
- movq mm2,mm1
- pfmul mm2,mm2 ; mm1 is eps, mm2 is eps2
-
- ; coulomb table
- mov edx, [ebp + %$VFtab]
- mov ecx, [esp + .n1]
- shl ecx, 2
- ; load all the table values we need
- movd mm4, [edx + ecx*4]
- movd mm5, [edx + ecx*4 + 4]
- movd mm6, [edx + ecx*4 + 8]
- movd mm7, [edx + ecx*4 + 12]
-
- pfmul mm6, mm1 ; mm6 = Geps
- pfmul mm7, mm2 ; mm7 = Heps2
-
- pfadd mm5, mm6
- pfadd mm5, mm7 ; mm5 = Fp
-
- pfmul mm7, [esp + .two] ; two*Heps2
- pfadd mm7, mm6
- pfadd mm7, mm5 ; mm7=FF
-
- pfmul mm5, mm1 ; mm5=eps*Fp
- pfadd mm5, mm4 ; mm5= VV
-
- pfmul mm5, mm3 ; vcoul=qq*VV
- pfmul mm3, mm7 ; fijC=FF*qq
-
- ; at this point mm5 contains vcoul and mm3 fijC.
- ; increment vcoul - then we can get rid of mm5.
- ;; update vctot
- pfadd mm5, [esp + .vctot] ; add the earlier value
- movq [esp + .vctot], mm5 ; store the sum
-
- ; change sign of mm3
- pxor mm1,mm1
- pfsub mm1, mm3
- pfmul mm0, [esp + .tabscale]
- pfmul mm0, mm1 ; mm0 is total fscal now
-
- ;; spread fscalar to both positions
- punpckldq mm0,mm0
- ;; calc vectorial force
- prefetchw [edi + eax*4] ; prefetch faction to cache
- movq mm2, [esp + .dx1]
- movd mm3, [esp + .dz1]
-
-
- pfmul mm2, mm0
- pfmul mm3, mm0
-
- ;; update i particle force
- movq mm0, [esp + .fix]
- movd mm1, [esp + .fiz]
- pfadd mm0, mm2
- pfadd mm1, mm3
- movq [esp + .fix], mm0
- movd [esp + .fiz], mm1
- ;; update j particle force
- movq mm0, [edi + eax*4]
- movd mm1, [edi + eax *4+ 8]
- pfsub mm0, mm2
- pfsub mm1, mm3
- movq [edi + eax*4], mm0
- movd [edi + eax*4 +8], mm1
- ;; done!
-.updateouterdata_coul:
- mov ecx, [esp + .ii3]
-
- movq mm6, [edi + ecx*4] ; increment i force
- movd mm7, [edi + ecx*4 + 8]
- pfadd mm6, [esp + .fix]
- pfadd mm7, [esp + .fiz]
- movq [edi + ecx*4], mm6
- movd [edi + ecx*4 +8], mm7
-
- mov ebx, [ebp + %$fshift] ; increment fshift force
- mov edx, [esp + .is3]
-
- movq mm6, [ebx + edx*4]
- movd mm7, [ebx + edx*4 + 8]
- pfadd mm6, [esp + .fix]
- pfadd mm7, [esp + .fiz]
- movq [ebx + edx*4], mm6
- movd [ebx + edx*4 + 8], mm7
-
- ;; loop back to mno.
- dec dword [esp + .nscoul]
- jz .last_mno
- jmp .mno_coul
-.last_mno:
- mov edx, [ebp + %$gid] ; get group index for this i particle
- mov edx, [edx]
- add [ebp + %$gid], dword 4 ; advance pointer
-
- movq mm7, [esp + .vctot]
- pfacc mm7,mm7 ; get and sum the two parts of total potential
-
- mov eax, [ebp + %$Vc]
- movd mm6, [eax + edx*4]
- pfadd mm6, mm7
- movd [eax + edx*4], mm6 ; increment vc[gid]
- ;; finish if last
- mov ecx, [ebp + %$nri]
- dec ecx
- jecxz .end
- ;; not last, iterate once more!
- mov [ebp + %$nri], ecx
- jmp .outer
-.end:
- femms
- add esp, 132
- pop edi
- pop esi
- pop edx
- pop ecx
- pop ebx
- pop eax
- endproc
-
-
-
-
-proc inl3020_3dnow
-%$nri arg
-%$iinr arg
-%$jindex arg
-%$jjnr arg
-%$shift arg
-%$shiftvec arg
-%$fshift arg
-%$gid arg
-%$pos arg
-%$faction arg
-%$charge arg
-%$facel arg
-%$Vc arg
-%$tabscale arg
-%$VFtab arg
- ;; stack offsets for local variables
-.is3 equ 0
-.ii3 equ 4
-.ixO equ 8
-.iyO equ 12
-.izO equ 16
-.ixH equ 20 ; repeated (64bit) to fill 3dnow reg
-.iyH equ 28 ; repeated (64bit) to fill 3dnow reg
-.izH equ 36 ; repeated (64bit) to fill 3dnow reg
-.iqO equ 44 ; repeated (64bit) to fill 3dnow reg
-.iqH equ 52 ; repeated (64bit) to fill 3dnow reg
-.qqO equ 60 ; repeated (64bit) to fill 3dnow reg
-.qqH equ 68 ; repeated (64bit) to fill 3dnow reg
-.vctot equ 76 ; repeated (64bit) to fill 3dnow reg
-.two equ 84 ; repeated (64bit) to fill 3dnow reg
-.n1 equ 92 ; repeated (64bit) to fill 3dnow reg
-.tabscale equ 100 ; repeated (64bit) to fill 3dnow reg
-.innerjjnr equ 108
-.innerk equ 112
-.fixO equ 116
-.fiyO equ 120
-.fizO equ 124
-.fixH equ 128 ; repeated (64bit) to fill 3dnow reg
-.fiyH equ 136 ; repeated (64bit) to fill 3dnow reg
-.fizH equ 144 ; repeated (64bit) to fill 3dnow reg
-.dxO equ 152
-.dyO equ 156
-.dzO equ 160
-.dxH equ 164 ; repeated (64bit) to fill 3dnow reg
-.dyH equ 172 ; repeated (64bit) to fill 3dnow reg
-.dzH equ 180 ; repeated (64bit) to fill 3dnow reg
-.tmprsqH equ 188 ; repeated (64bit) to fill 3dnow reg
- push eax
- push ebx
- push ecx
- push edx
- push esi
- push edi
- sub esp, 196 ; local stack space
- femms
-
- mov ecx, [ebp + %$iinr] ; ecx = pointer into iinr[]
- mov ebx, [ecx] ; ebx =ii
-
- mov edx, [ebp + %$charge]
- movd mm1, [ebp + %$facel]
- movd mm2, [edx + ebx*4] ; mm2=charge[ii0]
- pfmul mm2, mm1
- movq [esp + .iqO], mm2 ; iqO = facel*charge[ii]
-
- movd mm2, [edx + ebx*4 + 4] ; mm2=charge[ii0+1]
- pfmul mm2, mm1
- punpckldq mm2,mm2 ; spread to both halves
- movq [esp + .iqH], mm2 ; iqH = facel*charge[ii0+1]
-
- movq mm3, [mm_two]
- movd mm4, [ebp + %$tabscale]
- punpckldq mm4,mm4 ; spread to both halves
- movq [esp + .two], mm3
- movq [esp + .tabscale], mm4
- ;; assume we have at least one i particle - start directly
-.outer:
- mov eax, [ebp + %$shift] ; eax = pointer into shift[]
- mov ebx, [eax] ; ebx=shift[n]
- add [ebp + %$shift], dword 4 ; advance pointer one step
-
- lea ebx, [ebx + ebx*2] ; ebx=3*is
- mov [esp + .is3],ebx ; store is3
-
- mov eax, [ebp + %$shiftvec] ; eax = base of shiftvec[]
-
- movq mm5, [eax + ebx*4] ; move shX/shY to mm5 and shZ to mm6.
- movd mm6, [eax + ebx*4 + 8]
- movq mm0, mm5
- movq mm1, mm5
- movq mm2, mm6
- punpckldq mm0,mm0 ; also expand shX,Y,Z in mm0--mm2.
- punpckhdq mm1,mm1
- punpckldq mm2,mm2
-
- mov ecx, [ebp + %$iinr] ; ecx = pointer into iinr[]
- add [ebp + %$iinr], dword 4 ; advance pointer
- mov ebx, [ecx] ; ebx =ii
-
- lea ebx, [ebx + ebx*2] ; ebx = 3*ii=ii3
- mov eax, [ebp + %$pos] ; eax = base of pos[]
-
- pfadd mm5, [eax + ebx*4] ; ix = shX + posX (and iy too)
- movd mm7, [eax + ebx*4 + 8] ; cant use direct memory add for 4 bytes (iz)
- mov [esp + .ii3], ebx ; (use mm7 as temp. storage for iz.)
- pfadd mm6, mm7
- movq [esp + .ixO], mm5
- movq [esp + .izO], mm6
-
- movd mm3, [eax + ebx*4 + 12]
- movd mm4, [eax + ebx*4 + 16]
- movd mm5, [eax + ebx*4 + 20]
- punpckldq mm3, [eax + ebx*4 + 24]
- punpckldq mm4, [eax + ebx*4 + 28]
- punpckldq mm5, [eax + ebx*4 + 32] ; coords of H1 in low mm3-mm5, H2 in high.
-
- pfadd mm0, mm3
- pfadd mm1, mm4
- pfadd mm2, mm5
- movq [esp + .ixH], mm0
- movq [esp + .iyH], mm1
- movq [esp + .izH], mm2
-
- ;; clear vctot and i forces.
- pxor mm7,mm7
- movq [esp + .vctot], mm7
- movq [esp + .fixO], mm7
- movd [esp + .fizO], mm7
- movq [esp + .fixH], mm7
- movq [esp + .fiyH], mm7
- movq [esp + .fizH], mm7
-
- mov eax, [ebp + %$jindex]
- mov ecx, [eax] ; jindex[n]
- mov edx, [eax + 4] ; jindex[n+1]
- add [ebp + %$jindex], dword 4
- sub edx, ecx ; number of innerloop atoms
- mov [esp + .innerk], edx
-
- mov esi, [ebp + %$pos]
- mov edi, [ebp + %$faction]
- mov eax, [ebp + %$jjnr]
- shl ecx, 2
- add eax, ecx
- mov [esp + .innerjjnr], eax ; pointer to jjnr[nj0]
-.inner_loop:
- ;; a single j particle iteration.
- mov eax, [esp + .innerjjnr]
- mov eax, [eax] ; eax=jnr offset
- add [esp + .innerjjnr], dword 4 ; advance pointer
- ;prefetch [ecx + 16] ; prefetch data - trial and error says 16 is best
-
- mov ecx, [ebp + %$charge]
- movd mm7, [ecx + eax*4]
- punpckldq mm7,mm7
- movq mm6,mm7
- pfmul mm6, [esp + .iqO]
- pfmul mm7, [esp + .iqH] ; mm6=qqO, mm7=qqH
- movd [esp + .qqO], mm6
- movq [esp + .qqH], mm7
-
- lea eax, [eax + eax*2]
-
- movq mm0, [esi + eax*4]
- movd mm1, [esi + eax*4 + 8]
- ;; copy & expand to mm2-mm4 for the H interactions
- movq mm2, mm0
- movq mm3, mm0
- movq mm4, mm1
- punpckldq mm2,mm2
- punpckhdq mm3,mm3
- punpckldq mm4,mm4
-
- pfsubr mm0, [esp + .ixO]
- pfsubr mm1, [esp + .izO]
-
- movq [esp + .dxO], mm0
- pfmul mm0,mm0
- movd [esp + .dzO], mm1
- pfmul mm1,mm1
- pfacc mm0, mm1
- pfadd mm0, mm1 ; mm0=rsqO
-
- punpckldq mm2, mm2
- punpckldq mm3, mm3
- punpckldq mm4, mm4 ; mm2-mm4 is jx-jz
- pfsubr mm2, [esp + .ixH]
- pfsubr mm3, [esp + .iyH]
- pfsubr mm4, [esp + .izH] ; mm2-mm4 is dxH-dzH
-
- movq [esp + .dxH], mm2
- movq [esp + .dyH], mm3
- movq [esp + .dzH], mm4
- pfmul mm2,mm2
- pfmul mm3,mm3
- pfmul mm4,mm4
-
- pfadd mm3,mm2
- pfadd mm3,mm4 ; mm3=rsqH
- movq [esp + .tmprsqH], mm3
-
- pfrsqrt mm1,mm0
-
- movq mm2,mm1
- pfmul mm1,mm1
- pfrsqit1 mm1,mm0
- pfrcpit2 mm1,mm2 ; mm1=invsqrt
-
- pfmul mm0, mm1 ; mm0=r
-
- pfmul mm0, [esp + .tabscale]
- pf2iw mm4, mm0
- movd [esp + .n1], mm4
- pi2fd mm4,mm4
- pfsub mm0, mm4 ; now mm0 is eps and mm4 n0.
- movq mm2, mm0
- pfmul mm2, mm2 ; mm0 is eps, mm2 eps2
-
- ; coulomb table
- mov edx, [ebp + %$VFtab]
- mov ecx, [esp + .n1]
- shl ecx, 2
- ;; load all values we need
- movd mm4, [edx + ecx*4]
- movd mm5, [edx + ecx*4 + 4]
- movd mm6, [edx + ecx*4 + 8]
- movd mm7, [edx + ecx*4 + 12]
-
- pfmul mm6, mm0 ; mm6 = Geps
- pfmul mm7, mm2 ; mm7 = Heps2
- ;;
- pfadd mm5, mm6
- pfadd mm5, mm7 ; mm5 = Fp
-
- pfmul mm7, [esp + .two] ; two*Heps2
- pfadd mm7, mm6
- pfadd mm7, mm5 ; mm7=FF
-
- pfmul mm5, mm0 ; mm5=eps*Fp
- pfadd mm5, mm4 ; mm5= VV
-
- pfmul mm5, [esp + .qqO] ; vcoul=qq*VV
- pfmul mm7, [esp + .qqO] ; fijC=qq*FF
- ;; update vctot directly, use mm3 for fscal sum.
- pfadd mm5, [esp + .vctot]
- movq [esp + .vctot], mm5
- movq mm3, mm7
-
- ; change sign of fscal and multiply with rinv
- pxor mm0,mm0
- pfsubr mm3, mm0
- pfmul mm3, [esp + .tabscale]
- pfmul mm3, mm1 ; mm3 is total fscal (for the oxygen) now
-
- ;; Ready with the oxygen - potential is updated, fscal is in mm3.
- ;; now do the two hydrogens.
- movq mm0, [esp + .tmprsqH] ; mm0=rsqH
-
- pfrsqrt mm1, mm0
- pswapd mm0,mm0
- pfrsqrt mm2, mm0
- pswapd mm0,mm0
- punpckldq mm1,mm2 ; seeds are in mm1 now, and rsq in mm0.
-
- movq mm2, mm1
- pfmul mm1,mm1
- pfrsqit1 mm1,mm0
- pfrcpit2 mm1,mm2 ; mm1=invsqrt
-
- pfmul mm0,mm1 ; mm0=r
- pfmul mm0, [esp + .tabscale]
- pf2iw mm4, mm0
- movq [esp + .n1], mm4
- pi2fd mm4,mm4
- pfsub mm0, mm4 ; now mm0 is eps and mm4 n0.
- movq mm2, mm0
- pfmul mm2, mm2 ; mm0 is eps, mm2 eps2
-
- ; coulomb table
- mov edx, [ebp + %$VFtab]
- mov ecx, [esp + .n1]
- shl ecx, 2
- ;; load all values we need
- movd mm4, [edx + ecx*4]
- movd mm5, [edx + ecx*4 + 4]
- movd mm6, [edx + ecx*4 + 8]
- movd mm7, [edx + ecx*4 + 12]
- mov ecx, [esp + .n1 + 4]
- shl ecx, 2
- punpckldq mm4, [edx + ecx*4]
- punpckldq mm5, [edx + ecx*4 + 4]
- punpckldq mm6, [edx + ecx*4 + 8]
- punpckldq mm7, [edx + ecx*4 + 12]
-
- pfmul mm6, mm0 ; mm6 = Geps
- pfmul mm7, mm2 ; mm7 = Heps2
-
- pfadd mm5, mm6
- pfadd mm5, mm7 ; mm5 = Fp
-
- pfmul mm7, [esp + .two] ; two*Heps2
- pfadd mm7, mm6
- pfadd mm7, mm5 ; mm7=FF
-
- pfmul mm5, mm0 ; mm5=eps*Fp
- pfadd mm5, mm4 ; mm5= VV
-
- pfmul mm5, [esp + .qqH] ; vcoul=qq*VV
- pfmul mm7, [esp + .qqH] ; fijC=qq*FF
-
- ;; update vctot.
- pfadd mm5, [esp + .vctot]
- movq [esp + .vctot], mm5
-
- ;; change sign of fijC and multiply by rinv
- pxor mm4,mm4
- pfsub mm4, mm7
- pfmul mm4, [esp + .tabscale]
- pfmul mm4, mm1 ; mm4 is total fscal (for the hydrogens) now
-
- ;; spread oxygen fscalar to both positions
- punpckldq mm3,mm3
- ;; calc vectorial force for O
- prefetchw [edi + eax*4] ; prefetch faction to cache
- movq mm0, [esp + .dxO]
- movd mm1, [esp + .dzO]
- pfmul mm0, mm3
- pfmul mm1, mm3
-
- ;; calc vectorial force for H's
- movq mm5, [esp + .dxH]
- movq mm6, [esp + .dyH]
- movq mm7, [esp + .dzH]
- pfmul mm5, mm4
- pfmul mm6, mm4
- pfmul mm7, mm4
-
- ;; update iO particle force
- movq mm2, [esp + .fixO]
- movd mm3, [esp + .fizO]
- pfadd mm2, mm0
- pfadd mm3, mm1
- movq [esp + .fixO], mm2
- movd [esp + .fizO], mm3
-
- ;; update iH forces
- movq mm2, [esp + .fixH]
- movq mm3, [esp + .fiyH]
- movq mm4, [esp + .fizH]
- pfadd mm2, mm5
- pfadd mm3, mm6
- pfadd mm4, mm7
- movq [esp + .fixH], mm2
- movq [esp + .fiyH], mm3
- movq [esp + .fizH], mm4
-
- ;; pack j forces from H in the same form as the oxygen force.
- pfacc mm5, mm6 ; mm5(l)=fjx(H1+H2) mm5(h)=fjy(H1+H2)
- pfacc mm7, mm7 ; mm7(l)=fjz(H1+H2)
-
- pfadd mm0, mm5 ; add up total force on j particle.
- pfadd mm1, mm7
-
- ;; update j particle force
- movq mm2, [edi + eax*4]
- movd mm3, [edi + eax*4 + 8]
- pfsub mm2, mm0
- pfsub mm3, mm1
- movq [edi + eax*4], mm2
- movd [edi + eax*4 + 8], mm3
-
- ;; done - one more?
- dec dword [esp + .innerk]
- jz .updateouterdata
- jmp .inner_loop
-.updateouterdata:
- mov ecx, [esp + .ii3]
-
- movq mm6, [edi + ecx*4] ; increment iO force
- movd mm7, [edi + ecx*4 + 8]
- pfadd mm6, [esp + .fixO]
- pfadd mm7, [esp + .fizO]
- movq [edi + ecx*4], mm6
- movd [edi + ecx*4 +8], mm7
-
- movq mm0, [esp + .fixH]
- movq mm3, [esp + .fiyH]
- movq mm1, [esp + .fizH]
- movq mm2, mm0
- punpckldq mm0, mm3 ; mm0(l)=fxH1, mm0(h)=fyH1
- punpckhdq mm2, mm3 ; mm2(l)=fxH2, mm2(h)=fyH2
- movq mm3, mm1
- pswapd mm3, mm3
- ;; mm1 is fzH1
- ;; mm3 is fzH2
-
- movq mm6, [edi + ecx*4 + 12] ; increment iH1 force
- movd mm7, [edi + ecx*4 + 20]
- pfadd mm6, mm0
- pfadd mm7, mm1
- movq [edi + ecx*4 + 12], mm6
- movd [edi + ecx*4 + 20], mm7
-
- movq mm6, [edi + ecx*4 + 24] ; increment iH2 force
- movd mm7, [edi + ecx*4 + 32]
- pfadd mm6, mm2
- pfadd mm7, mm3
- movq [edi + ecx*4 + 24], mm6
- movd [edi + ecx*4 + 32], mm7
-
-
- mov ebx, [ebp + %$fshift] ; increment fshift force
- mov edx, [esp + .is3]
-
- movq mm6, [ebx + edx*4]
- movd mm7, [ebx + edx*4 + 8]
- pfadd mm6, [esp + .fixO]
- pfadd mm7, [esp + .fizO]
- pfadd mm6, mm0
- pfadd mm7, mm1
- pfadd mm6, mm2
- pfadd mm7, mm3
- movq [ebx + edx*4], mm6
- movd [ebx + edx*4 + 8], mm7
-
- mov edx, [ebp + %$gid] ; get group index for this i particle
- mov edx, [edx]
- add [ebp + %$gid], dword 4 ; advance pointer
-
- movq mm7, [esp + .vctot]
- pfacc mm7,mm7 ; get and sum the two parts of total potential
-
- mov eax, [ebp + %$Vc]
- movd mm6, [eax + edx*4]
- pfadd mm6, mm7
- movd [eax + edx*4], mm6 ; increment vc[gid]
-
- ;; finish if last
- dec dword [ebp + %$nri]
- jz .end
- ;; not last, iterate once more!
- jmp .outer
-.end:
- femms
- add esp, 196
- pop edi
- pop esi
- pop edx
- pop ecx
- pop ebx
- pop eax
- endproc
-
-
-
-proc inl3030_3dnow
-%$nri arg
-%$iinr arg
-%$jindex arg
-%$jjnr arg
-%$shift arg
-%$shiftvec arg
-%$fshift arg
-%$gid arg
-%$pos arg
-%$faction arg
-%$charge arg
-%$facel arg
-%$Vc arg
-%$tabscale arg
-%$VFtab arg
- ;; stack offsets for local variables
-.is3 equ 0
-.ii3 equ 4
-.ixO equ 8
-.iyO equ 12
-.izO equ 16
-.ixH equ 20 ; repeated (64bit) to fill 3dnow reg
-.iyH equ 28 ; repeated (64bit) to fill 3dnow reg
-.izH equ 36 ; repeated (64bit) to fill 3dnow reg
-.qqOO equ 44 ; repeated (64bit) to fill 3dnow reg
-.qqOH equ 52 ; repeated (64bit) to fill 3dnow reg
-.qqHH equ 60 ; repeated (64bit) to fill 3dnow reg
-.two equ 68 ; repeated (64bit) to fill 3dnow reg
-.n1 equ 76 ; repeated (64bit) to fill 3dnow reg
-.tabscale equ 84 ; repeated (64bit) to fill 3dnow reg
-.vctot equ 92 ; repeated (64bit) to fill 3dnow reg
-.innerjjnr equ 100
-.innerk equ 104
-.fixO equ 108
-.fiyO equ 112
-.fizO equ 116
-.fixH equ 120 ; repeated (64bit) to fill 3dnow reg
-.fiyH equ 128 ; repeated (64bit) to fill 3dnow reg
-.fizH equ 136 ; repeated (64bit) to fill 3dnow reg
-.dxO equ 144
-.dyO equ 148
-.dzO equ 152
-.dxH equ 156 ; repeated (64bit) to fill 3dnow reg
-.dyH equ 164 ; repeated (64bit) to fill 3dnow reg
-.dzH equ 172 ; repeated (64bit) to fill 3dnow reg
-.tmprsqH equ 180 ; repeated (64bit) to fill 3dnow reg
- push eax
- push ebx
- push ecx
- push edx
- push esi
- push edi
- sub esp, 188 ; local stack space
- femms
- ;; assume we have at least one i particle - start directly
-
- mov ecx, [ebp + %$iinr] ; ecx = pointer into iinr[]
- mov ebx, [ecx] ; ebx =ii
-
- mov edx, [ebp + %$charge]
- movd mm1, [ebp + %$facel] ; mm1=facel
- movd mm2, [edx + ebx*4] ; mm2=charge[ii0] (O)
- movd mm3, [edx + ebx*4 + 4] ; mm2=charge[ii0+1] (H)
- movq mm4, mm2
- pfmul mm4, mm1
- movq mm6, mm3
- pfmul mm6, mm1
- movq mm5, mm4
- pfmul mm4, mm2 ; mm4=qqOO*facel
- pfmul mm5, mm3 ; mm5=qqOH*facel
- pfmul mm6, mm3 ; mm6=qqHH*facel
- punpckldq mm5,mm5 ; spread to both halves
- punpckldq mm6,mm6 ; spread to both halves
- movq [esp + .qqOO], mm4
- movq [esp + .qqOH], mm5
- movq [esp + .qqHH], mm6
- movq mm2, [mm_two]
- movq [esp + .two], mm2
- movd mm3, [ebp + %$tabscale]
- punpckldq mm3,mm3
- movq [esp + .tabscale], mm3
-.outer:
- mov eax, [ebp + %$shift] ; eax = pointer into shift[]
- mov ebx, [eax] ; ebx=shift[n]
- add [ebp + %$shift], dword 4 ; advance pointer one step
-
- lea ebx, [ebx + ebx*2] ; ebx=3*is
- mov [esp + .is3],ebx ; store is3
-
- mov eax, [ebp + %$shiftvec] ; eax = base of shiftvec[]
-
- movq mm5, [eax + ebx*4] ; move shX/shY to mm5 and shZ to mm6.
- movd mm6, [eax + ebx*4 + 8]
- movq mm0, mm5
- movq mm1, mm5
- movq mm2, mm6
- punpckldq mm0,mm0 ; also expand shX,Y,Z in mm0--mm2.
- punpckhdq mm1,mm1
- punpckldq mm2,mm2
-
- mov ecx, [ebp + %$iinr] ; ecx = pointer into iinr[]
- add [ebp + %$iinr], dword 4 ; advance pointer
- mov ebx, [ecx] ; ebx =ii
-
- lea ebx, [ebx + ebx*2] ; ebx = 3*ii=ii3
- mov eax, [ebp + %$pos] ; eax = base of pos[]
-
- pfadd mm5, [eax + ebx*4] ; ix = shX + posX (and iy too)
- movd mm7, [eax + ebx*4 + 8] ; cant use direct memory add for 4 bytes (iz)
- mov [esp + .ii3], ebx ; (use mm7 as temp. storage for iz.)
- pfadd mm6, mm7
- movq [esp + .ixO], mm5
- movq [esp + .izO], mm6
-
- movd mm3, [eax + ebx*4 + 12]
- movd mm4, [eax + ebx*4 + 16]
- movd mm5, [eax + ebx*4 + 20]
- punpckldq mm3, [eax + ebx*4 + 24]
- punpckldq mm4, [eax + ebx*4 + 28]
- punpckldq mm5, [eax + ebx*4 + 32] ; coords of H1 in low mm3-mm5, H2 in high.
-
- pfadd mm0, mm3
- pfadd mm1, mm4
- pfadd mm2, mm5
- movq [esp + .ixH], mm0
- movq [esp + .iyH], mm1
- movq [esp + .izH], mm2
-
- ;; clear vctot and i forces.
- pxor mm7,mm7
- movq [esp + .vctot], mm7
- movq [esp + .fixO], mm7
- movq [esp + .fizO], mm7
- movq [esp + .fixH], mm7
- movq [esp + .fiyH], mm7
- movq [esp + .fizH], mm7
-
- mov eax, [ebp + %$jindex]
- mov ecx, [eax] ; jindex[n]
- mov edx, [eax + 4] ; jindex[n+1]
- add [ebp + %$jindex], dword 4
- sub edx, ecx ; number of innerloop atoms
- mov [esp + .innerk], edx ; number of innerloop atoms
-
- mov esi, [ebp + %$pos]
- mov edi, [ebp + %$faction]
- mov eax, [ebp + %$jjnr]
- shl ecx, 2
- add eax, ecx
- mov [esp + .innerjjnr], eax ; pointer to jjnr[nj0]
-.inner_loop:
- ;; a single j particle iteration here - compare with the unrolled code for comments.
- mov eax, [esp + .innerjjnr]
- mov eax, [eax] ; eax=jnr offset
- add [esp + .innerjjnr], dword 4 ; advance pointer
-
- lea eax, [eax + eax*2]
-
- movq mm0, [esi + eax*4]
- movd mm1, [esi + eax*4 + 8]
- ;; copy & expand to mm2-mm4 for the H interactions
- movq mm2, mm0
- movq mm3, mm0
- movq mm4, mm1
- punpckldq mm2,mm2
- punpckhdq mm3,mm3
- punpckldq mm4,mm4
-
- pfsubr mm0, [esp + .ixO]
- pfsubr mm1, [esp + .izO]
-
- movq [esp + .dxO], mm0
- pfmul mm0,mm0
- movd [esp + .dzO], mm1
- pfmul mm1,mm1
- pfacc mm0, mm0
- pfadd mm0, mm1 ; mm0=rsqO
-
- punpckldq mm2, mm2
- punpckldq mm3, mm3
- punpckldq mm4, mm4 ; mm2-mm4 is jx-jz
- pfsubr mm2, [esp + .ixH]
- pfsubr mm3, [esp + .iyH]
- pfsubr mm4, [esp + .izH] ; mm2-mm4 is dxH-dzH
-
- movq [esp + .dxH], mm2
- movq [esp + .dyH], mm3
- movq [esp + .dzH], mm4
- pfmul mm2,mm2
- pfmul mm3,mm3
- pfmul mm4,mm4
-
- pfadd mm3,mm2
- pfadd mm3,mm4 ; mm3=rsqH
- movq [esp + .tmprsqH], mm3
-
- pfrsqrt mm1,mm0
-
- movq mm2,mm1
- pfmul mm1,mm1
- pfrsqit1 mm1,mm0
- pfrcpit2 mm1,mm2 ; mm1=invsqrt OO
- pfmul mm0, mm1 ; mm0=rsq OO
-
- pfmul mm0, [esp + .tabscale]
- pf2iw mm4, mm0
- movd [esp + .n1], mm4
- pi2fd mm4,mm4
- pfsub mm0, mm4 ; now mm0 is eps and mm4 n0.
- movq mm2, mm0
- pfmul mm2, mm2 ; mm0 is eps, mm2 eps2
-
- ; coulomb table
- mov edx, [ebp + %$VFtab]
- mov ecx, [esp + .n1]
- shl ecx, 2
-
- ;; load all values we need
- movd mm4, [edx + ecx*4]
- movd mm5, [edx + ecx*4 + 4]
- movd mm6, [edx + ecx*4 + 8]
- movd mm7, [edx + ecx*4 + 12]
-
- pfmul mm6, mm0 ; mm6 = Geps
- pfmul mm7, mm2 ; mm7 = Heps2
- ;;
- pfadd mm5, mm6
- pfadd mm5, mm7 ; mm5 = Fp
-
- pfmul mm7, [esp + .two] ; two*Heps2
- pfadd mm7, mm6
- pfadd mm7, mm5 ; mm7=FF
-
- pfmul mm5, mm0 ; mm5=eps*Fp
- pfadd mm5, mm4 ; mm5= VV
-
- pfmul mm5, [esp + .qqOO] ; vcoul=qq*VV
- pfmul mm7, [esp + .qqOO] ; fijC=qq*FF
-
- ;; update vctot directly, use mm3 for fscal sum.
- pfadd mm5, [esp + .vctot]
- movq [esp + .vctot], mm5
- movq mm3, mm7
-
- ; change sign of fscal and multiply with rinv
- pxor mm0,mm0
- pfsubr mm3, mm0
- pfmul mm3, [esp + .tabscale]
- pfmul mm3, mm1 ; mm3 is total fscal (for the oxygen) now
-
- ;; Ready with the oxygen - potential is updated, fscal is in mm3.
- ;; time for hydrogens!
-
- movq mm0, [esp + .tmprsqH]
-
- pfrsqrt mm1, mm0
- pswapd mm0,mm0
- pfrsqrt mm2, mm0
- pswapd mm0,mm0
- punpckldq mm1,mm2 ; seeds are in mm1 now, and rsq in mm0.
-
- movq mm2, mm1
- pfmul mm1,mm1
- pfrsqit1 mm1,mm0
- pfrcpit2 mm1,mm2 ; mm1=invsqrt
-
- pfmul mm0,mm1 ; mm0=r
- pfmul mm0, [esp + .tabscale]
- pf2iw mm4, mm0
- movq [esp + .n1], mm4
- pi2fd mm4,mm4
- pfsub mm0, mm4 ; now mm0 is eps and mm4 n0.
- movq mm2, mm0
- pfmul mm2, mm2 ; mm0 is eps, mm2 eps2
-
- ; coulomb table
- mov edx, [ebp + %$VFtab]
- mov ecx, [esp + .n1]
- shl ecx, 2
- ;; load all values we need
- movd mm4, [edx + ecx*4]
- movd mm5, [edx + ecx*4 + 4]
- movd mm6, [edx + ecx*4 + 8]
- movd mm7, [edx + ecx*4 + 12]
- mov ecx, [esp + .n1 + 4]
- shl ecx, 2
- punpckldq mm4, [edx + ecx*4]
- punpckldq mm5, [edx + ecx*4 + 4]
- punpckldq mm6, [edx + ecx*4 + 8]
- punpckldq mm7, [edx + ecx*4 + 12]
-
- pfmul mm6, mm0 ; mm6 = Geps
- pfmul mm7, mm2 ; mm7 = Heps2
- ;;
- pfadd mm5, mm6
- pfadd mm5, mm7 ; mm5 = Fp
-
- pfmul mm7, [esp + .two] ; two*Heps2
- pfadd mm7, mm6
- pfadd mm7, mm5 ; mm7=FF
-
- pfmul mm5, mm0 ; mm5=eps*Fp
- pfadd mm5, mm4 ; mm5= VV
-
- pfmul mm5, [esp + .qqOH] ; vcoul=qq*VV
- pfmul mm7, [esp + .qqOH] ; fijC=qq*FF
- ;; update vctot.
- pfadd mm5, [esp + .vctot]
- movq [esp + .vctot], mm5
-
- ;; change sign of fijC and multiply by rinv
- pxor mm4,mm4
- pfsub mm4, mm7
- pfmul mm4, [esp + .tabscale]
- pfmul mm4, mm1 ; mm4 is total fscal (for the hydrogens) now
-
- ;; spread oxygen fscalar to both positions
- punpckldq mm3,mm3
- ;; calc vectorial force for O
- movq mm0, [esp + .dxO]
- movd mm1, [esp + .dzO]
- pfmul mm0, mm3
- pfmul mm1, mm3
-
- ;; calc vectorial force for H's
- movq mm5, [esp + .dxH]
- movq mm6, [esp + .dyH]
- movq mm7, [esp + .dzH]
- pfmul mm5, mm4
- pfmul mm6, mm4
- pfmul mm7, mm4
-
- ;; update iO particle force
- movq mm2, [esp + .fixO]
- movd mm3, [esp + .fizO]
- pfadd mm2, mm0
- pfadd mm3, mm1
- movq [esp + .fixO], mm2
- movd [esp + .fizO], mm3
-
- ;; update iH forces
- movq mm2, [esp + .fixH]
- movq mm3, [esp + .fiyH]
- movq mm4, [esp + .fizH]
- pfadd mm2, mm5
- pfadd mm3, mm6
- pfadd mm4, mm7
- movq [esp + .fixH], mm2
- movq [esp + .fiyH], mm3
- movq [esp + .fizH], mm4
-
- ;; pack j forces from H in the same form as the oxygen force.
- pfacc mm5, mm6 ; mm5(l)=fjx(H1+H2) mm5(h)=fjy(H1+H2)
- pfacc mm7, mm7 ; mm7(l)=fjz(H1+H2)
-
- pfadd mm0, mm5 ; add up total force on j particle.
- pfadd mm1, mm7
-
- ;; update j particle force
- movq mm2, [edi + eax*4]
- movd mm3, [edi + eax*4 + 8]
- pfsub mm2, mm0
- pfsub mm3, mm1
- movq [edi + eax*4], mm2
- movd [edi + eax*4 +8], mm3
-
- ; interactions with j H1.
-
- movq mm0, [esi + eax*4 + 12]
- movd mm1, [esi + eax*4 + 20]
- ;; copy & expand to mm2-mm4 for the H interactions
- movq mm2, mm0
- movq mm3, mm0
- movq mm4, mm1
- punpckldq mm2,mm2
- punpckhdq mm3,mm3
- punpckldq mm4,mm4
-
- pfsubr mm0, [esp + .ixO]
- pfsubr mm1, [esp + .izO]
-
- movq [esp + .dxO], mm0
- pfmul mm0,mm0
- movd [esp + .dzO], mm1
- pfmul mm1,mm1
- pfacc mm0, mm1
- pfadd mm0, mm1 ; mm0=rsqO
-
- punpckldq mm2, mm2
- punpckldq mm3, mm3
- punpckldq mm4, mm4 ; mm2-mm4 is jx-jz
- pfsubr mm2, [esp + .ixH]
- pfsubr mm3, [esp + .iyH]
- pfsubr mm4, [esp + .izH] ; mm2-mm4 is dxH-dzH
-
- movq [esp + .dxH], mm2
- movq [esp + .dyH], mm3
- movq [esp + .dzH], mm4
- pfmul mm2,mm2
- pfmul mm3,mm3
- pfmul mm4,mm4
-
- pfadd mm3,mm2
- pfadd mm3,mm4 ; mm3=rsqH
- movq [esp + .tmprsqH], mm3
-
- pfrsqrt mm1,mm0
-
- movq mm2,mm1
- pfmul mm1,mm1
- pfrsqit1 mm1,mm0
- pfrcpit2 mm1,mm2 ; mm1=invsqrt
- pfmul mm0, mm1 ; mm0=rsq
-
- pfmul mm0, [esp + .tabscale]
- pf2iw mm4, mm0
- movd [esp + .n1], mm4
- pi2fd mm4,mm4
- pfsub mm0, mm4 ; now mm0 is eps and mm4 n0.
- movq mm2, mm0
- pfmul mm2, mm2 ; mm0 is eps, mm2 eps2
-
- ; coulomb table
- mov edx, [ebp + %$VFtab]
- mov ecx, [esp + .n1]
- shl ecx, 2
-
- ;; load all values we need
- movd mm4, [edx + ecx*4]
- movd mm5, [edx + ecx*4 + 4]
- movd mm6, [edx + ecx*4 + 8]
- movd mm7, [edx + ecx*4 + 12]
-
- pfmul mm6, mm0 ; mm6 = Geps
- pfmul mm7, mm2 ; mm7 = Heps2
- ;;
- pfadd mm5, mm6
- pfadd mm5, mm7 ; mm5 = Fp
-
- pfmul mm7, [esp + .two] ; two*Heps2
- pfadd mm7, mm6
- pfadd mm7, mm5 ; mm7=FF
-
- pfmul mm5, mm0 ; mm5=eps*Fp
- pfadd mm5, mm4 ; mm5= VV
-
- pfmul mm5, [esp + .qqOH] ; vcoul=qq*VV
- pfmul mm7, [esp + .qqOH] ; fijC=qq*FF
-
- ;; update vctot directly, force is moved to mm3.
- pfadd mm5, [esp + .vctot]
- movq [esp + .vctot], mm5
- pxor mm3, mm3
- pfsub mm3, mm7
- pfmul mm3, [esp + .tabscale]
- pfmul mm3, mm1 ; mm3 is total fscal (for the oxygen) now
-
- movq mm0, [esp + .tmprsqH]
-
- pfrsqrt mm1, mm0
- pswapd mm0,mm0
- pfrsqrt mm2, mm0
- pswapd mm0,mm0
- punpckldq mm1,mm2 ; seeds are in mm1 now, and rsq in mm0.
-
- movq mm2, mm1
- pfmul mm1,mm1
- pfrsqit1 mm1,mm0
- pfrcpit2 mm1,mm2 ; mm1=invsqrt
-
- pfmul mm0,mm1 ; mm0=r
- pfmul mm0, [esp + .tabscale]
- pf2iw mm4, mm0
- movq [esp + .n1], mm4
- pi2fd mm4,mm4
- pfsub mm0, mm4 ; now mm0 is eps and mm4 n0.
- movq mm2, mm0
- pfmul mm2, mm2 ; mm0 is eps, mm2 eps2
-
- ; coulomb table
- mov edx, [ebp + %$VFtab]
- mov ecx, [esp + .n1]
- shl ecx, 2
- ;; load all values we need
- movd mm4, [edx + ecx*4]
- movd mm5, [edx + ecx*4 + 4]
- movd mm6, [edx + ecx*4 + 8]
- movd mm7, [edx + ecx*4 + 12]
- mov ecx, [esp + .n1 + 4]
- shl ecx, 2
- punpckldq mm4, [edx + ecx*4]
- punpckldq mm5, [edx + ecx*4 + 4]
- punpckldq mm6, [edx + ecx*4 + 8]
- punpckldq mm7, [edx + ecx*4 + 12]
-
-
- pfmul mm6, mm0 ; mm6 = Geps
- pfmul mm7, mm2 ; mm7 = Heps2
- ;;
- pfadd mm5, mm6
- pfadd mm5, mm7 ; mm5 = Fp
-
- pfmul mm7, [esp + .two] ; two*Heps2
- pfadd mm7, mm6
- pfadd mm7, mm5 ; mm7=FF
-
- pfmul mm5, mm0 ; mm5=eps*Fp
- pfadd mm5, mm4 ; mm5= VV
-
- pfmul mm5, [esp + .qqHH] ; vcoul=qq*VV
- pfmul mm7, [esp + .qqHH] ; fijC=qq*FF
- ;; update vctot.
- pfadd mm5, [esp + .vctot]
- movq [esp + .vctot], mm5
-
- ;; change sign of fijC and multiply by rinv
- pxor mm4,mm4
- pfsub mm4, mm7
- pfmul mm4, [esp + .tabscale]
- pfmul mm4, mm1 ; mm4 is total fscal (for the hydrogens) now
-
- ;; spread oxygen fscalar to both positions
- punpckldq mm3,mm3
- ;; calc vectorial force for O
- movq mm0, [esp + .dxO]
- movd mm1, [esp + .dzO]
- pfmul mm0, mm3
- pfmul mm1, mm3
-
- ;; calc vectorial force for H's
- movq mm5, [esp + .dxH]
- movq mm6, [esp + .dyH]
- movq mm7, [esp + .dzH]
- pfmul mm5, mm4
- pfmul mm6, mm4
- pfmul mm7, mm4
-
- ;; update iO particle force
- movq mm2, [esp + .fixO]
- movd mm3, [esp + .fizO]
- pfadd mm2, mm0
- pfadd mm3, mm1
- movq [esp + .fixO], mm2
- movd [esp + .fizO], mm3
-
- ;; update iH forces
- movq mm2, [esp + .fixH]
- movq mm3, [esp + .fiyH]
- movq mm4, [esp + .fizH]
- pfadd mm2, mm5
- pfadd mm3, mm6
- pfadd mm4, mm7
- movq [esp + .fixH], mm2
- movq [esp + .fiyH], mm3
- movq [esp + .fizH], mm4
-
- ;; pack j forces from H in the same form as the oxygen force.
- pfacc mm5, mm6 ; mm5(l)=fjx(H1+H2) mm5(h)=fjy(H1+H2)
- pfacc mm7, mm7 ; mm7(l)=fjz(H1+H2)
-
- pfadd mm0, mm5 ; add up total force on j particle.
- pfadd mm1, mm7
-
- ;; update j particle force
- movq mm2, [edi + eax*4 + 12]
- movd mm3, [edi + eax*4 + 20]
- pfsub mm2, mm0
- pfsub mm3, mm1
- movq [edi + eax*4 + 12], mm2
- movd [edi + eax*4 + 20], mm3
-
- ; interactions with j H2
- movq mm0, [esi + eax*4 + 24]
- movd mm1, [esi + eax*4 + 32]
- ;; copy & expand to mm2-mm4 for the H interactions
- movq mm2, mm0
- movq mm3, mm0
- movq mm4, mm1
- punpckldq mm2,mm2
- punpckhdq mm3,mm3
- punpckldq mm4,mm4
-
- pfsubr mm0, [esp + .ixO]
- pfsubr mm1, [esp + .izO]
-
- movq [esp + .dxO], mm0
- pfmul mm0,mm0
- movd [esp + .dzO], mm1
- pfmul mm1,mm1
- pfacc mm0, mm1
- pfadd mm0, mm1 ; mm0=rsqO
-
- punpckldq mm2, mm2
- punpckldq mm3, mm3
- punpckldq mm4, mm4 ; mm2-mm4 is jx-jz
- pfsubr mm2, [esp + .ixH]
- pfsubr mm3, [esp + .iyH]
- pfsubr mm4, [esp + .izH] ; mm2-mm4 is dxH-dzH
-
- movq [esp + .dxH], mm2
- movq [esp + .dyH], mm3
- movq [esp + .dzH], mm4
- pfmul mm2,mm2
- pfmul mm3,mm3
- pfmul mm4,mm4
-
- pfadd mm3,mm2
- pfadd mm3,mm4 ; mm3=rsqH
- movq [esp + .tmprsqH], mm3
-
- pfrsqrt mm1,mm0
-
- movq mm2,mm1
- pfmul mm1,mm1
- pfrsqit1 mm1,mm0
- pfrcpit2 mm1,mm2 ; mm1=invsqrt
- pfmul mm0, mm1
-
- pfmul mm0, [esp + .tabscale]
- pf2iw mm4, mm0
- movd [esp + .n1], mm4
- pi2fd mm4,mm4
- pfsub mm0, mm4 ; now mm0 is eps and mm4 n0.
- movq mm2, mm0
- pfmul mm2, mm2 ; mm0 is eps, mm2 eps2
-
- ; coulomb table
- mov edx, [ebp + %$VFtab]
- mov ecx, [esp + .n1]
- shl ecx, 2
-
- ;; load all values we need
- movd mm4, [edx + ecx*4]
- movd mm5, [edx + ecx*4 + 4]
- movd mm6, [edx + ecx*4 + 8]
- movd mm7, [edx + ecx*4 + 12]
-
- pfmul mm6, mm0 ; mm6 = Geps
- pfmul mm7, mm2 ; mm7 = Heps2
- ;;
- pfadd mm5, mm6
- pfadd mm5, mm7 ; mm5 = Fp
-
- pfmul mm7, [esp + .two] ; two*Heps2
- pfadd mm7, mm6
- pfadd mm7, mm5 ; mm7=FF
-
- pfmul mm5, mm0 ; mm5=eps*Fp
- pfadd mm5, mm4 ; mm5= VV
-
- pfmul mm5, [esp + .qqOH] ; vcoul=qq*VV
- pfmul mm7, [esp + .qqOH] ; fijC=qq*FF
-
- ;; update vctot directly, use mm3 for fscal sum.
- pfadd mm5, [esp + .vctot]
- movq [esp + .vctot], mm5
- pxor mm3,mm3
- pfsub mm3, mm7
- pfmul mm3, [esp + .tabscale]
- pfmul mm3, mm1 ; mm3 is total fscal (for the oxygen) now
-
- movq mm0, [esp + .tmprsqH]
-
- pfrsqrt mm1, mm0
- pswapd mm0,mm0
- pfrsqrt mm2, mm0
- pswapd mm0,mm0
- punpckldq mm1,mm2 ; seeds are in mm1 now, and rsq in mm0.
-
- movq mm2, mm1
- pfmul mm1,mm1
- pfrsqit1 mm1,mm0
- pfrcpit2 mm1,mm2 ; mm1=invsqrt
-
- pfmul mm0,mm1 ; mm0=r
- pfmul mm0, [esp + .tabscale]
- pf2iw mm4, mm0
- movq [esp + .n1], mm4
- pi2fd mm4,mm4
- pfsub mm0, mm4 ; now mm0 is eps and mm4 n0.
- movq mm2, mm0
- pfmul mm2, mm2 ; mm0 is eps, mm2 eps2
-
- ; coulomb table
- mov edx, [ebp + %$VFtab]
- mov ecx, [esp + .n1]
- shl ecx, 2
- ;; load all values we need
- movd mm4, [edx + ecx*4]
- movd mm5, [edx + ecx*4 + 4]
- movd mm6, [edx + ecx*4 + 8]
- movd mm7, [edx + ecx*4 + 12]
- mov ecx, [esp + .n1 + 4]
- shl ecx, 2
- punpckldq mm4, [edx + ecx*4]
- punpckldq mm5, [edx + ecx*4 + 4]
- punpckldq mm6, [edx + ecx*4 + 8]
- punpckldq mm7, [edx + ecx*4 + 12]
-
-
- pfmul mm6, mm0 ; mm6 = Geps
- pfmul mm7, mm2 ; mm7 = Heps2
- ;;
- pfadd mm5, mm6
- pfadd mm5, mm7 ; mm5 = Fp
-
- pfmul mm7, [esp + .two] ; two*Heps2
- pfadd mm7, mm6
- pfadd mm7, mm5 ; mm7=FF
-
- pfmul mm5, mm0 ; mm5=eps*Fp
- pfadd mm5, mm4 ; mm5= VV
-
- pfmul mm5, [esp + .qqHH] ; vcoul=qq*VV
- pfmul mm7, [esp + .qqHH] ; fijC=qq*FF
- ;; update vctot.
- pfadd mm5, [esp + .vctot]
- movq [esp + .vctot], mm5
-
- ;; change sign of fijC and multiply by rinv
- pxor mm4,mm4
- pfsub mm4, mm7
- pfmul mm4, [esp + .tabscale]
- pfmul mm4, mm1 ; mm4 is total fscal (for the hydrogens) now
-
- ;; spread oxygen fscalar to both positions
- punpckldq mm3,mm3
- ;; calc vectorial force for O
- movq mm0, [esp + .dxO]
- movd mm1, [esp + .dzO]
- pfmul mm0, mm3
- pfmul mm1, mm3
-
- ;; calc vectorial force for H's
- movq mm5, [esp + .dxH]
- movq mm6, [esp + .dyH]
- movq mm7, [esp + .dzH]
- pfmul mm5, mm4
- pfmul mm6, mm4
- pfmul mm7, mm4
-
- ;; update iO particle force
- movq mm2, [esp + .fixO]
- movd mm3, [esp + .fizO]
- pfadd mm2, mm0
- pfadd mm3, mm1
- movq [esp + .fixO], mm2
- movd [esp + .fizO], mm3
-
- ;; update iH forces
- movq mm2, [esp + .fixH]
- movq mm3, [esp + .fiyH]
- movq mm4, [esp + .fizH]
- pfadd mm2, mm5
- pfadd mm3, mm6
- pfadd mm4, mm7
- movq [esp + .fixH], mm2
- movq [esp + .fiyH], mm3
- movq [esp + .fizH], mm4
-
- ;; pack j forces from H in the same form as the oxygen force.
- pfacc mm5, mm6 ; mm5(l)=fjx(H1+H2) mm5(h)=fjy(H1+H2)
- pfacc mm7, mm7 ; mm7(l)=fjz(H1+H2)
-
- pfadd mm0, mm5 ; add up total force on j particle.
- pfadd mm1, mm7
-
- ;; update j particle force
- movq mm2, [edi + eax*4 + 24]
- movd mm3, [edi + eax*4 + 32]
- pfsub mm2, mm0
- pfsub mm3, mm1
- movq [edi + eax*4 + 24], mm2
- movd [edi + eax*4 + 32], mm3
-
- ;; done - one more?
- dec dword [esp + .innerk]
- jz .updateouterdata
- jmp .inner_loop
-.updateouterdata:
- mov ecx, [esp + .ii3]
-
- movq mm6, [edi + ecx*4] ; increment iO force
- movd mm7, [edi + ecx*4 + 8]
- pfadd mm6, [esp + .fixO]
- pfadd mm7, [esp + .fizO]
- movq [edi + ecx*4], mm6
- movd [edi + ecx*4 +8], mm7
-
- movq mm0, [esp + .fixH]
- movq mm3, [esp + .fiyH]
- movq mm1, [esp + .fizH]
- movq mm2, mm0
- punpckldq mm0, mm3 ; mm0(l)=fxH1, mm0(h)=fyH1
- punpckhdq mm2, mm3 ; mm2(l)=fxH2, mm2(h)=fyH2
- movq mm3, mm1
- pswapd mm3,mm3
- ;; mm1 is fzH1
- ;; mm3 is fzH2
-
- movq mm6, [edi + ecx*4 + 12] ; increment iH1 force
- movd mm7, [edi + ecx*4 + 20]
- pfadd mm6, mm0
- pfadd mm7, mm1
- movq [edi + ecx*4 + 12], mm6
- movd [edi + ecx*4 + 20], mm7
-
- movq mm6, [edi + ecx*4 + 24] ; increment iH2 force
- movd mm7, [edi + ecx*4 + 32]
- pfadd mm6, mm2
- pfadd mm7, mm3
- movq [edi + ecx*4 + 24], mm6
- movd [edi + ecx*4 + 32], mm7
-
-
- mov ebx, [ebp + %$fshift] ; increment fshift force
- mov edx, [esp + .is3]
-
- movq mm6, [ebx + edx*4]
- movd mm7, [ebx + edx*4 + 8]
- pfadd mm6, [esp + .fixO]
- pfadd mm7, [esp + .fizO]
- pfadd mm6, mm0
- pfadd mm7, mm1
- pfadd mm6, mm2
- pfadd mm7, mm3
- movq [ebx + edx*4], mm6
- movd [ebx + edx*4 + 8], mm7
-
- mov edx, [ebp + %$gid] ; get group index for this i particle
- mov edx, [edx]
- add [ebp + %$gid], dword 4 ; advance pointer
-
- movq mm7, [esp + .vctot]
- pfacc mm7,mm7 ; get and sum the two parts of total potential
-
- mov eax, [ebp + %$Vc]
- movd mm6, [eax + edx*4]
- pfadd mm6, mm7
- movd [eax + edx*4], mm6 ; increment vc[gid]
-
- ;; finish if last
- dec dword [ebp + %$nri]
- jz .end
- ;; not last, iterate once more!
- jmp .outer
-.end:
- femms
- add esp, 188
- pop edi
- pop esi
- pop edx
- pop ecx
- pop ebx
- pop eax
- endproc
-
-
-
-
-proc inl3100_3dnow
-%$nri arg
-%$iinr arg
-%$jindex arg
-%$jjnr arg
-%$shift arg
-%$shiftvec arg
-%$fshift arg
-%$gid arg
-%$pos arg
-%$faction arg
-%$charge arg
-%$facel arg
-%$Vc arg
-%$type arg
-%$ntype arg
-%$nbfp arg
-%$Vnb arg
-%$tabscale arg
-%$VFtab arg
- ;; stack offsets for local variables
-.is3 equ 0
-.ii3 equ 4
-.ix equ 8
-.iy equ 12
-.iz equ 16
-.iq equ 20 ; repeated (64bit) to fill 3dnow reg
-.vctot equ 28 ; repeated (64bit) to fill 3dnow reg
-.vnbtot equ 36 ; repeated (64bit) to fill 3dnow reg
-.c6 equ 44 ; repeated (64bit) to fill 3dnow reg
-.c12 equ 52 ; repeated (64bit) to fill 3dnow reg
-.six equ 60 ; repeated (64bit) to fill 3dnow reg
-.twelve equ 68 ; repeated (64bit) to fill 3dnow reg
-.two equ 76 ; repeated (64bit) to fill 3dnow reg
-.n1 equ 84 ; repeated (64bit) to fill 3dnow reg
-.tabscale equ 92 ; repeated (64bit) to fill 3dnow reg
-.ntia equ 100
-.innerjjnr equ 104
-.innerk equ 108
-.fix equ 112
-.fiy equ 116
-.fiz equ 120
-.dx1 equ 124
-.dy1 equ 128
-.dz1 equ 132
-.dx2 equ 136
-.dy2 equ 140
-.dz2 equ 144
- push eax
- push ebx
- push ecx
- push edx
- push esi
- push edi
- sub esp, 148 ; local stack space
- femms
- ; move data to local stack
- movq mm0, [mm_two]
- movq mm1, [mm_six]
- movq mm2, [mm_twelve]
- movd mm3, [ebp + %$tabscale]
- movq [esp + .two], mm0
- movq [esp + .six], mm1
- movq [esp + .twelve], mm2
- punpckldq mm3,mm3
- movq [esp + .tabscale], mm3
- ;; assume we have at least one i particle - start directly
-.outer:
- mov eax, [ebp + %$shift] ; eax = pointer into shift[]
- mov ebx, [eax] ; ebx=shift[n]
- add [ebp + %$shift], dword 4 ; advance pointer one step
-
- lea ebx, [ebx + ebx*2] ; ebx=3*is
- mov [esp + .is3],ebx ; store is3
-
- mov eax, [ebp + %$shiftvec] ; eax = base of shiftvec[]
-
- movq mm0, [eax + ebx*4] ; move shX/shY to mm0 and shZ to mm1.
- movd mm1, [eax + ebx*4 + 8]
-
- mov ecx, [ebp + %$iinr] ; ecx = pointer into iinr[]
- add [ebp + %$iinr], dword 4 ; advance pointer
- mov ebx, [ecx] ; ebx =ii
-
- mov edx, [ebp + %$charge]
- movd mm2, [edx + ebx*4] ; mm2=charge[ii]
- pfmul mm2, [ebp + %$facel]
- punpckldq mm2,mm2 ; spread to both halves
- movq [esp + .iq], mm2 ; iq =facel*charge[ii]
-
- mov edx, [ebp + %$type]
- mov edx, [edx + ebx*4]
- imul edx, [ebp + %$ntype]
- shl edx, 1
- mov [esp + .ntia], edx
-
- lea ebx, [ebx + ebx*2] ; ebx = 3*ii=ii3
- mov eax, [ebp + %$pos] ; eax = base of pos[]
-
- pfadd mm0, [eax + ebx*4] ; ix = shX + posX (and iy too)
- movd mm3, [eax + ebx*4 + 8] ; cant use direct memory add for 4 bytes (iz)
- mov [esp + .ii3], ebx
- pfadd mm1, mm3
- movq [esp + .ix], mm0
- movd [esp + .iz], mm1
-
- ;; clear total potential and i forces.
- pxor mm7,mm7
- movq [esp + .vctot], mm7
- movq [esp + .vnbtot], mm7
- movq [esp + .fix], mm7
- movd [esp + .fiz], mm7
-
- mov eax, [ebp + %$jindex]
- mov ecx, [eax] ; jindex[n]
- mov edx, [eax + 4] ; jindex[n+1]
- add [ebp + %$jindex], dword 4
- sub edx, ecx ; number of innerloop atoms
-
- mov esi, [ebp + %$pos]
- mov edi, [ebp + %$faction]
- mov eax, [ebp + %$jjnr]
- shl ecx, 2
- add eax, ecx
- mov [esp + .innerjjnr], eax ; pointer to jjnr[nj0]
- sub edx, dword 2
- mov [esp + .innerk], edx ; number of innerloop atoms
- jge .unroll_loop
- jmp .finish_inner
-.unroll_loop:
- ;; paired innerloop here.
- mov ecx, [esp + .innerjjnr] ; pointer to jjnr[k]
- mov eax, [ecx]
- mov ebx, [ecx + 4] ; eax/ebx=jnr
- add [esp + .innerjjnr], dword 8 ; advance pointer (unrolled 2)
- prefetch [ecx + 16] ; prefetch data - trial and error says 16 is best
-
- mov ecx, [ebp + %$charge] ; base of charge[]
- movq mm5, [esp + .iq]
- movd mm3, [ecx + eax*4] ; charge[jnr1]
- punpckldq mm3, [ecx + ebx*4] ; move charge 2 to high part of mm3
- pfmul mm3,mm5 ; mm3 now has qq for both particles
-
- mov ecx, [ebp + %$type]
- mov edx, [ecx + eax*4] ; type [jnr1]
- mov ecx, [ecx + ebx*4] ; type [jnr2]
-
- mov esi, [ebp + %$nbfp] ; base of nbfp
- shl edx, 1
- shl ecx, 1
- add edx, [esp + .ntia] ; tja = ntia + 2*type
- add ecx, [esp + .ntia]
-
- movq mm5, [esi + edx*4] ; mm5 = 1st c6 / c12
- movq mm7, [esi + ecx*4] ; mm7 = 2nd c6 / c12
- movq mm6,mm5
- punpckldq mm5,mm7 ; mm5 = 1st c6 / 2nd c6
- punpckhdq mm6,mm7 ; mm6 = 1st c12 / 2nd c12
- movq [esp + .c6], mm5
- movq [esp + .c12], mm6
-
- lea eax, [eax + eax*2] ; replace jnr with j3
- lea ebx, [ebx + ebx*2]
-
- mov esi, [ebp + %$pos]
-
- movq mm0, [esp + .ix]
- movd mm1, [esp + .iz]
- movq mm4, [esi + eax*4] ; fetch first j coordinates
- movd mm5, [esi + eax*4 + 8]
- pfsubr mm4,mm0 ; dr = ir - jr
- pfsubr mm5,mm1
- movq [esp + .dx1], mm4 ; store dr
- movd [esp + .dz1], mm5
- pfmul mm4,mm4 ; square dx,dy,dz
- pfmul mm5,mm5
- pfacc mm4, mm5 ; accumulate to get dx*dx+dy*dy+dz*dz
- pfacc mm4, mm5 ; first rsq in lower mm4
-
- movq mm6, [esi + ebx*4] ; fetch second j coordinates
- movd mm7, [esi + ebx*4 + 8]
-
- pfsubr mm6,mm0 ; dr = ir - jr
- pfsubr mm7,mm1
- movq [esp + .dx2], mm6 ; store dr
- movd [esp + .dz2], mm7
- pfmul mm6,mm6 ; square dx,dy,dz
- pfmul mm7,mm7
- pfacc mm6, mm7 ; accumulate to get dx*dx+dy*dy+dz*dz
- pfacc mm6, mm7 ; second rsq in lower mm6
-
- pfrsqrt mm0, mm4 ; lookup inverse square root seed
- pfrsqrt mm1, mm6
-
-
- punpckldq mm0,mm1
- punpckldq mm4,mm6 ; now 4 has rsq and 0 the seed for both pairs.
- movq mm2,mm0 ; amd 3dnow N-R iteration to get full precision.
- pfmul mm0,mm0
- pfrsqit1 mm0,mm4
- pfrcpit2 mm0,mm2
- pfmul mm4, mm0
- movq mm1, mm4
- ;; mm0 is invsqrt, and mm1 r.
- ;; do potential and fscal
- pfmul mm1, [esp + .tabscale] ; mm1=rt
- pf2iw mm4,mm1
- movq [esp + .n1], mm4
- pi2fd mm4,mm4
- pfsub mm1, mm4 ; now mm1 is eps and mm4 n0.
-
- movq mm2,mm1
- pfmul mm2,mm2 ; mm1 is eps, mm2 is eps2
-
- mov edx, [ebp + %$VFtab]
- mov ecx, [esp + .n1]
- shl ecx, 2
- ; coulomb table
- ; load all the table values we need
- movd mm4, [edx + ecx*4]
- movd mm5, [edx + ecx*4 + 4]
- movd mm6, [edx + ecx*4 + 8]
- movd mm7, [edx + ecx*4 + 12]
- mov ecx, [esp + .n1 + 4]
- shl ecx, 2
- punpckldq mm4, [edx + ecx*4]
- punpckldq mm5, [edx + ecx*4 + 4]
- punpckldq mm6, [edx + ecx*4 + 8]
- punpckldq mm7, [edx + ecx*4 + 12]
-
- pfmul mm6, mm1 ; mm6 = Geps
- pfmul mm7, mm2 ; mm7 = Heps2
-
- pfadd mm5, mm6
- pfadd mm5, mm7 ; mm5 = Fp
-
- pfmul mm7, [esp + .two] ; two*Heps2
- pfadd mm7, mm6
- pfadd mm7, mm5 ; mm7=FF
-
- pfmul mm5, mm1 ; mm5=eps*Fp
- pfadd mm5, mm4 ; mm5= VV
-
- pfmul mm5, mm3 ; vcoul=qq*VV
- pfmul mm3, mm7 ; fijC=FF*qq
-
- movq mm1, mm0
- pfmul mm1,mm1 ; mm1=invsq
- movq mm2, mm1
- pfmul mm2,mm1
- pfmul mm2,mm1 ; mm2=rinvsix
- movq mm1,mm2
- pfmul mm1,mm1 ; mm1=rinvtwelve
-
- pfmul mm3, [esp + .tabscale]
-
- pfmul mm1, [esp + .c12]
-
- pfmul mm2, [esp + .c6]
-
- movq mm4, mm1
- pfsub mm4, mm2 ; mm4 = vnb12-vnb6
-
- pfmul mm2, [esp + .six]
- pfmul mm1, [esp + .twelve]
-
- pfsub mm1, mm2
- pfmul mm1, mm0 ; mm1= (12*vnb12-6*vnb6)*rinv11
-
- pfsub mm1, mm3
-
- ;; update vctot
- pfadd mm5, [esp + .vctot] ; add the earlier value
- movq [esp + .vctot], mm5 ; store the sum
-
- pfmul mm0, mm1 ; mm0 is total fscal now
-
- prefetchw [esp + .dx1] ; prefetch i forces to cache
-
- ;; spread fscalar to both positions
- movq mm1,mm0
- punpckldq mm0,mm0
- punpckhdq mm1,mm1
-
- ;; calc vector force
- prefetchw [edi + eax*4] ; prefetch the 1st faction to cache
- movq mm2, [esp + .dx1] ; fetch dr
- movd mm3, [esp + .dz1]
-
- ;; update vnbtot
- pfadd mm4, [esp + .vnbtot] ; add the earlier value
- movq [esp + .vnbtot], mm4 ; store the sum
-
- prefetchw [edi + ebx*4] ; prefetch the 2nd faction to cache
- pfmul mm2, mm0 ; mult by fs
- pfmul mm3, mm0
-
- movq mm4, [esp + .dx2] ; fetch dr
- movd mm5, [esp + .dz2]
- pfmul mm4, mm1 ; mult by fs
- pfmul mm5, mm1
- ;; update i forces
-
- movq mm0, [esp + .fix]
- movd mm1, [esp + .fiz]
- pfadd mm0, mm2
- pfadd mm1, mm3
-
- pfadd mm0, mm4
- pfadd mm1, mm5
- movq [esp + .fix], mm0
- movd [esp + .fiz], mm1
- ;; update j forces
-
- movq mm0, [edi + eax*4]
- movd mm1, [edi + eax*4 + 8]
- movq mm6, [edi + ebx*4]
- movd mm7, [edi + ebx*4 + 8]
-
- pfsub mm0, mm2
- pfsub mm1, mm3
- pfsub mm6, mm4
- pfsub mm7, mm5
-
- movq [edi + eax*4], mm0
- movd [edi + eax*4 +8], mm1
- movq [edi + ebx*4], mm6
- movd [edi + ebx*4 + 8], mm7
-
- ;; should we do one more iteration?
- sub [esp + .innerk], dword 2
- jl .finish_inner
- jmp .unroll_loop
-.finish_inner:
- and [esp + .innerk], dword 1
- jnz .single_inner
- jmp .updateouterdata
-.single_inner:
- ;; a single j particle iteration here - compare with the unrolled code for comments.
- mov eax, [esp + .innerjjnr]
- mov eax, [eax] ; eax=jnr offset
-
- mov ecx, [ebp + %$charge]
- movd mm5, [esp + .iq]
- movd mm3, [ecx + eax*4]
- pfmul mm3, mm5 ; mm3=qq
-
- mov esi, [ebp + %$nbfp]
- mov ecx, [ebp + %$type]
- mov edx, [ecx + eax*4] ; type [jnr1]
- shl edx, 1
- add edx, [esp + .ntia] ; tja = ntia + 2*type
- movd mm5, [esi + edx*4] ; mm5 = 1st c6
- movq [esp + .c6], mm5
- movd mm5, [esi + edx*4 + 4] ; mm5 = 1st c12
- movq [esp + .c12], mm5
-
-
- mov esi, [ebp + %$pos]
- lea eax, [eax + eax*2]
-
- movq mm0, [esp + .ix]
- movd mm1, [esp + .iz]
- movq mm4, [esi + eax*4]
- movd mm5, [esi + eax*4 + 8]
- pfsubr mm4, mm0
- pfsubr mm5, mm1
- movq [esp + .dx1], mm4
- pfmul mm4,mm4
- movd [esp + .dz1], mm5
- pfmul mm5,mm5
- pfacc mm4, mm5
- pfacc mm4, mm5 ; mm4=rsq
-
- pfrsqrt mm0,mm4
- movq mm2,mm0
- pfmul mm0,mm0
- pfrsqit1 mm0,mm4
- pfrcpit2 mm0,mm2 ; mm1=invsqrt
- pfmul mm4, mm0
- movq mm1, mm4
- ;; mm0 is invsqrt, and mm1 r.
- ;; calculate potentials and scalar force
- pfmul mm1, [esp + .tabscale] ; mm1=rt
- pf2iw mm4,mm1
- movd [esp + .n1], mm4
- pi2fd mm4,mm4
- pfsub mm1, mm4 ; now mm1 is eps and mm4 n0.
-
- movq mm2,mm1
- pfmul mm2,mm2 ; mm1 is eps, mm2 is eps2
-
- ; coulomb table
- mov edx, [ebp + %$VFtab]
- mov ecx, [esp + .n1]
- shl ecx, 2
- ; load all the table values we need
- movd mm4, [edx + ecx*4]
- movd mm5, [edx + ecx*4 + 4]
- movd mm6, [edx + ecx*4 + 8]
- movd mm7, [edx + ecx*4 + 12]
-
- pfmul mm6, mm1 ; mm6 = Geps
- pfmul mm7, mm2 ; mm7 = Heps2
-
- pfadd mm5, mm6
- pfadd mm5, mm7 ; mm5 = Fp
-
- pfmul mm7, [esp + .two] ; two*Heps2
- pfadd mm7, mm6
- pfadd mm7, mm5 ; mm7=FF
-
- pfmul mm5, mm1 ; mm5=eps*Fp
- pfadd mm5, mm4 ; mm5= VV
-
- pfmul mm5, mm3 ; vcoul=qq*VV
- pfmul mm3, mm7 ; fijC=FF*qq
-
- ; at this point mm5 contains vcoul and mm3 fijC.
-
- movq mm1, mm0
- pfmul mm1,mm1 ; mm1=invsq
- movq mm2, mm1
- pfmul mm2,mm1
- pfmul mm2,mm1 ; mm2=rinvsix
- movq mm1,mm2
- pfmul mm1,mm1 ; mm1=rinvtwelve
-
- pfmul mm3, [esp + .tabscale]
-
- pfmul mm1, [esp + .c12]
-
- pfmul mm2, [esp + .c6]
-
- movq mm4, mm1
- pfsub mm4, mm2 ; mm4 = vnb12-vnb6
-
- pfmul mm2, [esp + .six]
- pfmul mm1, [esp + .twelve]
-
- pfsub mm1, mm2
- pfmul mm1, mm0 ; mm1= (12*vnb12-6*vnb6)*rinv11
-
- pfsub mm1, mm3
-
- ;; update vctot
- pfadd mm5, [esp + .vctot] ; add the earlier value
- movq [esp + .vctot], mm5 ; store the sum
-
- pfmul mm0, mm1 ; mm0 is total fscal now
-
- ;; spread fscalar to both positions
- punpckldq mm0,mm0
- ;; calc vectorial force
- prefetchw [edi + eax*4] ; prefetch faction to cache
- movq mm2, [esp + .dx1]
- movd mm3, [esp + .dz1]
-
- ;; update vnbtot
- pfadd mm4, [esp + .vnbtot] ; add the earlier value
- movq [esp + .vnbtot], mm4 ; store the sum
-
- pfmul mm2, mm0
- pfmul mm3, mm0
-
- ;; update i particle force
- movq mm0, [esp + .fix]
- movd mm1, [esp + .fiz]
- pfadd mm0, mm2
- pfadd mm1, mm3
- movq [esp + .fix], mm0
- movd [esp + .fiz], mm1
- ;; update j particle force
- movq mm0, [edi + eax*4]
- movd mm1, [edi + eax *4+ 8]
- pfsub mm0, mm2
- pfsub mm1, mm3
- movq [edi + eax*4], mm0
- movd [edi + eax*4 +8], mm1
- ;; done!
-.updateouterdata:
- mov ecx, [esp + .ii3]
-
- movq mm6, [edi + ecx*4] ; increment i force
- movd mm7, [edi + ecx*4 + 8]
- pfadd mm6, [esp + .fix]
- pfadd mm7, [esp + .fiz]
- movq [edi + ecx*4], mm6
- movd [edi + ecx*4 +8], mm7
-
- mov ebx, [ebp + %$fshift] ; increment fshift force
- mov edx, [esp + .is3]
-
- movq mm6, [ebx + edx*4]
- movd mm7, [ebx + edx*4 + 8]
- pfadd mm6, [esp + .fix]
- pfadd mm7, [esp + .fiz]
- movq [ebx + edx*4], mm6
- movd [ebx + edx*4 + 8], mm7
-
- mov edx, [ebp + %$gid] ; get group index for this i particle
- mov edx, [edx]
- add [ebp + %$gid], dword 4 ; advance pointer
-
- movq mm7, [esp + .vctot]
- pfacc mm7,mm7 ; get and sum the two parts of total potential
-
- mov eax, [ebp + %$Vc]
- movd mm6, [eax + edx*4]
- pfadd mm6, mm7
- movd [eax + edx*4], mm6 ; increment vc[gid]
-
- movq mm7, [esp + .vnbtot]
- pfacc mm7,mm7 ; get and sum the two parts of total potential
-
- mov eax, [ebp + %$Vnb]
- movd mm6, [eax + edx*4]
- pfadd mm6, mm7
- movd [eax + edx*4], mm6 ; increment vnb[gid]
-
- ;; finish if last
- mov ecx, [ebp + %$nri]
- dec ecx
- jecxz .end
- ;; not last, iterate once more!
- mov [ebp + %$nri], ecx
- jmp .outer
-.end:
- femms
- add esp, 148
- pop edi
- pop esi
- pop edx
- pop ecx
- pop ebx
- pop eax
- endproc
-
-
-
-
-
-
-
-proc inl3110_3dnow
-%$nri arg
-%$iinr arg
-%$jindex arg
-%$jjnr arg
-%$shift arg
-%$shiftvec arg
-%$fshift arg
-%$gid arg
-%$pos arg
-%$faction arg
-%$charge arg
-%$facel arg
-%$Vc arg
-%$type arg
-%$ntype arg
-%$nbfp arg
-%$Vnb arg
-%$tabscale arg
-%$VFtab arg
-%$nsatoms arg
- ;; stack offsets for local variables
-.is3 equ 0
-.ii3 equ 4
-.shX equ 8
-.shY equ 12
-.shZ equ 16
-.ix equ 20
-.iy equ 24
-.iz equ 28
-.iq equ 32 ; repeated (64bit) to fill 3dnow reg
-.vctot equ 40 ; repeated (64bit) to fill 3dnow reg
-.vnbtot equ 48 ; repeated (64bit) to fill 3dnow reg
-.c6 equ 56 ; repeated (64bit) to fill 3dnow reg
-.c12 equ 64 ; repeated (64bit) to fill 3dnow reg
-.six equ 72 ; repeated (64bit) to fill 3dnow reg
-.twelve equ 80 ; repeated (64bit) to fill 3dnow reg
-.two equ 88 ; repeated (64bit) to fill 3dnow reg
-.n1 equ 96 ; repeated (64bit) to fill 3dnow reg
-.tabscale equ 104 ; repeated (64bit) to fill 3dnow reg
-.ntia equ 112
-.innerjjnr0 equ 116
-.innerk0 equ 120
-.innerjjnr equ 124
-.innerk equ 128
-.fix equ 132
-.fiy equ 136
-.fiz equ 140
-.dx1 equ 144
-.dy1 equ 148
-.dz1 equ 152
-.dx2 equ 156
-.dy2 equ 160
-.dz2 equ 164
-.nsvdwc equ 168
-.nscoul equ 172
-.nsvdw equ 176
-.solnr equ 180
- push eax
- push ebx
- push ecx
- push edx
- push esi
- push edi
- sub esp, 184 ; local stack space
- femms
- movq mm0, [mm_six]
- movq mm1, [mm_twelve]
- movq [esp + .six], mm0
- movq [esp + .twelve], mm1
- movq mm2, [mm_two]
- movd mm3, [ebp + %$tabscale]
- movq [esp + .two], mm2
- punpckldq mm3,mm3
- movq [esp + .tabscale], mm3
- ;; assume we have at least one i particle - start directly
-.outer:
- mov eax, [ebp + %$shift] ; eax = pointer into shift[]
- mov ebx, [eax] ; ebx=shift[n]
- add [ebp + %$shift], dword 4 ; advance pointer one step
-
- lea ebx, [ebx + ebx*2] ; ebx=3*is
- mov [esp + .is3],ebx ; store is3
-
- mov eax, [ebp + %$shiftvec] ; eax = base of shiftvec[]
-
- movq mm0, [eax + ebx*4] ; move shX/shY to mm0 and shZ to mm1.
- movd mm1, [eax + ebx*4 + 8]
- movq [esp + .shX], mm0
- movd [esp + .shZ], mm1
-
- mov ecx, [ebp + %$iinr] ; ecx = pointer into iinr[]
- add [ebp + %$iinr], dword 4 ; advance pointer
- mov ebx, [ecx] ; ebx =ii
-
- mov eax, [ebp + %$nsatoms]
- add [ebp + %$nsatoms], dword 12
- mov ecx, [eax]
- mov edx, [eax + 4]
- mov eax, [eax + 8]
- sub ecx, eax
- sub eax, edx
-
- mov [esp + .nsvdwc], edx
- mov [esp + .nscoul], eax
- mov [esp + .nsvdw], ecx
-
- ;; clear potential
- pxor mm7,mm7
- movq [esp + .vctot], mm7
- movq [esp + .vnbtot], mm7
- mov [esp + .solnr], ebx
-
- mov eax, [ebp + %$jindex]
- mov ecx, [eax] ; jindex[n]
- mov edx, [eax + 4] ; jindex[n+1]
- add [ebp + %$jindex], dword 4
- sub edx, ecx ; number of innerloop atoms
- mov eax, [ebp + %$jjnr]
- shl ecx, 2
- add eax, ecx
- mov [esp + .innerjjnr0], eax ; pointer to jjnr[nj0]
-
- mov [esp + .innerk0], edx ; number of innerloop atoms
- mov esi, [ebp + %$pos]
- mov edi, [ebp + %$faction]
-
- mov ecx, [esp + .nsvdwc]
- cmp ecx, dword 0
- jnz .mno_vdwc
- jmp .testcoul
-.mno_vdwc:
- mov ebx, [esp + .solnr]
- inc dword [esp + .solnr]
- mov edx, [ebp + %$charge]
- movd mm2, [edx + ebx*4] ; mm2=charge[ii]
- pfmul mm2, [ebp + %$facel]
- punpckldq mm2,mm2 ; spread to both halves
- movq [esp + .iq], mm2 ; iq =facel*charge[ii]
-
- mov edx, [ebp + %$type]
- mov edx, [edx + ebx*4]
- imul edx, [ebp + %$ntype]
- shl edx, 1
- mov [esp + .ntia], edx
-
- lea ebx, [ebx + ebx*2] ; ebx = 3*ii=ii3
- mov eax, [ebp + %$pos] ; eax = base of pos[]
- mov [esp + .ii3], ebx
-
- movq mm0, [eax + ebx*4]
- movd mm1, [eax + ebx*4 + 8]
- pfadd mm0, [esp + .shX]
- pfadd mm1, [esp + .shZ]
- movq [esp + .ix], mm0
- movd [esp + .iz], mm1
-
- ;; clear forces.
- pxor mm7,mm7
- movq [esp + .fix], mm7
- movd [esp + .fiz], mm7
-
- mov ecx, [esp + .innerjjnr0]
- mov [esp + .innerjjnr], ecx
- mov edx, [esp + .innerk0]
- sub edx, dword 2
- mov [esp + .innerk], edx ; number of innerloop atoms
- jge .unroll_vdwc_loop
- jmp .finish_vdwc_inner
-.unroll_vdwc_loop:
- ;; paired innerloop here.
- mov ecx, [esp + .innerjjnr] ; pointer to jjnr[k]
- mov eax, [ecx]
- mov ebx, [ecx + 4] ; eax/ebx=jnr
- add [esp + .innerjjnr], dword 8 ; advance pointer (unrolled 2)
- prefetch [ecx + 16] ; prefetch data - trial and error says 16 is best
-
- mov ecx, [ebp + %$charge] ; base of charge[]
- movq mm5, [esp + .iq]
- movd mm3, [ecx + eax*4] ; charge[jnr1]
- punpckldq mm3, [ecx + ebx*4] ; move charge 2 to high part of mm3
- pfmul mm3,mm5 ; mm3 now has qq for both particles
-
- mov ecx, [ebp + %$type]
- mov edx, [ecx + eax*4] ; type [jnr1]
- mov ecx, [ecx + ebx*4] ; type [jnr2]
-
- mov esi, [ebp + %$nbfp] ; base of nbfp
- shl edx, 1
- shl ecx, 1
- add edx, [esp + .ntia] ; tja = ntia + 2*type
- add ecx, [esp + .ntia]
-
- movq mm5, [esi + edx*4] ; mm5 = 1st c6 / c12
- movq mm7, [esi + ecx*4] ; mm7 = 2nd c6 / c12
- movq mm6,mm5
- punpckldq mm5,mm7 ; mm5 = 1st c6 / 2nd c6
- punpckhdq mm6,mm7 ; mm6 = 1st c12 / 2nd c12
- movq [esp + .c6], mm5
- movq [esp + .c12], mm6
-
- lea eax, [eax + eax*2] ; replace jnr with j3
- lea ebx, [ebx + ebx*2]
-
- mov esi, [ebp + %$pos]
-
- movq mm0, [esp + .ix]
- movd mm1, [esp + .iz]
- movq mm4, [esi + eax*4] ; fetch first j coordinates
- movd mm5, [esi + eax*4 + 8]
- pfsubr mm4,mm0 ; dr = ir - jr
- pfsubr mm5,mm1
- movq [esp + .dx1], mm4 ; store dr
- movd [esp + .dz1], mm5
- pfmul mm4,mm4 ; square dx,dy,dz
- pfmul mm5,mm5
- pfacc mm4, mm5 ; accumulate to get dx*dx+dy*dy+dz*dz
- pfacc mm4, mm5 ; first rsq in lower mm4
-
- movq mm6, [esi + ebx*4] ; fetch second j coordinates
- movd mm7, [esi + ebx*4 + 8]
-
- pfsubr mm6,mm0 ; dr = ir - jr
- pfsubr mm7,mm1
- movq [esp + .dx2], mm6 ; store dr
- movd [esp + .dz2], mm7
- pfmul mm6,mm6 ; square dx,dy,dz
- pfmul mm7,mm7
- pfacc mm6, mm7 ; accumulate to get dx*dx+dy*dy+dz*dz
- pfacc mm6, mm7 ; second rsq in lower mm6
-
- pfrsqrt mm0, mm4 ; lookup inverse square root seed
- pfrsqrt mm1, mm6
-
-
- punpckldq mm0,mm1
- punpckldq mm4,mm6 ; now 4 has rsq and 0 the seed for both pairs.
- movq mm2,mm0 ; amd 3dnow N-R iteration to get full precision.
- pfmul mm0,mm0
- pfrsqit1 mm0,mm4
- pfrcpit2 mm0,mm2
- pfmul mm4, mm0
- movq mm1, mm4
- ;; mm0 is invsqrt, and mm1 r.
- ;; do potential and fscal
- pfmul mm1, [esp + .tabscale] ; mm1=rt
- pf2iw mm4,mm1
- movq [esp + .n1], mm4
- pi2fd mm4,mm4
- pfsub mm1, mm4 ; now mm1 is eps and mm4 n0.
-
- movq mm2,mm1
- pfmul mm2,mm2 ; mm1 is eps, mm2 is eps2
-
- mov edx, [ebp + %$VFtab]
- mov ecx, [esp + .n1]
- shl ecx, 2
- ; coulomb table
- ; load all the table values we need
- movd mm4, [edx + ecx*4]
- movd mm5, [edx + ecx*4 + 4]
- movd mm6, [edx + ecx*4 + 8]
- movd mm7, [edx + ecx*4 + 12]
- mov ecx, [esp + .n1 + 4]
- shl ecx, 2
- punpckldq mm4, [edx + ecx*4]
- punpckldq mm5, [edx + ecx*4 + 4]
- punpckldq mm6, [edx + ecx*4 + 8]
- punpckldq mm7, [edx + ecx*4 + 12]
-
- pfmul mm6, mm1 ; mm6 = Geps
- pfmul mm7, mm2 ; mm7 = Heps2
-
- pfadd mm5, mm6
- pfadd mm5, mm7 ; mm5 = Fp
-
- pfmul mm7, [esp + .two] ; two*Heps2
- pfadd mm7, mm6
- pfadd mm7, mm5 ; mm7=FF
-
- pfmul mm5, mm1 ; mm5=eps*Fp
- pfadd mm5, mm4 ; mm5= VV
-
- pfmul mm5, mm3 ; vcoul=qq*VV
- pfmul mm3, mm7 ; fijC=FF*qq
-
- movq mm1, mm0
- pfmul mm1,mm1 ; mm1=invsq
- movq mm2, mm1
- pfmul mm2,mm1
- pfmul mm2,mm1 ; mm2=rinvsix
- movq mm1,mm2
- pfmul mm1,mm1 ; mm1=rinvtwelve
-
- pfmul mm3, [esp + .tabscale]
-
- pfmul mm1, [esp + .c12]
-
- pfmul mm2, [esp + .c6]
-
- movq mm4, mm1
- pfsub mm4, mm2 ; mm4 = vnb12-vnb6
-
- pfmul mm2, [esp + .six]
- pfmul mm1, [esp + .twelve]
-
- pfsub mm1, mm2
- pfmul mm1, mm0 ; mm1= (12*vnb12-6*vnb6)*rinv11
-
- pfsub mm1, mm3
-
- ;; update vctot
- pfadd mm5, [esp + .vctot] ; add the earlier value
- movq [esp + .vctot], mm5 ; store the sum
-
- pfmul mm0, mm1 ; mm0 is total fscal now
-
- prefetchw [esp + .dx1] ; prefetch i forces to cache
-
- ;; spread fscalar to both positions
- movq mm1,mm0
- punpckldq mm0,mm0
- punpckhdq mm1,mm1
-
- ;; calc vector force
- prefetchw [edi + eax*4] ; prefetch the 1st faction to cache
- movq mm2, [esp + .dx1] ; fetch dr
- movd mm3, [esp + .dz1]
-
- ;; update vnbtot
- pfadd mm4, [esp + .vnbtot] ; add the earlier value
- movq [esp + .vnbtot], mm4 ; store the sum
-
- prefetchw [edi + ebx*4] ; prefetch the 2nd faction to cache
- pfmul mm2, mm0 ; mult by fs
- pfmul mm3, mm0
-
- movq mm4, [esp + .dx2] ; fetch dr
- movd mm5, [esp + .dz2]
- pfmul mm4, mm1 ; mult by fs
- pfmul mm5, mm1
- ;; update i forces
-
- movq mm0, [esp + .fix]
- movd mm1, [esp + .fiz]
- pfadd mm0, mm2
- pfadd mm1, mm3
-
- pfadd mm0, mm4
- pfadd mm1, mm5
- movq [esp + .fix], mm0
- movd [esp + .fiz], mm1
- ;; update j forces
-
- movq mm0, [edi + eax*4]
- movd mm1, [edi + eax*4 + 8]
- movq mm6, [edi + ebx*4]
- movd mm7, [edi + ebx*4 + 8]
-
- pfsub mm0, mm2
- pfsub mm1, mm3
- pfsub mm6, mm4
- pfsub mm7, mm5
-
- movq [edi + eax*4], mm0
- movd [edi + eax*4 +8], mm1
- movq [edi + ebx*4], mm6
- movd [edi + ebx*4 + 8], mm7
-
- ;; should we do one more iteration?
- sub [esp + .innerk], dword 2
- jl .finish_vdwc_inner
- jmp .unroll_vdwc_loop
-.finish_vdwc_inner:
- and [esp + .innerk], dword 1
- jnz .single_vdwc_inner
- jmp .updateouterdata_vdwc
-.single_vdwc_inner:
- ;; a single j particle iteration here - compare with the unrolled code for comments.
- mov eax, [esp + .innerjjnr]
- mov eax, [eax] ; eax=jnr offset
-
- mov ecx, [ebp + %$charge]
- movd mm5, [esp + .iq]
- movd mm3, [ecx + eax*4]
- pfmul mm3, mm5 ; mm3=qq
-
- mov esi, [ebp + %$nbfp]
- mov ecx, [ebp + %$type]
- mov edx, [ecx + eax*4] ; type [jnr1]
- shl edx, 1
- add edx, [esp + .ntia] ; tja = ntia + 2*type
- movd mm5, [esi + edx*4] ; mm5 = 1st c6
- movq [esp + .c6], mm5
- movd mm5, [esi + edx*4 + 4] ; mm5 = 1st c12
- movq [esp + .c12], mm5
-
-
- mov esi, [ebp + %$pos]
- lea eax, [eax + eax*2]
-
- movq mm0, [esp + .ix]
- movd mm1, [esp + .iz]
- movq mm4, [esi + eax*4]
- movd mm5, [esi + eax*4 + 8]
- pfsubr mm4, mm0
- pfsubr mm5, mm1
- movq [esp + .dx1], mm4
- pfmul mm4,mm4
- movd [esp + .dz1], mm5
- pfmul mm5,mm5
- pfacc mm4, mm5
- pfacc mm4, mm5 ; mm4=rsq
-
- pfrsqrt mm0,mm4
- movq mm2,mm0
- pfmul mm0,mm0
- pfrsqit1 mm0,mm4
- pfrcpit2 mm0,mm2 ; mm1=invsqrt
- pfmul mm4, mm0
- movq mm1, mm4
- ;; mm0 is invsqrt, and mm1 r.
- ;; calculate potentials and scalar force
- pfmul mm1, [esp + .tabscale] ; mm1=rt
- pf2iw mm4,mm1
- movd [esp + .n1], mm4
- pi2fd mm4,mm4
- pfsub mm1, mm4 ; now mm1 is eps and mm4 n0.
-
- movq mm2,mm1
- pfmul mm2,mm2 ; mm1 is eps, mm2 is eps2
-
- ; coulomb table
- mov edx, [ebp + %$VFtab]
- mov ecx, [esp + .n1]
- shl ecx, 2
- ; load all the table values we need
- movd mm4, [edx + ecx*4]
- movd mm5, [edx + ecx*4 + 4]
- movd mm6, [edx + ecx*4 + 8]
- movd mm7, [edx + ecx*4 + 12]
-
- pfmul mm6, mm1 ; mm6 = Geps
- pfmul mm7, mm2 ; mm7 = Heps2
-
- pfadd mm5, mm6
- pfadd mm5, mm7 ; mm5 = Fp
-
- pfmul mm7, [esp + .two] ; two*Heps2
- pfadd mm7, mm6
- pfadd mm7, mm5 ; mm7=FF
-
- pfmul mm5, mm1 ; mm5=eps*Fp
- pfadd mm5, mm4 ; mm5= VV
-
- pfmul mm5, mm3 ; vcoul=qq*VV
- pfmul mm3, mm7 ; fijC=FF*qq
-
- movq mm1, mm0
- pfmul mm1,mm1 ; mm1=invsq
- movq mm2, mm1
- pfmul mm2,mm1
- pfmul mm2,mm1 ; mm2=rinvsix
- movq mm1,mm2
- pfmul mm1,mm1 ; mm1=rinvtwelve
-
- pfmul mm3, [esp + .tabscale]
-
- pfmul mm1, [esp + .c12]
-
- pfmul mm2, [esp + .c6]
-
- movq mm4, mm1
- pfsub mm4, mm2 ; mm4 = vnb12-vnb6
-
- pfmul mm2, [esp + .six]
- pfmul mm1, [esp + .twelve]
-
- pfsub mm1, mm2
- pfmul mm1, mm0 ; mm1= (12*vnb12-6*vnb6)*rinv11
-
- pfsub mm1, mm3
-
- ;; update vctot
- pfadd mm5, [esp + .vctot] ; add the earlier value
- movq [esp + .vctot], mm5 ; store the sum
-
- pfmul mm0, mm1 ; mm0 is total fscal now
-
- ;; spread fscalar to both positions
- punpckldq mm0,mm0
- ;; calc vectorial force
- prefetchw [edi + eax*4] ; prefetch faction to cache
- movq mm2, [esp + .dx1]
- movd mm3, [esp + .dz1]
-
- ;; update vnbtot
- pfadd mm4, [esp + .vnbtot] ; add the earlier value
- movq [esp + .vnbtot], mm4 ; store the sum
-
- pfmul mm2, mm0
- pfmul mm3, mm0
-
- ;; update i particle force
- movq mm0, [esp + .fix]
- movd mm1, [esp + .fiz]
- pfadd mm0, mm2
- pfadd mm1, mm3
- movq [esp + .fix], mm0
- movd [esp + .fiz], mm1
- ;; update j particle force
- movq mm0, [edi + eax*4]
- movd mm1, [edi + eax *4+ 8]
- pfsub mm0, mm2
- pfsub mm1, mm3
- movq [edi + eax*4], mm0
- movd [edi + eax*4 +8], mm1
- ;; done!
-.updateouterdata_vdwc:
- mov ecx, [esp + .ii3]
-
- movq mm6, [edi + ecx*4] ; increment i force
- movd mm7, [edi + ecx*4 + 8]
- pfadd mm6, [esp + .fix]
- pfadd mm7, [esp + .fiz]
- movq [edi + ecx*4], mm6
- movd [edi + ecx*4 +8], mm7
-
- mov ebx, [ebp + %$fshift] ; increment fshift force
- mov edx, [esp + .is3]
-
- movq mm6, [ebx + edx*4]
- movd mm7, [ebx + edx*4 + 8]
- pfadd mm6, [esp + .fix]
- pfadd mm7, [esp + .fiz]
- movq [ebx + edx*4], mm6
- movd [ebx + edx*4 + 8], mm7
-
- ;; loop back to mno.
- dec dword [esp + .nsvdwc]
- jz .testcoul
- jmp .mno_vdwc
-.testcoul
- mov ecx, [esp + .nscoul]
- cmp ecx, byte 0
- jnz .mno_coul
- jmp .testvdw
-.mno_coul:
- mov ebx, [esp + .solnr]
- inc dword [esp + .solnr]
- mov edx, [ebp + %$charge]
- movd mm2, [edx + ebx*4] ; mm2=charge[ii]
- pfmul mm2, [ebp + %$facel]
- punpckldq mm2,mm2 ; spread to both halves
- movq [esp + .iq], mm2 ; iq =facel*charge[ii]
-
- lea ebx, [ebx + ebx*2] ; ebx = 3*ii=ii3
- mov eax, [ebp + %$pos] ; eax = base of pos[]
- mov [esp + .ii3], ebx
-
- movq mm0, [eax + ebx*4]
- movd mm1, [eax + ebx*4 + 8]
- pfadd mm0, [esp + .shX]
- pfadd mm1, [esp + .shZ]
- movq [esp + .ix], mm0
- movd [esp + .iz], mm1
-
- ;; clear forces.
- pxor mm7,mm7
- movq [esp + .fix], mm7
- movd [esp + .fiz], mm7
-
- mov ecx, [esp + .innerjjnr0]
- mov [esp + .innerjjnr], ecx
- mov edx, [esp + .innerk0]
- sub edx, dword 2
- mov [esp + .innerk], edx ; number of innerloop atoms
- jge .unroll_coul_loop
- jmp .finish_coul_inner
-.unroll_coul_loop:
- ;; paired innerloop here.
- mov ecx, [esp + .innerjjnr] ; pointer to jjnr[k]
- mov eax, [ecx]
- mov ebx, [ecx + 4] ; eax/ebx=jnr
- add [esp + .innerjjnr], dword 8 ; advance pointer (unrolled 2)
- prefetch [ecx + 16] ; prefetch data - trial and error says 16 is best
-
- mov ecx, [ebp + %$charge] ; base of charge[]
- movq mm5, [esp + .iq]
- movd mm3, [ecx + eax*4] ; charge[jnr1]
- punpckldq mm3, [ecx + ebx*4] ; move charge 2 to high part of mm3
- pfmul mm3,mm5 ; mm3 now has qq for both particles
-
- lea eax, [eax + eax*2] ; replace jnr with j3
- lea ebx, [ebx + ebx*2]
-
- mov esi, [ebp + %$pos]
-
- movq mm0, [esp + .ix]
- movd mm1, [esp + .iz]
- movq mm4, [esi + eax*4] ; fetch first j coordinates
- movd mm5, [esi + eax*4 + 8]
- pfsubr mm4,mm0 ; dr = ir - jr
- pfsubr mm5,mm1
- movq [esp + .dx1], mm4 ; store dr
- movd [esp + .dz1], mm5
- pfmul mm4,mm4 ; square dx,dy,dz
- pfmul mm5,mm5
- pfacc mm4, mm5 ; accumulate to get dx*dx+dy*dy+dz*dz
- pfacc mm4, mm5 ; first rsq in lower mm4
-
- movq mm6, [esi + ebx*4] ; fetch second j coordinates
- movd mm7, [esi + ebx*4 + 8]
-
- pfsubr mm6,mm0 ; dr = ir - jr
- pfsubr mm7,mm1
- movq [esp + .dx2], mm6 ; store dr
- movd [esp + .dz2], mm7
- pfmul mm6,mm6 ; square dx,dy,dz
- pfmul mm7,mm7
- pfacc mm6, mm7 ; accumulate to get dx*dx+dy*dy+dz*dz
- pfacc mm6, mm7 ; second rsq in lower mm6
-
- pfrsqrt mm0, mm4 ; lookup inverse square root seed
- pfrsqrt mm1, mm6
-
-
- punpckldq mm0,mm1
- punpckldq mm4,mm6 ; now 4 has rsq and 0 the seed for both pairs.
- movq mm2,mm0 ; amd 3dnow N-R iteration to get full precision.
- pfmul mm0,mm0
- pfrsqit1 mm0,mm4
- pfrcpit2 mm0,mm2
- pfmul mm4, mm0
- movq mm1, mm4
- ;; mm0 is invsqrt, and mm1 r.
- ;; do potential and fscal
- pfmul mm1, [esp + .tabscale] ; mm1=rt
- pf2iw mm4,mm1
- movq [esp + .n1], mm4
- pi2fd mm4,mm4
- pfsub mm1, mm4 ; now mm1 is eps and mm4 n0.
-
- movq mm2,mm1
- pfmul mm2,mm2 ; mm1 is eps, mm2 is eps2
-
- mov edx, [ebp + %$VFtab]
- mov ecx, [esp + .n1]
- shl ecx, 2
- ; coulomb table
- ; load all the table values we need
- movd mm4, [edx + ecx*4]
- movd mm5, [edx + ecx*4 + 4]
- movd mm6, [edx + ecx*4 + 8]
- movd mm7, [edx + ecx*4 + 12]
- mov ecx, [esp + .n1 + 4]
- shl ecx, 2
- punpckldq mm4, [edx + ecx*4]
- punpckldq mm5, [edx + ecx*4 + 4]
- punpckldq mm6, [edx + ecx*4 + 8]
- punpckldq mm7, [edx + ecx*4 + 12]
-
- pfmul mm6, mm1 ; mm6 = Geps
- pfmul mm7, mm2 ; mm7 = Heps2
-
- pfadd mm5, mm6
- pfadd mm5, mm7 ; mm5 = Fp
-
- pfmul mm7, [esp + .two] ; two*Heps2
- pfadd mm7, mm6
- pfadd mm7, mm5 ; mm7=FF
-
- pfmul mm5, mm1 ; mm5=eps*Fp
- pfadd mm5, mm4 ; mm5= VV
-
- pfmul mm5, mm3 ; vcoul=qq*VV
- pfmul mm3, mm7 ; fijC=FF*qq
-
- ; at this point mm5 contains vcoul and mm3 fijC.
- ; increment vcoul - then we can get rid of mm5.
- ;; update vctot
- pfadd mm5, [esp + .vctot] ; add the earlier value
- movq [esp + .vctot], mm5 ; store the sum
-
- ; change sign of mm3
- pxor mm1,mm1
- pfsub mm1, mm3
- pfmul mm1, [esp + .tabscale]
- pfmul mm0, mm1 ; mm0 is total fscal now
-
- prefetchw [esp + .dx1] ; prefetch i forces to cache
-
- ;; spread fscalar to both positions
- movq mm1,mm0
- punpckldq mm0,mm0
- punpckhdq mm1,mm1
-
- ;; calc vector force
- prefetchw [edi + eax*4] ; prefetch the 1st faction to cache
- movq mm2, [esp + .dx1] ; fetch dr
- movd mm3, [esp + .dz1]
-
- prefetchw [edi + ebx*4] ; prefetch the 2nd faction to cache
- pfmul mm2, mm0 ; mult by fs
- pfmul mm3, mm0
-
- movq mm4, [esp + .dx2] ; fetch dr
- movd mm5, [esp + .dz2]
- pfmul mm4, mm1 ; mult by fs
- pfmul mm5, mm1
- ;; update i forces
-
- movq mm0, [esp + .fix]
- movd mm1, [esp + .fiz]
- pfadd mm0, mm2
- pfadd mm1, mm3
-
- pfadd mm0, mm4
- pfadd mm1, mm5
- movq [esp + .fix], mm0
- movd [esp + .fiz], mm1
- ;; update j forces
-
- movq mm0, [edi + eax*4]
- movd mm1, [edi + eax*4 + 8]
- movq mm6, [edi + ebx*4]
- movd mm7, [edi + ebx*4 + 8]
-
- pfsub mm0, mm2
- pfsub mm1, mm3
- pfsub mm6, mm4
- pfsub mm7, mm5
-
- movq [edi + eax*4], mm0
- movd [edi + eax*4 +8], mm1
- movq [edi + ebx*4], mm6
- movd [edi + ebx*4 + 8], mm7
-
- ;; should we do one more iteration?
- sub [esp + .innerk], dword 2
- jl .finish_coul_inner
- jmp .unroll_coul_loop
-.finish_coul_inner:
- and [esp + .innerk], dword 1
- jnz .single_coul_inner
- jmp .updateouterdata_coul
-.single_coul_inner:
- ;; a single j particle iteration here - compare with the unrolled code for comments.
- mov eax, [esp + .innerjjnr]
- mov eax, [eax] ; eax=jnr offset
-
- mov ecx, [ebp + %$charge]
- movd mm5, [esp + .iq]
- movd mm3, [ecx + eax*4]
- pfmul mm3, mm5 ; mm3=qq
-
- mov esi, [ebp + %$pos]
- lea eax, [eax + eax*2]
-
- movq mm0, [esp + .ix]
- movd mm1, [esp + .iz]
- movq mm4, [esi + eax*4]
- movd mm5, [esi + eax*4 + 8]
- pfsubr mm4, mm0
- pfsubr mm5, mm1
- movq [esp + .dx1], mm4
- pfmul mm4,mm4
- movd [esp + .dz1], mm5
- pfmul mm5,mm5
- pfacc mm4, mm5
- pfacc mm4, mm5 ; mm0=rsq
-
- pfrsqrt mm0,mm4
- movq mm2,mm0
- pfmul mm0,mm0
- pfrsqit1 mm0,mm4
- pfrcpit2 mm0,mm2 ; mm1=invsqrt
- pfmul mm4, mm0
- movq mm1, mm4
- ;; mm0 is invsqrt, and mm1 r.
-
- ;; calculate potentials and scalar force
- pfmul mm1, [esp + .tabscale] ; mm1=rt
- pf2iw mm4,mm1
- movd [esp + .n1], mm4
- pi2fd mm4,mm4
- pfsub mm1, mm4 ; now mm1 is eps and mm4 n0.
-
- movq mm2,mm1
- pfmul mm2,mm2 ; mm1 is eps, mm2 is eps2
-
- ; coulomb table
- mov edx, [ebp + %$VFtab]
- mov ecx, [esp + .n1]
- shl ecx, 2
- ; load all the table values we need
- movd mm4, [edx + ecx*4]
- movd mm5, [edx + ecx*4 + 4]
- movd mm6, [edx + ecx*4 + 8]
- movd mm7, [edx + ecx*4 + 12]
-
- pfmul mm6, mm1 ; mm6 = Geps
- pfmul mm7, mm2 ; mm7 = Heps2
-
- pfadd mm5, mm6
- pfadd mm5, mm7 ; mm5 = Fp
-
- pfmul mm7, [esp + .two] ; two*Heps2
- pfadd mm7, mm6
- pfadd mm7, mm5 ; mm7=FF
-
- pfmul mm5, mm1 ; mm5=eps*Fp
- pfadd mm5, mm4 ; mm5= VV
-
- pfmul mm5, mm3 ; vcoul=qq*VV
- pfmul mm3, mm7 ; fijC=FF*qq
-
- ; at this point mm5 contains vcoul and mm3 fijC.
- ; increment vcoul - then we can get rid of mm5.
- ;; update vctot
- pfadd mm5, [esp + .vctot] ; add the earlier value
- movq [esp + .vctot], mm5 ; store the sum
-
- ; change sign of mm3
- pxor mm1,mm1
- pfsub mm1, mm3
- pfmul mm0, [esp + .tabscale]
- pfmul mm0, mm1 ; mm0 is total fscal now
-
- ;; spread fscalar to both positions
- punpckldq mm0,mm0
- ;; calc vectorial force
- prefetchw [edi + eax*4] ; prefetch faction to cache
- movq mm2, [esp + .dx1]
- movd mm3, [esp + .dz1]
-
-
- pfmul mm2, mm0
- pfmul mm3, mm0
-
- ;; update i particle force
- movq mm0, [esp + .fix]
- movd mm1, [esp + .fiz]
- pfadd mm0, mm2
- pfadd mm1, mm3
- movq [esp + .fix], mm0
- movd [esp + .fiz], mm1
- ;; update j particle force
- movq mm0, [edi + eax*4]
- movd mm1, [edi + eax *4+ 8]
- pfsub mm0, mm2
- pfsub mm1, mm3
- movq [edi + eax*4], mm0
- movd [edi + eax*4 +8], mm1
- ;; done!
-.updateouterdata_coul:
- mov ecx, [esp + .ii3]
-
- movq mm6, [edi + ecx*4] ; increment i force
- movd mm7, [edi + ecx*4 + 8]
- pfadd mm6, [esp + .fix]
- pfadd mm7, [esp + .fiz]
- movq [edi + ecx*4], mm6
- movd [edi + ecx*4 +8], mm7
-
- mov ebx, [ebp + %$fshift] ; increment fshift force
- mov edx, [esp + .is3]
-
- movq mm6, [ebx + edx*4]
- movd mm7, [ebx + edx*4 + 8]
- pfadd mm6, [esp + .fix]
- pfadd mm7, [esp + .fiz]
- movq [ebx + edx*4], mm6
- movd [ebx + edx*4 + 8], mm7
-
- ;; loop back to mno.
- dec dword [esp + .nscoul]
- jz .testvdw
- jmp .mno_coul
-.testvdw
- mov ecx, [esp + .nsvdw]
- cmp ecx, byte 0
- jnz .mno_vdw
- jmp .last_mno
-.mno_vdw:
- mov ebx, [esp + .solnr]
- inc dword [esp + .solnr]
-
- mov edx, [ebp + %$type]
- mov edx, [edx + ebx*4]
- imul edx, [ebp + %$ntype]
- shl edx, 1
- mov [esp + .ntia], edx
-
- lea ebx, [ebx + ebx*2] ; ebx = 3*ii=ii3
- mov eax, [ebp + %$pos] ; eax = base of pos[]
- mov [esp + .ii3], ebx
-
- movq mm0, [eax + ebx*4]
- movd mm1, [eax + ebx*4 + 8]
- pfadd mm0, [esp + .shX]
- pfadd mm1, [esp + .shZ]
- movq [esp + .ix], mm0
- movd [esp + .iz], mm1
-
- ;; clear forces.
- pxor mm7,mm7
- movq [esp + .fix], mm7
- movd [esp + .fiz], mm7
-
- mov ecx, [esp + .innerjjnr0]
- mov [esp + .innerjjnr], ecx
- mov edx, [esp + .innerk0]
- sub edx, dword 2
- mov [esp + .innerk], edx ; number of innerloop atoms
- jge .unroll_vdw_loop
- jmp .finish_vdw_inner
-.unroll_vdw_loop:
- ;; paired innerloop here.
- mov ecx, [esp + .innerjjnr] ; pointer to jjnr[k]
- mov eax, [ecx]
- mov ebx, [ecx + 4] ; eax/ebx=jnr
- add [esp + .innerjjnr], dword 8 ; advance pointer (unrolled 2)
- prefetch [ecx + 16] ; prefetch data - trial and error says 16 is best
-
- mov ecx, [ebp + %$type]
- mov edx, [ecx + eax*4] ; type [jnr1]
- mov ecx, [ecx + ebx*4] ; type [jnr2]
-
- mov esi, [ebp + %$nbfp] ; base of nbfp
- shl edx, 1
- shl ecx, 1
- add edx, [esp + .ntia] ; tja = ntia + 2*type
- add ecx, [esp + .ntia]
-
- movq mm5, [esi + edx*4] ; mm5 = 1st c6 / c12
- movq mm7, [esi + ecx*4] ; mm7 = 2nd c6 / c12
- movq mm6,mm5
- punpckldq mm5,mm7 ; mm5 = 1st c6 / 2nd c6
- punpckhdq mm6,mm7 ; mm6 = 1st c12 / 2nd c12
- movq [esp + .c6], mm5
- movq [esp + .c12], mm6
-
- lea eax, [eax + eax*2] ; replace jnr with j3
- lea ebx, [ebx + ebx*2]
-
- mov esi, [ebp + %$pos]
-
- movq mm0, [esp + .ix]
- movd mm1, [esp + .iz]
- movq mm4, [esi + eax*4] ; fetch first j coordinates
- movd mm5, [esi + eax*4 + 8]
- pfsubr mm4,mm0 ; dr = ir - jr
- pfsubr mm5,mm1
- movq [esp + .dx1], mm4 ; store dr
- movd [esp + .dz1], mm5
- pfmul mm4,mm4 ; square dx,dy,dz
- pfmul mm5,mm5
- pfacc mm4, mm5 ; accumulate to get dx*dx+dy*dy+dz*dz
- pfacc mm4, mm5 ; first rsq in lower mm4
-
- movq mm6, [esi + ebx*4] ; fetch second j coordinates
- movd mm7, [esi + ebx*4 + 8]
-
- pfsubr mm6,mm0 ; dr = ir - jr
- pfsubr mm7,mm1
- movq [esp + .dx2], mm6 ; store dr
- movd [esp + .dz2], mm7
- pfmul mm6,mm6 ; square dx,dy,dz
- pfmul mm7,mm7
- pfacc mm6, mm7 ; accumulate to get dx*dx+dy*dy+dz*dz
- pfacc mm6, mm7 ; second rsq in lower mm6
-
- pfrcp mm0, mm4 ; lookup reciprocal seed
- pfrcp mm1, mm6
-
- punpckldq mm0,mm1
- punpckldq mm4,mm6 ; now 4 has rsq and 0 the seed for both pairs.
- ; amd 3dnow N-R iteration to get full precision.
- pfrcpit1 mm4,mm0
- pfrcpit2 mm4,mm0
- ;; mm4 now contains invsq,
- ;; do potential and fscal
- movq mm0, mm4
- pfmul mm4, mm0
- pfmul mm4, mm0 ; mm4=rinvsix
- movq mm5, mm4
- pfmul mm5, mm5 ; mm5=rinvtwelve
-
- pfmul mm5, [esp + .c12]
- pfmul mm4, [esp + .c6]
- movq mm6, mm5 ; mm6 is vnb12-vnb6
- pfsub mm6, mm4
-
- pfmul mm4, [esp + .six]
-
- pfmul mm5, [esp + .twelve]
- pfsub mm5,mm4
- pfmul mm0, mm5 ; mm0 is total fscal now
-
- prefetchw [esp + .dx1] ; prefetch i forces to cache
-
- ;; spread fscalar to both positions
- movq mm1,mm0
- punpckldq mm0,mm0
- punpckhdq mm1,mm1
-
- ;; calc vector force
- prefetchw [edi + eax*4] ; prefetch the 1st faction to cache
- movq mm2, [esp + .dx1] ; fetch dr
- movd mm3, [esp + .dz1]
-
- ;; update vnbtot
- pfadd mm6, [esp + .vnbtot] ; add the earlier value
- movq [esp + .vnbtot], mm6 ; store the sum
-
- prefetchw [edi + ebx*4] ; prefetch the 2nd faction to cache
- pfmul mm2, mm0 ; mult by fs
- pfmul mm3, mm0
-
- movq mm4, [esp + .dx2] ; fetch dr
- movd mm5, [esp + .dz2]
- pfmul mm4, mm1 ; mult by fs
- pfmul mm5, mm1
- ;; update i forces
-
- movq mm0, [esp + .fix]
- movd mm1, [esp + .fiz]
- pfadd mm0, mm2
- pfadd mm1, mm3
-
- pfadd mm0, mm4
- pfadd mm1, mm5
- movq [esp + .fix], mm0
- movd [esp + .fiz], mm1
- ;; update j forces
-
- movq mm0, [edi + eax*4]
- movd mm1, [edi + eax*4 + 8]
- movq mm6, [edi + ebx*4]
- movd mm7, [edi + ebx*4 + 8]
-
- pfsub mm0, mm2
- pfsub mm1, mm3
- pfsub mm6, mm4
- pfsub mm7, mm5
-
- movq [edi + eax*4], mm0
- movd [edi + eax*4 +8], mm1
- movq [edi + ebx*4], mm6
- movd [edi + ebx*4 + 8], mm7
-
- ;; should we do one more iteration?
- sub [esp + .innerk], dword 2
- jl .finish_vdw_inner
- jmp .unroll_vdw_loop
-.finish_vdw_inner:
- and [esp + .innerk], dword 1
- jnz .single_vdw_inner
- jmp .updateouterdata_vdw
-.single_vdw_inner:
- ;; a single j particle iteration here - compare with the unrolled code for comments.
- mov eax, [esp + .innerjjnr]
- mov eax, [eax] ; eax=jnr offset
-
- mov esi, [ebp + %$nbfp]
- mov ecx, [ebp + %$type]
- mov edx, [ecx + eax*4] ; type [jnr1]
- shl edx, 1
- add edx, [esp + .ntia] ; tja = ntia + 2*type
- movd mm5, [esi + edx*4] ; mm5 = 1st c6
- movq [esp + .c6], mm5
- movd mm5, [esi + edx*4 + 4] ; mm5 = 1st c12
- movq [esp + .c12], mm5
-
- mov esi, [ebp + %$pos]
- lea eax, [eax + eax*2]
-
- movq mm0, [esp + .ix]
- movd mm1, [esp + .iz]
- movq mm4, [esi + eax*4]
- movd mm5, [esi + eax*4 + 8]
- pfsubr mm4, mm0
- pfsubr mm5, mm1
- movq [esp + .dx1], mm4
- pfmul mm4,mm4
- movd [esp + .dz1], mm5
- pfmul mm5,mm5
- pfacc mm4, mm5
- pfacc mm4, mm5 ; mm4=rsq
-
- pfrcp mm0,mm4
- pfrcpit1 mm4,mm0
- pfrcpit2 mm4,mm0 ; mm4=invsq
- ;; calculate potentials and scalar force
- movq mm0, mm4
-
- pfmul mm4, mm0
- pfmul mm4, mm0 ; mm4=rinvsix
- movq mm5, mm4
- pfmul mm5, mm5 ; mm5=rinvtwelve
-
- pfmul mm5, [esp + .c12]
- pfmul mm4, [esp + .c6]
- movq mm6, mm5 ; mm6 is vnb12-vnb6
- pfsub mm6, mm4
-
- pfmul mm4, [esp + .six]
-
- pfmul mm5, [esp + .twelve]
- pfsub mm5, mm4
- pfmul mm0, mm5 ; mm0 is total fscal now
-
- ;; update vnbtot
- pfadd mm6, [esp + .vnbtot] ; add the earlier value
- movq [esp + .vnbtot], mm6 ; store the sum
-
- ;; spread fscalar to both positions
- punpckldq mm0,mm0
- ;; calc vectorial force
- prefetchw [edi + eax*4] ; prefetch faction to cache
- movq mm2, [esp + .dx1]
- movd mm3, [esp + .dz1]
-
- pfmul mm2, mm0
- pfmul mm3, mm0
-
- ;; update i particle force
- movq mm0, [esp + .fix]
- movd mm1, [esp + .fiz]
- pfadd mm0, mm2
- pfadd mm1, mm3
- movq [esp + .fix], mm0
- movd [esp + .fiz], mm1
- ;; update j particle force
- movq mm0, [edi + eax*4]
- movd mm1, [edi + eax *4+ 8]
- pfsub mm0, mm2
- pfsub mm1, mm3
- movq [edi + eax*4], mm0
- movd [edi + eax*4 +8], mm1
- ;; done!
-.updateouterdata_vdw:
- mov ecx, [esp + .ii3]
-
- movq mm6, [edi + ecx*4] ; increment i force
- movd mm7, [edi + ecx*4 + 8]
- pfadd mm6, [esp + .fix]
- pfadd mm7, [esp + .fiz]
- movq [edi + ecx*4], mm6
- movd [edi + ecx*4 +8], mm7
-
- mov ebx, [ebp + %$fshift] ; increment fshift force
- mov edx, [esp + .is3]
-
- movq mm6, [ebx + edx*4]
- movd mm7, [ebx + edx*4 + 8]
- pfadd mm6, [esp + .fix]
- pfadd mm7, [esp + .fiz]
- movq [ebx + edx*4], mm6
- movd [ebx + edx*4 + 8], mm7
-
- ;; loop back to mno.
- dec dword [esp + .nsvdw]
- jz .last_mno
- jmp .mno_vdw
-
-.last_mno:
- mov edx, [ebp + %$gid] ; get group index for this i particle
- mov edx, [edx]
- add [ebp + %$gid], dword 4 ; advance pointer
-
- movq mm7, [esp + .vctot]
- pfacc mm7,mm7 ; get and sum the two parts of total potential
-
- mov eax, [ebp + %$Vc]
- movd mm6, [eax + edx*4]
- pfadd mm6, mm7
- movd [eax + edx*4], mm6 ; increment vc[gid]
-
- movq mm7, [esp + .vnbtot]
- pfacc mm7,mm7 ; get and sum the two parts of total potential
-
- mov eax, [ebp + %$Vnb]
- movd mm6, [eax + edx*4]
- pfadd mm6, mm7
- movd [eax + edx*4], mm6 ; increment vc[gid]
- ;; finish if last
- mov ecx, [ebp + %$nri]
- dec ecx
- jecxz .end
- ;; not last, iterate once more!
- mov [ebp + %$nri], ecx
- jmp .outer
-.end:
- femms
- add esp, 184
- pop edi
- pop esi
- pop edx
- pop ecx
- pop ebx
- pop eax
- endproc
-
-
-
-
-proc inl3120_3dnow
-%$nri arg
-%$iinr arg
-%$jindex arg
-%$jjnr arg
-%$shift arg
-%$shiftvec arg
-%$fshift arg
-%$gid arg
-%$pos arg
-%$faction arg
-%$charge arg
-%$facel arg
-%$Vc arg
-%$type arg
-%$ntype arg
-%$nbfp arg
-%$Vnb arg
-%$tabscale arg
-%$VFtab arg
- ;; stack offsets for local variables
-.is3 equ 0
-.ii3 equ 4
-.ixO equ 8
-.iyO equ 12
-.izO equ 16
-.ixH equ 20 ; repeated (64bit) to fill 3dnow reg
-.iyH equ 28 ; repeated (64bit) to fill 3dnow reg
-.izH equ 36 ; repeated (64bit) to fill 3dnow reg
-.iqO equ 44 ; repeated (64bit) to fill 3dnow reg
-.iqH equ 52 ; repeated (64bit) to fill 3dnow reg
-.qqO equ 60 ; repeated (64bit) to fill 3dnow reg
-.qqH equ 68 ; repeated (64bit) to fill 3dnow reg
-.vctot equ 76 ; repeated (64bit) to fill 3dnow reg
-.vnbtot equ 84 ; repeated (64bit) to fill 3dnow reg
-.c6 equ 92 ; repeated (64bit) to fill 3dnow reg
-.c12 equ 100 ; repeated (64bit) to fill 3dnow reg
-.six equ 108 ; repeated (64bit) to fill 3dnow reg
-.twelve equ 116 ; repeated (64bit) to fill 3dnow reg
-.two equ 124 ; repeated (64bit) to fill 3dnow reg
-.n1 equ 132 ; repeated (64bit) to fill 3dnow reg
-.tabscale equ 140 ; repeated (64bit) to fill 3dnow reg
-.ntia equ 148 ; repeated (64bit) to fill 3dnow reg
-.innerjjnr equ 156
-.innerk equ 160
-.fixO equ 164
-.fiyO equ 168
-.fizO equ 172
-.fixH equ 176 ; repeated (64bit) to fill 3dnow reg
-.fiyH equ 184 ; repeated (64bit) to fill 3dnow reg
-.fizH equ 192 ; repeated (64bit) to fill 3dnow reg
-.dxO equ 200
-.dyO equ 204
-.dzO equ 208
-.dxH equ 212 ; repeated (64bit) to fill 3dnow reg
-.dyH equ 220 ; repeated (64bit) to fill 3dnow reg
-.dzH equ 228 ; repeated (64bit) to fill 3dnow reg
-.tmprsqH equ 236 ; repeated (64bit) to fill 3dnow reg
- push eax
- push ebx
- push ecx
- push edx
- push esi
- push edi
- sub esp, 244 ; local stack space
- femms
-
- mov ecx, [ebp + %$iinr] ; ecx = pointer into iinr[]
- mov ebx, [ecx] ; ebx =ii
-
- mov edx, [ebp + %$charge]
- movd mm1, [ebp + %$facel]
- movd mm2, [edx + ebx*4] ; mm2=charge[ii0]
- pfmul mm2, mm1
- movq [esp + .iqO], mm2 ; iqO = facel*charge[ii]
-
- movd mm2, [edx + ebx*4 + 4] ; mm2=charge[ii0+1]
- pfmul mm2, mm1
- punpckldq mm2,mm2 ; spread to both halves
- movq [esp + .iqH], mm2 ; iqH = facel*charge[ii0+1]
-
- mov edx, [ebp + %$type]
- mov edx, [edx + ebx*4]
- shl edx, 1
- mov ecx, edx
- imul ecx, [ebp + %$ntype] ; ecx = ntia = 2*ntype*type[ii0]
- mov [esp + .ntia], ecx
-
- movq mm3, [mm_two]
- movq mm4, [mm_six]
- movq mm5, [mm_twelve]
- movq mm6, [ebp + %$tabscale]
- punpckldq mm6,mm6 ; spread to both halves
- movq [esp + .two], mm3
- movq [esp + .six], mm4
- movq [esp + .twelve], mm5
- movq [esp + .tabscale], mm6
- ;; assume we have at least one i particle - start directly
-.outer:
- mov eax, [ebp + %$shift] ; eax = pointer into shift[]
- mov ebx, [eax] ; ebx=shift[n]
- add [ebp + %$shift], dword 4 ; advance pointer one step
-
- lea ebx, [ebx + ebx*2] ; ebx=3*is
- mov [esp + .is3],ebx ; store is3
-
- mov eax, [ebp + %$shiftvec] ; eax = base of shiftvec[]
-
- movq mm5, [eax + ebx*4] ; move shX/shY to mm5 and shZ to mm6.
- movd mm6, [eax + ebx*4 + 8]
- movq mm0, mm5
- movq mm1, mm5
- movq mm2, mm6
- punpckldq mm0,mm0 ; also expand shX,Y,Z in mm0--mm2.
- punpckhdq mm1,mm1
- punpckldq mm2,mm2
-
- mov ecx, [ebp + %$iinr] ; ecx = pointer into iinr[]
- add [ebp + %$iinr], dword 4 ; advance pointer
- mov ebx, [ecx] ; ebx =ii
-
- lea ebx, [ebx + ebx*2] ; ebx = 3*ii=ii3
- mov eax, [ebp + %$pos] ; eax = base of pos[]
-
- pfadd mm5, [eax + ebx*4] ; ix = shX + posX (and iy too)
- movd mm7, [eax + ebx*4 + 8] ; cant use direct memory add for 4 bytes (iz)
- mov [esp + .ii3], ebx ; (use mm7 as temp. storage for iz.)
- pfadd mm6, mm7
- movq [esp + .ixO], mm5
- movq [esp + .izO], mm6
-
- movd mm3, [eax + ebx*4 + 12]
- movd mm4, [eax + ebx*4 + 16]
- movd mm5, [eax + ebx*4 + 20]
- punpckldq mm3, [eax + ebx*4 + 24]
- punpckldq mm4, [eax + ebx*4 + 28]
- punpckldq mm5, [eax + ebx*4 + 32] ; coords of H1 in low mm3-mm5, H2 in high.
-
- pfadd mm0, mm3
- pfadd mm1, mm4
- pfadd mm2, mm5
- movq [esp + .ixH], mm0
- movq [esp + .iyH], mm1
- movq [esp + .izH], mm2
-
- ;; clear vctot and i forces.
- pxor mm7,mm7
- movq [esp + .vctot], mm7
- movq [esp + .vnbtot], mm7
- movq [esp + .fixO], mm7
- movd [esp + .fizO], mm7
- movq [esp + .fixH], mm7
- movq [esp + .fiyH], mm7
- movq [esp + .fizH], mm7
-
- mov eax, [ebp + %$jindex]
- mov ecx, [eax] ; jindex[n]
- mov edx, [eax + 4] ; jindex[n+1]
- add [ebp + %$jindex], dword 4
- sub edx, ecx ; number of innerloop atoms
- mov [esp + .innerk], edx
-
- mov esi, [ebp + %$pos]
- mov edi, [ebp + %$faction]
- mov eax, [ebp + %$jjnr]
- shl ecx, 2
- add eax, ecx
- mov [esp + .innerjjnr], eax ; pointer to jjnr[nj0]
-.inner_loop:
- ;; a single j particle iteration here - compare with the unrolled code for comments.
- mov eax, [esp + .innerjjnr]
- mov eax, [eax] ; eax=jnr offset
- add [esp + .innerjjnr], dword 4 ; advance pointer
- ;prefetch [ecx + 16] ; prefetch data - trial and error says 16 is best
-
- mov ecx, [ebp + %$charge]
- movd mm7, [ecx + eax*4]
- punpckldq mm7,mm7
- movq mm6,mm7
- pfmul mm6, [esp + .iqO]
- pfmul mm7, [esp + .iqH] ; mm6=qqO, mm7=qqH
- movd [esp + .qqO], mm6
- movq [esp + .qqH], mm7
-
- mov ecx, [ebp + %$type]
- mov edx, [ecx + eax*4] ; type [jnr]
- mov ecx, [ebp + %$nbfp]
- shl edx, 1
- add edx, [esp + .ntia] ; tja = ntia + 2*type
- movd mm5, [ecx + edx*4] ; mm5 = 1st c6
- movq [esp + .c6], mm5
- movd mm5, [ecx + edx*4 + 4] ; mm5 = 1st c12
- movq [esp + .c12], mm5
-
- lea eax, [eax + eax*2]
-
- movq mm0, [esi + eax*4]
- movd mm1, [esi + eax*4 + 8]
- ;; copy & expand to mm2-mm4 for the H interactions
- movq mm2, mm0
- movq mm3, mm0
- movq mm4, mm1
- punpckldq mm2,mm2
- punpckhdq mm3,mm3
- punpckldq mm4,mm4
-
- pfsubr mm0, [esp + .ixO]
- pfsubr mm1, [esp + .izO]
-
- movq [esp + .dxO], mm0
- pfmul mm0,mm0
- movd [esp + .dzO], mm1
- pfmul mm1,mm1
- pfacc mm0, mm1
- pfadd mm0, mm1 ; mm0=rsqO
-
- punpckldq mm2, mm2
- punpckldq mm3, mm3
- punpckldq mm4, mm4 ; mm2-mm4 is jx-jz
- pfsubr mm2, [esp + .ixH]
- pfsubr mm3, [esp + .iyH]
- pfsubr mm4, [esp + .izH] ; mm2-mm4 is dxH-dzH
-
- movq [esp + .dxH], mm2
- movq [esp + .dyH], mm3
- movq [esp + .dzH], mm4
- pfmul mm2,mm2
- pfmul mm3,mm3
- pfmul mm4,mm4
-
- pfadd mm3,mm2
- pfadd mm3,mm4 ; mm3=rsqH
- movq [esp + .tmprsqH], mm3
-
- pfrsqrt mm1,mm0
-
- movq mm2,mm1
- pfmul mm1,mm1
- pfrsqit1 mm1,mm0
- pfrcpit2 mm1,mm2 ; mm1=invsqrt
-
- pfmul mm0, mm1 ; mm0=r
-
- pfmul mm0, [esp + .tabscale]
- pf2iw mm4, mm0
- movd [esp + .n1], mm4
- pi2fd mm4,mm4
- pfsub mm0, mm4 ; now mm0 is eps and mm4 n0.
- movq mm2, mm0
- pfmul mm2, mm2 ; mm0 is eps, mm2 eps2
-
- ; coulomb table
- mov edx, [ebp + %$VFtab]
- mov ecx, [esp + .n1]
- shl ecx, 2
- ;; load all values we need
- movd mm4, [edx + ecx*4]
- movd mm5, [edx + ecx*4 + 4]
- movd mm6, [edx + ecx*4 + 8]
- movd mm7, [edx + ecx*4 + 12]
-
- pfmul mm6, mm0 ; mm6 = Geps
- pfmul mm7, mm2 ; mm7 = Heps2
- ;;
- pfadd mm5, mm6
- pfadd mm5, mm7 ; mm5 = Fp
-
- pfmul mm7, [esp + .two] ; two*Heps2
- pfadd mm7, mm6
- pfadd mm7, mm5 ; mm7=FF
-
- pfmul mm5, mm0 ; mm5=eps*Fp
- pfadd mm5, mm4 ; mm5= VV
-
- pfmul mm5, [esp + .qqO] ; vcoul=qq*VV
- pfmul mm7, [esp + .qqO] ; fijC=qq*FF
- ;; update vctot directly, use mm3 for fscal sum.
- pfadd mm5, [esp + .vctot]
- movq [esp + .vctot], mm5
-
- movq mm3, mm7
- pfmul mm3, [esp + .tabscale]
-
- ; nontabulated LJ - mm1 is invsqrt. - keep mm1!
- movq mm0, mm1
- pfmul mm0, mm0 ; mm0 is invsq
- movq mm2, mm0
- pfmul mm2, mm0
- pfmul mm2, mm0 ; mm2 = rinvsix
- movq mm4, mm2
- pfmul mm4, mm4 ; mm4=rinvtwelve
-
- pfmul mm4, [esp + .c12]
- pfmul mm2, [esp + .c6]
- movq mm5, mm4
- pfsub mm5, mm2 ; mm5=vnb12-vnb6
-
- pfmul mm2, [esp + .six]
- pfmul mm4, [esp + .twelve]
- pfsub mm4, mm2
- pfmul mm4, mm1 ; mm4=(12*vnb12-6*vnb6)*rinv11
-
- pfsubr mm3, mm4
- pfmul mm3, mm1 ; mm3 is total fscal (for the oxygen) now
-
- ;; update vnbtot
- pfadd mm5, [esp + .vnbtot] ; add the earlier value
- movq [esp + .vnbtot], mm5 ; store the sum
-
- ;; Ready with the oxygen - potential is updated, fscal is in mm3.
- ;; now do the two hydrogens.
- movq mm0, [esp + .tmprsqH] ; mm0=rsqH
-
- pfrsqrt mm1, mm0
- pswapd mm0,mm0
- pfrsqrt mm2, mm0
- pswapd mm0,mm0
- punpckldq mm1,mm2 ; seeds are in mm1 now, and rsq in mm0.
-
- movq mm2, mm1
- pfmul mm1,mm1
- pfrsqit1 mm1,mm0
- pfrcpit2 mm1,mm2 ; mm1=invsqrt
-
- pfmul mm0,mm1 ; mm0=r
- pfmul mm0, [esp + .tabscale]
- pf2iw mm4, mm0
- movq [esp + .n1], mm4
- pi2fd mm4,mm4
- pfsub mm0, mm4 ; now mm0 is eps and mm4 n0.
- movq mm2, mm0
- pfmul mm2, mm2 ; mm0 is eps, mm2 eps2
-
- ; coulomb table
- mov edx, [ebp + %$VFtab]
- mov ecx, [esp + .n1]
- shl ecx, 2
- ;; load all values we need
- movd mm4, [edx + ecx*4]
- movd mm5, [edx + ecx*4 + 4]
- movd mm6, [edx + ecx*4 + 8]
- movd mm7, [edx + ecx*4 + 12]
- mov ecx, [esp + .n1 + 4]
- shl ecx, 2
- punpckldq mm4, [edx + ecx*4]
- punpckldq mm5, [edx + ecx*4 + 4]
- punpckldq mm6, [edx + ecx*4 + 8]
- punpckldq mm7, [edx + ecx*4 + 12]
-
- pfmul mm6, mm0 ; mm6 = Geps
- pfmul mm7, mm2 ; mm7 = Heps2
- ;;
- pfadd mm5, mm6
- pfadd mm5, mm7 ; mm5 = Fp
-
- pfmul mm7, [esp + .two] ; two*Heps2
- pfadd mm7, mm6
- pfadd mm7, mm5 ; mm7=FF
-
- pfmul mm5, mm0 ; mm5=eps*Fp
- pfadd mm5, mm4 ; mm5= VV
-
- pfmul mm5, [esp + .qqH] ; vcoul=qq*VV
- pfmul mm7, [esp + .qqH] ; fijC=qq*FF
- ;; update vctot.
- pfadd mm5, [esp + .vctot]
- movq [esp + .vctot], mm5
-
- ;; change sign of fijC and multiply by rinv
- pxor mm4,mm4
- pfsub mm4, mm7
- pfmul mm4, [esp + .tabscale]
- pfmul mm4, mm1 ; mm4 is total fscal (for the hydrogens) now
-
- ;; spread oxygen fscalar to both positions
- punpckldq mm3,mm3
- ;; calc vectorial force for O
- prefetchw [edi + eax*4] ; prefetch faction to cache
- movq mm0, [esp + .dxO]
- movd mm1, [esp + .dzO]
- pfmul mm0, mm3
- pfmul mm1, mm3
-
- ;; calc vectorial force for H's
- movq mm5, [esp + .dxH]
- movq mm6, [esp + .dyH]
- movq mm7, [esp + .dzH]
- pfmul mm5, mm4
- pfmul mm6, mm4
- pfmul mm7, mm4
-
- ;; update iO particle force
- movq mm2, [esp + .fixO]
- movd mm3, [esp + .fizO]
- pfadd mm2, mm0
- pfadd mm3, mm1
- movq [esp + .fixO], mm2
- movd [esp + .fizO], mm3
-
- ;; update iH forces
- movq mm2, [esp + .fixH]
- movq mm3, [esp + .fiyH]
- movq mm4, [esp + .fizH]
- pfadd mm2, mm5
- pfadd mm3, mm6
- pfadd mm4, mm7
- movq [esp + .fixH], mm2
- movq [esp + .fiyH], mm3
- movq [esp + .fizH], mm4
-
- ;; pack j forces from H in the same form as the oxygen force.
- pfacc mm5, mm6 ; mm5(l)=fjx(H1+H2) mm5(h)=fjy(H1+H2)
- pfacc mm7, mm7 ; mm7(l)=fjz(H1+H2)
-
- pfadd mm0, mm5 ; add up total force on j particle.
- pfadd mm1, mm7
-
- ;; update j particle force
- movq mm2, [edi + eax*4]
- movd mm3, [edi + eax*4 + 8]
- pfsub mm2, mm0
- pfsub mm3, mm1
- movq [edi + eax*4], mm2
- movd [edi + eax*4 +8], mm3
-
- ;; done - one more?
- dec dword [esp + .innerk]
- jz .updateouterdata
- jmp .inner_loop
-.updateouterdata:
- mov ecx, [esp + .ii3]
-
- movq mm6, [edi + ecx*4] ; increment iO force
- movd mm7, [edi + ecx*4 + 8]
- pfadd mm6, [esp + .fixO]
- pfadd mm7, [esp + .fizO]
- movq [edi + ecx*4], mm6
- movd [edi + ecx*4 +8], mm7
-
- movq mm0, [esp + .fixH]
- movq mm3, [esp + .fiyH]
- movq mm1, [esp + .fizH]
- movq mm2, mm0
- punpckldq mm0, mm3 ; mm0(l)=fxH1, mm0(h)=fyH1
- punpckhdq mm2, mm3 ; mm2(l)=fxH2, mm2(h)=fyH2
- movq mm3, mm1
- pswapd mm3,mm3
- ;; mm1 is fzH1
- ;; mm3 is fzH2
-
- movq mm6, [edi + ecx*4 + 12] ; increment iH1 force
- movd mm7, [edi + ecx*4 + 20]
- pfadd mm6, mm0
- pfadd mm7, mm1
- movq [edi + ecx*4 + 12], mm6
- movd [edi + ecx*4 + 20], mm7
-
- movq mm6, [edi + ecx*4 + 24] ; increment iH2 force
- movd mm7, [edi + ecx*4 + 32]
- pfadd mm6, mm2
- pfadd mm7, mm3
- movq [edi + ecx*4 + 24], mm6
- movd [edi + ecx*4 + 32], mm7
-
-
- mov ebx, [ebp + %$fshift] ; increment fshift force
- mov edx, [esp + .is3]
-
- movq mm6, [ebx + edx*4]
- movd mm7, [ebx + edx*4 + 8]
- pfadd mm6, [esp + .fixO]
- pfadd mm7, [esp + .fizO]
- pfadd mm6, mm0
- pfadd mm7, mm1
- pfadd mm6, mm2
- pfadd mm7, mm3
- movq [ebx + edx*4], mm6
- movd [ebx + edx*4 + 8], mm7
-
- mov edx, [ebp + %$gid] ; get group index for this i particle
- mov edx, [edx]
- add [ebp + %$gid], dword 4 ; advance pointer
-
- movq mm7, [esp + .vctot]
- pfacc mm7,mm7 ; get and sum the two parts of total potential
-
- mov eax, [ebp + %$Vc]
- movd mm6, [eax + edx*4]
- pfadd mm6, mm7
- movd [eax + edx*4], mm6 ; increment vc[gid]
-
- movq mm7, [esp + .vnbtot]
- pfacc mm7,mm7 ; same for Vnb.
-
- mov eax, [ebp + %$Vnb]
- movd mm6, [eax + edx*4]
- pfadd mm6, mm7
- movd [eax + edx*4], mm6 ; increment vnb[gid]
- ;; finish if last
- dec dword [ebp + %$nri]
- jz .end
- ;; not last, iterate once more!
- jmp .outer
-.end:
- femms
- add esp, 244
- pop edi
- pop esi
- pop edx
- pop ecx
- pop ebx
- pop eax
- endproc
-
-
-
-
-
-proc inl3130_3dnow
-%$nri arg
-%$iinr arg
-%$jindex arg
-%$jjnr arg
-%$shift arg
-%$shiftvec arg
-%$fshift arg
-%$gid arg
-%$pos arg
-%$faction arg
-%$charge arg
-%$facel arg
-%$Vc arg
-%$type arg
-%$ntype arg
-%$nbfp arg
-%$Vnb arg
-%$tabscale arg
-%$VFtab arg
- ;; stack offsets for local variables
-.is3 equ 0
-.ii3 equ 4
-.ixO equ 8
-.iyO equ 12
-.izO equ 16
-.ixH equ 20 ; repeated (64bit) to fill 3dnow reg
-.iyH equ 28 ; repeated (64bit) to fill 3dnow reg
-.izH equ 36 ; repeated (64bit) to fill 3dnow reg
-.qqOO equ 44 ; repeated (64bit) to fill 3dnow reg
-.qqOH equ 52 ; repeated (64bit) to fill 3dnow reg
-.qqHH equ 60 ; repeated (64bit) to fill 3dnow reg
-.c6 equ 68 ; repeated (64bit) to fill 3dnow reg
-.c12 equ 76 ; repeated (64bit) to fill 3dnow reg
-.six equ 84 ; repeated (64bit) to fill 3dnow reg
-.twelve equ 92 ; repeated (64bit) to fill 3dnow reg
-.two equ 100 ; repeated (64bit) to fill 3dnow reg
-.n1 equ 108 ; repeated (64bit) to fill 3dnow reg
-.tabscale equ 116 ; repeated (64bit) to fill 3dnow reg
-.vctot equ 124 ; repeated (64bit) to fill 3dnow reg
-.vnbtot equ 132 ; repeated (64bit) to fill 3dnow reg
-.innerjjnr equ 140
-.innerk equ 144
-.fixO equ 148
-.fiyO equ 152
-.fizO equ 156
-.fixH equ 160 ; repeated (64bit) to fill 3dnow reg
-.fiyH equ 168 ; repeated (64bit) to fill 3dnow reg
-.fizH equ 176 ; repeated (64bit) to fill 3dnow reg
-.dxO equ 184
-.dyO equ 188
-.dzO equ 192
-.dxH equ 200 ; repeated (64bit) to fill 3dnow reg
-.dyH equ 208 ; repeated (64bit) to fill 3dnow reg
-.dzH equ 216 ; repeated (64bit) to fill 3dnow reg
-.tmprsqH equ 224 ; repeated (64bit) to fill 3dnow reg
- push eax
- push ebx
- push ecx
- push edx
- push esi
- push edi
- sub esp, 232 ; local stack space
- femms
- ;; assume we have at least one i particle - start directly
-
- mov ecx, [ebp + %$iinr] ; ecx = pointer into iinr[]
- mov ebx, [ecx] ; ebx =ii
-
- mov edx, [ebp + %$charge]
- movd mm1, [ebp + %$facel] ; mm1=facel
- movd mm2, [edx + ebx*4] ; mm2=charge[ii0] (O)
- movd mm3, [edx + ebx*4 + 4] ; mm2=charge[ii0+1] (H)
- movq mm4, mm2
- pfmul mm4, mm1
- movq mm6, mm3
- pfmul mm6, mm1
- movq mm5, mm4
- pfmul mm4, mm2 ; mm4=qqOO*facel
- pfmul mm5, mm3 ; mm5=qqOH*facel
- pfmul mm6, mm3 ; mm6=qqHH*facel
- punpckldq mm5,mm5 ; spread to both halves
- punpckldq mm6,mm6 ; spread to both halves
- movq [esp + .qqOO], mm4
- movq [esp + .qqOH], mm5
- movq [esp + .qqHH], mm6
- mov edx, [ebp + %$type]
- mov ecx, [edx + ebx*4]
- shl ecx, 1
- mov edx, ecx
- imul ecx, [ebp + %$ntype]
- add edx, ecx
- mov eax, [ebp + %$nbfp]
- movd mm0, [eax + edx*4]
- movd mm1, [eax + edx*4 + 4]
- movq [esp + .c6], mm0
- movq [esp + .c12], mm1
- movq mm2, [mm_two]
- movq mm3, [mm_six]
- movq mm4, [mm_twelve]
- movq [esp + .two], mm2
- movq [esp + .six], mm3
- movq [esp + .twelve], mm4
- movd mm5, [ebp + %$tabscale]
- punpckldq mm5,mm5
- movq [esp + .tabscale], mm5
-.outer:
- mov eax, [ebp + %$shift] ; eax = pointer into shift[]
- mov ebx, [eax] ; ebx=shift[n]
- add [ebp + %$shift], dword 4 ; advance pointer one step
-
- lea ebx, [ebx + ebx*2] ; ebx=3*is
- mov [esp + .is3],ebx ; store is3
-
- mov eax, [ebp + %$shiftvec] ; eax = base of shiftvec[]
-
- movq mm5, [eax + ebx*4] ; move shX/shY to mm5 and shZ to mm6.
- movd mm6, [eax + ebx*4 + 8]
- movq mm0, mm5
- movq mm1, mm5
- movq mm2, mm6
- punpckldq mm0,mm0 ; also expand shX,Y,Z in mm0--mm2.
- punpckhdq mm1,mm1
- punpckldq mm2,mm2
-
- mov ecx, [ebp + %$iinr] ; ecx = pointer into iinr[]
- add [ebp + %$iinr], dword 4 ; advance pointer
- mov ebx, [ecx] ; ebx =ii
-
- lea ebx, [ebx + ebx*2] ; ebx = 3*ii=ii3
- mov eax, [ebp + %$pos] ; eax = base of pos[]
-
- pfadd mm5, [eax + ebx*4] ; ix = shX + posX (and iy too)
- movd mm7, [eax + ebx*4 + 8] ; cant use direct memory add for 4 bytes (iz)
- mov [esp + .ii3], ebx ; (use mm7 as temp. storage for iz.)
- pfadd mm6, mm7
- movq [esp + .ixO], mm5
- movq [esp + .izO], mm6
-
- movd mm3, [eax + ebx*4 + 12]
- movd mm4, [eax + ebx*4 + 16]
- movd mm5, [eax + ebx*4 + 20]
- punpckldq mm3, [eax + ebx*4 + 24]
- punpckldq mm4, [eax + ebx*4 + 28]
- punpckldq mm5, [eax + ebx*4 + 32] ; coords of H1 in low mm3-mm5, H2 in high.
-
- pfadd mm0, mm3
- pfadd mm1, mm4
- pfadd mm2, mm5
- movq [esp + .ixH], mm0
- movq [esp + .iyH], mm1
- movq [esp + .izH], mm2
-
- ;; clear vctot and i forces.
- pxor mm7,mm7
- movq [esp + .vctot], mm7
- movq [esp + .vnbtot], mm7
- movq [esp + .fixO], mm7
- movq [esp + .fizO], mm7
- movq [esp + .fixH], mm7
- movq [esp + .fiyH], mm7
- movq [esp + .fizH], mm7
-
- mov eax, [ebp + %$jindex]
- mov ecx, [eax] ; jindex[n]
- mov edx, [eax + 4] ; jindex[n+1]
- add [ebp + %$jindex], dword 4
- sub edx, ecx ; number of innerloop atoms
- mov [esp + .innerk], edx ; number of innerloop atoms
-
- mov esi, [ebp + %$pos]
- mov edi, [ebp + %$faction]
- mov eax, [ebp + %$jjnr]
- shl ecx, 2
- add eax, ecx
- mov [esp + .innerjjnr], eax ; pointer to jjnr[nj0]
-.inner_loop:
- ;; a single j particle iteration here - compare with the unrolled code for comments.
- mov eax, [esp + .innerjjnr]
- mov eax, [eax] ; eax=jnr offset
- add [esp + .innerjjnr], dword 4 ; advance pointer
-
- lea eax, [eax + eax*2]
-
- movq mm0, [esi + eax*4]
- movd mm1, [esi + eax*4 + 8]
- ;; copy & expand to mm2-mm4 for the H interactions
- movq mm2, mm0
- movq mm3, mm0
- movq mm4, mm1
- punpckldq mm2,mm2
- punpckhdq mm3,mm3
- punpckldq mm4,mm4
-
- pfsubr mm0, [esp + .ixO]
- pfsubr mm1, [esp + .izO]
-
- movq [esp + .dxO], mm0
- pfmul mm0,mm0
- movd [esp + .dzO], mm1
- pfmul mm1,mm1
- pfacc mm0, mm0
- pfadd mm0, mm1 ; mm0=rsqO
-
- punpckldq mm2, mm2
- punpckldq mm3, mm3
- punpckldq mm4, mm4 ; mm2-mm4 is jx-jz
- pfsubr mm2, [esp + .ixH]
- pfsubr mm3, [esp + .iyH]
- pfsubr mm4, [esp + .izH] ; mm2-mm4 is dxH-dzH
-
- movq [esp + .dxH], mm2
- movq [esp + .dyH], mm3
- movq [esp + .dzH], mm4
- pfmul mm2,mm2
- pfmul mm3,mm3
- pfmul mm4,mm4
-
- pfadd mm3,mm2
- pfadd mm3,mm4 ; mm3=rsqH
- movq [esp + .tmprsqH], mm3
-
- pfrsqrt mm1,mm0
-
- movq mm2,mm1
- pfmul mm1,mm1
- pfrsqit1 mm1,mm0
- pfrcpit2 mm1,mm2 ; mm1=invsqrt OO
- pfmul mm0, mm1 ; mm0=rsq OO
-
- pfmul mm0, [esp + .tabscale]
- pf2iw mm4, mm0
- movd [esp + .n1], mm4
- pi2fd mm4,mm4
- pfsub mm0, mm4 ; now mm0 is eps and mm4 n0.
- movq mm2, mm0
- pfmul mm2, mm2 ; mm0 is eps, mm2 eps2
-
- ; coulomb table
- mov edx, [ebp + %$VFtab]
- mov ecx, [esp + .n1]
- shl ecx, 2
-
- ;; load all values we need
- movd mm4, [edx + ecx*4]
- movd mm5, [edx + ecx*4 + 4]
- movd mm6, [edx + ecx*4 + 8]
- movd mm7, [edx + ecx*4 + 12]
-
- pfmul mm6, mm0 ; mm6 = Geps
- pfmul mm7, mm2 ; mm7 = Heps2
- ;;
- pfadd mm5, mm6
- pfadd mm5, mm7 ; mm5 = Fp
-
- pfmul mm7, [esp + .two] ; two*Heps2
- pfadd mm7, mm6
- pfadd mm7, mm5 ; mm7=FF
-
- pfmul mm5, mm0 ; mm5=eps*Fp
- pfadd mm5, mm4 ; mm5= VV
-
- pfmul mm5, [esp + .qqOO] ; vcoul=qq*VV
- pfmul mm7, [esp + .qqOO] ; fijC=qq*FF
-
- ;; update vctot directly, use mm3 for fscal sum.
- pfadd mm5, [esp + .vctot]
- movq [esp + .vctot], mm5
- movq mm3, mm7
- pfmul mm3, [esp + .tabscale]
-
- movq mm5, mm1
- pfmul mm5,mm5
- movq mm4, mm5
- pfmul mm4,mm5
- pfmul mm4,mm5
- movq mm5, mm4
- pfmul mm5,mm5 ; mm4=rinvsix, mm5=rinvtwelve
-
- pfmul mm4, [esp + .c6]
- pfmul mm5, [esp + .c12]
- movq mm6,mm5
- pfsub mm6,mm4
-
- pfmul mm4, [esp + .six]
- pfmul mm5, [esp + .twelve]
- pfsub mm5,mm4
- pfmul mm5, mm1
- pfsubr mm3, mm5
-
- pfmul mm3, mm1 ; mm3 is total fscal (for the oxygen) now
-
- ;; update vnbtot
- pfadd mm6, [esp + .vnbtot] ; add the earlier value
- movq [esp + .vnbtot], mm6 ; store the sum
-
- ;; Ready with the oxygen - potential is updated, fscal is in mm3.
- ;; time for hydrogens!
-
- movq mm0, [esp + .tmprsqH]
-
- pfrsqrt mm1, mm0
- pswapd mm0,mm0
- pfrsqrt mm2, mm0
- pswapd mm0,mm0
- punpckldq mm1,mm2 ; seeds are in mm1 now, and rsq in mm0.
-
- movq mm2, mm1
- pfmul mm1,mm1
- pfrsqit1 mm1,mm0
- pfrcpit2 mm1,mm2 ; mm1=invsqrt
-
- pfmul mm0,mm1 ; mm0=r
- pfmul mm0, [esp + .tabscale]
- pf2iw mm4, mm0
- movq [esp + .n1], mm4
- pi2fd mm4,mm4
- pfsub mm0, mm4 ; now mm0 is eps and mm4 n0.
- movq mm2, mm0
- pfmul mm2, mm2 ; mm0 is eps, mm2 eps2
-
- ; coulomb table
- mov edx, [ebp + %$VFtab]
- mov ecx, [esp + .n1]
- shl ecx, 2
- ;; load all values we need
- movd mm4, [edx + ecx*4]
- movd mm5, [edx + ecx*4 + 4]
- movd mm6, [edx + ecx*4 + 8]
- movd mm7, [edx + ecx*4 + 12]
- mov ecx, [esp + .n1 + 4]
- shl ecx, 2
- punpckldq mm4, [edx + ecx*4]
- punpckldq mm5, [edx + ecx*4 + 4]
- punpckldq mm6, [edx + ecx*4 + 8]
- punpckldq mm7, [edx + ecx*4 + 12]
-
- pfmul mm6, mm0 ; mm6 = Geps
- pfmul mm7, mm2 ; mm7 = Heps2
- ;;
- pfadd mm5, mm6
- pfadd mm5, mm7 ; mm5 = Fp
-
- pfmul mm7, [esp + .two] ; two*Heps2
- pfadd mm7, mm6
- pfadd mm7, mm5 ; mm7=FF
-
- pfmul mm5, mm0 ; mm5=eps*Fp
- pfadd mm5, mm4 ; mm5= VV
-
- pfmul mm5, [esp + .qqOH] ; vcoul=qq*VV
- pfmul mm7, [esp + .qqOH] ; fijC=qq*FF
- ;; update vctot.
- pfadd mm5, [esp + .vctot]
- movq [esp + .vctot], mm5
-
- ;; change sign of fijC and multiply by rinv
- pxor mm4,mm4
- pfsub mm4, mm7
- pfmul mm4, [esp + .tabscale]
- pfmul mm4, mm1 ; mm4 is total fscal (for the hydrogens) now
-
- ;; spread oxygen fscalar to both positions
- punpckldq mm3,mm3
- ;; calc vectorial force for O
- movq mm0, [esp + .dxO]
- movd mm1, [esp + .dzO]
- pfmul mm0, mm3
- pfmul mm1, mm3
-
- ;; calc vectorial force for H's
- movq mm5, [esp + .dxH]
- movq mm6, [esp + .dyH]
- movq mm7, [esp + .dzH]
- pfmul mm5, mm4
- pfmul mm6, mm4
- pfmul mm7, mm4
-
- ;; update iO particle force
- movq mm2, [esp + .fixO]
- movd mm3, [esp + .fizO]
- pfadd mm2, mm0
- pfadd mm3, mm1
- movq [esp + .fixO], mm2
- movd [esp + .fizO], mm3
-
- ;; update iH forces
- movq mm2, [esp + .fixH]
- movq mm3, [esp + .fiyH]
- movq mm4, [esp + .fizH]
- pfadd mm2, mm5
- pfadd mm3, mm6
- pfadd mm4, mm7
- movq [esp + .fixH], mm2
- movq [esp + .fiyH], mm3
- movq [esp + .fizH], mm4
-
- ;; pack j forces from H in the same form as the oxygen force.
- pfacc mm5, mm6 ; mm5(l)=fjx(H1+H2) mm5(h)=fjy(H1+H2)
- pfacc mm7, mm7 ; mm7(l)=fjz(H1+H2)
-
- pfadd mm0, mm5 ; add up total force on j particle.
- pfadd mm1, mm7
-
- ;; update j particle force
- movq mm2, [edi + eax*4]
- movd mm3, [edi + eax*4 + 8]
- pfsub mm2, mm0
- pfsub mm3, mm1
- movq [edi + eax*4], mm2
- movd [edi + eax*4 +8], mm3
-
- ; interactions with j H1.
-
- movq mm0, [esi + eax*4 + 12]
- movd mm1, [esi + eax*4 + 20]
- ;; copy & expand to mm2-mm4 for the H interactions
- movq mm2, mm0
- movq mm3, mm0
- movq mm4, mm1
- punpckldq mm2,mm2
- punpckhdq mm3,mm3
- punpckldq mm4,mm4
-
- pfsubr mm0, [esp + .ixO]
- pfsubr mm1, [esp + .izO]
-
- movq [esp + .dxO], mm0
- pfmul mm0,mm0
- movd [esp + .dzO], mm1
- pfmul mm1,mm1
- pfacc mm0, mm1
- pfadd mm0, mm1 ; mm0=rsqO
-
- punpckldq mm2, mm2
- punpckldq mm3, mm3
- punpckldq mm4, mm4 ; mm2-mm4 is jx-jz
- pfsubr mm2, [esp + .ixH]
- pfsubr mm3, [esp + .iyH]
- pfsubr mm4, [esp + .izH] ; mm2-mm4 is dxH-dzH
-
- movq [esp + .dxH], mm2
- movq [esp + .dyH], mm3
- movq [esp + .dzH], mm4
- pfmul mm2,mm2
- pfmul mm3,mm3
- pfmul mm4,mm4
-
- pfadd mm3,mm2
- pfadd mm3,mm4 ; mm3=rsqH
- movq [esp + .tmprsqH], mm3
-
- pfrsqrt mm1,mm0
-
- movq mm2,mm1
- pfmul mm1,mm1
- pfrsqit1 mm1,mm0
- pfrcpit2 mm1,mm2 ; mm1=invsqrt
- pfmul mm0, mm1 ; mm0=rsq
-
- pfmul mm0, [esp + .tabscale]
- pf2iw mm4, mm0
- movd [esp + .n1], mm4
- pi2fd mm4,mm4
- pfsub mm0, mm4 ; now mm0 is eps and mm4 n0.
- movq mm2, mm0
- pfmul mm2, mm2 ; mm0 is eps, mm2 eps2
-
- ; coulomb table
- mov edx, [ebp + %$VFtab]
- mov ecx, [esp + .n1]
- shl ecx, 2
-
- ;; load all values we need
- movd mm4, [edx + ecx*4]
- movd mm5, [edx + ecx*4 + 4]
- movd mm6, [edx + ecx*4 + 8]
- movd mm7, [edx + ecx*4 + 12]
-
- pfmul mm6, mm0 ; mm6 = Geps
- pfmul mm7, mm2 ; mm7 = Heps2
- ;;
- pfadd mm5, mm6
- pfadd mm5, mm7 ; mm5 = Fp
-
- pfmul mm7, [esp + .two] ; two*Heps2
- pfadd mm7, mm6
- pfadd mm7, mm5 ; mm7=FF
-
- pfmul mm5, mm0 ; mm5=eps*Fp
- pfadd mm5, mm4 ; mm5= VV
-
- pfmul mm5, [esp + .qqOH] ; vcoul=qq*VV
- pfmul mm7, [esp + .qqOH] ; fijC=qq*FF
-
- ;; update vctot directly, force is moved to mm3.
- pfadd mm5, [esp + .vctot]
- movq [esp + .vctot], mm5
- pxor mm3, mm3
- pfsub mm3, mm7
- pfmul mm3, [esp + .tabscale]
- pfmul mm3, mm1 ; mm3 is total fscal (for the oxygen) now
-
- movq mm0, [esp + .tmprsqH]
-
- pfrsqrt mm1, mm0
- pswapd mm0,mm0
- pfrsqrt mm2, mm0
- pswapd mm0,mm0
- punpckldq mm1,mm2 ; seeds are in mm1 now, and rsq in mm0.
-
- movq mm2, mm1
- pfmul mm1,mm1
- pfrsqit1 mm1,mm0
- pfrcpit2 mm1,mm2 ; mm1=invsqrt
-
- pfmul mm0,mm1 ; mm0=r
- pfmul mm0, [esp + .tabscale]
- pf2iw mm4, mm0
- movq [esp + .n1], mm4
- pi2fd mm4,mm4
- pfsub mm0, mm4 ; now mm0 is eps and mm4 n0.
- movq mm2, mm0
- pfmul mm2, mm2 ; mm0 is eps, mm2 eps2
-
- ; coulomb table
- mov edx, [ebp + %$VFtab]
- mov ecx, [esp + .n1]
- shl ecx, 2
- ;; load all values we need
- movd mm4, [edx + ecx*4]
- movd mm5, [edx + ecx*4 + 4]
- movd mm6, [edx + ecx*4 + 8]
- movd mm7, [edx + ecx*4 + 12]
- mov ecx, [esp + .n1 + 4]
- shl ecx, 2
- punpckldq mm4, [edx + ecx*4]
- punpckldq mm5, [edx + ecx*4 + 4]
- punpckldq mm6, [edx + ecx*4 + 8]
- punpckldq mm7, [edx + ecx*4 + 12]
-
-
- pfmul mm6, mm0 ; mm6 = Geps
- pfmul mm7, mm2 ; mm7 = Heps2
- ;;
- pfadd mm5, mm6
- pfadd mm5, mm7 ; mm5 = Fp
-
- pfmul mm7, [esp + .two] ; two*Heps2
- pfadd mm7, mm6
- pfadd mm7, mm5 ; mm7=FF
-
- pfmul mm5, mm0 ; mm5=eps*Fp
- pfadd mm5, mm4 ; mm5= VV
-
- pfmul mm5, [esp + .qqHH] ; vcoul=qq*VV
- pfmul mm7, [esp + .qqHH] ; fijC=qq*FF
- ;; update vctot.
- pfadd mm5, [esp + .vctot]
- movq [esp + .vctot], mm5
-
- ;; change sign of fijC and multiply by rinv
- pxor mm4,mm4
- pfsub mm4, mm7
- pfmul mm4, [esp + .tabscale]
- pfmul mm4, mm1 ; mm4 is total fscal (for the hydrogens) now
-
- ;; spread oxygen fscalar to both positions
- punpckldq mm3,mm3
- ;; calc vectorial force for O
- movq mm0, [esp + .dxO]
- movd mm1, [esp + .dzO]
- pfmul mm0, mm3
- pfmul mm1, mm3
-
- ;; calc vectorial force for H's
- movq mm5, [esp + .dxH]
- movq mm6, [esp + .dyH]
- movq mm7, [esp + .dzH]
- pfmul mm5, mm4
- pfmul mm6, mm4
- pfmul mm7, mm4
-
- ;; update iO particle force
- movq mm2, [esp + .fixO]
- movd mm3, [esp + .fizO]
- pfadd mm2, mm0
- pfadd mm3, mm1
- movq [esp + .fixO], mm2
- movd [esp + .fizO], mm3
-
- ;; update iH forces
- movq mm2, [esp + .fixH]
- movq mm3, [esp + .fiyH]
- movq mm4, [esp + .fizH]
- pfadd mm2, mm5
- pfadd mm3, mm6
- pfadd mm4, mm7
- movq [esp + .fixH], mm2
- movq [esp + .fiyH], mm3
- movq [esp + .fizH], mm4
-
- ;; pack j forces from H in the same form as the oxygen force.
- pfacc mm5, mm6 ; mm5(l)=fjx(H1+H2) mm5(h)=fjy(H1+H2)
- pfacc mm7, mm7 ; mm7(l)=fjz(H1+H2)
-
- pfadd mm0, mm5 ; add up total force on j particle.
- pfadd mm1, mm7
-
- ;; update j particle force
- movq mm2, [edi + eax*4 + 12]
- movd mm3, [edi + eax*4 + 20]
- pfsub mm2, mm0
- pfsub mm3, mm1
- movq [edi + eax*4 + 12], mm2
- movd [edi + eax*4 + 20], mm3
-
- ; interactions with j H2
- movq mm0, [esi + eax*4 + 24]
- movd mm1, [esi + eax*4 + 32]
- ;; copy & expand to mm2-mm4 for the H interactions
- movq mm2, mm0
- movq mm3, mm0
- movq mm4, mm1
- punpckldq mm2,mm2
- punpckhdq mm3,mm3
- punpckldq mm4,mm4
-
- pfsubr mm0, [esp + .ixO]
- pfsubr mm1, [esp + .izO]
-
- movq [esp + .dxO], mm0
- pfmul mm0,mm0
- movd [esp + .dzO], mm1
- pfmul mm1,mm1
- pfacc mm0, mm1
- pfadd mm0, mm1 ; mm0=rsqO
-
- punpckldq mm2, mm2
- punpckldq mm3, mm3
- punpckldq mm4, mm4 ; mm2-mm4 is jx-jz
- pfsubr mm2, [esp + .ixH]
- pfsubr mm3, [esp + .iyH]
- pfsubr mm4, [esp + .izH] ; mm2-mm4 is dxH-dzH
-
- movq [esp + .dxH], mm2
- movq [esp + .dyH], mm3
- movq [esp + .dzH], mm4
- pfmul mm2,mm2
- pfmul mm3,mm3
- pfmul mm4,mm4
-
- pfadd mm3,mm2
- pfadd mm3,mm4 ; mm3=rsqH
- movq [esp + .tmprsqH], mm3
-
- pfrsqrt mm1,mm0
-
- movq mm2,mm1
- pfmul mm1,mm1
- pfrsqit1 mm1,mm0
- pfrcpit2 mm1,mm2 ; mm1=invsqrt
- pfmul mm0, mm1
-
- pfmul mm0, [esp + .tabscale]
- pf2iw mm4, mm0
- movd [esp + .n1], mm4
- pi2fd mm4,mm4
- pfsub mm0, mm4 ; now mm0 is eps and mm4 n0.
- movq mm2, mm0
- pfmul mm2, mm2 ; mm0 is eps, mm2 eps2
-
- ; coulomb table
- mov edx, [ebp + %$VFtab]
- mov ecx, [esp + .n1]
- shl ecx, 2
-
- ;; load all values we need
- movd mm4, [edx + ecx*4]
- movd mm5, [edx + ecx*4 + 4]
- movd mm6, [edx + ecx*4 + 8]
- movd mm7, [edx + ecx*4 + 12]
-
- pfmul mm6, mm0 ; mm6 = Geps
- pfmul mm7, mm2 ; mm7 = Heps2
- ;;
- pfadd mm5, mm6
- pfadd mm5, mm7 ; mm5 = Fp
-
- pfmul mm7, [esp + .two] ; two*Heps2
- pfadd mm7, mm6
- pfadd mm7, mm5 ; mm7=FF
-
- pfmul mm5, mm0 ; mm5=eps*Fp
- pfadd mm5, mm4 ; mm5= VV
-
- pfmul mm5, [esp + .qqOH] ; vcoul=qq*VV
- pfmul mm7, [esp + .qqOH] ; fijC=qq*FF
-
- ;; update vctot directly, use mm3 for fscal sum.
- pfadd mm5, [esp + .vctot]
- movq [esp + .vctot], mm5
- pxor mm3,mm3
- pfsub mm3, mm7
- pfmul mm3, [esp + .tabscale]
- pfmul mm3, mm1 ; mm3 is total fscal (for the oxygen) now
-
- movq mm0, [esp + .tmprsqH]
-
- pfrsqrt mm1, mm0
- pswapd mm0,mm0
- pfrsqrt mm2, mm0
- pswapd mm0,mm0
- punpckldq mm1,mm2 ; seeds are in mm1 now, and rsq in mm0.
-
- movq mm2, mm1
- pfmul mm1,mm1
- pfrsqit1 mm1,mm0
- pfrcpit2 mm1,mm2 ; mm1=invsqrt
-
- pfmul mm0,mm1 ; mm0=r
- pfmul mm0, [esp + .tabscale]
- pf2iw mm4, mm0
- movq [esp + .n1], mm4
- pi2fd mm4,mm4
- pfsub mm0, mm4 ; now mm0 is eps and mm4 n0.
- movq mm2, mm0
- pfmul mm2, mm2 ; mm0 is eps, mm2 eps2
-
- ; coulomb table
- mov edx, [ebp + %$VFtab]
- mov ecx, [esp + .n1]
- shl ecx, 2
- ;; load all values we need
- movd mm4, [edx + ecx*4]
- movd mm5, [edx + ecx*4 + 4]
- movd mm6, [edx + ecx*4 + 8]
- movd mm7, [edx + ecx*4 + 12]
- mov ecx, [esp + .n1 + 4]
- shl ecx, 2
- punpckldq mm4, [edx + ecx*4]
- punpckldq mm5, [edx + ecx*4 + 4]
- punpckldq mm6, [edx + ecx*4 + 8]
- punpckldq mm7, [edx + ecx*4 + 12]
-
-
- pfmul mm6, mm0 ; mm6 = Geps
- pfmul mm7, mm2 ; mm7 = Heps2
- ;;
- pfadd mm5, mm6
- pfadd mm5, mm7 ; mm5 = Fp
-
- pfmul mm7, [esp + .two] ; two*Heps2
- pfadd mm7, mm6
- pfadd mm7, mm5 ; mm7=FF
-
- pfmul mm5, mm0 ; mm5=eps*Fp
- pfadd mm5, mm4 ; mm5= VV
-
- pfmul mm5, [esp + .qqHH] ; vcoul=qq*VV
- pfmul mm7, [esp + .qqHH] ; fijC=qq*FF
- ;; update vctot.
- pfadd mm5, [esp + .vctot]
- movq [esp + .vctot], mm5
-
- ;; change sign of fijC and multiply by rinv
- pxor mm4,mm4
- pfsub mm4, mm7
- pfmul mm4, [esp + .tabscale]
- pfmul mm4, mm1 ; mm4 is total fscal (for the hydrogens) now
-
- ;; spread oxygen fscalar to both positions
- punpckldq mm3,mm3
- ;; calc vectorial force for O
- movq mm0, [esp + .dxO]
- movd mm1, [esp + .dzO]
- pfmul mm0, mm3
- pfmul mm1, mm3
-
- ;; calc vectorial force for H's
- movq mm5, [esp + .dxH]
- movq mm6, [esp + .dyH]
- movq mm7, [esp + .dzH]
- pfmul mm5, mm4
- pfmul mm6, mm4
- pfmul mm7, mm4
-
- ;; update iO particle force
- movq mm2, [esp + .fixO]
- movd mm3, [esp + .fizO]
- pfadd mm2, mm0
- pfadd mm3, mm1
- movq [esp + .fixO], mm2
- movd [esp + .fizO], mm3
-
- ;; update iH forces
- movq mm2, [esp + .fixH]
- movq mm3, [esp + .fiyH]
- movq mm4, [esp + .fizH]
- pfadd mm2, mm5
- pfadd mm3, mm6
- pfadd mm4, mm7
- movq [esp + .fixH], mm2
- movq [esp + .fiyH], mm3
- movq [esp + .fizH], mm4
-
- ;; pack j forces from H in the same form as the oxygen force.
- pfacc mm5, mm6 ; mm5(l)=fjx(H1+H2) mm5(h)=fjy(H1+H2)
- pfacc mm7, mm7 ; mm7(l)=fjz(H1+H2)
-
- pfadd mm0, mm5 ; add up total force on j particle.
- pfadd mm1, mm7
-
- ;; update j particle force
- movq mm2, [edi + eax*4 + 24]
- movd mm3, [edi + eax*4 + 32]
- pfsub mm2, mm0
- pfsub mm3, mm1
- movq [edi + eax*4 + 24], mm2
- movd [edi + eax*4 + 32], mm3
-
- ;; done - one more?
- dec dword [esp + .innerk]
- jz .updateouterdata
- jmp .inner_loop
-.updateouterdata:
- mov ecx, [esp + .ii3]
-
- movq mm6, [edi + ecx*4] ; increment iO force
- movd mm7, [edi + ecx*4 + 8]
- pfadd mm6, [esp + .fixO]
- pfadd mm7, [esp + .fizO]
- movq [edi + ecx*4], mm6
- movd [edi + ecx*4 +8], mm7
-
- movq mm0, [esp + .fixH]
- movq mm3, [esp + .fiyH]
- movq mm1, [esp + .fizH]
- movq mm2, mm0
- punpckldq mm0, mm3 ; mm0(l)=fxH1, mm0(h)=fyH1
- punpckhdq mm2, mm3 ; mm2(l)=fxH2, mm2(h)=fyH2
- movq mm3, mm1
- pswapd mm3,mm3
- ;; mm1 is fzH1
- ;; mm3 is fzH2
-
- movq mm6, [edi + ecx*4 + 12] ; increment iH1 force
- movd mm7, [edi + ecx*4 + 20]
- pfadd mm6, mm0
- pfadd mm7, mm1
- movq [edi + ecx*4 + 12], mm6
- movd [edi + ecx*4 + 20], mm7
-
- movq mm6, [edi + ecx*4 + 24] ; increment iH2 force
- movd mm7, [edi + ecx*4 + 32]
- pfadd mm6, mm2
- pfadd mm7, mm3
- movq [edi + ecx*4 + 24], mm6
- movd [edi + ecx*4 + 32], mm7
-
-
- mov ebx, [ebp + %$fshift] ; increment fshift force
- mov edx, [esp + .is3]
-
- movq mm6, [ebx + edx*4]
- movd mm7, [ebx + edx*4 + 8]
- pfadd mm6, [esp + .fixO]
- pfadd mm7, [esp + .fizO]
- pfadd mm6, mm0
- pfadd mm7, mm1
- pfadd mm6, mm2
- pfadd mm7, mm3
- movq [ebx + edx*4], mm6
- movd [ebx + edx*4 + 8], mm7
-
- mov edx, [ebp + %$gid] ; get group index for this i particle
- mov edx, [edx]
- add [ebp + %$gid], dword 4 ; advance pointer
-
- movq mm7, [esp + .vctot]
- pfacc mm7,mm7 ; get and sum the two parts of total potential
-
- mov eax, [ebp + %$Vc]
- movd mm6, [eax + edx*4]
- pfadd mm6, mm7
- movd [eax + edx*4], mm6 ; increment vc[gid]
-
- movq mm7, [esp + .vnbtot]
- pfacc mm7,mm7 ; get and sum the two parts of total potential
-
- mov eax, [ebp + %$Vnb]
- movd mm6, [eax + edx*4]
- pfadd mm6, mm7
- movd [eax + edx*4], mm6 ; increment vnbtot[gid]
- ;; finish if last
- dec dword [ebp + %$nri]
- jz .end
- ;; not last, iterate once more!
- jmp .outer
-.end:
- femms
- add esp, 232
- pop edi
- pop esi
- pop edx
- pop ecx
- pop ebx
- pop eax
- endproc
-
-
-proc inl3300_3dnow
-%$nri arg
-%$iinr arg
-%$jindex arg
-%$jjnr arg
-%$shift arg
-%$shiftvec arg
-%$fshift arg
-%$gid arg
-%$pos arg
-%$faction arg
-%$charge arg
-%$facel arg
-%$Vc arg
-%$type arg
-%$ntype arg
-%$nbfp arg
-%$Vnb arg
-%$tabscale arg
-%$VFtab arg
- ;; stack offsets for local variables
-.is3 equ 0
-.ii3 equ 4
-.ix equ 8
-.iy equ 12
-.iz equ 16
-.iq equ 20 ; repeated (64bit) to fill 3dnow reg
-.vctot equ 28 ; repeated (64bit) to fill 3dnow reg
-.vnbtot equ 36 ; repeated (64bit) to fill 3dnow reg
-.c6 equ 44 ; repeated (64bit) to fill 3dnow reg
-.c12 equ 52 ; repeated (64bit) to fill 3dnow reg
-.two equ 60 ; repeated (64bit) to fill 3dnow reg
-.n1 equ 68 ; repeated (64bit) to fill 3dnow reg
-.tabscale equ 76 ; repeated (64bit) to fill 3dnow reg
-.ntia equ 84
-.innerjjnr equ 88
-.innerk equ 92
-.fix equ 96
-.fiy equ 100
-.fiz equ 104
-.dx1 equ 108
-.dy1 equ 112
-.dz1 equ 116
-.dx2 equ 120
-.dy2 equ 124
-.dz2 equ 128
- push eax
- push ebx
- push ecx
- push edx
- push esi
- push edi
- sub esp, 132 ; local stack space
- femms
- ; move data to local stack
- movq mm0, [mm_two]
- movd mm3, [ebp + %$tabscale]
- movq [esp + .two], mm0
- punpckldq mm3,mm3
- movq [esp + .tabscale], mm3
- ;; assume we have at least one i particle - start directly
-.outer:
- mov eax, [ebp + %$shift] ; eax = pointer into shift[]
- mov ebx, [eax] ; ebx=shift[n]
- add [ebp + %$shift], dword 4 ; advance pointer one step
-
- lea ebx, [ebx + ebx*2] ; ebx=3*is
- mov [esp + .is3],ebx ; store is3
-
- mov eax, [ebp + %$shiftvec] ; eax = base of shiftvec[]
-
- movq mm0, [eax + ebx*4] ; move shX/shY to mm0 and shZ to mm1.
- movd mm1, [eax + ebx*4 + 8]
-
- mov ecx, [ebp + %$iinr] ; ecx = pointer into iinr[]
- add [ebp + %$iinr], dword 4 ; advance pointer
- mov ebx, [ecx] ; ebx =ii
-
- mov edx, [ebp + %$charge]
- movd mm2, [edx + ebx*4] ; mm2=charge[ii]
- pfmul mm2, [ebp + %$facel]
- punpckldq mm2,mm2 ; spread to both halves
- movq [esp + .iq], mm2 ; iq =facel*charge[ii]
-
- mov edx, [ebp + %$type]
- mov edx, [edx + ebx*4]
- imul edx, [ebp + %$ntype]
- shl edx, 1
- mov [esp + .ntia], edx
-
- lea ebx, [ebx + ebx*2] ; ebx = 3*ii=ii3
- mov eax, [ebp + %$pos] ; eax = base of pos[]
-
- pfadd mm0, [eax + ebx*4] ; ix = shX + posX (and iy too)
- movd mm3, [eax + ebx*4 + 8] ; cant use direct memory add for 4 bytes (iz)
- mov [esp + .ii3], ebx
- pfadd mm1, mm3
- movq [esp + .ix], mm0
- movd [esp + .iz], mm1
-
- ;; clear total potential and i forces.
- pxor mm7,mm7
- movq [esp + .vctot], mm7
- movq [esp + .vnbtot], mm7
- movq [esp + .fix], mm7
- movd [esp + .fiz], mm7
-
- mov eax, [ebp + %$jindex]
- mov ecx, [eax] ; jindex[n]
- mov edx, [eax + 4] ; jindex[n+1]
- add [ebp + %$jindex], dword 4
- sub edx, ecx ; number of innerloop atoms
-
- mov esi, [ebp + %$pos]
- mov edi, [ebp + %$faction]
- mov eax, [ebp + %$jjnr]
- shl ecx, 2
- add eax, ecx
- mov [esp + .innerjjnr], eax ; pointer to jjnr[nj0]
- sub edx, dword 2
- mov [esp + .innerk], edx ; number of innerloop atoms
- jge .unroll_loop
- jmp .finish_inner
-.unroll_loop:
- ;; paired innerloop here.
- mov ecx, [esp + .innerjjnr] ; pointer to jjnr[k]
- mov eax, [ecx]
- mov ebx, [ecx + 4] ; eax/ebx=jnr
- add [esp + .innerjjnr], dword 8 ; advance pointer (unrolled 2)
- prefetch [ecx + 16] ; prefetch data - trial and error says 16 is best
-
- mov ecx, [ebp + %$charge] ; base of charge[]
- movq mm5, [esp + .iq]
- movd mm3, [ecx + eax*4] ; charge[jnr1]
- punpckldq mm3, [ecx + ebx*4] ; move charge 2 to high part of mm3
- pfmul mm3,mm5 ; mm3 now has qq for both particles
-
- mov ecx, [ebp + %$type]
- mov edx, [ecx + eax*4] ; type [jnr1]
- mov ecx, [ecx + ebx*4] ; type [jnr2]
-
- mov esi, [ebp + %$nbfp] ; base of nbfp
- shl edx, 1
- shl ecx, 1
- add edx, [esp + .ntia] ; tja = ntia + 2*type
- add ecx, [esp + .ntia]
-
- movq mm5, [esi + edx*4] ; mm5 = 1st c6 / c12
- movq mm7, [esi + ecx*4] ; mm7 = 2nd c6 / c12
- movq mm6,mm5
- punpckldq mm5,mm7 ; mm5 = 1st c6 / 2nd c6
- punpckhdq mm6,mm7 ; mm6 = 1st c12 / 2nd c12
- movq [esp + .c6], mm5
- movq [esp + .c12], mm6
-
- lea eax, [eax + eax*2] ; replace jnr with j3
- lea ebx, [ebx + ebx*2]
-
- mov esi, [ebp + %$pos]
-
- movq mm0, [esp + .ix]
- movd mm1, [esp + .iz]
- movq mm4, [esi + eax*4] ; fetch first j coordinates
- movd mm5, [esi + eax*4 + 8]
- pfsubr mm4,mm0 ; dr = ir - jr
- pfsubr mm5,mm1
- movq [esp + .dx1], mm4 ; store dr
- movd [esp + .dz1], mm5
- pfmul mm4,mm4 ; square dx,dy,dz
- pfmul mm5,mm5
- pfacc mm4, mm5 ; accumulate to get dx*dx+dy*dy+dz*dz
- pfacc mm4, mm5 ; first rsq in lower mm4
-
- movq mm6, [esi + ebx*4] ; fetch second j coordinates
- movd mm7, [esi + ebx*4 + 8]
-
- pfsubr mm6,mm0 ; dr = ir - jr
- pfsubr mm7,mm1
- movq [esp + .dx2], mm6 ; store dr
- movd [esp + .dz2], mm7
- pfmul mm6,mm6 ; square dx,dy,dz
- pfmul mm7,mm7
- pfacc mm6, mm7 ; accumulate to get dx*dx+dy*dy+dz*dz
- pfacc mm6, mm7 ; second rsq in lower mm6
-
- pfrsqrt mm0, mm4 ; lookup inverse square root seed
- pfrsqrt mm1, mm6
-
-
- punpckldq mm0,mm1
- punpckldq mm4,mm6 ; now 4 has rsq and 0 the seed for both pairs.
- movq mm2,mm0 ; amd 3dnow N-R iteration to get full precision.
- pfmul mm0,mm0
- pfrsqit1 mm0,mm4
- pfrcpit2 mm0,mm2
- pfmul mm4, mm0
- movq mm1, mm4
- ;; mm0 is invsqrt, and mm1 r.
- ;; do potential and fscal
- pfmul mm1, [esp + .tabscale] ; mm1=rt
- pf2iw mm4,mm1
- movq [esp + .n1], mm4
- pi2fd mm4,mm4
- pfsub mm1, mm4 ; now mm1 is eps and mm4 n0.
-
- movq mm2,mm1
- pfmul mm2,mm2 ; mm1 is eps, mm2 is eps2
-
- mov edx, [ebp + %$VFtab]
- mov ecx, [esp + .n1]
- lea ecx, [ecx + ecx*2]
- shl ecx, 2
- ; load all the table values we need
- movd mm4, [edx + ecx*4]
- movd mm5, [edx + ecx*4 + 4]
- movd mm6, [edx + ecx*4 + 8]
- movd mm7, [edx + ecx*4 + 12]
- mov ecx, [esp + .n1 + 4]
- lea ecx, [ecx + ecx*2]
- shl ecx, 2
- punpckldq mm4, [edx + ecx*4]
- punpckldq mm5, [edx + ecx*4 + 4]
- punpckldq mm6, [edx + ecx*4 + 8]
- punpckldq mm7, [edx + ecx*4 + 12]
-
- pfmul mm6, mm1 ; mm6 = Geps
- pfmul mm7, mm2 ; mm7 = Heps2
-
- pfadd mm5, mm6
- pfadd mm5, mm7 ; mm5 = Fp
-
- pfmul mm7, [esp + .two] ; two*Heps2
- pfadd mm7, mm6
- pfadd mm7, mm5 ; mm7=FF
-
- pfmul mm5, mm1 ; mm5=eps*Fp
- pfadd mm5, mm4 ; mm5= VV
-
- pfmul mm5, mm3 ; vcoul=qq*VV
- pfmul mm3, mm7 ; fijC=FF*qq
-
- ; at this point mm5 contains vcoul and mm3 fijC.
- ; increment vcoul - then we can get rid of mm5.
- ;; update vctot
- pfadd mm5, [esp + .vctot] ; add the earlier value
- movq [esp + .vctot], mm5 ; store the sum
-
- ; dispersion table
- mov ecx, [esp + .n1]
- lea ecx, [ecx + ecx*2]
- shl ecx, 2
- ; load all the table values we need
- movd mm4, [edx + ecx*4 + 16]
- movd mm5, [edx + ecx*4 + 20]
- movd mm6, [edx + ecx*4 + 24]
- movd mm7, [edx + ecx*4 + 28]
- mov ecx, [esp + .n1 + 4]
- lea ecx, [ecx + ecx*2]
- shl ecx, 2
- punpckldq mm4, [edx + ecx*4 + 16]
- punpckldq mm5, [edx + ecx*4 + 20]
- punpckldq mm6, [edx + ecx*4 + 24]
- punpckldq mm7, [edx + ecx*4 + 28]
- pfmul mm6, mm1 ; mm6 = Geps
- pfmul mm7, mm2 ; mm7 = Heps2
- pfadd mm5, mm6
- pfadd mm5, mm7 ; mm5 = Fp
- pfmul mm7, [esp + .two] ; two*Heps2
- pfadd mm7, mm6
- pfadd mm7, mm5 ; mm7=FF
- pfmul mm5, mm1 ; mm5=eps*Fp
- pfadd mm5, mm4 ; mm5= VV
-
- movq mm4, [esp + .c6]
- pfmul mm7, mm4 ; fijD
- pfmul mm5, mm4 ; vnb6
- pfadd mm3, mm7 ; add to fscal
-
- ;; update vnbtot to release mm5!
- pfadd mm5, [esp + .vnbtot] ; add the earlier value
- movq [esp + .vnbtot], mm5 ; store the sum
-
- ; repulsion table
- mov ecx, [esp + .n1]
- lea ecx, [ecx + ecx*2]
- shl ecx, 2
- ; load all the table values we need
- movd mm4, [edx + ecx*4 + 32]
- movd mm5, [edx + ecx*4 + 36]
- movd mm6, [edx + ecx*4 + 40]
- movd mm7, [edx + ecx*4 + 44]
- mov ecx, [esp + .n1 + 4]
- lea ecx, [ecx + ecx*2]
- shl ecx, 2
- punpckldq mm4, [edx + ecx*4 + 32]
- punpckldq mm5, [edx + ecx*4 + 36]
- punpckldq mm6, [edx + ecx*4 + 40]
- punpckldq mm7, [edx + ecx*4 + 44]
-
- pfmul mm6, mm1 ; mm6 = Geps
- pfmul mm7, mm2 ; mm7 = Heps2
- pfadd mm5, mm6
- pfadd mm5, mm7 ; mm5 = Fp
- pfmul mm7, [esp + .two] ; two*Heps2
- pfadd mm7, mm6
- pfadd mm7, mm5 ; mm7=FF
- pfmul mm5, mm1 ; mm5=eps*Fp
- pfadd mm5, mm4 ; mm5= VV
-
- movq mm6, [esp + .c12]
- pfmul mm7, mm6 ; fijR
- pfmul mm5, mm6 ; vnb12
- pfadd mm3, mm7 ; total fscal fijC+fijD+fijR
-
- ; change sign of mm3
- pxor mm1,mm1
- pfsub mm1, mm3
- pfmul mm0, [esp + .tabscale]
- pfmul mm0, mm1 ; mm0 is total fscal now
-
- prefetchw [esp + .dx1] ; prefetch i forces to cache
-
- ;; spread fscalar to both positions
- movq mm1,mm0
- punpckldq mm0,mm0
- punpckhdq mm1,mm1
-
- ;; calc vector force
- prefetchw [edi + eax*4] ; prefetch the 1st faction to cache
- movq mm2, [esp + .dx1] ; fetch dr
- movd mm3, [esp + .dz1]
-
- ;; update vnbtot
- pfadd mm5, [esp + .vnbtot] ; add the earlier value
- movq [esp + .vnbtot], mm5 ; store the sum
-
- prefetchw [edi + ebx*4] ; prefetch the 2nd faction to cache
- pfmul mm2, mm0 ; mult by fs
- pfmul mm3, mm0
-
- movq mm4, [esp + .dx2] ; fetch dr
- movd mm5, [esp + .dz2]
- pfmul mm4, mm1 ; mult by fs
- pfmul mm5, mm1
- ;; update i forces
-
- movq mm0, [esp + .fix]
- movd mm1, [esp + .fiz]
- pfadd mm0, mm2
- pfadd mm1, mm3
-
- pfadd mm0, mm4
- pfadd mm1, mm5
- movq [esp + .fix], mm0
- movd [esp + .fiz], mm1
- ;; update j forces
-
- movq mm0, [edi + eax*4]
- movd mm1, [edi + eax*4 + 8]
- movq mm6, [edi + ebx*4]
- movd mm7, [edi + ebx*4 + 8]
-
- pfsub mm0, mm2
- pfsub mm1, mm3
- pfsub mm6, mm4
- pfsub mm7, mm5
-
- movq [edi + eax*4], mm0
- movd [edi + eax*4 +8], mm1
- movq [edi + ebx*4], mm6
- movd [edi + ebx*4 + 8], mm7
-
- ;; should we do one more iteration?
- sub [esp + .innerk], dword 2
- jl .finish_inner
- jmp .unroll_loop
-.finish_inner:
- and [esp + .innerk], dword 1
- jnz .single_inner
- jmp .updateouterdata
-.single_inner:
- ;; a single j particle iteration here - compare with the unrolled code for comments.
- mov eax, [esp + .innerjjnr]
- mov eax, [eax] ; eax=jnr offset
-
- mov ecx, [ebp + %$charge]
- movd mm5, [esp + .iq]
- movd mm3, [ecx + eax*4]
- pfmul mm3, mm5 ; mm3=qq
-
- mov esi, [ebp + %$nbfp]
- mov ecx, [ebp + %$type]
- mov edx, [ecx + eax*4] ; type [jnr1]
- shl edx, 1
- add edx, [esp + .ntia] ; tja = ntia + 2*type
- movd mm5, [esi + edx*4] ; mm5 = 1st c6
- movq [esp + .c6], mm5
- movd mm5, [esi + edx*4 + 4] ; mm5 = 1st c12
- movq [esp + .c12], mm5
-
- mov esi, [ebp + %$pos]
- lea eax, [eax + eax*2]
-
- movq mm0, [esp + .ix]
- movd mm1, [esp + .iz]
- movq mm4, [esi + eax*4]
- movd mm5, [esi + eax*4 + 8]
- pfsubr mm4, mm0
- pfsubr mm5, mm1
- movq [esp + .dx1], mm4
- pfmul mm4,mm4
- movd [esp + .dz1], mm5
- pfmul mm5,mm5
- pfacc mm4, mm5
- pfacc mm4, mm5 ; mm0=rsq
-
- pfrsqrt mm0,mm4
- movq mm2,mm0
- pfmul mm0,mm0
- pfrsqit1 mm0,mm4
- pfrcpit2 mm0,mm2 ; mm1=invsqrt
- pfmul mm4, mm0
- movq mm1, mm4
- ;; mm0 is invsqrt, and mm1 r.
-
- ;; calculate potentials and scalar force
- pfmul mm1, [esp + .tabscale] ; mm1=rt
- pf2iw mm4,mm1
- movd [esp + .n1], mm4
- pi2fd mm4,mm4
- pfsub mm1, mm4 ; now mm1 is eps and mm4 n0.
-
- movq mm2,mm1
- pfmul mm2,mm2 ; mm1 is eps, mm2 is eps2
-
- ; coulomb table
- mov edx, [ebp + %$VFtab]
- mov ecx, [esp + .n1]
- lea ecx, [ecx + ecx*2]
- shl ecx, 2
- ; load all the table values we need
- movd mm4, [edx + ecx*4]
- movd mm5, [edx + ecx*4 + 4]
- movd mm6, [edx + ecx*4 + 8]
- movd mm7, [edx + ecx*4 + 12]
-
- pfmul mm6, mm1 ; mm6 = Geps
- pfmul mm7, mm2 ; mm7 = Heps2
-
- pfadd mm5, mm6
- pfadd mm5, mm7 ; mm5 = Fp
-
- pfmul mm7, [esp + .two] ; two*Heps2
- pfadd mm7, mm6
- pfadd mm7, mm5 ; mm7=FF
-
- pfmul mm5, mm1 ; mm5=eps*Fp
- pfadd mm5, mm4 ; mm5= VV
-
- pfmul mm5, mm3 ; vcoul=qq*VV
- pfmul mm3, mm7 ; fijC=FF*qq
-
- ; at this point mm5 contains vcoul and mm3 fijC.
- ; increment vcoul - then we can get rid of mm5.
- ;; update vctot
- pfadd mm5, [esp + .vctot] ; add the earlier value
- movq [esp + .vctot], mm5 ; store the sum
-
- ; dispersion table
- ; load all the table values we need
- movd mm4, [edx + ecx*4 + 16]
- movd mm5, [edx + ecx*4 + 20]
- movd mm6, [edx + ecx*4 + 24]
- movd mm7, [edx + ecx*4 + 28]
- pfmul mm6, mm1 ; mm6 = Geps
- pfmul mm7, mm2 ; mm7 = Heps2
- pfadd mm5, mm6
- pfadd mm5, mm7 ; mm5 = Fp
- pfmul mm7, [esp + .two] ; two*Heps2
- pfadd mm7, mm6
- pfadd mm7, mm5 ; mm7=FF
- pfmul mm5, mm1 ; mm5=eps*Fp
- pfadd mm5, mm4 ; mm5= VV
-
- movq mm4, [esp + .c6]
- pfmul mm7, mm4 ; fijD
- pfmul mm5, mm4 ; vnb6
- pfadd mm3, mm7 ; add to fscal
-
- ;; update vnbtot to release mm5!
- pfadd mm5, [esp + .vnbtot] ; add the earlier value
- movq [esp + .vnbtot], mm5 ; store the sum
-
- ; repulsion table
- ; load all the table values we need
- movd mm4, [edx + ecx*4 + 32]
- movd mm5, [edx + ecx*4 + 36]
- movd mm6, [edx + ecx*4 + 40]
- movd mm7, [edx + ecx*4 + 44]
-
- pfmul mm6, mm1 ; mm6 = Geps
- pfmul mm7, mm2 ; mm7 = Heps2
- pfadd mm5, mm6
- pfadd mm5, mm7 ; mm5 = Fp
- pfmul mm7, [esp + .two] ; two*Heps2
- pfadd mm7, mm6
- pfadd mm7, mm5 ; mm7=FF
- pfmul mm5, mm1 ; mm5=eps*Fp
- pfadd mm5, mm4 ; mm5= VV
-
- movq mm6, [esp + .c12]
- pfmul mm7, mm6 ; fijR
- pfmul mm5, mm6 ; vnb12
- pfadd mm3, mm7 ; total fscal fijC+fijD+fijR
-
- ; change sign of mm3
- pxor mm1,mm1
- pfsub mm1, mm3
- pfmul mm0, [esp + .tabscale]
- pfmul mm0, mm1 ; mm0 is total fscal now
-
- ;; update vnbtot
- pfadd mm5, [esp + .vnbtot] ; add the earlier value
- movq [esp + .vnbtot], mm5 ; store the sum
-
- ;; spread fscalar to both positions
- punpckldq mm0,mm0
- ;; calc vectorial force
- prefetchw [edi + eax*4] ; prefetch faction to cache
- movq mm2, [esp + .dx1]
- movd mm3, [esp + .dz1]
-
-
- pfmul mm2, mm0
- pfmul mm3, mm0
-
- ;; update i particle force
- movq mm0, [esp + .fix]
- movd mm1, [esp + .fiz]
- pfadd mm0, mm2
- pfadd mm1, mm3
- movq [esp + .fix], mm0
- movd [esp + .fiz], mm1
- ;; update j particle force
- movq mm0, [edi + eax*4]
- movd mm1, [edi + eax *4+ 8]
- pfsub mm0, mm2
- pfsub mm1, mm3
- movq [edi + eax*4], mm0
- movd [edi + eax*4 +8], mm1
- ;; done!
-.updateouterdata:
- mov ecx, [esp + .ii3]
-
- movq mm6, [edi + ecx*4] ; increment i force
- movd mm7, [edi + ecx*4 + 8]
- pfadd mm6, [esp + .fix]
- pfadd mm7, [esp + .fiz]
- movq [edi + ecx*4], mm6
- movd [edi + ecx*4 +8], mm7
-
- mov ebx, [ebp + %$fshift] ; increment fshift force
- mov edx, [esp + .is3]
-
- movq mm6, [ebx + edx*4]
- movd mm7, [ebx + edx*4 + 8]
- pfadd mm6, [esp + .fix]
- pfadd mm7, [esp + .fiz]
- movq [ebx + edx*4], mm6
- movd [ebx + edx*4 + 8], mm7
-
- mov edx, [ebp + %$gid] ; get group index for this i particle
- mov edx, [edx]
- add [ebp + %$gid], dword 4 ; advance pointer
-
- movq mm7, [esp + .vctot]
- pfacc mm7,mm7 ; get and sum the two parts of total potential
-
- mov eax, [ebp + %$Vc]
- movd mm6, [eax + edx*4]
- pfadd mm6, mm7
- movd [eax + edx*4], mm6 ; increment vc[gid]
-
- movq mm7, [esp + .vnbtot]
- pfacc mm7,mm7 ; get and sum the two parts of total potential
-
- mov eax, [ebp + %$Vnb]
- movd mm6, [eax + edx*4]
- pfadd mm6, mm7
- movd [eax + edx*4], mm6 ; increment vnb[gid]
-
- ;; finish if last
- mov ecx, [ebp + %$nri]
- dec ecx
- jecxz .end
- ;; not last, iterate once more!
- mov [ebp + %$nri], ecx
- jmp .outer
-.end:
- femms
- add esp, 132
- pop edi
- pop esi
- pop edx
- pop ecx
- pop ebx
- pop eax
- endproc
-
-
-
-
-
-proc inl3310_3dnow
-%$nri arg
-%$iinr arg
-%$jindex arg
-%$jjnr arg
-%$shift arg
-%$shiftvec arg
-%$fshift arg
-%$gid arg
-%$pos arg
-%$faction arg
-%$charge arg
-%$facel arg
-%$Vc arg
-%$type arg
-%$ntype arg
-%$nbfp arg
-%$Vnb arg
-%$tabscale arg
-%$VFtab arg
-%$nsatoms arg
- ;; stack offsets for local variables
-.is3 equ 0
-.ii3 equ 4
-.shX equ 8
-.shY equ 12
-.shZ equ 16
-.ix equ 20
-.iy equ 24
-.iz equ 28
-.iq equ 32 ; repeated (64bit) to fill 3dnow reg
-.vctot equ 40 ; repeated (64bit) to fill 3dnow reg
-.vnbtot equ 48 ; repeated (64bit) to fill 3dnow reg
-.c6 equ 56 ; repeated (64bit) to fill 3dnow reg
-.c12 equ 64 ; repeated (64bit) to fill 3dnow reg
-.two equ 72 ; repeated (64bit) to fill 3dnow reg
-.n1 equ 80 ; repeated (64bit) to fill 3dnow reg
-.tabscale equ 88 ; repeated (64bit) to fill 3dnow reg
-.ntia equ 96
-.innerjjnr0 equ 100
-.innerk0 equ 104
-.innerjjnr equ 108
-.innerk equ 112
-.fix equ 116
-.fiy equ 120
-.fiz equ 124
-.dx1 equ 128
-.dy1 equ 132
-.dz1 equ 136
-.dx2 equ 140
-.dy2 equ 144
-.dz2 equ 148
-.nsvdwc equ 152
-.nscoul equ 156
-.nsvdw equ 160
-.solnr equ 164
- push eax
- push ebx
- push ecx
- push edx
- push esi
- push edi
- sub esp, 168 ; local stack space
- femms
- movq mm0, [mm_two]
- movd mm3, [ebp + %$tabscale]
- movq [esp + .two], mm0
- punpckldq mm3,mm3
- movq [esp + .tabscale], mm3
- ;; assume we have at least one i particle - start directly
-.outer:
- mov eax, [ebp + %$shift] ; eax = pointer into shift[]
- mov ebx, [eax] ; ebx=shift[n]
- add [ebp + %$shift], dword 4 ; advance pointer one step
-
- lea ebx, [ebx + ebx*2] ; ebx=3*is
- mov [esp + .is3],ebx ; store is3
-
- mov eax, [ebp + %$shiftvec] ; eax = base of shiftvec[]
-
- movq mm0, [eax + ebx*4] ; move shX/shY to mm0 and shZ to mm1.
- movd mm1, [eax + ebx*4 + 8]
- movq [esp + .shX], mm0
- movd [esp + .shZ], mm1
-
- mov ecx, [ebp + %$iinr] ; ecx = pointer into iinr[]
- add [ebp + %$iinr], dword 4 ; advance pointer
- mov ebx, [ecx] ; ebx =ii
-
- mov eax, [ebp + %$nsatoms]
- add [ebp + %$nsatoms], dword 12
- mov ecx, [eax]
- mov edx, [eax + 4]
- mov eax, [eax + 8]
- sub ecx, eax
- sub eax, edx
-
- mov [esp + .nsvdwc], edx
- mov [esp + .nscoul], eax
- mov [esp + .nsvdw], ecx
-
- ;; clear potential
- pxor mm7,mm7
- movq [esp + .vctot], mm7
- movq [esp + .vnbtot], mm7
- mov [esp + .solnr], ebx
-
- mov eax, [ebp + %$jindex]
- mov ecx, [eax] ; jindex[n]
- mov edx, [eax + 4] ; jindex[n+1]
- add [ebp + %$jindex], dword 4
- sub edx, ecx ; number of innerloop atoms
- mov eax, [ebp + %$jjnr]
- shl ecx, 2
- add eax, ecx
- mov [esp + .innerjjnr0], eax ; pointer to jjnr[nj0]
-
- mov [esp + .innerk0], edx ; number of innerloop atoms
- mov esi, [ebp + %$pos]
- mov edi, [ebp + %$faction]
-
- mov ecx, [esp + .nsvdwc]
- cmp ecx, dword 0
- jnz .mno_vdwc
- jmp .testcoul
-.mno_vdwc:
- mov ebx, [esp + .solnr]
- inc dword [esp + .solnr]
- mov edx, [ebp + %$charge]
- movd mm2, [edx + ebx*4] ; mm2=charge[ii]
- pfmul mm2, [ebp + %$facel]
- punpckldq mm2,mm2 ; spread to both halves
- movq [esp + .iq], mm2 ; iq =facel*charge[ii]
-
- mov edx, [ebp + %$type]
- mov edx, [edx + ebx*4]
- imul edx, [ebp + %$ntype]
- shl edx, 1
- mov [esp + .ntia], edx
-
- lea ebx, [ebx + ebx*2] ; ebx = 3*ii=ii3
- mov eax, [ebp + %$pos] ; eax = base of pos[]
- mov [esp + .ii3], ebx
-
- movq mm0, [eax + ebx*4]
- movd mm1, [eax + ebx*4 + 8]
- pfadd mm0, [esp + .shX]
- pfadd mm1, [esp + .shZ]
- movq [esp + .ix], mm0
- movd [esp + .iz], mm1
-
- ;; clear forces.
- pxor mm7,mm7
- movq [esp + .fix], mm7
- movd [esp + .fiz], mm7
-
- mov ecx, [esp + .innerjjnr0]
- mov [esp + .innerjjnr], ecx
- mov edx, [esp + .innerk0]
- sub edx, dword 2
- mov [esp + .innerk], edx ; number of innerloop atoms
- jge .unroll_vdwc_loop
- jmp .finish_vdwc_inner
-.unroll_vdwc_loop:
- ;; paired innerloop here.
- mov ecx, [esp + .innerjjnr] ; pointer to jjnr[k]
- mov eax, [ecx]
- mov ebx, [ecx + 4] ; eax/ebx=jnr
- add [esp + .innerjjnr], dword 8 ; advance pointer (unrolled 2)
- prefetch [ecx + 16] ; prefetch data - trial and error says 16 is best
-
- mov ecx, [ebp + %$charge] ; base of charge[]
- movq mm5, [esp + .iq]
- movd mm3, [ecx + eax*4] ; charge[jnr1]
- punpckldq mm3, [ecx + ebx*4] ; move charge 2 to high part of mm3
- pfmul mm3,mm5 ; mm3 now has qq for both particles
-
- mov ecx, [ebp + %$type]
- mov edx, [ecx + eax*4] ; type [jnr1]
- mov ecx, [ecx + ebx*4] ; type [jnr2]
-
- mov esi, [ebp + %$nbfp] ; base of nbfp
- shl edx, 1
- shl ecx, 1
- add edx, [esp + .ntia] ; tja = ntia + 2*type
- add ecx, [esp + .ntia]
-
- movq mm5, [esi + edx*4] ; mm5 = 1st c6 / c12
- movq mm7, [esi + ecx*4] ; mm7 = 2nd c6 / c12
- movq mm6,mm5
- punpckldq mm5,mm7 ; mm5 = 1st c6 / 2nd c6
- punpckhdq mm6,mm7 ; mm6 = 1st c12 / 2nd c12
- movq [esp + .c6], mm5
- movq [esp + .c12], mm6
-
- lea eax, [eax + eax*2] ; replace jnr with j3
- lea ebx, [ebx + ebx*2]
-
- mov esi, [ebp + %$pos]
-
- movq mm0, [esp + .ix]
- movd mm1, [esp + .iz]
- movq mm4, [esi + eax*4] ; fetch first j coordinates
- movd mm5, [esi + eax*4 + 8]
- pfsubr mm4,mm0 ; dr = ir - jr
- pfsubr mm5,mm1
- movq [esp + .dx1], mm4 ; store dr
- movd [esp + .dz1], mm5
- pfmul mm4,mm4 ; square dx,dy,dz
- pfmul mm5,mm5
- pfacc mm4, mm5 ; accumulate to get dx*dx+dy*dy+dz*dz
- pfacc mm4, mm5 ; first rsq in lower mm4
-
- movq mm6, [esi + ebx*4] ; fetch second j coordinates
- movd mm7, [esi + ebx*4 + 8]
-
- pfsubr mm6,mm0 ; dr = ir - jr
- pfsubr mm7,mm1
- movq [esp + .dx2], mm6 ; store dr
- movd [esp + .dz2], mm7
- pfmul mm6,mm6 ; square dx,dy,dz
- pfmul mm7,mm7
- pfacc mm6, mm7 ; accumulate to get dx*dx+dy*dy+dz*dz
- pfacc mm6, mm7 ; second rsq in lower mm6
-
- pfrsqrt mm0, mm4 ; lookup inverse square root seed
- pfrsqrt mm1, mm6
-
-
- punpckldq mm0,mm1
- punpckldq mm4,mm6 ; now 4 has rsq and 0 the seed for both pairs.
- movq mm2,mm0 ; amd 3dnow N-R iteration to get full precision.
- pfmul mm0,mm0
- pfrsqit1 mm0,mm4
- pfrcpit2 mm0,mm2
- pfmul mm4, mm0
- movq mm1, mm4
- ;; mm0 is invsqrt, and mm1 r.
- ;; do potential and fscal
- pfmul mm1, [esp + .tabscale] ; mm1=rt
- pf2iw mm4,mm1
- movq [esp + .n1], mm4
- pi2fd mm4,mm4
- pfsub mm1, mm4 ; now mm1 is eps and mm4 n0.
-
- movq mm2,mm1
- pfmul mm2,mm2 ; mm1 is eps, mm2 is eps2
-
- mov edx, [ebp + %$VFtab]
- mov ecx, [esp + .n1]
- lea ecx, [ecx + ecx*2]
- shl ecx, 2
- ; load all the table values we need
- movd mm4, [edx + ecx*4]
- movd mm5, [edx + ecx*4 + 4]
- movd mm6, [edx + ecx*4 + 8]
- movd mm7, [edx + ecx*4 + 12]
- mov ecx, [esp + .n1 + 4]
- lea ecx, [ecx + ecx*2]
- shl ecx, 2
- punpckldq mm4, [edx + ecx*4]
- punpckldq mm5, [edx + ecx*4 + 4]
- punpckldq mm6, [edx + ecx*4 + 8]
- punpckldq mm7, [edx + ecx*4 + 12]
-
- pfmul mm6, mm1 ; mm6 = Geps
- pfmul mm7, mm2 ; mm7 = Heps2
-
- pfadd mm5, mm6
- pfadd mm5, mm7 ; mm5 = Fp
-
- pfmul mm7, [esp + .two] ; two*Heps2
- pfadd mm7, mm6
- pfadd mm7, mm5 ; mm7=FF
-
- pfmul mm5, mm1 ; mm5=eps*Fp
- pfadd mm5, mm4 ; mm5= VV
-
- pfmul mm5, mm3 ; vcoul=qq*VV
- pfmul mm3, mm7 ; fijC=FF*qq
-
- ; at this point mm5 contains vcoul and mm3 fijC.
- ; increment vcoul - then we can get rid of mm5.
- ;; update vctot
- pfadd mm5, [esp + .vctot] ; add the earlier value
- movq [esp + .vctot], mm5 ; store the sum
-
- ; dispersion table
- mov ecx, [esp + .n1]
- lea ecx, [ecx + ecx*2]
- shl ecx, 2
- ; load all the table values we need
- movd mm4, [edx + ecx*4 + 16]
- movd mm5, [edx + ecx*4 + 20]
- movd mm6, [edx + ecx*4 + 24]
- movd mm7, [edx + ecx*4 + 28]
- mov ecx, [esp + .n1 + 4]
- lea ecx, [ecx + ecx*2]
- shl ecx, 2
- punpckldq mm4, [edx + ecx*4 + 16]
- punpckldq mm5, [edx + ecx*4 + 20]
- punpckldq mm6, [edx + ecx*4 + 24]
- punpckldq mm7, [edx + ecx*4 + 28]
- pfmul mm6, mm1 ; mm6 = Geps
- pfmul mm7, mm2 ; mm7 = Heps2
- pfadd mm5, mm6
- pfadd mm5, mm7 ; mm5 = Fp
- pfmul mm7, [esp + .two] ; two*Heps2
- pfadd mm7, mm6
- pfadd mm7, mm5 ; mm7=FF
- pfmul mm5, mm1 ; mm5=eps*Fp
- pfadd mm5, mm4 ; mm5= VV
-
- movq mm4, [esp + .c6]
- pfmul mm7, mm4 ; fijD
- pfmul mm5, mm4 ; vnb6
- pfadd mm3, mm7 ; add to fscal
-
- ;; update vnbtot to release mm5!
- pfadd mm5, [esp + .vnbtot] ; add the earlier value
- movq [esp + .vnbtot], mm5 ; store the sum
-
- ; repulsion table
- mov ecx, [esp + .n1]
- lea ecx, [ecx + ecx*2]
- shl ecx, 2
- ; load all the table values we need
- movd mm4, [edx + ecx*4 + 32]
- movd mm5, [edx + ecx*4 + 36]
- movd mm6, [edx + ecx*4 + 40]
- movd mm7, [edx + ecx*4 + 44]
- mov ecx, [esp + .n1 + 4]
- lea ecx, [ecx + ecx*2]
- shl ecx, 2
- punpckldq mm4, [edx + ecx*4 + 32]
- punpckldq mm5, [edx + ecx*4 + 36]
- punpckldq mm6, [edx + ecx*4 + 40]
- punpckldq mm7, [edx + ecx*4 + 44]
-
- pfmul mm6, mm1 ; mm6 = Geps
- pfmul mm7, mm2 ; mm7 = Heps2
- pfadd mm5, mm6
- pfadd mm5, mm7 ; mm5 = Fp
- pfmul mm7, [esp + .two] ; two*Heps2
- pfadd mm7, mm6
- pfadd mm7, mm5 ; mm7=FF
- pfmul mm5, mm1 ; mm5=eps*Fp
- pfadd mm5, mm4 ; mm5= VV
-
- movq mm6, [esp + .c12]
- pfmul mm7, mm6 ; fijR
- pfmul mm5, mm6 ; vnb12
- pfadd mm3, mm7 ; total fscal fijC+fijD+fijR
-
- ; change sign of mm3
- pxor mm1,mm1
- pfsub mm1, mm3
- pfmul mm0, [esp + .tabscale]
- pfmul mm0, mm1 ; mm0 is total fscal now
-
- prefetchw [esp + .dx1] ; prefetch i forces to cache
-
- ;; spread fscalar to both positions
- movq mm1,mm0
- punpckldq mm0,mm0
- punpckhdq mm1,mm1
-
- ;; calc vector force
- prefetchw [edi + eax*4] ; prefetch the 1st faction to cache
- movq mm2, [esp + .dx1] ; fetch dr
- movd mm3, [esp + .dz1]
-
- ;; update vnbtot
- pfadd mm5, [esp + .vnbtot] ; add the earlier value
- movq [esp + .vnbtot], mm5 ; store the sum
-
- prefetchw [edi + ebx*4] ; prefetch the 2nd faction to cache
- pfmul mm2, mm0 ; mult by fs
- pfmul mm3, mm0
-
- movq mm4, [esp + .dx2] ; fetch dr
- movd mm5, [esp + .dz2]
- pfmul mm4, mm1 ; mult by fs
- pfmul mm5, mm1
- ;; update i forces
-
- movq mm0, [esp + .fix]
- movd mm1, [esp + .fiz]
- pfadd mm0, mm2
- pfadd mm1, mm3
-
- pfadd mm0, mm4
- pfadd mm1, mm5
- movq [esp + .fix], mm0
- movd [esp + .fiz], mm1
- ;; update j forces
-
- movq mm0, [edi + eax*4]
- movd mm1, [edi + eax*4 + 8]
- movq mm6, [edi + ebx*4]
- movd mm7, [edi + ebx*4 + 8]
-
- pfsub mm0, mm2
- pfsub mm1, mm3
- pfsub mm6, mm4
- pfsub mm7, mm5
-
- movq [edi + eax*4], mm0
- movd [edi + eax*4 +8], mm1
- movq [edi + ebx*4], mm6
- movd [edi + ebx*4 + 8], mm7
-
- ;; should we do one more iteration?
- sub [esp + .innerk], dword 2
- jl .finish_vdwc_inner
- jmp .unroll_vdwc_loop
-.finish_vdwc_inner:
- and [esp + .innerk], dword 1
- jnz .single_vdwc_inner
- jmp .updateouterdata_vdwc
-.single_vdwc_inner:
- ;; a single j particle iteration here - compare with the unrolled code for comments.
- mov eax, [esp + .innerjjnr]
- mov eax, [eax] ; eax=jnr offset
-
- mov ecx, [ebp + %$charge]
- movd mm5, [esp + .iq]
- movd mm3, [ecx + eax*4]
- pfmul mm3, mm5 ; mm3=qq
-
- mov esi, [ebp + %$nbfp]
- mov ecx, [ebp + %$type]
- mov edx, [ecx + eax*4] ; type [jnr1]
- shl edx, 1
- add edx, [esp + .ntia] ; tja = ntia + 2*type
- movd mm5, [esi + edx*4] ; mm5 = 1st c6
- movq [esp + .c6], mm5
- movd mm5, [esi + edx*4 + 4] ; mm5 = 1st c12
- movq [esp + .c12], mm5
-
- mov esi, [ebp + %$pos]
- lea eax, [eax + eax*2]
-
- movq mm0, [esp + .ix]
- movd mm1, [esp + .iz]
- movq mm4, [esi + eax*4]
- movd mm5, [esi + eax*4 + 8]
- pfsubr mm4, mm0
- pfsubr mm5, mm1
- movq [esp + .dx1], mm4
- pfmul mm4,mm4
- movd [esp + .dz1], mm5
- pfmul mm5,mm5
- pfacc mm4, mm5
- pfacc mm4, mm5 ; mm0=rsq
-
- pfrsqrt mm0,mm4
- movq mm2,mm0
- pfmul mm0,mm0
- pfrsqit1 mm0,mm4
- pfrcpit2 mm0,mm2 ; mm1=invsqrt
- pfmul mm4, mm0
- movq mm1, mm4
- ;; mm0 is invsqrt, and mm1 r.
-
- ;; calculate potentials and scalar force
- pfmul mm1, [esp + .tabscale] ; mm1=rt
- pf2iw mm4,mm1
- movd [esp + .n1], mm4
- pi2fd mm4,mm4
- pfsub mm1, mm4 ; now mm1 is eps and mm4 n0.
-
- movq mm2,mm1
- pfmul mm2,mm2 ; mm1 is eps, mm2 is eps2
-
- ; coulomb table
- mov edx, [ebp + %$VFtab]
- mov ecx, [esp + .n1]
- lea ecx, [ecx + ecx*2]
- shl ecx, 2
- ; load all the table values we need
- movd mm4, [edx + ecx*4]
- movd mm5, [edx + ecx*4 + 4]
- movd mm6, [edx + ecx*4 + 8]
- movd mm7, [edx + ecx*4 + 12]
-
- pfmul mm6, mm1 ; mm6 = Geps
- pfmul mm7, mm2 ; mm7 = Heps2
-
- pfadd mm5, mm6
- pfadd mm5, mm7 ; mm5 = Fp
-
- pfmul mm7, [esp + .two] ; two*Heps2
- pfadd mm7, mm6
- pfadd mm7, mm5 ; mm7=FF
-
- pfmul mm5, mm1 ; mm5=eps*Fp
- pfadd mm5, mm4 ; mm5= VV
-
- pfmul mm5, mm3 ; vcoul=qq*VV
- pfmul mm3, mm7 ; fijC=FF*qq
-
- ; at this point mm5 contains vcoul and mm3 fijC.
- ; increment vcoul - then we can get rid of mm5.
- ;; update vctot
- pfadd mm5, [esp + .vctot] ; add the earlier value
- movq [esp + .vctot], mm5 ; store the sum
-
- ; dispersion table
- ; load all the table values we need
- movd mm4, [edx + ecx*4 + 16]
- movd mm5, [edx + ecx*4 + 20]
- movd mm6, [edx + ecx*4 + 24]
- movd mm7, [edx + ecx*4 + 28]
- pfmul mm6, mm1 ; mm6 = Geps
- pfmul mm7, mm2 ; mm7 = Heps2
- pfadd mm5, mm6
- pfadd mm5, mm7 ; mm5 = Fp
- pfmul mm7, [esp + .two] ; two*Heps2
- pfadd mm7, mm6
- pfadd mm7, mm5 ; mm7=FF
- pfmul mm5, mm1 ; mm5=eps*Fp
- pfadd mm5, mm4 ; mm5= VV
-
- movq mm4, [esp + .c6]
- pfmul mm7, mm4 ; fijD
- pfmul mm5, mm4 ; vnb6
- pfadd mm3, mm7 ; add to fscal
-
- ;; update vnbtot to release mm5!
- pfadd mm5, [esp + .vnbtot] ; add the earlier value
- movq [esp + .vnbtot], mm5 ; store the sum
-
- ; repulsion table
- ; load all the table values we need
- movd mm4, [edx + ecx*4 + 32]
- movd mm5, [edx + ecx*4 + 36]
- movd mm6, [edx + ecx*4 + 40]
- movd mm7, [edx + ecx*4 + 44]
-
- pfmul mm6, mm1 ; mm6 = Geps
- pfmul mm7, mm2 ; mm7 = Heps2
- pfadd mm5, mm6
- pfadd mm5, mm7 ; mm5 = Fp
- pfmul mm7, [esp + .two] ; two*Heps2
- pfadd mm7, mm6
- pfadd mm7, mm5 ; mm7=FF
- pfmul mm5, mm1 ; mm5=eps*Fp
- pfadd mm5, mm4 ; mm5= VV
-
- movq mm6, [esp + .c12]
- pfmul mm7, mm6 ; fijR
- pfmul mm5, mm6 ; vnb12
- pfadd mm3, mm7 ; total fscal fijC+fijD+fijR
-
- ; change sign of mm3
- pxor mm1,mm1
- pfsub mm1, mm3
- pfmul mm0, [esp + .tabscale]
- pfmul mm0, mm1 ; mm0 is total fscal now
-
- ;; update vnbtot
- pfadd mm5, [esp + .vnbtot] ; add the earlier value
- movq [esp + .vnbtot], mm5 ; store the sum
-
- ;; spread fscalar to both positions
- punpckldq mm0,mm0
- ;; calc vectorial force
- prefetchw [edi + eax*4] ; prefetch faction to cache
- movq mm2, [esp + .dx1]
- movd mm3, [esp + .dz1]
-
-
- pfmul mm2, mm0
- pfmul mm3, mm0
-
- ;; update i particle force
- movq mm0, [esp + .fix]
- movd mm1, [esp + .fiz]
- pfadd mm0, mm2
- pfadd mm1, mm3
- movq [esp + .fix], mm0
- movd [esp + .fiz], mm1
- ;; update j particle force
- movq mm0, [edi + eax*4]
- movd mm1, [edi + eax *4+ 8]
- pfsub mm0, mm2
- pfsub mm1, mm3
- movq [edi + eax*4], mm0
- movd [edi + eax*4 +8], mm1
- ;; done!
-.updateouterdata_vdwc:
- mov ecx, [esp + .ii3]
-
- movq mm6, [edi + ecx*4] ; increment i force
- movd mm7, [edi + ecx*4 + 8]
- pfadd mm6, [esp + .fix]
- pfadd mm7, [esp + .fiz]
- movq [edi + ecx*4], mm6
- movd [edi + ecx*4 +8], mm7
-
- mov ebx, [ebp + %$fshift] ; increment fshift force
- mov edx, [esp + .is3]
-
- movq mm6, [ebx + edx*4]
- movd mm7, [ebx + edx*4 + 8]
- pfadd mm6, [esp + .fix]
- pfadd mm7, [esp + .fiz]
- movq [ebx + edx*4], mm6
- movd [ebx + edx*4 + 8], mm7
-
- ;; loop back to mno.
- dec dword [esp + .nsvdwc]
- jz .testcoul
- jmp .mno_vdwc
-.testcoul
- mov ecx, [esp + .nscoul]
- cmp ecx, byte 0
- jnz .mno_coul
- jmp .testvdw
-.mno_coul:
- mov ebx, [esp + .solnr]
- inc dword [esp + .solnr]
- mov edx, [ebp + %$charge]
- movd mm2, [edx + ebx*4] ; mm2=charge[ii]
- pfmul mm2, [ebp + %$facel]
- punpckldq mm2,mm2 ; spread to both halves
- movq [esp + .iq], mm2 ; iq =facel*charge[ii]
-
- lea ebx, [ebx + ebx*2] ; ebx = 3*ii=ii3
- mov eax, [ebp + %$pos] ; eax = base of pos[]
- mov [esp + .ii3], ebx
-
- movq mm0, [eax + ebx*4]
- movd mm1, [eax + ebx*4 + 8]
- pfadd mm0, [esp + .shX]
- pfadd mm1, [esp + .shZ]
- movq [esp + .ix], mm0
- movd [esp + .iz], mm1
-
- ;; clear forces.
- pxor mm7,mm7
- movq [esp + .fix], mm7
- movd [esp + .fiz], mm7
-
- mov ecx, [esp + .innerjjnr0]
- mov [esp + .innerjjnr], ecx
- mov edx, [esp + .innerk0]
- sub edx, dword 2
- mov [esp + .innerk], edx ; number of innerloop atoms
- jge .unroll_coul_loop
- jmp .finish_coul_inner
-.unroll_coul_loop:
- ;; paired innerloop here.
- mov ecx, [esp + .innerjjnr] ; pointer to jjnr[k]
- mov eax, [ecx]
- mov ebx, [ecx + 4] ; eax/ebx=jnr
- add [esp + .innerjjnr], dword 8 ; advance pointer (unrolled 2)
- prefetch [ecx + 16] ; prefetch data - trial and error says 16 is best
-
- mov ecx, [ebp + %$charge] ; base of charge[]
- movq mm5, [esp + .iq]
- movd mm3, [ecx + eax*4] ; charge[jnr1]
- punpckldq mm3, [ecx + ebx*4] ; move charge 2 to high part of mm3
- pfmul mm3,mm5 ; mm3 now has qq for both particles
-
- lea eax, [eax + eax*2] ; replace jnr with j3
- lea ebx, [ebx + ebx*2]
-
- mov esi, [ebp + %$pos]
-
- movq mm0, [esp + .ix]
- movd mm1, [esp + .iz]
- movq mm4, [esi + eax*4] ; fetch first j coordinates
- movd mm5, [esi + eax*4 + 8]
- pfsubr mm4,mm0 ; dr = ir - jr
- pfsubr mm5,mm1
- movq [esp + .dx1], mm4 ; store dr
- movd [esp + .dz1], mm5
- pfmul mm4,mm4 ; square dx,dy,dz
- pfmul mm5,mm5
- pfacc mm4, mm5 ; accumulate to get dx*dx+dy*dy+dz*dz
- pfacc mm4, mm5 ; first rsq in lower mm4
-
- movq mm6, [esi + ebx*4] ; fetch second j coordinates
- movd mm7, [esi + ebx*4 + 8]
-
- pfsubr mm6,mm0 ; dr = ir - jr
- pfsubr mm7,mm1
- movq [esp + .dx2], mm6 ; store dr
- movd [esp + .dz2], mm7
- pfmul mm6,mm6 ; square dx,dy,dz
- pfmul mm7,mm7
- pfacc mm6, mm7 ; accumulate to get dx*dx+dy*dy+dz*dz
- pfacc mm6, mm7 ; second rsq in lower mm6
-
- pfrsqrt mm0, mm4 ; lookup inverse square root seed
- pfrsqrt mm1, mm6
-
-
- punpckldq mm0,mm1
- punpckldq mm4,mm6 ; now 4 has rsq and 0 the seed for both pairs.
- movq mm2,mm0 ; amd 3dnow N-R iteration to get full precision.
- pfmul mm0,mm0
- pfrsqit1 mm0,mm4
- pfrcpit2 mm0,mm2
- pfmul mm4, mm0
- movq mm1, mm4
- ;; mm0 is invsqrt, and mm1 r.
- ;; do potential and fscal
- pfmul mm1, [esp + .tabscale] ; mm1=rt
- pf2iw mm4,mm1
- movq [esp + .n1], mm4
- pi2fd mm4,mm4
- pfsub mm1, mm4 ; now mm1 is eps and mm4 n0.
-
- movq mm2,mm1
- pfmul mm2,mm2 ; mm1 is eps, mm2 is eps2
-
- mov edx, [ebp + %$VFtab]
- mov ecx, [esp + .n1]
- lea ecx, [ecx + ecx*2]
- shl ecx, 2
- ; coulomb table
- ; load all the table values we need
- movd mm4, [edx + ecx*4]
- movd mm5, [edx + ecx*4 + 4]
- movd mm6, [edx + ecx*4 + 8]
- movd mm7, [edx + ecx*4 + 12]
- mov ecx, [esp + .n1 + 4]
- lea ecx, [ecx + ecx*2]
- shl ecx, 2
- punpckldq mm4, [edx + ecx*4]
- punpckldq mm5, [edx + ecx*4 + 4]
- punpckldq mm6, [edx + ecx*4 + 8]
- punpckldq mm7, [edx + ecx*4 + 12]
-
- pfmul mm6, mm1 ; mm6 = Geps
- pfmul mm7, mm2 ; mm7 = Heps2
-
- pfadd mm5, mm6
- pfadd mm5, mm7 ; mm5 = Fp
-
- pfmul mm7, [esp + .two] ; two*Heps2
- pfadd mm7, mm6
- pfadd mm7, mm5 ; mm7=FF
-
- pfmul mm5, mm1 ; mm5=eps*Fp
- pfadd mm5, mm4 ; mm5= VV
-
- pfmul mm5, mm3 ; vcoul=qq*VV
- pfmul mm3, mm7 ; fijC=FF*qq
-
- ; at this point mm5 contains vcoul and mm3 fijC.
- ; increment vcoul - then we can get rid of mm5.
- ;; update vctot
- pfadd mm5, [esp + .vctot] ; add the earlier value
- movq [esp + .vctot], mm5 ; store the sum
-
- ; change sign of mm3
- pxor mm1,mm1
- pfsub mm1, mm3
- pfmul mm1, [esp + .tabscale]
- pfmul mm0, mm1 ; mm0 is total fscal now
-
- prefetchw [esp + .dx1] ; prefetch i forces to cache
-
- ;; spread fscalar to both positions
- movq mm1,mm0
- punpckldq mm0,mm0
- punpckhdq mm1,mm1
-
- ;; calc vector force
- prefetchw [edi + eax*4] ; prefetch the 1st faction to cache
- movq mm2, [esp + .dx1] ; fetch dr
- movd mm3, [esp + .dz1]
-
- prefetchw [edi + ebx*4] ; prefetch the 2nd faction to cache
- pfmul mm2, mm0 ; mult by fs
- pfmul mm3, mm0
-
- movq mm4, [esp + .dx2] ; fetch dr
- movd mm5, [esp + .dz2]
- pfmul mm4, mm1 ; mult by fs
- pfmul mm5, mm1
- ;; update i forces
-
- movq mm0, [esp + .fix]
- movd mm1, [esp + .fiz]
- pfadd mm0, mm2
- pfadd mm1, mm3
-
- pfadd mm0, mm4
- pfadd mm1, mm5
- movq [esp + .fix], mm0
- movd [esp + .fiz], mm1
- ;; update j forces
-
- movq mm0, [edi + eax*4]
- movd mm1, [edi + eax*4 + 8]
- movq mm6, [edi + ebx*4]
- movd mm7, [edi + ebx*4 + 8]
-
- pfsub mm0, mm2
- pfsub mm1, mm3
- pfsub mm6, mm4
- pfsub mm7, mm5
-
- movq [edi + eax*4], mm0
- movd [edi + eax*4 +8], mm1
- movq [edi + ebx*4], mm6
- movd [edi + ebx*4 + 8], mm7
-
- ;; should we do one more iteration?
- sub [esp + .innerk], dword 2
- jl .finish_coul_inner
- jmp .unroll_coul_loop
-.finish_coul_inner:
- and [esp + .innerk], dword 1
- jnz .single_coul_inner
- jmp .updateouterdata_coul
-.single_coul_inner:
- ;; a single j particle iteration here - compare with the unrolled code for comments.
- mov eax, [esp + .innerjjnr]
- mov eax, [eax] ; eax=jnr offset
-
- mov ecx, [ebp + %$charge]
- movd mm5, [esp + .iq]
- movd mm3, [ecx + eax*4]
- pfmul mm3, mm5 ; mm3=qq
-
- mov esi, [ebp + %$pos]
- lea eax, [eax + eax*2]
-
- movq mm0, [esp + .ix]
- movd mm1, [esp + .iz]
- movq mm4, [esi + eax*4]
- movd mm5, [esi + eax*4 + 8]
- pfsubr mm4, mm0
- pfsubr mm5, mm1
- movq [esp + .dx1], mm4
- pfmul mm4,mm4
- movd [esp + .dz1], mm5
- pfmul mm5,mm5
- pfacc mm4, mm5
- pfacc mm4, mm5 ; mm0=rsq
-
- pfrsqrt mm0,mm4
- movq mm2,mm0
- pfmul mm0,mm0
- pfrsqit1 mm0,mm4
- pfrcpit2 mm0,mm2 ; mm1=invsqrt
- pfmul mm4, mm0
- movq mm1, mm4
- ;; mm0 is invsqrt, and mm1 r.
-
- ;; calculate potentials and scalar force
- pfmul mm1, [esp + .tabscale] ; mm1=rt
- pf2iw mm4,mm1
- movd [esp + .n1], mm4
- pi2fd mm4,mm4
- pfsub mm1, mm4 ; now mm1 is eps and mm4 n0.
-
- movq mm2,mm1
- pfmul mm2,mm2 ; mm1 is eps, mm2 is eps2
-
- ; coulomb table
- mov edx, [ebp + %$VFtab]
- mov ecx, [esp + .n1]
- lea ecx, [ecx + ecx*2]
- shl ecx, 2
- ; load all the table values we need
- movd mm4, [edx + ecx*4]
- movd mm5, [edx + ecx*4 + 4]
- movd mm6, [edx + ecx*4 + 8]
- movd mm7, [edx + ecx*4 + 12]
-
- pfmul mm6, mm1 ; mm6 = Geps
- pfmul mm7, mm2 ; mm7 = Heps2
-
- pfadd mm5, mm6
- pfadd mm5, mm7 ; mm5 = Fp
-
- pfmul mm7, [esp + .two] ; two*Heps2
- pfadd mm7, mm6
- pfadd mm7, mm5 ; mm7=FF
-
- pfmul mm5, mm1 ; mm5=eps*Fp
- pfadd mm5, mm4 ; mm5= VV
-
- pfmul mm5, mm3 ; vcoul=qq*VV
- pfmul mm3, mm7 ; fijC=FF*qq
-
- ; at this point mm5 contains vcoul and mm3 fijC.
- ; increment vcoul - then we can get rid of mm5.
- ;; update vctot
- pfadd mm5, [esp + .vctot] ; add the earlier value
- movq [esp + .vctot], mm5 ; store the sum
-
- ; change sign of mm3
- pxor mm1,mm1
- pfsub mm1, mm3
- pfmul mm0, [esp + .tabscale]
- pfmul mm0, mm1 ; mm0 is total fscal now
-
- ;; spread fscalar to both positions
- punpckldq mm0,mm0
- ;; calc vectorial force
- prefetchw [edi + eax*4] ; prefetch faction to cache
- movq mm2, [esp + .dx1]
- movd mm3, [esp + .dz1]
-
-
- pfmul mm2, mm0
- pfmul mm3, mm0
-
- ;; update i particle force
- movq mm0, [esp + .fix]
- movd mm1, [esp + .fiz]
- pfadd mm0, mm2
- pfadd mm1, mm3
- movq [esp + .fix], mm0
- movd [esp + .fiz], mm1
- ;; update j particle force
- movq mm0, [edi + eax*4]
- movd mm1, [edi + eax *4+ 8]
- pfsub mm0, mm2
- pfsub mm1, mm3
- movq [edi + eax*4], mm0
- movd [edi + eax*4 +8], mm1
- ;; done!
-.updateouterdata_coul:
- mov ecx, [esp + .ii3]
-
- movq mm6, [edi + ecx*4] ; increment i force
- movd mm7, [edi + ecx*4 + 8]
- pfadd mm6, [esp + .fix]
- pfadd mm7, [esp + .fiz]
- movq [edi + ecx*4], mm6
- movd [edi + ecx*4 +8], mm7
-
- mov ebx, [ebp + %$fshift] ; increment fshift force
- mov edx, [esp + .is3]
-
- movq mm6, [ebx + edx*4]
- movd mm7, [ebx + edx*4 + 8]
- pfadd mm6, [esp + .fix]
- pfadd mm7, [esp + .fiz]
- movq [ebx + edx*4], mm6
- movd [ebx + edx*4 + 8], mm7
-
- ;; loop back to mno.
- dec dword [esp + .nscoul]
- jz .testvdw
- jmp .mno_coul
-.testvdw
- mov ecx, [esp + .nsvdw]
- cmp ecx, byte 0
- jnz .mno_vdw
- jmp .last_mno
-.mno_vdw:
- mov ebx, [esp + .solnr]
- inc dword [esp + .solnr]
-
- mov edx, [ebp + %$type]
- mov edx, [edx + ebx*4]
- imul edx, [ebp + %$ntype]
- shl edx, 1
- mov [esp + .ntia], edx
-
- lea ebx, [ebx + ebx*2] ; ebx = 3*ii=ii3
- mov eax, [ebp + %$pos] ; eax = base of pos[]
- mov [esp + .ii3], ebx
-
- movq mm0, [eax + ebx*4]
- movd mm1, [eax + ebx*4 + 8]
- pfadd mm0, [esp + .shX]
- pfadd mm1, [esp + .shZ]
- movq [esp + .ix], mm0
- movd [esp + .iz], mm1
-
- ;; clear forces.
- pxor mm7,mm7
- movq [esp + .fix], mm7
- movd [esp + .fiz], mm7
-
- mov ecx, [esp + .innerjjnr0]
- mov [esp + .innerjjnr], ecx
- mov edx, [esp + .innerk0]
- sub edx, dword 2
- mov [esp + .innerk], edx ; number of innerloop atoms
- jge .unroll_vdw_loop
- jmp .finish_vdw_inner
-.unroll_vdw_loop:
- ;; paired innerloop here.
- mov ecx, [esp + .innerjjnr] ; pointer to jjnr[k]
- mov eax, [ecx]
- mov ebx, [ecx + 4] ; eax/ebx=jnr
- add [esp + .innerjjnr], dword 8 ; advance pointer (unrolled 2)
- prefetch [ecx + 16] ; prefetch data - trial and error says 16 is best
-
- mov ecx, [ebp + %$type]
- mov edx, [ecx + eax*4] ; type [jnr1]
- mov ecx, [ecx + ebx*4] ; type [jnr2]
-
- mov esi, [ebp + %$nbfp] ; base of nbfp
- shl edx, 1
- shl ecx, 1
- add edx, [esp + .ntia] ; tja = ntia + 2*type
- add ecx, [esp + .ntia]
-
- movq mm5, [esi + edx*4] ; mm5 = 1st c6 / c12
- movq mm7, [esi + ecx*4] ; mm7 = 2nd c6 / c12
- movq mm6,mm5
- punpckldq mm5,mm7 ; mm5 = 1st c6 / 2nd c6
- punpckhdq mm6,mm7 ; mm6 = 1st c12 / 2nd c12
- movq [esp + .c6], mm5
- movq [esp + .c12], mm6
-
- lea eax, [eax + eax*2] ; replace jnr with j3
- lea ebx, [ebx + ebx*2]
-
- mov esi, [ebp + %$pos]
-
- movq mm0, [esp + .ix]
- movd mm1, [esp + .iz]
- movq mm4, [esi + eax*4] ; fetch first j coordinates
- movd mm5, [esi + eax*4 + 8]
- pfsubr mm4,mm0 ; dr = ir - jr
- pfsubr mm5,mm1
- movq [esp + .dx1], mm4 ; store dr
- movd [esp + .dz1], mm5
- pfmul mm4,mm4 ; square dx,dy,dz
- pfmul mm5,mm5
- pfacc mm4, mm5 ; accumulate to get dx*dx+dy*dy+dz*dz
- pfacc mm4, mm5 ; first rsq in lower mm4
-
- movq mm6, [esi + ebx*4] ; fetch second j coordinates
- movd mm7, [esi + ebx*4 + 8]
-
- pfsubr mm6,mm0 ; dr = ir - jr
- pfsubr mm7,mm1
- movq [esp + .dx2], mm6 ; store dr
- movd [esp + .dz2], mm7
- pfmul mm6,mm6 ; square dx,dy,dz
- pfmul mm7,mm7
- pfacc mm6, mm7 ; accumulate to get dx*dx+dy*dy+dz*dz
- pfacc mm6, mm7 ; second rsq in lower mm6
-
- pfrsqrt mm0, mm4 ; lookup inverse square root seed
- pfrsqrt mm1, mm6
-
-
- punpckldq mm0,mm1
- punpckldq mm4,mm6 ; now 4 has rsq and 0 the seed for both pairs.
- movq mm2,mm0 ; amd 3dnow N-R iteration to get full precision.
- pfmul mm0,mm0
- pfrsqit1 mm0,mm4
- pfrcpit2 mm0,mm2
- pfmul mm4, mm0
- movq mm1, mm4
- ;; mm0 is invsqrt, and mm1 r.
- ;; do potential and fscal
- pfmul mm1, [esp + .tabscale] ; mm1=rt
- pf2iw mm4,mm1
- movq [esp + .n1], mm4
- pi2fd mm4,mm4
- pfsub mm1, mm4 ; now mm1 is eps and mm4 n0.
-
- movq mm2,mm1
- pfmul mm2,mm2 ; mm1 is eps, mm2 is eps2
-
- mov edx, [ebp + %$VFtab]
- ; dispersion table
- mov ecx, [esp + .n1]
- lea ecx, [ecx + ecx*2]
- shl ecx, 2
- ; load all the table values we need
- movd mm4, [edx + ecx*4]
- movd mm5, [edx + ecx*4 + 4]
- movd mm6, [edx + ecx*4 + 8]
- movd mm7, [edx + ecx*4 + 12]
- mov ecx, [esp + .n1 + 4]
- lea ecx, [ecx + ecx*2]
- shl ecx, 2
- punpckldq mm4, [edx + ecx*4]
- punpckldq mm5, [edx + ecx*4 + 4]
- punpckldq mm6, [edx + ecx*4 + 8]
- punpckldq mm7, [edx + ecx*4 + 12]
- pfmul mm6, mm1 ; mm6 = Geps
- pfmul mm7, mm2 ; mm7 = Heps2
- pfadd mm5, mm6
- pfadd mm5, mm7 ; mm5 = Fp
- pfmul mm7, [esp + .two] ; two*Heps2
- pfadd mm7, mm6
- pfadd mm7, mm5 ; mm7=FF
- pfmul mm5, mm1 ; mm5=eps*Fp
- pfadd mm5, mm4 ; mm5= VV
-
- movq mm4, [esp + .c6]
- pfmul mm7, mm4 ; fijD
- pfmul mm5, mm4 ; vnb6
- movq mm3, mm7 ; add to fscal
-
- ;; update vnbtot to release mm5!
- pfadd mm5, [esp + .vnbtot] ; add the earlier value
- movq [esp + .vnbtot], mm5 ; store the sum
-
- ; repulsion table
- mov ecx, [esp + .n1]
- lea ecx, [ecx + ecx*2]
- shl ecx, 2
- ; load all the table values we need
- movd mm4, [edx + ecx*4 + 16]
- movd mm5, [edx + ecx*4 + 20]
- movd mm6, [edx + ecx*4 + 24]
- movd mm7, [edx + ecx*4 + 28]
- mov ecx, [esp + .n1 + 4]
- lea ecx, [ecx + ecx*2]
- shl ecx, 2
- punpckldq mm4, [edx + ecx*4 + 16]
- punpckldq mm5, [edx + ecx*4 + 20]
- punpckldq mm6, [edx + ecx*4 + 24]
- punpckldq mm7, [edx + ecx*4 + 28]
-
- pfmul mm6, mm1 ; mm6 = Geps
- pfmul mm7, mm2 ; mm7 = Heps2
- pfadd mm5, mm6
- pfadd mm5, mm7 ; mm5 = Fp
- pfmul mm7, [esp + .two] ; two*Heps2
- pfadd mm7, mm6
- pfadd mm7, mm5 ; mm7=FF
- pfmul mm5, mm1 ; mm5=eps*Fp
- pfadd mm5, mm4 ; mm5= VV
-
- movq mm6, [esp + .c12]
- pfmul mm7, mm6 ; fijR
- pfmul mm5, mm6 ; vnb12
- pfadd mm3, mm7 ; total fscal fijD+fijR
-
- ; change sign of mm3
- pxor mm1,mm1
- pfsub mm1, mm3
- pfmul mm1, [esp + .tabscale]
- pfmul mm0, mm1 ; mm0 is total fscal now
-
- prefetchw [esp + .dx1] ; prefetch i forces to cache
-
- ;; spread fscalar to both positions
- movq mm1,mm0
- punpckldq mm0,mm0
- punpckhdq mm1,mm1
-
- ;; calc vector force
- prefetchw [edi + eax*4] ; prefetch the 1st faction to cache
- movq mm2, [esp + .dx1] ; fetch dr
- movd mm3, [esp + .dz1]
-
- ;; update vnbtot
- pfadd mm5, [esp + .vnbtot] ; add the earlier value
- movq [esp + .vnbtot], mm5 ; store the sum
-
- prefetchw [edi + ebx*4] ; prefetch the 2nd faction to cache
- pfmul mm2, mm0 ; mult by fs
- pfmul mm3, mm0
-
- movq mm4, [esp + .dx2] ; fetch dr
- movd mm5, [esp + .dz2]
- pfmul mm4, mm1 ; mult by fs
- pfmul mm5, mm1
- ;; update i forces
-
- movq mm0, [esp + .fix]
- movd mm1, [esp + .fiz]
- pfadd mm0, mm2
- pfadd mm1, mm3
-
- pfadd mm0, mm4
- pfadd mm1, mm5
- movq [esp + .fix], mm0
- movd [esp + .fiz], mm1
- ;; update j forces
-
- movq mm0, [edi + eax*4]
- movd mm1, [edi + eax*4 + 8]
- movq mm6, [edi + ebx*4]
- movd mm7, [edi + ebx*4 + 8]
-
- pfsub mm0, mm2
- pfsub mm1, mm3
- pfsub mm6, mm4
- pfsub mm7, mm5
-
- movq [edi + eax*4], mm0
- movd [edi + eax*4 +8], mm1
- movq [edi + ebx*4], mm6
- movd [edi + ebx*4 + 8], mm7
-
- ;; should we do one more iteration?
- sub [esp + .innerk], dword 2
- jl .finish_vdw_inner
- jmp .unroll_vdw_loop
-.finish_vdw_inner:
- and [esp + .innerk], dword 1
- jnz .single_vdw_inner
- jmp .updateouterdata_vdw
-.single_vdw_inner:
- ;; a single j particle iteration here - compare with the unrolled code for comments.
- mov eax, [esp + .innerjjnr]
- mov eax, [eax] ; eax=jnr offset
-
- mov esi, [ebp + %$nbfp]
- mov ecx, [ebp + %$type]
- mov edx, [ecx + eax*4] ; type [jnr1]
- shl edx, 1
- add edx, [esp + .ntia] ; tja = ntia + 2*type
- movd mm5, [esi + edx*4] ; mm5 = 1st c6
- movq [esp + .c6], mm5
- movd mm5, [esi + edx*4 + 4] ; mm5 = 1st c12
- movq [esp + .c12], mm5
-
- mov esi, [ebp + %$pos]
- lea eax, [eax + eax*2]
-
- movq mm0, [esp + .ix]
- movd mm1, [esp + .iz]
- movq mm4, [esi + eax*4]
- movd mm5, [esi + eax*4 + 8]
- pfsubr mm4, mm0
- pfsubr mm5, mm1
- movq [esp + .dx1], mm4
- pfmul mm4,mm4
- movd [esp + .dz1], mm5
- pfmul mm5,mm5
- pfacc mm4, mm5
- pfacc mm4, mm5 ; mm0=rsq
-
- pfrsqrt mm0,mm4
- movq mm2,mm0
- pfmul mm0,mm0
- pfrsqit1 mm0,mm4
- pfrcpit2 mm0,mm2 ; mm1=invsqrt
- pfmul mm4, mm0
- movq mm1, mm4
- ;; mm0 is invsqrt, and mm1 r.
-
- ;; calculate potentials and scalar force
- pfmul mm1, [esp + .tabscale] ; mm1=rt
- pf2iw mm4,mm1
- movd [esp + .n1], mm4
- pi2fd mm4,mm4
- pfsub mm1, mm4 ; now mm1 is eps and mm4 n0.
-
- movq mm2,mm1
- pfmul mm2,mm2 ; mm1 is eps, mm2 is eps2
-
- mov edx, [ebp + %$VFtab]
- mov ecx, [esp + .n1]
- lea ecx, [ecx + ecx*2]
- shl ecx, 2
- ; dispersion table
- ; load all the table values we need
- movd mm4, [edx + ecx*4]
- movd mm5, [edx + ecx*4 + 4]
- movd mm6, [edx + ecx*4 + 8]
- movd mm7, [edx + ecx*4 + 12]
- pfmul mm6, mm1 ; mm6 = Geps
- pfmul mm7, mm2 ; mm7 = Heps2
- pfadd mm5, mm6
- pfadd mm5, mm7 ; mm5 = Fp
- pfmul mm7, [esp + .two] ; two*Heps2
- pfadd mm7, mm6
- pfadd mm7, mm5 ; mm7=FF
- pfmul mm5, mm1 ; mm5=eps*Fp
- pfadd mm5, mm4 ; mm5= VV
-
- movq mm4, [esp + .c6]
- pfmul mm7, mm4 ; fijD
- pfmul mm5, mm4 ; vnb6
- movq mm3, mm7 ; add to fscal
-
- ;; update vnbtot to release mm5!
- pfadd mm5, [esp + .vnbtot] ; add the earlier value
- movq [esp + .vnbtot], mm5 ; store the sum
-
- ; repulsion table
- ; load all the table values we need
- movd mm4, [edx + ecx*4 + 16]
- movd mm5, [edx + ecx*4 + 20]
- movd mm6, [edx + ecx*4 + 24]
- movd mm7, [edx + ecx*4 + 28]
-
- pfmul mm6, mm1 ; mm6 = Geps
- pfmul mm7, mm2 ; mm7 = Heps2
- pfadd mm5, mm6
- pfadd mm5, mm7 ; mm5 = Fp
- pfmul mm7, [esp + .two] ; two*Heps2
- pfadd mm7, mm6
- pfadd mm7, mm5 ; mm7=FF
- pfmul mm5, mm1 ; mm5=eps*Fp
- pfadd mm5, mm4 ; mm5= VV
-
- movq mm6, [esp + .c12]
- pfmul mm7, mm6 ; fijR
- pfmul mm5, mm6 ; vnb12
- pfadd mm3, mm7 ; total fscal fijC+fijD+fijR
-
- ; change sign of mm3
- pxor mm1,mm1
- pfsub mm1, mm3
- pfmul mm0, [esp + .tabscale]
- pfmul mm0, mm1 ; mm0 is total fscal now
-
- ;; update vnbtot
- pfadd mm5, [esp + .vnbtot] ; add the earlier value
- movq [esp + .vnbtot], mm5 ; store the sum
-
- ;; spread fscalar to both positions
- punpckldq mm0,mm0
- ;; calc vectorial force
- prefetchw [edi + eax*4] ; prefetch faction to cache
- movq mm2, [esp + .dx1]
- movd mm3, [esp + .dz1]
-
- pfmul mm2, mm0
- pfmul mm3, mm0
-
- ;; update i particle force
- movq mm0, [esp + .fix]
- movd mm1, [esp + .fiz]
- pfadd mm0, mm2
- pfadd mm1, mm3
- movq [esp + .fix], mm0
- movd [esp + .fiz], mm1
- ;; update j particle force
- movq mm0, [edi + eax*4]
- movd mm1, [edi + eax *4+ 8]
- pfsub mm0, mm2
- pfsub mm1, mm3
- movq [edi + eax*4], mm0
- movd [edi + eax*4 +8], mm1
- ;; done!
-.updateouterdata_vdw:
- mov ecx, [esp + .ii3]
-
- movq mm6, [edi + ecx*4] ; increment i force
- movd mm7, [edi + ecx*4 + 8]
- pfadd mm6, [esp + .fix]
- pfadd mm7, [esp + .fiz]
- movq [edi + ecx*4], mm6
- movd [edi + ecx*4 +8], mm7
-
- mov ebx, [ebp + %$fshift] ; increment fshift force
- mov edx, [esp + .is3]
-
- movq mm6, [ebx + edx*4]
- movd mm7, [ebx + edx*4 + 8]
- pfadd mm6, [esp + .fix]
- pfadd mm7, [esp + .fiz]
- movq [ebx + edx*4], mm6
- movd [ebx + edx*4 + 8], mm7
-
- ;; loop back to mno.
- dec dword [esp + .nsvdw]
- jz .last_mno
- jmp .mno_vdw
-
-.last_mno:
- mov edx, [ebp + %$gid] ; get group index for this i particle
- mov edx, [edx]
- add [ebp + %$gid], dword 4 ; advance pointer
-
- movq mm7, [esp + .vctot]
- pfacc mm7,mm7 ; get and sum the two parts of total potential
-
- mov eax, [ebp + %$Vc]
- movd mm6, [eax + edx*4]
- pfadd mm6, mm7
- movd [eax + edx*4], mm6 ; increment vc[gid]
-
- movq mm7, [esp + .vnbtot]
- pfacc mm7,mm7 ; get and sum the two parts of total potential
-
- mov eax, [ebp + %$Vnb]
- movd mm6, [eax + edx*4]
- pfadd mm6, mm7
- movd [eax + edx*4], mm6 ; increment vc[gid]
- ;; finish if last
- mov ecx, [ebp + %$nri]
- dec ecx
- jecxz .end
- ;; not last, iterate once more!
- mov [ebp + %$nri], ecx
- jmp .outer
-.end:
- femms
- add esp, 168
- pop edi
- pop esi
- pop edx
- pop ecx
- pop ebx
- pop eax
- endproc
-
-
-proc inl3320_3dnow
-%$nri arg
-%$iinr arg
-%$jindex arg
-%$jjnr arg
-%$shift arg
-%$shiftvec arg
-%$fshift arg
-%$gid arg
-%$pos arg
-%$faction arg
-%$charge arg
-%$facel arg
-%$Vc arg
-%$type arg
-%$ntype arg
-%$nbfp arg
-%$Vnb arg
-%$tabscale arg
-%$VFtab arg
- ;; stack offsets for local variables
-.is3 equ 0
-.ii3 equ 4
-.ixO equ 8
-.iyO equ 12
-.izO equ 16
-.ixH equ 20 ; repeated (64bit) to fill 3dnow reg
-.iyH equ 28 ; repeated (64bit) to fill 3dnow reg
-.izH equ 36 ; repeated (64bit) to fill 3dnow reg
-.iqO equ 44 ; repeated (64bit) to fill 3dnow reg
-.iqH equ 52 ; repeated (64bit) to fill 3dnow reg
-.qqO equ 60 ; repeated (64bit) to fill 3dnow reg
-.qqH equ 68 ; repeated (64bit) to fill 3dnow reg
-.vctot equ 76 ; repeated (64bit) to fill 3dnow reg
-.vnbtot equ 84 ; repeated (64bit) to fill 3dnow reg
-.c6 equ 92 ; repeated (64bit) to fill 3dnow reg
-.c12 equ 100 ; repeated (64bit) to fill 3dnow reg
-.two equ 108 ; repeated (64bit) to fill 3dnow reg
-.n1 equ 116 ; repeated (64bit) to fill 3dnow reg
-.tabscale equ 124 ; repeated (64bit) to fill 3dnow reg
-.ntia equ 132 ; repeated (64bit) to fill 3dnow reg
-.innerjjnr equ 140
-.innerk equ 144
-.fixO equ 148
-.fiyO equ 152
-.fizO equ 156
-.fixH equ 160 ; repeated (64bit) to fill 3dnow reg
-.fiyH equ 168 ; repeated (64bit) to fill 3dnow reg
-.fizH equ 176 ; repeated (64bit) to fill 3dnow reg
-.dxO equ 184
-.dyO equ 188
-.dzO equ 192
-.dxH equ 196 ; repeated (64bit) to fill 3dnow reg
-.dyH equ 204 ; repeated (64bit) to fill 3dnow reg
-.dzH equ 212 ; repeated (64bit) to fill 3dnow reg
-.tmprsqH equ 220 ; repeated (64bit) to fill 3dnow reg
- push eax
- push ebx
- push ecx
- push edx
- push esi
- push edi
- sub esp, 228 ; local stack space
- femms
-
- mov ecx, [ebp + %$iinr] ; ecx = pointer into iinr[]
- mov ebx, [ecx] ; ebx =ii
-
- mov edx, [ebp + %$charge]
- movd mm1, [ebp + %$facel]
- movd mm2, [edx + ebx*4] ; mm2=charge[ii0]
- pfmul mm2, mm1
- movq [esp + .iqO], mm2 ; iqO = facel*charge[ii]
-
- movd mm2, [edx + ebx*4 + 4] ; mm2=charge[ii0+1]
- pfmul mm2, mm1
- punpckldq mm2,mm2 ; spread to both halves
- movq [esp + .iqH], mm2 ; iqH = facel*charge[ii0+1]
-
- mov edx, [ebp + %$type]
- mov ecx, [edx + ebx*4]
- shl ecx, 1
- imul ecx, [ebp + %$ntype] ; ecx = ntia = 2*ntype*type[ii0]
- mov [esp + .ntia], ecx
-
- movq mm3, [mm_two]
- movq mm4, [ebp + %$tabscale]
- punpckldq mm4,mm4 ; spread to both halves
- movq [esp + .two], mm3
- movq [esp + .tabscale], mm4
- ;; assume we have at least one i particle - start directly
-.outer:
- mov eax, [ebp + %$shift] ; eax = pointer into shift[]
- mov ebx, [eax] ; ebx=shift[n]
- add [ebp + %$shift], dword 4 ; advance pointer one step
-
- lea ebx, [ebx + ebx*2] ; ebx=3*is
- mov [esp + .is3],ebx ; store is3
-
- mov eax, [ebp + %$shiftvec] ; eax = base of shiftvec[]
-
- movq mm5, [eax + ebx*4] ; move shX/shY to mm5 and shZ to mm6.
- movd mm6, [eax + ebx*4 + 8]
- movq mm0, mm5
- movq mm1, mm5
- movq mm2, mm6
- punpckldq mm0,mm0 ; also expand shX,Y,Z in mm0--mm2.
- punpckhdq mm1,mm1
- punpckldq mm2,mm2
-
- mov ecx, [ebp + %$iinr] ; ecx = pointer into iinr[]
- add [ebp + %$iinr], dword 4 ; advance pointer
- mov ebx, [ecx] ; ebx =ii
-
- lea ebx, [ebx + ebx*2] ; ebx = 3*ii=ii3
- mov eax, [ebp + %$pos] ; eax = base of pos[]
-
- pfadd mm5, [eax + ebx*4] ; ix = shX + posX (and iy too)
- movd mm7, [eax + ebx*4 + 8] ; cant use direct memory add for 4 bytes (iz)
- mov [esp + .ii3], ebx ; (use mm7 as temp. storage for iz.)
- pfadd mm6, mm7
- movq [esp + .ixO], mm5
- movq [esp + .izO], mm6
-
- movd mm3, [eax + ebx*4 + 12]
- movd mm4, [eax + ebx*4 + 16]
- movd mm5, [eax + ebx*4 + 20]
- punpckldq mm3, [eax + ebx*4 + 24]
- punpckldq mm4, [eax + ebx*4 + 28]
- punpckldq mm5, [eax + ebx*4 + 32] ; coords of H1 in low mm3-mm5, H2 in high.
-
- pfadd mm0, mm3
- pfadd mm1, mm4
- pfadd mm2, mm5
- movq [esp + .ixH], mm0
- movq [esp + .iyH], mm1
- movq [esp + .izH], mm2
-
- ;; clear vctot and i forces.
- pxor mm7,mm7
- movq [esp + .vctot], mm7
- movq [esp + .vnbtot], mm7
- movq [esp + .fixO], mm7
- movd [esp + .fizO], mm7
- movq [esp + .fixH], mm7
- movq [esp + .fiyH], mm7
- movq [esp + .fizH], mm7
-
- mov eax, [ebp + %$jindex]
- mov ecx, [eax] ; jindex[n]
- mov edx, [eax + 4] ; jindex[n+1]
- add [ebp + %$jindex], dword 4
- sub edx, ecx ; number of innerloop atoms
- mov [esp + .innerk], edx
-
- mov esi, [ebp + %$pos]
- mov edi, [ebp + %$faction]
- mov eax, [ebp + %$jjnr]
- shl ecx, 2
- add eax, ecx
- mov [esp + .innerjjnr], eax ; pointer to jjnr[nj0]
-.inner_loop:
- ;; a single j particle iteration here - compare with the unrolled code for comments.
- mov eax, [esp + .innerjjnr]
- mov eax, [eax] ; eax=jnr offset
- add [esp + .innerjjnr], dword 4 ; advance pointer
- ;prefetch [ecx + 16] ; prefetch data - trial and error says 16 is best
-
- mov ecx, [ebp + %$charge]
- movd mm7, [ecx + eax*4]
- punpckldq mm7,mm7
- movq mm6,mm7
- pfmul mm6, [esp + .iqO]
- pfmul mm7, [esp + .iqH] ; mm6=qqO, mm7=qqH
- movd [esp + .qqO], mm6
- movq [esp + .qqH], mm7
-
- mov ecx, [ebp + %$type]
- mov edx, [ecx + eax*4] ; type [jnr]
- mov ecx, [ebp + %$nbfp]
- shl edx, 1
- add edx, [esp + .ntia] ; tja = ntia + 2*type
- movd mm5, [ecx + edx*4] ; mm5 = 1st c6
- movq [esp + .c6], mm5
- movd mm5, [ecx + edx*4 + 4] ; mm5 = 1st c12
- movq [esp + .c12], mm5
-
- lea eax, [eax + eax*2]
-
- movq mm0, [esi + eax*4]
- movd mm1, [esi + eax*4 + 8]
- ;; copy & expand to mm2-mm4 for the H interactions
- movq mm2, mm0
- movq mm3, mm0
- movq mm4, mm1
- punpckldq mm2,mm2
- punpckhdq mm3,mm3
- punpckldq mm4,mm4
-
- pfsubr mm0, [esp + .ixO]
- pfsubr mm1, [esp + .izO]
-
- movq [esp + .dxO], mm0
- pfmul mm0,mm0
- movd [esp + .dzO], mm1
- pfmul mm1,mm1
- pfacc mm0, mm1
- pfadd mm0, mm1 ; mm0=rsqO
-
- punpckldq mm2, mm2
- punpckldq mm3, mm3
- punpckldq mm4, mm4 ; mm2-mm4 is jx-jz
- pfsubr mm2, [esp + .ixH]
- pfsubr mm3, [esp + .iyH]
- pfsubr mm4, [esp + .izH] ; mm2-mm4 is dxH-dzH
-
- movq [esp + .dxH], mm2
- movq [esp + .dyH], mm3
- movq [esp + .dzH], mm4
- pfmul mm2,mm2
- pfmul mm3,mm3
- pfmul mm4,mm4
-
- pfadd mm3,mm2
- pfadd mm3,mm4 ; mm3=rsqH
- movq [esp + .tmprsqH], mm3
-
- pfrsqrt mm1,mm0
-
- movq mm2,mm1
- pfmul mm1,mm1
- pfrsqit1 mm1,mm0
- pfrcpit2 mm1,mm2 ; mm1=invsqrt
-
- pfmul mm0, mm1 ; mm0=r
-
- pfmul mm0, [esp + .tabscale]
- pf2iw mm4, mm0
- movd [esp + .n1], mm4
- pi2fd mm4,mm4
- pfsub mm0, mm4 ; now mm0 is eps and mm4 n0.
- movq mm2, mm0
- pfmul mm2, mm2 ; mm0 is eps, mm2 eps2
-
- ; coulomb table
- mov edx, [ebp + %$VFtab]
- mov ecx, [esp + .n1]
- lea ecx, [ecx + ecx*2]
- shl ecx, 2
- ;; load all values we need
- movd mm4, [edx + ecx*4]
- movd mm5, [edx + ecx*4 + 4]
- movd mm6, [edx + ecx*4 + 8]
- movd mm7, [edx + ecx*4 + 12]
-
- pfmul mm6, mm0 ; mm6 = Geps
- pfmul mm7, mm2 ; mm7 = Heps2
- ;;
- pfadd mm5, mm6
- pfadd mm5, mm7 ; mm5 = Fp
-
- pfmul mm7, [esp + .two] ; two*Heps2
- pfadd mm7, mm6
- pfadd mm7, mm5 ; mm7=FF
-
- pfmul mm5, mm0 ; mm5=eps*Fp
- pfadd mm5, mm4 ; mm5= VV
-
- pfmul mm5, [esp + .qqO] ; vcoul=qq*VV
- pfmul mm7, [esp + .qqO] ; fijC=qq*FF
-
- ;; update vctot directly, use mm3 for fscal sum.
- pfadd mm5, [esp + .vctot]
- movq [esp + .vctot], mm5
- movq mm3, mm7
-
- ; dispersion table
- ; load all the table values we need
- movd mm4, [edx + ecx*4 + 16]
- movd mm5, [edx + ecx*4 + 20]
- movd mm6, [edx + ecx*4 + 24]
- movd mm7, [edx + ecx*4 + 28]
- pfmul mm6, mm0 ; mm6 = Geps
- pfmul mm7, mm2 ; mm7 = Heps2
- pfadd mm5, mm6
- pfadd mm5, mm7 ; mm5 = Fp
- pfmul mm7, [esp + .two] ; two*Heps2
- pfadd mm7, mm6
- pfadd mm7, mm5 ; mm7=FF
- pfmul mm5, mm0 ; mm5=eps*Fp
- pfadd mm5, mm4 ; mm5= VV
-
- movq mm4, [esp + .c6]
- pfmul mm7, mm4 ; fijD
- pfmul mm5, mm4 ; vnb6
- pfadd mm3, mm7 ; add to fscal
-
- ;; update vnbtot to release mm5!
- pfadd mm5, [esp + .vnbtot] ; add the earlier value
- movq [esp + .vnbtot], mm5 ; store the sum
-
- ; repulsion table
- ; load all the table values we need
- movd mm4, [edx + ecx*4 + 32]
- movd mm5, [edx + ecx*4 + 36]
- movd mm6, [edx + ecx*4 + 40]
- movd mm7, [edx + ecx*4 + 44]
-
- pfmul mm6, mm0 ; mm6 = Geps
- pfmul mm7, mm2 ; mm7 = Heps2
- pfadd mm5, mm6
- pfadd mm5, mm7 ; mm5 = Fp
- pfmul mm7, [esp + .two] ; two*Heps2
- pfadd mm7, mm6
- pfadd mm7, mm5 ; mm7=FF
- pfmul mm5, mm0 ; mm5=eps*Fp
- pfadd mm5, mm4 ; mm5= VV
-
- movq mm6, [esp + .c12]
- pfmul mm7, mm6 ; fijR
- pfmul mm5, mm6 ; vnb12
- pfadd mm3, mm7 ; total fscal fijC+fijD+fijR
-
- ; change sign of fscal and multiply with rinv
- pxor mm0,mm0
- pfsubr mm3, mm0
- pfmul mm3, [esp + .tabscale]
- pfmul mm3, mm1 ; mm3 is total fscal (for the oxygen) now
-
- ;; update vnbtot
- pfadd mm5, [esp + .vnbtot] ; add the earlier value
- movq [esp + .vnbtot], mm5 ; store the sum
-
- ;; Ready with the oxygen - potential is updated, fscal is in mm3.
- ;; now do the two hydrogens.
- movq mm0, [esp + .tmprsqH] ; mm0=rsqH
-
- pfrsqrt mm1, mm0
- pswapd mm0,mm0
- pfrsqrt mm2, mm0
- pswapd mm0,mm0
- punpckldq mm1,mm2 ; seeds are in mm1 now, and rsq in mm0.
-
- movq mm2, mm1
- pfmul mm1,mm1
- pfrsqit1 mm1,mm0
- pfrcpit2 mm1,mm2 ; mm1=invsqrt
-
- pfmul mm0,mm1 ; mm0=r
- pfmul mm0, [esp + .tabscale]
- pf2iw mm4, mm0
- movq [esp + .n1], mm4
- pi2fd mm4,mm4
- pfsub mm0, mm4 ; now mm0 is eps and mm4 n0.
- movq mm2, mm0
- pfmul mm2, mm2 ; mm0 is eps, mm2 eps2
-
- ; coulomb table
- mov edx, [ebp + %$VFtab]
- mov ecx, [esp + .n1]
- lea ecx, [ecx + ecx*2]
- shl ecx, 2
- ;; load all values we need
- movd mm4, [edx + ecx*4]
- movd mm5, [edx + ecx*4 + 4]
- movd mm6, [edx + ecx*4 + 8]
- movd mm7, [edx + ecx*4 + 12]
- mov ecx, [esp + .n1 + 4]
- lea ecx, [ecx + ecx*2]
- shl ecx, 2
- punpckldq mm4, [edx + ecx*4]
- punpckldq mm5, [edx + ecx*4 + 4]
- punpckldq mm6, [edx + ecx*4 + 8]
- punpckldq mm7, [edx + ecx*4 + 12]
-
-
- pfmul mm6, mm0 ; mm6 = Geps
- pfmul mm7, mm2 ; mm7 = Heps2
- ;;
- pfadd mm5, mm6
- pfadd mm5, mm7 ; mm5 = Fp
-
- pfmul mm7, [esp + .two] ; two*Heps2
- pfadd mm7, mm6
- pfadd mm7, mm5 ; mm7=FF
-
- pfmul mm5, mm0 ; mm5=eps*Fp
- pfadd mm5, mm4 ; mm5= VV
-
- pfmul mm5, [esp + .qqH] ; vcoul=qq*VV
- pfmul mm7, [esp + .qqH] ; fijC=qq*FF
- ;; update vctot.
- pfadd mm5, [esp + .vctot]
- movq [esp + .vctot], mm5
-
- ;; change sign of fijC and multiply by rinv
- pxor mm4,mm4
- pfsub mm4, mm7
- pfmul mm4, [esp + .tabscale]
- pfmul mm4, mm1 ; mm4 is total fscal (for the hydrogens) now
-
- ;; spread oxygen fscalar to both positions
- punpckldq mm3,mm3
- ;; calc vectorial force for O
- prefetchw [edi + eax*4] ; prefetch faction to cache
- movq mm0, [esp + .dxO]
- movd mm1, [esp + .dzO]
- pfmul mm0, mm3
- pfmul mm1, mm3
-
- ;; calc vectorial force for H's
- movq mm5, [esp + .dxH]
- movq mm6, [esp + .dyH]
- movq mm7, [esp + .dzH]
- pfmul mm5, mm4
- pfmul mm6, mm4
- pfmul mm7, mm4
-
- ;; update iO particle force
- movq mm2, [esp + .fixO]
- movd mm3, [esp + .fizO]
- pfadd mm2, mm0
- pfadd mm3, mm1
- movq [esp + .fixO], mm2
- movd [esp + .fizO], mm3
-
- ;; update iH forces
- movq mm2, [esp + .fixH]
- movq mm3, [esp + .fiyH]
- movq mm4, [esp + .fizH]
- pfadd mm2, mm5
- pfadd mm3, mm6
- pfadd mm4, mm7
- movq [esp + .fixH], mm2
- movq [esp + .fiyH], mm3
- movq [esp + .fizH], mm4
-
- ;; pack j forces from H in the same form as the oxygen force.
- pfacc mm5, mm6 ; mm5(l)=fjx(H1+H2) mm5(h)=fjy(H1+H2)
- pfacc mm7, mm7 ; mm7(l)=fjz(H1+H2)
-
- pfadd mm0, mm5 ; add up total force on j particle.
- pfadd mm1, mm7
-
- ;; update j particle force
- movq mm2, [edi + eax*4]
- movd mm3, [edi + eax*4 + 8]
- pfsub mm2, mm0
- pfsub mm3, mm1
- movq [edi + eax*4], mm2
- movd [edi + eax*4 +8], mm3
-
- ;; done - one more?
- dec dword [esp + .innerk]
- jz .updateouterdata
- jmp .inner_loop
-.updateouterdata:
- mov ecx, [esp + .ii3]
-
- movq mm6, [edi + ecx*4] ; increment iO force
- movd mm7, [edi + ecx*4 + 8]
- pfadd mm6, [esp + .fixO]
- pfadd mm7, [esp + .fizO]
- movq [edi + ecx*4], mm6
- movd [edi + ecx*4 +8], mm7
-
- movq mm0, [esp + .fixH]
- movq mm3, [esp + .fiyH]
- movq mm1, [esp + .fizH]
- movq mm2, mm0
- punpckldq mm0, mm3 ; mm0(l)=fxH1, mm0(h)=fyH1
- punpckhdq mm2, mm3 ; mm2(l)=fxH2, mm2(h)=fyH2
- movq mm3, mm1
- pswapd mm3,mm3
- ;; mm1 is fzH1
- ;; mm3 is fzH2
-
- movq mm6, [edi + ecx*4 + 12] ; increment iH1 force
- movd mm7, [edi + ecx*4 + 20]
- pfadd mm6, mm0
- pfadd mm7, mm1
- movq [edi + ecx*4 + 12], mm6
- movd [edi + ecx*4 + 20], mm7
-
- movq mm6, [edi + ecx*4 + 24] ; increment iH2 force
- movd mm7, [edi + ecx*4 + 32]
- pfadd mm6, mm2
- pfadd mm7, mm3
- movq [edi + ecx*4 + 24], mm6
- movd [edi + ecx*4 + 32], mm7
-
-
- mov ebx, [ebp + %$fshift] ; increment fshift force
- mov edx, [esp + .is3]
-
- movq mm6, [ebx + edx*4]
- movd mm7, [ebx + edx*4 + 8]
- pfadd mm6, [esp + .fixO]
- pfadd mm7, [esp + .fizO]
- pfadd mm6, mm0
- pfadd mm7, mm1
- pfadd mm6, mm2
- pfadd mm7, mm3
- movq [ebx + edx*4], mm6
- movd [ebx + edx*4 + 8], mm7
-
- mov edx, [ebp + %$gid] ; get group index for this i particle
- mov edx, [edx]
- add [ebp + %$gid], dword 4 ; advance pointer
-
- movq mm7, [esp + .vctot]
- pfacc mm7,mm7 ; get and sum the two parts of total potential
-
- mov eax, [ebp + %$Vc]
- movd mm6, [eax + edx*4]
- pfadd mm6, mm7
- movd [eax + edx*4], mm6 ; increment vc[gid]
-
- movq mm7, [esp + .vnbtot]
- pfacc mm7,mm7 ; same for Vnb.
-
- mov eax, [ebp + %$Vnb]
- movd mm6, [eax + edx*4]
- pfadd mm6, mm7
- movd [eax + edx*4], mm6 ; increment vnb[gid]
- ;; finish if last
- dec dword [ebp + %$nri]
- jz .end
- ;; not last, iterate once more!
- jmp .outer
-.end:
- femms
- add esp, 228
- pop edi
- pop esi
- pop edx
- pop ecx
- pop ebx
- pop eax
- endproc
-
-
-
-proc inl3330_3dnow
-%$nri arg
-%$iinr arg
-%$jindex arg
-%$jjnr arg
-%$shift arg
-%$shiftvec arg
-%$fshift arg
-%$gid arg
-%$pos arg
-%$faction arg
-%$charge arg
-%$facel arg
-%$Vc arg
-%$type arg
-%$ntype arg
-%$nbfp arg
-%$Vnb arg
-%$tabscale arg
-%$VFtab arg
- ;; stack offsets for local variables
-.is3 equ 0
-.ii3 equ 4
-.ixO equ 8
-.iyO equ 12
-.izO equ 16
-.ixH equ 20 ; repeated (64bit) to fill 3dnow reg
-.iyH equ 28 ; repeated (64bit) to fill 3dnow reg
-.izH equ 36 ; repeated (64bit) to fill 3dnow reg
-.qqOO equ 44 ; repeated (64bit) to fill 3dnow reg
-.qqOH equ 52 ; repeated (64bit) to fill 3dnow reg
-.qqHH equ 60 ; repeated (64bit) to fill 3dnow reg
-.c6 equ 68 ; repeated (64bit) to fill 3dnow reg
-.c12 equ 76 ; repeated (64bit) to fill 3dnow reg
-.two equ 84 ; repeated (64bit) to fill 3dnow reg
-.n1 equ 92 ; repeated (64bit) to fill 3dnow reg
-.tabscale equ 100 ; repeated (64bit) to fill 3dnow reg
-.vctot equ 108 ; repeated (64bit) to fill 3dnow reg
-.vnbtot equ 116 ; repeated (64bit) to fill 3dnow reg
-.innerjjnr equ 124
-.innerk equ 128
-.fixO equ 132
-.fiyO equ 136
-.fizO equ 140
-.fixH equ 144 ; repeated (64bit) to fill 3dnow reg
-.fiyH equ 152 ; repeated (64bit) to fill 3dnow reg
-.fizH equ 160 ; repeated (64bit) to fill 3dnow reg
-.dxO equ 168
-.dyO equ 172
-.dzO equ 176
-.dxH equ 180 ; repeated (64bit) to fill 3dnow reg
-.dyH equ 188 ; repeated (64bit) to fill 3dnow reg
-.dzH equ 196 ; repeated (64bit) to fill 3dnow reg
-.tmprsqH equ 204 ; repeated (64bit) to fill 3dnow reg
- push eax
- push ebx
- push ecx
- push edx
- push esi
- push edi
- sub esp, 212 ; local stack space
- femms
- ;; assume we have at least one i particle - start directly
-
- mov ecx, [ebp + %$iinr] ; ecx = pointer into iinr[]
- mov ebx, [ecx] ; ebx =ii
-
- mov edx, [ebp + %$charge]
- movd mm1, [ebp + %$facel] ; mm1=facel
- movd mm2, [edx + ebx*4] ; mm2=charge[ii0] (O)
- movd mm3, [edx + ebx*4 + 4] ; mm2=charge[ii0+1] (H)
- movq mm4, mm2
- pfmul mm4, mm1
- movq mm6, mm3
- pfmul mm6, mm1
- movq mm5, mm4
- pfmul mm4, mm2 ; mm4=qqOO*facel
- pfmul mm5, mm3 ; mm5=qqOH*facel
- pfmul mm6, mm3 ; mm6=qqHH*facel
- punpckldq mm5,mm5 ; spread to both halves
- punpckldq mm6,mm6 ; spread to both halves
- movq [esp + .qqOO], mm4
- movq [esp + .qqOH], mm5
- movq [esp + .qqHH], mm6
- mov edx, [ebp + %$type]
- mov ecx, [edx + ebx*4]
- shl ecx, 1
- mov edx, ecx
- imul ecx, [ebp + %$ntype]
- add edx, ecx
- mov eax, [ebp + %$nbfp]
- movd mm0, [eax + edx*4]
- movd mm1, [eax + edx*4 + 4]
- movq [esp + .c6], mm0
- movq [esp + .c12], mm1
- movq mm2, [mm_two]
- movq [esp + .two], mm2
- movd mm3, [ebp + %$tabscale]
- punpckldq mm3,mm3
- movq [esp + .tabscale], mm3
-.outer:
- mov eax, [ebp + %$shift] ; eax = pointer into shift[]
- mov ebx, [eax] ; ebx=shift[n]
- add [ebp + %$shift], dword 4 ; advance pointer one step
-
- lea ebx, [ebx + ebx*2] ; ebx=3*is
- mov [esp + .is3],ebx ; store is3
-
- mov eax, [ebp + %$shiftvec] ; eax = base of shiftvec[]
-
- movq mm5, [eax + ebx*4] ; move shX/shY to mm5 and shZ to mm6.
- movd mm6, [eax + ebx*4 + 8]
- movq mm0, mm5
- movq mm1, mm5
- movq mm2, mm6
- punpckldq mm0,mm0 ; also expand shX,Y,Z in mm0--mm2.
- punpckhdq mm1,mm1
- punpckldq mm2,mm2
-
- mov ecx, [ebp + %$iinr] ; ecx = pointer into iinr[]
- add [ebp + %$iinr], dword 4 ; advance pointer
- mov ebx, [ecx] ; ebx =ii
-
- lea ebx, [ebx + ebx*2] ; ebx = 3*ii=ii3
- mov eax, [ebp + %$pos] ; eax = base of pos[]
-
- pfadd mm5, [eax + ebx*4] ; ix = shX + posX (and iy too)
- movd mm7, [eax + ebx*4 + 8] ; cant use direct memory add for 4 bytes (iz)
- mov [esp + .ii3], ebx ; (use mm7 as temp. storage for iz.)
- pfadd mm6, mm7
- movq [esp + .ixO], mm5
- movq [esp + .izO], mm6
-
- movd mm3, [eax + ebx*4 + 12]
- movd mm4, [eax + ebx*4 + 16]
- movd mm5, [eax + ebx*4 + 20]
- punpckldq mm3, [eax + ebx*4 + 24]
- punpckldq mm4, [eax + ebx*4 + 28]
- punpckldq mm5, [eax + ebx*4 + 32] ; coords of H1 in low mm3-mm5, H2 in high.
-
- pfadd mm0, mm3
- pfadd mm1, mm4
- pfadd mm2, mm5
- movq [esp + .ixH], mm0
- movq [esp + .iyH], mm1
- movq [esp + .izH], mm2
-
- ;; clear vctot and i forces.
- pxor mm7,mm7
- movq [esp + .vctot], mm7
- movq [esp + .vnbtot], mm7
- movq [esp + .fixO], mm7
- movq [esp + .fizO], mm7
- movq [esp + .fixH], mm7
- movq [esp + .fiyH], mm7
- movq [esp + .fizH], mm7
-
- mov eax, [ebp + %$jindex]
- mov ecx, [eax] ; jindex[n]
- mov edx, [eax + 4] ; jindex[n+1]
- add [ebp + %$jindex], dword 4
- sub edx, ecx ; number of innerloop atoms
- mov [esp + .innerk], edx ; number of innerloop atoms
-
- mov esi, [ebp + %$pos]
- mov edi, [ebp + %$faction]
- mov eax, [ebp + %$jjnr]
- shl ecx, 2
- add eax, ecx
- mov [esp + .innerjjnr], eax ; pointer to jjnr[nj0]
-.inner_loop:
- ;; a single j particle iteration here - compare with the unrolled code for comments.
- mov eax, [esp + .innerjjnr]
- mov eax, [eax] ; eax=jnr offset
- add [esp + .innerjjnr], dword 4 ; advance pointer
-
- lea eax, [eax + eax*2]
-
- movq mm0, [esi + eax*4]
- movd mm1, [esi + eax*4 + 8]
- ;; copy & expand to mm2-mm4 for the H interactions
- movq mm2, mm0
- movq mm3, mm0
- movq mm4, mm1
- punpckldq mm2,mm2
- punpckhdq mm3,mm3
- punpckldq mm4,mm4
-
- pfsubr mm0, [esp + .ixO]
- pfsubr mm1, [esp + .izO]
-
- movq [esp + .dxO], mm0
- pfmul mm0,mm0
- movd [esp + .dzO], mm1
- pfmul mm1,mm1
- pfacc mm0, mm0
- pfadd mm0, mm1 ; mm0=rsqO
-
- punpckldq mm2, mm2
- punpckldq mm3, mm3
- punpckldq mm4, mm4 ; mm2-mm4 is jx-jz
- pfsubr mm2, [esp + .ixH]
- pfsubr mm3, [esp + .iyH]
- pfsubr mm4, [esp + .izH] ; mm2-mm4 is dxH-dzH
-
- movq [esp + .dxH], mm2
- movq [esp + .dyH], mm3
- movq [esp + .dzH], mm4
- pfmul mm2,mm2
- pfmul mm3,mm3
- pfmul mm4,mm4
-
- pfadd mm3,mm2
- pfadd mm3,mm4 ; mm3=rsqH
- movq [esp + .tmprsqH], mm3
-
- pfrsqrt mm1,mm0
-
- movq mm2,mm1
- pfmul mm1,mm1
- pfrsqit1 mm1,mm0
- pfrcpit2 mm1,mm2 ; mm1=invsqrt OO
- pfmul mm0, mm1 ; mm0=rsq OO
-
- pfmul mm0, [esp + .tabscale]
- pf2iw mm4, mm0
- movd [esp + .n1], mm4
- pi2fd mm4,mm4
- pfsub mm0, mm4 ; now mm0 is eps and mm4 n0.
- movq mm2, mm0
- pfmul mm2, mm2 ; mm0 is eps, mm2 eps2
-
- ; coulomb table
- mov edx, [ebp + %$VFtab]
- mov ecx, [esp + .n1]
- lea ecx, [ecx + ecx*2]
- shl ecx, 2
-
- ;; load all values we need
- movd mm4, [edx + ecx*4]
- movd mm5, [edx + ecx*4 + 4]
- movd mm6, [edx + ecx*4 + 8]
- movd mm7, [edx + ecx*4 + 12]
-
- pfmul mm6, mm0 ; mm6 = Geps
- pfmul mm7, mm2 ; mm7 = Heps2
- ;;
- pfadd mm5, mm6
- pfadd mm5, mm7 ; mm5 = Fp
-
- pfmul mm7, [esp + .two] ; two*Heps2
- pfadd mm7, mm6
- pfadd mm7, mm5 ; mm7=FF
-
- pfmul mm5, mm0 ; mm5=eps*Fp
- pfadd mm5, mm4 ; mm5= VV
-
- pfmul mm5, [esp + .qqOO] ; vcoul=qq*VV
- pfmul mm7, [esp + .qqOO] ; fijC=qq*FF
-
- ;; update vctot directly, use mm3 for fscal sum.
- pfadd mm5, [esp + .vctot]
- movq [esp + .vctot], mm5
- movq mm3, mm7
-
- ; dispersion table
- ; load all the table values we need
- movd mm4, [edx + ecx*4 + 16]
- movd mm5, [edx + ecx*4 + 20]
- movd mm6, [edx + ecx*4 + 24]
- movd mm7, [edx + ecx*4 + 28]
- pfmul mm6, mm0 ; mm6 = Geps
- pfmul mm7, mm2 ; mm7 = Heps2
- pfadd mm5, mm6
- pfadd mm5, mm7 ; mm5 = Fp
- pfmul mm7, [esp + .two] ; two*Heps2
- pfadd mm7, mm6
- pfadd mm7, mm5 ; mm7=FF
- pfmul mm5, mm0 ; mm5=eps*Fp
- pfadd mm5, mm4 ; mm5= VV
-
- movq mm4, [esp + .c6]
- pfmul mm7, mm4 ; fijD
- pfmul mm5, mm4 ; vnb6
- pfadd mm3, mm7 ; add to fscal
-
- ;; update vnbtot to release mm5!
- pfadd mm5, [esp + .vnbtot] ; add the earlier value
- movq [esp + .vnbtot], mm5 ; store the sum
-
- ; repulsion table
- ; load all the table values we need
- movd mm4, [edx + ecx*4 + 32]
- movd mm5, [edx + ecx*4 + 36]
- movd mm6, [edx + ecx*4 + 40]
- movd mm7, [edx + ecx*4 + 44]
-
- pfmul mm6, mm0 ; mm6 = Geps
- pfmul mm7, mm2 ; mm7 = Heps2
- pfadd mm5, mm6
- pfadd mm5, mm7 ; mm5 = Fp
- pfmul mm7, [esp + .two] ; two*Heps2
- pfadd mm7, mm6
- pfadd mm7, mm5 ; mm7=FF
- pfmul mm5, mm0 ; mm5=eps*Fp
- pfadd mm5, mm4 ; mm5= VV
-
- movq mm6, [esp + .c12]
- pfmul mm7, mm6 ; fijR
- pfmul mm5, mm6 ; vnb12
- pfadd mm3, mm7 ; total fscal fijC+fijD+fijR
-
- ; change sign of fscal and multiply with rinv
- pxor mm0,mm0
- pfsubr mm3, mm0
- pfmul mm3, [esp + .tabscale]
- pfmul mm3, mm1 ; mm3 is total fscal (for the oxygen) now
-
- ;; update vnbtot
- pfadd mm5, [esp + .vnbtot] ; add the earlier value
- movq [esp + .vnbtot], mm5 ; store the sum
-
- ;; Ready with the oxygen - potential is updated, fscal is in mm3.
- ;; time for hydrogens!
-
- movq mm0, [esp + .tmprsqH]
-
- pfrsqrt mm1, mm0
- pswapd mm0,mm0
- pfrsqrt mm2, mm0
- pswapd mm0,mm0
- punpckldq mm1,mm2 ; seeds are in mm1 now, and rsq in mm0.
-
- movq mm2, mm1
- pfmul mm1,mm1
- pfrsqit1 mm1,mm0
- pfrcpit2 mm1,mm2 ; mm1=invsqrt
-
- pfmul mm0,mm1 ; mm0=r
- pfmul mm0, [esp + .tabscale]
- pf2iw mm4, mm0
- movq [esp + .n1], mm4
- pi2fd mm4,mm4
- pfsub mm0, mm4 ; now mm0 is eps and mm4 n0.
- movq mm2, mm0
- pfmul mm2, mm2 ; mm0 is eps, mm2 eps2
-
- ; coulomb table
- mov edx, [ebp + %$VFtab]
- mov ecx, [esp + .n1]
- lea ecx, [ecx + ecx*2]
- shl ecx, 2
- ;; load all values we need
- movd mm4, [edx + ecx*4]
- movd mm5, [edx + ecx*4 + 4]
- movd mm6, [edx + ecx*4 + 8]
- movd mm7, [edx + ecx*4 + 12]
- mov ecx, [esp + .n1 + 4]
- lea ecx, [ecx + ecx*2]
- shl ecx, 2
- punpckldq mm4, [edx + ecx*4]
- punpckldq mm5, [edx + ecx*4 + 4]
- punpckldq mm6, [edx + ecx*4 + 8]
- punpckldq mm7, [edx + ecx*4 + 12]
-
- pfmul mm6, mm0 ; mm6 = Geps
- pfmul mm7, mm2 ; mm7 = Heps2
- ;;
- pfadd mm5, mm6
- pfadd mm5, mm7 ; mm5 = Fp
-
- pfmul mm7, [esp + .two] ; two*Heps2
- pfadd mm7, mm6
- pfadd mm7, mm5 ; mm7=FF
-
- pfmul mm5, mm0 ; mm5=eps*Fp
- pfadd mm5, mm4 ; mm5= VV
-
- pfmul mm5, [esp + .qqOH] ; vcoul=qq*VV
- pfmul mm7, [esp + .qqOH] ; fijC=qq*FF
- ;; update vctot.
- pfadd mm5, [esp + .vctot]
- movq [esp + .vctot], mm5
-
- ;; change sign of fijC and multiply by rinv
- pxor mm4,mm4
- pfsub mm4, mm7
- pfmul mm4, [esp + .tabscale]
- pfmul mm4, mm1 ; mm4 is total fscal (for the hydrogens) now
-
- ;; spread oxygen fscalar to both positions
- punpckldq mm3,mm3
- ;; calc vectorial force for O
- movq mm0, [esp + .dxO]
- movd mm1, [esp + .dzO]
- pfmul mm0, mm3
- pfmul mm1, mm3
-
- ;; calc vectorial force for H's
- movq mm5, [esp + .dxH]
- movq mm6, [esp + .dyH]
- movq mm7, [esp + .dzH]
- pfmul mm5, mm4
- pfmul mm6, mm4
- pfmul mm7, mm4
-
- ;; update iO particle force
- movq mm2, [esp + .fixO]
- movd mm3, [esp + .fizO]
- pfadd mm2, mm0
- pfadd mm3, mm1
- movq [esp + .fixO], mm2
- movd [esp + .fizO], mm3
-
- ;; update iH forces
- movq mm2, [esp + .fixH]
- movq mm3, [esp + .fiyH]
- movq mm4, [esp + .fizH]
- pfadd mm2, mm5
- pfadd mm3, mm6
- pfadd mm4, mm7
- movq [esp + .fixH], mm2
- movq [esp + .fiyH], mm3
- movq [esp + .fizH], mm4
-
- ;; pack j forces from H in the same form as the oxygen force.
- pfacc mm5, mm6 ; mm5(l)=fjx(H1+H2) mm5(h)=fjy(H1+H2)
- pfacc mm7, mm7 ; mm7(l)=fjz(H1+H2)
-
- pfadd mm0, mm5 ; add up total force on j particle.
- pfadd mm1, mm7
-
- ;; update j particle force
- movq mm2, [edi + eax*4]
- movd mm3, [edi + eax*4 + 8]
- pfsub mm2, mm0
- pfsub mm3, mm1
- movq [edi + eax*4], mm2
- movd [edi + eax*4 +8], mm3
-
- ; interactions with j H1.
-
- movq mm0, [esi + eax*4 + 12]
- movd mm1, [esi + eax*4 + 20]
- ;; copy & expand to mm2-mm4 for the H interactions
- movq mm2, mm0
- movq mm3, mm0
- movq mm4, mm1
- punpckldq mm2,mm2
- punpckhdq mm3,mm3
- punpckldq mm4,mm4
-
- pfsubr mm0, [esp + .ixO]
- pfsubr mm1, [esp + .izO]
-
- movq [esp + .dxO], mm0
- pfmul mm0,mm0
- movd [esp + .dzO], mm1
- pfmul mm1,mm1
- pfacc mm0, mm1
- pfadd mm0, mm1 ; mm0=rsqO
-
- punpckldq mm2, mm2
- punpckldq mm3, mm3
- punpckldq mm4, mm4 ; mm2-mm4 is jx-jz
- pfsubr mm2, [esp + .ixH]
- pfsubr mm3, [esp + .iyH]
- pfsubr mm4, [esp + .izH] ; mm2-mm4 is dxH-dzH
-
- movq [esp + .dxH], mm2
- movq [esp + .dyH], mm3
- movq [esp + .dzH], mm4
- pfmul mm2,mm2
- pfmul mm3,mm3
- pfmul mm4,mm4
-
- pfadd mm3,mm2
- pfadd mm3,mm4 ; mm3=rsqH
- movq [esp + .tmprsqH], mm3
-
- pfrsqrt mm1,mm0
-
- movq mm2,mm1
- pfmul mm1,mm1
- pfrsqit1 mm1,mm0
- pfrcpit2 mm1,mm2 ; mm1=invsqrt
- pfmul mm0, mm1 ; mm0=rsq
-
- pfmul mm0, [esp + .tabscale]
- pf2iw mm4, mm0
- movd [esp + .n1], mm4
- pi2fd mm4,mm4
- pfsub mm0, mm4 ; now mm0 is eps and mm4 n0.
- movq mm2, mm0
- pfmul mm2, mm2 ; mm0 is eps, mm2 eps2
-
- ; coulomb table
- mov edx, [ebp + %$VFtab]
- mov ecx, [esp + .n1]
- lea ecx, [ecx + ecx*2]
- shl ecx, 2
-
- ;; load all values we need
- movd mm4, [edx + ecx*4]
- movd mm5, [edx + ecx*4 + 4]
- movd mm6, [edx + ecx*4 + 8]
- movd mm7, [edx + ecx*4 + 12]
-
- pfmul mm6, mm0 ; mm6 = Geps
- pfmul mm7, mm2 ; mm7 = Heps2
- ;;
- pfadd mm5, mm6
- pfadd mm5, mm7 ; mm5 = Fp
-
- pfmul mm7, [esp + .two] ; two*Heps2
- pfadd mm7, mm6
- pfadd mm7, mm5 ; mm7=FF
-
- pfmul mm5, mm0 ; mm5=eps*Fp
- pfadd mm5, mm4 ; mm5= VV
-
- pfmul mm5, [esp + .qqOH] ; vcoul=qq*VV
- pfmul mm7, [esp + .qqOH] ; fijC=qq*FF
-
- ;; update vctot directly, force is moved to mm3.
- pfadd mm5, [esp + .vctot]
- movq [esp + .vctot], mm5
- pxor mm3, mm3
- pfsub mm3, mm7
- pfmul mm3, [esp + .tabscale]
- pfmul mm3, mm1 ; mm3 is total fscal (for the oxygen) now
-
- movq mm0, [esp + .tmprsqH]
-
- pfrsqrt mm1, mm0
- pswapd mm0,mm0
- pfrsqrt mm2, mm0
- pswapd mm0,mm0
- punpckldq mm1,mm2 ; seeds are in mm1 now, and rsq in mm0.
-
- movq mm2, mm1
- pfmul mm1,mm1
- pfrsqit1 mm1,mm0
- pfrcpit2 mm1,mm2 ; mm1=invsqrt
-
- pfmul mm0,mm1 ; mm0=r
- pfmul mm0, [esp + .tabscale]
- pf2iw mm4, mm0
- movq [esp + .n1], mm4
- pi2fd mm4,mm4
- pfsub mm0, mm4 ; now mm0 is eps and mm4 n0.
- movq mm2, mm0
- pfmul mm2, mm2 ; mm0 is eps, mm2 eps2
-
- ; coulomb table
- mov edx, [ebp + %$VFtab]
- mov ecx, [esp + .n1]
- lea ecx, [ecx + ecx*2]
- shl ecx, 2
- ;; load all values we need
- movd mm4, [edx + ecx*4]
- movd mm5, [edx + ecx*4 + 4]
- movd mm6, [edx + ecx*4 + 8]
- movd mm7, [edx + ecx*4 + 12]
- mov ecx, [esp + .n1 + 4]
- lea ecx, [ecx + ecx*2]
- shl ecx, 2
- punpckldq mm4, [edx + ecx*4]
- punpckldq mm5, [edx + ecx*4 + 4]
- punpckldq mm6, [edx + ecx*4 + 8]
- punpckldq mm7, [edx + ecx*4 + 12]
-
-
- pfmul mm6, mm0 ; mm6 = Geps
- pfmul mm7, mm2 ; mm7 = Heps2
- ;;
- pfadd mm5, mm6
- pfadd mm5, mm7 ; mm5 = Fp
-
- pfmul mm7, [esp + .two] ; two*Heps2
- pfadd mm7, mm6
- pfadd mm7, mm5 ; mm7=FF
-
- pfmul mm5, mm0 ; mm5=eps*Fp
- pfadd mm5, mm4 ; mm5= VV
-
- pfmul mm5, [esp + .qqHH] ; vcoul=qq*VV
- pfmul mm7, [esp + .qqHH] ; fijC=qq*FF
- ;; update vctot.
- pfadd mm5, [esp + .vctot]
- movq [esp + .vctot], mm5
-
- ;; change sign of fijC and multiply by rinv
- pxor mm4,mm4
- pfsub mm4, mm7
- pfmul mm4, [esp + .tabscale]
- pfmul mm4, mm1 ; mm4 is total fscal (for the hydrogens) now
-
- ;; spread oxygen fscalar to both positions
- punpckldq mm3,mm3
- ;; calc vectorial force for O
- movq mm0, [esp + .dxO]
- movd mm1, [esp + .dzO]
- pfmul mm0, mm3
- pfmul mm1, mm3
-
- ;; calc vectorial force for H's
- movq mm5, [esp + .dxH]
- movq mm6, [esp + .dyH]
- movq mm7, [esp + .dzH]
- pfmul mm5, mm4
- pfmul mm6, mm4
- pfmul mm7, mm4
-
- ;; update iO particle force
- movq mm2, [esp + .fixO]
- movd mm3, [esp + .fizO]
- pfadd mm2, mm0
- pfadd mm3, mm1
- movq [esp + .fixO], mm2
- movd [esp + .fizO], mm3
-
- ;; update iH forces
- movq mm2, [esp + .fixH]
- movq mm3, [esp + .fiyH]
- movq mm4, [esp + .fizH]
- pfadd mm2, mm5
- pfadd mm3, mm6
- pfadd mm4, mm7
- movq [esp + .fixH], mm2
- movq [esp + .fiyH], mm3
- movq [esp + .fizH], mm4
-
- ;; pack j forces from H in the same form as the oxygen force.
- pfacc mm5, mm6 ; mm5(l)=fjx(H1+H2) mm5(h)=fjy(H1+H2)
- pfacc mm7, mm7 ; mm7(l)=fjz(H1+H2)
-
- pfadd mm0, mm5 ; add up total force on j particle.
- pfadd mm1, mm7
-
- ;; update j particle force
- movq mm2, [edi + eax*4 + 12]
- movd mm3, [edi + eax*4 + 20]
- pfsub mm2, mm0
- pfsub mm3, mm1
- movq [edi + eax*4 + 12], mm2
- movd [edi + eax*4 + 20], mm3
-
- ; interactions with j H2
- movq mm0, [esi + eax*4 + 24]
- movd mm1, [esi + eax*4 + 32]
- ;; copy & expand to mm2-mm4 for the H interactions
- movq mm2, mm0
- movq mm3, mm0
- movq mm4, mm1
- punpckldq mm2,mm2
- punpckhdq mm3,mm3
- punpckldq mm4,mm4
-
- pfsubr mm0, [esp + .ixO]
- pfsubr mm1, [esp + .izO]
-
- movq [esp + .dxO], mm0
- pfmul mm0,mm0
- movd [esp + .dzO], mm1
- pfmul mm1,mm1
- pfacc mm0, mm1
- pfadd mm0, mm1 ; mm0=rsqO
-
- punpckldq mm2, mm2
- punpckldq mm3, mm3
- punpckldq mm4, mm4 ; mm2-mm4 is jx-jz
- pfsubr mm2, [esp + .ixH]
- pfsubr mm3, [esp + .iyH]
- pfsubr mm4, [esp + .izH] ; mm2-mm4 is dxH-dzH
-
- movq [esp + .dxH], mm2
- movq [esp + .dyH], mm3
- movq [esp + .dzH], mm4
- pfmul mm2,mm2
- pfmul mm3,mm3
- pfmul mm4,mm4
-
- pfadd mm3,mm2
- pfadd mm3,mm4 ; mm3=rsqH
- movq [esp + .tmprsqH], mm3
-
- pfrsqrt mm1,mm0
-
- movq mm2,mm1
- pfmul mm1,mm1
- pfrsqit1 mm1,mm0
- pfrcpit2 mm1,mm2 ; mm1=invsqrt
- pfmul mm0, mm1
-
- pfmul mm0, [esp + .tabscale]
- pf2iw mm4, mm0
- movd [esp + .n1], mm4
- pi2fd mm4,mm4
- pfsub mm0, mm4 ; now mm0 is eps and mm4 n0.
- movq mm2, mm0
- pfmul mm2, mm2 ; mm0 is eps, mm2 eps2
-
- ; coulomb table
- mov edx, [ebp + %$VFtab]
- mov ecx, [esp + .n1]
- lea ecx, [ecx + ecx*2]
- shl ecx, 2
-
- ;; load all values we need
- movd mm4, [edx + ecx*4]
- movd mm5, [edx + ecx*4 + 4]
- movd mm6, [edx + ecx*4 + 8]
- movd mm7, [edx + ecx*4 + 12]
-
- pfmul mm6, mm0 ; mm6 = Geps
- pfmul mm7, mm2 ; mm7 = Heps2
- ;;
- pfadd mm5, mm6
- pfadd mm5, mm7 ; mm5 = Fp
-
- pfmul mm7, [esp + .two] ; two*Heps2
- pfadd mm7, mm6
- pfadd mm7, mm5 ; mm7=FF
-
- pfmul mm5, mm0 ; mm5=eps*Fp
- pfadd mm5, mm4 ; mm5= VV
-
- pfmul mm5, [esp + .qqOH] ; vcoul=qq*VV
- pfmul mm7, [esp + .qqOH] ; fijC=qq*FF
-
- ;; update vctot directly, use mm3 for fscal sum.
- pfadd mm5, [esp + .vctot]
- movq [esp + .vctot], mm5
- pxor mm3,mm3
- pfsub mm3, mm7
- pfmul mm3, [esp + .tabscale]
- pfmul mm3, mm1 ; mm3 is total fscal (for the oxygen) now
-
- movq mm0, [esp + .tmprsqH]
-
- pfrsqrt mm1, mm0
- pswapd mm0,mm0
- pfrsqrt mm2, mm0
- pswapd mm0,mm0
- punpckldq mm1,mm2 ; seeds are in mm1 now, and rsq in mm0.
-
- movq mm2, mm1
- pfmul mm1,mm1
- pfrsqit1 mm1,mm0
- pfrcpit2 mm1,mm2 ; mm1=invsqrt
-
- pfmul mm0,mm1 ; mm0=r
- pfmul mm0, [esp + .tabscale]
- pf2iw mm4, mm0
- movq [esp + .n1], mm4
- pi2fd mm4,mm4
- pfsub mm0, mm4 ; now mm0 is eps and mm4 n0.
- movq mm2, mm0
- pfmul mm2, mm2 ; mm0 is eps, mm2 eps2
-
- ; coulomb table
- mov edx, [ebp + %$VFtab]
- mov ecx, [esp + .n1]
- lea ecx, [ecx + ecx*2]
- shl ecx, 2
- ;; load all values we need
- movd mm4, [edx + ecx*4]
- movd mm5, [edx + ecx*4 + 4]
- movd mm6, [edx + ecx*4 + 8]
- movd mm7, [edx + ecx*4 + 12]
- mov ecx, [esp + .n1 + 4]
- lea ecx, [ecx + ecx*2]
- shl ecx, 2
- punpckldq mm4, [edx + ecx*4]
- punpckldq mm5, [edx + ecx*4 + 4]
- punpckldq mm6, [edx + ecx*4 + 8]
- punpckldq mm7, [edx + ecx*4 + 12]
-
-
- pfmul mm6, mm0 ; mm6 = Geps
- pfmul mm7, mm2 ; mm7 = Heps2
- ;;
- pfadd mm5, mm6
- pfadd mm5, mm7 ; mm5 = Fp
-
- pfmul mm7, [esp + .two] ; two*Heps2
- pfadd mm7, mm6
- pfadd mm7, mm5 ; mm7=FF
-
- pfmul mm5, mm0 ; mm5=eps*Fp
- pfadd mm5, mm4 ; mm5= VV
-
- pfmul mm5, [esp + .qqHH] ; vcoul=qq*VV
- pfmul mm7, [esp + .qqHH] ; fijC=qq*FF
- ;; update vctot.
- pfadd mm5, [esp + .vctot]
- movq [esp + .vctot], mm5
-
- ;; change sign of fijC and multiply by rinv
- pxor mm4,mm4
- pfsub mm4, mm7
- pfmul mm4, [esp + .tabscale]
- pfmul mm4, mm1 ; mm4 is total fscal (for the hydrogens) now
-
- ;; spread oxygen fscalar to both positions
- punpckldq mm3,mm3
- ;; calc vectorial force for O
- movq mm0, [esp + .dxO]
- movd mm1, [esp + .dzO]
- pfmul mm0, mm3
- pfmul mm1, mm3
-
- ;; calc vectorial force for H's
- movq mm5, [esp + .dxH]
- movq mm6, [esp + .dyH]
- movq mm7, [esp + .dzH]
- pfmul mm5, mm4
- pfmul mm6, mm4
- pfmul mm7, mm4
-
- ;; update iO particle force
- movq mm2, [esp + .fixO]
- movd mm3, [esp + .fizO]
- pfadd mm2, mm0
- pfadd mm3, mm1
- movq [esp + .fixO], mm2
- movd [esp + .fizO], mm3
-
- ;; update iH forces
- movq mm2, [esp + .fixH]
- movq mm3, [esp + .fiyH]
- movq mm4, [esp + .fizH]
- pfadd mm2, mm5
- pfadd mm3, mm6
- pfadd mm4, mm7
- movq [esp + .fixH], mm2
- movq [esp + .fiyH], mm3
- movq [esp + .fizH], mm4
-
- ;; pack j forces from H in the same form as the oxygen force.
- pfacc mm5, mm6 ; mm5(l)=fjx(H1+H2) mm5(h)=fjy(H1+H2)
- pfacc mm7, mm7 ; mm7(l)=fjz(H1+H2)
-
- pfadd mm0, mm5 ; add up total force on j particle.
- pfadd mm1, mm7
-
- ;; update j particle force
- movq mm2, [edi + eax*4 + 24]
- movd mm3, [edi + eax*4 + 32]
- pfsub mm2, mm0
- pfsub mm3, mm1
- movq [edi + eax*4 + 24], mm2
- movd [edi + eax*4 + 32], mm3
-
- ;; done - one more?
- dec dword [esp + .innerk]
- jz .updateouterdata
- jmp .inner_loop
-.updateouterdata:
- mov ecx, [esp + .ii3]
-
- movq mm6, [edi + ecx*4] ; increment iO force
- movd mm7, [edi + ecx*4 + 8]
- pfadd mm6, [esp + .fixO]
- pfadd mm7, [esp + .fizO]
- movq [edi + ecx*4], mm6
- movd [edi + ecx*4 +8], mm7
-
- movq mm0, [esp + .fixH]
- movq mm3, [esp + .fiyH]
- movq mm1, [esp + .fizH]
- movq mm2, mm0
- punpckldq mm0, mm3 ; mm0(l)=fxH1, mm0(h)=fyH1
- punpckhdq mm2, mm3 ; mm2(l)=fxH2, mm2(h)=fyH2
- movq mm3, mm1
- pswapd mm3,mm3
- ;; mm1 is fzH1
- ;; mm3 is fzH2
-
- movq mm6, [edi + ecx*4 + 12] ; increment iH1 force
- movd mm7, [edi + ecx*4 + 20]
- pfadd mm6, mm0
- pfadd mm7, mm1
- movq [edi + ecx*4 + 12], mm6
- movd [edi + ecx*4 + 20], mm7
-
- movq mm6, [edi + ecx*4 + 24] ; increment iH2 force
- movd mm7, [edi + ecx*4 + 32]
- pfadd mm6, mm2
- pfadd mm7, mm3
- movq [edi + ecx*4 + 24], mm6
- movd [edi + ecx*4 + 32], mm7
-
-
- mov ebx, [ebp + %$fshift] ; increment fshift force
- mov edx, [esp + .is3]
-
- movq mm6, [ebx + edx*4]
- movd mm7, [ebx + edx*4 + 8]
- pfadd mm6, [esp + .fixO]
- pfadd mm7, [esp + .fizO]
- pfadd mm6, mm0
- pfadd mm7, mm1
- pfadd mm6, mm2
- pfadd mm7, mm3
- movq [ebx + edx*4], mm6
- movd [ebx + edx*4 + 8], mm7
-
- mov edx, [ebp + %$gid] ; get group index for this i particle
- mov edx, [edx]
- add [ebp + %$gid], dword 4 ; advance pointer
-
- movq mm7, [esp + .vctot]
- pfacc mm7,mm7 ; get and sum the two parts of total potential
-
- mov eax, [ebp + %$Vc]
- movd mm6, [eax + edx*4]
- pfadd mm6, mm7
- movd [eax + edx*4], mm6 ; increment vc[gid]
-
- movq mm7, [esp + .vnbtot]
- pfacc mm7,mm7 ; get and sum the two parts of total potential
-
- mov eax, [ebp + %$Vnb]
- movd mm6, [eax + edx*4]
- pfadd mm6, mm7
- movd [eax + edx*4], mm6 ; increment vnbtot[gid]
- ;; finish if last
- dec dword [ebp + %$nri]
- jz .end
- ;; not last, iterate once more!
- jmp .outer
-.end:
- femms
- add esp, 212
- pop edi
- pop esi
- pop edx
- pop ecx
- pop ebx
- pop eax
- endproc
-
--- /dev/null
+/*
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * GROningen MAchine for Chemical Simulations
+ *
+ * VERSION 3.0
+ *
+ * Copyright (c) 1991-2001
+ * BIOSON Research Institute, Dept. of Biophysical Chemistry
+ * University of Groningen, The Netherlands
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * If you want to redistribute modifications, please consider that
+ * scientific software is very special. Version control is crucial -
+ * bugs must be traceable. We will be happy to consider code for
+ * inclusion in the official distribution, but derived work must not
+ * be called official GROMACS. Details are found in the README & COPYING
+ * files - if they are missing, get the official version at www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the papers on the package - you can find them in the top README file.
+ *
+ * Do check out http: //www.gromacs.org , or mail us at gromacs@gromacs.org .
+ *
+ * And Hey:
+ * GROup of MAchos and Cynical Suckers
+ *
+ * This file requires GNU binutils 2.10 or later, since we
+ * use intel syntax for portability.
+ */
+
+.intel_syntax noprefix
+
+.text
+
+.globl x86_cpuid /* issues the cpuid instruction with supplied args */
+ .type x86_cpuid,@function
+x86_cpuid:
+ push ebp
+ mov ebp,esp
+ push edi
+ push ebx
+ push ecx
+ push edx
+ mov eax, [ebp+8]
+ cpuid
+ mov edi, [ebp+12]
+ mov [edi],eax
+ mov edi, [ebp+16]
+ mov [edi],ebx
+ mov edi, [ebp+20]
+ mov [edi],ecx
+ mov edi, [ebp+24]
+ mov [edi],edx
+ pop edx
+ pop ecx
+ pop ebx
+ pop edi
+ mov esp, ebp
+ pop ebp
+ ret
+
+
+
+
+
+
+
+
+
+
+++ /dev/null
-;;
-;; This source code is part of
-;;
-;; G R O M A C S
-;;
-;; GROningen MAchine for Chemical Simulations
-;;
-;; VERSION 3.0
-;;
-;; Copyright (c) 1991-2001
-;; BIOSON Research Institute, Dept. of Biophysical Chemistry
-;; University of Groningen, The Netherlands
-;;
-;; This program is free software; you can redistribute it and/or
-;; modify it under the terms of the GNU General Public License
-;; as published by the Free Software Foundation; either version 2
-;; of the License, or (at your option) any later version.
-;;
-;; If you want to redistribute modifications, please consider that
-;; scientific software is very special. Version control is crucial -
-;; bugs must be traceable. We will be happy to consider code for
-;; inclusion in the official distribution, but derived work must not
-;; be called official GROMACS. Details are found in the README & COPYING
-;; files - if they are missing, get the official version at www.gromacs.org.
-;;
-;; To help us fund GROMACS development, we humbly ask that you cite
-;; the papers on the package - you can find them in the top README file.
-;;
-;; Do check out http: //www.gromacs.org , or mail us at gromacs@gromacs.org .
-;;
-;; And Hey:
-;; GROup of MAchos and Cynical Suckers
-
-;; this file must be processed with a version
-;; of nasm that supports the extended 3dnow instructions.
-;; you can find a binary of such a version on the
-;; gromacs homepage.
-
-segment .text
-global x86_cpuid ; issues the cpuid instruction with supplied args
-x86_cpuid:
- push ebp
- mov ebp,esp
- push edi
- push ebx
- push ecx
- push edx
- mov eax, [ebp+8]
- cpuid
- mov edi, [ebp+12]
- mov [edi],eax
- mov edi, [ebp+16]
- mov [edi],ebx
- mov edi, [ebp+20]
- mov [edi],ecx
- mov edi, [ebp+24]
- mov [edi],edx
- pop edx
- pop ecx
- pop ebx
- pop edi
- mov esp, ebp
- pop ebp
- ret
--- /dev/null
+/*
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * GROningen MAchine for Chemical Simulations
+ *
+ * VERSION 3.0
+ *
+ * Copyright (c) 1991-2001
+ * Dept. of Biophysical Chemistry
+ * University of Groningen, The Netherlands
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * If you want to redistribute modifications, please consider that
+ * scientific software is very special. Version control is crucial -
+ * bugs must be traceable. We will be happy to consider code for
+ * inclusion in the official distribution, but derived work must not
+ * be called official GROMACS. Details are found in the README & COPYING
+ * files - if they are missing, get the official version at www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the papers on the package - you can find them in the top README file.
+ *
+ * Do check out http://www.gromacs.org , or mail us at gromacs@gromacs.org .
+ *
+ * And Hey:
+ * GROup of MAchos and Cynical Suckers
+ */
+.intel_syntax noprefix
+
+.text
+.align 16
+
+sse_minushalf:
+ .long 0xbf000000
+ .long 0xbf000000
+ .long 0xbf000000
+ .long 0xbf000000
+sse_half:
+ .long 0x3f000000
+ .long 0x3f000000
+ .long 0x3f000000
+ .long 0x3f000000
+sse_two:
+ .long 0x40000000
+ .long 0x40000000
+ .long 0x40000000
+ .long 0x40000000
+sse_three:
+ .long 0x40400000
+ .long 0x40400000
+ .long 0x40400000
+ .long 0x40400000
+sse_six:
+ .long 0x40c00000
+ .long 0x40c00000
+ .long 0x40c00000
+ .long 0x40c00000
+sse_twelve:
+ .long 0x41400000
+ .long 0x41400000
+ .long 0x41400000
+ .long 0x41400000
+
+
+
+.globl checksse /* try to issue a SSE instruction */
+ .type checksse,@function
+checksse:
+ emms
+ xorps xmm0,xmm0
+ emms
+ ret
+
+.align 16
+
+.globl vecinvsqrt_sse
+ .type vecinvsqrt_sse,@function
+vecinvsqrt_sse:
+ push ebp
+ mov ebp,esp
+ push eax
+ push ebx
+ push ecx
+ push edx
+
+ mov eax, [ebp + 8]
+ mov ebx, [ebp + 12]
+ mov ecx, [ebp + 16]
+ mov edx, ecx
+ movups xmm6,[sse_three]
+ movups xmm7,[sse_half]
+ shr ecx, 3
+ jecxz .vecinvsqrt_iter4
+ emms
+.vecinvsqrt_loop8:
+ movaps xmm0,[eax]
+ add eax, 16
+ rsqrtps xmm1,xmm0
+ movaps xmm2,[eax]
+ add eax, 16
+ rsqrtps xmm3,xmm2
+ mulps xmm0,xmm1
+ mulps xmm2,xmm3
+ mulps xmm0,xmm1
+ mulps xmm2,xmm3
+ subps xmm0,xmm6
+ subps xmm2,xmm6
+ mulps xmm0,xmm1
+ mulps xmm2,xmm3
+ mulps xmm0,xmm7
+ mulps xmm2,xmm7
+ movaps [ebx],xmm0
+ add ebx, 16
+ movaps [ebx],xmm2
+ add ebx, 16
+ dec ecx
+ jecxz .vecinvsqrt_iter4
+ jmp .vecinvsqrt_loop8
+.vecinvsqrt_iter4:
+ mov ecx,edx
+ and ecx,4
+ jecxz .vecinvsqrt_iter2
+ movaps xmm0,[eax]
+ add eax, 16
+ rsqrtps xmm1,xmm0
+ mulps xmm0,xmm1
+ mulps xmm0,xmm1
+ subps xmm0,xmm6
+ mulps xmm0,xmm1
+ mulps xmm0,xmm7
+ movaps [ebx],xmm0
+ add ebx, 16
+.vecinvsqrt_iter2:
+ mov ecx,edx
+ and ecx,2
+ jecxz .vecinvsqrt_iter1
+ movlps xmm0,[eax]
+ add eax, 8
+ rsqrtps xmm1,xmm0
+ mulps xmm0,xmm1
+ mulps xmm0,xmm1
+ subps xmm0,xmm6
+ mulps xmm0,xmm1
+ mulps xmm0,xmm7
+ movlps [ebx],xmm0
+ add ebx, 8
+.vecinvsqrt_iter1:
+ mov ecx,edx
+ and ecx,1
+ jecxz .vecinvsqrt_end
+ movss xmm0,[eax]
+ rsqrtss xmm1,xmm0
+ mulss xmm0,xmm1
+ mulss xmm0,xmm1
+ subss xmm0,xmm6
+ mulss xmm0,xmm1
+ mulss xmm0,xmm7
+ movss [ebx],xmm0
+.vecinvsqrt_end:
+ emms
+ pop edx
+ pop ecx
+ pop ebx
+ pop eax
+ leave
+ ret
+
+.globl vecrecip_sse
+ .type vecrecip_sse,@function
+vecrecip_sse:
+ push ebp
+ mov ebp,esp
+ push eax
+ push ebx
+ push ecx
+ push edx
+
+ mov eax, [ebp + 8]
+ mov ebx, [ebp + 12]
+ mov ecx, [ebp + 16]
+ mov edx, ecx
+ movups xmm6,[sse_two]
+ shr ecx, 3
+ jecxz .vecrecip_iter4
+ emms
+.vecrecip_loop8:
+ movaps xmm0,[eax]
+ add eax, 16
+ rcpps xmm1,xmm0
+ movaps xmm3,[eax]
+ add eax, 16
+ rcpps xmm4,xmm3
+ movaps xmm2,xmm6
+ mulps xmm0,xmm1
+ movaps xmm5,xmm6
+ subps xmm2,xmm0
+ mulps xmm3,xmm4
+ mulps xmm2,xmm1
+ subps xmm5,xmm3
+ movaps [ebx],xmm2
+ mulps xmm5,xmm4
+ add ebx, 16
+ movaps [ebx],xmm5
+ add ebx, 16
+ dec ecx
+ jecxz .vecrecip_iter4
+ jmp .vecrecip_loop8
+.vecrecip_iter4:
+ mov ecx,edx
+ and ecx,4
+ jecxz .vecrecip_iter2
+ movaps xmm0,[eax]
+ add eax, 16
+ rcpps xmm1,xmm0
+ movaps xmm2,xmm6
+ mulps xmm0,xmm1
+ subps xmm2,xmm0
+ mulps xmm2,xmm1
+ movaps [ebx],xmm2
+ add ebx, 16
+.vecrecip_iter2:
+ mov ecx,edx
+ and ecx,2
+ jecxz .vecrecip_iter1
+ movlps xmm0,[eax]
+ add eax, 8
+ rcpps xmm1,xmm0
+ movaps xmm2,xmm6
+ mulps xmm0,xmm1
+ subps xmm2,xmm0
+ mulps xmm2,xmm1
+ movlps [ebx],xmm2
+ add ebx, 8
+.vecrecip_iter1:
+ mov ecx,edx
+ and ecx,1
+ jecxz .vecrecip_end
+ movss xmm0,[eax]
+ rcpss xmm1,xmm0
+ movss xmm2,xmm6
+ mulss xmm0,xmm1
+ subss xmm2,xmm0
+ mulss xmm2,xmm1
+ movss [ebx],xmm2
+.vecrecip_end:
+ emms
+ pop edx
+ pop ecx
+ pop ebx
+ pop eax
+ leave
+ ret
+
+
+.globl inl0100_sse
+ .type inl0100_sse,@function
+inl0100_sse:
+.equ nri, 8
+.equ iinr, 12
+.equ jindex, 16
+.equ jjnr, 20
+.equ shift, 24
+.equ shiftvec, 28
+.equ fshift, 32
+.equ gid, 36
+.equ pos, 40
+.equ faction, 44
+.equ type, 48
+.equ ntype, 52
+.equ nbfp, 56
+.equ Vnb, 60
+ /* stack offsets for local variables */
+ /* bottom of stack is cache-aligned for sse use */
+.equ ix, 0
+.equ iy, 16
+.equ iz, 32
+.equ dx, 48
+.equ dy, 64
+.equ dz, 80
+.equ two, 96
+.equ c6, 112
+.equ c12, 128
+.equ six, 144
+.equ twelve, 160
+.equ vnbtot, 176
+.equ fix, 192
+.equ fiy, 208
+.equ fiz, 224
+.equ half, 240
+.equ three, 256
+.equ is3, 272
+.equ ii3, 276
+.equ ntia, 280
+.equ innerjjnr, 284
+.equ innerk, 288
+.equ salign, 292
+ push ebp
+ mov ebp,esp
+ push eax
+ push ebx
+ push ecx
+ push edx
+ push esi
+ push edi
+ sub esp, 296 /* local stack space */
+ mov eax, esp
+ and eax, 0xf
+ sub esp, eax
+ mov [esp + salign], eax
+
+ emms
+
+ movups xmm1, [sse_two]
+ movups xmm2, [sse_six]
+ movups xmm3, [sse_twelve]
+ movaps [esp + two], xmm1
+ movaps [esp + six], xmm2
+ movaps [esp + twelve], xmm3
+
+ /* assume we have at least one i particle - start directly */
+.i0100_outer:
+ mov eax, [ebp + shift] /* eax = pointer into shift[] */
+ mov ebx, [eax] /* ebx=shift[n] */
+ add [ebp + shift], 4 /* advance pointer one step */
+
+ lea ebx, [ebx + ebx*2] /* ebx=3*is */
+ mov [esp + is3],ebx /* store is3 */
+
+ mov eax, [ebp + shiftvec] /* eax = base of shiftvec[] */
+
+ movss xmm0, [eax + ebx*4]
+ movss xmm1, [eax + ebx*4 + 4]
+ movss xmm2, [eax + ebx*4 + 8]
+
+ mov ecx, [ebp + iinr] /* ecx = pointer into iinr[] */
+ add [ebp + iinr], 4 /* advance pointer */
+ mov ebx, [ecx] /* ebx =ii */
+
+ mov edx, [ebp + type]
+ mov edx, [edx + ebx*4]
+ imul edx, [ebp + ntype]
+ shl edx, 1
+ mov [esp + ntia], edx
+
+ lea ebx, [ebx + ebx*2] /* ebx = 3*ii=ii3 */
+ mov eax, [ebp + pos] /* eax = base of pos[] */
+
+ addss xmm0, [eax + ebx*4]
+ addss xmm1, [eax + ebx*4 + 4]
+ addss xmm2, [eax + ebx*4 + 8]
+
+ shufps xmm0, xmm0, 0
+ shufps xmm1, xmm1, 0
+ shufps xmm2, xmm2, 0
+
+ movaps [esp + ix], xmm0
+ movaps [esp + iy], xmm1
+ movaps [esp + iz], xmm2
+
+ mov [esp + ii3], ebx
+
+ /* clear vnbtot and i forces */
+ xorps xmm4, xmm4
+ movaps [esp + vnbtot], xmm4
+ movaps [esp + fix], xmm4
+ movaps [esp + fiy], xmm4
+ movaps [esp + fiz], xmm4
+
+ mov eax, [ebp + jindex]
+ mov ecx, [eax] /* jindex[n] */
+ mov edx, [eax + 4] /* jindex[n+1] */
+ add [ebp + jindex], 4
+ sub edx, ecx /* number of innerloop atoms */
+
+ mov esi, [ebp + pos]
+ mov edi, [ebp + faction]
+ mov eax, [ebp + jjnr]
+ shl ecx, 2
+ add eax, ecx
+ mov [esp + innerjjnr], eax /* pointer to jjnr[nj0] */
+ sub edx, 4
+ mov [esp + innerk], edx /* number of innerloop atoms */
+ jge .i0100_unroll_loop
+ jmp .i0100_finish_inner
+.i0100_unroll_loop:
+ /* quad-unroll innerloop here */
+ mov edx, [esp + innerjjnr] /* pointer to jjnr[k] */
+ mov eax, [edx]
+ mov ebx, [edx + 4]
+ mov ecx, [edx + 8]
+ mov edx, [edx + 12] /* eax-edx=jnr1-4 */
+ add [esp + innerjjnr], 16 /* advance pointer (unrolled 4) */
+
+ movd mm0, eax /* use mmx registers as temp storage */
+ movd mm1, ebx
+ movd mm2, ecx
+ movd mm3, edx
+
+ mov esi, [ebp + type]
+ mov eax, [esi + eax*4]
+ mov ebx, [esi + ebx*4]
+ mov ecx, [esi + ecx*4]
+ mov edx, [esi + edx*4]
+ mov esi, [ebp + nbfp]
+ shl eax, 1
+ shl ebx, 1
+ shl ecx, 1
+ shl edx, 1
+ mov edi, [esp + ntia]
+ add eax, edi
+ add ebx, edi
+ add ecx, edi
+ add edx, edi
+
+ movlps xmm6, [esi + eax*4]
+ movlps xmm7, [esi + ecx*4]
+ movhps xmm6, [esi + ebx*4]
+ movhps xmm7, [esi + edx*4]
+
+ movaps xmm4, xmm6
+ shufps xmm4, xmm7, 0b10001000
+ shufps xmm6, xmm7, 0b11011101
+
+ movd eax, mm0
+ movd ebx, mm1
+ movd ecx, mm2
+ movd edx, mm3
+
+ movaps [esp + c6], xmm4
+ movaps [esp + c12], xmm6
+
+ mov esi, [ebp + pos] /* base of pos[] */
+
+ lea eax, [eax + eax*2] /* replace jnr with j3 */
+ lea ebx, [ebx + ebx*2]
+
+ mulps xmm3, xmm2
+ lea ecx, [ecx + ecx*2] /* replace jnr with j3 */
+ lea edx, [edx + edx*2]
+
+ /* move four coordinates to xmm0-xmm2 */
+
+ movlps xmm4, [esi + eax*4]
+ movlps xmm5, [esi + ecx*4]
+ movss xmm2, [esi + eax*4 + 8]
+ movss xmm6, [esi + ecx*4 + 8]
+
+ movhps xmm4, [esi + ebx*4]
+ movhps xmm5, [esi + edx*4]
+
+ movss xmm0, [esi + ebx*4 + 8]
+ movss xmm1, [esi + edx*4 + 8]
+
+ shufps xmm2, xmm0, 0
+ shufps xmm6, xmm1, 0
+
+ movaps xmm0, xmm4
+ movaps xmm1, xmm4
+
+ shufps xmm2, xmm6, 0b10001000
+
+ shufps xmm0, xmm5, 0b10001000
+ shufps xmm1, xmm5, 0b11011101
+
+ /* move ix-iz to xmm4-xmm6 */
+ movaps xmm4, [esp + ix]
+ movaps xmm5, [esp + iy]
+ movaps xmm6, [esp + iz]
+
+ /* calc dr */
+ subps xmm4, xmm0
+ subps xmm5, xmm1
+ subps xmm6, xmm2
+
+ /* store dr */
+ movaps [esp + dx], xmm4
+ movaps [esp + dy], xmm5
+ movaps [esp + dz], xmm6
+ /* square it */
+ mulps xmm4,xmm4
+ mulps xmm5,xmm5
+ mulps xmm6,xmm6
+ addps xmm4, xmm5
+ addps xmm4, xmm6
+ /* rsq in xmm4 */
+
+ rcpps xmm5, xmm4
+ /* 1/x lookup seed in xmm5 */
+ movaps xmm0, [esp + two]
+ mulps xmm4, xmm5
+ subps xmm0, xmm4
+ mulps xmm0, xmm5 /* xmm0=rinvsq */
+ movaps xmm4, xmm0
+
+ movaps xmm1, xmm0
+ mulps xmm1, xmm0
+ mulps xmm1, xmm0 /* xmm1=rinvsix */
+ movaps xmm2, xmm1
+ mulps xmm2, xmm2 /* xmm2=rinvtwelve */
+
+ mulps xmm1, [esp + c6]
+ mulps xmm2, [esp + c12]
+ movaps xmm5, xmm2
+ subps xmm5, xmm1 /* vnb=vnb12-vnb6 */
+ addps xmm5, [esp + vnbtot]
+ mulps xmm1, [esp + six]
+ mulps xmm2, [esp + twelve]
+ subps xmm2, xmm1
+ mulps xmm4, xmm2 /* xmm4=total fscal */
+
+ movaps xmm0, [esp + dx]
+ movaps xmm1, [esp + dy]
+ movaps xmm2, [esp + dz]
+
+ movaps [esp + vnbtot], xmm5
+
+ mov edi, [ebp + faction]
+ mulps xmm0, xmm4
+ mulps xmm1, xmm4
+ mulps xmm2, xmm4
+ /* xmm0-xmm2 contains tx-tz (partial force) */
+ /* now update f_i */
+ movaps xmm3, [esp + fix]
+ movaps xmm4, [esp + fiy]
+ movaps xmm5, [esp + fiz]
+ addps xmm3, xmm0
+ addps xmm4, xmm1
+ addps xmm5, xmm2
+ movaps [esp + fix], xmm3
+ movaps [esp + fiy], xmm4
+ movaps [esp + fiz], xmm5
+ /* the fj's - start by accumulating x & y forces from memory */
+ movlps xmm4, [edi + eax*4]
+ movlps xmm6, [edi + ecx*4]
+ movhps xmm4, [edi + ebx*4]
+ movhps xmm6, [edi + edx*4]
+
+ movaps xmm3, xmm4
+ shufps xmm3, xmm6, 0b10001000
+ shufps xmm4, xmm6, 0b11011101
+
+ /* now xmm3-xmm5 contains fjx, fjy, fjz */
+ subps xmm3, xmm0
+ subps xmm4, xmm1
+
+ /* unpack them back so we can store them - first x & y in xmm3/xmm4 */
+
+ movaps xmm6, xmm3
+ unpcklps xmm6, xmm4
+ unpckhps xmm3, xmm4
+ /* xmm6(l)=x & y for j1, (h) for j2 */
+ /* xmm3(l)=x & y for j3, (h) for j4 */
+ movlps [edi + eax*4], xmm6
+ movlps [edi + ecx*4], xmm3
+
+ movhps [edi + ebx*4], xmm6
+ movhps [edi + edx*4], xmm3
+
+ /* and the z forces */
+ movss xmm4, [edi + eax*4 + 8]
+ movss xmm5, [edi + ebx*4 + 8]
+ movss xmm6, [edi + ecx*4 + 8]
+ movss xmm7, [edi + edx*4 + 8]
+ subss xmm4, xmm2
+ shufps xmm2, xmm2, 0b11100101
+ subss xmm5, xmm2
+ shufps xmm2, xmm2, 0b11101010
+ subss xmm6, xmm2
+ shufps xmm2, xmm2, 0b11111111
+ subss xmm7, xmm2
+ movss [edi + eax*4 + 8], xmm4
+ movss [edi + ebx*4 + 8], xmm5
+ movss [edi + ecx*4 + 8], xmm6
+ movss [edi + edx*4 + 8], xmm7
+
+ /* should we do one more iteration? */
+ sub [esp + innerk], 4
+ jl .i0100_finish_inner
+ jmp .i0100_unroll_loop
+.i0100_finish_inner:
+ /* check if at least two particles remain */
+ add [esp + innerk], 4
+ mov edx, [esp + innerk]
+ and edx, 2
+ jnz .i0100_dopair
+ jmp .i0100_checksingle
+.i0100_dopair:
+
+ mov ecx, [esp + innerjjnr]
+
+ mov eax, [ecx]
+ mov ebx, [ecx + 4]
+ add [esp + innerjjnr], 8
+
+ mov esi, [ebp + type]
+ mov ecx, eax
+ mov edx, ebx
+ mov ecx, [esi + ecx*4]
+ mov edx, [esi + edx*4]
+ mov esi, [ebp + nbfp]
+ shl ecx, 1
+ shl edx, 1
+ mov edi, [esp + ntia]
+ add ecx, edi
+ add edx, edi
+ movlps xmm6, [esi + ecx*4]
+ movhps xmm6, [esi + edx*4]
+ mov edi, [ebp + pos]
+ xorps xmm7,xmm7
+ movaps xmm4, xmm6
+ shufps xmm4, xmm4, 0b1000
+ shufps xmm6, xmm6, 0b1101
+ movlhps xmm4, xmm7
+ movlhps xmm6, xmm7
+
+ movaps [esp + c6], xmm4
+ movaps [esp + c12], xmm6
+
+ lea eax, [eax + eax*2]
+ lea ebx, [ebx + ebx*2]
+ /* move coordinates to xmm0-xmm2 */
+ movlps xmm1, [edi + eax*4]
+ movss xmm2, [edi + eax*4 + 8]
+ movhps xmm1, [edi + ebx*4]
+ movss xmm0, [edi + ebx*4 + 8]
+
+
+ movlhps xmm3, xmm7
+
+ shufps xmm2, xmm0, 0
+
+ movaps xmm0, xmm1
+
+ shufps xmm2, xmm2, 0b10001000
+
+ shufps xmm0, xmm0, 0b10001000
+ shufps xmm1, xmm1, 0b11011101
+
+ mov edi, [ebp + faction]
+ /* move ix-iz to xmm4-xmm6 */
+ xorps xmm7, xmm7
+
+ movaps xmm4, [esp + ix]
+ movaps xmm5, [esp + iy]
+ movaps xmm6, [esp + iz]
+
+ /* calc dr */
+ subps xmm4, xmm0
+ subps xmm5, xmm1
+ subps xmm6, xmm2
+
+ /* store dr */
+ movaps [esp + dx], xmm4
+ movaps [esp + dy], xmm5
+ movaps [esp + dz], xmm6
+ /* square it */
+ mulps xmm4,xmm4
+ mulps xmm5,xmm5
+ mulps xmm6,xmm6
+ addps xmm4, xmm5
+ addps xmm4, xmm6
+ /* rsq in xmm4 */
+
+
+ rcpps xmm5, xmm4
+ /* 1/x lookup seed in xmm5 */
+ movaps xmm0, [esp + two]
+ mulps xmm4, xmm5
+ subps xmm0, xmm4
+ mulps xmm0, xmm5 /* xmm0=rinvsq */
+ movaps xmm4, xmm0
+
+ movaps xmm1, xmm0
+ mulps xmm1, xmm0
+ mulps xmm1, xmm0 /* xmm1=rinvsix */
+ movaps xmm2, xmm1
+ mulps xmm2, xmm2 /* xmm2=rinvtwelve */
+
+ mulps xmm1, [esp + c6]
+ mulps xmm2, [esp + c12]
+ movaps xmm5, xmm2
+ subps xmm5, xmm1 /* vnb=vnb12-vnb6 */
+ addps xmm5, [esp + vnbtot]
+ mulps xmm1, [esp + six]
+ mulps xmm2, [esp + twelve]
+ subps xmm2, xmm1
+ mulps xmm4, xmm2 /* xmm4=total fscal */
+
+ movaps xmm0, [esp + dx]
+ movaps xmm1, [esp + dy]
+ movaps xmm2, [esp + dz]
+
+ movaps [esp + vnbtot], xmm5
+
+ mulps xmm0, xmm4
+ mulps xmm1, xmm4
+ mulps xmm2, xmm4
+ /* xmm0-xmm2 contains tx-tz (partial force) */
+ /* now update f_i */
+ movaps xmm3, [esp + fix]
+ movaps xmm4, [esp + fiy]
+ movaps xmm5, [esp + fiz]
+ addps xmm3, xmm0
+ addps xmm4, xmm1
+ addps xmm5, xmm2
+ movaps [esp + fix], xmm3
+ movaps [esp + fiy], xmm4
+ movaps [esp + fiz], xmm5
+ /* update the fj's */
+ movss xmm3, [edi + eax*4]
+ movss xmm4, [edi + eax*4 + 4]
+ movss xmm5, [edi + eax*4 + 8]
+ subss xmm3, xmm0
+ subss xmm4, xmm1
+ subss xmm5, xmm2
+ movss [edi + eax*4], xmm3
+ movss [edi + eax*4 + 4], xmm4
+ movss [edi + eax*4 + 8], xmm5
+
+ shufps xmm0, xmm0, 0b11100001
+ shufps xmm1, xmm1, 0b11100001
+ shufps xmm2, xmm2, 0b11100001
+
+ movss xmm3, [edi + ebx*4]
+ movss xmm4, [edi + ebx*4 + 4]
+ movss xmm5, [edi + ebx*4 + 8]
+ subss xmm3, xmm0
+ subss xmm4, xmm1
+ subss xmm5, xmm2
+ movss [edi + ebx*4], xmm3
+ movss [edi + ebx*4 + 4], xmm4
+ movss [edi + ebx*4 + 8], xmm5
+
+.i0100_checksingle:
+ mov edx, [esp + innerk]
+ and edx, 1
+ jnz .i0100_dosingle
+ jmp .i0100_updateouterdata
+.i0100_dosingle:
+ mov edi, [ebp + pos]
+ mov ecx, [esp + innerjjnr]
+ mov eax, [ecx]
+
+ mov esi, [ebp + type]
+ mov ecx, eax
+ mov ecx, [esi + ecx*4]
+ mov esi, [ebp + nbfp]
+ shl ecx, 1
+ add ecx, [esp + ntia]
+ xorps xmm6, xmm6
+ movlps xmm6, [esi + ecx*4]
+ movaps xmm4, xmm6
+ shufps xmm4, xmm4, 0b11111100
+ shufps xmm6, xmm6, 0b11111101
+
+ movaps [esp + c6], xmm4
+ movaps [esp + c12], xmm6
+
+ lea eax, [eax + eax*2]
+
+ /* move coordinates to xmm0-xmm2 */
+ movss xmm0, [edi + eax*4]
+ movss xmm1, [edi + eax*4 + 4]
+ movss xmm2, [edi + eax*4 + 8]
+
+ xorps xmm7, xmm7
+
+ movaps xmm4, [esp + ix]
+ movaps xmm5, [esp + iy]
+ movaps xmm6, [esp + iz]
+
+ /* calc dr */
+ subps xmm4, xmm0
+ subps xmm5, xmm1
+ subps xmm6, xmm2
+
+ /* store dr */
+ movaps [esp + dx], xmm4
+ movaps [esp + dy], xmm5
+ movaps [esp + dz], xmm6
+ /* square it */
+ mulps xmm4,xmm4
+ mulps xmm5,xmm5
+ mulps xmm6,xmm6
+ addps xmm4, xmm5
+ addps xmm4, xmm6
+ /* rsq in xmm4 */
+
+ rcpps xmm5, xmm4
+ /* 1/x lookup seed in xmm5 */
+ movaps xmm0, [esp + two]
+ mulps xmm4, xmm5
+ subps xmm0, xmm4
+ mulps xmm0, xmm5 /* xmm0=rinvsq */
+ movaps xmm4, xmm0
+
+ movaps xmm1, xmm0
+ mulps xmm1, xmm0
+ mulps xmm1, xmm0 /* xmm1=rinvsix */
+ movaps xmm2, xmm1
+ mulps xmm2, xmm2 /* xmm2=rinvtwelve */
+
+ mulps xmm1, [esp + c6]
+ mulps xmm2, [esp + c12]
+ movaps xmm5, xmm2
+ subps xmm5, xmm1 /* vnb=vnb12-vnb6 */
+ addps xmm5, [esp + vnbtot]
+ mulps xmm1, [esp + six]
+ mulps xmm2, [esp + twelve]
+ subps xmm2, xmm1
+ mulps xmm4, xmm2 /* xmm4=total fscal */
+
+ mov edi, [ebp + faction]
+
+ movaps xmm0, [esp + dx]
+ movaps xmm1, [esp + dy]
+ movaps xmm2, [esp + dz]
+
+ movaps [esp + vnbtot], xmm5
+
+ mulps xmm0, xmm4
+ mulps xmm1, xmm4
+ mulps xmm2, xmm4
+ /* xmm0-xmm2 contains tx-tz (partial force) */
+ /* now update f_i */
+ movaps xmm3, [esp + fix]
+ movaps xmm4, [esp + fiy]
+ movaps xmm5, [esp + fiz]
+ addps xmm3, xmm0
+ addps xmm4, xmm1
+ addps xmm5, xmm2
+ movaps [esp + fix], xmm3
+ movaps [esp + fiy], xmm4
+ movaps [esp + fiz], xmm5
+ /* update fj */
+
+ movss xmm3, [edi + eax*4]
+ movss xmm4, [edi + eax*4 + 4]
+ movss xmm5, [edi + eax*4 + 8]
+ subss xmm3, xmm0
+ subss xmm4, xmm1
+ subss xmm5, xmm2
+ movss [edi + eax*4], xmm3
+ movss [edi + eax*4 + 4], xmm4
+ movss [edi + eax*4 + 8], xmm5
+.i0100_updateouterdata:
+ mov ecx, [esp + ii3]
+ mov edi, [ebp + faction]
+ mov esi, [ebp + fshift]
+ mov edx, [esp + is3]
+
+ /* accumulate i forces in xmm0, xmm1, xmm2 */
+ movaps xmm0, [esp + fix]
+ movaps xmm1, [esp + fiy]
+ movaps xmm2, [esp + fiz]
+
+ movhlps xmm3, xmm0
+ movhlps xmm4, xmm1
+ movhlps xmm5, xmm2
+ addps xmm0, xmm3
+ addps xmm1, xmm4
+ addps xmm2, xmm5 /* sum is in 1/2 in xmm0-xmm2 */
+
+ movaps xmm3, xmm0
+ movaps xmm4, xmm1
+ movaps xmm5, xmm2
+
+ shufps xmm3, xmm3, 1
+ shufps xmm4, xmm4, 1
+ shufps xmm5, xmm5, 1
+ addss xmm0, xmm3
+ addss xmm1, xmm4
+ addss xmm2, xmm5 /* xmm0-xmm2 has single force in pos0 */
+
+ /* increment i force */
+ movss xmm3, [edi + ecx*4]
+ movss xmm4, [edi + ecx*4 + 4]
+ movss xmm5, [edi + ecx*4 + 8]
+ addss xmm3, xmm0
+ addss xmm4, xmm1
+ addss xmm5, xmm2
+ movss [edi + ecx*4], xmm3
+ movss [edi + ecx*4 + 4], xmm4
+ movss [edi + ecx*4 + 8], xmm5
+
+ /* increment fshift force */
+ movss xmm3, [esi + edx*4]
+ movss xmm4, [esi + edx*4 + 4]
+ movss xmm5, [esi + edx*4 + 8]
+ addss xmm3, xmm0
+ addss xmm4, xmm1
+ addss xmm5, xmm2
+ movss [esi + edx*4], xmm3
+ movss [esi + edx*4 + 4], xmm4
+ movss [esi + edx*4 + 8], xmm5
+
+ /* get group index for i particle */
+ mov edx, [ebp + gid] /* get group index for this i particle */
+ mov edx, [edx]
+ add [ebp + gid], 4 /* advance pointer */
+
+
+ /* accumulate total lj energy and update it */
+ movaps xmm7, [esp + vnbtot]
+ /* accumulate */
+ movhlps xmm6, xmm7
+ addps xmm7, xmm6 /* pos 0-1 in xmm7 have the sum now */
+ movaps xmm6, xmm7
+ shufps xmm6, xmm6, 1
+ addss xmm7, xmm6
+
+ /* add earlier value from mem */
+ mov eax, [ebp + Vnb]
+ addss xmm7, [eax + edx*4]
+ /* move back to mem */
+ movss [eax + edx*4], xmm7
+
+ /* finish if last */
+ mov ecx, [ebp + nri]
+ dec ecx
+ jecxz .i0100_end
+ /* not last, iterate once more! */
+ mov [ebp + nri], ecx
+ jmp .i0100_outer
+.i0100_end:
+ emms
+ mov eax, [esp + salign]
+ add esp, eax
+ add esp, 296
+ pop edi
+ pop esi
+ pop edx
+ pop ecx
+ pop ebx
+ pop eax
+ leave
+ ret
+
+
+.globl inl0110_sse
+ .type inl0110_sse,@function
+inl0110_sse:
+.equ nri, 8
+.equ iinr, 12
+.equ jindex, 16
+.equ jjnr, 20
+.equ shift, 24
+.equ shiftvec, 28
+.equ fshift, 32
+.equ gid, 36
+.equ pos, 40
+.equ faction, 44
+.equ type, 48
+.equ ntype, 52
+.equ nbfp, 56
+.equ Vnb, 60
+.equ nsatoms, 64
+ /* stack offsets for local variables */
+ /* bottom of stack is cache-aligned for sse use */
+.equ ix, 0
+.equ iy, 16
+.equ iz, 32
+.equ dx, 48
+.equ dy, 64
+.equ dz, 80
+.equ two, 96
+.equ c6, 112
+.equ c12, 128
+.equ six, 144
+.equ twelve, 160
+.equ vnbtot, 176
+.equ fix, 192
+.equ fiy, 208
+.equ fiz, 224
+.equ half, 240
+.equ three, 256
+.equ is3, 272
+.equ ii3, 276
+.equ shX, 280
+.equ shY, 284
+.equ shZ, 288
+.equ ntia, 292
+.equ innerjjnr0, 296
+.equ innerjjnr, 300
+.equ innerk0, 304
+.equ innerk, 308
+.equ salign, 312
+.equ nsvdwc, 316
+.equ nscoul, 320
+.equ nsvdw, 324
+.equ solnr, 328
+ push ebp
+ mov ebp,esp
+ push eax
+ push ebx
+ push ecx
+ push edx
+ push esi
+ push edi
+ sub esp, 332 /* local stack space */
+ mov eax, esp
+ and eax, 0xf
+ sub esp, eax
+ mov [esp + salign], eax
+
+ emms
+
+ movups xmm1, [sse_two]
+ movups xmm2, [sse_six]
+ movups xmm3, [sse_twelve]
+ movaps [esp + two], xmm1
+ movaps [esp + six], xmm2
+ movaps [esp + twelve], xmm3
+
+ /* assume we have at least one i particle - start directly */
+.i0110_outer:
+ mov eax, [ebp + shift] /* eax = pointer into shift[] */
+ mov ebx, [eax] /* ebx=shift[n] */
+ add [ebp + shift], 4 /* advance pointer one step */
+
+ lea ebx, [ebx + ebx*2] /* ebx=3*is */
+ mov [esp + is3],ebx /* store is3 */
+
+ mov eax, [ebp + shiftvec] /* eax = base of shiftvec[] */
+
+ movlps xmm0, [eax + ebx*4] /* getting the shiftvector */
+ movss xmm1, [eax + ebx*4 + 8]
+ movlps [esp + shX], xmm0
+ movss [esp + shZ], xmm1
+
+ mov ecx, [ebp + iinr] /* ecx = pointer into iinr[] */
+ add [ebp + iinr], 4 /* advance pointer */
+ mov ebx, [ecx] /* ebx =ii */
+
+ mov eax, [ebp + nsatoms]
+ add [ebp + nsatoms], 12
+ mov ecx, [eax]
+ mov edx, [eax + 4]
+ mov eax, [eax + 8]
+ sub ecx, eax
+ sub eax, edx
+
+ mov [esp + nsvdwc], edx
+ mov [esp + nscoul], eax
+ mov [esp + nsvdw], ecx
+
+ /* clear vnbtot */
+ xorps xmm4, xmm4
+ movaps [esp + vnbtot], xmm4
+ mov [esp + solnr], ebx
+
+ mov eax, [ebp + jindex]
+ mov ecx, [eax] /* jindex[n] */
+ mov edx, [eax + 4] /* jindex[n+1] */
+ add [ebp + jindex], 4
+ sub edx, ecx /* number of innerloop atoms */
+ mov eax, [ebp + jjnr]
+ shl ecx, 2
+ add eax, ecx
+ mov [esp + innerjjnr0], eax /* pointer to jjnr[nj0] */
+
+ mov [esp + innerk0], edx /* number of innerloop atoms */
+
+ mov ecx, [esp + nsvdwc]
+ cmp ecx, 0
+ jnz .i0110_mno_vdwc
+ jmp .i0110_testvdw
+.i0110_mno_vdwc:
+ mov ebx, [esp + solnr]
+ inc dword ptr [esp + solnr]
+
+ mov edx, [ebp + type]
+ mov edx, [edx + ebx*4]
+ imul edx, [ebp + ntype]
+ shl edx, 1
+ mov [esp + ntia], edx
+
+ lea ebx, [ebx + ebx*2] /* ebx = 3*ii=ii3 */
+ mov eax, [ebp + pos] /* eax = base of pos[] */
+ mov [esp + ii3], ebx
+
+ movss xmm0, [esp + shX]
+ movss xmm1, [esp + shY]
+ movss xmm2, [esp + shZ]
+
+ addss xmm0, [eax + ebx*4]
+ addss xmm1, [eax + ebx*4 + 4]
+ addss xmm2, [eax + ebx*4 + 8]
+
+ /* clear i forces */
+ xorps xmm4, xmm4
+ movaps [esp + fix], xmm4
+ movaps [esp + fiy], xmm4
+ movaps [esp + fiz], xmm4
+
+ shufps xmm0, xmm0, 0
+ shufps xmm1, xmm1, 0
+ shufps xmm2, xmm2, 0
+
+ movaps [esp + ix], xmm0
+ movaps [esp + iy], xmm1
+ movaps [esp + iz], xmm2
+
+ mov ecx, [esp + innerjjnr0]
+ mov [esp + innerjjnr], ecx
+ mov edx, [esp + innerk0]
+ sub edx, 4
+ mov [esp + innerk], edx /* number of innerloop atoms */
+ jge .i0110_unroll_vdwc_loop
+ jmp .i0110_finish_vdwc_inner
+.i0110_unroll_vdwc_loop:
+ /* quad-unroll innerloop here */
+ mov edx, [esp + innerjjnr] /* pointer to jjnr[k] */
+ mov eax, [edx]
+ mov ebx, [edx + 4]
+ mov ecx, [edx + 8]
+ mov edx, [edx + 12] /* eax-edx=jnr1-4 */
+ add [esp + innerjjnr], 16 /* advance pointer (unrolled 4) */
+
+ movd mm0, eax /* use mmx registers as temp storage */
+ movd mm1, ebx
+ movd mm2, ecx
+ movd mm3, edx
+
+ mov esi, [ebp + type]
+ mov eax, [esi + eax*4]
+ mov ebx, [esi + ebx*4]
+ mov ecx, [esi + ecx*4]
+ mov edx, [esi + edx*4]
+ mov esi, [ebp + nbfp]
+ shl eax, 1
+ shl ebx, 1
+ shl ecx, 1
+ shl edx, 1
+ mov edi, [esp + ntia]
+ add eax, edi
+ add ebx, edi
+ add ecx, edi
+ add edx, edi
+
+ movlps xmm6, [esi + eax*4]
+ movlps xmm7, [esi + ecx*4]
+ movhps xmm6, [esi + ebx*4]
+ movhps xmm7, [esi + edx*4]
+
+ movaps xmm4, xmm6
+ shufps xmm4, xmm7, 0b10001000
+ shufps xmm6, xmm7, 0b11011101
+
+ movd eax, mm0
+ movd ebx, mm1
+ movd ecx, mm2
+ movd edx, mm3
+
+ movaps [esp + c6], xmm4
+ movaps [esp + c12], xmm6
+
+ mov esi, [ebp + pos] /* base of pos[] */
+
+ lea eax, [eax + eax*2] /* replace jnr with j3 */
+ lea ebx, [ebx + ebx*2]
+
+ mulps xmm3, xmm2
+ lea ecx, [ecx + ecx*2] /* replace jnr with j3 */
+ lea edx, [edx + edx*2]
+
+ /* move four coordinates to xmm0-xmm2 */
+
+ movlps xmm4, [esi + eax*4]
+ movlps xmm5, [esi + ecx*4]
+ movss xmm2, [esi + eax*4 + 8]
+ movss xmm6, [esi + ecx*4 + 8]
+
+ movhps xmm4, [esi + ebx*4]
+ movhps xmm5, [esi + edx*4]
+
+ movss xmm0, [esi + ebx*4 + 8]
+ movss xmm1, [esi + edx*4 + 8]
+
+ shufps xmm2, xmm0, 0
+ shufps xmm6, xmm1, 0
+
+ movaps xmm0, xmm4
+ movaps xmm1, xmm4
+
+ shufps xmm2, xmm6, 0b10001000
+
+ shufps xmm0, xmm5, 0b10001000
+ shufps xmm1, xmm5, 0b11011101
+
+ /* move ix-iz to xmm4-xmm6 */
+ movaps xmm4, [esp + ix]
+ movaps xmm5, [esp + iy]
+ movaps xmm6, [esp + iz]
+
+ /* calc dr */
+ subps xmm4, xmm0
+ subps xmm5, xmm1
+ subps xmm6, xmm2
+
+ /* store dr */
+ movaps [esp + dx], xmm4
+ movaps [esp + dy], xmm5
+ movaps [esp + dz], xmm6
+ /* square it */
+ mulps xmm4,xmm4
+ mulps xmm5,xmm5
+ mulps xmm6,xmm6
+ addps xmm4, xmm5
+ addps xmm4, xmm6
+ /* rsq in xmm4 */
+
+ rcpps xmm5, xmm4
+ /* 1/x lookup seed in xmm5 */
+ movaps xmm0, [esp + two]
+ mulps xmm4, xmm5
+ subps xmm0, xmm4
+ mulps xmm0, xmm5 /* xmm0=rinvsq */
+ movaps xmm4, xmm0
+
+ movaps xmm1, xmm0
+ mulps xmm1, xmm0
+ mulps xmm1, xmm0 /* xmm1=rinvsix */
+ movaps xmm2, xmm1
+ mulps xmm2, xmm2 /* xmm2=rinvtwelve */
+
+ mulps xmm1, [esp + c6]
+ mulps xmm2, [esp + c12]
+ movaps xmm5, xmm2
+ subps xmm5, xmm1 /* vnb=vnb12-vnb6 */
+ addps xmm5, [esp + vnbtot]
+ mulps xmm1, [esp + six]
+ mulps xmm2, [esp + twelve]
+ subps xmm2, xmm1
+ mulps xmm4, xmm2 /* xmm4=total fscal */
+
+ movaps xmm0, [esp + dx]
+ movaps xmm1, [esp + dy]
+ movaps xmm2, [esp + dz]
+
+ movaps [esp + vnbtot], xmm5
+
+ mov edi, [ebp + faction]
+ mulps xmm0, xmm4
+ mulps xmm1, xmm4
+ mulps xmm2, xmm4
+ /* xmm0-xmm2 contains tx-tz (partial force) */
+ /* now update f_i */
+ movaps xmm3, [esp + fix]
+ movaps xmm4, [esp + fiy]
+ movaps xmm5, [esp + fiz]
+ addps xmm3, xmm0
+ addps xmm4, xmm1
+ addps xmm5, xmm2
+ movaps [esp + fix], xmm3
+ movaps [esp + fiy], xmm4
+ movaps [esp + fiz], xmm5
+ /* the fj's - start by accumulating x & y forces from memory */
+ movlps xmm4, [edi + eax*4]
+ movlps xmm6, [edi + ecx*4]
+ movhps xmm4, [edi + ebx*4]
+ movhps xmm6, [edi + edx*4]
+
+ movaps xmm3, xmm4
+ shufps xmm3, xmm6, 0b10001000
+ shufps xmm4, xmm6, 0b11011101
+
+ /* now xmm3-xmm5 contains fjx, fjy, fjz */
+ subps xmm3, xmm0
+ subps xmm4, xmm1
+
+ /* unpack them back so we can store them - first x & y in xmm3/xmm4 */
+
+ movaps xmm6, xmm3
+ unpcklps xmm6, xmm4
+ unpckhps xmm3, xmm4
+ /* xmm6(l)=x & y for j1, (h) for j2 */
+ /* xmm3(l)=x & y for j3, (h) for j4 */
+ movlps [edi + eax*4], xmm6
+ movlps [edi + ecx*4], xmm3
+
+ movhps [edi + ebx*4], xmm6
+ movhps [edi + edx*4], xmm3
+
+ /* and the z forces */
+ movss xmm4, [edi + eax*4 + 8]
+ movss xmm5, [edi + ebx*4 + 8]
+ movss xmm6, [edi + ecx*4 + 8]
+ movss xmm7, [edi + edx*4 + 8]
+ subss xmm4, xmm2
+ shufps xmm2, xmm2, 0b11100101
+ subss xmm5, xmm2
+ shufps xmm2, xmm2, 0b11101010
+ subss xmm6, xmm2
+ shufps xmm2, xmm2, 0b11111111
+ subss xmm7, xmm2
+ movss [edi + eax*4 + 8], xmm4
+ movss [edi + ebx*4 + 8], xmm5
+ movss [edi + ecx*4 + 8], xmm6
+ movss [edi + edx*4 + 8], xmm7
+
+ /* should we do one more iteration? */
+ sub [esp + innerk], 4
+ jl .i0110_finish_vdwc_inner
+ jmp .i0110_unroll_vdwc_loop
+.i0110_finish_vdwc_inner:
+ /* check if at least two particles remain */
+ add [esp + innerk], 4
+ mov edx, [esp + innerk]
+ and edx, 2
+ jnz .i0110_dopair_vdwc
+ jmp .i0110_checksingle_vdwc
+.i0110_dopair_vdwc:
+
+ mov ecx, [esp + innerjjnr]
+
+ mov eax, [ecx]
+ mov ebx, [ecx + 4]
+ add [esp + innerjjnr], 8
+
+ mov esi, [ebp + type]
+ mov ecx, eax
+ mov edx, ebx
+ mov ecx, [esi + ecx*4]
+ mov edx, [esi + edx*4]
+ mov esi, [ebp + nbfp]
+ shl ecx, 1
+ shl edx, 1
+ mov edi, [esp + ntia]
+ add ecx, edi
+ add edx, edi
+ movlps xmm6, [esi + ecx*4]
+ movhps xmm6, [esi + edx*4]
+ mov edi, [ebp + pos]
+ xorps xmm7,xmm7
+ movaps xmm4, xmm6
+ shufps xmm4, xmm4, 0b1000
+ shufps xmm6, xmm6, 0b1101
+ movlhps xmm4, xmm7
+ movlhps xmm6, xmm7
+
+ movaps [esp + c6], xmm4
+ movaps [esp + c12], xmm6
+
+ lea eax, [eax + eax*2]
+ lea ebx, [ebx + ebx*2]
+ /* move coordinates to xmm0-xmm2 */
+ movlps xmm1, [edi + eax*4]
+ movss xmm2, [edi + eax*4 + 8]
+ movhps xmm1, [edi + ebx*4]
+ movss xmm0, [edi + ebx*4 + 8]
+
+
+ movlhps xmm3, xmm7
+
+ shufps xmm2, xmm0, 0
+
+ movaps xmm0, xmm1
+
+ shufps xmm2, xmm2, 0b10001000
+
+ shufps xmm0, xmm0, 0b10001000
+ shufps xmm1, xmm1, 0b11011101
+
+ mov edi, [ebp + faction]
+ /* move ix-iz to xmm4-xmm6 */
+ xorps xmm7, xmm7
+
+ movaps xmm4, [esp + ix]
+ movaps xmm5, [esp + iy]
+ movaps xmm6, [esp + iz]
+
+ /* calc dr */
+ subps xmm4, xmm0
+ subps xmm5, xmm1
+ subps xmm6, xmm2
+
+ /* store dr */
+ movaps [esp + dx], xmm4
+ movaps [esp + dy], xmm5
+ movaps [esp + dz], xmm6
+ /* square it */
+ mulps xmm4,xmm4
+ mulps xmm5,xmm5
+ mulps xmm6,xmm6
+ addps xmm4, xmm5
+ addps xmm4, xmm6
+ /* rsq in xmm4 */
+
+
+ rcpps xmm5, xmm4
+ /* 1/x lookup seed in xmm5 */
+ movaps xmm0, [esp + two]
+ mulps xmm4, xmm5
+ subps xmm0, xmm4
+ mulps xmm0, xmm5 /* xmm0=rinvsq */
+ movaps xmm4, xmm0
+
+ movaps xmm1, xmm0
+ mulps xmm1, xmm0
+ mulps xmm1, xmm0 /* xmm1=rinvsix */
+ movaps xmm2, xmm1
+ mulps xmm2, xmm2 /* xmm2=rinvtwelve */
+
+ mulps xmm1, [esp + c6]
+ mulps xmm2, [esp + c12]
+ movaps xmm5, xmm2
+ subps xmm5, xmm1 /* vnb=vnb12-vnb6 */
+ addps xmm5, [esp + vnbtot]
+ mulps xmm1, [esp + six]
+ mulps xmm2, [esp + twelve]
+ subps xmm2, xmm1
+ mulps xmm4, xmm2 /* xmm4=total fscal */
+
+ movaps xmm0, [esp + dx]
+ movaps xmm1, [esp + dy]
+ movaps xmm2, [esp + dz]
+
+ movaps [esp + vnbtot], xmm5
+
+ mulps xmm0, xmm4
+ mulps xmm1, xmm4
+ mulps xmm2, xmm4
+ /* xmm0-xmm2 contains tx-tz (partial force) */
+ /* now update f_i */
+ movaps xmm3, [esp + fix]
+ movaps xmm4, [esp + fiy]
+ movaps xmm5, [esp + fiz]
+ addps xmm3, xmm0
+ addps xmm4, xmm1
+ addps xmm5, xmm2
+ movaps [esp + fix], xmm3
+ movaps [esp + fiy], xmm4
+ movaps [esp + fiz], xmm5
+ /* update the fj's */
+ movss xmm3, [edi + eax*4]
+ movss xmm4, [edi + eax*4 + 4]
+ movss xmm5, [edi + eax*4 + 8]
+ subss xmm3, xmm0
+ subss xmm4, xmm1
+ subss xmm5, xmm2
+ movss [edi + eax*4], xmm3
+ movss [edi + eax*4 + 4], xmm4
+ movss [edi + eax*4 + 8], xmm5
+
+ shufps xmm0, xmm0, 0b11100001
+ shufps xmm1, xmm1, 0b11100001
+ shufps xmm2, xmm2, 0b11100001
+
+ movss xmm3, [edi + ebx*4]
+ movss xmm4, [edi + ebx*4 + 4]
+ movss xmm5, [edi + ebx*4 + 8]
+ subss xmm3, xmm0
+ subss xmm4, xmm1
+ subss xmm5, xmm2
+ movss [edi + ebx*4], xmm3
+ movss [edi + ebx*4 + 4], xmm4
+ movss [edi + ebx*4 + 8], xmm5
+
+.i0110_checksingle_vdwc:
+ mov edx, [esp + innerk]
+ and edx, 1
+ jnz .i0110_dosingle_vdwc
+ jmp .i0110_updateouterdata_vdwc
+.i0110_dosingle_vdwc:
+ mov edi, [ebp + pos]
+ mov ecx, [esp + innerjjnr]
+ mov eax, [ecx]
+
+ mov esi, [ebp + type]
+ mov ecx, eax
+ mov ecx, [esi + ecx*4]
+ mov esi, [ebp + nbfp]
+ shl ecx, 1
+ add ecx, [esp + ntia]
+ xorps xmm6, xmm6
+ movlps xmm6, [esi + ecx*4]
+ movaps xmm4, xmm6
+ shufps xmm4, xmm4, 0b11111100
+ shufps xmm6, xmm6, 0b11111101
+
+ movaps [esp + c6], xmm4
+ movaps [esp + c12], xmm6
+
+ lea eax, [eax + eax*2]
+
+ /* move coordinates to xmm0-xmm2 */
+ movss xmm0, [edi + eax*4]
+ movss xmm1, [edi + eax*4 + 4]
+ movss xmm2, [edi + eax*4 + 8]
+
+ xorps xmm7, xmm7
+
+ movaps xmm4, [esp + ix]
+ movaps xmm5, [esp + iy]
+ movaps xmm6, [esp + iz]
+
+ /* calc dr */
+ subps xmm4, xmm0
+ subps xmm5, xmm1
+ subps xmm6, xmm2
+
+ /* store dr */
+ movaps [esp + dx], xmm4
+ movaps [esp + dy], xmm5
+ movaps [esp + dz], xmm6
+ /* square it */
+ mulps xmm4,xmm4
+ mulps xmm5,xmm5
+ mulps xmm6,xmm6
+ addps xmm4, xmm5
+ addps xmm4, xmm6
+ /* rsq in xmm4 */
+
+ rcpps xmm5, xmm4
+ /* 1/x lookup seed in xmm5 */
+ movaps xmm0, [esp + two]
+ mulps xmm4, xmm5
+ subps xmm0, xmm4
+ mulps xmm0, xmm5 /* xmm0=rinvsq */
+ movaps xmm4, xmm0
+
+ movaps xmm1, xmm0
+ mulps xmm1, xmm0
+ mulps xmm1, xmm0 /* xmm1=rinvsix */
+ movaps xmm2, xmm1
+ mulps xmm2, xmm2 /* xmm2=rinvtwelve */
+
+ mulps xmm1, [esp + c6]
+ mulps xmm2, [esp + c12]
+ movaps xmm5, xmm2
+ subps xmm5, xmm1 /* vnb=vnb12-vnb6 */
+ addps xmm5, [esp + vnbtot]
+ mulps xmm1, [esp + six]
+ mulps xmm2, [esp + twelve]
+ subps xmm2, xmm1
+ mulps xmm4, xmm2 /* xmm4=total fscal */
+
+ mov edi, [ebp + faction]
+
+ movaps xmm0, [esp + dx]
+ movaps xmm1, [esp + dy]
+ movaps xmm2, [esp + dz]
+
+ movaps [esp + vnbtot], xmm5
+
+ mulps xmm0, xmm4
+ mulps xmm1, xmm4
+ mulps xmm2, xmm4
+ /* xmm0-xmm2 contains tx-tz (partial force) */
+ /* now update f_i */
+ movaps xmm3, [esp + fix]
+ movaps xmm4, [esp + fiy]
+ movaps xmm5, [esp + fiz]
+ addps xmm3, xmm0
+ addps xmm4, xmm1
+ addps xmm5, xmm2
+ movaps [esp + fix], xmm3
+ movaps [esp + fiy], xmm4
+ movaps [esp + fiz], xmm5
+ /* update fj */
+
+ movss xmm3, [edi + eax*4]
+ movss xmm4, [edi + eax*4 + 4]
+ movss xmm5, [edi + eax*4 + 8]
+ subss xmm3, xmm0
+ subss xmm4, xmm1
+ subss xmm5, xmm2
+ movss [edi + eax*4], xmm3
+ movss [edi + eax*4 + 4], xmm4
+ movss [edi + eax*4 + 8], xmm5
+.i0110_updateouterdata_vdwc:
+ mov ecx, [esp + ii3]
+ mov edi, [ebp + faction]
+ mov esi, [ebp + fshift]
+ mov edx, [esp + is3]
+
+ /* accumulate i forces in xmm0, xmm1, xmm2 */
+ movaps xmm0, [esp + fix]
+ movaps xmm1, [esp + fiy]
+ movaps xmm2, [esp + fiz]
+
+ movhlps xmm3, xmm0
+ movhlps xmm4, xmm1
+ movhlps xmm5, xmm2
+ addps xmm0, xmm3
+ addps xmm1, xmm4
+ addps xmm2, xmm5 /* sum is in 1/2 in xmm0-xmm2 */
+
+ movaps xmm3, xmm0
+ movaps xmm4, xmm1
+ movaps xmm5, xmm2
+
+ shufps xmm3, xmm3, 1
+ shufps xmm4, xmm4, 1
+ shufps xmm5, xmm5, 1
+ addss xmm0, xmm3
+ addss xmm1, xmm4
+ addss xmm2, xmm5 /* xmm0-xmm2 has single force in pos0 */
+
+ /* increment i force */
+ movss xmm3, [edi + ecx*4]
+ movss xmm4, [edi + ecx*4 + 4]
+ movss xmm5, [edi + ecx*4 + 8]
+ addss xmm3, xmm0
+ addss xmm4, xmm1
+ addss xmm5, xmm2
+ movss [edi + ecx*4], xmm3
+ movss [edi + ecx*4 + 4], xmm4
+ movss [edi + ecx*4 + 8], xmm5
+
+ /* increment fshift force */
+ movss xmm3, [esi + edx*4]
+ movss xmm4, [esi + edx*4 + 4]
+ movss xmm5, [esi + edx*4 + 8]
+ addss xmm3, xmm0
+ addss xmm4, xmm1
+ addss xmm5, xmm2
+ movss [esi + edx*4], xmm3
+ movss [esi + edx*4 + 4], xmm4
+ movss [esi + edx*4 + 8], xmm5
+
+ /* loop back to mno */
+ dec dword ptr [esp + nsvdwc]
+ jz .i0110_testvdw
+ jmp .i0110_mno_vdwc
+.i0110_testvdw:
+ mov ebx, [esp + nscoul]
+ add [esp + solnr], ebx
+
+ mov ecx, [esp + nsvdw]
+ cmp ecx, 0
+ jnz .i0110_mno_vdw
+ jmp .i0110_last_mno
+.i0110_mno_vdw:
+ mov ebx, [esp + solnr]
+ inc dword ptr [esp + solnr]
+
+ mov edx, [ebp + type]
+ mov edx, [edx + ebx*4]
+ imul edx, [ebp + ntype]
+ shl edx, 1
+ mov [esp + ntia], edx
+
+ lea ebx, [ebx + ebx*2] /* ebx = 3*ii=ii3 */
+ mov eax, [ebp + pos] /* eax = base of pos[] */
+ mov [esp + ii3], ebx
+
+ movss xmm0, [esp + shX]
+ movss xmm1, [esp + shY]
+ movss xmm2, [esp + shZ]
+
+ addss xmm0, [eax + ebx*4]
+ addss xmm1, [eax + ebx*4 + 4]
+ addss xmm2, [eax + ebx*4 + 8]
+
+ xorps xmm4, xmm4
+ movaps [esp + fix], xmm4
+ movaps [esp + fiy], xmm4
+ movaps [esp + fiz], xmm4
+
+ shufps xmm0, xmm0, 0
+ shufps xmm1, xmm1, 0
+ shufps xmm2, xmm2, 0
+
+ movaps [esp + ix], xmm0
+ movaps [esp + iy], xmm1
+ movaps [esp + iz], xmm2
+
+ mov ecx, [esp + innerjjnr0]
+ mov [esp + innerjjnr], ecx
+ mov edx, [esp + innerk0]
+ sub edx, 4
+ mov [esp + innerk], edx /* number of innerloop atoms */
+ jge .i0110_unroll_vdw_loop
+ jmp .i0110_finish_vdw_inner
+.i0110_unroll_vdw_loop:
+ /* quad-unroll innerloop here */
+ mov edx, [esp + innerjjnr] /* pointer to jjnr[k] */
+ mov eax, [edx]
+ mov ebx, [edx + 4]
+ mov ecx, [edx + 8]
+ mov edx, [edx + 12] /* eax-edx=jnr1-4 */
+ add [esp + innerjjnr], 16 /* advance pointer (unrolled 4) */
+
+ movd mm0, eax /* use mmx registers as temp storage */
+ movd mm1, ebx
+ movd mm2, ecx
+ movd mm3, edx
+
+ mov esi, [ebp + type]
+ mov eax, [esi + eax*4]
+ mov ebx, [esi + ebx*4]
+ mov ecx, [esi + ecx*4]
+ mov edx, [esi + edx*4]
+ mov esi, [ebp + nbfp]
+ shl eax, 1
+ shl ebx, 1
+ shl ecx, 1
+ shl edx, 1
+ mov edi, [esp + ntia]
+ add eax, edi
+ add ebx, edi
+ add ecx, edi
+ add edx, edi
+
+ movlps xmm6, [esi + eax*4]
+ movlps xmm7, [esi + ecx*4]
+ movhps xmm6, [esi + ebx*4]
+ movhps xmm7, [esi + edx*4]
+
+ movaps xmm4, xmm6
+ shufps xmm4, xmm7, 0b10001000
+ shufps xmm6, xmm7, 0b11011101
+
+ movd eax, mm0
+ movd ebx, mm1
+ movd ecx, mm2
+ movd edx, mm3
+
+ movaps [esp + c6], xmm4
+ movaps [esp + c12], xmm6
+
+ mov esi, [ebp + pos] /* base of pos[] */
+
+ lea eax, [eax + eax*2] /* replace jnr with j3 */
+ lea ebx, [ebx + ebx*2]
+
+ mulps xmm3, xmm2
+ lea ecx, [ecx + ecx*2] /* replace jnr with j3 */
+ lea edx, [edx + edx*2]
+
+ /* move four coordinates to xmm0-xmm2 */
+
+ movlps xmm4, [esi + eax*4]
+ movlps xmm5, [esi + ecx*4]
+ movss xmm2, [esi + eax*4 + 8]
+ movss xmm6, [esi + ecx*4 + 8]
+
+ movhps xmm4, [esi + ebx*4]
+ movhps xmm5, [esi + edx*4]
+
+ movss xmm0, [esi + ebx*4 + 8]
+ movss xmm1, [esi + edx*4 + 8]
+
+ shufps xmm2, xmm0, 0
+ shufps xmm6, xmm1, 0
+
+ movaps xmm0, xmm4
+ movaps xmm1, xmm4
+
+ shufps xmm2, xmm6, 0b10001000
+
+ shufps xmm0, xmm5, 0b10001000
+ shufps xmm1, xmm5, 0b11011101
+
+ /* move ix-iz to xmm4-xmm6 */
+ movaps xmm4, [esp + ix]
+ movaps xmm5, [esp + iy]
+ movaps xmm6, [esp + iz]
+
+ /* calc dr */
+ subps xmm4, xmm0
+ subps xmm5, xmm1
+ subps xmm6, xmm2
+
+ /* store dr */
+ movaps [esp + dx], xmm4
+ movaps [esp + dy], xmm5
+ movaps [esp + dz], xmm6
+ /* square it */
+ mulps xmm4,xmm4
+ mulps xmm5,xmm5
+ mulps xmm6,xmm6
+ addps xmm4, xmm5
+ addps xmm4, xmm6
+ /* rsq in xmm4 */
+
+ rcpps xmm5, xmm4
+ /* 1/x lookup seed in xmm5 */
+ movaps xmm0, [esp + two]
+ mulps xmm4, xmm5
+ subps xmm0, xmm4
+ mulps xmm0, xmm5 /* xmm0=rinvsq */
+ movaps xmm4, xmm0
+
+ movaps xmm1, xmm0
+ mulps xmm1, xmm0
+ mulps xmm1, xmm0 /* xmm1=rinvsix */
+ movaps xmm2, xmm1
+ mulps xmm2, xmm2 /* xmm2=rinvtwelve */
+
+ mulps xmm1, [esp + c6]
+ mulps xmm2, [esp + c12]
+ movaps xmm5, xmm2
+ subps xmm5, xmm1 /* vnb=vnb12-vnb6 */
+ addps xmm5, [esp + vnbtot]
+ mulps xmm1, [esp + six]
+ mulps xmm2, [esp + twelve]
+ subps xmm2, xmm1
+ mulps xmm4, xmm2 /* xmm4=total fscal */
+
+ movaps xmm0, [esp + dx]
+ movaps xmm1, [esp + dy]
+ movaps xmm2, [esp + dz]
+
+ movaps [esp + vnbtot], xmm5
+
+ mov edi, [ebp + faction]
+ mulps xmm0, xmm4
+ mulps xmm1, xmm4
+ mulps xmm2, xmm4
+ /* xmm0-xmm2 contains tx-tz (partial force) */
+ /* now update f_i */
+ movaps xmm3, [esp + fix]
+ movaps xmm4, [esp + fiy]
+ movaps xmm5, [esp + fiz]
+ addps xmm3, xmm0
+ addps xmm4, xmm1
+ addps xmm5, xmm2
+ movaps [esp + fix], xmm3
+ movaps [esp + fiy], xmm4
+ movaps [esp + fiz], xmm5
+ /* the fj's - start by accumulating x & y forces from memory */
+ movlps xmm4, [edi + eax*4]
+ movlps xmm6, [edi + ecx*4]
+ movhps xmm4, [edi + ebx*4]
+ movhps xmm6, [edi + edx*4]
+
+ movaps xmm3, xmm4
+ shufps xmm3, xmm6, 0b10001000
+ shufps xmm4, xmm6, 0b11011101
+
+ /* now xmm3-xmm5 contains fjx, fjy, fjz */
+ subps xmm3, xmm0
+ subps xmm4, xmm1
+
+ /* unpack them back so we can store them - first x & y in xmm3/xmm4 */
+
+ movaps xmm6, xmm3
+ unpcklps xmm6, xmm4
+ unpckhps xmm3, xmm4
+ /* xmm6(l)=x & y for j1, (h) for j2 */
+ /* xmm3(l)=x & y for j3, (h) for j4 */
+ movlps [edi + eax*4], xmm6
+ movlps [edi + ecx*4], xmm3
+
+ movhps [edi + ebx*4], xmm6
+ movhps [edi + edx*4], xmm3
+
+ /* and the z forces */
+ movss xmm4, [edi + eax*4 + 8]
+ movss xmm5, [edi + ebx*4 + 8]
+ movss xmm6, [edi + ecx*4 + 8]
+ movss xmm7, [edi + edx*4 + 8]
+ subss xmm4, xmm2
+ shufps xmm2, xmm2, 0b11100101
+ subss xmm5, xmm2
+ shufps xmm2, xmm2, 0b11101010
+ subss xmm6, xmm2
+ shufps xmm2, xmm2, 0b11111111
+ subss xmm7, xmm2
+ movss [edi + eax*4 + 8], xmm4
+ movss [edi + ebx*4 + 8], xmm5
+ movss [edi + ecx*4 + 8], xmm6
+ movss [edi + edx*4 + 8], xmm7
+
+ /* should we do one more iteration? */
+ sub [esp + innerk], 4
+ jl .i0110_finish_vdw_inner
+ jmp .i0110_unroll_vdw_loop
+.i0110_finish_vdw_inner:
+ /* check if at least two particles remain */
+ add [esp + innerk], 4
+ mov edx, [esp + innerk]
+ and edx, 2
+ jnz .i0110_dopair_vdw
+ jmp .i0110_checksingle_vdw
+.i0110_dopair_vdw:
+
+ mov ecx, [esp + innerjjnr]
+
+ mov eax, [ecx]
+ mov ebx, [ecx + 4]
+ add [esp + innerjjnr], 8
+
+ mov esi, [ebp + type]
+ mov ecx, eax
+ mov edx, ebx
+ mov ecx, [esi + ecx*4]
+ mov edx, [esi + edx*4]
+ mov esi, [ebp + nbfp]
+ shl ecx, 1
+ shl edx, 1
+ mov edi, [esp + ntia]
+ add ecx, edi
+ add edx, edi
+ movlps xmm6, [esi + ecx*4]
+ movhps xmm6, [esi + edx*4]
+ mov edi, [ebp + pos]
+ xorps xmm7,xmm7
+ movaps xmm4, xmm6
+ shufps xmm4, xmm4, 0b1000
+ shufps xmm6, xmm6, 0b1101
+ movlhps xmm4, xmm7
+ movlhps xmm6, xmm7
+
+ movaps [esp + c6], xmm4
+ movaps [esp + c12], xmm6
+
+ lea eax, [eax + eax*2]
+ lea ebx, [ebx + ebx*2]
+ /* move coordinates to xmm0-xmm2 */
+ movlps xmm1, [edi + eax*4]
+ movss xmm2, [edi + eax*4 + 8]
+ movhps xmm1, [edi + ebx*4]
+ movss xmm0, [edi + ebx*4 + 8]
+
+
+ movlhps xmm3, xmm7
+
+ shufps xmm2, xmm0, 0
+
+ movaps xmm0, xmm1
+
+ shufps xmm2, xmm2, 0b10001000
+
+ shufps xmm0, xmm0, 0b10001000
+ shufps xmm1, xmm1, 0b11011101
+
+ mov edi, [ebp + faction]
+ /* move ix-iz to xmm4-xmm6 */
+ xorps xmm7, xmm7
+
+ movaps xmm4, [esp + ix]
+ movaps xmm5, [esp + iy]
+ movaps xmm6, [esp + iz]
+
+ /* calc dr */
+ subps xmm4, xmm0
+ subps xmm5, xmm1
+ subps xmm6, xmm2
+
+ /* store dr */
+ movaps [esp + dx], xmm4
+ movaps [esp + dy], xmm5
+ movaps [esp + dz], xmm6
+ /* square it */
+ mulps xmm4,xmm4
+ mulps xmm5,xmm5
+ mulps xmm6,xmm6
+ addps xmm4, xmm5
+ addps xmm4, xmm6
+ /* rsq in xmm4 */
+
+
+ rcpps xmm5, xmm4
+ /* 1/x lookup seed in xmm5 */
+ movaps xmm0, [esp + two]
+ mulps xmm4, xmm5
+ subps xmm0, xmm4
+ mulps xmm0, xmm5 /* xmm0=rinvsq */
+ movaps xmm4, xmm0
+
+ movaps xmm1, xmm0
+ mulps xmm1, xmm0
+ mulps xmm1, xmm0 /* xmm1=rinvsix */
+ movaps xmm2, xmm1
+ mulps xmm2, xmm2 /* xmm2=rinvtwelve */
+
+ mulps xmm1, [esp + c6]
+ mulps xmm2, [esp + c12]
+ movaps xmm5, xmm2
+ subps xmm5, xmm1 /* vnb=vnb12-vnb6 */
+ addps xmm5, [esp + vnbtot]
+ mulps xmm1, [esp + six]
+ mulps xmm2, [esp + twelve]
+ subps xmm2, xmm1
+ mulps xmm4, xmm2 /* xmm4=total fscal */
+
+ movaps xmm0, [esp + dx]
+ movaps xmm1, [esp + dy]
+ movaps xmm2, [esp + dz]
+
+ movaps [esp + vnbtot], xmm5
+
+ mulps xmm0, xmm4
+ mulps xmm1, xmm4
+ mulps xmm2, xmm4
+ /* xmm0-xmm2 contains tx-tz (partial force) */
+ /* now update f_i */
+ movaps xmm3, [esp + fix]
+ movaps xmm4, [esp + fiy]
+ movaps xmm5, [esp + fiz]
+ addps xmm3, xmm0
+ addps xmm4, xmm1
+ addps xmm5, xmm2
+ movaps [esp + fix], xmm3
+ movaps [esp + fiy], xmm4
+ movaps [esp + fiz], xmm5
+ /* update the fj's */
+ movss xmm3, [edi + eax*4]
+ movss xmm4, [edi + eax*4 + 4]
+ movss xmm5, [edi + eax*4 + 8]
+ subss xmm3, xmm0
+ subss xmm4, xmm1
+ subss xmm5, xmm2
+ movss [edi + eax*4], xmm3
+ movss [edi + eax*4 + 4], xmm4
+ movss [edi + eax*4 + 8], xmm5
+
+ shufps xmm0, xmm0, 0b11100001
+ shufps xmm1, xmm1, 0b11100001
+ shufps xmm2, xmm2, 0b11100001
+
+ movss xmm3, [edi + ebx*4]
+ movss xmm4, [edi + ebx*4 + 4]
+ movss xmm5, [edi + ebx*4 + 8]
+ subss xmm3, xmm0
+ subss xmm4, xmm1
+ subss xmm5, xmm2
+ movss [edi + ebx*4], xmm3
+ movss [edi + ebx*4 + 4], xmm4
+ movss [edi + ebx*4 + 8], xmm5
+
+.i0110_checksingle_vdw:
+ mov edx, [esp + innerk]
+ and edx, 1
+ jnz .i0110_dosingle_vdw
+ jmp .i0110_updateouterdata_vdw
+.i0110_dosingle_vdw:
+ mov edi, [ebp + pos]
+ mov ecx, [esp + innerjjnr]
+ mov eax, [ecx]
+
+ mov esi, [ebp + type]
+ mov ecx, eax
+ mov ecx, [esi + ecx*4]
+ mov esi, [ebp + nbfp]
+ shl ecx, 1
+ add ecx, [esp + ntia]
+ xorps xmm6, xmm6
+ movlps xmm6, [esi + ecx*4]
+ movaps xmm4, xmm6
+ shufps xmm4, xmm4, 0b11111100
+ shufps xmm6, xmm6, 0b11111101
+
+ movaps [esp + c6], xmm4
+ movaps [esp + c12], xmm6
+
+ lea eax, [eax + eax*2]
+
+ /* move coordinates to xmm0-xmm2 */
+ movss xmm0, [edi + eax*4]
+ movss xmm1, [edi + eax*4 + 4]
+ movss xmm2, [edi + eax*4 + 8]
+
+ xorps xmm7, xmm7
+
+ movaps xmm4, [esp + ix]
+ movaps xmm5, [esp + iy]
+ movaps xmm6, [esp + iz]
+
+ /* calc dr */
+ subps xmm4, xmm0
+ subps xmm5, xmm1
+ subps xmm6, xmm2
+
+ /* store dr */
+ movaps [esp + dx], xmm4
+ movaps [esp + dy], xmm5
+ movaps [esp + dz], xmm6
+ /* square it */
+ mulps xmm4,xmm4
+ mulps xmm5,xmm5
+ mulps xmm6,xmm6
+ addps xmm4, xmm5
+ addps xmm4, xmm6
+ /* rsq in xmm4 */
+
+ rcpps xmm5, xmm4
+ /* 1/x lookup seed in xmm5 */
+ movaps xmm0, [esp + two]
+ mulps xmm4, xmm5
+ subps xmm0, xmm4
+ mulps xmm0, xmm5 /* xmm0=rinvsq */
+ movaps xmm4, xmm0
+
+ movaps xmm1, xmm0
+ mulps xmm1, xmm0
+ mulps xmm1, xmm0 /* xmm1=rinvsix */
+ movaps xmm2, xmm1
+ mulps xmm2, xmm2 /* xmm2=rinvtwelve */
+
+ mulps xmm1, [esp + c6]
+ mulps xmm2, [esp + c12]
+ movaps xmm5, xmm2
+ subps xmm5, xmm1 /* vnb=vnb12-vnb6 */
+ addps xmm5, [esp + vnbtot]
+ mulps xmm1, [esp + six]
+ mulps xmm2, [esp + twelve]
+ subps xmm2, xmm1
+ mulps xmm4, xmm2 /* xmm4=total fscal */
+
+ mov edi, [ebp + faction]
+
+ movaps xmm0, [esp + dx]
+ movaps xmm1, [esp + dy]
+ movaps xmm2, [esp + dz]
+
+ movaps [esp + vnbtot], xmm5
+
+ mulps xmm0, xmm4
+ mulps xmm1, xmm4
+ mulps xmm2, xmm4
+ /* xmm0-xmm2 contains tx-tz (partial force) */
+ /* now update f_i */
+ movaps xmm3, [esp + fix]
+ movaps xmm4, [esp + fiy]
+ movaps xmm5, [esp + fiz]
+ addps xmm3, xmm0
+ addps xmm4, xmm1
+ addps xmm5, xmm2
+ movaps [esp + fix], xmm3
+ movaps [esp + fiy], xmm4
+ movaps [esp + fiz], xmm5
+ /* update fj */
+
+ movss xmm3, [edi + eax*4]
+ movss xmm4, [edi + eax*4 + 4]
+ movss xmm5, [edi + eax*4 + 8]
+ subss xmm3, xmm0
+ subss xmm4, xmm1
+ subss xmm5, xmm2
+ movss [edi + eax*4], xmm3
+ movss [edi + eax*4 + 4], xmm4
+ movss [edi + eax*4 + 8], xmm5
+.i0110_updateouterdata_vdw:
+ mov ecx, [esp + ii3]
+ mov edi, [ebp + faction]
+ mov esi, [ebp + fshift]
+ mov edx, [esp + is3]
+
+ /* accumulate i forces in xmm0, xmm1, xmm2 */
+ movaps xmm0, [esp + fix]
+ movaps xmm1, [esp + fiy]
+ movaps xmm2, [esp + fiz]
+
+ movhlps xmm3, xmm0
+ movhlps xmm4, xmm1
+ movhlps xmm5, xmm2
+ addps xmm0, xmm3
+ addps xmm1, xmm4
+ addps xmm2, xmm5 /* sum is in 1/2 in xmm0-xmm2 */
+
+ movaps xmm3, xmm0
+ movaps xmm4, xmm1
+ movaps xmm5, xmm2
+
+ shufps xmm3, xmm3, 1
+ shufps xmm4, xmm4, 1
+ shufps xmm5, xmm5, 1
+ addss xmm0, xmm3
+ addss xmm1, xmm4
+ addss xmm2, xmm5 /* xmm0-xmm2 has single force in pos0 */
+
+ /* increment i force */
+ movss xmm3, [edi + ecx*4]
+ movss xmm4, [edi + ecx*4 + 4]
+ movss xmm5, [edi + ecx*4 + 8]
+ addss xmm3, xmm0
+ addss xmm4, xmm1
+ addss xmm5, xmm2
+ movss [edi + ecx*4], xmm3
+ movss [edi + ecx*4 + 4], xmm4
+ movss [edi + ecx*4 + 8], xmm5
+
+ /* increment fshift force */
+ movss xmm3, [esi + edx*4]
+ movss xmm4, [esi + edx*4 + 4]
+ movss xmm5, [esi + edx*4 + 8]
+ addss xmm3, xmm0
+ addss xmm4, xmm1
+ addss xmm5, xmm2
+ movss [esi + edx*4], xmm3
+ movss [esi + edx*4 + 4], xmm4
+ movss [esi + edx*4 + 8], xmm5
+
+ /* loop back to mno */
+ dec dword ptr [esp + nsvdw]
+ jz .i0110_last_mno
+ jmp .i0110_mno_vdw
+
+.i0110_last_mno:
+
+ /* get group index for i particle */
+ mov edx, [ebp + gid] /* get group index for this i particle */
+ mov edx, [edx]
+ add [ebp + gid], 4 /* advance pointer */
+
+
+ /* accumulate total lj energy and update it */
+ movaps xmm7, [esp + vnbtot]
+ /* accumulate */
+ movhlps xmm6, xmm7
+ addps xmm7, xmm6 /* pos 0-1 in xmm7 have the sum now */
+ movaps xmm6, xmm7
+ shufps xmm6, xmm6, 1
+ addss xmm7, xmm6
+
+ /* add earlier value from mem */
+ mov eax, [ebp + Vnb]
+ addss xmm7, [eax + edx*4]
+ /* move back to mem */
+ movss [eax + edx*4], xmm7
+
+ /* finish if last */
+ mov ecx, [ebp + nri]
+ dec ecx
+ jecxz .i0110_end
+ /* not last, iterate once more! */
+ mov [ebp + nri], ecx
+ jmp .i0110_outer
+.i0110_end:
+ emms
+ mov eax, [esp + salign]
+ add esp, eax
+ add esp, 332
+ pop edi
+ pop esi
+ pop edx
+ pop ecx
+ pop ebx
+ pop eax
+ leave
+ ret
+
+
+.globl inl0300_sse
+ .type inl0300_sse,@function
+inl0300_sse:
+.equ nri, 8
+.equ iinr, 12
+.equ jindex, 16
+.equ jjnr, 20
+.equ shift, 24
+.equ shiftvec, 28
+.equ fshift, 32
+.equ gid, 36
+.equ pos, 40
+.equ faction, 44
+.equ type, 48
+.equ ntype, 52
+.equ nbfp, 56
+.equ Vnb, 60
+.equ tabscale, 64
+.equ VFtab, 68
+ /* stack offsets for local variables */
+ /* bottom of stack is cache-aligned for sse use */
+.equ ix, 0
+.equ iy, 16
+.equ iz, 32
+.equ dx, 48
+.equ dy, 64
+.equ dz, 80
+.equ two, 96
+.equ tsc, 112
+.equ c6, 128
+.equ c12, 144
+.equ fscal, 160
+.equ vnbtot, 176
+.equ fix, 192
+.equ fiy, 208
+.equ fiz, 224
+.equ half, 240
+.equ three, 256
+.equ is3, 272
+.equ ii3, 276
+.equ ntia, 280
+.equ innerjjnr, 284
+.equ innerk, 288
+.equ salign, 292
+ push ebp
+ mov ebp,esp
+ push eax
+ push ebx
+ push ecx
+ push edx
+ push esi
+ push edi
+ sub esp, 296 /* local stack space */
+ mov eax, esp
+ and eax, 0xf
+ sub esp, eax
+ mov [esp + salign], eax
+
+ emms
+
+ movups xmm0, [sse_half]
+ movups xmm1, [sse_two]
+ movups xmm2, [sse_three]
+ movss xmm3, [ebp + tabscale]
+ movaps [esp + half], xmm0
+ movaps [esp + two], xmm1
+ movaps [esp + three], xmm2
+ shufps xmm3, xmm3, 0
+ movaps [esp + tsc], xmm3
+
+ /* assume we have at least one i particle - start directly */
+.i0300_outer:
+ mov eax, [ebp + shift] /* eax = pointer into shift[] */
+ mov ebx, [eax] /* ebx=shift[n] */
+ add [ebp + shift], 4 /* advance pointer one step */
+
+ lea ebx, [ebx + ebx*2] /* ebx=3*is */
+ mov [esp + is3],ebx /* store is3 */
+
+ mov eax, [ebp + shiftvec] /* eax = base of shiftvec[] */
+
+ movss xmm0, [eax + ebx*4]
+ movss xmm1, [eax + ebx*4 + 4]
+ movss xmm2, [eax + ebx*4 + 8]
+
+ mov ecx, [ebp + iinr] /* ecx = pointer into iinr[] */
+ add [ebp + iinr], 4 /* advance pointer */
+ mov ebx, [ecx] /* ebx =ii */
+
+ mov edx, [ebp + type]
+ mov edx, [edx + ebx*4]
+ imul edx, [ebp + ntype]
+ shl edx, 1
+ mov [esp + ntia], edx
+
+ lea ebx, [ebx + ebx*2] /* ebx = 3*ii=ii3 */
+ mov eax, [ebp + pos] /* eax = base of pos[] */
+
+ addss xmm0, [eax + ebx*4]
+ addss xmm1, [eax + ebx*4 + 4]
+ addss xmm2, [eax + ebx*4 + 8]
+
+ shufps xmm0, xmm0, 0
+ shufps xmm1, xmm1, 0
+ shufps xmm2, xmm2, 0
+
+ movaps [esp + ix], xmm0
+ movaps [esp + iy], xmm1
+ movaps [esp + iz], xmm2
+
+ mov [esp + ii3], ebx
+
+ /* clear tot potential and i forces */
+ xorps xmm4, xmm4
+ movaps [esp + vnbtot], xmm4
+ movaps [esp + fix], xmm4
+ movaps [esp + fiy], xmm4
+ movaps [esp + fiz], xmm4
+
+ mov eax, [ebp + jindex]
+ mov ecx, [eax] /* jindex[n] */
+ mov edx, [eax + 4] /* jindex[n+1] */
+ add [ebp + jindex], 4
+ sub edx, ecx /* number of innerloop atoms */
+
+ mov esi, [ebp + pos]
+ mov edi, [ebp + faction]
+ mov eax, [ebp + jjnr]
+ shl ecx, 2
+ add eax, ecx
+ mov [esp + innerjjnr], eax /* pointer to jjnr[nj0] */
+ sub edx, 4
+ mov [esp + innerk], edx /* number of innerloop atoms */
+ jge .i0300_unroll_loop
+ jmp .i0300_finish_inner
+.i0300_unroll_loop:
+ /* quad-unroll innerloop here */
+ mov edx, [esp + innerjjnr] /* pointer to jjnr[k] */
+ mov eax, [edx]
+ mov ebx, [edx + 4]
+ mov ecx, [edx + 8]
+ mov edx, [edx + 12] /* eax-edx=jnr1-4 */
+ add [esp + innerjjnr], 16 /* advance pointer (unrolled 4) */
+
+ movd mm0, eax /* use mmx registers as temp storage */
+ movd mm1, ebx
+ movd mm2, ecx
+ movd mm3, edx
+
+ mov esi, [ebp + type]
+ mov eax, [esi + eax*4]
+ mov ebx, [esi + ebx*4]
+ mov ecx, [esi + ecx*4]
+ mov edx, [esi + edx*4]
+ mov esi, [ebp + nbfp]
+ shl eax, 1
+ shl ebx, 1
+ shl ecx, 1
+ shl edx, 1
+ mov edi, [esp + ntia]
+ add eax, edi
+ add ebx, edi
+ add ecx, edi
+ add edx, edi
+
+ movlps xmm6, [esi + eax*4]
+ movlps xmm7, [esi + ecx*4]
+ movhps xmm6, [esi + ebx*4]
+ movhps xmm7, [esi + edx*4]
+
+ movaps xmm4, xmm6
+ shufps xmm4, xmm7, 0b10001000
+ shufps xmm6, xmm7, 0b11011101
+
+ movd eax, mm0
+ movd ebx, mm1
+ movd ecx, mm2
+ movd edx, mm3
+
+ movaps [esp + c6], xmm4
+ movaps [esp + c12], xmm6
+
+ mov esi, [ebp + pos] /* base of pos[] */
+
+ lea eax, [eax + eax*2] /* replace jnr with j3 */
+ lea ebx, [ebx + ebx*2]
+
+ lea ecx, [ecx + ecx*2] /* replace jnr with j3 */
+ lea edx, [edx + edx*2]
+
+ /* move four coordinates to xmm0-xmm2 */
+
+ movlps xmm4, [esi + eax*4]
+ movlps xmm5, [esi + ecx*4]
+ movss xmm2, [esi + eax*4 + 8]
+ movss xmm6, [esi + ecx*4 + 8]
+
+ movhps xmm4, [esi + ebx*4]
+ movhps xmm5, [esi + edx*4]
+
+ movss xmm0, [esi + ebx*4 + 8]
+ movss xmm1, [esi + edx*4 + 8]
+
+ shufps xmm2, xmm0, 0
+ shufps xmm6, xmm1, 0
+
+ movaps xmm0, xmm4
+ movaps xmm1, xmm4
+
+ shufps xmm2, xmm6, 0b10001000
+
+ shufps xmm0, xmm5, 0b10001000
+ shufps xmm1, xmm5, 0b11011101
+
+ /* move ix-iz to xmm4-xmm6 */
+ movaps xmm4, [esp + ix]
+ movaps xmm5, [esp + iy]
+ movaps xmm6, [esp + iz]
+
+ /* calc dr */
+ subps xmm4, xmm0
+ subps xmm5, xmm1
+ subps xmm6, xmm2
+
+ /* store dr */
+ movaps [esp + dx], xmm4
+ movaps [esp + dy], xmm5
+ movaps [esp + dz], xmm6
+ /* square it */
+ mulps xmm4,xmm4
+ mulps xmm5,xmm5
+ mulps xmm6,xmm6
+ addps xmm4, xmm5
+ addps xmm4, xmm6
+ /* rsq in xmm4 */
+
+ rsqrtps xmm5, xmm4
+ /* lookup seed in xmm5 */
+ movaps xmm2, xmm5
+ mulps xmm5, xmm5
+ movaps xmm1, [esp + three]
+ mulps xmm5, xmm4 /* rsq*lu*lu */
+ movaps xmm0, [esp + half]
+ subps xmm1, xmm5 /* 30-rsq*lu*lu */
+ mulps xmm1, xmm2
+ mulps xmm0, xmm1 /* xmm0=rinv */
+ mulps xmm4, xmm0 /* xmm4=r */
+ mulps xmm4, [esp + tsc]
+
+ movhlps xmm5, xmm4
+ cvttps2pi mm6, xmm4
+ cvttps2pi mm7, xmm5 /* mm6/mm7 contain lu indices */
+ cvtpi2ps xmm6, mm6
+ cvtpi2ps xmm5, mm7
+ movlhps xmm6, xmm5
+ subps xmm4, xmm6
+ movaps xmm1, xmm4 /* xmm1=eps */
+ movaps xmm2, xmm1
+ mulps xmm2, xmm2 /* xmm2=eps2 */
+ pslld mm6, 3
+ pslld mm7, 3
+
+ movd mm0, eax
+ movd mm1, ebx
+ movd mm2, ecx
+ movd mm3, edx
+
+ mov esi, [ebp + VFtab]
+ movd eax, mm6
+ psrlq mm6, 32
+ movd ecx, mm7
+ psrlq mm7, 32
+ movd ebx, mm6
+ movd edx, mm7
+
+ /* dispersion */
+ movlps xmm5, [esi + eax*4 + 0]
+ movlps xmm7, [esi + ecx*4 + 0]
+ movhps xmm5, [esi + ebx*4 + 0]
+ movhps xmm7, [esi + edx*4 + 0] /* got half dispersion table */
+ movaps xmm4, xmm5
+ shufps xmm4, xmm7, 0b10001000
+ shufps xmm5, xmm7, 0b11011101
+
+ movlps xmm7, [esi + eax*4 + 8]
+ movlps xmm3, [esi + ecx*4 + 8]
+ movhps xmm7, [esi + ebx*4 + 8]
+ movhps xmm3, [esi + edx*4 + 8] /* other half of dispersion table */
+ movaps xmm6, xmm7
+ shufps xmm6, xmm3, 0b10001000
+ shufps xmm7, xmm3, 0b11011101
+ /* dispersion table ready, in xmm4-xmm7 */
+ mulps xmm6, xmm1 /* xmm6=Geps */
+ mulps xmm7, xmm2 /* xmm7=Heps2 */
+ addps xmm5, xmm6
+ addps xmm5, xmm7 /* xmm5=Fp */
+ mulps xmm7, [esp + two] /* two*Heps2 */
+ addps xmm7, xmm6
+ addps xmm7, xmm5 /* xmm7=FF */
+ mulps xmm5, xmm1 /* xmm5=eps*Fp */
+ addps xmm5, xmm4 /* xmm5=VV */
+
+ movaps xmm4, [esp + c6]
+ mulps xmm7, xmm4 /* fijD */
+ mulps xmm5, xmm4 /* vnb6 */
+
+ /* put scalar force on stack Update vnbtot directly */
+ addps xmm5, [esp + vnbtot]
+ movaps [esp + fscal], xmm7
+ movaps [esp + vnbtot], xmm5
+
+ /* repulsion */
+ movlps xmm5, [esi + eax*4 + 16]
+ movlps xmm7, [esi + ecx*4 + 16]
+ movhps xmm5, [esi + ebx*4 + 16]
+ movhps xmm7, [esi + edx*4 + 16] /* got half repulsion table */
+ movaps xmm4, xmm5
+ shufps xmm4, xmm7, 0b10001000
+ shufps xmm5, xmm7, 0b11011101
+
+ movlps xmm7, [esi + eax*4 + 24]
+ movlps xmm3, [esi + ecx*4 + 24]
+ movhps xmm7, [esi + ebx*4 + 24]
+ movhps xmm3, [esi + edx*4 + 24] /* other half of repulsion table */
+ movaps xmm6, xmm7
+ shufps xmm6, xmm3, 0b10001000
+ shufps xmm7, xmm3, 0b11011101
+ /* table ready, in xmm4-xmm7 */
+ mulps xmm6, xmm1 /* xmm6=Geps */
+ mulps xmm7, xmm2 /* xmm7=Heps2 */
+ addps xmm5, xmm6
+ addps xmm5, xmm7 /* xmm5=Fp */
+ mulps xmm7, [esp + two] /* two*Heps2 */
+ addps xmm7, xmm6
+ addps xmm7, xmm5 /* xmm7=FF */
+ mulps xmm5, xmm1 /* xmm5=eps*Fp */
+ addps xmm5, xmm4 /* xmm5=VV */
+
+ movaps xmm4, [esp + c12]
+ mulps xmm7, xmm4
+ mulps xmm5, xmm4
+ addps xmm7, [esp + fscal]
+
+ addps xmm5, [esp + vnbtot]
+ movaps [esp + vnbtot], xmm5
+ xorps xmm4, xmm4
+
+ mulps xmm7, [esp + tsc]
+ mulps xmm7, xmm0
+ subps xmm4, xmm7
+
+ movaps xmm0, [esp + dx]
+ movaps xmm1, [esp + dy]
+ movaps xmm2, [esp + dz]
+
+ movd eax, mm0
+ movd ebx, mm1
+ movd ecx, mm2
+ movd edx, mm3
+
+ mov edi, [ebp + faction]
+ mulps xmm0, xmm4
+ mulps xmm1, xmm4
+ mulps xmm2, xmm4
+ /* xmm0-xmm2 contains tx-tz (partial force) */
+ /* now update f_i */
+ movaps xmm3, [esp + fix]
+ movaps xmm4, [esp + fiy]
+ movaps xmm5, [esp + fiz]
+ addps xmm3, xmm0
+ addps xmm4, xmm1
+ addps xmm5, xmm2
+ movaps [esp + fix], xmm3
+ movaps [esp + fiy], xmm4
+ movaps [esp + fiz], xmm5
+ /* the fj's - start by accumulating x & y forces from memory */
+ movlps xmm4, [edi + eax*4]
+ movlps xmm6, [edi + ecx*4]
+ movhps xmm4, [edi + ebx*4]
+ movhps xmm6, [edi + edx*4]
+
+ movaps xmm3, xmm4
+ shufps xmm3, xmm6, 0b10001000
+ shufps xmm4, xmm6, 0b11011101
+
+ /* now xmm3-xmm5 contains fjx, fjy, fjz */
+ subps xmm3, xmm0
+ subps xmm4, xmm1
+
+ /* unpack them back so we can store them - first x & y in xmm3/xmm4 */
+
+ movaps xmm6, xmm3
+ unpcklps xmm6, xmm4
+ unpckhps xmm3, xmm4
+ /* xmm6(l)=x & y for j1, (h) for j2 */
+ /* xmm3(l)=x & y for j3, (h) for j4 */
+ movlps [edi + eax*4], xmm6
+ movlps [edi + ecx*4], xmm3
+
+ movhps [edi + ebx*4], xmm6
+ movhps [edi + edx*4], xmm3
+
+ /* and the z forces */
+ movss xmm4, [edi + eax*4 + 8]
+ movss xmm5, [edi + ebx*4 + 8]
+ movss xmm6, [edi + ecx*4 + 8]
+ movss xmm7, [edi + edx*4 + 8]
+ subss xmm4, xmm2
+ shufps xmm2, xmm2, 0b11100101
+ subss xmm5, xmm2
+ shufps xmm2, xmm2, 0b11101010
+ subss xmm6, xmm2
+ shufps xmm2, xmm2, 0b11111111
+ subss xmm7, xmm2
+ movss [edi + eax*4 + 8], xmm4
+ movss [edi + ebx*4 + 8], xmm5
+ movss [edi + ecx*4 + 8], xmm6
+ movss [edi + edx*4 + 8], xmm7
+
+ /* should we do one more iteration? */
+ sub [esp + innerk], 4
+ jl .i0300_finish_inner
+ jmp .i0300_unroll_loop
+.i0300_finish_inner:
+ /* check if at least two particles remain */
+ add [esp + innerk], 4
+ mov edx, [esp + innerk]
+ and edx, 2
+ jnz .i0300_dopair
+ jmp .i0300_checksingle
+.i0300_dopair:
+ mov ecx, [esp + innerjjnr]
+
+ mov eax, [ecx]
+ mov ebx, [ecx + 4]
+ add [esp + innerjjnr], 8
+ xorps xmm7, xmm7
+
+ mov esi, [ebp + type]
+ mov ecx, eax
+ mov edx, ebx
+ mov ecx, [esi + ecx*4]
+ mov edx, [esi + edx*4]
+ mov esi, [ebp + nbfp]
+ shl ecx, 1
+ shl edx, 1
+ mov edi, [esp + ntia]
+ add ecx, edi
+ add edx, edi
+ movlps xmm6, [esi + ecx*4]
+ movhps xmm6, [esi + edx*4]
+ mov edi, [ebp + pos]
+
+ movaps xmm4, xmm6
+ shufps xmm4, xmm4, 0b1000
+ shufps xmm6, xmm6, 0b1101
+ movlhps xmm4, xmm7
+ movlhps xmm6, xmm7
+
+ movaps [esp + c6], xmm4
+ movaps [esp + c12], xmm6
+
+ lea eax, [eax + eax*2]
+ lea ebx, [ebx + ebx*2]
+ /* move coordinates to xmm0-xmm2 */
+ movlps xmm1, [edi + eax*4]
+ movss xmm2, [edi + eax*4 + 8]
+ movhps xmm1, [edi + ebx*4]
+ movss xmm0, [edi + ebx*4 + 8]
+
+ movlhps xmm3, xmm7
+
+ shufps xmm2, xmm0, 0
+
+ movaps xmm0, xmm1
+
+ shufps xmm2, xmm2, 0b10001000
+
+ shufps xmm0, xmm0, 0b10001000
+ shufps xmm1, xmm1, 0b11011101
+
+ mov edi, [ebp + faction]
+ /* move ix-iz to xmm4-xmm6 */
+ xorps xmm7, xmm7
+
+ movaps xmm4, [esp + ix]
+ movaps xmm5, [esp + iy]
+ movaps xmm6, [esp + iz]
+
+ /* calc dr */
+ subps xmm4, xmm0
+ subps xmm5, xmm1
+ subps xmm6, xmm2
+
+ /* store dr */
+ movaps [esp + dx], xmm4
+ movaps [esp + dy], xmm5
+ movaps [esp + dz], xmm6
+ /* square it */
+ mulps xmm4,xmm4
+ mulps xmm5,xmm5
+ mulps xmm6,xmm6
+ addps xmm4, xmm5
+ addps xmm4, xmm6
+ /* rsq in xmm4 */
+
+ rsqrtps xmm5, xmm4
+ /* lookup seed in xmm5 */
+ movaps xmm2, xmm5
+ mulps xmm5, xmm5
+ movaps xmm1, [esp + three]
+ mulps xmm5, xmm4 /* rsq*lu*lu */
+ movaps xmm0, [esp + half]
+ subps xmm1, xmm5 /* 30-rsq*lu*lu */
+ mulps xmm1, xmm2
+ mulps xmm0, xmm1 /* xmm0=rinv */
+ mulps xmm4, xmm0 /* xmm4=r */
+ mulps xmm4, [esp + tsc]
+
+ cvttps2pi mm6, xmm4 /* mm6 contain lu indices */
+ cvtpi2ps xmm6, mm6
+ subps xmm4, xmm6
+ movaps xmm1, xmm4 /* xmm1=eps */
+ movaps xmm2, xmm1
+ mulps xmm2, xmm2 /* xmm2=eps2 */
+
+ pslld mm6, 3
+
+ mov esi, [ebp + VFtab]
+ movd ecx, mm6
+ psrlq mm6, 32
+ movd edx, mm6
+
+ /* dispersion */
+ movlps xmm5, [esi + ecx*4 + 0]
+ movhps xmm5, [esi + edx*4 + 0]/* got half dispersion table */
+ movaps xmm4, xmm5
+ shufps xmm4, xmm4, 0b10001000
+ shufps xmm5, xmm5, 0b11011101
+
+ movlps xmm7, [esi + ecx*4 + 8]
+ movhps xmm7, [esi + edx*4 + 8] /* other half of dispersion table */
+ movaps xmm6, xmm7
+ shufps xmm6, xmm6, 0b10001000
+ shufps xmm7, xmm7, 0b11011101
+ /* dispersion table ready, in xmm4-xmm7 */
+ mulps xmm6, xmm1 /* xmm6=Geps */
+ mulps xmm7, xmm2 /* xmm7=Heps2 */
+ addps xmm5, xmm6
+ addps xmm5, xmm7 /* xmm5=Fp */
+ mulps xmm7, [esp + two] /* two*Heps2 */
+ addps xmm7, xmm6
+ addps xmm7, xmm5 /* xmm7=FF */
+ mulps xmm5, xmm1 /* xmm5=eps*Fp */
+ addps xmm5, xmm4 /* xmm5=VV */
+
+ movaps xmm4, [esp + c6]
+ mulps xmm7, xmm4 /* fijD */
+ mulps xmm5, xmm4 /* vnb6 */
+
+ /* put scalar force on stack Update vnbtot directly */
+ addps xmm5, [esp + vnbtot]
+ movaps [esp + fscal], xmm7
+ movaps [esp + vnbtot], xmm5
+
+ /* repulsion */
+ movlps xmm5, [esi + ecx*4 + 16]
+ movhps xmm5, [esi + edx*4 + 16] /* got half repulsion table */
+ movaps xmm4, xmm5
+ shufps xmm4, xmm7, 0b10001000
+ shufps xmm5, xmm7, 0b11011101
+
+ movlps xmm7, [esi + ecx*4 + 24]
+ movhps xmm7, [esi + edx*4 + 24] /* other half of repulsion table */
+ movaps xmm6, xmm7
+ shufps xmm6, xmm3, 0b10001000
+ shufps xmm7, xmm3, 0b11011101
+ /* table ready, in xmm4-xmm7 */
+ mulps xmm6, xmm1 /* xmm6=Geps */
+ mulps xmm7, xmm2 /* xmm7=Heps2 */
+ addps xmm5, xmm6
+ addps xmm5, xmm7 /* xmm5=Fp */
+ mulps xmm7, [esp + two] /* two*Heps2 */
+ addps xmm7, xmm6
+ addps xmm7, xmm5 /* xmm7=FF */
+ mulps xmm5, xmm1 /* xmm5=eps*Fp */
+ addps xmm5, xmm4 /* xmm5=VV */
+
+ movaps xmm4, [esp + c12]
+ mulps xmm7, xmm4 /* fijR */
+ mulps xmm5, xmm4 /* vnb12 */
+ addps xmm7, [esp + fscal]
+
+ addps xmm5, [esp + vnbtot]
+ movaps [esp + vnbtot], xmm5
+ xorps xmm4, xmm4
+
+ mulps xmm7, [esp + tsc]
+ mulps xmm7, xmm0
+ subps xmm4, xmm7
+
+ movaps xmm0, [esp + dx]
+ movaps xmm1, [esp + dy]
+ movaps xmm2, [esp + dz]
+
+ mulps xmm0, xmm4
+ mulps xmm1, xmm4
+ mulps xmm2, xmm4
+ /* xmm0-xmm2 contains tx-tz (partial force) */
+ /* now update f_i */
+ movaps xmm3, [esp + fix]
+ movaps xmm4, [esp + fiy]
+ movaps xmm5, [esp + fiz]
+ addps xmm3, xmm0
+ addps xmm4, xmm1
+ addps xmm5, xmm2
+ movaps [esp + fix], xmm3
+ movaps [esp + fiy], xmm4
+ movaps [esp + fiz], xmm5
+ /* update the fj's */
+ movss xmm3, [edi + eax*4]
+ movss xmm4, [edi + eax*4 + 4]
+ movss xmm5, [edi + eax*4 + 8]
+ subss xmm3, xmm0
+ subss xmm4, xmm1
+ subss xmm5, xmm2
+ movss [edi + eax*4], xmm3
+ movss [edi + eax*4 + 4], xmm4
+ movss [edi + eax*4 + 8], xmm5
+
+ shufps xmm0, xmm0, 0b11100001
+ shufps xmm1, xmm1, 0b11100001
+ shufps xmm2, xmm2, 0b11100001
+
+ movss xmm3, [edi + ebx*4]
+ movss xmm4, [edi + ebx*4 + 4]
+ movss xmm5, [edi + ebx*4 + 8]
+ subss xmm3, xmm0
+ subss xmm4, xmm1
+ subss xmm5, xmm2
+ movss [edi + ebx*4], xmm3
+ movss [edi + ebx*4 + 4], xmm4
+ movss [edi + ebx*4 + 8], xmm5
+
+.i0300_checksingle:
+ mov edx, [esp + innerk]
+ and edx, 1
+ jnz .i0300_dosingle
+ jmp .i0300_updateouterdata
+.i0300_dosingle:
+ mov edi, [ebp + pos]
+ mov ecx, [esp + innerjjnr]
+ mov eax, [ecx]
+ xorps xmm6, xmm6
+
+ mov esi, [ebp + type]
+ mov ecx, eax
+ mov ecx, [esi + ecx*4]
+ mov esi, [ebp + nbfp]
+ shl ecx, 1
+ add ecx, [esp + ntia]
+ movlps xmm6, [esi + ecx*4]
+ movaps xmm4, xmm6
+ shufps xmm4, xmm4, 0b11111100
+ shufps xmm6, xmm6, 0b11111101
+
+ movaps [esp + c6], xmm4
+ movaps [esp + c12], xmm6
+
+ lea eax, [eax + eax*2]
+
+ /* move coordinates to xmm0-xmm2 */
+ movss xmm0, [edi + eax*4]
+ movss xmm1, [edi + eax*4 + 4]
+ movss xmm2, [edi + eax*4 + 8]
+
+ movaps xmm4, [esp + ix]
+ movaps xmm5, [esp + iy]
+ movaps xmm6, [esp + iz]
+
+ /* calc dr */
+ subps xmm4, xmm0
+ subps xmm5, xmm1
+ subps xmm6, xmm2
+
+ /* store dr */
+ movaps [esp + dx], xmm4
+ movaps [esp + dy], xmm5
+ movaps [esp + dz], xmm6
+ /* square it */
+ mulps xmm4,xmm4
+ mulps xmm5,xmm5
+ mulps xmm6,xmm6
+ addps xmm4, xmm5
+ addps xmm4, xmm6
+ /* rsq in xmm4 */
+
+ rsqrtps xmm5, xmm4
+ /* lookup seed in xmm5 */
+ movaps xmm2, xmm5
+ mulps xmm5, xmm5
+ movaps xmm1, [esp + three]
+ mulps xmm5, xmm4 /* rsq*lu*lu */
+ movaps xmm0, [esp + half]
+ subps xmm1, xmm5 /* 30-rsq*lu*lu */
+ mulps xmm1, xmm2
+ mulps xmm0, xmm1 /* xmm0=rinv */
+
+ mulps xmm4, xmm0 /* xmm4=r */
+ mulps xmm4, [esp + tsc]
+
+ cvttps2pi mm6, xmm4 /* mm6 contain lu indices */
+ cvtpi2ps xmm6, mm6
+ subps xmm4, xmm6
+ movaps xmm1, xmm4 /* xmm1=eps */
+ movaps xmm2, xmm1
+ mulps xmm2, xmm2 /* xmm2=eps2 */
+
+ pslld mm6, 3
+
+ mov esi, [ebp + VFtab]
+ movd ebx, mm6
+
+ /* dispersion */
+ movlps xmm4, [esi + ebx*4 + 0]
+ movlps xmm6, [esi + ebx*4 + 8]
+ movaps xmm5, xmm4
+ movaps xmm7, xmm6
+ shufps xmm5, xmm5, 1
+ shufps xmm7, xmm7, 1
+ /* table ready in xmm4-xmm7 */
+
+ mulps xmm6, xmm1 /* xmm6=Geps */
+ mulps xmm7, xmm2 /* xmm7=Heps2 */
+ addps xmm5, xmm6
+ addps xmm5, xmm7 /* xmm5=Fp */
+ mulps xmm7, [esp + two] /* two*Heps2 */
+ addps xmm7, xmm6
+ addps xmm7, xmm5 /* xmm7=FF */
+ mulps xmm5, xmm1 /* xmm5=eps*Fp */
+ addps xmm5, xmm4 /* xmm5=VV */
+
+ movaps xmm4, [esp + c6]
+ mulps xmm7, xmm4 /* fijD */
+ mulps xmm5, xmm4 /* vnb6 */
+
+ /* put scalar force on stack Update vnbtot directly */
+ addps xmm5, [esp + vnbtot]
+ movaps [esp + fscal], xmm7
+ movaps [esp + vnbtot], xmm5
+
+ /* repulsion */
+ movlps xmm4, [esi + ebx*4 + 16]
+ movlps xmm6, [esi + ebx*4 + 24]
+ movaps xmm5, xmm4
+ movaps xmm7, xmm6
+ shufps xmm5, xmm5, 1
+ shufps xmm7, xmm7, 1
+ /* table ready in xmm4-xmm7 */
+
+ mulps xmm6, xmm1 /* xmm6=Geps */
+ mulps xmm7, xmm2 /* xmm7=Heps2 */
+ addps xmm5, xmm6
+ addps xmm5, xmm7 /* xmm5=Fp */
+ mulps xmm7, [esp + two] /* two*Heps2 */
+ addps xmm7, xmm6
+ addps xmm7, xmm5 /* xmm7=FF */
+ mulps xmm5, xmm1 /* xmm5=eps*Fp */
+ addps xmm5, xmm4 /* xmm5=VV */
+
+ movaps xmm4, [esp + c12]
+ mulps xmm7, xmm4 /* fijR */
+ mulps xmm5, xmm4 /* vnb12 */
+ addps xmm7, [esp + fscal]
+
+ addps xmm5, [esp + vnbtot]
+ movaps [esp + vnbtot], xmm5
+ xorps xmm4, xmm4
+
+ mulps xmm7, [esp + tsc]
+ mulps xmm7, xmm0
+ subps xmm4, xmm7
+ mov edi, [ebp + faction]
+
+ movaps xmm0, [esp + dx]
+ movaps xmm1, [esp + dy]
+ movaps xmm2, [esp + dz]
+
+ mulps xmm0, xmm4
+ mulps xmm1, xmm4
+ mulps xmm2, xmm4
+ /* xmm0-xmm2 contains tx-tz (partial force) */
+ /* now update f_i */
+ movaps xmm3, [esp + fix]
+ movaps xmm4, [esp + fiy]
+ movaps xmm5, [esp + fiz]
+ addps xmm3, xmm0
+ addps xmm4, xmm1
+ addps xmm5, xmm2
+ movaps [esp + fix], xmm3
+ movaps [esp + fiy], xmm4
+ movaps [esp + fiz], xmm5
+ /* update fj */
+
+ movss xmm3, [edi + eax*4]
+ movss xmm4, [edi + eax*4 + 4]
+ movss xmm5, [edi + eax*4 + 8]
+ subss xmm3, xmm0
+ subss xmm4, xmm1
+ subss xmm5, xmm2
+ movss [edi + eax*4], xmm3
+ movss [edi + eax*4 + 4], xmm4
+ movss [edi + eax*4 + 8], xmm5
+.i0300_updateouterdata:
+ mov ecx, [esp + ii3]
+ mov edi, [ebp + faction]
+ mov esi, [ebp + fshift]
+ mov edx, [esp + is3]
+
+ /* accumulate i forces in xmm0, xmm1, xmm2 */
+ movaps xmm0, [esp + fix]
+ movaps xmm1, [esp + fiy]
+ movaps xmm2, [esp + fiz]
+
+ movhlps xmm3, xmm0
+ movhlps xmm4, xmm1
+ movhlps xmm5, xmm2
+ addps xmm0, xmm3
+ addps xmm1, xmm4
+ addps xmm2, xmm5 /* sum is in 1/2 in xmm0-xmm2 */
+
+ movaps xmm3, xmm0
+ movaps xmm4, xmm1
+ movaps xmm5, xmm2
+
+ shufps xmm3, xmm3, 1
+ shufps xmm4, xmm4, 1
+ shufps xmm5, xmm5, 1
+ addss xmm0, xmm3
+ addss xmm1, xmm4
+ addss xmm2, xmm5 /* xmm0-xmm2 has single force in pos0 */
+
+ /* increment i force */
+ movss xmm3, [edi + ecx*4]
+ movss xmm4, [edi + ecx*4 + 4]
+ movss xmm5, [edi + ecx*4 + 8]
+ addss xmm3, xmm0
+ addss xmm4, xmm1
+ addss xmm5, xmm2
+ movss [edi + ecx*4], xmm3
+ movss [edi + ecx*4 + 4], xmm4
+ movss [edi + ecx*4 + 8], xmm5
+
+ /* increment fshift force */
+ movss xmm3, [esi + edx*4]
+ movss xmm4, [esi + edx*4 + 4]
+ movss xmm5, [esi + edx*4 + 8]
+ addss xmm3, xmm0
+ addss xmm4, xmm1
+ addss xmm5, xmm2
+ movss [esi + edx*4], xmm3
+ movss [esi + edx*4 + 4], xmm4
+ movss [esi + edx*4 + 8], xmm5
+
+ /* get group index for i particle */
+ mov edx, [ebp + gid] /* get group index for this i particle */
+ mov edx, [edx]
+ add [ebp + gid], 4 /* advance pointer */
+
+ /* accumulate total lj energy and update it */
+ movaps xmm7, [esp + vnbtot]
+ /* accumulate */
+ movhlps xmm6, xmm7
+ addps xmm7, xmm6 /* pos 0-1 in xmm7 have the sum now */
+ movaps xmm6, xmm7
+ shufps xmm6, xmm6, 1
+ addss xmm7, xmm6
+
+ /* add earlier value from mem */
+ mov eax, [ebp + Vnb]
+ addss xmm7, [eax + edx*4]
+ /* move back to mem */
+ movss [eax + edx*4], xmm7
+
+ /* finish if last */
+ mov ecx, [ebp + nri]
+ dec ecx
+ jecxz .i0300_end
+ /* not last, iterate once more! */
+ mov [ebp + nri], ecx
+ jmp .i0300_outer
+.i0300_end:
+ emms
+ mov eax, [esp + salign]
+ add esp, eax
+ add esp, 296
+ pop edi
+ pop esi
+ pop edx
+ pop ecx
+ pop ebx
+ pop eax
+ leave
+ ret
+
+
+.globl inl0310_sse
+ .type inl0310_sse,@function
+inl0310_sse:
+.equ nri, 8
+.equ iinr, 12
+.equ jindex, 16
+.equ jjnr, 20
+.equ shift, 24
+.equ shiftvec, 28
+.equ fshift, 32
+.equ gid, 36
+.equ pos, 40
+.equ faction, 44
+.equ type, 48
+.equ ntype, 52
+.equ nbfp, 56
+.equ Vnb, 60
+.equ tabscale, 64
+.equ VFtab, 68
+.equ nsatoms, 72
+ /* stack offsets for local variables */
+ /* bottom of stack is cache-aligned for sse use */
+.equ ix, 0
+.equ iy, 16
+.equ iz, 32
+.equ dx, 48
+.equ dy, 64
+.equ dz, 80
+.equ two, 96
+.equ tsc, 112
+.equ c6, 128
+.equ c12, 144
+.equ fscal, 160
+.equ vnbtot, 176
+.equ fix, 192
+.equ fiy, 208
+.equ fiz, 224
+.equ half, 240
+.equ three, 256
+.equ is3, 272
+.equ ii3, 276
+.equ shX, 280
+.equ shY, 284
+.equ shZ, 288
+.equ ntia, 292
+.equ innerjjnr0, 296
+.equ innerjjnr, 300
+.equ innerk0, 304
+.equ innerk, 308
+.equ salign, 312
+.equ nsvdwc, 316
+.equ nscoul, 320
+.equ nsvdw, 324
+.equ solnr, 328
+ push ebp
+ mov ebp,esp
+ push eax
+ push ebx
+ push ecx
+ push edx
+ push esi
+ push edi
+ sub esp, 332 /* local stack space */
+ mov eax, esp
+ and eax, 0xf
+ sub esp, eax
+ mov [esp + salign], eax
+
+ emms
+
+ movups xmm0, [sse_half]
+ movups xmm1, [sse_two]
+ movups xmm2, [sse_three]
+ movss xmm3, [ebp + tabscale]
+ movaps [esp + half], xmm0
+ movaps [esp + two], xmm1
+ movaps [esp + three], xmm2
+ shufps xmm3, xmm3, 0
+ movaps [esp + tsc], xmm3
+
+ /* assume we have at least one i particle - start directly */
+.i0310_outer:
+ mov eax, [ebp + shift] /* eax = pointer into shift[] */
+ mov ebx, [eax] /* ebx=shift[n] */
+ add [ebp + shift], 4 /* advance pointer one step */
+
+ lea ebx, [ebx + ebx*2] /* ebx=3*is */
+ mov [esp + is3],ebx /* store is3 */
+
+ mov eax, [ebp + shiftvec] /* eax = base of shiftvec[] */
+
+ movlps xmm0, [eax + ebx*4] /* getting the shiftvector */
+ movss xmm1, [eax + ebx*4 + 8]
+ movlps [esp + shX], xmm0
+ movss [esp + shZ], xmm1
+
+ mov ecx, [ebp + iinr] /* ecx = pointer into iinr[] */
+ add [ebp + iinr], 4 /* advance pointer */
+ mov ebx, [ecx] /* ebx =ii */
+
+ mov eax, [ebp + nsatoms]
+ add [ebp + nsatoms], 12
+ mov ecx, [eax]
+ mov edx, [eax + 4]
+ mov eax, [eax + 8]
+ sub ecx, eax
+ sub eax, edx
+
+ mov [esp + nsvdwc], edx
+ mov [esp + nscoul], eax
+ mov [esp + nsvdw], ecx
+
+ /* clear vnbtot */
+ xorps xmm4, xmm4
+ movaps [esp + vnbtot], xmm4
+ mov [esp + solnr], ebx
+
+ mov eax, [ebp + jindex]
+ mov ecx, [eax] /* jindex[n] */
+ mov edx, [eax + 4] /* jindex[n+1] */
+ add [ebp + jindex], 4
+ sub edx, ecx /* number of innerloop atoms */
+ mov eax, [ebp + jjnr]
+ shl ecx, 2
+ add eax, ecx
+ mov [esp + innerjjnr0], eax /* pointer to jjnr[nj0] */
+
+ mov [esp + innerk0], edx /* number of innerloop atoms */
+
+ mov ecx, [esp + nsvdwc]
+ cmp ecx, 0
+ jnz .i0310_mno_vdwc
+ jmp .i0310_testvdw
+.i0310_mno_vdwc:
+ mov ebx, [esp + solnr]
+ inc dword ptr [esp + solnr]
+
+ mov edx, [ebp + type]
+ mov edx, [edx + ebx*4]
+ imul edx, [ebp + ntype]
+ shl edx, 1
+ mov [esp + ntia], edx
+
+ lea ebx, [ebx + ebx*2] /* ebx = 3*ii=ii3 */
+ mov eax, [ebp + pos] /* eax = base of pos[] */
+ mov [esp + ii3], ebx
+
+ movss xmm0, [esp + shX]
+ movss xmm1, [esp + shY]
+ movss xmm2, [esp + shZ]
+
+ addss xmm0, [eax + ebx*4]
+ addss xmm1, [eax + ebx*4 + 4]
+ addss xmm2, [eax + ebx*4 + 8]
+
+ /* clear i forces */
+ xorps xmm4, xmm4
+ movaps [esp + fix], xmm4
+ movaps [esp + fiy], xmm4
+ movaps [esp + fiz], xmm4
+
+ shufps xmm0, xmm0, 0
+ shufps xmm1, xmm1, 0
+ shufps xmm2, xmm2, 0
+
+ movaps [esp + ix], xmm0
+ movaps [esp + iy], xmm1
+ movaps [esp + iz], xmm2
+
+ mov ecx, [esp + innerjjnr0]
+ mov [esp + innerjjnr], ecx
+ mov edx, [esp + innerk0]
+ sub edx, 4
+ mov [esp + innerk], edx /* number of innerloop atoms */
+ jge .i0310_unroll_vdwc_loop
+ jmp .i0310_finish_vdwc_inner
+.i0310_unroll_vdwc_loop:
+ /* quad-unroll innerloop here */
+ mov edx, [esp + innerjjnr] /* pointer to jjnr[k] */
+ mov eax, [edx]
+ mov ebx, [edx + 4]
+ mov ecx, [edx + 8]
+ mov edx, [edx + 12] /* eax-edx=jnr1-4 */
+ add [esp + innerjjnr], 16 /* advance pointer (unrolled 4) */
+
+ movd mm0, eax /* use mmx registers as temp storage */
+ movd mm1, ebx
+ movd mm2, ecx
+ movd mm3, edx
+
+ mov esi, [ebp + type]
+ mov eax, [esi + eax*4]
+ mov ebx, [esi + ebx*4]
+ mov ecx, [esi + ecx*4]
+ mov edx, [esi + edx*4]
+ mov esi, [ebp + nbfp]
+ shl eax, 1
+ shl ebx, 1
+ shl ecx, 1
+ shl edx, 1
+ mov edi, [esp + ntia]
+ add eax, edi
+ add ebx, edi
+ add ecx, edi
+ add edx, edi
+
+ movlps xmm6, [esi + eax*4]
+ movlps xmm7, [esi + ecx*4]
+ movhps xmm6, [esi + ebx*4]
+ movhps xmm7, [esi + edx*4]
+
+ movaps xmm4, xmm6
+ shufps xmm4, xmm7, 0b10001000
+ shufps xmm6, xmm7, 0b11011101
+
+ movd eax, mm0
+ movd ebx, mm1
+ movd ecx, mm2
+ movd edx, mm3
+
+ movaps [esp + c6], xmm4
+ movaps [esp + c12], xmm6
+
+ mov esi, [ebp + pos] /* base of pos[] */
+
+ lea eax, [eax + eax*2] /* replace jnr with j3 */
+ lea ebx, [ebx + ebx*2]
+
+ lea ecx, [ecx + ecx*2] /* replace jnr with j3 */
+ lea edx, [edx + edx*2]
+
+ /* move four coordinates to xmm0-xmm2 */
+
+ movlps xmm4, [esi + eax*4]
+ movlps xmm5, [esi + ecx*4]
+ movss xmm2, [esi + eax*4 + 8]
+ movss xmm6, [esi + ecx*4 + 8]
+
+ movhps xmm4, [esi + ebx*4]
+ movhps xmm5, [esi + edx*4]
+
+ movss xmm0, [esi + ebx*4 + 8]
+ movss xmm1, [esi + edx*4 + 8]
+
+ shufps xmm2, xmm0, 0
+ shufps xmm6, xmm1, 0
+
+ movaps xmm0, xmm4
+ movaps xmm1, xmm4
+
+ shufps xmm2, xmm6, 0b10001000
+
+ shufps xmm0, xmm5, 0b10001000
+ shufps xmm1, xmm5, 0b11011101
+
+ /* move ix-iz to xmm4-xmm6 */
+ movaps xmm4, [esp + ix]
+ movaps xmm5, [esp + iy]
+ movaps xmm6, [esp + iz]
+
+ /* calc dr */
+ subps xmm4, xmm0
+ subps xmm5, xmm1
+ subps xmm6, xmm2
+
+ /* store dr */
+ movaps [esp + dx], xmm4
+ movaps [esp + dy], xmm5
+ movaps [esp + dz], xmm6
+ /* square it */
+ mulps xmm4,xmm4
+ mulps xmm5,xmm5
+ mulps xmm6,xmm6
+ addps xmm4, xmm5
+ addps xmm4, xmm6
+ /* rsq in xmm4 */
+
+ rsqrtps xmm5, xmm4
+ /* lookup seed in xmm5 */
+ movaps xmm2, xmm5
+ mulps xmm5, xmm5
+ movaps xmm1, [esp + three]
+ mulps xmm5, xmm4 /* rsq*lu*lu */
+ movaps xmm0, [esp + half]
+ subps xmm1, xmm5 /* 30-rsq*lu*lu */
+ mulps xmm1, xmm2
+ mulps xmm0, xmm1 /* xmm0=rinv */
+ mulps xmm4, xmm0 /* xmm4=r */
+ mulps xmm4, [esp + tsc]
+
+ movhlps xmm5, xmm4
+ cvttps2pi mm6, xmm4
+ cvttps2pi mm7, xmm5 /* mm6/mm7 contain lu indices */
+ cvtpi2ps xmm6, mm6
+ cvtpi2ps xmm5, mm7
+ movlhps xmm6, xmm5
+ subps xmm4, xmm6
+ movaps xmm1, xmm4 /* xmm1=eps */
+ movaps xmm2, xmm1
+ mulps xmm2, xmm2 /* xmm2=eps2 */
+ pslld mm6, 3
+ pslld mm7, 3
+
+ movd mm0, eax
+ movd mm1, ebx
+ movd mm2, ecx
+ movd mm3, edx
+
+ mov esi, [ebp + VFtab]
+ movd eax, mm6
+ psrlq mm6, 32
+ movd ecx, mm7
+ psrlq mm7, 32
+ movd ebx, mm6
+ movd edx, mm7
+
+ /* dispersion */
+ movlps xmm5, [esi + eax*4 + 0]
+ movlps xmm7, [esi + ecx*4 + 0]
+ movhps xmm5, [esi + ebx*4 + 0]
+ movhps xmm7, [esi + edx*4 + 0] /* got half dispersion table */
+ movaps xmm4, xmm5
+ shufps xmm4, xmm7, 0b10001000
+ shufps xmm5, xmm7, 0b11011101
+
+ movlps xmm7, [esi + eax*4 + 8]
+ movlps xmm3, [esi + ecx*4 + 8]
+ movhps xmm7, [esi + ebx*4 + 8]
+ movhps xmm3, [esi + edx*4 + 8] /* other half of dispersion table */
+ movaps xmm6, xmm7
+ shufps xmm6, xmm3, 0b10001000
+ shufps xmm7, xmm3, 0b11011101
+ /* dispersion table ready, in xmm4-xmm7 */
+ mulps xmm6, xmm1 /* xmm6=Geps */
+ mulps xmm7, xmm2 /* xmm7=Heps2 */
+ addps xmm5, xmm6
+ addps xmm5, xmm7 /* xmm5=Fp */
+ mulps xmm7, [esp + two] /* two*Heps2 */
+ addps xmm7, xmm6
+ addps xmm7, xmm5 /* xmm7=FF */
+ mulps xmm5, xmm1 /* xmm5=eps*Fp */
+ addps xmm5, xmm4 /* xmm5=VV */
+
+ movaps xmm4, [esp + c6]
+ mulps xmm7, xmm4 /* fijD */
+ mulps xmm5, xmm4 /* vnb6 */
+
+ /* put scalar force on stack Update vnbtot directly */
+ addps xmm5, [esp + vnbtot]
+ movaps [esp + fscal], xmm7
+ movaps [esp + vnbtot], xmm5
+
+ /* repulsion */
+ movlps xmm5, [esi + eax*4 + 16]
+ movlps xmm7, [esi + ecx*4 + 16]
+ movhps xmm5, [esi + ebx*4 + 16]
+ movhps xmm7, [esi + edx*4 + 16] /* got half repulsion table */
+ movaps xmm4, xmm5
+ shufps xmm4, xmm7, 0b10001000
+ shufps xmm5, xmm7, 0b11011101
+
+ movlps xmm7, [esi + eax*4 + 24]
+ movlps xmm3, [esi + ecx*4 + 24]
+ movhps xmm7, [esi + ebx*4 + 24]
+ movhps xmm3, [esi + edx*4 + 24] /* other half of repulsion table */
+ movaps xmm6, xmm7
+ shufps xmm6, xmm3, 0b10001000
+ shufps xmm7, xmm3, 0b11011101
+ /* table ready, in xmm4-xmm7 */
+ mulps xmm6, xmm1 /* xmm6=Geps */
+ mulps xmm7, xmm2 /* xmm7=Heps2 */
+ addps xmm5, xmm6
+ addps xmm5, xmm7 /* xmm5=Fp */
+ mulps xmm7, [esp + two] /* two*Heps2 */
+ addps xmm7, xmm6
+ addps xmm7, xmm5 /* xmm7=FF */
+ mulps xmm5, xmm1 /* xmm5=eps*Fp */
+ addps xmm5, xmm4 /* xmm5=VV */
+
+ movaps xmm4, [esp + c12]
+ mulps xmm7, xmm4 /* fijR */
+ mulps xmm5, xmm4 /* vnb12 */
+ addps xmm7, [esp + fscal]
+
+ addps xmm5, [esp + vnbtot]
+ movaps [esp + vnbtot], xmm5
+ xorps xmm4, xmm4
+
+ mulps xmm7, [esp + tsc]
+ mulps xmm7, xmm0
+ subps xmm4, xmm7
+
+ movaps xmm0, [esp + dx]
+ movaps xmm1, [esp + dy]
+ movaps xmm2, [esp + dz]
+
+ movd eax, mm0
+ movd ebx, mm1
+ movd ecx, mm2
+ movd edx, mm3
+
+ mov edi, [ebp + faction]
+ mulps xmm0, xmm4
+ mulps xmm1, xmm4
+ mulps xmm2, xmm4
+ /* xmm0-xmm2 contains tx-tz (partial force) */
+ /* now update f_i */
+ movaps xmm3, [esp + fix]
+ movaps xmm4, [esp + fiy]
+ movaps xmm5, [esp + fiz]
+ addps xmm3, xmm0
+ addps xmm4, xmm1
+ addps xmm5, xmm2
+ movaps [esp + fix], xmm3
+ movaps [esp + fiy], xmm4
+ movaps [esp + fiz], xmm5
+ /* the fj's - start by accumulating x & y forces from memory */
+ movlps xmm4, [edi + eax*4]
+ movlps xmm6, [edi + ecx*4]
+ movhps xmm4, [edi + ebx*4]
+ movhps xmm6, [edi + edx*4]
+
+ movaps xmm3, xmm4
+ shufps xmm3, xmm6, 0b10001000
+ shufps xmm4, xmm6, 0b11011101
+
+ /* now xmm3-xmm5 contains fjx, fjy, fjz */
+ subps xmm3, xmm0
+ subps xmm4, xmm1
+
+ /* unpack them back so we can store them - first x & y in xmm3/xmm4 */
+
+ movaps xmm6, xmm3
+ unpcklps xmm6, xmm4
+ unpckhps xmm3, xmm4
+ /* xmm6(l)=x & y for j1, (h) for j2 */
+ /* xmm3(l)=x & y for j3, (h) for j4 */
+ movlps [edi + eax*4], xmm6
+ movlps [edi + ecx*4], xmm3
+
+ movhps [edi + ebx*4], xmm6
+ movhps [edi + edx*4], xmm3
+
+ /* and the z forces */
+ movss xmm4, [edi + eax*4 + 8]
+ movss xmm5, [edi + ebx*4 + 8]
+ movss xmm6, [edi + ecx*4 + 8]
+ movss xmm7, [edi + edx*4 + 8]
+ subss xmm4, xmm2
+ shufps xmm2, xmm2, 0b11100101
+ subss xmm5, xmm2
+ shufps xmm2, xmm2, 0b11101010
+ subss xmm6, xmm2
+ shufps xmm2, xmm2, 0b11111111
+ subss xmm7, xmm2
+ movss [edi + eax*4 + 8], xmm4
+ movss [edi + ebx*4 + 8], xmm5
+ movss [edi + ecx*4 + 8], xmm6
+ movss [edi + edx*4 + 8], xmm7
+
+ /* should we do one more iteration? */
+ sub [esp + innerk], 4
+ jl .i0310_finish_vdwc_inner
+ jmp .i0310_unroll_vdwc_loop
+.i0310_finish_vdwc_inner:
+ /* check if at least two particles remain */
+ add [esp + innerk], 4
+ mov edx, [esp + innerk]
+ and edx, 2
+ jnz .i0310_dopair_vdwc
+ jmp .i0310_checksingle_vdwc
+.i0310_dopair_vdwc:
+ mov ecx, [esp + innerjjnr]
+
+ mov eax, [ecx]
+ mov ebx, [ecx + 4]
+ add [esp + innerjjnr], 8
+ xorps xmm7, xmm7
+
+ mov esi, [ebp + type]
+ mov ecx, eax
+ mov edx, ebx
+ mov ecx, [esi + ecx*4]
+ mov edx, [esi + edx*4]
+ mov esi, [ebp + nbfp]
+ shl ecx, 1
+ shl edx, 1
+ mov edi, [esp + ntia]
+ add ecx, edi
+ add edx, edi
+ movlps xmm6, [esi + ecx*4]
+ movhps xmm6, [esi + edx*4]
+ mov edi, [ebp + pos]
+
+ movaps xmm4, xmm6
+ shufps xmm4, xmm4, 0b1000
+ shufps xmm6, xmm6, 0b1101
+ movlhps xmm4, xmm7
+ movlhps xmm6, xmm7
+
+ movaps [esp + c6], xmm4
+ movaps [esp + c12], xmm6
+
+ lea eax, [eax + eax*2]
+ lea ebx, [ebx + ebx*2]
+ /* move coordinates to xmm0-xmm2 */
+ movlps xmm1, [edi + eax*4]
+ movss xmm2, [edi + eax*4 + 8]
+ movhps xmm1, [edi + ebx*4]
+ movss xmm0, [edi + ebx*4 + 8]
+
+ shufps xmm2, xmm0, 0
+
+ movaps xmm0, xmm1
+
+ shufps xmm2, xmm2, 0b10001000
+
+ shufps xmm0, xmm0, 0b10001000
+ shufps xmm1, xmm1, 0b11011101
+
+ mov edi, [ebp + faction]
+ /* move ix-iz to xmm4-xmm6 */
+ xorps xmm7, xmm7
+
+ movaps xmm4, [esp + ix]
+ movaps xmm5, [esp + iy]
+ movaps xmm6, [esp + iz]
+
+ /* calc dr */
+ subps xmm4, xmm0
+ subps xmm5, xmm1
+ subps xmm6, xmm2
+
+ /* store dr */
+ movaps [esp + dx], xmm4
+ movaps [esp + dy], xmm5
+ movaps [esp + dz], xmm6
+ /* square it */
+ mulps xmm4,xmm4
+ mulps xmm5,xmm5
+ mulps xmm6,xmm6
+ addps xmm4, xmm5
+ addps xmm4, xmm6
+ /* rsq in xmm4 */
+
+ rsqrtps xmm5, xmm4
+ /* lookup seed in xmm5 */
+ movaps xmm2, xmm5
+ mulps xmm5, xmm5
+ movaps xmm1, [esp + three]
+ mulps xmm5, xmm4 /* rsq*lu*lu */
+ movaps xmm0, [esp + half]
+ subps xmm1, xmm5 /* 30-rsq*lu*lu */
+ mulps xmm1, xmm2
+ mulps xmm0, xmm1 /* xmm0=rinv */
+ mulps xmm4, xmm0 /* xmm4=r */
+ mulps xmm4, [esp + tsc]
+
+ cvttps2pi mm6, xmm4 /* mm6 contain lu indices */
+ cvtpi2ps xmm6, mm6
+ subps xmm4, xmm6
+ movaps xmm1, xmm4 /* xmm1=eps */
+ movaps xmm2, xmm1
+ mulps xmm2, xmm2 /* xmm2=eps2 */
+
+ pslld mm6, 3
+
+ mov esi, [ebp + VFtab]
+ movd ecx, mm6
+ psrlq mm6, 32
+ movd edx, mm6
+
+ /* dispersion */
+ movlps xmm5, [esi + ecx*4 + 0]
+ movhps xmm5, [esi + edx*4 + 0]/* got half dispersion table */
+ movaps xmm4, xmm5
+ shufps xmm4, xmm4, 0b10001000
+ shufps xmm5, xmm5, 0b11011101
+
+ movlps xmm7, [esi + ecx*4 + 8]
+ movhps xmm7, [esi + edx*4 + 8] /* other half of dispersion table */
+ movaps xmm6, xmm7
+ shufps xmm6, xmm6, 0b10001000
+ shufps xmm7, xmm7, 0b11011101
+ /* dispersion table ready, in xmm4-xmm7 */
+ mulps xmm6, xmm1 /* xmm6=Geps */
+ mulps xmm7, xmm2 /* xmm7=Heps2 */
+ addps xmm5, xmm6
+ addps xmm5, xmm7 /* xmm5=Fp */
+ mulps xmm7, [esp + two] /* two*Heps2 */
+ addps xmm7, xmm6
+ addps xmm7, xmm5 /* xmm7=FF */
+ mulps xmm5, xmm1 /* xmm5=eps*Fp */
+ addps xmm5, xmm4 /* xmm5=VV */
+
+ movaps xmm4, [esp + c6]
+ mulps xmm7, xmm4 /* fijD */
+ mulps xmm5, xmm4 /* vnb6 */
+
+ /* put scalar force on stack Update vnbtot directly */
+ addps xmm5, [esp + vnbtot]
+ movaps [esp + fscal], xmm7
+ movaps [esp + vnbtot], xmm5
+
+ /* repulsion */
+ movlps xmm5, [esi + ecx*4 + 16]
+ movhps xmm5, [esi + edx*4 + 16] /* got half repulsion table */
+ movaps xmm4, xmm5
+ shufps xmm4, xmm4, 0b10001000
+ shufps xmm5, xmm5, 0b11011101
+
+ movlps xmm7, [esi + ecx*4 + 24]
+ movhps xmm7, [esi + edx*4 + 24] /* other half of repulsion table */
+ movaps xmm6, xmm7
+ shufps xmm6, xmm6, 0b10001000
+ shufps xmm7, xmm7, 0b11011101
+ /* table ready, in xmm4-xmm7 */
+ mulps xmm6, xmm1 /* xmm6=Geps */
+ mulps xmm7, xmm2 /* xmm7=Heps2 */
+ addps xmm5, xmm6
+ addps xmm5, xmm7 /* xmm5=Fp */
+ mulps xmm7, [esp + two] /* two*Heps2 */
+ addps xmm7, xmm6
+ addps xmm7, xmm5 /* xmm7=FF */
+ mulps xmm5, xmm1 /* xmm5=eps*Fp */
+ addps xmm5, xmm4 /* xmm5=VV */
+
+ movaps xmm4, [esp + c12]
+ mulps xmm7, xmm4 /* fijR */
+ mulps xmm5, xmm4 /* vnb12 */
+ addps xmm7, [esp + fscal]
+
+ addps xmm5, [esp + vnbtot]
+ movaps [esp + vnbtot], xmm5
+ xorps xmm4, xmm4
+
+ mulps xmm7, [esp + tsc]
+ mulps xmm7, xmm0
+ subps xmm4, xmm7
+
+ movaps xmm0, [esp + dx]
+ movaps xmm1, [esp + dy]
+ movaps xmm2, [esp + dz]
+
+ mulps xmm0, xmm4
+ mulps xmm1, xmm4
+ mulps xmm2, xmm4
+ /* xmm0-xmm2 contains tx-tz (partial force) */
+ /* now update f_i */
+ movaps xmm3, [esp + fix]
+ movaps xmm4, [esp + fiy]
+ movaps xmm5, [esp + fiz]
+ addps xmm3, xmm0
+ addps xmm4, xmm1
+ addps xmm5, xmm2
+ movaps [esp + fix], xmm3
+ movaps [esp + fiy], xmm4
+ movaps [esp + fiz], xmm5
+ /* update the fj's */
+ movss xmm3, [edi + eax*4]
+ movss xmm4, [edi + eax*4 + 4]
+ movss xmm5, [edi + eax*4 + 8]
+ subss xmm3, xmm0
+ subss xmm4, xmm1
+ subss xmm5, xmm2
+ movss [edi + eax*4], xmm3
+ movss [edi + eax*4 + 4], xmm4
+ movss [edi + eax*4 + 8], xmm5
+
+ shufps xmm0, xmm0, 0b11100001
+ shufps xmm1, xmm1, 0b11100001
+ shufps xmm2, xmm2, 0b11100001
+
+ movss xmm3, [edi + ebx*4]
+ movss xmm4, [edi + ebx*4 + 4]
+ movss xmm5, [edi + ebx*4 + 8]
+ subss xmm3, xmm0
+ subss xmm4, xmm1
+ subss xmm5, xmm2
+ movss [edi + ebx*4], xmm3
+ movss [edi + ebx*4 + 4], xmm4
+ movss [edi + ebx*4 + 8], xmm5
+
+.i0310_checksingle_vdwc:
+ mov edx, [esp + innerk]
+ and edx, 1
+ jnz .i0310_dosingle_vdwc
+ jmp .i0310_updateouterdata_vdwc
+.i0310_dosingle_vdwc:
+ mov edi, [ebp + pos]
+ mov ecx, [esp + innerjjnr]
+ mov eax, [ecx]
+ xorps xmm6, xmm6
+
+ mov esi, [ebp + type]
+ mov ecx, eax
+ mov ecx, [esi + ecx*4]
+ mov esi, [ebp + nbfp]
+ shl ecx, 1
+ add ecx, [esp + ntia]
+ movlps xmm6, [esi + ecx*4]
+ movaps xmm4, xmm6
+ shufps xmm4, xmm4, 0b11111100
+ shufps xmm6, xmm6, 0b11111101
+
+ movaps [esp + c6], xmm4
+ movaps [esp + c12], xmm6
+
+ lea eax, [eax + eax*2]
+
+ /* move coordinates to xmm0-xmm2 */
+ movss xmm0, [edi + eax*4]
+ movss xmm1, [edi + eax*4 + 4]
+ movss xmm2, [edi + eax*4 + 8]
+
+ movaps xmm4, [esp + ix]
+ movaps xmm5, [esp + iy]
+ movaps xmm6, [esp + iz]
+
+ /* calc dr */
+ subps xmm4, xmm0
+ subps xmm5, xmm1
+ subps xmm6, xmm2
+
+ /* store dr */
+ movaps [esp + dx], xmm4
+ movaps [esp + dy], xmm5
+ movaps [esp + dz], xmm6
+ /* square it */
+ mulps xmm4,xmm4
+ mulps xmm5,xmm5
+ mulps xmm6,xmm6
+ addps xmm4, xmm5
+ addps xmm4, xmm6
+ /* rsq in xmm4 */
+
+ rsqrtps xmm5, xmm4
+ /* lookup seed in xmm5 */
+ movaps xmm2, xmm5
+ mulps xmm5, xmm5
+ movaps xmm1, [esp + three]
+ mulps xmm5, xmm4 /* rsq*lu*lu */
+ movaps xmm0, [esp + half]
+ subps xmm1, xmm5 /* 30-rsq*lu*lu */
+ mulps xmm1, xmm2
+ mulps xmm0, xmm1 /* xmm0=rinv */
+
+ mulps xmm4, xmm0 /* xmm4=r */
+ mulps xmm4, [esp + tsc]
+
+ cvttps2pi mm6, xmm4 /* mm6 contain lu indices */
+ cvtpi2ps xmm6, mm6
+ subps xmm4, xmm6
+ movaps xmm1, xmm4 /* xmm1=eps */
+ movaps xmm2, xmm1
+ mulps xmm2, xmm2 /* xmm2=eps2 */
+
+ pslld mm6, 3
+
+ mov esi, [ebp + VFtab]
+ movd ebx, mm6
+
+ /* dispersion */
+ movlps xmm4, [esi + ebx*4 + 0]
+ movlps xmm6, [esi + ebx*4 + 8]
+ movaps xmm5, xmm4
+ movaps xmm7, xmm6
+ shufps xmm5, xmm5, 1
+ shufps xmm7, xmm7, 1
+ /* table ready in xmm4-xmm7 */
+
+ mulps xmm6, xmm1 /* xmm6=Geps */
+ mulps xmm7, xmm2 /* xmm7=Heps2 */
+ addps xmm5, xmm6
+ addps xmm5, xmm7 /* xmm5=Fp */
+ mulps xmm7, [esp + two] /* two*Heps2 */
+ addps xmm7, xmm6
+ addps xmm7, xmm5 /* xmm7=FF */
+ mulps xmm5, xmm1 /* xmm5=eps*Fp */
+ addps xmm5, xmm4 /* xmm5=VV */
+
+ movaps xmm4, [esp + c6]
+ mulps xmm7, xmm4 /* fijD */
+ mulps xmm5, xmm4 /* vnb6 */
+
+ /* put scalar force on stack Update vnbtot directly */
+ addps xmm5, [esp + vnbtot]
+ movaps [esp + fscal], xmm7
+ movaps [esp + vnbtot], xmm5
+
+ /* repulsion */
+ movlps xmm4, [esi + ebx*4 + 16]
+ movlps xmm6, [esi + ebx*4 + 24]
+ movaps xmm5, xmm4
+ movaps xmm7, xmm6
+ shufps xmm5, xmm5, 1
+ shufps xmm7, xmm7, 1
+ /* table ready in xmm4-xmm7 */
+
+ mulps xmm6, xmm1 /* xmm6=Geps */
+ mulps xmm7, xmm2 /* xmm7=Heps2 */
+ addps xmm5, xmm6
+ addps xmm5, xmm7 /* xmm5=Fp */
+ mulps xmm7, [esp + two] /* two*Heps2 */
+ addps xmm7, xmm6
+ addps xmm7, xmm5 /* xmm7=FF */
+ mulps xmm5, xmm1 /* xmm5=eps*Fp */
+ addps xmm5, xmm4 /* xmm5=VV */
+
+ movaps xmm4, [esp + c12]
+ mulps xmm7, xmm4 /* fijR */
+ mulps xmm5, xmm4 /* vnb12 */
+ addps xmm7, [esp + fscal]
+
+ addps xmm5, [esp + vnbtot]
+ movaps [esp + vnbtot], xmm5
+ xorps xmm4, xmm4
+
+ mulps xmm7, [esp + tsc]
+ mulps xmm7, xmm0
+ subps xmm4, xmm7
+ mov edi, [ebp + faction]
+
+ movaps xmm0, [esp + dx]
+ movaps xmm1, [esp + dy]
+ movaps xmm2, [esp + dz]
+
+ mulps xmm0, xmm4
+ mulps xmm1, xmm4
+ mulps xmm2, xmm4
+ /* xmm0-xmm2 contains tx-tz (partial force) */
+ /* now update f_i */
+ movaps xmm3, [esp + fix]
+ movaps xmm4, [esp + fiy]
+ movaps xmm5, [esp + fiz]
+ addps xmm3, xmm0
+ addps xmm4, xmm1
+ addps xmm5, xmm2
+ movaps [esp + fix], xmm3
+ movaps [esp + fiy], xmm4
+ movaps [esp + fiz], xmm5
+ /* update fj */
+
+ movss xmm3, [edi + eax*4]
+ movss xmm4, [edi + eax*4 + 4]
+ movss xmm5, [edi + eax*4 + 8]
+ subss xmm3, xmm0
+ subss xmm4, xmm1
+ subss xmm5, xmm2
+ movss [edi + eax*4], xmm3
+ movss [edi + eax*4 + 4], xmm4
+ movss [edi + eax*4 + 8], xmm5
+.i0310_updateouterdata_vdwc:
+ mov ecx, [esp + ii3]
+ mov edi, [ebp + faction]
+ mov esi, [ebp + fshift]
+ mov edx, [esp + is3]
+
+ /* accumulate i forces in xmm0, xmm1, xmm2 */
+ movaps xmm0, [esp + fix]
+ movaps xmm1, [esp + fiy]
+ movaps xmm2, [esp + fiz]
+
+ movhlps xmm3, xmm0
+ movhlps xmm4, xmm1
+ movhlps xmm5, xmm2
+ addps xmm0, xmm3
+ addps xmm1, xmm4
+ addps xmm2, xmm5 /* sum is in 1/2 in xmm0-xmm2 */
+
+ movaps xmm3, xmm0
+ movaps xmm4, xmm1
+ movaps xmm5, xmm2
+
+ shufps xmm3, xmm3, 1
+ shufps xmm4, xmm4, 1
+ shufps xmm5, xmm5, 1
+ addss xmm0, xmm3
+ addss xmm1, xmm4
+ addss xmm2, xmm5 /* xmm0-xmm2 has single force in pos0 */
+
+ /* increment i force */
+ movss xmm3, [edi + ecx*4]
+ movss xmm4, [edi + ecx*4 + 4]
+ movss xmm5, [edi + ecx*4 + 8]
+ addss xmm3, xmm0
+ addss xmm4, xmm1
+ addss xmm5, xmm2
+ movss [edi + ecx*4], xmm3
+ movss [edi + ecx*4 + 4], xmm4
+ movss [edi + ecx*4 + 8], xmm5
+
+ /* increment fshift force */
+ movss xmm3, [esi + edx*4]
+ movss xmm4, [esi + edx*4 + 4]
+ movss xmm5, [esi + edx*4 + 8]
+ addss xmm3, xmm0
+ addss xmm4, xmm1
+ addss xmm5, xmm2
+ movss [esi + edx*4], xmm3
+ movss [esi + edx*4 + 4], xmm4
+ movss [esi + edx*4 + 8], xmm5
+
+ /* loop back to mno */
+ dec dword ptr [esp + nsvdwc]
+ jz .i0310_testvdw
+ jmp .i0310_mno_vdwc
+.i0310_testvdw:
+ mov ebx, [esp + nscoul]
+ add [esp + solnr], ebx
+
+ mov ecx, [esp + nsvdw]
+ cmp ecx, 0
+ jnz .i0310_mno_vdw
+ jmp .i0310_last_mno
+.i0310_mno_vdw:
+ mov ebx, [esp + solnr]
+ inc dword ptr [esp + solnr]
+
+ mov edx, [ebp + type]
+ mov edx, [edx + ebx*4]
+ imul edx, [ebp + ntype]
+ shl edx, 1
+ mov [esp + ntia], edx
+
+ lea ebx, [ebx + ebx*2] /* ebx = 3*ii=ii3 */
+ mov eax, [ebp + pos] /* eax = base of pos[] */
+ mov [esp + ii3], ebx
+
+ movss xmm0, [esp + shX]
+ movss xmm1, [esp + shY]
+ movss xmm2, [esp + shZ]
+
+ addss xmm0, [eax + ebx*4]
+ addss xmm1, [eax + ebx*4 + 4]
+ addss xmm2, [eax + ebx*4 + 8]
+
+ xorps xmm4, xmm4
+ movaps [esp + fix], xmm4
+ movaps [esp + fiy], xmm4
+ movaps [esp + fiz], xmm4
+
+ shufps xmm0, xmm0, 0
+ shufps xmm1, xmm1, 0
+ shufps xmm2, xmm2, 0
+
+ movaps [esp + ix], xmm0
+ movaps [esp + iy], xmm1
+ movaps [esp + iz], xmm2
+
+ mov ecx, [esp + innerjjnr0]
+ mov [esp + innerjjnr], ecx
+ mov edx, [esp + innerk0]
+ sub edx, 4
+ mov [esp + innerk], edx /* number of innerloop atoms */
+ jge .i0310_unroll_vdw_loop
+ jmp .i0310_finish_vdw_inner
+.i0310_unroll_vdw_loop:
+ /* quad-unroll innerloop here */
+ mov edx, [esp + innerjjnr] /* pointer to jjnr[k] */
+ mov eax, [edx]
+ mov ebx, [edx + 4]
+ mov ecx, [edx + 8]
+ mov edx, [edx + 12] /* eax-edx=jnr1-4 */
+ add [esp + innerjjnr], 16 /* advance pointer (unrolled 4) */
+
+ movd mm0, eax /* use mmx registers as temp storage */
+ movd mm1, ebx
+ movd mm2, ecx
+ movd mm3, edx
+
+ mov esi, [ebp + type]
+ mov eax, [esi + eax*4]
+ mov ebx, [esi + ebx*4]
+ mov ecx, [esi + ecx*4]
+ mov edx, [esi + edx*4]
+ mov esi, [ebp + nbfp]
+ shl eax, 1
+ shl ebx, 1
+ shl ecx, 1
+ shl edx, 1
+ mov edi, [esp + ntia]
+ add eax, edi
+ add ebx, edi
+ add ecx, edi
+ add edx, edi
+
+ movlps xmm6, [esi + eax*4]
+ movlps xmm7, [esi + ecx*4]
+ movhps xmm6, [esi + ebx*4]
+ movhps xmm7, [esi + edx*4]
+
+ movaps xmm4, xmm6
+ shufps xmm4, xmm7, 0b10001000
+ shufps xmm6, xmm7, 0b11011101
+
+ movd eax, mm0
+ movd ebx, mm1
+ movd ecx, mm2
+ movd edx, mm3
+
+ movaps [esp + c6], xmm4
+ movaps [esp + c12], xmm6
+
+ mov esi, [ebp + pos] /* base of pos[] */
+
+ lea eax, [eax + eax*2] /* replace jnr with j3 */
+ lea ebx, [ebx + ebx*2]
+
+ lea ecx, [ecx + ecx*2] /* replace jnr with j3 */
+ lea edx, [edx + edx*2]
+
+ /* move four coordinates to xmm0-xmm2 */
+
+ movlps xmm4, [esi + eax*4]
+ movlps xmm5, [esi + ecx*4]
+ movss xmm2, [esi + eax*4 + 8]
+ movss xmm6, [esi + ecx*4 + 8]
+
+ movhps xmm4, [esi + ebx*4]
+ movhps xmm5, [esi + edx*4]
+
+ movss xmm0, [esi + ebx*4 + 8]
+ movss xmm1, [esi + edx*4 + 8]
+
+ shufps xmm2, xmm0, 0
+ shufps xmm6, xmm1, 0
+
+ movaps xmm0, xmm4
+ movaps xmm1, xmm4
+
+ shufps xmm2, xmm6, 0b10001000
+
+ shufps xmm0, xmm5, 0b10001000
+ shufps xmm1, xmm5, 0b11011101
+
+ /* move ix-iz to xmm4-xmm6 */
+ movaps xmm4, [esp + ix]
+ movaps xmm5, [esp + iy]
+ movaps xmm6, [esp + iz]
+
+ /* calc dr */
+ subps xmm4, xmm0
+ subps xmm5, xmm1
+ subps xmm6, xmm2
+
+ /* store dr */
+ movaps [esp + dx], xmm4
+ movaps [esp + dy], xmm5
+ movaps [esp + dz], xmm6
+ /* square it */
+ mulps xmm4,xmm4
+ mulps xmm5,xmm5
+ mulps xmm6,xmm6
+ addps xmm4, xmm5
+ addps xmm4, xmm6
+ /* rsq in xmm4 */
+
+ rsqrtps xmm5, xmm4
+ /* lookup seed in xmm5 */
+ movaps xmm2, xmm5
+ mulps xmm5, xmm5
+ movaps xmm1, [esp + three]
+ mulps xmm5, xmm4 /* rsq*lu*lu */
+ movaps xmm0, [esp + half]
+ subps xmm1, xmm5 /* 30-rsq*lu*lu */
+ mulps xmm1, xmm2
+ mulps xmm0, xmm1 /* xmm0=rinv */
+ mulps xmm4, xmm0 /* xmm4=r */
+ mulps xmm4, [esp + tsc]
+
+ movhlps xmm5, xmm4
+ cvttps2pi mm6, xmm4
+ cvttps2pi mm7, xmm5 /* mm6/mm7 contain lu indices */
+ cvtpi2ps xmm6, mm6
+ cvtpi2ps xmm5, mm7
+ movlhps xmm6, xmm5
+ subps xmm4, xmm6
+ movaps xmm1, xmm4 /* xmm1=eps */
+ movaps xmm2, xmm1
+ mulps xmm2, xmm2 /* xmm2=eps2 */
+ pslld mm6, 3
+ pslld mm7, 3
+
+ movd mm0, eax
+ movd mm1, ebx
+ movd mm2, ecx
+ movd mm3, edx
+
+ mov esi, [ebp + VFtab]
+ movd eax, mm6
+ psrlq mm6, 32
+ movd ecx, mm7
+ psrlq mm7, 32
+ movd ebx, mm6
+ movd edx, mm7
+
+ /* dispersion */
+ movlps xmm5, [esi + eax*4 + 0]
+ movlps xmm7, [esi + ecx*4 + 0]
+ movhps xmm5, [esi + ebx*4 + 0]
+ movhps xmm7, [esi + edx*4 + 0] /* got half dispersion table */
+ movaps xmm4, xmm5
+ shufps xmm4, xmm7, 0b10001000
+ shufps xmm5, xmm7, 0b11011101
+
+ movlps xmm7, [esi + eax*4 + 8]
+ movlps xmm3, [esi + ecx*4 + 8]
+ movhps xmm7, [esi + ebx*4 + 8]
+ movhps xmm3, [esi + edx*4 + 8] /* other half of dispersion table */
+ movaps xmm6, xmm7
+ shufps xmm6, xmm3, 0b10001000
+ shufps xmm7, xmm3, 0b11011101
+ /* dispersion table ready, in xmm4-xmm7 */
+ mulps xmm6, xmm1 /* xmm6=Geps */
+ mulps xmm7, xmm2 /* xmm7=Heps2 */
+ addps xmm5, xmm6
+ addps xmm5, xmm7 /* xmm5=Fp */
+ mulps xmm7, [esp + two] /* two*Heps2 */
+ addps xmm7, xmm6
+ addps xmm7, xmm5 /* xmm7=FF */
+ mulps xmm5, xmm1 /* xmm5=eps*Fp */
+ addps xmm5, xmm4 /* xmm5=VV */
+
+ movaps xmm4, [esp + c6]
+ mulps xmm7, xmm4 /* fijD */
+ mulps xmm5, xmm4 /* vnb6 */
+
+ /* put scalar force on stack Update vnbtot directly */
+ addps xmm5, [esp + vnbtot]
+ movaps [esp + fscal], xmm7
+ movaps [esp + vnbtot], xmm5
+
+ /* repulsion */
+ movlps xmm5, [esi + eax*4 + 16]
+ movlps xmm7, [esi + ecx*4 + 16]
+ movhps xmm5, [esi + ebx*4 + 16]
+ movhps xmm7, [esi + edx*4 + 16] /* got half repulsion table */
+ movaps xmm4, xmm5
+ shufps xmm4, xmm7, 0b10001000
+ shufps xmm5, xmm7, 0b11011101
+
+ movlps xmm7, [esi + eax*4 + 24]
+ movlps xmm3, [esi + ecx*4 + 24]
+ movhps xmm7, [esi + ebx*4 + 24]
+ movhps xmm3, [esi + edx*4 + 24] /* other half of repulsion table */
+ movaps xmm6, xmm7
+ shufps xmm6, xmm3, 0b10001000
+ shufps xmm7, xmm3, 0b11011101
+ /* table ready, in xmm4-xmm7 */
+ mulps xmm6, xmm1 /* xmm6=Geps */
+ mulps xmm7, xmm2 /* xmm7=Heps2 */
+ addps xmm5, xmm6
+ addps xmm5, xmm7 /* xmm5=Fp */
+ mulps xmm7, [esp + two] /* two*Heps2 */
+ addps xmm7, xmm6
+ addps xmm7, xmm5 /* xmm7=FF */
+ mulps xmm5, xmm1 /* xmm5=eps*Fp */
+ addps xmm5, xmm4 /* xmm5=VV */
+
+ movaps xmm4, [esp + c12]
+ mulps xmm7, xmm4 /* fijR */
+ mulps xmm5, xmm4 /* vnb12 */
+ addps xmm7, [esp + fscal]
+
+ addps xmm5, [esp + vnbtot]
+ movaps [esp + vnbtot], xmm5
+ xorps xmm4, xmm4
+
+ mulps xmm7, [esp + tsc]
+ mulps xmm7, xmm0
+ subps xmm4, xmm7
+
+ movaps xmm0, [esp + dx]
+ movaps xmm1, [esp + dy]
+ movaps xmm2, [esp + dz]
+
+ movd eax, mm0
+ movd ebx, mm1
+ movd ecx, mm2
+ movd edx, mm3
+
+ mov edi, [ebp + faction]
+ mulps xmm0, xmm4
+ mulps xmm1, xmm4
+ mulps xmm2, xmm4
+ /* xmm0-xmm2 contains tx-tz (partial force) */
+ /* now update f_i */
+ movaps xmm3, [esp + fix]
+ movaps xmm4, [esp + fiy]
+ movaps xmm5, [esp + fiz]
+ addps xmm3, xmm0
+ addps xmm4, xmm1
+ addps xmm5, xmm2
+ movaps [esp + fix], xmm3
+ movaps [esp + fiy], xmm4
+ movaps [esp + fiz], xmm5
+ /* the fj's - start by accumulating x & y forces from memory */
+ movlps xmm4, [edi + eax*4]
+ movlps xmm6, [edi + ecx*4]
+ movhps xmm4, [edi + ebx*4]
+ movhps xmm6, [edi + edx*4]
+
+ movaps xmm3, xmm4
+ shufps xmm3, xmm6, 0b10001000
+ shufps xmm4, xmm6, 0b11011101
+
+ /* now xmm3-xmm5 contains fjx, fjy, fjz */
+ subps xmm3, xmm0
+ subps xmm4, xmm1
+
+ /* unpack them back so we can store them - first x & y in xmm3/xmm4 */
+
+ movaps xmm6, xmm3
+ unpcklps xmm6, xmm4
+ unpckhps xmm3, xmm4
+ /* xmm6(l)=x & y for j1, (h) for j2 */
+ /* xmm3(l)=x & y for j3, (h) for j4 */
+ movlps [edi + eax*4], xmm6
+ movlps [edi + ecx*4], xmm3
+
+ movhps [edi + ebx*4], xmm6
+ movhps [edi + edx*4], xmm3
+
+ /* and the z forces */
+ movss xmm4, [edi + eax*4 + 8]
+ movss xmm5, [edi + ebx*4 + 8]
+ movss xmm6, [edi + ecx*4 + 8]
+ movss xmm7, [edi + edx*4 + 8]
+ subss xmm4, xmm2
+ shufps xmm2, xmm2, 0b11100101
+ subss xmm5, xmm2
+ shufps xmm2, xmm2, 0b11101010
+ subss xmm6, xmm2
+ shufps xmm2, xmm2, 0b11111111
+ subss xmm7, xmm2
+ movss [edi + eax*4 + 8], xmm4
+ movss [edi + ebx*4 + 8], xmm5
+ movss [edi + ecx*4 + 8], xmm6
+ movss [edi + edx*4 + 8], xmm7
+
+ /* should we do one more iteration? */
+ sub [esp + innerk], 4
+ jl .i0310_finish_vdw_inner
+ jmp .i0310_unroll_vdw_loop
+.i0310_finish_vdw_inner:
+ /* check if at least two particles remain */
+ add [esp + innerk], 4
+ mov edx, [esp + innerk]
+ and edx, 2
+ jnz .i0310_dopair_vdw
+ jmp .i0310_checksingle_vdw
+.i0310_dopair_vdw:
+ mov ecx, [esp + innerjjnr]
+
+ mov eax, [ecx]
+ mov ebx, [ecx + 4]
+ add [esp + innerjjnr], 8
+ xorps xmm7, xmm7
+
+ mov esi, [ebp + type]
+ mov ecx, eax
+ mov edx, ebx
+ mov ecx, [esi + ecx*4]
+ mov edx, [esi + edx*4]
+ mov esi, [ebp + nbfp]
+ shl ecx, 1
+ shl edx, 1
+ mov edi, [esp + ntia]
+ add ecx, edi
+ add edx, edi
+ movlps xmm6, [esi + ecx*4]
+ movhps xmm6, [esi + edx*4]
+ mov edi, [ebp + pos]
+
+ movaps xmm4, xmm6
+ shufps xmm4, xmm4, 0b1000
+ shufps xmm6, xmm6, 0b1101
+ movlhps xmm4, xmm7
+ movlhps xmm6, xmm7
+
+ movaps [esp + c6], xmm4
+ movaps [esp + c12], xmm6
+
+ lea eax, [eax + eax*2]
+ lea ebx, [ebx + ebx*2]
+ /* move coordinates to xmm0-xmm2 */
+ movlps xmm1, [edi + eax*4]
+ movss xmm2, [edi + eax*4 + 8]
+ movhps xmm1, [edi + ebx*4]
+ movss xmm0, [edi + ebx*4 + 8]
+
+ shufps xmm2, xmm0, 0
+
+ movaps xmm0, xmm1
+
+ shufps xmm2, xmm2, 0b10001000
+
+ shufps xmm0, xmm0, 0b10001000
+ shufps xmm1, xmm1, 0b11011101
+
+ mov edi, [ebp + faction]
+ /* move ix-iz to xmm4-xmm6 */
+ movaps xmm4, [esp + ix]
+ movaps xmm5, [esp + iy]
+ movaps xmm6, [esp + iz]
+
+ /* calc dr */
+ subps xmm4, xmm0
+ subps xmm5, xmm1
+ subps xmm6, xmm2
+
+ /* store dr */
+ movaps [esp + dx], xmm4
+ movaps [esp + dy], xmm5
+ movaps [esp + dz], xmm6
+ /* square it */
+ mulps xmm4,xmm4
+ mulps xmm5,xmm5
+ mulps xmm6,xmm6
+ addps xmm4, xmm5
+ addps xmm4, xmm6
+ /* rsq in xmm4 */
+
+ rsqrtps xmm5, xmm4
+ /* lookup seed in xmm5 */
+ movaps xmm2, xmm5
+ mulps xmm5, xmm5
+ movaps xmm1, [esp + three]
+ mulps xmm5, xmm4 /* rsq*lu*lu */
+ movaps xmm0, [esp + half]
+ subps xmm1, xmm5 /* 30-rsq*lu*lu */
+ mulps xmm1, xmm2
+ mulps xmm0, xmm1 /* xmm0=rinv */
+ mulps xmm4, xmm0 /* xmm4=r */
+ mulps xmm4, [esp + tsc]
+
+ cvttps2pi mm6, xmm4 /* mm6 contain lu indices */
+ cvtpi2ps xmm6, mm6
+ subps xmm4, xmm6
+ movaps xmm1, xmm4 /* xmm1=eps */
+ movaps xmm2, xmm1
+ mulps xmm2, xmm2 /* xmm2=eps2 */
+
+ pslld mm6, 3
+
+ mov esi, [ebp + VFtab]
+ movd ecx, mm6
+ psrlq mm6, 32
+ movd edx, mm6
+
+ /* dispersion */
+ movlps xmm5, [esi + ecx*4 + 0]
+ movhps xmm5, [esi + edx*4 + 0]/* got half dispersion table */
+ movaps xmm4, xmm5
+ shufps xmm4, xmm4, 0b10001000
+ shufps xmm5, xmm5, 0b11011101
+
+ movlps xmm7, [esi + ecx*4 + 8]
+ movhps xmm7, [esi + edx*4 + 8] /* other half of dispersion table */
+ movaps xmm6, xmm7
+ shufps xmm6, xmm6, 0b10001000
+ shufps xmm7, xmm7, 0b11011101
+ /* dispersion table ready, in xmm4-xmm7 */
+ mulps xmm6, xmm1 /* xmm6=Geps */
+ mulps xmm7, xmm2 /* xmm7=Heps2 */
+ addps xmm5, xmm6
+ addps xmm5, xmm7 /* xmm5=Fp */
+ mulps xmm7, [esp + two] /* two*Heps2 */
+ addps xmm7, xmm6
+ addps xmm7, xmm5 /* xmm7=FF */
+ mulps xmm5, xmm1 /* xmm5=eps*Fp */
+ addps xmm5, xmm4 /* xmm5=VV */
+
+ movaps xmm4, [esp + c6]
+ mulps xmm7, xmm4 /* fijD */
+ mulps xmm5, xmm4 /* vnb6 */
+
+ /* put scalar force on stack Update vnbtot directly */
+ addps xmm5, [esp + vnbtot]
+ movaps [esp + fscal], xmm7
+ movaps [esp + vnbtot], xmm5
+
+ /* repulsion */
+ movlps xmm5, [esi + ecx*4 + 16]
+ movhps xmm5, [esi + edx*4 + 16] /* got half repulsion table */
+ movaps xmm4, xmm5
+ shufps xmm4, xmm4, 0b10001000
+ shufps xmm5, xmm5, 0b11011101
+
+ movlps xmm7, [esi + ecx*4 + 24]
+ movhps xmm7, [esi + edx*4 + 24] /* other half of repulsion table */
+ movaps xmm6, xmm7
+ shufps xmm6, xmm6, 0b10001000
+ shufps xmm7, xmm7, 0b11011101
+ /* table ready, in xmm4-xmm7 */
+ mulps xmm6, xmm1 /* xmm6=Geps */
+ mulps xmm7, xmm2 /* xmm7=Heps2 */
+ addps xmm5, xmm6
+ addps xmm5, xmm7 /* xmm5=Fp */
+ mulps xmm7, [esp + two] /* two*Heps2 */
+ addps xmm7, xmm6
+ addps xmm7, xmm5 /* xmm7=FF */
+ mulps xmm5, xmm1 /* xmm5=eps*Fp */
+ addps xmm5, xmm4 /* xmm5=VV */
+
+ movaps xmm4, [esp + c12]
+ mulps xmm7, xmm4 /* fijR */
+ mulps xmm5, xmm4 /* vnb12 */
+ addps xmm7, [esp + fscal]
+
+ addps xmm5, [esp + vnbtot]
+ movaps [esp + vnbtot], xmm5
+ xorps xmm4, xmm4
+
+ mulps xmm7, [esp + tsc]
+ mulps xmm7, xmm0
+ subps xmm4, xmm7
+
+ movaps xmm0, [esp + dx]
+ movaps xmm1, [esp + dy]
+ movaps xmm2, [esp + dz]
+
+ mulps xmm0, xmm4
+ mulps xmm1, xmm4
+ mulps xmm2, xmm4
+ /* xmm0-xmm2 contains tx-tz (partial force) */
+ /* now update f_i */
+ movaps xmm3, [esp + fix]
+ movaps xmm4, [esp + fiy]
+ movaps xmm5, [esp + fiz]
+ addps xmm3, xmm0
+ addps xmm4, xmm1
+ addps xmm5, xmm2
+ movaps [esp + fix], xmm3
+ movaps [esp + fiy], xmm4
+ movaps [esp + fiz], xmm5
+ /* update the fj's */
+ movss xmm3, [edi + eax*4]
+ movss xmm4, [edi + eax*4 + 4]
+ movss xmm5, [edi + eax*4 + 8]
+ subss xmm3, xmm0
+ subss xmm4, xmm1
+ subss xmm5, xmm2
+ movss [edi + eax*4], xmm3
+ movss [edi + eax*4 + 4], xmm4
+ movss [edi + eax*4 + 8], xmm5
+
+ shufps xmm0, xmm0, 0b11100001
+ shufps xmm1, xmm1, 0b11100001
+ shufps xmm2, xmm2, 0b11100001
+
+ movss xmm3, [edi + ebx*4]
+ movss xmm4, [edi + ebx*4 + 4]
+ movss xmm5, [edi + ebx*4 + 8]
+ subss xmm3, xmm0
+ subss xmm4, xmm1
+ subss xmm5, xmm2
+ movss [edi + ebx*4], xmm3
+ movss [edi + ebx*4 + 4], xmm4
+ movss [edi + ebx*4 + 8], xmm5
+
+.i0310_checksingle_vdw:
+ mov edx, [esp + innerk]
+ and edx, 1
+ jnz .i0310_dosingle_vdw
+ jmp .i0310_updateouterdata_vdw
+.i0310_dosingle_vdw:
+ mov edi, [ebp + pos]
+ mov ecx, [esp + innerjjnr]
+ mov eax, [ecx]
+ xorps xmm6, xmm6
+
+ mov esi, [ebp + type]
+ mov ecx, eax
+ mov ecx, [esi + ecx*4]
+ mov esi, [ebp + nbfp]
+ shl ecx, 1
+ add ecx, [esp + ntia]
+ movlps xmm6, [esi + ecx*4]
+ movaps xmm4, xmm6
+ shufps xmm4, xmm4, 0b11111100
+ shufps xmm6, xmm6, 0b11111101
+
+ movaps [esp + c6], xmm4
+ movaps [esp + c12], xmm6
+
+ lea eax, [eax + eax*2]
+
+ /* move coordinates to xmm0-xmm2 */
+ movss xmm0, [edi + eax*4]
+ movss xmm1, [edi + eax*4 + 4]
+ movss xmm2, [edi + eax*4 + 8]
+
+ movaps xmm4, [esp + ix]
+ movaps xmm5, [esp + iy]
+ movaps xmm6, [esp + iz]
+
+ /* calc dr */
+ subps xmm4, xmm0
+ subps xmm5, xmm1
+ subps xmm6, xmm2
+
+ /* store dr */
+ movaps [esp + dx], xmm4
+ movaps [esp + dy], xmm5
+ movaps [esp + dz], xmm6
+ /* square it */
+ mulps xmm4,xmm4
+ mulps xmm5,xmm5
+ mulps xmm6,xmm6
+ addps xmm4, xmm5
+ addps xmm4, xmm6
+ /* rsq in xmm4 */
+
+ rsqrtps xmm5, xmm4
+ /* lookup seed in xmm5 */
+ movaps xmm2, xmm5
+ mulps xmm5, xmm5
+ movaps xmm1, [esp + three]
+ mulps xmm5, xmm4 /* rsq*lu*lu */
+ movaps xmm0, [esp + half]
+ subps xmm1, xmm5 /* 30-rsq*lu*lu */
+ mulps xmm1, xmm2
+ mulps xmm0, xmm1 /* xmm0=rinv */
+
+ mulps xmm4, xmm0 /* xmm4=r */
+ mulps xmm4, [esp + tsc]
+
+ cvttps2pi mm6, xmm4 /* mm6 contain lu indices */
+ cvtpi2ps xmm6, mm6
+ subps xmm4, xmm6
+ movaps xmm1, xmm4 /* xmm1=eps */
+ movaps xmm2, xmm1
+ mulps xmm2, xmm2 /* xmm2=eps2 */
+
+ pslld mm6, 3
+
+ mov esi, [ebp + VFtab]
+ movd ebx, mm6
+
+ /* dispersion */
+ movlps xmm4, [esi + ebx*4 + 0]
+ movlps xmm6, [esi + ebx*4 + 8]
+ movaps xmm5, xmm4
+ movaps xmm7, xmm6
+ shufps xmm5, xmm5, 1
+ shufps xmm7, xmm7, 1
+ /* table ready in xmm4-xmm7 */
+
+ mulps xmm6, xmm1 /* xmm6=Geps */
+ mulps xmm7, xmm2 /* xmm7=Heps2 */
+ addps xmm5, xmm6
+ addps xmm5, xmm7 /* xmm5=Fp */
+ mulps xmm7, [esp + two] /* two*Heps2 */
+ addps xmm7, xmm6
+ addps xmm7, xmm5 /* xmm7=FF */
+ mulps xmm5, xmm1 /* xmm5=eps*Fp */
+ addps xmm5, xmm4 /* xmm5=VV */
+
+ movaps xmm4, [esp + c6]
+ mulps xmm7, xmm4 /* fijD */
+ mulps xmm5, xmm4 /* vnb6 */
+
+ /* put scalar force on stack Update vnbtot directly */
+ addps xmm5, [esp + vnbtot]
+ movaps [esp + fscal], xmm7
+ movaps [esp + vnbtot], xmm5
+
+ /* repulsion */
+ movlps xmm4, [esi + ebx*4 + 16]
+ movlps xmm6, [esi + ebx*4 + 24]
+ movaps xmm5, xmm4
+ movaps xmm7, xmm6
+ shufps xmm5, xmm5, 1
+ shufps xmm7, xmm7, 1
+ /* table ready in xmm4-xmm7 */
+
+ mulps xmm6, xmm1 /* xmm6=Geps */
+ mulps xmm7, xmm2 /* xmm7=Heps2 */
+ addps xmm5, xmm6
+ addps xmm5, xmm7 /* xmm5=Fp */
+ mulps xmm7, [esp + two] /* two*Heps2 */
+ addps xmm7, xmm6
+ addps xmm7, xmm5 /* xmm7=FF */
+ mulps xmm5, xmm1 /* xmm5=eps*Fp */
+ addps xmm5, xmm4 /* xmm5=VV */
+
+ movaps xmm4, [esp + c12]
+ mulps xmm7, xmm4 /* fijR */
+ mulps xmm5, xmm4 /* vnb12 */
+ addps xmm7, [esp + fscal]
+
+ addps xmm5, [esp + vnbtot]
+ movaps [esp + vnbtot], xmm5
+ xorps xmm4, xmm4
+
+ mulps xmm7, [esp + tsc]
+ mulps xmm7, xmm0
+ subps xmm4, xmm7
+ mov edi, [ebp + faction]
+
+ movaps xmm0, [esp + dx]
+ movaps xmm1, [esp + dy]
+ movaps xmm2, [esp + dz]
+
+ mulps xmm0, xmm4
+ mulps xmm1, xmm4
+ mulps xmm2, xmm4
+ /* xmm0-xmm2 contains tx-tz (partial force) */
+ /* now update f_i */
+ movaps xmm3, [esp + fix]
+ movaps xmm4, [esp + fiy]
+ movaps xmm5, [esp + fiz]
+ addps xmm3, xmm0
+ addps xmm4, xmm1
+ addps xmm5, xmm2
+ movaps [esp + fix], xmm3
+ movaps [esp + fiy], xmm4
+ movaps [esp + fiz], xmm5
+ /* update fj */
+ movss xmm3, [edi + eax*4]
+ movss xmm4, [edi + eax*4 + 4]
+ movss xmm5, [edi + eax*4 + 8]
+ subss xmm3, xmm0
+ subss xmm4, xmm1
+ subss xmm5, xmm2
+ movss [edi + eax*4], xmm3
+ movss [edi + eax*4 + 4], xmm4
+ movss [edi + eax*4 + 8], xmm5
+.i0310_updateouterdata_vdw:
+ mov ecx, [esp + ii3]
+ mov edi, [ebp + faction]
+ mov esi, [ebp + fshift]
+ mov edx, [esp + is3]
+
+ /* accumulate i forces in xmm0, xmm1, xmm2 */
+ movaps xmm0, [esp + fix]
+ movaps xmm1, [esp + fiy]
+ movaps xmm2, [esp + fiz]
+
+ movhlps xmm3, xmm0
+ movhlps xmm4, xmm1
+ movhlps xmm5, xmm2
+ addps xmm0, xmm3
+ addps xmm1, xmm4
+ addps xmm2, xmm5 /* sum is in 1/2 in xmm0-xmm2 */
+
+ movaps xmm3, xmm0
+ movaps xmm4, xmm1
+ movaps xmm5, xmm2
+
+ shufps xmm3, xmm3, 1
+ shufps xmm4, xmm4, 1
+ shufps xmm5, xmm5, 1
+ addss xmm0, xmm3
+ addss xmm1, xmm4
+ addss xmm2, xmm5 /* xmm0-xmm2 has single force in pos0 */
+
+ /* increment i force */
+ movss xmm3, [edi + ecx*4]
+ movss xmm4, [edi + ecx*4 + 4]
+ movss xmm5, [edi + ecx*4 + 8]
+ addss xmm3, xmm0
+ addss xmm4, xmm1
+ addss xmm5, xmm2
+ movss [edi + ecx*4], xmm3
+ movss [edi + ecx*4 + 4], xmm4
+ movss [edi + ecx*4 + 8], xmm5
+
+ /* increment fshift force */
+ movss xmm3, [esi + edx*4]
+ movss xmm4, [esi + edx*4 + 4]
+ movss xmm5, [esi + edx*4 + 8]
+ addss xmm3, xmm0
+ addss xmm4, xmm1
+ addss xmm5, xmm2
+ movss [esi + edx*4], xmm3
+ movss [esi + edx*4 + 4], xmm4
+ movss [esi + edx*4 + 8], xmm5
+
+ /* loop back to mno */
+ dec dword ptr [esp + nsvdw]
+ jz .i0310_last_mno
+ jmp .i0310_mno_vdw
+.i0310_last_mno:
+
+ /* get group index for i particle */
+ mov edx, [ebp + gid] /* get group index for this i particle */
+ mov edx, [edx]
+ add [ebp + gid], 4 /* advance pointer */
+
+
+ /* accumulate total lj energy and update it */
+ movaps xmm7, [esp + vnbtot]
+ /* accumulate */
+ movhlps xmm6, xmm7
+ addps xmm7, xmm6 /* pos 0-1 in xmm7 have the sum now */
+ movaps xmm6, xmm7
+ shufps xmm6, xmm6, 1
+ addss xmm7, xmm6
+
+ /* add earlier value from mem */
+ mov eax, [ebp + Vnb]
+ addss xmm7, [eax + edx*4]
+ /* move back to mem */
+ movss [eax + edx*4], xmm7
+
+ /* finish if last */
+ mov ecx, [ebp + nri]
+ dec ecx
+ jecxz .i0310_end
+ /* not last, iterate once more! */
+ mov [ebp + nri], ecx
+ jmp .i0310_outer
+.i0310_end:
+ emms
+ mov eax, [esp + salign]
+ add esp, eax
+ add esp, 332
+ pop edi
+ pop esi
+ pop edx
+ pop ecx
+ pop ebx
+ pop eax
+ leave
+ ret
+
+
+
+.globl inl1000_sse
+ .type inl1000_sse,@function
+inl1000_sse:
+.equ nri, 8
+.equ iinr, 12
+.equ jindex, 16
+.equ jjnr, 20
+.equ shift, 24
+.equ shiftvec, 28
+.equ fshift, 32
+.equ gid, 36
+.equ pos, 40
+.equ faction, 44
+.equ charge, 48
+.equ facel, 52
+.equ Vc, 56
+ /* stack offsets for local variables */
+ /* bottom of stack is cache-aligned for sse use */
+.equ ix, 0
+.equ iy, 16
+.equ iz, 32
+.equ iq, 48
+.equ dx, 64
+.equ dy, 80
+.equ dz, 96
+.equ vctot, 112
+.equ fix, 128
+.equ fiy, 144
+.equ fiz, 160
+.equ half, 176
+.equ three, 192
+.equ is3, 208
+.equ ii3, 212
+.equ innerjjnr, 216
+.equ innerk, 220
+.equ salign, 224
+ push ebp
+ mov ebp,esp
+ push eax
+ push ebx
+ push ecx
+ push edx
+ push esi
+ push edi
+ sub esp, 228 /* local stack space */
+ mov eax, esp
+ and eax, 0xf
+ sub esp, eax
+ mov [esp + salign], eax
+
+ emms
+
+ movups xmm0, [sse_half]
+ movups xmm1, [sse_three]
+ movaps [esp + half], xmm0
+ movaps [esp + three], xmm1
+
+ /* assume we have at least one i particle - start directly */
+i1000_outer:
+ mov eax, [ebp + shift] /* eax = pointer into shift[] */
+ mov ebx, [eax] /* ebx=shift[n] */
+ add [ebp + shift], 4 /* advance pointer one step */
+
+ lea ebx, [ebx + ebx*2] /* ebx=3*is */
+ mov [esp + is3],ebx /* store is3 */
+
+ mov eax, [ebp + shiftvec] /* eax = base of shiftvec[] */
+
+ movss xmm0, [eax + ebx*4]
+ movss xmm1, [eax + ebx*4 + 4]
+ movss xmm2, [eax + ebx*4 + 8]
+
+ mov ecx, [ebp + iinr] /* ecx = pointer into iinr[] */
+ add [ebp + iinr], 4 /* advance pointer */
+ mov ebx, [ecx] /* ebx =ii */
+
+ mov edx, [ebp + charge]
+ movss xmm3, [edx + ebx*4]
+ mulss xmm3, [ebp + facel]
+ shufps xmm3, xmm3, 0
+
+ lea ebx, [ebx + ebx*2] /* ebx = 3*ii=ii3 */
+ mov eax, [ebp + pos] /* eax = base of pos[] */
+
+ addss xmm0, [eax + ebx*4]
+ addss xmm1, [eax + ebx*4 + 4]
+ addss xmm2, [eax + ebx*4 + 8]
+
+ movaps [esp + iq], xmm3
+
+ shufps xmm0, xmm0, 0
+ shufps xmm1, xmm1, 0
+ shufps xmm2, xmm2, 0
+
+ movaps [esp + ix], xmm0
+ movaps [esp + iy], xmm1
+ movaps [esp + iz], xmm2
+
+ mov [esp + ii3], ebx
+
+ /* clear vctot and i forces */
+ xorps xmm4, xmm4
+ movaps [esp + vctot], xmm4
+ movaps [esp + fix], xmm4
+ movaps [esp + fiy], xmm4
+ movaps [esp + fiz], xmm4
+
+ mov eax, [ebp + jindex]
+ mov ecx, [eax] /* jindex[n] */
+ mov edx, [eax + 4] /* jindex[n+1] */
+ add [ebp + jindex], 4
+ sub edx, ecx /* number of innerloop atoms */
+
+ mov esi, [ebp + pos]
+ mov edi, [ebp + faction]
+ mov eax, [ebp + jjnr]
+ shl ecx, 2
+ add eax, ecx
+ mov [esp + innerjjnr], eax /* pointer to jjnr[nj0] */
+ sub edx, 4
+ mov [esp + innerk], edx /* number of innerloop atoms */
+ jge i1000_unroll_loop
+ jmp i1000_finish_inner
+i1000_unroll_loop:
+ /* quad-unrolled innerloop here */
+ mov edx, [esp + innerjjnr] /* pointer to jjnr[k] */
+ mov eax, [edx]
+ mov ebx, [edx + 4]
+ mov ecx, [edx + 8]
+ mov edx, [edx + 12] /* eax-edx=jnr1-4 */
+ add [esp + innerjjnr], 16 /* advance pointer (unrolled 4) */
+
+ mov esi, [ebp + charge] /* base of charge[] */
+
+ movss xmm3, [esi + eax*4]
+ movss xmm4, [esi + ecx*4]
+ movss xmm6, [esi + ebx*4]
+ movss xmm7, [esi + edx*4]
+
+ movaps xmm5, [esp + iq]
+ shufps xmm3, xmm6, 0
+ shufps xmm4, xmm7, 0
+ shufps xmm3, xmm4, 0b10001000
+ mov esi, [ebp + pos] /* base of pos[] */
+
+ lea eax, [eax + eax*2] /* replace jnr with j3 */
+ lea ebx, [ebx + ebx*2]
+
+ mulps xmm3, xmm5
+ lea ecx, [ecx + ecx*2] /* replace jnr with j3 */
+ lea edx, [edx + edx*2]
+
+ /* move four coordinates to xmm0-xmm2 */
+
+ movlps xmm4, [esi + eax*4] /* x1 y1 - - */
+ movlps xmm5, [esi + ecx*4] /* x3 y3 - - */
+ movss xmm2, [esi + eax*4 + 8] /* z1 - - - */
+ movss xmm6, [esi + ecx*4 + 8] /* z3 - - - */
+
+ movhps xmm4, [esi + ebx*4] /* x1 y1 x2 y2 */
+ movhps xmm5, [esi + edx*4] /* x3 y3 x4 y4 */
+
+ movss xmm0, [esi + ebx*4 + 8] /* z2 - - - */
+ movss xmm1, [esi + edx*4 + 8] /* z4 - - - */
+
+ shufps xmm2, xmm0, 0 /* z1 z1 z2 z2 */
+ shufps xmm6, xmm1, 0 /* z3 z3 z4 z4 */
+
+ movaps xmm0, xmm4 /* x1 y1 x2 y2 */
+ movaps xmm1, xmm4 /* x1 y1 x2 y2 */
+
+ shufps xmm2, xmm6, 0b10001000 /* z1 z2 z3 z4 */
+
+ shufps xmm0, xmm5, 0b10001000 /* x1 x2 x3 x4 */
+ shufps xmm1, xmm5, 0b11011101 /* y1 y2 y3 y4 */
+
+ mov edi, [ebp + faction]
+
+ /* move ix-iz to xmm4-xmm6 */
+ movaps xmm4, [esp + ix]
+ movaps xmm5, [esp + iy]
+ movaps xmm6, [esp + iz]
+
+ /* calc dr */
+ subps xmm4, xmm0
+ subps xmm5, xmm1
+ subps xmm6, xmm2
+
+ /* store dr */
+ movaps [esp + dx], xmm4
+ movaps [esp + dy], xmm5
+ movaps [esp + dz], xmm6
+ /* square it */
+ mulps xmm4,xmm4
+ mulps xmm5,xmm5
+ mulps xmm6,xmm6
+ addps xmm4, xmm5
+ addps xmm4, xmm6
+ /* rsq in xmm4 */
+
+ rsqrtps xmm5, xmm4
+ /* lookup seed in xmm5 */
+ movaps xmm2, xmm5
+ mulps xmm5, xmm5
+ movaps xmm1, [esp + three]
+ mulps xmm5, xmm4 /* rsq*lu*lu */
+ movaps xmm0, [esp + half]
+ subps xmm1, xmm5 /* 30-rsq*lu*lu */
+ mulps xmm1, xmm2
+ mulps xmm0, xmm1 /* xmm0=rinv */
+ movaps xmm4, xmm0
+ mulps xmm4, xmm4 /* xmm4=rinvsq */
+
+ movaps xmm5, [esp + vctot]
+ mulps xmm3, xmm0 /* xmm3=vcoul */
+ mulps xmm4, xmm3 /* xmm4=fscal */
+ addps xmm5, xmm3
+
+ movaps xmm0, [esp + dx]
+ movaps xmm1, [esp + dy]
+ movaps xmm2, [esp + dz]
+
+ movaps [esp + vctot], xmm5
+
+ mulps xmm0, xmm4
+ mulps xmm1, xmm4
+ mulps xmm2, xmm4
+ /* xmm0-xmm2 contains tx-tz (partial force) */
+ /* now update f_i */
+ movaps xmm3, [esp + fix]
+ movaps xmm4, [esp + fiy]
+ movaps xmm5, [esp + fiz]
+ addps xmm3, xmm0
+ addps xmm4, xmm1
+ addps xmm5, xmm2
+ movaps [esp + fix], xmm3
+ movaps [esp + fiy], xmm4
+ movaps [esp + fiz], xmm5
+ /* the fj's - start by accumulating x & y forces from memory */
+ movlps xmm4, [edi + eax*4]
+ movlps xmm6, [edi + ecx*4]
+ movhps xmm4, [edi + ebx*4]
+ movhps xmm6, [edi + edx*4]
+
+ movaps xmm3, xmm4
+ shufps xmm3, xmm6, 0b10001000
+ shufps xmm4, xmm6, 0b11011101
+
+ /* now xmm3-xmm5 contains fjx, fjy, fjz */
+ subps xmm3, xmm0
+ subps xmm4, xmm1
+
+ /* unpack them back so we can store them - first x & y in xmm3/xmm4 */
+
+ movaps xmm6, xmm3
+ unpcklps xmm6, xmm4
+ unpckhps xmm3, xmm4
+ /* xmm6(l)=x & y for j1, (h) for j2 */
+ /* xmm3(l)=x & y for j3, (h) for j4 */
+ movlps [edi + eax*4], xmm6
+ movlps [edi + ecx*4], xmm3
+
+ movhps [edi + ebx*4], xmm6
+ movhps [edi + edx*4], xmm3
+
+ /* and the z forces */
+ movss xmm4, [edi + eax*4 + 8]
+ movss xmm5, [edi + ebx*4 + 8]
+ movss xmm6, [edi + ecx*4 + 8]
+ movss xmm7, [edi + edx*4 + 8]
+ subss xmm4, xmm2
+ shufps xmm2, xmm2, 0b11100101
+ subss xmm5, xmm2
+ shufps xmm2, xmm2, 0b11101010
+ subss xmm6, xmm2
+ shufps xmm2, xmm2, 0b11111111
+ subss xmm7, xmm2
+ movss [edi + eax*4 + 8], xmm4
+ movss [edi + ebx*4 + 8], xmm5
+ movss [edi + ecx*4 + 8], xmm6
+ movss [edi + edx*4 + 8], xmm7
+
+ /* should we do one more iteration? */
+ sub [esp + innerk], 4
+ jl i1000_finish_inner
+ jmp i1000_unroll_loop
+i1000_finish_inner:
+ /* check if at least two particles remain */
+ add [esp + innerk], 4
+ mov edx, [esp + innerk]
+ and edx, 2
+ jnz i1000_dopair
+ jmp i1000_checksingle
+i1000_dopair:
+ mov esi, [ebp + charge]
+ mov edi, [ebp + pos]
+ mov ecx, [esp + innerjjnr]
+
+ mov eax, [ecx]
+ mov ebx, [ecx + 4]
+ add [esp + innerjjnr], 8
+
+ movss xmm3, [esi + eax*4]
+ movss xmm6, [esi + ebx*4]
+ shufps xmm3, xmm6, 0
+ shufps xmm3, xmm3, 0b00001000 /* xmm3(0,1) has the charges */
+
+ lea eax, [eax + eax*2]
+ lea ebx, [ebx + ebx*2]
+ /* move coordinates to xmm0-xmm2 */
+ movlps xmm1, [edi + eax*4]
+ movss xmm2, [edi + eax*4 + 8]
+ movhps xmm1, [edi + ebx*4]
+ movss xmm0, [edi + ebx*4 + 8]
+
+ mulps xmm3, [esp + iq]
+ xorps xmm7,xmm7
+ movlhps xmm3, xmm7
+
+ shufps xmm2, xmm0, 0
+
+ movaps xmm0, xmm1
+
+ shufps xmm2, xmm2, 0b10001000
+
+ shufps xmm0, xmm0, 0b10001000
+ shufps xmm1, xmm1, 0b11011101
+
+ mov edi, [ebp + faction]
+ /* move ix-iz to xmm4-xmm6 */
+ xorps xmm7, xmm7
+
+ movaps xmm4, [esp + ix]
+ movaps xmm5, [esp + iy]
+ movaps xmm6, [esp + iz]
+
+ /* calc dr */
+ subps xmm4, xmm0
+ subps xmm5, xmm1
+ subps xmm6, xmm2
+
+ /* store dr */
+ movaps [esp + dx], xmm4
+ movaps [esp + dy], xmm5
+ movaps [esp + dz], xmm6
+ /* square it */
+ mulps xmm4,xmm4
+ mulps xmm5,xmm5
+ mulps xmm6,xmm6
+ addps xmm4, xmm5
+ addps xmm4, xmm6
+ /* rsq in xmm4 */
+
+ rsqrtps xmm5, xmm4
+ /* lookup seed in xmm5 */
+ movaps xmm2, xmm5
+ mulps xmm5, xmm5
+ movaps xmm1, [esp + three]
+ mulps xmm5, xmm4 /* rsq*lu*lu */
+ movaps xmm0, [esp + half]
+ subps xmm1, xmm5 /* 30-rsq*lu*lu */
+ mulps xmm1, xmm2
+ mulps xmm0, xmm1 /* xmm0=rinv */
+ movaps xmm4, xmm0
+ mulps xmm4, xmm4 /* xmm4=rinvsq */
+
+ movaps xmm5, [esp + vctot]
+ mulps xmm3, xmm0 /* xmm3=vcoul */
+ mulps xmm4, xmm3 /* xmm4=fscal */
+ addps xmm5, xmm3
+
+ movaps xmm0, [esp + dx]
+ movaps xmm1, [esp + dy]
+ movaps xmm2, [esp + dz]
+
+ movaps [esp + vctot], xmm5
+
+ mulps xmm0, xmm4
+ mulps xmm1, xmm4
+ mulps xmm2, xmm4
+ /* xmm0-xmm2 contains tx-tz (partial force) */
+ /* now update f_i */
+ movaps xmm3, [esp + fix]
+ movaps xmm4, [esp + fiy]
+ movaps xmm5, [esp + fiz]
+ addps xmm3, xmm0
+ addps xmm4, xmm1
+ addps xmm5, xmm2
+ movaps [esp + fix], xmm3
+ movaps [esp + fiy], xmm4
+ movaps [esp + fiz], xmm5
+ /* update the fj's */
+ movss xmm3, [edi + eax*4]
+ movss xmm4, [edi + eax*4 + 4]
+ movss xmm5, [edi + eax*4 + 8]
+ subss xmm3, xmm0
+ subss xmm4, xmm1
+ subss xmm5, xmm2
+ movss [edi + eax*4], xmm3
+ movss [edi + eax*4 + 4], xmm4
+ movss [edi + eax*4 + 8], xmm5
+
+ shufps xmm0, xmm0, 0b11100001
+ shufps xmm1, xmm1, 0b11100001
+ shufps xmm2, xmm2, 0b11100001
+
+ movss xmm3, [edi + ebx*4]
+ movss xmm4, [edi + ebx*4 + 4]
+ movss xmm5, [edi + ebx*4 + 8]
+ subss xmm3, xmm0
+ subss xmm4, xmm1
+ subss xmm5, xmm2
+ movss [edi + ebx*4], xmm3
+ movss [edi + ebx*4 + 4], xmm4
+ movss [edi + ebx*4 + 8], xmm5
+i1000_checksingle:
+ mov edx, [esp + innerk]
+ and edx, 1
+ jnz i1000_dosingle
+ jmp i1000_updateouterdata
+i1000_dosingle:
+ mov esi, [ebp + charge]
+ mov edi, [ebp + pos]
+ mov ecx, [esp + innerjjnr]
+ mov eax, [ecx]
+ movss xmm3, [esi + eax*4] /* xmm3(0) has the charge */
+
+ lea eax, [eax + eax*2]
+
+ /* move coordinates to xmm0-xmm2 */
+ movss xmm0, [edi + eax*4]
+ movss xmm1, [edi + eax*4 + 4]
+ movss xmm2, [edi + eax*4 + 8]
+
+ mulps xmm3, [esp + iq]
+
+ xorps xmm7, xmm7
+
+ movaps xmm4, [esp + ix]
+ movaps xmm5, [esp + iy]
+ movaps xmm6, [esp + iz]
+
+ /* calc dr */
+ subps xmm4, xmm0
+ subps xmm5, xmm1
+ subps xmm6, xmm2
+
+ /* store dr */
+ movaps [esp + dx], xmm4
+ movaps [esp + dy], xmm5
+ movaps [esp + dz], xmm6
+ /* square it */
+ mulps xmm4,xmm4
+ mulps xmm5,xmm5
+ mulps xmm6,xmm6
+ addps xmm4, xmm5
+ addps xmm4, xmm6
+ /* rsq in xmm4 */
+
+ rsqrtps xmm5, xmm4
+ /* lookup seed in xmm5 */
+ movaps xmm2, xmm5
+ mulps xmm5, xmm5
+ movaps xmm1, [esp + three]
+ mulps xmm5, xmm4 /* rsq*lu*lu */
+ movaps xmm0, [esp + half]
+ subps xmm1, xmm5 /* 30-rsq*lu*lu */
+ mulps xmm1, xmm2
+ mulps xmm0, xmm1 /* xmm0=rinv */
+ movaps xmm4, xmm0
+ mulps xmm4, xmm4 /* xmm4=rinvsq */
+ mov edi, [ebp + faction]
+ movaps xmm5, [esp + vctot]
+ mulps xmm3, xmm0 /* xmm3=vcoul */
+ mulps xmm4, xmm3 /* xmm4=fscal */
+ addps xmm5, xmm3
+
+ movaps xmm0, [esp + dx]
+ movaps xmm1, [esp + dy]
+ movaps xmm2, [esp + dz]
+
+ movaps [esp + vctot], xmm5
+
+ mulps xmm0, xmm4
+ mulps xmm1, xmm4
+ mulps xmm2, xmm4
+ /* xmm0-xmm2 contains tx-tz (partial force) */
+ /* now update f_i */
+ movaps xmm3, [esp + fix]
+ movaps xmm4, [esp + fiy]
+ movaps xmm5, [esp + fiz]
+ addps xmm3, xmm0
+ addps xmm4, xmm1
+ addps xmm5, xmm2
+ movaps [esp + fix], xmm3
+ movaps [esp + fiy], xmm4
+ movaps [esp + fiz], xmm5
+ /* update fj */
+ movss xmm3, [edi + eax*4]
+ movss xmm4, [edi + eax*4 + 4]
+ movss xmm5, [edi + eax*4 + 8]
+ subss xmm3, xmm0
+ subss xmm4, xmm1
+ subss xmm5, xmm2
+ movss [edi + eax*4], xmm3
+ movss [edi + eax*4 + 4], xmm4
+ movss [edi + eax*4 + 8], xmm5
+i1000_updateouterdata:
+ mov ecx, [esp + ii3]
+ mov edi, [ebp + faction]
+ mov esi, [ebp + fshift]
+ mov edx, [esp + is3]
+
+ /* accumulate i forces in xmm0, xmm1, xmm2 */
+ movaps xmm0, [esp + fix]
+ movaps xmm1, [esp + fiy]
+ movaps xmm2, [esp + fiz]
+
+ movhlps xmm3, xmm0
+ movhlps xmm4, xmm1
+ movhlps xmm5, xmm2
+ addps xmm0, xmm3
+ addps xmm1, xmm4
+ addps xmm2, xmm5 /* sum is in 1/2 in xmm0-xmm2 */
+
+ movaps xmm3, xmm0
+ movaps xmm4, xmm1
+ movaps xmm5, xmm2
+
+ shufps xmm3, xmm3, 1
+ shufps xmm4, xmm4, 1
+ shufps xmm5, xmm5, 1
+ addss xmm0, xmm3
+ addss xmm1, xmm4
+ addss xmm2, xmm5 /* xmm0-xmm2 has single force in pos0 */
+
+ /* increment i force */
+ movss xmm3, [edi + ecx*4]
+ movss xmm4, [edi + ecx*4 + 4]
+ movss xmm5, [edi + ecx*4 + 8]
+ addss xmm3, xmm0
+ addss xmm4, xmm1
+ addss xmm5, xmm2
+ movss [edi + ecx*4], xmm3
+ movss [edi + ecx*4 + 4], xmm4
+ movss [edi + ecx*4 + 8], xmm5
+
+ /* increment fshift force */
+ movss xmm3, [esi + edx*4]
+ movss xmm4, [esi + edx*4 + 4]
+ movss xmm5, [esi + edx*4 + 8]
+ addss xmm3, xmm0
+ addss xmm4, xmm1
+ addss xmm5, xmm2
+ movss [esi + edx*4], xmm3
+ movss [esi + edx*4 + 4], xmm4
+ movss [esi + edx*4 + 8], xmm5
+
+ /* get group index for i particle */
+ mov edx, [ebp + gid] /* get group index for this i particle */
+ mov edx, [edx]
+ add [ebp + gid], 4 /* advance pointer */
+
+ /* accumulate total potential energy and update it */
+ movaps xmm7, [esp + vctot]
+ /* accumulate */
+ movhlps xmm6, xmm7
+ addps xmm7, xmm6 /* pos 0-1 in xmm7 have the sum now */
+ movaps xmm6, xmm7
+ shufps xmm6, xmm6, 1
+ addss xmm7, xmm6
+
+ /* add earlier value from mem */
+ mov eax, [ebp + Vc]
+ addss xmm7, [eax + edx*4]
+ /* move back to mem */
+ movss [eax + edx*4], xmm7
+
+ /* finish if last */
+ mov ecx, [ebp + nri]
+ dec ecx
+ jecxz i1000_end
+ /* not last, iterate once more! */
+ mov [ebp + nri], ecx
+ jmp i1000_outer
+i1000_end:
+ emms
+ mov eax, [esp + salign]
+ add esp, eax
+ add esp, 228
+ pop edi
+ pop esi
+ pop edx
+ pop ecx
+ pop ebx
+ pop eax
+ leave
+ ret
+
+
+
+.globl inl1010_sse
+ .type inl1010_sse,@function
+inl1010_sse:
+.equ nri, 8
+.equ iinr, 12
+.equ jindex, 16
+.equ jjnr, 20
+.equ shift, 24
+.equ shiftvec, 28
+.equ fshift, 32
+.equ gid, 36
+.equ pos, 40
+.equ faction, 44
+.equ charge, 48
+.equ facel, 52
+.equ Vc, 56
+.equ nsatoms, 60
+ /* stack offsets for local variables */
+ /* bottom of stack is cache-aligned for sse use */
+.equ ix, 0
+.equ iy, 16
+.equ iz, 32
+.equ iq, 48
+.equ dx, 64
+.equ dy, 80
+.equ dz, 96
+.equ vctot, 112
+.equ fix, 128
+.equ fiy, 144
+.equ fiz, 160
+.equ half, 176
+.equ three, 192
+.equ is3, 208
+.equ ii3, 212
+.equ shX, 216
+.equ shY, 220
+.equ shZ, 224
+.equ ntia, 228
+.equ innerjjnr0, 232
+.equ innerk0, 236
+.equ innerjjnr, 240
+.equ innerk, 244
+.equ salign, 248
+.equ nscoul, 252
+.equ solnr, 256
+ push ebp
+ mov ebp,esp
+ push eax
+ push ebx
+ push ecx
+ push edx
+ push esi
+ push edi
+ sub esp, 260 /* local stack space */
+ mov eax, esp
+ and eax, 0xf
+ sub esp, eax
+ mov [esp + salign], eax
+
+ emms
+
+ movups xmm0, [sse_half]
+ movups xmm1, [sse_three]
+ movaps [esp + half], xmm0
+ movaps [esp + three], xmm1
+ add [ebp + nsatoms], 8
+
+ /* assume we have at least one i particle - start directly */
+i1010_outer:
+ mov eax, [ebp + shift] /* eax = pointer into shift[] */
+ mov ebx, [eax] /* ebx=shift[n] */
+ add [ebp + shift], 4 /* advance pointer one step */
+
+ lea ebx, [ebx + ebx*2] /* ebx=3*is */
+ mov [esp + is3],ebx /* store is3 */
+
+ mov eax, [ebp + shiftvec] /* eax = base of shiftvec[] */
+
+ movss xmm0, [eax + ebx*4]
+ movss xmm1, [eax + ebx*4 + 4]
+ movss xmm2, [eax + ebx*4 + 8]
+ movss [esp + shX], xmm0
+ movss [esp + shY], xmm1
+ movss [esp + shZ], xmm2
+
+ mov ecx, [ebp + iinr] /* ecx = pointer into iinr[] */
+ add [ebp + iinr], 4 /* advance pointer */
+ mov ebx, [ecx] /* ebx =ii */
+
+ mov eax, [ebp + nsatoms]
+ mov ecx, [eax]
+ add [ebp + nsatoms], 12
+ mov [esp + nscoul], ecx
+
+ /* clear vctot */
+ xorps xmm4, xmm4
+ movaps [esp + vctot], xmm4
+ mov [esp + solnr], ebx
+
+ mov eax, [ebp + jindex]
+ mov ecx, [eax] /* jindex[n] */
+ mov edx, [eax + 4] /* jindex[n+1] */
+ add [ebp + jindex], 4
+ sub edx, ecx /* number of innerloop atoms */
+
+ mov eax, [ebp + jjnr]
+ shl ecx, 2
+ add eax, ecx
+ mov [esp + innerjjnr0], eax /* pointer to jjnr[nj0] */
+ mov [esp + innerk0], edx /* number of innerloop atoms */
+
+ mov ecx, [esp + nscoul]
+ cmp ecx, 0
+ jnz i1010_mno_coul
+ jmp i1010_last_mno
+i1010_mno_coul:
+ mov ebx, [esp + solnr]
+ inc dword ptr [esp + solnr]
+
+ movss xmm0, [esp + shX]
+ movss xmm1, [esp + shY]
+ movss xmm2, [esp + shZ]
+
+ mov edx, [ebp + charge]
+ movss xmm3, [edx + ebx*4]
+ mulss xmm3, [ebp + facel]
+ shufps xmm3, xmm3, 0
+
+ lea ebx, [ebx + ebx*2] /* ebx = 3*ii=ii3 */
+ mov eax, [ebp + pos] /* eax = base of pos[] */
+
+ addss xmm0, [eax + ebx*4]
+ addss xmm1, [eax + ebx*4 + 4]
+ addss xmm2, [eax + ebx*4 + 8]
+
+ movaps [esp + iq], xmm3
+
+ shufps xmm0, xmm0, 0
+ shufps xmm1, xmm1, 0
+ shufps xmm2, xmm2, 0
+
+ movaps [esp + ix], xmm0
+ movaps [esp + iy], xmm1
+ movaps [esp + iz], xmm2
+
+ mov [esp + ii3], ebx
+
+ /* clear i forces */
+ xorps xmm4, xmm4
+ movaps [esp + fix], xmm4
+ movaps [esp + fiy], xmm4
+ movaps [esp + fiz], xmm4
+
+ mov ecx, [esp + innerjjnr0]
+ mov [esp + innerjjnr], ecx
+ mov edx, [esp + innerk0]
+ sub edx, 4
+ mov [esp + innerk], edx /* number of innerloop atoms */
+ jge i1010_unroll_coul_loop
+ jmp i1010_finish_coul_inner
+
+i1010_unroll_coul_loop:
+ /* quad-unrolled innerloop here */
+ mov edx, [esp + innerjjnr] /* pointer to jjnr[k] */
+ mov eax, [edx]
+ mov ebx, [edx + 4]
+ mov ecx, [edx + 8]
+ mov edx, [edx + 12] /* eax-edx=jnr1-4 */
+ add [esp + innerjjnr], 16 /* advance pointer (unrolled 4) */
+
+ mov esi, [ebp + charge] /* base of charge[] */
+
+ movss xmm3, [esi + eax*4]
+ movss xmm4, [esi + ecx*4]
+ movss xmm6, [esi + ebx*4]
+ movss xmm7, [esi + edx*4]
+
+ movaps xmm5, [esp + iq]
+ shufps xmm3, xmm6, 0
+ shufps xmm4, xmm7, 0
+ shufps xmm3, xmm4, 0b10001000
+ mov esi, [ebp + pos] /* base of pos[] */
+
+ lea eax, [eax + eax*2] /* replace jnr with j3 */
+ lea ebx, [ebx + ebx*2]
+
+ mulps xmm3, xmm5
+ lea ecx, [ecx + ecx*2] /* replace jnr with j3 */
+ lea edx, [edx + edx*2]
+
+ /* move four coordinates to xmm0-xmm2 */
+
+ movlps xmm4, [esi + eax*4]
+ movlps xmm5, [esi + ecx*4]
+ movss xmm2, [esi + eax*4 + 8]
+ movss xmm6, [esi + ecx*4 + 8]
+
+ movhps xmm4, [esi + ebx*4]
+ movhps xmm5, [esi + edx*4]
+
+ movss xmm0, [esi + ebx*4 + 8]
+ movss xmm1, [esi + edx*4 + 8]
+
+ shufps xmm2, xmm0, 0
+ shufps xmm6, xmm1, 0
+
+ movaps xmm0, xmm4
+ movaps xmm1, xmm4
+
+ shufps xmm2, xmm6, 0b10001000
+
+ shufps xmm0, xmm5, 0b10001000
+ shufps xmm1, xmm5, 0b11011101
+
+ mov edi, [ebp + faction]
+
+ /* move ix-iz to xmm4-xmm6 */
+ movaps xmm4, [esp + ix]
+ movaps xmm5, [esp + iy]
+ movaps xmm6, [esp + iz]
+
+ /* calc dr */
+ subps xmm4, xmm0
+ subps xmm5, xmm1
+ subps xmm6, xmm2
+
+ /* store dr */
+ movaps [esp + dx], xmm4
+ movaps [esp + dy], xmm5
+ movaps [esp + dz], xmm6
+ /* square it */
+ mulps xmm4,xmm4
+ mulps xmm5,xmm5
+ mulps xmm6,xmm6
+ addps xmm4, xmm5
+ addps xmm4, xmm6
+ /* rsq in xmm4 */
+
+ rsqrtps xmm5, xmm4
+ /* lookup seed in xmm5 */
+ movaps xmm2, xmm5
+ mulps xmm5, xmm5
+ movaps xmm1, [esp + three]
+ mulps xmm5, xmm4 /* rsq*lu*lu */
+ movaps xmm0, [esp + half]
+ subps xmm1, xmm5 /* 30-rsq*lu*lu */
+ mulps xmm1, xmm2
+ mulps xmm0, xmm1 /* xmm0=rinv */
+ movaps xmm4, xmm0
+ mulps xmm4, xmm4 /* xmm4=rinvsq */
+
+ movaps xmm5, [esp + vctot]
+ mulps xmm3, xmm0 /* xmm3=vcoul */
+ mulps xmm4, xmm3 /* xmm4=fscal */
+ addps xmm5, xmm3
+
+ movaps xmm0, [esp + dx]
+ movaps xmm1, [esp + dy]
+ movaps xmm2, [esp + dz]
+
+ movaps [esp + vctot], xmm5
+
+
+ mulps xmm0, xmm4
+ mulps xmm1, xmm4
+ mulps xmm2, xmm4
+ /* xmm0-xmm2 contains tx-tz (partial force) */
+ /* now update f_i */
+ movaps xmm3, [esp + fix]
+ movaps xmm4, [esp + fiy]
+ movaps xmm5, [esp + fiz]
+ addps xmm3, xmm0
+ addps xmm4, xmm1
+ addps xmm5, xmm2
+ movaps [esp + fix], xmm3
+ movaps [esp + fiy], xmm4
+ movaps [esp + fiz], xmm5
+ /* the fj's - start by accumulating x & y forces from memory */
+ movlps xmm4, [edi + eax*4]
+ movlps xmm6, [edi + ecx*4]
+ movhps xmm4, [edi + ebx*4]
+ movhps xmm6, [edi + edx*4]
+
+ movaps xmm3, xmm4
+ shufps xmm3, xmm6, 0b10001000
+ shufps xmm4, xmm6, 0b11011101
+
+ /* now xmm3-xmm5 contains fjx, fjy, fjz */
+ subps xmm3, xmm0
+ subps xmm4, xmm1
+
+ /* unpack them back so we can store them - first x & y in xmm3/xmm4 */
+
+ movaps xmm6, xmm3
+ unpcklps xmm6, xmm4
+ unpckhps xmm3, xmm4
+ /* xmm6(l)=x & y for j1, (h) for j2 */
+ /* xmm3(l)=x & y for j3, (h) for j4 */
+ movlps [edi + eax*4], xmm6
+ movlps [edi + ecx*4], xmm3
+
+ movhps [edi + ebx*4], xmm6
+ movhps [edi + edx*4], xmm3
+
+ /* and the z forces */
+ movss xmm4, [edi + eax*4 + 8]
+ movss xmm5, [edi + ebx*4 + 8]
+ movss xmm6, [edi + ecx*4 + 8]
+ movss xmm7, [edi + edx*4 + 8]
+ subss xmm4, xmm2
+ shufps xmm2, xmm2, 0b11100101
+ subss xmm5, xmm2
+ shufps xmm2, xmm2, 0b11101010
+ subss xmm6, xmm2
+ shufps xmm2, xmm2, 0b11111111
+ subss xmm7, xmm2
+ movss [edi + eax*4 + 8], xmm4
+ movss [edi + ebx*4 + 8], xmm5
+ movss [edi + ecx*4 + 8], xmm6
+ movss [edi + edx*4 + 8], xmm7
+
+ /* should we do one more iteration? */
+ sub [esp + innerk], 4
+ jl i1010_finish_coul_inner
+ jmp i1010_unroll_coul_loop
+i1010_finish_coul_inner:
+ /* check if at least two particles remain */
+ add [esp + innerk], 4
+ mov edx, [esp + innerk]
+ and edx, 2
+ jnz i1010_dopair_coul
+ jmp i1010_checksingle_coul
+i1010_dopair_coul:
+ mov esi, [ebp + charge]
+ mov edi, [ebp + pos]
+ mov ecx, [esp + innerjjnr]
+
+ mov eax, [ecx]
+ mov ebx, [ecx + 4]
+ add [esp + innerjjnr], 8
+
+ movss xmm3, [esi + eax*4]
+ movss xmm6, [esi + ebx*4]
+ shufps xmm3, xmm6, 0
+ shufps xmm3, xmm3, 0b00001000 /* xmm3(0,1) has the charges */
+
+ lea eax, [eax + eax*2]
+ lea ebx, [ebx + ebx*2]
+ /* move coordinates to xmm0-xmm2 */
+ movlps xmm1, [edi + eax*4]
+ movss xmm2, [edi + eax*4 + 8]
+ movhps xmm1, [edi + ebx*4]
+ movss xmm0, [edi + ebx*4 + 8]
+
+ mulps xmm3, [esp + iq]
+ xorps xmm7,xmm7
+ movlhps xmm3, xmm7
+
+ shufps xmm2, xmm0, 0
+
+ movaps xmm0, xmm1
+
+ shufps xmm2, xmm2, 0b10001000
+
+ shufps xmm0, xmm0, 0b10001000
+ shufps xmm1, xmm1, 0b11011101
+
+ mov edi, [ebp + faction]
+ /* move ix-iz to xmm4-xmm6 */
+ xorps xmm7, xmm7
+
+ movaps xmm4, [esp + ix]
+ movaps xmm5, [esp + iy]
+ movaps xmm6, [esp + iz]
+
+ /* calc dr */
+ subps xmm4, xmm0
+ subps xmm5, xmm1
+ subps xmm6, xmm2
+
+ /* store dr */
+ movaps [esp + dx], xmm4
+ movaps [esp + dy], xmm5
+ movaps [esp + dz], xmm6
+ /* square it */
+ mulps xmm4,xmm4
+ mulps xmm5,xmm5
+ mulps xmm6,xmm6
+ addps xmm4, xmm5
+ addps xmm4, xmm6
+ /* rsq in xmm4 */
+
+ rsqrtps xmm5, xmm4
+ /* lookup seed in xmm5 */
+ movaps xmm2, xmm5
+ mulps xmm5, xmm5
+ movaps xmm1, [esp + three]
+ mulps xmm5, xmm4 /* rsq*lu*lu */
+ movaps xmm0, [esp + half]
+ subps xmm1, xmm5 /* 30-rsq*lu*lu */
+ mulps xmm1, xmm2
+ mulps xmm0, xmm1 /* xmm0=rinv */
+ movaps xmm4, xmm0
+ mulps xmm4, xmm4 /* xmm4=rinvsq */
+
+ movaps xmm5, [esp + vctot]
+ mulps xmm3, xmm0 /* xmm3=vcoul */
+ mulps xmm4, xmm3 /* xmm4=fscal */
+ addps xmm5, xmm3
+
+ movaps xmm0, [esp + dx]
+ movaps xmm1, [esp + dy]
+ movaps xmm2, [esp + dz]
+
+ movaps [esp + vctot], xmm5
+
+ mulps xmm0, xmm4
+ mulps xmm1, xmm4
+ mulps xmm2, xmm4
+ /* xmm0-xmm2 contains tx-tz (partial force) */
+ /* now update f_i */
+ movaps xmm3, [esp + fix]
+ movaps xmm4, [esp + fiy]
+ movaps xmm5, [esp + fiz]
+ addps xmm3, xmm0
+ addps xmm4, xmm1
+ addps xmm5, xmm2
+ movaps [esp + fix], xmm3
+ movaps [esp + fiy], xmm4
+ movaps [esp + fiz], xmm5
+ /* update the fj's */
+ movss xmm3, [edi + eax*4]
+ movss xmm4, [edi + eax*4 + 4]
+ movss xmm5, [edi + eax*4 + 8]
+ subps xmm3, xmm0
+ subps xmm4, xmm1
+ subps xmm5, xmm2
+ movss [edi + eax*4], xmm3
+ movss [edi + eax*4 + 4], xmm4
+ movss [edi + eax*4 + 8], xmm5
+
+ shufps xmm0, xmm0, 0b11100001
+ shufps xmm1, xmm1, 0b11100001
+ shufps xmm2, xmm2, 0b11100001
+
+ movss xmm3, [edi + ebx*4]
+ movss xmm4, [edi + ebx*4 + 4]
+ movss xmm5, [edi + ebx*4 + 8]
+ subps xmm3, xmm0
+ subps xmm4, xmm1
+ subps xmm5, xmm2
+ movss [edi + ebx*4], xmm3
+ movss [edi + ebx*4 + 4], xmm4
+ movss [edi + ebx*4 + 8], xmm5
+
+i1010_checksingle_coul:
+ mov edx, [esp + innerk]
+ and edx, 1
+ jnz i1010_dosingle_coul
+ jmp i1010_updateouterdata_coul
+i1010_dosingle_coul:
+ mov esi, [ebp + charge]
+ mov edi, [ebp + pos]
+ mov ecx, [esp + innerjjnr]
+ mov eax, [ecx]
+ movss xmm3, [esi + eax*4] /* xmm3(0) has the charge */
+
+ lea eax, [eax + eax*2]
+
+ /* move coordinates to xmm0-xmm2 */
+ movss xmm0, [edi + eax*4]
+ movss xmm1, [edi + eax*4 + 4]
+ movss xmm2, [edi + eax*4 + 8]
+
+ mulps xmm3, [esp + iq]
+
+ xorps xmm7, xmm7
+
+ movaps xmm4, [esp + ix]
+ movaps xmm5, [esp + iy]
+ movaps xmm6, [esp + iz]
+
+ /* calc dr */
+ subps xmm4, xmm0
+ subps xmm5, xmm1
+ subps xmm6, xmm2
+
+ /* store dr */
+ movaps [esp + dx], xmm4
+ movaps [esp + dy], xmm5
+ movaps [esp + dz], xmm6
+ /* square it */
+ mulps xmm4,xmm4
+ mulps xmm5,xmm5
+ mulps xmm6,xmm6
+ addps xmm4, xmm5
+ addps xmm4, xmm6
+ /* rsq in xmm4 */
+
+ rsqrtps xmm5, xmm4
+ /* lookup seed in xmm5 */
+ movaps xmm2, xmm5
+ mulps xmm5, xmm5
+ movaps xmm1, [esp + three]
+ mulps xmm5, xmm4 /* rsq*lu*lu */
+ movaps xmm0, [esp + half]
+ subps xmm1, xmm5 /* 30-rsq*lu*lu */
+ mulps xmm1, xmm2
+ mulps xmm0, xmm1 /* xmm0=rinv */
+ movaps xmm4, xmm0
+ mulps xmm4, xmm4 /* xmm4=rinvsq */
+ mov edi, [ebp + faction]
+ movaps xmm5, [esp + vctot]
+ mulps xmm3, xmm0 /* xmm3=vcoul */
+ mulps xmm4, xmm3 /* xmm4=fscal */
+ addps xmm5, xmm3
+
+ movaps xmm0, [esp + dx]
+ movaps xmm1, [esp + dy]
+ movaps xmm2, [esp + dz]
+
+ movaps [esp + vctot], xmm5
+
+ mulps xmm0, xmm4
+ mulps xmm1, xmm4
+ mulps xmm2, xmm4
+ /* xmm0-xmm2 contains tx-tz (partial force) */
+ /* now update f_i */
+ movaps xmm3, [esp + fix]
+ movaps xmm4, [esp + fiy]
+ movaps xmm5, [esp + fiz]
+ addps xmm3, xmm0
+ addps xmm4, xmm1
+ addps xmm5, xmm2
+ movaps [esp + fix], xmm3
+ movaps [esp + fiy], xmm4
+ movaps [esp + fiz], xmm5
+ /* update fj */
+ movss xmm3, [edi + eax*4]
+ movss xmm4, [edi + eax*4 + 4]
+ movss xmm5, [edi + eax*4 + 8]
+ subss xmm3, xmm0
+ subss xmm4, xmm1
+ subss xmm5, xmm2
+ movss [edi + eax*4], xmm3
+ movss [edi + eax*4 + 4], xmm4
+ movss [edi + eax*4 + 8], xmm5
+i1010_updateouterdata_coul:
+ mov ecx, [esp + ii3]
+ mov edi, [ebp + faction]
+ mov esi, [ebp + fshift]
+ mov edx, [esp + is3]
+
+ /* accumulate i forces in xmm0, xmm1, xmm2 */
+ movaps xmm0, [esp + fix]
+ movaps xmm1, [esp + fiy]
+ movaps xmm2, [esp + fiz]
+
+ movhlps xmm3, xmm0
+ movhlps xmm4, xmm1
+ movhlps xmm5, xmm2
+ addps xmm0, xmm3
+ addps xmm1, xmm4
+ addps xmm2, xmm5 /* sum is in 1/2 in xmm0-xmm2 */
+
+ movaps xmm3, xmm0
+ movaps xmm4, xmm1
+ movaps xmm5, xmm2
+
+ shufps xmm3, xmm3, 1
+ shufps xmm4, xmm4, 1
+ shufps xmm5, xmm5, 1
+ addss xmm0, xmm3
+ addss xmm1, xmm4
+ addss xmm2, xmm5 /* xmm0-xmm2 has single force in pos0 */
+
+ /* increment i force */
+ movss xmm3, [edi + ecx*4]
+ movss xmm4, [edi + ecx*4 + 4]
+ movss xmm5, [edi + ecx*4 + 8]
+ addss xmm3, xmm0
+ addss xmm4, xmm1
+ addss xmm5, xmm2
+ movss [edi + ecx*4], xmm3
+ movss [edi + ecx*4 + 4], xmm4
+ movss [edi + ecx*4 + 8], xmm5
+
+ /* increment fshift force */
+ movss xmm3, [esi + edx*4]
+ movss xmm4, [esi + edx*4 + 4]
+ movss xmm5, [esi + edx*4 + 8]
+ addss xmm3, xmm0
+ addss xmm4, xmm1
+ addss xmm5, xmm2
+ movss [esi + edx*4], xmm3
+ movss [esi + edx*4 + 4], xmm4
+ movss [esi + edx*4 + 8], xmm5
+
+ /* loop back to mno */
+ dec dword ptr [esp + nscoul]
+ jz i1010_last_mno
+ jmp i1010_mno_coul
+
+i1010_last_mno:
+ /* get group index for i particle */
+ mov edx, [ebp + gid] /* get group index for this i particle */
+ mov edx, [edx]
+ add [ebp + gid], 4 /* advance pointer */
+
+ /* accumulate total potential energy and update it */
+ movaps xmm7, [esp + vctot]
+ /* accumulate */
+ movhlps xmm6, xmm7
+ addps xmm7, xmm6 /* pos 0-1 in xmm7 have the sum now */
+ movaps xmm6, xmm7
+ shufps xmm6, xmm6, 1
+ addss xmm7, xmm6
+
+ /* add earlier value from mem */
+ mov eax, [ebp + Vc]
+ addss xmm7, [eax + edx*4]
+ /* move back to mem */
+ movss [eax + edx*4], xmm7
+
+ /* finish if last */
+ mov ecx, [ebp + nri]
+ dec ecx
+ jecxz i1010_end
+ /* not last, iterate once more! */
+ mov [ebp + nri], ecx
+ jmp i1010_outer
+i1010_end:
+ emms
+ mov eax, [esp + salign]
+ add esp, eax
+ add esp, 260
+ pop edi
+ pop esi
+ pop edx
+ pop ecx
+ pop ebx
+ pop eax
+ leave
+ ret
+
+
+
+
+.globl inl1020_sse
+ .type inl1020_sse,@function
+inl1020_sse:
+.equ nri, 8
+.equ iinr, 12
+.equ jindex, 16
+.equ jjnr, 20
+.equ shift, 24
+.equ shiftvec, 28
+.equ fshift, 32
+.equ gid, 36
+.equ pos, 40
+.equ faction, 44
+.equ charge, 48
+.equ facel, 52
+.equ Vc, 56
+ /* stack offsets for local variables */
+ /* bottom of stack is cache-aligned for sse use */
+.equ ixO, 0
+.equ iyO, 16
+.equ izO, 32
+.equ ixH1, 48
+.equ iyH1, 64
+.equ izH1, 80
+.equ ixH2, 96
+.equ iyH2, 112
+.equ izH2, 128
+.equ iqO, 144
+.equ iqH, 160
+.equ dxO, 176
+.equ dyO, 192
+.equ dzO, 208
+.equ dxH1, 224
+.equ dyH1, 240
+.equ dzH1, 256
+.equ dxH2, 272
+.equ dyH2, 288
+.equ dzH2, 304
+.equ qqO, 320
+.equ qqH, 336
+.equ vctot, 352
+.equ fixO, 368
+.equ fiyO, 384
+.equ fizO, 400
+.equ fixH1, 416
+.equ fiyH1, 432
+.equ fizH1, 448
+.equ fixH2, 464
+.equ fiyH2, 480
+.equ fizH2, 496
+.equ fjx, 512
+.equ fjy, 528
+.equ fjz, 544
+.equ half, 560
+.equ three, 576
+.equ is3, 592
+.equ ii3, 596
+.equ innerjjnr, 600
+.equ innerk, 604
+.equ salign, 608
+ push ebp
+ mov ebp,esp
+ push eax
+ push ebx
+ push ecx
+ push edx
+ push esi
+ push edi
+ sub esp, 612 /* local stack space */
+ mov eax, esp
+ and eax, 0xf
+ sub esp, eax
+ mov [esp + salign], eax
+
+ emms
+
+ movups xmm0, [sse_half]
+ movups xmm1, [sse_three]
+ movaps [esp + half], xmm0
+ movaps [esp + three], xmm1
+
+ /* assume we have at least one i particle - start directly */
+ mov ecx, [ebp + iinr] /* ecx = pointer into iinr[] */
+ mov ebx, [ecx] /* ebx =ii */
+
+ mov edx, [ebp + charge]
+ movss xmm3, [edx + ebx*4]
+ movss xmm4, [edx + ebx*4 + 4]
+ movss xmm5, [ebp + facel]
+ mulss xmm3, xmm5
+ mulss xmm4, xmm5
+
+ shufps xmm3, xmm3, 0
+ shufps xmm4, xmm4, 0
+ movaps [esp + iqO], xmm3
+ movaps [esp + iqH], xmm4
+
+i1020_outer:
+ mov eax, [ebp + shift] /* eax = pointer into shift[] */
+ mov ebx, [eax] /* ebx=shift[n] */
+ add [ebp + shift], 4 /* advance pointer one step */
+
+ lea ebx, [ebx + ebx*2] /* ebx=3*is */
+ mov [esp + is3],ebx /* store is3 */
+
+ mov eax, [ebp + shiftvec] /* eax = base of shiftvec[] */
+
+ movss xmm0, [eax + ebx*4]
+ movss xmm1, [eax + ebx*4 + 4]
+ movss xmm2, [eax + ebx*4 + 8]
+
+ mov ecx, [ebp + iinr] /* ecx = pointer into iinr[] */
+ add [ebp + iinr], 4 /* advance pointer */
+ mov ebx, [ecx] /* ebx =ii */
+
+ movaps xmm3, xmm0
+ movaps xmm4, xmm1
+ movaps xmm5, xmm2
+
+ lea ebx, [ebx + ebx*2] /* ebx = 3*ii=ii3 */
+ mov eax, [ebp + pos] /* eax = base of pos[] */
+ mov [esp + ii3], ebx
+
+ addss xmm3, [eax + ebx*4]
+ addss xmm4, [eax + ebx*4 + 4]
+ addss xmm5, [eax + ebx*4 + 8]
+ shufps xmm3, xmm3, 0
+ shufps xmm4, xmm4, 0
+ shufps xmm5, xmm5, 0
+ movaps [esp + ixO], xmm3
+ movaps [esp + iyO], xmm4
+ movaps [esp + izO], xmm5
+
+ movss xmm3, xmm0
+ movss xmm4, xmm1
+ movss xmm5, xmm2
+ addss xmm0, [eax + ebx*4 + 12]
+ addss xmm1, [eax + ebx*4 + 16]
+ addss xmm2, [eax + ebx*4 + 20]
+ addss xmm3, [eax + ebx*4 + 24]
+ addss xmm4, [eax + ebx*4 + 28]
+ addss xmm5, [eax + ebx*4 + 32]
+
+ shufps xmm0, xmm0, 0
+ shufps xmm1, xmm1, 0
+ shufps xmm2, xmm2, 0
+ shufps xmm3, xmm3, 0
+ shufps xmm4, xmm4, 0
+ shufps xmm5, xmm5, 0
+ movaps [esp + ixH1], xmm0
+ movaps [esp + iyH1], xmm1
+ movaps [esp + izH1], xmm2
+ movaps [esp + ixH2], xmm3
+ movaps [esp + iyH2], xmm4
+ movaps [esp + izH2], xmm5
+
+ /* clear vctot and i forces */
+ xorps xmm4, xmm4
+ movaps [esp + vctot], xmm4
+ movaps [esp + fixO], xmm4
+ movaps [esp + fiyO], xmm4
+ movaps [esp + fizO], xmm4
+ movaps [esp + fixH1], xmm4
+ movaps [esp + fiyH1], xmm4
+ movaps [esp + fizH1], xmm4
+ movaps [esp + fixH2], xmm4
+ movaps [esp + fiyH2], xmm4
+ movaps [esp + fizH2], xmm4
+
+ mov eax, [ebp + jindex]
+ mov ecx, [eax] /* jindex[n] */
+ mov edx, [eax + 4] /* jindex[n+1] */
+ add [ebp + jindex], 4
+ sub edx, ecx /* number of innerloop atoms */
+
+ mov esi, [ebp + pos]
+ mov edi, [ebp + faction]
+ mov eax, [ebp + jjnr]
+ shl ecx, 2
+ add eax, ecx
+ mov [esp + innerjjnr], eax /* pointer to jjnr[nj0] */
+ sub edx, 4
+ mov [esp + innerk], edx /* number of innerloop atoms */
+ jge i1020_unroll_loop
+ jmp i1020_odd_inner
+i1020_unroll_loop:
+ /* quad-unroll innerloop here */
+ mov edx, [esp + innerjjnr] /* pointer to jjnr[k] */
+ mov eax, [edx]
+ mov ebx, [edx + 4]
+ mov ecx, [edx + 8]
+ mov edx, [edx + 12] /* eax-edx=jnr1-4 */
+
+ add [esp + innerjjnr], 16 /* advance pointer (unrolled 4) */
+
+ mov esi, [ebp + charge] /* base of charge[] */
+
+ movss xmm3, [esi + eax*4]
+ movss xmm4, [esi + ecx*4]
+ movss xmm6, [esi + ebx*4]
+ movss xmm7, [esi + edx*4]
+
+ shufps xmm3, xmm6, 0
+ shufps xmm4, xmm7, 0
+ shufps xmm3, xmm4, 0b10001000 /* all charges in xmm3 */
+ movaps xmm4, xmm3 /* and in xmm4 */
+ mulps xmm3, [esp + iqO]
+ mulps xmm4, [esp + iqH]
+
+ movaps [esp + qqO], xmm3
+ movaps [esp + qqH], xmm4
+
+ mov esi, [ebp + pos] /* base of pos[] */
+
+ lea eax, [eax + eax*2] /* replace jnr with j3 */
+ lea ebx, [ebx + ebx*2]
+ lea ecx, [ecx + ecx*2] /* replace jnr with j3 */
+ lea edx, [edx + edx*2]
+
+ /* move four coordinates to xmm0-xmm2 */
+ movlps xmm4, [esi + eax*4]
+ movlps xmm5, [esi + ecx*4]
+ movss xmm2, [esi + eax*4 + 8]
+ movss xmm6, [esi + ecx*4 + 8]
+
+ movhps xmm4, [esi + ebx*4]
+ movhps xmm5, [esi + edx*4]
+
+ movss xmm0, [esi + ebx*4 + 8]
+ movss xmm1, [esi + edx*4 + 8]
+
+ shufps xmm2, xmm0, 0
+ shufps xmm6, xmm1, 0
+
+ movaps xmm0, xmm4
+ movaps xmm1, xmm4
+
+ shufps xmm2, xmm6, 0b10001000
+
+ shufps xmm0, xmm5, 0b10001000
+ shufps xmm1, xmm5, 0b11011101
+
+ /* move ixO-izO to xmm4-xmm6 */
+ movaps xmm4, [esp + ixO]
+ movaps xmm5, [esp + iyO]
+ movaps xmm6, [esp + izO]
+
+ /* calc dr */
+ subps xmm4, xmm0
+ subps xmm5, xmm1
+ subps xmm6, xmm2
+
+ /* store dr */
+ movaps [esp + dxO], xmm4
+ movaps [esp + dyO], xmm5
+ movaps [esp + dzO], xmm6
+ /* square it */
+ mulps xmm4,xmm4
+ mulps xmm5,xmm5
+ mulps xmm6,xmm6
+ addps xmm4, xmm5
+ addps xmm4, xmm6
+ movaps xmm7, xmm4
+ /* rsqO in xmm7 */
+
+ /* move ixH1-izH1 to xmm4-xmm6 */
+ movaps xmm4, [esp + ixH1]
+ movaps xmm5, [esp + iyH1]
+ movaps xmm6, [esp + izH1]
+
+ /* calc dr */
+ subps xmm4, xmm0
+ subps xmm5, xmm1
+ subps xmm6, xmm2
+
+ /* store dr */
+ movaps [esp + dxH1], xmm4
+ movaps [esp + dyH1], xmm5
+ movaps [esp + dzH1], xmm6
+ /* square it */
+ mulps xmm4,xmm4
+ mulps xmm5,xmm5
+ mulps xmm6,xmm6
+ addps xmm6, xmm5
+ addps xmm6, xmm4
+ /* rsqH1 in xmm6 */
+
+ /* move ixH2-izH2 to xmm3-xmm5 */
+ movaps xmm3, [esp + ixH2]
+ movaps xmm4, [esp + iyH2]
+ movaps xmm5, [esp + izH2]
+
+ /* calc dr */
+ subps xmm3, xmm0
+ subps xmm4, xmm1
+ subps xmm5, xmm2
+
+ /* store dr */
+ movaps [esp + dxH2], xmm3
+ movaps [esp + dyH2], xmm4
+ movaps [esp + dzH2], xmm5
+ /* square it */
+ mulps xmm3,xmm3
+ mulps xmm4,xmm4
+ mulps xmm5,xmm5
+ addps xmm5, xmm4
+ addps xmm5, xmm3
+ /* rsqH2 in xmm5, rsqH1 in xmm6, rsqO in xmm7 */
+
+ /* start with rsqO - seed in xmm2 */
+ rsqrtps xmm2, xmm7
+ movaps xmm3, xmm2
+ mulps xmm2, xmm2
+ movaps xmm4, [esp + three]
+ mulps xmm2, xmm7 /* rsq*lu*lu */
+ subps xmm4, xmm2 /* 30-rsq*lu*lu */
+ mulps xmm4, xmm3 /* lu*(3-rsq*lu*lu) */
+ mulps xmm4, [esp + half]
+ movaps xmm7, xmm4 /* rinvO in xmm7 */
+ /* rsqH1 - seed in xmm2 */
+ rsqrtps xmm2, xmm6
+ movaps xmm3, xmm2
+ mulps xmm2, xmm2
+ movaps xmm4, [esp + three]
+ mulps xmm2, xmm6 /* rsq*lu*lu */
+ subps xmm4, xmm2 /* 30-rsq*lu*lu */
+ mulps xmm4, xmm3 /* lu*(3-rsq*lu*lu) */
+ mulps xmm4, [esp + half]
+ movaps xmm6, xmm4 /* rinvH1 in xmm6 */
+ /* rsqH2 - seed in xmm2 */
+ rsqrtps xmm2, xmm5
+ movaps xmm3, xmm2
+ mulps xmm2, xmm2
+ movaps xmm4, [esp + three]
+ mulps xmm2, xmm5 /* rsq*lu*lu */
+ subps xmm4, xmm2 /* 30-rsq*lu*lu */
+ mulps xmm4, xmm3 /* lu*(3-rsq*lu*lu) */
+ mulps xmm4, [esp + half]
+ movaps xmm5, xmm4 /* rinvH2 in xmm5 */
+
+ /* do O interactions */
+ movaps xmm4, xmm7
+ mulps xmm4, xmm4 /* xmm7=rinv, xmm4=rinvsq */
+ mulps xmm7, [esp + qqO] /* xmm7=vcoul */
+
+ mulps xmm4, xmm7 /* total fsO in xmm4 */
+
+ addps xmm7, [esp + vctot]
+
+ movaps [esp + vctot], xmm7
+
+ movaps xmm0, [esp + dxO]
+ movaps xmm1, [esp + dyO]
+ movaps xmm2, [esp + dzO]
+ mulps xmm0, xmm4
+ mulps xmm1, xmm4
+ mulps xmm2, xmm4
+
+ /* update O forces */
+ movaps xmm3, [esp + fixO]
+ movaps xmm4, [esp + fiyO]
+ movaps xmm7, [esp + fizO]
+ addps xmm3, xmm0
+ addps xmm4, xmm1
+ addps xmm7, xmm2
+ movaps [esp + fixO], xmm3
+ movaps [esp + fiyO], xmm4
+ movaps [esp + fizO], xmm7
+ /* update j forces with water O */
+ movaps [esp + fjx], xmm0
+ movaps [esp + fjy], xmm1
+ movaps [esp + fjz], xmm2
+
+ /* H1 interactions */
+ movaps xmm4, xmm6
+ mulps xmm4, xmm4 /* xmm6=rinv, xmm4=rinvsq */
+ mulps xmm6, [esp + qqH] /* xmm6=vcoul */
+ mulps xmm4, xmm6 /* total fsH1 in xmm4 */
+
+ addps xmm6, [esp + vctot]
+
+ movaps xmm0, [esp + dxH1]
+ movaps xmm1, [esp + dyH1]
+ movaps xmm2, [esp + dzH1]
+ movaps [esp + vctot], xmm6
+ mulps xmm0, xmm4
+ mulps xmm1, xmm4
+ mulps xmm2, xmm4
+
+ /* update H1 forces */
+ movaps xmm3, [esp + fixH1]
+ movaps xmm4, [esp + fiyH1]
+ movaps xmm7, [esp + fizH1]
+ addps xmm3, xmm0
+ addps xmm4, xmm1
+ addps xmm7, xmm2
+ movaps [esp + fixH1], xmm3
+ movaps [esp + fiyH1], xmm4
+ movaps [esp + fizH1], xmm7
+ /* update j forces with water H1 */
+ addps xmm0, [esp + fjx]
+ addps xmm1, [esp + fjy]
+ addps xmm2, [esp + fjz]
+ movaps [esp + fjx], xmm0
+ movaps [esp + fjy], xmm1
+ movaps [esp + fjz], xmm2
+
+ /* H2 interactions */
+ movaps xmm4, xmm5
+ mulps xmm4, xmm4 /* xmm5=rinv, xmm4=rinvsq */
+ mulps xmm5, [esp + qqH] /* xmm5=vcoul */
+ mulps xmm4, xmm5 /* total fsH1 in xmm4 */
+
+ addps xmm5, [esp + vctot]
+
+ movaps xmm0, [esp + dxH2]
+ movaps xmm1, [esp + dyH2]
+ movaps xmm2, [esp + dzH2]
+ movaps [esp + vctot], xmm5
+ mulps xmm0, xmm4
+ mulps xmm1, xmm4
+ mulps xmm2, xmm4
+
+ /* update H2 forces */
+ movaps xmm3, [esp + fixH2]
+ movaps xmm4, [esp + fiyH2]
+ movaps xmm7, [esp + fizH2]
+ addps xmm3, xmm0
+ addps xmm4, xmm1
+ addps xmm7, xmm2
+ movaps [esp + fixH2], xmm3
+ movaps [esp + fiyH2], xmm4
+ movaps [esp + fizH2], xmm7
+
+ mov edi, [ebp +faction]
+ /* update j forces */
+ addps xmm0, [esp + fjx]
+ addps xmm1, [esp + fjy]
+ addps xmm2, [esp + fjz]
+
+ movlps xmm4, [edi + eax*4]
+ movlps xmm7, [edi + ecx*4]
+ movhps xmm4, [edi + ebx*4]
+ movhps xmm7, [edi + edx*4]
+
+ movaps xmm3, xmm4
+ shufps xmm3, xmm7, 0b10001000
+ shufps xmm4, xmm7, 0b11011101
+ /* xmm3 has fjx, xmm4 has fjy */
+ subps xmm3, xmm0
+ subps xmm4, xmm1
+ /* unpack them back for storing */
+ movaps xmm7, xmm3
+ unpcklps xmm7, xmm4
+ unpckhps xmm3, xmm4
+ movlps [edi + eax*4], xmm7
+ movlps [edi + ecx*4], xmm3
+ movhps [edi + ebx*4], xmm7
+ movhps [edi + edx*4], xmm3
+ /* finally z forces */
+ movss xmm0, [edi + eax*4 + 8]
+ movss xmm1, [edi + ebx*4 + 8]
+ movss xmm3, [edi + ecx*4 + 8]
+ movss xmm4, [edi + edx*4 + 8]
+ subss xmm0, xmm2
+ shufps xmm2, xmm2, 0b11100101
+ subss xmm1, xmm2
+ shufps xmm2, xmm2, 0b11101010
+ subss xmm3, xmm2
+ shufps xmm2, xmm2, 0b11111111
+ subss xmm4, xmm2
+ movss [edi + eax*4 + 8], xmm0
+ movss [edi + ebx*4 + 8], xmm1
+ movss [edi + ecx*4 + 8], xmm3
+ movss [edi + edx*4 + 8], xmm4
+
+ /* should we do one more iteration? */
+ sub [esp + innerk], 4
+ jl i1020_odd_inner
+ jmp i1020_unroll_loop
+i1020_odd_inner:
+ add [esp + innerk], 4
+ jnz i1020_odd_loop
+ jmp i1020_updateouterdata
+i1020_odd_loop:
+ mov edx, [esp + innerjjnr] /* pointer to jjnr[k] */
+ mov eax, [edx]
+ add [esp + innerjjnr], 4
+
+ xorps xmm4, xmm4
+ movss xmm4, [esp + iqO]
+ mov esi, [ebp + charge]
+ movhps xmm4, [esp + iqH]
+ movss xmm3, [esi + eax*4] /* charge in xmm3 */
+ shufps xmm3, xmm3, 0
+ mulps xmm3, xmm4
+ movaps [esp + qqO], xmm3 /* use oxygen qq for storage */
+
+ mov esi, [ebp + pos]
+ lea eax, [eax + eax*2]
+
+ /* move j coords to xmm0-xmm2 */
+ movss xmm0, [esi + eax*4]
+ movss xmm1, [esi + eax*4 + 4]
+ movss xmm2, [esi + eax*4 + 8]
+ shufps xmm0, xmm0, 0
+ shufps xmm1, xmm1, 0
+ shufps xmm2, xmm2, 0
+
+ movss xmm3, [esp + ixO]
+ movss xmm4, [esp + iyO]
+ movss xmm5, [esp + izO]
+
+ movlps xmm6, [esp + ixH1]
+ movlps xmm7, [esp + ixH2]
+ unpcklps xmm6, xmm7
+ movlhps xmm3, xmm6
+ movlps xmm6, [esp + iyH1]
+ movlps xmm7, [esp + iyH2]
+ unpcklps xmm6, xmm7
+ movlhps xmm4, xmm6
+ movlps xmm6, [esp + izH1]
+ movlps xmm7, [esp + izH2]
+ unpcklps xmm6, xmm7
+ movlhps xmm5, xmm6
+
+ subps xmm3, xmm0
+ subps xmm4, xmm1
+ subps xmm5, xmm2
+
+ movaps [esp + dxO], xmm3
+ movaps [esp + dyO], xmm4
+ movaps [esp + dzO], xmm5
+
+ mulps xmm3, xmm3
+ mulps xmm4, xmm4
+ mulps xmm5, xmm5
+
+ addps xmm4, xmm3
+ addps xmm4, xmm5
+ /* rsq in xmm4 */
+
+ rsqrtps xmm5, xmm4
+ /* lookup seed in xmm5 */
+ movaps xmm2, xmm5
+ mulps xmm5, xmm5
+ movaps xmm1, [esp + three]
+ mulps xmm5, xmm4 /* rsq*lu*lu */
+ movaps xmm0, [esp + half]
+ subps xmm1, xmm5 /* 30-rsq*lu*lu */
+ mulps xmm1, xmm2
+ mulps xmm0, xmm1 /* xmm0=rinv */
+ movaps xmm4, xmm0
+ mulps xmm4, xmm4 /* xmm4=rinvsq */
+ movaps xmm3, [esp + qqO]
+
+ mulps xmm3, xmm0 /* xmm3=vcoul */
+ mulps xmm4, xmm3 /* xmm4=total fscal */
+ addps xmm3, [esp + vctot]
+
+ movaps xmm0, [esp + dxO]
+ movaps xmm1, [esp + dyO]
+ movaps xmm2, [esp + dzO]
+
+ movaps [esp + vctot], xmm3
+
+ mulps xmm0, xmm4
+ mulps xmm1, xmm4
+ mulps xmm2, xmm4
+ /* xmm0-xmm2 contains tx-tz (partial force) */
+ movss xmm3, [esp + fixO]
+ movss xmm4, [esp + fiyO]
+ movss xmm5, [esp + fizO]
+ addss xmm3, xmm0
+ addss xmm4, xmm1
+ addss xmm5, xmm2
+ movss [esp + fixO], xmm3
+ movss [esp + fiyO], xmm4
+ movss [esp + fizO], xmm5 /* updated the O force now do the H's */
+ movaps xmm3, xmm0
+ movaps xmm4, xmm1
+ movaps xmm5, xmm2
+ shufps xmm3, xmm3, 0b11100110 /* shift right */
+ shufps xmm4, xmm4, 0b11100110
+ shufps xmm5, xmm5, 0b11100110
+ addss xmm3, [esp + fixH1]
+ addss xmm4, [esp + fiyH1]
+ addss xmm5, [esp + fizH1]
+ movss [esp + fixH1], xmm3
+ movss [esp + fiyH1], xmm4
+ movss [esp + fizH1], xmm5 /* updated the H1 force */
+
+ mov edi, [ebp + faction]
+ shufps xmm3, xmm3, 0b11100111 /* shift right */
+ shufps xmm4, xmm4, 0b11100111
+ shufps xmm5, xmm5, 0b11100111
+ addss xmm3, [esp + fixH2]
+ addss xmm4, [esp + fiyH2]
+ addss xmm5, [esp + fizH2]
+ movss [esp + fixH2], xmm3
+ movss [esp + fiyH2], xmm4
+ movss [esp + fizH2], xmm5 /* updated the H2 force */
+
+ /* the fj's - start by accumulating the tx/ty/tz force in xmm0, xmm1 */
+ xorps xmm5, xmm5
+ movaps xmm3, xmm0
+ movlps xmm6, [edi + eax*4]
+ movss xmm7, [edi + eax*4 + 8]
+ unpcklps xmm3, xmm1
+ movlhps xmm3, xmm5
+ unpckhps xmm0, xmm1
+ addps xmm0, xmm3
+ movhlps xmm3, xmm0
+ addps xmm0, xmm3 /* x,y sum in xmm0 */
+
+ movhlps xmm1, xmm2
+ addss xmm2, xmm1
+ shufps xmm1, xmm1, 1
+ addss xmm2, xmm1 /* z sum in xmm2 */
+ subps xmm6, xmm0
+ subss xmm7, xmm2
+
+ movlps [edi + eax*4], xmm6
+ movss [edi + eax*4 + 8], xmm7
+
+ dec dword ptr [esp + innerk]
+ jz i1020_updateouterdata
+ jmp i1020_odd_loop
+i1020_updateouterdata:
+ mov ecx, [esp + ii3]
+ mov edi, [ebp + faction]
+ mov esi, [ebp + fshift]
+ mov edx, [esp + is3]
+
+ /* accumulate Oi forces in xmm0, xmm1, xmm2 */
+ movaps xmm0, [esp + fixO]
+ movaps xmm1, [esp + fiyO]
+ movaps xmm2, [esp + fizO]
+
+ movhlps xmm3, xmm0
+ movhlps xmm4, xmm1
+ movhlps xmm5, xmm2
+ addps xmm0, xmm3
+ addps xmm1, xmm4
+ addps xmm2, xmm5 /* sum is in 1/2 in xmm0-xmm2 */
+
+ movaps xmm3, xmm0
+ movaps xmm4, xmm1
+ movaps xmm5, xmm2
+
+ shufps xmm3, xmm3, 1
+ shufps xmm4, xmm4, 1
+ shufps xmm5, xmm5, 1
+ addss xmm0, xmm3
+ addss xmm1, xmm4
+ addss xmm2, xmm5 /* xmm0-xmm2 has single force in pos0 */
+
+ /* increment i force */
+ movss xmm3, [edi + ecx*4]
+ movss xmm4, [edi + ecx*4 + 4]
+ movss xmm5, [edi + ecx*4 + 8]
+ addss xmm3, xmm0
+ addss xmm4, xmm1
+ addss xmm5, xmm2
+ movss [edi + ecx*4], xmm3
+ movss [edi + ecx*4 + 4], xmm4
+ movss [edi + ecx*4 + 8], xmm5
+
+ /* accumulate force in xmm6/xmm7 for fshift */
+ movaps xmm6, xmm0
+ movss xmm7, xmm2
+ movlhps xmm6, xmm1
+ shufps xmm6, xmm6, 0b1000
+
+ /* accumulate H1i forces in xmm0, xmm1, xmm2 */
+ movaps xmm0, [esp + fixH1]
+ movaps xmm1, [esp + fiyH1]
+ movaps xmm2, [esp + fizH1]
+
+ movhlps xmm3, xmm0
+ movhlps xmm4, xmm1
+ movhlps xmm5, xmm2
+ addps xmm0, xmm3
+ addps xmm1, xmm4
+ addps xmm2, xmm5 /* sum is in 1/2 in xmm0-xmm2 */
+
+ movaps xmm3, xmm0
+ movaps xmm4, xmm1
+ movaps xmm5, xmm2
+
+ shufps xmm3, xmm3, 1
+ shufps xmm4, xmm4, 1
+ shufps xmm5, xmm5, 1
+ addss xmm0, xmm3
+ addss xmm1, xmm4
+ addss xmm2, xmm5 /* xmm0-xmm2 has single force in pos0 */
+
+ /* increment i force */
+ movss xmm3, [edi + ecx*4 + 12]
+ movss xmm4, [edi + ecx*4 + 16]
+ movss xmm5, [edi + ecx*4 + 20]
+ addss xmm3, xmm0
+ addss xmm4, xmm1
+ addss xmm5, xmm2
+ movss [edi + ecx*4 + 12], xmm3
+ movss [edi + ecx*4 + 16], xmm4
+ movss [edi + ecx*4 + 20], xmm5
+
+ /* accumulate force in xmm6/xmm7 for fshift */
+ addss xmm7, xmm2
+ movlhps xmm0, xmm1
+ shufps xmm0, xmm0, 0b1000
+ addps xmm6, xmm0
+
+ /* accumulate H2i forces in xmm0, xmm1, xmm2 */
+ movaps xmm0, [esp + fixH2]
+ movaps xmm1, [esp + fiyH2]
+ movaps xmm2, [esp + fizH2]
+
+ movhlps xmm3, xmm0
+ movhlps xmm4, xmm1
+ movhlps xmm5, xmm2
+ addps xmm0, xmm3
+ addps xmm1, xmm4
+ addps xmm2, xmm5 /* sum is in 1/2 in xmm0-xmm2 */
+
+ movaps xmm3, xmm0
+ movaps xmm4, xmm1
+ movaps xmm5, xmm2
+
+ shufps xmm3, xmm3, 1
+ shufps xmm4, xmm4, 1
+ shufps xmm5, xmm5, 1
+ addss xmm0, xmm3
+ addss xmm1, xmm4
+ addss xmm2, xmm5 /* xmm0-xmm2 has single force in pos0 */
+
+ /* increment i force */
+ movss xmm3, [edi + ecx*4 + 24]
+ movss xmm4, [edi + ecx*4 + 28]
+ movss xmm5, [edi + ecx*4 + 32]
+ addss xmm3, xmm0
+ addss xmm4, xmm1
+ addss xmm5, xmm2
+ movss [edi + ecx*4 + 24], xmm3
+ movss [edi + ecx*4 + 28], xmm4
+ movss [edi + ecx*4 + 32], xmm5
+
+ /* accumulate force in xmm6/xmm7 for fshift */
+ addss xmm7, xmm2
+ movlhps xmm0, xmm1
+ shufps xmm0, xmm0, 0b1000
+ addps xmm6, xmm0
+
+ /* increment fshift force */
+ movlps xmm3, [esi + edx*4]
+ movss xmm4, [esi + edx*4 + 8]
+ addps xmm3, xmm6
+ addss xmm4, xmm7
+ movlps [esi + edx*4], xmm3
+ movss [esi + edx*4 + 8], xmm4
+
+ mov edx, [ebp + gid]
+ mov edx, [edx]
+ add [ebp + gid], 4
+
+ /* accumulate total potential energy and update it */
+ movaps xmm7, [esp + vctot]
+ /* accumulate */
+ movhlps xmm6, xmm7
+ addps xmm7, xmm6 /* pos 0-1 in xmm7 have the sum now */
+ movaps xmm6, xmm7
+ shufps xmm6, xmm6, 1
+ addss xmm7, xmm6
+
+ /* add earlier value from mem */
+ mov eax, [ebp + Vc]
+ addss xmm7, [eax + edx*4]
+ /* move back to mem */
+ movss [eax + edx*4], xmm7
+
+ /* finish if last */
+ mov ecx, [ebp + nri]
+ dec ecx
+ jecxz i1020_end
+ /* not last, iterate once more! */
+ mov [ebp + nri], ecx
+ jmp i1020_outer
+i1020_end:
+ emms
+ mov eax, [esp + salign]
+ add esp, eax
+ add esp, 612
+ pop edi
+ pop esi
+ pop edx
+ pop ecx
+ pop ebx
+ pop eax
+ leave
+ ret
+
+
+
+.globl inl1030_sse
+ .type inl1030_sse,@function
+inl1030_sse:
+.equ nri, 8
+.equ iinr, 12
+.equ jindex, 16
+.equ jjnr, 20
+.equ shift, 24
+.equ shiftvec, 28
+.equ fshift, 32
+.equ gid, 36
+.equ pos, 40
+.equ faction, 44
+.equ charge, 48
+.equ facel, 52
+.equ Vc, 56
+ /* stack offsets for local variables */
+ /* bottom of stack is cache-aligned for sse use */
+.equ ixO, 0
+.equ iyO, 16
+.equ izO, 32
+.equ ixH1, 48
+.equ iyH1, 64
+.equ izH1, 80
+.equ ixH2, 96
+.equ iyH2, 112
+.equ izH2, 128
+.equ jxO, 144
+.equ jyO, 160
+.equ jzO, 176
+.equ jxH1, 192
+.equ jyH1, 208
+.equ jzH1, 224
+.equ jxH2, 240
+.equ jyH2, 256
+.equ jzH2, 272
+.equ dxOO, 288
+.equ dyOO, 304
+.equ dzOO, 320
+.equ dxOH1, 336
+.equ dyOH1, 352
+.equ dzOH1, 368
+.equ dxOH2, 384
+.equ dyOH2, 400
+.equ dzOH2, 416
+.equ dxH1O, 432
+.equ dyH1O, 448
+.equ dzH1O, 464
+.equ dxH1H1, 480
+.equ dyH1H1, 496
+.equ dzH1H1, 512
+.equ dxH1H2, 528
+.equ dyH1H2, 544
+.equ dzH1H2, 560
+.equ dxH2O, 576
+.equ dyH2O, 592
+.equ dzH2O, 608
+.equ dxH2H1, 624
+.equ dyH2H1, 640
+.equ dzH2H1, 656
+.equ dxH2H2, 672
+.equ dyH2H2, 688
+.equ dzH2H2, 704
+.equ qqOO, 720
+.equ qqOH, 736
+.equ qqHH, 752
+.equ vctot, 768
+.equ fixO, 784
+.equ fiyO, 800
+.equ fizO, 816
+.equ fixH1, 832
+.equ fiyH1, 848
+.equ fizH1, 864
+.equ fixH2, 880
+.equ fiyH2, 896
+.equ fizH2, 912
+.equ fjxO, 928
+.equ fjyO, 944
+.equ fjzO, 960
+.equ fjxH1, 976
+.equ fjyH1, 992
+.equ fjzH1, 1008
+.equ fjxH2, 1024
+.equ fjyH2, 1040
+.equ fjzH2, 1056
+.equ half, 1072
+.equ three, 1088
+.equ rsqOO, 1104
+.equ rsqOH1, 1120
+.equ rsqOH2, 1136
+.equ rsqH1O, 1152
+.equ rsqH1H1, 1168
+.equ rsqH1H2, 1184
+.equ rsqH2O, 1200
+.equ rsqH2H1, 1216
+.equ rsqH2H2, 1232
+.equ rinvOO, 1248
+.equ rinvOH1, 1264
+.equ rinvOH2, 1280
+.equ rinvH1O, 1296
+.equ rinvH1H1, 1312
+.equ rinvH1H2, 1328
+.equ rinvH2O, 1344
+.equ rinvH2H1, 1360
+.equ rinvH2H2, 1376
+.equ is3, 1392
+.equ ii3, 1396
+.equ innerjjnr, 1400
+.equ innerk, 1404
+.equ salign, 1408
+ push ebp
+ mov ebp,esp
+ push eax
+ push ebx
+ push ecx
+ push edx
+ push esi
+ push edi
+ sub esp, 1412 /* local stack space */
+ mov eax, esp
+ and eax, 0xf
+ sub esp, eax
+ mov [esp + salign], eax
+
+ emms
+
+ movups xmm0, [sse_half]
+ movups xmm1, [sse_three]
+ movaps [esp + half], xmm0
+ movaps [esp + three], xmm1
+
+ /* assume we have at least one i particle - start directly */
+ mov ecx, [ebp + iinr] /* ecx = pointer into iinr[] */
+ mov ebx, [ecx] /* ebx =ii */
+
+ mov edx, [ebp + charge]
+ movss xmm3, [edx + ebx*4]
+ movss xmm4, xmm3
+ movss xmm5, [edx + ebx*4 + 4]
+ movss xmm6, [ebp + facel]
+ mulss xmm3, xmm3
+ mulss xmm4, xmm5
+ mulss xmm5, xmm5
+ mulss xmm3, xmm6
+ mulss xmm4, xmm6
+ mulss xmm5, xmm6
+ shufps xmm3, xmm3, 0
+ shufps xmm4, xmm4, 0
+ shufps xmm5, xmm5, 0
+ movaps [esp + qqOO], xmm3
+ movaps [esp + qqOH], xmm4
+ movaps [esp + qqHH], xmm5
+
+i1030_outer:
+ mov eax, [ebp + shift] /* eax = pointer into shift[] */
+ mov ebx, [eax] /* ebx=shift[n] */
+ add [ebp + shift], 4 /* advance pointer one step */
+
+ lea ebx, [ebx + ebx*2] /* ebx=3*is */
+ mov [esp + is3],ebx /* store is3 */
+
+ mov eax, [ebp + shiftvec] /* eax = base of shiftvec[] */
+
+ movss xmm0, [eax + ebx*4]
+ movss xmm1, [eax + ebx*4 + 4]
+ movss xmm2, [eax + ebx*4 + 8]
+
+ mov ecx, [ebp + iinr] /* ecx = pointer into iinr[] */
+ add [ebp + iinr], 4 /* advance pointer */
+ mov ebx, [ecx] /* ebx =ii */
+
+ lea ebx, [ebx + ebx*2] /* ebx = 3*ii=ii3 */
+ mov eax, [ebp + pos] /* eax = base of pos[] */
+ mov [esp + ii3], ebx
+
+ movaps xmm3, xmm0
+ movaps xmm4, xmm1
+ movaps xmm5, xmm2
+ addss xmm3, [eax + ebx*4]
+ addss xmm4, [eax + ebx*4 + 4]
+ addss xmm5, [eax + ebx*4 + 8]
+ shufps xmm3, xmm3, 0
+ shufps xmm4, xmm4, 0
+ shufps xmm5, xmm5, 0
+ movaps [esp + ixO], xmm3
+ movaps [esp + iyO], xmm4
+ movaps [esp + izO], xmm5
+
+ movss xmm3, xmm0
+ movss xmm4, xmm1
+ movss xmm5, xmm2
+ addss xmm0, [eax + ebx*4 + 12]
+ addss xmm1, [eax + ebx*4 + 16]
+ addss xmm2, [eax + ebx*4 + 20]
+ addss xmm3, [eax + ebx*4 + 24]
+ addss xmm4, [eax + ebx*4 + 28]
+ addss xmm5, [eax + ebx*4 + 32]
+
+ shufps xmm0, xmm0, 0
+ shufps xmm1, xmm1, 0
+ shufps xmm2, xmm2, 0
+ shufps xmm3, xmm3, 0
+ shufps xmm4, xmm4, 0
+ shufps xmm5, xmm5, 0
+ movaps [esp + ixH1], xmm0
+ movaps [esp + iyH1], xmm1
+ movaps [esp + izH1], xmm2
+ movaps [esp + ixH2], xmm3
+ movaps [esp + iyH2], xmm4
+ movaps [esp + izH2], xmm5
+
+ /* clear vctot and i forces */
+ xorps xmm4, xmm4
+ movaps [esp + vctot], xmm4
+ movaps [esp + fixO], xmm4
+ movaps [esp + fiyO], xmm4
+ movaps [esp + fizO], xmm4
+ movaps [esp + fixH1], xmm4
+ movaps [esp + fiyH1], xmm4
+ movaps [esp + fizH1], xmm4
+ movaps [esp + fixH2], xmm4
+ movaps [esp + fiyH2], xmm4
+ movaps [esp + fizH2], xmm4
+
+ mov eax, [ebp + jindex]
+ mov ecx, [eax] /* jindex[n] */
+ mov edx, [eax + 4] /* jindex[n+1] */
+ add [ebp + jindex], 4
+ sub edx, ecx /* number of innerloop atoms */
+
+ mov esi, [ebp + pos]
+ mov edi, [ebp + faction]
+ mov eax, [ebp + jjnr]
+ shl ecx, 2
+ add eax, ecx
+ mov [esp + innerjjnr], eax /* pointer to jjnr[nj0] */
+ sub edx, 4
+ mov [esp + innerk], edx /* number of innerloop atoms */
+ jge i1030_unroll_loop
+ jmp i1030_single_check
+i1030_unroll_loop:
+ /* quad-unroll innerloop here */
+ mov edx, [esp + innerjjnr] /* pointer to jjnr[k] */
+
+ mov eax, [edx]
+ mov ebx, [edx + 4]
+ mov ecx, [edx + 8]
+ mov edx, [edx + 12] /* eax-edx=jnr1-4 */
+
+ add [esp + innerjjnr], 16 /* advance pointer (unrolled 4) */
+
+ mov esi, [ebp + pos] /* base of pos[] */
+
+ lea eax, [eax + eax*2] /* replace jnr with j3 */
+ lea ebx, [ebx + ebx*2]
+ lea ecx, [ecx + ecx*2] /* replace jnr with j3 */
+ lea edx, [edx + edx*2]
+
+ /* move j coordinates to local temp variables */
+ movlps xmm2, [esi + eax*4]
+ movlps xmm3, [esi + eax*4 + 12]
+ movlps xmm4, [esi + eax*4 + 24]
+
+ movlps xmm5, [esi + ebx*4]
+ movlps xmm6, [esi + ebx*4 + 12]
+ movlps xmm7, [esi + ebx*4 + 24]
+
+ movhps xmm2, [esi + ecx*4]
+ movhps xmm3, [esi + ecx*4 + 12]
+ movhps xmm4, [esi + ecx*4 + 24]
+
+ movhps xmm5, [esi + edx*4]
+ movhps xmm6, [esi + edx*4 + 12]
+ movhps xmm7, [esi + edx*4 + 24]
+
+ /* current state: */
+ /* xmm2= jxOa jyOa jxOc jyOc */
+ /* xmm3= jxH1a jyH1a jxH1c jyH1c */
+ /* xmm4= jxH2a jyH2a jxH2c jyH2c */
+ /* xmm5= jxOb jyOb jxOd jyOd */
+ /* xmm6= jxH1b jyH1b jxH1d jyH1d */
+ /* xmm7= jxH2b jyH2b jxH2d jyH2d */
+
+ movaps xmm0, xmm2
+ movaps xmm1, xmm3
+ unpcklps xmm0, xmm5 /* xmm0= jxOa jxOb jyOa jyOb */
+ unpcklps xmm1, xmm6 /* xmm1= jxH1a jxH1b jyH1a jyH1b */
+ unpckhps xmm2, xmm5 /* xmm2= jxOc jxOd jyOc jyOd */
+ unpckhps xmm3, xmm6 /* xmm3= jxH1c jxH1d jyH1c jyH1d */
+ movaps xmm5, xmm4
+ movaps xmm6, xmm0
+ unpcklps xmm4, xmm7 /* xmm4= jxH2a jxH2b jyH2a jyH2b */
+ unpckhps xmm5, xmm7 /* xmm5= jxH2c jxH2d jyH2c jyH2d */
+ movaps xmm7, xmm1
+ movlhps xmm0, xmm2 /* xmm0= jxOa jxOb jxOc jxOd */
+ movaps [esp + jxO], xmm0
+ movhlps xmm2, xmm6 /* xmm2= jyOa jyOb jyOc jyOd */
+ movaps [esp + jyO], xmm2
+ movlhps xmm1, xmm3
+ movaps [esp + jxH1], xmm1
+ movhlps xmm3, xmm7
+ movaps xmm6, xmm4
+ movaps [esp + jyH1], xmm3
+ movlhps xmm4, xmm5
+ movaps [esp + jxH2], xmm4
+ movhlps xmm5, xmm6
+ movaps [esp + jyH2], xmm5
+
+ movss xmm0, [esi + eax*4 + 8]
+ movss xmm1, [esi + eax*4 + 20]
+ movss xmm2, [esi + eax*4 + 32]
+
+ movss xmm3, [esi + ecx*4 + 8]
+ movss xmm4, [esi + ecx*4 + 20]
+ movss xmm5, [esi + ecx*4 + 32]
+
+ movhps xmm0, [esi + ebx*4 + 4]
+ movhps xmm1, [esi + ebx*4 + 16]
+ movhps xmm2, [esi + ebx*4 + 28]
+
+ movhps xmm3, [esi + edx*4 + 4]
+ movhps xmm4, [esi + edx*4 + 16]
+ movhps xmm5, [esi + edx*4 + 28]
+
+ shufps xmm0, xmm3, 0b11001100
+ shufps xmm1, xmm4, 0b11001100
+ shufps xmm2, xmm5, 0b11001100
+ movaps [esp + jzO], xmm0
+ movaps [esp + jzH1], xmm1
+ movaps [esp + jzH2], xmm2
+
+ movaps xmm0, [esp + ixO]
+ movaps xmm1, [esp + iyO]
+ movaps xmm2, [esp + izO]
+ movaps xmm3, [esp + ixO]
+ movaps xmm4, [esp + iyO]
+ movaps xmm5, [esp + izO]
+ subps xmm0, [esp + jxO]
+ subps xmm1, [esp + jyO]
+ subps xmm2, [esp + jzO]
+ subps xmm3, [esp + jxH1]
+ subps xmm4, [esp + jyH1]
+ subps xmm5, [esp + jzH1]
+ movaps [esp + dxOO], xmm0
+ movaps [esp + dyOO], xmm1
+ movaps [esp + dzOO], xmm2
+ mulps xmm0, xmm0
+ mulps xmm1, xmm1
+ mulps xmm2, xmm2
+ movaps [esp + dxOH1], xmm3
+ movaps [esp + dyOH1], xmm4
+ movaps [esp + dzOH1], xmm5
+ mulps xmm3, xmm3
+ mulps xmm4, xmm4
+ mulps xmm5, xmm5
+ addps xmm0, xmm1
+ addps xmm0, xmm2
+ addps xmm3, xmm4
+ addps xmm3, xmm5
+ movaps [esp + rsqOO], xmm0
+ movaps [esp + rsqOH1], xmm3
+
+ movaps xmm0, [esp + ixO]
+ movaps xmm1, [esp + iyO]
+ movaps xmm2, [esp + izO]
+ movaps xmm3, [esp + ixH1]
+ movaps xmm4, [esp + iyH1]
+ movaps xmm5, [esp + izH1]
+ subps xmm0, [esp + jxH2]
+ subps xmm1, [esp + jyH2]
+ subps xmm2, [esp + jzH2]
+ subps xmm3, [esp + jxO]
+ subps xmm4, [esp + jyO]
+ subps xmm5, [esp + jzO]
+ movaps [esp + dxOH2], xmm0
+ movaps [esp + dyOH2], xmm1
+ movaps [esp + dzOH2], xmm2
+ mulps xmm0, xmm0
+ mulps xmm1, xmm1
+ mulps xmm2, xmm2
+ movaps [esp + dxH1O], xmm3
+ movaps [esp + dyH1O], xmm4
+ movaps [esp + dzH1O], xmm5
+ mulps xmm3, xmm3
+ mulps xmm4, xmm4
+ mulps xmm5, xmm5
+ addps xmm0, xmm1
+ addps xmm0, xmm2
+ addps xmm3, xmm4
+ addps xmm3, xmm5
+ movaps [esp + rsqOH2], xmm0
+ movaps [esp + rsqH1O], xmm3
+
+ movaps xmm0, [esp + ixH1]
+ movaps xmm1, [esp + iyH1]
+ movaps xmm2, [esp + izH1]
+ movaps xmm3, [esp + ixH1]
+ movaps xmm4, [esp + iyH1]
+ movaps xmm5, [esp + izH1]
+ subps xmm0, [esp + jxH1]
+ subps xmm1, [esp + jyH1]
+ subps xmm2, [esp + jzH1]
+ subps xmm3, [esp + jxH2]
+ subps xmm4, [esp + jyH2]
+ subps xmm5, [esp + jzH2]
+ movaps [esp + dxH1H1], xmm0
+ movaps [esp + dyH1H1], xmm1
+ movaps [esp + dzH1H1], xmm2
+ mulps xmm0, xmm0
+ mulps xmm1, xmm1
+ mulps xmm2, xmm2
+ movaps [esp + dxH1H2], xmm3
+ movaps [esp + dyH1H2], xmm4
+ movaps [esp + dzH1H2], xmm5
+ mulps xmm3, xmm3
+ mulps xmm4, xmm4
+ mulps xmm5, xmm5
+ addps xmm0, xmm1
+ addps xmm0, xmm2
+ addps xmm3, xmm4
+ addps xmm3, xmm5
+ movaps [esp + rsqH1H1], xmm0
+ movaps [esp + rsqH1H2], xmm3
+
+ movaps xmm0, [esp + ixH2]
+ movaps xmm1, [esp + iyH2]
+ movaps xmm2, [esp + izH2]
+ movaps xmm3, [esp + ixH2]
+ movaps xmm4, [esp + iyH2]
+ movaps xmm5, [esp + izH2]
+ subps xmm0, [esp + jxO]
+ subps xmm1, [esp + jyO]
+ subps xmm2, [esp + jzO]
+ subps xmm3, [esp + jxH1]
+ subps xmm4, [esp + jyH1]
+ subps xmm5, [esp + jzH1]
+ movaps [esp + dxH2O], xmm0
+ movaps [esp + dyH2O], xmm1
+ movaps [esp + dzH2O], xmm2
+ mulps xmm0, xmm0
+ mulps xmm1, xmm1
+ mulps xmm2, xmm2
+ movaps [esp + dxH2H1], xmm3
+ movaps [esp + dyH2H1], xmm4
+ movaps [esp + dzH2H1], xmm5
+ mulps xmm3, xmm3
+ mulps xmm4, xmm4
+ mulps xmm5, xmm5
+ addps xmm0, xmm1
+ addps xmm0, xmm2
+ addps xmm4, xmm3
+ addps xmm4, xmm5
+ movaps [esp + rsqH2O], xmm0
+ movaps [esp + rsqH2H1], xmm4
+
+ movaps xmm0, [esp + ixH2]
+ movaps xmm1, [esp + iyH2]
+ movaps xmm2, [esp + izH2]
+ subps xmm0, [esp + jxH2]
+ subps xmm1, [esp + jyH2]
+ subps xmm2, [esp + jzH2]
+ movaps [esp + dxH2H2], xmm0
+ movaps [esp + dyH2H2], xmm1
+ movaps [esp + dzH2H2], xmm2
+ mulps xmm0, xmm0
+ mulps xmm1, xmm1
+ mulps xmm2, xmm2
+ addps xmm0, xmm1
+ addps xmm0, xmm2
+ movaps [esp + rsqH2H2], xmm0
+
+ /* start doing invsqrt use rsq values in xmm0, xmm4 */
+ rsqrtps xmm1, xmm0
+ rsqrtps xmm5, xmm4
+ movaps xmm2, xmm1
+ movaps xmm6, xmm5
+ mulps xmm1, xmm1
+ mulps xmm5, xmm5
+ movaps xmm3, [esp + three]
+ movaps xmm7, xmm3
+ mulps xmm1, xmm0
+ mulps xmm5, xmm4
+ subps xmm3, xmm1
+ subps xmm7, xmm5
+ mulps xmm3, xmm2
+ mulps xmm7, xmm6
+ mulps xmm3, [esp + half] /* rinvH2H2 */
+ mulps xmm7, [esp + half] /* rinvH2H1 */
+ movaps [esp + rinvH2H2], xmm3
+ movaps [esp + rinvH2H1], xmm7
+
+ rsqrtps xmm1, [esp + rsqOO]
+ rsqrtps xmm5, [esp + rsqOH1]
+ movaps xmm2, xmm1
+ movaps xmm6, xmm5
+ mulps xmm1, xmm1
+ mulps xmm5, xmm5
+ movaps xmm3, [esp + three]
+ movaps xmm7, xmm3
+ mulps xmm1, [esp + rsqOO]
+ mulps xmm5, [esp + rsqOH1]
+ subps xmm3, xmm1
+ subps xmm7, xmm5
+ mulps xmm3, xmm2
+ mulps xmm7, xmm6
+ mulps xmm3, [esp + half]
+ mulps xmm7, [esp + half]
+ movaps [esp + rinvOO], xmm3
+ movaps [esp + rinvOH1], xmm7
+
+ rsqrtps xmm1, [esp + rsqOH2]
+ rsqrtps xmm5, [esp + rsqH1O]
+ movaps xmm2, xmm1
+ movaps xmm6, xmm5
+ mulps xmm1, xmm1
+ mulps xmm5, xmm5
+ movaps xmm3, [esp + three]
+ movaps xmm7, xmm3
+ mulps xmm1, [esp + rsqOH2]
+ mulps xmm5, [esp + rsqH1O]
+ subps xmm3, xmm1
+ subps xmm7, xmm5
+ mulps xmm3, xmm2
+ mulps xmm7, xmm6
+ mulps xmm3, [esp + half]
+ mulps xmm7, [esp + half]
+ movaps [esp + rinvOH2], xmm3
+ movaps [esp + rinvH1O], xmm7
+
+ rsqrtps xmm1, [esp + rsqH1H1]
+ rsqrtps xmm5, [esp + rsqH1H2]
+ movaps xmm2, xmm1
+ movaps xmm6, xmm5
+ mulps xmm1, xmm1
+ mulps xmm5, xmm5
+ movaps xmm3, [esp + three]
+ movaps xmm7, xmm3
+ mulps xmm1, [esp + rsqH1H1]
+ mulps xmm5, [esp + rsqH1H2]
+ subps xmm3, xmm1
+ subps xmm7, xmm5
+ mulps xmm3, xmm2
+ mulps xmm7, xmm6
+ mulps xmm3, [esp + half]
+ mulps xmm7, [esp + half]
+ movaps [esp + rinvH1H1], xmm3
+ movaps [esp + rinvH1H2], xmm7
+
+ rsqrtps xmm1, [esp + rsqH2O]
+ movaps xmm2, xmm1
+ mulps xmm1, xmm1
+ movaps xmm3, [esp + three]
+ mulps xmm1, [esp + rsqH2O]
+ subps xmm3, xmm1
+ mulps xmm3, xmm2
+ mulps xmm3, [esp + half]
+ movaps [esp + rinvH2O], xmm3
+
+ /* start with OO interaction */
+ movaps xmm0, [esp + rinvOO]
+ movaps xmm7, xmm0
+ mulps xmm0, xmm0
+ mulps xmm7, [esp + qqOO]
+ mulps xmm0, xmm7
+ addps xmm7, [esp + vctot]
+ movaps xmm1, xmm0
+ movaps xmm2, xmm0
+
+ xorps xmm3, xmm3
+ movaps xmm4, xmm3
+ movaps xmm5, xmm3
+ mulps xmm0, [esp + dxOO]
+ mulps xmm1, [esp + dyOO]
+ mulps xmm2, [esp + dzOO]
+ subps xmm3, xmm0
+ subps xmm4, xmm1
+ subps xmm5, xmm2
+ addps xmm0, [esp + fixO]
+ addps xmm1, [esp + fiyO]
+ addps xmm2, [esp + fizO]
+ movaps [esp + fjxO], xmm3
+ movaps [esp + fjyO], xmm4
+ movaps [esp + fjzO], xmm5
+ movaps [esp + fixO], xmm0
+ movaps [esp + fiyO], xmm1
+ movaps [esp + fizO], xmm2
+
+ /* O-H1 interaction */
+ movaps xmm0, [esp + rinvOH1]
+ movaps xmm1, xmm0
+ mulps xmm0, xmm0
+ mulps xmm1, [esp + qqOH]
+ mulps xmm0, xmm1 /* fsOH1 */
+ addps xmm7, xmm1 /* add to local vctot */
+ movaps xmm1, xmm0
+ movaps xmm2, xmm0
+
+ xorps xmm3, xmm3
+ movaps xmm4, xmm3
+ movaps xmm5, xmm3
+ mulps xmm0, [esp + dxOH1]
+ mulps xmm1, [esp + dyOH1]
+ mulps xmm2, [esp + dzOH1]
+ subps xmm3, xmm0
+ subps xmm4, xmm1
+ subps xmm5, xmm2
+ addps xmm0, [esp + fixO]
+ addps xmm1, [esp + fiyO]
+ addps xmm2, [esp + fizO]
+ movaps [esp + fjxH1], xmm3
+ movaps [esp + fjyH1], xmm4
+ movaps [esp + fjzH1], xmm5
+ movaps [esp + fixO], xmm0
+ movaps [esp + fiyO], xmm1
+ movaps [esp + fizO], xmm2
+
+ /* O-H2 interaction */
+ movaps xmm0, [esp + rinvOH2]
+ movaps xmm1, xmm0
+ mulps xmm0, xmm0
+ mulps xmm1, [esp + qqOH]
+ mulps xmm0, xmm1 /* fsOH2 */
+ addps xmm7, xmm1 /* add to local vctot */
+ movaps xmm1, xmm0
+ movaps xmm2, xmm0
+
+ xorps xmm3, xmm3
+ movaps xmm4, xmm3
+ movaps xmm5, xmm3
+ mulps xmm0, [esp + dxOH2]
+ mulps xmm1, [esp + dyOH2]
+ mulps xmm2, [esp + dzOH2]
+ subps xmm3, xmm0
+ subps xmm4, xmm1
+ subps xmm5, xmm2
+ addps xmm0, [esp + fixO]
+ addps xmm1, [esp + fiyO]
+ addps xmm2, [esp + fizO]
+ movaps [esp + fjxH2], xmm3
+ movaps [esp + fjyH2], xmm4
+ movaps [esp + fjzH2], xmm5
+ movaps [esp + fixO], xmm0
+ movaps [esp + fiyO], xmm1
+ movaps [esp + fizO], xmm2
+
+ /* H1-O interaction */
+ movaps xmm0, [esp + rinvH1O]
+ movaps xmm1, xmm0
+ mulps xmm0, xmm0
+ mulps xmm1, [esp + qqOH]
+ mulps xmm0, xmm1 /* fsH1O */
+ addps xmm7, xmm1 /* add to local vctot */
+ movaps xmm1, xmm0
+ movaps xmm2, xmm0
+ movaps xmm3, [esp + fjxO]
+ movaps xmm4, [esp + fjyO]
+ movaps xmm5, [esp + fjzO]
+ mulps xmm0, [esp + dxH1O]
+ mulps xmm1, [esp + dyH1O]
+ mulps xmm2, [esp + dzH1O]
+ subps xmm3, xmm0
+ subps xmm4, xmm1
+ subps xmm5, xmm2
+ addps xmm0, [esp + fixH1]
+ addps xmm1, [esp + fiyH1]
+ addps xmm2, [esp + fizH1]
+ movaps [esp + fjxO], xmm3
+ movaps [esp + fjyO], xmm4
+ movaps [esp + fjzO], xmm5
+ movaps [esp + fixH1], xmm0
+ movaps [esp + fiyH1], xmm1
+ movaps [esp + fizH1], xmm2
+
+ /* H1-H1 interaction */
+ movaps xmm0, [esp + rinvH1H1]
+ movaps xmm1, xmm0
+ mulps xmm0, xmm0
+ mulps xmm1, [esp + qqHH]
+ mulps xmm0, xmm1 /* fsH1H1 */
+ addps xmm7, xmm1 /* add to local vctot */
+ movaps xmm1, xmm0
+ movaps xmm2, xmm0
+ movaps xmm3, [esp + fjxH1]
+ movaps xmm4, [esp + fjyH1]
+ movaps xmm5, [esp + fjzH1]
+ mulps xmm0, [esp + dxH1H1]
+ mulps xmm1, [esp + dyH1H1]
+ mulps xmm2, [esp + dzH1H1]
+ subps xmm3, xmm0
+ subps xmm4, xmm1
+ subps xmm5, xmm2
+ addps xmm0, [esp + fixH1]
+ addps xmm1, [esp + fiyH1]
+ addps xmm2, [esp + fizH1]
+ movaps [esp + fjxH1], xmm3
+ movaps [esp + fjyH1], xmm4
+ movaps [esp + fjzH1], xmm5
+ movaps [esp + fixH1], xmm0
+ movaps [esp + fiyH1], xmm1
+ movaps [esp + fizH1], xmm2
+
+ /* H1-H2 interaction */
+ movaps xmm0, [esp + rinvH1H2]
+ movaps xmm1, xmm0
+ mulps xmm0, xmm0
+ mulps xmm1, [esp + qqHH]
+ mulps xmm0, xmm1 /* fsOH2 */
+ addps xmm7, xmm1 /* add to local vctot */
+ movaps xmm1, xmm0
+ movaps xmm2, xmm0
+ movaps xmm3, [esp + fjxH2]
+ movaps xmm4, [esp + fjyH2]
+ movaps xmm5, [esp + fjzH2]
+ mulps xmm0, [esp + dxH1H2]
+ mulps xmm1, [esp + dyH1H2]
+ mulps xmm2, [esp + dzH1H2]
+ subps xmm3, xmm0
+ subps xmm4, xmm1
+ subps xmm5, xmm2
+ addps xmm0, [esp + fixH1]
+ addps xmm1, [esp + fiyH1]
+ addps xmm2, [esp + fizH1]
+ movaps [esp + fjxH2], xmm3
+ movaps [esp + fjyH2], xmm4
+ movaps [esp + fjzH2], xmm5
+ movaps [esp + fixH1], xmm0
+ movaps [esp + fiyH1], xmm1
+ movaps [esp + fizH1], xmm2
+
+ /* H2-O interaction */
+ movaps xmm0, [esp + rinvH2O]
+ movaps xmm1, xmm0
+ mulps xmm0, xmm0
+ mulps xmm1, [esp + qqOH]
+ mulps xmm0, xmm1 /* fsH2O */
+ addps xmm7, xmm1 /* add to local vctot */
+ movaps xmm1, xmm0
+ movaps xmm2, xmm0
+ movaps xmm3, [esp + fjxO]
+ movaps xmm4, [esp + fjyO]
+ movaps xmm5, [esp + fjzO]
+ mulps xmm0, [esp + dxH2O]
+ mulps xmm1, [esp + dyH2O]
+ mulps xmm2, [esp + dzH2O]
+ subps xmm3, xmm0
+ subps xmm4, xmm1
+ subps xmm5, xmm2
+ addps xmm0, [esp + fixH2]
+ addps xmm1, [esp + fiyH2]
+ addps xmm2, [esp + fizH2]
+ movaps [esp + fjxO], xmm3
+ movaps [esp + fjyO], xmm4
+ movaps [esp + fjzO], xmm5
+ movaps [esp + fixH2], xmm0
+ movaps [esp + fiyH2], xmm1
+ movaps [esp + fizH2], xmm2
+
+ /* H2-H1 interaction */
+ movaps xmm0, [esp + rinvH2H1]
+ movaps xmm1, xmm0
+ mulps xmm0, xmm0
+ mulps xmm1, [esp + qqHH]
+ mulps xmm0, xmm1 /* fsH2H1 */
+ addps xmm7, xmm1 /* add to local vctot */
+ movaps xmm1, xmm0
+ movaps xmm2, xmm0
+ movaps xmm3, [esp + fjxH1]
+ movaps xmm4, [esp + fjyH1]
+ movaps xmm5, [esp + fjzH1]
+ mulps xmm0, [esp + dxH2H1]
+ mulps xmm1, [esp + dyH2H1]
+ mulps xmm2, [esp + dzH2H1]
+ subps xmm3, xmm0
+ subps xmm4, xmm1
+ subps xmm5, xmm2
+ addps xmm0, [esp + fixH2]
+ addps xmm1, [esp + fiyH2]
+ addps xmm2, [esp + fizH2]
+ movaps [esp + fjxH1], xmm3
+ movaps [esp + fjyH1], xmm4
+ movaps [esp + fjzH1], xmm5
+ movaps [esp + fixH2], xmm0
+ movaps [esp + fiyH2], xmm1
+ movaps [esp + fizH2], xmm2
+
+ /* H2-H2 interaction */
+ movaps xmm0, [esp + rinvH2H2]
+ movaps xmm1, xmm0
+ mulps xmm0, xmm0
+ mulps xmm1, [esp + qqHH]
+ mulps xmm0, xmm1 /* fsH2H2 */
+ addps xmm7, xmm1 /* add to local vctot */
+ movaps xmm1, xmm0
+ movaps [esp + vctot], xmm7
+ movaps xmm2, xmm0
+ movaps xmm3, [esp + fjxH2]
+ movaps xmm4, [esp + fjyH2]
+ movaps xmm5, [esp + fjzH2]
+ mulps xmm0, [esp + dxH2H2]
+ mulps xmm1, [esp + dyH2H2]
+ mulps xmm2, [esp + dzH2H2]
+ subps xmm3, xmm0
+ subps xmm4, xmm1
+ subps xmm5, xmm2
+ addps xmm0, [esp + fixH2]
+ addps xmm1, [esp + fiyH2]
+ addps xmm2, [esp + fizH2]
+ movaps [esp + fjxH2], xmm3
+ movaps [esp + fjyH2], xmm4
+ movaps [esp + fjzH2], xmm5
+ movaps [esp + fixH2], xmm0
+ movaps [esp + fiyH2], xmm1
+ movaps [esp + fizH2], xmm2
+
+ mov edi, [ebp +faction]
+
+ /* Did all interactions - now update j forces */
+ /* 4 j waters with three atoms each - first do a & b j particles */
+ movaps xmm0, [esp + fjxO] /* xmm0= fjxOa fjxOb fjxOc fjxOd */
+ movaps xmm1, [esp + fjyO] /* xmm1= fjyOa fjyOb fjyOc fjyOd */
+ unpcklps xmm0, xmm1 /* xmm0= fjxOa fjyOa fjxOb fjyOb */
+ movaps xmm1, [esp + fjzO]
+ movaps xmm2, [esp + fjxH1]
+ movhlps xmm3, xmm0 /* xmm3= fjxOb fjyOb */
+ unpcklps xmm1, xmm2 /* xmm1= fjzOa fjxH1a fjzOb fjxH1b */
+ movaps xmm4, [esp + fjyH1]
+ movaps xmm5, [esp + fjzH1]
+ unpcklps xmm4, xmm5 /* xmm4= fjyH1a fjzH1a fjyH1b fjzH1b */
+ movaps xmm5, [esp + fjxH2]
+ movaps xmm6, [esp + fjyH2]
+ movhlps xmm7, xmm4 /* xmm7= fjyH1b fjzH1b */
+ unpcklps xmm5, xmm6 /* xmm5= fjxH2a fjyH2a fjxH2b fjyH2b */
+ movlhps xmm0, xmm1 /* xmm0= fjxOa fjyOa fjzOa fjxH1a */
+ shufps xmm3, xmm1, 0b11100100
+ /* xmm3= fjxOb fjyOb fjzOb fjxH1b */
+ movlhps xmm4, xmm5 /* xmm4= fjyH1a fjzH1a fjxH2a fjyH2a */
+ shufps xmm7, xmm5, 0b11100100
+ /* xmm7= fjyH1b fjzH1b fjxH2b fjyH2b */
+ movups xmm1, [edi + eax*4]
+ movups xmm2, [edi + eax*4 + 16]
+ movups xmm5, [edi + ebx*4]
+ movups xmm6, [edi + ebx*4 + 16]
+ addps xmm1, xmm0
+ addps xmm2, xmm4
+ addps xmm5, xmm3
+ addps xmm6, xmm7
+ movss xmm0, [edi + eax*4 + 32]
+ movss xmm3, [edi + ebx*4 + 32]
+
+ movaps xmm4, [esp + fjzH2]
+ movaps xmm7, xmm4
+ shufps xmm7, xmm7, 1
+
+ movups [edi + eax*4], xmm1
+ movups [edi + eax*4 + 16],xmm2
+ movups [edi + ebx*4], xmm5
+ movups [edi + ebx*4 + 16],xmm6
+ addss xmm0, xmm4
+ addss xmm3, xmm7
+ movss [edi + eax*4 + 32], xmm0
+ movss [edi + ebx*4 + 32], xmm3
+
+ /* then do the second pair (c & d) */
+ movaps xmm0, [esp + fjxO] /* xmm0= fjxOa fjxOb fjxOc fjxOd */
+ movaps xmm1, [esp + fjyO] /* xmm1= fjyOa fjyOb fjyOc fjyOd */
+ unpckhps xmm0, xmm1 /* xmm0= fjxOc fjyOc fjxOd fjyOd */
+ movaps xmm1, [esp + fjzO]
+ movaps xmm2, [esp + fjxH1]
+ movhlps xmm3, xmm0 /* xmm3= fjxOd fjyOd */
+ unpckhps xmm1, xmm2 /* xmm1= fjzOc fjxH1c fjzOd fjxH1d */
+ movaps xmm4, [esp + fjyH1]
+ movaps xmm5, [esp + fjzH1]
+ unpckhps xmm4, xmm5 /* xmm4= fjyH1c fjzH1c fjyH1d fjzH1d */
+ movaps xmm5, [esp + fjxH2]
+ movaps xmm6, [esp + fjyH2]
+ movhlps xmm7, xmm4 /* xmm7= fjyH1d fjzH1d */
+ unpckhps xmm5, xmm6 /* xmm5= fjxH2c fjyH2c fjxH2d fjyH2d */
+ movlhps xmm0, xmm1 /* xmm0= fjxOc fjyOc fjzOc fjxH1c */
+ shufps xmm3, xmm1, 0b11100100
+ /* xmm3= fjxOd fjyOd fjzOd fjxH1d */
+ movlhps xmm4, xmm5 /* xmm4= fjyH1c fjzH1c fjxH2c fjyH2c */
+ shufps xmm7, xmm5, 0b11100100
+ /* xmm7= fjyH1d fjzH1d fjxH2d fjyH2d */
+ movups xmm1, [edi + ecx*4]
+ movups xmm2, [edi + ecx*4 + 16]
+ movups xmm5, [edi + edx*4]
+ movups xmm6, [edi + edx*4 + 16]
+ addps xmm1, xmm0
+ addps xmm2, xmm4
+ addps xmm5, xmm3
+ addps xmm6, xmm7
+ movss xmm0, [edi + ecx*4 + 32]
+ movss xmm3, [edi + edx*4 + 32]
+
+ movaps xmm4, [esp + fjzH2]
+ movaps xmm7, xmm4
+ shufps xmm4, xmm4, 0b10
+ shufps xmm7, xmm7, 0b11
+ movups [edi + ecx*4], xmm1
+ movups [edi + ecx*4 + 16],xmm2
+ movups [edi + edx*4], xmm5
+ movups [edi + edx*4 + 16],xmm6
+ addss xmm0, xmm4
+ addss xmm3, xmm7
+ movss [edi + ecx*4 + 32], xmm0
+ movss [edi + edx*4 + 32], xmm3
+
+ /* should we do one more iteration? */
+ sub [esp + innerk], 4
+ jl i1030_single_check
+ jmp i1030_unroll_loop
+i1030_single_check:
+ add [esp + innerk], 4
+ jnz i1030_single_loop
+ jmp i1030_updateouterdata
+i1030_single_loop:
+ mov edx, [esp + innerjjnr] /* pointer to jjnr[k] */
+ mov eax, [edx]
+ add [esp + innerjjnr], 4
+
+ mov esi, [ebp + pos]
+ lea eax, [eax + eax*2]
+
+ /* fetch j coordinates */
+ xorps xmm3, xmm3
+ xorps xmm4, xmm4
+ xorps xmm5, xmm5
+ movss xmm3, [esi + eax*4]
+ movss xmm4, [esi + eax*4 + 4]
+ movss xmm5, [esi + eax*4 + 8]
+
+ movlps xmm6, [esi + eax*4 + 12]
+ movhps xmm6, [esi + eax*4 + 24] /* xmm6=jxH1 jyH1 jxH2 jyH2 */
+ /* fetch both z coords in one go, to positions 0 and 3 in xmm7 */
+ movups xmm7, [esi + eax*4 + 20] /* xmm7=jzH1 jxH2 jyH2 jzH2 */
+ shufps xmm6, xmm6, 0b11011000 /* xmm6=jxH1 jxH2 jyH1 jyH2 */
+ movlhps xmm3, xmm6 /* xmm3= jxO 0 jxH1 jxH2 */
+ movaps xmm0, [esp + ixO]
+ movaps xmm1, [esp + iyO]
+ movaps xmm2, [esp + izO]
+ shufps xmm4, xmm6, 0b11100100 /* xmm4= jyO 0 jyH1 jyH2 */
+ shufps xmm5, xmm7, 0b11000100 /* xmm5= jzO 0 jzH1 jzH2 */
+ /* store all j coordinates in jO */
+ movaps [esp + jxO], xmm3
+ movaps [esp + jyO], xmm4
+ movaps [esp + jzO], xmm5
+ subps xmm0, xmm3
+ subps xmm1, xmm4
+ subps xmm2, xmm5
+ movaps [esp + dxOO], xmm0
+ movaps [esp + dyOO], xmm1
+ movaps [esp + dzOO], xmm2
+ mulps xmm0, xmm0
+ mulps xmm1, xmm1
+ mulps xmm2, xmm2
+ addps xmm0, xmm1
+ addps xmm0, xmm2 /* have rsq in xmm0 */
+
+ /* do invsqrt */
+ rsqrtps xmm1, xmm0
+ movaps xmm2, xmm1
+ mulps xmm1, xmm1
+ movaps xmm3, [esp + three]
+ mulps xmm1, xmm0
+ subps xmm3, xmm1
+ mulps xmm3, xmm2
+ mulps xmm3, [esp + half] /* rinv iO - j water */
+
+ xorps xmm1, xmm1
+ movaps xmm0, xmm3
+ xorps xmm4, xmm4
+ mulps xmm0, xmm0 /* xmm0=rinvsq */
+ /* fetch charges to xmm4 (temporary) */
+ movss xmm4, [esp + qqOO]
+
+ movhps xmm4, [esp + qqOH]
+
+ mulps xmm3, xmm4 /* xmm3=vcoul */
+ mulps xmm0, xmm3 /* total fscal */
+ addps xmm3, [esp + vctot]
+ movaps [esp + vctot], xmm3
+
+ movaps xmm1, xmm0
+ movaps xmm2, xmm0
+ mulps xmm0, [esp + dxOO]
+ mulps xmm1, [esp + dyOO]
+ mulps xmm2, [esp + dzOO]
+ /* initial update for j forces */
+ xorps xmm3, xmm3
+ xorps xmm4, xmm4
+ xorps xmm5, xmm5
+ subps xmm3, xmm0
+ subps xmm4, xmm1
+ subps xmm5, xmm2
+ movaps [esp + fjxO], xmm3
+ movaps [esp + fjyO], xmm4
+ movaps [esp + fjzO], xmm5
+ addps xmm0, [esp + fixO]
+ addps xmm1, [esp + fiyO]
+ addps xmm2, [esp + fizO]
+ movaps [esp + fixO], xmm0
+ movaps [esp + fiyO], xmm1
+ movaps [esp + fizO], xmm2
+
+
+ /* done with i O Now do i H1 & H2 simultaneously first get i particle coords: */
+ movaps xmm0, [esp + ixH1]
+ movaps xmm1, [esp + iyH1]
+ movaps xmm2, [esp + izH1]
+ movaps xmm3, [esp + ixH2]
+ movaps xmm4, [esp + iyH2]
+ movaps xmm5, [esp + izH2]
+ subps xmm0, [esp + jxO]
+ subps xmm1, [esp + jyO]
+ subps xmm2, [esp + jzO]
+ subps xmm3, [esp + jxO]
+ subps xmm4, [esp + jyO]
+ subps xmm5, [esp + jzO]
+ movaps [esp + dxH1O], xmm0
+ movaps [esp + dyH1O], xmm1
+ movaps [esp + dzH1O], xmm2
+ movaps [esp + dxH2O], xmm3
+ movaps [esp + dyH2O], xmm4
+ movaps [esp + dzH2O], xmm5
+ mulps xmm0, xmm0
+ mulps xmm1, xmm1
+ mulps xmm2, xmm2
+ mulps xmm3, xmm3
+ mulps xmm4, xmm4
+ mulps xmm5, xmm5
+ addps xmm0, xmm1
+ addps xmm4, xmm3
+ addps xmm0, xmm2 /* have rsqH1 in xmm0 */
+ addps xmm4, xmm5 /* have rsqH2 in xmm4 */
+
+ /* do invsqrt */
+ rsqrtps xmm1, xmm0
+ rsqrtps xmm5, xmm4
+ movaps xmm2, xmm1
+ movaps xmm6, xmm5
+ mulps xmm1, xmm1
+ mulps xmm5, xmm5
+ movaps xmm3, [esp + three]
+ movaps xmm7, xmm3
+ mulps xmm1, xmm0
+ mulps xmm5, xmm4
+ subps xmm3, xmm1
+ subps xmm7, xmm5
+ mulps xmm3, xmm2
+ mulps xmm7, xmm6
+ mulps xmm3, [esp + half] /* rinv H1 - j water */
+ mulps xmm7, [esp + half] /* rinv H2 - j water */
+
+ /* assemble charges in xmm6 */
+ xorps xmm6, xmm6
+ /* do coulomb interaction */
+ movaps xmm0, xmm3
+ movss xmm6, [esp + qqOH]
+ movaps xmm4, xmm7
+ movhps xmm6, [esp + qqHH]
+ mulps xmm0, xmm0 /* rinvsq */
+ mulps xmm4, xmm4 /* rinvsq */
+ mulps xmm3, xmm6 /* vcoul */
+ mulps xmm7, xmm6 /* vcoul */
+ movaps xmm2, xmm3
+ addps xmm2, xmm7 /* total vcoul */
+ mulps xmm0, xmm3 /* fscal */
+
+ addps xmm2, [esp + vctot]
+ mulps xmm7, xmm4 /* fscal */
+ movaps [esp + vctot], xmm2
+ movaps xmm1, xmm0
+ movaps xmm2, xmm0
+ mulps xmm0, [esp + dxH1O]
+ mulps xmm1, [esp + dyH1O]
+ mulps xmm2, [esp + dzH1O]
+ /* update forces H1 - j water */
+ movaps xmm3, [esp + fjxO]
+ movaps xmm4, [esp + fjyO]
+ movaps xmm5, [esp + fjzO]
+ subps xmm3, xmm0
+ subps xmm4, xmm1
+ subps xmm5, xmm2
+ movaps [esp + fjxO], xmm3
+ movaps [esp + fjyO], xmm4
+ movaps [esp + fjzO], xmm5
+ addps xmm0, [esp + fixH1]
+ addps xmm1, [esp + fiyH1]
+ addps xmm2, [esp + fizH1]
+ movaps [esp + fixH1], xmm0
+ movaps [esp + fiyH1], xmm1
+ movaps [esp + fizH1], xmm2
+ /* do forces H2 - j water */
+ movaps xmm0, xmm7
+ movaps xmm1, xmm7
+ movaps xmm2, xmm7
+ mulps xmm0, [esp + dxH2O]
+ mulps xmm1, [esp + dyH2O]
+ mulps xmm2, [esp + dzH2O]
+ movaps xmm3, [esp + fjxO]
+ movaps xmm4, [esp + fjyO]
+ movaps xmm5, [esp + fjzO]
+ subps xmm3, xmm0
+ subps xmm4, xmm1
+ subps xmm5, xmm2
+ mov esi, [ebp + faction]
+ movaps [esp + fjxO], xmm3
+ movaps [esp + fjyO], xmm4
+ movaps [esp + fjzO], xmm5
+ addps xmm0, [esp + fixH2]
+ addps xmm1, [esp + fiyH2]
+ addps xmm2, [esp + fizH2]
+ movaps [esp + fixH2], xmm0
+ movaps [esp + fiyH2], xmm1
+ movaps [esp + fizH2], xmm2
+
+ /* update j water forces from local variables */
+ movlps xmm0, [esi + eax*4]
+ movlps xmm1, [esi + eax*4 + 12]
+ movhps xmm1, [esi + eax*4 + 24]
+ movaps xmm3, [esp + fjxO]
+ movaps xmm4, [esp + fjyO]
+ movaps xmm5, [esp + fjzO]
+ movaps xmm6, xmm5
+ movaps xmm7, xmm5
+ shufps xmm6, xmm6, 0b10
+ shufps xmm7, xmm7, 0b11
+ addss xmm5, [esi + eax*4 + 8]
+ addss xmm6, [esi + eax*4 + 20]
+ addss xmm7, [esi + eax*4 + 32]
+ movss [esi + eax*4 + 8], xmm5
+ movss [esi + eax*4 + 20], xmm6
+ movss [esi + eax*4 + 32], xmm7
+ movaps xmm5, xmm3
+ unpcklps xmm3, xmm4
+ unpckhps xmm5, xmm4
+ addps xmm0, xmm3
+ addps xmm1, xmm5
+ movlps [esi + eax*4], xmm0
+ movlps [esi + eax*4 + 12], xmm1
+ movhps [esi + eax*4 + 24], xmm1
+
+ dec dword ptr [esp + innerk]
+ jz i1030_updateouterdata
+ jmp i1030_single_loop
+i1030_updateouterdata:
+ mov ecx, [esp + ii3]
+ mov edi, [ebp + faction]
+ mov esi, [ebp + fshift]
+ mov edx, [esp + is3]
+
+ /* accumulate Oi forces in xmm0, xmm1, xmm2 */
+ movaps xmm0, [esp + fixO]
+ movaps xmm1, [esp + fiyO]
+ movaps xmm2, [esp + fizO]
+
+ movhlps xmm3, xmm0
+ movhlps xmm4, xmm1
+ movhlps xmm5, xmm2
+ addps xmm0, xmm3
+ addps xmm1, xmm4
+ addps xmm2, xmm5 /* sum is in 1/2 in xmm0-xmm2 */
+
+ movaps xmm3, xmm0
+ movaps xmm4, xmm1
+ movaps xmm5, xmm2
+
+ shufps xmm3, xmm3, 1
+ shufps xmm4, xmm4, 1
+ shufps xmm5, xmm5, 1
+ addss xmm0, xmm3
+ addss xmm1, xmm4
+ addss xmm2, xmm5 /* xmm0-xmm2 has single force in pos0 */
+
+ /* increment i force */
+ movss xmm3, [edi + ecx*4]
+ movss xmm4, [edi + ecx*4 + 4]
+ movss xmm5, [edi + ecx*4 + 8]
+ addss xmm3, xmm0
+ addss xmm4, xmm1
+ addss xmm5, xmm2
+ movss [edi + ecx*4], xmm3
+ movss [edi + ecx*4 + 4], xmm4
+ movss [edi + ecx*4 + 8], xmm5
+
+ /* accumulate force in xmm6/xmm7 for fshift */
+ movaps xmm6, xmm0
+ movss xmm7, xmm2
+ movlhps xmm6, xmm1
+ shufps xmm6, xmm6, 0b1000
+
+ /* accumulate H1i forces in xmm0, xmm1, xmm2 */
+ movaps xmm0, [esp + fixH1]
+ movaps xmm1, [esp + fiyH1]
+ movaps xmm2, [esp + fizH1]
+
+ movhlps xmm3, xmm0
+ movhlps xmm4, xmm1
+ movhlps xmm5, xmm2
+ addps xmm0, xmm3
+ addps xmm1, xmm4
+ addps xmm2, xmm5 /* sum is in 1/2 in xmm0-xmm2 */
+
+ movaps xmm3, xmm0
+ movaps xmm4, xmm1
+ movaps xmm5, xmm2
+
+ shufps xmm3, xmm3, 1
+ shufps xmm4, xmm4, 1
+ shufps xmm5, xmm5, 1
+ addss xmm0, xmm3
+ addss xmm1, xmm4
+ addss xmm2, xmm5 /* xmm0-xmm2 has single force in pos0 */
+
+ /* increment i force */
+ movss xmm3, [edi + ecx*4 + 12]
+ movss xmm4, [edi + ecx*4 + 16]
+ movss xmm5, [edi + ecx*4 + 20]
+ addss xmm3, xmm0
+ addss xmm4, xmm1
+ addss xmm5, xmm2
+ movss [edi + ecx*4 + 12], xmm3
+ movss [edi + ecx*4 + 16], xmm4
+ movss [edi + ecx*4 + 20], xmm5
+
+ /* accumulate force in xmm6/xmm7 for fshift */
+ addss xmm7, xmm2
+ movlhps xmm0, xmm1
+ shufps xmm0, xmm0, 0b1000
+ addps xmm6, xmm0
+
+ /* accumulate H2i forces in xmm0, xmm1, xmm2 */
+ movaps xmm0, [esp + fixH2]
+ movaps xmm1, [esp + fiyH2]
+ movaps xmm2, [esp + fizH2]
+
+ movhlps xmm3, xmm0
+ movhlps xmm4, xmm1
+ movhlps xmm5, xmm2
+ addps xmm0, xmm3
+ addps xmm1, xmm4
+ addps xmm2, xmm5 /* sum is in 1/2 in xmm0-xmm2 */
+
+ movaps xmm3, xmm0
+ movaps xmm4, xmm1
+ movaps xmm5, xmm2
+
+ shufps xmm3, xmm3, 1
+ shufps xmm4, xmm4, 1
+ shufps xmm5, xmm5, 1
+ addss xmm0, xmm3
+ addss xmm1, xmm4
+ addss xmm2, xmm5 /* xmm0-xmm2 has single force in pos0 */
+
+ /* increment i force */
+ movss xmm3, [edi + ecx*4 + 24]
+ movss xmm4, [edi + ecx*4 + 28]
+ movss xmm5, [edi + ecx*4 + 32]
+ addss xmm3, xmm0
+ addss xmm4, xmm1
+ addss xmm5, xmm2
+ movss [edi + ecx*4 + 24], xmm3
+ movss [edi + ecx*4 + 28], xmm4
+ movss [edi + ecx*4 + 32], xmm5
+
+ /* accumulate force in xmm6/xmm7 for fshift */
+ addss xmm7, xmm2
+ movlhps xmm0, xmm1
+ shufps xmm0, xmm0, 0b1000
+ addps xmm6, xmm0
+
+ /* increment fshift force */
+ movlps xmm3, [esi + edx*4]
+ movss xmm4, [esi + edx*4 + 8]
+ addps xmm3, xmm6
+ addss xmm4, xmm7
+ movlps [esi + edx*4], xmm3
+ movss [esi + edx*4 + 8], xmm4
+
+ /* get group index for i particle */
+ mov edx, [ebp + gid] /* get group index for this i particle */
+ mov edx, [edx]
+ add [ebp + gid], 4 /* advance pointer */
+
+ /* accumulate total potential energy and update it */
+ movaps xmm7, [esp + vctot]
+ /* accumulate */
+ movhlps xmm6, xmm7
+ addps xmm7, xmm6 /* pos 0-1 in xmm7 have the sum now */
+ movaps xmm6, xmm7
+ shufps xmm6, xmm6, 1
+ addss xmm7, xmm6
+
+ /* add earlier value from mem */
+ mov eax, [ebp + Vc]
+ addss xmm7, [eax + edx*4]
+ /* move back to mem */
+ movss [eax + edx*4], xmm7
+
+ /* finish if last */
+ mov ecx, [ebp + nri]
+ dec ecx
+ jecxz i1030_end
+ /* not last, iterate once more! */
+ mov [ebp + nri], ecx
+ jmp i1030_outer
+i1030_end:
+ emms
+ mov eax, [esp + salign]
+ add esp, eax
+ add esp, 1412
+ pop edi
+ pop esi
+ pop edx
+ pop ecx
+ pop ebx
+ pop eax
+ leave
+ ret
+
+
+
+
+
+
+
+.globl inl1100_sse
+ .type inl1100_sse,@function
+inl1100_sse:
+.equ nri, 8
+.equ iinr, 12
+.equ jindex, 16
+.equ jjnr, 20
+.equ shift, 24
+.equ shiftvec, 28
+.equ fshift, 32
+.equ gid, 36
+.equ pos, 40
+.equ faction, 44
+.equ charge, 48
+.equ facel, 52
+.equ Vc, 56
+.equ type, 60
+.equ ntype, 64
+.equ nbfp, 68
+.equ Vnb, 72
+ /* stack offsets for local variables */
+ /* bottom of stack is cache-aligned for sse use */
+.equ ix, 0
+.equ iy, 16
+.equ iz, 32
+.equ iq, 48
+.equ dx, 64
+.equ dy, 80
+.equ dz, 96
+.equ c6, 112
+.equ c12, 128
+.equ six, 144
+.equ twelve, 160
+.equ vctot, 176
+.equ vnbtot, 192
+.equ fix, 208
+.equ fiy, 224
+.equ fiz, 240
+.equ half, 256
+.equ three, 272
+.equ is3, 288
+.equ ii3, 292
+.equ ntia, 296
+.equ innerjjnr, 300
+.equ innerk, 304
+.equ salign, 308
+ push ebp
+ mov ebp,esp
+ push eax
+ push ebx
+ push ecx
+ push edx
+ push esi
+ push edi
+ sub esp, 312 /* local stack space */
+ mov eax, esp
+ and eax, 0xf
+ sub esp, eax
+ mov [esp + salign], eax
+
+ emms
+
+ movups xmm0, [sse_half]
+ movups xmm1, [sse_three]
+ movups xmm2, [sse_six]
+ movups xmm3, [sse_twelve]
+ movaps [esp + half], xmm0
+ movaps [esp + three], xmm1
+ movaps [esp + six], xmm2
+ movaps [esp + twelve], xmm3
+
+ /* assume we have at least one i particle - start directly */
+i1100_outer:
+ mov eax, [ebp + shift] /* eax = pointer into shift[] */
+ mov ebx, [eax] /* ebx=shift[n] */
+ add [ebp + shift], 4 /* advance pointer one step */
+
+ lea ebx, [ebx + ebx*2] /* ebx=3*is */
+ mov [esp + is3],ebx /* store is3 */
+
+ mov eax, [ebp + shiftvec] /* eax = base of shiftvec[] */
+
+ movss xmm0, [eax + ebx*4]
+ movss xmm1, [eax + ebx*4 + 4]
+ movss xmm2, [eax + ebx*4 + 8]
+
+ mov ecx, [ebp + iinr] /* ecx = pointer into iinr[] */
+ add [ebp + iinr], 4 /* advance pointer */
+ mov ebx, [ecx] /* ebx =ii */
+
+ mov edx, [ebp + charge]
+ movss xmm3, [edx + ebx*4]
+ mulss xmm3, [ebp + facel]
+ shufps xmm3, xmm3, 0
+
+ mov edx, [ebp + type]
+ mov edx, [edx + ebx*4]
+ imul edx, [ebp + ntype]
+ shl edx, 1
+ mov [esp + ntia], edx
+
+ lea ebx, [ebx + ebx*2] /* ebx = 3*ii=ii3 */
+ mov eax, [ebp + pos] /* eax = base of pos[] */
+
+ addss xmm0, [eax + ebx*4]
+ addss xmm1, [eax + ebx*4 + 4]
+ addss xmm2, [eax + ebx*4 + 8]
+
+ movaps [esp + iq], xmm3
+
+ shufps xmm0, xmm0, 0
+ shufps xmm1, xmm1, 0
+ shufps xmm2, xmm2, 0
+
+ movaps [esp + ix], xmm0
+ movaps [esp + iy], xmm1
+ movaps [esp + iz], xmm2
+
+ mov [esp + ii3], ebx
+
+ /* clear vctot and i forces */
+ xorps xmm4, xmm4
+ movaps [esp + vctot], xmm4
+ movaps [esp + vnbtot], xmm4
+ movaps [esp + fix], xmm4
+ movaps [esp + fiy], xmm4
+ movaps [esp + fiz], xmm4
+
+ mov eax, [ebp + jindex]
+ mov ecx, [eax] /* jindex[n] */
+ mov edx, [eax + 4] /* jindex[n+1] */
+ add [ebp + jindex], 4
+ sub edx, ecx /* number of innerloop atoms */
+
+ mov esi, [ebp + pos]
+ mov edi, [ebp + faction]
+ mov eax, [ebp + jjnr]
+ shl ecx, 2
+ add eax, ecx
+ mov [esp + innerjjnr], eax /* pointer to jjnr[nj0] */
+ sub edx, 4
+ mov [esp + innerk], edx /* number of innerloop atoms */
+ jge i1100_unroll_loop
+ jmp i1100_finish_inner
+i1100_unroll_loop:
+ /* quad-unroll innerloop here */
+ mov edx, [esp + innerjjnr] /* pointer to jjnr[k] */
+ mov eax, [edx]
+ mov ebx, [edx + 4]
+ mov ecx, [edx + 8]
+ mov edx, [edx + 12] /* eax-edx=jnr1-4 */
+ add [esp + innerjjnr], 16 /* advance pointer (unrolled 4) */
+
+ mov esi, [ebp + charge] /* base of charge[] */
+
+ movss xmm3, [esi + eax*4]
+ movss xmm4, [esi + ecx*4]
+ movss xmm6, [esi + ebx*4]
+ movss xmm7, [esi + edx*4]
+
+ movaps xmm2, [esp + iq]
+ shufps xmm3, xmm6, 0
+ shufps xmm4, xmm7, 0
+ shufps xmm3, xmm4, 0b10001000 /* all charges in xmm3 */
+ movd mm0, eax /* use mmx registers as temp storage */
+ movd mm1, ebx
+ movd mm2, ecx
+ movd mm3, edx
+
+ mov esi, [ebp + type]
+ mov eax, [esi + eax*4]
+ mov ebx, [esi + ebx*4]
+ mov ecx, [esi + ecx*4]
+ mov edx, [esi + edx*4]
+ mov esi, [ebp + nbfp]
+ shl eax, 1
+ shl ebx, 1
+ shl ecx, 1
+ shl edx, 1
+ mov edi, [esp + ntia]
+ add eax, edi
+ add ebx, edi
+ add ecx, edi
+ add edx, edi
+
+ movlps xmm6, [esi + eax*4]
+ movlps xmm7, [esi + ecx*4]
+ movhps xmm6, [esi + ebx*4]
+ movhps xmm7, [esi + edx*4]
+
+ movaps xmm4, xmm6
+ shufps xmm4, xmm7, 0b10001000
+ shufps xmm6, xmm7, 0b11011101
+
+ movd eax, mm0
+ movd ebx, mm1
+ movd ecx, mm2
+ movd edx, mm3
+
+ movaps [esp + c6], xmm4
+ movaps [esp + c12], xmm6
+
+ mov esi, [ebp + pos] /* base of pos[] */
+
+ lea eax, [eax + eax*2] /* replace jnr with j3 */
+ lea ebx, [ebx + ebx*2]
+
+ mulps xmm3, xmm2
+ lea ecx, [ecx + ecx*2] /* replace jnr with j3 */
+ lea edx, [edx + edx*2]
+
+ /* move four coordinates to xmm0-xmm2 */
+
+ movlps xmm4, [esi + eax*4]
+ movlps xmm5, [esi + ecx*4]
+ movss xmm2, [esi + eax*4 + 8]
+ movss xmm6, [esi + ecx*4 + 8]
+
+ movhps xmm4, [esi + ebx*4]
+ movhps xmm5, [esi + edx*4]
+
+ movss xmm0, [esi + ebx*4 + 8]
+ movss xmm1, [esi + edx*4 + 8]
+
+ shufps xmm2, xmm0, 0
+ shufps xmm6, xmm1, 0
+
+ movaps xmm0, xmm4
+ movaps xmm1, xmm4
+
+ shufps xmm2, xmm6, 0b10001000
+
+ shufps xmm0, xmm5, 0b10001000
+ shufps xmm1, xmm5, 0b11011101
+
+ /* move ix-iz to xmm4-xmm6 */
+ movaps xmm4, [esp + ix]
+ movaps xmm5, [esp + iy]
+ movaps xmm6, [esp + iz]
+
+ /* calc dr */
+ subps xmm4, xmm0
+ subps xmm5, xmm1
+ subps xmm6, xmm2
+
+ /* store dr */
+ movaps [esp + dx], xmm4
+ movaps [esp + dy], xmm5
+ movaps [esp + dz], xmm6
+ /* square it */
+ mulps xmm4,xmm4
+ mulps xmm5,xmm5
+ mulps xmm6,xmm6
+ addps xmm4, xmm5
+ addps xmm4, xmm6
+ /* rsq in xmm4 */
+
+ rsqrtps xmm5, xmm4
+ /* lookup seed in xmm5 */
+ movaps xmm2, xmm5
+ mulps xmm5, xmm5
+ movaps xmm1, [esp + three]
+ mulps xmm5, xmm4 /* rsq*lu*lu */
+ movaps xmm0, [esp + half]
+ subps xmm1, xmm5 /* 30-rsq*lu*lu */
+ mulps xmm1, xmm2
+ mulps xmm0, xmm1 /* xmm0=rinv */
+ movaps xmm4, xmm0
+ mulps xmm4, xmm4 /* xmm4=rinvsq */
+ movaps xmm1, xmm4
+ mulps xmm1, xmm4
+ mulps xmm1, xmm4 /* xmm1=rinvsix */
+ movaps xmm2, xmm1
+ mulps xmm2, xmm2 /* xmm2=rinvtwelve */
+ mulps xmm3, xmm0 /* xmm3=vcoul */
+ mulps xmm1, [esp + c6]
+ mulps xmm2, [esp + c12]
+ movaps xmm5, xmm2
+ subps xmm5, xmm1 /* vnb=vnb12-vnb6 */
+ addps xmm5, [esp + vnbtot]
+ mulps xmm1, [esp + six]
+ mulps xmm2, [esp + twelve]
+ subps xmm2, xmm1
+ addps xmm2, xmm3
+ mulps xmm4, xmm2 /* xmm4=total fscal */
+ addps xmm3, [esp + vctot]
+
+ movaps xmm0, [esp + dx]
+ movaps xmm1, [esp + dy]
+ movaps xmm2, [esp + dz]
+
+ movaps [esp + vctot], xmm3
+ movaps [esp + vnbtot], xmm5
+
+ mov edi, [ebp + faction]
+ mulps xmm0, xmm4
+ mulps xmm1, xmm4
+ mulps xmm2, xmm4
+ /* xmm0-xmm2 contains tx-tz (partial force) */
+ /* now update f_i */
+ movaps xmm3, [esp + fix]
+ movaps xmm4, [esp + fiy]
+ movaps xmm5, [esp + fiz]
+ addps xmm3, xmm0
+ addps xmm4, xmm1
+ addps xmm5, xmm2
+ movaps [esp + fix], xmm3
+ movaps [esp + fiy], xmm4
+ movaps [esp + fiz], xmm5
+ /* the fj's - start by accumulating x & y forces from memory */
+ movlps xmm4, [edi + eax*4]
+ movlps xmm6, [edi + ecx*4]
+ movhps xmm4, [edi + ebx*4]
+ movhps xmm6, [edi + edx*4]
+
+ movaps xmm3, xmm4
+ shufps xmm3, xmm6, 0b10001000
+ shufps xmm4, xmm6, 0b11011101
+
+ /* now xmm3-xmm5 contains fjx, fjy, fjz */
+ subps xmm3, xmm0
+ subps xmm4, xmm1
+
+ /* unpack them back so we can store them - first x & y in xmm3/xmm4 */
+
+ movaps xmm6, xmm3
+ unpcklps xmm6, xmm4
+ unpckhps xmm3, xmm4
+ /* xmm6(l)=x & y for j1, (h) for j2 */
+ /* xmm3(l)=x & y for j3, (h) for j4 */
+ movlps [edi + eax*4], xmm6
+ movlps [edi + ecx*4], xmm3
+
+ movhps [edi + ebx*4], xmm6
+ movhps [edi + edx*4], xmm3
+
+ /* and the z forces */
+ movss xmm4, [edi + eax*4 + 8]
+ movss xmm5, [edi + ebx*4 + 8]
+ movss xmm6, [edi + ecx*4 + 8]
+ movss xmm7, [edi + edx*4 + 8]
+ subss xmm4, xmm2
+ shufps xmm2, xmm2, 0b11100101
+ subss xmm5, xmm2
+ shufps xmm2, xmm2, 0b11101010
+ subss xmm6, xmm2
+ shufps xmm2, xmm2, 0b11111111
+ subss xmm7, xmm2
+ movss [edi + eax*4 + 8], xmm4
+ movss [edi + ebx*4 + 8], xmm5
+ movss [edi + ecx*4 + 8], xmm6
+ movss [edi + edx*4 + 8], xmm7
+
+ /* should we do one more iteration? */
+ sub [esp + innerk], 4
+ jl i1100_finish_inner
+ jmp i1100_unroll_loop
+i1100_finish_inner:
+ /* check if at least two particles remain */
+ add [esp + innerk], 4
+ mov edx, [esp + innerk]
+ and edx, 2
+ jnz i1100_dopair
+ jmp i1100_checksingle
+i1100_dopair:
+ mov esi, [ebp + charge]
+
+ mov ecx, [esp + innerjjnr]
+
+ mov eax, [ecx]
+ mov ebx, [ecx + 4]
+ add [esp + innerjjnr], 8
+
+ xorps xmm3, xmm3
+ movss xmm3, [esi + eax*4]
+ movss xmm6, [esi + ebx*4]
+ shufps xmm3, xmm6, 0b00001100
+ shufps xmm3, xmm3, 0b01011000 /* xmm3(0,1) has the charges */
+
+ mov esi, [ebp + type]
+ mov ecx, eax
+ mov edx, ebx
+ mov ecx, [esi + ecx*4]
+ mov edx, [esi + edx*4]
+ mov esi, [ebp + nbfp]
+ shl ecx, 1
+ shl edx, 1
+ mov edi, [esp + ntia]
+ add ecx, edi
+ add edx, edi
+ movlps xmm6, [esi + ecx*4]
+ movhps xmm6, [esi + edx*4]
+ mov edi, [ebp + pos]
+ xorps xmm7,xmm7
+ movaps xmm4, xmm6
+ shufps xmm4, xmm4, 0b1000
+ shufps xmm6, xmm6, 0b1101
+ movlhps xmm4, xmm7
+ movlhps xmm6, xmm7
+
+ movaps [esp + c6], xmm4
+ movaps [esp + c12], xmm6
+
+ lea eax, [eax + eax*2]
+ lea ebx, [ebx + ebx*2]
+ /* move coordinates to xmm0-xmm2 */
+ movlps xmm1, [edi + eax*4]
+ movss xmm2, [edi + eax*4 + 8]
+ movhps xmm1, [edi + ebx*4]
+ movss xmm0, [edi + ebx*4 + 8]
+
+ mulps xmm3, [esp + iq]
+
+ movlhps xmm3, xmm7
+
+ shufps xmm2, xmm0, 0
+
+ movaps xmm0, xmm1
+
+ shufps xmm2, xmm2, 0b10001000
+
+ shufps xmm0, xmm0, 0b10001000
+ shufps xmm1, xmm1, 0b11011101
+
+ mov edi, [ebp + faction]
+ /* move ix-iz to xmm4-xmm6 */
+ xorps xmm7, xmm7
+
+ movaps xmm4, [esp + ix]
+ movaps xmm5, [esp + iy]
+ movaps xmm6, [esp + iz]
+
+ /* calc dr */
+ subps xmm4, xmm0
+ subps xmm5, xmm1
+ subps xmm6, xmm2
+
+ /* store dr */
+ movaps [esp + dx], xmm4
+ movaps [esp + dy], xmm5
+ movaps [esp + dz], xmm6
+ /* square it */
+ mulps xmm4,xmm4
+ mulps xmm5,xmm5
+ mulps xmm6,xmm6
+ addps xmm4, xmm5
+ addps xmm4, xmm6
+ /* rsq in xmm4 */
+
+ rsqrtps xmm5, xmm4
+ /* lookup seed in xmm5 */
+ movaps xmm2, xmm5
+ mulps xmm5, xmm5
+ movaps xmm1, [esp + three]
+ mulps xmm5, xmm4 /* rsq*lu*lu */
+ movaps xmm0, [esp + half]
+ subps xmm1, xmm5 /* 30-rsq*lu*lu */
+ mulps xmm1, xmm2
+ mulps xmm0, xmm1 /* xmm0=rinv */
+ movaps xmm4, xmm0
+ mulps xmm4, xmm4 /* xmm4=rinvsq */
+ movaps xmm1, xmm4
+ mulps xmm1, xmm4
+ mulps xmm1, xmm4 /* xmm1=rinvsix */
+ movaps xmm2, xmm1
+ mulps xmm2, xmm2 /* xmm2=rinvtwelve */
+
+ mulps xmm3, xmm0 /* xmm3=vcoul */
+ mulps xmm1, [esp + c6]
+ mulps xmm2, [esp + c12]
+ movaps xmm5, xmm2
+ subps xmm5, xmm1 /* vnb=vnb12-vnb6 */
+ addps xmm5, [esp + vnbtot]
+ mulps xmm1, [esp + six]
+ mulps xmm2, [esp + twelve]
+ subps xmm2, xmm1
+ addps xmm2, xmm3
+ mulps xmm4, xmm2 /* xmm4=total fscal */
+ addps xmm3, [esp + vctot]
+
+ movaps xmm0, [esp + dx]
+ movaps xmm1, [esp + dy]
+ movaps xmm2, [esp + dz]
+
+ movaps [esp + vctot], xmm3
+ movaps [esp + vnbtot], xmm5
+
+ mulps xmm0, xmm4
+ mulps xmm1, xmm4
+ mulps xmm2, xmm4
+ /* xmm0-xmm2 contains tx-tz (partial force) */
+ /* now update f_i */
+ movaps xmm3, [esp + fix]
+ movaps xmm4, [esp + fiy]
+ movaps xmm5, [esp + fiz]
+ addps xmm3, xmm0
+ addps xmm4, xmm1
+ addps xmm5, xmm2
+ movaps [esp + fix], xmm3
+ movaps [esp + fiy], xmm4
+ movaps [esp + fiz], xmm5
+ /* update the fj's */
+ movss xmm3, [edi + eax*4]
+ movss xmm4, [edi + eax*4 + 4]
+ movss xmm5, [edi + eax*4 + 8]
+ subss xmm3, xmm0
+ subss xmm4, xmm1
+ subss xmm5, xmm2
+ movss [edi + eax*4], xmm3
+ movss [edi + eax*4 + 4], xmm4
+ movss [edi + eax*4 + 8], xmm5
+
+ shufps xmm0, xmm0, 0b11100001
+ shufps xmm1, xmm1, 0b11100001
+ shufps xmm2, xmm2, 0b11100001
+
+ movss xmm3, [edi + ebx*4]
+ movss xmm4, [edi + ebx*4 + 4]
+ movss xmm5, [edi + ebx*4 + 8]
+ subss xmm3, xmm0
+ subss xmm4, xmm1
+ subss xmm5, xmm2
+ movss [edi + ebx*4], xmm3
+ movss [edi + ebx*4 + 4], xmm4
+ movss [edi + ebx*4 + 8], xmm5
+
+i1100_checksingle:
+ mov edx, [esp + innerk]
+ and edx, 1
+ jnz i1100_dosingle
+ jmp i1100_updateouterdata
+i1100_dosingle:
+ mov esi, [ebp + charge]
+ mov edi, [ebp + pos]
+ mov ecx, [esp + innerjjnr]
+ xorps xmm3, xmm3
+ mov eax, [ecx]
+ movss xmm3, [esi + eax*4] /* xmm3(0) has the charge */
+
+ mov esi, [ebp + type]
+ mov ecx, eax
+ mov ecx, [esi + ecx*4]
+ mov esi, [ebp + nbfp]
+ shl ecx, 1
+ add ecx, [esp + ntia]
+ xorps xmm6, xmm6
+ movlps xmm6, [esi + ecx*4]
+ movaps xmm4, xmm6
+ shufps xmm4, xmm4, 0b11111100
+ shufps xmm6, xmm6, 0b11111101
+
+ movaps [esp + c6], xmm4
+ movaps [esp + c12], xmm6
+
+ lea eax, [eax + eax*2]
+
+ /* move coordinates to xmm0-xmm2 */
+ movss xmm0, [edi + eax*4]
+ movss xmm1, [edi + eax*4 + 4]
+ movss xmm2, [edi + eax*4 + 8]
+
+ mulps xmm3, [esp + iq]
+
+ xorps xmm7, xmm7
+
+ movaps xmm4, [esp + ix]
+ movaps xmm5, [esp + iy]
+ movaps xmm6, [esp + iz]
+
+ /* calc dr */
+ subps xmm4, xmm0
+ subps xmm5, xmm1
+ subps xmm6, xmm2
+
+ /* store dr */
+ movaps [esp + dx], xmm4
+ movaps [esp + dy], xmm5
+ movaps [esp + dz], xmm6
+ /* square it */
+ mulps xmm4,xmm4
+ mulps xmm5,xmm5
+ mulps xmm6,xmm6
+ addps xmm4, xmm5
+ addps xmm4, xmm6
+ /* rsq in xmm4 */
+
+ rsqrtps xmm5, xmm4
+ /* lookup seed in xmm5 */
+ movaps xmm2, xmm5
+ mulps xmm5, xmm5
+ movaps xmm1, [esp + three]
+ mulps xmm5, xmm4 /* rsq*lu*lu */
+ movaps xmm0, [esp + half]
+ subps xmm1, xmm5 /* 30-rsq*lu*lu */
+ mulps xmm1, xmm2
+ mulps xmm0, xmm1 /* xmm0=rinv */
+ movaps xmm4, xmm0
+ mulps xmm4, xmm4 /* xmm4=rinvsq */
+ movaps xmm1, xmm4
+ mulps xmm1, xmm4
+ mulps xmm1, xmm4 /* xmm1=rinvsix */
+ movaps xmm2, xmm1
+ mulps xmm2, xmm2 /* xmm2=rinvtwelve */
+ mulps xmm3, xmm0 /* xmm3=vcoul */
+ mulps xmm1, [esp + c6]
+ mulps xmm2, [esp + c12]
+ movaps xmm5, xmm2
+ subps xmm5, xmm1 /* vnb=vnb12-vnb6 */
+ addps xmm5, [esp + vnbtot]
+ mulps xmm1, [esp + six]
+ mulps xmm2, [esp + twelve]
+ subps xmm2, xmm1
+ addps xmm2, xmm3
+ mulps xmm4, xmm2 /* xmm4=total fscal */
+ addps xmm3, [esp + vctot]
+
+ mov edi, [ebp + faction]
+
+ movaps xmm0, [esp + dx]
+ movaps xmm1, [esp + dy]
+ movaps xmm2, [esp + dz]
+
+ movaps [esp + vctot], xmm3
+ movaps [esp + vnbtot], xmm5
+
+ mulps xmm0, xmm4
+ mulps xmm1, xmm4
+ mulps xmm2, xmm4
+ /* xmm0-xmm2 contains tx-tz (partial force) */
+ /* now update f_i */
+ movaps xmm3, [esp + fix]
+ movaps xmm4, [esp + fiy]
+ movaps xmm5, [esp + fiz]
+ addps xmm3, xmm0
+ addps xmm4, xmm1
+ addps xmm5, xmm2
+ movaps [esp + fix], xmm3
+ movaps [esp + fiy], xmm4
+ movaps [esp + fiz], xmm5
+ /* update fj */
+
+ movss xmm3, [edi + eax*4]
+ movss xmm4, [edi + eax*4 + 4]
+ movss xmm5, [edi + eax*4 + 8]
+ subss xmm3, xmm0
+ subss xmm4, xmm1
+ subss xmm5, xmm2
+ movss [edi + eax*4], xmm3
+ movss [edi + eax*4 + 4], xmm4
+ movss [edi + eax*4 + 8], xmm5
+i1100_updateouterdata:
+ mov ecx, [esp + ii3]
+ mov edi, [ebp + faction]
+ mov esi, [ebp + fshift]
+ mov edx, [esp + is3]
+
+ /* accumulate i forces in xmm0, xmm1, xmm2 */
+ movaps xmm0, [esp + fix]
+ movaps xmm1, [esp + fiy]
+ movaps xmm2, [esp + fiz]
+
+ movhlps xmm3, xmm0
+ movhlps xmm4, xmm1
+ movhlps xmm5, xmm2
+ addps xmm0, xmm3
+ addps xmm1, xmm4
+ addps xmm2, xmm5 /* sum is in 1/2 in xmm0-xmm2 */
+
+ movaps xmm3, xmm0
+ movaps xmm4, xmm1
+ movaps xmm5, xmm2
+
+ shufps xmm3, xmm3, 1
+ shufps xmm4, xmm4, 1
+ shufps xmm5, xmm5, 1
+ addss xmm0, xmm3
+ addss xmm1, xmm4
+ addss xmm2, xmm5 /* xmm0-xmm2 has single force in pos0 */
+
+ /* increment i force */
+ movss xmm3, [edi + ecx*4]
+ movss xmm4, [edi + ecx*4 + 4]
+ movss xmm5, [edi + ecx*4 + 8]
+ addss xmm3, xmm0
+ addss xmm4, xmm1
+ addss xmm5, xmm2
+ movss [edi + ecx*4], xmm3
+ movss [edi + ecx*4 + 4], xmm4
+ movss [edi + ecx*4 + 8], xmm5
+
+ /* increment fshift force */
+ movss xmm3, [esi + edx*4]
+ movss xmm4, [esi + edx*4 + 4]
+ movss xmm5, [esi + edx*4 + 8]
+ addss xmm3, xmm0
+ addss xmm4, xmm1
+ addss xmm5, xmm2
+ movss [esi + edx*4], xmm3
+ movss [esi + edx*4 + 4], xmm4
+ movss [esi + edx*4 + 8], xmm5
+
+ /* get group index for i particle */
+ mov edx, [ebp + gid] /* get group index for this i particle */
+ mov edx, [edx]
+ add [ebp + gid], 4 /* advance pointer */
+
+ /* accumulate total potential energy and update it */
+ movaps xmm7, [esp + vctot]
+ /* accumulate */
+ movhlps xmm6, xmm7
+ addps xmm7, xmm6 /* pos 0-1 in xmm7 have the sum now */
+ movaps xmm6, xmm7
+ shufps xmm6, xmm6, 1
+ addss xmm7, xmm6
+
+ /* add earlier value from mem */
+ mov eax, [ebp + Vc]
+ addss xmm7, [eax + edx*4]
+ /* move back to mem */
+ movss [eax + edx*4], xmm7
+
+ /* accumulate total lj energy and update it */
+ movaps xmm7, [esp + vnbtot]
+ /* accumulate */
+ movhlps xmm6, xmm7
+ addps xmm7, xmm6 /* pos 0-1 in xmm7 have the sum now */
+ movaps xmm6, xmm7
+ shufps xmm6, xmm6, 1
+ addss xmm7, xmm6
+
+ /* add earlier value from mem */
+ mov eax, [ebp + Vnb]
+ addss xmm7, [eax + edx*4]
+ /* move back to mem */
+ movss [eax + edx*4], xmm7
+
+ /* finish if last */
+ mov ecx, [ebp + nri]
+ dec ecx
+ jecxz i1100_end
+ /* not last, iterate once more! */
+ mov [ebp + nri], ecx
+ jmp i1100_outer
+i1100_end:
+ emms
+ mov eax, [esp + salign]
+ add esp, eax
+ add esp, 312
+ pop edi
+ pop esi
+ pop edx
+ pop ecx
+ pop ebx
+ pop eax
+ leave
+ ret
+
+
+
+
+.globl inl2100_sse
+ .type inl2100_sse,@function
+inl2100_sse:
+.equ nri, 8
+.equ iinr, 12
+.equ jindex, 16
+.equ jjnr, 20
+.equ shift, 24
+.equ shiftvec, 28
+.equ fshift, 32
+.equ gid, 36
+.equ pos, 40
+.equ faction, 44
+.equ charge, 48
+.equ facel, 52
+.equ Vc, 56
+.equ krf, 60
+.equ crf, 64
+.equ type, 68
+.equ ntype, 72
+.equ nbfp, 76
+.equ Vnb, 80
+ /* stack offsets for local variables */
+ /* bottom of stack is cache-aligned for sse use */
+.equ ix, 0
+.equ iy, 16
+.equ iz, 32
+.equ iq, 48
+.equ dx, 64
+.equ dy, 80
+.equ dz, 96
+.equ c6, 112
+.equ c12, 128
+.equ six, 144
+.equ twelve, 160
+.equ vctot, 176
+.equ vnbtot, 192
+.equ fix, 208
+.equ fiy, 224
+.equ fiz, 240
+.equ half, 256
+.equ three, 272
+.equ two, 288
+.equ krf, 304
+.equ crf, 320
+.equ is3, 336
+.equ ii3, 340
+.equ ntia, 344
+.equ innerjjnr, 348
+.equ innerk, 352
+.equ salign, 356
+ push ebp
+ mov ebp,esp
+ push eax
+ push ebx
+ push ecx
+ push edx
+ push esi
+ push edi
+ sub esp, 360 /* local stack space */
+ mov eax, esp
+ and eax, 0xf
+ sub esp, eax
+ mov [esp + salign], eax
+
+ emms
+
+ movups xmm0, [sse_half]
+ movups xmm1, [sse_three]
+ movups xmm2, [sse_six]
+ movups xmm3, [sse_twelve]
+ movups xmm4, [sse_two]
+ movss xmm5, [ebp + krf]
+ movss xmm6, [ebp + crf]
+
+ movaps [esp + half], xmm0
+ movaps [esp + three], xmm1
+ movaps [esp + six], xmm2
+ movaps [esp + twelve], xmm3
+ movaps [esp + two], xmm4
+ shufps xmm5, xmm5, 0
+ shufps xmm6, xmm6, 0
+ movaps [esp + krf], xmm5
+ movaps [esp + crf], xmm6
+
+ /* assume we have at least one i particle - start directly */
+.i2100_outer:
+ mov eax, [ebp + shift] /* eax = pointer into shift[] */
+ mov ebx, [eax] /* ebx=shift[n] */
+ add [ebp + shift], 4 /* advance pointer one step */
+
+ lea ebx, [ebx + ebx*2] /* ebx=3*is */
+ mov [esp + is3],ebx /* store is3 */
+
+ mov eax, [ebp + shiftvec] /* eax = base of shiftvec[] */
+
+ movss xmm0, [eax + ebx*4]
+ movss xmm1, [eax + ebx*4 + 4]
+ movss xmm2, [eax + ebx*4 + 8]
+
+ mov ecx, [ebp + iinr] /* ecx = pointer into iinr[] */
+ add [ebp + iinr], 4 /* advance pointer */
+ mov ebx, [ecx] /* ebx =ii */
+
+ mov edx, [ebp + charge]
+ movss xmm3, [edx + ebx*4]
+ mulss xmm3, [ebp + facel]
+ shufps xmm3, xmm3, 0
+
+ mov edx, [ebp + type]
+ mov edx, [edx + ebx*4]
+ imul edx, [ebp + ntype]
+ shl edx, 1
+ mov [esp + ntia], edx
+
+ lea ebx, [ebx + ebx*2] /* ebx = 3*ii=ii3 */
+ mov eax, [ebp + pos] /* eax = base of pos[] */
+
+ addss xmm0, [eax + ebx*4]
+ addss xmm1, [eax + ebx*4 + 4]
+ addss xmm2, [eax + ebx*4 + 8]
+
+ movaps [esp + iq], xmm3
+
+ shufps xmm0, xmm0, 0
+ shufps xmm1, xmm1, 0
+ shufps xmm2, xmm2, 0
+
+ movaps [esp + ix], xmm0
+ movaps [esp + iy], xmm1
+ movaps [esp + iz], xmm2
+
+ mov [esp + ii3], ebx
+
+ /* clear vctot and i forces */
+ xorps xmm4, xmm4
+ movaps [esp + vctot], xmm4
+ movaps [esp + vnbtot], xmm4
+ movaps [esp + fix], xmm4
+ movaps [esp + fiy], xmm4
+ movaps [esp + fiz], xmm4
+
+ mov eax, [ebp + jindex]
+ mov ecx, [eax] /* jindex[n] */
+ mov edx, [eax + 4] /* jindex[n+1] */
+ add [ebp + jindex], 4
+ sub edx, ecx /* number of innerloop atoms */
+
+ mov esi, [ebp + pos]
+ mov edi, [ebp + faction]
+ mov eax, [ebp + jjnr]
+ shl ecx, 2
+ add eax, ecx
+ mov [esp + innerjjnr], eax /* pointer to jjnr[nj0] */
+ sub edx, 4
+ mov [esp + innerk], edx /* number of innerloop atoms */
+ jge .i2100_unroll_loop
+ jmp .i2100_finish_inner
+.i2100_unroll_loop:
+ /* quad-unroll innerloop here */
+ mov edx, [esp + innerjjnr] /* pointer to jjnr[k] */
+ mov eax, [edx]
+ mov ebx, [edx + 4]
+ mov ecx, [edx + 8]
+ mov edx, [edx + 12] /* eax-edx=jnr1-4 */
+ add [esp + innerjjnr], 16 /* advance pointer (unrolled 4) */
+
+ mov esi, [ebp + charge] /* base of charge[] */
+
+ movss xmm3, [esi + eax*4]
+ movss xmm4, [esi + ecx*4]
+ movss xmm6, [esi + ebx*4]
+ movss xmm7, [esi + edx*4]
+
+ movaps xmm2, [esp + iq]
+ shufps xmm3, xmm6, 0
+ shufps xmm4, xmm7, 0
+ shufps xmm3, xmm4, 0b10001000 /* all charges in xmm3 */
+ movd mm0, eax /* use mmx registers as temp storage */
+ movd mm1, ebx
+ movd mm2, ecx
+ movd mm3, edx
+
+ mov esi, [ebp + type]
+ mov eax, [esi + eax*4]
+ mov ebx, [esi + ebx*4]
+ mov ecx, [esi + ecx*4]
+ mov edx, [esi + edx*4]
+ mov esi, [ebp + nbfp]
+ shl eax, 1
+ shl ebx, 1
+ shl ecx, 1
+ shl edx, 1
+ mov edi, [esp + ntia]
+ add eax, edi
+ add ebx, edi
+ add ecx, edi
+ add edx, edi
+
+ movlps xmm6, [esi + eax*4]
+ movlps xmm7, [esi + ecx*4]
+ movhps xmm6, [esi + ebx*4]
+ movhps xmm7, [esi + edx*4]
+
+ movaps xmm4, xmm6
+ shufps xmm4, xmm7, 0b10001000
+ shufps xmm6, xmm7, 0b11011101
+
+ movd eax, mm0
+ movd ebx, mm1
+ movd ecx, mm2
+ movd edx, mm3
+
+ movaps [esp + c6], xmm4
+ movaps [esp + c12], xmm6
+
+ mov esi, [ebp + pos] /* base of pos[] */
+
+ lea eax, [eax + eax*2] /* replace jnr with j3 */
+ lea ebx, [ebx + ebx*2]
+
+ mulps xmm3, xmm2
+ lea ecx, [ecx + ecx*2] /* replace jnr with j3 */
+ lea edx, [edx + edx*2]
+
+ /* move four coordinates to xmm0-xmm2 */
+
+ movlps xmm4, [esi + eax*4]
+ movlps xmm5, [esi + ecx*4]
+ movss xmm2, [esi + eax*4 + 8]
+ movss xmm6, [esi + ecx*4 + 8]
+
+ movhps xmm4, [esi + ebx*4]
+ movhps xmm5, [esi + edx*4]
+
+ movss xmm0, [esi + ebx*4 + 8]
+ movss xmm1, [esi + edx*4 + 8]
+
+ shufps xmm2, xmm0, 0
+ shufps xmm6, xmm1, 0
+
+ movaps xmm0, xmm4
+ movaps xmm1, xmm4
+
+ shufps xmm2, xmm6, 0b10001000
+
+ shufps xmm0, xmm5, 0b10001000
+ shufps xmm1, xmm5, 0b11011101
+
+ /* move ix-iz to xmm4-xmm6 */
+ movaps xmm4, [esp + ix]
+ movaps xmm5, [esp + iy]
+ movaps xmm6, [esp + iz]
+
+ /* calc dr */
+ subps xmm4, xmm0
+ subps xmm5, xmm1
+ subps xmm6, xmm2
+
+ /* store dr */
+ movaps [esp + dx], xmm4
+ movaps [esp + dy], xmm5
+ movaps [esp + dz], xmm6
+ /* square it */
+ mulps xmm4,xmm4
+ mulps xmm5,xmm5
+ mulps xmm6,xmm6
+ addps xmm4, xmm5
+ addps xmm4, xmm6
+ /* rsq in xmm4 */
+
+ movaps xmm7, [esp + krf]
+ rsqrtps xmm5, xmm4
+ /* lookup seed in xmm5 */
+ movaps xmm2, xmm5
+ mulps xmm5, xmm5
+ movaps xmm1, [esp + three]
+ mulps xmm5, xmm4 /* rsq*lu*lu */
+ movaps xmm0, [esp + half]
+ mulps xmm7, xmm4 /* xmm7=krsq */
+ subps xmm1, xmm5 /* 30-rsq*lu*lu */
+ mulps xmm1, xmm2
+ mulps xmm0, xmm1 /* xmm0=rinv */
+ movaps xmm4, xmm0
+ mulps xmm4, xmm4 /* xmm4=rinvsq */
+ movaps xmm6, xmm0
+ addps xmm6, xmm7 /* xmm6=rinv+krsq */
+ movaps xmm1, xmm4
+ subps xmm6, [esp + crf]
+ mulps xmm1, xmm4
+ mulps xmm1, xmm4 /* xmm1=rinvsix */
+ movaps xmm2, xmm1
+ mulps xmm2, xmm2 /* xmm2=rinvtwelve */
+ mulps xmm6, xmm3 /* xmm6=vcoul=qq*(rinv+krsq) */
+ mulps xmm7, [esp + two]
+ mulps xmm1, [esp + c6]
+ mulps xmm2, [esp + c12]
+ movaps xmm5, xmm2
+ subps xmm5, xmm1 /* vnb=vnb12-vnb6 */
+ addps xmm5, [esp + vnbtot]
+ mulps xmm1, [esp + six]
+ mulps xmm2, [esp + twelve]
+ subps xmm2, xmm1
+ subps xmm0, xmm7
+ mulps xmm3, xmm0
+ addps xmm2, xmm3
+ mulps xmm4, xmm2 /* xmm4=total fscal */
+ addps xmm6, [esp + vctot]
+
+ movaps xmm0, [esp + dx]
+ movaps xmm1, [esp + dy]
+ movaps xmm2, [esp + dz]
+
+ movaps [esp + vctot], xmm6
+ movaps [esp + vnbtot], xmm5
+
+ mov edi, [ebp + faction]
+ mulps xmm0, xmm4
+ mulps xmm1, xmm4
+ mulps xmm2, xmm4
+ /* xmm0-xmm2 contains tx-tz (partial force) */
+ /* now update f_i */
+ movaps xmm3, [esp + fix]
+ movaps xmm4, [esp + fiy]
+ movaps xmm5, [esp + fiz]
+ addps xmm3, xmm0
+ addps xmm4, xmm1
+ addps xmm5, xmm2
+ movaps [esp + fix], xmm3
+ movaps [esp + fiy], xmm4
+ movaps [esp + fiz], xmm5
+ /* the fj's - start by accumulating x & y forces from memory */
+ movlps xmm4, [edi + eax*4]
+ movlps xmm6, [edi + ecx*4]
+ movhps xmm4, [edi + ebx*4]
+ movhps xmm6, [edi + edx*4]
+
+ movaps xmm3, xmm4
+ shufps xmm3, xmm6, 0b10001000
+ shufps xmm4, xmm6, 0b11011101
+
+ /* now xmm3-xmm5 contains fjx, fjy, fjz */
+ subps xmm3, xmm0
+ subps xmm4, xmm1
+
+ /* unpack them back so we can store them - first x & y in xmm3/xmm4 */
+
+ movaps xmm6, xmm3
+ unpcklps xmm6, xmm4
+ unpckhps xmm3, xmm4
+ /* xmm6(l)=x & y for j1, (h) for j2 */
+ /* xmm3(l)=x & y for j3, (h) for j4 */
+ movlps [edi + eax*4], xmm6
+ movlps [edi + ecx*4], xmm3
+
+ movhps [edi + ebx*4], xmm6
+ movhps [edi + edx*4], xmm3
+
+ /* and the z forces */
+ movss xmm4, [edi + eax*4 + 8]
+ movss xmm5, [edi + ebx*4 + 8]
+ movss xmm6, [edi + ecx*4 + 8]
+ movss xmm7, [edi + edx*4 + 8]
+ subss xmm4, xmm2
+ shufps xmm2, xmm2, 0b11100101
+ subss xmm5, xmm2
+ shufps xmm2, xmm2, 0b11101010
+ subss xmm6, xmm2
+ shufps xmm2, xmm2, 0b11111111
+ subss xmm7, xmm2
+ movss [edi + eax*4 + 8], xmm4
+ movss [edi + ebx*4 + 8], xmm5
+ movss [edi + ecx*4 + 8], xmm6
+ movss [edi + edx*4 + 8], xmm7
+
+ /* should we do one more iteration? */
+ sub [esp + innerk], 4
+ jl .i2100_finish_inner
+ jmp .i2100_unroll_loop
+.i2100_finish_inner:
+ /* check if at least two particles remain */
+ add [esp + innerk], 4
+ mov edx, [esp + innerk]
+ and edx, 2
+ jnz .i2100_dopair
+ jmp .i2100_checksingle
+.i2100_dopair:
+ mov esi, [ebp + charge]
+
+ mov ecx, [esp + innerjjnr]
+
+ mov eax, [ecx]
+ mov ebx, [ecx + 4]
+ add [esp + innerjjnr], 8
+
+ xorps xmm3, xmm3
+ movss xmm3, [esi + eax*4]
+ movss xmm6, [esi + ebx*4]
+ shufps xmm3, xmm6, 0b00001100
+ shufps xmm3, xmm3, 0b01011000 /* xmm3(0,1) has the charges */
+
+ mov esi, [ebp + type]
+ mov ecx, eax
+ mov edx, ebx
+ mov ecx, [esi + ecx*4]
+ mov edx, [esi + edx*4]
+ mov esi, [ebp + nbfp]
+ shl ecx, 1
+ shl edx, 1
+ mov edi, [esp + ntia]
+ add ecx, edi
+ add edx, edi
+ movlps xmm6, [esi + ecx*4]
+ movhps xmm6, [esi + edx*4]
+ mov edi, [ebp + pos]
+ xorps xmm7,xmm7
+ movaps xmm4, xmm6
+ shufps xmm4, xmm4, 0b1000
+ shufps xmm6, xmm6, 0b1101
+ movlhps xmm4, xmm7
+ movlhps xmm6, xmm7
+
+ movaps [esp + c6], xmm4
+ movaps [esp + c12], xmm6
+
+ lea eax, [eax + eax*2]
+ lea ebx, [ebx + ebx*2]
+ /* move coordinates to xmm0-xmm2 */
+ movlps xmm1, [edi + eax*4]
+ movss xmm2, [edi + eax*4 + 8]
+ movhps xmm1, [edi + ebx*4]
+ movss xmm0, [edi + ebx*4 + 8]
+
+ mulps xmm3, [esp + iq]
+
+ movlhps xmm3, xmm7
+
+ shufps xmm2, xmm0, 0
+
+ movaps xmm0, xmm1
+
+ shufps xmm2, xmm2, 0b10001000
+
+ shufps xmm0, xmm0, 0b10001000
+ shufps xmm1, xmm1, 0b11011101
+
+ mov edi, [ebp + faction]
+ /* move ix-iz to xmm4-xmm6 */
+ xorps xmm7, xmm7
+
+ movaps xmm4, [esp + ix]
+ movaps xmm5, [esp + iy]
+ movaps xmm6, [esp + iz]
+
+ /* calc dr */
+ subps xmm4, xmm0
+ subps xmm5, xmm1
+ subps xmm6, xmm2
+
+ /* store dr */
+ movaps [esp + dx], xmm4
+ movaps [esp + dy], xmm5
+ movaps [esp + dz], xmm6
+ /* square it */
+ mulps xmm4,xmm4
+ mulps xmm5,xmm5
+ mulps xmm6,xmm6
+ addps xmm4, xmm5
+ addps xmm4, xmm6
+ /* rsq in xmm4 */
+
+ movaps xmm7, [esp + krf]
+ rsqrtps xmm5, xmm4
+ /* lookup seed in xmm5 */
+ movaps xmm2, xmm5
+ mulps xmm5, xmm5
+ movaps xmm1, [esp + three]
+ mulps xmm5, xmm4 /* rsq*lu*lu */
+ movaps xmm0, [esp + half]
+ mulps xmm7, xmm4 /* xmm7=krsq */
+ subps xmm1, xmm5 /* 30-rsq*lu*lu */
+ mulps xmm1, xmm2
+ mulps xmm0, xmm1 /* xmm0=rinv */
+ movaps xmm4, xmm0
+ mulps xmm4, xmm4 /* xmm4=rinvsq */
+ movaps xmm6, xmm0
+ addps xmm6, xmm7 /* xmm6=rinv+krsq */
+ movaps xmm1, xmm4
+ subps xmm6, [esp + crf]
+ mulps xmm1, xmm4
+ mulps xmm1, xmm4 /* xmm1=rinvsix */
+ movaps xmm2, xmm1
+ mulps xmm2, xmm2 /* xmm2=rinvtwelve */
+ mulps xmm6, xmm3 /* xmm6=vcoul=qq*(rinv+krsq-crf) */
+ mulps xmm7, [esp + two]
+ mulps xmm1, [esp + c6]
+ mulps xmm2, [esp + c12]
+ movaps xmm5, xmm2
+ subps xmm5, xmm1 /* vnb=vnb12-vnb6 */
+ addps xmm5, [esp + vnbtot]
+ mulps xmm1, [esp + six]
+ mulps xmm2, [esp + twelve]
+ subps xmm2, xmm1
+ subps xmm0, xmm7
+ mulps xmm3, xmm0
+ addps xmm2, xmm3
+ mulps xmm4, xmm2 /* xmm4=total fscal */
+ addps xmm6, [esp + vctot]
+
+ movaps xmm0, [esp + dx]
+ movaps xmm1, [esp + dy]
+ movaps xmm2, [esp + dz]
+
+ movaps [esp + vctot], xmm6
+ movaps [esp + vnbtot], xmm5
+
+ mulps xmm0, xmm4
+ mulps xmm1, xmm4
+ mulps xmm2, xmm4
+ /* xmm0-xmm2 contains tx-tz (partial force) */
+ /* now update f_i */
+ movaps xmm3, [esp + fix]
+ movaps xmm4, [esp + fiy]
+ movaps xmm5, [esp + fiz]
+ addps xmm3, xmm0
+ addps xmm4, xmm1
+ addps xmm5, xmm2
+ movaps [esp + fix], xmm3
+ movaps [esp + fiy], xmm4
+ movaps [esp + fiz], xmm5
+ /* update the fj's */
+ movss xmm3, [edi + eax*4]
+ movss xmm4, [edi + eax*4 + 4]
+ movss xmm5, [edi + eax*4 + 8]
+ subss xmm3, xmm0
+ subss xmm4, xmm1
+ subss xmm5, xmm2
+ movss [edi + eax*4], xmm3
+ movss [edi + eax*4 + 4], xmm4
+ movss [edi + eax*4 + 8], xmm5
+
+ shufps xmm0, xmm0, 0b11100001
+ shufps xmm1, xmm1, 0b11100001
+ shufps xmm2, xmm2, 0b11100001
+
+ movss xmm3, [edi + ebx*4]
+ movss xmm4, [edi + ebx*4 + 4]
+ movss xmm5, [edi + ebx*4 + 8]
+ subss xmm3, xmm0
+ subss xmm4, xmm1
+ subss xmm5, xmm2
+ movss [edi + ebx*4], xmm3
+ movss [edi + ebx*4 + 4], xmm4
+ movss [edi + ebx*4 + 8], xmm5
+
+.i2100_checksingle:
+ mov edx, [esp + innerk]
+ and edx, 1
+ jnz .i2100_dosingle
+ jmp .i2100_updateouterdata
+.i2100_dosingle:
+ mov esi, [ebp + charge]
+ mov edi, [ebp + pos]
+ mov ecx, [esp + innerjjnr]
+ xorps xmm3, xmm3
+ mov eax, [ecx]
+ movss xmm3, [esi + eax*4] /* xmm3(0) has the charge */
+
+ mov esi, [ebp + type]
+ mov ecx, eax
+ mov ecx, [esi + ecx*4]
+ mov esi, [ebp + nbfp]
+ shl ecx, 1
+ add ecx, [esp + ntia]
+ xorps xmm6, xmm6
+ movlps xmm6, [esi + ecx*4]
+ movaps xmm4, xmm6
+ shufps xmm4, xmm4, 0b11111100
+ shufps xmm6, xmm6, 0b11111101
+
+ movaps [esp + c6], xmm4
+ movaps [esp + c12], xmm6
+
+ lea eax, [eax + eax*2]
+
+ /* move coordinates to xmm0-xmm2 */
+ movss xmm0, [edi + eax*4]
+ movss xmm1, [edi + eax*4 + 4]
+ movss xmm2, [edi + eax*4 + 8]
+
+ mulps xmm3, [esp + iq]
+
+ xorps xmm7, xmm7
+
+ movaps xmm4, [esp + ix]
+ movaps xmm5, [esp + iy]
+ movaps xmm6, [esp + iz]
+
+ /* calc dr */
+ subps xmm4, xmm0
+ subps xmm5, xmm1
+ subps xmm6, xmm2
+
+ /* store dr */
+ movaps [esp + dx], xmm4
+ movaps [esp + dy], xmm5
+ movaps [esp + dz], xmm6
+ /* square it */
+ mulps xmm4,xmm4
+ mulps xmm5,xmm5
+ mulps xmm6,xmm6
+ addps xmm4, xmm5
+ addps xmm4, xmm6
+ /* rsq in xmm4 */
+
+ movaps xmm7, [esp + krf]
+ rsqrtps xmm5, xmm4
+ /* lookup seed in xmm5 */
+ movaps xmm2, xmm5
+ mulps xmm5, xmm5
+ movaps xmm1, [esp + three]
+ mulps xmm5, xmm4 /* rsq*lu*lu */
+ movaps xmm0, [esp + half]
+ mulps xmm7, xmm4 /* xmm7=krsq */
+ subps xmm1, xmm5 /* 30-rsq*lu*lu */
+ mulps xmm1, xmm2
+ mulps xmm0, xmm1 /* xmm0=rinv */
+ movaps xmm4, xmm0
+ mulps xmm4, xmm4 /* xmm4=rinvsq */
+ movaps xmm6, xmm0
+ addps xmm6, xmm7 /* xmm6=rinv+krsq */
+ movaps xmm1, xmm4
+ subps xmm6, [esp + crf]
+ mulps xmm1, xmm4
+ mulps xmm1, xmm4 /* xmm1=rinvsix */
+ movaps xmm2, xmm1
+ mulps xmm2, xmm2 /* xmm2=rinvtwelve */
+ mulps xmm6, xmm3 /* xmm6=vcoul */
+ mulps xmm7, [esp + two]
+ mulps xmm1, [esp + c6]
+ mulps xmm2, [esp + c12]
+ movaps xmm5, xmm2
+ subps xmm5, xmm1 /* vnb=vnb12-vnb6 */
+ addps xmm5, [esp + vnbtot]
+ mulps xmm1, [esp + six]
+ mulps xmm2, [esp + twelve]
+ subps xmm2, xmm1
+ subps xmm0, xmm7
+ mulps xmm3, xmm0
+ addps xmm2, xmm3
+ mulps xmm4, xmm2 /* xmm4=total fscal */
+ addps xmm6, [esp + vctot]
+
+ mov edi, [ebp + faction]
+
+ movaps xmm0, [esp + dx]
+ movaps xmm1, [esp + dy]
+ movaps xmm2, [esp + dz]
+
+ movaps [esp + vctot], xmm6
+ movaps [esp + vnbtot], xmm5
+
+ mulps xmm0, xmm4
+ mulps xmm1, xmm4
+ mulps xmm2, xmm4
+ /* xmm0-xmm2 contains tx-tz (partial force) */
+ /* now update f_i */
+ movaps xmm3, [esp + fix]
+ movaps xmm4, [esp + fiy]
+ movaps xmm5, [esp + fiz]
+ addps xmm3, xmm0
+ addps xmm4, xmm1
+ addps xmm5, xmm2
+ movaps [esp + fix], xmm3
+ movaps [esp + fiy], xmm4
+ movaps [esp + fiz], xmm5
+ /* update fj */
+
+ movss xmm3, [edi + eax*4]
+ movss xmm4, [edi + eax*4 + 4]
+ movss xmm5, [edi + eax*4 + 8]
+ subss xmm3, xmm0
+ subss xmm4, xmm1
+ subss xmm5, xmm2
+ movss [edi + eax*4], xmm3
+ movss [edi + eax*4 + 4], xmm4
+ movss [edi + eax*4 + 8], xmm5
+.i2100_updateouterdata:
+ mov ecx, [esp + ii3]
+ mov edi, [ebp + faction]
+ mov esi, [ebp + fshift]
+ mov edx, [esp + is3]
+
+ /* accumulate i forces in xmm0, xmm1, xmm2 */
+ movaps xmm0, [esp + fix]
+ movaps xmm1, [esp + fiy]
+ movaps xmm2, [esp + fiz]
+
+ movhlps xmm3, xmm0
+ movhlps xmm4, xmm1
+ movhlps xmm5, xmm2
+ addps xmm0, xmm3
+ addps xmm1, xmm4
+ addps xmm2, xmm5 /* sum is in 1/2 in xmm0-xmm2 */
+
+ movaps xmm3, xmm0
+ movaps xmm4, xmm1
+ movaps xmm5, xmm2
+
+ shufps xmm3, xmm3, 1
+ shufps xmm4, xmm4, 1
+ shufps xmm5, xmm5, 1
+ addss xmm0, xmm3
+ addss xmm1, xmm4
+ addss xmm2, xmm5 /* xmm0-xmm2 has single force in pos0 */
+
+ /* increment i force */
+ movss xmm3, [edi + ecx*4]
+ movss xmm4, [edi + ecx*4 + 4]
+ movss xmm5, [edi + ecx*4 + 8]
+ addss xmm3, xmm0
+ addss xmm4, xmm1
+ addss xmm5, xmm2
+ movss [edi + ecx*4], xmm3
+ movss [edi + ecx*4 + 4], xmm4
+ movss [edi + ecx*4 + 8], xmm5
+
+ /* increment fshift force */
+ movss xmm3, [esi + edx*4]
+ movss xmm4, [esi + edx*4 + 4]
+ movss xmm5, [esi + edx*4 + 8]
+ addss xmm3, xmm0
+ addss xmm4, xmm1
+ addss xmm5, xmm2
+ movss [esi + edx*4], xmm3
+ movss [esi + edx*4 + 4], xmm4
+ movss [esi + edx*4 + 8], xmm5
+
+ /* get group index for i particle */
+ mov edx, [ebp + gid] /* get group index for this i particle */
+ mov edx, [edx]
+ add [ebp + gid], 4 /* advance pointer */
+
+ /* accumulate total potential energy and update it */
+ movaps xmm7, [esp + vctot]
+ /* accumulate */
+ movhlps xmm6, xmm7
+ addps xmm7, xmm6 /* pos 0-1 in xmm7 have the sum now */
+ movaps xmm6, xmm7
+ shufps xmm6, xmm6, 1
+ addss xmm7, xmm6
+
+ /* add earlier value from mem */
+ mov eax, [ebp + Vc]
+ addss xmm7, [eax + edx*4]
+ /* move back to mem */
+ movss [eax + edx*4], xmm7
+
+ /* accumulate total lj energy and update it */
+ movaps xmm7, [esp + vnbtot]
+ /* accumulate */
+ movhlps xmm6, xmm7
+ addps xmm7, xmm6 /* pos 0-1 in xmm7 have the sum now */
+ movaps xmm6, xmm7
+ shufps xmm6, xmm6, 1
+ addss xmm7, xmm6
+
+ /* add earlier value from mem */
+ mov eax, [ebp + Vnb]
+ addss xmm7, [eax + edx*4]
+ /* move back to mem */
+ movss [eax + edx*4], xmm7
+
+ /* finish if last */
+ mov ecx, [ebp + nri]
+ dec ecx
+ jecxz .i2100_end
+ /* not last, iterate once more! */
+ mov [ebp + nri], ecx
+ jmp .i2100_outer
+.i2100_end:
+ emms
+ mov eax, [esp + salign]
+ add esp, eax
+ add esp, 360
+ pop edi
+ pop esi
+ pop edx
+ pop ecx
+ pop ebx
+ pop eax
+ leave
+ ret
+
+
+
+.globl inl2000_sse
+ .type inl2000_sse,@function
+inl2000_sse:
+.equ nri, 8
+.equ iinr, 12
+.equ jindex, 16
+.equ jjnr, 20
+.equ shift, 24
+.equ shiftvec, 28
+.equ fshift, 32
+.equ gid, 36
+.equ pos, 40
+.equ faction, 44
+.equ charge, 48
+.equ facel, 52
+.equ Vc, 56
+.equ krf, 60
+.equ crf, 64
+ /* stack offsets for local variables */
+ /* bottom of stack is cache-aligned for sse use */
+.equ ix, 0
+.equ iy, 16
+.equ iz, 32
+.equ iq, 48
+.equ dx, 64
+.equ dy, 80
+.equ dz, 96
+.equ vctot, 112
+.equ fix, 128
+.equ fiy, 144
+.equ fiz, 160
+.equ half, 176
+.equ three, 192
+.equ two, 208
+.equ krf, 224
+.equ crf, 240
+.equ is3, 256
+.equ ii3, 260
+.equ innerjjnr, 264
+.equ innerk, 268
+.equ salign, 272
+ push ebp
+ mov ebp,esp
+ push eax
+ push ebx
+ push ecx
+ push edx
+ push esi
+ push edi
+ sub esp, 276 /* local stack space */
+ mov eax, esp
+ and eax, 0xf
+ sub esp, eax
+ mov [esp + salign], eax
+
+ emms
+
+ movups xmm0, [sse_half]
+ movups xmm1, [sse_three]
+ movups xmm4, [sse_two]
+ movss xmm5, [ebp + krf]
+ movss xmm6, [ebp + crf]
+
+ movaps [esp + half], xmm0
+ movaps [esp + three], xmm1
+ movaps [esp + two], xmm4
+ shufps xmm5, xmm5, 0
+ movaps [esp + krf], xmm5
+ shufps xmm6, xmm6, 0
+ movaps [esp + crf], xmm6
+
+ /* assume we have at least one i particle - start directly */
+.i2000_outer:
+ mov eax, [ebp + shift] /* eax = pointer into shift[] */
+ mov ebx, [eax] /* ebx=shift[n] */
+ add [ebp + shift], 4 /* advance pointer one step */
+
+ lea ebx, [ebx + ebx*2] /* ebx=3*is */
+ mov [esp + is3],ebx /* store is3 */
+
+ mov eax, [ebp + shiftvec] /* eax = base of shiftvec[] */
+
+ movss xmm0, [eax + ebx*4]
+ movss xmm1, [eax + ebx*4 + 4]
+ movss xmm2, [eax + ebx*4 + 8]
+
+ mov ecx, [ebp + iinr] /* ecx = pointer into iinr[] */
+ add [ebp + iinr], 4 /* advance pointer */
+ mov ebx, [ecx] /* ebx =ii */
+
+ mov edx, [ebp + charge]
+ movss xmm3, [edx + ebx*4]
+ mulss xmm3, [ebp + facel]
+ shufps xmm3, xmm3, 0
+
+ lea ebx, [ebx + ebx*2] /* ebx = 3*ii=ii3 */
+ mov eax, [ebp + pos] /* eax = base of pos[] */
+
+ addss xmm0, [eax + ebx*4]
+ addss xmm1, [eax + ebx*4 + 4]
+ addss xmm2, [eax + ebx*4 + 8]
+
+ movaps [esp + iq], xmm3
+
+ shufps xmm0, xmm0, 0
+ shufps xmm1, xmm1, 0
+ shufps xmm2, xmm2, 0
+
+ movaps [esp + ix], xmm0
+ movaps [esp + iy], xmm1
+ movaps [esp + iz], xmm2
+
+ mov [esp + ii3], ebx
+
+ /* clear vctot and i forces */
+ xorps xmm4, xmm4
+ movaps [esp + vctot], xmm4
+ movaps [esp + fix], xmm4
+ movaps [esp + fiy], xmm4
+ movaps [esp + fiz], xmm4
+
+ mov eax, [ebp + jindex]
+ mov ecx, [eax] /* jindex[n] */
+ mov edx, [eax + 4] /* jindex[n+1] */
+ add [ebp + jindex], 4
+ sub edx, ecx /* number of innerloop atoms */
+
+ mov esi, [ebp + pos]
+ mov edi, [ebp + faction]
+ mov eax, [ebp + jjnr]
+ shl ecx, 2
+ add eax, ecx
+ mov [esp + innerjjnr], eax /* pointer to jjnr[nj0] */
+ sub edx, 4
+ mov [esp + innerk], edx /* number of innerloop atoms */
+ jge .i2000_unroll_loop
+ jmp .i2000_finish_inner
+.i2000_unroll_loop:
+ /* quad-unroll innerloop here */
+ mov edx, [esp + innerjjnr] /* pointer to jjnr[k] */
+ mov eax, [edx]
+ mov ebx, [edx + 4]
+ mov ecx, [edx + 8]
+ mov edx, [edx + 12] /* eax-edx=jnr1-4 */
+ add [esp + innerjjnr], 16 /* advance pointer (unrolled 4) */
+
+ mov esi, [ebp + charge] /* base of charge[] */
+
+ movss xmm3, [esi + eax*4]
+ movss xmm4, [esi + ecx*4]
+ movss xmm6, [esi + ebx*4]
+ movss xmm7, [esi + edx*4]
+
+ movaps xmm2, [esp + iq]
+ shufps xmm3, xmm6, 0
+ shufps xmm4, xmm7, 0
+ shufps xmm3, xmm4, 0b10001000 /* all charges in xmm3 */
+
+ mov esi, [ebp + pos] /* base of pos[] */
+
+ lea eax, [eax + eax*2] /* replace jnr with j3 */
+ lea ebx, [ebx + ebx*2]
+
+ mulps xmm3, xmm2
+ lea ecx, [ecx + ecx*2] /* replace jnr with j3 */
+ lea edx, [edx + edx*2]
+
+ /* move four coordinates to xmm0-xmm2 */
+
+ movlps xmm4, [esi + eax*4]
+ movlps xmm5, [esi + ecx*4]
+ movss xmm2, [esi + eax*4 + 8]
+ movss xmm6, [esi + ecx*4 + 8]
+
+ movhps xmm4, [esi + ebx*4]
+ movhps xmm5, [esi + edx*4]
+
+ movss xmm0, [esi + ebx*4 + 8]
+ movss xmm1, [esi + edx*4 + 8]
+
+ shufps xmm2, xmm0, 0
+ shufps xmm6, xmm1, 0
+
+ movaps xmm0, xmm4
+ movaps xmm1, xmm4
+
+ shufps xmm2, xmm6, 0b10001000
+
+ shufps xmm0, xmm5, 0b10001000
+ shufps xmm1, xmm5, 0b11011101
+
+ /* move ix-iz to xmm4-xmm6 */
+ movaps xmm4, [esp + ix]
+ movaps xmm5, [esp + iy]
+ movaps xmm6, [esp + iz]
+
+ /* calc dr */
+ subps xmm4, xmm0
+ subps xmm5, xmm1
+ subps xmm6, xmm2
+
+ /* store dr */
+ movaps [esp + dx], xmm4
+ movaps [esp + dy], xmm5
+ movaps [esp + dz], xmm6
+ /* square it */
+ mulps xmm4,xmm4
+ mulps xmm5,xmm5
+ mulps xmm6,xmm6
+ addps xmm4, xmm5
+ addps xmm4, xmm6
+ /* rsq in xmm4 */
+
+ movaps xmm7, [esp + krf]
+ rsqrtps xmm5, xmm4
+ /* lookup seed in xmm5 */
+ movaps xmm2, xmm5
+ mulps xmm5, xmm5
+ movaps xmm1, [esp + three]
+ mulps xmm5, xmm4 /* rsq*lu*lu */
+ movaps xmm0, [esp + half]
+ mulps xmm7, xmm4 /* xmm7=krsq */
+ subps xmm1, xmm5 /* 30-rsq*lu*lu */
+ mulps xmm1, xmm2
+ mulps xmm0, xmm1 /* xmm0=rinv */
+ movaps xmm4, xmm0
+ mulps xmm4, xmm4 /* xmm4=rinvsq */
+ movaps xmm6, xmm0
+ addps xmm6, xmm7 /* xmm6=rinv+krsq */
+
+ subps xmm6, [esp + crf] /* xmm6=rinv+krsq-crf */
+
+ mulps xmm6, xmm3 /* xmm6=vcoul=qq*(rinv+krsq) */
+ mulps xmm7, [esp + two]
+
+ subps xmm0, xmm7
+ mulps xmm3, xmm0
+ mulps xmm4, xmm3 /* xmm4=total fscal */
+ addps xmm6, [esp + vctot]
+
+ movaps xmm0, [esp + dx]
+ movaps xmm1, [esp + dy]
+ movaps xmm2, [esp + dz]
+
+ movaps [esp + vctot], xmm6
+
+ mov edi, [ebp + faction]
+ mulps xmm0, xmm4
+ mulps xmm1, xmm4
+ mulps xmm2, xmm4
+ /* xmm0-xmm2 contains tx-tz (partial force) */
+ /* now update f_i */
+ movaps xmm3, [esp + fix]
+ movaps xmm4, [esp + fiy]
+ movaps xmm5, [esp + fiz]
+ addps xmm3, xmm0
+ addps xmm4, xmm1
+ addps xmm5, xmm2
+ movaps [esp + fix], xmm3
+ movaps [esp + fiy], xmm4
+ movaps [esp + fiz], xmm5
+ /* the fj's - start by accumulating x & y forces from memory */
+ movlps xmm4, [edi + eax*4]
+ movlps xmm6, [edi + ecx*4]
+ movhps xmm4, [edi + ebx*4]
+ movhps xmm6, [edi + edx*4]
+
+ movaps xmm3, xmm4
+ shufps xmm3, xmm6, 0b10001000
+ shufps xmm4, xmm6, 0b11011101
+
+ /* now xmm3-xmm5 contains fjx, fjy, fjz */
+ subps xmm3, xmm0
+ subps xmm4, xmm1
+
+ /* unpack them back so we can store them - first x & y in xmm3/xmm4 */
+
+ movaps xmm6, xmm3
+ unpcklps xmm6, xmm4
+ unpckhps xmm3, xmm4
+ /* xmm6(l)=x & y for j1, (h) for j2 */
+ /* xmm3(l)=x & y for j3, (h) for j4 */
+ movlps [edi + eax*4], xmm6
+ movlps [edi + ecx*4], xmm3
+
+ movhps [edi + ebx*4], xmm6
+ movhps [edi + edx*4], xmm3
+
+ /* and the z forces */
+ movss xmm4, [edi + eax*4 + 8]
+ movss xmm5, [edi + ebx*4 + 8]
+ movss xmm6, [edi + ecx*4 + 8]
+ movss xmm7, [edi + edx*4 + 8]
+ subss xmm4, xmm2
+ shufps xmm2, xmm2, 0b11100101
+ subss xmm5, xmm2
+ shufps xmm2, xmm2, 0b11101010
+ subss xmm6, xmm2
+ shufps xmm2, xmm2, 0b11111111
+ subss xmm7, xmm2
+ movss [edi + eax*4 + 8], xmm4
+ movss [edi + ebx*4 + 8], xmm5
+ movss [edi + ecx*4 + 8], xmm6
+ movss [edi + edx*4 + 8], xmm7
+
+ /* should we do one more iteration? */
+ sub [esp + innerk], 4
+ jl .i2000_finish_inner
+ jmp .i2000_unroll_loop
+.i2000_finish_inner:
+ /* check if at least two particles remain */
+ add [esp + innerk], 4
+ mov edx, [esp + innerk]
+ and edx, 2
+ jnz .i2000_dopair
+ jmp .i2000_checksingle
+.i2000_dopair:
+ mov esi, [ebp + charge]
+
+ mov ecx, [esp + innerjjnr]
+
+ mov eax, [ecx]
+ mov ebx, [ecx + 4]
+ add [esp + innerjjnr], 8
+
+ xorps xmm3, xmm3
+ movss xmm3, [esi + eax*4]
+ movss xmm6, [esi + ebx*4]
+ shufps xmm3, xmm6, 0b00001100
+ shufps xmm3, xmm3, 0b01011000 /* xmm3(0,1) has the charges */
+
+ mov edi, [ebp + pos]
+
+ lea eax, [eax + eax*2]
+ lea ebx, [ebx + ebx*2]
+ /* move coordinates to xmm0-xmm2 */
+ movlps xmm1, [edi + eax*4]
+ movss xmm2, [edi + eax*4 + 8]
+ movhps xmm1, [edi + ebx*4]
+ movss xmm0, [edi + ebx*4 + 8]
+
+ mulps xmm3, [esp + iq]
+
+ xorps xmm7,xmm7
+ movlhps xmm3, xmm7
+
+ shufps xmm2, xmm0, 0
+
+ movaps xmm0, xmm1
+
+ shufps xmm2, xmm2, 0b10001000
+
+ shufps xmm0, xmm0, 0b10001000
+ shufps xmm1, xmm1, 0b11011101
+
+ mov edi, [ebp + faction]
+ /* move ix-iz to xmm4-xmm6 */
+
+ movaps xmm4, [esp + ix]
+ movaps xmm5, [esp + iy]
+ movaps xmm6, [esp + iz]
+
+ /* calc dr */
+ subps xmm4, xmm0
+ subps xmm5, xmm1
+ subps xmm6, xmm2
+
+ /* store dr */
+ movaps [esp + dx], xmm4
+ movaps [esp + dy], xmm5
+ movaps [esp + dz], xmm6
+ /* square it */
+ mulps xmm4,xmm4
+ mulps xmm5,xmm5
+ mulps xmm6,xmm6
+ addps xmm4, xmm5
+ addps xmm4, xmm6
+ /* rsq in xmm4 */
+
+ movaps xmm7, [esp + krf]
+ rsqrtps xmm5, xmm4
+ /* lookup seed in xmm5 */
+ movaps xmm2, xmm5
+ mulps xmm5, xmm5
+ movaps xmm1, [esp + three]
+ mulps xmm5, xmm4 /* rsq*lu*lu */
+ movaps xmm0, [esp + half]
+ mulps xmm7, xmm4 /* xmm7=krsq */
+ subps xmm1, xmm5 /* 30-rsq*lu*lu */
+ mulps xmm1, xmm2
+ mulps xmm0, xmm1 /* xmm0=rinv */
+
+ movaps xmm4, xmm0
+ mulps xmm4, xmm4 /* xmm4=rinvsq */
+ movaps xmm6, xmm0
+ addps xmm6, xmm7 /* xmm6=rinv+krsq */
+
+ subps xmm6, [esp + crf] /* xmm6=rinv+krsq-crf */
+
+ mulps xmm6, xmm3 /* xmm6=vcoul=qq*(rinv+krsq-crf) */
+ mulps xmm7, [esp + two]
+
+ subps xmm0, xmm7
+ mulps xmm3, xmm0
+
+ mulps xmm4, xmm3 /* xmm4=total fscal */
+ addps xmm6, [esp + vctot]
+
+ movaps xmm0, [esp + dx]
+ movaps xmm1, [esp + dy]
+ movaps xmm2, [esp + dz]
+
+ movaps [esp + vctot], xmm6
+
+ mulps xmm0, xmm4
+ mulps xmm1, xmm4
+ mulps xmm2, xmm4
+ /* xmm0-xmm2 contains tx-tz (partial force) */
+ /* now update f_i */
+ movaps xmm3, [esp + fix]
+ movaps xmm4, [esp + fiy]
+ movaps xmm5, [esp + fiz]
+ addps xmm3, xmm0
+ addps xmm4, xmm1
+ addps xmm5, xmm2
+ movaps [esp + fix], xmm3
+ movaps [esp + fiy], xmm4
+ movaps [esp + fiz], xmm5
+ /* update the fj's */
+ movss xmm3, [edi + eax*4]
+ movss xmm4, [edi + eax*4 + 4]
+ movss xmm5, [edi + eax*4 + 8]
+ subss xmm3, xmm0
+ subss xmm4, xmm1
+ subss xmm5, xmm2
+ movss [edi + eax*4], xmm3
+ movss [edi + eax*4 + 4], xmm4
+ movss [edi + eax*4 + 8], xmm5
+
+ shufps xmm0, xmm0, 0b11100001
+ shufps xmm1, xmm1, 0b11100001
+ shufps xmm2, xmm2, 0b11100001
+
+ movss xmm3, [edi + ebx*4]
+ movss xmm4, [edi + ebx*4 + 4]
+ movss xmm5, [edi + ebx*4 + 8]
+ subss xmm3, xmm0
+ subss xmm4, xmm1
+ subss xmm5, xmm2
+ movss [edi + ebx*4], xmm3
+ movss [edi + ebx*4 + 4], xmm4
+ movss [edi + ebx*4 + 8], xmm5
+
+.i2000_checksingle:
+ mov edx, [esp + innerk]
+ and edx, 1
+ jnz .i2000_dosingle
+ jmp .i2000_updateouterdata
+.i2000_dosingle:
+ mov esi, [ebp + charge]
+ mov edi, [ebp + pos]
+ mov ecx, [esp + innerjjnr]
+ xorps xmm3, xmm3
+ mov eax, [ecx]
+ movss xmm3, [esi + eax*4] /* xmm3(0) has the charge */
+
+ lea eax, [eax + eax*2]
+
+ /* move coordinates to xmm0-xmm2 */
+ movss xmm0, [edi + eax*4]
+ movss xmm1, [edi + eax*4 + 4]
+ movss xmm2, [edi + eax*4 + 8]
+
+ mulps xmm3, [esp + iq]
+
+ xorps xmm7, xmm7
+
+ movaps xmm4, [esp + ix]
+ movaps xmm5, [esp + iy]
+ movaps xmm6, [esp + iz]
+
+ /* calc dr */
+ subps xmm4, xmm0
+ subps xmm5, xmm1
+ subps xmm6, xmm2
+
+ /* store dr */
+ movaps [esp + dx], xmm4
+ movaps [esp + dy], xmm5
+ movaps [esp + dz], xmm6
+ /* square it */
+ mulps xmm4,xmm4
+ mulps xmm5,xmm5
+ mulps xmm6,xmm6
+ addps xmm4, xmm5
+ addps xmm4, xmm6
+ /* rsq in xmm4 */
+
+ movaps xmm7, [esp + krf]
+ rsqrtps xmm5, xmm4
+ /* lookup seed in xmm5 */
+ movaps xmm2, xmm5
+ mulps xmm5, xmm5
+ movaps xmm1, [esp + three]
+ mulps xmm5, xmm4 /* rsq*lu*lu */
+ movaps xmm0, [esp + half]
+ mulps xmm7, xmm4 /* xmm7=krsq */
+ subps xmm1, xmm5 /* 30-rsq*lu*lu */
+ mulps xmm1, xmm2
+ mulps xmm0, xmm1 /* xmm0=rinv */
+ movaps xmm4, xmm0
+ mulps xmm4, xmm4 /* xmm4=rinvsq */
+ movaps xmm6, xmm0
+ addps xmm6, xmm7 /* xmm6=rinv+krsq */
+
+ subps xmm6, [esp + crf] /* xmm6=rinv+krsq-crf */
+
+ mulps xmm6, xmm3 /* xmm6=vcoul */
+ mulps xmm7, [esp + two]
+
+ subps xmm0, xmm7
+ mulps xmm3, xmm0
+ mulps xmm4, xmm3 /* xmm4=total fscal */
+ addps xmm6, [esp + vctot]
+
+ mov edi, [ebp + faction]
+
+ movaps xmm0, [esp + dx]
+ movaps xmm1, [esp + dy]
+ movaps xmm2, [esp + dz]
+
+ movaps [esp + vctot], xmm6
+
+ mulps xmm0, xmm4
+ mulps xmm1, xmm4
+ mulps xmm2, xmm4
+ /* xmm0-xmm2 contains tx-tz (partial force) */
+ /* now update f_i */
+ movaps xmm3, [esp + fix]
+ movaps xmm4, [esp + fiy]
+ movaps xmm5, [esp + fiz]
+ addps xmm3, xmm0
+ addps xmm4, xmm1
+ addps xmm5, xmm2
+ movaps [esp + fix], xmm3
+ movaps [esp + fiy], xmm4
+ movaps [esp + fiz], xmm5
+ /* update fj */
+
+ movss xmm3, [edi + eax*4]
+ movss xmm4, [edi + eax*4 + 4]
+ movss xmm5, [edi + eax*4 + 8]
+ subss xmm3, xmm0
+ subss xmm4, xmm1
+ subss xmm5, xmm2
+ movss [edi + eax*4], xmm3
+ movss [edi + eax*4 + 4], xmm4
+ movss [edi + eax*4 + 8], xmm5
+.i2000_updateouterdata:
+ mov ecx, [esp + ii3]
+ mov edi, [ebp + faction]
+ mov esi, [ebp + fshift]
+ mov edx, [esp + is3]
+
+ /* accumulate i forces in xmm0, xmm1, xmm2 */
+ movaps xmm0, [esp + fix]
+ movaps xmm1, [esp + fiy]
+ movaps xmm2, [esp + fiz]
+
+ movhlps xmm3, xmm0
+ movhlps xmm4, xmm1
+ movhlps xmm5, xmm2
+ addps xmm0, xmm3
+ addps xmm1, xmm4
+ addps xmm2, xmm5 /* sum is in 1/2 in xmm0-xmm2 */
+
+ movaps xmm3, xmm0
+ movaps xmm4, xmm1
+ movaps xmm5, xmm2
+
+ shufps xmm3, xmm3, 1
+ shufps xmm4, xmm4, 1
+ shufps xmm5, xmm5, 1
+ addss xmm0, xmm3
+ addss xmm1, xmm4
+ addss xmm2, xmm5 /* xmm0-xmm2 has single force in pos0 */
+
+ /* increment i force */
+ movss xmm3, [edi + ecx*4]
+ movss xmm4, [edi + ecx*4 + 4]
+ movss xmm5, [edi + ecx*4 + 8]
+ addss xmm3, xmm0
+ addss xmm4, xmm1
+ addss xmm5, xmm2
+ movss [edi + ecx*4], xmm3
+ movss [edi + ecx*4 + 4], xmm4
+ movss [edi + ecx*4 + 8], xmm5
+
+ /* increment fshift force */
+ movss xmm3, [esi + edx*4]
+ movss xmm4, [esi + edx*4 + 4]
+ movss xmm5, [esi + edx*4 + 8]
+ addss xmm3, xmm0
+ addss xmm4, xmm1
+ addss xmm5, xmm2
+ movss [esi + edx*4], xmm3
+ movss [esi + edx*4 + 4], xmm4
+ movss [esi + edx*4 + 8], xmm5
+
+ /* get group index for i particle */
+ mov edx, [ebp + gid] /* get group index for this i particle */
+ mov edx, [edx]
+ add [ebp + gid], 4 /* advance pointer */
+
+ /* accumulate total potential energy and update it */
+ movaps xmm7, [esp + vctot]
+ /* accumulate */
+ movhlps xmm6, xmm7
+ addps xmm7, xmm6 /* pos 0-1 in xmm7 have the sum now */
+ movaps xmm6, xmm7
+ shufps xmm6, xmm6, 1
+ addss xmm7, xmm6
+
+ /* add earlier value from mem */
+ mov eax, [ebp + Vc]
+ addss xmm7, [eax + edx*4]
+ /* move back to mem */
+ movss [eax + edx*4], xmm7
+
+ /* finish if last */
+ mov ecx, [ebp + nri]
+ dec ecx
+ jecxz .i2000_end
+ /* not last, iterate once more! */
+ mov [ebp + nri], ecx
+ jmp .i2000_outer
+.i2000_end:
+ emms
+ mov eax, [esp + salign]
+ add esp, eax
+ add esp, 276
+ pop edi
+ pop esi
+ pop edx
+ pop ecx
+ pop ebx
+ pop eax
+ leave
+ ret
+
+
+
+
+
+.globl inl1110_sse
+ .type inl1110_sse,@function
+inl1110_sse:
+.equ nri, 8
+.equ iinr, 12
+.equ jindex, 16
+.equ jjnr, 20
+.equ shift, 24
+.equ shiftvec, 28
+.equ fshift, 32
+.equ gid, 36
+.equ pos, 40
+.equ faction, 44
+.equ charge, 48
+.equ facel, 52
+.equ Vc, 56
+.equ type, 60
+.equ ntype, 64
+.equ nbfp, 68
+.equ Vnb, 72
+.equ nsatoms, 76
+ /* stack offsets for local variables */
+ /* bottom of stack is cache-aligned for sse use */
+.equ ix, 0
+.equ iy, 16
+.equ iz, 32
+.equ iq, 48
+.equ dx, 64
+.equ dy, 80
+.equ dz, 96
+.equ c6, 112
+.equ c12, 128
+.equ two, 144
+.equ six, 160
+.equ twelve, 176
+.equ vctot, 192
+.equ vnbtot, 208
+.equ fix, 224
+.equ fiy, 240
+.equ fiz, 256
+.equ half, 272
+.equ three, 288
+.equ is3, 304
+.equ ii3, 308
+.equ shX, 312
+.equ shY, 316
+.equ shZ, 320
+.equ ntia, 324
+.equ innerjjnr0, 328
+.equ innerk0, 332
+.equ innerjjnr, 336
+.equ innerk, 340
+.equ salign, 344
+.equ nsvdwc, 348
+.equ nscoul, 352
+.equ nsvdw, 356
+.equ solnr, 360
+ push ebp
+ mov ebp,esp
+ push eax
+ push ebx
+ push ecx
+ push edx
+ push esi
+ push edi
+ sub esp, 364 /* local stack space */
+ mov eax, esp
+ and eax, 0xf
+ sub esp, eax
+ mov [esp + salign], eax
+
+ emms
+
+ movups xmm0, [sse_half]
+ movups xmm1, [sse_two]
+ movups xmm2, [sse_three]
+ movups xmm3, [sse_six]
+ movups xmm4, [sse_twelve]
+ movaps [esp + half], xmm0
+ movaps [esp + two], xmm1
+ movaps [esp + three], xmm2
+ movaps [esp + six], xmm3
+ movaps [esp + twelve], xmm4
+
+ /* assume we have at least one i particle - start directly */
+i1110_outer:
+ mov eax, [ebp + shift] /* eax = pointer into shift[] */
+ mov ebx, [eax] /* ebx=shift[n] */
+ add [ebp + shift], 4 /* advance pointer one step */
+
+ lea ebx, [ebx + ebx*2] /* ebx=3*is */
+ mov [esp + is3],ebx /* store is3 */
+
+ mov eax, [ebp + shiftvec] /* eax = base of shiftvec[] */
+
+ movlps xmm0, [eax + ebx*4]
+ movss xmm1, [eax + ebx*4 + 8]
+ movlps [esp + shX], xmm0
+ movss [esp + shZ], xmm1
+
+ mov ecx, [ebp + iinr] /* ecx = pointer into iinr[] */
+ add [ebp + iinr], 4 /* advance pointer */
+ mov ebx, [ecx] /* ebx =ii */
+
+ mov eax, [ebp + nsatoms]
+ add [ebp + nsatoms], 12
+ mov ecx, [eax]
+ mov edx, [eax + 4]
+ mov eax, [eax + 8]
+ sub ecx, eax
+ sub eax, edx
+
+ mov [esp + nsvdwc], edx
+ mov [esp + nscoul], eax
+ mov [esp + nsvdw], ecx
+
+ /* clear potential */
+ xorps xmm4, xmm4
+ movaps [esp + vctot], xmm4
+ movaps [esp + vnbtot], xmm4
+ mov [esp + solnr], ebx
+
+ mov eax, [ebp + jindex]
+ mov ecx, [eax] /* jindex[n] */
+ mov edx, [eax + 4] /* jindex[n+1] */
+ add [ebp + jindex], 4
+ sub edx, ecx /* number of innerloop atoms */
+
+ mov eax, [ebp + jjnr]
+ shl ecx, 2
+ add eax, ecx
+ mov [esp + innerjjnr0], eax /* pointer to jjnr[nj0] */
+ mov [esp + innerk0], edx /* number of innerloop atoms */
+
+ mov ecx, [esp + nsvdwc]
+ cmp ecx, 0
+ jnz i1110_mno_vdwc
+ jmp i1110_testcoul
+i1110_mno_vdwc:
+ mov ebx, [esp + solnr]
+ inc dword ptr [esp + solnr]
+
+ mov edx, [ebp + charge]
+ movss xmm3, [edx + ebx*4]
+ mulss xmm3, [ebp + facel]
+ shufps xmm3, xmm3, 0
+
+ mov edx, [ebp + type]
+ mov edx, [edx + ebx*4]
+ imul edx, [ebp + ntype]
+ shl edx, 1
+ mov [esp + ntia], edx
+
+ lea ebx, [ebx + ebx*2] /* ebx = 3*ii=ii3 */
+ mov eax, [ebp + pos] /* eax = base of pos[] */
+
+ movss xmm0, [esp + shX]
+ movss xmm1, [esp + shY]
+ movss xmm2, [esp + shZ]
+ addss xmm0, [eax + ebx*4]
+ addss xmm1, [eax + ebx*4 + 4]
+ addss xmm2, [eax + ebx*4 + 8]
+
+ movaps [esp + iq], xmm3
+
+ shufps xmm0, xmm0, 0
+ shufps xmm1, xmm1, 0
+ shufps xmm2, xmm2, 0
+
+ movaps [esp + ix], xmm0
+ movaps [esp + iy], xmm1
+ movaps [esp + iz], xmm2
+
+ mov [esp + ii3], ebx
+
+ /* clear i forces */
+ xorps xmm4, xmm4
+ movaps [esp + fix], xmm4
+ movaps [esp + fiy], xmm4
+ movaps [esp + fiz], xmm4
+
+ mov ecx, [esp + innerjjnr0]
+ mov [esp + innerjjnr], ecx
+ mov edx, [esp + innerk0]
+ sub edx, 4
+ mov [esp + innerk], edx /* number of innerloop atoms */
+ jge i1110_unroll_vdwc_loop
+ jmp i1110_finish_vdwc_inner
+i1110_unroll_vdwc_loop:
+ /* quad-unroll innerloop here */
+ mov edx, [esp + innerjjnr] /* pointer to jjnr[k] */
+ mov eax, [edx]
+ mov ebx, [edx + 4]
+ mov ecx, [edx + 8]
+ mov edx, [edx + 12] /* eax-edx=jnr1-4 */
+ add [esp + innerjjnr], 16 /* advance pointer (unrolled 4) */
+
+ mov esi, [ebp + charge] /* base of charge[] */
+
+ movss xmm3, [esi + eax*4]
+ movss xmm4, [esi + ecx*4]
+ movss xmm6, [esi + ebx*4]
+ movss xmm7, [esi + edx*4]
+
+ movaps xmm2, [esp + iq]
+ shufps xmm3, xmm6, 0
+ shufps xmm4, xmm7, 0
+ shufps xmm3, xmm4, 0b10001000 /* all charges in xmm3 */
+ movd mm0, eax /* use mmx registers as temp storage */
+ movd mm1, ebx
+ movd mm2, ecx
+ movd mm3, edx
+
+ mov esi, [ebp + type]
+ mov eax, [esi + eax*4]
+ mov ebx, [esi + ebx*4]
+ mov ecx, [esi + ecx*4]
+ mov edx, [esi + edx*4]
+ mov esi, [ebp + nbfp]
+ shl eax, 1
+ shl ebx, 1
+ shl ecx, 1
+ shl edx, 1
+ mov edi, [esp + ntia]
+ add eax, edi
+ add ebx, edi
+ add ecx, edi
+ add edx, edi
+
+ movlps xmm6, [esi + eax*4]
+ movlps xmm7, [esi + ecx*4]
+ movhps xmm6, [esi + ebx*4]
+ movhps xmm7, [esi + edx*4]
+
+ movaps xmm4, xmm6
+ shufps xmm4, xmm7, 0b10001000
+ shufps xmm6, xmm7, 0b11011101
+
+ movd eax, mm0
+ movd ebx, mm1
+ movd ecx, mm2
+ movd edx, mm3
+
+ movaps [esp + c6], xmm4
+ movaps [esp + c12], xmm6
+
+ mov esi, [ebp + pos] /* base of pos[] */
+
+ lea eax, [eax + eax*2] /* replace jnr with j3 */
+ lea ebx, [ebx + ebx*2]
+
+ mulps xmm3, xmm2
+ lea ecx, [ecx + ecx*2] /* replace jnr with j3 */
+ lea edx, [edx + edx*2]
+
+ /* move four coordinates to xmm0-xmm2 */
+
+ movlps xmm4, [esi + eax*4]
+ movlps xmm5, [esi + ecx*4]
+ movss xmm2, [esi + eax*4 + 8]
+ movss xmm6, [esi + ecx*4 + 8]
+
+ movhps xmm4, [esi + ebx*4]
+ movhps xmm5, [esi + edx*4]
+
+ movss xmm0, [esi + ebx*4 + 8]
+ movss xmm1, [esi + edx*4 + 8]
+
+ shufps xmm2, xmm0, 0
+ shufps xmm6, xmm1, 0
+
+ movaps xmm0, xmm4
+ movaps xmm1, xmm4
+
+ shufps xmm2, xmm6, 0b10001000
+
+ shufps xmm0, xmm5, 0b10001000
+ shufps xmm1, xmm5, 0b11011101
+
+ /* move ix-iz to xmm4-xmm6 */
+ movaps xmm4, [esp + ix]
+ movaps xmm5, [esp + iy]
+ movaps xmm6, [esp + iz]
+
+ /* calc dr */
+ subps xmm4, xmm0
+ subps xmm5, xmm1
+ subps xmm6, xmm2
+
+ /* store dr */
+ movaps [esp + dx], xmm4
+ movaps [esp + dy], xmm5
+ movaps [esp + dz], xmm6
+ /* square it */
+ mulps xmm4,xmm4
+ mulps xmm5,xmm5
+ mulps xmm6,xmm6
+ addps xmm4, xmm5
+ addps xmm4, xmm6
+ /* rsq in xmm4 */
+
+ rsqrtps xmm5, xmm4
+ /* lookup seed in xmm5 */
+ movaps xmm2, xmm5
+ mulps xmm5, xmm5
+ movaps xmm1, [esp + three]
+ mulps xmm5, xmm4 /* rsq*lu*lu */
+ movaps xmm0, [esp + half]
+ subps xmm1, xmm5 /* 30-rsq*lu*lu */
+ mulps xmm1, xmm2
+ mulps xmm0, xmm1 /* xmm0=rinv */
+ movaps xmm4, xmm0
+ mulps xmm4, xmm4 /* xmm4=rinvsq */
+ movaps xmm1, xmm4
+ mulps xmm1, xmm4
+ mulps xmm1, xmm4 /* xmm1=rinvsix */
+ movaps xmm2, xmm1
+ mulps xmm2, xmm2 /* xmm2=rinvtwelve */
+ mulps xmm3, xmm0 /* xmm3=vcoul */
+ mulps xmm1, [esp + c6]
+ mulps xmm2, [esp + c12]
+ movaps xmm5, xmm2
+ subps xmm5, xmm1 /* vnb=vnb12-vnb6 */
+ addps xmm5, [esp + vnbtot]
+ mulps xmm1, [esp + six]
+ mulps xmm2, [esp + twelve]
+ subps xmm2, xmm1
+ addps xmm2, xmm3
+ mulps xmm4, xmm2 /* xmm4=total fscal */
+ addps xmm3, [esp + vctot]
+
+ movaps xmm0, [esp + dx]
+ movaps xmm1, [esp + dy]
+ movaps xmm2, [esp + dz]
+
+ movaps [esp + vctot], xmm3
+ movaps [esp + vnbtot], xmm5
+
+ mov edi, [ebp + faction]
+ mulps xmm0, xmm4
+ mulps xmm1, xmm4
+ mulps xmm2, xmm4
+ /* xmm0-xmm2 contains tx-tz (partial force) */
+ /* now update f_i */
+ movaps xmm3, [esp + fix]
+ movaps xmm4, [esp + fiy]
+ movaps xmm5, [esp + fiz]
+ addps xmm3, xmm0
+ addps xmm4, xmm1
+ addps xmm5, xmm2
+ movaps [esp + fix], xmm3
+ movaps [esp + fiy], xmm4
+ movaps [esp + fiz], xmm5
+ /* the fj's - start by accumulating x & y forces from memory */
+ movlps xmm4, [edi + eax*4]
+ movlps xmm6, [edi + ecx*4]
+ movhps xmm4, [edi + ebx*4]
+ movhps xmm6, [edi + edx*4]
+
+ movaps xmm3, xmm4
+ shufps xmm3, xmm6, 0b10001000
+ shufps xmm4, xmm6, 0b11011101
+
+ /* now xmm3-xmm5 contains fjx, fjy, fjz */
+ subps xmm3, xmm0
+ subps xmm4, xmm1
+
+ /* unpack them back so we can store them - first x & y in xmm3/xmm4 */
+
+ movaps xmm6, xmm3
+ unpcklps xmm6, xmm4
+ unpckhps xmm3, xmm4
+ /* xmm6(l)=x & y for j1, (h) for j2 */
+ /* xmm3(l)=x & y for j3, (h) for j4 */
+ movlps [edi + eax*4], xmm6
+ movlps [edi + ecx*4], xmm3
+
+ movhps [edi + ebx*4], xmm6
+ movhps [edi + edx*4], xmm3
+
+ /* and the z forces */
+ movss xmm4, [edi + eax*4 + 8]
+ movss xmm5, [edi + ebx*4 + 8]
+ movss xmm6, [edi + ecx*4 + 8]
+ movss xmm7, [edi + edx*4 + 8]
+ subss xmm4, xmm2
+ shufps xmm2, xmm2, 0b11100101
+ subss xmm5, xmm2
+ shufps xmm2, xmm2, 0b11101010
+ subss xmm6, xmm2
+ shufps xmm2, xmm2, 0b11111111
+ subss xmm7, xmm2
+ movss [edi + eax*4 + 8], xmm4
+ movss [edi + ebx*4 + 8], xmm5
+ movss [edi + ecx*4 + 8], xmm6
+ movss [edi + edx*4 + 8], xmm7
+
+ /* should we do one more iteration? */
+ sub [esp + innerk], 4
+ jl i1110_finish_vdwc_inner
+ jmp i1110_unroll_vdwc_loop
+i1110_finish_vdwc_inner:
+ /* check if at least two particles remain */
+ add [esp + innerk], 4
+ mov edx, [esp + innerk]
+ and edx, 2
+ jnz i1110_dopair_vdwc
+ jmp i1110_checksingle_vdwc
+i1110_dopair_vdwc:
+ mov esi, [ebp + charge]
+
+ mov ecx, [esp + innerjjnr]
+
+ mov eax, [ecx]
+ mov ebx, [ecx + 4]
+ add [esp + innerjjnr], 8
+
+ movss xmm3, [esi + eax*4]
+ movss xmm6, [esi + ebx*4]
+ shufps xmm3, xmm6, 0
+ shufps xmm3, xmm3, 0b00001000 /* xmm3(0,1) has the charges */
+
+ mov esi, [ebp + type]
+ mov ecx, eax
+ mov edx, ebx
+ mov ecx, [esi + ecx*4]
+ mov edx, [esi + edx*4]
+ mov esi, [ebp + nbfp]
+ shl ecx, 1
+ shl edx, 1
+ mov edi, [esp + ntia]
+ add ecx, edi
+ add edx, edi
+ movlps xmm6, [esi + ecx*4]
+ movhps xmm6, [esi + edx*4]
+ mov edi, [ebp + pos]
+ xorps xmm7,xmm7
+ movaps xmm4, xmm6
+ shufps xmm4, xmm4, 0b1000
+ shufps xmm6, xmm6, 0b1101
+ movlhps xmm4, xmm7
+ movlhps xmm6, xmm7
+
+ movaps [esp + c6], xmm4
+ movaps [esp + c12], xmm6
+
+ lea eax, [eax + eax*2]
+ lea ebx, [ebx + ebx*2]
+ /* move coordinates to xmm0-xmm2 */
+ movlps xmm1, [edi + eax*4]
+ movss xmm2, [edi + eax*4 + 8]
+ movhps xmm1, [edi + ebx*4]
+ movss xmm0, [edi + ebx*4 + 8]
+
+ mulps xmm3, [esp + iq]
+
+ movlhps xmm3, xmm7
+
+ shufps xmm2, xmm0, 0
+
+ movaps xmm0, xmm1
+
+ shufps xmm2, xmm2, 0b10001000
+
+ shufps xmm0, xmm0, 0b10001000
+ shufps xmm1, xmm1, 0b11011101
+
+ mov edi, [ebp + faction]
+ /* move ix-iz to xmm4-xmm6 */
+ xorps xmm7, xmm7
+
+ movaps xmm4, [esp + ix]
+ movaps xmm5, [esp + iy]
+ movaps xmm6, [esp + iz]
+
+ /* calc dr */
+ subps xmm4, xmm0
+ subps xmm5, xmm1
+ subps xmm6, xmm2
+
+ /* store dr */
+ movaps [esp + dx], xmm4
+ movaps [esp + dy], xmm5
+ movaps [esp + dz], xmm6
+ /* square it */
+ mulps xmm4,xmm4
+ mulps xmm5,xmm5
+ mulps xmm6,xmm6
+ addps xmm4, xmm5
+ addps xmm4, xmm6
+ /* rsq in xmm4 */
+
+ rsqrtps xmm5, xmm4
+ /* lookup seed in xmm5 */
+ movaps xmm2, xmm5
+ mulps xmm5, xmm5
+ movaps xmm1, [esp + three]
+ mulps xmm5, xmm4 /* rsq*lu*lu */
+ movaps xmm0, [esp + half]
+ subps xmm1, xmm5 /* 30-rsq*lu*lu */
+ mulps xmm1, xmm2
+ mulps xmm0, xmm1 /* xmm0=rinv */
+ movaps xmm4, xmm0
+ mulps xmm4, xmm4 /* xmm4=rinvsq */
+ movaps xmm1, xmm4
+ mulps xmm1, xmm4
+ mulps xmm1, xmm4 /* xmm1=rinvsix */
+ movaps xmm2, xmm1
+ mulps xmm2, xmm2 /* xmm2=rinvtwelve */
+
+ mulps xmm3, xmm0 /* xmm3=vcoul */
+ mulps xmm1, [esp + c6]
+ mulps xmm2, [esp + c12]
+ movaps xmm5, xmm2
+ subps xmm5, xmm1 /* vnb=vnb12-vnb6 */
+ addps xmm5, [esp + vnbtot]
+ mulps xmm1, [esp + six]
+ mulps xmm2, [esp + twelve]
+ subps xmm2, xmm1
+ addps xmm2, xmm3
+ mulps xmm4, xmm2 /* xmm4=total fscal */
+ addps xmm3, [esp + vctot]
+
+ movaps xmm0, [esp + dx]
+ movaps xmm1, [esp + dy]
+ movaps xmm2, [esp + dz]
+
+ movaps [esp + vctot], xmm3
+ movaps [esp + vnbtot], xmm5
+
+ mulps xmm0, xmm4
+ mulps xmm1, xmm4
+ mulps xmm2, xmm4
+ /* xmm0-xmm2 contains tx-tz (partial force) */
+ /* now update f_i */
+ movaps xmm3, [esp + fix]
+ movaps xmm4, [esp + fiy]
+ movaps xmm5, [esp + fiz]
+ addps xmm3, xmm0
+ addps xmm4, xmm1
+ addps xmm5, xmm2
+ movaps [esp + fix], xmm3
+ movaps [esp + fiy], xmm4
+ movaps [esp + fiz], xmm5
+ /* update the fj's */
+ movss xmm3, [edi + eax*4]
+ movss xmm4, [edi + eax*4 + 4]
+ movss xmm5, [edi + eax*4 + 8]
+ subss xmm3, xmm0
+ subss xmm4, xmm1
+ subss xmm5, xmm2
+ movss [edi + eax*4], xmm3
+ movss [edi + eax*4 + 4], xmm4
+ movss [edi + eax*4 + 8], xmm5
+
+ shufps xmm0, xmm0, 0b11100001
+ shufps xmm1, xmm1, 0b11100001
+ shufps xmm2, xmm2, 0b11100001
+
+ movss xmm3, [edi + ebx*4]
+ movss xmm4, [edi + ebx*4 + 4]
+ movss xmm5, [edi + ebx*4 + 8]
+ subss xmm3, xmm0
+ subss xmm4, xmm1
+ subss xmm5, xmm2
+ movss [edi + ebx*4], xmm3
+ movss [edi + ebx*4 + 4], xmm4
+ movss [edi + ebx*4 + 8], xmm5
+
+i1110_checksingle_vdwc:
+ mov edx, [esp + innerk]
+ and edx, 1
+ jnz i1110_dosingle_vdwc
+ jmp i1110_updateouterdata_vdwc
+i1110_dosingle_vdwc:
+ mov esi, [ebp + charge]
+ mov edi, [ebp + pos]
+ mov ecx, [esp + innerjjnr]
+ mov eax, [ecx]
+ movss xmm3, [esi + eax*4] /* xmm3(0) has the charge */
+
+ mov esi, [ebp + type]
+ mov ecx, eax
+ mov ecx, [esi + ecx*4]
+ mov esi, [ebp + nbfp]
+ shl ecx, 1
+ add ecx, [esp + ntia]
+ xorps xmm6, xmm6
+ movlps xmm6, [esi + ecx*4]
+ movaps xmm4, xmm6
+ shufps xmm4, xmm4, 0b11111100
+ shufps xmm6, xmm6, 0b11111101
+
+ movaps [esp + c6], xmm4
+ movaps [esp + c12], xmm6
+
+ lea eax, [eax + eax*2]
+
+ /* move coordinates to xmm0-xmm2 */
+ movss xmm0, [edi + eax*4]
+ movss xmm1, [edi + eax*4 + 4]
+ movss xmm2, [edi + eax*4 + 8]
+
+ mulps xmm3, [esp + iq]
+
+ xorps xmm7, xmm7
+
+ movaps xmm4, [esp + ix]
+ movaps xmm5, [esp + iy]
+ movaps xmm6, [esp + iz]
+
+ /* calc dr */
+ subps xmm4, xmm0
+ subps xmm5, xmm1
+ subps xmm6, xmm2
+
+ /* store dr */
+ movaps [esp + dx], xmm4
+ movaps [esp + dy], xmm5
+ movaps [esp + dz], xmm6
+ /* square it */
+ mulps xmm4,xmm4
+ mulps xmm5,xmm5
+ mulps xmm6,xmm6
+ addps xmm4, xmm5
+ addps xmm4, xmm6
+ /* rsq in xmm4 */
+
+ rsqrtps xmm5, xmm4
+ /* lookup seed in xmm5 */
+ movaps xmm2, xmm5
+ mulps xmm5, xmm5
+ movaps xmm1, [esp + three]
+ mulps xmm5, xmm4 /* rsq*lu*lu */
+ movaps xmm0, [esp + half]
+ subps xmm1, xmm5 /* 30-rsq*lu*lu */
+ mulps xmm1, xmm2
+ mulps xmm0, xmm1 /* xmm0=rinv */
+ movaps xmm4, xmm0
+ mulps xmm4, xmm4 /* xmm4=rinvsq */
+ movaps xmm1, xmm4
+ mulps xmm1, xmm4
+ mulps xmm1, xmm4 /* xmm1=rinvsix */
+ movaps xmm2, xmm1
+ mulps xmm2, xmm2 /* xmm2=rinvtwelve */
+ mulps xmm3, xmm0 /* xmm3=vcoul */
+ mulps xmm1, [esp + c6]
+ mulps xmm2, [esp + c12]
+ movaps xmm5, xmm2
+ subps xmm5, xmm1 /* vnb=vnb12-vnb6 */
+ addps xmm5, [esp + vnbtot]
+ mulps xmm1, [esp + six]
+ mulps xmm2, [esp + twelve]
+ subps xmm2, xmm1
+ addps xmm2, xmm3
+ mulps xmm4, xmm2 /* xmm4=total fscal */
+ addps xmm3, [esp + vctot]
+
+ mov edi, [ebp + faction]
+
+ movaps xmm0, [esp + dx]
+ movaps xmm1, [esp + dy]
+ movaps xmm2, [esp + dz]
+
+ movaps [esp + vctot], xmm3
+ movaps [esp + vnbtot], xmm5
+
+ mulps xmm0, xmm4
+ mulps xmm1, xmm4
+ mulps xmm2, xmm4
+ /* xmm0-xmm2 contains tx-tz (partial force) */
+ /* now update f_i */
+ movaps xmm3, [esp + fix]
+ movaps xmm4, [esp + fiy]
+ movaps xmm5, [esp + fiz]
+ addps xmm3, xmm0
+ addps xmm4, xmm1
+ addps xmm5, xmm2
+ movaps [esp + fix], xmm3
+ movaps [esp + fiy], xmm4
+ movaps [esp + fiz], xmm5
+ /* update fj */
+
+ movss xmm3, [edi + eax*4]
+ movss xmm4, [edi + eax*4 + 4]
+ movss xmm5, [edi + eax*4 + 8]
+ subss xmm3, xmm0
+ subss xmm4, xmm1
+ subss xmm5, xmm2
+ movss [edi + eax*4], xmm3
+ movss [edi + eax*4 + 4], xmm4
+ movss [edi + eax*4 + 8], xmm5
+i1110_updateouterdata_vdwc:
+ mov ecx, [esp + ii3]
+ mov edi, [ebp + faction]
+ mov esi, [ebp + fshift]
+ mov edx, [esp + is3]
+
+ /* accumulate i forces in xmm0, xmm1, xmm2 */
+ movaps xmm0, [esp + fix]
+ movaps xmm1, [esp + fiy]
+ movaps xmm2, [esp + fiz]
+
+ movhlps xmm3, xmm0
+ movhlps xmm4, xmm1
+ movhlps xmm5, xmm2
+ addps xmm0, xmm3
+ addps xmm1, xmm4
+ addps xmm2, xmm5 /* sum is in 1/2 in xmm0-xmm2 */
+
+ movaps xmm3, xmm0
+ movaps xmm4, xmm1
+ movaps xmm5, xmm2
+
+ shufps xmm3, xmm3, 1
+ shufps xmm4, xmm4, 1
+ shufps xmm5, xmm5, 1
+ addss xmm0, xmm3
+ addss xmm1, xmm4
+ addss xmm2, xmm5 /* xmm0-xmm2 has single force in pos0 */
+
+ /* increment i force */
+ movss xmm3, [edi + ecx*4]
+ movss xmm4, [edi + ecx*4 + 4]
+ movss xmm5, [edi + ecx*4 + 8]
+ addss xmm3, xmm0
+ addss xmm4, xmm1
+ addss xmm5, xmm2
+ movss [edi + ecx*4], xmm3
+ movss [edi + ecx*4 + 4], xmm4
+ movss [edi + ecx*4 + 8], xmm5
+
+ /* increment fshift force */
+ movss xmm3, [esi + edx*4]
+ movss xmm4, [esi + edx*4 + 4]
+ movss xmm5, [esi + edx*4 + 8]
+ addss xmm3, xmm0
+ addss xmm4, xmm1
+ addss xmm5, xmm2
+ movss [esi + edx*4], xmm3
+ movss [esi + edx*4 + 4], xmm4
+ movss [esi + edx*4 + 8], xmm5
+
+
+ /* loop back to mno */
+ dec dword ptr [esp + nsvdwc]
+ jz i1110_testcoul
+ jmp i1110_mno_vdwc
+i1110_testcoul:
+ mov ecx, [esp + nscoul]
+ cmp ecx, 0
+ jnz i1110_mno_coul
+ jmp i1110_testvdw
+i1110_mno_coul:
+ mov ebx, [esp + solnr]
+ inc dword ptr [esp + solnr]
+
+ movss xmm0, [esp + shX]
+ movss xmm1, [esp + shY]
+ movss xmm2, [esp + shZ]
+
+ mov edx, [ebp + charge]
+ movss xmm3, [edx + ebx*4]
+ mulss xmm3, [ebp + facel]
+ shufps xmm3, xmm3, 0
+
+ lea ebx, [ebx + ebx*2] /* ebx = 3*ii=ii3 */
+ mov eax, [ebp + pos] /* eax = base of pos[] */
+
+ addss xmm0, [eax + ebx*4]
+ addss xmm1, [eax + ebx*4 + 4]
+ addss xmm2, [eax + ebx*4 + 8]
+
+ movaps [esp + iq], xmm3
+
+ shufps xmm0, xmm0, 0
+ shufps xmm1, xmm1, 0
+ shufps xmm2, xmm2, 0
+
+ movaps [esp + ix], xmm0
+ movaps [esp + iy], xmm1
+ movaps [esp + iz], xmm2
+
+ mov [esp + ii3], ebx
+
+ /* clear i forces */
+ xorps xmm4, xmm4
+ movaps [esp + fix], xmm4
+ movaps [esp + fiy], xmm4
+ movaps [esp + fiz], xmm4
+
+ mov ecx, [esp + innerjjnr0]
+ mov [esp + innerjjnr], ecx
+ mov edx, [esp + innerk0]
+ sub edx, 4
+ mov [esp + innerk], edx /* number of innerloop atoms */
+ jge i1110_unroll_coul_loop
+ jmp i1110_finish_coul_inner
+
+i1110_unroll_coul_loop:
+ /* quad-unrolled innerloop here */
+ mov edx, [esp + innerjjnr] /* pointer to jjnr[k] */
+ mov eax, [edx]
+ mov ebx, [edx + 4]
+ mov ecx, [edx + 8]
+ mov edx, [edx + 12] /* eax-edx=jnr1-4 */
+ add [esp + innerjjnr], 16 /* advance pointer (unrolled 4) */
+
+ mov esi, [ebp + charge] /* base of charge[] */
+
+ movss xmm3, [esi + eax*4]
+ movss xmm4, [esi + ecx*4]
+ movss xmm6, [esi + ebx*4]
+ movss xmm7, [esi + edx*4]
+
+ movaps xmm5, [esp + iq]
+ shufps xmm3, xmm6, 0
+ shufps xmm4, xmm7, 0
+ shufps xmm3, xmm4, 0b10001000
+ mov esi, [ebp + pos] /* base of pos[] */
+
+ lea eax, [eax + eax*2] /* replace jnr with j3 */
+ lea ebx, [ebx + ebx*2]
+
+ mulps xmm3, xmm5
+ lea ecx, [ecx + ecx*2] /* replace jnr with j3 */
+ lea edx, [edx + edx*2]
+
+ /* move four coordinates to xmm0-xmm2 */
+
+ movlps xmm4, [esi + eax*4]
+ movlps xmm5, [esi + ecx*4]
+ movss xmm2, [esi + eax*4 + 8]
+ movss xmm6, [esi + ecx*4 + 8]
+
+ movhps xmm4, [esi + ebx*4]
+ movhps xmm5, [esi + edx*4]
+
+ movss xmm0, [esi + ebx*4 + 8]
+ movss xmm1, [esi + edx*4 + 8]
+
+ shufps xmm2, xmm0, 0
+ shufps xmm6, xmm1, 0
+
+ movaps xmm0, xmm4
+ movaps xmm1, xmm4
+
+ shufps xmm2, xmm6, 0b10001000
+
+ shufps xmm0, xmm5, 0b10001000
+ shufps xmm1, xmm5, 0b11011101
+
+ mov edi, [ebp + faction]
+
+ /* move ix-iz to xmm4-xmm6 */
+ movaps xmm4, [esp + ix]
+ movaps xmm5, [esp + iy]
+ movaps xmm6, [esp + iz]
+
+ /* calc dr */
+ subps xmm4, xmm0
+ subps xmm5, xmm1
+ subps xmm6, xmm2
+
+ /* store dr */
+ movaps [esp + dx], xmm4
+ movaps [esp + dy], xmm5
+ movaps [esp + dz], xmm6
+ /* square it */
+ mulps xmm4,xmm4
+ mulps xmm5,xmm5
+ mulps xmm6,xmm6
+ addps xmm4, xmm5
+ addps xmm4, xmm6
+ /* rsq in xmm4 */
+
+ rsqrtps xmm5, xmm4
+ /* lookup seed in xmm5 */
+ movaps xmm2, xmm5
+ mulps xmm5, xmm5
+ movaps xmm1, [esp + three]
+ mulps xmm5, xmm4 /* rsq*lu*lu */
+ movaps xmm0, [esp + half]
+ subps xmm1, xmm5 /* 30-rsq*lu*lu */
+ mulps xmm1, xmm2
+ mulps xmm0, xmm1 /* xmm0=rinv */
+ movaps xmm4, xmm0
+ mulps xmm4, xmm4 /* xmm4=rinvsq */
+
+ movaps xmm5, [esp + vctot]
+ mulps xmm3, xmm0 /* xmm3=vcoul */
+ mulps xmm4, xmm3 /* xmm4=fscal */
+ addps xmm5, xmm3
+
+ movaps xmm0, [esp + dx]
+ movaps xmm1, [esp + dy]
+ movaps xmm2, [esp + dz]
+
+ movaps [esp + vctot], xmm5
+
+
+ mulps xmm0, xmm4
+ mulps xmm1, xmm4
+ mulps xmm2, xmm4
+ /* xmm0-xmm2 contains tx-tz (partial force) */
+ /* now update f_i */
+ movaps xmm3, [esp + fix]
+ movaps xmm4, [esp + fiy]
+ movaps xmm5, [esp + fiz]
+ addps xmm3, xmm0
+ addps xmm4, xmm1
+ addps xmm5, xmm2
+ movaps [esp + fix], xmm3
+ movaps [esp + fiy], xmm4
+ movaps [esp + fiz], xmm5
+ /* the fj's - start by accumulating x & y forces from memory */
+ movlps xmm4, [edi + eax*4]
+ movlps xmm6, [edi + ecx*4]
+ movhps xmm4, [edi + ebx*4]
+ movhps xmm6, [edi + edx*4]
+
+ movaps xmm3, xmm4
+ shufps xmm3, xmm6, 0b10001000
+ shufps xmm4, xmm6, 0b11011101
+
+ /* now xmm3-xmm5 contains fjx, fjy, fjz */
+ subps xmm3, xmm0
+ subps xmm4, xmm1
+
+ /* unpack them back so we can store them - first x & y in xmm3/xmm4 */
+
+ movaps xmm6, xmm3
+ unpcklps xmm6, xmm4
+ unpckhps xmm3, xmm4
+ /* xmm6(l)=x & y for j1, (h) for j2 */
+ /* xmm3(l)=x & y for j3, (h) for j4 */
+ movlps [edi + eax*4], xmm6
+ movlps [edi + ecx*4], xmm3
+
+ movhps [edi + ebx*4], xmm6
+ movhps [edi + edx*4], xmm3
+
+ /* and the z forces */
+ movss xmm4, [edi + eax*4 + 8]
+ movss xmm5, [edi + ebx*4 + 8]
+ movss xmm6, [edi + ecx*4 + 8]
+ movss xmm7, [edi + edx*4 + 8]
+ subss xmm4, xmm2
+ shufps xmm2, xmm2, 0b11100101
+ subss xmm5, xmm2
+ shufps xmm2, xmm2, 0b11101010
+ subss xmm6, xmm2
+ shufps xmm2, xmm2, 0b11111111
+ subss xmm7, xmm2
+ movss [edi + eax*4 + 8], xmm4
+ movss [edi + ebx*4 + 8], xmm5
+ movss [edi + ecx*4 + 8], xmm6
+ movss [edi + edx*4 + 8], xmm7
+
+ /* should we do one more iteration? */
+ sub [esp + innerk], 4
+ jl i1110_finish_coul_inner
+ jmp i1110_unroll_coul_loop
+i1110_finish_coul_inner:
+ /* check if at least two particles remain */
+ add [esp + innerk], 4
+ mov edx, [esp + innerk]
+ and edx, 2
+ jnz i1110_dopair_coul
+ jmp i1110_checksingle_coul
+i1110_dopair_coul:
+ mov esi, [ebp + charge]
+ mov edi, [ebp + pos]
+ mov ecx, [esp + innerjjnr]
+
+ mov eax, [ecx]
+ mov ebx, [ecx + 4]
+ add [esp + innerjjnr], 8
+
+ movss xmm3, [esi + eax*4]
+ movss xmm6, [esi + ebx*4]
+ shufps xmm3, xmm6, 0
+ shufps xmm3, xmm3, 0b00001000 /* xmm3(0,1) has the charges */
+
+ lea eax, [eax + eax*2]
+ lea ebx, [ebx + ebx*2]
+ /* move coordinates to xmm0-xmm2 */
+ movlps xmm1, [edi + eax*4]
+ movss xmm2, [edi + eax*4 + 8]
+ movhps xmm1, [edi + ebx*4]
+ movss xmm0, [edi + ebx*4 + 8]
+
+ mulps xmm3, [esp + iq]
+ xorps xmm7,xmm7
+ movlhps xmm3, xmm7
+
+ shufps xmm2, xmm0, 0
+
+ movaps xmm0, xmm1
+
+ shufps xmm2, xmm2, 0b10001000
+
+ shufps xmm0, xmm0, 0b10001000
+ shufps xmm1, xmm1, 0b11011101
+
+ mov edi, [ebp + faction]
+ /* move ix-iz to xmm4-xmm6 */
+ xorps xmm7, xmm7
+
+ movaps xmm4, [esp + ix]
+ movaps xmm5, [esp + iy]
+ movaps xmm6, [esp + iz]
+
+ /* calc dr */
+ subps xmm4, xmm0
+ subps xmm5, xmm1
+ subps xmm6, xmm2
+
+ /* store dr */
+ movaps [esp + dx], xmm4
+ movaps [esp + dy], xmm5
+ movaps [esp + dz], xmm6
+ /* square it */
+ mulps xmm4,xmm4
+ mulps xmm5,xmm5
+ mulps xmm6,xmm6
+ addps xmm4, xmm5
+ addps xmm4, xmm6
+ /* rsq in xmm4 */
+
+ rsqrtps xmm5, xmm4
+ /* lookup seed in xmm5 */
+ movaps xmm2, xmm5
+ mulps xmm5, xmm5
+ movaps xmm1, [esp + three]
+ mulps xmm5, xmm4 /* rsq*lu*lu */
+ movaps xmm0, [esp + half]
+ subps xmm1, xmm5 /* 30-rsq*lu*lu */
+ mulps xmm1, xmm2
+ mulps xmm0, xmm1 /* xmm0=rinv */
+ movaps xmm4, xmm0
+ mulps xmm4, xmm4 /* xmm4=rinvsq */
+
+ movaps xmm5, [esp + vctot]
+ mulps xmm3, xmm0 /* xmm3=vcoul */
+ mulps xmm4, xmm3 /* xmm4=fscal */
+ addps xmm5, xmm3
+
+ movaps xmm0, [esp + dx]
+ movaps xmm1, [esp + dy]
+ movaps xmm2, [esp + dz]
+
+ movaps [esp + vctot], xmm5
+
+ mulps xmm0, xmm4
+ mulps xmm1, xmm4
+ mulps xmm2, xmm4
+ /* xmm0-xmm2 contains tx-tz (partial force) */
+ /* now update f_i */
+ movaps xmm3, [esp + fix]
+ movaps xmm4, [esp + fiy]
+ movaps xmm5, [esp + fiz]
+ addps xmm3, xmm0
+ addps xmm4, xmm1
+ addps xmm5, xmm2
+ movaps [esp + fix], xmm3
+ movaps [esp + fiy], xmm4
+ movaps [esp + fiz], xmm5
+ /* update the fj's */
+ movss xmm3, [edi + eax*4]
+ movss xmm4, [edi + eax*4 + 4]
+ movss xmm5, [edi + eax*4 + 8]
+ subss xmm3, xmm0
+ subss xmm4, xmm1
+ subss xmm5, xmm2
+ movss [edi + eax*4], xmm3
+ movss [edi + eax*4 + 4], xmm4
+ movss [edi + eax*4 + 8], xmm5
+
+ shufps xmm0, xmm0, 0b11100001
+ shufps xmm1, xmm1, 0b11100001
+ shufps xmm2, xmm2, 0b11100001
+
+ movss xmm3, [edi + ebx*4]
+ movss xmm4, [edi + ebx*4 + 4]
+ movss xmm5, [edi + ebx*4 + 8]
+ subss xmm3, xmm0
+ subss xmm4, xmm1
+ subss xmm5, xmm2
+ movss [edi + ebx*4], xmm3
+ movss [edi + ebx*4 + 4], xmm4
+ movss [edi + ebx*4 + 8], xmm5
+
+i1110_checksingle_coul:
+ mov edx, [esp + innerk]
+ and edx, 1
+ jnz i1110_dosingle_coul
+ jmp i1110_updateouterdata_coul
+i1110_dosingle_coul:
+ mov esi, [ebp + charge]
+ mov edi, [ebp + pos]
+ mov ecx, [esp + innerjjnr]
+ mov eax, [ecx]
+ movss xmm3, [esi + eax*4] /* xmm3(0) has the charge */
+
+ lea eax, [eax + eax*2]
+
+ /* move coordinates to xmm0-xmm2 */
+ movss xmm0, [edi + eax*4]
+ movss xmm1, [edi + eax*4 + 4]
+ movss xmm2, [edi + eax*4 + 8]
+
+ mulps xmm3, [esp + iq]
+
+ xorps xmm7, xmm7
+
+ movaps xmm4, [esp + ix]
+ movaps xmm5, [esp + iy]
+ movaps xmm6, [esp + iz]
+
+ /* calc dr */
+ subps xmm4, xmm0
+ subps xmm5, xmm1
+ subps xmm6, xmm2
+
+ /* store dr */
+ movaps [esp + dx], xmm4
+ movaps [esp + dy], xmm5
+ movaps [esp + dz], xmm6
+ /* square it */
+ mulps xmm4,xmm4
+ mulps xmm5,xmm5
+ mulps xmm6,xmm6
+ addps xmm4, xmm5
+ addps xmm4, xmm6
+ /* rsq in xmm4 */
+
+ rsqrtps xmm5, xmm4
+ /* lookup seed in xmm5 */
+ movaps xmm2, xmm5
+ mulps xmm5, xmm5
+ movaps xmm1, [esp + three]
+ mulps xmm5, xmm4 /* rsq*lu*lu */
+ movaps xmm0, [esp + half]
+ subps xmm1, xmm5 /* 30-rsq*lu*lu */
+ mulps xmm1, xmm2
+ mulps xmm0, xmm1 /* xmm0=rinv */
+ movaps xmm4, xmm0
+ mulps xmm4, xmm4 /* xmm4=rinvsq */
+ mov edi, [ebp + faction]
+ movaps xmm5, [esp + vctot]
+ mulps xmm3, xmm0 /* xmm3=vcoul */
+ mulps xmm4, xmm3 /* xmm4=fscal */
+ addps xmm5, xmm3
+
+ movaps xmm0, [esp + dx]
+ movaps xmm1, [esp + dy]
+ movaps xmm2, [esp + dz]
+
+ movaps [esp + vctot], xmm5
+
+ mulps xmm0, xmm4
+ mulps xmm1, xmm4
+ mulps xmm2, xmm4
+ /* xmm0-xmm2 contains tx-tz (partial force) */
+ /* now update f_i */
+ movaps xmm3, [esp + fix]
+ movaps xmm4, [esp + fiy]
+ movaps xmm5, [esp + fiz]
+ addps xmm3, xmm0
+ addps xmm4, xmm1
+ addps xmm5, xmm2
+ movaps [esp + fix], xmm3
+ movaps [esp + fiy], xmm4
+ movaps [esp + fiz], xmm5
+ /* update fj */
+ movss xmm3, [edi + eax*4]
+ movss xmm4, [edi + eax*4 + 4]
+ movss xmm5, [edi + eax*4 + 8]
+ subss xmm3, xmm0
+ subss xmm4, xmm1
+ subss xmm5, xmm2
+ movss [edi + eax*4], xmm3
+ movss [edi + eax*4 + 4], xmm4
+ movss [edi + eax*4 + 8], xmm5
+
+i1110_updateouterdata_coul:
+ mov ecx, [esp + ii3]
+ mov edi, [ebp + faction]
+ mov esi, [ebp + fshift]
+ mov edx, [esp + is3]
+
+ /* accumulate i forces in xmm0, xmm1, xmm2 */
+ movaps xmm0, [esp + fix]
+ movaps xmm1, [esp + fiy]
+ movaps xmm2, [esp + fiz]
+
+ movhlps xmm3, xmm0
+ movhlps xmm4, xmm1
+ movhlps xmm5, xmm2
+ addps xmm0, xmm3
+ addps xmm1, xmm4
+ addps xmm2, xmm5 /* sum is in 1/2 in xmm0-xmm2 */
+
+ movaps xmm3, xmm0
+ movaps xmm4, xmm1
+ movaps xmm5, xmm2
+
+ shufps xmm3, xmm3, 1
+ shufps xmm4, xmm4, 1
+ shufps xmm5, xmm5, 1
+ addss xmm0, xmm3
+ addss xmm1, xmm4
+ addss xmm2, xmm5 /* xmm0-xmm2 has single force in pos0 */
+
+ /* increment i force */
+ movss xmm3, [edi + ecx*4]
+ movss xmm4, [edi + ecx*4 + 4]
+ movss xmm5, [edi + ecx*4 + 8]
+ addss xmm3, xmm0
+ addss xmm4, xmm1
+ addss xmm5, xmm2
+ movss [edi + ecx*4], xmm3
+ movss [edi + ecx*4 + 4], xmm4
+ movss [edi + ecx*4 + 8], xmm5
+
+ /* increment fshift force */
+ movss xmm3, [esi + edx*4]
+ movss xmm4, [esi + edx*4 + 4]
+ movss xmm5, [esi + edx*4 + 8]
+ addss xmm3, xmm0
+ addss xmm4, xmm1
+ addss xmm5, xmm2
+ movss [esi + edx*4], xmm3
+ movss [esi + edx*4 + 4], xmm4
+ movss [esi + edx*4 + 8], xmm5
+
+ /* loop back to mno */
+ dec dword ptr [esp + nscoul]
+ jz i1110_testvdw
+ jmp i1110_mno_coul
+i1110_testvdw:
+ mov ecx, [esp + nsvdw]
+ cmp ecx, 0
+ jnz i1110_mno_vdw
+ jmp i1110_last_mno
+i1110_mno_vdw:
+ mov ebx, [esp + solnr]
+ inc dword ptr [esp + solnr]
+
+ mov edx, [ebp + type]
+ mov edx, [edx + ebx*4]
+ imul edx, [ebp + ntype]
+ shl edx, 1
+ mov [esp + ntia], edx
+
+ lea ebx, [ebx + ebx*2] /* ebx = 3*ii=ii3 */
+ mov eax, [ebp + pos] /* eax = base of pos[] */
+ mov [esp + ii3], ebx
+
+ movss xmm0, [esp + shX]
+ movss xmm1, [esp + shY]
+ movss xmm2, [esp + shZ]
+
+ addss xmm0, [eax + ebx*4]
+ addss xmm1, [eax + ebx*4 + 4]
+ addss xmm2, [eax + ebx*4 + 8]
+
+ xorps xmm4, xmm4
+ movaps [esp + fix], xmm4
+ movaps [esp + fiy], xmm4
+ movaps [esp + fiz], xmm4
+
+ shufps xmm0, xmm0, 0
+ shufps xmm1, xmm1, 0
+ shufps xmm2, xmm2, 0
+
+ movaps [esp + ix], xmm0
+ movaps [esp + iy], xmm1
+ movaps [esp + iz], xmm2
+
+ mov ecx, [esp + innerjjnr0]
+ mov [esp + innerjjnr], ecx
+ mov edx, [esp + innerk0]
+ sub edx, 4
+ mov [esp + innerk], edx /* number of innerloop atoms */
+ jge i1110_unroll_vdw_loop
+ jmp i1110_finish_vdw_inner
+i1110_unroll_vdw_loop:
+ /* quad-unroll innerloop here */
+ mov edx, [esp + innerjjnr] /* pointer to jjnr[k] */
+ mov eax, [edx]
+ mov ebx, [edx + 4]
+ mov ecx, [edx + 8]
+ mov edx, [edx + 12] /* eax-edx=jnr1-4 */
+ add [esp + innerjjnr], 16 /* advance pointer (unrolled 4) */
+
+ movd mm0, eax /* use mmx registers as temp storage */
+ movd mm1, ebx
+ movd mm2, ecx
+ movd mm3, edx
+
+ mov esi, [ebp + type]
+ mov eax, [esi + eax*4]
+ mov ebx, [esi + ebx*4]
+ mov ecx, [esi + ecx*4]
+ mov edx, [esi + edx*4]
+ mov esi, [ebp + nbfp]
+ shl eax, 1
+ shl ebx, 1
+ shl ecx, 1
+ shl edx, 1
+ mov edi, [esp + ntia]
+ add eax, edi
+ add ebx, edi
+ add ecx, edi
+ add edx, edi
+
+ movlps xmm6, [esi + eax*4]
+ movlps xmm7, [esi + ecx*4]
+ movhps xmm6, [esi + ebx*4]
+ movhps xmm7, [esi + edx*4]
+
+ movaps xmm4, xmm6
+ shufps xmm4, xmm7, 0b10001000
+ shufps xmm6, xmm7, 0b11011101
+
+ movd eax, mm0
+ movd ebx, mm1
+ movd ecx, mm2
+ movd edx, mm3
+
+ movaps [esp + c6], xmm4
+ movaps [esp + c12], xmm6
+
+ mov esi, [ebp + pos] /* base of pos[] */
+
+ lea eax, [eax + eax*2] /* replace jnr with j3 */
+ lea ebx, [ebx + ebx*2]
+
+ lea ecx, [ecx + ecx*2] /* replace jnr with j3 */
+ lea edx, [edx + edx*2]
+
+ /* move four coordinates to xmm0-xmm2 */
+
+ movlps xmm4, [esi + eax*4]
+ movlps xmm5, [esi + ecx*4]
+ movss xmm2, [esi + eax*4 + 8]
+ movss xmm6, [esi + ecx*4 + 8]
+
+ movhps xmm4, [esi + ebx*4]
+ movhps xmm5, [esi + edx*4]
+
+ movss xmm0, [esi + ebx*4 + 8]
+ movss xmm1, [esi + edx*4 + 8]
+
+ shufps xmm2, xmm0, 0
+ shufps xmm6, xmm1, 0
+
+ movaps xmm0, xmm4
+ movaps xmm1, xmm4
+
+ shufps xmm2, xmm6, 0b10001000
+
+ shufps xmm0, xmm5, 0b10001000
+ shufps xmm1, xmm5, 0b11011101
+
+ /* move ix-iz to xmm4-xmm6 */
+ movaps xmm4, [esp + ix]
+ movaps xmm5, [esp + iy]
+ movaps xmm6, [esp + iz]
+
+ /* calc dr */
+ subps xmm4, xmm0
+ subps xmm5, xmm1
+ subps xmm6, xmm2
+
+ /* store dr */
+ movaps [esp + dx], xmm4
+ movaps [esp + dy], xmm5
+ movaps [esp + dz], xmm6
+ /* square it */
+ mulps xmm4,xmm4
+ mulps xmm5,xmm5
+ mulps xmm6,xmm6
+ addps xmm4, xmm5
+ addps xmm4, xmm6
+ /* rsq in xmm4 */
+
+ rcpps xmm5, xmm4
+ /* 1/x lookup seed in xmm5 */
+ movaps xmm0, [esp + two]
+ mulps xmm4, xmm5
+ subps xmm0, xmm4
+ mulps xmm0, xmm5 /* xmm0=rinvsq */
+ movaps xmm4, xmm0
+
+ movaps xmm1, xmm0
+ mulps xmm1, xmm0
+ mulps xmm1, xmm0 /* xmm1=rinvsix */
+ movaps xmm2, xmm1
+ mulps xmm2, xmm2 /* xmm2=rinvtwelve */
+
+ mulps xmm1, [esp + c6]
+ mulps xmm2, [esp + c12]
+ movaps xmm5, xmm2
+ subps xmm5, xmm1 /* vnb=vnb12-vnb6 */
+ addps xmm5, [esp + vnbtot]
+ mulps xmm1, [esp + six]
+ mulps xmm2, [esp + twelve]
+ subps xmm2, xmm1
+ mulps xmm4, xmm2 /* xmm4=total fscal */
+
+ movaps xmm0, [esp + dx]
+ movaps xmm1, [esp + dy]
+ movaps xmm2, [esp + dz]
+
+ movaps [esp + vnbtot], xmm5
+
+ mov edi, [ebp + faction]
+ mulps xmm0, xmm4
+ mulps xmm1, xmm4
+ mulps xmm2, xmm4
+ /* xmm0-xmm2 contains tx-tz (partial force) */
+ /* now update f_i */
+ movaps xmm3, [esp + fix]
+ movaps xmm4, [esp + fiy]
+ movaps xmm5, [esp + fiz]
+ addps xmm3, xmm0
+ addps xmm4, xmm1
+ addps xmm5, xmm2
+ movaps [esp + fix], xmm3
+ movaps [esp + fiy], xmm4
+ movaps [esp + fiz], xmm5
+ /* the fj's - start by accumulating x & y forces from memory */
+ movlps xmm4, [edi + eax*4]
+ movlps xmm6, [edi + ecx*4]
+ movhps xmm4, [edi + ebx*4]
+ movhps xmm6, [edi + edx*4]
+
+ movaps xmm3, xmm4
+ shufps xmm3, xmm6, 0b10001000
+ shufps xmm4, xmm6, 0b11011101
+
+ /* now xmm3-xmm5 contains fjx, fjy, fjz */
+ subps xmm3, xmm0
+ subps xmm4, xmm1
+
+ /* unpack them back so we can store them - first x & y in xmm3/xmm4 */
+
+ movaps xmm6, xmm3
+ unpcklps xmm6, xmm4
+ unpckhps xmm3, xmm4
+ /* xmm6(l)=x & y for j1, (h) for j2 */
+ /* xmm3(l)=x & y for j3, (h) for j4 */
+ movlps [edi + eax*4], xmm6
+ movlps [edi + ecx*4], xmm3
+
+ movhps [edi + ebx*4], xmm6
+ movhps [edi + edx*4], xmm3
+
+ /* and the z forces */
+ movss xmm4, [edi + eax*4 + 8]
+ movss xmm5, [edi + ebx*4 + 8]
+ movss xmm6, [edi + ecx*4 + 8]
+ movss xmm7, [edi + edx*4 + 8]
+ subss xmm4, xmm2
+ shufps xmm2, xmm2, 0b11100101
+ subss xmm5, xmm2
+ shufps xmm2, xmm2, 0b11101010
+ subss xmm6, xmm2
+ shufps xmm2, xmm2, 0b11111111
+ subss xmm7, xmm2
+ movss [edi + eax*4 + 8], xmm4
+ movss [edi + ebx*4 + 8], xmm5
+ movss [edi + ecx*4 + 8], xmm6
+ movss [edi + edx*4 + 8], xmm7
+
+ /* should we do one more iteration? */
+ sub [esp + innerk], 4
+ jl i1110_finish_vdw_inner
+ jmp i1110_unroll_vdw_loop
+i1110_finish_vdw_inner:
+ /* check if at least two particles remain */
+ add [esp + innerk], 4
+ mov edx, [esp + innerk]
+ and edx, 2
+ jnz i1110_dopair_vdw
+ jmp i1110_checksingle_vdw
+i1110_dopair_vdw:
+
+ mov ecx, [esp + innerjjnr]
+
+ mov eax, [ecx]
+ mov ebx, [ecx + 4]
+ add [esp + innerjjnr], 8
+
+ mov esi, [ebp + type]
+ mov ecx, eax
+ mov edx, ebx
+ mov ecx, [esi + ecx*4]
+ mov edx, [esi + edx*4]
+ mov esi, [ebp + nbfp]
+ shl ecx, 1
+ shl edx, 1
+ mov edi, [esp + ntia]
+ add ecx, edi
+ add edx, edi
+ movlps xmm6, [esi + ecx*4]
+ movhps xmm6, [esi + edx*4]
+ mov edi, [ebp + pos]
+ xorps xmm7,xmm7
+ movaps xmm4, xmm6
+ shufps xmm4, xmm4, 0b1000
+ shufps xmm6, xmm6, 0b1101
+ movlhps xmm4, xmm7
+ movlhps xmm6, xmm7
+
+ movaps [esp + c6], xmm4
+ movaps [esp + c12], xmm6
+
+ lea eax, [eax + eax*2]
+ lea ebx, [ebx + ebx*2]
+ /* move coordinates to xmm0-xmm2 */
+ movlps xmm1, [edi + eax*4]
+ movss xmm2, [edi + eax*4 + 8]
+ movhps xmm1, [edi + ebx*4]
+ movss xmm0, [edi + ebx*4 + 8]
+
+
+ movlhps xmm3, xmm7
+
+ shufps xmm2, xmm0, 0
+
+ movaps xmm0, xmm1
+
+ shufps xmm2, xmm2, 0b10001000
+
+ shufps xmm0, xmm0, 0b10001000
+ shufps xmm1, xmm1, 0b11011101
+
+ mov edi, [ebp + faction]
+ /* move ix-iz to xmm4-xmm6 */
+ xorps xmm7, xmm7
+
+ movaps xmm4, [esp + ix]
+ movaps xmm5, [esp + iy]
+ movaps xmm6, [esp + iz]
+
+ /* calc dr */
+ subps xmm4, xmm0
+ subps xmm5, xmm1
+ subps xmm6, xmm2
+
+ /* store dr */
+ movaps [esp + dx], xmm4
+ movaps [esp + dy], xmm5
+ movaps [esp + dz], xmm6
+ /* square it */
+ mulps xmm4,xmm4
+ mulps xmm5,xmm5
+ mulps xmm6,xmm6
+ addps xmm4, xmm5
+ addps xmm4, xmm6
+ /* rsq in xmm4 */
+
+
+ rcpps xmm5, xmm4
+ /* 1/x lookup seed in xmm5 */
+ movaps xmm0, [esp + two]
+ mulps xmm4, xmm5
+ subps xmm0, xmm4
+ mulps xmm0, xmm5 /* xmm0=rinvsq */
+ movaps xmm4, xmm0
+
+ movaps xmm1, xmm0
+ mulps xmm1, xmm0
+ mulps xmm1, xmm0 /* xmm1=rinvsix */
+ movaps xmm2, xmm1
+ mulps xmm2, xmm2 /* xmm2=rinvtwelve */
+
+ mulps xmm1, [esp + c6]
+ mulps xmm2, [esp + c12]
+ movaps xmm5, xmm2
+ subps xmm5, xmm1 /* vnb=vnb12-vnb6 */
+ addps xmm5, [esp + vnbtot]
+ mulps xmm1, [esp + six]
+ mulps xmm2, [esp + twelve]
+ subps xmm2, xmm1
+ mulps xmm4, xmm2 /* xmm4=total fscal */
+
+ movaps xmm0, [esp + dx]
+ movaps xmm1, [esp + dy]
+ movaps xmm2, [esp + dz]
+
+ movaps [esp + vnbtot], xmm5
+
+ mulps xmm0, xmm4
+ mulps xmm1, xmm4
+ mulps xmm2, xmm4
+ /* xmm0-xmm2 contains tx-tz (partial force) */
+ /* now update f_i */
+ movaps xmm3, [esp + fix]
+ movaps xmm4, [esp + fiy]
+ movaps xmm5, [esp + fiz]
+ addps xmm3, xmm0
+ addps xmm4, xmm1
+ addps xmm5, xmm2
+ movaps [esp + fix], xmm3
+ movaps [esp + fiy], xmm4
+ movaps [esp + fiz], xmm5
+ /* update the fj's */
+ movss xmm3, [edi + eax*4]
+ movss xmm4, [edi + eax*4 + 4]
+ movss xmm5, [edi + eax*4 + 8]
+ subss xmm3, xmm0
+ subss xmm4, xmm1
+ subss xmm5, xmm2
+ movss [edi + eax*4], xmm3
+ movss [edi + eax*4 + 4], xmm4
+ movss [edi + eax*4 + 8], xmm5
+
+ shufps xmm0, xmm0, 0b11100001
+ shufps xmm1, xmm1, 0b11100001
+ shufps xmm2, xmm2, 0b11100001
+
+ movss xmm3, [edi + ebx*4]
+ movss xmm4, [edi + ebx*4 + 4]
+ movss xmm5, [edi + ebx*4 + 8]
+ subss xmm3, xmm0
+ subss xmm4, xmm1
+ subss xmm5, xmm2
+ movss [edi + ebx*4], xmm3
+ movss [edi + ebx*4 + 4], xmm4
+ movss [edi + ebx*4 + 8], xmm5
+
+i1110_checksingle_vdw:
+ mov edx, [esp + innerk]
+ and edx, 1
+ jnz i1110_dosingle_vdw
+ jmp i1110_updateouterdata_vdw
+i1110_dosingle_vdw:
+ mov edi, [ebp + pos]
+ mov ecx, [esp + innerjjnr]
+ mov eax, [ecx]
+
+ mov esi, [ebp + type]
+ mov ecx, eax
+ mov ecx, [esi + ecx*4]
+ mov esi, [ebp + nbfp]
+ shl ecx, 1
+ add ecx, [esp + ntia]
+ xorps xmm6, xmm6
+ movlps xmm6, [esi + ecx*4]
+ movaps xmm4, xmm6
+ shufps xmm4, xmm4, 0b11111100
+ shufps xmm6, xmm6, 0b11111101
+
+ movaps [esp + c6], xmm4
+ movaps [esp + c12], xmm6
+
+ lea eax, [eax + eax*2]
+
+ /* move coordinates to xmm0-xmm2 */
+ movss xmm0, [edi + eax*4]
+ movss xmm1, [edi + eax*4 + 4]
+ movss xmm2, [edi + eax*4 + 8]
+
+ xorps xmm7, xmm7
+
+ movaps xmm4, [esp + ix]
+ movaps xmm5, [esp + iy]
+ movaps xmm6, [esp + iz]
+
+ /* calc dr */
+ subps xmm4, xmm0
+ subps xmm5, xmm1
+ subps xmm6, xmm2
+
+ /* store dr */
+ movaps [esp + dx], xmm4
+ movaps [esp + dy], xmm5
+ movaps [esp + dz], xmm6
+ /* square it */
+ mulps xmm4,xmm4
+ mulps xmm5,xmm5
+ mulps xmm6,xmm6
+ addps xmm4, xmm5
+ addps xmm4, xmm6
+ /* rsq in xmm4 */
+
+ rcpps xmm5, xmm4
+ /* 1/x lookup seed in xmm5 */
+ movaps xmm0, [esp + two]
+ mulps xmm4, xmm5
+ subps xmm0, xmm4
+ mulps xmm0, xmm5 /* xmm0=rinvsq */
+ movaps xmm4, xmm0
+
+ movaps xmm1, xmm0
+ mulps xmm1, xmm0
+ mulps xmm1, xmm0 /* xmm1=rinvsix */
+ movaps xmm2, xmm1
+ mulps xmm2, xmm2 /* xmm2=rinvtwelve */
+
+ mulps xmm1, [esp + c6]
+ mulps xmm2, [esp + c12]
+ movaps xmm5, xmm2
+ subps xmm5, xmm1 /* vnb=vnb12-vnb6 */
+ addps xmm5, [esp + vnbtot]
+ mulps xmm1, [esp + six]
+ mulps xmm2, [esp + twelve]
+ subps xmm2, xmm1
+ mulps xmm4, xmm2 /* xmm4=total fscal */
+
+ mov edi, [ebp + faction]
+
+ movaps xmm0, [esp + dx]
+ movaps xmm1, [esp + dy]
+ movaps xmm2, [esp + dz]
+
+ movaps [esp + vnbtot], xmm5
+
+ mulps xmm0, xmm4
+ mulps xmm1, xmm4
+ mulps xmm2, xmm4
+ /* xmm0-xmm2 contains tx-tz (partial force) */
+ /* now update f_i */
+ movaps xmm3, [esp + fix]
+ movaps xmm4, [esp + fiy]
+ movaps xmm5, [esp + fiz]
+ addps xmm3, xmm0
+ addps xmm4, xmm1
+ addps xmm5, xmm2
+ movaps [esp + fix], xmm3
+ movaps [esp + fiy], xmm4
+ movaps [esp + fiz], xmm5
+ /* update fj */
+
+ movss xmm3, [edi + eax*4]
+ movss xmm4, [edi + eax*4 + 4]
+ movss xmm5, [edi + eax*4 + 8]
+ subss xmm3, xmm0
+ subss xmm4, xmm1
+ subss xmm5, xmm2
+ movss [edi + eax*4], xmm3
+ movss [edi + eax*4 + 4], xmm4
+ movss [edi + eax*4 + 8], xmm5
+i1110_updateouterdata_vdw:
+ mov ecx, [esp + ii3]
+ mov edi, [ebp + faction]
+ mov esi, [ebp + fshift]
+ mov edx, [esp + is3]
+
+ /* accumulate i forces in xmm0, xmm1, xmm2 */
+ movaps xmm0, [esp + fix]
+ movaps xmm1, [esp + fiy]
+ movaps xmm2, [esp + fiz]
+
+ movhlps xmm3, xmm0
+ movhlps xmm4, xmm1
+ movhlps xmm5, xmm2
+ addps xmm0, xmm3
+ addps xmm1, xmm4
+ addps xmm2, xmm5 /* sum is in 1/2 in xmm0-xmm2 */
+
+ movaps xmm3, xmm0
+ movaps xmm4, xmm1
+ movaps xmm5, xmm2
+
+ shufps xmm3, xmm3, 1
+ shufps xmm4, xmm4, 1
+ shufps xmm5, xmm5, 1
+ addss xmm0, xmm3
+ addss xmm1, xmm4
+ addss xmm2, xmm5 /* xmm0-xmm2 has single force in pos0 */
+
+ /* increment i force */
+ movss xmm3, [edi + ecx*4]
+ movss xmm4, [edi + ecx*4 + 4]
+ movss xmm5, [edi + ecx*4 + 8]
+ addss xmm3, xmm0
+ addss xmm4, xmm1
+ addss xmm5, xmm2
+ movss [edi + ecx*4], xmm3
+ movss [edi + ecx*4 + 4], xmm4
+ movss [edi + ecx*4 + 8], xmm5
+
+ /* increment fshift force */
+ movss xmm3, [esi + edx*4]
+ movss xmm4, [esi + edx*4 + 4]
+ movss xmm5, [esi + edx*4 + 8]
+ addss xmm3, xmm0
+ addss xmm4, xmm1
+ addss xmm5, xmm2
+ movss [esi + edx*4], xmm3
+ movss [esi + edx*4 + 4], xmm4
+ movss [esi + edx*4 + 8], xmm5
+
+ /* loop back to mno */
+ dec dword ptr [esp + nsvdw]
+ jz i1110_last_mno
+ jmp i1110_mno_vdw
+i1110_last_mno:
+ /* get group index for i particle */
+ mov edx, [ebp + gid] /* get group index for this i particle */
+ mov edx, [edx]
+ add [ebp + gid], 4 /* advance pointer */
+
+ /* accumulate total potential energy and update it */
+ movaps xmm7, [esp + vctot]
+ /* accumulate */
+ movhlps xmm6, xmm7
+ addps xmm7, xmm6 /* pos 0-1 in xmm7 have the sum now */
+ movaps xmm6, xmm7
+ shufps xmm6, xmm6, 1
+ addss xmm7, xmm6
+
+ /* add earlier value from mem */
+ mov eax, [ebp + Vc]
+ addss xmm7, [eax + edx*4]
+ /* move back to mem */
+ movss [eax + edx*4], xmm7
+
+ /* accumulate total lj energy and update it */
+ movaps xmm7, [esp + vnbtot]
+ /* accumulate */
+ movhlps xmm6, xmm7
+ addps xmm7, xmm6 /* pos 0-1 in xmm7 have the sum now */
+ movaps xmm6, xmm7
+ shufps xmm6, xmm6, 1
+ addss xmm7, xmm6
+
+ /* add earlier value from mem */
+ mov eax, [ebp + Vnb]
+ addss xmm7, [eax + edx*4]
+ /* move back to mem */
+ movss [eax + edx*4], xmm7
+
+ /* finish if last */
+ mov ecx, [ebp + nri]
+ dec ecx
+ jecxz i1110_end
+ /* not last, iterate once more! */
+ mov [ebp + nri], ecx
+ jmp i1110_outer
+i1110_end:
+ emms
+ mov eax, [esp + salign]
+ add esp, eax
+ add esp, 364
+ pop edi
+ pop esi
+ pop edx
+ pop ecx
+ pop ebx
+ pop eax
+ leave
+ ret
+
+
+
+
+.globl inl1120_sse
+ .type inl1120_sse,@function
+inl1120_sse:
+.equ nri, 8
+.equ iinr, 12
+.equ jindex, 16
+.equ jjnr, 20
+.equ shift, 24
+.equ shiftvec, 28
+.equ fshift, 32
+.equ gid, 36
+.equ pos, 40
+.equ faction, 44
+.equ charge, 48
+.equ facel, 52
+.equ Vc, 56
+.equ type, 60
+.equ ntype, 64
+.equ nbfp, 68
+.equ Vnb, 72
+ /* stack offsets for local variables */
+ /* bottom of stack is cache-aligned for sse use */
+.equ ixO, 0
+.equ iyO, 16
+.equ izO, 32
+.equ ixH1, 48
+.equ iyH1, 64
+.equ izH1, 80
+.equ ixH2, 96
+.equ iyH2, 112
+.equ izH2, 128
+.equ iqO, 144
+.equ iqH, 160
+.equ dxO, 176
+.equ dyO, 192
+.equ dzO, 208
+.equ dxH1, 224
+.equ dyH1, 240
+.equ dzH1, 256
+.equ dxH2, 272
+.equ dyH2, 288
+.equ dzH2, 304
+.equ qqO, 320
+.equ qqH, 336
+.equ c6, 352
+.equ c12, 368
+.equ six, 384
+.equ twelve, 400
+.equ vctot, 416
+.equ vnbtot, 432
+.equ fixO, 448
+.equ fiyO, 464
+.equ fizO, 480
+.equ fixH1, 496
+.equ fiyH1, 512
+.equ fizH1, 528
+.equ fixH2, 544
+.equ fiyH2, 560
+.equ fizH2, 576
+.equ fjx, 592
+.equ fjy, 608
+.equ fjz, 624
+.equ half, 640
+.equ three, 656
+.equ is3, 672
+.equ ii3, 676
+.equ ntia, 680
+.equ innerjjnr, 684
+.equ innerk, 688
+.equ salign, 692
+ push ebp
+ mov ebp,esp
+ push eax
+ push ebx
+ push ecx
+ push edx
+ push esi
+ push edi
+ sub esp, 696 /* local stack space */
+ mov eax, esp
+ and eax, 0xf
+ sub esp, eax
+ mov [esp + salign], eax
+
+ emms
+
+ movups xmm0, [sse_half]
+ movups xmm1, [sse_three]
+ movups xmm2, [sse_six]
+ movups xmm3, [sse_twelve]
+ movaps [esp + half], xmm0
+ movaps [esp + three], xmm1
+ movaps [esp + six], xmm2
+ movaps [esp + twelve], xmm3
+
+ /* assume we have at least one i particle - start directly */
+ mov ecx, [ebp + iinr] /* ecx = pointer into iinr[] */
+ mov ebx, [ecx] /* ebx =ii */
+
+ mov edx, [ebp + charge]
+ movss xmm3, [edx + ebx*4]
+ movss xmm4, [edx + ebx*4 + 4]
+ movss xmm5, [ebp + facel]
+ mulss xmm3, xmm5
+ mulss xmm4, xmm5
+
+ shufps xmm3, xmm3, 0
+ shufps xmm4, xmm4, 0
+ movaps [esp + iqO], xmm3
+ movaps [esp + iqH], xmm4
+
+ mov edx, [ebp + type]
+ mov ecx, [edx + ebx*4]
+ shl ecx, 1
+ imul ecx, [ebp + ntype] /* ecx = ntia = 2*ntype*type[ii0] */
+ mov [esp + ntia], ecx
+i1120_outer:
+ mov eax, [ebp + shift] /* eax = pointer into shift[] */
+ mov ebx, [eax] /* ebx=shift[n] */
+ add [ebp + shift], 4 /* advance pointer one step */
+
+ lea ebx, [ebx + ebx*2] /* ebx=3*is */
+ mov [esp + is3],ebx /* store is3 */
+
+ mov eax, [ebp + shiftvec] /* eax = base of shiftvec[] */
+
+ movss xmm0, [eax + ebx*4]
+ movss xmm1, [eax + ebx*4 + 4]
+ movss xmm2, [eax + ebx*4 + 8]
+
+ mov ecx, [ebp + iinr] /* ecx = pointer into iinr[] */
+ add [ebp + iinr], 4 /* advance pointer */
+ mov ebx, [ecx] /* ebx =ii */
+
+ movaps xmm3, xmm0
+ movaps xmm4, xmm1
+ movaps xmm5, xmm2
+
+ lea ebx, [ebx + ebx*2] /* ebx = 3*ii=ii3 */
+ mov eax, [ebp + pos] /* eax = base of pos[] */
+ mov [esp + ii3], ebx
+
+ addss xmm3, [eax + ebx*4]
+ addss xmm4, [eax + ebx*4 + 4]
+ addss xmm5, [eax + ebx*4 + 8]
+ shufps xmm3, xmm3, 0
+ shufps xmm4, xmm4, 0
+ shufps xmm5, xmm5, 0
+ movaps [esp + ixO], xmm3
+ movaps [esp + iyO], xmm4
+ movaps [esp + izO], xmm5
+
+ movss xmm3, xmm0
+ movss xmm4, xmm1
+ movss xmm5, xmm2
+ addss xmm0, [eax + ebx*4 + 12]
+ addss xmm1, [eax + ebx*4 + 16]
+ addss xmm2, [eax + ebx*4 + 20]
+ addss xmm3, [eax + ebx*4 + 24]
+ addss xmm4, [eax + ebx*4 + 28]
+ addss xmm5, [eax + ebx*4 + 32]
+
+ shufps xmm0, xmm0, 0
+ shufps xmm1, xmm1, 0
+ shufps xmm2, xmm2, 0
+ shufps xmm3, xmm3, 0
+ shufps xmm4, xmm4, 0
+ shufps xmm5, xmm5, 0
+ movaps [esp + ixH1], xmm0
+ movaps [esp + iyH1], xmm1
+ movaps [esp + izH1], xmm2
+ movaps [esp + ixH2], xmm3
+ movaps [esp + iyH2], xmm4
+ movaps [esp + izH2], xmm5
+
+ /* clear vctot and i forces */
+ xorps xmm4, xmm4
+ movaps [esp + vctot], xmm4
+ movaps [esp + vnbtot], xmm4
+ movaps [esp + fixO], xmm4
+ movaps [esp + fiyO], xmm4
+ movaps [esp + fizO], xmm4
+ movaps [esp + fixH1], xmm4
+ movaps [esp + fiyH1], xmm4
+ movaps [esp + fizH1], xmm4
+ movaps [esp + fixH2], xmm4
+ movaps [esp + fiyH2], xmm4
+ movaps [esp + fizH2], xmm4
+
+ mov eax, [ebp + jindex]
+ mov ecx, [eax] /* jindex[n] */
+ mov edx, [eax + 4] /* jindex[n+1] */
+ add [ebp + jindex], 4
+ sub edx, ecx /* number of innerloop atoms */
+
+ mov esi, [ebp + pos]
+ mov edi, [ebp + faction]
+ mov eax, [ebp + jjnr]
+ shl ecx, 2
+ add eax, ecx
+ mov [esp + innerjjnr], eax /* pointer to jjnr[nj0] */
+ sub edx, 4
+ mov [esp + innerk], edx /* number of innerloop atoms */
+ jge i1120_unroll_loop
+ jmp i1120_odd_inner
+i1120_unroll_loop:
+ /* quad-unroll innerloop here */
+ mov edx, [esp + innerjjnr] /* pointer to jjnr[k] */
+ mov eax, [edx]
+ mov ebx, [edx + 4]
+ mov ecx, [edx + 8]
+ mov edx, [edx + 12] /* eax-edx=jnr1-4 */
+
+ add [esp + innerjjnr], 16 /* advance pointer (unrolled 4) */
+
+ mov esi, [ebp + charge] /* base of charge[] */
+
+ movss xmm3, [esi + eax*4]
+ movss xmm4, [esi + ecx*4]
+ movss xmm6, [esi + ebx*4]
+ movss xmm7, [esi + edx*4]
+
+ shufps xmm3, xmm6, 0
+ shufps xmm4, xmm7, 0
+ shufps xmm3, xmm4, 0b10001000 /* all charges in xmm3 */
+ movaps xmm4, xmm3 /* and in xmm4 */
+ mulps xmm3, [esp + iqO]
+ mulps xmm4, [esp + iqH]
+
+ movd mm0, eax /* use mmx registers as temp storage */
+ movd mm1, ebx
+ movd mm2, ecx
+ movd mm3, edx
+
+ movaps [esp + qqO], xmm3
+ movaps [esp + qqH], xmm4
+
+ mov esi, [ebp + type]
+ mov eax, [esi + eax*4]
+ mov ebx, [esi + ebx*4]
+ mov ecx, [esi + ecx*4]
+ mov edx, [esi + edx*4]
+ mov esi, [ebp + nbfp]
+ shl eax, 1
+ shl ebx, 1
+ shl ecx, 1
+ shl edx, 1
+ mov edi, [esp + ntia]
+ add eax, edi
+ add ebx, edi
+ add ecx, edi
+ add edx, edi
+
+ movlps xmm6, [esi + eax*4]
+ movlps xmm7, [esi + ecx*4]
+ movhps xmm6, [esi + ebx*4]
+ movhps xmm7, [esi + edx*4]
+
+ movaps xmm4, xmm6
+ shufps xmm4, xmm7, 0b10001000
+ shufps xmm6, xmm7, 0b11011101
+
+ movd eax, mm0
+ movd ebx, mm1
+ movd ecx, mm2
+ movd edx, mm3
+
+ movaps [esp + c6], xmm4
+ movaps [esp + c12], xmm6
+
+ mov esi, [ebp + pos] /* base of pos[] */
+
+ lea eax, [eax + eax*2] /* replace jnr with j3 */
+ lea ebx, [ebx + ebx*2]
+ lea ecx, [ecx + ecx*2] /* replace jnr with j3 */
+ lea edx, [edx + edx*2]
+
+ /* move four coordinates to xmm0-xmm2 */
+ movlps xmm4, [esi + eax*4]
+ movlps xmm5, [esi + ecx*4]
+ movss xmm2, [esi + eax*4 + 8]
+ movss xmm6, [esi + ecx*4 + 8]
+
+ movhps xmm4, [esi + ebx*4]
+ movhps xmm5, [esi + edx*4]
+
+ movss xmm0, [esi + ebx*4 + 8]
+ movss xmm1, [esi + edx*4 + 8]
+
+ shufps xmm2, xmm0, 0
+ shufps xmm6, xmm1, 0
+
+ movaps xmm0, xmm4
+ movaps xmm1, xmm4
+
+ shufps xmm2, xmm6, 0b10001000
+
+ shufps xmm0, xmm5, 0b10001000
+ shufps xmm1, xmm5, 0b11011101
+
+ /* move ixO-izO to xmm4-xmm6 */
+ movaps xmm4, [esp + ixO]
+ movaps xmm5, [esp + iyO]
+ movaps xmm6, [esp + izO]
+
+ /* calc dr */
+ subps xmm4, xmm0
+ subps xmm5, xmm1
+ subps xmm6, xmm2
+
+ /* store dr */
+ movaps [esp + dxO], xmm4
+ movaps [esp + dyO], xmm5
+ movaps [esp + dzO], xmm6
+ /* square it */
+ mulps xmm4,xmm4
+ mulps xmm5,xmm5
+ mulps xmm6,xmm6
+ addps xmm4, xmm5
+ addps xmm4, xmm6
+ movaps xmm7, xmm4
+ /* rsqO in xmm7 */
+
+ /* move ixH1-izH1 to xmm4-xmm6 */
+ movaps xmm4, [esp + ixH1]
+ movaps xmm5, [esp + iyH1]
+ movaps xmm6, [esp + izH1]
+
+ /* calc dr */
+ subps xmm4, xmm0
+ subps xmm5, xmm1
+ subps xmm6, xmm2
+
+ /* store dr */
+ movaps [esp + dxH1], xmm4
+ movaps [esp + dyH1], xmm5
+ movaps [esp + dzH1], xmm6
+ /* square it */
+ mulps xmm4,xmm4
+ mulps xmm5,xmm5
+ mulps xmm6,xmm6
+ addps xmm6, xmm5
+ addps xmm6, xmm4
+ /* rsqH1 in xmm6 */
+
+ /* move ixH2-izH2 to xmm3-xmm5 */
+ movaps xmm3, [esp + ixH2]
+ movaps xmm4, [esp + iyH2]
+ movaps xmm5, [esp + izH2]
+
+ /* calc dr */
+ subps xmm3, xmm0
+ subps xmm4, xmm1
+ subps xmm5, xmm2
+
+ /* store dr */
+ movaps [esp + dxH2], xmm3
+ movaps [esp + dyH2], xmm4
+ movaps [esp + dzH2], xmm5
+ /* square it */
+ mulps xmm3,xmm3
+ mulps xmm4,xmm4
+ mulps xmm5,xmm5
+ addps xmm5, xmm4
+ addps xmm5, xmm3
+ /* rsqH2 in xmm5, rsqH1 in xmm6, rsqO in xmm7 */
+
+ /* start with rsqO - seed in xmm2 */
+ rsqrtps xmm2, xmm7
+ movaps xmm3, xmm2
+ mulps xmm2, xmm2
+ movaps xmm4, [esp + three]
+ mulps xmm2, xmm7 /* rsq*lu*lu */
+ subps xmm4, xmm2 /* 30-rsq*lu*lu */
+ mulps xmm4, xmm3 /* lu*(3-rsq*lu*lu) */
+ mulps xmm4, [esp + half]
+ movaps xmm7, xmm4 /* rinvO in xmm7 */
+ /* rsqH1 - seed in xmm2 */
+ rsqrtps xmm2, xmm6
+ movaps xmm3, xmm2
+ mulps xmm2, xmm2
+ movaps xmm4, [esp + three]
+ mulps xmm2, xmm6 /* rsq*lu*lu */
+ subps xmm4, xmm2 /* 30-rsq*lu*lu */
+ mulps xmm4, xmm3 /* lu*(3-rsq*lu*lu) */
+ mulps xmm4, [esp + half]
+ movaps xmm6, xmm4 /* rinvH1 in xmm6 */
+ /* rsqH2 - seed in xmm2 */
+ rsqrtps xmm2, xmm5
+ movaps xmm3, xmm2
+ mulps xmm2, xmm2
+ movaps xmm4, [esp + three]
+ mulps xmm2, xmm5 /* rsq*lu*lu */
+ subps xmm4, xmm2 /* 30-rsq*lu*lu */
+ mulps xmm4, xmm3 /* lu*(3-rsq*lu*lu) */
+ mulps xmm4, [esp + half]
+ movaps xmm5, xmm4 /* rinvH2 in xmm5 */
+
+ /* do O interactions */
+ movaps xmm4, xmm7
+ mulps xmm4, xmm4 /* xmm7=rinv, xmm4=rinvsq */
+ movaps xmm1, xmm4
+ mulps xmm1, xmm4
+ mulps xmm1, xmm4 /* xmm1=rinvsix */
+ movaps xmm2, xmm1
+ mulps xmm2, xmm2 /* xmm2=rinvtwelve */
+ mulps xmm7, [esp + qqO] /* xmm7=vcoul */
+
+ mulps xmm1, [esp + c6]
+ mulps xmm2, [esp + c12]
+ movaps xmm3, xmm2
+ subps xmm3, xmm1 /* vnb=vnb12-vnb6 */
+ addps xmm3, [esp + vnbtot]
+ mulps xmm1, [esp + six]
+ mulps xmm2, [esp + twelve]
+ subps xmm2, xmm1
+ addps xmm2, xmm7
+ mulps xmm4, xmm2 /* total fsO in xmm4 */
+
+ addps xmm7, [esp + vctot]
+
+ movaps [esp + vnbtot], xmm3
+ movaps [esp + vctot], xmm7
+
+ movaps xmm0, [esp + dxO]
+ movaps xmm1, [esp + dyO]
+ movaps xmm2, [esp + dzO]
+ mulps xmm0, xmm4
+ mulps xmm1, xmm4
+ mulps xmm2, xmm4
+
+ /* update O forces */
+ movaps xmm3, [esp + fixO]
+ movaps xmm4, [esp + fiyO]
+ movaps xmm7, [esp + fizO]
+ addps xmm3, xmm0
+ addps xmm4, xmm1
+ addps xmm7, xmm2
+ movaps [esp + fixO], xmm3
+ movaps [esp + fiyO], xmm4
+ movaps [esp + fizO], xmm7
+ /* update j forces with water O */
+ movaps [esp + fjx], xmm0
+ movaps [esp + fjy], xmm1
+ movaps [esp + fjz], xmm2
+
+ /* H1 interactions */
+ movaps xmm4, xmm6
+ mulps xmm4, xmm4 /* xmm6=rinv, xmm4=rinvsq */
+ mulps xmm6, [esp + qqH] /* xmm6=vcoul */
+ mulps xmm4, xmm6 /* total fsH1 in xmm4 */
+
+ addps xmm6, [esp + vctot]
+
+ movaps xmm0, [esp + dxH1]
+ movaps xmm1, [esp + dyH1]
+ movaps xmm2, [esp + dzH1]
+ movaps [esp + vctot], xmm6
+ mulps xmm0, xmm4
+ mulps xmm1, xmm4
+ mulps xmm2, xmm4
+
+ /* update H1 forces */
+ movaps xmm3, [esp + fixH1]
+ movaps xmm4, [esp + fiyH1]
+ movaps xmm7, [esp + fizH1]
+ addps xmm3, xmm0
+ addps xmm4, xmm1
+ addps xmm7, xmm2
+ movaps [esp + fixH1], xmm3
+ movaps [esp + fiyH1], xmm4
+ movaps [esp + fizH1], xmm7
+ /* update j forces with water H1 */
+ addps xmm0, [esp + fjx]
+ addps xmm1, [esp + fjy]
+ addps xmm2, [esp + fjz]
+ movaps [esp + fjx], xmm0
+ movaps [esp + fjy], xmm1
+ movaps [esp + fjz], xmm2
+
+ /* H2 interactions */
+ movaps xmm4, xmm5
+ mulps xmm4, xmm4 /* xmm5=rinv, xmm4=rinvsq */
+ mulps xmm5, [esp + qqH] /* xmm5=vcoul */
+ mulps xmm4, xmm5 /* total fsH1 in xmm4 */
+
+ addps xmm5, [esp + vctot]
+
+ movaps xmm0, [esp + dxH2]
+ movaps xmm1, [esp + dyH2]
+ movaps xmm2, [esp + dzH2]
+ movaps [esp + vctot], xmm5
+ mulps xmm0, xmm4
+ mulps xmm1, xmm4
+ mulps xmm2, xmm4
+
+ /* update H2 forces */
+ movaps xmm3, [esp + fixH2]
+ movaps xmm4, [esp + fiyH2]
+ movaps xmm7, [esp + fizH2]
+ addps xmm3, xmm0
+ addps xmm4, xmm1
+ addps xmm7, xmm2
+ movaps [esp + fixH2], xmm3
+ movaps [esp + fiyH2], xmm4
+ movaps [esp + fizH2], xmm7
+
+ mov edi, [ebp +faction]
+ /* update j forces */
+ addps xmm0, [esp + fjx]
+ addps xmm1, [esp + fjy]
+ addps xmm2, [esp + fjz]
+
+ movlps xmm4, [edi + eax*4]
+ movlps xmm7, [edi + ecx*4]
+ movhps xmm4, [edi + ebx*4]
+ movhps xmm7, [edi + edx*4]
+
+ movaps xmm3, xmm4
+ shufps xmm3, xmm7, 0b10001000
+ shufps xmm4, xmm7, 0b11011101
+ /* xmm3 has fjx, xmm4 has fjy */
+ subps xmm3, xmm0
+ subps xmm4, xmm1
+ /* unpack them back for storing */
+ movaps xmm7, xmm3
+ unpcklps xmm7, xmm4
+ unpckhps xmm3, xmm4
+ movlps [edi + eax*4], xmm7
+ movlps [edi + ecx*4], xmm3
+ movhps [edi + ebx*4], xmm7
+ movhps [edi + edx*4], xmm3
+ /* finally z forces */
+ movss xmm0, [edi + eax*4 + 8]
+ movss xmm1, [edi + ebx*4 + 8]
+ movss xmm3, [edi + ecx*4 + 8]
+ movss xmm4, [edi + edx*4 + 8]
+ subss xmm0, xmm2
+ shufps xmm2, xmm2, 0b11100101
+ subss xmm1, xmm2
+ shufps xmm2, xmm2, 0b11101010
+ subss xmm3, xmm2
+ shufps xmm2, xmm2, 0b11111111
+ subss xmm4, xmm2
+ movss [edi + eax*4 + 8], xmm0
+ movss [edi + ebx*4 + 8], xmm1
+ movss [edi + ecx*4 + 8], xmm3
+ movss [edi + edx*4 + 8], xmm4
+
+ /* should we do one more iteration? */
+ sub [esp + innerk], 4
+ jl i1120_odd_inner
+ jmp i1120_unroll_loop
+i1120_odd_inner:
+ add [esp + innerk], 4
+ jnz i1120_odd_loop
+ jmp i1120_updateouterdata
+i1120_odd_loop:
+ mov edx, [esp + innerjjnr] /* pointer to jjnr[k] */
+ mov eax, [edx]
+ add [esp + innerjjnr], 4
+
+ xorps xmm4, xmm4
+ movss xmm4, [esp + iqO]
+ mov esi, [ebp + charge]
+ movhps xmm4, [esp + iqH]
+ movss xmm3, [esi + eax*4] /* charge in xmm3 */
+ shufps xmm3, xmm3, 0
+ mulps xmm3, xmm4
+ movaps [esp + qqO], xmm3 /* use oxygen qq for storage */
+
+ xorps xmm6, xmm6
+ mov esi, [ebp + type]
+ mov ebx, [esi + eax*4]
+ mov esi, [ebp + nbfp]
+ shl ebx, 1
+ add ebx, [esp + ntia]
+ movlps xmm6, [esi + ebx*4]
+ movaps xmm7, xmm6
+ shufps xmm6, xmm6, 0b11111100
+ shufps xmm7, xmm7, 0b11111101
+ movaps [esp + c6], xmm6
+ movaps [esp + c12], xmm7
+
+ mov esi, [ebp + pos]
+ lea eax, [eax + eax*2]
+
+ /* move j coords to xmm0-xmm2 */
+ movss xmm0, [esi + eax*4]
+ movss xmm1, [esi + eax*4 + 4]
+ movss xmm2, [esi + eax*4 + 8]
+ shufps xmm0, xmm0, 0
+ shufps xmm1, xmm1, 0
+ shufps xmm2, xmm2, 0
+
+ movss xmm3, [esp + ixO]
+ movss xmm4, [esp + iyO]
+ movss xmm5, [esp + izO]
+
+ movlps xmm6, [esp + ixH1]
+ movlps xmm7, [esp + ixH2]
+ unpcklps xmm6, xmm7
+ movlhps xmm3, xmm6
+ movlps xmm6, [esp + iyH1]
+ movlps xmm7, [esp + iyH2]
+ unpcklps xmm6, xmm7
+ movlhps xmm4, xmm6
+ movlps xmm6, [esp + izH1]
+ movlps xmm7, [esp + izH2]
+ unpcklps xmm6, xmm7
+ movlhps xmm5, xmm6
+
+ subps xmm3, xmm0
+ subps xmm4, xmm1
+ subps xmm5, xmm2
+
+ movaps [esp + dxO], xmm3
+ movaps [esp + dyO], xmm4
+ movaps [esp + dzO], xmm5
+
+ mulps xmm3, xmm3
+ mulps xmm4, xmm4
+ mulps xmm5, xmm5
+
+ addps xmm4, xmm3
+ addps xmm4, xmm5
+ /* rsq in xmm4 */
+
+ rsqrtps xmm5, xmm4
+ /* lookup seed in xmm5 */
+ movaps xmm2, xmm5
+ mulps xmm5, xmm5
+ movaps xmm1, [esp + three]
+ mulps xmm5, xmm4 /* rsq*lu*lu */
+ movaps xmm0, [esp + half]
+ subps xmm1, xmm5 /* 30-rsq*lu*lu */
+ mulps xmm1, xmm2
+ mulps xmm0, xmm1 /* xmm0=rinv */
+ movaps xmm4, xmm0
+ mulps xmm4, xmm4 /* xmm4=rinvsq */
+ movaps xmm1, xmm4
+ mulss xmm1, xmm4
+ movaps xmm3, [esp + qqO]
+ mulss xmm1, xmm4 /* xmm1=rinvsix */
+ movaps xmm2, xmm1
+ mulss xmm2, xmm2 /* xmm2=rinvtwelve */
+ mulps xmm3, xmm0 /* xmm3=vcoul */
+ mulps xmm1, [esp + c6]
+ mulps xmm2, [esp + c12]
+ movaps xmm5, xmm2
+ subss xmm5, xmm1 /* vnb=vnb12-vnb6 */
+ addps xmm5, [esp + vnbtot]
+ mulss xmm1, [esp + six]
+ mulss xmm2, [esp + twelve]
+ subss xmm2, xmm1
+ addps xmm2, xmm3
+ mulps xmm4, xmm2 /* xmm4=total fscal */
+ addps xmm3, [esp + vctot]
+
+ movaps xmm0, [esp + dxO]
+ movaps xmm1, [esp + dyO]
+ movaps xmm2, [esp + dzO]
+
+ movaps [esp + vctot], xmm3
+ movaps [esp + vnbtot], xmm5
+
+ mulps xmm0, xmm4
+ mulps xmm1, xmm4
+ mulps xmm2, xmm4
+ /* xmm0-xmm2 contains tx-tz (partial force) */
+ movss xmm3, [esp + fixO]
+ movss xmm4, [esp + fiyO]
+ movss xmm5, [esp + fizO]
+ addss xmm3, xmm0
+ addss xmm4, xmm1
+ addss xmm5, xmm2
+ movss [esp + fixO], xmm3
+ movss [esp + fiyO], xmm4
+ movss [esp + fizO], xmm5 /* updated the O force now do the H's */
+ movaps xmm3, xmm0
+ movaps xmm4, xmm1
+ movaps xmm5, xmm2
+ shufps xmm3, xmm3, 0b11100110 /* shift right */
+ shufps xmm4, xmm4, 0b11100110
+ shufps xmm5, xmm5, 0b11100110
+ addss xmm3, [esp + fixH1]
+ addss xmm4, [esp + fiyH1]
+ addss xmm5, [esp + fizH1]
+ movss [esp + fixH1], xmm3
+ movss [esp + fiyH1], xmm4
+ movss [esp + fizH1], xmm5 /* updated the H1 force */
+
+ mov edi, [ebp + faction]
+ shufps xmm3, xmm3, 0b11100111 /* shift right */
+ shufps xmm4, xmm4, 0b11100111
+ shufps xmm5, xmm5, 0b11100111
+ addss xmm3, [esp + fixH2]
+ addss xmm4, [esp + fiyH2]
+ addss xmm5, [esp + fizH2]
+ movss [esp + fixH2], xmm3
+ movss [esp + fiyH2], xmm4
+ movss [esp + fizH2], xmm5 /* updated the H2 force */
+
+ /* the fj's - start by accumulating the tx/ty/tz force in xmm0, xmm1 */
+ xorps xmm5, xmm5
+ movaps xmm3, xmm0
+ movlps xmm6, [edi + eax*4]
+ movss xmm7, [edi + eax*4 + 8]
+ unpcklps xmm3, xmm1
+ movlhps xmm3, xmm5
+ unpckhps xmm0, xmm1
+ addps xmm0, xmm3
+ movhlps xmm3, xmm0
+ addps xmm0, xmm3 /* x,y sum in xmm0 */
+
+ movhlps xmm1, xmm2
+ addss xmm2, xmm1
+ shufps xmm1, xmm1, 1
+ addss xmm2, xmm1 /* z sum in xmm2 */
+ subps xmm6, xmm0
+ subss xmm7, xmm2
+
+ movlps [edi + eax*4], xmm6
+ movss [edi + eax*4 + 8], xmm7
+
+ dec dword ptr [esp + innerk]
+ jz i1120_updateouterdata
+ jmp i1120_odd_loop
+i1120_updateouterdata:
+ mov ecx, [esp + ii3]
+ mov edi, [ebp + faction]
+ mov esi, [ebp + fshift]
+ mov edx, [esp + is3]
+
+ /* accumulate Oi forces in xmm0, xmm1, xmm2 */
+ movaps xmm0, [esp + fixO]
+ movaps xmm1, [esp + fiyO]
+ movaps xmm2, [esp + fizO]
+
+ movhlps xmm3, xmm0
+ movhlps xmm4, xmm1
+ movhlps xmm5, xmm2
+ addps xmm0, xmm3
+ addps xmm1, xmm4
+ addps xmm2, xmm5 /* sum is in 1/2 in xmm0-xmm2 */
+
+ movaps xmm3, xmm0
+ movaps xmm4, xmm1
+ movaps xmm5, xmm2
+
+ shufps xmm3, xmm3, 1
+ shufps xmm4, xmm4, 1
+ shufps xmm5, xmm5, 1
+ addss xmm0, xmm3
+ addss xmm1, xmm4
+ addss xmm2, xmm5 /* xmm0-xmm2 has single force in pos0 */
+
+ /* increment i force */
+ movss xmm3, [edi + ecx*4]
+ movss xmm4, [edi + ecx*4 + 4]
+ movss xmm5, [edi + ecx*4 + 8]
+ addss xmm3, xmm0
+ addss xmm4, xmm1
+ addss xmm5, xmm2
+ movss [edi + ecx*4], xmm3
+ movss [edi + ecx*4 + 4], xmm4
+ movss [edi + ecx*4 + 8], xmm5
+
+ /* accumulate force in xmm6/xmm7 for fshift */
+ movaps xmm6, xmm0
+ movss xmm7, xmm2
+ movlhps xmm6, xmm1
+ shufps xmm6, xmm6, 0b1000
+
+ /* accumulate H1i forces in xmm0, xmm1, xmm2 */
+ movaps xmm0, [esp + fixH1]
+ movaps xmm1, [esp + fiyH1]
+ movaps xmm2, [esp + fizH1]
+
+ movhlps xmm3, xmm0
+ movhlps xmm4, xmm1
+ movhlps xmm5, xmm2
+ addps xmm0, xmm3
+ addps xmm1, xmm4
+ addps xmm2, xmm5 /* sum is in 1/2 in xmm0-xmm2 */
+
+ movaps xmm3, xmm0
+ movaps xmm4, xmm1
+ movaps xmm5, xmm2
+
+ shufps xmm3, xmm3, 1
+ shufps xmm4, xmm4, 1
+ shufps xmm5, xmm5, 1
+ addss xmm0, xmm3
+ addss xmm1, xmm4
+ addss xmm2, xmm5 /* xmm0-xmm2 has single force in pos0 */
+
+ /* increment i force */
+ movss xmm3, [edi + ecx*4 + 12]
+ movss xmm4, [edi + ecx*4 + 16]
+ movss xmm5, [edi + ecx*4 + 20]
+ addss xmm3, xmm0
+ addss xmm4, xmm1
+ addss xmm5, xmm2
+ movss [edi + ecx*4 + 12], xmm3
+ movss [edi + ecx*4 + 16], xmm4
+ movss [edi + ecx*4 + 20], xmm5
+
+ /* accumulate force in xmm6/xmm7 for fshift */
+ addss xmm7, xmm2
+ movlhps xmm0, xmm1
+ shufps xmm0, xmm0, 0b1000
+ addps xmm6, xmm0
+
+ /* accumulate H2i forces in xmm0, xmm1, xmm2 */
+ movaps xmm0, [esp + fixH2]
+ movaps xmm1, [esp + fiyH2]
+ movaps xmm2, [esp + fizH2]
+
+ movhlps xmm3, xmm0
+ movhlps xmm4, xmm1
+ movhlps xmm5, xmm2
+ addps xmm0, xmm3
+ addps xmm1, xmm4
+ addps xmm2, xmm5 /* sum is in 1/2 in xmm0-xmm2 */
+
+ movaps xmm3, xmm0
+ movaps xmm4, xmm1
+ movaps xmm5, xmm2
+
+ shufps xmm3, xmm3, 1
+ shufps xmm4, xmm4, 1
+ shufps xmm5, xmm5, 1
+ addss xmm0, xmm3
+ addss xmm1, xmm4
+ addss xmm2, xmm5 /* xmm0-xmm2 has single force in pos0 */
+
+ /* increment i force */
+ movss xmm3, [edi + ecx*4 + 24]
+ movss xmm4, [edi + ecx*4 + 28]
+ movss xmm5, [edi + ecx*4 + 32]
+ addss xmm3, xmm0
+ addss xmm4, xmm1
+ addss xmm5, xmm2
+ movss [edi + ecx*4 + 24], xmm3
+ movss [edi + ecx*4 + 28], xmm4
+ movss [edi + ecx*4 + 32], xmm5
+
+ /* accumulate force in xmm6/xmm7 for fshift */
+ addss xmm7, xmm2
+ movlhps xmm0, xmm1
+ shufps xmm0, xmm0, 0b1000
+ addps xmm6, xmm0
+
+ /* increment fshift force */
+ movlps xmm3, [esi + edx*4]
+ movss xmm4, [esi + edx*4 + 8]
+ addps xmm3, xmm6
+ addss xmm4, xmm7
+ movlps [esi + edx*4], xmm3
+ movss [esi + edx*4 + 8], xmm4
+
+ mov edx, [ebp + gid]
+ mov edx, [edx]
+ add [ebp + gid], 4
+
+ /* accumulate total potential energy and update it */
+ movaps xmm7, [esp + vctot]
+ /* accumulate */
+ movhlps xmm6, xmm7
+ addps xmm7, xmm6 /* pos 0-1 in xmm7 have the sum now */
+ movaps xmm6, xmm7
+ shufps xmm6, xmm6, 1
+ addss xmm7, xmm6
+
+ /* add earlier value from mem */
+ mov eax, [ebp + Vc]
+ addss xmm7, [eax + edx*4]
+ /* move back to mem */
+ movss [eax + edx*4], xmm7
+
+ /* accumulate total lj energy and update it */
+ movaps xmm7, [esp + vnbtot]
+ /* accumulate */
+ movhlps xmm6, xmm7
+ addps xmm7, xmm6 /* pos 0-1 in xmm7 have the sum now */
+ movaps xmm6, xmm7
+ shufps xmm6, xmm6, 1
+ addss xmm7, xmm6
+
+ /* add earlier value from mem */
+ mov eax, [ebp + Vnb]
+ addss xmm7, [eax + edx*4]
+ /* move back to mem */
+ movss [eax + edx*4], xmm7
+
+ /* finish if last */
+ mov ecx, [ebp + nri]
+ dec ecx
+ jecxz i1120_end
+ /* not last, iterate once more! */
+ mov [ebp + nri], ecx
+ jmp i1120_outer
+i1120_end:
+ emms
+ mov eax, [esp + salign]
+ add esp, eax
+ add esp, 696
+ pop edi
+ pop esi
+ pop edx
+ pop ecx
+ pop ebx
+ pop eax
+ leave
+ ret
+
+
+
+.globl inl1130_sse
+ .type inl1130_sse,@function
+inl1130_sse:
+.equ nri, 8
+.equ iinr, 12
+.equ jindex, 16
+.equ jjnr, 20
+.equ shift, 24
+.equ shiftvec, 28
+.equ fshift, 32
+.equ gid, 36
+.equ pos, 40
+.equ faction, 44
+.equ charge, 48
+.equ facel, 52
+.equ Vc, 56
+.equ type, 60
+.equ ntype, 64
+.equ nbfp, 68
+.equ Vnb, 72
+ /* stack offsets for local variables */
+ /* bottom of stack is cache-aligned for sse use */
+.equ ixO, 0
+.equ iyO, 16
+.equ izO, 32
+.equ ixH1, 48
+.equ iyH1, 64
+.equ izH1, 80
+.equ ixH2, 96
+.equ iyH2, 112
+.equ izH2, 128
+.equ jxO, 144
+.equ jyO, 160
+.equ jzO, 176
+.equ jxH1, 192
+.equ jyH1, 208
+.equ jzH1, 224
+.equ jxH2, 240
+.equ jyH2, 256
+.equ jzH2, 272
+.equ dxOO, 288
+.equ dyOO, 304
+.equ dzOO, 320
+.equ dxOH1, 336
+.equ dyOH1, 352
+.equ dzOH1, 368
+.equ dxOH2, 384
+.equ dyOH2, 400
+.equ dzOH2, 416
+.equ dxH1O, 432
+.equ dyH1O, 448
+.equ dzH1O, 464
+.equ dxH1H1, 480
+.equ dyH1H1, 496
+.equ dzH1H1, 512
+.equ dxH1H2, 528
+.equ dyH1H2, 544
+.equ dzH1H2, 560
+.equ dxH2O, 576
+.equ dyH2O, 592
+.equ dzH2O, 608
+.equ dxH2H1, 624
+.equ dyH2H1, 640
+.equ dzH2H1, 656
+.equ dxH2H2, 672
+.equ dyH2H2, 688
+.equ dzH2H2, 704
+.equ qqOO, 720
+.equ qqOH, 736
+.equ qqHH, 752
+.equ c6, 768
+.equ c12, 784
+.equ six, 800
+.equ twelve, 816
+.equ vctot, 832
+.equ vnbtot, 848
+.equ fixO, 864
+.equ fiyO, 880
+.equ fizO, 896
+.equ fixH1, 912
+.equ fiyH1, 928
+.equ fizH1, 944
+.equ fixH2, 960
+.equ fiyH2, 976
+.equ fizH2, 992
+.equ fjxO, 1008
+.equ fjyO, 1024
+.equ fjzO, 1040
+.equ fjxH1, 1056
+.equ fjyH1, 1072
+.equ fjzH1, 1088
+.equ fjxH2, 1104
+.equ fjyH2, 1120
+.equ fjzH2, 1136
+.equ half, 1152
+.equ three, 1168
+.equ rsqOO, 1184
+.equ rsqOH1, 1200
+.equ rsqOH2, 1216
+.equ rsqH1O, 1232
+.equ rsqH1H1, 1248
+.equ rsqH1H2, 1264
+.equ rsqH2O, 1280
+.equ rsqH2H1, 1296
+.equ rsqH2H2, 1312
+.equ rinvOO, 1328
+.equ rinvOH1, 1344
+.equ rinvOH2, 1360
+.equ rinvH1O, 1376
+.equ rinvH1H1, 1392
+.equ rinvH1H2, 1408
+.equ rinvH2O, 1424
+.equ rinvH2H1, 1440
+.equ rinvH2H2, 1456
+.equ is3, 1472
+.equ ii3, 1476
+.equ innerjjnr, 1480
+.equ innerk, 1484
+.equ salign, 1488
+ push ebp
+ mov ebp,esp
+ push eax
+ push ebx
+ push ecx
+ push edx
+ push esi
+ push edi
+ sub esp, 1492 /* local stack space */
+ mov eax, esp
+ and eax, 0xf
+ sub esp, eax
+ mov [esp + salign], eax
+
+ emms
+
+ movups xmm0, [sse_half]
+ movups xmm1, [sse_three]
+ movups xmm2, [sse_six]
+ movups xmm3, [sse_twelve]
+ movaps [esp + half], xmm0
+ movaps [esp + three], xmm1
+ movaps [esp + six], xmm2
+ movaps [esp + twelve], xmm3
+
+ /* assume we have at least one i particle - start directly */
+ mov ecx, [ebp + iinr] /* ecx = pointer into iinr[] */
+ mov ebx, [ecx] /* ebx =ii */
+
+ mov edx, [ebp + charge]
+ movss xmm3, [edx + ebx*4]
+ movss xmm4, xmm3
+ movss xmm5, [edx + ebx*4 + 4]
+ movss xmm6, [ebp + facel]
+ mulss xmm3, xmm3
+ mulss xmm4, xmm5
+ mulss xmm5, xmm5
+ mulss xmm3, xmm6
+ mulss xmm4, xmm6
+ mulss xmm5, xmm6
+ shufps xmm3, xmm3, 0
+ shufps xmm4, xmm4, 0
+ shufps xmm5, xmm5, 0
+ movaps [esp + qqOO], xmm3
+ movaps [esp + qqOH], xmm4
+ movaps [esp + qqHH], xmm5
+
+ xorps xmm0, xmm0
+ mov edx, [ebp + type]
+ mov ecx, [edx + ebx*4]
+ shl ecx, 1
+ mov edx, ecx
+ imul ecx, [ebp + ntype] /* ecx = ntia = 2*ntype*type[ii0] */
+ add edx, ecx
+ mov eax, [ebp + nbfp]
+ movlps xmm0, [eax + edx*4]
+ movaps xmm1, xmm0
+ shufps xmm0, xmm0, 0
+ shufps xmm1, xmm1, 0b01010101
+ movaps [esp + c6], xmm0
+ movaps [esp + c12], xmm1
+
+i1130_outer:
+ mov eax, [ebp + shift] /* eax = pointer into shift[] */
+ mov ebx, [eax] /* ebx=shift[n] */
+ add [ebp + shift], 4 /* advance pointer one step */
+
+ lea ebx, [ebx + ebx*2] /* ebx=3*is */
+ mov [esp + is3],ebx /* store is3 */
+
+ mov eax, [ebp + shiftvec] /* eax = base of shiftvec[] */
+
+ movss xmm0, [eax + ebx*4]
+ movss xmm1, [eax + ebx*4 + 4]
+ movss xmm2, [eax + ebx*4 + 8]
+
+ mov ecx, [ebp + iinr] /* ecx = pointer into iinr[] */
+ add [ebp + iinr], 4 /* advance pointer */
+ mov ebx, [ecx] /* ebx =ii */
+
+ lea ebx, [ebx + ebx*2] /* ebx = 3*ii=ii3 */
+ mov eax, [ebp + pos] /* eax = base of pos[] */
+ mov [esp + ii3], ebx
+
+ movaps xmm3, xmm0
+ movaps xmm4, xmm1
+ movaps xmm5, xmm2
+ addss xmm3, [eax + ebx*4]
+ addss xmm4, [eax + ebx*4 + 4]
+ addss xmm5, [eax + ebx*4 + 8]
+ shufps xmm3, xmm3, 0
+ shufps xmm4, xmm4, 0
+ shufps xmm5, xmm5, 0
+ movaps [esp + ixO], xmm3
+ movaps [esp + iyO], xmm4
+ movaps [esp + izO], xmm5
+
+ movss xmm3, xmm0
+ movss xmm4, xmm1
+ movss xmm5, xmm2
+ addss xmm0, [eax + ebx*4 + 12]
+ addss xmm1, [eax + ebx*4 + 16]
+ addss xmm2, [eax + ebx*4 + 20]
+ addss xmm3, [eax + ebx*4 + 24]
+ addss xmm4, [eax + ebx*4 + 28]
+ addss xmm5, [eax + ebx*4 + 32]
+
+ shufps xmm0, xmm0, 0
+ shufps xmm1, xmm1, 0
+ shufps xmm2, xmm2, 0
+ shufps xmm3, xmm3, 0
+ shufps xmm4, xmm4, 0
+ shufps xmm5, xmm5, 0
+ movaps [esp + ixH1], xmm0
+ movaps [esp + iyH1], xmm1
+ movaps [esp + izH1], xmm2
+ movaps [esp + ixH2], xmm3
+ movaps [esp + iyH2], xmm4
+ movaps [esp + izH2], xmm5
+
+ /* clear vctot and i forces */
+ xorps xmm4, xmm4
+ movaps [esp + vctot], xmm4
+ movaps [esp + vnbtot], xmm4
+ movaps [esp + fixO], xmm4
+ movaps [esp + fiyO], xmm4
+ movaps [esp + fizO], xmm4
+ movaps [esp + fixH1], xmm4
+ movaps [esp + fiyH1], xmm4
+ movaps [esp + fizH1], xmm4
+ movaps [esp + fixH2], xmm4
+ movaps [esp + fiyH2], xmm4
+ movaps [esp + fizH2], xmm4
+
+ mov eax, [ebp + jindex]
+ mov ecx, [eax] /* jindex[n] */
+ mov edx, [eax + 4] /* jindex[n+1] */
+ add [ebp + jindex], 4
+ sub edx, ecx /* number of innerloop atoms */
+
+ mov esi, [ebp + pos]
+ mov edi, [ebp + faction]
+ mov eax, [ebp + jjnr]
+ shl ecx, 2
+ add eax, ecx
+ mov [esp + innerjjnr], eax /* pointer to jjnr[nj0] */
+ sub edx, 4
+ mov [esp + innerk], edx /* number of innerloop atoms */
+ jge i1130_unroll_loop
+ jmp i1130_single_check
+i1130_unroll_loop:
+ /* quad-unroll innerloop here */
+ mov edx, [esp + innerjjnr] /* pointer to jjnr[k] */
+
+ mov eax, [edx]
+ mov ebx, [edx + 4]
+ mov ecx, [edx + 8]
+ mov edx, [edx + 12] /* eax-edx=jnr1-4 */
+
+ add [esp + innerjjnr], 16 /* advance pointer (unrolled 4) */
+
+ mov esi, [ebp + pos] /* base of pos[] */
+
+ lea eax, [eax + eax*2] /* replace jnr with j3 */
+ lea ebx, [ebx + ebx*2]
+ lea ecx, [ecx + ecx*2] /* replace jnr with j3 */
+ lea edx, [edx + edx*2]
+
+ /* move j coordinates to local temp variables */
+ movlps xmm2, [esi + eax*4]
+ movlps xmm3, [esi + eax*4 + 12]
+ movlps xmm4, [esi + eax*4 + 24]
+
+ movlps xmm5, [esi + ebx*4]
+ movlps xmm6, [esi + ebx*4 + 12]
+ movlps xmm7, [esi + ebx*4 + 24]
+
+ movhps xmm2, [esi + ecx*4]
+ movhps xmm3, [esi + ecx*4 + 12]
+ movhps xmm4, [esi + ecx*4 + 24]
+
+ movhps xmm5, [esi + edx*4]
+ movhps xmm6, [esi + edx*4 + 12]
+ movhps xmm7, [esi + edx*4 + 24]
+
+ /* current state: */
+ /* xmm2= jxOa jyOa jxOc jyOc */
+ /* xmm3= jxH1a jyH1a jxH1c jyH1c */
+ /* xmm4= jxH2a jyH2a jxH2c jyH2c */
+ /* xmm5= jxOb jyOb jxOd jyOd */
+ /* xmm6= jxH1b jyH1b jxH1d jyH1d */
+ /* xmm7= jxH2b jyH2b jxH2d jyH2d */
+
+ movaps xmm0, xmm2
+ movaps xmm1, xmm3
+ unpcklps xmm0, xmm5 /* xmm0= jxOa jxOb jyOa jyOb */
+ unpcklps xmm1, xmm6 /* xmm1= jxH1a jxH1b jyH1a jyH1b */
+ unpckhps xmm2, xmm5 /* xmm2= jxOc jxOd jyOc jyOd */
+ unpckhps xmm3, xmm6 /* xmm3= jxH1c jxH1d jyH1c jyH1d */
+ movaps xmm5, xmm4
+ movaps xmm6, xmm0
+ unpcklps xmm4, xmm7 /* xmm4= jxH2a jxH2b jyH2a jyH2b */
+ unpckhps xmm5, xmm7 /* xmm5= jxH2c jxH2d jyH2c jyH2d */
+ movaps xmm7, xmm1
+ movlhps xmm0, xmm2 /* xmm0= jxOa jxOb jxOc jxOd */
+ movaps [esp + jxO], xmm0
+ movhlps xmm2, xmm6 /* xmm2= jyOa jyOb jyOc jyOd */
+ movaps [esp + jyO], xmm2
+ movlhps xmm1, xmm3
+ movaps [esp + jxH1], xmm1
+ movhlps xmm3, xmm7
+ movaps xmm6, xmm4
+ movaps [esp + jyH1], xmm3
+ movlhps xmm4, xmm5
+ movaps [esp + jxH2], xmm4
+ movhlps xmm5, xmm6
+ movaps [esp + jyH2], xmm5
+
+ movss xmm0, [esi + eax*4 + 8]
+ movss xmm1, [esi + eax*4 + 20]
+ movss xmm2, [esi + eax*4 + 32]
+
+ movss xmm3, [esi + ecx*4 + 8]
+ movss xmm4, [esi + ecx*4 + 20]
+ movss xmm5, [esi + ecx*4 + 32]
+
+ movhps xmm0, [esi + ebx*4 + 4]
+ movhps xmm1, [esi + ebx*4 + 16]
+ movhps xmm2, [esi + ebx*4 + 28]
+
+ movhps xmm3, [esi + edx*4 + 4]
+ movhps xmm4, [esi + edx*4 + 16]
+ movhps xmm5, [esi + edx*4 + 28]
+
+ shufps xmm0, xmm3, 0b11001100
+ shufps xmm1, xmm4, 0b11001100
+ shufps xmm2, xmm5, 0b11001100
+ movaps [esp + jzO], xmm0
+ movaps [esp + jzH1], xmm1
+ movaps [esp + jzH2], xmm2
+
+ movaps xmm0, [esp + ixO]
+ movaps xmm1, [esp + iyO]
+ movaps xmm2, [esp + izO]
+ movaps xmm3, [esp + ixO]
+ movaps xmm4, [esp + iyO]
+ movaps xmm5, [esp + izO]
+ subps xmm0, [esp + jxO]
+ subps xmm1, [esp + jyO]
+ subps xmm2, [esp + jzO]
+ subps xmm3, [esp + jxH1]
+ subps xmm4, [esp + jyH1]
+ subps xmm5, [esp + jzH1]
+ movaps [esp + dxOO], xmm0
+ movaps [esp + dyOO], xmm1
+ movaps [esp + dzOO], xmm2
+ mulps xmm0, xmm0
+ mulps xmm1, xmm1
+ mulps xmm2, xmm2
+ movaps [esp + dxOH1], xmm3
+ movaps [esp + dyOH1], xmm4
+ movaps [esp + dzOH1], xmm5
+ mulps xmm3, xmm3
+ mulps xmm4, xmm4
+ mulps xmm5, xmm5
+ addps xmm0, xmm1
+ addps xmm0, xmm2
+ addps xmm3, xmm4
+ addps xmm3, xmm5
+ movaps [esp + rsqOO], xmm0
+ movaps [esp + rsqOH1], xmm3
+
+ movaps xmm0, [esp + ixO]
+ movaps xmm1, [esp + iyO]
+ movaps xmm2, [esp + izO]
+ movaps xmm3, [esp + ixH1]
+ movaps xmm4, [esp + iyH1]
+ movaps xmm5, [esp + izH1]
+ subps xmm0, [esp + jxH2]
+ subps xmm1, [esp + jyH2]
+ subps xmm2, [esp + jzH2]
+ subps xmm3, [esp + jxO]
+ subps xmm4, [esp + jyO]
+ subps xmm5, [esp + jzO]
+ movaps [esp + dxOH2], xmm0
+ movaps [esp + dyOH2], xmm1
+ movaps [esp + dzOH2], xmm2
+ mulps xmm0, xmm0
+ mulps xmm1, xmm1
+ mulps xmm2, xmm2
+ movaps [esp + dxH1O], xmm3
+ movaps [esp + dyH1O], xmm4
+ movaps [esp + dzH1O], xmm5
+ mulps xmm3, xmm3
+ mulps xmm4, xmm4
+ mulps xmm5, xmm5
+ addps xmm0, xmm1
+ addps xmm0, xmm2
+ addps xmm3, xmm4
+ addps xmm3, xmm5
+ movaps [esp + rsqOH2], xmm0
+ movaps [esp + rsqH1O], xmm3
+
+ movaps xmm0, [esp + ixH1]
+ movaps xmm1, [esp + iyH1]
+ movaps xmm2, [esp + izH1]
+ movaps xmm3, [esp + ixH1]
+ movaps xmm4, [esp + iyH1]
+ movaps xmm5, [esp + izH1]
+ subps xmm0, [esp + jxH1]
+ subps xmm1, [esp + jyH1]
+ subps xmm2, [esp + jzH1]
+ subps xmm3, [esp + jxH2]
+ subps xmm4, [esp + jyH2]
+ subps xmm5, [esp + jzH2]
+ movaps [esp + dxH1H1], xmm0
+ movaps [esp + dyH1H1], xmm1
+ movaps [esp + dzH1H1], xmm2
+ mulps xmm0, xmm0
+ mulps xmm1, xmm1
+ mulps xmm2, xmm2
+ movaps [esp + dxH1H2], xmm3
+ movaps [esp + dyH1H2], xmm4
+ movaps [esp + dzH1H2], xmm5
+ mulps xmm3, xmm3
+ mulps xmm4, xmm4
+ mulps xmm5, xmm5
+ addps xmm0, xmm1
+ addps xmm0, xmm2
+ addps xmm3, xmm4
+ addps xmm3, xmm5
+ movaps [esp + rsqH1H1], xmm0
+ movaps [esp + rsqH1H2], xmm3
+
+ movaps xmm0, [esp + ixH2]
+ movaps xmm1, [esp + iyH2]
+ movaps xmm2, [esp + izH2]
+ movaps xmm3, [esp + ixH2]
+ movaps xmm4, [esp + iyH2]
+ movaps xmm5, [esp + izH2]
+ subps xmm0, [esp + jxO]
+ subps xmm1, [esp + jyO]
+ subps xmm2, [esp + jzO]
+ subps xmm3, [esp + jxH1]
+ subps xmm4, [esp + jyH1]
+ subps xmm5, [esp + jzH1]
+ movaps [esp + dxH2O], xmm0
+ movaps [esp + dyH2O], xmm1
+ movaps [esp + dzH2O], xmm2
+ mulps xmm0, xmm0
+ mulps xmm1, xmm1
+ mulps xmm2, xmm2
+ movaps [esp + dxH2H1], xmm3
+ movaps [esp + dyH2H1], xmm4
+ movaps [esp + dzH2H1], xmm5
+ mulps xmm3, xmm3
+ mulps xmm4, xmm4
+ mulps xmm5, xmm5
+ addps xmm0, xmm1
+ addps xmm0, xmm2
+ addps xmm4, xmm3
+ addps xmm4, xmm5
+ movaps [esp + rsqH2O], xmm0
+ movaps [esp + rsqH2H1], xmm4
+
+ movaps xmm0, [esp + ixH2]
+ movaps xmm1, [esp + iyH2]
+ movaps xmm2, [esp + izH2]
+ subps xmm0, [esp + jxH2]
+ subps xmm1, [esp + jyH2]
+ subps xmm2, [esp + jzH2]
+ movaps [esp + dxH2H2], xmm0
+ movaps [esp + dyH2H2], xmm1
+ movaps [esp + dzH2H2], xmm2
+ mulps xmm0, xmm0
+ mulps xmm1, xmm1
+ mulps xmm2, xmm2
+ addps xmm0, xmm1
+ addps xmm0, xmm2
+ movaps [esp + rsqH2H2], xmm0
+
+ /* start doing invsqrt use rsq values in xmm0, xmm4 */
+ rsqrtps xmm1, xmm0
+ rsqrtps xmm5, xmm4
+ movaps xmm2, xmm1
+ movaps xmm6, xmm5
+ mulps xmm1, xmm1
+ mulps xmm5, xmm5
+ movaps xmm3, [esp + three]
+ movaps xmm7, xmm3
+ mulps xmm1, xmm0
+ mulps xmm5, xmm4
+ subps xmm3, xmm1
+ subps xmm7, xmm5
+ mulps xmm3, xmm2
+ mulps xmm7, xmm6
+ mulps xmm3, [esp + half] /* rinvH2H2 */
+ mulps xmm7, [esp + half] /* rinvH2H1 */
+ movaps [esp + rinvH2H2], xmm3
+ movaps [esp + rinvH2H1], xmm7
+
+ rsqrtps xmm1, [esp + rsqOO]
+ rsqrtps xmm5, [esp + rsqOH1]
+ movaps xmm2, xmm1
+ movaps xmm6, xmm5
+ mulps xmm1, xmm1
+ mulps xmm5, xmm5
+ movaps xmm3, [esp + three]
+ movaps xmm7, xmm3
+ mulps xmm1, [esp + rsqOO]
+ mulps xmm5, [esp + rsqOH1]
+ subps xmm3, xmm1
+ subps xmm7, xmm5
+ mulps xmm3, xmm2
+ mulps xmm7, xmm6
+ mulps xmm3, [esp + half]
+ mulps xmm7, [esp + half]
+ movaps [esp + rinvOO], xmm3
+ movaps [esp + rinvOH1], xmm7
+
+ rsqrtps xmm1, [esp + rsqOH2]
+ rsqrtps xmm5, [esp + rsqH1O]
+ movaps xmm2, xmm1
+ movaps xmm6, xmm5
+ mulps xmm1, xmm1
+ mulps xmm5, xmm5
+ movaps xmm3, [esp + three]
+ movaps xmm7, xmm3
+ mulps xmm1, [esp + rsqOH2]
+ mulps xmm5, [esp + rsqH1O]
+ subps xmm3, xmm1
+ subps xmm7, xmm5
+ mulps xmm3, xmm2
+ mulps xmm7, xmm6
+ mulps xmm3, [esp + half]
+ mulps xmm7, [esp + half]
+ movaps [esp + rinvOH2], xmm3
+ movaps [esp + rinvH1O], xmm7
+
+ rsqrtps xmm1, [esp + rsqH1H1]
+ rsqrtps xmm5, [esp + rsqH1H2]
+ movaps xmm2, xmm1
+ movaps xmm6, xmm5
+ mulps xmm1, xmm1
+ mulps xmm5, xmm5
+ movaps xmm3, [esp + three]
+ movaps xmm7, xmm3
+ mulps xmm1, [esp + rsqH1H1]
+ mulps xmm5, [esp + rsqH1H2]
+ subps xmm3, xmm1
+ subps xmm7, xmm5
+ mulps xmm3, xmm2
+ mulps xmm7, xmm6
+ mulps xmm3, [esp + half]
+ mulps xmm7, [esp + half]
+ movaps [esp + rinvH1H1], xmm3
+ movaps [esp + rinvH1H2], xmm7
+
+ rsqrtps xmm1, [esp + rsqH2O]
+ movaps xmm2, xmm1
+ mulps xmm1, xmm1
+ movaps xmm3, [esp + three]
+ mulps xmm1, [esp + rsqH2O]
+ subps xmm3, xmm1
+ mulps xmm3, xmm2
+ mulps xmm3, [esp + half]
+ movaps [esp + rinvH2O], xmm3
+
+ /* start with OO interaction */
+ movaps xmm0, [esp + rinvOO]
+ movaps xmm7, xmm0
+ mulps xmm0, xmm0
+ movaps xmm1, xmm0
+ mulps xmm1, xmm0
+ mulps xmm1, xmm0 /* xmm1=rinvsix */
+ mulps xmm7, [esp + qqOO]
+ movaps xmm2, xmm1
+ mulps xmm2, xmm2 /* xmm2=rinvtwelve */
+ mulps xmm1, [esp + c6]
+ mulps xmm2, [esp + c12]
+ movaps xmm3, xmm2
+ subps xmm3, xmm1 /* xmm3=vnb12-vnb6 */
+ addps xmm3, [esp + vnbtot]
+ mulps xmm1, [esp + six]
+ mulps xmm2, [esp + twelve]
+ movaps [esp + vnbtot], xmm3
+ subps xmm2, xmm1
+ addps xmm2, xmm7
+ addps xmm7, [esp + vctot]
+ mulps xmm0, xmm2
+
+ movaps xmm1, xmm0
+ movaps xmm2, xmm0
+
+ xorps xmm3, xmm3
+ movaps xmm4, xmm3
+ movaps xmm5, xmm3
+ mulps xmm0, [esp + dxOO]
+ mulps xmm1, [esp + dyOO]
+ mulps xmm2, [esp + dzOO]
+ subps xmm3, xmm0
+ subps xmm4, xmm1
+ subps xmm5, xmm2
+ addps xmm0, [esp + fixO]
+ addps xmm1, [esp + fiyO]
+ addps xmm2, [esp + fizO]
+ movaps [esp + fjxO], xmm3
+ movaps [esp + fjyO], xmm4
+ movaps [esp + fjzO], xmm5
+ movaps [esp + fixO], xmm0
+ movaps [esp + fiyO], xmm1
+ movaps [esp + fizO], xmm2
+
+ /* O-H1 interaction */
+ movaps xmm0, [esp + rinvOH1]
+ movaps xmm1, xmm0
+ mulps xmm0, xmm0
+ mulps xmm1, [esp + qqOH]
+ mulps xmm0, xmm1 /* fsOH1 */
+ addps xmm7, xmm1 /* add to local vctot */
+ movaps xmm1, xmm0
+ movaps xmm2, xmm0
+
+ xorps xmm3, xmm3
+ movaps xmm4, xmm3
+ movaps xmm5, xmm3
+ mulps xmm0, [esp + dxOH1]
+ mulps xmm1, [esp + dyOH1]
+ mulps xmm2, [esp + dzOH1]
+ subps xmm3, xmm0
+ subps xmm4, xmm1
+ subps xmm5, xmm2
+ addps xmm0, [esp + fixO]
+ addps xmm1, [esp + fiyO]
+ addps xmm2, [esp + fizO]
+ movaps [esp + fjxH1], xmm3
+ movaps [esp + fjyH1], xmm4
+ movaps [esp + fjzH1], xmm5
+ movaps [esp + fixO], xmm0
+ movaps [esp + fiyO], xmm1
+ movaps [esp + fizO], xmm2
+
+ /* O-H2 interaction */
+ movaps xmm0, [esp + rinvOH2]
+ movaps xmm1, xmm0
+ mulps xmm0, xmm0
+ mulps xmm1, [esp + qqOH]
+ mulps xmm0, xmm1 /* fsOH2 */
+ addps xmm7, xmm1 /* add to local vctot */
+ movaps xmm1, xmm0
+ movaps xmm2, xmm0
+
+ xorps xmm3, xmm3
+ movaps xmm4, xmm3
+ movaps xmm5, xmm3
+ mulps xmm0, [esp + dxOH2]
+ mulps xmm1, [esp + dyOH2]
+ mulps xmm2, [esp + dzOH2]
+ subps xmm3, xmm0
+ subps xmm4, xmm1
+ subps xmm5, xmm2
+ addps xmm0, [esp + fixO]
+ addps xmm1, [esp + fiyO]
+ addps xmm2, [esp + fizO]
+ movaps [esp + fjxH2], xmm3
+ movaps [esp + fjyH2], xmm4
+ movaps [esp + fjzH2], xmm5
+ movaps [esp + fixO], xmm0
+ movaps [esp + fiyO], xmm1
+ movaps [esp + fizO], xmm2
+
+ /* H1-O interaction */
+ movaps xmm0, [esp + rinvH1O]
+ movaps xmm1, xmm0
+ mulps xmm0, xmm0
+ mulps xmm1, [esp + qqOH]
+ mulps xmm0, xmm1 /* fsH1O */
+ addps xmm7, xmm1 /* add to local vctot */
+ movaps xmm1, xmm0
+ movaps xmm2, xmm0
+ movaps xmm3, [esp + fjxO]
+ movaps xmm4, [esp + fjyO]
+ movaps xmm5, [esp + fjzO]
+ mulps xmm0, [esp + dxH1O]
+ mulps xmm1, [esp + dyH1O]
+ mulps xmm2, [esp + dzH1O]
+ subps xmm3, xmm0
+ subps xmm4, xmm1
+ subps xmm5, xmm2
+ addps xmm0, [esp + fixH1]
+ addps xmm1, [esp + fiyH1]
+ addps xmm2, [esp + fizH1]
+ movaps [esp + fjxO], xmm3
+ movaps [esp + fjyO], xmm4
+ movaps [esp + fjzO], xmm5
+ movaps [esp + fixH1], xmm0
+ movaps [esp + fiyH1], xmm1
+ movaps [esp + fizH1], xmm2
+
+ /* H1-H1 interaction */
+ movaps xmm0, [esp + rinvH1H1]
+ movaps xmm1, xmm0
+ mulps xmm0, xmm0
+ mulps xmm1, [esp + qqHH]
+ mulps xmm0, xmm1 /* fsH1H1 */
+ addps xmm7, xmm1 /* add to local vctot */
+ movaps xmm1, xmm0
+ movaps xmm2, xmm0
+ movaps xmm3, [esp + fjxH1]
+ movaps xmm4, [esp + fjyH1]
+ movaps xmm5, [esp + fjzH1]
+ mulps xmm0, [esp + dxH1H1]
+ mulps xmm1, [esp + dyH1H1]
+ mulps xmm2, [esp + dzH1H1]
+ subps xmm3, xmm0
+ subps xmm4, xmm1
+ subps xmm5, xmm2
+ addps xmm0, [esp + fixH1]
+ addps xmm1, [esp + fiyH1]
+ addps xmm2, [esp + fizH1]
+ movaps [esp + fjxH1], xmm3
+ movaps [esp + fjyH1], xmm4
+ movaps [esp + fjzH1], xmm5
+ movaps [esp + fixH1], xmm0
+ movaps [esp + fiyH1], xmm1
+ movaps [esp + fizH1], xmm2
+
+ /* H1-H2 interaction */
+ movaps xmm0, [esp + rinvH1H2]
+ movaps xmm1, xmm0
+ mulps xmm0, xmm0
+ mulps xmm1, [esp + qqHH]
+ mulps xmm0, xmm1 /* fsOH2 */
+ addps xmm7, xmm1 /* add to local vctot */
+ movaps xmm1, xmm0
+ movaps xmm2, xmm0
+ movaps xmm3, [esp + fjxH2]
+ movaps xmm4, [esp + fjyH2]
+ movaps xmm5, [esp + fjzH2]
+ mulps xmm0, [esp + dxH1H2]
+ mulps xmm1, [esp + dyH1H2]
+ mulps xmm2, [esp + dzH1H2]
+ subps xmm3, xmm0
+ subps xmm4, xmm1
+ subps xmm5, xmm2
+ addps xmm0, [esp + fixH1]
+ addps xmm1, [esp + fiyH1]
+ addps xmm2, [esp + fizH1]
+ movaps [esp + fjxH2], xmm3
+ movaps [esp + fjyH2], xmm4
+ movaps [esp + fjzH2], xmm5
+ movaps [esp + fixH1], xmm0
+ movaps [esp + fiyH1], xmm1
+ movaps [esp + fizH1], xmm2
+
+ /* H2-O interaction */
+ movaps xmm0, [esp + rinvH2O]
+ movaps xmm1, xmm0
+ mulps xmm0, xmm0
+ mulps xmm1, [esp + qqOH]
+ mulps xmm0, xmm1 /* fsH2O */
+ addps xmm7, xmm1 /* add to local vctot */
+ movaps xmm1, xmm0
+ movaps xmm2, xmm0
+ movaps xmm3, [esp + fjxO]
+ movaps xmm4, [esp + fjyO]
+ movaps xmm5, [esp + fjzO]
+ mulps xmm0, [esp + dxH2O]
+ mulps xmm1, [esp + dyH2O]
+ mulps xmm2, [esp + dzH2O]
+ subps xmm3, xmm0
+ subps xmm4, xmm1
+ subps xmm5, xmm2
+ addps xmm0, [esp + fixH2]
+ addps xmm1, [esp + fiyH2]
+ addps xmm2, [esp + fizH2]
+ movaps [esp + fjxO], xmm3
+ movaps [esp + fjyO], xmm4
+ movaps [esp + fjzO], xmm5
+ movaps [esp + fixH2], xmm0
+ movaps [esp + fiyH2], xmm1
+ movaps [esp + fizH2], xmm2
+
+ /* H2-H1 interaction */
+ movaps xmm0, [esp + rinvH2H1]
+ movaps xmm1, xmm0
+ mulps xmm0, xmm0
+ mulps xmm1, [esp + qqHH]
+ mulps xmm0, xmm1 /* fsH2H1 */
+ addps xmm7, xmm1 /* add to local vctot */
+ movaps xmm1, xmm0
+ movaps xmm2, xmm0
+ movaps xmm3, [esp + fjxH1]
+ movaps xmm4, [esp + fjyH1]
+ movaps xmm5, [esp + fjzH1]
+ mulps xmm0, [esp + dxH2H1]
+ mulps xmm1, [esp + dyH2H1]
+ mulps xmm2, [esp + dzH2H1]
+ subps xmm3, xmm0
+ subps xmm4, xmm1
+ subps xmm5, xmm2
+ addps xmm0, [esp + fixH2]
+ addps xmm1, [esp + fiyH2]
+ addps xmm2, [esp + fizH2]
+ movaps [esp + fjxH1], xmm3
+ movaps [esp + fjyH1], xmm4
+ movaps [esp + fjzH1], xmm5
+ movaps [esp + fixH2], xmm0
+ movaps [esp + fiyH2], xmm1
+ movaps [esp + fizH2], xmm2
+
+ /* H2-H2 interaction */
+ movaps xmm0, [esp + rinvH2H2]
+ movaps xmm1, xmm0
+ mulps xmm0, xmm0
+ mulps xmm1, [esp + qqHH]
+ mulps xmm0, xmm1 /* fsH2H2 */
+ addps xmm7, xmm1 /* add to local vctot */
+ movaps xmm1, xmm0
+ movaps [esp + vctot], xmm7
+ movaps xmm2, xmm0
+ movaps xmm3, [esp + fjxH2]
+ movaps xmm4, [esp + fjyH2]
+ movaps xmm5, [esp + fjzH2]
+ mulps xmm0, [esp + dxH2H2]
+ mulps xmm1, [esp + dyH2H2]
+ mulps xmm2, [esp + dzH2H2]
+ subps xmm3, xmm0
+ subps xmm4, xmm1
+ subps xmm5, xmm2
+ addps xmm0, [esp + fixH2]
+ addps xmm1, [esp + fiyH2]
+ addps xmm2, [esp + fizH2]
+ movaps [esp + fjxH2], xmm3
+ movaps [esp + fjyH2], xmm4
+ movaps [esp + fjzH2], xmm5
+ movaps [esp + fixH2], xmm0
+ movaps [esp + fiyH2], xmm1
+ movaps [esp + fizH2], xmm2
+
+ mov edi, [ebp +faction]
+
+ /* Did all interactions - now update j forces */
+ /* 4 j waters with three atoms each - first do a & b j particles */
+ movaps xmm0, [esp + fjxO] /* xmm0= fjxOa fjxOb fjxOc fjxOd */
+ movaps xmm1, [esp + fjyO] /* xmm1= fjyOa fjyOb fjyOc fjyOd */
+ unpcklps xmm0, xmm1 /* xmm0= fjxOa fjyOa fjxOb fjyOb */
+ movaps xmm1, [esp + fjzO]
+ movaps xmm2, [esp + fjxH1]
+ movhlps xmm3, xmm0 /* xmm3= fjxOb fjyOb */
+ unpcklps xmm1, xmm2 /* xmm1= fjzOa fjxH1a fjzOb fjxH1b */
+ movaps xmm4, [esp + fjyH1]
+ movaps xmm5, [esp + fjzH1]
+ unpcklps xmm4, xmm5 /* xmm4= fjyH1a fjzH1a fjyH1b fjzH1b */
+ movaps xmm5, [esp + fjxH2]
+ movaps xmm6, [esp + fjyH2]
+ movhlps xmm7, xmm4 /* xmm7= fjyH1b fjzH1b */
+ unpcklps xmm5, xmm6 /* xmm5= fjxH2a fjyH2a fjxH2b fjyH2b */
+ movlhps xmm0, xmm1 /* xmm0= fjxOa fjyOa fjzOa fjxH1a */
+ shufps xmm3, xmm1, 0b11100100
+ /* xmm3= fjxOb fjyOb fjzOb fjxH1b */
+ movlhps xmm4, xmm5 /* xmm4= fjyH1a fjzH1a fjxH2a fjyH2a */
+ shufps xmm7, xmm5, 0b11100100
+ /* xmm7= fjyH1b fjzH1b fjxH2b fjyH2b */
+ movups xmm1, [edi + eax*4]
+ movups xmm2, [edi + eax*4 + 16]
+ movups xmm5, [edi + ebx*4]
+ movups xmm6, [edi + ebx*4 + 16]
+ addps xmm1, xmm0
+ addps xmm2, xmm4
+ addps xmm5, xmm3
+ addps xmm6, xmm7
+ movss xmm0, [edi + eax*4 + 32]
+ movss xmm3, [edi + ebx*4 + 32]
+
+ movaps xmm4, [esp + fjzH2]
+ movaps xmm7, xmm4
+ shufps xmm7, xmm7, 1
+
+ movups [edi + eax*4], xmm1
+ movups [edi + eax*4 + 16],xmm2
+ movups [edi + ebx*4], xmm5
+ movups [edi + ebx*4 + 16],xmm6
+ addss xmm0, xmm4
+ addss xmm3, xmm7
+ movss [edi + eax*4 + 32], xmm0
+ movss [edi + ebx*4 + 32], xmm3
+
+ /* then do the second pair (c & d) */
+ movaps xmm0, [esp + fjxO] /* xmm0= fjxOa fjxOb fjxOc fjxOd */
+ movaps xmm1, [esp + fjyO] /* xmm1= fjyOa fjyOb fjyOc fjyOd */
+ unpckhps xmm0, xmm1 /* xmm0= fjxOc fjyOc fjxOd fjyOd */
+ movaps xmm1, [esp + fjzO]
+ movaps xmm2, [esp + fjxH1]
+ movhlps xmm3, xmm0 /* xmm3= fjxOd fjyOd */
+ unpckhps xmm1, xmm2 /* xmm1= fjzOc fjxH1c fjzOd fjxH1d */
+ movaps xmm4, [esp + fjyH1]
+ movaps xmm5, [esp + fjzH1]
+ unpckhps xmm4, xmm5 /* xmm4= fjyH1c fjzH1c fjyH1d fjzH1d */
+ movaps xmm5, [esp + fjxH2]
+ movaps xmm6, [esp + fjyH2]
+ movhlps xmm7, xmm4 /* xmm7= fjyH1d fjzH1d */
+ unpckhps xmm5, xmm6 /* xmm5= fjxH2c fjyH2c fjxH2d fjyH2d */
+ movlhps xmm0, xmm1 /* xmm0= fjxOc fjyOc fjzOc fjxH1c */
+ shufps xmm3, xmm1, 0b11100100
+ /* xmm3= fjxOd fjyOd fjzOd fjxH1d */
+ movlhps xmm4, xmm5 /* xmm4= fjyH1c fjzH1c fjxH2c fjyH2c */
+ shufps xmm7, xmm5, 0b11100100
+ /* xmm7= fjyH1d fjzH1d fjxH2d fjyH2d */
+ movups xmm1, [edi + ecx*4]
+ movups xmm2, [edi + ecx*4 + 16]
+ movups xmm5, [edi + edx*4]
+ movups xmm6, [edi + edx*4 + 16]
+ addps xmm1, xmm0
+ addps xmm2, xmm4
+ addps xmm5, xmm3
+ addps xmm6, xmm7
+ movss xmm0, [edi + ecx*4 + 32]
+ movss xmm3, [edi + edx*4 + 32]
+
+ movaps xmm4, [esp + fjzH2]
+ movaps xmm7, xmm4
+ shufps xmm4, xmm4, 0b10
+ shufps xmm7, xmm7, 0b11
+ movups [edi + ecx*4], xmm1
+ movups [edi + ecx*4 + 16],xmm2
+ movups [edi + edx*4], xmm5
+ movups [edi + edx*4 + 16],xmm6
+ addss xmm0, xmm4
+ addss xmm3, xmm7
+ movss [edi + ecx*4 + 32], xmm0
+ movss [edi + edx*4 + 32], xmm3
+
+ /* should we do one more iteration? */
+ sub [esp + innerk], 4
+ jl i1130_single_check
+ jmp i1130_unroll_loop
+i1130_single_check:
+ add [esp + innerk], 4
+ jnz i1130_single_loop
+ jmp i1130_updateouterdata
+i1130_single_loop:
+ mov edx, [esp + innerjjnr] /* pointer to jjnr[k] */
+ mov eax, [edx]
+ add [esp + innerjjnr], 4
+
+ mov esi, [ebp + pos]
+ lea eax, [eax + eax*2]
+
+ /* fetch j coordinates */
+ xorps xmm3, xmm3
+ xorps xmm4, xmm4
+ xorps xmm5, xmm5
+ movss xmm3, [esi + eax*4]
+ movss xmm4, [esi + eax*4 + 4]
+ movss xmm5, [esi + eax*4 + 8]
+
+ movlps xmm6, [esi + eax*4 + 12]
+ movhps xmm6, [esi + eax*4 + 24] /* xmm6=jxH1 jyH1 jxH2 jyH2 */
+ /* fetch both z coords in one go, to positions 0 and 3 in xmm7 */
+ movups xmm7, [esi + eax*4 + 20] /* xmm7=jzH1 jxH2 jyH2 jzH2 */
+ shufps xmm6, xmm6, 0b11011000 /* xmm6=jxH1 jxH2 jyH1 jyH2 */
+ movlhps xmm3, xmm6 /* xmm3= jxO 0 jxH1 jxH2 */
+ movaps xmm0, [esp + ixO]
+ movaps xmm1, [esp + iyO]
+ movaps xmm2, [esp + izO]
+ shufps xmm4, xmm6, 0b11100100 /* xmm4= jyO 0 jyH1 jyH2 */
+ shufps xmm5, xmm7, 0b11000100 /* xmm5= jzO 0 jzH1 jzH2 */
+ /* store all j coordinates in jO */
+ movaps [esp + jxO], xmm3
+ movaps [esp + jyO], xmm4
+ movaps [esp + jzO], xmm5
+ subps xmm0, xmm3
+ subps xmm1, xmm4
+ subps xmm2, xmm5
+ movaps [esp + dxOO], xmm0
+ movaps [esp + dyOO], xmm1
+ movaps [esp + dzOO], xmm2
+ mulps xmm0, xmm0
+ mulps xmm1, xmm1
+ mulps xmm2, xmm2
+ addps xmm0, xmm1
+ addps xmm0, xmm2 /* have rsq in xmm0 */
+
+ /* do invsqrt */
+ rsqrtps xmm1, xmm0
+ movaps xmm2, xmm1
+ mulps xmm1, xmm1
+ movaps xmm3, [esp + three]
+ mulps xmm1, xmm0
+ subps xmm3, xmm1
+ mulps xmm3, xmm2
+ mulps xmm3, [esp + half] /* rinv iO - j water */
+
+ xorps xmm1, xmm1
+ movaps xmm0, xmm3
+ xorps xmm4, xmm4
+ mulps xmm0, xmm0 /* xmm0=rinvsq */
+ /* fetch charges to xmm4 (temporary) */
+ movss xmm4, [esp + qqOO]
+ movss xmm1, xmm0
+ movhps xmm4, [esp + qqOH]
+ mulss xmm1, xmm0
+ mulps xmm3, xmm4 /* xmm3=vcoul */
+ mulss xmm1, xmm0 /* xmm1(0)=rinvsix */
+ movaps xmm2, xmm1 /* zero everything else in xmm2 */
+ mulss xmm2, xmm2 /* xmm2=rinvtwelve */
+
+ mulss xmm1, [esp + c6]
+ mulss xmm2, [esp + c12]
+ movaps xmm4, xmm2
+ subss xmm4, xmm1 /* vnbtot=vnb12-vnb6 */
+ addps xmm4, [esp + vnbtot]
+ mulss xmm1, [esp + six]
+ mulss xmm2, [esp + twelve]
+ movaps [esp + vnbtot], xmm4
+ subss xmm2, xmm1 /* fsD+fsR */
+ addps xmm2, xmm3 /* fsC+fsD+fsR */
+
+ addps xmm3, [esp + vctot]
+ mulps xmm0, xmm2 /* total fscal */
+ movaps [esp + vctot], xmm3
+
+ movaps xmm1, xmm0
+ movaps xmm2, xmm0
+ mulps xmm0, [esp + dxOO]
+ mulps xmm1, [esp + dyOO]
+ mulps xmm2, [esp + dzOO]
+ /* initial update for j forces */
+ xorps xmm3, xmm3
+ xorps xmm4, xmm4
+ xorps xmm5, xmm5
+ subps xmm3, xmm0
+ subps xmm4, xmm1
+ subps xmm5, xmm2
+ movaps [esp + fjxO], xmm3
+ movaps [esp + fjyO], xmm4
+ movaps [esp + fjzO], xmm5
+ addps xmm0, [esp + fixO]
+ addps xmm1, [esp + fiyO]
+ addps xmm2, [esp + fizO]
+ movaps [esp + fixO], xmm0
+ movaps [esp + fiyO], xmm1
+ movaps [esp + fizO], xmm2
+
+
+ /* done with i O Now do i H1 & H2 simultaneously first get i particle coords: */
+ movaps xmm0, [esp + ixH1]
+ movaps xmm1, [esp + iyH1]
+ movaps xmm2, [esp + izH1]
+ movaps xmm3, [esp + ixH2]
+ movaps xmm4, [esp + iyH2]
+ movaps xmm5, [esp + izH2]
+ subps xmm0, [esp + jxO]
+ subps xmm1, [esp + jyO]
+ subps xmm2, [esp + jzO]
+ subps xmm3, [esp + jxO]
+ subps xmm4, [esp + jyO]
+ subps xmm5, [esp + jzO]
+ movaps [esp + dxH1O], xmm0
+ movaps [esp + dyH1O], xmm1
+ movaps [esp + dzH1O], xmm2
+ movaps [esp + dxH2O], xmm3
+ movaps [esp + dyH2O], xmm4
+ movaps [esp + dzH2O], xmm5
+ mulps xmm0, xmm0
+ mulps xmm1, xmm1
+ mulps xmm2, xmm2
+ mulps xmm3, xmm3
+ mulps xmm4, xmm4
+ mulps xmm5, xmm5
+ addps xmm0, xmm1
+ addps xmm4, xmm3
+ addps xmm0, xmm2 /* have rsqH1 in xmm0 */
+ addps xmm4, xmm5 /* have rsqH2 in xmm4 */
+
+ /* do invsqrt */
+ rsqrtps xmm1, xmm0
+ rsqrtps xmm5, xmm4
+ movaps xmm2, xmm1 /* do coulomb interaction */
+ movaps xmm6, xmm5
+ mulps xmm1, xmm1
+ mulps xmm5, xmm5
+ movaps xmm3, [esp + three]
+ movaps xmm7, xmm3
+ mulps xmm1, xmm0
+ mulps xmm5, xmm4
+ subps xmm3, xmm1
+ subps xmm7, xmm5
+ mulps xmm3, xmm2
+ mulps xmm7, xmm6
+ mulps xmm3, [esp + half] /* rinv H1 - j water */
+ mulps xmm7, [esp + half] /* rinv H2 - j water */
+
+ /* assemble charges in xmm6 */
+ xorps xmm6, xmm6
+ /* do coulomb interaction */
+ movaps xmm0, xmm3
+ movss xmm6, [esp + qqOH]
+ movaps xmm4, xmm7
+ movhps xmm6, [esp + qqHH]
+ mulps xmm0, xmm0 /* rinvsq */
+ mulps xmm4, xmm4 /* rinvsq */
+ mulps xmm3, xmm6 /* vcoul */
+ mulps xmm7, xmm6 /* vcoul */
+ movaps xmm2, xmm3
+ addps xmm2, xmm7 /* total vcoul */
+ mulps xmm0, xmm3 /* fscal */
+
+ addps xmm2, [esp + vctot]
+ mulps xmm7, xmm4 /* fscal */
+ movaps [esp + vctot], xmm2
+ movaps xmm1, xmm0
+ movaps xmm2, xmm0
+ mulps xmm0, [esp + dxH1O]
+ mulps xmm1, [esp + dyH1O]
+ mulps xmm2, [esp + dzH1O]
+ /* update forces H1 - j water */
+ movaps xmm3, [esp + fjxO]
+ movaps xmm4, [esp + fjyO]
+ movaps xmm5, [esp + fjzO]
+ subps xmm3, xmm0
+ subps xmm4, xmm1
+ subps xmm5, xmm2
+ movaps [esp + fjxO], xmm3
+ movaps [esp + fjyO], xmm4
+ movaps [esp + fjzO], xmm5
+ addps xmm0, [esp + fixH1]
+ addps xmm1, [esp + fiyH1]
+ addps xmm2, [esp + fizH1]
+ movaps [esp + fixH1], xmm0
+ movaps [esp + fiyH1], xmm1
+ movaps [esp + fizH1], xmm2
+ /* do forces H2 - j water */
+ movaps xmm0, xmm7
+ movaps xmm1, xmm7
+ movaps xmm2, xmm7
+ mulps xmm0, [esp + dxH2O]
+ mulps xmm1, [esp + dyH2O]
+ mulps xmm2, [esp + dzH2O]
+ movaps xmm3, [esp + fjxO]
+ movaps xmm4, [esp + fjyO]
+ movaps xmm5, [esp + fjzO]
+ subps xmm3, xmm0
+ subps xmm4, xmm1
+ subps xmm5, xmm2
+ mov esi, [ebp + faction]
+ movaps [esp + fjxO], xmm3
+ movaps [esp + fjyO], xmm4
+ movaps [esp + fjzO], xmm5
+ addps xmm0, [esp + fixH2]
+ addps xmm1, [esp + fiyH2]
+ addps xmm2, [esp + fizH2]
+ movaps [esp + fixH2], xmm0
+ movaps [esp + fiyH2], xmm1
+ movaps [esp + fizH2], xmm2
+
+ /* update j water forces from local variables */
+ movlps xmm0, [esi + eax*4]
+ movlps xmm1, [esi + eax*4 + 12]
+ movhps xmm1, [esi + eax*4 + 24]
+ movaps xmm3, [esp + fjxO]
+ movaps xmm4, [esp + fjyO]
+ movaps xmm5, [esp + fjzO]
+ movaps xmm6, xmm5
+ movaps xmm7, xmm5
+ shufps xmm6, xmm6, 0b10
+ shufps xmm7, xmm7, 0b11
+ addss xmm5, [esi + eax*4 + 8]
+ addss xmm6, [esi + eax*4 + 20]
+ addss xmm7, [esi + eax*4 + 32]
+ movss [esi + eax*4 + 8], xmm5
+ movss [esi + eax*4 + 20], xmm6
+ movss [esi + eax*4 + 32], xmm7
+ movaps xmm5, xmm3
+ unpcklps xmm3, xmm4
+ unpckhps xmm5, xmm4
+ addps xmm0, xmm3
+ addps xmm1, xmm5
+ movlps [esi + eax*4], xmm0
+ movlps [esi + eax*4 + 12], xmm1
+ movhps [esi + eax*4 + 24], xmm1
+
+ dec dword ptr [esp + innerk]
+ jz i1130_updateouterdata
+ jmp i1130_single_loop
+i1130_updateouterdata:
+ mov ecx, [esp + ii3]
+ mov edi, [ebp + faction]
+ mov esi, [ebp + fshift]
+ mov edx, [esp + is3]
+
+ /* accumulate Oi forces in xmm0, xmm1, xmm2 */
+ movaps xmm0, [esp + fixO]
+ movaps xmm1, [esp + fiyO]
+ movaps xmm2, [esp + fizO]
+
+ movhlps xmm3, xmm0
+ movhlps xmm4, xmm1
+ movhlps xmm5, xmm2
+ addps xmm0, xmm3
+ addps xmm1, xmm4
+ addps xmm2, xmm5 /* sum is in 1/2 in xmm0-xmm2 */
+
+ movaps xmm3, xmm0
+ movaps xmm4, xmm1
+ movaps xmm5, xmm2
+
+ shufps xmm3, xmm3, 1
+ shufps xmm4, xmm4, 1
+ shufps xmm5, xmm5, 1
+ addss xmm0, xmm3
+ addss xmm1, xmm4
+ addss xmm2, xmm5 /* xmm0-xmm2 has single force in pos0 */
+
+ /* increment i force */
+ movss xmm3, [edi + ecx*4]
+ movss xmm4, [edi + ecx*4 + 4]
+ movss xmm5, [edi + ecx*4 + 8]
+ addss xmm3, xmm0
+ addss xmm4, xmm1
+ addss xmm5, xmm2
+ movss [edi + ecx*4], xmm3
+ movss [edi + ecx*4 + 4], xmm4
+ movss [edi + ecx*4 + 8], xmm5
+
+ /* accumulate force in xmm6/xmm7 for fshift */
+ movaps xmm6, xmm0
+ movss xmm7, xmm2
+ movlhps xmm6, xmm1
+ shufps xmm6, xmm6, 0b1000
+
+ /* accumulate H1i forces in xmm0, xmm1, xmm2 */
+ movaps xmm0, [esp + fixH1]
+ movaps xmm1, [esp + fiyH1]
+ movaps xmm2, [esp + fizH1]
+
+ movhlps xmm3, xmm0
+ movhlps xmm4, xmm1
+ movhlps xmm5, xmm2
+ addps xmm0, xmm3
+ addps xmm1, xmm4
+ addps xmm2, xmm5 /* sum is in 1/2 in xmm0-xmm2 */
+
+ movaps xmm3, xmm0
+ movaps xmm4, xmm1
+ movaps xmm5, xmm2
+
+ shufps xmm3, xmm3, 1
+ shufps xmm4, xmm4, 1
+ shufps xmm5, xmm5, 1
+ addss xmm0, xmm3
+ addss xmm1, xmm4
+ addss xmm2, xmm5 /* xmm0-xmm2 has single force in pos0 */
+
+ /* increment i force */
+ movss xmm3, [edi + ecx*4 + 12]
+ movss xmm4, [edi + ecx*4 + 16]
+ movss xmm5, [edi + ecx*4 + 20]
+ addss xmm3, xmm0
+ addss xmm4, xmm1
+ addss xmm5, xmm2
+ movss [edi + ecx*4 + 12], xmm3
+ movss [edi + ecx*4 + 16], xmm4
+ movss [edi + ecx*4 + 20], xmm5
+
+ /* accumulate force in xmm6/xmm7 for fshift */
+ addss xmm7, xmm2
+ movlhps xmm0, xmm1
+ shufps xmm0, xmm0, 0b1000
+ addps xmm6, xmm0
+
+ /* accumulate H2i forces in xmm0, xmm1, xmm2 */
+ movaps xmm0, [esp + fixH2]
+ movaps xmm1, [esp + fiyH2]
+ movaps xmm2, [esp + fizH2]
+
+ movhlps xmm3, xmm0
+ movhlps xmm4, xmm1
+ movhlps xmm5, xmm2
+ addps xmm0, xmm3
+ addps xmm1, xmm4
+ addps xmm2, xmm5 /* sum is in 1/2 in xmm0-xmm2 */
+
+ movaps xmm3, xmm0
+ movaps xmm4, xmm1
+ movaps xmm5, xmm2
+
+ shufps xmm3, xmm3, 1
+ shufps xmm4, xmm4, 1
+ shufps xmm5, xmm5, 1
+ addss xmm0, xmm3
+ addss xmm1, xmm4
+ addss xmm2, xmm5 /* xmm0-xmm2 has single force in pos0 */
+
+ /* increment i force */
+ movss xmm3, [edi + ecx*4 + 24]
+ movss xmm4, [edi + ecx*4 + 28]
+ movss xmm5, [edi + ecx*4 + 32]
+ addss xmm3, xmm0
+ addss xmm4, xmm1
+ addss xmm5, xmm2
+ movss [edi + ecx*4 + 24], xmm3
+ movss [edi + ecx*4 + 28], xmm4
+ movss [edi + ecx*4 + 32], xmm5
+
+ /* accumulate force in xmm6/xmm7 for fshift */
+ addss xmm7, xmm2
+ movlhps xmm0, xmm1
+ shufps xmm0, xmm0, 0b1000
+ addps xmm6, xmm0
+
+ /* increment fshift force */
+ movlps xmm3, [esi + edx*4]
+ movss xmm4, [esi + edx*4 + 8]
+ addps xmm3, xmm6
+ addss xmm4, xmm7
+ movlps [esi + edx*4], xmm3
+ movss [esi + edx*4 + 8], xmm4
+
+ /* get group index for i particle */
+ mov edx, [ebp + gid] /* get group index for this i particle */
+ mov edx, [edx]
+ add [ebp + gid], 4 /* advance pointer */
+
+ /* accumulate total potential energy and update it */
+ movaps xmm7, [esp + vctot]
+ /* accumulate */
+ movhlps xmm6, xmm7
+ addps xmm7, xmm6 /* pos 0-1 in xmm7 have the sum now */
+ movaps xmm6, xmm7
+ shufps xmm6, xmm6, 1
+ addss xmm7, xmm6
+
+ /* add earlier value from mem */
+ mov eax, [ebp + Vc]
+ addss xmm7, [eax + edx*4]
+ /* move back to mem */
+ movss [eax + edx*4], xmm7
+
+ /* accumulate total lj energy and update it */
+ movaps xmm7, [esp + vnbtot]
+ /* accumulate */
+ movhlps xmm6, xmm7
+ addps xmm7, xmm6 /* pos 0-1 in xmm7 have the sum now */
+ movaps xmm6, xmm7
+ shufps xmm6, xmm6, 1
+ addss xmm7, xmm6
+
+ /* add earlier value from mem */
+ mov eax, [ebp + Vnb]
+ addss xmm7, [eax + edx*4]
+ /* move back to mem */
+ movss [eax + edx*4], xmm7
+
+ /* finish if last */
+ mov ecx, [ebp + nri]
+ dec ecx
+ jecxz i1130_end
+ /* not last, iterate once more! */
+ mov [ebp + nri], ecx
+ jmp i1130_outer
+i1130_end:
+ emms
+ mov eax, [esp + salign]
+ add esp, eax
+ add esp, 1492
+ pop edi
+ pop esi
+ pop edx
+ pop ecx
+ pop ebx
+ pop eax
+ leave
+ ret
+
+
+
+.globl inl2120_sse
+ .type inl2120_sse,@function
+inl2120_sse:
+.equ nri, 8
+.equ iinr, 12
+.equ jindex, 16
+.equ jjnr, 20
+.equ shift, 24
+.equ shiftvec, 28
+.equ fshift, 32
+.equ gid, 36
+.equ pos, 40
+.equ faction, 44
+.equ charge, 48
+.equ facel, 52
+.equ Vc, 56
+.equ krf, 60
+.equ crf, 64
+.equ type, 68
+.equ ntype, 72
+.equ nbfp, 76
+.equ Vnb, 80
+ /* stack offsets for local variables */
+ /* bottom of stack is cache-aligned for sse use */
+.equ ixO, 0
+.equ iyO, 16
+.equ izO, 32
+.equ ixH1, 48
+.equ iyH1, 64
+.equ izH1, 80
+.equ ixH2, 96
+.equ iyH2, 112
+.equ izH2, 128
+.equ iqO, 144
+.equ iqH, 160
+.equ dxO, 176
+.equ dyO, 192
+.equ dzO, 208
+.equ dxH1, 224
+.equ dyH1, 240
+.equ dzH1, 256
+.equ dxH2, 272
+.equ dyH2, 288
+.equ dzH2, 304
+.equ qqO, 320
+.equ qqH, 336
+.equ c6, 352
+.equ c12, 368
+.equ six, 384
+.equ twelve, 400
+.equ vctot, 416
+.equ vnbtot, 432
+.equ fixO, 448
+.equ fiyO, 464
+.equ fizO, 480
+.equ fixH1, 496
+.equ fiyH1, 512
+.equ fizH1, 528
+.equ fixH2, 544
+.equ fiyH2, 560
+.equ fizH2, 576
+.equ fjx, 592
+.equ fjy, 608
+.equ fjz, 624
+.equ half, 640
+.equ three, 656
+.equ two, 672
+.equ krf, 688
+.equ crf, 704
+.equ krsqO, 720
+.equ krsqH1, 736
+.equ krsqH2, 752
+.equ is3, 768
+.equ ii3, 772
+.equ ntia, 776
+.equ innerjjnr, 780
+.equ innerk, 784
+.equ salign, 788
+ push ebp
+ mov ebp,esp
+ push eax
+ push ebx
+ push ecx
+ push edx
+ push esi
+ push edi
+ sub esp, 792 /* local stack space */
+ mov eax, esp
+ and eax, 0xf
+ sub esp, eax
+ mov [esp + salign], eax
+
+ emms
+
+ movups xmm0, [sse_half]
+ movups xmm1, [sse_three]
+ movups xmm2, [sse_six]
+ movups xmm3, [sse_twelve]
+ movups xmm4, [sse_two]
+ movss xmm5, [ebp + krf]
+ movss xmm6, [ebp + crf]
+
+ movaps [esp + half], xmm0
+ movaps [esp + three], xmm1
+ movaps [esp + six], xmm2
+ movaps [esp + twelve], xmm3
+ movaps [esp + two], xmm4
+ shufps xmm5, xmm5, 0
+ shufps xmm6, xmm6, 0
+ movaps [esp + krf], xmm5
+ movaps [esp + crf], xmm6
+
+ /* assume we have at least one i particle - start directly */
+ mov ecx, [ebp + iinr] /* ecx = pointer into iinr[] */
+ mov ebx, [ecx] /* ebx =ii */
+
+ mov edx, [ebp + charge]
+ movss xmm3, [edx + ebx*4]
+ movss xmm4, [edx + ebx*4 + 4]
+ movss xmm5, [ebp + facel]
+ mulss xmm3, xmm5
+ mulss xmm4, xmm5
+
+ shufps xmm3, xmm3, 0
+ shufps xmm4, xmm4, 0
+ movaps [esp + iqO], xmm3
+ movaps [esp + iqH], xmm4
+
+ mov edx, [ebp + type]
+ mov ecx, [edx + ebx*4]
+ shl ecx, 1
+ imul ecx, [ebp + ntype] /* ecx = ntia = 2*ntype*type[ii0] */
+ mov [esp + ntia], ecx
+.i2120_outer:
+ mov eax, [ebp + shift] /* eax = pointer into shift[] */
+ mov ebx, [eax] /* ebx=shift[n] */
+ add [ebp + shift], 4 /* advance pointer one step */
+
+ lea ebx, [ebx + ebx*2] /* ebx=3*is */
+ mov [esp + is3],ebx /* store is3 */
+
+ mov eax, [ebp + shiftvec] /* eax = base of shiftvec[] */
+
+ movss xmm0, [eax + ebx*4]
+ movss xmm1, [eax + ebx*4 + 4]
+ movss xmm2, [eax + ebx*4 + 8]
+
+ mov ecx, [ebp + iinr] /* ecx = pointer into iinr[] */
+ add [ebp + iinr], 4 /* advance pointer */
+ mov ebx, [ecx] /* ebx =ii */
+
+ movaps xmm3, xmm0
+ movaps xmm4, xmm1
+ movaps xmm5, xmm2
+
+ lea ebx, [ebx + ebx*2] /* ebx = 3*ii=ii3 */
+ mov eax, [ebp + pos] /* eax = base of pos[] */
+ mov [esp + ii3], ebx
+
+ addss xmm3, [eax + ebx*4]
+ addss xmm4, [eax + ebx*4 + 4]
+ addss xmm5, [eax + ebx*4 + 8]
+ shufps xmm3, xmm3, 0
+ shufps xmm4, xmm4, 0
+ shufps xmm5, xmm5, 0
+ movaps [esp + ixO], xmm3
+ movaps [esp + iyO], xmm4
+ movaps [esp + izO], xmm5
+
+ movss xmm3, xmm0
+ movss xmm4, xmm1
+ movss xmm5, xmm2
+ addss xmm0, [eax + ebx*4 + 12]
+ addss xmm1, [eax + ebx*4 + 16]
+ addss xmm2, [eax + ebx*4 + 20]
+ addss xmm3, [eax + ebx*4 + 24]
+ addss xmm4, [eax + ebx*4 + 28]
+ addss xmm5, [eax + ebx*4 + 32]
+
+ shufps xmm0, xmm0, 0
+ shufps xmm1, xmm1, 0
+ shufps xmm2, xmm2, 0
+ shufps xmm3, xmm3, 0
+ shufps xmm4, xmm4, 0
+ shufps xmm5, xmm5, 0
+ movaps [esp + ixH1], xmm0
+ movaps [esp + iyH1], xmm1
+ movaps [esp + izH1], xmm2
+ movaps [esp + ixH2], xmm3
+ movaps [esp + iyH2], xmm4
+ movaps [esp + izH2], xmm5
+
+ /* clear vctot and i forces */
+ xorps xmm4, xmm4
+ movaps [esp + vctot], xmm4
+ movaps [esp + vnbtot], xmm4
+ movaps [esp + fixO], xmm4
+ movaps [esp + fiyO], xmm4
+ movaps [esp + fizO], xmm4
+ movaps [esp + fixH1], xmm4
+ movaps [esp + fiyH1], xmm4
+ movaps [esp + fizH1], xmm4
+ movaps [esp + fixH2], xmm4
+ movaps [esp + fiyH2], xmm4
+ movaps [esp + fizH2], xmm4
+
+ mov eax, [ebp + jindex]
+ mov ecx, [eax] /* jindex[n] */
+ mov edx, [eax + 4] /* jindex[n+1] */
+ add [ebp + jindex], 4
+ sub edx, ecx /* number of innerloop atoms */
+
+ mov esi, [ebp + pos]
+ mov edi, [ebp + faction]
+ mov eax, [ebp + jjnr]
+ shl ecx, 2
+ add eax, ecx
+ mov [esp + innerjjnr], eax /* pointer to jjnr[nj0] */
+ sub edx, 4
+ mov [esp + innerk], edx /* number of innerloop atoms */
+ jge .i2120_unroll_loop
+ jmp .i2120_odd_inner
+.i2120_unroll_loop:
+ /* quad-unroll innerloop here */
+ mov edx, [esp + innerjjnr] /* pointer to jjnr[k] */
+ mov eax, [edx]
+ mov ebx, [edx + 4]
+ mov ecx, [edx + 8]
+ mov edx, [edx + 12] /* eax-edx=jnr1-4 */
+
+ add [esp + innerjjnr], 16 /* advance pointer (unrolled 4) */
+
+ mov esi, [ebp + charge] /* base of charge[] */
+
+ movss xmm3, [esi + eax*4]
+ movss xmm4, [esi + ecx*4]
+ movss xmm6, [esi + ebx*4]
+ movss xmm7, [esi + edx*4]
+
+ shufps xmm3, xmm6, 0
+ shufps xmm4, xmm7, 0
+ shufps xmm3, xmm4, 0b10001000 /* all charges in xmm3 */
+ movaps xmm4, xmm3 /* and in xmm4 */
+ mulps xmm3, [esp + iqO]
+ mulps xmm4, [esp + iqH]
+
+ movd mm0, eax /* use mmx registers as temp storage */
+ movd mm1, ebx
+ movd mm2, ecx
+ movd mm3, edx
+
+ movaps [esp + qqO], xmm3
+ movaps [esp + qqH], xmm4
+
+ mov esi, [ebp + type]
+ mov eax, [esi + eax*4]
+ mov ebx, [esi + ebx*4]
+ mov ecx, [esi + ecx*4]
+ mov edx, [esi + edx*4]
+ mov esi, [ebp + nbfp]
+ shl eax, 1
+ shl ebx, 1
+ shl ecx, 1
+ shl edx, 1
+ mov edi, [esp + ntia]
+ add eax, edi
+ add ebx, edi
+ add ecx, edi
+ add edx, edi
+
+ movlps xmm6, [esi + eax*4]
+ movlps xmm7, [esi + ecx*4]
+ movhps xmm6, [esi + ebx*4]
+ movhps xmm7, [esi + edx*4]
+
+ movaps xmm4, xmm6
+ shufps xmm4, xmm7, 0b10001000
+ shufps xmm6, xmm7, 0b11011101
+
+ movd eax, mm0
+ movd ebx, mm1
+ movd ecx, mm2
+ movd edx, mm3
+
+ movaps [esp + c6], xmm4
+ movaps [esp + c12], xmm6
+
+ mov esi, [ebp + pos] /* base of pos[] */
+
+ lea eax, [eax + eax*2] /* replace jnr with j3 */
+ lea ebx, [ebx + ebx*2]
+ lea ecx, [ecx + ecx*2] /* replace jnr with j3 */
+ lea edx, [edx + edx*2]
+
+ /* move four coordinates to xmm0-xmm2 */
+ movlps xmm4, [esi + eax*4]
+ movlps xmm5, [esi + ecx*4]
+ movss xmm2, [esi + eax*4 + 8]
+ movss xmm6, [esi + ecx*4 + 8]
+
+ movhps xmm4, [esi + ebx*4]
+ movhps xmm5, [esi + edx*4]
+
+ movss xmm0, [esi + ebx*4 + 8]
+ movss xmm1, [esi + edx*4 + 8]
+
+ shufps xmm2, xmm0, 0
+ shufps xmm6, xmm1, 0
+
+ movaps xmm0, xmm4
+ movaps xmm1, xmm4
+
+ shufps xmm2, xmm6, 0b10001000
+
+ shufps xmm0, xmm5, 0b10001000
+ shufps xmm1, xmm5, 0b11011101
+
+ /* move ixO-izO to xmm4-xmm6 */
+ movaps xmm4, [esp + ixO]
+ movaps xmm5, [esp + iyO]
+ movaps xmm6, [esp + izO]
+
+ /* calc dr */
+ subps xmm4, xmm0
+ subps xmm5, xmm1
+ subps xmm6, xmm2
+
+ /* store dr */
+ movaps [esp + dxO], xmm4
+ movaps [esp + dyO], xmm5
+ movaps [esp + dzO], xmm6
+ /* square it */
+ mulps xmm4,xmm4
+ mulps xmm5,xmm5
+ mulps xmm6,xmm6
+ addps xmm4, xmm5
+ addps xmm4, xmm6
+ movaps xmm7, xmm4
+ /* rsqO in xmm7 */
+
+ /* move ixH1-izH1 to xmm4-xmm6 */
+ movaps xmm4, [esp + ixH1]
+ movaps xmm5, [esp + iyH1]
+ movaps xmm6, [esp + izH1]
+
+ /* calc dr */
+ subps xmm4, xmm0
+ subps xmm5, xmm1
+ subps xmm6, xmm2
+
+ /* store dr */
+ movaps [esp + dxH1], xmm4
+ movaps [esp + dyH1], xmm5
+ movaps [esp + dzH1], xmm6
+ /* square it */
+ mulps xmm4,xmm4
+ mulps xmm5,xmm5
+ mulps xmm6,xmm6
+ addps xmm6, xmm5
+ addps xmm6, xmm4
+ /* rsqH1 in xmm6 */
+
+ /* move ixH2-izH2 to xmm3-xmm5 */
+ movaps xmm3, [esp + ixH2]
+ movaps xmm4, [esp + iyH2]
+ movaps xmm5, [esp + izH2]
+
+ /* calc dr */
+ subps xmm3, xmm0
+ subps xmm4, xmm1
+ subps xmm5, xmm2
+
+ /* store dr */
+ movaps [esp + dxH2], xmm3
+ movaps [esp + dyH2], xmm4
+ movaps [esp + dzH2], xmm5
+ /* square it */
+ mulps xmm3,xmm3
+ mulps xmm4,xmm4
+ mulps xmm5,xmm5
+ addps xmm5, xmm4
+ addps xmm5, xmm3
+ /* rsqH2 in xmm5, rsqH1 in xmm6, rsqO in xmm7 */
+
+ movaps xmm0, xmm5
+ movaps xmm1, xmm6
+ movaps xmm2, xmm7
+
+ mulps xmm0, [esp + krf]
+ mulps xmm1, [esp + krf]
+ mulps xmm2, [esp + krf]
+
+ movaps [esp + krsqH2], xmm0
+ movaps [esp + krsqH1], xmm1
+ movaps [esp + krsqO], xmm2
+
+ /* start with rsqO - seed in xmm2 */
+ rsqrtps xmm2, xmm7
+ movaps xmm3, xmm2
+ mulps xmm2, xmm2
+ movaps xmm4, [esp + three]
+ mulps xmm2, xmm7 /* rsq*lu*lu */
+ subps xmm4, xmm2 /* 30-rsq*lu*lu */
+ mulps xmm4, xmm3 /* lu*(3-rsq*lu*lu) */
+ mulps xmm4, [esp + half]
+ movaps xmm7, xmm4 /* rinvO in xmm7 */
+ /* rsqH1 - seed in xmm2 */
+ rsqrtps xmm2, xmm6
+ movaps xmm3, xmm2
+ mulps xmm2, xmm2
+ movaps xmm4, [esp + three]
+ mulps xmm2, xmm6 /* rsq*lu*lu */
+ subps xmm4, xmm2 /* 30-rsq*lu*lu */
+ mulps xmm4, xmm3 /* lu*(3-rsq*lu*lu) */
+ mulps xmm4, [esp + half]
+ movaps xmm6, xmm4 /* rinvH1 in xmm6 */
+ /* rsqH2 - seed in xmm2 */
+ rsqrtps xmm2, xmm5
+ movaps xmm3, xmm2
+ mulps xmm2, xmm2
+ movaps xmm4, [esp + three]
+ mulps xmm2, xmm5 /* rsq*lu*lu */
+ subps xmm4, xmm2 /* 30-rsq*lu*lu */
+ mulps xmm4, xmm3 /* lu*(3-rsq*lu*lu) */
+ mulps xmm4, [esp + half]
+ movaps xmm5, xmm4 /* rinvH2 in xmm5 */
+
+ /* do O interactions */
+ movaps xmm4, xmm7
+ mulps xmm4, xmm4 /* xmm7=rinv, xmm4=rinvsq */
+ movaps xmm1, xmm4
+ mulps xmm1, xmm4
+ mulps xmm1, xmm4 /* xmm1=rinvsix */
+ movaps xmm2, xmm1
+ mulps xmm2, xmm2 /* xmm2=rinvtwelve */
+ mulps xmm1, [esp + c6]
+ mulps xmm2, [esp + c12]
+ movaps xmm3, xmm2
+ subps xmm3, xmm1 /* vnb=vnb12-vnb6 */
+ addps xmm3, [esp + vnbtot]
+ mulps xmm1, [esp + six]
+ mulps xmm2, [esp + twelve]
+ subps xmm2, xmm1 /* nb part of fs */
+
+ movaps xmm0, xmm7
+ movaps xmm1, [esp + krsqO]
+ addps xmm0, xmm1
+ mulps xmm1, [esp + two]
+ subps xmm0, [esp + crf] /* xmm0=rinv+krsq-crf */
+ subps xmm7, xmm1
+ mulps xmm0, [esp + qqO]
+ mulps xmm7, [esp + qqO]
+ addps xmm2, xmm7
+
+ mulps xmm4, xmm2 /* total fsO in xmm4 */
+
+ addps xmm0, [esp + vctot]
+ movaps [esp + vnbtot], xmm3
+ movaps [esp + vctot], xmm0
+
+ movaps xmm0, [esp + dxO]
+ movaps xmm1, [esp + dyO]
+ movaps xmm2, [esp + dzO]
+ mulps xmm0, xmm4
+ mulps xmm1, xmm4
+ mulps xmm2, xmm4
+
+ /* update O forces */
+ movaps xmm3, [esp + fixO]
+ movaps xmm4, [esp + fiyO]
+ movaps xmm7, [esp + fizO]
+ addps xmm3, xmm0
+ addps xmm4, xmm1
+ addps xmm7, xmm2
+ movaps [esp + fixO], xmm3
+ movaps [esp + fiyO], xmm4
+ movaps [esp + fizO], xmm7
+ /* update j forces with water O */
+ movaps [esp + fjx], xmm0
+ movaps [esp + fjy], xmm1
+ movaps [esp + fjz], xmm2
+
+ /* H1 interactions */
+ movaps xmm4, xmm6
+ mulps xmm4, xmm4 /* xmm6=rinv, xmm4=rinvsq */
+ movaps xmm7, xmm6
+ movaps xmm0, [esp + krsqH1]
+ addps xmm6, xmm0 /* xmm6=rinv+krsq */
+ mulps xmm0, [esp + two]
+ subps xmm6, [esp + crf]
+ subps xmm7, xmm0 /* xmm7=rinv-2*krsq */
+ mulps xmm6, [esp + qqH] /* vcoul */
+ mulps xmm7, [esp + qqH]
+ mulps xmm4, xmm7 /* total fsH1 in xmm4 */
+
+ addps xmm6, [esp + vctot]
+
+ movaps xmm0, [esp + dxH1]
+ movaps xmm1, [esp + dyH1]
+ movaps xmm2, [esp + dzH1]
+ movaps [esp + vctot], xmm6
+ mulps xmm0, xmm4
+ mulps xmm1, xmm4
+ mulps xmm2, xmm4
+
+ /* update H1 forces */
+ movaps xmm3, [esp + fixH1]
+ movaps xmm4, [esp + fiyH1]
+ movaps xmm7, [esp + fizH1]
+ addps xmm3, xmm0
+ addps xmm4, xmm1
+ addps xmm7, xmm2
+ movaps [esp + fixH1], xmm3
+ movaps [esp + fiyH1], xmm4
+ movaps [esp + fizH1], xmm7
+ /* update j forces with water H1 */
+ addps xmm0, [esp + fjx]
+ addps xmm1, [esp + fjy]
+ addps xmm2, [esp + fjz]
+ movaps [esp + fjx], xmm0
+ movaps [esp + fjy], xmm1
+ movaps [esp + fjz], xmm2
+
+ /* H2 interactions */
+ movaps xmm4, xmm5
+ mulps xmm4, xmm4 /* xmm5=rinv, xmm4=rinvsq */
+ movaps xmm7, xmm5
+ movaps xmm0, [esp + krsqH2]
+ addps xmm5, xmm0 /* xmm5=rinv+krsq */
+ mulps xmm0, [esp + two]
+ subps xmm5, [esp + crf]
+ subps xmm7, xmm0 /* xmm7=rinv-2*krsq */
+ mulps xmm5, [esp + qqH] /* vcoul */
+ mulps xmm7, [esp + qqH]
+ mulps xmm4, xmm7 /* total fsH2 in xmm4 */
+
+ addps xmm5, [esp + vctot]
+
+ movaps xmm0, [esp + dxH2]
+ movaps xmm1, [esp + dyH2]
+ movaps xmm2, [esp + dzH2]
+ movaps [esp + vctot], xmm5
+ mulps xmm0, xmm4
+ mulps xmm1, xmm4
+ mulps xmm2, xmm4
+
+ /* update H2 forces */
+ movaps xmm3, [esp + fixH2]
+ movaps xmm4, [esp + fiyH2]
+ movaps xmm7, [esp + fizH2]
+ addps xmm3, xmm0
+ addps xmm4, xmm1
+ addps xmm7, xmm2
+ movaps [esp + fixH2], xmm3
+ movaps [esp + fiyH2], xmm4
+ movaps [esp + fizH2], xmm7
+
+ mov edi, [ebp +faction]
+ /* update j forces */
+ addps xmm0, [esp + fjx]
+ addps xmm1, [esp + fjy]
+ addps xmm2, [esp + fjz]
+
+ movlps xmm4, [edi + eax*4]
+ movlps xmm7, [edi + ecx*4]
+ movhps xmm4, [edi + ebx*4]
+ movhps xmm7, [edi + edx*4]
+
+ movaps xmm3, xmm4
+ shufps xmm3, xmm7, 0b10001000
+ shufps xmm4, xmm7, 0b11011101
+ /* xmm3 has fjx, xmm4 has fjy */
+ subps xmm3, xmm0
+ subps xmm4, xmm1
+ /* unpack them back for storing */
+ movaps xmm7, xmm3
+ unpcklps xmm7, xmm4
+ unpckhps xmm3, xmm4
+ movlps [edi + eax*4], xmm7
+ movlps [edi + ecx*4], xmm3
+ movhps [edi + ebx*4], xmm7
+ movhps [edi + edx*4], xmm3
+ /* finally z forces */
+ movss xmm0, [edi + eax*4 + 8]
+ movss xmm1, [edi + ebx*4 + 8]
+ movss xmm3, [edi + ecx*4 + 8]
+ movss xmm4, [edi + edx*4 + 8]
+ subss xmm0, xmm2
+ shufps xmm2, xmm2, 0b11100101
+ subss xmm1, xmm2
+ shufps xmm2, xmm2, 0b11101010
+ subss xmm3, xmm2
+ shufps xmm2, xmm2, 0b11111111
+ subss xmm4, xmm2
+ movss [edi + eax*4 + 8], xmm0
+ movss [edi + ebx*4 + 8], xmm1
+ movss [edi + ecx*4 + 8], xmm3
+ movss [edi + edx*4 + 8], xmm4
+
+ /* should we do one more iteration? */
+ sub [esp + innerk], 4
+ jl .i2120_odd_inner
+ jmp .i2120_unroll_loop
+.i2120_odd_inner:
+ add [esp + innerk], 4
+ jnz .i2120_odd_loop
+ jmp .i2120_updateouterdata
+.i2120_odd_loop:
+ mov edx, [esp + innerjjnr] /* pointer to jjnr[k] */
+ mov eax, [edx]
+ add [esp + innerjjnr], 4
+
+ xorps xmm4, xmm4
+ movss xmm4, [esp + iqO]
+ mov esi, [ebp + charge]
+ movhps xmm4, [esp + iqH]
+ movss xmm3, [esi + eax*4] /* charge in xmm3 */
+ shufps xmm3, xmm3, 0
+ mulps xmm3, xmm4
+ movaps [esp + qqO], xmm3 /* use oxygen qq for storage */
+
+ xorps xmm6, xmm6
+ mov esi, [ebp + type]
+ mov ebx, [esi + eax*4]
+ mov esi, [ebp + nbfp]
+ shl ebx, 1
+ add ebx, [esp + ntia]
+ movlps xmm6, [esi + ebx*4]
+ movaps xmm7, xmm6
+ shufps xmm6, xmm6, 0b11111100
+ shufps xmm7, xmm7, 0b11111101
+ movaps [esp + c6], xmm6
+ movaps [esp + c12], xmm7
+
+ mov esi, [ebp + pos]
+ lea eax, [eax + eax*2]
+
+ /* move j coords to xmm0-xmm2 */
+ movss xmm0, [esi + eax*4]
+ movss xmm1, [esi + eax*4 + 4]
+ movss xmm2, [esi + eax*4 + 8]
+ shufps xmm0, xmm0, 0
+ shufps xmm1, xmm1, 0
+ shufps xmm2, xmm2, 0
+
+ movss xmm3, [esp + ixO]
+ movss xmm4, [esp + iyO]
+ movss xmm5, [esp + izO]
+
+ movlps xmm6, [esp + ixH1]
+ movlps xmm7, [esp + ixH2]
+ unpcklps xmm6, xmm7
+ movlhps xmm3, xmm6
+ movlps xmm6, [esp + iyH1]
+ movlps xmm7, [esp + iyH2]
+ unpcklps xmm6, xmm7
+ movlhps xmm4, xmm6
+ movlps xmm6, [esp + izH1]
+ movlps xmm7, [esp + izH2]
+ unpcklps xmm6, xmm7
+ movlhps xmm5, xmm6
+
+ subps xmm3, xmm0
+ subps xmm4, xmm1
+ subps xmm5, xmm2
+
+ movaps [esp + dxO], xmm3
+ movaps [esp + dyO], xmm4
+ movaps [esp + dzO], xmm5
+
+ mulps xmm3, xmm3
+ mulps xmm4, xmm4
+ mulps xmm5, xmm5
+
+ addps xmm4, xmm3
+ addps xmm4, xmm5
+ /* rsq in xmm4 */
+
+ movaps xmm0, xmm4
+ mulps xmm0, [esp + krf]
+ movaps [esp + krsqO], xmm0
+
+ rsqrtps xmm5, xmm4
+ /* lookup seed in xmm5 */
+ movaps xmm2, xmm5
+ mulps xmm5, xmm5
+ movaps xmm1, [esp + three]
+ mulps xmm5, xmm4 /* rsq*lu*lu */
+ movaps xmm0, [esp + half]
+ subps xmm1, xmm5 /* 30-rsq*lu*lu */
+ mulps xmm1, xmm2
+ mulps xmm0, xmm1 /* xmm0=rinv */
+ movaps xmm4, xmm0
+ mulps xmm4, xmm4 /* xmm4=rinvsq */
+ movaps xmm1, xmm4
+ mulss xmm1, xmm4
+ mulss xmm1, xmm4 /* xmm1=rinvsix */
+ movaps xmm2, xmm1
+ mulss xmm2, xmm2 /* xmm2=rinvtwelve */
+ mulps xmm1, [esp + c6]
+ mulps xmm2, [esp + c12]
+ movaps xmm5, xmm2
+ subss xmm5, xmm1 /* vnb=vnb12-vnb6 */
+ addps xmm5, [esp + vnbtot]
+ mulss xmm1, [esp + six]
+ mulss xmm2, [esp + twelve]
+ subss xmm2, xmm1
+
+ movaps xmm1, xmm0 /* xmm1=r */inv
+ movaps xmm3, [esp + krsqO]
+ addps xmm0, xmm3 /* xmm0=rinv+krsq */
+ mulps xmm3, [esp + two]
+ subps xmm0, [esp + crf] /* xmm0=rinv+krsq-crf */
+ subps xmm1, xmm3 /* xmm1=rinv-2*krsq */
+ mulps xmm0, [esp + qqO] /* xmm0=vcoul */
+ mulps xmm1, [esp + qqO] /* xmm1=coul part of fs */
+
+ addps xmm2, xmm1 /* total fs */
+
+ mulps xmm4, xmm2 /* xmm4=total fscal */
+ addps xmm0, [esp + vctot]
+ movaps [esp + vctot], xmm0
+
+ movaps xmm0, [esp + dxO]
+ movaps xmm1, [esp + dyO]
+ movaps xmm2, [esp + dzO]
+
+ movaps [esp + vnbtot], xmm5
+
+ mulps xmm0, xmm4
+ mulps xmm1, xmm4
+ mulps xmm2, xmm4
+ /* xmm0-xmm2 contains tx-tz (partial force) */
+ movss xmm3, [esp + fixO]
+ movss xmm4, [esp + fiyO]
+ movss xmm5, [esp + fizO]
+ addss xmm3, xmm0
+ addss xmm4, xmm1
+ addss xmm5, xmm2
+ movss [esp + fixO], xmm3
+ movss [esp + fiyO], xmm4
+ movss [esp + fizO], xmm5 /* updated the O force now do the H's */
+ movaps xmm3, xmm0
+ movaps xmm4, xmm1
+ movaps xmm5, xmm2
+ shufps xmm3, xmm3, 0b11100110 /* shift right */
+ shufps xmm4, xmm4, 0b11100110
+ shufps xmm5, xmm5, 0b11100110
+ addss xmm3, [esp + fixH1]
+ addss xmm4, [esp + fiyH1]
+ addss xmm5, [esp + fizH1]
+ movss [esp + fixH1], xmm3
+ movss [esp + fiyH1], xmm4
+ movss [esp + fizH1], xmm5 /* updated the H1 force */
+
+ mov edi, [ebp + faction]
+ shufps xmm3, xmm3, 0b11100111 /* shift right */
+ shufps xmm4, xmm4, 0b11100111
+ shufps xmm5, xmm5, 0b11100111
+ addss xmm3, [esp + fixH2]
+ addss xmm4, [esp + fiyH2]
+ addss xmm5, [esp + fizH2]
+ movss [esp + fixH2], xmm3
+ movss [esp + fiyH2], xmm4
+ movss [esp + fizH2], xmm5 /* updated the H2 force */
+
+ /* the fj's - start by accumulating the tx/ty/tz force in xmm0, xmm1 */
+ xorps xmm5, xmm5
+ movaps xmm3, xmm0
+ movlps xmm6, [edi + eax*4]
+ movss xmm7, [edi + eax*4 + 8]
+ unpcklps xmm3, xmm1
+ movlhps xmm3, xmm5
+ unpckhps xmm0, xmm1
+ addps xmm0, xmm3
+ movhlps xmm3, xmm0
+ addps xmm0, xmm3 /* x,y sum in xmm0 */
+
+ movhlps xmm1, xmm2
+ addss xmm2, xmm1
+ shufps xmm1, xmm1, 1
+ addss xmm2, xmm1 /* z sum in xmm2 */
+ subps xmm6, xmm0
+ subss xmm7, xmm2
+
+ movlps [edi + eax*4], xmm6
+ movss [edi + eax*4 + 8], xmm7
+
+ dec dword ptr [esp + innerk]
+ jz .i2120_updateouterdata
+ jmp .i2120_odd_loop
+.i2120_updateouterdata:
+ mov ecx, [esp + ii3]
+ mov edi, [ebp + faction]
+ mov esi, [ebp + fshift]
+ mov edx, [esp + is3]
+
+ /* accumulate Oi forces in xmm0, xmm1, xmm2 */
+ movaps xmm0, [esp + fixO]
+ movaps xmm1, [esp + fiyO]
+ movaps xmm2, [esp + fizO]
+
+ movhlps xmm3, xmm0
+ movhlps xmm4, xmm1
+ movhlps xmm5, xmm2
+ addps xmm0, xmm3
+ addps xmm1, xmm4
+ addps xmm2, xmm5 /* sum is in 1/2 in xmm0-xmm2 */
+
+ movaps xmm3, xmm0
+ movaps xmm4, xmm1
+ movaps xmm5, xmm2
+
+ shufps xmm3, xmm3, 1
+ shufps xmm4, xmm4, 1
+ shufps xmm5, xmm5, 1
+ addss xmm0, xmm3
+ addss xmm1, xmm4
+ addss xmm2, xmm5 /* xmm0-xmm2 has single force in pos0 */
+
+ /* increment i force */
+ movss xmm3, [edi + ecx*4]
+ movss xmm4, [edi + ecx*4 + 4]
+ movss xmm5, [edi + ecx*4 + 8]
+ addss xmm3, xmm0
+ addss xmm4, xmm1
+ addss xmm5, xmm2
+ movss [edi + ecx*4], xmm3
+ movss [edi + ecx*4 + 4], xmm4
+ movss [edi + ecx*4 + 8], xmm5
+
+ /* accumulate force in xmm6/xmm7 for fshift */
+ movaps xmm6, xmm0
+ movss xmm7, xmm2
+ movlhps xmm6, xmm1
+ shufps xmm6, xmm6, 0b1000
+
+ /* accumulate H1i forces in xmm0, xmm1, xmm2 */
+ movaps xmm0, [esp + fixH1]
+ movaps xmm1, [esp + fiyH1]
+ movaps xmm2, [esp + fizH1]
+
+ movhlps xmm3, xmm0
+ movhlps xmm4, xmm1
+ movhlps xmm5, xmm2
+ addps xmm0, xmm3
+ addps xmm1, xmm4
+ addps xmm2, xmm5 /* sum is in 1/2 in xmm0-xmm2 */
+
+ movaps xmm3, xmm0
+ movaps xmm4, xmm1
+ movaps xmm5, xmm2
+
+ shufps xmm3, xmm3, 1
+ shufps xmm4, xmm4, 1
+ shufps xmm5, xmm5, 1
+ addss xmm0, xmm3
+ addss xmm1, xmm4
+ addss xmm2, xmm5 /* xmm0-xmm2 has single force in pos0 */
+
+ /* increment i force */
+ movss xmm3, [edi + ecx*4 + 12]
+ movss xmm4, [edi + ecx*4 + 16]
+ movss xmm5, [edi + ecx*4 + 20]
+ addss xmm3, xmm0
+ addss xmm4, xmm1
+ addss xmm5, xmm2
+ movss [edi + ecx*4 + 12], xmm3
+ movss [edi + ecx*4 + 16], xmm4
+ movss [edi + ecx*4 + 20], xmm5
+
+ /* accumulate force in xmm6/xmm7 for fshift */
+ addss xmm7, xmm2
+ movlhps xmm0, xmm1
+ shufps xmm0, xmm0, 0b1000
+ addps xmm6, xmm0
+
+ /* accumulate H2i forces in xmm0, xmm1, xmm2 */
+ movaps xmm0, [esp + fixH2]
+ movaps xmm1, [esp + fiyH2]
+ movaps xmm2, [esp + fizH2]
+
+ movhlps xmm3, xmm0
+ movhlps xmm4, xmm1
+ movhlps xmm5, xmm2
+ addps xmm0, xmm3
+ addps xmm1, xmm4
+ addps xmm2, xmm5 /* sum is in 1/2 in xmm0-xmm2 */
+
+ movaps xmm3, xmm0
+ movaps xmm4, xmm1
+ movaps xmm5, xmm2
+
+ shufps xmm3, xmm3, 1
+ shufps xmm4, xmm4, 1
+ shufps xmm5, xmm5, 1
+ addss xmm0, xmm3
+ addss xmm1, xmm4
+ addss xmm2, xmm5 /* xmm0-xmm2 has single force in pos0 */
+
+ /* increment i force */
+ movss xmm3, [edi + ecx*4 + 24]
+ movss xmm4, [edi + ecx*4 + 28]
+ movss xmm5, [edi + ecx*4 + 32]
+ addss xmm3, xmm0
+ addss xmm4, xmm1
+ addss xmm5, xmm2
+ movss [edi + ecx*4 + 24], xmm3
+ movss [edi + ecx*4 + 28], xmm4
+ movss [edi + ecx*4 + 32], xmm5
+
+ /* accumulate force in xmm6/xmm7 for fshift */
+ addss xmm7, xmm2
+ movlhps xmm0, xmm1
+ shufps xmm0, xmm0, 0b1000
+ addps xmm6, xmm0
+
+ /* increment fshift force */
+ movlps xmm3, [esi + edx*4]
+ movss xmm4, [esi + edx*4 + 8]
+ addps xmm3, xmm6
+ addss xmm4, xmm7
+ movlps [esi + edx*4], xmm3
+ movss [esi + edx*4 + 8], xmm4
+
+ mov edx, [ebp + gid]
+ mov edx, [edx]
+ add [ebp + gid], 4
+
+ /* accumulate total potential energy and update it */
+ movaps xmm7, [esp + vctot]
+ /* accumulate */
+ movhlps xmm6, xmm7
+ addps xmm7, xmm6 /* pos 0-1 in xmm7 have the sum now */
+ movaps xmm6, xmm7
+ shufps xmm6, xmm6, 1
+ addss xmm7, xmm6
+
+ /* add earlier value from mem */
+ mov eax, [ebp + Vc]
+ addss xmm7, [eax + edx*4]
+ /* move back to mem */
+ movss [eax + edx*4], xmm7
+
+ /* accumulate total lj energy and update it */
+ movaps xmm7, [esp + vnbtot]
+ /* accumulate */
+ movhlps xmm6, xmm7
+ addps xmm7, xmm6 /* pos 0-1 in xmm7 have the sum now */
+ movaps xmm6, xmm7
+ shufps xmm6, xmm6, 1
+ addss xmm7, xmm6
+
+ /* add earlier value from mem */
+ mov eax, [ebp + Vnb]
+ addss xmm7, [eax + edx*4]
+ /* move back to mem */
+ movss [eax + edx*4], xmm7
+
+ /* finish if last */
+ mov ecx, [ebp + nri]
+ dec ecx
+ jecxz .i2120_end
+ /* not last, iterate once more! */
+ mov [ebp + nri], ecx
+ jmp .i2120_outer
+.i2120_end:
+ emms
+ mov eax, [esp + salign]
+ add esp, eax
+ add esp, 792
+ pop edi
+ pop esi
+ pop edx
+ pop ecx
+ pop ebx
+ pop eax
+ leave
+ ret
+
+
+
+.globl inl2130_sse
+ .type inl2130_sse,@function
+inl2130_sse:
+.equ nri, 8
+.equ iinr, 12
+.equ jindex, 16
+.equ jjnr, 20
+.equ shift, 24
+.equ shiftvec, 28
+.equ fshift, 32
+.equ gid, 36
+.equ pos, 40
+.equ faction, 44
+.equ charge, 48
+.equ facel, 52
+.equ Vc, 56
+.equ krf, 60
+.equ crf, 64
+.equ type, 68
+.equ ntype, 72
+.equ nbfp, 76
+.equ Vnb, 80
+ /* stack offsets for local variables */
+ /* bottom of stack is cache-aligned for sse use */
+.equ ixO, 0
+.equ iyO, 16
+.equ izO, 32
+.equ ixH1, 48
+.equ iyH1, 64
+.equ izH1, 80
+.equ ixH2, 96
+.equ iyH2, 112
+.equ izH2, 128
+.equ jxO, 144
+.equ jyO, 160
+.equ jzO, 176
+.equ jxH1, 192
+.equ jyH1, 208
+.equ jzH1, 224
+.equ jxH2, 240
+.equ jyH2, 256
+.equ jzH2, 272
+.equ dxOO, 288
+.equ dyOO, 304
+.equ dzOO, 320
+.equ dxOH1, 336
+.equ dyOH1, 352
+.equ dzOH1, 368
+.equ dxOH2, 384
+.equ dyOH2, 400
+.equ dzOH2, 416
+.equ dxH1O, 432
+.equ dyH1O, 448
+.equ dzH1O, 464
+.equ dxH1H1, 480
+.equ dyH1H1, 496
+.equ dzH1H1, 512
+.equ dxH1H2, 528
+.equ dyH1H2, 544
+.equ dzH1H2, 560
+.equ dxH2O, 576
+.equ dyH2O, 592
+.equ dzH2O, 608
+.equ dxH2H1, 624
+.equ dyH2H1, 640
+.equ dzH2H1, 656
+.equ dxH2H2, 672
+.equ dyH2H2, 688
+.equ dzH2H2, 704
+.equ qqOO, 720
+.equ qqOH, 736
+.equ qqHH, 752
+.equ c6, 768
+.equ c12, 784
+.equ six, 800
+.equ twelve, 816
+.equ vctot, 832
+.equ vnbtot, 848
+.equ fixO, 864
+.equ fiyO, 880
+.equ fizO, 896
+.equ fixH1, 912
+.equ fiyH1, 928
+.equ fizH1, 944
+.equ fixH2, 960
+.equ fiyH2, 976
+.equ fizH2, 992
+.equ fjxO, 1008
+.equ fjyO, 1024
+.equ fjzO, 1040
+.equ fjxH1, 1056
+.equ fjyH1, 1072
+.equ fjzH1, 1088
+.equ fjxH2, 1104
+.equ fjyH2, 1120
+.equ fjzH2, 1136
+.equ half, 1152
+.equ three, 1168
+.equ rsqOO, 1184
+.equ rsqOH1, 1200
+.equ rsqOH2, 1216
+.equ rsqH1O, 1232
+.equ rsqH1H1, 1248
+.equ rsqH1H2, 1264
+.equ rsqH2O, 1280
+.equ rsqH2H1, 1296
+.equ rsqH2H2, 1312
+.equ rinvOO, 1328
+.equ rinvOH1, 1344
+.equ rinvOH2, 1360
+.equ rinvH1O, 1376
+.equ rinvH1H1, 1392
+.equ rinvH1H2, 1408
+.equ rinvH2O, 1424
+.equ rinvH2H1, 1440
+.equ rinvH2H2, 1456
+.equ two, 1472
+.equ krf, 1488
+.equ crf, 1504
+.equ is3, 1520
+.equ ii3, 1524
+.equ innerjjnr, 1528
+.equ innerk, 1532
+.equ salign, 1536
+ push ebp
+ mov ebp,esp
+ push eax
+ push ebx
+ push ecx
+ push edx
+ push esi
+ push edi
+ sub esp, 1540 /* local stack space */
+ mov eax, esp
+ and eax, 0xf
+ sub esp, eax
+ mov [esp + salign], eax
+
+ emms
+
+ movups xmm0, [sse_half]
+ movups xmm1, [sse_three]
+ movups xmm2, [sse_six]
+ movups xmm3, [sse_twelve]
+ movups xmm4, [sse_two]
+ movss xmm5, [ebp + krf]
+ movss xmm6, [ebp + crf]
+
+ movaps [esp + half], xmm0
+ movaps [esp + three], xmm1
+ movaps [esp + six], xmm2
+ movaps [esp + twelve], xmm3
+ movaps [esp + two], xmm4
+ shufps xmm5, xmm5, 0
+ shufps xmm6, xmm6, 0
+ movaps [esp + krf], xmm5
+ movaps [esp + crf], xmm6
+
+ /* assume we have at least one i particle - start directly */
+ mov ecx, [ebp + iinr] /* ecx = pointer into iinr[] */
+ mov ebx, [ecx] /* ebx =ii */
+
+ mov edx, [ebp + charge]
+ movss xmm3, [edx + ebx*4]
+ movss xmm4, xmm3
+ movss xmm5, [edx + ebx*4 + 4]
+ movss xmm6, [ebp + facel]
+ mulss xmm3, xmm3
+ mulss xmm4, xmm5
+ mulss xmm5, xmm5
+ mulss xmm3, xmm6
+ mulss xmm4, xmm6
+ mulss xmm5, xmm6
+ shufps xmm3, xmm3, 0
+ shufps xmm4, xmm4, 0
+ shufps xmm5, xmm5, 0
+ movaps [esp + qqOO], xmm3
+ movaps [esp + qqOH], xmm4
+ movaps [esp + qqHH], xmm5
+
+ xorps xmm0, xmm0
+ mov edx, [ebp + type]
+ mov ecx, [edx + ebx*4]
+ shl ecx, 1
+ mov edx, ecx
+ imul ecx, [ebp + ntype] /* ecx = ntia = 2*ntype*type[ii0] */
+ add edx, ecx
+ mov eax, [ebp + nbfp]
+ movlps xmm0, [eax + edx*4]
+ movaps xmm1, xmm0
+ shufps xmm0, xmm0, 0
+ shufps xmm1, xmm1, 0b01010101
+ movaps [esp + c6], xmm0
+ movaps [esp + c12], xmm1
+
+.i2130_outer:
+ mov eax, [ebp + shift] /* eax = pointer into shift[] */
+ mov ebx, [eax] /* ebx=shift[n] */
+ add [ebp + shift], 4 /* advance pointer one step */
+
+ lea ebx, [ebx + ebx*2] /* ebx=3*is */
+ mov [esp + is3],ebx /* store is3 */
+
+ mov eax, [ebp + shiftvec] /* eax = base of shiftvec[] */
+
+ movss xmm0, [eax + ebx*4]
+ movss xmm1, [eax + ebx*4 + 4]
+ movss xmm2, [eax + ebx*4 + 8]
+
+ mov ecx, [ebp + iinr] /* ecx = pointer into iinr[] */
+ add [ebp + iinr], 4 /* advance pointer */
+ mov ebx, [ecx] /* ebx =ii */
+
+ lea ebx, [ebx + ebx*2] /* ebx = 3*ii=ii3 */
+ mov eax, [ebp + pos] /* eax = base of pos[] */
+ mov [esp + ii3], ebx
+
+ movaps xmm3, xmm0
+ movaps xmm4, xmm1
+ movaps xmm5, xmm2
+ addss xmm3, [eax + ebx*4]
+ addss xmm4, [eax + ebx*4 + 4]
+ addss xmm5, [eax + ebx*4 + 8]
+ shufps xmm3, xmm3, 0
+ shufps xmm4, xmm4, 0
+ shufps xmm5, xmm5, 0
+ movaps [esp + ixO], xmm3
+ movaps [esp + iyO], xmm4
+ movaps [esp + izO], xmm5
+
+ movss xmm3, xmm0
+ movss xmm4, xmm1
+ movss xmm5, xmm2
+ addss xmm0, [eax + ebx*4 + 12]
+ addss xmm1, [eax + ebx*4 + 16]
+ addss xmm2, [eax + ebx*4 + 20]
+ addss xmm3, [eax + ebx*4 + 24]
+ addss xmm4, [eax + ebx*4 + 28]
+ addss xmm5, [eax + ebx*4 + 32]
+
+ shufps xmm0, xmm0, 0
+ shufps xmm1, xmm1, 0
+ shufps xmm2, xmm2, 0
+ shufps xmm3, xmm3, 0
+ shufps xmm4, xmm4, 0
+ shufps xmm5, xmm5, 0
+ movaps [esp + ixH1], xmm0
+ movaps [esp + iyH1], xmm1
+ movaps [esp + izH1], xmm2
+ movaps [esp + ixH2], xmm3
+ movaps [esp + iyH2], xmm4
+ movaps [esp + izH2], xmm5
+
+ /* clear vctot and i forces */
+ xorps xmm4, xmm4
+ movaps [esp + vctot], xmm4
+ movaps [esp + vnbtot], xmm4
+ movaps [esp + fixO], xmm4
+ movaps [esp + fiyO], xmm4
+ movaps [esp + fizO], xmm4
+ movaps [esp + fixH1], xmm4
+ movaps [esp + fiyH1], xmm4
+ movaps [esp + fizH1], xmm4
+ movaps [esp + fixH2], xmm4
+ movaps [esp + fiyH2], xmm4
+ movaps [esp + fizH2], xmm4
+
+ mov eax, [ebp + jindex]
+ mov ecx, [eax] /* jindex[n] */
+ mov edx, [eax + 4] /* jindex[n+1] */
+ add [ebp + jindex], 4
+ sub edx, ecx /* number of innerloop atoms */
+
+ mov esi, [ebp + pos]
+ mov edi, [ebp + faction]
+ mov eax, [ebp + jjnr]
+ shl ecx, 2
+ add eax, ecx
+ mov [esp + innerjjnr], eax /* pointer to jjnr[nj0] */
+ sub edx, 4
+ mov [esp + innerk], edx /* number of innerloop atoms */
+ jge .i2130_unroll_loop
+ jmp .i2130_single_check
+.i2130_unroll_loop:
+ /* quad-unroll innerloop here */
+ mov edx, [esp + innerjjnr] /* pointer to jjnr[k] */
+
+ mov eax, [edx]
+ mov ebx, [edx + 4]
+ mov ecx, [edx + 8]
+ mov edx, [edx + 12] /* eax-edx=jnr1-4 */
+
+ add [esp + innerjjnr], 16 /* advance pointer (unrolled 4) */
+
+ mov esi, [ebp + pos] /* base of pos[] */
+
+ lea eax, [eax + eax*2] /* replace jnr with j3 */
+ lea ebx, [ebx + ebx*2]
+ lea ecx, [ecx + ecx*2] /* replace jnr with j3 */
+ lea edx, [edx + edx*2]
+
+ /* move j coordinates to local temp variables */
+ movlps xmm2, [esi + eax*4]
+ movlps xmm3, [esi + eax*4 + 12]
+ movlps xmm4, [esi + eax*4 + 24]
+
+ movlps xmm5, [esi + ebx*4]
+ movlps xmm6, [esi + ebx*4 + 12]
+ movlps xmm7, [esi + ebx*4 + 24]
+
+ movhps xmm2, [esi + ecx*4]
+ movhps xmm3, [esi + ecx*4 + 12]
+ movhps xmm4, [esi + ecx*4 + 24]
+
+ movhps xmm5, [esi + edx*4]
+ movhps xmm6, [esi + edx*4 + 12]
+ movhps xmm7, [esi + edx*4 + 24]
+
+ /* current state: */
+ /* xmm2= jxOa jyOa jxOc jyOc */
+ /* xmm3= jxH1a jyH1a jxH1c jyH1c */
+ /* xmm4= jxH2a jyH2a jxH2c jyH2c */
+ /* xmm5= jxOb jyOb jxOd jyOd */
+ /* xmm6= jxH1b jyH1b jxH1d jyH1d */
+ /* xmm7= jxH2b jyH2b jxH2d jyH2d */
+
+ movaps xmm0, xmm2
+ movaps xmm1, xmm3
+ unpcklps xmm0, xmm5 /* xmm0= jxOa jxOb jyOa jyOb */
+ unpcklps xmm1, xmm6 /* xmm1= jxH1a jxH1b jyH1a jyH1b */
+ unpckhps xmm2, xmm5 /* xmm2= jxOc jxOd jyOc jyOd */
+ unpckhps xmm3, xmm6 /* xmm3= jxH1c jxH1d jyH1c jyH1d */
+ movaps xmm5, xmm4
+ movaps xmm6, xmm0
+ unpcklps xmm4, xmm7 /* xmm4= jxH2a jxH2b jyH2a jyH2b */
+ unpckhps xmm5, xmm7 /* xmm5= jxH2c jxH2d jyH2c jyH2d */
+ movaps xmm7, xmm1
+ movlhps xmm0, xmm2 /* xmm0= jxOa jxOb jxOc jxOd */
+ movaps [esp + jxO], xmm0
+ movhlps xmm2, xmm6 /* xmm2= jyOa jyOb jyOc jyOd */
+ movaps [esp + jyO], xmm2
+ movlhps xmm1, xmm3
+ movaps [esp + jxH1], xmm1
+ movhlps xmm3, xmm7
+ movaps xmm6, xmm4
+ movaps [esp + jyH1], xmm3
+ movlhps xmm4, xmm5
+ movaps [esp + jxH2], xmm4
+ movhlps xmm5, xmm6
+ movaps [esp + jyH2], xmm5
+
+ movss xmm0, [esi + eax*4 + 8]
+ movss xmm1, [esi + eax*4 + 20]
+ movss xmm2, [esi + eax*4 + 32]
+
+ movss xmm3, [esi + ecx*4 + 8]
+ movss xmm4, [esi + ecx*4 + 20]
+ movss xmm5, [esi + ecx*4 + 32]
+
+ movhps xmm0, [esi + ebx*4 + 4]
+ movhps xmm1, [esi + ebx*4 + 16]
+ movhps xmm2, [esi + ebx*4 + 28]
+
+ movhps xmm3, [esi + edx*4 + 4]
+ movhps xmm4, [esi + edx*4 + 16]
+ movhps xmm5, [esi + edx*4 + 28]
+
+ shufps xmm0, xmm3, 0b11001100
+ shufps xmm1, xmm4, 0b11001100
+ shufps xmm2, xmm5, 0b11001100
+ movaps [esp + jzO], xmm0
+ movaps [esp + jzH1], xmm1
+ movaps [esp + jzH2], xmm2
+
+ movaps xmm0, [esp + ixO]
+ movaps xmm1, [esp + iyO]
+ movaps xmm2, [esp + izO]
+ movaps xmm3, [esp + ixO]
+ movaps xmm4, [esp + iyO]
+ movaps xmm5, [esp + izO]
+ subps xmm0, [esp + jxO]
+ subps xmm1, [esp + jyO]
+ subps xmm2, [esp + jzO]
+ subps xmm3, [esp + jxH1]
+ subps xmm4, [esp + jyH1]
+ subps xmm5, [esp + jzH1]
+ movaps [esp + dxOO], xmm0
+ movaps [esp + dyOO], xmm1
+ movaps [esp + dzOO], xmm2
+ mulps xmm0, xmm0
+ mulps xmm1, xmm1
+ mulps xmm2, xmm2
+ movaps [esp + dxOH1], xmm3
+ movaps [esp + dyOH1], xmm4
+ movaps [esp + dzOH1], xmm5
+ mulps xmm3, xmm3
+ mulps xmm4, xmm4
+ mulps xmm5, xmm5
+ addps xmm0, xmm1
+ addps xmm0, xmm2
+ addps xmm3, xmm4
+ addps xmm3, xmm5
+ movaps [esp + rsqOO], xmm0
+ movaps [esp + rsqOH1], xmm3
+
+ movaps xmm0, [esp + ixO]
+ movaps xmm1, [esp + iyO]
+ movaps xmm2, [esp + izO]
+ movaps xmm3, [esp + ixH1]
+ movaps xmm4, [esp + iyH1]
+ movaps xmm5, [esp + izH1]
+ subps xmm0, [esp + jxH2]
+ subps xmm1, [esp + jyH2]
+ subps xmm2, [esp + jzH2]
+ subps xmm3, [esp + jxO]
+ subps xmm4, [esp + jyO]
+ subps xmm5, [esp + jzO]
+ movaps [esp + dxOH2], xmm0
+ movaps [esp + dyOH2], xmm1
+ movaps [esp + dzOH2], xmm2
+ mulps xmm0, xmm0
+ mulps xmm1, xmm1
+ mulps xmm2, xmm2
+ movaps [esp + dxH1O], xmm3
+ movaps [esp + dyH1O], xmm4
+ movaps [esp + dzH1O], xmm5
+ mulps xmm3, xmm3
+ mulps xmm4, xmm4
+ mulps xmm5, xmm5
+ addps xmm0, xmm1
+ addps xmm0, xmm2
+ addps xmm3, xmm4
+ addps xmm3, xmm5
+ movaps [esp + rsqOH2], xmm0
+ movaps [esp + rsqH1O], xmm3
+
+ movaps xmm0, [esp + ixH1]
+ movaps xmm1, [esp + iyH1]
+ movaps xmm2, [esp + izH1]
+ movaps xmm3, [esp + ixH1]
+ movaps xmm4, [esp + iyH1]
+ movaps xmm5, [esp + izH1]
+ subps xmm0, [esp + jxH1]
+ subps xmm1, [esp + jyH1]
+ subps xmm2, [esp + jzH1]
+ subps xmm3, [esp + jxH2]
+ subps xmm4, [esp + jyH2]
+ subps xmm5, [esp + jzH2]
+ movaps [esp + dxH1H1], xmm0
+ movaps [esp + dyH1H1], xmm1
+ movaps [esp + dzH1H1], xmm2
+ mulps xmm0, xmm0
+ mulps xmm1, xmm1
+ mulps xmm2, xmm2
+ movaps [esp + dxH1H2], xmm3
+ movaps [esp + dyH1H2], xmm4
+ movaps [esp + dzH1H2], xmm5
+ mulps xmm3, xmm3
+ mulps xmm4, xmm4
+ mulps xmm5, xmm5
+ addps xmm0, xmm1
+ addps xmm0, xmm2
+ addps xmm3, xmm4
+ addps xmm3, xmm5
+ movaps [esp + rsqH1H1], xmm0
+ movaps [esp + rsqH1H2], xmm3
+
+ movaps xmm0, [esp + ixH2]
+ movaps xmm1, [esp + iyH2]
+ movaps xmm2, [esp + izH2]
+ movaps xmm3, [esp + ixH2]
+ movaps xmm4, [esp + iyH2]
+ movaps xmm5, [esp + izH2]
+ subps xmm0, [esp + jxO]
+ subps xmm1, [esp + jyO]
+ subps xmm2, [esp + jzO]
+ subps xmm3, [esp + jxH1]
+ subps xmm4, [esp + jyH1]
+ subps xmm5, [esp + jzH1]
+ movaps [esp + dxH2O], xmm0
+ movaps [esp + dyH2O], xmm1
+ movaps [esp + dzH2O], xmm2
+ mulps xmm0, xmm0
+ mulps xmm1, xmm1
+ mulps xmm2, xmm2
+ movaps [esp + dxH2H1], xmm3
+ movaps [esp + dyH2H1], xmm4
+ movaps [esp + dzH2H1], xmm5
+ mulps xmm3, xmm3
+ mulps xmm4, xmm4
+ mulps xmm5, xmm5
+ addps xmm0, xmm1
+ addps xmm0, xmm2
+ addps xmm4, xmm3
+ addps xmm4, xmm5
+ movaps [esp + rsqH2O], xmm0
+ movaps [esp + rsqH2H1], xmm4
+
+ movaps xmm0, [esp + ixH2]
+ movaps xmm1, [esp + iyH2]
+ movaps xmm2, [esp + izH2]
+ subps xmm0, [esp + jxH2]
+ subps xmm1, [esp + jyH2]
+ subps xmm2, [esp + jzH2]
+ movaps [esp + dxH2H2], xmm0
+ movaps [esp + dyH2H2], xmm1
+ movaps [esp + dzH2H2], xmm2
+ mulps xmm0, xmm0
+ mulps xmm1, xmm1
+ mulps xmm2, xmm2
+ addps xmm0, xmm1
+ addps xmm0, xmm2
+ movaps [esp + rsqH2H2], xmm0
+
+ /* start doing invsqrt use rsq values in xmm0, xmm4 */
+ rsqrtps xmm1, xmm0
+ rsqrtps xmm5, xmm4
+ movaps xmm2, xmm1
+ movaps xmm6, xmm5
+ mulps xmm1, xmm1
+ mulps xmm5, xmm5
+ movaps xmm3, [esp + three]
+ movaps xmm7, xmm3
+ mulps xmm1, xmm0
+ mulps xmm5, xmm4
+ subps xmm3, xmm1
+ subps xmm7, xmm5
+ mulps xmm3, xmm2
+ mulps xmm7, xmm6
+ mulps xmm3, [esp + half] /* rinvH2H2 */
+ mulps xmm7, [esp + half] /* rinvH2H1 */
+ movaps [esp + rinvH2H2], xmm3
+ movaps [esp + rinvH2H1], xmm7
+
+ rsqrtps xmm1, [esp + rsqOO]
+ rsqrtps xmm5, [esp + rsqOH1]
+ movaps xmm2, xmm1
+ movaps xmm6, xmm5
+ mulps xmm1, xmm1
+ mulps xmm5, xmm5
+ movaps xmm3, [esp + three]
+ movaps xmm7, xmm3
+ mulps xmm1, [esp + rsqOO]
+ mulps xmm5, [esp + rsqOH1]
+ subps xmm3, xmm1
+ subps xmm7, xmm5
+ mulps xmm3, xmm2
+ mulps xmm7, xmm6
+ mulps xmm3, [esp + half]
+ mulps xmm7, [esp + half]
+ movaps [esp + rinvOO], xmm3
+ movaps [esp + rinvOH1], xmm7
+
+ rsqrtps xmm1, [esp + rsqOH2]
+ rsqrtps xmm5, [esp + rsqH1O]
+ movaps xmm2, xmm1
+ movaps xmm6, xmm5
+ mulps xmm1, xmm1
+ mulps xmm5, xmm5
+ movaps xmm3, [esp + three]
+ movaps xmm7, xmm3
+ mulps xmm1, [esp + rsqOH2]
+ mulps xmm5, [esp + rsqH1O]
+ subps xmm3, xmm1
+ subps xmm7, xmm5
+ mulps xmm3, xmm2
+ mulps xmm7, xmm6
+ mulps xmm3, [esp + half]
+ mulps xmm7, [esp + half]
+ movaps [esp + rinvOH2], xmm3
+ movaps [esp + rinvH1O], xmm7
+
+ rsqrtps xmm1, [esp + rsqH1H1]
+ rsqrtps xmm5, [esp + rsqH1H2]
+ movaps xmm2, xmm1
+ movaps xmm6, xmm5
+ mulps xmm1, xmm1
+ mulps xmm5, xmm5
+ movaps xmm3, [esp + three]
+ movaps xmm7, xmm3
+ mulps xmm1, [esp + rsqH1H1]
+ mulps xmm5, [esp + rsqH1H2]
+ subps xmm3, xmm1
+ subps xmm7, xmm5
+ mulps xmm3, xmm2
+ mulps xmm7, xmm6
+ mulps xmm3, [esp + half]
+ mulps xmm7, [esp + half]
+ movaps [esp + rinvH1H1], xmm3
+ movaps [esp + rinvH1H2], xmm7
+
+ rsqrtps xmm1, [esp + rsqH2O]
+ movaps xmm2, xmm1
+ mulps xmm1, xmm1
+ movaps xmm3, [esp + three]
+ mulps xmm1, [esp + rsqH2O]
+ subps xmm3, xmm1
+ mulps xmm3, xmm2
+ mulps xmm3, [esp + half]
+ movaps [esp + rinvH2O], xmm3
+
+ /* start with OO interaction */
+ movaps xmm0, [esp + rinvOO]
+ movaps xmm7, xmm0 /* xmm7=rinv */
+ movaps xmm5, [esp + krf]
+ mulps xmm0, xmm0
+ movaps xmm1, xmm0
+ mulps xmm1, xmm0
+ mulps xmm1, xmm0 /* xmm1=rinvsix */
+ mulps xmm5, [esp + rsqOO] /* xmm5=krsq */
+ movaps xmm6, xmm5
+ addps xmm6, xmm7 /* xmm6=rinv+krsq */
+ subps xmm6, [esp + crf]
+
+ mulps xmm6, [esp + qqOO] /* xmm6=voul=qq*(rinv+krsq-crf) */
+ mulps xmm5, [esp + two]
+ subps xmm7, xmm5 /* xmm7=rinv-2*krsq */
+ mulps xmm7, [esp + qqOO] /* xmm7 = coul part of fscal */
+
+ movaps xmm2, xmm1
+ mulps xmm2, xmm2 /* xmm2=rinvtwelve */
+ mulps xmm1, [esp + c6]
+ mulps xmm2, [esp + c12]
+ movaps xmm3, xmm2
+ subps xmm3, xmm1 /* xmm3=vnb12-vnb6 */
+ addps xmm3, [esp + vnbtot]
+ mulps xmm1, [esp + six]
+ mulps xmm2, [esp + twelve]
+ movaps [esp + vnbtot], xmm3
+ subps xmm2, xmm1
+ addps xmm2, xmm7
+ addps xmm6, [esp + vctot] /* local vctot summation variable */
+ mulps xmm0, xmm2
+
+ movaps xmm1, xmm0
+ movaps xmm2, xmm0
+
+ xorps xmm3, xmm3
+ movaps xmm4, xmm3
+ movaps xmm5, xmm3
+ mulps xmm0, [esp + dxOO]
+ mulps xmm1, [esp + dyOO]
+ mulps xmm2, [esp + dzOO]
+ subps xmm3, xmm0
+ subps xmm4, xmm1
+ subps xmm5, xmm2
+ addps xmm0, [esp + fixO]
+ addps xmm1, [esp + fiyO]
+ addps xmm2, [esp + fizO]
+ movaps [esp + fjxO], xmm3
+ movaps [esp + fjyO], xmm4
+ movaps [esp + fjzO], xmm5
+ movaps [esp + fixO], xmm0
+ movaps [esp + fiyO], xmm1
+ movaps [esp + fizO], xmm2
+
+ /* O-H1 interaction */
+ movaps xmm0, [esp + rinvOH1]
+ movaps xmm7, xmm0 /* xmm7=rinv */
+ movaps xmm5, [esp + krf]
+ movaps xmm1, xmm0
+ mulps xmm5, [esp + rsqOH1] /* xmm5=krsq */
+ movaps xmm4, xmm5
+ addps xmm4, xmm7 /* xmm4=rinv+krsq */
+ mulps xmm0, xmm0
+ subps xmm4, [esp + crf]
+ mulps xmm4, [esp + qqOH] /* xmm4=voul=qq*(rinv+krsq) */
+ mulps xmm5, [esp + two]
+ subps xmm7, xmm5 /* xmm7=rinv-2*krsq */
+ mulps xmm7, [esp + qqOH] /* xmm7 = coul part of fscal */
+ addps xmm6, xmm4 /* add to local vctot */
+ mulps xmm0, xmm7 /* fsOH1 */
+ movaps xmm1, xmm0
+ movaps xmm2, xmm0
+
+ xorps xmm3, xmm3
+ movaps xmm4, xmm3
+ movaps xmm5, xmm3
+ mulps xmm0, [esp + dxOH1]
+ mulps xmm1, [esp + dyOH1]
+ mulps xmm2, [esp + dzOH1]
+ subps xmm3, xmm0
+ subps xmm4, xmm1
+ subps xmm5, xmm2
+ addps xmm0, [esp + fixO]
+ addps xmm1, [esp + fiyO]
+ addps xmm2, [esp + fizO]
+ movaps [esp + fjxH1], xmm3
+ movaps [esp + fjyH1], xmm4
+ movaps [esp + fjzH1], xmm5
+ movaps [esp + fixO], xmm0
+ movaps [esp + fiyO], xmm1
+ movaps [esp + fizO], xmm2
+
+ /* O-H2 interaction */
+ movaps xmm0, [esp + rinvOH2]
+ movaps xmm7, xmm0 /* xmm7=rinv */
+ movaps xmm5, [esp + krf]
+ movaps xmm1, xmm0
+ mulps xmm5, [esp + rsqOH2] /* xmm5=krsq */
+ movaps xmm4, xmm5
+ addps xmm4, xmm7 /* xmm4=r inv+krsq */
+ mulps xmm0, xmm0
+ subps xmm4, [esp + crf]
+ mulps xmm4, [esp + qqOH] /* xmm4=voul=qq*(rinv+krsq) */
+ mulps xmm5, [esp + two]
+ subps xmm7, xmm5 /* xmm7=rinv-2*krsq */
+ mulps xmm7, [esp + qqOH] /* xmm7 = coul part of fscal */
+ addps xmm6, xmm4 /* add to local vctot */
+ mulps xmm0, xmm7 /* fsOH2 */
+ movaps xmm1, xmm0
+ movaps xmm2, xmm0
+
+ xorps xmm3, xmm3
+ movaps xmm4, xmm3
+ movaps xmm5, xmm3
+ mulps xmm0, [esp + dxOH2]
+ mulps xmm1, [esp + dyOH2]
+ mulps xmm2, [esp + dzOH2]
+ subps xmm3, xmm0
+ subps xmm4, xmm1
+ subps xmm5, xmm2
+ addps xmm0, [esp + fixO]
+ addps xmm1, [esp + fiyO]
+ addps xmm2, [esp + fizO]
+ movaps [esp + fjxH2], xmm3
+ movaps [esp + fjyH2], xmm4
+ movaps [esp + fjzH2], xmm5
+ movaps [esp + fixO], xmm0
+ movaps [esp + fiyO], xmm1
+ movaps [esp + fizO], xmm2
+
+ /* H1-O interaction */
+ movaps xmm0, [esp + rinvH1O]
+ movaps xmm7, xmm0 /* xmm7=rinv */
+ movaps xmm5, [esp + krf]
+ movaps xmm1, xmm0
+ mulps xmm5, [esp + rsqH1O] /* xmm5=krsq */
+ movaps xmm4, xmm5
+ addps xmm4, xmm7 /* xmm4=rinv+krsq */
+ mulps xmm0, xmm0
+ subps xmm4, [esp + crf]
+ mulps xmm4, [esp + qqOH] /* xmm4=voul=qq*(rinv+krsq) */
+ mulps xmm5, [esp + two]
+ subps xmm7, xmm5 /* xmm7=rinv-2*krsq */
+ mulps xmm7, [esp + qqOH] /* xmm7 = coul part of fscal */
+ addps xmm6, xmm4 /* add to local vctot */
+ mulps xmm0, xmm7 /* fsOH2 */
+ movaps xmm1, xmm0
+ movaps xmm2, xmm0
+
+ movaps xmm3, [esp + fjxO]
+ movaps xmm4, [esp + fjyO]
+ movaps xmm5, [esp + fjzO]
+ mulps xmm0, [esp + dxH1O]
+ mulps xmm1, [esp + dyH1O]
+ mulps xmm2, [esp + dzH1O]
+ subps xmm3, xmm0
+ subps xmm4, xmm1
+ subps xmm5, xmm2
+ addps xmm0, [esp + fixH1]
+ addps xmm1, [esp + fiyH1]
+ addps xmm2, [esp + fizH1]
+ movaps [esp + fjxO], xmm3
+ movaps [esp + fjyO], xmm4
+ movaps [esp + fjzO], xmm5
+ movaps [esp + fixH1], xmm0
+ movaps [esp + fiyH1], xmm1
+ movaps [esp + fizH1], xmm2
+
+ /* H1-H1 interaction */
+ movaps xmm0, [esp + rinvH1H1]
+ movaps xmm7, xmm0 /* xmm7=rinv */
+ movaps xmm5, [esp + krf]
+ movaps xmm1, xmm0
+ mulps xmm5, [esp + rsqH1H1] /* xmm5=krsq */
+ movaps xmm4, xmm5
+ addps xmm4, xmm7 /* xmm4=r inv+krsq */
+ subps xmm4, [esp + crf]
+ mulps xmm0, xmm0
+ mulps xmm4, [esp + qqHH] /* xmm4=voul=qq*(rinv+krsq) */
+ mulps xmm5, [esp + two]
+ subps xmm7, xmm5 /* xmm7=rinv-2*krsq */
+ mulps xmm7, [esp + qqHH] /* xmm7 = coul part of fscal */
+ addps xmm6, xmm4 /* add to local vctot */
+ mulps xmm0, xmm7 /* fsOH2 */
+ movaps xmm1, xmm0
+ movaps xmm2, xmm0
+
+ movaps xmm3, [esp + fjxH1]
+ movaps xmm4, [esp + fjyH1]
+ movaps xmm5, [esp + fjzH1]
+ mulps xmm0, [esp + dxH1H1]
+ mulps xmm1, [esp + dyH1H1]
+ mulps xmm2, [esp + dzH1H1]
+ subps xmm3, xmm0
+ subps xmm4, xmm1
+ subps xmm5, xmm2
+ addps xmm0, [esp + fixH1]
+ addps xmm1, [esp + fiyH1]
+ addps xmm2, [esp + fizH1]
+ movaps [esp + fjxH1], xmm3
+ movaps [esp + fjyH1], xmm4
+ movaps [esp + fjzH1], xmm5
+ movaps [esp + fixH1], xmm0
+ movaps [esp + fiyH1], xmm1
+ movaps [esp + fizH1], xmm2
+
+ /* H1-H2 interaction */
+ movaps xmm0, [esp + rinvH1H2]
+ movaps xmm7, xmm0 /* xmm7=rinv */
+ movaps xmm5, [esp + krf]
+ movaps xmm1, xmm0
+ mulps xmm5, [esp + rsqH1H2] /* xmm5=krsq */
+ movaps xmm4, xmm5
+ addps xmm4, xmm7 /* xmm4=r inv+krsq */
+ mulps xmm0, xmm0
+ subps xmm4, [esp + crf]
+ mulps xmm4, [esp + qqHH] /* xmm4=voul=qq*(rinv+krsq) */
+ mulps xmm5, [esp + two]
+ subps xmm7, xmm5 /* xmm7=rinv-2*krsq */
+ mulps xmm7, [esp + qqHH] /* xmm7 = coul part of fscal */
+ addps xmm6, xmm4 /* add to local vctot */
+ mulps xmm0, xmm7 /* fsOH2 */
+ movaps xmm1, xmm0
+ movaps xmm2, xmm0
+
+ movaps xmm3, [esp + fjxH2]
+ movaps xmm4, [esp + fjyH2]
+ movaps xmm5, [esp + fjzH2]
+ mulps xmm0, [esp + dxH1H2]
+ mulps xmm1, [esp + dyH1H2]
+ mulps xmm2, [esp + dzH1H2]
+ subps xmm3, xmm0
+ subps xmm4, xmm1
+ subps xmm5, xmm2
+ addps xmm0, [esp + fixH1]
+ addps xmm1, [esp + fiyH1]
+ addps xmm2, [esp + fizH1]
+ movaps [esp + fjxH2], xmm3
+ movaps [esp + fjyH2], xmm4
+ movaps [esp + fjzH2], xmm5
+ movaps [esp + fixH1], xmm0
+ movaps [esp + fiyH1], xmm1
+ movaps [esp + fizH1], xmm2
+
+ /* H2-O interaction */
+ movaps xmm0, [esp + rinvH2O]
+ movaps xmm7, xmm0 /* xmm7=rinv */
+ movaps xmm5, [esp + krf]
+ movaps xmm1, xmm0
+ mulps xmm5, [esp + rsqH2O] /* xmm5=krsq */
+ movaps xmm4, xmm5
+ addps xmm4, xmm7 /* xmm4=r inv+krsq */
+ subps xmm4, [esp + crf]
+ mulps xmm0, xmm0
+ mulps xmm4, [esp + qqOH] /* xmm4=voul=qq*(rinv+krsq) */
+ mulps xmm5, [esp + two]
+ subps xmm7, xmm5 /* xmm7=rinv-2*krsq */
+ mulps xmm7, [esp + qqOH] /* xmm7 = coul part of fscal */
+ addps xmm6, xmm4 /* add to local vctot */
+ mulps xmm0, xmm7 /* fsOH2 */
+ movaps xmm1, xmm0
+ movaps xmm2, xmm0
+
+ movaps xmm3, [esp + fjxO]
+ movaps xmm4, [esp + fjyO]
+ movaps xmm5, [esp + fjzO]
+ mulps xmm0, [esp + dxH2O]
+ mulps xmm1, [esp + dyH2O]
+ mulps xmm2, [esp + dzH2O]
+ subps xmm3, xmm0
+ subps xmm4, xmm1
+ subps xmm5, xmm2
+ addps xmm0, [esp + fixH2]
+ addps xmm1, [esp + fiyH2]
+ addps xmm2, [esp + fizH2]
+ movaps [esp + fjxO], xmm3
+ movaps [esp + fjyO], xmm4
+ movaps [esp + fjzO], xmm5
+ movaps [esp + fixH2], xmm0
+ movaps [esp + fiyH2], xmm1
+ movaps [esp + fizH2], xmm2
+
+ /* H2-H1 interaction */
+ movaps xmm0, [esp + rinvH2H1]
+ movaps xmm7, xmm0 /* xmm7=rinv */
+ movaps xmm5, [esp + krf]
+ movaps xmm1, xmm0
+ mulps xmm5, [esp + rsqH2H1] /* xmm5=krsq */
+ movaps xmm4, xmm5
+ addps xmm4, xmm7 /* xmm4=r inv+krsq */
+ subps xmm4, [esp + crf]
+ mulps xmm0, xmm0
+ mulps xmm4, [esp + qqHH] /* xmm4=voul=qq*(rinv+krsq) */
+ mulps xmm5, [esp + two]
+ subps xmm7, xmm5 /* xmm7=rinv-2*krsq */
+ mulps xmm7, [esp + qqHH] /* xmm7 = coul part of fscal */
+ addps xmm6, xmm4 /* add to local vctot */
+ mulps xmm0, xmm7 /* fsOH2 */
+ movaps xmm1, xmm0
+ movaps xmm2, xmm0
+
+ movaps xmm3, [esp + fjxH1]
+ movaps xmm4, [esp + fjyH1]
+ movaps xmm5, [esp + fjzH1]
+ mulps xmm0, [esp + dxH2H1]
+ mulps xmm1, [esp + dyH2H1]
+ mulps xmm2, [esp + dzH2H1]
+ subps xmm3, xmm0
+ subps xmm4, xmm1
+ subps xmm5, xmm2
+ addps xmm0, [esp + fixH2]
+ addps xmm1, [esp + fiyH2]
+ addps xmm2, [esp + fizH2]
+ movaps [esp + fjxH1], xmm3
+ movaps [esp + fjyH1], xmm4
+ movaps [esp + fjzH1], xmm5
+ movaps [esp + fixH2], xmm0
+ movaps [esp + fiyH2], xmm1
+ movaps [esp + fizH2], xmm2
+
+ /* H2-H2 interaction */
+ movaps xmm0, [esp + rinvH2H2]
+ movaps xmm7, xmm0 /* xmm7=rinv */
+ movaps xmm5, [esp + krf]
+ movaps xmm1, xmm0
+ mulps xmm5, [esp + rsqH2H2] /* xmm5=krsq */
+ movaps xmm4, xmm5
+ addps xmm4, xmm7 /* xmm4=r inv+krsq */
+ subps xmm4, [esp + crf]
+ mulps xmm0, xmm0
+ mulps xmm4, [esp + qqHH] /* xmm4=voul=qq*(rinv+krsq) */
+ mulps xmm5, [esp + two]
+ subps xmm7, xmm5 /* xmm7=rinv-2*krsq */
+ mulps xmm7, [esp + qqHH] /* xmm7 = coul part of fscal */
+ addps xmm6, xmm4 /* add to local vctot */
+ mulps xmm0, xmm7 /* fsOH2 */
+ movaps xmm1, xmm0
+ movaps xmm2, xmm0
+
+ movaps xmm1, xmm0
+ movaps [esp + vctot], xmm6
+ movaps xmm2, xmm0
+
+ movaps xmm3, [esp + fjxH2]
+ movaps xmm4, [esp + fjyH2]
+ movaps xmm5, [esp + fjzH2]
+ mulps xmm0, [esp + dxH2H2]
+ mulps xmm1, [esp + dyH2H2]
+ mulps xmm2, [esp + dzH2H2]
+ subps xmm3, xmm0
+ subps xmm4, xmm1
+ subps xmm5, xmm2
+ addps xmm0, [esp + fixH2]
+ addps xmm1, [esp + fiyH2]
+ addps xmm2, [esp + fizH2]
+ movaps [esp + fjxH2], xmm3
+ movaps [esp + fjyH2], xmm4
+ movaps [esp + fjzH2], xmm5
+ movaps [esp + fixH2], xmm0
+ movaps [esp + fiyH2], xmm1
+ movaps [esp + fizH2], xmm2
+
+ mov edi, [ebp +faction]
+
+ /* Did all interactions - now update j forces */
+ /* 4 j waters with three atoms each - first do a & b j particles */
+ movaps xmm0, [esp + fjxO] /* xmm0= fjxOa fjxOb fjxOc fjxOd */
+ movaps xmm1, [esp + fjyO] /* xmm1= fjyOa fjyOb fjyOc fjyOd */
+ unpcklps xmm0, xmm1 /* xmm0= fjxOa fjyOa fjxOb fjyOb */
+ movaps xmm1, [esp + fjzO]
+ movaps xmm2, [esp + fjxH1]
+ movhlps xmm3, xmm0 /* xmm3= fjxOb fjyOb */
+ unpcklps xmm1, xmm2 /* xmm1= fjzOa fjxH1a fjzOb fjxH1b */
+ movaps xmm4, [esp + fjyH1]
+ movaps xmm5, [esp + fjzH1]
+ unpcklps xmm4, xmm5 /* xmm4= fjyH1a fjzH1a fjyH1b fjzH1b */
+ movaps xmm5, [esp + fjxH2]
+ movaps xmm6, [esp + fjyH2]
+ movhlps xmm7, xmm4 /* xmm7= fjyH1b fjzH1b */
+ unpcklps xmm5, xmm6 /* xmm5= fjxH2a fjyH2a fjxH2b fjyH2b */
+ movlhps xmm0, xmm1 /* xmm0= fjxOa fjyOa fjzOa fjxH1a */
+ shufps xmm3, xmm1, 0b11100100
+ /* xmm3= fjxOb fjyOb fjzOb fjxH1b */
+ movlhps xmm4, xmm5 /* xmm4= fjyH1a fjzH1a fjxH2a fjyH2a */
+ shufps xmm7, xmm5, 0b11100100
+ /* xmm7= fjyH1b fjzH1b fjxH2b fjyH2b */
+ movups xmm1, [edi + eax*4]
+ movups xmm2, [edi + eax*4 + 16]
+ movups xmm5, [edi + ebx*4]
+ movups xmm6, [edi + ebx*4 + 16]
+ addps xmm1, xmm0
+ addps xmm2, xmm4
+ addps xmm5, xmm3
+ addps xmm6, xmm7
+ movss xmm0, [edi + eax*4 + 32]
+ movss xmm3, [edi + ebx*4 + 32]
+
+ movaps xmm4, [esp + fjzH2]
+ movaps xmm7, xmm4
+ shufps xmm7, xmm7, 1
+
+ movups [edi + eax*4], xmm1
+ movups [edi + eax*4 + 16],xmm2
+ movups [edi + ebx*4], xmm5
+ movups [edi + ebx*4 + 16],xmm6
+ addss xmm0, xmm4
+ addss xmm3, xmm7
+ movss [edi + eax*4 + 32], xmm0
+ movss [edi + ebx*4 + 32], xmm3
+
+ /* then do the second pair (c & d) */
+ movaps xmm0, [esp + fjxO] /* xmm0= fjxOa fjxOb fjxOc fjxOd */
+ movaps xmm1, [esp + fjyO] /* xmm1= fjyOa fjyOb fjyOc fjyOd */
+ unpckhps xmm0, xmm1 /* xmm0= fjxOc fjyOc fjxOd fjyOd */
+ movaps xmm1, [esp + fjzO]
+ movaps xmm2, [esp + fjxH1]
+ movhlps xmm3, xmm0 /* xmm3= fjxOd fjyOd */
+ unpckhps xmm1, xmm2 /* xmm1= fjzOc fjxH1c fjzOd fjxH1d */
+ movaps xmm4, [esp + fjyH1]
+ movaps xmm5, [esp + fjzH1]
+ unpckhps xmm4, xmm5 /* xmm4= fjyH1c fjzH1c fjyH1d fjzH1d */
+ movaps xmm5, [esp + fjxH2]
+ movaps xmm6, [esp + fjyH2]
+ movhlps xmm7, xmm4 /* xmm7= fjyH1d fjzH1d */
+ unpckhps xmm5, xmm6 /* xmm5= fjxH2c fjyH2c fjxH2d fjyH2d */
+ movlhps xmm0, xmm1 /* xmm0= fjxOc fjyOc fjzOc fjxH1c */
+ shufps xmm3, xmm1, 0b11100100
+ /* xmm3= fjxOd fjyOd fjzOd fjxH1d */
+ movlhps xmm4, xmm5 /* xmm4= fjyH1c fjzH1c fjxH2c fjyH2c */
+ shufps xmm7, xmm5, 0b11100100
+ /* xmm7= fjyH1d fjzH1d fjxH2d fjyH2d */
+ movups xmm1, [edi + ecx*4]
+ movups xmm2, [edi + ecx*4 + 16]
+ movups xmm5, [edi + edx*4]
+ movups xmm6, [edi + edx*4 + 16]
+ addps xmm1, xmm0
+ addps xmm2, xmm4
+ addps xmm5, xmm3
+ addps xmm6, xmm7
+ movss xmm0, [edi + ecx*4 + 32]
+ movss xmm3, [edi + edx*4 + 32]
+
+ movaps xmm4, [esp + fjzH2]
+ movaps xmm7, xmm4
+ shufps xmm4, xmm4, 0b10
+ shufps xmm7, xmm7, 0b11
+ movups [edi + ecx*4], xmm1
+ movups [edi + ecx*4 + 16],xmm2
+ movups [edi + edx*4], xmm5
+ movups [edi + edx*4 + 16],xmm6
+ addss xmm0, xmm4
+ addss xmm3, xmm7
+ movss [edi + ecx*4 + 32], xmm0
+ movss [edi + edx*4 + 32], xmm3
+
+ /* should we do one more iteration? */
+ sub [esp + innerk], 4
+ jl .i2130_single_check
+ jmp .i2130_unroll_loop
+.i2130_single_check:
+ add [esp + innerk], 4
+ jnz .i2130_single_loop
+ jmp .i2130_updateouterdata
+.i2130_single_loop:
+ mov edx, [esp + innerjjnr] /* pointer to jjnr[k] */
+ mov eax, [edx]
+ add [esp + innerjjnr], 4
+
+ mov esi, [ebp + pos]
+ lea eax, [eax + eax*2]
+
+ /* fetch j coordinates */
+ xorps xmm3, xmm3
+ xorps xmm4, xmm4
+ xorps xmm5, xmm5
+ movss xmm3, [esi + eax*4]
+ movss xmm4, [esi + eax*4 + 4]
+ movss xmm5, [esi + eax*4 + 8]
+
+ movlps xmm6, [esi + eax*4 + 12]
+ movhps xmm6, [esi + eax*4 + 24] /* xmm6=jxH1 jyH1 jxH2 jyH2 */
+ /* fetch both z coords in one go, to positions 0 and 3 in xmm7 */
+ movups xmm7, [esi + eax*4 + 20] /* xmm7=jzH1 jxH2 jyH2 jzH2 */
+ shufps xmm6, xmm6, 0b11011000 /* xmm6=jxH1 jxH2 jyH1 jyH2 */
+ movlhps xmm3, xmm6 /* xmm3= jxO 0 jxH1 jxH2 */
+ movaps xmm0, [esp + ixO]
+ movaps xmm1, [esp + iyO]
+ movaps xmm2, [esp + izO]
+ shufps xmm4, xmm6, 0b11100100 /* xmm4= jyO 0 jyH1 jyH2 */
+ shufps xmm5, xmm7, 0b11000100 /* xmm5= jzO 0 jzH1 jzH2 */
+ /* store all j coordinates in jO */
+ movaps [esp + jxO], xmm3
+ movaps [esp + jyO], xmm4
+ movaps [esp + jzO], xmm5
+ subps xmm0, xmm3
+ subps xmm1, xmm4
+ subps xmm2, xmm5
+ movaps [esp + dxOO], xmm0
+ movaps [esp + dyOO], xmm1
+ movaps [esp + dzOO], xmm2
+ mulps xmm0, xmm0
+ mulps xmm1, xmm1
+ mulps xmm2, xmm2
+ addps xmm0, xmm1
+ addps xmm0, xmm2 /* have rsq in xmm0 */
+
+ movaps xmm6, xmm0
+
+ /* do invsqrt */
+ rsqrtps xmm1, xmm0
+ mulps xmm6, [esp + krf] /* xmm6=krsq */
+ movaps xmm2, xmm1
+ movaps xmm7, xmm6
+ mulps xmm1, xmm1
+ movaps xmm3, [esp + three]
+ mulps xmm1, xmm0
+ subps xmm3, xmm1
+ mulps xmm3, xmm2
+ mulps xmm3, [esp + half] /* rinv iO - j water */
+
+ addps xmm6, xmm3 /* xmm6=rinv+krsq */
+ mulps xmm7, [esp + two]
+ subps xmm6, [esp + crf] /* xmm6=rinv+krsq-crf */
+
+ xorps xmm1, xmm1
+ movaps xmm0, xmm3
+ subps xmm3, xmm7 /* xmm3=rinv-2*krsq */
+ xorps xmm4, xmm4
+ mulps xmm0, xmm0 /* xmm0=rinvsq */
+ /* fetch charges to xmm4 (temporary) */
+ movss xmm4, [esp + qqOO]
+ movss xmm1, xmm0
+ movhps xmm4, [esp + qqOH]
+ mulss xmm1, xmm0
+
+ mulps xmm6, xmm4 /* vcoul */
+ mulps xmm3, xmm4 /* coul part of fs */
+
+ mulss xmm1, xmm0 /* xmm1(0)=rinvsix */
+ movaps xmm2, xmm1 /* zero everything else in xmm2 */
+ mulss xmm2, xmm2 /* xmm2=rinvtwelve */
+
+ mulss xmm1, [esp + c6]
+ mulss xmm2, [esp + c12]
+ movaps xmm4, xmm2
+ subss xmm4, xmm1 /* vnbtot=vnb12-vnb6 */
+ addps xmm4, [esp + vnbtot]
+ mulss xmm1, [esp + six]
+ mulss xmm2, [esp + twelve]
+ movaps [esp + vnbtot], xmm4
+ subss xmm2, xmm1 /* fsD+fsR */
+ addps xmm2, xmm3 /* fsC+fsD+fsR */
+
+ addps xmm6, [esp + vctot]
+ mulps xmm0, xmm2 /* total fscal */
+ movaps [esp + vctot], xmm6
+
+ movaps xmm1, xmm0
+ movaps xmm2, xmm0
+ mulps xmm0, [esp + dxOO]
+ mulps xmm1, [esp + dyOO]
+ mulps xmm2, [esp + dzOO]
+
+ /* initial update for j forces */
+ xorps xmm3, xmm3
+ xorps xmm4, xmm4
+ xorps xmm5, xmm5
+ subps xmm3, xmm0
+ subps xmm4, xmm1
+ subps xmm5, xmm2
+ movaps [esp + fjxO], xmm3
+ movaps [esp + fjyO], xmm4
+ movaps [esp + fjzO], xmm5
+ addps xmm0, [esp + fixO]
+ addps xmm1, [esp + fiyO]
+ addps xmm2, [esp + fizO]
+ movaps [esp + fixO], xmm0
+ movaps [esp + fiyO], xmm1
+ movaps [esp + fizO], xmm2
+
+
+ /* done with i O Now do i H1 & H2 simultaneously first get i particle coords: */
+ movaps xmm0, [esp + ixH1]
+ movaps xmm1, [esp + iyH1]
+ movaps xmm2, [esp + izH1]
+ movaps xmm3, [esp + ixH2]
+ movaps xmm4, [esp + iyH2]
+ movaps xmm5, [esp + izH2]
+ subps xmm0, [esp + jxO]
+ subps xmm1, [esp + jyO]
+ subps xmm2, [esp + jzO]
+ subps xmm3, [esp + jxO]
+ subps xmm4, [esp + jyO]
+ subps xmm5, [esp + jzO]
+ movaps [esp + dxH1O], xmm0
+ movaps [esp + dyH1O], xmm1
+ movaps [esp + dzH1O], xmm2
+ movaps [esp + dxH2O], xmm3
+ movaps [esp + dyH2O], xmm4
+ movaps [esp + dzH2O], xmm5
+ mulps xmm0, xmm0
+ mulps xmm1, xmm1
+ mulps xmm2, xmm2
+ mulps xmm3, xmm3
+ mulps xmm4, xmm4
+ mulps xmm5, xmm5
+ addps xmm0, xmm1
+ addps xmm4, xmm3
+ addps xmm0, xmm2 /* have rsqH1 in xmm0 */
+ addps xmm4, xmm5 /* have rsqH2 in xmm4 */
+
+ /* do invsqrt */
+ rsqrtps xmm1, xmm0
+ rsqrtps xmm5, xmm4
+ movaps xmm2, xmm1
+ movaps xmm6, xmm5
+ mulps xmm1, xmm1
+ mulps xmm5, xmm5
+ movaps xmm3, [esp + three]
+ movaps xmm7, xmm3
+ mulps xmm1, xmm0
+ mulps xmm5, xmm4
+ subps xmm3, xmm1
+ subps xmm7, xmm5
+ mulps xmm3, xmm2
+ mulps xmm7, xmm6
+ mulps xmm3, [esp + half] /* rinv H1 - j water */
+ mulps xmm7, [esp + half] /* rinv H2 - j water */
+
+ mulps xmm0, [esp + krf] /* krsq */
+ mulps xmm4, [esp + krf] /* krsq */
+
+
+ /* assemble charges in xmm6 */
+ xorps xmm6, xmm6
+ movss xmm6, [esp + qqOH]
+ movhps xmm6, [esp + qqHH]
+ movaps xmm1, xmm0
+ movaps xmm5, xmm4
+ addps xmm0, xmm3 /* krsq+rinv */
+ addps xmm4, xmm7 /* krsq+rinv */
+ subps xmm0, [esp + crf]
+ subps xmm4, [esp + crf]
+ mulps xmm1, [esp + two]
+ mulps xmm5, [esp + two]
+ mulps xmm0, xmm6 /* vcoul */
+ mulps xmm4, xmm6 /* vcoul */
+ addps xmm4, xmm0
+ addps xmm4, [esp + vctot]
+ movaps [esp + vctot], xmm4
+ movaps xmm0, xmm3
+ movaps xmm4, xmm7
+ mulps xmm3, xmm3
+ mulps xmm7, xmm7
+ subps xmm0, xmm1
+ subps xmm4, xmm5
+ mulps xmm0, xmm6
+ mulps xmm4, xmm6
+ mulps xmm0, xmm3 /* fscal */
+ mulps xmm7, xmm4 /* fscal */
+
+ movaps xmm1, xmm0
+ movaps xmm2, xmm0
+ mulps xmm0, [esp + dxH1O]
+ mulps xmm1, [esp + dyH1O]
+ mulps xmm2, [esp + dzH1O]
+ /* update forces H1 - j water */
+ movaps xmm3, [esp + fjxO]
+ movaps xmm4, [esp + fjyO]
+ movaps xmm5, [esp + fjzO]
+ subps xmm3, xmm0
+ subps xmm4, xmm1
+ subps xmm5, xmm2
+ movaps [esp + fjxO], xmm3
+ movaps [esp + fjyO], xmm4
+ movaps [esp + fjzO], xmm5
+ addps xmm0, [esp + fixH1]
+ addps xmm1, [esp + fiyH1]
+ addps xmm2, [esp + fizH1]
+ movaps [esp + fixH1], xmm0
+ movaps [esp + fiyH1], xmm1
+ movaps [esp + fizH1], xmm2
+ /* do forces H2 - j water */
+ movaps xmm0, xmm7
+ movaps xmm1, xmm7
+ movaps xmm2, xmm7
+ mulps xmm0, [esp + dxH2O]
+ mulps xmm1, [esp + dyH2O]
+ mulps xmm2, [esp + dzH2O]
+ movaps xmm3, [esp + fjxO]
+ movaps xmm4, [esp + fjyO]
+ movaps xmm5, [esp + fjzO]
+ subps xmm3, xmm0
+ subps xmm4, xmm1
+ subps xmm5, xmm2
+ mov esi, [ebp + faction]
+ movaps [esp + fjxO], xmm3
+ movaps [esp + fjyO], xmm4
+ movaps [esp + fjzO], xmm5
+ addps xmm0, [esp + fixH2]
+ addps xmm1, [esp + fiyH2]
+ addps xmm2, [esp + fizH2]
+ movaps [esp + fixH2], xmm0
+ movaps [esp + fiyH2], xmm1
+ movaps [esp + fizH2], xmm2
+
+ /* update j water forces from local variables */
+ movlps xmm0, [esi + eax*4]
+ movlps xmm1, [esi + eax*4 + 12]
+ movhps xmm1, [esi + eax*4 + 24]
+ movaps xmm3, [esp + fjxO]
+ movaps xmm4, [esp + fjyO]
+ movaps xmm5, [esp + fjzO]
+ movaps xmm6, xmm5
+ movaps xmm7, xmm5
+ shufps xmm6, xmm6, 0b10
+ shufps xmm7, xmm7, 0b11
+ addss xmm5, [esi + eax*4 + 8]
+ addss xmm6, [esi + eax*4 + 20]
+ addss xmm7, [esi + eax*4 + 32]
+ movss [esi + eax*4 + 8], xmm5
+ movss [esi + eax*4 + 20], xmm6
+ movss [esi + eax*4 + 32], xmm7
+ movaps xmm5, xmm3
+ unpcklps xmm3, xmm4
+ unpckhps xmm5, xmm4
+ addps xmm0, xmm3
+ addps xmm1, xmm5
+ movlps [esi + eax*4], xmm0
+ movlps [esi + eax*4 + 12], xmm1
+ movhps [esi + eax*4 + 24], xmm1
+
+ dec dword ptr [esp + innerk]
+ jz .i2130_updateouterdata
+ jmp .i2130_single_loop
+.i2130_updateouterdata:
+ mov ecx, [esp + ii3]
+ mov edi, [ebp + faction]
+ mov esi, [ebp + fshift]
+ mov edx, [esp + is3]
+
+ /* accumulate Oi forces in xmm0, xmm1, xmm2 */
+ movaps xmm0, [esp + fixO]
+ movaps xmm1, [esp + fiyO]
+ movaps xmm2, [esp + fizO]
+
+ movhlps xmm3, xmm0
+ movhlps xmm4, xmm1
+ movhlps xmm5, xmm2
+ addps xmm0, xmm3
+ addps xmm1, xmm4
+ addps xmm2, xmm5 /* sum is in 1/2 in xmm0-xmm2 */
+
+ movaps xmm3, xmm0
+ movaps xmm4, xmm1
+ movaps xmm5, xmm2
+
+ shufps xmm3, xmm3, 1
+ shufps xmm4, xmm4, 1
+ shufps xmm5, xmm5, 1
+ addss xmm0, xmm3
+ addss xmm1, xmm4
+ addss xmm2, xmm5 /* xmm0-xmm2 has single force in pos0 */
+
+ /* increment i force */
+ movss xmm3, [edi + ecx*4]
+ movss xmm4, [edi + ecx*4 + 4]
+ movss xmm5, [edi + ecx*4 + 8]
+ addss xmm3, xmm0
+ addss xmm4, xmm1
+ addss xmm5, xmm2
+ movss [edi + ecx*4], xmm3
+ movss [edi + ecx*4 + 4], xmm4
+ movss [edi + ecx*4 + 8], xmm5
+
+ /* accumulate force in xmm6/xmm7 for fshift */
+ movaps xmm6, xmm0
+ movss xmm7, xmm2
+ movlhps xmm6, xmm1
+ shufps xmm6, xmm6, 0b1000
+
+ /* accumulate H1i forces in xmm0, xmm1, xmm2 */
+ movaps xmm0, [esp + fixH1]
+ movaps xmm1, [esp + fiyH1]
+ movaps xmm2, [esp + fizH1]
+
+ movhlps xmm3, xmm0
+ movhlps xmm4, xmm1
+ movhlps xmm5, xmm2
+ addps xmm0, xmm3
+ addps xmm1, xmm4
+ addps xmm2, xmm5 /* sum is in 1/2 in xmm0-xmm2 */
+
+ movaps xmm3, xmm0
+ movaps xmm4, xmm1
+ movaps xmm5, xmm2
+
+ shufps xmm3, xmm3, 1
+ shufps xmm4, xmm4, 1
+ shufps xmm5, xmm5, 1
+ addss xmm0, xmm3
+ addss xmm1, xmm4
+ addss xmm2, xmm5 /* xmm0-xmm2 has single force in pos0 */
+
+ /* increment i force */
+ movss xmm3, [edi + ecx*4 + 12]
+ movss xmm4, [edi + ecx*4 + 16]
+ movss xmm5, [edi + ecx*4 + 20]
+ addss xmm3, xmm0
+ addss xmm4, xmm1
+ addss xmm5, xmm2
+ movss [edi + ecx*4 + 12], xmm3
+ movss [edi + ecx*4 + 16], xmm4
+ movss [edi + ecx*4 + 20], xmm5
+
+ /* accumulate force in xmm6/xmm7 for fshift */
+ addss xmm7, xmm2
+ movlhps xmm0, xmm1
+ shufps xmm0, xmm0, 0b1000
+ addps xmm6, xmm0
+
+ /* accumulate H2i forces in xmm0, xmm1, xmm2 */
+ movaps xmm0, [esp + fixH2]
+ movaps xmm1, [esp + fiyH2]
+ movaps xmm2, [esp + fizH2]
+
+ movhlps xmm3, xmm0
+ movhlps xmm4, xmm1
+ movhlps xmm5, xmm2
+ addps xmm0, xmm3
+ addps xmm1, xmm4
+ addps xmm2, xmm5 /* sum is in 1/2 in xmm0-xmm2 */
+
+ movaps xmm3, xmm0
+ movaps xmm4, xmm1
+ movaps xmm5, xmm2
+
+ shufps xmm3, xmm3, 1
+ shufps xmm4, xmm4, 1
+ shufps xmm5, xmm5, 1
+ addss xmm0, xmm3
+ addss xmm1, xmm4
+ addss xmm2, xmm5 /* xmm0-xmm2 has single force in pos0 */
+
+ /* increment i force */
+ movss xmm3, [edi + ecx*4 + 24]
+ movss xmm4, [edi + ecx*4 + 28]
+ movss xmm5, [edi + ecx*4 + 32]
+ addss xmm3, xmm0
+ addss xmm4, xmm1
+ addss xmm5, xmm2
+ movss [edi + ecx*4 + 24], xmm3
+ movss [edi + ecx*4 + 28], xmm4
+ movss [edi + ecx*4 + 32], xmm5
+
+ /* accumulate force in xmm6/xmm7 for fshift */
+ addss xmm7, xmm2
+ movlhps xmm0, xmm1
+ shufps xmm0, xmm0, 0b1000
+ addps xmm6, xmm0
+
+ /* increment fshift force */
+ movlps xmm3, [esi + edx*4]
+ movss xmm4, [esi + edx*4 + 8]
+ addps xmm3, xmm6
+ addss xmm4, xmm7
+ movlps [esi + edx*4], xmm3
+ movss [esi + edx*4 + 8], xmm4
+
+ /* get group index for i particle */
+ mov edx, [ebp + gid] /* get group index for this i particle */
+ mov edx, [edx]
+ add [ebp + gid], 4 /* advance pointer */
+
+ /* accumulate total potential energy and update it */
+ movaps xmm7, [esp + vctot]
+ /* accumulate */
+ movhlps xmm6, xmm7
+ addps xmm7, xmm6 /* pos 0-1 in xmm7 have the sum now */
+ movaps xmm6, xmm7
+ shufps xmm6, xmm6, 1
+ addss xmm7, xmm6
+
+ /* add earlier value from mem */
+ mov eax, [ebp + Vc]
+ addss xmm7, [eax + edx*4]
+ /* move back to mem */
+ movss [eax + edx*4], xmm7
+
+ /* accumulate total lj energy and update it */
+ movaps xmm7, [esp + vnbtot]
+ /* accumulate */
+ movhlps xmm6, xmm7
+ addps xmm7, xmm6 /* pos 0-1 in xmm7 have the sum now */
+ movaps xmm6, xmm7
+ shufps xmm6, xmm6, 1
+ addss xmm7, xmm6
+
+ /* add earlier value from mem */
+ mov eax, [ebp + Vnb]
+ addss xmm7, [eax + edx*4]
+ /* move back to mem */
+ movss [eax + edx*4], xmm7
+
+ /* finish if last */
+ mov ecx, [ebp + nri]
+ dec ecx
+ jecxz .i2130_end
+ /* not last, iterate once more! */
+ mov [ebp + nri], ecx
+ jmp .i2130_outer
+.i2130_end:
+ emms
+ mov eax, [esp + salign]
+ add esp, eax
+ add esp, 1540
+ pop edi
+ pop esi
+ pop edx
+ pop ecx
+ pop ebx
+ pop eax
+ leave
+ ret
+
+
+
+
+.globl inl2020_sse
+ .type inl2020_sse,@function
+inl2020_sse:
+.equ nri, 8
+.equ iinr, 12
+.equ jindex, 16
+.equ jjnr, 20
+.equ shift, 24
+.equ shiftvec, 28
+.equ fshift, 32
+.equ gid, 36
+.equ pos, 40
+.equ faction, 44
+.equ charge, 48
+.equ facel, 52
+.equ Vc, 56
+.equ krf, 60
+.equ crf, 64
+ /* stack offsets for local variables */
+ /* bottom of stack is cache-aligned for sse use */
+.equ ixO, 0
+.equ iyO, 16
+.equ izO, 32
+.equ ixH1, 48
+.equ iyH1, 64
+.equ izH1, 80
+.equ ixH2, 96
+.equ iyH2, 112
+.equ izH2, 128
+.equ iqO, 144
+.equ iqH, 160
+.equ dxO, 176
+.equ dyO, 192
+.equ dzO, 208
+.equ dxH1, 224
+.equ dyH1, 240
+.equ dzH1, 256
+.equ dxH2, 272
+.equ dyH2, 288
+.equ dzH2, 304
+.equ qqO, 320
+.equ qqH, 336
+.equ vctot, 352
+.equ fixO, 384
+.equ fiyO, 400
+.equ fizO, 416
+.equ fixH1, 432
+.equ fiyH1, 448
+.equ fizH1, 464
+.equ fixH2, 480
+.equ fiyH2, 496
+.equ fizH2, 512
+.equ fjx, 528
+.equ fjy, 544
+.equ fjz, 560
+.equ half, 576
+.equ three, 592
+.equ two, 608
+.equ krf, 624
+.equ crf, 640
+.equ krsqO, 656
+.equ krsqH1, 672
+.equ krsqH2, 688
+.equ is3, 704
+.equ ii3, 708
+.equ innerjjnr, 712
+.equ innerk, 716
+.equ salign, 720
+ push ebp
+ mov ebp,esp
+ push eax
+ push ebx
+ push ecx
+ push edx
+ push esi
+ push edi
+ sub esp, 724 /* local stack space */
+ mov eax, esp
+ and eax, 0xf
+ sub esp, eax
+ mov [esp + salign], eax
+
+ emms
+
+ movups xmm0, [sse_half]
+ movups xmm1, [sse_three]
+ movups xmm4, [sse_two]
+ movss xmm5, [ebp + krf]
+ movss xmm6, [ebp + crf]
+
+ movaps [esp + half], xmm0
+ movaps [esp + three], xmm1
+ movaps [esp + two], xmm4
+ shufps xmm5, xmm5, 0
+ shufps xmm6, xmm6, 0
+ movaps [esp + krf], xmm5
+ movaps [esp + crf], xmm6
+
+ /* assume we have at least one i particle - start directly */
+ mov ecx, [ebp + iinr] /* ecx = pointer into iinr[] */
+ mov ebx, [ecx] /* ebx =ii */
+
+ mov edx, [ebp + charge]
+ movss xmm3, [edx + ebx*4]
+ movss xmm4, [edx + ebx*4 + 4]
+ movss xmm5, [ebp + facel]
+ mulss xmm3, xmm5
+ mulss xmm4, xmm5
+
+ shufps xmm3, xmm3, 0
+ shufps xmm4, xmm4, 0
+ movaps [esp + iqO], xmm3
+ movaps [esp + iqH], xmm4
+
+.i2020_outer:
+ mov eax, [ebp + shift] /* eax = pointer into shift[] */
+ mov ebx, [eax] /* ebx=shift[n] */
+ add [ebp + shift], 4 /* advance pointer one step */
+
+ lea ebx, [ebx + ebx*2] /* ebx=3*is */
+ mov [esp + is3],ebx /* store is3 */
+
+ mov eax, [ebp + shiftvec] /* eax = base of shiftvec[] */
+
+ movss xmm0, [eax + ebx*4]
+ movss xmm1, [eax + ebx*4 + 4]
+ movss xmm2, [eax + ebx*4 + 8]
+
+ mov ecx, [ebp + iinr] /* ecx = pointer into iinr[] */
+ add [ebp + iinr], 4 /* advance pointer */
+ mov ebx, [ecx] /* ebx =ii */
+
+ movaps xmm3, xmm0
+ movaps xmm4, xmm1
+ movaps xmm5, xmm2
+
+ lea ebx, [ebx + ebx*2] /* ebx = 3*ii=ii3 */
+ mov eax, [ebp + pos] /* eax = base of pos[] */
+ mov [esp + ii3], ebx
+
+ addss xmm3, [eax + ebx*4]
+ addss xmm4, [eax + ebx*4 + 4]
+ addss xmm5, [eax + ebx*4 + 8]
+ shufps xmm3, xmm3, 0
+ shufps xmm4, xmm4, 0
+ shufps xmm5, xmm5, 0
+ movaps [esp + ixO], xmm3
+ movaps [esp + iyO], xmm4
+ movaps [esp + izO], xmm5
+
+ movss xmm3, xmm0
+ movss xmm4, xmm1
+ movss xmm5, xmm2
+ addss xmm0, [eax + ebx*4 + 12]
+ addss xmm1, [eax + ebx*4 + 16]
+ addss xmm2, [eax + ebx*4 + 20]
+ addss xmm3, [eax + ebx*4 + 24]
+ addss xmm4, [eax + ebx*4 + 28]
+ addss xmm5, [eax + ebx*4 + 32]
+
+ shufps xmm0, xmm0, 0
+ shufps xmm1, xmm1, 0
+ shufps xmm2, xmm2, 0
+ shufps xmm3, xmm3, 0
+ shufps xmm4, xmm4, 0
+ shufps xmm5, xmm5, 0
+ movaps [esp + ixH1], xmm0
+ movaps [esp + iyH1], xmm1
+ movaps [esp + izH1], xmm2
+ movaps [esp + ixH2], xmm3
+ movaps [esp + iyH2], xmm4
+ movaps [esp + izH2], xmm5
+
+ /* clear vctot and i forces */
+ xorps xmm4, xmm4
+ movaps [esp + vctot], xmm4
+ movaps [esp + fixO], xmm4
+ movaps [esp + fiyO], xmm4
+ movaps [esp + fizO], xmm4
+ movaps [esp + fixH1], xmm4
+ movaps [esp + fiyH1], xmm4
+ movaps [esp + fizH1], xmm4
+ movaps [esp + fixH2], xmm4
+ movaps [esp + fiyH2], xmm4
+ movaps [esp + fizH2], xmm4
+
+ mov eax, [ebp + jindex]
+ mov ecx, [eax] /* jindex[n] */
+ mov edx, [eax + 4] /* jindex[n+1] */
+ add [ebp + jindex], 4
+ sub edx, ecx /* number of innerloop atoms */
+
+ mov esi, [ebp + pos]
+ mov edi, [ebp + faction]
+ mov eax, [ebp + jjnr]
+ shl ecx, 2
+ add eax, ecx
+ mov [esp + innerjjnr], eax /* pointer to jjnr[nj0] */
+ sub edx, 4
+ mov [esp + innerk], edx /* number of innerloop atoms */
+ jge .i2020_unroll_loop
+ jmp .i2020_odd_inner
+.i2020_unroll_loop:
+ /* quad-unroll innerloop here */
+ mov edx, [esp + innerjjnr] /* pointer to jjnr[k] */
+ mov eax, [edx]
+ mov ebx, [edx + 4]
+ mov ecx, [edx + 8]
+ mov edx, [edx + 12] /* eax-edx=jnr1-4 */
+
+ add [esp + innerjjnr], 16 /* advance pointer (unrolled 4) */
+
+ mov esi, [ebp + charge] /* base of charge[] */
+
+ movss xmm3, [esi + eax*4]
+ movss xmm4, [esi + ecx*4]
+ movss xmm6, [esi + ebx*4]
+ movss xmm7, [esi + edx*4]
+
+ shufps xmm3, xmm6, 0
+ shufps xmm4, xmm7, 0
+ shufps xmm3, xmm4, 0b10001000 /* all charges in xmm3 */
+ movaps xmm4, xmm3 /* and in xmm4 */
+ mulps xmm3, [esp + iqO]
+ mulps xmm4, [esp + iqH]
+
+ movaps [esp + qqO], xmm3
+ movaps [esp + qqH], xmm4
+
+ mov esi, [ebp + pos] /* base of pos[] */
+
+ lea eax, [eax + eax*2] /* replace jnr with j3 */
+ lea ebx, [ebx + ebx*2]
+ lea ecx, [ecx + ecx*2] /* replace jnr with j3 */
+ lea edx, [edx + edx*2]
+
+ /* move four coordinates to xmm0-xmm2 */
+ movlps xmm4, [esi + eax*4]
+ movlps xmm5, [esi + ecx*4]
+ movss xmm2, [esi + eax*4 + 8]
+ movss xmm6, [esi + ecx*4 + 8]
+
+ movhps xmm4, [esi + ebx*4]
+ movhps xmm5, [esi + edx*4]
+
+ movss xmm0, [esi + ebx*4 + 8]
+ movss xmm1, [esi + edx*4 + 8]
+
+ shufps xmm2, xmm0, 0
+ shufps xmm6, xmm1, 0
+
+ movaps xmm0, xmm4
+ movaps xmm1, xmm4
+
+ shufps xmm2, xmm6, 0b10001000
+
+ shufps xmm0, xmm5, 0b10001000
+ shufps xmm1, xmm5, 0b11011101
+
+ /* move ixO-izO to xmm4-xmm6 */
+ movaps xmm4, [esp + ixO]
+ movaps xmm5, [esp + iyO]
+ movaps xmm6, [esp + izO]
+
+ /* calc dr */
+ subps xmm4, xmm0
+ subps xmm5, xmm1
+ subps xmm6, xmm2
+
+ /* store dr */
+ movaps [esp + dxO], xmm4
+ movaps [esp + dyO], xmm5
+ movaps [esp + dzO], xmm6
+ /* square it */
+ mulps xmm4,xmm4
+ mulps xmm5,xmm5
+ mulps xmm6,xmm6
+ addps xmm4, xmm5
+ addps xmm4, xmm6
+ movaps xmm7, xmm4
+ /* rsqO in xmm7 */
+
+ /* move ixH1-izH1 to xmm4-xmm6 */
+ movaps xmm4, [esp + ixH1]
+ movaps xmm5, [esp + iyH1]
+ movaps xmm6, [esp + izH1]
+
+ /* calc dr */
+ subps xmm4, xmm0
+ subps xmm5, xmm1
+ subps xmm6, xmm2
+
+ /* store dr */
+ movaps [esp + dxH1], xmm4
+ movaps [esp + dyH1], xmm5
+ movaps [esp + dzH1], xmm6
+ /* square it */
+ mulps xmm4,xmm4
+ mulps xmm5,xmm5
+ mulps xmm6,xmm6
+ addps xmm6, xmm5
+ addps xmm6, xmm4
+ /* rsqH1 in xmm6 */
+
+ /* move ixH2-izH2 to xmm3-xmm5 */
+ movaps xmm3, [esp + ixH2]
+ movaps xmm4, [esp + iyH2]
+ movaps xmm5, [esp + izH2]
+
+ /* calc dr */
+ subps xmm3, xmm0
+ subps xmm4, xmm1
+ subps xmm5, xmm2
+
+ /* store dr */
+ movaps [esp + dxH2], xmm3
+ movaps [esp + dyH2], xmm4
+ movaps [esp + dzH2], xmm5
+ /* square it */
+ mulps xmm3,xmm3
+ mulps xmm4,xmm4
+ mulps xmm5,xmm5
+ addps xmm5, xmm4
+ addps xmm5, xmm3
+ /* rsqH2 in xmm5, rsqH1 in xmm6, rsqO in xmm7 */
+
+ movaps xmm0, xmm5
+ movaps xmm1, xmm6
+ movaps xmm2, xmm7
+
+ mulps xmm0, [esp + krf]
+ mulps xmm1, [esp + krf]
+ mulps xmm2, [esp + krf]
+
+ movaps [esp + krsqH2], xmm0
+ movaps [esp + krsqH1], xmm1
+ movaps [esp + krsqO], xmm2
+
+ /* start with rsqO - seed in xmm2 */
+ rsqrtps xmm2, xmm7
+ movaps xmm3, xmm2
+ mulps xmm2, xmm2
+ movaps xmm4, [esp + three]
+ mulps xmm2, xmm7 /* rsq*lu*lu */
+ subps xmm4, xmm2 /* 30-rsq*lu*lu */
+ mulps xmm4, xmm3 /* lu*(3-rsq*lu*lu) */
+ mulps xmm4, [esp + half]
+ movaps xmm7, xmm4 /* rinvO in xmm7 */
+ /* rsqH1 - seed in xmm2 */
+ rsqrtps xmm2, xmm6
+ movaps xmm3, xmm2
+ mulps xmm2, xmm2
+ movaps xmm4, [esp + three]
+ mulps xmm2, xmm6 /* rsq*lu*lu */
+ subps xmm4, xmm2 /* 30-rsq*lu*lu */
+ mulps xmm4, xmm3 /* lu*(3-rsq*lu*lu) */
+ mulps xmm4, [esp + half]
+ movaps xmm6, xmm4 /* rinvH1 in xmm6 */
+ /* rsqH2 - seed in xmm2 */
+ rsqrtps xmm2, xmm5
+ movaps xmm3, xmm2
+ mulps xmm2, xmm2
+ movaps xmm4, [esp + three]
+ mulps xmm2, xmm5 /* rsq*lu*lu */
+ subps xmm4, xmm2 /* 30-rsq*lu*lu */
+ mulps xmm4, xmm3 /* lu*(3-rsq*lu*lu) */
+ mulps xmm4, [esp + half]
+ movaps xmm5, xmm4 /* rinvH2 in xmm5 */
+
+ /* do O interactions */
+ movaps xmm4, xmm7
+ mulps xmm4, xmm4 /* xmm7=rinv, xmm4=rinvsq */
+
+ movaps xmm0, xmm7
+ movaps xmm1, [esp + krsqO]
+ addps xmm0, xmm1
+ subps xmm0, [esp + crf] /* xmm0=rinv+krsq-crf */
+ mulps xmm1, [esp + two]
+ subps xmm7, xmm1
+ mulps xmm0, [esp + qqO]
+ mulps xmm7, [esp + qqO]
+
+ mulps xmm4, xmm7 /* total fsO in xmm4 */
+
+ addps xmm0, [esp + vctot]
+ movaps [esp + vctot], xmm0
+
+ movaps xmm0, [esp + dxO]
+ movaps xmm1, [esp + dyO]
+ movaps xmm2, [esp + dzO]
+ mulps xmm0, xmm4
+ mulps xmm1, xmm4
+ mulps xmm2, xmm4
+
+ /* update O forces */
+ movaps xmm3, [esp + fixO]
+ movaps xmm4, [esp + fiyO]
+ movaps xmm7, [esp + fizO]
+ addps xmm3, xmm0
+ addps xmm4, xmm1
+ addps xmm7, xmm2
+ movaps [esp + fixO], xmm3
+ movaps [esp + fiyO], xmm4
+ movaps [esp + fizO], xmm7
+ /* update j forces with water O */
+ movaps [esp + fjx], xmm0
+ movaps [esp + fjy], xmm1
+ movaps [esp + fjz], xmm2
+
+ /* H1 interactions */
+ movaps xmm4, xmm6
+ mulps xmm4, xmm4 /* xmm6=rinv, xmm4=rinvsq */
+ movaps xmm7, xmm6
+ movaps xmm0, [esp + krsqH1]
+ addps xmm6, xmm0 /* xmm6=rinv+krsq */
+ subps xmm6, [esp + crf] /* xmm6=rinv+krsq-crf */
+ mulps xmm0, [esp + two]
+ subps xmm7, xmm0 /* xmm7=rinv-2*krsq */
+ mulps xmm6, [esp + qqH] /* vcoul */
+ mulps xmm7, [esp + qqH]
+ mulps xmm4, xmm7 /* total fsH1 in xmm4 */
+
+ addps xmm6, [esp + vctot]
+
+ movaps xmm0, [esp + dxH1]
+ movaps xmm1, [esp + dyH1]
+ movaps xmm2, [esp + dzH1]
+ movaps [esp + vctot], xmm6
+ mulps xmm0, xmm4
+ mulps xmm1, xmm4
+ mulps xmm2, xmm4
+
+ /* update H1 forces */
+ movaps xmm3, [esp + fixH1]
+ movaps xmm4, [esp + fiyH1]
+ movaps xmm7, [esp + fizH1]
+ addps xmm3, xmm0
+ addps xmm4, xmm1
+ addps xmm7, xmm2
+ movaps [esp + fixH1], xmm3
+ movaps [esp + fiyH1], xmm4
+ movaps [esp + fizH1], xmm7
+ /* update j forces with water H1 */
+ addps xmm0, [esp + fjx]
+ addps xmm1, [esp + fjy]
+ addps xmm2, [esp + fjz]
+ movaps [esp + fjx], xmm0
+ movaps [esp + fjy], xmm1
+ movaps [esp + fjz], xmm2
+
+ /* H2 interactions */
+ movaps xmm4, xmm5
+ mulps xmm4, xmm4 /* xmm5=rinv, xmm4=rinvsq */
+ movaps xmm7, xmm5
+ movaps xmm0, [esp + krsqH2]
+ addps xmm5, xmm0 /* xmm6=rinv+krsq */
+ subps xmm5, [esp + crf] /* xmm5=rinv+krsq-crf */
+ mulps xmm0, [esp + two]
+ subps xmm7, xmm0 /* xmm7=rinv-2*krsq */
+ mulps xmm5, [esp + qqH] /* vcoul */
+ mulps xmm7, [esp + qqH]
+ mulps xmm4, xmm7 /* total fsH2 in xmm4 */
+
+ addps xmm5, [esp + vctot]
+
+ movaps xmm0, [esp + dxH2]
+ movaps xmm1, [esp + dyH2]
+ movaps xmm2, [esp + dzH2]
+ movaps [esp + vctot], xmm5
+ mulps xmm0, xmm4
+ mulps xmm1, xmm4
+ mulps xmm2, xmm4
+
+ /* update H2 forces */
+ movaps xmm3, [esp + fixH2]
+ movaps xmm4, [esp + fiyH2]
+ movaps xmm7, [esp + fizH2]
+ addps xmm3, xmm0
+ addps xmm4, xmm1
+ addps xmm7, xmm2
+ movaps [esp + fixH2], xmm3
+ movaps [esp + fiyH2], xmm4
+ movaps [esp + fizH2], xmm7
+
+ mov edi, [ebp +faction]
+ /* update j forces */
+ addps xmm0, [esp + fjx]
+ addps xmm1, [esp + fjy]
+ addps xmm2, [esp + fjz]
+
+ movlps xmm4, [edi + eax*4]
+ movlps xmm7, [edi + ecx*4]
+ movhps xmm4, [edi + ebx*4]
+ movhps xmm7, [edi + edx*4]
+
+ movaps xmm3, xmm4
+ shufps xmm3, xmm7, 0b10001000
+ shufps xmm4, xmm7, 0b11011101
+ /* xmm3 has fjx, xmm4 has fjy */
+ subps xmm3, xmm0
+ subps xmm4, xmm1
+ /* unpack them back for storing */
+ movaps xmm7, xmm3
+ unpcklps xmm7, xmm4
+ unpckhps xmm3, xmm4
+ movlps [edi + eax*4], xmm7
+ movlps [edi + ecx*4], xmm3
+ movhps [edi + ebx*4], xmm7
+ movhps [edi + edx*4], xmm3
+ /* finally z forces */
+ movss xmm0, [edi + eax*4 + 8]
+ movss xmm1, [edi + ebx*4 + 8]
+ movss xmm3, [edi + ecx*4 + 8]
+ movss xmm4, [edi + edx*4 + 8]
+ subss xmm0, xmm2
+ shufps xmm2, xmm2, 0b11100101
+ subss xmm1, xmm2
+ shufps xmm2, xmm2, 0b11101010
+ subss xmm3, xmm2
+ shufps xmm2, xmm2, 0b11111111
+ subss xmm4, xmm2
+ movss [edi + eax*4 + 8], xmm0
+ movss [edi + ebx*4 + 8], xmm1
+ movss [edi + ecx*4 + 8], xmm3
+ movss [edi + edx*4 + 8], xmm4
+
+ /* should we do one more iteration? */
+ sub [esp + innerk], 4
+ jl .i2020_odd_inner
+ jmp .i2020_unroll_loop
+.i2020_odd_inner:
+ add [esp + innerk], 4
+ jnz .i2020_odd_loop
+ jmp .i2020_updateouterdata
+.i2020_odd_loop:
+ mov edx, [esp + innerjjnr] /* pointer to jjnr[k] */
+ mov eax, [edx]
+ add [esp + innerjjnr], 4
+
+ xorps xmm4, xmm4
+ movss xmm4, [esp + iqO]
+ mov esi, [ebp + charge]
+ movhps xmm4, [esp + iqH]
+ movss xmm3, [esi + eax*4] /* charge in xmm3 */
+ shufps xmm3, xmm3, 0
+ mulps xmm3, xmm4
+ movaps [esp + qqO], xmm3 /* use oxygen qq for storage */
+
+ mov esi, [ebp + pos]
+ lea eax, [eax + eax*2]
+
+ /* move j coords to xmm0-xmm2 */
+ movss xmm0, [esi + eax*4]
+ movss xmm1, [esi + eax*4 + 4]
+ movss xmm2, [esi + eax*4 + 8]
+ shufps xmm0, xmm0, 0
+ shufps xmm1, xmm1, 0
+ shufps xmm2, xmm2, 0
+
+ movss xmm3, [esp + ixO]
+ movss xmm4, [esp + iyO]
+ movss xmm5, [esp + izO]
+
+ movlps xmm6, [esp + ixH1]
+ movlps xmm7, [esp + ixH2]
+ unpcklps xmm6, xmm7
+ movlhps xmm3, xmm6
+ movlps xmm6, [esp + iyH1]
+ movlps xmm7, [esp + iyH2]
+ unpcklps xmm6, xmm7
+ movlhps xmm4, xmm6
+ movlps xmm6, [esp + izH1]
+ movlps xmm7, [esp + izH2]
+ unpcklps xmm6, xmm7
+ movlhps xmm5, xmm6
+
+ subps xmm3, xmm0
+ subps xmm4, xmm1
+ subps xmm5, xmm2
+
+ movaps [esp + dxO], xmm3
+ movaps [esp + dyO], xmm4
+ movaps [esp + dzO], xmm5
+
+ mulps xmm3, xmm3
+ mulps xmm4, xmm4
+ mulps xmm5, xmm5
+
+ addps xmm4, xmm3
+ addps xmm4, xmm5
+ /* rsq in xmm4 */
+
+ movaps xmm0, xmm4
+ mulps xmm0, [esp + krf]
+ movaps [esp + krsqO], xmm0
+
+ rsqrtps xmm5, xmm4
+ /* lookup seed in xmm5 */
+ movaps xmm2, xmm5
+ mulps xmm5, xmm5
+ movaps xmm1, [esp + three]
+ mulps xmm5, xmm4 /* rsq*lu*lu */
+ movaps xmm0, [esp + half]
+ subps xmm1, xmm5 /* 30-rsq*lu*lu */
+ mulps xmm1, xmm2
+ mulps xmm0, xmm1 /* xmm0=rinv */
+ movaps xmm4, xmm0
+ mulps xmm4, xmm4 /* xmm4=rinvsq */
+
+ movaps xmm1, xmm0 /* xmm1=r */inv
+ movaps xmm3, [esp + krsqO]
+ addps xmm0, xmm3 /* xmm0=rinv+krsq */
+ subps xmm0, [esp + crf] /* xmm0=rinv+krsq-crf */
+ mulps xmm3, [esp + two]
+ subps xmm1, xmm3 /* xmm1=rinv-2*krsq */
+ mulps xmm0, [esp + qqO] /* xmm0=vcoul */
+ mulps xmm1, [esp + qqO] /* xmm1=coul part of fs */
+
+
+ mulps xmm4, xmm1 /* xmm4=total fscal */
+ addps xmm0, [esp + vctot]
+ movaps [esp + vctot], xmm0
+
+ movaps xmm0, [esp + dxO]
+ movaps xmm1, [esp + dyO]
+ movaps xmm2, [esp + dzO]
+
+ mulps xmm0, xmm4
+ mulps xmm1, xmm4
+ mulps xmm2, xmm4
+ /* xmm0-xmm2 contains tx-tz (partial force) */
+ movss xmm3, [esp + fixO]
+ movss xmm4, [esp + fiyO]
+ movss xmm5, [esp + fizO]
+ addss xmm3, xmm0
+ addss xmm4, xmm1
+ addss xmm5, xmm2
+ movss [esp + fixO], xmm3
+ movss [esp + fiyO], xmm4
+ movss [esp + fizO], xmm5 /* updated the O force now do the H's */
+ movaps xmm3, xmm0
+ movaps xmm4, xmm1
+ movaps xmm5, xmm2
+ shufps xmm3, xmm3, 0b11100110 /* shift right */
+ shufps xmm4, xmm4, 0b11100110
+ shufps xmm5, xmm5, 0b11100110
+ addss xmm3, [esp + fixH1]
+ addss xmm4, [esp + fiyH1]
+ addss xmm5, [esp + fizH1]
+ movss [esp + fixH1], xmm3
+ movss [esp + fiyH1], xmm4
+ movss [esp + fizH1], xmm5 /* updated the H1 force */
+
+ mov edi, [ebp + faction]
+ shufps xmm3, xmm3, 0b11100111 /* shift right */
+ shufps xmm4, xmm4, 0b11100111
+ shufps xmm5, xmm5, 0b11100111
+ addss xmm3, [esp + fixH2]
+ addss xmm4, [esp + fiyH2]
+ addss xmm5, [esp + fizH2]
+ movss [esp + fixH2], xmm3
+ movss [esp + fiyH2], xmm4
+ movss [esp + fizH2], xmm5 /* updated the H2 force */
+
+ /* the fj's - start by accumulating the tx/ty/tz force in xmm0, xmm1 */
+ xorps xmm5, xmm5
+ movaps xmm3, xmm0
+ movlps xmm6, [edi + eax*4]
+ movss xmm7, [edi + eax*4 + 8]
+ unpcklps xmm3, xmm1
+ movlhps xmm3, xmm5
+ unpckhps xmm0, xmm1
+ addps xmm0, xmm3
+ movhlps xmm3, xmm0
+ addps xmm0, xmm3 /* x,y sum in xmm0 */
+
+ movhlps xmm1, xmm2
+ addss xmm2, xmm1
+ shufps xmm1, xmm1, 1
+ addss xmm2, xmm1 /* z sum in xmm2 */
+ subps xmm6, xmm0
+ subss xmm7, xmm2
+
+ movlps [edi + eax*4], xmm6
+ movss [edi + eax*4 + 8], xmm7
+
+ dec dword ptr [esp + innerk]
+ jz .i2020_updateouterdata
+ jmp .i2020_odd_loop
+.i2020_updateouterdata:
+ mov ecx, [esp + ii3]
+ mov edi, [ebp + faction]
+ mov esi, [ebp + fshift]
+ mov edx, [esp + is3]
+
+ /* accumulate Oi forces in xmm0, xmm1, xmm2 */
+ movaps xmm0, [esp + fixO]
+ movaps xmm1, [esp + fiyO]
+ movaps xmm2, [esp + fizO]
+
+ movhlps xmm3, xmm0
+ movhlps xmm4, xmm1
+ movhlps xmm5, xmm2
+ addps xmm0, xmm3
+ addps xmm1, xmm4
+ addps xmm2, xmm5 /* sum is in 1/2 in xmm0-xmm2 */
+
+ movaps xmm3, xmm0
+ movaps xmm4, xmm1
+ movaps xmm5, xmm2
+
+ shufps xmm3, xmm3, 1
+ shufps xmm4, xmm4, 1
+ shufps xmm5, xmm5, 1
+ addss xmm0, xmm3
+ addss xmm1, xmm4
+ addss xmm2, xmm5 /* xmm0-xmm2 has single force in pos0 */
+
+ /* increment i force */
+ movss xmm3, [edi + ecx*4]
+ movss xmm4, [edi + ecx*4 + 4]
+ movss xmm5, [edi + ecx*4 + 8]
+ addss xmm3, xmm0
+ addss xmm4, xmm1
+ addss xmm5, xmm2
+ movss [edi + ecx*4], xmm3
+ movss [edi + ecx*4 + 4], xmm4
+ movss [edi + ecx*4 + 8], xmm5
+
+ /* accumulate force in xmm6/xmm7 for fshift */
+ movaps xmm6, xmm0
+ movss xmm7, xmm2
+ movlhps xmm6, xmm1
+ shufps xmm6, xmm6, 0b1000
+
+ /* accumulate H1i forces in xmm0, xmm1, xmm2 */
+ movaps xmm0, [esp + fixH1]
+ movaps xmm1, [esp + fiyH1]
+ movaps xmm2, [esp + fizH1]
+
+ movhlps xmm3, xmm0
+ movhlps xmm4, xmm1
+ movhlps xmm5, xmm2
+ addps xmm0, xmm3
+ addps xmm1, xmm4
+ addps xmm2, xmm5 /* sum is in 1/2 in xmm0-xmm2 */
+
+ movaps xmm3, xmm0
+ movaps xmm4, xmm1
+ movaps xmm5, xmm2
+
+ shufps xmm3, xmm3, 1
+ shufps xmm4, xmm4, 1
+ shufps xmm5, xmm5, 1
+ addss xmm0, xmm3
+ addss xmm1, xmm4
+ addss xmm2, xmm5 /* xmm0-xmm2 has single force in pos0 */
+
+ /* increment i force */
+ movss xmm3, [edi + ecx*4 + 12]
+ movss xmm4, [edi + ecx*4 + 16]
+ movss xmm5, [edi + ecx*4 + 20]
+ addss xmm3, xmm0
+ addss xmm4, xmm1
+ addss xmm5, xmm2
+ movss [edi + ecx*4 + 12], xmm3
+ movss [edi + ecx*4 + 16], xmm4
+ movss [edi + ecx*4 + 20], xmm5
+
+ /* accumulate force in xmm6/xmm7 for fshift */
+ addss xmm7, xmm2
+ movlhps xmm0, xmm1
+ shufps xmm0, xmm0, 0b1000
+ addps xmm6, xmm0
+
+ /* accumulate H2i forces in xmm0, xmm1, xmm2 */
+ movaps xmm0, [esp + fixH2]
+ movaps xmm1, [esp + fiyH2]
+ movaps xmm2, [esp + fizH2]
+
+ movhlps xmm3, xmm0
+ movhlps xmm4, xmm1
+ movhlps xmm5, xmm2
+ addps xmm0, xmm3
+ addps xmm1, xmm4
+ addps xmm2, xmm5 /* sum is in 1/2 in xmm0-xmm2 */
+
+ movaps xmm3, xmm0
+ movaps xmm4, xmm1
+ movaps xmm5, xmm2
+
+ shufps xmm3, xmm3, 1
+ shufps xmm4, xmm4, 1
+ shufps xmm5, xmm5, 1
+ addss xmm0, xmm3
+ addss xmm1, xmm4
+ addss xmm2, xmm5 /* xmm0-xmm2 has single force in pos0 */
+
+ /* increment i force */
+ movss xmm3, [edi + ecx*4 + 24]
+ movss xmm4, [edi + ecx*4 + 28]
+ movss xmm5, [edi + ecx*4 + 32]
+ addss xmm3, xmm0
+ addss xmm4, xmm1
+ addss xmm5, xmm2
+ movss [edi + ecx*4 + 24], xmm3
+ movss [edi + ecx*4 + 28], xmm4
+ movss [edi + ecx*4 + 32], xmm5
+
+ /* accumulate force in xmm6/xmm7 for fshift */
+ addss xmm7, xmm2
+ movlhps xmm0, xmm1
+ shufps xmm0, xmm0, 0b1000
+ addps xmm6, xmm0
+
+ /* increment fshift force */
+ movlps xmm3, [esi + edx*4]
+ movss xmm4, [esi + edx*4 + 8]
+ addps xmm3, xmm6
+ addss xmm4, xmm7
+ movlps [esi + edx*4], xmm3
+ movss [esi + edx*4 + 8], xmm4
+
+ mov edx, [ebp + gid]
+ mov edx, [edx]
+ add [ebp + gid], 4
+
+ /* accumulate total potential energy and update it */
+ movaps xmm7, [esp + vctot]
+ /* accumulate */
+ movhlps xmm6, xmm7
+ addps xmm7, xmm6 /* pos 0-1 in xmm7 have the sum now */
+ movaps xmm6, xmm7
+ shufps xmm6, xmm6, 1
+ addss xmm7, xmm6
+
+ /* add earlier value from mem */
+ mov eax, [ebp + Vc]
+ addss xmm7, [eax + edx*4]
+ /* move back to mem */
+ movss [eax + edx*4], xmm7
+
+ /* finish if last */
+ mov ecx, [ebp + nri]
+ dec ecx
+ jecxz .i2020_end
+ /* not last, iterate once more! */
+ mov [ebp + nri], ecx
+ jmp .i2020_outer
+.i2020_end:
+ emms
+ mov eax, [esp + salign]
+ add esp, eax
+ add esp, 724
+ pop edi
+ pop esi
+ pop edx
+ pop ecx
+ pop ebx
+ pop eax
+ leave
+ ret
+
+
+
+.globl inl2030_sse
+ .type inl2030_sse,@function
+inl2030_sse:
+.equ nri, 8
+.equ iinr, 12
+.equ jindex, 16
+.equ jjnr, 20
+.equ shift, 24
+.equ shiftvec, 28
+.equ fshift, 32
+.equ gid, 36
+.equ pos, 40
+.equ faction, 44
+.equ charge, 48
+.equ facel, 52
+.equ Vc, 56
+.equ krf, 60
+.equ crf, 64
+ /* stack offsets for local variables */
+ /* bottom of stack is cache-aligned for sse use */
+.equ ixO, 0
+.equ iyO, 16
+.equ izO, 32
+.equ ixH1, 48
+.equ iyH1, 64
+.equ izH1, 80
+.equ ixH2, 96
+.equ iyH2, 112
+.equ izH2, 128
+.equ jxO, 144
+.equ jyO, 160
+.equ jzO, 176
+.equ jxH1, 192
+.equ jyH1, 208
+.equ jzH1, 224
+.equ jxH2, 240
+.equ jyH2, 256
+.equ jzH2, 272
+.equ dxOO, 288
+.equ dyOO, 304
+.equ dzOO, 320
+.equ dxOH1, 336
+.equ dyOH1, 352
+.equ dzOH1, 368
+.equ dxOH2, 384
+.equ dyOH2, 400
+.equ dzOH2, 416
+.equ dxH1O, 432
+.equ dyH1O, 448
+.equ dzH1O, 464
+.equ dxH1H1, 480
+.equ dyH1H1, 496
+.equ dzH1H1, 512
+.equ dxH1H2, 528
+.equ dyH1H2, 544
+.equ dzH1H2, 560
+.equ dxH2O, 576
+.equ dyH2O, 592
+.equ dzH2O, 608
+.equ dxH2H1, 624
+.equ dyH2H1, 640
+.equ dzH2H1, 656
+.equ dxH2H2, 672
+.equ dyH2H2, 688
+.equ dzH2H2, 704
+.equ qqOO, 720
+.equ qqOH, 736
+.equ qqHH, 752
+.equ vctot, 768
+.equ fixO, 784
+.equ fiyO, 800
+.equ fizO, 816
+.equ fixH1, 832
+.equ fiyH1, 848
+.equ fizH1, 864
+.equ fixH2, 880
+.equ fiyH2, 896
+.equ fizH2, 912
+.equ fjxO, 928
+.equ fjyO, 944
+.equ fjzO, 960
+.equ fjxH1, 976
+.equ fjyH1, 992
+.equ fjzH1, 1008
+.equ fjxH2, 1024
+.equ fjyH2, 1040
+.equ fjzH2, 1056
+.equ half, 1072
+.equ three, 1088
+.equ rsqOO, 1104
+.equ rsqOH1, 1120
+.equ rsqOH2, 1136
+.equ rsqH1O, 1152
+.equ rsqH1H1, 1168
+.equ rsqH1H2, 1184
+.equ rsqH2O, 1200
+.equ rsqH2H1, 1216
+.equ rsqH2H2, 1232
+.equ rinvOO, 1248
+.equ rinvOH1, 1264
+.equ rinvOH2, 1280
+.equ rinvH1O, 1296
+.equ rinvH1H1, 1312
+.equ rinvH1H2, 1328
+.equ rinvH2O, 1344
+.equ rinvH2H1, 1360
+.equ rinvH2H2, 1376
+.equ two, 1392
+.equ krf, 1408
+.equ crf, 1424
+.equ is3, 1440
+.equ ii3, 1444
+.equ innerjjnr, 1448
+.equ innerk, 1452
+.equ salign, 1456
+ push ebp
+ mov ebp,esp
+ push eax
+ push ebx
+ push ecx
+ push edx
+ push esi
+ push edi
+ sub esp, 1460 /* local stack space */
+ mov eax, esp
+ and eax, 0xf
+ sub esp, eax
+ mov [esp + salign], eax
+
+ emms
+
+ movups xmm0, [sse_half]
+ movups xmm1, [sse_three]
+ movups xmm4, [sse_two]
+ movss xmm5, [ebp + krf]
+ movss xmm6, [ebp + crf]
+
+ movaps [esp + half], xmm0
+ movaps [esp + three], xmm1
+ movaps [esp + two], xmm4
+ shufps xmm5, xmm5, 0
+ shufps xmm6, xmm6, 0
+ movaps [esp + krf], xmm5
+ movaps [esp + crf], xmm6
+
+ /* assume we have at least one i particle - start directly */
+ mov ecx, [ebp + iinr] /* ecx = pointer into iinr[] */
+ mov ebx, [ecx] /* ebx =ii */
+
+ mov edx, [ebp + charge]
+ movss xmm3, [edx + ebx*4]
+ movss xmm4, xmm3
+ movss xmm5, [edx + ebx*4 + 4]
+ movss xmm6, [ebp + facel]
+ mulss xmm3, xmm3
+ mulss xmm4, xmm5
+ mulss xmm5, xmm5
+ mulss xmm3, xmm6
+ mulss xmm4, xmm6
+ mulss xmm5, xmm6
+ shufps xmm3, xmm3, 0
+ shufps xmm4, xmm4, 0
+ shufps xmm5, xmm5, 0
+ movaps [esp + qqOO], xmm3
+ movaps [esp + qqOH], xmm4
+ movaps [esp + qqHH], xmm5
+
+.i2030_outer:
+ mov eax, [ebp + shift] /* eax = pointer into shift[] */
+ mov ebx, [eax] /* ebx=shift[n] */
+ add [ebp + shift], 4 /* advance pointer one step */
+
+ lea ebx, [ebx + ebx*2] /* ebx=3*is */
+ mov [esp + is3],ebx /* store is3 */
+
+ mov eax, [ebp + shiftvec] /* eax = base of shiftvec[] */
+
+ movss xmm0, [eax + ebx*4]
+ movss xmm1, [eax + ebx*4 + 4]
+ movss xmm2, [eax + ebx*4 + 8]
+
+ mov ecx, [ebp + iinr] /* ecx = pointer into iinr[] */
+ add [ebp + iinr], 4 /* advance pointer */
+ mov ebx, [ecx] /* ebx =ii */
+
+ lea ebx, [ebx + ebx*2] /* ebx = 3*ii=ii3 */
+ mov eax, [ebp + pos] /* eax = base of pos[] */
+ mov [esp + ii3], ebx
+
+ movaps xmm3, xmm0
+ movaps xmm4, xmm1
+ movaps xmm5, xmm2
+ addss xmm3, [eax + ebx*4]
+ addss xmm4, [eax + ebx*4 + 4]
+ addss xmm5, [eax + ebx*4 + 8]
+ shufps xmm3, xmm3, 0
+ shufps xmm4, xmm4, 0
+ shufps xmm5, xmm5, 0
+ movaps [esp + ixO], xmm3
+ movaps [esp + iyO], xmm4
+ movaps [esp + izO], xmm5
+
+ movss xmm3, xmm0
+ movss xmm4, xmm1
+ movss xmm5, xmm2
+ addss xmm0, [eax + ebx*4 + 12]
+ addss xmm1, [eax + ebx*4 + 16]
+ addss xmm2, [eax + ebx*4 + 20]
+ addss xmm3, [eax + ebx*4 + 24]
+ addss xmm4, [eax + ebx*4 + 28]
+ addss xmm5, [eax + ebx*4 + 32]
+
+ shufps xmm0, xmm0, 0
+ shufps xmm1, xmm1, 0
+ shufps xmm2, xmm2, 0
+ shufps xmm3, xmm3, 0
+ shufps xmm4, xmm4, 0
+ shufps xmm5, xmm5, 0
+ movaps [esp + ixH1], xmm0
+ movaps [esp + iyH1], xmm1
+ movaps [esp + izH1], xmm2
+ movaps [esp + ixH2], xmm3
+ movaps [esp + iyH2], xmm4
+ movaps [esp + izH2], xmm5
+
+ /* clear vctot and i forces */
+ xorps xmm4, xmm4
+ movaps [esp + vctot], xmm4
+ movaps [esp + fixO], xmm4
+ movaps [esp + fiyO], xmm4
+ movaps [esp + fizO], xmm4
+ movaps [esp + fixH1], xmm4
+ movaps [esp + fiyH1], xmm4
+ movaps [esp + fizH1], xmm4
+ movaps [esp + fixH2], xmm4
+ movaps [esp + fiyH2], xmm4
+ movaps [esp + fizH2], xmm4
+
+ mov eax, [ebp + jindex]
+ mov ecx, [eax] /* jindex[n] */
+ mov edx, [eax + 4] /* jindex[n+1] */
+ add [ebp + jindex], 4
+ sub edx, ecx /* number of innerloop atoms */
+
+ mov esi, [ebp + pos]
+ mov edi, [ebp + faction]
+ mov eax, [ebp + jjnr]
+ shl ecx, 2
+ add eax, ecx
+ mov [esp + innerjjnr], eax /* pointer to jjnr[nj0] */
+ sub edx, 4
+ mov [esp + innerk], edx /* number of innerloop atoms */
+ jge .i2030_unroll_loop
+ jmp .i2030_single_check
+.i2030_unroll_loop:
+ /* quad-unroll innerloop here */
+ mov edx, [esp + innerjjnr] /* pointer to jjnr[k] */
+
+ mov eax, [edx]
+ mov ebx, [edx + 4]
+ mov ecx, [edx + 8]
+ mov edx, [edx + 12] /* eax-edx=jnr1-4 */
+
+ add [esp + innerjjnr], 16 /* advance pointer (unrolled 4) */
+
+ mov esi, [ebp + pos] /* base of pos[] */
+
+ lea eax, [eax + eax*2] /* replace jnr with j3 */
+ lea ebx, [ebx + ebx*2]
+ lea ecx, [ecx + ecx*2] /* replace jnr with j3 */
+ lea edx, [edx + edx*2]
+
+ /* move j coordinates to local temp variables */
+ movlps xmm2, [esi + eax*4]
+ movlps xmm3, [esi + eax*4 + 12]
+ movlps xmm4, [esi + eax*4 + 24]
+
+ movlps xmm5, [esi + ebx*4]
+ movlps xmm6, [esi + ebx*4 + 12]
+ movlps xmm7, [esi + ebx*4 + 24]
+
+ movhps xmm2, [esi + ecx*4]
+ movhps xmm3, [esi + ecx*4 + 12]
+ movhps xmm4, [esi + ecx*4 + 24]
+
+ movhps xmm5, [esi + edx*4]
+ movhps xmm6, [esi + edx*4 + 12]
+ movhps xmm7, [esi + edx*4 + 24]
+
+ /* current state: */
+ /* xmm2= jxOa jyOa jxOc jyOc */
+ /* xmm3= jxH1a jyH1a jxH1c jyH1c */
+ /* xmm4= jxH2a jyH2a jxH2c jyH2c */
+ /* xmm5= jxOb jyOb jxOd jyOd */
+ /* xmm6= jxH1b jyH1b jxH1d jyH1d */
+ /* xmm7= jxH2b jyH2b jxH2d jyH2d */
+
+ movaps xmm0, xmm2
+ movaps xmm1, xmm3
+ unpcklps xmm0, xmm5 /* xmm0= jxOa jxOb jyOa jyOb */
+ unpcklps xmm1, xmm6 /* xmm1= jxH1a jxH1b jyH1a jyH1b */
+ unpckhps xmm2, xmm5 /* xmm2= jxOc jxOd jyOc jyOd */
+ unpckhps xmm3, xmm6 /* xmm3= jxH1c jxH1d jyH1c jyH1d */
+ movaps xmm5, xmm4
+ movaps xmm6, xmm0
+ unpcklps xmm4, xmm7 /* xmm4= jxH2a jxH2b jyH2a jyH2b */
+ unpckhps xmm5, xmm7 /* xmm5= jxH2c jxH2d jyH2c jyH2d */
+ movaps xmm7, xmm1
+ movlhps xmm0, xmm2 /* xmm0= jxOa jxOb jxOc jxOd */
+ movaps [esp + jxO], xmm0
+ movhlps xmm2, xmm6 /* xmm2= jyOa jyOb jyOc jyOd */
+ movaps [esp + jyO], xmm2
+ movlhps xmm1, xmm3
+ movaps [esp + jxH1], xmm1
+ movhlps xmm3, xmm7
+ movaps xmm6, xmm4
+ movaps [esp + jyH1], xmm3
+ movlhps xmm4, xmm5
+ movaps [esp + jxH2], xmm4
+ movhlps xmm5, xmm6
+ movaps [esp + jyH2], xmm5
+
+ movss xmm0, [esi + eax*4 + 8]
+ movss xmm1, [esi + eax*4 + 20]
+ movss xmm2, [esi + eax*4 + 32]
+
+ movss xmm3, [esi + ecx*4 + 8]
+ movss xmm4, [esi + ecx*4 + 20]
+ movss xmm5, [esi + ecx*4 + 32]
+
+ movhps xmm0, [esi + ebx*4 + 4]
+ movhps xmm1, [esi + ebx*4 + 16]
+ movhps xmm2, [esi + ebx*4 + 28]
+
+ movhps xmm3, [esi + edx*4 + 4]
+ movhps xmm4, [esi + edx*4 + 16]
+ movhps xmm5, [esi + edx*4 + 28]
+
+ shufps xmm0, xmm3, 0b11001100
+ shufps xmm1, xmm4, 0b11001100
+ shufps xmm2, xmm5, 0b11001100
+ movaps [esp + jzO], xmm0
+ movaps [esp + jzH1], xmm1
+ movaps [esp + jzH2], xmm2
+
+ movaps xmm0, [esp + ixO]
+ movaps xmm1, [esp + iyO]
+ movaps xmm2, [esp + izO]
+ movaps xmm3, [esp + ixO]
+ movaps xmm4, [esp + iyO]
+ movaps xmm5, [esp + izO]
+ subps xmm0, [esp + jxO]
+ subps xmm1, [esp + jyO]
+ subps xmm2, [esp + jzO]
+ subps xmm3, [esp + jxH1]
+ subps xmm4, [esp + jyH1]
+ subps xmm5, [esp + jzH1]
+ movaps [esp + dxOO], xmm0
+ movaps [esp + dyOO], xmm1
+ movaps [esp + dzOO], xmm2
+ mulps xmm0, xmm0
+ mulps xmm1, xmm1
+ mulps xmm2, xmm2
+ movaps [esp + dxOH1], xmm3
+ movaps [esp + dyOH1], xmm4
+ movaps [esp + dzOH1], xmm5
+ mulps xmm3, xmm3
+ mulps xmm4, xmm4
+ mulps xmm5, xmm5
+ addps xmm0, xmm1
+ addps xmm0, xmm2
+ addps xmm3, xmm4
+ addps xmm3, xmm5
+ movaps [esp + rsqOO], xmm0
+ movaps [esp + rsqOH1], xmm3
+
+ movaps xmm0, [esp + ixO]
+ movaps xmm1, [esp + iyO]
+ movaps xmm2, [esp + izO]
+ movaps xmm3, [esp + ixH1]
+ movaps xmm4, [esp + iyH1]
+ movaps xmm5, [esp + izH1]
+ subps xmm0, [esp + jxH2]
+ subps xmm1, [esp + jyH2]
+ subps xmm2, [esp + jzH2]
+ subps xmm3, [esp + jxO]
+ subps xmm4, [esp + jyO]
+ subps xmm5, [esp + jzO]
+ movaps [esp + dxOH2], xmm0
+ movaps [esp + dyOH2], xmm1
+ movaps [esp + dzOH2], xmm2
+ mulps xmm0, xmm0
+ mulps xmm1, xmm1
+ mulps xmm2, xmm2
+ movaps [esp + dxH1O], xmm3
+ movaps [esp + dyH1O], xmm4
+ movaps [esp + dzH1O], xmm5
+ mulps xmm3, xmm3
+ mulps xmm4, xmm4
+ mulps xmm5, xmm5
+ addps xmm0, xmm1
+ addps xmm0, xmm2
+ addps xmm3, xmm4
+ addps xmm3, xmm5
+ movaps [esp + rsqOH2], xmm0
+ movaps [esp + rsqH1O], xmm3
+
+ movaps xmm0, [esp + ixH1]
+ movaps xmm1, [esp + iyH1]
+ movaps xmm2, [esp + izH1]
+ movaps xmm3, [esp + ixH1]
+ movaps xmm4, [esp + iyH1]
+ movaps xmm5, [esp + izH1]
+ subps xmm0, [esp + jxH1]
+ subps xmm1, [esp + jyH1]
+ subps xmm2, [esp + jzH1]
+ subps xmm3, [esp + jxH2]
+ subps xmm4, [esp + jyH2]
+ subps xmm5, [esp + jzH2]
+ movaps [esp + dxH1H1], xmm0
+ movaps [esp + dyH1H1], xmm1
+ movaps [esp + dzH1H1], xmm2
+ mulps xmm0, xmm0
+ mulps xmm1, xmm1
+ mulps xmm2, xmm2
+ movaps [esp + dxH1H2], xmm3
+ movaps [esp + dyH1H2], xmm4
+ movaps [esp + dzH1H2], xmm5
+ mulps xmm3, xmm3
+ mulps xmm4, xmm4
+ mulps xmm5, xmm5
+ addps xmm0, xmm1
+ addps xmm0, xmm2
+ addps xmm3, xmm4
+ addps xmm3, xmm5
+ movaps [esp + rsqH1H1], xmm0
+ movaps [esp + rsqH1H2], xmm3
+
+ movaps xmm0, [esp + ixH2]
+ movaps xmm1, [esp + iyH2]
+ movaps xmm2, [esp + izH2]
+ movaps xmm3, [esp + ixH2]
+ movaps xmm4, [esp + iyH2]
+ movaps xmm5, [esp + izH2]
+ subps xmm0, [esp + jxO]
+ subps xmm1, [esp + jyO]
+ subps xmm2, [esp + jzO]
+ subps xmm3, [esp + jxH1]
+ subps xmm4, [esp + jyH1]
+ subps xmm5, [esp + jzH1]
+ movaps [esp + dxH2O], xmm0
+ movaps [esp + dyH2O], xmm1
+ movaps [esp + dzH2O], xmm2
+ mulps xmm0, xmm0
+ mulps xmm1, xmm1
+ mulps xmm2, xmm2
+ movaps [esp + dxH2H1], xmm3
+ movaps [esp + dyH2H1], xmm4
+ movaps [esp + dzH2H1], xmm5
+ mulps xmm3, xmm3
+ mulps xmm4, xmm4
+ mulps xmm5, xmm5
+ addps xmm0, xmm1
+ addps xmm0, xmm2
+ addps xmm4, xmm3
+ addps xmm4, xmm5
+ movaps [esp + rsqH2O], xmm0
+ movaps [esp + rsqH2H1], xmm4
+
+ movaps xmm0, [esp + ixH2]
+ movaps xmm1, [esp + iyH2]
+ movaps xmm2, [esp + izH2]
+ subps xmm0, [esp + jxH2]
+ subps xmm1, [esp + jyH2]
+ subps xmm2, [esp + jzH2]
+ movaps [esp + dxH2H2], xmm0
+ movaps [esp + dyH2H2], xmm1
+ movaps [esp + dzH2H2], xmm2
+ mulps xmm0, xmm0
+ mulps xmm1, xmm1
+ mulps xmm2, xmm2
+ addps xmm0, xmm1
+ addps xmm0, xmm2
+ movaps [esp + rsqH2H2], xmm0
+
+ /* start doing invsqrt use rsq values in xmm0, xmm4 */
+ rsqrtps xmm1, xmm0
+ rsqrtps xmm5, xmm4
+ movaps xmm2, xmm1
+ movaps xmm6, xmm5
+ mulps xmm1, xmm1
+ mulps xmm5, xmm5
+ movaps xmm3, [esp + three]
+ movaps xmm7, xmm3
+ mulps xmm1, xmm0
+ mulps xmm5, xmm4
+ subps xmm3, xmm1
+ subps xmm7, xmm5
+ mulps xmm3, xmm2
+ mulps xmm7, xmm6
+ mulps xmm3, [esp + half] /* rinvH2H2 */
+ mulps xmm7, [esp + half] /* rinvH2H1 */
+ movaps [esp + rinvH2H2], xmm3
+ movaps [esp + rinvH2H1], xmm7
+
+ rsqrtps xmm1, [esp + rsqOO]
+ rsqrtps xmm5, [esp + rsqOH1]
+ movaps xmm2, xmm1
+ movaps xmm6, xmm5
+ mulps xmm1, xmm1
+ mulps xmm5, xmm5
+ movaps xmm3, [esp + three]
+ movaps xmm7, xmm3
+ mulps xmm1, [esp + rsqOO]
+ mulps xmm5, [esp + rsqOH1]
+ subps xmm3, xmm1
+ subps xmm7, xmm5
+ mulps xmm3, xmm2
+ mulps xmm7, xmm6
+ mulps xmm3, [esp + half]
+ mulps xmm7, [esp + half]
+ movaps [esp + rinvOO], xmm3
+ movaps [esp + rinvOH1], xmm7
+
+ rsqrtps xmm1, [esp + rsqOH2]
+ rsqrtps xmm5, [esp + rsqH1O]
+ movaps xmm2, xmm1
+ movaps xmm6, xmm5
+ mulps xmm1, xmm1
+ mulps xmm5, xmm5
+ movaps xmm3, [esp + three]
+ movaps xmm7, xmm3
+ mulps xmm1, [esp + rsqOH2]
+ mulps xmm5, [esp + rsqH1O]
+ subps xmm3, xmm1
+ subps xmm7, xmm5
+ mulps xmm3, xmm2
+ mulps xmm7, xmm6
+ mulps xmm3, [esp + half]
+ mulps xmm7, [esp + half]
+ movaps [esp + rinvOH2], xmm3
+ movaps [esp + rinvH1O], xmm7
+
+ rsqrtps xmm1, [esp + rsqH1H1]
+ rsqrtps xmm5, [esp + rsqH1H2]
+ movaps xmm2, xmm1
+ movaps xmm6, xmm5
+ mulps xmm1, xmm1
+ mulps xmm5, xmm5
+ movaps xmm3, [esp + three]
+ movaps xmm7, xmm3
+ mulps xmm1, [esp + rsqH1H1]
+ mulps xmm5, [esp + rsqH1H2]
+ subps xmm3, xmm1
+ subps xmm7, xmm5
+ mulps xmm3, xmm2
+ mulps xmm7, xmm6
+ mulps xmm3, [esp + half]
+ mulps xmm7, [esp + half]
+ movaps [esp + rinvH1H1], xmm3
+ movaps [esp + rinvH1H2], xmm7
+
+ rsqrtps xmm1, [esp + rsqH2O]
+ movaps xmm2, xmm1
+ mulps xmm1, xmm1
+ movaps xmm3, [esp + three]
+ mulps xmm1, [esp + rsqH2O]
+ subps xmm3, xmm1
+ mulps xmm3, xmm2
+ mulps xmm3, [esp + half]
+ movaps [esp + rinvH2O], xmm3
+
+ /* start with OO interaction */
+ movaps xmm0, [esp + rinvOO]
+ movaps xmm7, xmm0 /* xmm7=rinv */
+ movaps xmm5, [esp + krf]
+ mulps xmm0, xmm0 /* xmm0=rinvsq */
+
+ mulps xmm5, [esp + rsqOO] /* xmm5=krsq */
+ movaps xmm6, xmm5
+ addps xmm6, xmm7 /* xmm6=rinv+krsq */
+ subps xmm6, [esp + crf]
+ mulps xmm6, [esp + qqOO] /* xmm6=voul=qq*(rinv+krsq-crf) */
+ mulps xmm5, [esp + two]
+ subps xmm7, xmm5 /* xmm7=rinv-2*krsq */
+ mulps xmm7, [esp + qqOO] /* xmm7 = coul part of fscal */
+
+ addps xmm6, [esp + vctot] /* local vctot summation variable */
+ mulps xmm0, xmm7
+
+ movaps xmm1, xmm0
+ movaps xmm2, xmm0
+
+ xorps xmm3, xmm3
+ movaps xmm4, xmm3
+ movaps xmm5, xmm3
+ mulps xmm0, [esp + dxOO]
+ mulps xmm1, [esp + dyOO]
+ mulps xmm2, [esp + dzOO]
+ subps xmm3, xmm0
+ subps xmm4, xmm1
+ subps xmm5, xmm2
+ addps xmm0, [esp + fixO]
+ addps xmm1, [esp + fiyO]
+ addps xmm2, [esp + fizO]
+ movaps [esp + fjxO], xmm3
+ movaps [esp + fjyO], xmm4
+ movaps [esp + fjzO], xmm5
+ movaps [esp + fixO], xmm0
+ movaps [esp + fiyO], xmm1
+ movaps [esp + fizO], xmm2
+
+ /* O-H1 interaction */
+ movaps xmm0, [esp + rinvOH1]
+ movaps xmm7, xmm0 /* xmm7=rinv */
+ movaps xmm5, [esp + krf]
+ movaps xmm1, xmm0
+ mulps xmm5, [esp + rsqOH1] /* xmm5=krsq */
+ movaps xmm4, xmm5
+ addps xmm4, xmm7 /* xmm4=r inv+krsq */
+ subps xmm4, [esp + crf]
+ mulps xmm0, xmm0
+ mulps xmm4, [esp + qqOH] /* xmm4=voul=qq*(rinv+krsq-crf) */
+ mulps xmm5, [esp + two]
+ subps xmm7, xmm5 /* xmm7=rinv-2*krsq */
+ mulps xmm7, [esp + qqOH] /* xmm7 = coul part of fscal */
+ addps xmm6, xmm4 /* add to local vctot */
+ mulps xmm0, xmm7 /* fsOH1 */
+ movaps xmm1, xmm0
+ movaps xmm2, xmm0
+
+ xorps xmm3, xmm3
+ movaps xmm4, xmm3
+ movaps xmm5, xmm3
+ mulps xmm0, [esp + dxOH1]
+ mulps xmm1, [esp + dyOH1]
+ mulps xmm2, [esp + dzOH1]
+ subps xmm3, xmm0
+ subps xmm4, xmm1
+ subps xmm5, xmm2
+ addps xmm0, [esp + fixO]
+ addps xmm1, [esp + fiyO]
+ addps xmm2, [esp + fizO]
+ movaps [esp + fjxH1], xmm3
+ movaps [esp + fjyH1], xmm4
+ movaps [esp + fjzH1], xmm5
+ movaps [esp + fixO], xmm0
+ movaps [esp + fiyO], xmm1
+ movaps [esp + fizO], xmm2
+
+ /* O-H2 interaction */
+ movaps xmm0, [esp + rinvOH2]
+ movaps xmm7, xmm0 /* xmm7=rinv */
+ movaps xmm5, [esp + krf]
+ movaps xmm1, xmm0
+ mulps xmm5, [esp + rsqOH2] /* xmm5=krsq */
+ movaps xmm4, xmm5
+ addps xmm4, xmm7 /* xmm4=r inv+krsq */
+ subps xmm4, [esp + crf]
+ mulps xmm0, xmm0
+ mulps xmm4, [esp + qqOH] /* xmm4=voul=qq*(rinv+krsq-crf) */
+ mulps xmm5, [esp + two]
+ subps xmm7, xmm5 /* xmm7=rinv-2*krsq */
+ mulps xmm7, [esp + qqOH] /* xmm7 = coul part of fscal */
+ addps xmm6, xmm4 /* add to local vctot */
+ mulps xmm0, xmm7 /* fsOH2 */
+ movaps xmm1, xmm0
+ movaps xmm2, xmm0
+
+ xorps xmm3, xmm3
+ movaps xmm4, xmm3
+ movaps xmm5, xmm3
+ mulps xmm0, [esp + dxOH2]
+ mulps xmm1, [esp + dyOH2]
+ mulps xmm2, [esp + dzOH2]
+ subps xmm3, xmm0
+ subps xmm4, xmm1
+ subps xmm5, xmm2
+ addps xmm0, [esp + fixO]
+ addps xmm1, [esp + fiyO]
+ addps xmm2, [esp + fizO]
+ movaps [esp + fjxH2], xmm3
+ movaps [esp + fjyH2], xmm4
+ movaps [esp + fjzH2], xmm5
+ movaps [esp + fixO], xmm0
+ movaps [esp + fiyO], xmm1
+ movaps [esp + fizO], xmm2
+
+ /* H1-O interaction */
+ movaps xmm0, [esp + rinvH1O]
+ movaps xmm7, xmm0 /* xmm7=rinv */
+ movaps xmm5, [esp + krf]
+ movaps xmm1, xmm0
+ mulps xmm5, [esp + rsqH1O] /* xmm5=krsq */
+ movaps xmm4, xmm5
+ addps xmm4, xmm7 /* xmm4=r inv+krsq */
+ subps xmm4, [esp + crf]
+ mulps xmm0, xmm0
+ mulps xmm4, [esp + qqOH] /* xmm4=voul=qq*(rinv+krsq-crf) */
+ mulps xmm5, [esp + two]
+ subps xmm7, xmm5 /* xmm7=rinv-2*krsq */
+ mulps xmm7, [esp + qqOH] /* xmm7 = coul part of fscal */
+ addps xmm6, xmm4 /* add to local vctot */
+ mulps xmm0, xmm7 /* fsOH2 */
+ movaps xmm1, xmm0
+ movaps xmm2, xmm0
+
+ movaps xmm3, [esp + fjxO]
+ movaps xmm4, [esp + fjyO]
+ movaps xmm5, [esp + fjzO]
+ mulps xmm0, [esp + dxH1O]
+ mulps xmm1, [esp + dyH1O]
+ mulps xmm2, [esp + dzH1O]
+ subps xmm3, xmm0
+ subps xmm4, xmm1
+ subps xmm5, xmm2
+ addps xmm0, [esp + fixH1]
+ addps xmm1, [esp + fiyH1]
+ addps xmm2, [esp + fizH1]
+ movaps [esp + fjxO], xmm3
+ movaps [esp + fjyO], xmm4
+ movaps [esp + fjzO], xmm5
+ movaps [esp + fixH1], xmm0
+ movaps [esp + fiyH1], xmm1
+ movaps [esp + fizH1], xmm2
+
+ /* H1-H1 interaction */
+ movaps xmm0, [esp + rinvH1H1]
+ movaps xmm7, xmm0 /* xmm7=rinv */
+ movaps xmm5, [esp + krf]
+ movaps xmm1, xmm0
+ mulps xmm5, [esp + rsqH1H1] /* xmm5=krsq */
+ movaps xmm4, xmm5
+ addps xmm4, xmm7 /* xmm4=r inv+krsq */
+ subps xmm4, [esp + crf]
+ mulps xmm0, xmm0
+ mulps xmm4, [esp + qqHH] /* xmm4=voul=qq*(rinv+krsq-crf) */
+ mulps xmm5, [esp + two]
+ subps xmm7, xmm5 /* xmm7=rinv-2*krsq */
+ mulps xmm7, [esp + qqHH] /* xmm7 = coul part of fscal */
+ addps xmm6, xmm4 /* add to local vctot */
+ mulps xmm0, xmm7 /* fsOH2 */
+ movaps xmm1, xmm0
+ movaps xmm2, xmm0
+
+ movaps xmm3, [esp + fjxH1]
+ movaps xmm4, [esp + fjyH1]
+ movaps xmm5, [esp + fjzH1]
+ mulps xmm0, [esp + dxH1H1]
+ mulps xmm1, [esp + dyH1H1]
+ mulps xmm2, [esp + dzH1H1]
+ subps xmm3, xmm0
+ subps xmm4, xmm1
+ subps xmm5, xmm2
+ addps xmm0, [esp + fixH1]
+ addps xmm1, [esp + fiyH1]
+ addps xmm2, [esp + fizH1]
+ movaps [esp + fjxH1], xmm3
+ movaps [esp + fjyH1], xmm4
+ movaps [esp + fjzH1], xmm5
+ movaps [esp + fixH1], xmm0
+ movaps [esp + fiyH1], xmm1
+ movaps [esp + fizH1], xmm2
+
+ /* H1-H2 interaction */
+ movaps xmm0, [esp + rinvH1H2]
+ movaps xmm7, xmm0 /* xmm7=rinv */
+ movaps xmm5, [esp + krf]
+ movaps xmm1, xmm0
+ mulps xmm5, [esp + rsqH1H2] /* xmm5=krsq */
+ movaps xmm4, xmm5
+ addps xmm4, xmm7 /* xmm4=r inv+krsq */
+ subps xmm4, [esp + crf]
+ mulps xmm0, xmm0
+ mulps xmm4, [esp + qqHH] /* xmm4=voul=qq*(rinv+krsq-crf) */
+ mulps xmm5, [esp + two]
+ subps xmm7, xmm5 /* xmm7=rinv-2*krsq */
+ mulps xmm7, [esp + qqHH] /* xmm7 = coul part of fscal */
+ addps xmm6, xmm4 /* add to local vctot */
+ mulps xmm0, xmm7 /* fsOH2 */
+ movaps xmm1, xmm0
+ movaps xmm2, xmm0
+
+ movaps xmm3, [esp + fjxH2]
+ movaps xmm4, [esp + fjyH2]
+ movaps xmm5, [esp + fjzH2]
+ mulps xmm0, [esp + dxH1H2]
+ mulps xmm1, [esp + dyH1H2]
+ mulps xmm2, [esp + dzH1H2]
+ subps xmm3, xmm0
+ subps xmm4, xmm1
+ subps xmm5, xmm2
+ addps xmm0, [esp + fixH1]
+ addps xmm1, [esp + fiyH1]
+ addps xmm2, [esp + fizH1]
+ movaps [esp + fjxH2], xmm3
+ movaps [esp + fjyH2], xmm4
+ movaps [esp + fjzH2], xmm5
+ movaps [esp + fixH1], xmm0
+ movaps [esp + fiyH1], xmm1
+ movaps [esp + fizH1], xmm2
+
+ /* H2-O interaction */
+ movaps xmm0, [esp + rinvH2O]
+ movaps xmm7, xmm0 /* xmm7=rinv */
+ movaps xmm5, [esp + krf]
+ movaps xmm1, xmm0
+ mulps xmm5, [esp + rsqH2O] /* xmm5=krsq */
+ movaps xmm4, xmm5
+ addps xmm4, xmm7 /* xmm4=r inv+krsq */
+ subps xmm4, [esp + crf]
+ mulps xmm0, xmm0
+ mulps xmm4, [esp + qqOH] /* xmm4=voul=qq*(rinv+krsq-crf) */
+ mulps xmm5, [esp + two]
+ subps xmm7, xmm5 /* xmm7=rinv-2*krsq */
+ mulps xmm7, [esp + qqOH] /* xmm7 = coul part of fscal */
+ addps xmm6, xmm4 /* add to local vctot */
+ mulps xmm0, xmm7 /* fsOH2 */
+ movaps xmm1, xmm0
+ movaps xmm2, xmm0
+
+ movaps xmm3, [esp + fjxO]
+ movaps xmm4, [esp + fjyO]
+ movaps xmm5, [esp + fjzO]
+ mulps xmm0, [esp + dxH2O]
+ mulps xmm1, [esp + dyH2O]
+ mulps xmm2, [esp + dzH2O]
+ subps xmm3, xmm0
+ subps xmm4, xmm1
+ subps xmm5, xmm2
+ addps xmm0, [esp + fixH2]
+ addps xmm1, [esp + fiyH2]
+ addps xmm2, [esp + fizH2]
+ movaps [esp + fjxO], xmm3
+ movaps [esp + fjyO], xmm4
+ movaps [esp + fjzO], xmm5
+ movaps [esp + fixH2], xmm0
+ movaps [esp + fiyH2], xmm1
+ movaps [esp + fizH2], xmm2
+
+ /* H2-H1 interaction */
+ movaps xmm0, [esp + rinvH2H1]
+ movaps xmm7, xmm0 /* xmm7=rinv */
+ movaps xmm5, [esp + krf]
+ movaps xmm1, xmm0
+ mulps xmm5, [esp + rsqH2H1] /* xmm5=krsq */
+ movaps xmm4, xmm5
+ addps xmm4, xmm7 /* xmm4=r inv+krsq */
+ subps xmm4, [esp + crf]
+ mulps xmm0, xmm0
+ mulps xmm4, [esp + qqHH] /* xmm4=voul=qq*(rinv+krsq-crf) */
+ mulps xmm5, [esp + two]
+ subps xmm7, xmm5 /* xmm7=rinv-2*krsq */
+ mulps xmm7, [esp + qqHH] /* xmm7 = coul part of fscal */
+ addps xmm6, xmm4 /* add to local vctot */
+ mulps xmm0, xmm7 /* fsOH2 */
+ movaps xmm1, xmm0
+ movaps xmm2, xmm0
+
+ movaps xmm3, [esp + fjxH1]
+ movaps xmm4, [esp + fjyH1]
+ movaps xmm5, [esp + fjzH1]
+ mulps xmm0, [esp + dxH2H1]
+ mulps xmm1, [esp + dyH2H1]
+ mulps xmm2, [esp + dzH2H1]
+ subps xmm3, xmm0
+ subps xmm4, xmm1
+ subps xmm5, xmm2
+ addps xmm0, [esp + fixH2]
+ addps xmm1, [esp + fiyH2]
+ addps xmm2, [esp + fizH2]
+ movaps [esp + fjxH1], xmm3
+ movaps [esp + fjyH1], xmm4
+ movaps [esp + fjzH1], xmm5
+ movaps [esp + fixH2], xmm0
+ movaps [esp + fiyH2], xmm1
+ movaps [esp + fizH2], xmm2
+
+ /* H2-H2 interaction */
+ movaps xmm0, [esp + rinvH2H2]
+ movaps xmm7, xmm0 /* xmm7=rinv */
+ movaps xmm5, [esp + krf]
+ movaps xmm1, xmm0
+ mulps xmm5, [esp + rsqH2H2] /* xmm5=krsq */
+ movaps xmm4, xmm5
+ addps xmm4, xmm7 /* xmm4=r inv+krsq */
+ subps xmm4, [esp + crf]
+ mulps xmm0, xmm0
+ mulps xmm4, [esp + qqHH] /* xmm4=voul=qq*(rinv+krsq-crf) */
+ mulps xmm5, [esp + two]
+ subps xmm7, xmm5 /* xmm7=rinv-2*krsq */
+ mulps xmm7, [esp + qqHH] /* xmm7 = coul part of fscal */
+ addps xmm6, xmm4 /* add to local vctot */
+ mulps xmm0, xmm7 /* fsOH2 */
+ movaps xmm1, xmm0
+ movaps xmm2, xmm0
+
+ movaps xmm1, xmm0
+ movaps [esp + vctot], xmm6
+ movaps xmm2, xmm0
+
+ movaps xmm3, [esp + fjxH2]
+ movaps xmm4, [esp + fjyH2]
+ movaps xmm5, [esp + fjzH2]
+ mulps xmm0, [esp + dxH2H2]
+ mulps xmm1, [esp + dyH2H2]
+ mulps xmm2, [esp + dzH2H2]
+ subps xmm3, xmm0
+ subps xmm4, xmm1
+ subps xmm5, xmm2
+ addps xmm0, [esp + fixH2]
+ addps xmm1, [esp + fiyH2]
+ addps xmm2, [esp + fizH2]
+ movaps [esp + fjxH2], xmm3
+ movaps [esp + fjyH2], xmm4
+ movaps [esp + fjzH2], xmm5
+ movaps [esp + fixH2], xmm0
+ movaps [esp + fiyH2], xmm1
+ movaps [esp + fizH2], xmm2
+
+ mov edi, [ebp +faction]
+
+ /* Did all interactions - now update j forces */
+ /* 4 j waters with three atoms each - first do a & b j particles */
+ movaps xmm0, [esp + fjxO] /* xmm0= fjxOa fjxOb fjxOc fjxOd */
+ movaps xmm1, [esp + fjyO] /* xmm1= fjyOa fjyOb fjyOc fjyOd */
+ unpcklps xmm0, xmm1 /* xmm0= fjxOa fjyOa fjxOb fjyOb */
+ movaps xmm1, [esp + fjzO]
+ movaps xmm2, [esp + fjxH1]
+ movhlps xmm3, xmm0 /* xmm3= fjxOb fjyOb */
+ unpcklps xmm1, xmm2 /* xmm1= fjzOa fjxH1a fjzOb fjxH1b */
+ movaps xmm4, [esp + fjyH1]
+ movaps xmm5, [esp + fjzH1]
+ unpcklps xmm4, xmm5 /* xmm4= fjyH1a fjzH1a fjyH1b fjzH1b */
+ movaps xmm5, [esp + fjxH2]
+ movaps xmm6, [esp + fjyH2]
+ movhlps xmm7, xmm4 /* xmm7= fjyH1b fjzH1b */
+ unpcklps xmm5, xmm6 /* xmm5= fjxH2a fjyH2a fjxH2b fjyH2b */
+ movlhps xmm0, xmm1 /* xmm0= fjxOa fjyOa fjzOa fjxH1a */
+ shufps xmm3, xmm1, 0b11100100
+ /* xmm3= fjxOb fjyOb fjzOb fjxH1b */
+ movlhps xmm4, xmm5 /* xmm4= fjyH1a fjzH1a fjxH2a fjyH2a */
+ shufps xmm7, xmm5, 0b11100100
+ /* xmm7= fjyH1b fjzH1b fjxH2b fjyH2b */
+ movups xmm1, [edi + eax*4]
+ movups xmm2, [edi + eax*4 + 16]
+ movups xmm5, [edi + ebx*4]
+ movups xmm6, [edi + ebx*4 + 16]
+ addps xmm1, xmm0
+ addps xmm2, xmm4
+ addps xmm5, xmm3
+ addps xmm6, xmm7
+ movss xmm0, [edi + eax*4 + 32]
+ movss xmm3, [edi + ebx*4 + 32]
+
+ movaps xmm4, [esp + fjzH2]
+ movaps xmm7, xmm4
+ shufps xmm7, xmm7, 1
+
+ movups [edi + eax*4], xmm1
+ movups [edi + eax*4 + 16],xmm2
+ movups [edi + ebx*4], xmm5
+ movups [edi + ebx*4 + 16],xmm6
+ addss xmm0, xmm4
+ addss xmm3, xmm7
+ movss [edi + eax*4 + 32], xmm0
+ movss [edi + ebx*4 + 32], xmm3
+
+ /* then do the second pair (c & d) */
+ movaps xmm0, [esp + fjxO] /* xmm0= fjxOa fjxOb fjxOc fjxOd */
+ movaps xmm1, [esp + fjyO] /* xmm1= fjyOa fjyOb fjyOc fjyOd */
+ unpckhps xmm0, xmm1 /* xmm0= fjxOc fjyOc fjxOd fjyOd */
+ movaps xmm1, [esp + fjzO]
+ movaps xmm2, [esp + fjxH1]
+ movhlps xmm3, xmm0 /* xmm3= fjxOd fjyOd */
+ unpckhps xmm1, xmm2 /* xmm1= fjzOc fjxH1c fjzOd fjxH1d */
+ movaps xmm4, [esp + fjyH1]
+ movaps xmm5, [esp + fjzH1]
+ unpckhps xmm4, xmm5 /* xmm4= fjyH1c fjzH1c fjyH1d fjzH1d */
+ movaps xmm5, [esp + fjxH2]
+ movaps xmm6, [esp + fjyH2]
+ movhlps xmm7, xmm4 /* xmm7= fjyH1d fjzH1d */
+ unpckhps xmm5, xmm6 /* xmm5= fjxH2c fjyH2c fjxH2d fjyH2d */
+ movlhps xmm0, xmm1 /* xmm0= fjxOc fjyOc fjzOc fjxH1c */
+ shufps xmm3, xmm1, 0b11100100
+ /* xmm3= fjxOd fjyOd fjzOd fjxH1d */
+ movlhps xmm4, xmm5 /* xmm4= fjyH1c fjzH1c fjxH2c fjyH2c */
+ shufps xmm7, xmm5, 0b11100100
+ /* xmm7= fjyH1d fjzH1d fjxH2d fjyH2d */
+ movups xmm1, [edi + ecx*4]
+ movups xmm2, [edi + ecx*4 + 16]
+ movups xmm5, [edi + edx*4]
+ movups xmm6, [edi + edx*4 + 16]
+ addps xmm1, xmm0
+ addps xmm2, xmm4
+ addps xmm5, xmm3
+ addps xmm6, xmm7
+ movss xmm0, [edi + ecx*4 + 32]
+ movss xmm3, [edi + edx*4 + 32]
+
+ movaps xmm4, [esp + fjzH2]
+ movaps xmm7, xmm4
+ shufps xmm4, xmm4, 0b10
+ shufps xmm7, xmm7, 0b11
+ movups [edi + ecx*4], xmm1
+ movups [edi + ecx*4 + 16],xmm2
+ movups [edi + edx*4], xmm5
+ movups [edi + edx*4 + 16],xmm6
+ addss xmm0, xmm4
+ addss xmm3, xmm7
+ movss [edi + ecx*4 + 32], xmm0
+ movss [edi + edx*4 + 32], xmm3
+
+ /* should we do one more iteration? */
+ sub [esp + innerk], 4
+ jl .i2030_single_check
+ jmp .i2030_unroll_loop
+.i2030_single_check:
+ add [esp + innerk], 4
+ jnz .i2030_single_loop
+ jmp .i2030_updateouterdata
+.i2030_single_loop:
+ mov edx, [esp + innerjjnr] /* pointer to jjnr[k] */
+ mov eax, [edx]
+ add [esp + innerjjnr], 4
+
+ mov esi, [ebp + pos]
+ lea eax, [eax + eax*2]
+
+ /* fetch j coordinates */
+ xorps xmm3, xmm3
+ xorps xmm4, xmm4
+ xorps xmm5, xmm5
+ movss xmm3, [esi + eax*4]
+ movss xmm4, [esi + eax*4 + 4]
+ movss xmm5, [esi + eax*4 + 8]
+
+ movlps xmm6, [esi + eax*4 + 12]
+ movhps xmm6, [esi + eax*4 + 24] /* xmm6=jxH1 jyH1 jxH2 jyH2 */
+ /* fetch both z coords in one go, to positions 0 and 3 in xmm7 */
+ movups xmm7, [esi + eax*4 + 20] /* xmm7=jzH1 jxH2 jyH2 jzH2 */
+ shufps xmm6, xmm6, 0b11011000 /* xmm6=jxH1 jxH2 jyH1 jyH2 */
+ movlhps xmm3, xmm6 /* xmm3= jxO 0 jxH1 jxH2 */
+ movaps xmm0, [esp + ixO]
+ movaps xmm1, [esp + iyO]
+ movaps xmm2, [esp + izO]
+ shufps xmm4, xmm6, 0b11100100 /* xmm4= jyO 0 jyH1 jyH2 */
+ shufps xmm5, xmm7, 0b11000100 /* xmm5= jzO 0 jzH1 jzH2 */
+ /* store all j coordinates in jO */
+ movaps [esp + jxO], xmm3
+ movaps [esp + jyO], xmm4
+ movaps [esp + jzO], xmm5
+ subps xmm0, xmm3
+ subps xmm1, xmm4
+ subps xmm2, xmm5
+ movaps [esp + dxOO], xmm0
+ movaps [esp + dyOO], xmm1
+ movaps [esp + dzOO], xmm2
+ mulps xmm0, xmm0
+ mulps xmm1, xmm1
+ mulps xmm2, xmm2
+ addps xmm0, xmm1
+ addps xmm0, xmm2 /* have rsq in xmm0 */
+
+ movaps xmm6, xmm0
+
+ /* do invsqrt */
+ rsqrtps xmm1, xmm0
+ mulps xmm6, [esp + krf] /* xmm6=krsq */
+ movaps xmm2, xmm1
+ movaps xmm7, xmm6 /* xmm7=krsq */
+ mulps xmm1, xmm1
+ movaps xmm3, [esp + three]
+ mulps xmm1, xmm0
+ subps xmm3, xmm1
+ mulps xmm3, xmm2
+ mulps xmm3, [esp + half] /* rinv iO - j water */
+
+
+
+ addps xmm6, xmm3 /* xmm6=rinv+krsq */
+ mulps xmm7, [esp + two]
+ subps xmm6, [esp + crf] /* xmm6=rinv+krsq-crf */
+
+ xorps xmm1, xmm1
+ movaps xmm0, xmm3
+ subps xmm3, xmm7 /* xmm3=rinv-2*krsq */
+ xorps xmm4, xmm4
+ mulps xmm0, xmm0 /* xmm0=rinvsq */
+ /* fetch charges to xmm4 (temporary) */
+ movss xmm4, [esp + qqOO]
+ movhps xmm4, [esp + qqOH]
+
+ mulps xmm6, xmm4 /* vcoul */
+ mulps xmm3, xmm4 /* coul part of fs */
+
+
+ addps xmm6, [esp + vctot]
+ mulps xmm0, xmm3 /* total fscal */
+ movaps [esp + vctot], xmm6
+
+ movaps xmm1, xmm0
+ movaps xmm2, xmm0
+ mulps xmm0, [esp + dxOO]
+ mulps xmm1, [esp + dyOO]
+ mulps xmm2, [esp + dzOO]
+
+ /* initial update for j forces */
+ xorps xmm3, xmm3
+ xorps xmm4, xmm4
+ xorps xmm5, xmm5
+ subps xmm3, xmm0
+ subps xmm4, xmm1
+ subps xmm5, xmm2
+ movaps [esp + fjxO], xmm3
+ movaps [esp + fjyO], xmm4
+ movaps [esp + fjzO], xmm5
+ addps xmm0, [esp + fixO]
+ addps xmm1, [esp + fiyO]
+ addps xmm2, [esp + fizO]
+ movaps [esp + fixO], xmm0
+ movaps [esp + fiyO], xmm1
+ movaps [esp + fizO], xmm2
+
+
+ /* done with i O Now do i H1 & H2 simultaneously first get i particle coords: */
+ movaps xmm0, [esp + ixH1]
+ movaps xmm1, [esp + iyH1]
+ movaps xmm2, [esp + izH1]
+ movaps xmm3, [esp + ixH2]
+ movaps xmm4, [esp + iyH2]
+ movaps xmm5, [esp + izH2]
+ subps xmm0, [esp + jxO]
+ subps xmm1, [esp + jyO]
+ subps xmm2, [esp + jzO]
+ subps xmm3, [esp + jxO]
+ subps xmm4, [esp + jyO]
+ subps xmm5, [esp + jzO]
+ movaps [esp + dxH1O], xmm0
+ movaps [esp + dyH1O], xmm1
+ movaps [esp + dzH1O], xmm2
+ movaps [esp + dxH2O], xmm3
+ movaps [esp + dyH2O], xmm4
+ movaps [esp + dzH2O], xmm5
+ mulps xmm0, xmm0
+ mulps xmm1, xmm1
+ mulps xmm2, xmm2
+ mulps xmm3, xmm3
+ mulps xmm4, xmm4
+ mulps xmm5, xmm5
+ addps xmm0, xmm1
+ addps xmm4, xmm3
+ addps xmm0, xmm2 /* have rsqH1 in xmm0 */
+ addps xmm4, xmm5 /* have rsqH2 in xmm4 */
+
+ /* do invsqrt */
+ rsqrtps xmm1, xmm0
+ rsqrtps xmm5, xmm4
+ movaps xmm2, xmm1
+ movaps xmm6, xmm5
+ mulps xmm1, xmm1
+ mulps xmm5, xmm5
+ movaps xmm3, [esp + three]
+ movaps xmm7, xmm3
+ mulps xmm1, xmm0
+ mulps xmm5, xmm4
+ subps xmm3, xmm1
+ subps xmm7, xmm5
+ mulps xmm3, xmm2
+ mulps xmm7, xmm6
+ mulps xmm3, [esp + half] /* rinv H1 - j water */
+ mulps xmm7, [esp + half] /* rinv H2 - j water */
+
+ mulps xmm0, [esp + krf] /* krsq */
+ mulps xmm4, [esp + krf] /* krsq */
+
+ /* assemble charges in xmm6 */
+ xorps xmm6, xmm6
+ movss xmm6, [esp + qqOH]
+ movhps xmm6, [esp + qqHH]
+ movaps xmm1, xmm0
+ movaps xmm5, xmm4
+ addps xmm0, xmm3 /* krsq+rinv */
+ addps xmm4, xmm7 /* krsq+rinv */
+ subps xmm0, [esp + crf]
+ subps xmm4, [esp + crf]
+ mulps xmm1, [esp + two]
+ mulps xmm5, [esp + two]
+ mulps xmm0, xmm6 /* vcoul */
+ mulps xmm4, xmm6 /* vcoul */
+ addps xmm4, xmm0
+ addps xmm4, [esp + vctot]
+ movaps [esp + vctot], xmm4
+ movaps xmm0, xmm3
+ movaps xmm4, xmm7
+ mulps xmm3, xmm3
+ mulps xmm7, xmm7
+ subps xmm0, xmm1
+ subps xmm4, xmm5
+ mulps xmm0, xmm6
+ mulps xmm4, xmm6
+ mulps xmm0, xmm3 /* fscal */
+ mulps xmm7, xmm4 /* fscal */
+
+ movaps xmm1, xmm0
+ movaps xmm2, xmm0
+ mulps xmm0, [esp + dxH1O]
+ mulps xmm1, [esp + dyH1O]
+ mulps xmm2, [esp + dzH1O]
+ /* update forces H1 - j water */
+ movaps xmm3, [esp + fjxO]
+ movaps xmm4, [esp + fjyO]
+ movaps xmm5, [esp + fjzO]
+ subps xmm3, xmm0
+ subps xmm4, xmm1
+ subps xmm5, xmm2
+ movaps [esp + fjxO], xmm3
+ movaps [esp + fjyO], xmm4
+ movaps [esp + fjzO], xmm5
+ addps xmm0, [esp + fixH1]
+ addps xmm1, [esp + fiyH1]
+ addps xmm2, [esp + fizH1]
+ movaps [esp + fixH1], xmm0
+ movaps [esp + fiyH1], xmm1
+ movaps [esp + fizH1], xmm2
+ /* do forces H2 - j water */
+ movaps xmm0, xmm7
+ movaps xmm1, xmm7
+ movaps xmm2, xmm7
+ mulps xmm0, [esp + dxH2O]
+ mulps xmm1, [esp + dyH2O]
+ mulps xmm2, [esp + dzH2O]
+ movaps xmm3, [esp + fjxO]
+ movaps xmm4, [esp + fjyO]
+ movaps xmm5, [esp + fjzO]
+ subps xmm3, xmm0
+ subps xmm4, xmm1
+ subps xmm5, xmm2
+ mov esi, [ebp + faction]
+ movaps [esp + fjxO], xmm3
+ movaps [esp + fjyO], xmm4
+ movaps [esp + fjzO], xmm5
+ addps xmm0, [esp + fixH2]
+ addps xmm1, [esp + fiyH2]
+ addps xmm2, [esp + fizH2]
+ movaps [esp + fixH2], xmm0
+ movaps [esp + fiyH2], xmm1
+ movaps [esp + fizH2], xmm2
+
+ /* update j water forces from local variables */
+ movlps xmm0, [esi + eax*4]
+ movlps xmm1, [esi + eax*4 + 12]
+ movhps xmm1, [esi + eax*4 + 24]
+ movaps xmm3, [esp + fjxO]
+ movaps xmm4, [esp + fjyO]
+ movaps xmm5, [esp + fjzO]
+ movaps xmm6, xmm5
+ movaps xmm7, xmm5
+ shufps xmm6, xmm6, 0b10
+ shufps xmm7, xmm7, 0b11
+ addss xmm5, [esi + eax*4 + 8]
+ addss xmm6, [esi + eax*4 + 20]
+ addss xmm7, [esi + eax*4 + 32]
+ movss [esi + eax*4 + 8], xmm5
+ movss [esi + eax*4 + 20], xmm6
+ movss [esi + eax*4 + 32], xmm7
+ movaps xmm5, xmm3
+ unpcklps xmm3, xmm4
+ unpckhps xmm5, xmm4
+ addps xmm0, xmm3
+ addps xmm1, xmm5
+ movlps [esi + eax*4], xmm0
+ movlps [esi + eax*4 + 12], xmm1
+ movhps [esi + eax*4 + 24], xmm1
+
+ dec dword ptr [esp + innerk]
+ jz .i2030_updateouterdata
+ jmp .i2030_single_loop
+.i2030_updateouterdata:
+ mov ecx, [esp + ii3]
+ mov edi, [ebp + faction]
+ mov esi, [ebp + fshift]
+ mov edx, [esp + is3]
+
+ /* accumulate Oi forces in xmm0, xmm1, xmm2 */
+ movaps xmm0, [esp + fixO]
+ movaps xmm1, [esp + fiyO]
+ movaps xmm2, [esp + fizO]
+
+ movhlps xmm3, xmm0
+ movhlps xmm4, xmm1
+ movhlps xmm5, xmm2
+ addps xmm0, xmm3
+ addps xmm1, xmm4
+ addps xmm2, xmm5 /* sum is in 1/2 in xmm0-xmm2 */
+
+ movaps xmm3, xmm0
+ movaps xmm4, xmm1
+ movaps xmm5, xmm2
+
+ shufps xmm3, xmm3, 1
+ shufps xmm4, xmm4, 1
+ shufps xmm5, xmm5, 1
+ addss xmm0, xmm3
+ addss xmm1, xmm4
+ addss xmm2, xmm5 /* xmm0-xmm2 has single force in pos0 */
+
+ /* increment i force */
+ movss xmm3, [edi + ecx*4]
+ movss xmm4, [edi + ecx*4 + 4]
+ movss xmm5, [edi + ecx*4 + 8]
+ addss xmm3, xmm0
+ addss xmm4, xmm1
+ addss xmm5, xmm2
+ movss [edi + ecx*4], xmm3
+ movss [edi + ecx*4 + 4], xmm4
+ movss [edi + ecx*4 + 8], xmm5
+
+ /* accumulate force in xmm6/xmm7 for fshift */
+ movaps xmm6, xmm0
+ movss xmm7, xmm2
+ movlhps xmm6, xmm1
+ shufps xmm6, xmm6, 0b1000
+
+ /* accumulate H1i forces in xmm0, xmm1, xmm2 */
+ movaps xmm0, [esp + fixH1]
+ movaps xmm1, [esp + fiyH1]
+ movaps xmm2, [esp + fizH1]
+
+ movhlps xmm3, xmm0
+ movhlps xmm4, xmm1
+ movhlps xmm5, xmm2
+ addps xmm0, xmm3
+ addps xmm1, xmm4
+ addps xmm2, xmm5 /* sum is in 1/2 in xmm0-xmm2 */
+
+ movaps xmm3, xmm0
+ movaps xmm4, xmm1
+ movaps xmm5, xmm2
+
+ shufps xmm3, xmm3, 1
+ shufps xmm4, xmm4, 1
+ shufps xmm5, xmm5, 1
+ addss xmm0, xmm3
+ addss xmm1, xmm4
+ addss xmm2, xmm5 /* xmm0-xmm2 has single force in pos0 */
+
+ /* increment i force */
+ movss xmm3, [edi + ecx*4 + 12]
+ movss xmm4, [edi + ecx*4 + 16]
+ movss xmm5, [edi + ecx*4 + 20]
+ addss xmm3, xmm0
+ addss xmm4, xmm1
+ addss xmm5, xmm2
+ movss [edi + ecx*4 + 12], xmm3
+ movss [edi + ecx*4 + 16], xmm4
+ movss [edi + ecx*4 + 20], xmm5
+
+ /* accumulate force in xmm6/xmm7 for fshift */
+ addss xmm7, xmm2
+ movlhps xmm0, xmm1
+ shufps xmm0, xmm0, 0b1000
+ addps xmm6, xmm0
+
+ /* accumulate H2i forces in xmm0, xmm1, xmm2 */
+ movaps xmm0, [esp + fixH2]
+ movaps xmm1, [esp + fiyH2]
+ movaps xmm2, [esp + fizH2]
+
+ movhlps xmm3, xmm0
+ movhlps xmm4, xmm1
+ movhlps xmm5, xmm2
+ addps xmm0, xmm3
+ addps xmm1, xmm4
+ addps xmm2, xmm5 /* sum is in 1/2 in xmm0-xmm2 */
+
+ movaps xmm3, xmm0
+ movaps xmm4, xmm1
+ movaps xmm5, xmm2
+
+ shufps xmm3, xmm3, 1
+ shufps xmm4, xmm4, 1
+ shufps xmm5, xmm5, 1
+ addss xmm0, xmm3
+ addss xmm1, xmm4
+ addss xmm2, xmm5 /* xmm0-xmm2 has single force in pos0 */
+
+ /* increment i force */
+ movss xmm3, [edi + ecx*4 + 24]
+ movss xmm4, [edi + ecx*4 + 28]
+ movss xmm5, [edi + ecx*4 + 32]
+ addss xmm3, xmm0
+ addss xmm4, xmm1
+ addss xmm5, xmm2
+ movss [edi + ecx*4 + 24], xmm3
+ movss [edi + ecx*4 + 28], xmm4
+ movss [edi + ecx*4 + 32], xmm5
+
+ /* accumulate force in xmm6/xmm7 for fshift */
+ addss xmm7, xmm2
+ movlhps xmm0, xmm1
+ shufps xmm0, xmm0, 0b1000
+ addps xmm6, xmm0
+
+ /* increment fshift force */
+ movlps xmm3, [esi + edx*4]
+ movss xmm4, [esi + edx*4 + 8]
+ addps xmm3, xmm6
+ addss xmm4, xmm7
+ movlps [esi + edx*4], xmm3
+ movss [esi + edx*4 + 8], xmm4
+
+ /* get group index for i particle */
+ mov edx, [ebp + gid] /* get group index for this i particle */
+ mov edx, [edx]
+ add [ebp + gid], 4 /* advance pointer */
+
+ /* accumulate total potential energy and update it */
+ movaps xmm7, [esp + vctot]
+ /* accumulate */
+ movhlps xmm6, xmm7
+ addps xmm7, xmm6 /* pos 0-1 in xmm7 have the sum now */
+ movaps xmm6, xmm7
+ shufps xmm6, xmm6, 1
+ addss xmm7, xmm6
+
+ /* add earlier value from mem */
+ mov eax, [ebp + Vc]
+ addss xmm7, [eax + edx*4]
+ /* move back to mem */
+ movss [eax + edx*4], xmm7
+
+ /* finish if last */
+ mov ecx, [ebp + nri]
+ dec ecx
+ jecxz .i2030_end
+ /* not last, iterate once more! */
+ mov [ebp + nri], ecx
+ jmp .i2030_outer
+.i2030_end:
+ emms
+ mov eax, [esp + salign]
+ add esp, eax
+ add esp, 1460
+ pop edi
+ pop esi
+ pop edx
+ pop ecx
+ pop ebx
+ pop eax
+ leave
+ ret
+
+
+
+
+.globl inl3000_sse
+ .type inl3000_sse,@function
+inl3000_sse:
+.equ nri, 8
+.equ iinr, 12
+.equ jindex, 16
+.equ jjnr, 20
+.equ shift, 24
+.equ shiftvec, 28
+.equ fshift, 32
+.equ gid, 36
+.equ pos, 40
+.equ faction, 44
+.equ charge, 48
+.equ facel, 52
+.equ Vc, 56
+.equ tabscale, 60
+.equ VFtab, 64
+ /* stack offsets for local variables */
+ /* bottom of stack is cache-aligned for sse use */
+.equ ix, 0
+.equ iy, 16
+.equ iz, 32
+.equ iq, 48
+.equ dx, 64
+.equ dy, 80
+.equ dz, 96
+.equ two, 112
+.equ tsc, 128
+.equ qq, 144
+.equ fs, 160
+.equ vctot, 176
+.equ fix, 192
+.equ fiy, 208
+.equ fiz, 224
+.equ half, 240
+.equ three, 256
+.equ is3, 272
+.equ ii3, 276
+.equ innerjjnr, 280
+.equ innerk, 284
+.equ salign, 288
+ push ebp
+ mov ebp,esp
+ push eax
+ push ebx
+ push ecx
+ push edx
+ push esi
+ push edi
+ sub esp, 292 /* local stack space */
+ mov eax, esp
+ and eax, 0xf
+ sub esp, eax
+ mov [esp + salign], eax
+
+ emms
+
+ movups xmm0, [sse_half]
+ movups xmm1, [sse_two]
+ movups xmm2, [sse_three]
+ movss xmm3, [ebp + tabscale]
+ movaps [esp + half], xmm0
+ movaps [esp + two], xmm1
+ movaps [esp + three], xmm2
+ shufps xmm3, xmm3, 0
+ movaps [esp + tsc], xmm3
+
+ /* assume we have at least one i particle - start directly */
+.i3000_outer:
+ mov eax, [ebp + shift] /* eax = pointer into shift[] */
+ mov ebx, [eax] /* ebx=shift[n] */
+ add [ebp + shift], 4 /* advance pointer one step */
+
+ lea ebx, [ebx + ebx*2] /* ebx=3*is */
+ mov [esp + is3],ebx /* store is3 */
+
+ mov eax, [ebp + shiftvec] /* eax = base of shiftvec[] */
+
+ movss xmm0, [eax + ebx*4]
+ movss xmm1, [eax + ebx*4 + 4]
+ movss xmm2, [eax + ebx*4 + 8]
+
+ mov ecx, [ebp + iinr] /* ecx = pointer into iinr[] */
+ add [ebp + iinr], 4 /* advance pointer */
+ mov ebx, [ecx] /* ebx =ii */
+
+ mov edx, [ebp + charge]
+ movss xmm3, [edx + ebx*4]
+ mulss xmm3, [ebp + facel]
+ shufps xmm3, xmm3, 0
+
+ lea ebx, [ebx + ebx*2] /* ebx = 3*ii=ii3 */
+ mov eax, [ebp + pos] /* eax = base of pos[] */
+
+ addss xmm0, [eax + ebx*4]
+ addss xmm1, [eax + ebx*4 + 4]
+ addss xmm2, [eax + ebx*4 + 8]
+
+ movaps [esp + iq], xmm3
+
+ shufps xmm0, xmm0, 0
+ shufps xmm1, xmm1, 0
+ shufps xmm2, xmm2, 0
+
+ movaps [esp + ix], xmm0
+ movaps [esp + iy], xmm1
+ movaps [esp + iz], xmm2
+
+ mov [esp + ii3], ebx
+
+ /* clear vctot and i forces */
+ xorps xmm4, xmm4
+ movaps [esp + vctot], xmm4
+ movaps [esp + fix], xmm4
+ movaps [esp + fiy], xmm4
+ movaps [esp + fiz], xmm4
+
+ mov eax, [ebp + jindex]
+ mov ecx, [eax] /* jindex[n] */
+ mov edx, [eax + 4] /* jindex[n+1] */
+ add [ebp + jindex], 4
+ sub edx, ecx /* number of innerloop atoms */
+
+ mov esi, [ebp + pos]
+ mov edi, [ebp + faction]
+ mov eax, [ebp + jjnr]
+ shl ecx, 2
+ add eax, ecx
+ mov [esp + innerjjnr], eax /* pointer to jjnr[nj0] */
+ sub edx, 4
+ mov [esp + innerk], edx /* number of innerloop atoms */
+ jge .i3000_unroll_loop
+ jmp .i3000_finish_inner
+.i3000_unroll_loop:
+ /* quad-unroll innerloop here */
+ mov edx, [esp + innerjjnr] /* pointer to jjnr[k] */
+ mov eax, [edx]
+ mov ebx, [edx + 4]
+ mov ecx, [edx + 8]
+ mov edx, [edx + 12] /* eax-edx=jnr1-4 */
+ add [esp + innerjjnr], 16 /* advance pointer (unrolled 4) */
+
+ mov esi, [ebp + charge] /* base of charge[] */
+
+ movss xmm3, [esi + eax*4]
+ movss xmm4, [esi + ecx*4]
+ movss xmm6, [esi + ebx*4]
+ movss xmm7, [esi + edx*4]
+
+ movaps xmm2, [esp + iq]
+ shufps xmm3, xmm6, 0
+ shufps xmm4, xmm7, 0
+ shufps xmm3, xmm4, 0b10001000 /* all charges in xmm3 */
+ mulps xmm3, xmm2
+
+ movaps [esp + qq], xmm3
+
+ mov esi, [ebp + pos] /* base of pos[] */
+
+ lea eax, [eax + eax*2] /* replace jnr with j3 */
+ lea ebx, [ebx + ebx*2]
+
+ lea ecx, [ecx + ecx*2] /* replace jnr with j3 */
+ lea edx, [edx + edx*2]
+
+ /* move four coordinates to xmm0-xmm2 */
+
+ movlps xmm4, [esi + eax*4]
+ movlps xmm5, [esi + ecx*4]
+ movss xmm2, [esi + eax*4 + 8]
+ movss xmm6, [esi + ecx*4 + 8]
+
+ movhps xmm4, [esi + ebx*4]
+ movhps xmm5, [esi + edx*4]
+
+ movss xmm0, [esi + ebx*4 + 8]
+ movss xmm1, [esi + edx*4 + 8]
+
+ shufps xmm2, xmm0, 0
+ shufps xmm6, xmm1, 0
+
+ movaps xmm0, xmm4
+ movaps xmm1, xmm4
+
+ shufps xmm2, xmm6, 0b10001000
+
+ shufps xmm0, xmm5, 0b10001000
+ shufps xmm1, xmm5, 0b11011101
+
+ /* move ix-iz to xmm4-xmm6 */
+ movaps xmm4, [esp + ix]
+ movaps xmm5, [esp + iy]
+ movaps xmm6, [esp + iz]
+
+ /* calc dr */
+ subps xmm4, xmm0
+ subps xmm5, xmm1
+ subps xmm6, xmm2
+
+ /* store dr */
+ movaps [esp + dx], xmm4
+ movaps [esp + dy], xmm5
+ movaps [esp + dz], xmm6
+ /* square it */
+ mulps xmm4,xmm4
+ mulps xmm5,xmm5
+ mulps xmm6,xmm6
+ addps xmm4, xmm5
+ addps xmm4, xmm6
+ /* rsq in xmm4 */
+
+ rsqrtps xmm5, xmm4
+ /* lookup seed in xmm5 */
+ movaps xmm2, xmm5
+ mulps xmm5, xmm5
+ movaps xmm1, [esp + three]
+ mulps xmm5, xmm4 /* rsq*lu*lu */
+ movaps xmm0, [esp + half]
+ subps xmm1, xmm5 /* 30-rsq*lu*lu */
+ mulps xmm1, xmm2
+ mulps xmm0, xmm1 /* xmm0=rinv */
+ mulps xmm4, xmm0 /* xmm4=r */
+ mulps xmm4, [esp + tsc]
+
+ movhlps xmm5, xmm4
+ cvttps2pi mm6, xmm4
+ cvttps2pi mm7, xmm5 /* mm6/mm7 contain lu indices */
+ cvtpi2ps xmm6, mm6
+ cvtpi2ps xmm5, mm7
+ movlhps xmm6, xmm5
+ subps xmm4, xmm6
+ movaps xmm1, xmm4 /* xmm1=eps */
+ movaps xmm2, xmm1
+ mulps xmm2, xmm2 /* xmm2=eps2 */
+ pslld mm6, 2
+ pslld mm7, 2
+
+ movd mm0, eax
+ movd mm1, ebx
+ movd mm2, ecx
+ movd mm3, edx
+
+ mov esi, [ebp + VFtab]
+ movd eax, mm6
+ psrlq mm6, 32
+ movd ecx, mm7
+ psrlq mm7, 32
+ movd ebx, mm6
+ movd edx, mm7
+
+ movlps xmm5, [esi + eax*4]
+ movlps xmm7, [esi + ecx*4]
+ movhps xmm5, [esi + ebx*4]
+ movhps xmm7, [esi + edx*4] /* got half coulomb table */
+
+ movaps xmm4, xmm5
+ shufps xmm4, xmm7, 0b10001000
+ shufps xmm5, xmm7, 0b11011101
+
+ movlps xmm7, [esi + eax*4 + 8]
+ movlps xmm3, [esi + ecx*4 + 8]
+ movhps xmm7, [esi + ebx*4 + 8]
+ movhps xmm3, [esi + edx*4 + 8] /* other half of coulomb table */
+ movaps xmm6, xmm7
+ shufps xmm6, xmm3, 0b10001000
+ shufps xmm7, xmm3, 0b11011101
+ /* coulomb table ready, in xmm4-xmm7 */
+
+ mulps xmm6, xmm1 /* xmm6=Geps */
+ mulps xmm7, xmm2 /* xmm7=Heps2 */
+ addps xmm5, xmm6
+ addps xmm5, xmm7 /* xmm5=Fp */
+ mulps xmm7, [esp + two] /* two*Heps2 */
+ movaps xmm3, [esp + qq]
+ addps xmm7, xmm6
+ addps xmm7, xmm5 /* xmm7=FF */
+ mulps xmm5, xmm1 /* xmm5=eps*Fp */
+ addps xmm5, xmm4 /* xmm5=VV */
+ mulps xmm5, xmm3 /* vcoul=qq*VV */
+ mulps xmm3, xmm7 /* fijC=FF*qq */
+ /* at this point mm5 contains vcoul and mm3 fijC */
+ /* increment vcoul - then we can get rid of mm5 */
+ /* update vctot */
+ addps xmm5, [esp + vctot]
+ movaps [esp + vctot], xmm5
+
+ xorps xmm4, xmm4
+
+ mulps xmm3, [esp + tsc]
+ mulps xmm3, xmm0
+ subps xmm4, xmm3
+
+ movaps xmm0, [esp + dx]
+ movaps xmm1, [esp + dy]
+ movaps xmm2, [esp + dz]
+
+ movd eax, mm0
+ movd ebx, mm1
+ movd ecx, mm2
+ movd edx, mm3
+
+ mov edi, [ebp + faction]
+ mulps xmm0, xmm4
+ mulps xmm1, xmm4
+ mulps xmm2, xmm4
+ /* xmm0-xmm2 contains tx-tz (partial force) */
+ /* now update f_i */
+ movaps xmm3, [esp + fix]
+ movaps xmm4, [esp + fiy]
+ movaps xmm5, [esp + fiz]
+ addps xmm3, xmm0
+ addps xmm4, xmm1
+ addps xmm5, xmm2
+ movaps [esp + fix], xmm3
+ movaps [esp + fiy], xmm4
+ movaps [esp + fiz], xmm5
+ /* the fj's - start by accumulating x & y forces from memory */
+ movlps xmm4, [edi + eax*4]
+ movlps xmm6, [edi + ecx*4]
+ movhps xmm4, [edi + ebx*4]
+ movhps xmm6, [edi + edx*4]
+
+ movaps xmm3, xmm4
+ shufps xmm3, xmm6, 0b10001000
+ shufps xmm4, xmm6, 0b11011101
+
+ /* now xmm3-xmm5 contains fjx, fjy, fjz */
+ subps xmm3, xmm0
+ subps xmm4, xmm1
+
+ /* unpack them back so we can store them - first x & y in xmm3/xmm4 */
+
+ movaps xmm6, xmm3
+ unpcklps xmm6, xmm4
+ unpckhps xmm3, xmm4
+ /* xmm6(l)=x & y for j1, (h) for j2 */
+ /* xmm3(l)=x & y for j3, (h) for j4 */
+ movlps [edi + eax*4], xmm6
+ movlps [edi + ecx*4], xmm3
+
+ movhps [edi + ebx*4], xmm6
+ movhps [edi + edx*4], xmm3
+
+ /* and the z forces */
+ movss xmm4, [edi + eax*4 + 8]
+ movss xmm5, [edi + ebx*4 + 8]
+ movss xmm6, [edi + ecx*4 + 8]
+ movss xmm7, [edi + edx*4 + 8]
+ subss xmm4, xmm2
+ shufps xmm2, xmm2, 0b11100101
+ subss xmm5, xmm2
+ shufps xmm2, xmm2, 0b11101010
+ subss xmm6, xmm2
+ shufps xmm2, xmm2, 0b11111111
+ subss xmm7, xmm2
+ movss [edi + eax*4 + 8], xmm4
+ movss [edi + ebx*4 + 8], xmm5
+ movss [edi + ecx*4 + 8], xmm6
+ movss [edi + edx*4 + 8], xmm7
+
+ /* should we do one more iteration? */
+ sub [esp + innerk], 4
+ jl .i3000_finish_inner
+ jmp .i3000_unroll_loop
+.i3000_finish_inner:
+ /* check if at least two particles remain */
+ add [esp + innerk], 4
+ mov edx, [esp + innerk]
+ and edx, 2
+ jnz .i3000_dopair
+ jmp .i3000_checksingle
+.i3000_dopair:
+ mov esi, [ebp + charge]
+
+ mov ecx, [esp + innerjjnr]
+
+ mov eax, [ecx]
+ mov ebx, [ecx + 4]
+ add [esp + innerjjnr], 8
+ xorps xmm7, xmm7
+ movss xmm3, [esi + eax*4]
+ movss xmm6, [esi + ebx*4]
+ shufps xmm3, xmm6, 0
+ shufps xmm3, xmm3, 0b00001000 /* xmm3(0,1) has the charges */
+
+ mulps xmm3, [esp + iq]
+ movlhps xmm3, xmm7
+ movaps [esp + qq], xmm3
+
+ mov edi, [ebp + pos]
+
+ lea eax, [eax + eax*2]
+ lea ebx, [ebx + ebx*2]
+ /* move coordinates to xmm0-xmm2 */
+ movlps xmm1, [edi + eax*4]
+ movss xmm2, [edi + eax*4 + 8]
+ movhps xmm1, [edi + ebx*4]
+ movss xmm0, [edi + ebx*4 + 8]
+
+ movlhps xmm3, xmm7
+
+ shufps xmm2, xmm0, 0
+
+ movaps xmm0, xmm1
+
+ shufps xmm2, xmm2, 0b10001000
+
+ shufps xmm0, xmm0, 0b10001000
+ shufps xmm1, xmm1, 0b11011101
+
+ mov edi, [ebp + faction]
+ /* move ix-iz to xmm4-xmm6 */
+ xorps xmm7, xmm7
+
+ movaps xmm4, [esp + ix]
+ movaps xmm5, [esp + iy]
+ movaps xmm6, [esp + iz]
+
+ /* calc dr */
+ subps xmm4, xmm0
+ subps xmm5, xmm1
+ subps xmm6, xmm2
+
+ /* store dr */
+ movaps [esp + dx], xmm4
+ movaps [esp + dy], xmm5
+ movaps [esp + dz], xmm6
+ /* square it */
+ mulps xmm4,xmm4
+ mulps xmm5,xmm5
+ mulps xmm6,xmm6
+ addps xmm4, xmm5
+ addps xmm4, xmm6
+ /* rsq in xmm4 */
+
+ rsqrtps xmm5, xmm4
+ /* lookup seed in xmm5 */
+ movaps xmm2, xmm5
+ mulps xmm5, xmm5
+ movaps xmm1, [esp + three]
+ mulps xmm5, xmm4 /* rsq*lu*lu */
+ movaps xmm0, [esp + half]
+ subps xmm1, xmm5 /* 30-rsq*lu*lu */
+ mulps xmm1, xmm2
+ mulps xmm0, xmm1 /* xmm0=rinv */
+ mulps xmm4, xmm0 /* xmm4=r */
+ mulps xmm4, [esp + tsc]
+
+ cvttps2pi mm6, xmm4 /* mm6 contain lu indices */
+ cvtpi2ps xmm6, mm6
+ subps xmm4, xmm6
+ movaps xmm1, xmm4 /* xmm1=eps */
+ movaps xmm2, xmm1
+ mulps xmm2, xmm2 /* xmm2=eps2 */
+
+ pslld mm6, 2
+
+ mov esi, [ebp + VFtab]
+ movd ecx, mm6
+ psrlq mm6, 32
+ movd edx, mm6
+
+ movlps xmm5, [esi + ecx*4]
+ movhps xmm5, [esi + edx*4] /* got half coulomb table */
+ movaps xmm4, xmm5
+ shufps xmm4, xmm4, 0b10001000
+ shufps xmm5, xmm7, 0b11011101
+
+ movlps xmm7, [esi + ecx*4 + 8]
+ movhps xmm7, [esi + edx*4 + 8]
+ movaps xmm6, xmm7
+ shufps xmm6, xmm6, 0b10001000
+ shufps xmm7, xmm7, 0b11011101
+ /* table ready in xmm4-xmm7 */
+
+ mulps xmm6, xmm1 /* xmm6=Geps */
+ mulps xmm7, xmm2 /* xmm7=Heps2 */
+ addps xmm5, xmm6
+ addps xmm5, xmm7 /* xmm5=Fp */
+ mulps xmm7, [esp + two] /* two*Heps2 */
+ movaps xmm3, [esp + qq]
+ addps xmm7, xmm6
+ addps xmm7, xmm5 /* xmm7=FF */
+ mulps xmm5, xmm1 /* xmm5=eps*Fp */
+ addps xmm5, xmm4 /* xmm5=VV */
+ mulps xmm5, xmm3 /* vcoul=qq*VV */
+ mulps xmm3, xmm7 /* fijC=FF*qq */
+ /* at this point mm5 contains vcoul and mm3 fijC */
+ /* increment vcoul - then we can get rid of mm5 */
+ /* update vctot */
+ addps xmm5, [esp + vctot]
+ movaps [esp + vctot], xmm5
+
+ xorps xmm4, xmm4
+
+ mulps xmm3, [esp + tsc]
+ mulps xmm3, xmm0
+ subps xmm4, xmm3
+
+ movaps xmm0, [esp + dx]
+ movaps xmm1, [esp + dy]
+ movaps xmm2, [esp + dz]
+
+ mulps xmm0, xmm4
+ mulps xmm1, xmm4
+ mulps xmm2, xmm4
+ /* xmm0-xmm2 contains tx-tz (partial force) */
+ /* now update f_i */
+ movaps xmm3, [esp + fix]
+ movaps xmm4, [esp + fiy]
+ movaps xmm5, [esp + fiz]
+ addps xmm3, xmm0
+ addps xmm4, xmm1
+ addps xmm5, xmm2
+ movaps [esp + fix], xmm3
+ movaps [esp + fiy], xmm4
+ movaps [esp + fiz], xmm5
+ /* update the fj's */
+ movss xmm3, [edi + eax*4]
+ movss xmm4, [edi + eax*4 + 4]
+ movss xmm5, [edi + eax*4 + 8]
+ subss xmm3, xmm0
+ subss xmm4, xmm1
+ subss xmm5, xmm2
+ movss [edi + eax*4], xmm3
+ movss [edi + eax*4 + 4], xmm4
+ movss [edi + eax*4 + 8], xmm5
+
+ shufps xmm0, xmm0, 0b11100001
+ shufps xmm1, xmm1, 0b11100001
+ shufps xmm2, xmm2, 0b11100001
+
+ movss xmm3, [edi + ebx*4]
+ movss xmm4, [edi + ebx*4 + 4]
+ movss xmm5, [edi + ebx*4 + 8]
+ subss xmm3, xmm0
+ subss xmm4, xmm1
+ subss xmm5, xmm2
+ movss [edi + ebx*4], xmm3
+ movss [edi + ebx*4 + 4], xmm4
+ movss [edi + ebx*4 + 8], xmm5
+
+.i3000_checksingle:
+ mov edx, [esp + innerk]
+ and edx, 1
+ jnz .i3000_dosingle
+ jmp .i3000_updateouterdata
+.i3000_dosingle:
+ mov esi, [ebp + charge]
+ mov edi, [ebp + pos]
+ mov ecx, [esp + innerjjnr]
+ mov eax, [ecx]
+ xorps xmm6, xmm6
+ movss xmm6, [esi + eax*4] /* xmm6(0) has the charge */
+ mulps xmm6, [esp + iq]
+ movaps [esp + qq], xmm6
+
+ lea eax, [eax + eax*2]
+
+ /* move coordinates to xmm0-xmm2 */
+ movss xmm0, [edi + eax*4]
+ movss xmm1, [edi + eax*4 + 4]
+ movss xmm2, [edi + eax*4 + 8]
+
+ movaps xmm4, [esp + ix]
+ movaps xmm5, [esp + iy]
+ movaps xmm6, [esp + iz]
+
+ /* calc dr */
+ subps xmm4, xmm0
+ subps xmm5, xmm1
+ subps xmm6, xmm2
+
+ /* store dr */
+ movaps [esp + dx], xmm4
+ movaps [esp + dy], xmm5
+ movaps [esp + dz], xmm6
+ /* square it */
+ mulps xmm4,xmm4
+ mulps xmm5,xmm5
+ mulps xmm6,xmm6
+ addps xmm4, xmm5
+ addps xmm4, xmm6
+ /* rsq in xmm4 */
+
+ rsqrtps xmm5, xmm4
+ /* lookup seed in xmm5 */
+ movaps xmm2, xmm5
+ mulps xmm5, xmm5
+ movaps xmm1, [esp + three]
+ mulps xmm5, xmm4 /* rsq*lu*lu */
+ movaps xmm0, [esp + half]
+ subps xmm1, xmm5 /* 30-rsq*lu*lu */
+ mulps xmm1, xmm2
+ mulps xmm0, xmm1 /* xmm0=rinv */
+
+ mulps xmm4, xmm0 /* xmm4=r */
+ mulps xmm4, [esp + tsc]
+
+ cvttps2pi mm6, xmm4 /* mm6 contain lu indices */
+ cvtpi2ps xmm6, mm6
+ subps xmm4, xmm6
+ movaps xmm1, xmm4 /* xmm1=eps */
+ movaps xmm2, xmm1
+ mulps xmm2, xmm2 /* xmm2=eps2 */
+
+ pslld mm6, 2
+
+ mov esi, [ebp + VFtab]
+ movd ebx, mm6
+
+ movlps xmm4, [esi + ebx*4]
+ movlps xmm6, [esi + ebx*4 + 8]
+ movaps xmm5, xmm4
+ movaps xmm7, xmm6
+ shufps xmm5, xmm5, 1
+ shufps xmm7, xmm7, 1
+ /* table ready in xmm4-xmm7 */
+
+ mulps xmm6, xmm1 /* xmm6=Geps */
+ mulps xmm7, xmm2 /* xmm7=Heps2 */
+ addps xmm5, xmm6
+ addps xmm5, xmm7 /* xmm5=Fp */
+ mulps xmm7, [esp + two] /* two*Heps2 */
+ movaps xmm3, [esp + qq]
+ addps xmm7, xmm6
+ addps xmm7, xmm5 /* xmm7=FF */
+ mulps xmm5, xmm1 /* xmm5=eps*Fp */
+ addps xmm5, xmm4 /* xmm5=VV */
+ mulps xmm5, xmm3 /* vcoul=qq*VV */
+ mulps xmm3, xmm7 /* fijC=FF*qq */
+ /* at this point mm5 contains vcoul and mm3 fijC */
+ /* increment vcoul - then we can get rid of mm5 */
+ /* update vctot */
+ addps xmm5, [esp + vctot]
+ movaps [esp + vctot], xmm5
+
+ xorps xmm4, xmm4
+
+ mulps xmm3, [esp + tsc]
+ mulps xmm3, xmm0
+ subps xmm4, xmm3
+ mov edi, [ebp + faction]
+
+ movaps xmm0, [esp + dx]
+ movaps xmm1, [esp + dy]
+ movaps xmm2, [esp + dz]
+
+ mulps xmm0, xmm4
+ mulps xmm1, xmm4
+ mulps xmm2, xmm4
+ /* xmm0-xmm2 contains tx-tz (partial force) */
+ /* now update f_i */
+ movaps xmm3, [esp + fix]
+ movaps xmm4, [esp + fiy]
+ movaps xmm5, [esp + fiz]
+ addps xmm3, xmm0
+ addps xmm4, xmm1
+ addps xmm5, xmm2
+ movaps [esp + fix], xmm3
+ movaps [esp + fiy], xmm4
+ movaps [esp + fiz], xmm5
+ /* update fj */
+
+ movss xmm3, [edi + eax*4]
+ movss xmm4, [edi + eax*4 + 4]
+ movss xmm5, [edi + eax*4 + 8]
+ subss xmm3, xmm0
+ subss xmm4, xmm1
+ subss xmm5, xmm2
+ movss [edi + eax*4], xmm3
+ movss [edi + eax*4 + 4], xmm4
+ movss [edi + eax*4 + 8], xmm5
+.i3000_updateouterdata:
+ mov ecx, [esp + ii3]
+ mov edi, [ebp + faction]
+ mov esi, [ebp + fshift]
+ mov edx, [esp + is3]
+
+ /* accumulate i forces in xmm0, xmm1, xmm2 */
+ movaps xmm0, [esp + fix]
+ movaps xmm1, [esp + fiy]
+ movaps xmm2, [esp + fiz]
+
+ movhlps xmm3, xmm0
+ movhlps xmm4, xmm1
+ movhlps xmm5, xmm2
+ addps xmm0, xmm3
+ addps xmm1, xmm4
+ addps xmm2, xmm5 /* sum is in 1/2 in xmm0-xmm2 */
+
+ movaps xmm3, xmm0
+ movaps xmm4, xmm1
+ movaps xmm5, xmm2
+
+ shufps xmm3, xmm3, 1
+ shufps xmm4, xmm4, 1
+ shufps xmm5, xmm5, 1
+ addss xmm0, xmm3
+ addss xmm1, xmm4
+ addss xmm2, xmm5 /* xmm0-xmm2 has single force in pos0 */
+
+ /* increment i force */
+ movss xmm3, [edi + ecx*4]
+ movss xmm4, [edi + ecx*4 + 4]
+ movss xmm5, [edi + ecx*4 + 8]
+ addss xmm3, xmm0
+ addss xmm4, xmm1
+ addss xmm5, xmm2
+ movss [edi + ecx*4], xmm3
+ movss [edi + ecx*4 + 4], xmm4
+ movss [edi + ecx*4 + 8], xmm5
+
+ /* increment fshift force */
+ movss xmm3, [esi + edx*4]
+ movss xmm4, [esi + edx*4 + 4]
+ movss xmm5, [esi + edx*4 + 8]
+ addss xmm3, xmm0
+ addss xmm4, xmm1
+ addss xmm5, xmm2
+ movss [esi + edx*4], xmm3
+ movss [esi + edx*4 + 4], xmm4
+ movss [esi + edx*4 + 8], xmm5
+
+ /* get group index for i particle */
+ mov edx, [ebp + gid] /* get group index for this i particle */
+ mov edx, [edx]
+ add [ebp + gid], 4 /* advance pointer */
+
+ /* accumulate total potential energy and update it */
+ movaps xmm7, [esp + vctot]
+ /* accumulate */
+ movhlps xmm6, xmm7
+ addps xmm7, xmm6 /* pos 0-1 in xmm7 have the sum now */
+ movaps xmm6, xmm7
+ shufps xmm6, xmm6, 1
+ addss xmm7, xmm6
+
+ /* add earlier value from mem */
+ mov eax, [ebp + Vc]
+ addss xmm7, [eax + edx*4]
+ /* move back to mem */
+ movss [eax + edx*4], xmm7
+
+ /* finish if last */
+ mov ecx, [ebp + nri]
+ dec ecx
+ jecxz .i3000_end
+ /* not last, iterate once more! */
+ mov [ebp + nri], ecx
+ jmp .i3000_outer
+.i3000_end:
+ emms
+ mov eax, [esp + salign]
+ add esp, eax
+ add esp, 292
+ pop edi
+ pop esi
+ pop edx
+ pop ecx
+ pop ebx
+ pop eax
+ leave
+ ret
+
+
+
+.globl inl3010_sse
+ .type inl3010_sse,@function
+inl3010_sse:
+.equ nri, 8
+.equ iinr, 12
+.equ jindex, 16
+.equ jjnr, 20
+.equ shift, 24
+.equ shiftvec, 28
+.equ fshift, 32
+.equ gid, 36
+.equ pos, 40
+.equ faction, 44
+.equ charge, 48
+.equ facel, 52
+.equ Vc, 56
+.equ tabscale, 60
+.equ VFtab, 64
+.equ nsatoms, 68
+ /* stack offsets for local variables */
+ /* bottom of stack is cache-aligned for sse use */
+.equ ix, 0
+.equ iy, 16
+.equ iz, 32
+.equ iq, 48
+.equ dx, 64
+.equ dy, 80
+.equ dz, 96
+.equ two, 112
+.equ tsc, 128
+.equ qq, 144
+.equ fscal, 160
+.equ vctot, 176
+.equ fix, 192
+.equ fiy, 208
+.equ fiz, 224
+.equ half, 240
+.equ three, 256
+.equ is3, 272
+.equ ii3, 276
+.equ shX, 280
+.equ shY, 284
+.equ shZ, 288
+.equ ntia, 292
+.equ innerjjnr0, 296
+.equ innerk0, 300
+.equ innerjjnr, 304
+.equ innerk, 308
+.equ salign, 312
+.equ nscoul, 316
+.equ solnr, 320
+ push ebp
+ mov ebp,esp
+ push eax
+ push ebx
+ push ecx
+ push edx
+ push esi
+ push edi
+ sub esp, 324 /* local stack space */
+ mov eax, esp
+ and eax, 0xf
+ sub esp, eax
+ mov [esp + salign], eax
+
+ emms
+
+ movups xmm0, [sse_half]
+ movups xmm1, [sse_two]
+ movups xmm2, [sse_three]
+ movss xmm3, [ebp + tabscale]
+ movaps [esp + half], xmm0
+ movaps [esp + two], xmm1
+ movaps [esp + three], xmm2
+ shufps xmm3, xmm3, 0
+ movaps [esp + tsc], xmm3
+
+ add [ebp + nsatoms], 8
+
+ /* assume we have at least one i particle - start directly */
+.i3010_outer:
+ mov eax, [ebp + shift] /* eax = pointer into shift[] */
+ mov ebx, [eax] /* ebx=shift[n] */
+ add [ebp + shift], 4 /* advance pointer one step */
+
+ lea ebx, [ebx + ebx*2] /* ebx=3*is */
+ mov [esp + is3],ebx /* store is3 */
+
+ mov eax, [ebp + shiftvec] /* eax = base of shiftvec[] */
+
+ movss xmm0, [eax + ebx*4]
+ movss xmm1, [eax + ebx*4 + 4]
+ movss xmm2, [eax + ebx*4 + 8]
+ movss [esp + shX], xmm0
+ movss [esp + shY], xmm1
+ movss [esp + shZ], xmm2
+
+ mov ecx, [ebp + iinr] /* ecx = pointer into iinr[] */
+ add [ebp + iinr], 4 /* advance pointer */
+ mov ebx, [ecx] /* ebx =ii */
+
+ mov eax, [ebp + nsatoms]
+ mov ecx, [eax]
+ add [ebp + nsatoms], 12
+ mov [esp + nscoul], ecx
+
+ /* clear vctot */
+ xorps xmm4, xmm4
+ movaps [esp + vctot], xmm4
+ mov [esp + solnr], ebx
+
+ mov eax, [ebp + jindex]
+ mov ecx, [eax] /* jindex[n] */
+ mov edx, [eax + 4] /* jindex[n+1] */
+ add [ebp + jindex], 4
+ sub edx, ecx /* number of innerloop atoms */
+
+ mov eax, [ebp + jjnr]
+ shl ecx, 2
+ add eax, ecx
+ mov [esp + innerjjnr0], eax /* pointer to jjnr[nj0] */
+ mov [esp + innerk0], edx /* number of innerloop atoms */
+
+ mov ecx, [esp + nscoul]
+ cmp ecx, 0
+ jnz .i3010_mno_coul
+ jmp .i3010_last_mno
+.i3010_mno_coul:
+ mov ebx, [esp + solnr]
+ inc dword ptr [esp + solnr]
+
+ movss xmm0, [esp + shX]
+ movss xmm1, [esp + shY]
+ movss xmm2, [esp + shZ]
+
+ mov edx, [ebp + charge]
+ movss xmm3, [edx + ebx*4]
+ mulss xmm3, [ebp + facel]
+ shufps xmm3, xmm3, 0
+
+ lea ebx, [ebx + ebx*2] /* ebx = 3*ii=ii3 */
+ mov eax, [ebp + pos] /* eax = base of pos[] */
+
+ addss xmm0, [eax + ebx*4]
+ addss xmm1, [eax + ebx*4 + 4]
+ addss xmm2, [eax + ebx*4 + 8]
+
+ movaps [esp + iq], xmm3
+
+ shufps xmm0, xmm0, 0
+ shufps xmm1, xmm1, 0
+ shufps xmm2, xmm2, 0
+
+ movaps [esp + ix], xmm0
+ movaps [esp + iy], xmm1
+ movaps [esp + iz], xmm2
+
+ mov [esp + ii3], ebx
+
+ /* clear i forces */
+ xorps xmm4, xmm4
+ movaps [esp + fix], xmm4
+ movaps [esp + fiy], xmm4
+ movaps [esp + fiz], xmm4
+
+ mov ecx, [esp + innerjjnr0]
+ mov [esp + innerjjnr], ecx
+ mov edx, [esp + innerk0]
+ sub edx, 4
+ mov [esp + innerk], edx /* number of innerloop atoms */
+ jge .i3010_unroll_coul_loop
+ jmp .i3010_finish_coul_inner
+
+.i3010_unroll_coul_loop:
+ /* quad-unroll innerloop here */
+ mov edx, [esp + innerjjnr] /* pointer to jjnr[k] */
+ mov eax, [edx]
+ mov ebx, [edx + 4]
+ mov ecx, [edx + 8]
+ mov edx, [edx + 12] /* eax-edx=jnr1-4 */
+ add [esp + innerjjnr], 16 /* advance pointer (unrolled 4) */
+
+ mov esi, [ebp + charge] /* base of charge[] */
+
+ movss xmm3, [esi + eax*4]
+ movss xmm4, [esi + ecx*4]
+ movss xmm6, [esi + ebx*4]
+ movss xmm7, [esi + edx*4]
+
+ movaps xmm2, [esp + iq]
+ shufps xmm3, xmm6, 0
+ shufps xmm4, xmm7, 0
+ shufps xmm3, xmm4, 0b10001000 /* all charges in xmm3 */
+ mulps xmm3, xmm2
+
+ movaps [esp + qq], xmm3
+
+ mov esi, [ebp + pos] /* base of pos[] */
+
+ lea eax, [eax + eax*2] /* replace jnr with j3 */
+ lea ebx, [ebx + ebx*2]
+
+ lea ecx, [ecx + ecx*2] /* replace jnr with j3 */
+ lea edx, [edx + edx*2]
+
+ /* move four coordinates to xmm0-xmm2 */
+
+ movlps xmm4, [esi + eax*4]
+ movlps xmm5, [esi + ecx*4]
+ movss xmm2, [esi + eax*4 + 8]
+ movss xmm6, [esi + ecx*4 + 8]
+
+ movhps xmm4, [esi + ebx*4]
+ movhps xmm5, [esi + edx*4]
+
+ movss xmm0, [esi + ebx*4 + 8]
+ movss xmm1, [esi + edx*4 + 8]
+
+ shufps xmm2, xmm0, 0
+ shufps xmm6, xmm1, 0
+
+ movaps xmm0, xmm4
+ movaps xmm1, xmm4
+
+ shufps xmm2, xmm6, 0b10001000
+
+ shufps xmm0, xmm5, 0b10001000
+ shufps xmm1, xmm5, 0b11011101
+
+ /* move ix-iz to xmm4-xmm6 */
+ movaps xmm4, [esp + ix]
+ movaps xmm5, [esp + iy]
+ movaps xmm6, [esp + iz]
+
+ /* calc dr */
+ subps xmm4, xmm0
+ subps xmm5, xmm1
+ subps xmm6, xmm2
+
+ /* store dr */
+ movaps [esp + dx], xmm4
+ movaps [esp + dy], xmm5
+ movaps [esp + dz], xmm6
+ /* square it */
+ mulps xmm4,xmm4
+ mulps xmm5,xmm5
+ mulps xmm6,xmm6
+ addps xmm4, xmm5
+ addps xmm4, xmm6
+ /* rsq in xmm4 */
+
+ rsqrtps xmm5, xmm4
+ /* lookup seed in xmm5 */
+ movaps xmm2, xmm5
+ mulps xmm5, xmm5
+ movaps xmm1, [esp + three]
+ mulps xmm5, xmm4 /* rsq*lu*lu */
+ movaps xmm0, [esp + half]
+ subps xmm1, xmm5 /* 30-rsq*lu*lu */
+ mulps xmm1, xmm2
+ mulps xmm0, xmm1 /* xmm0=rinv */
+ mulps xmm4, xmm0 /* xmm4=r */
+ mulps xmm4, [esp + tsc]
+
+ movhlps xmm5, xmm4
+ cvttps2pi mm6, xmm4
+ cvttps2pi mm7, xmm5 /* mm6/mm7 contain lu indices */
+ cvtpi2ps xmm6, mm6
+ cvtpi2ps xmm5, mm7
+ movlhps xmm6, xmm5
+ subps xmm4, xmm6
+ movaps xmm1, xmm4 /* xmm1=eps */
+ movaps xmm2, xmm1
+ mulps xmm2, xmm2 /* xmm2=eps2 */
+ pslld mm6, 2
+ pslld mm7, 2
+
+ movd mm0, eax
+ movd mm1, ebx
+ movd mm2, ecx
+ movd mm3, edx
+
+ mov esi, [ebp + VFtab]
+ movd eax, mm6
+ psrlq mm6, 32
+ movd ecx, mm7
+ psrlq mm7, 32
+ movd ebx, mm6
+ movd edx, mm7
+
+ movlps xmm5, [esi + eax*4]
+ movlps xmm7, [esi + ecx*4]
+ movhps xmm5, [esi + ebx*4]
+ movhps xmm7, [esi + edx*4] /* got half coulomb table */
+
+ movaps xmm4, xmm5
+ shufps xmm4, xmm7, 0b10001000
+ shufps xmm5, xmm7, 0b11011101
+
+ movlps xmm7, [esi + eax*4 + 8]
+ movlps xmm3, [esi + ecx*4 + 8]
+ movhps xmm7, [esi + ebx*4 + 8]
+ movhps xmm3, [esi + edx*4 + 8] /* other half of coulomb table */
+ movaps xmm6, xmm7
+ shufps xmm6, xmm3, 0b10001000
+ shufps xmm7, xmm3, 0b11011101
+ /* coulomb table ready, in xmm4-xmm7 */
+
+ mulps xmm6, xmm1 /* xmm6=Geps */
+ mulps xmm7, xmm2 /* xmm7=Heps2 */
+ addps xmm5, xmm6
+ addps xmm5, xmm7 /* xmm5=Fp */
+ mulps xmm7, [esp + two] /* two*Heps2 */
+ movaps xmm3, [esp + qq]
+ addps xmm7, xmm6
+ addps xmm7, xmm5 /* xmm7=FF */
+ mulps xmm5, xmm1 /* xmm5=eps*Fp */
+ addps xmm5, xmm4 /* xmm5=VV */
+ mulps xmm5, xmm3 /* vcoul=qq*VV */
+ mulps xmm3, xmm7 /* fijC=FF*qq */
+ /* at this point mm5 contains vcoul and mm3 fijC */
+ /* increment vcoul - then we can get rid of mm5 */
+ /* update vctot */
+ addps xmm5, [esp + vctot]
+ movaps [esp + vctot], xmm5
+
+ xorps xmm4, xmm4
+
+ mulps xmm3, [esp + tsc]
+ mulps xmm3, xmm0
+ subps xmm4, xmm3
+
+ movaps xmm0, [esp + dx]
+ movaps xmm1, [esp + dy]
+ movaps xmm2, [esp + dz]
+
+ movd eax, mm0
+ movd ebx, mm1
+ movd ecx, mm2
+ movd edx, mm3
+
+ mov edi, [ebp + faction]
+ mulps xmm0, xmm4
+ mulps xmm1, xmm4
+ mulps xmm2, xmm4
+ /* xmm0-xmm2 contains tx-tz (partial force) */
+ /* now update f_i */
+ movaps xmm3, [esp + fix]
+ movaps xmm4, [esp + fiy]
+ movaps xmm5, [esp + fiz]
+ addps xmm3, xmm0
+ addps xmm4, xmm1
+ addps xmm5, xmm2
+ movaps [esp + fix], xmm3
+ movaps [esp + fiy], xmm4
+ movaps [esp + fiz], xmm5
+ /* the fj's - start by accumulating x & y forces from memory */
+ movlps xmm4, [edi + eax*4]
+ movlps xmm6, [edi + ecx*4]
+ movhps xmm4, [edi + ebx*4]
+ movhps xmm6, [edi + edx*4]
+
+ movaps xmm3, xmm4
+ shufps xmm3, xmm6, 0b10001000
+ shufps xmm4, xmm6, 0b11011101
+
+ /* now xmm3-xmm5 contains fjx, fjy, fjz */
+ subps xmm3, xmm0
+ subps xmm4, xmm1
+
+ /* unpack them back so we can store them - first x & y in xmm3/xmm4 */
+
+ movaps xmm6, xmm3
+ unpcklps xmm6, xmm4
+ unpckhps xmm3, xmm4
+ /* xmm6(l)=x & y for j1, (h) for j2 */
+ /* xmm3(l)=x & y for j3, (h) for j4 */
+ movlps [edi + eax*4], xmm6
+ movlps [edi + ecx*4], xmm3
+
+ movhps [edi + ebx*4], xmm6
+ movhps [edi + edx*4], xmm3
+
+ /* and the z forces */
+ movss xmm4, [edi + eax*4 + 8]
+ movss xmm5, [edi + ebx*4 + 8]
+ movss xmm6, [edi + ecx*4 + 8]
+ movss xmm7, [edi + edx*4 + 8]
+ subss xmm4, xmm2
+ shufps xmm2, xmm2, 0b11100101
+ subss xmm5, xmm2
+ shufps xmm2, xmm2, 0b11101010
+ subss xmm6, xmm2
+ shufps xmm2, xmm2, 0b11111111
+ subss xmm7, xmm2
+ movss [edi + eax*4 + 8], xmm4
+ movss [edi + ebx*4 + 8], xmm5
+ movss [edi + ecx*4 + 8], xmm6
+ movss [edi + edx*4 + 8], xmm7
+
+ /* should we do one more iteration? */
+ sub [esp + innerk], 4
+ jl .i3010_finish_coul_inner
+ jmp .i3010_unroll_coul_loop
+.i3010_finish_coul_inner:
+ /* check if at least two particles remain */
+ add [esp + innerk], 4
+ mov edx, [esp + innerk]
+ and edx, 2
+ jnz .i3010_dopair_coul
+ jmp .i3010_checksingle_coul
+.i3010_dopair_coul:
+ mov esi, [ebp + charge]
+
+ mov ecx, [esp + innerjjnr]
+
+ mov eax, [ecx]
+ mov ebx, [ecx + 4]
+ add [esp + innerjjnr], 8
+ xorps xmm7, xmm7
+ movss xmm3, [esi + eax*4]
+ movss xmm6, [esi + ebx*4]
+ shufps xmm3, xmm6, 0
+ shufps xmm3, xmm3, 0b00001000 /* xmm3(0,1) has the charges */
+
+ mulps xmm3, [esp + iq]
+ movlhps xmm3, xmm7
+ movaps [esp + qq], xmm3
+
+ mov edi, [ebp + pos]
+
+ lea eax, [eax + eax*2]
+ lea ebx, [ebx + ebx*2]
+ /* move coordinates to xmm0-xmm2 */
+ movlps xmm1, [edi + eax*4]
+ movss xmm2, [edi + eax*4 + 8]
+ movhps xmm1, [edi + ebx*4]
+ movss xmm0, [edi + ebx*4 + 8]
+
+ movlhps xmm3, xmm7
+
+ shufps xmm2, xmm0, 0
+
+ movaps xmm0, xmm1
+
+ shufps xmm2, xmm2, 0b10001000
+
+ shufps xmm0, xmm0, 0b10001000
+ shufps xmm1, xmm1, 0b11011101
+
+ mov edi, [ebp + faction]
+ /* move ix-iz to xmm4-xmm6 */
+ xorps xmm7, xmm7
+
+ movaps xmm4, [esp + ix]
+ movaps xmm5, [esp + iy]
+ movaps xmm6, [esp + iz]
+
+ /* calc dr */
+ subps xmm4, xmm0
+ subps xmm5, xmm1
+ subps xmm6, xmm2
+
+ /* store dr */
+ movaps [esp + dx], xmm4
+ movaps [esp + dy], xmm5
+ movaps [esp + dz], xmm6
+ /* square it */
+ mulps xmm4,xmm4
+ mulps xmm5,xmm5
+ mulps xmm6,xmm6
+ addps xmm4, xmm5
+ addps xmm4, xmm6
+ /* rsq in xmm4 */
+
+ rsqrtps xmm5, xmm4
+ /* lookup seed in xmm5 */
+ movaps xmm2, xmm5
+ mulps xmm5, xmm5
+ movaps xmm1, [esp + three]
+ mulps xmm5, xmm4 /* rsq*lu*lu */
+ movaps xmm0, [esp + half]
+ subps xmm1, xmm5 /* 30-rsq*lu*lu */
+ mulps xmm1, xmm2
+ mulps xmm0, xmm1 /* xmm0=rinv */
+ mulps xmm4, xmm0 /* xmm4=r */
+ mulps xmm4, [esp + tsc]
+
+ cvttps2pi mm6, xmm4 /* mm6 contain lu indices */
+ cvtpi2ps xmm6, mm6
+ subps xmm4, xmm6
+ movaps xmm1, xmm4 /* xmm1=eps */
+ movaps xmm2, xmm1
+ mulps xmm2, xmm2 /* xmm2=eps2 */
+
+ pslld mm6, 2
+
+ mov esi, [ebp + VFtab]
+ movd ecx, mm6
+ psrlq mm6, 32
+ movd edx, mm6
+
+ movlps xmm5, [esi + ecx*4]
+ movhps xmm5, [esi + edx*4] /* got half coulomb table */
+ movaps xmm4, xmm5
+ shufps xmm4, xmm4, 0b10001000
+ shufps xmm5, xmm7, 0b11011101
+
+ movlps xmm7, [esi + ecx*4 + 8]
+ movhps xmm7, [esi + edx*4 + 8]
+ movaps xmm6, xmm7
+ shufps xmm6, xmm6, 0b10001000
+ shufps xmm7, xmm7, 0b11011101
+ /* table ready in xmm4-xmm7 */
+
+ mulps xmm6, xmm1 /* xmm6=Geps */
+ mulps xmm7, xmm2 /* xmm7=Heps2 */
+ addps xmm5, xmm6
+ addps xmm5, xmm7 /* xmm5=Fp */
+ mulps xmm7, [esp + two] /* two*Heps2 */
+ movaps xmm3, [esp + qq]
+ addps xmm7, xmm6
+ addps xmm7, xmm5 /* xmm7=FF */
+ mulps xmm5, xmm1 /* xmm5=eps*Fp */
+ addps xmm5, xmm4 /* xmm5=VV */
+ mulps xmm5, xmm3 /* vcoul=qq*VV */
+ mulps xmm3, xmm7 /* fijC=FF*qq */
+ /* at this point mm5 contains vcoul and mm3 fijC */
+ /* increment vcoul - then we can get rid of mm5 */
+ /* update vctot */
+ addps xmm5, [esp + vctot]
+ movaps [esp + vctot], xmm5
+
+ xorps xmm4, xmm4
+
+ mulps xmm3, [esp + tsc]
+ mulps xmm3, xmm0
+ subps xmm4, xmm3
+
+ movaps xmm0, [esp + dx]
+ movaps xmm1, [esp + dy]
+ movaps xmm2, [esp + dz]
+
+ mulps xmm0, xmm4
+ mulps xmm1, xmm4
+ mulps xmm2, xmm4
+ /* xmm0-xmm2 contains tx-tz (partial force) */
+ /* now update f_i */
+ movaps xmm3, [esp + fix]
+ movaps xmm4, [esp + fiy]
+ movaps xmm5, [esp + fiz]
+ addps xmm3, xmm0
+ addps xmm4, xmm1
+ addps xmm5, xmm2
+ movaps [esp + fix], xmm3
+ movaps [esp + fiy], xmm4
+ movaps [esp + fiz], xmm5
+ /* update the fj's */
+ movss xmm3, [edi + eax*4]
+ movss xmm4, [edi + eax*4 + 4]
+ movss xmm5, [edi + eax*4 + 8]
+ subss xmm3, xmm0
+ subss xmm4, xmm1
+ subss xmm5, xmm2
+ movss [edi + eax*4], xmm3
+ movss [edi + eax*4 + 4], xmm4
+ movss [edi + eax*4 + 8], xmm5
+
+ shufps xmm0, xmm0, 0b11100001
+ shufps xmm1, xmm1, 0b11100001
+ shufps xmm2, xmm2, 0b11100001
+
+ movss xmm3, [edi + ebx*4]
+ movss xmm4, [edi + ebx*4 + 4]
+ movss xmm5, [edi + ebx*4 + 8]
+ subss xmm3, xmm0
+ subss xmm4, xmm1
+ subss xmm5, xmm2
+ movss [edi + ebx*4], xmm3
+ movss [edi + ebx*4 + 4], xmm4
+ movss [edi + ebx*4 + 8], xmm5
+
+.i3010_checksingle_coul:
+ mov edx, [esp + innerk]
+ and edx, 1
+ jnz .i3010_dosingle_coul
+ jmp .i3010_updateouterdata_coul
+.i3010_dosingle_coul:
+ mov esi, [ebp + charge]
+ mov edi, [ebp + pos]
+ mov ecx, [esp + innerjjnr]
+ mov eax, [ecx]
+ xorps xmm6, xmm6
+ movss xmm6, [esi + eax*4] /* xmm6(0) has the charge */
+ mulps xmm6, [esp + iq]
+ movaps [esp + qq], xmm6
+
+ lea eax, [eax + eax*2]
+
+ /* move coordinates to xmm0-xmm2 */
+ movss xmm0, [edi + eax*4]
+ movss xmm1, [edi + eax*4 + 4]
+ movss xmm2, [edi + eax*4 + 8]
+
+ movaps xmm4, [esp + ix]
+ movaps xmm5, [esp + iy]
+ movaps xmm6, [esp + iz]
+
+ /* calc dr */
+ subps xmm4, xmm0
+ subps xmm5, xmm1
+ subps xmm6, xmm2
+
+ /* store dr */
+ movaps [esp + dx], xmm4
+ movaps [esp + dy], xmm5
+ movaps [esp + dz], xmm6
+ /* square it */
+ mulps xmm4,xmm4
+ mulps xmm5,xmm5
+ mulps xmm6,xmm6
+ addps xmm4, xmm5
+ addps xmm4, xmm6
+ /* rsq in xmm4 */
+
+ rsqrtps xmm5, xmm4
+ /* lookup seed in xmm5 */
+ movaps xmm2, xmm5
+ mulps xmm5, xmm5
+ movaps xmm1, [esp + three]
+ mulps xmm5, xmm4 /* rsq*lu*lu */
+ movaps xmm0, [esp + half]
+ subps xmm1, xmm5 /* 30-rsq*lu*lu */
+ mulps xmm1, xmm2
+ mulps xmm0, xmm1 /* xmm0=rinv */
+
+ mulps xmm4, xmm0 /* xmm4=r */
+ mulps xmm4, [esp + tsc]
+
+ cvttps2pi mm6, xmm4 /* mm6 contain lu indices */
+ cvtpi2ps xmm6, mm6
+ subps xmm4, xmm6
+ movaps xmm1, xmm4 /* xmm1=eps */
+ movaps xmm2, xmm1
+ mulps xmm2, xmm2 /* xmm2=eps2 */
+
+ pslld mm6, 2
+
+ mov esi, [ebp + VFtab]
+ movd ebx, mm6
+
+ movlps xmm4, [esi + ebx*4]
+ movlps xmm6, [esi + ebx*4 + 8]
+ movaps xmm5, xmm4
+ movaps xmm7, xmm6
+ shufps xmm5, xmm5, 1
+ shufps xmm7, xmm7, 1
+ /* table ready in xmm4-xmm7 */
+
+ mulps xmm6, xmm1 /* xmm6=Geps */
+ mulps xmm7, xmm2 /* xmm7=Heps2 */
+ addps xmm5, xmm6
+ addps xmm5, xmm7 /* xmm5=Fp */
+ mulps xmm7, [esp + two] /* two*Heps2 */
+ movaps xmm3, [esp + qq]
+ addps xmm7, xmm6
+ addps xmm7, xmm5 /* xmm7=FF */
+ mulps xmm5, xmm1 /* xmm5=eps*Fp */
+ addps xmm5, xmm4 /* xmm5=VV */
+ mulps xmm5, xmm3 /* vcoul=qq*VV */
+ mulps xmm3, xmm7 /* fijC=FF*qq */
+ /* at this point mm5 contains vcoul and mm3 fijC */
+ /* increment vcoul - then we can get rid of mm5 */
+ /* update vctot */
+ addps xmm5, [esp + vctot]
+ movaps [esp + vctot], xmm5
+
+ xorps xmm4, xmm4
+
+ mulps xmm3, [esp + tsc]
+ mulps xmm3, xmm0
+ subps xmm4, xmm3
+ mov edi, [ebp + faction]
+
+ movaps xmm0, [esp + dx]
+ movaps xmm1, [esp + dy]
+ movaps xmm2, [esp + dz]
+
+ mulps xmm0, xmm4
+ mulps xmm1, xmm4
+ mulps xmm2, xmm4
+ /* xmm0-xmm2 contains tx-tz (partial force) */
+ /* now update f_i */
+ movaps xmm3, [esp + fix]
+ movaps xmm4, [esp + fiy]
+ movaps xmm5, [esp + fiz]
+ addps xmm3, xmm0
+ addps xmm4, xmm1
+ addps xmm5, xmm2
+ movaps [esp + fix], xmm3
+ movaps [esp + fiy], xmm4
+ movaps [esp + fiz], xmm5
+ /* update fj */
+
+ movss xmm3, [edi + eax*4]
+ movss xmm4, [edi + eax*4 + 4]
+ movss xmm5, [edi + eax*4 + 8]
+ subss xmm3, xmm0
+ subss xmm4, xmm1
+ subss xmm5, xmm2
+ movss [edi + eax*4], xmm3
+ movss [edi + eax*4 + 4], xmm4
+ movss [edi + eax*4 + 8], xmm5
+.i3010_updateouterdata_coul:
+ mov ecx, [esp + ii3]
+ mov edi, [ebp + faction]
+ mov esi, [ebp + fshift]
+ mov edx, [esp + is3]
+
+ /* accumulate i forces in xmm0, xmm1, xmm2 */
+ movaps xmm0, [esp + fix]
+ movaps xmm1, [esp + fiy]
+ movaps xmm2, [esp + fiz]
+
+ movhlps xmm3, xmm0
+ movhlps xmm4, xmm1
+ movhlps xmm5, xmm2
+ addps xmm0, xmm3
+ addps xmm1, xmm4
+ addps xmm2, xmm5 /* sum is in 1/2 in xmm0-xmm2 */
+
+ movaps xmm3, xmm0
+ movaps xmm4, xmm1
+ movaps xmm5, xmm2
+
+ shufps xmm3, xmm3, 1
+ shufps xmm4, xmm4, 1
+ shufps xmm5, xmm5, 1
+ addss xmm0, xmm3
+ addss xmm1, xmm4
+ addss xmm2, xmm5 /* xmm0-xmm2 has single force in pos0 */
+
+ /* increment i force */
+ movss xmm3, [edi + ecx*4]
+ movss xmm4, [edi + ecx*4 + 4]
+ movss xmm5, [edi + ecx*4 + 8]
+ addss xmm3, xmm0
+ addss xmm4, xmm1
+ addss xmm5, xmm2
+ movss [edi + ecx*4], xmm3
+ movss [edi + ecx*4 + 4], xmm4
+ movss [edi + ecx*4 + 8], xmm5
+
+ /* increment fshift force */
+ movss xmm3, [esi + edx*4]
+ movss xmm4, [esi + edx*4 + 4]
+ movss xmm5, [esi + edx*4 + 8]
+ addss xmm3, xmm0
+ addss xmm4, xmm1
+ addss xmm5, xmm2
+ movss [esi + edx*4], xmm3
+ movss [esi + edx*4 + 4], xmm4
+ movss [esi + edx*4 + 8], xmm5
+
+ /* loop back to mno */
+ dec dword ptr [esp + nscoul]
+ jz .i3010_last_mno
+ jmp .i3010_mno_coul
+
+.i3010_last_mno:
+ /* get group index for i particle */
+ mov edx, [ebp + gid] /* get group index for this i particle */
+ mov edx, [edx]
+ add [ebp + gid], 4 /* advance pointer */
+
+ /* accumulate total potential energy and update it */
+ movaps xmm7, [esp + vctot]
+ /* accumulate */
+ movhlps xmm6, xmm7
+ addps xmm7, xmm6 /* pos 0-1 in xmm7 have the sum now */
+ movaps xmm6, xmm7
+ shufps xmm6, xmm6, 1
+ addss xmm7, xmm6
+
+ /* add earlier value from mem */
+ mov eax, [ebp + Vc]
+ addss xmm7, [eax + edx*4]
+ /* move back to mem */
+ movss [eax + edx*4], xmm7
+
+ /* finish if last */
+ mov ecx, [ebp + nri]
+ dec ecx
+ jecxz .i3010_end
+ /* not last, iterate once more! */
+ mov [ebp + nri], ecx
+ jmp .i3010_outer
+.i3010_end:
+ emms
+ mov eax, [esp + salign]
+ add esp, eax
+ add esp, 324
+ pop edi
+ pop esi
+ pop edx
+ pop ecx
+ pop ebx
+ pop eax
+ leave
+ ret
+
+
+
+
+.globl inl3020_sse
+ .type inl3020_sse,@function
+inl3020_sse:
+.equ nri, 8
+.equ iinr, 12
+.equ jindex, 16
+.equ jjnr, 20
+.equ shift, 24
+.equ shiftvec, 28
+.equ fshift, 32
+.equ gid, 36
+.equ pos, 40
+.equ faction, 44
+.equ charge, 48
+.equ facel, 52
+.equ Vc, 56
+.equ tabscale, 60
+.equ VFtab, 64
+ /* stack offsets for local variables */
+ /* bottom of stack is cache-aligned for sse use */
+.equ ixO, 0
+.equ iyO, 16
+.equ izO, 32
+.equ ixH1, 48
+.equ iyH1, 64
+.equ izH1, 80
+.equ ixH2, 96
+.equ iyH2, 112
+.equ izH2, 128
+.equ iqO, 144
+.equ iqH, 160
+.equ dxO, 176
+.equ dyO, 192
+.equ dzO, 208
+.equ dxH1, 224
+.equ dyH1, 240
+.equ dzH1, 256
+.equ dxH2, 272
+.equ dyH2, 288
+.equ dzH2, 304
+.equ qqO, 320
+.equ qqH, 336
+.equ rinvO, 352
+.equ rinvH1, 368
+.equ rinvH2, 384
+.equ rO, 400
+.equ rH1, 416
+.equ rH2, 432
+.equ tsc, 448
+.equ two, 464
+.equ vctot, 480
+.equ fixO, 496
+.equ fiyO, 512
+.equ fizO, 528
+.equ fixH1, 544
+.equ fiyH1, 560
+.equ fizH1, 576
+.equ fixH2, 592
+.equ fiyH2, 608
+.equ fizH2, 624
+.equ fjx, 640
+.equ fjy, 656
+.equ fjz, 672
+.equ half, 688
+.equ three, 704
+.equ is3, 720
+.equ ii3, 724
+.equ innerjjnr, 728
+.equ innerk, 732
+.equ salign, 736
+ push ebp
+ mov ebp,esp
+ push eax
+ push ebx
+ push ecx
+ push edx
+ push esi
+ push edi
+ sub esp, 740 /* local stack space */
+ mov eax, esp
+ and eax, 0xf
+ sub esp, eax
+ mov [esp + salign], eax
+
+ emms
+
+ movups xmm0, [sse_half]
+ movups xmm1, [sse_two]
+ movups xmm2, [sse_three]
+ movss xmm3, [ebp +tabscale]
+
+ movaps [esp + half], xmm0
+ movaps [esp + two], xmm1
+ movaps [esp + three], xmm2
+ shufps xmm3, xmm3, 0
+ movaps [esp + tsc], xmm3
+
+ /* assume we have at least one i particle - start directly */
+ mov ecx, [ebp + iinr] /* ecx = pointer into iinr[] */
+ mov ebx, [ecx] /* ebx =ii */
+
+ mov edx, [ebp + charge]
+ movss xmm3, [edx + ebx*4]
+ movss xmm4, [edx + ebx*4 + 4]
+ movss xmm5, [ebp + facel]
+ mulss xmm3, xmm5
+ mulss xmm4, xmm5
+
+ shufps xmm3, xmm3, 0
+ shufps xmm4, xmm4, 0
+ movaps [esp + iqO], xmm3
+ movaps [esp + iqH], xmm4
+
+.i3020_outer:
+ mov eax, [ebp + shift] /* eax = pointer into shift[] */
+ mov ebx, [eax] /* ebx=shift[n] */
+ add [ebp + shift], 4 /* advance pointer one step */
+
+ lea ebx, [ebx + ebx*2] /* ebx=3*is */
+ mov [esp + is3],ebx /* store is3 */
+
+ mov eax, [ebp + shiftvec] /* eax = base of shiftvec[] */
+
+ movss xmm0, [eax + ebx*4]
+ movss xmm1, [eax + ebx*4 + 4]
+ movss xmm2, [eax + ebx*4 + 8]
+
+ mov ecx, [ebp + iinr] /* ecx = pointer into iinr[] */
+ add [ebp + iinr], 4 /* advance pointer */
+ mov ebx, [ecx] /* ebx =ii */
+
+ movaps xmm3, xmm0
+ movaps xmm4, xmm1
+ movaps xmm5, xmm2
+
+ lea ebx, [ebx + ebx*2] /* ebx = 3*ii=ii3 */
+ mov eax, [ebp + pos] /* eax = base of pos[] */
+ mov [esp + ii3], ebx
+
+ addss xmm3, [eax + ebx*4]
+ addss xmm4, [eax + ebx*4 + 4]
+ addss xmm5, [eax + ebx*4 + 8]
+ shufps xmm3, xmm3, 0
+ shufps xmm4, xmm4, 0
+ shufps xmm5, xmm5, 0
+ movaps [esp + ixO], xmm3
+ movaps [esp + iyO], xmm4
+ movaps [esp + izO], xmm5
+
+ movss xmm3, xmm0
+ movss xmm4, xmm1
+ movss xmm5, xmm2
+ addss xmm0, [eax + ebx*4 + 12]
+ addss xmm1, [eax + ebx*4 + 16]
+ addss xmm2, [eax + ebx*4 + 20]
+ addss xmm3, [eax + ebx*4 + 24]
+ addss xmm4, [eax + ebx*4 + 28]
+ addss xmm5, [eax + ebx*4 + 32]
+
+ shufps xmm0, xmm0, 0
+ shufps xmm1, xmm1, 0
+ shufps xmm2, xmm2, 0
+ shufps xmm3, xmm3, 0
+ shufps xmm4, xmm4, 0
+ shufps xmm5, xmm5, 0
+ movaps [esp + ixH1], xmm0
+ movaps [esp + iyH1], xmm1
+ movaps [esp + izH1], xmm2
+ movaps [esp + ixH2], xmm3
+ movaps [esp + iyH2], xmm4
+ movaps [esp + izH2], xmm5
+
+ /* clear vctot and i forces */
+ xorps xmm4, xmm4
+ movaps [esp + vctot], xmm4
+ movaps [esp + fixO], xmm4
+ movaps [esp + fiyO], xmm4
+ movaps [esp + fizO], xmm4
+ movaps [esp + fixH1], xmm4
+ movaps [esp + fiyH1], xmm4
+ movaps [esp + fizH1], xmm4
+ movaps [esp + fixH2], xmm4
+ movaps [esp + fiyH2], xmm4
+ movaps [esp + fizH2], xmm4
+
+ mov eax, [ebp + jindex]
+ mov ecx, [eax] /* jindex[n] */
+ mov edx, [eax + 4] /* jindex[n+1] */
+ add [ebp + jindex], 4
+ sub edx, ecx /* number of innerloop atoms */
+
+ mov esi, [ebp + pos]
+ mov edi, [ebp + faction]
+ mov eax, [ebp + jjnr]
+ shl ecx, 2
+ add eax, ecx
+ mov [esp + innerjjnr], eax /* pointer to jjnr[nj0] */
+ sub edx, 4
+ mov [esp + innerk], edx /* number of innerloop atoms */
+ jge .i3020_unroll_loop
+ jmp .i3020_odd_inner
+.i3020_unroll_loop:
+ /* quad-unroll innerloop here */
+ mov edx, [esp + innerjjnr] /* pointer to jjnr[k] */
+ mov eax, [edx]
+ mov ebx, [edx + 4]
+ mov ecx, [edx + 8]
+ mov edx, [edx + 12] /* eax-edx=jnr1-4 */
+
+ add [esp + innerjjnr], 16 /* advance pointer (unrolled 4) */
+
+ mov esi, [ebp + charge] /* base of charge[] */
+
+ movss xmm3, [esi + eax*4]
+ movss xmm4, [esi + ecx*4]
+ movss xmm6, [esi + ebx*4]
+ movss xmm7, [esi + edx*4]
+
+ shufps xmm3, xmm6, 0
+ shufps xmm4, xmm7, 0
+ shufps xmm3, xmm4, 0b10001000 /* all charges in xmm3 */
+ movaps xmm4, xmm3 /* and in xmm4 */
+ mulps xmm3, [esp + iqO]
+ mulps xmm4, [esp + iqH]
+
+ movd mm0, eax /* use mmx registers as temp storage */
+ movd mm1, ebx
+ movd mm2, ecx
+ movd mm3, edx
+
+ movaps [esp + qqO], xmm3
+ movaps [esp + qqH], xmm4
+
+ mov esi, [ebp + pos] /* base of pos[] */
+
+ lea eax, [eax + eax*2] /* replace jnr with j3 */
+ lea ebx, [ebx + ebx*2]
+ lea ecx, [ecx + ecx*2] /* replace jnr with j3 */
+ lea edx, [edx + edx*2]
+
+ /* move four coordinates to xmm0-xmm2 */
+ movlps xmm4, [esi + eax*4]
+ movlps xmm5, [esi + ecx*4]
+ movss xmm2, [esi + eax*4 + 8]
+ movss xmm6, [esi + ecx*4 + 8]
+
+ movhps xmm4, [esi + ebx*4]
+ movhps xmm5, [esi + edx*4]
+
+ movss xmm0, [esi + ebx*4 + 8]
+ movss xmm1, [esi + edx*4 + 8]
+
+ shufps xmm2, xmm0, 0
+ shufps xmm6, xmm1, 0
+
+ movaps xmm0, xmm4
+ movaps xmm1, xmm4
+
+ shufps xmm2, xmm6, 0b10001000
+
+ shufps xmm0, xmm5, 0b10001000
+ shufps xmm1, xmm5, 0b11011101
+
+ /* move ixO-izO to xmm4-xmm6 */
+ movaps xmm4, [esp + ixO]
+ movaps xmm5, [esp + iyO]
+ movaps xmm6, [esp + izO]
+
+ /* calc dr */
+ subps xmm4, xmm0
+ subps xmm5, xmm1
+ subps xmm6, xmm2
+
+ /* store dr */
+ movaps [esp + dxO], xmm4
+ movaps [esp + dyO], xmm5
+ movaps [esp + dzO], xmm6
+ /* square it */
+ mulps xmm4,xmm4
+ mulps xmm5,xmm5
+ mulps xmm6,xmm6
+ addps xmm4, xmm5
+ addps xmm4, xmm6
+ movaps xmm7, xmm4
+ /* rsqO in xmm7 */
+
+ /* move ixH1-izH1 to xmm4-xmm6 */
+ movaps xmm4, [esp + ixH1]
+ movaps xmm5, [esp + iyH1]
+ movaps xmm6, [esp + izH1]
+
+ /* calc dr */
+ subps xmm4, xmm0
+ subps xmm5, xmm1
+ subps xmm6, xmm2
+
+ /* store dr */
+ movaps [esp + dxH1], xmm4
+ movaps [esp + dyH1], xmm5
+ movaps [esp + dzH1], xmm6
+ /* square it */
+ mulps xmm4,xmm4
+ mulps xmm5,xmm5
+ mulps xmm6,xmm6
+ addps xmm6, xmm5
+ addps xmm6, xmm4
+ /* rsqH1 in xmm6 */
+
+ /* move ixH2-izH2 to xmm3-xmm5 */
+ movaps xmm3, [esp + ixH2]
+ movaps xmm4, [esp + iyH2]
+ movaps xmm5, [esp + izH2]
+
+ /* calc dr */
+ subps xmm3, xmm0
+ subps xmm4, xmm1
+ subps xmm5, xmm2
+
+ /* store dr */
+ movaps [esp + dxH2], xmm3
+ movaps [esp + dyH2], xmm4
+ movaps [esp + dzH2], xmm5
+ /* square it */
+ mulps xmm3,xmm3
+ mulps xmm4,xmm4
+ mulps xmm5,xmm5
+ addps xmm5, xmm4
+ addps xmm5, xmm3
+ /* rsqH2 in xmm5, rsqH1 in xmm6, rsqO in xmm7 */
+
+ /* start with rsqO - seed to xmm2 */
+ rsqrtps xmm2, xmm7
+ movaps xmm3, xmm2
+ mulps xmm2, xmm2
+ movaps xmm4, [esp + three]
+ mulps xmm2, xmm7 /* rsq*lu*lu */
+ subps xmm4, xmm2 /* 30-rsq*lu*lu */
+ mulps xmm4, xmm3 /* lu*(3-rsq*lu*lu) */
+ mulps xmm4, [esp + half]
+ movaps [esp + rinvO], xmm4 /* rinvO in xmm4 */
+ mulps xmm7, xmm4
+ movaps [esp + rO], xmm7
+
+ /* rsqH1 - seed in xmm2 */
+ rsqrtps xmm2, xmm6
+ movaps xmm3, xmm2
+ mulps xmm2, xmm2
+ movaps xmm4, [esp + three]
+ mulps xmm2, xmm6 /* rsq*lu*lu */
+ subps xmm4, xmm2 /* 30-rsq*lu*lu */
+ mulps xmm4, xmm3 /* lu*(3-rsq*lu*lu) */
+ mulps xmm4, [esp + half]
+ movaps [esp + rinvH1], xmm4 /* rinvH1 in xmm4 */
+ mulps xmm6, xmm4
+ movaps [esp + rH1], xmm6
+
+ /* rsqH2 - seed to xmm2 */
+ rsqrtps xmm2, xmm5
+ movaps xmm3, xmm2
+ mulps xmm2, xmm2
+ movaps xmm4, [esp + three]
+ mulps xmm2, xmm5 /* rsq*lu*lu */
+ subps xmm4, xmm2 /* 30-rsq*lu*lu */
+ mulps xmm4, xmm3 /* lu*(3-rsq*lu*lu) */
+ mulps xmm4, [esp + half]
+ movaps [esp + rinvH2], xmm4 /* rinvH2 in xmm4 */
+ mulps xmm5, xmm4
+ movaps [esp + rH2], xmm5
+
+ /* do O interactions */
+ /* rO is still in xmm7 */
+ mulps xmm7, [esp + tsc]
+ movhlps xmm4, xmm7
+ cvttps2pi mm6, xmm7
+ cvttps2pi mm7, xmm4 /* mm6/mm7 contain lu indices */
+
+ cvtpi2ps xmm3, mm6
+ cvtpi2ps xmm4, mm7
+ movlhps xmm3, xmm4
+
+ subps xmm7, xmm3
+
+ movaps xmm1, xmm7 /* xmm1=eps */
+ movaps xmm2, xmm1
+ mulps xmm2, xmm2 /* xmm2=eps2 */
+ pslld mm6, 2
+ pslld mm7, 2
+
+ movd mm0, eax
+ movd mm1, ebx
+ movd mm2, ecx
+ movd mm3, edx
+
+ mov esi, [ebp + VFtab]
+ movd eax, mm6
+ psrlq mm6, 32
+ movd ecx, mm7
+ psrlq mm7, 32
+ movd ebx, mm6
+ movd edx, mm7
+
+ movlps xmm5, [esi + eax*4]
+ movlps xmm7, [esi + ecx*4]
+ movhps xmm5, [esi + ebx*4]
+ movhps xmm7, [esi + edx*4] /* got half coulomb table */
+
+ movaps xmm4, xmm5
+ shufps xmm4, xmm7, 0b10001000
+ shufps xmm5, xmm7, 0b11011101
+
+ movlps xmm7, [esi + eax*4 + 8]
+ movlps xmm3, [esi + ecx*4 + 8]
+ movhps xmm7, [esi + ebx*4 + 8]
+ movhps xmm3, [esi + edx*4 + 8] /* other half of coulomb table */
+ movaps xmm6, xmm7
+ shufps xmm6, xmm3, 0b10001000
+ shufps xmm7, xmm3, 0b11011101
+ /* coulomb table ready, in xmm4-xmm7 */
+
+ mulps xmm6, xmm1 /* xmm6=Geps */
+ mulps xmm7, xmm2 /* xmm7=Heps2 */
+ addps xmm5, xmm6
+ addps xmm5, xmm7 /* xmm5=Fp */
+ mulps xmm7, [esp + two] /* two*Heps2 */
+ movaps xmm0, [esp + qqO]
+ addps xmm7, xmm6
+ addps xmm7, xmm5 /* xmm7=FF */
+ mulps xmm5, xmm1 /* xmm5=eps*Fp */
+ addps xmm5, xmm4 /* xmm5=VV */
+ mulps xmm5, xmm0 /* vcoul=qq*VV */
+ mulps xmm0, xmm7 /* fijC=FF*qq */
+
+ /* at this point mm5 contains vcoul and xmm0 fijC */
+ /* increment vcoul - then we can get rid of mm5 */
+ addps xmm5, [esp + vctot]
+ movaps [esp + vctot], xmm5
+ xorps xmm4, xmm4
+
+ mulps xmm0, [esp + tsc]
+ mulps xmm0, [esp + rinvO]
+ subps xmm4, xmm0
+
+ movaps xmm0, [esp + dxO]
+ movaps xmm1, [esp + dyO]
+ movaps xmm2, [esp + dzO]
+ mulps xmm0, xmm4
+ mulps xmm1, xmm4
+ mulps xmm2, xmm4 /* tx in xmm0-xmm2 */
+
+ /* update O forces */
+ movaps xmm3, [esp + fixO]
+ movaps xmm4, [esp + fiyO]
+ movaps xmm7, [esp + fizO]
+ addps xmm3, xmm0
+ addps xmm4, xmm1
+ addps xmm7, xmm2
+ movaps [esp + fixO], xmm3
+ movaps [esp + fiyO], xmm4
+ movaps [esp + fizO], xmm7
+ /* update j forces with water O */
+ movaps [esp + fjx], xmm0
+ movaps [esp + fjy], xmm1
+ movaps [esp + fjz], xmm2
+
+ /* Done with O interactions - now H1! */
+ movaps xmm7, [esp + rH1]
+ mulps xmm7, [esp + tsc]
+ movhlps xmm4, xmm7
+ cvttps2pi mm6, xmm7
+ cvttps2pi mm7, xmm4 /* mm6/mm7 contain lu indices */
+
+ cvtpi2ps xmm3, mm6
+ cvtpi2ps xmm4, mm7
+ movlhps xmm3, xmm4
+
+ subps xmm7, xmm3
+ movaps xmm1, xmm7 /* xmm1=eps */
+ movaps xmm2, xmm1
+ mulps xmm2, xmm2 /* xmm2=eps2 */
+ pslld mm6, 2
+ pslld mm7, 2
+
+ movd eax, mm6
+ psrlq mm6, 32
+ movd ecx, mm7
+ psrlq mm7, 32
+ movd ebx, mm6
+ movd edx, mm7
+
+ movlps xmm5, [esi + eax*4]
+ movlps xmm7, [esi + ecx*4]
+ movhps xmm5, [esi + ebx*4]
+ movhps xmm7, [esi + edx*4] /* got half coulomb table */
+
+ movaps xmm4, xmm5
+ shufps xmm4, xmm7, 0b10001000
+ shufps xmm5, xmm7, 0b11011101
+
+ movlps xmm7, [esi + eax*4 + 8]
+ movlps xmm3, [esi + ecx*4 + 8]
+ movhps xmm7, [esi + ebx*4 + 8]
+ movhps xmm3, [esi + edx*4 + 8] /* other half of coulomb table */
+ movaps xmm6, xmm7
+ shufps xmm6, xmm3, 0b10001000
+ shufps xmm7, xmm3, 0b11011101
+ /* coulomb table ready, in xmm4-xmm7 */
+
+ mulps xmm6, xmm1 /* xmm6=Geps */
+ mulps xmm7, xmm2 /* xmm7=Heps2 */
+ addps xmm5, xmm6
+ addps xmm5, xmm7 /* xmm5=Fp */
+ mulps xmm7, [esp + two] /* two*Heps2 */
+ movaps xmm0, [esp + qqH]
+ addps xmm7, xmm6
+ addps xmm7, xmm5 /* xmm7=FF */
+ mulps xmm5, xmm1 /* xmm5=eps*Fp */
+ addps xmm5, xmm4 /* xmm5=VV */
+ mulps xmm5, xmm0 /* vcoul=qq*VV */
+ mulps xmm7, xmm0 /* fijC=FF*qq */
+ /* at this point mm5 contains vcoul and xmm7 fijC */
+ /* increment vcoul */
+ xorps xmm4, xmm4
+ addps xmm5, [esp + vctot]
+ mulps xmm7, [esp + rinvH1]
+ movaps [esp + vctot], xmm5
+ mulps xmm7, [esp + tsc]
+ subps xmm4, xmm7
+
+ movaps xmm0, [esp + dxH1]
+ movaps xmm1, [esp + dyH1]
+ movaps xmm2, [esp + dzH1]
+ mulps xmm0, xmm4
+ mulps xmm1, xmm4
+ mulps xmm2, xmm4
+
+ /* update H1 forces */
+ movaps xmm3, [esp + fixH1]
+ movaps xmm4, [esp + fiyH1]
+ movaps xmm7, [esp + fizH1]
+ addps xmm3, xmm0
+ addps xmm4, xmm1
+ addps xmm7, xmm2
+ movaps [esp + fixH1], xmm3
+ movaps [esp + fiyH1], xmm4
+ movaps [esp + fizH1], xmm7
+ /* update j forces with water H1 */
+ addps xmm0, [esp + fjx]
+ addps xmm1, [esp + fjy]
+ addps xmm2, [esp + fjz]
+ movaps [esp + fjx], xmm0
+ movaps [esp + fjy], xmm1
+ movaps [esp + fjz], xmm2
+
+ /* Done with H1, finally we do H2 interactions */
+ movaps xmm7, [esp + rH2]
+ mulps xmm7, [esp + tsc]
+ movhlps xmm4, xmm7
+ cvttps2pi mm6, xmm7
+ cvttps2pi mm7, xmm4 /* mm6/mm7 contain lu indices */
+
+ cvtpi2ps xmm3, mm6
+ cvtpi2ps xmm4, mm7
+ movlhps xmm3, xmm4
+
+ subps xmm7, xmm3
+ movaps xmm1, xmm7 /* xmm1=eps */
+ movaps xmm2, xmm1
+ mulps xmm2, xmm2 /* xmm2=eps2 */
+ pslld mm6, 2
+ pslld mm7, 2
+
+ movd eax, mm6
+ psrlq mm6, 32
+ movd ecx, mm7
+ psrlq mm7, 32
+ movd ebx, mm6
+ movd edx, mm7
+
+ movlps xmm5, [esi + eax*4]
+ movlps xmm7, [esi + ecx*4]
+ movhps xmm5, [esi + ebx*4]
+ movhps xmm7, [esi + edx*4] /* got half coulomb table */
+
+ movaps xmm4, xmm5
+ shufps xmm4, xmm7, 0b10001000
+ shufps xmm5, xmm7, 0b11011101
+
+ movlps xmm7, [esi + eax*4 + 8]
+ movlps xmm3, [esi + ecx*4 + 8]
+ movhps xmm7, [esi + ebx*4 + 8]
+ movhps xmm3, [esi + edx*4 + 8] /* other half of coulomb table */
+ movaps xmm6, xmm7
+ shufps xmm6, xmm3, 0b10001000
+ shufps xmm7, xmm3, 0b11011101
+ /* coulomb table ready, in xmm4-xmm7 */
+
+ mulps xmm6, xmm1 /* xmm6=Geps */
+ mulps xmm7, xmm2 /* xmm7=Heps2 */
+ addps xmm5, xmm6
+ addps xmm5, xmm7 /* xmm5=Fp */
+ mulps xmm7, [esp + two] /* two*Heps2 */
+ movaps xmm0, [esp + qqH]
+ addps xmm7, xmm6
+ addps xmm7, xmm5 /* xmm7=FF */
+ mulps xmm5, xmm1 /* xmm5=eps*Fp */
+ addps xmm5, xmm4 /* xmm5=VV */
+ mulps xmm5, xmm0 /* vcoul=qq*VV */
+ mulps xmm7, xmm0 /* fijC=FF*qq */
+ /* at this point mm5 contains vcoul and xmm0 fijC */
+ /* increment vcoul */
+ xorps xmm4, xmm4
+ addps xmm5, [esp + vctot]
+ mulps xmm7, [esp + rinvH2]
+ movaps [esp + vctot], xmm5
+ mulps xmm7, [esp + tsc]
+ subps xmm4, xmm7
+
+ movaps xmm0, [esp + dxH2]
+ movaps xmm1, [esp + dyH2]
+ movaps xmm2, [esp + dzH2]
+ mulps xmm0, xmm4
+ mulps xmm1, xmm4
+ mulps xmm2, xmm4
+
+ movd eax, mm0
+ movd ebx, mm1
+ movd ecx, mm2
+ movd edx, mm3
+
+ /* update H2 forces */
+ movaps xmm3, [esp + fixH2]
+ movaps xmm4, [esp + fiyH2]
+ movaps xmm7, [esp + fizH2]
+ addps xmm3, xmm0
+ addps xmm4, xmm1
+ addps xmm7, xmm2
+ movaps [esp + fixH2], xmm3
+ movaps [esp + fiyH2], xmm4
+ movaps [esp + fizH2], xmm7
+
+ mov edi, [ebp +faction]
+ /* update j forces */
+ addps xmm0, [esp + fjx]
+ addps xmm1, [esp + fjy]
+ addps xmm2, [esp + fjz]
+
+ movlps xmm4, [edi + eax*4]
+ movlps xmm7, [edi + ecx*4]
+ movhps xmm4, [edi + ebx*4]
+ movhps xmm7, [edi + edx*4]
+
+ movaps xmm3, xmm4
+ shufps xmm3, xmm7, 0b10001000
+ shufps xmm4, xmm7, 0b11011101
+ /* xmm3 has fjx, xmm4 has fjy */
+ subps xmm3, xmm0
+ subps xmm4, xmm1
+ /* unpack them back for storing */
+ movaps xmm7, xmm3
+ unpcklps xmm7, xmm4
+ unpckhps xmm3, xmm4
+ movlps [edi + eax*4], xmm7
+ movlps [edi + ecx*4], xmm3
+ movhps [edi + ebx*4], xmm7
+ movhps [edi + edx*4], xmm3
+ /* finally z forces */
+ movss xmm0, [edi + eax*4 + 8]
+ movss xmm1, [edi + ebx*4 + 8]
+ movss xmm3, [edi + ecx*4 + 8]
+ movss xmm4, [edi + edx*4 + 8]
+ subss xmm0, xmm2
+ shufps xmm2, xmm2, 0b11100101
+ subss xmm1, xmm2
+ shufps xmm2, xmm2, 0b11101010
+ subss xmm3, xmm2
+ shufps xmm2, xmm2, 0b11111111
+ subss xmm4, xmm2
+ movss [edi + eax*4 + 8], xmm0
+ movss [edi + ebx*4 + 8], xmm1
+ movss [edi + ecx*4 + 8], xmm3
+ movss [edi + edx*4 + 8], xmm4
+
+ /* should we do one more iteration? */
+ sub [esp + innerk], 4
+ jl .i3020_odd_inner
+ jmp .i3020_unroll_loop
+.i3020_odd_inner:
+ add [esp + innerk], 4
+ jnz .i3020_odd_loop
+ jmp .i3020_updateouterdata
+.i3020_odd_loop:
+ mov edx, [esp + innerjjnr] /* pointer to jjnr[k] */
+ mov eax, [edx]
+ add [esp + innerjjnr], 4
+
+ xorps xmm4, xmm4
+ movss xmm4, [esp + iqO]
+ mov esi, [ebp + charge]
+ movhps xmm4, [esp + iqH]
+ movss xmm3, [esi + eax*4] /* charge in xmm3 */
+ shufps xmm3, xmm3, 0
+ mulps xmm3, xmm4
+ movaps [esp + qqO], xmm3 /* use oxygen qq for storage */
+
+ mov esi, [ebp + pos]
+ lea eax, [eax + eax*2]
+
+ /* move j coords to xmm0-xmm2 */
+ movss xmm0, [esi + eax*4]
+ movss xmm1, [esi + eax*4 + 4]
+ movss xmm2, [esi + eax*4 + 8]
+ shufps xmm0, xmm0, 0
+ shufps xmm1, xmm1, 0
+ shufps xmm2, xmm2, 0
+
+ movss xmm3, [esp + ixO]
+ movss xmm4, [esp + iyO]
+ movss xmm5, [esp + izO]
+
+ movlps xmm6, [esp + ixH1]
+ movlps xmm7, [esp + ixH2]
+ unpcklps xmm6, xmm7
+ movlhps xmm3, xmm6
+ movlps xmm6, [esp + iyH1]
+ movlps xmm7, [esp + iyH2]
+ unpcklps xmm6, xmm7
+ movlhps xmm4, xmm6
+ movlps xmm6, [esp + izH1]
+ movlps xmm7, [esp + izH2]
+ unpcklps xmm6, xmm7
+ movlhps xmm5, xmm6
+
+ subps xmm3, xmm0
+ subps xmm4, xmm1
+ subps xmm5, xmm2
+
+ movaps [esp + dxO], xmm3
+ movaps [esp + dyO], xmm4
+ movaps [esp + dzO], xmm5
+
+ mulps xmm3, xmm3
+ mulps xmm4, xmm4
+ mulps xmm5, xmm5
+
+ addps xmm4, xmm3
+ addps xmm4, xmm5
+ /* rsq in xmm4 */
+
+ rsqrtps xmm5, xmm4
+ /* lookup seed in xmm5 */
+ movaps xmm2, xmm5
+ mulps xmm5, xmm5
+ movaps xmm1, [esp + three]
+ mulps xmm5, xmm4 /* rsq*lu*lu */
+ movaps xmm0, [esp + half]
+ subps xmm1, xmm5 /* 30-rsq*lu*lu */
+ mulps xmm1, xmm2
+ mulps xmm0, xmm1 /* xmm0=rinv */
+ mulps xmm4, xmm0 /* xmm4=r */
+ movaps [esp + rinvO], xmm0
+
+ mulps xmm4, [esp + tsc]
+ movhlps xmm7, xmm4
+ cvttps2pi mm6, xmm4
+ cvttps2pi mm7, xmm7 /* mm6/mm7 contain lu indices */
+ cvtpi2ps xmm3, mm6
+ cvtpi2ps xmm7, mm7
+ movlhps xmm3, xmm7
+
+ subps xmm4, xmm3
+ movaps xmm1, xmm4 /* xmm1=eps */
+ movaps xmm2, xmm1
+ mulps xmm2, xmm2 /* xmm2=eps2 */
+ pslld mm6, 2
+ pslld mm7, 2
+
+ movd mm0, eax
+ movd mm1, ecx
+ movd mm2, edx
+
+ mov esi, [ebp + VFtab]
+ movd eax, mm6
+ movd ecx, mm7
+ psrlq mm7, 32
+ movd edx, mm7
+
+ movlps xmm5, [esi + eax*4]
+ movlps xmm7, [esi + ecx*4]
+ movhps xmm7, [esi + edx*4] /* got half coulomb table */
+
+ movaps xmm4, xmm5
+ shufps xmm4, xmm7, 0b10001000
+ shufps xmm5, xmm7, 0b11011101
+
+ movlps xmm7, [esi + eax*4 + 8]
+ movlps xmm3, [esi + ecx*4 + 8]
+ movhps xmm3, [esi + edx*4 + 8] /* other half of coulomb table */
+ movaps xmm6, xmm7
+ shufps xmm6, xmm3, 0b10001000
+ shufps xmm7, xmm3, 0b11011101
+ /* coulomb table ready, in xmm4-xmm7 */
+ mulps xmm6, xmm1 /* xmm6=Geps */
+ mulps xmm7, xmm2 /* xmm7=Heps2 */
+ addps xmm5, xmm6
+ addps xmm5, xmm7 /* xmm5=Fp */
+ mulps xmm7, [esp + two] /* two*Heps2 */
+ movaps xmm0, [esp + qqO]
+ addps xmm7, xmm6
+ addps xmm7, xmm5 /* xmm7=FF */
+ mulps xmm5, xmm1 /* xmm5=eps*Fp */
+ addps xmm5, xmm4 /* xmm5=VV */
+ mulps xmm5, xmm0 /* vcoul=qq*VV */
+ mulps xmm0, xmm7 /* fijC=FF*qq */
+ /* at this point mm5 contains vcoul and xmm0 fijC */
+ /* increment vcoul - then we can get rid of mm5 */
+ addps xmm5, [esp + vctot]
+ movaps [esp + vctot], xmm5
+
+ xorps xmm4, xmm4
+ mulps xmm0, [esp + tsc]
+ mulps xmm0, [esp + rinvO]
+ subps xmm4, xmm0
+
+ movd eax, mm0
+ movd ecx, mm1
+ movd edx, mm2
+
+ movaps xmm0, [esp + dxO]
+ movaps xmm1, [esp + dyO]
+ movaps xmm2, [esp + dzO]
+
+ mulps xmm0, xmm4
+ mulps xmm1, xmm4
+ mulps xmm2, xmm4 /* xmm0-xmm2 now contains tx-tz (partial force) */
+ movss xmm3, [esp + fixO]
+ movss xmm4, [esp + fiyO]
+ movss xmm5, [esp + fizO]
+ addss xmm3, xmm0
+ addss xmm4, xmm1
+ addss xmm5, xmm2
+ movss [esp + fixO], xmm3
+ movss [esp + fiyO], xmm4
+ movss [esp + fizO], xmm5 /* updated the O force now do the H's */
+ movaps xmm3, xmm0
+ movaps xmm4, xmm1
+ movaps xmm5, xmm2
+ shufps xmm3, xmm3, 0b11100110 /* shift right */
+ shufps xmm4, xmm4, 0b11100110
+ shufps xmm5, xmm5, 0b11100110
+ addss xmm3, [esp + fixH1]
+ addss xmm4, [esp + fiyH1]
+ addss xmm5, [esp + fizH1]
+ movss [esp + fixH1], xmm3
+ movss [esp + fiyH1], xmm4
+ movss [esp + fizH1], xmm5 /* updated the H1 force */
+
+ mov edi, [ebp + faction]
+ shufps xmm3, xmm3, 0b11100111 /* shift right */
+ shufps xmm4, xmm4, 0b11100111
+ shufps xmm5, xmm5, 0b11100111
+ addss xmm3, [esp + fixH2]
+ addss xmm4, [esp + fiyH2]
+ addss xmm5, [esp + fizH2]
+ movss [esp + fixH2], xmm3
+ movss [esp + fiyH2], xmm4
+ movss [esp + fizH2], xmm5 /* updated the H2 force */
+
+ /* the fj's - start by accumulating the tx/ty/tz force in xmm0, xmm1 */
+ xorps xmm5, xmm5
+ movaps xmm3, xmm0
+ movlps xmm6, [edi + eax*4]
+ movss xmm7, [edi + eax*4 + 8]
+ unpcklps xmm3, xmm1
+ movlhps xmm3, xmm5
+ unpckhps xmm0, xmm1
+ addps xmm0, xmm3
+ movhlps xmm3, xmm0
+ addps xmm0, xmm3 /* x,y sum in xmm0 */
+
+ movhlps xmm1, xmm2
+ addss xmm2, xmm1
+ shufps xmm1, xmm1, 1
+ addss xmm2, xmm1 /* z sum in xmm2 */
+ subps xmm6, xmm0
+ subss xmm7, xmm2
+
+ movlps [edi + eax*4], xmm6
+ movss [edi + eax*4 + 8], xmm7
+
+ dec dword ptr [esp + innerk]
+ jz .i3020_updateouterdata
+ jmp .i3020_odd_loop
+.i3020_updateouterdata:
+ mov ecx, [esp + ii3]
+ mov edi, [ebp + faction]
+ mov esi, [ebp + fshift]
+ mov edx, [esp + is3]
+
+ /* accumulate Oi forces in xmm0, xmm1, xmm2 */
+ movaps xmm0, [esp + fixO]
+ movaps xmm1, [esp + fiyO]
+ movaps xmm2, [esp + fizO]
+
+ movhlps xmm3, xmm0
+ movhlps xmm4, xmm1
+ movhlps xmm5, xmm2
+ addps xmm0, xmm3
+ addps xmm1, xmm4
+ addps xmm2, xmm5 /* sum is in 1/2 in xmm0-xmm2 */
+
+ movaps xmm3, xmm0
+ movaps xmm4, xmm1
+ movaps xmm5, xmm2
+
+ shufps xmm3, xmm3, 1
+ shufps xmm4, xmm4, 1
+ shufps xmm5, xmm5, 1
+ addss xmm0, xmm3
+ addss xmm1, xmm4
+ addss xmm2, xmm5 /* xmm0-xmm2 has single force in pos0 */
+
+ /* increment i force */
+ movss xmm3, [edi + ecx*4]
+ movss xmm4, [edi + ecx*4 + 4]
+ movss xmm5, [edi + ecx*4 + 8]
+ addss xmm3, xmm0
+ addss xmm4, xmm1
+ addss xmm5, xmm2
+ movss [edi + ecx*4], xmm3
+ movss [edi + ecx*4 + 4], xmm4
+ movss [edi + ecx*4 + 8], xmm5
+
+ /* accumulate force in xmm6/xmm7 for fshift */
+ movaps xmm6, xmm0
+ movss xmm7, xmm2
+ movlhps xmm6, xmm1
+ shufps xmm6, xmm6, 0b1000
+
+ /* accumulate H1i forces in xmm0, xmm1, xmm2 */
+ movaps xmm0, [esp + fixH1]
+ movaps xmm1, [esp + fiyH1]
+ movaps xmm2, [esp + fizH1]
+
+ movhlps xmm3, xmm0
+ movhlps xmm4, xmm1
+ movhlps xmm5, xmm2
+ addps xmm0, xmm3
+ addps xmm1, xmm4
+ addps xmm2, xmm5 /* sum is in 1/2 in xmm0-xmm2 */
+
+ movaps xmm3, xmm0
+ movaps xmm4, xmm1
+ movaps xmm5, xmm2
+
+ shufps xmm3, xmm3, 1
+ shufps xmm4, xmm4, 1
+ shufps xmm5, xmm5, 1
+ addss xmm0, xmm3
+ addss xmm1, xmm4
+ addss xmm2, xmm5 /* xmm0-xmm2 has single force in pos0 */
+
+ /* increment i force */
+ movss xmm3, [edi + ecx*4 + 12]
+ movss xmm4, [edi + ecx*4 + 16]
+ movss xmm5, [edi + ecx*4 + 20]
+ addss xmm3, xmm0
+ addss xmm4, xmm1
+ addss xmm5, xmm2
+ movss [edi + ecx*4 + 12], xmm3
+ movss [edi + ecx*4 + 16], xmm4
+ movss [edi + ecx*4 + 20], xmm5
+
+ /* accumulate force in xmm6/xmm7 for fshift */
+ addss xmm7, xmm2
+ movlhps xmm0, xmm1
+ shufps xmm0, xmm0, 0b1000
+ addps xmm6, xmm0
+
+ /* accumulate H2i forces in xmm0, xmm1, xmm2 */
+ movaps xmm0, [esp + fixH2]
+ movaps xmm1, [esp + fiyH2]
+ movaps xmm2, [esp + fizH2]
+
+ movhlps xmm3, xmm0
+ movhlps xmm4, xmm1
+ movhlps xmm5, xmm2
+ addps xmm0, xmm3
+ addps xmm1, xmm4
+ addps xmm2, xmm5 /* sum is in 1/2 in xmm0-xmm2 */
+
+ movaps xmm3, xmm0
+ movaps xmm4, xmm1
+ movaps xmm5, xmm2
+
+ shufps xmm3, xmm3, 1
+ shufps xmm4, xmm4, 1
+ shufps xmm5, xmm5, 1
+ addss xmm0, xmm3
+ addss xmm1, xmm4
+ addss xmm2, xmm5 /* xmm0-xmm2 has single force in pos0 */
+
+ /* increment i force */
+ movss xmm3, [edi + ecx*4 + 24]
+ movss xmm4, [edi + ecx*4 + 28]
+ movss xmm5, [edi + ecx*4 + 32]
+ addss xmm3, xmm0
+ addss xmm4, xmm1
+ addss xmm5, xmm2
+ movss [edi + ecx*4 + 24], xmm3
+ movss [edi + ecx*4 + 28], xmm4
+ movss [edi + ecx*4 + 32], xmm5
+
+ /* accumulate force in xmm6/xmm7 for fshift */
+ addss xmm7, xmm2
+ movlhps xmm0, xmm1
+ shufps xmm0, xmm0, 0b1000
+ addps xmm6, xmm0
+
+ /* increment fshift force */
+ movlps xmm3, [esi + edx*4]
+ movss xmm4, [esi + edx*4 + 8]
+ addps xmm3, xmm6
+ addss xmm4, xmm7
+ movlps [esi + edx*4], xmm3
+ movss [esi + edx*4 + 8], xmm4
+
+ mov edx, [ebp + gid]
+ mov edx, [edx]
+ add [ebp + gid], 4
+
+ /* accumulate total potential energy and update it */
+ movaps xmm7, [esp + vctot]
+ /* accumulate */
+ movhlps xmm6, xmm7
+ addps xmm7, xmm6 /* pos 0-1 in xmm7 have the sum now */
+ movaps xmm6, xmm7
+ shufps xmm6, xmm6, 1
+ addss xmm7, xmm6
+
+ /* add earlier value from mem */
+ mov eax, [ebp + Vc]
+ addss xmm7, [eax + edx*4]
+ /* move back to mem */
+ movss [eax + edx*4], xmm7
+
+ /* finish if last */
+ mov ecx, [ebp + nri]
+ dec ecx
+ jecxz .i3020_end
+ /* not last, iterate once more! */
+ mov [ebp + nri], ecx
+ jmp .i3020_outer
+.i3020_end:
+ emms
+ mov eax, [esp + salign]
+ add esp, eax
+ add esp, 740
+ pop edi
+ pop esi
+ pop edx
+ pop ecx
+ pop ebx
+ pop eax
+ leave
+ ret
+
+
+
+.globl inl3030_sse
+ .type inl3030_sse,@function
+inl3030_sse:
+.equ nri, 8
+.equ iinr, 12
+.equ jindex, 16
+.equ jjnr, 20
+.equ shift, 24
+.equ shiftvec, 28
+.equ fshift, 32
+.equ gid, 36
+.equ pos, 40
+.equ faction, 44
+.equ charge, 48
+.equ facel, 52
+.equ Vc, 56
+.equ tabscale, 60
+.equ VFtab, 64
+ /* stack offsets for local variables */
+ /* bottom of stack is cache-aligned for sse use */
+.equ ixO, 0
+.equ iyO, 16
+.equ izO, 32
+.equ ixH1, 48
+.equ iyH1, 64
+.equ izH1, 80
+.equ ixH2, 96
+.equ iyH2, 112
+.equ izH2, 128
+.equ jxO, 144
+.equ jyO, 160
+.equ jzO, 176
+.equ jxH1, 192
+.equ jyH1, 208
+.equ jzH1, 224
+.equ jxH2, 240
+.equ jyH2, 256
+.equ jzH2, 272
+.equ dxOO, 288
+.equ dyOO, 304
+.equ dzOO, 320
+.equ dxOH1, 336
+.equ dyOH1, 352
+.equ dzOH1, 368
+.equ dxOH2, 384
+.equ dyOH2, 400
+.equ dzOH2, 416
+.equ dxH1O, 432
+.equ dyH1O, 448
+.equ dzH1O, 464
+.equ dxH1H1, 480
+.equ dyH1H1, 496
+.equ dzH1H1, 512
+.equ dxH1H2, 528
+.equ dyH1H2, 544
+.equ dzH1H2, 560
+.equ dxH2O, 576
+.equ dyH2O, 592
+.equ dzH2O, 608
+.equ dxH2H1, 624
+.equ dyH2H1, 640
+.equ dzH2H1, 656
+.equ dxH2H2, 672
+.equ dyH2H2, 688
+.equ dzH2H2, 704
+.equ qqOO, 720
+.equ qqOH, 736
+.equ qqHH, 752
+.equ two, 768
+.equ tsc, 784
+.equ vctot, 800
+.equ fixO, 816
+.equ fiyO, 832
+.equ fizO, 848
+.equ fixH1, 864
+.equ fiyH1, 880
+.equ fizH1, 896
+.equ fixH2, 912
+.equ fiyH2, 928
+.equ fizH2, 944
+.equ fjxO, 960
+.equ fjyO, 976
+.equ fjzO, 992
+.equ fjxH1, 1008
+.equ fjyH1, 1024
+.equ fjzH1, 1040
+.equ fjxH2, 1056
+.equ fjyH2, 1072
+.equ fjzH2, 1088
+.equ half, 1104
+.equ three, 1120
+.equ rsqOO, 1136
+.equ rsqOH1, 1152
+.equ rsqOH2, 1168
+.equ rsqH1O, 1184
+.equ rsqH1H1, 1200
+.equ rsqH1H2, 1216
+.equ rsqH2O, 1232
+.equ rsqH2H1, 1248
+.equ rsqH2H2, 1264
+.equ rinvOO, 1280
+.equ rinvOH1, 1296
+.equ rinvOH2, 1312
+.equ rinvH1O, 1328
+.equ rinvH1H1, 1344
+.equ rinvH1H2, 1360
+.equ rinvH2O, 1376
+.equ rinvH2H1, 1392
+.equ rinvH2H2, 1408
+.equ is3, 1424
+.equ ii3, 1428
+.equ innerjjnr, 1432
+.equ innerk, 1436
+.equ salign, 1440
+ push ebp
+ mov ebp,esp
+ push eax
+ push ebx
+ push ecx
+ push edx
+ push esi
+ push edi
+ sub esp, 1444 /* local stack space */
+ mov eax, esp
+ and eax, 0xf
+ sub esp, eax
+ mov [esp + salign], eax
+
+ emms
+
+ movups xmm0, [sse_half]
+ movups xmm1, [sse_two]
+ movups xmm2, [sse_three]
+ movss xmm3, [ebp +tabscale]
+ movaps [esp + half], xmm0
+ movaps [esp + two], xmm1
+ movaps [esp + three], xmm2
+ shufps xmm3, xmm3, 0
+ movaps [esp + tsc], xmm3
+
+ /* assume we have at least one i particle - start directly */
+ mov ecx, [ebp + iinr] /* ecx = pointer into iinr[] */
+ mov ebx, [ecx] /* ebx =ii */
+
+ mov edx, [ebp + charge]
+ movss xmm3, [edx + ebx*4]
+ movss xmm4, xmm3
+ movss xmm5, [edx + ebx*4 + 4]
+ movss xmm6, [ebp + facel]
+ mulss xmm3, xmm3
+ mulss xmm4, xmm5
+ mulss xmm5, xmm5
+ mulss xmm3, xmm6
+ mulss xmm4, xmm6
+ mulss xmm5, xmm6
+ shufps xmm3, xmm3, 0
+ shufps xmm4, xmm4, 0
+ shufps xmm5, xmm5, 0
+ movaps [esp + qqOO], xmm3
+ movaps [esp + qqOH], xmm4
+ movaps [esp + qqHH], xmm5
+
+.i3030_outer:
+ mov eax, [ebp + shift] /* eax = pointer into shift[] */
+ mov ebx, [eax] /* ebx=shift[n] */
+ add [ebp + shift], 4 /* advance pointer one step */
+
+ lea ebx, [ebx + ebx*2] /* ebx=3*is */
+ mov [esp + is3],ebx /* store is3 */
+
+ mov eax, [ebp + shiftvec] /* eax = base of shiftvec[] */
+
+ movss xmm0, [eax + ebx*4]
+ movss xmm1, [eax + ebx*4 + 4]
+ movss xmm2, [eax + ebx*4 + 8]
+
+ mov ecx, [ebp + iinr] /* ecx = pointer into iinr[] */
+ add [ebp + iinr], 4 /* advance pointer */
+ mov ebx, [ecx] /* ebx =ii */
+
+ lea ebx, [ebx + ebx*2] /* ebx = 3*ii=ii3 */
+ mov eax, [ebp + pos] /* eax = base of pos[] */
+ mov [esp + ii3], ebx
+
+ movaps xmm3, xmm0
+ movaps xmm4, xmm1
+ movaps xmm5, xmm2
+ addss xmm3, [eax + ebx*4]
+ addss xmm4, [eax + ebx*4 + 4]
+ addss xmm5, [eax + ebx*4 + 8]
+ shufps xmm3, xmm3, 0
+ shufps xmm4, xmm4, 0
+ shufps xmm5, xmm5, 0
+ movaps [esp + ixO], xmm3
+ movaps [esp + iyO], xmm4
+ movaps [esp + izO], xmm5
+
+ movss xmm3, xmm0
+ movss xmm4, xmm1
+ movss xmm5, xmm2
+ addss xmm0, [eax + ebx*4 + 12]
+ addss xmm1, [eax + ebx*4 + 16]
+ addss xmm2, [eax + ebx*4 + 20]
+ addss xmm3, [eax + ebx*4 + 24]
+ addss xmm4, [eax + ebx*4 + 28]
+ addss xmm5, [eax + ebx*4 + 32]
+
+ shufps xmm0, xmm0, 0
+ shufps xmm1, xmm1, 0
+ shufps xmm2, xmm2, 0
+ shufps xmm3, xmm3, 0
+ shufps xmm4, xmm4, 0
+ shufps xmm5, xmm5, 0
+ movaps [esp + ixH1], xmm0
+ movaps [esp + iyH1], xmm1
+ movaps [esp + izH1], xmm2
+ movaps [esp + ixH2], xmm3
+ movaps [esp + iyH2], xmm4
+ movaps [esp + izH2], xmm5
+
+ /* clear vctot and i forces */
+ xorps xmm4, xmm4
+ movaps [esp + vctot], xmm4
+ movaps [esp + fixO], xmm4
+ movaps [esp + fiyO], xmm4
+ movaps [esp + fizO], xmm4
+ movaps [esp + fixH1], xmm4
+ movaps [esp + fiyH1], xmm4
+ movaps [esp + fizH1], xmm4
+ movaps [esp + fixH2], xmm4
+ movaps [esp + fiyH2], xmm4
+ movaps [esp + fizH2], xmm4
+
+ mov eax, [ebp + jindex]
+ mov ecx, [eax] /* jindex[n] */
+ mov edx, [eax + 4] /* jindex[n+1] */
+ add [ebp + jindex], 4
+ sub edx, ecx /* number of innerloop atoms */
+
+ mov esi, [ebp + pos]
+ mov edi, [ebp + faction]
+ mov eax, [ebp + jjnr]
+ shl ecx, 2
+ add eax, ecx
+ mov [esp + innerjjnr], eax /* pointer to jjnr[nj0] */
+ sub edx, 4
+ mov [esp + innerk], edx /* number of innerloop atoms */
+ jge .i3030_unroll_loop
+ jmp .i3030_single_check
+.i3030_unroll_loop:
+ /* quad-unroll innerloop here */
+ mov edx, [esp + innerjjnr] /* pointer to jjnr[k] */
+
+ mov eax, [edx]
+ mov ebx, [edx + 4]
+ mov ecx, [edx + 8]
+ mov edx, [edx + 12] /* eax-edx=jnr1-4 */
+
+ add [esp + innerjjnr], 16 /* advance pointer (unrolled 4) */
+
+ mov esi, [ebp + pos] /* base of pos[] */
+
+ lea eax, [eax + eax*2] /* replace jnr with j3 */
+ lea ebx, [ebx + ebx*2]
+ lea ecx, [ecx + ecx*2] /* replace jnr with j3 */
+ lea edx, [edx + edx*2]
+
+ /* move j coordinates to local temp variables */
+ movlps xmm2, [esi + eax*4]
+ movlps xmm3, [esi + eax*4 + 12]
+ movlps xmm4, [esi + eax*4 + 24]
+
+ movlps xmm5, [esi + ebx*4]
+ movlps xmm6, [esi + ebx*4 + 12]
+ movlps xmm7, [esi + ebx*4 + 24]
+
+ movhps xmm2, [esi + ecx*4]
+ movhps xmm3, [esi + ecx*4 + 12]
+ movhps xmm4, [esi + ecx*4 + 24]
+
+ movhps xmm5, [esi + edx*4]
+ movhps xmm6, [esi + edx*4 + 12]
+ movhps xmm7, [esi + edx*4 + 24]
+
+ /* current state: */
+ /* xmm2= jxOa jyOa jxOc jyOc */
+ /* xmm3= jxH1a jyH1a jxH1c jyH1c */
+ /* xmm4= jxH2a jyH2a jxH2c jyH2c */
+ /* xmm5= jxOb jyOb jxOd jyOd */
+ /* xmm6= jxH1b jyH1b jxH1d jyH1d */
+ /* xmm7= jxH2b jyH2b jxH2d jyH2d */
+
+ movaps xmm0, xmm2
+ movaps xmm1, xmm3
+ unpcklps xmm0, xmm5 /* xmm0= jxOa jxOb jyOa jyOb */
+ unpcklps xmm1, xmm6 /* xmm1= jxH1a jxH1b jyH1a jyH1b */
+ unpckhps xmm2, xmm5 /* xmm2= jxOc jxOd jyOc jyOd */
+ unpckhps xmm3, xmm6 /* xmm3= jxH1c jxH1d jyH1c jyH1d */
+ movaps xmm5, xmm4
+ movaps xmm6, xmm0
+ unpcklps xmm4, xmm7 /* xmm4= jxH2a jxH2b jyH2a jyH2b */
+ unpckhps xmm5, xmm7 /* xmm5= jxH2c jxH2d jyH2c jyH2d */
+ movaps xmm7, xmm1
+ movlhps xmm0, xmm2 /* xmm0= jxOa jxOb jxOc jxOd */
+ movaps [esp + jxO], xmm0
+ movhlps xmm2, xmm6 /* xmm2= jyOa jyOb jyOc jyOd */
+ movaps [esp + jyO], xmm2
+ movlhps xmm1, xmm3
+ movaps [esp + jxH1], xmm1
+ movhlps xmm3, xmm7
+ movaps xmm6, xmm4
+ movaps [esp + jyH1], xmm3
+ movlhps xmm4, xmm5
+ movaps [esp + jxH2], xmm4
+ movhlps xmm5, xmm6
+ movaps [esp + jyH2], xmm5
+
+ movss xmm0, [esi + eax*4 + 8]
+ movss xmm1, [esi + eax*4 + 20]
+ movss xmm2, [esi + eax*4 + 32]
+
+ movss xmm3, [esi + ecx*4 + 8]
+ movss xmm4, [esi + ecx*4 + 20]
+ movss xmm5, [esi + ecx*4 + 32]
+
+ movhps xmm0, [esi + ebx*4 + 4]
+ movhps xmm1, [esi + ebx*4 + 16]
+ movhps xmm2, [esi + ebx*4 + 28]
+
+ movhps xmm3, [esi + edx*4 + 4]
+ movhps xmm4, [esi + edx*4 + 16]
+ movhps xmm5, [esi + edx*4 + 28]
+
+ shufps xmm0, xmm3, 0b11001100
+ shufps xmm1, xmm4, 0b11001100
+ shufps xmm2, xmm5, 0b11001100
+ movaps [esp + jzO], xmm0
+ movaps [esp + jzH1], xmm1
+ movaps [esp + jzH2], xmm2
+
+ movaps xmm0, [esp + ixO]
+ movaps xmm1, [esp + iyO]
+ movaps xmm2, [esp + izO]
+ movaps xmm3, [esp + ixO]
+ movaps xmm4, [esp + iyO]
+ movaps xmm5, [esp + izO]
+ subps xmm0, [esp + jxO]
+ subps xmm1, [esp + jyO]
+ subps xmm2, [esp + jzO]
+ subps xmm3, [esp + jxH1]
+ subps xmm4, [esp + jyH1]
+ subps xmm5, [esp + jzH1]
+ movaps [esp + dxOO], xmm0
+ movaps [esp + dyOO], xmm1
+ movaps [esp + dzOO], xmm2
+ mulps xmm0, xmm0
+ mulps xmm1, xmm1
+ mulps xmm2, xmm2
+ movaps [esp + dxOH1], xmm3
+ movaps [esp + dyOH1], xmm4
+ movaps [esp + dzOH1], xmm5
+ mulps xmm3, xmm3
+ mulps xmm4, xmm4
+ mulps xmm5, xmm5
+ addps xmm0, xmm1
+ addps xmm0, xmm2
+ addps xmm3, xmm4
+ addps xmm3, xmm5
+ movaps [esp + rsqOO], xmm0
+ movaps [esp + rsqOH1], xmm3
+
+ movaps xmm0, [esp + ixO]
+ movaps xmm1, [esp + iyO]
+ movaps xmm2, [esp + izO]
+ movaps xmm3, [esp + ixH1]
+ movaps xmm4, [esp + iyH1]
+ movaps xmm5, [esp + izH1]
+ subps xmm0, [esp + jxH2]
+ subps xmm1, [esp + jyH2]
+ subps xmm2, [esp + jzH2]
+ subps xmm3, [esp + jxO]
+ subps xmm4, [esp + jyO]
+ subps xmm5, [esp + jzO]
+ movaps [esp + dxOH2], xmm0
+ movaps [esp + dyOH2], xmm1
+ movaps [esp + dzOH2], xmm2
+ mulps xmm0, xmm0
+ mulps xmm1, xmm1
+ mulps xmm2, xmm2
+ movaps [esp + dxH1O], xmm3
+ movaps [esp + dyH1O], xmm4
+ movaps [esp + dzH1O], xmm5
+ mulps xmm3, xmm3
+ mulps xmm4, xmm4
+ mulps xmm5, xmm5
+ addps xmm0, xmm1
+ addps xmm0, xmm2
+ addps xmm3, xmm4
+ addps xmm3, xmm5
+ movaps [esp + rsqOH2], xmm0
+ movaps [esp + rsqH1O], xmm3
+
+ movaps xmm0, [esp + ixH1]
+ movaps xmm1, [esp + iyH1]
+ movaps xmm2, [esp + izH1]
+ movaps xmm3, [esp + ixH1]
+ movaps xmm4, [esp + iyH1]
+ movaps xmm5, [esp + izH1]
+ subps xmm0, [esp + jxH1]
+ subps xmm1, [esp + jyH1]
+ subps xmm2, [esp + jzH1]
+ subps xmm3, [esp + jxH2]
+ subps xmm4, [esp + jyH2]
+ subps xmm5, [esp + jzH2]
+ movaps [esp + dxH1H1], xmm0
+ movaps [esp + dyH1H1], xmm1
+ movaps [esp + dzH1H1], xmm2
+ mulps xmm0, xmm0
+ mulps xmm1, xmm1
+ mulps xmm2, xmm2
+ movaps [esp + dxH1H2], xmm3
+ movaps [esp + dyH1H2], xmm4
+ movaps [esp + dzH1H2], xmm5
+ mulps xmm3, xmm3
+ mulps xmm4, xmm4
+ mulps xmm5, xmm5
+ addps xmm0, xmm1
+ addps xmm0, xmm2
+ addps xmm3, xmm4
+ addps xmm3, xmm5
+ movaps [esp + rsqH1H1], xmm0
+ movaps [esp + rsqH1H2], xmm3
+
+ movaps xmm0, [esp + ixH2]
+ movaps xmm1, [esp + iyH2]
+ movaps xmm2, [esp + izH2]
+ movaps xmm3, [esp + ixH2]
+ movaps xmm4, [esp + iyH2]
+ movaps xmm5, [esp + izH2]
+ subps xmm0, [esp + jxO]
+ subps xmm1, [esp + jyO]
+ subps xmm2, [esp + jzO]
+ subps xmm3, [esp + jxH1]
+ subps xmm4, [esp + jyH1]
+ subps xmm5, [esp + jzH1]
+ movaps [esp + dxH2O], xmm0
+ movaps [esp + dyH2O], xmm1
+ movaps [esp + dzH2O], xmm2
+ mulps xmm0, xmm0
+ mulps xmm1, xmm1
+ mulps xmm2, xmm2
+ movaps [esp + dxH2H1], xmm3
+ movaps [esp + dyH2H1], xmm4
+ movaps [esp + dzH2H1], xmm5
+ mulps xmm3, xmm3
+ mulps xmm4, xmm4
+ mulps xmm5, xmm5
+ addps xmm0, xmm1
+ addps xmm0, xmm2
+ addps xmm4, xmm3
+ addps xmm4, xmm5
+ movaps [esp + rsqH2O], xmm0
+ movaps [esp + rsqH2H1], xmm4
+
+ movaps xmm0, [esp + ixH2]
+ movaps xmm1, [esp + iyH2]
+ movaps xmm2, [esp + izH2]
+ subps xmm0, [esp + jxH2]
+ subps xmm1, [esp + jyH2]
+ subps xmm2, [esp + jzH2]
+ movaps [esp + dxH2H2], xmm0
+ movaps [esp + dyH2H2], xmm1
+ movaps [esp + dzH2H2], xmm2
+ mulps xmm0, xmm0
+ mulps xmm1, xmm1
+ mulps xmm2, xmm2
+ addps xmm0, xmm1
+ addps xmm0, xmm2
+ movaps [esp + rsqH2H2], xmm0
+
+ /* start doing invsqrt use rsq values in xmm0, xmm4 */
+ rsqrtps xmm1, xmm0
+ rsqrtps xmm5, xmm4
+ movaps xmm2, xmm1
+ movaps xmm6, xmm5
+ mulps xmm1, xmm1
+ mulps xmm5, xmm5
+ movaps xmm3, [esp + three]
+ movaps xmm7, xmm3
+ mulps xmm1, xmm0
+ mulps xmm5, xmm4
+ subps xmm3, xmm1
+ subps xmm7, xmm5
+ mulps xmm3, xmm2
+ mulps xmm7, xmm6
+ mulps xmm3, [esp + half] /* rinvH2H2 */
+ mulps xmm7, [esp + half] /* rinvH2H1 */
+ movaps [esp + rinvH2H2], xmm3
+ movaps [esp + rinvH2H1], xmm7
+
+ rsqrtps xmm1, [esp + rsqOO]
+ rsqrtps xmm5, [esp + rsqOH1]
+ movaps xmm2, xmm1
+ movaps xmm6, xmm5
+ mulps xmm1, xmm1
+ mulps xmm5, xmm5
+ movaps xmm3, [esp + three]
+ movaps xmm7, xmm3
+ mulps xmm1, [esp + rsqOO]
+ mulps xmm5, [esp + rsqOH1]
+ subps xmm3, xmm1
+ subps xmm7, xmm5
+ mulps xmm3, xmm2
+ mulps xmm7, xmm6
+ mulps xmm3, [esp + half]
+ mulps xmm7, [esp + half]
+ movaps [esp + rinvOO], xmm3
+ movaps [esp + rinvOH1], xmm7
+
+ rsqrtps xmm1, [esp + rsqOH2]
+ rsqrtps xmm5, [esp + rsqH1O]
+ movaps xmm2, xmm1
+ movaps xmm6, xmm5
+ mulps xmm1, xmm1
+ mulps xmm5, xmm5
+ movaps xmm3, [esp + three]
+ movaps xmm7, xmm3
+ mulps xmm1, [esp + rsqOH2]
+ mulps xmm5, [esp + rsqH1O]
+ subps xmm3, xmm1
+ subps xmm7, xmm5
+ mulps xmm3, xmm2
+ mulps xmm7, xmm6
+ mulps xmm3, [esp + half]
+ mulps xmm7, [esp + half]
+ movaps [esp + rinvOH2], xmm3
+ movaps [esp + rinvH1O], xmm7
+
+ rsqrtps xmm1, [esp + rsqH1H1]
+ rsqrtps xmm5, [esp + rsqH1H2]
+ movaps xmm2, xmm1
+ movaps xmm6, xmm5
+ mulps xmm1, xmm1
+ mulps xmm5, xmm5
+ movaps xmm3, [esp + three]
+ movaps xmm7, xmm3
+ mulps xmm1, [esp + rsqH1H1]
+ mulps xmm5, [esp + rsqH1H2]
+ subps xmm3, xmm1
+ subps xmm7, xmm5
+ mulps xmm3, xmm2
+ mulps xmm7, xmm6
+ mulps xmm3, [esp + half]
+ mulps xmm7, [esp + half]
+ movaps [esp + rinvH1H1], xmm3
+ movaps [esp + rinvH1H2], xmm7
+
+ rsqrtps xmm1, [esp + rsqH2O]
+ movaps xmm2, xmm1
+ mulps xmm1, xmm1
+ movaps xmm3, [esp + three]
+ mulps xmm1, [esp + rsqH2O]
+ subps xmm3, xmm1
+ mulps xmm3, xmm2
+ mulps xmm3, [esp + half]
+ movaps [esp + rinvH2O], xmm3
+
+ /* start with OO interaction */
+ movaps xmm0, [esp + rinvOO]
+ movaps xmm1, xmm0
+ mulps xmm1, [esp + rsqOO] /* xmm1=r */
+ mulps xmm1, [esp + tsc]
+
+ movhlps xmm2, xmm1
+ cvttps2pi mm6, xmm1
+ cvttps2pi mm7, xmm2 /* mm6/mm7 contain lu indices */
+ cvtpi2ps xmm3, mm6
+ cvtpi2ps xmm2, mm7
+ movlhps xmm3, xmm2
+ subps xmm1, xmm3 /* xmm1=eps */
+ movaps xmm2, xmm1
+ mulps xmm2, xmm2 /* xmm2=eps2 */
+ pslld mm6, 2
+ pslld mm7, 2
+
+ movd mm0, eax
+ movd mm1, ebx
+ movd mm2, ecx
+ movd mm3, edx
+
+ mov esi, [ebp + VFtab]
+ movd eax, mm6
+ psrlq mm6, 32
+ movd ecx, mm7
+ psrlq mm7, 32
+ movd ebx, mm6
+ movd edx, mm7
+
+ movlps xmm5, [esi + eax*4]
+ movlps xmm7, [esi + ecx*4]
+ movhps xmm5, [esi + ebx*4]
+ movhps xmm7, [esi + edx*4] /* got half coulomb table */
+
+ movaps xmm4, xmm5
+ shufps xmm4, xmm7, 0b10001000
+ shufps xmm5, xmm7, 0b11011101
+
+ movlps xmm7, [esi + eax*4 + 8]
+ movlps xmm3, [esi + ecx*4 + 8]
+ movhps xmm7, [esi + ebx*4 + 8]
+ movhps xmm3, [esi + edx*4 + 8] /* other half of coulomb table */
+ movaps xmm6, xmm7
+ shufps xmm6, xmm3, 0b10001000
+ shufps xmm7, xmm3, 0b11011101
+ /* coulomb table ready, in xmm4-xmm7 */
+
+ mulps xmm6, xmm1 /* xmm6=Geps */
+ mulps xmm7, xmm2 /* xmm7=Heps2 */
+ addps xmm5, xmm6
+ addps xmm5, xmm7 /* xmm5=Fp */
+ mulps xmm7, [esp + two] /* two*Heps2 */
+ movaps xmm3, [esp + qqOO]
+ addps xmm7, xmm6
+ addps xmm7, xmm5 /* xmm7=FF */
+ mulps xmm5, xmm1 /* xmm5=eps*Fp */
+ addps xmm5, xmm4 /* xmm5=VV */
+ mulps xmm5, xmm3 /* vcoul=qq*VV */
+ mulps xmm3, xmm7 /* fijC=FF*qq */
+ /* at this point mm5 contains vcoul and mm3 fijC */
+ /* increment vcoul - then we can get rid of mm5 */
+ /* update vctot */
+ addps xmm5, [esp + vctot]
+ xorps xmm2, xmm2
+ movaps [esp + vctot], xmm5
+ mulps xmm3, [esp + tsc]
+
+ subps xmm2, xmm3
+ mulps xmm0, xmm2
+
+ movaps xmm1, xmm0
+ movaps xmm2, xmm0
+
+ xorps xmm3, xmm3
+ movaps xmm4, xmm3
+ movaps xmm5, xmm3
+ mulps xmm0, [esp + dxOO]
+ mulps xmm1, [esp + dyOO]
+ mulps xmm2, [esp + dzOO]
+ subps xmm3, xmm0
+ subps xmm4, xmm1
+ subps xmm5, xmm2
+ addps xmm0, [esp + fixO]
+ addps xmm1, [esp + fiyO]
+ addps xmm2, [esp + fizO]
+ movaps [esp + fjxO], xmm3
+ movaps [esp + fjyO], xmm4
+ movaps [esp + fjzO], xmm5
+ movaps [esp + fixO], xmm0
+ movaps [esp + fiyO], xmm1
+ movaps [esp + fizO], xmm2
+
+ /* O-H1 interaction */
+ movaps xmm0, [esp + rinvOH1]
+ movaps xmm1, xmm0
+ mulps xmm1, [esp + rsqOH1] /* xmm1=r */
+ mulps xmm1, [esp + tsc]
+ movhlps xmm2, xmm1
+ cvttps2pi mm6, xmm1
+ cvttps2pi mm7, xmm2 /* mm6/mm7 contain lu indices */
+ cvtpi2ps xmm3, mm6
+ cvtpi2ps xmm2, mm7
+ movlhps xmm3, xmm2
+ subps xmm1, xmm3 /* xmm1=eps */
+ movaps xmm2, xmm1
+ mulps xmm2, xmm2 /* xmm2=eps2 */
+
+ pslld mm6, 2
+ pslld mm7, 2
+
+ movd eax, mm6
+ psrlq mm6, 32
+ movd ecx, mm7
+ psrlq mm7, 32
+ movd ebx, mm6
+ movd edx, mm7
+
+ movlps xmm5, [esi + eax*4]
+ movlps xmm7, [esi + ecx*4]
+ movhps xmm5, [esi + ebx*4]
+ movhps xmm7, [esi + edx*4] /* got half coulomb table */
+
+ movaps xmm4, xmm5
+ shufps xmm4, xmm7, 0b10001000
+ shufps xmm5, xmm7, 0b11011101
+
+ movlps xmm7, [esi + eax*4 + 8]
+ movlps xmm3, [esi + ecx*4 + 8]
+ movhps xmm7, [esi + ebx*4 + 8]
+ movhps xmm3, [esi + edx*4 + 8] /* other half of coulomb table */
+ movaps xmm6, xmm7
+ shufps xmm6, xmm3, 0b10001000
+ shufps xmm7, xmm3, 0b11011101
+ /* coulomb table ready, in xmm4-xmm7 */
+
+ mulps xmm6, xmm1 /* xmm6=Geps */
+ mulps xmm7, xmm2 /* xmm7=Heps2 */
+ addps xmm5, xmm6
+ addps xmm5, xmm7 /* xmm5=Fp */
+ mulps xmm7, [esp + two] /* two*Heps2 */
+ movaps xmm3, [esp + qqOH]
+ addps xmm7, xmm6
+ addps xmm7, xmm5 /* xmm7=FF */
+ mulps xmm5, xmm1 /* xmm5=eps*Fp */
+ addps xmm5, xmm4 /* xmm5=VV */
+ mulps xmm5, xmm3 /* vcoul=qq*VV */
+ mulps xmm3, xmm7 /* fijC=FF*qq */
+ /* at this point mm5 contains vcoul and mm3 fijC */
+
+ addps xmm5, [esp + vctot]
+ movaps [esp + vctot], xmm5
+ xorps xmm1, xmm1
+ mulps xmm3, [esp + tsc]
+ mulps xmm3, xmm0
+ subps xmm1, xmm3
+
+ movaps xmm0, xmm1
+ movaps xmm2, xmm1
+
+ xorps xmm3, xmm3
+ movaps xmm4, xmm3
+ movaps xmm5, xmm3
+ mulps xmm0, [esp + dxOH1]
+ mulps xmm1, [esp + dyOH1]
+ mulps xmm2, [esp + dzOH1]
+ subps xmm3, xmm0
+ subps xmm4, xmm1
+ subps xmm5, xmm2
+ addps xmm0, [esp + fixO]
+ addps xmm1, [esp + fiyO]
+ addps xmm2, [esp + fizO]
+ movaps [esp + fjxH1], xmm3
+ movaps [esp + fjyH1], xmm4
+ movaps [esp + fjzH1], xmm5
+ movaps [esp + fixO], xmm0
+ movaps [esp + fiyO], xmm1
+ movaps [esp + fizO], xmm2
+
+ /* O-H2 interaction */
+ movaps xmm0, [esp + rinvOH2]
+ movaps xmm1, xmm0
+ mulps xmm1, [esp + rsqOH2] /* xmm1=r */
+ mulps xmm1, [esp + tsc]
+ movhlps xmm2, xmm1
+ cvttps2pi mm6, xmm1
+ cvttps2pi mm7, xmm2 /* mm6/mm7 contain lu indices */
+ cvtpi2ps xmm3, mm6
+ cvtpi2ps xmm2, mm7
+ movlhps xmm3, xmm2
+ subps xmm1, xmm3 /* xmm1=eps */
+ movaps xmm2, xmm1
+ mulps xmm2, xmm2 /* xmm2=eps2 */
+
+ pslld mm6, 2
+ pslld mm7, 2
+
+ movd eax, mm6
+ psrlq mm6, 32
+ movd ecx, mm7
+ psrlq mm7, 32
+ movd ebx, mm6
+ movd edx, mm7
+
+ movlps xmm5, [esi + eax*4]
+ movlps xmm7, [esi + ecx*4]
+ movhps xmm5, [esi + ebx*4]
+ movhps xmm7, [esi + edx*4] /* got half coulomb table */
+
+ movaps xmm4, xmm5
+ shufps xmm4, xmm7, 0b10001000
+ shufps xmm5, xmm7, 0b11011101
+
+ movlps xmm7, [esi + eax*4 + 8]
+ movlps xmm3, [esi + ecx*4 + 8]
+ movhps xmm7, [esi + ebx*4 + 8]
+ movhps xmm3, [esi + edx*4 + 8] /* other half of coulomb table */
+ movaps xmm6, xmm7
+ shufps xmm6, xmm3, 0b10001000
+ shufps xmm7, xmm3, 0b11011101
+ /* coulomb table ready, in xmm4-xmm7 */
+
+ mulps xmm6, xmm1 /* xmm6=Geps */
+ mulps xmm7, xmm2 /* xmm7=Heps2 */
+ addps xmm5, xmm6
+ addps xmm5, xmm7 /* xmm5=Fp */
+ mulps xmm7, [esp + two] /* two*Heps2 */
+ movaps xmm3, [esp + qqOH]
+ addps xmm7, xmm6
+ addps xmm7, xmm5 /* xmm7=FF */
+ mulps xmm5, xmm1 /* xmm5=eps*Fp */
+ addps xmm5, xmm4 /* xmm5=VV */
+ mulps xmm5, xmm3 /* vcoul=qq*VV */
+ mulps xmm3, xmm7 /* fijC=FF*qq */
+ /* at this point mm5 contains vcoul and mm3 fijC */
+
+ addps xmm5, [esp + vctot]
+ movaps [esp + vctot], xmm5
+ xorps xmm1, xmm1
+ mulps xmm3, [esp + tsc]
+ mulps xmm3, xmm0
+ subps xmm1, xmm3
+
+ movaps xmm0, xmm1
+ movaps xmm2, xmm1
+
+ xorps xmm3, xmm3
+ movaps xmm4, xmm3
+ movaps xmm5, xmm3
+ mulps xmm0, [esp + dxOH2]
+ mulps xmm1, [esp + dyOH2]
+ mulps xmm2, [esp + dzOH2]
+ subps xmm3, xmm0
+ subps xmm4, xmm1
+ subps xmm5, xmm2
+ addps xmm0, [esp + fixO]
+ addps xmm1, [esp + fiyO]
+ addps xmm2, [esp + fizO]
+ movaps [esp + fjxH2], xmm3
+ movaps [esp + fjyH2], xmm4
+ movaps [esp + fjzH2], xmm5
+ movaps [esp + fixO], xmm0
+ movaps [esp + fiyO], xmm1
+ movaps [esp + fizO], xmm2
+
+ /* H1-O interaction */
+ movaps xmm0, [esp + rinvH1O]
+ movaps xmm1, xmm0
+ mulps xmm1, [esp + rsqH1O] /* xmm1=r */
+ mulps xmm1, [esp + tsc]
+ movhlps xmm2, xmm1
+ cvttps2pi mm6, xmm1
+ cvttps2pi mm7, xmm2 /* mm6/mm7 contain lu indices */
+ cvtpi2ps xmm3, mm6
+ cvtpi2ps xmm2, mm7
+ movlhps xmm3, xmm2
+ subps xmm1, xmm3 /* xmm1=eps */
+ movaps xmm2, xmm1
+ mulps xmm2, xmm2 /* xmm2=eps2 */
+
+ pslld mm6, 2
+ pslld mm7, 2
+
+ movd eax, mm6
+ psrlq mm6, 32
+ movd ecx, mm7
+ psrlq mm7, 32
+ movd ebx, mm6
+ movd edx, mm7
+
+ movlps xmm5, [esi + eax*4]
+ movlps xmm7, [esi + ecx*4]
+ movhps xmm5, [esi + ebx*4]
+ movhps xmm7, [esi + edx*4] /* got half coulomb table */
+
+ movaps xmm4, xmm5
+ shufps xmm4, xmm7, 0b10001000
+ shufps xmm5, xmm7, 0b11011101
+
+ movlps xmm7, [esi + eax*4 + 8]
+ movlps xmm3, [esi + ecx*4 + 8]
+ movhps xmm7, [esi + ebx*4 + 8]
+ movhps xmm3, [esi + edx*4 + 8] /* other half of coulomb table */
+ movaps xmm6, xmm7
+ shufps xmm6, xmm3, 0b10001000
+ shufps xmm7, xmm3, 0b11011101
+ /* coulomb table ready, in xmm4-xmm7 */
+
+ mulps xmm6, xmm1 /* xmm6=Geps */
+ mulps xmm7, xmm2 /* xmm7=Heps2 */
+ addps xmm5, xmm6
+ addps xmm5, xmm7 /* xmm5=Fp */
+ mulps xmm7, [esp + two] /* two*Heps2 */
+ movaps xmm3, [esp + qqOH]
+ addps xmm7, xmm6
+ addps xmm7, xmm5 /* xmm7=FF */
+ mulps xmm5, xmm1 /* xmm5=eps*Fp */
+ addps xmm5, xmm4 /* xmm5=VV */
+ mulps xmm5, xmm3 /* vcoul=qq*VV */
+ mulps xmm3, xmm7 /* fijC=FF*qq */
+ /* at this point mm5 contains vcoul and mm3 fijC */
+
+ addps xmm5, [esp + vctot]
+ movaps [esp + vctot], xmm5
+ xorps xmm1, xmm1
+ mulps xmm3, [esp + tsc]
+ mulps xmm3, xmm0
+ subps xmm1, xmm3
+
+ movaps xmm0, xmm1
+ movaps xmm2, xmm1
+
+ movaps xmm3, [esp + fjxO]
+ movaps xmm4, [esp + fjyO]
+ movaps xmm5, [esp + fjzO]
+ mulps xmm0, [esp + dxH1O]
+ mulps xmm1, [esp + dyH1O]
+ mulps xmm2, [esp + dzH1O]
+ subps xmm3, xmm0
+ subps xmm4, xmm1
+ subps xmm5, xmm2
+ addps xmm0, [esp + fixH1]
+ addps xmm1, [esp + fiyH1]
+ addps xmm2, [esp + fizH1]
+ movaps [esp + fjxO], xmm3
+ movaps [esp + fjyO], xmm4
+ movaps [esp + fjzO], xmm5
+ movaps [esp + fixH1], xmm0
+ movaps [esp + fiyH1], xmm1
+ movaps [esp + fizH1], xmm2
+
+ /* H1-H1 interaction */
+ movaps xmm0, [esp + rinvH1H1]
+ movaps xmm1, xmm0
+ mulps xmm1, [esp + rsqH1H1] /* xmm1=r */
+ mulps xmm1, [esp + tsc]
+ movhlps xmm2, xmm1
+ cvttps2pi mm6, xmm1
+ cvttps2pi mm7, xmm2 /* mm6/mm7 contain lu indices */
+ cvtpi2ps xmm3, mm6
+ cvtpi2ps xmm2, mm7
+ movlhps xmm3, xmm2
+ subps xmm1, xmm3 /* xmm1=eps */
+ movaps xmm2, xmm1
+ mulps xmm2, xmm2 /* xmm2=eps2 */
+
+ pslld mm6, 2
+ pslld mm7, 2
+
+ movd eax, mm6
+ psrlq mm6, 32
+ movd ecx, mm7
+ psrlq mm7, 32
+ movd ebx, mm6
+ movd edx, mm7
+
+ movlps xmm5, [esi + eax*4]
+ movlps xmm7, [esi + ecx*4]
+ movhps xmm5, [esi + ebx*4]
+ movhps xmm7, [esi + edx*4] /* got half coulomb table */
+
+ movaps xmm4, xmm5
+ shufps xmm4, xmm7, 0b10001000
+ shufps xmm5, xmm7, 0b11011101
+
+ movlps xmm7, [esi + eax*4 + 8]
+ movlps xmm3, [esi + ecx*4 + 8]
+ movhps xmm7, [esi + ebx*4 + 8]
+ movhps xmm3, [esi + edx*4 + 8] /* other half of coulomb table */
+ movaps xmm6, xmm7
+ shufps xmm6, xmm3, 0b10001000
+ shufps xmm7, xmm3, 0b11011101
+ /* coulomb table ready, in xmm4-xmm7 */
+
+ mulps xmm6, xmm1 /* xmm6=Geps */
+ mulps xmm7, xmm2 /* xmm7=Heps2 */
+ addps xmm5, xmm6
+ addps xmm5, xmm7 /* xmm5=Fp */
+ mulps xmm7, [esp + two] /* two*Heps2 */
+ movaps xmm3, [esp + qqHH]
+ addps xmm7, xmm6
+ addps xmm7, xmm5 /* xmm7=FF */
+ mulps xmm5, xmm1 /* xmm5=eps*Fp */
+ addps xmm5, xmm4 /* xmm5=VV */
+ mulps xmm5, xmm3 /* vcoul=qq*VV */
+ mulps xmm3, xmm7 /* fijC=FF*qq */
+ /* at this point mm5 contains vcoul and mm3 fijC */
+
+ addps xmm5, [esp + vctot]
+ movaps [esp + vctot], xmm5
+ xorps xmm1, xmm1
+ mulps xmm3, [esp + tsc]
+ mulps xmm3, xmm0
+ subps xmm1, xmm3
+
+ movaps xmm0, xmm1
+ movaps xmm2, xmm1
+
+ movaps xmm3, [esp + fjxH1]
+ movaps xmm4, [esp + fjyH1]
+ movaps xmm5, [esp + fjzH1]
+ mulps xmm0, [esp + dxH1H1]
+ mulps xmm1, [esp + dyH1H1]
+ mulps xmm2, [esp + dzH1H1]
+ subps xmm3, xmm0
+ subps xmm4, xmm1
+ subps xmm5, xmm2
+ addps xmm0, [esp + fixH1]
+ addps xmm1, [esp + fiyH1]
+ addps xmm2, [esp + fizH1]
+ movaps [esp + fjxH1], xmm3
+ movaps [esp + fjyH1], xmm4
+ movaps [esp + fjzH1], xmm5
+ movaps [esp + fixH1], xmm0
+ movaps [esp + fiyH1], xmm1
+ movaps [esp + fizH1], xmm2
+
+ /* H1-H2 interaction */
+ movaps xmm0, [esp + rinvH1H2]
+ movaps xmm1, xmm0
+ mulps xmm1, [esp + rsqH1H2] /* xmm1=r */
+ mulps xmm1, [esp + tsc]
+ movhlps xmm2, xmm1
+ cvttps2pi mm6, xmm1
+ cvttps2pi mm7, xmm2 /* mm6/mm7 contain lu indices */
+ cvtpi2ps xmm3, mm6
+ cvtpi2ps xmm2, mm7
+ movlhps xmm3, xmm2
+ subps xmm1, xmm3 /* xmm1=eps */
+ movaps xmm2, xmm1
+ mulps xmm2, xmm2 /* xmm2=eps2 */
+
+ pslld mm6, 2
+ pslld mm7, 2
+
+ movd eax, mm6
+ psrlq mm6, 32
+ movd ecx, mm7
+ psrlq mm7, 32
+ movd ebx, mm6
+ movd edx, mm7
+
+ movlps xmm5, [esi + eax*4]
+ movlps xmm7, [esi + ecx*4]
+ movhps xmm5, [esi + ebx*4]
+ movhps xmm7, [esi + edx*4] /* got half coulomb table */
+
+ movaps xmm4, xmm5
+ shufps xmm4, xmm7, 0b10001000
+ shufps xmm5, xmm7, 0b11011101
+
+ movlps xmm7, [esi + eax*4 + 8]
+ movlps xmm3, [esi + ecx*4 + 8]
+ movhps xmm7, [esi + ebx*4 + 8]
+ movhps xmm3, [esi + edx*4 + 8] /* other half of coulomb table */
+ movaps xmm6, xmm7
+ shufps xmm6, xmm3, 0b10001000
+ shufps xmm7, xmm3, 0b11011101
+ /* coulomb table ready, in xmm4-xmm7 */
+
+ mulps xmm6, xmm1 /* xmm6=Geps */
+ mulps xmm7, xmm2 /* xmm7=Heps2 */
+ addps xmm5, xmm6
+ addps xmm5, xmm7 /* xmm5=Fp */
+ mulps xmm7, [esp + two] /* two*Heps2 */
+ movaps xmm3, [esp + qqHH]
+ addps xmm7, xmm6
+ addps xmm7, xmm5 /* xmm7=FF */
+ mulps xmm5, xmm1 /* xmm5=eps*Fp */
+ addps xmm5, xmm4 /* xmm5=VV */
+ mulps xmm5, xmm3 /* vcoul=qq*VV */
+ mulps xmm3, xmm7 /* fijC=FF*qq */
+ /* at this point mm5 contains vcoul and mm3 fijC */
+
+ addps xmm5, [esp + vctot]
+ movaps [esp + vctot], xmm5
+ xorps xmm1, xmm1
+ mulps xmm3, [esp + tsc]
+ mulps xmm3, xmm0
+ subps xmm1, xmm3
+
+ movaps xmm0, xmm1
+ movaps xmm2, xmm1
+
+ movaps xmm3, [esp + fjxH2]
+ movaps xmm4, [esp + fjyH2]
+ movaps xmm5, [esp + fjzH2]
+ mulps xmm0, [esp + dxH1H2]
+ mulps xmm1, [esp + dyH1H2]
+ mulps xmm2, [esp + dzH1H2]
+ subps xmm3, xmm0
+ subps xmm4, xmm1
+ subps xmm5, xmm2
+ addps xmm0, [esp + fixH1]
+ addps xmm1, [esp + fiyH1]
+ addps xmm2, [esp + fizH1]
+ movaps [esp + fjxH2], xmm3
+ movaps [esp + fjyH2], xmm4
+ movaps [esp + fjzH2], xmm5
+ movaps [esp + fixH1], xmm0
+ movaps [esp + fiyH1], xmm1
+ movaps [esp + fizH1], xmm2
+
+ /* H2-O interaction */
+ movaps xmm0, [esp + rinvH2O]
+ movaps xmm1, xmm0
+ mulps xmm1, [esp + rsqH2O] /* xmm1=r */
+ mulps xmm1, [esp + tsc]
+ movhlps xmm2, xmm1
+ cvttps2pi mm6, xmm1
+ cvttps2pi mm7, xmm2 /* mm6/mm7 contain lu indices */
+ cvtpi2ps xmm3, mm6
+ cvtpi2ps xmm2, mm7
+ movlhps xmm3, xmm2
+ subps xmm1, xmm3 /* xmm1=eps */
+ movaps xmm2, xmm1
+ mulps xmm2, xmm2 /* xmm2=eps2 */
+
+ pslld mm6, 2
+ pslld mm7, 2
+
+ movd eax, mm6
+ psrlq mm6, 32
+ movd ecx, mm7
+ psrlq mm7, 32
+ movd ebx, mm6
+ movd edx, mm7
+
+ movlps xmm5, [esi + eax*4]
+ movlps xmm7, [esi + ecx*4]
+ movhps xmm5, [esi + ebx*4]
+ movhps xmm7, [esi + edx*4] /* got half coulomb table */
+
+ movaps xmm4, xmm5
+ shufps xmm4, xmm7, 0b10001000
+ shufps xmm5, xmm7, 0b11011101
+
+ movlps xmm7, [esi + eax*4 + 8]
+ movlps xmm3, [esi + ecx*4 + 8]
+ movhps xmm7, [esi + ebx*4 + 8]
+ movhps xmm3, [esi + edx*4 + 8] /* other half of coulomb table */
+ movaps xmm6, xmm7
+ shufps xmm6, xmm3, 0b10001000
+ shufps xmm7, xmm3, 0b11011101
+ /* coulomb table ready, in xmm4-xmm7 */
+
+ mulps xmm6, xmm1 /* xmm6=Geps */
+ mulps xmm7, xmm2 /* xmm7=Heps2 */
+ addps xmm5, xmm6
+ addps xmm5, xmm7 /* xmm5=Fp */
+ mulps xmm7, [esp + two] /* two*Heps2 */
+ movaps xmm3, [esp + qqOH]
+ addps xmm7, xmm6
+ addps xmm7, xmm5 /* xmm7=FF */
+ mulps xmm5, xmm1 /* xmm5=eps*Fp */
+ addps xmm5, xmm4 /* xmm5=VV */
+ mulps xmm5, xmm3 /* vcoul=qq*VV */
+ mulps xmm3, xmm7 /* fijC=FF*qq */
+ /* at this point mm5 contains vcoul and mm3 fijC */
+
+ addps xmm5, [esp + vctot]
+ movaps [esp + vctot], xmm5
+ xorps xmm1, xmm1
+ mulps xmm3, [esp + tsc]
+ mulps xmm3, xmm0
+ subps xmm1, xmm3
+
+ movaps xmm0, xmm1
+ movaps xmm2, xmm1
+
+ movaps xmm3, [esp + fjxO]
+ movaps xmm4, [esp + fjyO]
+ movaps xmm5, [esp + fjzO]
+ mulps xmm0, [esp + dxH2O]
+ mulps xmm1, [esp + dyH2O]
+ mulps xmm2, [esp + dzH2O]
+ subps xmm3, xmm0
+ subps xmm4, xmm1
+ subps xmm5, xmm2
+ addps xmm0, [esp + fixH2]
+ addps xmm1, [esp + fiyH2]
+ addps xmm2, [esp + fizH2]
+ movaps [esp + fjxO], xmm3
+ movaps [esp + fjyO], xmm4
+ movaps [esp + fjzO], xmm5
+ movaps [esp + fixH2], xmm0
+ movaps [esp + fiyH2], xmm1
+ movaps [esp + fizH2], xmm2
+
+ /* H2-H1 interaction */
+ movaps xmm0, [esp + rinvH2H1]
+ movaps xmm1, xmm0
+ mulps xmm1, [esp + rsqH2H1] /* xmm1=r */
+ mulps xmm1, [esp + tsc]
+ movhlps xmm2, xmm1
+ cvttps2pi mm6, xmm1
+ cvttps2pi mm7, xmm2 /* mm6/mm7 contain lu indices */
+ cvtpi2ps xmm3, mm6
+ cvtpi2ps xmm2, mm7
+ movlhps xmm3, xmm2
+ subps xmm1, xmm3 /* xmm1=eps */
+ movaps xmm2, xmm1
+ mulps xmm2, xmm2 /* xmm2=eps2 */
+
+ pslld mm6, 2
+ pslld mm7, 2
+
+ movd eax, mm6
+ psrlq mm6, 32
+ movd ecx, mm7
+ psrlq mm7, 32
+ movd ebx, mm6
+ movd edx, mm7
+
+ movlps xmm5, [esi + eax*4]
+ movlps xmm7, [esi + ecx*4]
+ movhps xmm5, [esi + ebx*4]
+ movhps xmm7, [esi + edx*4] /* got half coulomb table */
+
+ movaps xmm4, xmm5
+ shufps xmm4, xmm7, 0b10001000
+ shufps xmm5, xmm7, 0b11011101
+
+ movlps xmm7, [esi + eax*4 + 8]
+ movlps xmm3, [esi + ecx*4 + 8]
+ movhps xmm7, [esi + ebx*4 + 8]
+ movhps xmm3, [esi + edx*4 + 8] /* other half of coulomb table */
+ movaps xmm6, xmm7
+ shufps xmm6, xmm3, 0b10001000
+ shufps xmm7, xmm3, 0b11011101
+ /* coulomb table ready, in xmm4-xmm7 */
+
+ mulps xmm6, xmm1 /* xmm6=Geps */
+ mulps xmm7, xmm2 /* xmm7=Heps2 */
+ addps xmm5, xmm6
+ addps xmm5, xmm7 /* xmm5=Fp */
+ mulps xmm7, [esp + two] /* two*Heps2 */
+ movaps xmm3, [esp + qqHH]
+ addps xmm7, xmm6
+ addps xmm7, xmm5 /* xmm7=FF */
+ mulps xmm5, xmm1 /* xmm5=eps*Fp */
+ addps xmm5, xmm4 /* xmm5=VV */
+ mulps xmm5, xmm3 /* vcoul=qq*VV */
+ mulps xmm3, xmm7 /* fijC=FF*qq */
+ /* at this point mm5 contains vcoul and mm3 fijC */
+
+ addps xmm5, [esp + vctot]
+ movaps [esp + vctot], xmm5
+ xorps xmm1, xmm1
+ mulps xmm3, [esp + tsc]
+ mulps xmm3, xmm0
+ subps xmm1, xmm3
+
+ movaps xmm0, xmm1
+ movaps xmm2, xmm1
+
+ movaps xmm3, [esp + fjxH1]
+ movaps xmm4, [esp + fjyH1]
+ movaps xmm5, [esp + fjzH1]
+ mulps xmm0, [esp + dxH2H1]
+ mulps xmm1, [esp + dyH2H1]
+ mulps xmm2, [esp + dzH2H1]
+ subps xmm3, xmm0
+ subps xmm4, xmm1
+ subps xmm5, xmm2
+ addps xmm0, [esp + fixH2]
+ addps xmm1, [esp + fiyH2]
+ addps xmm2, [esp + fizH2]
+ movaps [esp + fjxH1], xmm3
+ movaps [esp + fjyH1], xmm4
+ movaps [esp + fjzH1], xmm5
+ movaps [esp + fixH2], xmm0
+ movaps [esp + fiyH2], xmm1
+ movaps [esp + fizH2], xmm2
+
+ /* H2-H2 interaction */
+ movaps xmm0, [esp + rinvH2H2]
+ movaps xmm1, xmm0
+ mulps xmm1, [esp + rsqH2H2] /* xmm1=r */
+ mulps xmm1, [esp + tsc]
+ movhlps xmm2, xmm1
+ cvttps2pi mm6, xmm1
+ cvttps2pi mm7, xmm2 /* mm6/mm7 contain lu indices */
+ cvtpi2ps xmm3, mm6
+ cvtpi2ps xmm2, mm7
+ movlhps xmm3, xmm2
+ subps xmm1, xmm3 /* xmm1=eps */
+ movaps xmm2, xmm1
+ mulps xmm2, xmm2 /* xmm2=eps2 */
+
+ pslld mm6, 2
+ pslld mm7, 2
+
+ movd eax, mm6
+ psrlq mm6, 32
+ movd ecx, mm7
+ psrlq mm7, 32
+ movd ebx, mm6
+ movd edx, mm7
+
+ movlps xmm5, [esi + eax*4]
+ movlps xmm7, [esi + ecx*4]
+ movhps xmm5, [esi + ebx*4]
+ movhps xmm7, [esi + edx*4] /* got half coulomb table */
+
+ movaps xmm4, xmm5
+ shufps xmm4, xmm7, 0b10001000
+ shufps xmm5, xmm7, 0b11011101
+
+ movlps xmm7, [esi + eax*4 + 8]
+ movlps xmm3, [esi + ecx*4 + 8]
+ movhps xmm7, [esi + ebx*4 + 8]
+ movhps xmm3, [esi + edx*4 + 8] /* other half of coulomb table */
+ movaps xmm6, xmm7
+ shufps xmm6, xmm3, 0b10001000
+ shufps xmm7, xmm3, 0b11011101
+ /* coulomb table ready, in xmm4-xmm7 */
+
+ mulps xmm6, xmm1 /* xmm6=Geps */
+ mulps xmm7, xmm2 /* xmm7=Heps2 */
+ addps xmm5, xmm6
+ addps xmm5, xmm7 /* xmm5=Fp */
+ mulps xmm7, [esp + two] /* two*Heps2 */
+ movaps xmm3, [esp + qqHH]
+ addps xmm7, xmm6
+ addps xmm7, xmm5 /* xmm7=FF */
+ mulps xmm5, xmm1 /* xmm5=eps*Fp */
+ addps xmm5, xmm4 /* xmm5=VV */
+ mulps xmm5, xmm3 /* vcoul=qq*VV */
+ mulps xmm3, xmm7 /* fijC=FF*qq */
+ /* at this point mm5 contains vcoul and mm3 fijC */
+
+ addps xmm5, [esp + vctot]
+ movaps [esp + vctot], xmm5
+ xorps xmm1, xmm1
+ mulps xmm3, [esp + tsc]
+ mulps xmm3, xmm0
+ subps xmm1, xmm3
+
+ movaps xmm0, xmm1
+ movaps xmm2, xmm1
+
+ movaps xmm3, [esp + fjxH2]
+ movaps xmm4, [esp + fjyH2]
+ movaps xmm5, [esp + fjzH2]
+ mulps xmm0, [esp + dxH2H2]
+ mulps xmm1, [esp + dyH2H2]
+ mulps xmm2, [esp + dzH2H2]
+ subps xmm3, xmm0
+ subps xmm4, xmm1
+ subps xmm5, xmm2
+ addps xmm0, [esp + fixH2]
+ addps xmm1, [esp + fiyH2]
+ addps xmm2, [esp + fizH2]
+ movaps [esp + fjxH2], xmm3
+ movaps [esp + fjyH2], xmm4
+ movaps [esp + fjzH2], xmm5
+ movaps [esp + fixH2], xmm0
+ movaps [esp + fiyH2], xmm1
+ movaps [esp + fizH2], xmm2
+
+ mov edi, [ebp +faction]
+
+ movd eax, mm0
+ movd ebx, mm1
+ movd ecx, mm2
+ movd edx, mm3
+
+ /* Did all interactions - now update j forces */
+ /* 4 j waters with three atoms each - first do a & b j particles */
+ movaps xmm0, [esp + fjxO] /* xmm0= fjxOa fjxOb fjxOc fjxOd */
+ movaps xmm1, [esp + fjyO] /* xmm1= fjyOa fjyOb fjyOc fjyOd */
+ unpcklps xmm0, xmm1 /* xmm0= fjxOa fjyOa fjxOb fjyOb */
+ movaps xmm1, [esp + fjzO]
+ movaps xmm2, [esp + fjxH1]
+ movhlps xmm3, xmm0 /* xmm3= fjxOb fjyOb */
+ unpcklps xmm1, xmm2 /* xmm1= fjzOa fjxH1a fjzOb fjxH1b */
+ movaps xmm4, [esp + fjyH1]
+ movaps xmm5, [esp + fjzH1]
+ unpcklps xmm4, xmm5 /* xmm4= fjyH1a fjzH1a fjyH1b fjzH1b */
+ movaps xmm5, [esp + fjxH2]
+ movaps xmm6, [esp + fjyH2]
+ movhlps xmm7, xmm4 /* xmm7= fjyH1b fjzH1b */
+ unpcklps xmm5, xmm6 /* xmm5= fjxH2a fjyH2a fjxH2b fjyH2b */
+ movlhps xmm0, xmm1 /* xmm0= fjxOa fjyOa fjzOa fjxH1a */
+ shufps xmm3, xmm1, 0b11100100
+ /* xmm3= fjxOb fjyOb fjzOb fjxH1b */
+ movlhps xmm4, xmm5 /* xmm4= fjyH1a fjzH1a fjxH2a fjyH2a */
+ shufps xmm7, xmm5, 0b11100100
+ /* xmm7= fjyH1b fjzH1b fjxH2b fjyH2b */
+ movups xmm1, [edi + eax*4]
+ movups xmm2, [edi + eax*4 + 16]
+ movups xmm5, [edi + ebx*4]
+ movups xmm6, [edi + ebx*4 + 16]
+ addps xmm1, xmm0
+ addps xmm2, xmm4
+ addps xmm5, xmm3
+ addps xmm6, xmm7
+ movss xmm0, [edi + eax*4 + 32]
+ movss xmm3, [edi + ebx*4 + 32]
+
+ movaps xmm4, [esp + fjzH2]
+ movaps xmm7, xmm4
+ shufps xmm7, xmm7, 1
+
+ movups [edi + eax*4], xmm1
+ movups [edi + eax*4 + 16],xmm2
+ movups [edi + ebx*4], xmm5
+ movups [edi + ebx*4 + 16],xmm6
+ addss xmm0, xmm4
+ addss xmm3, xmm7
+ movss [edi + eax*4 + 32], xmm0
+ movss [edi + ebx*4 + 32], xmm3
+
+ /* then do the second pair (c & d) */
+ movaps xmm0, [esp + fjxO] /* xmm0= fjxOa fjxOb fjxOc fjxOd */
+ movaps xmm1, [esp + fjyO] /* xmm1= fjyOa fjyOb fjyOc fjyOd */
+ unpckhps xmm0, xmm1 /* xmm0= fjxOc fjyOc fjxOd fjyOd */
+ movaps xmm1, [esp + fjzO]
+ movaps xmm2, [esp + fjxH1]
+ movhlps xmm3, xmm0 /* xmm3= fjxOd fjyOd */
+ unpckhps xmm1, xmm2 /* xmm1= fjzOc fjxH1c fjzOd fjxH1d */
+ movaps xmm4, [esp + fjyH1]
+ movaps xmm5, [esp + fjzH1]
+ unpckhps xmm4, xmm5 /* xmm4= fjyH1c fjzH1c fjyH1d fjzH1d */
+ movaps xmm5, [esp + fjxH2]
+ movaps xmm6, [esp + fjyH2]
+ movhlps xmm7, xmm4 /* xmm7= fjyH1d fjzH1d */
+ unpckhps xmm5, xmm6 /* xmm5= fjxH2c fjyH2c fjxH2d fjyH2d */
+ movlhps xmm0, xmm1 /* xmm0= fjxOc fjyOc fjzOc fjxH1c */
+ shufps xmm3, xmm1, 0b11100100
+ /* xmm3= fjxOd fjyOd fjzOd fjxH1d */
+ movlhps xmm4, xmm5 /* xmm4= fjyH1c fjzH1c fjxH2c fjyH2c */
+ shufps xmm7, xmm5, 0b11100100
+ /* xmm7= fjyH1d fjzH1d fjxH2d fjyH2d */
+ movups xmm1, [edi + ecx*4]
+ movups xmm2, [edi + ecx*4 + 16]
+ movups xmm5, [edi + edx*4]
+ movups xmm6, [edi + edx*4 + 16]
+ addps xmm1, xmm0
+ addps xmm2, xmm4
+ addps xmm5, xmm3
+ addps xmm6, xmm7
+ movss xmm0, [edi + ecx*4 + 32]
+ movss xmm3, [edi + edx*4 + 32]
+
+ movaps xmm4, [esp + fjzH2]
+ movaps xmm7, xmm4
+ shufps xmm4, xmm4, 0b10
+ shufps xmm7, xmm7, 0b11
+ movups [edi + ecx*4], xmm1
+ movups [edi + ecx*4 + 16],xmm2
+ movups [edi + edx*4], xmm5
+ movups [edi + edx*4 + 16],xmm6
+ addss xmm0, xmm4
+ addss xmm3, xmm7
+ movss [edi + ecx*4 + 32], xmm0
+ movss [edi + edx*4 + 32], xmm3
+
+ /* should we do one more iteration? */
+ sub [esp + innerk], 4
+ jl .i3030_single_check
+ jmp .i3030_unroll_loop
+.i3030_single_check:
+ add [esp + innerk], 4
+ jnz .i3030_single_loop
+ jmp .i3030_updateouterdata
+.i3030_single_loop:
+ mov edx, [esp + innerjjnr] /* pointer to jjnr[k] */
+ mov eax, [edx]
+ add [esp + innerjjnr], 4
+
+ mov esi, [ebp + pos]
+ lea eax, [eax + eax*2]
+
+ /* fetch j coordinates */
+ xorps xmm3, xmm3
+ xorps xmm4, xmm4
+ xorps xmm5, xmm5
+ movss xmm3, [esi + eax*4]
+ movss xmm4, [esi + eax*4 + 4]
+ movss xmm5, [esi + eax*4 + 8]
+
+ movlps xmm6, [esi + eax*4 + 12]
+ movhps xmm6, [esi + eax*4 + 24] /* xmm6=jxH1 jyH1 jxH2 jyH2 */
+ /* fetch both z coords in one go, to positions 0 and 3 in xmm7 */
+ movups xmm7, [esi + eax*4 + 20] /* xmm7=jzH1 jxH2 jyH2 jzH2 */
+ shufps xmm6, xmm6, 0b11011000 /* xmm6=jxH1 jxH2 jyH1 jyH2 */
+ movlhps xmm3, xmm6 /* xmm3= jxO 0 jxH1 jxH2 */
+ movaps xmm0, [esp + ixO]
+ movaps xmm1, [esp + iyO]
+ movaps xmm2, [esp + izO]
+ shufps xmm4, xmm6, 0b11100100 /* xmm4= jyO 0 jyH1 jyH2 */
+ shufps xmm5, xmm7, 0b11000100 /* xmm5= jzO 0 jzH1 jzH2 */
+ /* store all j coordinates in jO */
+ movaps [esp + jxO], xmm3
+ movaps [esp + jyO], xmm4
+ movaps [esp + jzO], xmm5
+ subps xmm0, xmm3
+ subps xmm1, xmm4
+ subps xmm2, xmm5
+ movaps [esp + dxOO], xmm0
+ movaps [esp + dyOO], xmm1
+ movaps [esp + dzOO], xmm2
+ mulps xmm0, xmm0
+ mulps xmm1, xmm1
+ mulps xmm2, xmm2
+ addps xmm0, xmm1
+ addps xmm0, xmm2 /* have rsq in xmm0 */
+
+ /* do invsqrt */
+ rsqrtps xmm1, xmm0
+ movaps xmm2, xmm1
+ mulps xmm1, xmm1
+ movaps xmm3, [esp + three]
+ mulps xmm1, xmm0
+ subps xmm3, xmm1
+ mulps xmm3, xmm2
+ mulps xmm3, [esp + half] /* rinv iO - j water */
+
+ movaps xmm1, xmm3
+ mulps xmm1, xmm0 /* xmm1=r */
+ movaps xmm0, xmm3 /* xmm0=rinv */
+ mulps xmm1, [esp + tsc]
+
+ movhlps xmm2, xmm1
+ cvttps2pi mm6, xmm1
+ cvttps2pi mm7, xmm2 /* mm6/mm7 contain lu indices */
+ cvtpi2ps xmm3, mm6
+ cvtpi2ps xmm2, mm7
+ movlhps xmm3, xmm2
+ subps xmm1, xmm3 /* xmm1=eps */
+ movaps xmm2, xmm1
+ mulps xmm2, xmm2 /* xmm2=eps2 */
+ pslld mm6, 2
+ pslld mm7, 2
+
+ movd ebx, mm6
+ movd ecx, mm7
+ psrlq mm7, 32
+ movd edx, mm7 /* table indices in ebx,ecx,edx */
+
+ mov esi, [ebp + VFtab]
+
+ movlps xmm5, [esi + ebx*4]
+ movlps xmm7, [esi + ecx*4]
+ movhps xmm7, [esi + edx*4] /* got half coulomb table */
+ movaps xmm4, xmm5
+ shufps xmm4, xmm7, 0b10001000
+ shufps xmm5, xmm7, 0b11011101
+
+ movlps xmm7, [esi + ebx*4 + 8]
+ movlps xmm3, [esi + ecx*4 + 8]
+ movhps xmm3, [esi + edx*4 + 8] /* other half of coulomb table */
+ movaps xmm6, xmm7
+ shufps xmm6, xmm3, 0b10001000
+ shufps xmm7, xmm3, 0b11011101
+ /* coulomb table ready, in xmm4-xmm7 */
+ mulps xmm6, xmm1 /* xmm6=Geps */
+ mulps xmm7, xmm2 /* xmm7=Heps2 */
+ addps xmm5, xmm6
+ addps xmm5, xmm7 /* xmm5=Fp */
+ mulps xmm7, [esp + two] /* two*Heps2 */
+
+ xorps xmm3, xmm3
+ /* fetch charges to xmm3 (temporary) */
+ movss xmm3, [esp + qqOO]
+ movhps xmm3, [esp + qqOH]
+
+ addps xmm7, xmm6
+ addps xmm7, xmm5 /* xmm7=FF */
+ mulps xmm5, xmm1 /* xmm5=eps*Fp */
+ addps xmm5, xmm4 /* xmm5=VV */
+ mulps xmm5, xmm3 /* vcoul=qq*VV */
+ mulps xmm3, xmm7 /* fijC=FF*qq */
+ /* at this point xmm5 contains vcoul and xmm3 fijC */
+
+ addps xmm5, [esp + vctot]
+ movaps [esp + vctot], xmm5
+ xorps xmm2, xmm2
+ mulps xmm3, [esp + tsc]
+
+ subps xmm2, xmm3
+ mulps xmm0, xmm2
+
+ movaps xmm1, xmm0
+ movaps xmm2, xmm0
+
+ mulps xmm0, [esp + dxOO]
+ mulps xmm1, [esp + dyOO]
+ mulps xmm2, [esp + dzOO]
+ /* initial update for j forces */
+ xorps xmm3, xmm3
+ xorps xmm4, xmm4
+ xorps xmm5, xmm5
+ subps xmm3, xmm0
+ subps xmm4, xmm1
+ subps xmm5, xmm2
+ movaps [esp + fjxO], xmm3
+ movaps [esp + fjyO], xmm4
+ movaps [esp + fjzO], xmm5
+ addps xmm0, [esp + fixO]
+ addps xmm1, [esp + fiyO]
+ addps xmm2, [esp + fizO]
+ movaps [esp + fixO], xmm0
+ movaps [esp + fiyO], xmm1
+ movaps [esp + fizO], xmm2
+
+
+ /* done with i O Now do i H1 & H2 simultaneously first get i particle coords: */
+ movaps xmm0, [esp + ixH1]
+ movaps xmm1, [esp + iyH1]
+ movaps xmm2, [esp + izH1]
+ movaps xmm3, [esp + ixH2]
+ movaps xmm4, [esp + iyH2]
+ movaps xmm5, [esp + izH2]
+ subps xmm0, [esp + jxO]
+ subps xmm1, [esp + jyO]
+ subps xmm2, [esp + jzO]
+ subps xmm3, [esp + jxO]
+ subps xmm4, [esp + jyO]
+ subps xmm5, [esp + jzO]
+ movaps [esp + dxH1O], xmm0
+ movaps [esp + dyH1O], xmm1
+ movaps [esp + dzH1O], xmm2
+ movaps [esp + dxH2O], xmm3
+ movaps [esp + dyH2O], xmm4
+ movaps [esp + dzH2O], xmm5
+ mulps xmm0, xmm0
+ mulps xmm1, xmm1
+ mulps xmm2, xmm2
+ mulps xmm3, xmm3
+ mulps xmm4, xmm4
+ mulps xmm5, xmm5
+ addps xmm0, xmm1
+ addps xmm4, xmm3
+ addps xmm0, xmm2 /* have rsqH1 in xmm0 */
+ addps xmm4, xmm5 /* have rsqH2 in xmm4 */
+
+ /* start with H1, save H2 data */
+ movaps [esp + rsqH2O], xmm4
+
+ /* do invsqrt */
+ rsqrtps xmm1, xmm0
+ rsqrtps xmm5, xmm4
+ movaps xmm2, xmm1
+ movaps xmm6, xmm5
+ mulps xmm1, xmm1
+ mulps xmm5, xmm5
+ movaps xmm3, [esp + three]
+ movaps xmm7, xmm3
+ mulps xmm1, xmm0
+ mulps xmm5, xmm4
+ subps xmm3, xmm1
+ subps xmm7, xmm5
+ mulps xmm3, xmm2
+ mulps xmm7, xmm6
+ mulps xmm3, [esp + half] /* rinv H1 - j water */
+ mulps xmm7, [esp + half] /* rinv H2 - j water */
+
+ /* start with H1, save H2 data */
+ movaps [esp + rinvH2O], xmm7
+
+ movaps xmm1, xmm3
+ mulps xmm1, xmm0 /* xmm1=r */
+ movaps xmm0, xmm3 /* xmm0=rinv */
+ mulps xmm1, [esp + tsc]
+
+ movhlps xmm2, xmm1
+ cvttps2pi mm6, xmm1
+ cvttps2pi mm7, xmm2 /* mm6/mm7 contain lu indices */
+ cvtpi2ps xmm3, mm6
+ cvtpi2ps xmm2, mm7
+ movlhps xmm3, xmm2
+ subps xmm1, xmm3 /* xmm1=eps */
+ movaps xmm2, xmm1
+ mulps xmm2, xmm2 /* xmm2=eps2 */
+ pslld mm6, 2
+ pslld mm7, 2
+
+ movd ebx, mm6
+ movd ecx, mm7
+ psrlq mm7, 32
+ movd edx, mm7 /* table indices in ebx,ecx,edx */
+
+ movlps xmm5, [esi + ebx*4]
+ movlps xmm7, [esi + ecx*4]
+ movhps xmm7, [esi + edx*4] /* got half coulomb table */
+ movaps xmm4, xmm5
+ shufps xmm4, xmm7, 0b10001000
+ shufps xmm5, xmm7, 0b11011101
+
+ movlps xmm7, [esi + ebx*4 + 8]
+ movlps xmm3, [esi + ecx*4 + 8]
+ movhps xmm3, [esi + edx*4 + 8] /* other half of coulomb table */
+ movaps xmm6, xmm7
+ shufps xmm6, xmm3, 0b10001000
+ shufps xmm7, xmm3, 0b11011101
+ /* coulomb table ready, in xmm4-xmm7 */
+ mulps xmm6, xmm1 /* xmm6=Geps */
+ mulps xmm7, xmm2 /* xmm7=Heps2 */
+ addps xmm5, xmm6
+ addps xmm5, xmm7 /* xmm5=Fp */
+ mulps xmm7, [esp + two] /* two*Heps2 */
+
+ xorps xmm3, xmm3
+ /* fetch charges to xmm3 (temporary) */
+ movss xmm3, [esp + qqOH]
+ movhps xmm3, [esp + qqHH]
+
+ addps xmm7, xmm6
+ addps xmm7, xmm5 /* xmm7=FF */
+ mulps xmm5, xmm1 /* xmm5=eps*Fp */
+ addps xmm5, xmm4 /* xmm5=VV */
+ mulps xmm5, xmm3 /* vcoul=qq*VV */
+ mulps xmm3, xmm7 /* fijC=FF*qq */
+ /* at this point xmm5 contains vcoul and xmm3 fijC */
+ addps xmm5, [esp + vctot]
+ movaps [esp + vctot], xmm5
+
+ xorps xmm1, xmm1
+
+ mulps xmm3, [esp + tsc]
+ mulps xmm3, xmm0
+ subps xmm1, xmm3
+
+ movaps xmm0, xmm1
+ movaps xmm2, xmm1
+ mulps xmm0, [esp + dxH1O]
+ mulps xmm1, [esp + dyH1O]
+ mulps xmm2, [esp + dzH1O]
+ /* update forces H1 - j water */
+ movaps xmm3, [esp + fjxO]
+ movaps xmm4, [esp + fjyO]
+ movaps xmm5, [esp + fjzO]
+ subps xmm3, xmm0
+ subps xmm4, xmm1
+ subps xmm5, xmm2
+ movaps [esp + fjxO], xmm3
+ movaps [esp + fjyO], xmm4
+ movaps [esp + fjzO], xmm5
+ addps xmm0, [esp + fixH1]
+ addps xmm1, [esp + fiyH1]
+ addps xmm2, [esp + fizH1]
+ movaps [esp + fixH1], xmm0
+ movaps [esp + fiyH1], xmm1
+ movaps [esp + fizH1], xmm2
+ /* do table for H2 - j water interaction */
+ movaps xmm0, [esp + rinvH2O]
+ movaps xmm1, [esp + rsqH2O]
+ mulps xmm1, xmm0 /* xmm0=rinv, xmm1=r */
+ mulps xmm1, [esp + tsc]
+
+ movhlps xmm2, xmm1
+ cvttps2pi mm6, xmm1
+ cvttps2pi mm7, xmm2 /* mm6/mm7 contain lu indices */
+ cvtpi2ps xmm3, mm6
+ cvtpi2ps xmm2, mm7
+ movlhps xmm3, xmm2
+ subps xmm1, xmm3 /* xmm1=eps */
+ movaps xmm2, xmm1
+ mulps xmm2, xmm2 /* xmm2=eps2 */
+ pslld mm6, 2
+ pslld mm7, 2
+
+ movd ebx, mm6
+ movd ecx, mm7
+ psrlq mm7, 32
+ movd edx, mm7 /* table indices in ebx,ecx,edx */
+
+ movlps xmm5, [esi + ebx*4]
+ movlps xmm7, [esi + ecx*4]
+ movhps xmm7, [esi + edx*4] /* got half coulomb table */
+ movaps xmm4, xmm5
+ shufps xmm4, xmm7, 0b10001000
+ shufps xmm5, xmm7, 0b11011101
+
+ movlps xmm7, [esi + ebx*4 + 8]
+ movlps xmm3, [esi + ecx*4 + 8]
+ movhps xmm3, [esi + edx*4 + 8] /* other half of coulomb table */
+ movaps xmm6, xmm7
+ shufps xmm6, xmm3, 0b10001000
+ shufps xmm7, xmm3, 0b11011101
+ /* coulomb table ready, in xmm4-xmm7 */
+ mulps xmm6, xmm1 /* xmm6=Geps */
+ mulps xmm7, xmm2 /* xmm7=Heps2 */
+ addps xmm5, xmm6
+ addps xmm5, xmm7 /* xmm5=Fp */
+ mulps xmm7, [esp + two] /* two*Heps2 */
+
+ xorps xmm3, xmm3
+ /* fetch charges to xmm3 (temporary) */
+ movss xmm3, [esp + qqOH]
+ movhps xmm3, [esp + qqHH]
+
+ addps xmm7, xmm6
+ addps xmm7, xmm5 /* xmm7=FF */
+ mulps xmm5, xmm1 /* xmm5=eps*Fp */
+ addps xmm5, xmm4 /* xmm5=VV */
+ mulps xmm5, xmm3 /* vcoul=qq*VV */
+ mulps xmm3, xmm7 /* fijC=FF*qq */
+ /* at this point xmm5 contains vcoul and xmm3 fijC */
+ addps xmm5, [esp + vctot]
+ movaps [esp + vctot], xmm5
+
+ xorps xmm1, xmm1
+
+ mulps xmm3, [esp + tsc]
+ mulps xmm3, xmm0
+ subps xmm1, xmm3
+
+ movaps xmm0, xmm1
+ movaps xmm2, xmm1
+
+ mulps xmm0, [esp + dxH2O]
+ mulps xmm1, [esp + dyH2O]
+ mulps xmm2, [esp + dzH2O]
+ movaps xmm3, [esp + fjxO]
+ movaps xmm4, [esp + fjyO]
+ movaps xmm5, [esp + fjzO]
+ subps xmm3, xmm0
+ subps xmm4, xmm1
+ subps xmm5, xmm2
+ mov esi, [ebp + faction]
+ movaps [esp + fjxO], xmm3
+ movaps [esp + fjyO], xmm4
+ movaps [esp + fjzO], xmm5
+ addps xmm0, [esp + fixH2]
+ addps xmm1, [esp + fiyH2]
+ addps xmm2, [esp + fizH2]
+ movaps [esp + fixH2], xmm0
+ movaps [esp + fiyH2], xmm1
+ movaps [esp + fizH2], xmm2
+
+ /* update j water forces from local variables */
+ movlps xmm0, [esi + eax*4]
+ movlps xmm1, [esi + eax*4 + 12]
+ movhps xmm1, [esi + eax*4 + 24]
+ movaps xmm3, [esp + fjxO]
+ movaps xmm4, [esp + fjyO]
+ movaps xmm5, [esp + fjzO]
+ movaps xmm6, xmm5
+ movaps xmm7, xmm5
+ shufps xmm6, xmm6, 0b10
+ shufps xmm7, xmm7, 0b11
+ addss xmm5, [esi + eax*4 + 8]
+ addss xmm6, [esi + eax*4 + 20]
+ addss xmm7, [esi + eax*4 + 32]
+ movss [esi + eax*4 + 8], xmm5
+ movss [esi + eax*4 + 20], xmm6
+ movss [esi + eax*4 + 32], xmm7
+ movaps xmm5, xmm3
+ unpcklps xmm3, xmm4
+ unpckhps xmm5, xmm4
+ addps xmm0, xmm3
+ addps xmm1, xmm5
+ movlps [esi + eax*4], xmm0
+ movlps [esi + eax*4 + 12], xmm1
+ movhps [esi + eax*4 + 24], xmm1
+
+ dec dword ptr [esp + innerk]
+ jz .i3030_updateouterdata
+ jmp .i3030_single_loop
+.i3030_updateouterdata:
+ mov ecx, [esp + ii3]
+ mov edi, [ebp + faction]
+ mov esi, [ebp + fshift]
+ mov edx, [esp + is3]
+
+ /* accumulate Oi forces in xmm0, xmm1, xmm2 */
+ movaps xmm0, [esp + fixO]
+ movaps xmm1, [esp + fiyO]
+ movaps xmm2, [esp + fizO]
+
+ movhlps xmm3, xmm0
+ movhlps xmm4, xmm1
+ movhlps xmm5, xmm2
+ addps xmm0, xmm3
+ addps xmm1, xmm4
+ addps xmm2, xmm5 /* sum is in 1/2 in xmm0-xmm2 */
+
+ movaps xmm3, xmm0
+ movaps xmm4, xmm1
+ movaps xmm5, xmm2
+
+ shufps xmm3, xmm3, 1
+ shufps xmm4, xmm4, 1
+ shufps xmm5, xmm5, 1
+ addss xmm0, xmm3
+ addss xmm1, xmm4
+ addss xmm2, xmm5 /* xmm0-xmm2 has single force in pos0 */
+
+ /* increment i force */
+ movss xmm3, [edi + ecx*4]
+ movss xmm4, [edi + ecx*4 + 4]
+ movss xmm5, [edi + ecx*4 + 8]
+ addss xmm3, xmm0
+ addss xmm4, xmm1
+ addss xmm5, xmm2
+ movss [edi + ecx*4], xmm3
+ movss [edi + ecx*4 + 4], xmm4
+ movss [edi + ecx*4 + 8], xmm5
+
+ /* accumulate force in xmm6/xmm7 for fshift */
+ movaps xmm6, xmm0
+ movss xmm7, xmm2
+ movlhps xmm6, xmm1
+ shufps xmm6, xmm6, 0b1000
+
+ /* accumulate H1i forces in xmm0, xmm1, xmm2 */
+ movaps xmm0, [esp + fixH1]
+ movaps xmm1, [esp + fiyH1]
+ movaps xmm2, [esp + fizH1]
+
+ movhlps xmm3, xmm0
+ movhlps xmm4, xmm1
+ movhlps xmm5, xmm2
+ addps xmm0, xmm3
+ addps xmm1, xmm4
+ addps xmm2, xmm5 /* sum is in 1/2 in xmm0-xmm2 */
+
+ movaps xmm3, xmm0
+ movaps xmm4, xmm1
+ movaps xmm5, xmm2
+
+ shufps xmm3, xmm3, 1
+ shufps xmm4, xmm4, 1
+ shufps xmm5, xmm5, 1
+ addss xmm0, xmm3
+ addss xmm1, xmm4
+ addss xmm2, xmm5 /* xmm0-xmm2 has single force in pos0 */
+
+ /* increment i force */
+ movss xmm3, [edi + ecx*4 + 12]
+ movss xmm4, [edi + ecx*4 + 16]
+ movss xmm5, [edi + ecx*4 + 20]
+ addss xmm3, xmm0
+ addss xmm4, xmm1
+ addss xmm5, xmm2
+ movss [edi + ecx*4 + 12], xmm3
+ movss [edi + ecx*4 + 16], xmm4
+ movss [edi + ecx*4 + 20], xmm5
+
+ /* accumulate force in xmm6/xmm7 for fshift */
+ addss xmm7, xmm2
+ movlhps xmm0, xmm1
+ shufps xmm0, xmm0, 0b1000
+ addps xmm6, xmm0
+
+ /* accumulate H2i forces in xmm0, xmm1, xmm2 */
+ movaps xmm0, [esp + fixH2]
+ movaps xmm1, [esp + fiyH2]
+ movaps xmm2, [esp + fizH2]
+
+ movhlps xmm3, xmm0
+ movhlps xmm4, xmm1
+ movhlps xmm5, xmm2
+ addps xmm0, xmm3
+ addps xmm1, xmm4
+ addps xmm2, xmm5 /* sum is in 1/2 in xmm0-xmm2 */
+
+ movaps xmm3, xmm0
+ movaps xmm4, xmm1
+ movaps xmm5, xmm2
+
+ shufps xmm3, xmm3, 1
+ shufps xmm4, xmm4, 1
+ shufps xmm5, xmm5, 1
+ addss xmm0, xmm3
+ addss xmm1, xmm4
+ addss xmm2, xmm5 /* xmm0-xmm2 has single force in pos0 */
+
+ /* increment i force */
+ movss xmm3, [edi + ecx*4 + 24]
+ movss xmm4, [edi + ecx*4 + 28]
+ movss xmm5, [edi + ecx*4 + 32]
+ addss xmm3, xmm0
+ addss xmm4, xmm1
+ addss xmm5, xmm2
+ movss [edi + ecx*4 + 24], xmm3
+ movss [edi + ecx*4 + 28], xmm4
+ movss [edi + ecx*4 + 32], xmm5
+
+ /* accumulate force in xmm6/xmm7 for fshift */
+ addss xmm7, xmm2
+ movlhps xmm0, xmm1
+ shufps xmm0, xmm0, 0b1000
+ addps xmm6, xmm0
+
+ /* increment fshift force */
+ movlps xmm3, [esi + edx*4]
+ movss xmm4, [esi + edx*4 + 8]
+ addps xmm3, xmm6
+ addss xmm4, xmm7
+ movlps [esi + edx*4], xmm3
+ movss [esi + edx*4 + 8], xmm4
+
+ /* get group index for i particle */
+ mov edx, [ebp + gid] /* get group index for this i particle */
+ mov edx, [edx]
+ add [ebp + gid], 4 /* advance pointer */
+
+ /* accumulate total potential energy and update it */
+ movaps xmm7, [esp + vctot]
+ /* accumulate */
+ movhlps xmm6, xmm7
+ addps xmm7, xmm6 /* pos 0-1 in xmm7 have the sum now */
+ movaps xmm6, xmm7
+ shufps xmm6, xmm6, 1
+ addss xmm7, xmm6
+
+ /* add earlier value from mem */
+ mov eax, [ebp + Vc]
+ addss xmm7, [eax + edx*4]
+ /* move back to mem */
+ movss [eax + edx*4], xmm7
+
+ /* finish if last */
+ mov ecx, [ebp + nri]
+ dec ecx
+ jecxz .i3030_end
+ /* not last, iterate once more! */
+ mov [ebp + nri], ecx
+ jmp .i3030_outer
+.i3030_end:
+ emms
+ mov eax, [esp + salign]
+ add esp, eax
+ add esp, 1444
+ pop edi
+ pop esi
+ pop edx
+ pop ecx
+ pop ebx
+ pop eax
+ leave
+ ret
+
+
+
+
+.globl inl3100_sse
+ .type inl3100_sse,@function
+inl3100_sse:
+.equ nri, 8
+.equ iinr, 12
+.equ jindex, 16
+.equ jjnr, 20
+.equ shift, 24
+.equ shiftvec, 28
+.equ fshift, 32
+.equ gid, 36
+.equ pos, 40
+.equ faction, 44
+.equ charge, 48
+.equ facel, 52
+.equ Vc, 56
+.equ type, 60
+.equ ntype, 64
+.equ nbfp, 68
+.equ Vnb, 72
+.equ tabscale, 76
+.equ VFtab, 80
+ /* stack offsets for local variables */
+ /* bottom of stack is cache-aligned for sse use */
+.equ ix, 0
+.equ iy, 16
+.equ iz, 32
+.equ iq, 48
+.equ dx, 64
+.equ dy, 80
+.equ dz, 96
+.equ two, 112
+.equ six, 128
+.equ twelve, 144
+.equ tsc, 160
+.equ qq, 176
+.equ c6, 192
+.equ c12, 208
+.equ fscal, 224
+.equ vctot, 240
+.equ vnbtot, 256
+.equ fix, 272
+.equ fiy, 288
+.equ fiz, 304
+.equ half, 320
+.equ three, 336
+.equ is3, 352
+.equ ii3, 356
+.equ ntia, 360
+.equ innerjjnr, 364
+.equ innerk, 368
+.equ salign, 372
+ push ebp
+ mov ebp,esp
+ push eax
+ push ebx
+ push ecx
+ push edx
+ push esi
+ push edi
+ sub esp, 376 /* local stack space */
+ mov eax, esp
+ and eax, 0xf
+ sub esp, eax
+ mov [esp + salign], eax
+
+ emms
+
+ movups xmm0, [sse_half]
+ movups xmm1, [sse_two]
+ movups xmm2, [sse_three]
+ movups xmm3, [sse_six]
+ movups xmm4, [sse_twelve]
+ movss xmm5, [ebp + tabscale]
+ movaps [esp + half], xmm0
+ movaps [esp + two], xmm1
+ movaps [esp + three], xmm2
+ movaps [esp + six], xmm3
+ movaps [esp + twelve], xmm4
+ shufps xmm5, xmm5, 0
+ movaps [esp + tsc], xmm5
+
+ /* assume we have at least one i particle - start directly */
+.i3100_outer:
+ mov eax, [ebp + shift] /* eax = pointer into shift[] */
+ mov ebx, [eax] /* ebx=shift[n] */
+ add [ebp + shift], 4 /* advance pointer one step */
+
+ lea ebx, [ebx + ebx*2] /* ebx=3*is */
+ mov [esp + is3],ebx /* store is3 */
+
+ mov eax, [ebp + shiftvec] /* eax = base of shiftvec[] */
+
+ movss xmm0, [eax + ebx*4]
+ movss xmm1, [eax + ebx*4 + 4]
+ movss xmm2, [eax + ebx*4 + 8]
+
+ mov ecx, [ebp + iinr] /* ecx = pointer into iinr[] */
+ add [ebp + iinr], 4 /* advance pointer */
+ mov ebx, [ecx] /* ebx =ii */
+
+ mov edx, [ebp + charge]
+ movss xmm3, [edx + ebx*4]
+ mulss xmm3, [ebp + facel]
+ shufps xmm3, xmm3, 0
+
+ mov edx, [ebp + type]
+ mov edx, [edx + ebx*4]
+ imul edx, [ebp + ntype]
+ shl edx, 1
+ mov [esp + ntia], edx
+
+ lea ebx, [ebx + ebx*2] /* ebx = 3*ii=ii3 */
+ mov eax, [ebp + pos] /* eax = base of pos[] */
+
+ addss xmm0, [eax + ebx*4]
+ addss xmm1, [eax + ebx*4 + 4]
+ addss xmm2, [eax + ebx*4 + 8]
+
+ movaps [esp + iq], xmm3
+
+ shufps xmm0, xmm0, 0
+ shufps xmm1, xmm1, 0
+ shufps xmm2, xmm2, 0
+
+ movaps [esp + ix], xmm0
+ movaps [esp + iy], xmm1
+ movaps [esp + iz], xmm2
+
+ mov [esp + ii3], ebx
+
+ /* clear vctot and i forces */
+ xorps xmm4, xmm4
+ movaps [esp + vctot], xmm4
+ movaps [esp + vnbtot], xmm4
+ movaps [esp + fix], xmm4
+ movaps [esp + fiy], xmm4
+ movaps [esp + fiz], xmm4
+
+ mov eax, [ebp + jindex]
+ mov ecx, [eax] /* jindex[n] */
+ mov edx, [eax + 4] /* jindex[n+1] */
+ add [ebp + jindex], 4
+ sub edx, ecx /* number of innerloop atoms */
+
+ mov esi, [ebp + pos]
+ mov edi, [ebp + faction]
+ mov eax, [ebp + jjnr]
+ shl ecx, 2
+ add eax, ecx
+ mov [esp + innerjjnr], eax /* pointer to jjnr[nj0] */
+ sub edx, 4
+ mov [esp + innerk], edx /* number of innerloop atoms */
+ jge .i3100_unroll_loop
+ jmp .i3100_finish_inner
+.i3100_unroll_loop:
+ /* quad-unroll innerloop here */
+ mov edx, [esp + innerjjnr] /* pointer to jjnr[k] */
+ mov eax, [edx]
+ mov ebx, [edx + 4]
+ mov ecx, [edx + 8]
+ mov edx, [edx + 12] /* eax-edx=jnr1-4 */
+ add [esp + innerjjnr], 16 /* advance pointer (unrolled 4) */
+
+ mov esi, [ebp + charge] /* base of charge[] */
+
+ movss xmm3, [esi + eax*4]
+ movss xmm4, [esi + ecx*4]
+ movss xmm6, [esi + ebx*4]
+ movss xmm7, [esi + edx*4]
+
+ movaps xmm2, [esp + iq]
+ shufps xmm3, xmm6, 0
+ shufps xmm4, xmm7, 0
+ shufps xmm3, xmm4, 0b10001000 /* all charges in xmm3 */
+ movd mm0, eax /* use mmx registers as temp storage */
+ movd mm1, ebx
+ mulps xmm3, xmm2
+ movd mm2, ecx
+ movd mm3, edx
+
+ movaps [esp + qq], xmm3
+
+ mov esi, [ebp + type]
+ mov eax, [esi + eax*4]
+ mov ebx, [esi + ebx*4]
+ mov ecx, [esi + ecx*4]
+ mov edx, [esi + edx*4]
+ mov esi, [ebp + nbfp]
+ shl eax, 1
+ shl ebx, 1
+ shl ecx, 1
+ shl edx, 1
+ mov edi, [esp + ntia]
+ add eax, edi
+ add ebx, edi
+ add ecx, edi
+ add edx, edi
+
+ movlps xmm6, [esi + eax*4]
+ movlps xmm7, [esi + ecx*4]
+ movhps xmm6, [esi + ebx*4]
+ movhps xmm7, [esi + edx*4]
+
+ movaps xmm4, xmm6
+ shufps xmm4, xmm7, 0b10001000
+ shufps xmm6, xmm7, 0b11011101
+
+ movd eax, mm0
+ movd ebx, mm1
+ movd ecx, mm2
+ movd edx, mm3
+
+ movaps [esp + c6], xmm4
+ movaps [esp + c12], xmm6
+
+ mov esi, [ebp + pos] /* base of pos[] */
+
+ lea eax, [eax + eax*2] /* replace jnr with j3 */
+ lea ebx, [ebx + ebx*2]
+
+ lea ecx, [ecx + ecx*2] /* replace jnr with j3 */
+ lea edx, [edx + edx*2]
+
+ /* move four coordinates to xmm0-xmm2 */
+
+ movlps xmm4, [esi + eax*4]
+ movlps xmm5, [esi + ecx*4]
+ movss xmm2, [esi + eax*4 + 8]
+ movss xmm6, [esi + ecx*4 + 8]
+
+ movhps xmm4, [esi + ebx*4]
+ movhps xmm5, [esi + edx*4]
+
+ movss xmm0, [esi + ebx*4 + 8]
+ movss xmm1, [esi + edx*4 + 8]
+
+ shufps xmm2, xmm0, 0
+ shufps xmm6, xmm1, 0
+
+ movaps xmm0, xmm4
+ movaps xmm1, xmm4
+
+ shufps xmm2, xmm6, 0b10001000
+
+ shufps xmm0, xmm5, 0b10001000
+ shufps xmm1, xmm5, 0b11011101
+
+ /* move ix-iz to xmm4-xmm6 */
+ movaps xmm4, [esp + ix]
+ movaps xmm5, [esp + iy]
+ movaps xmm6, [esp + iz]
+
+ /* calc dr */
+ subps xmm4, xmm0
+ subps xmm5, xmm1
+ subps xmm6, xmm2
+
+ /* store dr */
+ movaps [esp + dx], xmm4
+ movaps [esp + dy], xmm5
+ movaps [esp + dz], xmm6
+ /* square it */
+ mulps xmm4,xmm4
+ mulps xmm5,xmm5
+ mulps xmm6,xmm6
+ addps xmm4, xmm5
+ addps xmm4, xmm6
+ /* rsq in xmm4 */
+
+ rsqrtps xmm5, xmm4
+ /* lookup seed in xmm5 */
+ movaps xmm2, xmm5
+ mulps xmm5, xmm5
+ movaps xmm1, [esp + three]
+ mulps xmm5, xmm4 /* rsq*lu*lu */
+ movaps xmm0, [esp + half]
+ subps xmm1, xmm5 /* 30-rsq*lu*lu */
+ mulps xmm1, xmm2
+ mulps xmm0, xmm1 /* xmm0=rinv */
+ mulps xmm4, xmm0 /* xmm4=r */
+ mulps xmm4, [esp + tsc]
+
+ movhlps xmm5, xmm4
+ cvttps2pi mm6, xmm4
+ cvttps2pi mm7, xmm5 /* mm6/mm7 contain lu indices */
+ cvtpi2ps xmm6, mm6
+ cvtpi2ps xmm5, mm7
+ movlhps xmm6, xmm5
+ subps xmm4, xmm6
+ movaps xmm1, xmm4 /* xmm1=eps */
+ movaps xmm2, xmm1
+ mulps xmm2, xmm2 /* xmm2=eps2 */
+ pslld mm6, 2
+ pslld mm7, 2
+
+ movd mm0, eax
+ movd mm1, ebx
+ movd mm2, ecx
+ movd mm3, edx
+
+ mov esi, [ebp + VFtab]
+ movd eax, mm6
+ psrlq mm6, 32
+ movd ecx, mm7
+ psrlq mm7, 32
+ movd ebx, mm6
+ movd edx, mm7
+
+ movlps xmm5, [esi + eax*4]
+ movlps xmm7, [esi + ecx*4]
+ movhps xmm5, [esi + ebx*4]
+ movhps xmm7, [esi + edx*4] /* got half coulomb table */
+
+ movaps xmm4, xmm5
+ shufps xmm4, xmm7, 0b10001000
+ shufps xmm5, xmm7, 0b11011101
+
+ movlps xmm7, [esi + eax*4 + 8]
+ movlps xmm3, [esi + ecx*4 + 8]
+ movhps xmm7, [esi + ebx*4 + 8]
+ movhps xmm3, [esi + edx*4 + 8] /* other half of coulomb table */
+ movaps xmm6, xmm7
+ shufps xmm6, xmm3, 0b10001000
+ shufps xmm7, xmm3, 0b11011101
+ /* coulomb table ready, in xmm4-xmm7 */
+
+ mulps xmm6, xmm1 /* xmm6=Geps */
+ mulps xmm7, xmm2 /* xmm7=Heps2 */
+ addps xmm5, xmm6
+ addps xmm5, xmm7 /* xmm5=Fp */
+ mulps xmm7, [esp + two] /* two*Heps2 */
+ movaps xmm3, [esp + qq]
+ addps xmm7, xmm6
+ addps xmm7, xmm5 /* xmm7=FF */
+ mulps xmm5, xmm1 /* xmm5=eps*Fp */
+ addps xmm5, xmm4 /* xmm5=VV */
+ mulps xmm5, xmm3 /* vcoul=qq*VV */
+ mulps xmm3, xmm7 /* fijC=FF*qq */
+ /* L-J */
+ movaps xmm4, xmm0
+ mulps xmm4, xmm0 /* xmm4=rinvsq */
+
+ /* at this point mm5 contains vcoul and mm3 fijC */
+ /* increment vcoul - then we can get rid of mm5 */
+ /* update vctot */
+ addps xmm5, [esp + vctot]
+
+ movaps xmm6, xmm4
+ mulps xmm6, xmm4
+
+ movaps [esp + vctot], xmm5
+
+ mulps xmm6, xmm4 /* xmm6=rinvsix */
+ movaps xmm4, xmm6
+ mulps xmm4, xmm4 /* xmm4=rinvtwelve */
+ mulps xmm6, [esp + c6]
+ mulps xmm4, [esp + c12]
+ movaps xmm7, [esp + vnbtot]
+ addps xmm7, xmm4
+ mulps xmm4, [esp + twelve]
+ subps xmm7, xmm6
+ mulps xmm3, [esp + tsc]
+ mulps xmm6, [esp + six]
+ movaps [esp + vnbtot], xmm7
+ subps xmm4, xmm6
+ mulps xmm4, xmm0
+ subps xmm4, xmm3
+ mulps xmm4, xmm0
+
+ movaps xmm0, [esp + dx]
+ movaps xmm1, [esp + dy]
+ movaps xmm2, [esp + dz]
+
+ movd eax, mm0
+ movd ebx, mm1
+ movd ecx, mm2
+ movd edx, mm3
+
+ mov edi, [ebp + faction]
+ mulps xmm0, xmm4
+ mulps xmm1, xmm4
+ mulps xmm2, xmm4
+ /* xmm0-xmm2 contains tx-tz (partial force) */
+ /* now update f_i */
+ movaps xmm3, [esp + fix]
+ movaps xmm4, [esp + fiy]
+ movaps xmm5, [esp + fiz]
+ addps xmm3, xmm0
+ addps xmm4, xmm1
+ addps xmm5, xmm2
+ movaps [esp + fix], xmm3
+ movaps [esp + fiy], xmm4
+ movaps [esp + fiz], xmm5
+ /* the fj's - start by accumulating x & y forces from memory */
+ movlps xmm4, [edi + eax*4]
+ movlps xmm6, [edi + ecx*4]
+ movhps xmm4, [edi + ebx*4]
+ movhps xmm6, [edi + edx*4]
+
+ movaps xmm3, xmm4
+ shufps xmm3, xmm6, 0b10001000
+ shufps xmm4, xmm6, 0b11011101
+
+ /* now xmm3-xmm5 contains fjx, fjy, fjz */
+ subps xmm3, xmm0
+ subps xmm4, xmm1
+
+ /* unpack them back so we can store them - first x & y in xmm3/xmm4 */
+
+ movaps xmm6, xmm3
+ unpcklps xmm6, xmm4
+ unpckhps xmm3, xmm4
+ /* xmm6(l)=x & y for j1, (h) for j2 */
+ /* xmm3(l)=x & y for j3, (h) for j4 */
+ movlps [edi + eax*4], xmm6
+ movlps [edi + ecx*4], xmm3
+
+ movhps [edi + ebx*4], xmm6
+ movhps [edi + edx*4], xmm3
+
+ /* and the z forces */
+ movss xmm4, [edi + eax*4 + 8]
+ movss xmm5, [edi + ebx*4 + 8]
+ movss xmm6, [edi + ecx*4 + 8]
+ movss xmm7, [edi + edx*4 + 8]
+ subss xmm4, xmm2
+ shufps xmm2, xmm2, 0b11100101
+ subss xmm5, xmm2
+ shufps xmm2, xmm2, 0b11101010
+ subss xmm6, xmm2
+ shufps xmm2, xmm2, 0b11111111
+ subss xmm7, xmm2
+ movss [edi + eax*4 + 8], xmm4
+ movss [edi + ebx*4 + 8], xmm5
+ movss [edi + ecx*4 + 8], xmm6
+ movss [edi + edx*4 + 8], xmm7
+
+ /* should we do one more iteration? */
+ sub [esp + innerk], 4
+ jl .i3100_finish_inner
+ jmp .i3100_unroll_loop
+.i3100_finish_inner:
+ /* check if at least two particles remain */
+ add [esp + innerk], 4
+ mov edx, [esp + innerk]
+ and edx, 2
+ jnz .i3100_dopair
+ jmp .i3100_checksingle
+.i3100_dopair:
+ mov esi, [ebp + charge]
+ mov ecx, [esp + innerjjnr]
+ mov eax, [ecx]
+ mov ebx, [ecx + 4]
+ add [esp + innerjjnr], 8
+ xorps xmm7, xmm7
+ movss xmm3, [esi + eax*4]
+ movss xmm6, [esi + ebx*4]
+ shufps xmm3, xmm6, 0
+ shufps xmm3, xmm3, 0b00001000 /* xmm3(0,1) has the charges */
+
+ mulps xmm3, [esp + iq]
+ movlhps xmm3, xmm7
+ movaps [esp + qq], xmm3
+
+ mov esi, [ebp + type]
+ mov ecx, eax
+ mov edx, ebx
+ mov ecx, [esi + ecx*4]
+ mov edx, [esi + edx*4]
+ mov esi, [ebp + nbfp]
+ shl ecx, 1
+ shl edx, 1
+ mov edi, [esp + ntia]
+ add ecx, edi
+ add edx, edi
+ movlps xmm6, [esi + ecx*4]
+ movhps xmm6, [esi + edx*4]
+ mov edi, [ebp + pos]
+
+ movaps xmm4, xmm6
+ shufps xmm4, xmm4, 0b1000
+ shufps xmm6, xmm6, 0b1101
+ movlhps xmm4, xmm7
+ movlhps xmm6, xmm7
+
+ movaps [esp + c6], xmm4
+ movaps [esp + c12], xmm6
+
+ lea eax, [eax + eax*2]
+ lea ebx, [ebx + ebx*2]
+ /* move coordinates to xmm0-xmm2 */
+ movlps xmm1, [edi + eax*4]
+ movss xmm2, [edi + eax*4 + 8]
+ movhps xmm1, [edi + ebx*4]
+ movss xmm0, [edi + ebx*4 + 8]
+
+ movlhps xmm3, xmm7
+
+ shufps xmm2, xmm0, 0
+
+ movaps xmm0, xmm1
+
+ shufps xmm2, xmm2, 0b10001000
+
+ shufps xmm0, xmm0, 0b10001000
+ shufps xmm1, xmm1, 0b11011101
+
+ mov edi, [ebp + faction]
+ /* move ix-iz to xmm4-xmm6 */
+ xorps xmm7, xmm7
+
+ movaps xmm4, [esp + ix]
+ movaps xmm5, [esp + iy]
+ movaps xmm6, [esp + iz]
+
+ /* calc dr */
+ subps xmm4, xmm0
+ subps xmm5, xmm1
+ subps xmm6, xmm2
+
+ /* store dr */
+ movaps [esp + dx], xmm4
+ movaps [esp + dy], xmm5
+ movaps [esp + dz], xmm6
+ /* square it */
+ mulps xmm4,xmm4
+ mulps xmm5,xmm5
+ mulps xmm6,xmm6
+ addps xmm4, xmm5
+ addps xmm4, xmm6
+ /* rsq in xmm4 */
+
+ rsqrtps xmm5, xmm4
+ /* lookup seed in xmm5 */
+ movaps xmm2, xmm5
+ mulps xmm5, xmm5
+ movaps xmm1, [esp + three]
+ mulps xmm5, xmm4 /* rsq*lu*lu */
+ movaps xmm0, [esp + half]
+ subps xmm1, xmm5 /* 30-rsq*lu*lu */
+ mulps xmm1, xmm2
+ mulps xmm0, xmm1 /* xmm0=rinv */
+ mulps xmm4, xmm0 /* xmm4=r */
+ mulps xmm4, [esp + tsc]
+
+ cvttps2pi mm6, xmm4 /* mm6 contain lu indices */
+ cvtpi2ps xmm6, mm6
+ subps xmm4, xmm6
+ movaps xmm1, xmm4 /* xmm1=eps */
+ movaps xmm2, xmm1
+ mulps xmm2, xmm2 /* xmm2=eps2 */
+
+ pslld mm6, 2
+
+ mov esi, [ebp + VFtab]
+ movd ecx, mm6
+ psrlq mm6, 32
+ movd edx, mm6
+
+ movlps xmm5, [esi + ecx*4]
+ movhps xmm5, [esi + edx*4] /* got half coulomb table */
+ movaps xmm4, xmm5
+ shufps xmm4, xmm4, 0b10001000
+ shufps xmm5, xmm7, 0b11011101
+
+ movlps xmm7, [esi + ecx*4 + 8]
+ movhps xmm7, [esi + edx*4 + 8]
+ movaps xmm6, xmm7
+ shufps xmm6, xmm6, 0b10001000
+ shufps xmm7, xmm7, 0b11011101
+ /* table ready in xmm4-xmm7 */
+
+ mulps xmm6, xmm1 /* xmm6=Geps */
+ mulps xmm7, xmm2 /* xmm7=Heps2 */
+ addps xmm5, xmm6
+ addps xmm5, xmm7 /* xmm5=Fp */
+ mulps xmm7, [esp + two] /* two*Heps2 */
+ movaps xmm3, [esp + qq]
+ addps xmm7, xmm6
+ addps xmm7, xmm5 /* xmm7=FF */
+ mulps xmm5, xmm1 /* xmm5=eps*Fp */
+ addps xmm5, xmm4 /* xmm5=VV */
+ mulps xmm5, xmm3 /* vcoul=qq*VV */
+ mulps xmm3, xmm7 /* fijC=FF*qq */
+ /* L-J */
+ movaps xmm4, xmm0
+ mulps xmm4, xmm0 /* xmm4=rinvsq */
+
+ /* at this point mm5 contains vcoul and mm3 fijC */
+ /* increment vcoul - then we can get rid of mm5 */
+ /* update vctot */
+ addps xmm5, [esp + vctot]
+
+ movaps xmm6, xmm4
+ mulps xmm6, xmm4
+
+ movaps [esp + vctot], xmm5
+
+ mulps xmm6, xmm4 /* xmm6=rinvsix */
+ movaps xmm4, xmm6
+ mulps xmm4, xmm4 /* xmm4=rinvtwelve */
+ mulps xmm6, [esp + c6]
+ mulps xmm4, [esp + c12]
+ movaps xmm7, [esp + vnbtot]
+ addps xmm7, xmm4
+ mulps xmm4, [esp + twelve]
+ subps xmm7, xmm6
+ mulps xmm3, [esp + tsc]
+ mulps xmm6, [esp + six]
+ movaps [esp + vnbtot], xmm7
+ subps xmm4, xmm6
+ mulps xmm4, xmm0
+ subps xmm4, xmm3
+ mulps xmm4, xmm0
+
+ movaps xmm0, [esp + dx]
+ movaps xmm1, [esp + dy]
+ movaps xmm2, [esp + dz]
+
+ mulps xmm0, xmm4
+ mulps xmm1, xmm4
+ mulps xmm2, xmm4
+ /* xmm0-xmm2 contains tx-tz (partial force) */
+ /* now update f_i */
+ movaps xmm3, [esp + fix]
+ movaps xmm4, [esp + fiy]
+ movaps xmm5, [esp + fiz]
+ addps xmm3, xmm0
+ addps xmm4, xmm1
+ addps xmm5, xmm2
+ movaps [esp + fix], xmm3
+ movaps [esp + fiy], xmm4
+ movaps [esp + fiz], xmm5
+ /* update the fj's */
+ movss xmm3, [edi + eax*4]
+ movss xmm4, [edi + eax*4 + 4]
+ movss xmm5, [edi + eax*4 + 8]
+ subss xmm3, xmm0
+ subss xmm4, xmm1
+ subss xmm5, xmm2
+ movss [edi + eax*4], xmm3
+ movss [edi + eax*4 + 4], xmm4
+ movss [edi + eax*4 + 8], xmm5
+
+ shufps xmm0, xmm0, 0b11100001
+ shufps xmm1, xmm1, 0b11100001
+ shufps xmm2, xmm2, 0b11100001
+
+ movss xmm3, [edi + ebx*4]
+ movss xmm4, [edi + ebx*4 + 4]
+ movss xmm5, [edi + ebx*4 + 8]
+ subss xmm3, xmm0
+ subss xmm4, xmm1
+ subss xmm5, xmm2
+ movss [edi + ebx*4], xmm3
+ movss [edi + ebx*4 + 4], xmm4
+ movss [edi + ebx*4 + 8], xmm5
+
+.i3100_checksingle:
+ mov edx, [esp + innerk]
+ and edx, 1
+ jnz .i3100_dosingle
+ jmp .i3100_updateouterdata
+.i3100_dosingle:
+ mov esi, [ebp + charge]
+ mov edi, [ebp + pos]
+ mov ecx, [esp + innerjjnr]
+ mov eax, [ecx]
+ xorps xmm6, xmm6
+ movss xmm6, [esi + eax*4] /* xmm6(0) has the charge */
+ mulps xmm6, [esp + iq]
+ movaps [esp + qq], xmm6
+
+ mov esi, [ebp + type]
+ mov ecx, eax
+ mov ecx, [esi + ecx*4]
+ mov esi, [ebp + nbfp]
+ shl ecx, 1
+ add ecx, [esp + ntia]
+ movlps xmm6, [esi + ecx*4]
+ movaps xmm4, xmm6
+ shufps xmm4, xmm4, 0b11111100
+ shufps xmm6, xmm6, 0b11111101
+
+ movaps [esp + c6], xmm4
+ movaps [esp + c12], xmm6
+
+ lea eax, [eax + eax*2]
+
+ /* move coordinates to xmm0-xmm2 */
+ movss xmm0, [edi + eax*4]
+ movss xmm1, [edi + eax*4 + 4]
+ movss xmm2, [edi + eax*4 + 8]
+
+ movaps xmm4, [esp + ix]
+ movaps xmm5, [esp + iy]
+ movaps xmm6, [esp + iz]
+
+ /* calc dr */
+ subps xmm4, xmm0
+ subps xmm5, xmm1
+ subps xmm6, xmm2
+
+ /* store dr */
+ movaps [esp + dx], xmm4
+ movaps [esp + dy], xmm5
+ movaps [esp + dz], xmm6
+ /* square it */
+ mulps xmm4,xmm4
+ mulps xmm5,xmm5
+ mulps xmm6,xmm6
+ addps xmm4, xmm5
+ addps xmm4, xmm6
+ /* rsq in xmm4 */
+
+ rsqrtps xmm5, xmm4
+ /* lookup seed in xmm5 */
+ movaps xmm2, xmm5
+ mulps xmm5, xmm5
+ movaps xmm1, [esp + three]
+ mulps xmm5, xmm4 /* rsq*lu*lu */
+ movaps xmm0, [esp + half]
+ subps xmm1, xmm5 /* 30-rsq*lu*lu */
+ mulps xmm1, xmm2
+ mulps xmm0, xmm1 /* xmm0=rinv */
+
+ mulps xmm4, xmm0 /* xmm4=r */
+ mulps xmm4, [esp + tsc]
+
+ cvttps2pi mm6, xmm4 /* mm6 contain lu indices */
+ cvtpi2ps xmm6, mm6
+ subps xmm4, xmm6
+ movaps xmm1, xmm4 /* xmm1=eps */
+ movaps xmm2, xmm1
+ mulps xmm2, xmm2 /* xmm2=eps2 */
+
+ pslld mm6, 2
+
+ mov esi, [ebp + VFtab]
+ movd ebx, mm6
+
+ movlps xmm4, [esi + ebx*4]
+ movlps xmm6, [esi + ebx*4 + 8]
+ movaps xmm5, xmm4
+ movaps xmm7, xmm6
+ shufps xmm5, xmm5, 1
+ shufps xmm7, xmm7, 1
+ /* table ready in xmm4-xmm7 */
+
+ mulps xmm6, xmm1 /* xmm6=Geps */
+ mulps xmm7, xmm2 /* xmm7=Heps2 */
+ addps xmm5, xmm6
+ addps xmm5, xmm7 /* xmm5=Fp */
+ mulps xmm7, [esp + two] /* two*Heps2 */
+ movaps xmm3, [esp + qq]
+ addps xmm7, xmm6
+ addps xmm7, xmm5 /* xmm7=FF */
+ mulps xmm5, xmm1 /* xmm5=eps*Fp */
+ addps xmm5, xmm4 /* xmm5=VV */
+ mulps xmm5, xmm3 /* vcoul=qq*VV */
+ mulps xmm3, xmm7 /* fijC=FF*qq */
+ /* L-J */
+ movaps xmm4, xmm0
+ mulps xmm4, xmm0 /* xmm4=rinvsq */
+
+ /* at this point mm5 contains vcoul and mm3 fijC */
+ /* increment vcoul - then we can get rid of mm5 */
+ /* update vctot */
+ addps xmm5, [esp + vctot]
+
+ movaps xmm6, xmm4
+ mulps xmm6, xmm4
+
+ movaps [esp + vctot], xmm5
+
+ mulps xmm6, xmm4 /* xmm6=rinvsix */
+ movaps xmm4, xmm6
+ mulps xmm4, xmm4 /* xmm4=rinvtwelve */
+ mulps xmm6, [esp + c6]
+ mulps xmm4, [esp + c12]
+ movaps xmm7, [esp + vnbtot]
+ addps xmm7, xmm4
+ mulps xmm4, [esp + twelve]
+ subps xmm7, xmm6
+ mulps xmm3, [esp + tsc]
+ mulps xmm6, [esp + six]
+ movaps [esp + vnbtot], xmm7
+ subps xmm4, xmm6
+ mulps xmm4, xmm0
+ subps xmm4, xmm3
+ mulps xmm4, xmm0
+
+ movaps xmm0, [esp + dx]
+ movaps xmm1, [esp + dy]
+ movaps xmm2, [esp + dz]
+
+ mov edi, [ebp + faction]
+ mulps xmm0, xmm4
+ mulps xmm1, xmm4
+ mulps xmm2, xmm4
+ /* xmm0-xmm2 contains tx-tz (partial force) */
+ /* now update f_i */
+ movaps xmm3, [esp + fix]
+ movaps xmm4, [esp + fiy]
+ movaps xmm5, [esp + fiz]
+ addps xmm3, xmm0
+ addps xmm4, xmm1
+ addps xmm5, xmm2
+ movaps [esp + fix], xmm3
+ movaps [esp + fiy], xmm4
+ movaps [esp + fiz], xmm5
+ /* update fj */
+
+ movss xmm3, [edi + eax*4]
+ movss xmm4, [edi + eax*4 + 4]
+ movss xmm5, [edi + eax*4 + 8]
+ subss xmm3, xmm0
+ subss xmm4, xmm1
+ subss xmm5, xmm2
+ movss [edi + eax*4], xmm3
+ movss [edi + eax*4 + 4], xmm4
+ movss [edi + eax*4 + 8], xmm5
+.i3100_updateouterdata:
+ mov ecx, [esp + ii3]
+ mov edi, [ebp + faction]
+ mov esi, [ebp + fshift]
+ mov edx, [esp + is3]
+
+ /* accumulate i forces in xmm0, xmm1, xmm2 */
+ movaps xmm0, [esp + fix]
+ movaps xmm1, [esp + fiy]
+ movaps xmm2, [esp + fiz]
+
+ movhlps xmm3, xmm0
+ movhlps xmm4, xmm1
+ movhlps xmm5, xmm2
+ addps xmm0, xmm3
+ addps xmm1, xmm4
+ addps xmm2, xmm5 /* sum is in 1/2 in xmm0-xmm2 */
+
+ movaps xmm3, xmm0
+ movaps xmm4, xmm1
+ movaps xmm5, xmm2
+
+ shufps xmm3, xmm3, 1
+ shufps xmm4, xmm4, 1
+ shufps xmm5, xmm5, 1
+ addss xmm0, xmm3
+ addss xmm1, xmm4
+ addss xmm2, xmm5 /* xmm0-xmm2 has single force in pos0 */
+
+ /* increment i force */
+ movss xmm3, [edi + ecx*4]
+ movss xmm4, [edi + ecx*4 + 4]
+ movss xmm5, [edi + ecx*4 + 8]
+ addss xmm3, xmm0
+ addss xmm4, xmm1
+ addss xmm5, xmm2
+ movss [edi + ecx*4], xmm3
+ movss [edi + ecx*4 + 4], xmm4
+ movss [edi + ecx*4 + 8], xmm5
+
+ /* increment fshift force */
+ movss xmm3, [esi + edx*4]
+ movss xmm4, [esi + edx*4 + 4]
+ movss xmm5, [esi + edx*4 + 8]
+ addss xmm3, xmm0
+ addss xmm4, xmm1
+ addss xmm5, xmm2
+ movss [esi + edx*4], xmm3
+ movss [esi + edx*4 + 4], xmm4
+ movss [esi + edx*4 + 8], xmm5
+
+ /* get group index for i particle */
+ mov edx, [ebp + gid] /* get group index for this i particle */
+ mov edx, [edx]
+ add [ebp + gid], 4 /* advance pointer */
+
+ /* accumulate total potential energy and update it */
+ movaps xmm7, [esp + vctot]
+ /* accumulate */
+ movhlps xmm6, xmm7
+ addps xmm7, xmm6 /* pos 0-1 in xmm7 have the sum now */
+ movaps xmm6, xmm7
+ shufps xmm6, xmm6, 1
+ addss xmm7, xmm6
+
+ /* add earlier value from mem */
+ mov eax, [ebp + Vc]
+ addss xmm7, [eax + edx*4]
+ /* move back to mem */
+ movss [eax + edx*4], xmm7
+
+ /* accumulate total lj energy and update it */
+ movaps xmm7, [esp + vnbtot]
+ /* accumulate */
+ movhlps xmm6, xmm7
+ addps xmm7, xmm6 /* pos 0-1 in xmm7 have the sum now */
+ movaps xmm6, xmm7
+ shufps xmm6, xmm6, 1
+ addss xmm7, xmm6
+
+ /* add earlier value from mem */
+ mov eax, [ebp + Vnb]
+ addss xmm7, [eax + edx*4]
+ /* move back to mem */
+ movss [eax + edx*4], xmm7
+
+ /* finish if last */
+ mov ecx, [ebp + nri]
+ dec ecx
+ jecxz .i3100_end
+ /* not last, iterate once more! */
+ mov [ebp + nri], ecx
+ jmp .i3100_outer
+.i3100_end:
+ emms
+ mov eax, [esp + salign]
+ add esp, eax
+ add esp, 376
+ pop edi
+ pop esi
+ pop edx
+ pop ecx
+ pop ebx
+ pop eax
+ leave
+ ret
+
+
+
+
+.globl inl3110_sse
+ .type inl3110_sse,@function
+inl3110_sse:
+.equ nri, 8
+.equ iinr, 12
+.equ jindex, 16
+.equ jjnr, 20
+.equ shift, 24
+.equ shiftvec, 28
+.equ fshift, 32
+.equ gid, 36
+.equ pos, 40
+.equ faction, 44
+.equ charge, 48
+.equ facel, 52
+.equ Vc, 56
+.equ type, 60
+.equ ntype, 64
+.equ nbfp, 68
+.equ Vnb, 72
+.equ tabscale, 76
+.equ VFtab, 80
+.equ nsatoms, 84
+ /* stack offsets for local variables */
+ /* bottom of stack is cache-aligned for sse use */
+.equ ix, 0
+.equ iy, 16
+.equ iz, 32
+.equ iq, 48
+.equ dx, 64
+.equ dy, 80
+.equ dz, 96
+.equ two, 112
+.equ tsc, 128
+.equ qq, 144
+.equ c6, 160
+.equ c12, 176
+.equ six, 192
+.equ twelve, 208
+.equ fscal, 224
+.equ vctot, 240
+.equ vnbtot, 256
+.equ fix, 272
+.equ fiy, 288
+.equ fiz, 304
+.equ half, 320
+.equ three, 336
+.equ is3, 352
+.equ ii3, 356
+.equ shX, 360
+.equ shY, 364
+.equ shZ, 368
+.equ ntia, 372
+.equ innerjjnr0, 376
+.equ innerk0, 380
+.equ innerjjnr, 384
+.equ innerk, 388
+.equ salign, 392
+.equ nsvdwc, 396
+.equ nscoul, 400
+.equ nsvdw, 404
+.equ solnr, 408
+ push ebp
+ mov ebp,esp
+ push eax
+ push ebx
+ push ecx
+ push edx
+ push esi
+ push edi
+ sub esp, 412 /* local stack space */
+ mov eax, esp
+ and eax, 0xf
+ sub esp, eax
+ mov [esp + salign], eax
+
+ emms
+
+ movups xmm0, [sse_half]
+ movups xmm1, [sse_two]
+ movups xmm2, [sse_three]
+ movups xmm3, [sse_six]
+ movups xmm4, [sse_twelve]
+ movss xmm5, [ebp + tabscale]
+ movaps [esp + half], xmm0
+ movaps [esp + two], xmm1
+ movaps [esp + three], xmm2
+ movaps [esp + six], xmm3
+ movaps [esp + twelve], xmm4
+ shufps xmm5, xmm5, 0
+ movaps [esp + tsc], xmm5
+
+ /* assume we have at least one i particle - start directly */
+.i3110_outer:
+ mov eax, [ebp + shift] /* eax = pointer into shift[] */
+ mov ebx, [eax] /* ebx=shift[n] */
+ add [ebp + shift], 4 /* advance pointer one step */
+
+ lea ebx, [ebx + ebx*2] /* ebx=3*is */
+ mov [esp + is3],ebx /* store is3 */
+
+ mov eax, [ebp + shiftvec] /* eax = base of shiftvec[] */
+
+ movlps xmm0, [eax + ebx*4]
+ movss xmm1, [eax + ebx*4 + 8]
+ movlps [esp + shX], xmm0
+ movss [esp + shZ], xmm1
+
+ mov ecx, [ebp + iinr] /* ecx = pointer into iinr[] */
+ add [ebp + iinr], 4 /* advance pointer */
+ mov ebx, [ecx] /* ebx =ii */
+
+ mov eax, [ebp + nsatoms]
+ add [ebp + nsatoms], 12
+ mov ecx, [eax]
+ mov edx, [eax + 4]
+ mov eax, [eax + 8]
+ sub ecx, eax
+ sub eax, edx
+
+ mov [esp + nsvdwc], edx
+ mov [esp + nscoul], eax
+ mov [esp + nsvdw], ecx
+
+ /* clear potential */
+ xorps xmm4, xmm4
+ movaps [esp + vctot], xmm4
+ movaps [esp + vnbtot], xmm4
+ mov [esp + solnr], ebx
+
+ mov eax, [ebp + jindex]
+ mov ecx, [eax] /* jindex[n] */
+ mov edx, [eax + 4] /* jindex[n+1] */
+ add [ebp + jindex], 4
+ sub edx, ecx /* number of innerloop atoms */
+
+ mov eax, [ebp + jjnr]
+ shl ecx, 2
+ add eax, ecx
+ mov [esp + innerjjnr0], eax /* pointer to jjnr[nj0] */
+ mov [esp + innerk0], edx /* number of innerloop atoms */
+
+ mov ecx, [esp + nsvdwc]
+ cmp ecx, 0
+ jnz .i3110_mno_vdwc
+ jmp .i3110_testcoul
+.i3110_mno_vdwc:
+ mov ebx, [esp + solnr]
+ inc dword ptr [esp + solnr]
+
+ mov edx, [ebp + charge]
+ movss xmm3, [edx + ebx*4]
+ mulss xmm3, [ebp + facel]
+ shufps xmm3, xmm3, 0
+
+ mov edx, [ebp + type]
+ mov edx, [edx + ebx*4]
+ imul edx, [ebp + ntype]
+ shl edx, 1
+ mov [esp + ntia], edx
+
+ lea ebx, [ebx + ebx*2] /* ebx = 3*ii=ii3 */
+ mov eax, [ebp + pos] /* eax = base of pos[] */
+
+ movss xmm0, [esp + shX]
+ movss xmm1, [esp + shY]
+ movss xmm2, [esp + shZ]
+ addss xmm0, [eax + ebx*4]
+ addss xmm1, [eax + ebx*4 + 4]
+ addss xmm2, [eax + ebx*4 + 8]
+
+ movaps [esp + iq], xmm3
+
+ shufps xmm0, xmm0, 0
+ shufps xmm1, xmm1, 0
+ shufps xmm2, xmm2, 0
+
+ movaps [esp + ix], xmm0
+ movaps [esp + iy], xmm1
+ movaps [esp + iz], xmm2
+
+ mov [esp + ii3], ebx
+
+ /* clear i forces */
+ xorps xmm4, xmm4
+ movaps [esp + fix], xmm4
+ movaps [esp + fiy], xmm4
+ movaps [esp + fiz], xmm4
+
+ mov ecx, [esp + innerjjnr0]
+ mov [esp + innerjjnr], ecx
+ mov edx, [esp + innerk0]
+ sub edx, 4
+ mov [esp + innerk], edx /* number of innerloop atoms */
+ jge .i3110_unroll_vdwc_loop
+ jmp .i3110_finish_vdwc_inner
+.i3110_unroll_vdwc_loop:
+ /* quad-unroll innerloop here */
+ mov edx, [esp + innerjjnr] /* pointer to jjnr[k] */
+ mov eax, [edx]
+ mov ebx, [edx + 4]
+ mov ecx, [edx + 8]
+ mov edx, [edx + 12] /* eax-edx=jnr1-4 */
+ add [esp + innerjjnr], 16 /* advance pointer (unrolled 4) */
+
+ mov esi, [ebp + charge] /* base of charge[] */
+
+ movss xmm3, [esi + eax*4]
+ movss xmm4, [esi + ecx*4]
+ movss xmm6, [esi + ebx*4]
+ movss xmm7, [esi + edx*4]
+
+ movaps xmm2, [esp + iq]
+ shufps xmm3, xmm6, 0
+ shufps xmm4, xmm7, 0
+ shufps xmm3, xmm4, 0b10001000 /* all charges in xmm3 */
+ movd mm0, eax /* use mmx registers as temp storage */
+ movd mm1, ebx
+ mulps xmm3, xmm2
+ movd mm2, ecx
+ movd mm3, edx
+
+ movaps [esp + qq], xmm3
+
+ mov esi, [ebp + type]
+ mov eax, [esi + eax*4]
+ mov ebx, [esi + ebx*4]
+ mov ecx, [esi + ecx*4]
+ mov edx, [esi + edx*4]
+ mov esi, [ebp + nbfp]
+ shl eax, 1
+ shl ebx, 1
+ shl ecx, 1
+ shl edx, 1
+ mov edi, [esp + ntia]
+ add eax, edi
+ add ebx, edi
+ add ecx, edi
+ add edx, edi
+
+ movlps xmm6, [esi + eax*4]
+ movlps xmm7, [esi + ecx*4]
+ movhps xmm6, [esi + ebx*4]
+ movhps xmm7, [esi + edx*4]
+
+ movaps xmm4, xmm6
+ shufps xmm4, xmm7, 0b10001000
+ shufps xmm6, xmm7, 0b11011101
+
+ movd eax, mm0
+ movd ebx, mm1
+ movd ecx, mm2
+ movd edx, mm3
+
+ movaps [esp + c6], xmm4
+ movaps [esp + c12], xmm6
+
+ mov esi, [ebp + pos] /* base of pos[] */
+
+ lea eax, [eax + eax*2] /* replace jnr with j3 */
+ lea ebx, [ebx + ebx*2]
+
+ lea ecx, [ecx + ecx*2] /* replace jnr with j3 */
+ lea edx, [edx + edx*2]
+
+ /* move four coordinates to xmm0-xmm2 */
+
+ movlps xmm4, [esi + eax*4]
+ movlps xmm5, [esi + ecx*4]
+ movss xmm2, [esi + eax*4 + 8]
+ movss xmm6, [esi + ecx*4 + 8]
+
+ movhps xmm4, [esi + ebx*4]
+ movhps xmm5, [esi + edx*4]
+
+ movss xmm0, [esi + ebx*4 + 8]
+ movss xmm1, [esi + edx*4 + 8]
+
+ shufps xmm2, xmm0, 0
+ shufps xmm6, xmm1, 0
+
+ movaps xmm0, xmm4
+ movaps xmm1, xmm4
+
+ shufps xmm2, xmm6, 0b10001000
+
+ shufps xmm0, xmm5, 0b10001000
+ shufps xmm1, xmm5, 0b11011101
+
+ /* move ix-iz to xmm4-xmm6 */
+ movaps xmm4, [esp + ix]
+ movaps xmm5, [esp + iy]
+ movaps xmm6, [esp + iz]
+
+ /* calc dr */
+ subps xmm4, xmm0
+ subps xmm5, xmm1
+ subps xmm6, xmm2
+
+ /* store dr */
+ movaps [esp + dx], xmm4
+ movaps [esp + dy], xmm5
+ movaps [esp + dz], xmm6
+ /* square it */
+ mulps xmm4,xmm4
+ mulps xmm5,xmm5
+ mulps xmm6,xmm6
+ addps xmm4, xmm5
+ addps xmm4, xmm6
+ /* rsq in xmm4 */
+
+ rsqrtps xmm5, xmm4
+ /* lookup seed in xmm5 */
+ movaps xmm2, xmm5
+ mulps xmm5, xmm5
+ movaps xmm1, [esp + three]
+ mulps xmm5, xmm4 /* rsq*lu*lu */
+ movaps xmm0, [esp + half]
+ subps xmm1, xmm5 /* 30-rsq*lu*lu */
+ mulps xmm1, xmm2
+ mulps xmm0, xmm1 /* xmm0=rinv */
+ mulps xmm4, xmm0 /* xmm4=r */
+ mulps xmm4, [esp + tsc]
+
+ movhlps xmm5, xmm4
+ cvttps2pi mm6, xmm4
+ cvttps2pi mm7, xmm5 /* mm6/mm7 contain lu indices */
+ cvtpi2ps xmm6, mm6
+ cvtpi2ps xmm5, mm7
+ movlhps xmm6, xmm5
+ subps xmm4, xmm6
+ movaps xmm1, xmm4 /* xmm1=eps */
+ movaps xmm2, xmm1
+ mulps xmm2, xmm2 /* xmm2=eps2 */
+ pslld mm6, 2
+ pslld mm7, 2
+
+ movd mm0, eax
+ movd mm1, ebx
+ movd mm2, ecx
+ movd mm3, edx
+
+ mov esi, [ebp + VFtab]
+ movd eax, mm6
+ psrlq mm6, 32
+ movd ecx, mm7
+ psrlq mm7, 32
+ movd ebx, mm6
+ movd edx, mm7
+
+ movlps xmm5, [esi + eax*4]
+ movlps xmm7, [esi + ecx*4]
+ movhps xmm5, [esi + ebx*4]
+ movhps xmm7, [esi + edx*4] /* got half coulomb table */
+
+ movaps xmm4, xmm5
+ shufps xmm4, xmm7, 0b10001000
+ shufps xmm5, xmm7, 0b11011101
+
+ movlps xmm7, [esi + eax*4 + 8]
+ movlps xmm3, [esi + ecx*4 + 8]
+ movhps xmm7, [esi + ebx*4 + 8]
+ movhps xmm3, [esi + edx*4 + 8] /* other half of coulomb table */
+ movaps xmm6, xmm7
+ shufps xmm6, xmm3, 0b10001000
+ shufps xmm7, xmm3, 0b11011101
+ /* coulomb table ready, in xmm4-xmm7 */
+
+ mulps xmm6, xmm1 /* xmm6=Geps */
+ mulps xmm7, xmm2 /* xmm7=Heps2 */
+ addps xmm5, xmm6
+ addps xmm5, xmm7 /* xmm5=Fp */
+ mulps xmm7, [esp + two] /* two*Heps2 */
+ movaps xmm3, [esp + qq]
+ addps xmm7, xmm6
+ addps xmm7, xmm5 /* xmm7=FF */
+ mulps xmm5, xmm1 /* xmm5=eps*Fp */
+ addps xmm5, xmm4 /* xmm5=VV */
+ mulps xmm5, xmm3 /* vcoul=qq*VV */
+ mulps xmm3, xmm7 /* fijC=FF*qq */
+ /* L-J */
+ movaps xmm4, xmm0
+ mulps xmm4, xmm0 /* xmm4=rinvsq */
+
+ /* at this point mm5 contains vcoul and mm3 fijC */
+ /* increment vcoul - then we can get rid of mm5 */
+ /* update vctot */
+ addps xmm5, [esp + vctot]
+
+ movaps xmm6, xmm4
+ mulps xmm6, xmm4
+
+ movaps [esp + vctot], xmm5
+
+ mulps xmm6, xmm4 /* xmm6=rinvsix */
+ movaps xmm4, xmm6
+ mulps xmm4, xmm4 /* xmm4=rinvtwelve */
+ mulps xmm6, [esp + c6]
+ mulps xmm4, [esp + c12]
+ movaps xmm7, [esp + vnbtot]
+ addps xmm7, xmm4
+ mulps xmm4, [esp + twelve]
+ subps xmm7, xmm6
+ mulps xmm3, [esp + tsc]
+ mulps xmm6, [esp + six]
+ movaps [esp + vnbtot], xmm7
+ subps xmm4, xmm6
+ mulps xmm4, xmm0
+ subps xmm4, xmm3
+ mulps xmm4, xmm0
+
+ movaps xmm0, [esp + dx]
+ movaps xmm1, [esp + dy]
+ movaps xmm2, [esp + dz]
+
+ movd eax, mm0
+ movd ebx, mm1
+ movd ecx, mm2
+ movd edx, mm3
+
+ mov edi, [ebp + faction]
+ mulps xmm0, xmm4
+ mulps xmm1, xmm4
+ mulps xmm2, xmm4
+ /* xmm0-xmm2 contains tx-tz (partial force) */
+ /* now update f_i */
+ movaps xmm3, [esp + fix]
+ movaps xmm4, [esp + fiy]
+ movaps xmm5, [esp + fiz]
+ addps xmm3, xmm0
+ addps xmm4, xmm1
+ addps xmm5, xmm2
+ movaps [esp + fix], xmm3
+ movaps [esp + fiy], xmm4
+ movaps [esp + fiz], xmm5
+ /* the fj's - start by accumulating x & y forces from memory */
+ movlps xmm4, [edi + eax*4]
+ movlps xmm6, [edi + ecx*4]
+ movhps xmm4, [edi + ebx*4]
+ movhps xmm6, [edi + edx*4]
+
+ movaps xmm3, xmm4
+ shufps xmm3, xmm6, 0b10001000
+ shufps xmm4, xmm6, 0b11011101
+
+ /* now xmm3-xmm5 contains fjx, fjy, fjz */
+ subps xmm3, xmm0
+ subps xmm4, xmm1
+
+ /* unpack them back so we can store them - first x & y in xmm3/xmm4 */
+
+ movaps xmm6, xmm3
+ unpcklps xmm6, xmm4
+ unpckhps xmm3, xmm4
+ /* xmm6(l)=x & y for j1, (h) for j2 */
+ /* xmm3(l)=x & y for j3, (h) for j4 */
+ movlps [edi + eax*4], xmm6
+ movlps [edi + ecx*4], xmm3
+
+ movhps [edi + ebx*4], xmm6
+ movhps [edi + edx*4], xmm3
+
+ /* and the z forces */
+ movss xmm4, [edi + eax*4 + 8]
+ movss xmm5, [edi + ebx*4 + 8]
+ movss xmm6, [edi + ecx*4 + 8]
+ movss xmm7, [edi + edx*4 + 8]
+ subss xmm4, xmm2
+ shufps xmm2, xmm2, 0b11100101
+ subss xmm5, xmm2
+ shufps xmm2, xmm2, 0b11101010
+ subss xmm6, xmm2
+ shufps xmm2, xmm2, 0b11111111
+ subss xmm7, xmm2
+ movss [edi + eax*4 + 8], xmm4
+ movss [edi + ebx*4 + 8], xmm5
+ movss [edi + ecx*4 + 8], xmm6
+ movss [edi + edx*4 + 8], xmm7
+
+ /* should we do one more iteration? */
+ sub [esp + innerk], 4
+ jl .i3110_finish_vdwc_inner
+ jmp .i3110_unroll_vdwc_loop
+.i3110_finish_vdwc_inner:
+ /* check if at least two particles remain */
+ add [esp + innerk], 4
+ mov edx, [esp + innerk]
+ and edx, 2
+ jnz .i3110_dopair_vdwc
+ jmp .i3110_checksingle_vdwc
+.i3110_dopair_vdwc:
+ mov esi, [ebp + charge]
+
+ mov ecx, [esp + innerjjnr]
+
+ mov eax, [ecx]
+ mov ebx, [ecx + 4]
+ add [esp + innerjjnr], 8
+ xorps xmm7, xmm7
+ movss xmm3, [esi + eax*4]
+ movss xmm6, [esi + ebx*4]
+ shufps xmm3, xmm6, 0
+ shufps xmm3, xmm3, 0b00001000 /* xmm3(0,1) has the charges */
+
+ mulps xmm3, [esp + iq]
+ movlhps xmm3, xmm7
+ movaps [esp + qq], xmm3
+
+ mov esi, [ebp + type]
+ mov ecx, eax
+ mov edx, ebx
+ mov ecx, [esi + ecx*4]
+ mov edx, [esi + edx*4]
+ mov esi, [ebp + nbfp]
+ shl ecx, 1
+ shl edx, 1
+ mov edi, [esp + ntia]
+ add ecx, edi
+ add edx, edi
+ movlps xmm6, [esi + ecx*4]
+ movhps xmm6, [esi + edx*4]
+ mov edi, [ebp + pos]
+
+ movaps xmm4, xmm6
+ shufps xmm4, xmm4, 0b1000
+ shufps xmm6, xmm6, 0b1101
+ movlhps xmm4, xmm7
+ movlhps xmm6, xmm7
+
+ movaps [esp + c6], xmm4
+ movaps [esp + c12], xmm6
+
+ lea eax, [eax + eax*2]
+ lea ebx, [ebx + ebx*2]
+ /* move coordinates to xmm0-xmm2 */
+ movlps xmm1, [edi + eax*4]
+ movss xmm2, [edi + eax*4 + 8]
+ movhps xmm1, [edi + ebx*4]
+ movss xmm0, [edi + ebx*4 + 8]
+
+ movlhps xmm3, xmm7
+
+ shufps xmm2, xmm0, 0
+
+ movaps xmm0, xmm1
+
+ shufps xmm2, xmm2, 0b10001000
+
+ shufps xmm0, xmm0, 0b10001000
+ shufps xmm1, xmm1, 0b11011101
+
+ /* move ix-iz to xmm4-xmm6 */
+ xorps xmm7, xmm7
+
+ movaps xmm4, [esp + ix]
+ movaps xmm5, [esp + iy]
+ movaps xmm6, [esp + iz]
+
+ /* calc dr */
+ subps xmm4, xmm0
+ subps xmm5, xmm1
+ subps xmm6, xmm2
+
+ /* store dr */
+ movaps [esp + dx], xmm4
+ movaps [esp + dy], xmm5
+ movaps [esp + dz], xmm6
+ /* square it */
+ mulps xmm4,xmm4
+ mulps xmm5,xmm5
+ mulps xmm6,xmm6
+ addps xmm4, xmm5
+ addps xmm4, xmm6
+ /* rsq in xmm4 */
+
+ rsqrtps xmm5, xmm4
+ /* lookup seed in xmm5 */
+ movaps xmm2, xmm5
+ mulps xmm5, xmm5
+ movaps xmm1, [esp + three]
+ mulps xmm5, xmm4 /* rsq*lu*lu */
+ movaps xmm0, [esp + half]
+ subps xmm1, xmm5 /* 30-rsq*lu*lu */
+ mulps xmm1, xmm2
+ mulps xmm0, xmm1 /* xmm0=rinv */
+ mulps xmm4, xmm0 /* xmm4=r */
+ mulps xmm4, [esp + tsc]
+
+ cvttps2pi mm6, xmm4 /* mm6 contain lu indices */
+ cvtpi2ps xmm6, mm6
+ subps xmm4, xmm6
+ movaps xmm1, xmm4 /* xmm1=eps */
+ movaps xmm2, xmm1
+ mulps xmm2, xmm2 /* xmm2=eps2 */
+
+ pslld mm6, 2
+
+ mov esi, [ebp + VFtab]
+ movd ecx, mm6
+ psrlq mm6, 32
+ movd edx, mm6
+
+ movlps xmm5, [esi + ecx*4]
+ movhps xmm5, [esi + edx*4] /* got half coulomb table */
+ movaps xmm4, xmm5
+ shufps xmm4, xmm4, 0b10001000
+ shufps xmm5, xmm7, 0b11011101
+
+ movlps xmm7, [esi + ecx*4 + 8]
+ movhps xmm7, [esi + edx*4 + 8]
+ movaps xmm6, xmm7
+ shufps xmm6, xmm6, 0b10001000
+ shufps xmm7, xmm7, 0b11011101
+ /* table ready in xmm4-xmm7 */
+
+ mulps xmm6, xmm1 /* xmm6=Geps */
+ mulps xmm7, xmm2 /* xmm7=Heps2 */
+ addps xmm5, xmm6
+ addps xmm5, xmm7 /* xmm5=Fp */
+ mulps xmm7, [esp + two] /* two*Heps2 */
+ movaps xmm3, [esp + qq]
+ addps xmm7, xmm6
+ addps xmm7, xmm5 /* xmm7=FF */
+ mulps xmm5, xmm1 /* xmm5=eps*Fp */
+ addps xmm5, xmm4 /* xmm5=VV */
+ mulps xmm5, xmm3 /* vcoul=qq*VV */
+ mulps xmm3, xmm7 /* fijC=FF*qq */
+ /* L-J */
+ movaps xmm4, xmm0
+ mulps xmm4, xmm0 /* xmm4=rinvsq */
+
+ /* at this point mm5 contains vcoul and mm3 fijC */
+ /* increment vcoul - then we can get rid of mm5 */
+ /* update vctot */
+ addps xmm5, [esp + vctot]
+
+ movaps xmm6, xmm4
+ mulps xmm6, xmm4
+
+ movaps [esp + vctot], xmm5
+
+ mulps xmm6, xmm4 /* xmm6=rinvsix */
+ movaps xmm4, xmm6
+ mulps xmm4, xmm4 /* xmm4=rinvtwelve */
+ mulps xmm6, [esp + c6]
+ mulps xmm4, [esp + c12]
+ movaps xmm7, [esp + vnbtot]
+ addps xmm7, xmm4
+ mulps xmm4, [esp + twelve]
+ subps xmm7, xmm6
+ mulps xmm3, [esp + tsc]
+ mulps xmm6, [esp + six]
+ movaps [esp + vnbtot], xmm7
+ subps xmm4, xmm6
+ mulps xmm4, xmm0
+ subps xmm4, xmm3
+ mulps xmm4, xmm0
+
+ movaps xmm0, [esp + dx]
+ movaps xmm1, [esp + dy]
+ movaps xmm2, [esp + dz]
+
+ mulps xmm0, xmm4
+ mulps xmm1, xmm4
+ mulps xmm2, xmm4
+ mov edi, [ebp + faction]
+ /* xmm0-xmm2 contains tx-tz (partial force) */
+ /* now update f_i */
+ movaps xmm3, [esp + fix]
+ movaps xmm4, [esp + fiy]
+ movaps xmm5, [esp + fiz]
+ addps xmm3, xmm0
+ addps xmm4, xmm1
+ addps xmm5, xmm2
+ movaps [esp + fix], xmm3
+ movaps [esp + fiy], xmm4
+ movaps [esp + fiz], xmm5
+ /* update the fj's */
+ movss xmm3, [edi + eax*4]
+ movss xmm4, [edi + eax*4 + 4]
+ movss xmm5, [edi + eax*4 + 8]
+ subss xmm3, xmm0
+ subss xmm4, xmm1
+ subss xmm5, xmm2
+ movss [edi + eax*4], xmm3
+ movss [edi + eax*4 + 4], xmm4
+ movss [edi + eax*4 + 8], xmm5
+
+ shufps xmm0, xmm0, 0b11100001
+ shufps xmm1, xmm1, 0b11100001
+ shufps xmm2, xmm2, 0b11100001
+
+ movss xmm3, [edi + ebx*4]
+ movss xmm4, [edi + ebx*4 + 4]
+ movss xmm5, [edi + ebx*4 + 8]
+ subss xmm3, xmm0
+ subss xmm4, xmm1
+ subss xmm5, xmm2
+ movss [edi + ebx*4], xmm3
+ movss [edi + ebx*4 + 4], xmm4
+ movss [edi + ebx*4 + 8], xmm5
+
+.i3110_checksingle_vdwc:
+ mov edx, [esp + innerk]
+ and edx, 1
+ jnz .i3110_dosingle_vdwc
+ jmp .i3110_updateouterdata_vdwc
+.i3110_dosingle_vdwc:
+ mov esi, [ebp + charge]
+ mov edi, [ebp + pos]
+ mov ecx, [esp + innerjjnr]
+ mov eax, [ecx]
+ xorps xmm6, xmm6
+ movss xmm6, [esi + eax*4] /* xmm6(0) has the charge */
+ mulps xmm6, [esp + iq]
+ movaps [esp + qq], xmm6
+
+ mov esi, [ebp + type]
+ mov ecx, eax
+ mov ecx, [esi + ecx*4]
+ mov esi, [ebp + nbfp]
+ shl ecx, 1
+ add ecx, [esp + ntia]
+ movlps xmm6, [esi + ecx*4]
+ movaps xmm4, xmm6
+ shufps xmm4, xmm4, 0b11111100
+ shufps xmm6, xmm6, 0b11111101
+
+ movaps [esp + c6], xmm4
+ movaps [esp + c12], xmm6
+
+ lea eax, [eax + eax*2]
+
+ /* move coordinates to xmm0-xmm2 */
+ movss xmm0, [edi + eax*4]
+ movss xmm1, [edi + eax*4 + 4]
+ movss xmm2, [edi + eax*4 + 8]
+
+ movaps xmm4, [esp + ix]
+ movaps xmm5, [esp + iy]
+ movaps xmm6, [esp + iz]
+
+ /* calc dr */
+ subps xmm4, xmm0
+ subps xmm5, xmm1
+ subps xmm6, xmm2
+
+ /* store dr */
+ movaps [esp + dx], xmm4
+ movaps [esp + dy], xmm5
+ movaps [esp + dz], xmm6
+ /* square it */
+ mulps xmm4,xmm4
+ mulps xmm5,xmm5
+ mulps xmm6,xmm6
+ addps xmm4, xmm5
+ addps xmm4, xmm6
+ /* rsq in xmm4 */
+
+ rsqrtps xmm5, xmm4
+ /* lookup seed in xmm5 */
+ movaps xmm2, xmm5
+ mulps xmm5, xmm5
+ movaps xmm1, [esp + three]
+ mulps xmm5, xmm4 /* rsq*lu*lu */
+ movaps xmm0, [esp + half]
+ subps xmm1, xmm5 /* 30-rsq*lu*lu */
+ mulps xmm1, xmm2
+ mulps xmm0, xmm1 /* xmm0=rinv */
+
+ mulps xmm4, xmm0 /* xmm4=r */
+ mulps xmm4, [esp + tsc]
+
+ cvttps2pi mm6, xmm4 /* mm6 contain lu indices */
+ cvtpi2ps xmm6, mm6
+ subps xmm4, xmm6
+ movaps xmm1, xmm4 /* xmm1=eps */
+ movaps xmm2, xmm1
+ mulps xmm2, xmm2 /* xmm2=eps2 */
+
+ pslld mm6, 2
+
+ mov esi, [ebp + VFtab]
+ movd ebx, mm6
+
+ movlps xmm4, [esi + ebx*4]
+ movlps xmm6, [esi + ebx*4 + 8]
+ movaps xmm5, xmm4
+ movaps xmm7, xmm6
+ shufps xmm5, xmm5, 1
+ shufps xmm7, xmm7, 1
+ /* table ready in xmm4-xmm7 */
+
+ mulps xmm6, xmm1 /* xmm6=Geps */
+ mulps xmm7, xmm2 /* xmm7=Heps2 */
+ addps xmm5, xmm6
+ addps xmm5, xmm7 /* xmm5=Fp */
+ mulps xmm7, [esp + two] /* two*Heps2 */
+ movaps xmm3, [esp + qq]
+ addps xmm7, xmm6
+ addps xmm7, xmm5 /* xmm7=FF */
+ mulps xmm5, xmm1 /* xmm5=eps*Fp */
+ addps xmm5, xmm4 /* xmm5=VV */
+ mulps xmm5, xmm3 /* vcoul=qq*VV */
+ mulps xmm3, xmm7 /* fijC=FF*qq */
+ /* L-J */
+ movaps xmm4, xmm0
+ mulps xmm4, xmm0 /* xmm4=rinvsq */
+
+ /* at this point mm5 contains vcoul and mm3 fijC */
+ /* increment vcoul - then we can get rid of mm5 */
+ /* update vctot */
+ addps xmm5, [esp + vctot]
+
+ movaps xmm6, xmm4
+ mulps xmm6, xmm4
+
+ movaps [esp + vctot], xmm5
+
+ mulps xmm6, xmm4 /* xmm6=rinvsix */
+ movaps xmm4, xmm6
+ mulps xmm4, xmm4 /* xmm4=rinvtwelve */
+ mulps xmm6, [esp + c6]
+ mulps xmm4, [esp + c12]
+ movaps xmm7, [esp + vnbtot]
+ addps xmm7, xmm4
+ mulps xmm4, [esp + twelve]
+ subps xmm7, xmm6
+ mulps xmm3, [esp + tsc]
+ mulps xmm6, [esp + six]
+ movaps [esp + vnbtot], xmm7
+ subps xmm4, xmm6
+ mulps xmm4, xmm0
+ subps xmm4, xmm3
+ mulps xmm4, xmm0
+
+ mov edi, [ebp +faction]
+
+ movaps xmm0, [esp + dx]
+ movaps xmm1, [esp + dy]
+ movaps xmm2, [esp + dz]
+
+ mulps xmm0, xmm4
+ mulps xmm1, xmm4
+ mulps xmm2, xmm4
+ /* xmm0-xmm2 contains tx-tz (partial force) */
+ /* now update f_i */
+ movaps xmm3, [esp + fix]
+ movaps xmm4, [esp + fiy]
+ movaps xmm5, [esp + fiz]
+ addps xmm3, xmm0
+ addps xmm4, xmm1
+ addps xmm5, xmm2
+ movaps [esp + fix], xmm3
+ movaps [esp + fiy], xmm4
+ movaps [esp + fiz], xmm5
+ /* update fj */
+
+ movss xmm3, [edi + eax*4]
+ movss xmm4, [edi + eax*4 + 4]
+ movss xmm5, [edi + eax*4 + 8]
+ subss xmm3, xmm0
+ subss xmm4, xmm1
+ subss xmm5, xmm2
+ movss [edi + eax*4], xmm3
+ movss [edi + eax*4 + 4], xmm4
+ movss [edi + eax*4 + 8], xmm5
+.i3110_updateouterdata_vdwc:
+ mov ecx, [esp + ii3]
+ mov edi, [ebp + faction]
+ mov esi, [ebp + fshift]
+ mov edx, [esp + is3]
+
+ /* accumulate i forces in xmm0, xmm1, xmm2 */
+ movaps xmm0, [esp + fix]
+ movaps xmm1, [esp + fiy]
+ movaps xmm2, [esp + fiz]
+
+ movhlps xmm3, xmm0
+ movhlps xmm4, xmm1
+ movhlps xmm5, xmm2
+ addps xmm0, xmm3
+ addps xmm1, xmm4
+ addps xmm2, xmm5 /* sum is in 1/2 in xmm0-xmm2 */
+
+ movaps xmm3, xmm0
+ movaps xmm4, xmm1
+ movaps xmm5, xmm2
+
+ shufps xmm3, xmm3, 1
+ shufps xmm4, xmm4, 1
+ shufps xmm5, xmm5, 1
+ addss xmm0, xmm3
+ addss xmm1, xmm4
+ addss xmm2, xmm5 /* xmm0-xmm2 has single force in pos0 */
+
+ /* increment i force */
+ movss xmm3, [edi + ecx*4]
+ movss xmm4, [edi + ecx*4 + 4]
+ movss xmm5, [edi + ecx*4 + 8]
+ addss xmm3, xmm0
+ addss xmm4, xmm1
+ addss xmm5, xmm2
+ movss [edi + ecx*4], xmm3
+ movss [edi + ecx*4 + 4], xmm4
+ movss [edi + ecx*4 + 8], xmm5
+
+ /* increment fshift force */
+ movss xmm3, [esi + edx*4]
+ movss xmm4, [esi + edx*4 + 4]
+ movss xmm5, [esi + edx*4 + 8]
+ addss xmm3, xmm0
+ addss xmm4, xmm1
+ addss xmm5, xmm2
+ movss [esi + edx*4], xmm3
+ movss [esi + edx*4 + 4], xmm4
+ movss [esi + edx*4 + 8], xmm5
+
+
+ /* loop back to mno */
+ dec dword ptr [esp + nsvdwc]
+ jz .i3110_testcoul
+ jmp .i3110_mno_vdwc
+.i3110_testcoul:
+ mov ecx, [esp + nscoul]
+ cmp ecx, 0
+ jnz .i3110_mno_coul
+ jmp .i3110_testvdw
+.i3110_mno_coul:
+ mov ebx, [esp + solnr]
+ inc dword ptr [esp + solnr]
+
+ movss xmm0, [esp + shX]
+ movss xmm1, [esp + shY]
+ movss xmm2, [esp + shZ]
+
+ mov edx, [ebp + charge]
+ movss xmm3, [edx + ebx*4]
+ mulss xmm3, [ebp + facel]
+ shufps xmm3, xmm3, 0
+
+ lea ebx, [ebx + ebx*2] /* ebx = 3*ii=ii3 */
+ mov eax, [ebp + pos] /* eax = base of pos[] */
+
+ addss xmm0, [eax + ebx*4]
+ addss xmm1, [eax + ebx*4 + 4]
+ addss xmm2, [eax + ebx*4 + 8]
+
+ movaps [esp + iq], xmm3
+
+ shufps xmm0, xmm0, 0
+ shufps xmm1, xmm1, 0
+ shufps xmm2, xmm2, 0
+
+ movaps [esp + ix], xmm0
+ movaps [esp + iy], xmm1
+ movaps [esp + iz], xmm2
+
+ mov [esp + ii3], ebx
+
+ /* clear i forces */
+ xorps xmm4, xmm4
+ movaps [esp + fix], xmm4
+ movaps [esp + fiy], xmm4
+ movaps [esp + fiz], xmm4
+
+ mov ecx, [esp + innerjjnr0]
+ mov [esp + innerjjnr], ecx
+ mov edx, [esp + innerk0]
+ sub edx, 4
+ mov [esp + innerk], edx /* number of innerloop atoms */
+ jge .i3110_unroll_coul_loop
+ jmp .i3110_finish_coul_inner
+
+.i3110_unroll_coul_loop:
+ /* quad-unroll innerloop here */
+ mov edx, [esp + innerjjnr] /* pointer to jjnr[k] */
+ mov eax, [edx]
+ mov ebx, [edx + 4]
+ mov ecx, [edx + 8]
+ mov edx, [edx + 12] /* eax-edx=jnr1-4 */
+ add [esp + innerjjnr], 16 /* advance pointer (unrolled 4) */
+
+ mov esi, [ebp + charge] /* base of charge[] */
+
+ movss xmm3, [esi + eax*4]
+ movss xmm4, [esi + ecx*4]
+ movss xmm6, [esi + ebx*4]
+ movss xmm7, [esi + edx*4]
+
+ movaps xmm2, [esp + iq]
+ shufps xmm3, xmm6, 0
+ shufps xmm4, xmm7, 0
+ shufps xmm3, xmm4, 0b10001000 /* all charges in xmm3 */
+ mulps xmm3, xmm2
+
+ movaps [esp + qq], xmm3
+
+ mov esi, [ebp + pos] /* base of pos[] */
+
+ lea eax, [eax + eax*2] /* replace jnr with j3 */
+ lea ebx, [ebx + ebx*2]
+
+ lea ecx, [ecx + ecx*2] /* replace jnr with j3 */
+ lea edx, [edx + edx*2]
+
+ /* move four coordinates to xmm0-xmm2 */
+
+ movlps xmm4, [esi + eax*4]
+ movlps xmm5, [esi + ecx*4]
+ movss xmm2, [esi + eax*4 + 8]
+ movss xmm6, [esi + ecx*4 + 8]
+
+ movhps xmm4, [esi + ebx*4]
+ movhps xmm5, [esi + edx*4]
+
+ movss xmm0, [esi + ebx*4 + 8]
+ movss xmm1, [esi + edx*4 + 8]
+
+ shufps xmm2, xmm0, 0
+ shufps xmm6, xmm1, 0
+
+ movaps xmm0, xmm4
+ movaps xmm1, xmm4
+
+ shufps xmm2, xmm6, 0b10001000
+
+ shufps xmm0, xmm5, 0b10001000
+ shufps xmm1, xmm5, 0b11011101
+
+ /* move ix-iz to xmm4-xmm6 */
+ movaps xmm4, [esp + ix]
+ movaps xmm5, [esp + iy]
+ movaps xmm6, [esp + iz]
+
+ /* calc dr */
+ subps xmm4, xmm0
+ subps xmm5, xmm1
+ subps xmm6, xmm2
+
+ /* store dr */
+ movaps [esp + dx], xmm4
+ movaps [esp + dy], xmm5
+ movaps [esp + dz], xmm6
+ /* square it */
+ mulps xmm4,xmm4
+ mulps xmm5,xmm5
+ mulps xmm6,xmm6
+ addps xmm4, xmm5
+ addps xmm4, xmm6
+ /* rsq in xmm4 */
+
+ rsqrtps xmm5, xmm4
+ /* lookup seed in xmm5 */
+ movaps xmm2, xmm5
+ mulps xmm5, xmm5
+ movaps xmm1, [esp + three]
+ mulps xmm5, xmm4 /* rsq*lu*lu */
+ movaps xmm0, [esp + half]
+ subps xmm1, xmm5 /* 30-rsq*lu*lu */
+ mulps xmm1, xmm2
+ mulps xmm0, xmm1 /* xmm0=rinv */
+ mulps xmm4, xmm0 /* xmm4=r */
+ mulps xmm4, [esp + tsc]
+
+ movhlps xmm5, xmm4
+ cvttps2pi mm6, xmm4
+ cvttps2pi mm7, xmm5 /* mm6/mm7 contain lu indices */
+ cvtpi2ps xmm6, mm6
+ cvtpi2ps xmm5, mm7
+ movlhps xmm6, xmm5
+ subps xmm4, xmm6
+ movaps xmm1, xmm4 /* xmm1=eps */
+ movaps xmm2, xmm1
+ mulps xmm2, xmm2 /* xmm2=eps2 */
+ pslld mm6, 2
+ pslld mm7, 2
+
+ movd mm0, eax
+ movd mm1, ebx
+ movd mm2, ecx
+ movd mm3, edx
+
+ mov esi, [ebp + VFtab]
+ movd eax, mm6
+ psrlq mm6, 32
+ movd ecx, mm7
+ psrlq mm7, 32
+ movd ebx, mm6
+ movd edx, mm7
+
+ movlps xmm5, [esi + eax*4]
+ movlps xmm7, [esi + ecx*4]
+ movhps xmm5, [esi + ebx*4]
+ movhps xmm7, [esi + edx*4] /* got half coulomb table */
+
+ movaps xmm4, xmm5
+ shufps xmm4, xmm7, 0b10001000
+ shufps xmm5, xmm7, 0b11011101
+
+ movlps xmm7, [esi + eax*4 + 8]
+ movlps xmm3, [esi + ecx*4 + 8]
+ movhps xmm7, [esi + ebx*4 + 8]
+ movhps xmm3, [esi + edx*4 + 8] /* other half of coulomb table */
+ movaps xmm6, xmm7
+ shufps xmm6, xmm3, 0b10001000
+ shufps xmm7, xmm3, 0b11011101
+ /* coulomb table ready, in xmm4-xmm7 */
+
+ mulps xmm6, xmm1 /* xmm6=Geps */
+ mulps xmm7, xmm2 /* xmm7=Heps2 */
+ addps xmm5, xmm6
+ addps xmm5, xmm7 /* xmm5=Fp */
+ mulps xmm7, [esp + two] /* two*Heps2 */
+ movaps xmm3, [esp + qq]
+ addps xmm7, xmm6
+ addps xmm7, xmm5 /* xmm7=FF */
+ mulps xmm5, xmm1 /* xmm5=eps*Fp */
+ addps xmm5, xmm4 /* xmm5=VV */
+ mulps xmm5, xmm3 /* vcoul=qq*VV */
+ mulps xmm3, xmm7 /* fijC=FF*qq */
+ /* at this point mm5 contains vcoul and mm3 fijC */
+ /* increment vcoul - then we can get rid of mm5 */
+ /* update vctot */
+ addps xmm5, [esp + vctot]
+ movaps [esp + vctot], xmm5
+
+ xorps xmm4, xmm4
+
+ mulps xmm3, [esp + tsc]
+ mulps xmm3, xmm0
+ subps xmm4, xmm3
+
+ movaps xmm0, [esp + dx]
+ movaps xmm1, [esp + dy]
+ movaps xmm2, [esp + dz]
+
+ movd eax, mm0
+ movd ebx, mm1
+ movd ecx, mm2
+ movd edx, mm3
+
+ mov edi, [ebp + faction]
+ mulps xmm0, xmm4
+ mulps xmm1, xmm4
+ mulps xmm2, xmm4
+ /* xmm0-xmm2 contains tx-tz (partial force) */
+ /* now update f_i */
+ movaps xmm3, [esp + fix]
+ movaps xmm4, [esp + fiy]
+ movaps xmm5, [esp + fiz]
+ addps xmm3, xmm0
+ addps xmm4, xmm1
+ addps xmm5, xmm2
+ movaps [esp + fix], xmm3
+ movaps [esp + fiy], xmm4
+ movaps [esp + fiz], xmm5
+ /* the fj's - start by accumulating x & y forces from memory */
+ movlps xmm4, [edi + eax*4]
+ movlps xmm6, [edi + ecx*4]
+ movhps xmm4, [edi + ebx*4]
+ movhps xmm6, [edi + edx*4]
+
+ movaps xmm3, xmm4
+ shufps xmm3, xmm6, 0b10001000
+ shufps xmm4, xmm6, 0b11011101
+
+ /* now xmm3-xmm5 contains fjx, fjy, fjz */
+ subps xmm3, xmm0
+ subps xmm4, xmm1
+
+ /* unpack them back so we can store them - first x & y in xmm3/xmm4 */
+
+ movaps xmm6, xmm3
+ unpcklps xmm6, xmm4
+ unpckhps xmm3, xmm4
+ /* xmm6(l)=x & y for j1, (h) for j2 */
+ /* xmm3(l)=x & y for j3, (h) for j4 */
+ movlps [edi + eax*4], xmm6
+ movlps [edi + ecx*4], xmm3
+
+ movhps [edi + ebx*4], xmm6
+ movhps [edi + edx*4], xmm3
+
+ /* and the z forces */
+ movss xmm4, [edi + eax*4 + 8]
+ movss xmm5, [edi + ebx*4 + 8]
+ movss xmm6, [edi + ecx*4 + 8]
+ movss xmm7, [edi + edx*4 + 8]
+ subss xmm4, xmm2
+ shufps xmm2, xmm2, 0b11100101
+ subss xmm5, xmm2
+ shufps xmm2, xmm2, 0b11101010
+ subss xmm6, xmm2
+ shufps xmm2, xmm2, 0b11111111
+ subss xmm7, xmm2
+ movss [edi + eax*4 + 8], xmm4
+ movss [edi + ebx*4 + 8], xmm5
+ movss [edi + ecx*4 + 8], xmm6
+ movss [edi + edx*4 + 8], xmm7
+
+ /* should we do one more iteration? */
+ sub [esp + innerk], 4
+ jl .i3110_finish_coul_inner
+ jmp .i3110_unroll_coul_loop
+.i3110_finish_coul_inner:
+ /* check if at least two particles remain */
+ add [esp + innerk], 4
+ mov edx, [esp + innerk]
+ and edx, 2
+ jnz .i3110_dopair_coul
+ jmp .i3110_checksingle_coul
+.i3110_dopair_coul:
+ mov esi, [ebp + charge]
+
+ mov ecx, [esp + innerjjnr]
+
+ mov eax, [ecx]
+ mov ebx, [ecx + 4]
+ add [esp + innerjjnr], 8
+ xorps xmm7, xmm7
+ movss xmm3, [esi + eax*4]
+ movss xmm6, [esi + ebx*4]
+ shufps xmm3, xmm6, 0
+ shufps xmm3, xmm3, 0b00001000 /* xmm3(0,1) has the charges */
+
+ mulps xmm3, [esp + iq]
+ movlhps xmm3, xmm7
+ movaps [esp + qq], xmm3
+
+ mov edi, [ebp + pos]
+
+ lea eax, [eax + eax*2]
+ lea ebx, [ebx + ebx*2]
+ /* move coordinates to xmm0-xmm2 */
+ movlps xmm1, [edi + eax*4]
+ movss xmm2, [edi + eax*4 + 8]
+ movhps xmm1, [edi + ebx*4]
+ movss xmm0, [edi + ebx*4 + 8]
+
+ movlhps xmm3, xmm7
+
+ shufps xmm2, xmm0, 0
+
+ movaps xmm0, xmm1
+
+ shufps xmm2, xmm2, 0b10001000
+
+ shufps xmm0, xmm0, 0b10001000
+ shufps xmm1, xmm1, 0b11011101
+
+ mov edi, [ebp + faction]
+ /* move ix-iz to xmm4-xmm6 */
+ xorps xmm7, xmm7
+
+ movaps xmm4, [esp + ix]
+ movaps xmm5, [esp + iy]
+ movaps xmm6, [esp + iz]
+
+ /* calc dr */
+ subps xmm4, xmm0
+ subps xmm5, xmm1
+ subps xmm6, xmm2
+
+ /* store dr */
+ movaps [esp + dx], xmm4
+ movaps [esp + dy], xmm5
+ movaps [esp + dz], xmm6
+ /* square it */
+ mulps xmm4,xmm4
+ mulps xmm5,xmm5
+ mulps xmm6,xmm6
+ addps xmm4, xmm5
+ addps xmm4, xmm6
+ /* rsq in xmm4 */
+
+ rsqrtps xmm5, xmm4
+ /* lookup seed in xmm5 */
+ movaps xmm2, xmm5
+ mulps xmm5, xmm5
+ movaps xmm1, [esp + three]
+ mulps xmm5, xmm4 /* rsq*lu*lu */
+ movaps xmm0, [esp + half]
+ subps xmm1, xmm5 /* 30-rsq*lu*lu */
+ mulps xmm1, xmm2
+ mulps xmm0, xmm1 /* xmm0=rinv */
+ mulps xmm4, xmm0 /* xmm4=r */
+ mulps xmm4, [esp + tsc]
+
+ cvttps2pi mm6, xmm4 /* mm6 contain lu indices */
+ cvtpi2ps xmm6, mm6
+ subps xmm4, xmm6
+ movaps xmm1, xmm4 /* xmm1=eps */
+ movaps xmm2, xmm1
+ mulps xmm2, xmm2 /* xmm2=eps2 */
+
+ pslld mm6, 2
+
+ mov esi, [ebp + VFtab]
+ movd ecx, mm6
+ psrlq mm6, 32
+ movd edx, mm6
+
+ movlps xmm5, [esi + ecx*4]
+ movhps xmm5, [esi + edx*4] /* got half coulomb table */
+ movaps xmm4, xmm5
+ shufps xmm4, xmm4, 0b10001000
+ shufps xmm5, xmm7, 0b11011101
+
+ movlps xmm7, [esi + ecx*4 + 8]
+ movhps xmm7, [esi + edx*4 + 8]
+ movaps xmm6, xmm7
+ shufps xmm6, xmm6, 0b10001000
+ shufps xmm7, xmm7, 0b11011101
+ /* table ready in xmm4-xmm7 */
+
+ mulps xmm6, xmm1 /* xmm6=Geps */
+ mulps xmm7, xmm2 /* xmm7=Heps2 */
+ addps xmm5, xmm6
+ addps xmm5, xmm7 /* xmm5=Fp */
+ mulps xmm7, [esp + two] /* two*Heps2 */
+ movaps xmm3, [esp + qq]
+ addps xmm7, xmm6
+ addps xmm7, xmm5 /* xmm7=FF */
+ mulps xmm5, xmm1 /* xmm5=eps*Fp */
+ addps xmm5, xmm4 /* xmm5=VV */
+ mulps xmm5, xmm3 /* vcoul=qq*VV */
+ mulps xmm3, xmm7 /* fijC=FF*qq */
+ /* at this point mm5 contains vcoul and mm3 fijC */
+ /* increment vcoul - then we can get rid of mm5 */
+ /* update vctot */
+ addps xmm5, [esp + vctot]
+ movaps [esp + vctot], xmm5
+
+ xorps xmm4, xmm4
+
+ mulps xmm3, [esp + tsc]
+ mulps xmm3, xmm0
+ subps xmm4, xmm3
+
+ movaps xmm0, [esp + dx]
+ movaps xmm1, [esp + dy]
+ movaps xmm2, [esp + dz]
+
+ mulps xmm0, xmm4
+ mulps xmm1, xmm4
+ mulps xmm2, xmm4
+ /* xmm0-xmm2 contains tx-tz (partial force) */
+ /* now update f_i */
+ movaps xmm3, [esp + fix]
+ movaps xmm4, [esp + fiy]
+ movaps xmm5, [esp + fiz]
+ addps xmm3, xmm0
+ addps xmm4, xmm1
+ addps xmm5, xmm2
+ movaps [esp + fix], xmm3
+ movaps [esp + fiy], xmm4
+ movaps [esp + fiz], xmm5
+ /* update the fj's */
+ movss xmm3, [edi + eax*4]
+ movss xmm4, [edi + eax*4 + 4]
+ movss xmm5, [edi + eax*4 + 8]
+ subss xmm3, xmm0
+ subss xmm4, xmm1
+ subss xmm5, xmm2
+ movss [edi + eax*4], xmm3
+ movss [edi + eax*4 + 4], xmm4
+ movss [edi + eax*4 + 8], xmm5
+
+ shufps xmm0, xmm0, 0b11100001
+ shufps xmm1, xmm1, 0b11100001
+ shufps xmm2, xmm2, 0b11100001
+
+ movss xmm3, [edi + ebx*4]
+ movss xmm4, [edi + ebx*4 + 4]
+ movss xmm5, [edi + ebx*4 + 8]
+ subss xmm3, xmm0
+ subss xmm4, xmm1
+ subss xmm5, xmm2
+ movss [edi + ebx*4], xmm3
+ movss [edi + ebx*4 + 4], xmm4
+ movss [edi + ebx*4 + 8], xmm5
+
+.i3110_checksingle_coul:
+ mov edx, [esp + innerk]
+ and edx, 1
+ jnz .i3110_dosingle_coul
+ jmp .i3110_updateouterdata_coul
+.i3110_dosingle_coul:
+ mov esi, [ebp + charge]
+ mov edi, [ebp + pos]
+ mov ecx, [esp + innerjjnr]
+ mov eax, [ecx]
+ xorps xmm6, xmm6
+ movss xmm6, [esi + eax*4] /* xmm6(0) has the charge */
+ mulps xmm6, [esp + iq]
+ movaps [esp + qq], xmm6
+
+ lea eax, [eax + eax*2]
+
+ /* move coordinates to xmm0-xmm2 */
+ movss xmm0, [edi + eax*4]
+ movss xmm1, [edi + eax*4 + 4]
+ movss xmm2, [edi + eax*4 + 8]
+
+ movaps xmm4, [esp + ix]
+ movaps xmm5, [esp + iy]
+ movaps xmm6, [esp + iz]
+
+ /* calc dr */
+ subps xmm4, xmm0
+ subps xmm5, xmm1
+ subps xmm6, xmm2
+
+ /* store dr */
+ movaps [esp + dx], xmm4
+ movaps [esp + dy], xmm5
+ movaps [esp + dz], xmm6
+ /* square it */
+ mulps xmm4,xmm4
+ mulps xmm5,xmm5
+ mulps xmm6,xmm6
+ addps xmm4, xmm5
+ addps xmm4, xmm6
+ /* rsq in xmm4 */
+
+ rsqrtps xmm5, xmm4
+ /* lookup seed in xmm5 */
+ movaps xmm2, xmm5
+ mulps xmm5, xmm5
+ movaps xmm1, [esp + three]
+ mulps xmm5, xmm4 /* rsq*lu*lu */
+ movaps xmm0, [esp + half]
+ subps xmm1, xmm5 /* 30-rsq*lu*lu */
+ mulps xmm1, xmm2
+ mulps xmm0, xmm1 /* xmm0=rinv */
+
+ mulps xmm4, xmm0 /* xmm4=r */
+ mulps xmm4, [esp + tsc]
+
+ cvttps2pi mm6, xmm4 /* mm6 contain lu indices */
+ cvtpi2ps xmm6, mm6
+ subps xmm4, xmm6
+ movaps xmm1, xmm4 /* xmm1=eps */
+ movaps xmm2, xmm1
+ mulps xmm2, xmm2 /* xmm2=eps2 */
+
+ pslld mm6, 2
+
+ mov esi, [ebp + VFtab]
+ movd ebx, mm6
+
+ movlps xmm4, [esi + ebx*4]
+ movlps xmm6, [esi + ebx*4 + 8]
+ movaps xmm5, xmm4
+ movaps xmm7, xmm6
+ shufps xmm5, xmm5, 1
+ shufps xmm7, xmm7, 1
+ /* table ready in xmm4-xmm7 */
+
+ mulps xmm6, xmm1 /* xmm6=Geps */
+ mulps xmm7, xmm2 /* xmm7=Heps2 */
+ addps xmm5, xmm6
+ addps xmm5, xmm7 /* xmm5=Fp */
+ mulps xmm7, [esp + two] /* two*Heps2 */
+ movaps xmm3, [esp + qq]
+ addps xmm7, xmm6
+ addps xmm7, xmm5 /* xmm7=FF */
+ mulps xmm5, xmm1 /* xmm5=eps*Fp */
+ addps xmm5, xmm4 /* xmm5=VV */
+ mulps xmm5, xmm3 /* vcoul=qq*VV */
+ mulps xmm3, xmm7 /* fijC=FF*qq */
+ /* at this point mm5 contains vcoul and mm3 fijC */
+ /* increment vcoul - then we can get rid of mm5 */
+ /* update vctot */
+ addps xmm5, [esp + vctot]
+ movaps [esp + vctot], xmm5
+
+ xorps xmm4, xmm4
+
+ mulps xmm3, [esp + tsc]
+ mulps xmm3, xmm0
+ subps xmm4, xmm3
+ mov edi, [ebp + faction]
+
+ movaps xmm0, [esp + dx]
+ movaps xmm1, [esp + dy]
+ movaps xmm2, [esp + dz]
+
+ mulps xmm0, xmm4
+ mulps xmm1, xmm4
+ mulps xmm2, xmm4
+ /* xmm0-xmm2 contains tx-tz (partial force) */
+ /* now update f_i */
+ movaps xmm3, [esp + fix]
+ movaps xmm4, [esp + fiy]
+ movaps xmm5, [esp + fiz]
+ addps xmm3, xmm0
+ addps xmm4, xmm1
+ addps xmm5, xmm2
+ movaps [esp + fix], xmm3
+ movaps [esp + fiy], xmm4
+ movaps [esp + fiz], xmm5
+ /* update fj */
+
+ movss xmm3, [edi + eax*4]
+ movss xmm4, [edi + eax*4 + 4]
+ movss xmm5, [edi + eax*4 + 8]
+ subss xmm3, xmm0
+ subss xmm4, xmm1
+ subss xmm5, xmm2
+ movss [edi + eax*4], xmm3
+ movss [edi + eax*4 + 4], xmm4
+ movss [edi + eax*4 + 8], xmm5
+.i3110_updateouterdata_coul:
+ mov ecx, [esp + ii3]
+ mov edi, [ebp + faction]
+ mov esi, [ebp + fshift]
+ mov edx, [esp + is3]
+
+ /* accumulate i forces in xmm0, xmm1, xmm2 */
+ movaps xmm0, [esp + fix]
+ movaps xmm1, [esp + fiy]
+ movaps xmm2, [esp + fiz]
+
+ movhlps xmm3, xmm0
+ movhlps xmm4, xmm1
+ movhlps xmm5, xmm2
+ addps xmm0, xmm3
+ addps xmm1, xmm4
+ addps xmm2, xmm5 /* sum is in 1/2 in xmm0-xmm2 */
+
+ movaps xmm3, xmm0
+ movaps xmm4, xmm1
+ movaps xmm5, xmm2
+
+ shufps xmm3, xmm3, 1
+ shufps xmm4, xmm4, 1
+ shufps xmm5, xmm5, 1
+ addss xmm0, xmm3
+ addss xmm1, xmm4
+ addss xmm2, xmm5 /* xmm0-xmm2 has single force in pos0 */
+
+ /* increment i force */
+ movss xmm3, [edi + ecx*4]
+ movss xmm4, [edi + ecx*4 + 4]
+ movss xmm5, [edi + ecx*4 + 8]
+ addss xmm3, xmm0
+ addss xmm4, xmm1
+ addss xmm5, xmm2
+ movss [edi + ecx*4], xmm3
+ movss [edi + ecx*4 + 4], xmm4
+ movss [edi + ecx*4 + 8], xmm5
+
+ /* increment fshift force */
+ movss xmm3, [esi + edx*4]
+ movss xmm4, [esi + edx*4 + 4]
+ movss xmm5, [esi + edx*4 + 8]
+ addss xmm3, xmm0
+ addss xmm4, xmm1
+ addss xmm5, xmm2
+ movss [esi + edx*4], xmm3
+ movss [esi + edx*4 + 4], xmm4
+ movss [esi + edx*4 + 8], xmm5
+
+ /* loop back to mno */
+ dec dword ptr [esp + nscoul]
+ jz .i3110_testvdw
+ jmp .i3110_mno_coul
+.i3110_testvdw:
+ mov ecx, [esp + nsvdw]
+ cmp ecx, 0
+ jnz .i3110_mno_vdw
+ jmp .i3110_last_mno
+.i3110_mno_vdw:
+ mov ebx, [esp + solnr]
+ inc dword ptr [esp + solnr]
+
+ mov edx, [ebp + type]
+ mov edx, [edx + ebx*4]
+ imul edx, [ebp + ntype]
+ shl edx, 1
+ mov [esp + ntia], edx
+
+ lea ebx, [ebx + ebx*2] /* ebx = 3*ii=ii3 */
+ mov eax, [ebp + pos] /* eax = base of pos[] */
+ mov [esp + ii3], ebx
+
+ movss xmm0, [esp + shX]
+ movss xmm1, [esp + shY]
+ movss xmm2, [esp + shZ]
+
+ addss xmm0, [eax + ebx*4]
+ addss xmm1, [eax + ebx*4 + 4]
+ addss xmm2, [eax + ebx*4 + 8]
+
+ xorps xmm4, xmm4
+ movaps [esp + fix], xmm4
+ movaps [esp + fiy], xmm4
+ movaps [esp + fiz], xmm4
+
+ shufps xmm0, xmm0, 0
+ shufps xmm1, xmm1, 0
+ shufps xmm2, xmm2, 0
+
+ movaps [esp + ix], xmm0
+ movaps [esp + iy], xmm1
+ movaps [esp + iz], xmm2
+
+ mov ecx, [esp + innerjjnr0]
+ mov [esp + innerjjnr], ecx
+ mov edx, [esp + innerk0]
+ sub edx, 4
+ mov [esp + innerk], edx /* number of innerloop atoms */
+ jge .i3110_unroll_vdw_loop
+ jmp .i3110_finish_vdw_inner
+.i3110_unroll_vdw_loop:
+ /* quad-unroll innerloop here */
+ mov edx, [esp + innerjjnr] /* pointer to jjnr[k] */
+ mov eax, [edx]
+ mov ebx, [edx + 4]
+ mov ecx, [edx + 8]
+ mov edx, [edx + 12] /* eax-edx=jnr1-4 */
+ add [esp + innerjjnr], 16 /* advance pointer (unrolled 4) */
+
+ movd mm0, eax /* use mmx registers as temp storage */
+ movd mm1, ebx
+ movd mm2, ecx
+ movd mm3, edx
+
+ mov esi, [ebp + type]
+ mov eax, [esi + eax*4]
+ mov ebx, [esi + ebx*4]
+ mov ecx, [esi + ecx*4]
+ mov edx, [esi + edx*4]
+ mov esi, [ebp + nbfp]
+ shl eax, 1
+ shl ebx, 1
+ shl ecx, 1
+ shl edx, 1
+ mov edi, [esp + ntia]
+ add eax, edi
+ add ebx, edi
+ add ecx, edi
+ add edx, edi
+
+ movlps xmm6, [esi + eax*4]
+ movlps xmm7, [esi + ecx*4]
+ movhps xmm6, [esi + ebx*4]
+ movhps xmm7, [esi + edx*4]
+
+ movaps xmm4, xmm6
+ shufps xmm4, xmm7, 0b10001000
+ shufps xmm6, xmm7, 0b11011101
+
+ movd eax, mm0
+ movd ebx, mm1
+ movd ecx, mm2
+ movd edx, mm3
+
+ movaps [esp + c6], xmm4
+ movaps [esp + c12], xmm6
+
+ mov esi, [ebp + pos] /* base of pos[] */
+
+ lea eax, [eax + eax*2] /* replace jnr with j3 */
+ lea ebx, [ebx + ebx*2]
+
+ lea ecx, [ecx + ecx*2] /* replace jnr with j3 */
+ lea edx, [edx + edx*2]
+
+ /* move four coordinates to xmm0-xmm2 */
+
+ movlps xmm4, [esi + eax*4]
+ movlps xmm5, [esi + ecx*4]
+ movss xmm2, [esi + eax*4 + 8]
+ movss xmm6, [esi + ecx*4 + 8]
+
+ movhps xmm4, [esi + ebx*4]
+ movhps xmm5, [esi + edx*4]
+
+ movss xmm0, [esi + ebx*4 + 8]
+ movss xmm1, [esi + edx*4 + 8]
+
+ shufps xmm2, xmm0, 0
+ shufps xmm6, xmm1, 0
+
+ movaps xmm0, xmm4
+ movaps xmm1, xmm4
+
+ shufps xmm2, xmm6, 0b10001000
+
+ shufps xmm0, xmm5, 0b10001000
+ shufps xmm1, xmm5, 0b11011101
+
+ /* move ix-iz to xmm4-xmm6 */
+ movaps xmm4, [esp + ix]
+ movaps xmm5, [esp + iy]
+ movaps xmm6, [esp + iz]
+
+ /* calc dr */
+ subps xmm4, xmm0
+ subps xmm5, xmm1
+ subps xmm6, xmm2
+
+ /* store dr */
+ movaps [esp + dx], xmm4
+ movaps [esp + dy], xmm5
+ movaps [esp + dz], xmm6
+ /* square it */
+ mulps xmm4,xmm4
+ mulps xmm5,xmm5
+ mulps xmm6,xmm6
+ addps xmm4, xmm5
+ addps xmm4, xmm6
+ /* rsq in xmm4 */
+
+ rcpps xmm5, xmm4
+ /* 1/x lookup seed in xmm5 */
+ movaps xmm0, [esp + two]
+ mulps xmm4, xmm5
+ subps xmm0, xmm4
+ mulps xmm0, xmm5 /* xmm0=rinvsq */
+ movaps xmm4, xmm0
+
+ movaps xmm1, xmm0
+ mulps xmm1, xmm0
+ mulps xmm1, xmm0 /* xmm1=rinvsix */
+ movaps xmm2, xmm1
+ mulps xmm2, xmm2 /* xmm2=rinvtwelve */
+
+ mulps xmm1, [esp + c6]
+ mulps xmm2, [esp + c12]
+ movaps xmm5, xmm2
+ subps xmm5, xmm1 /* vnb=vnb12-vnb6 */
+ addps xmm5, [esp + vnbtot]
+ mulps xmm1, [esp + six]
+ mulps xmm2, [esp + twelve]
+ subps xmm2, xmm1
+ mulps xmm4, xmm2 /* xmm4=total fscal */
+ movaps xmm0, [esp + dx]
+ movaps xmm1, [esp + dy]
+ movaps xmm2, [esp + dz]
+
+ movaps [esp + vnbtot], xmm5
+
+ mov edi, [ebp + faction]
+ mulps xmm0, xmm4
+ mulps xmm1, xmm4
+ mulps xmm2, xmm4
+ /* xmm0-xmm2 contains tx-tz (partial force) */
+ /* now update f_i */
+ movaps xmm3, [esp + fix]
+ movaps xmm4, [esp + fiy]
+ movaps xmm5, [esp + fiz]
+ addps xmm3, xmm0
+ addps xmm4, xmm1
+ addps xmm5, xmm2
+ movaps [esp + fix], xmm3
+ movaps [esp + fiy], xmm4
+ movaps [esp + fiz], xmm5
+ /* the fj's - start by accumulating x & y forces from memory */
+ movlps xmm4, [edi + eax*4]
+ movlps xmm6, [edi + ecx*4]
+ movhps xmm4, [edi + ebx*4]
+ movhps xmm6, [edi + edx*4]
+
+ movaps xmm3, xmm4
+ shufps xmm3, xmm6, 0b10001000
+ shufps xmm4, xmm6, 0b11011101
+
+ /* now xmm3-xmm5 contains fjx, fjy, fjz */
+ subps xmm3, xmm0
+ subps xmm4, xmm1
+
+ /* unpack them back so we can store them - first x & y in xmm3/xmm4 */
+
+ movaps xmm6, xmm3
+ unpcklps xmm6, xmm4
+ unpckhps xmm3, xmm4
+ /* xmm6(l)=x & y for j1, (h) for j2 */
+ /* xmm3(l)=x & y for j3, (h) for j4 */
+ movlps [edi + eax*4], xmm6
+ movlps [edi + ecx*4], xmm3
+
+ movhps [edi + ebx*4], xmm6
+ movhps [edi + edx*4], xmm3
+
+ /* and the z forces */
+ movss xmm4, [edi + eax*4 + 8]
+ movss xmm5, [edi + ebx*4 + 8]
+ movss xmm6, [edi + ecx*4 + 8]
+ movss xmm7, [edi + edx*4 + 8]
+ subss xmm4, xmm2
+ shufps xmm2, xmm2, 0b11100101
+ subss xmm5, xmm2
+ shufps xmm2, xmm2, 0b11101010
+ subss xmm6, xmm2
+ shufps xmm2, xmm2, 0b11111111
+ subss xmm7, xmm2
+ movss [edi + eax*4 + 8], xmm4
+ movss [edi + ebx*4 + 8], xmm5
+ movss [edi + ecx*4 + 8], xmm6
+ movss [edi + edx*4 + 8], xmm7
+
+ /* should we do one more iteration? */
+ sub [esp + innerk], 4
+ jl .i3110_finish_vdw_inner
+ jmp .i3110_unroll_vdw_loop
+.i3110_finish_vdw_inner:
+ /* check if at least two particles remain */
+ add [esp + innerk], 4
+ mov edx, [esp + innerk]
+ and edx, 2
+ jnz .i3110_dopair_vdw
+ jmp .i3110_checksingle_vdw
+.i3110_dopair_vdw:
+ mov ecx, [esp + innerjjnr]
+
+ mov eax, [ecx]
+ mov ebx, [ecx + 4]
+ add [esp + innerjjnr], 8
+ xorps xmm7, xmm7
+
+ mov esi, [ebp + type]
+ mov ecx, eax
+ mov edx, ebx
+ mov ecx, [esi + ecx*4]
+ mov edx, [esi + edx*4]
+ mov esi, [ebp + nbfp]
+ shl ecx, 1
+ shl edx, 1
+ mov edi, [esp + ntia]
+ add ecx, edi
+ add edx, edi
+ movlps xmm6, [esi + ecx*4]
+ movhps xmm6, [esi + edx*4]
+ mov edi, [ebp + pos]
+
+ movaps xmm4, xmm6
+ shufps xmm4, xmm4, 0b1000
+ shufps xmm6, xmm6, 0b1101
+ movlhps xmm4, xmm7
+ movlhps xmm6, xmm7
+
+ movaps [esp + c6], xmm4
+ movaps [esp + c12], xmm6
+
+ lea eax, [eax + eax*2]
+ lea ebx, [ebx + ebx*2]
+ /* move coordinates to xmm0-xmm2 */
+ movlps xmm1, [edi + eax*4]
+ movss xmm2, [edi + eax*4 + 8]
+ movhps xmm1, [edi + ebx*4]
+ movss xmm0, [edi + ebx*4 + 8]
+
+ movlhps xmm3, xmm7
+
+ shufps xmm2, xmm0, 0
+
+ movaps xmm0, xmm1
+
+ shufps xmm2, xmm2, 0b10001000
+
+ shufps xmm0, xmm0, 0b10001000
+ shufps xmm1, xmm1, 0b11011101
+
+ mov edi, [ebp + faction]
+ /* move ix-iz to xmm4-xmm6 */
+ xorps xmm7, xmm7
+
+ movaps xmm4, [esp + ix]
+ movaps xmm5, [esp + iy]
+ movaps xmm6, [esp + iz]
+
+ /* calc dr */
+ subps xmm4, xmm0
+ subps xmm5, xmm1
+ subps xmm6, xmm2
+
+ /* store dr */
+ movaps [esp + dx], xmm4
+ movaps [esp + dy], xmm5
+ movaps [esp + dz], xmm6
+ /* square it */
+ mulps xmm4,xmm4
+ mulps xmm5,xmm5
+ mulps xmm6,xmm6
+ addps xmm4, xmm5
+ addps xmm4, xmm6
+ /* rsq in xmm4 */
+
+ rcpps xmm5, xmm4
+ /* 1/x lookup seed in xmm5 */
+ movaps xmm0, [esp + two]
+ mulps xmm4, xmm5
+ subps xmm0, xmm4
+ mulps xmm0, xmm5 /* xmm0=rinvsq */
+ movaps xmm4, xmm0
+
+ movaps xmm1, xmm0
+ mulps xmm1, xmm0
+ mulps xmm1, xmm0 /* xmm1=rinvsix */
+ movaps xmm2, xmm1
+ mulps xmm2, xmm2 /* xmm2=rinvtwelve */
+
+ mulps xmm1, [esp + c6]
+ mulps xmm2, [esp + c12]
+ movaps xmm5, xmm2
+ subps xmm5, xmm1 /* vnb=vnb12-vnb6 */
+ addps xmm5, [esp + vnbtot]
+ mulps xmm1, [esp + six]
+ mulps xmm2, [esp + twelve]
+ subps xmm2, xmm1
+ mulps xmm4, xmm2 /* xmm4=total fscal */
+
+ movaps xmm0, [esp + dx]
+ movaps xmm1, [esp + dy]
+ movaps xmm2, [esp + dz]
+
+ movaps [esp + vnbtot], xmm5
+
+ mulps xmm0, xmm4
+ mulps xmm1, xmm4
+ mulps xmm2, xmm4
+ /* xmm0-xmm2 contains tx-tz (partial force) */
+ /* now update f_i */
+ movaps xmm3, [esp + fix]
+ movaps xmm4, [esp + fiy]
+ movaps xmm5, [esp + fiz]
+ addps xmm3, xmm0
+ addps xmm4, xmm1
+ addps xmm5, xmm2
+ movaps [esp + fix], xmm3
+ movaps [esp + fiy], xmm4
+ movaps [esp + fiz], xmm5
+ /* update the fj's */
+ movss xmm3, [edi + eax*4]
+ movss xmm4, [edi + eax*4 + 4]
+ movss xmm5, [edi + eax*4 + 8]
+ subss xmm3, xmm0
+ subss xmm4, xmm1
+ subss xmm5, xmm2
+ movss [edi + eax*4], xmm3
+ movss [edi + eax*4 + 4], xmm4
+ movss [edi + eax*4 + 8], xmm5
+
+ shufps xmm0, xmm0, 0b11100001
+ shufps xmm1, xmm1, 0b11100001
+ shufps xmm2, xmm2, 0b11100001
+
+ movss xmm3, [edi + ebx*4]
+ movss xmm4, [edi + ebx*4 + 4]
+ movss xmm5, [edi + ebx*4 + 8]
+ subss xmm3, xmm0
+ subss xmm4, xmm1
+ subss xmm5, xmm2
+ movss [edi + ebx*4], xmm3
+ movss [edi + ebx*4 + 4], xmm4
+ movss [edi + ebx*4 + 8], xmm5
+
+.i3110_checksingle_vdw:
+ mov edx, [esp + innerk]
+ and edx, 1
+ jnz .i3110_dosingle_vdw
+ jmp .i3110_updateouterdata_vdw
+.i3110_dosingle_vdw:
+ mov edi, [ebp + pos]
+ mov ecx, [esp + innerjjnr]
+ mov eax, [ecx]
+ xorps xmm6, xmm6
+
+ mov esi, [ebp + type]
+ mov ecx, eax
+ mov ecx, [esi + ecx*4]
+ mov esi, [ebp + nbfp]
+ shl ecx, 1
+ add ecx, [esp + ntia]
+ movlps xmm6, [esi + ecx*4]
+ movaps xmm4, xmm6
+ shufps xmm4, xmm4, 0b11111100
+ shufps xmm6, xmm6, 0b11111101
+
+ movaps [esp + c6], xmm4
+ movaps [esp + c12], xmm6
+
+ lea eax, [eax + eax*2]
+
+ /* move coordinates to xmm0-xmm2 */
+ movss xmm0, [edi + eax*4]
+ movss xmm1, [edi + eax*4 + 4]
+ movss xmm2, [edi + eax*4 + 8]
+
+ movaps xmm4, [esp + ix]
+ movaps xmm5, [esp + iy]
+ movaps xmm6, [esp + iz]
+
+ /* calc dr */
+ subps xmm4, xmm0
+ subps xmm5, xmm1
+ subps xmm6, xmm2
+
+ /* store dr */
+ movaps [esp + dx], xmm4
+ movaps [esp + dy], xmm5
+ movaps [esp + dz], xmm6
+ /* square it */
+ mulps xmm4,xmm4
+ mulps xmm5,xmm5
+ mulps xmm6,xmm6
+ addps xmm4, xmm5
+ addps xmm4, xmm6
+ /* rsq in xmm4 */
+
+ rcpps xmm5, xmm4
+ /* 1/x lookup seed in xmm5 */
+ movaps xmm0, [esp + two]
+ mulps xmm4, xmm5
+ subps xmm0, xmm4
+ mulps xmm0, xmm5 /* xmm0=rinvsq */
+ movaps xmm4, xmm0
+
+ movaps xmm1, xmm0
+ mulps xmm1, xmm0
+ mulps xmm1, xmm0 /* xmm1=rinvsix */
+ movaps xmm2, xmm1
+ mulps xmm2, xmm2 /* xmm2=rinvtwelve */
+
+ mulps xmm1, [esp + c6]
+ mulps xmm2, [esp + c12]
+ movaps xmm5, xmm2
+ subps xmm5, xmm1 /* vnb=vnb12-vnb6 */
+ addps xmm5, [esp + vnbtot]
+ mulps xmm1, [esp + six]
+ mulps xmm2, [esp + twelve]
+ subps xmm2, xmm1
+ mulps xmm4, xmm2 /* xmm4=total fscal */
+
+ mov edi, [ebp + faction]
+
+ movaps xmm0, [esp + dx]
+ movaps xmm1, [esp + dy]
+ movaps xmm2, [esp + dz]
+
+ movaps [esp + vnbtot], xmm5
+
+ mulps xmm0, xmm4
+ mulps xmm1, xmm4
+ mulps xmm2, xmm4
+ /* xmm0-xmm2 contains tx-tz (partial force) */
+ mov edi, [ebp +faction]
+
+ /* now update f_i */
+ movaps xmm3, [esp + fix]
+ movaps xmm4, [esp + fiy]
+ movaps xmm5, [esp + fiz]
+ addps xmm3, xmm0
+ addps xmm4, xmm1
+ addps xmm5, xmm2
+ movaps [esp + fix], xmm3
+ movaps [esp + fiy], xmm4
+ movaps [esp + fiz], xmm5
+ /* update fj */
+
+ movss xmm3, [edi + eax*4]
+ movss xmm4, [edi + eax*4 + 4]
+ movss xmm5, [edi + eax*4 + 8]
+ subss xmm3, xmm0
+ subss xmm4, xmm1
+ subss xmm5, xmm2
+ movss [edi + eax*4], xmm3
+ movss [edi + eax*4 + 4], xmm4
+ movss [edi + eax*4 + 8], xmm5
+.i3110_updateouterdata_vdw:
+ mov ecx, [esp + ii3]
+ mov edi, [ebp + faction]
+ mov esi, [ebp + fshift]
+ mov edx, [esp + is3]
+
+ /* accumulate i forces in xmm0, xmm1, xmm2 */
+ movaps xmm0, [esp + fix]
+ movaps xmm1, [esp + fiy]
+ movaps xmm2, [esp + fiz]
+
+ movhlps xmm3, xmm0
+ movhlps xmm4, xmm1
+ movhlps xmm5, xmm2
+ addps xmm0, xmm3
+ addps xmm1, xmm4
+ addps xmm2, xmm5 /* sum is in 1/2 in xmm0-xmm2 */
+
+ movaps xmm3, xmm0
+ movaps xmm4, xmm1
+ movaps xmm5, xmm2
+
+ shufps xmm3, xmm3, 1
+ shufps xmm4, xmm4, 1
+ shufps xmm5, xmm5, 1
+ addss xmm0, xmm3
+ addss xmm1, xmm4
+ addss xmm2, xmm5 /* xmm0-xmm2 has single force in pos0 */
+
+ /* increment i force */
+ movss xmm3, [edi + ecx*4]
+ movss xmm4, [edi + ecx*4 + 4]
+ movss xmm5, [edi + ecx*4 + 8]
+ addss xmm3, xmm0
+ addss xmm4, xmm1
+ addss xmm5, xmm2
+ movss [edi + ecx*4], xmm3
+ movss [edi + ecx*4 + 4], xmm4
+ movss [edi + ecx*4 + 8], xmm5
+
+ /* increment fshift force */
+ movss xmm3, [esi + edx*4]
+ movss xmm4, [esi + edx*4 + 4]
+ movss xmm5, [esi + edx*4 + 8]
+ addss xmm3, xmm0
+ addss xmm4, xmm1
+ addss xmm5, xmm2
+ movss [esi + edx*4], xmm3
+ movss [esi + edx*4 + 4], xmm4
+ movss [esi + edx*4 + 8], xmm5
+
+ /* loop back to mno */
+ dec dword ptr [esp + nsvdw]
+ jz .i3110_last_mno
+ jmp .i3110_mno_vdw
+.i3110_last_mno:
+ /* get group index for i particle */
+ mov edx, [ebp + gid] /* get group index for this i particle */
+ mov edx, [edx]
+ add [ebp + gid], 4 /* advance pointer */
+
+ /* accumulate total potential energy and update it */
+ movaps xmm7, [esp + vctot]
+ /* accumulate */
+ movhlps xmm6, xmm7
+ addps xmm7, xmm6 /* pos 0-1 in xmm7 have the sum now */
+ movaps xmm6, xmm7
+ shufps xmm6, xmm6, 1
+ addss xmm7, xmm6
+
+ /* add earlier value from mem */
+ mov eax, [ebp + Vc]
+ addss xmm7, [eax + edx*4]
+ /* move back to mem */
+ movss [eax + edx*4], xmm7
+
+ /* accumulate total lj energy and update it */
+ movaps xmm7, [esp + vnbtot]
+ /* accumulate */
+ movhlps xmm6, xmm7
+ addps xmm7, xmm6 /* pos 0-1 in xmm7 have the sum now */
+ movaps xmm6, xmm7
+ shufps xmm6, xmm6, 1
+ addss xmm7, xmm6
+
+ /* add earlier value from mem */
+ mov eax, [ebp + Vnb]
+ addss xmm7, [eax + edx*4]
+ /* move back to mem */
+ movss [eax + edx*4], xmm7
+
+ /* finish if last */
+ mov ecx, [ebp + nri]
+ dec ecx
+ jecxz .i3110_end
+ /* not last, iterate once more! */
+ mov [ebp + nri], ecx
+ jmp .i3110_outer
+.i3110_end:
+ emms
+ mov eax, [esp + salign]
+ add esp, eax
+ add esp, 412
+ pop edi
+ pop esi
+ pop edx
+ pop ecx
+ pop ebx
+ pop eax
+ leave
+ ret
+
+
+
+
+.globl inl3120_sse
+ .type inl3120_sse,@function
+inl3120_sse:
+.equ nri, 8
+.equ iinr, 12
+.equ jindex, 16
+.equ jjnr, 20
+.equ shift, 24
+.equ shiftvec, 28
+.equ fshift, 32
+.equ gid, 36
+.equ pos, 40
+.equ faction, 44
+.equ charge, 48
+.equ facel, 52
+.equ Vc, 56
+.equ type, 60
+.equ ntype, 64
+.equ nbfp, 68
+.equ Vnb, 72
+.equ tabscale, 76
+.equ VFtab, 80
+ /* stack offsets for local variables */
+ /* bottom of stack is cache-aligned for sse use */
+.equ ixO, 0
+.equ iyO, 16
+.equ izO, 32
+.equ ixH1, 48
+.equ iyH1, 64
+.equ izH1, 80
+.equ ixH2, 96
+.equ iyH2, 112
+.equ izH2, 128
+.equ iqO, 144
+.equ iqH, 160
+.equ dxO, 176
+.equ dyO, 192
+.equ dzO, 208
+.equ dxH1, 224
+.equ dyH1, 240
+.equ dzH1, 256
+.equ dxH2, 272
+.equ dyH2, 288
+.equ dzH2, 304
+.equ qqO, 320
+.equ qqH, 336
+.equ rinvO, 352
+.equ rinvH1, 368
+.equ rinvH2, 384
+.equ rO, 400
+.equ rH1, 416
+.equ rH2, 432
+.equ tsc, 448
+.equ two, 464
+.equ c6, 480
+.equ c12, 496
+.equ six, 512
+.equ twelve, 528
+.equ vctot, 544
+.equ vnbtot, 560
+.equ fixO, 576
+.equ fiyO, 592
+.equ fizO, 608
+.equ fixH1, 624
+.equ fiyH1, 640
+.equ fizH1, 656
+.equ fixH2, 672
+.equ fiyH2, 688
+.equ fizH2, 704
+.equ fjx, 720
+.equ fjy, 736
+.equ fjz, 752
+.equ half, 768
+.equ three, 784
+.equ is3, 800
+.equ ii3, 804
+.equ ntia, 808
+.equ innerjjnr, 812
+.equ innerk, 816
+.equ salign, 820
+ push ebp
+ mov ebp,esp
+ push eax
+ push ebx
+ push ecx
+ push edx
+ push esi
+ push edi
+ sub esp, 824 /* local stack space */
+ mov eax, esp
+ and eax, 0xf
+ sub esp, eax
+ mov [esp + salign], eax
+
+ emms
+
+ movups xmm0, [sse_half]
+ movups xmm1, [sse_two]
+ movups xmm2, [sse_three]
+ movups xmm3, [sse_six]
+ movups xmm4, [sse_twelve]
+ movss xmm5, [ebp +tabscale]
+
+ movaps [esp + half], xmm0
+ movaps [esp + two], xmm1
+ movaps [esp + three], xmm2
+ movaps [esp + six], xmm3
+ movaps [esp + twelve], xmm4
+ shufps xmm5, xmm5, 0
+ movaps [esp + tsc], xmm5
+
+ /* assume we have at least one i particle - start directly */
+ mov ecx, [ebp + iinr] /* ecx = pointer into iinr[] */
+ mov ebx, [ecx] /* ebx =ii */
+
+ mov edx, [ebp + charge]
+ movss xmm3, [edx + ebx*4]
+ movss xmm4, [edx + ebx*4 + 4]
+ movss xmm5, [ebp + facel]
+ mulss xmm3, xmm5
+ mulss xmm4, xmm5
+
+ shufps xmm3, xmm3, 0
+ shufps xmm4, xmm4, 0
+ movaps [esp + iqO], xmm3
+ movaps [esp + iqH], xmm4
+
+ mov edx, [ebp + type]
+ mov ecx, [edx + ebx*4]
+ shl ecx, 1
+ imul ecx, [ebp + ntype] /* ecx = ntia = 2*ntype*type[ii0] */
+ mov [esp + ntia], ecx
+.i3120_outer:
+ mov eax, [ebp + shift] /* eax = pointer into shift[] */
+ mov ebx, [eax] /* ebx=shift[n] */
+ add [ebp + shift], 4 /* advance pointer one step */
+
+ lea ebx, [ebx + ebx*2] /* ebx=3*is */
+ mov [esp + is3],ebx /* store is3 */
+
+ mov eax, [ebp + shiftvec] /* eax = base of shiftvec[] */
+
+ movss xmm0, [eax + ebx*4]
+ movss xmm1, [eax + ebx*4 + 4]
+ movss xmm2, [eax + ebx*4 + 8]
+
+ mov ecx, [ebp + iinr] /* ecx = pointer into iinr[] */
+ add [ebp + iinr], 4 /* advance pointer */
+ mov ebx, [ecx] /* ebx =ii */
+
+ movaps xmm3, xmm0
+ movaps xmm4, xmm1
+ movaps xmm5, xmm2
+
+ lea ebx, [ebx + ebx*2] /* ebx = 3*ii=ii3 */
+ mov eax, [ebp + pos] /* eax = base of pos[] */
+ mov [esp + ii3], ebx
+
+ addss xmm3, [eax + ebx*4]
+ addss xmm4, [eax + ebx*4 + 4]
+ addss xmm5, [eax + ebx*4 + 8]
+ shufps xmm3, xmm3, 0
+ shufps xmm4, xmm4, 0
+ shufps xmm5, xmm5, 0
+ movaps [esp + ixO], xmm3
+ movaps [esp + iyO], xmm4
+ movaps [esp + izO], xmm5
+
+ movss xmm3, xmm0
+ movss xmm4, xmm1
+ movss xmm5, xmm2
+ addss xmm0, [eax + ebx*4 + 12]
+ addss xmm1, [eax + ebx*4 + 16]
+ addss xmm2, [eax + ebx*4 + 20]
+ addss xmm3, [eax + ebx*4 + 24]
+ addss xmm4, [eax + ebx*4 + 28]
+ addss xmm5, [eax + ebx*4 + 32]
+
+ shufps xmm0, xmm0, 0
+ shufps xmm1, xmm1, 0
+ shufps xmm2, xmm2, 0
+ shufps xmm3, xmm3, 0
+ shufps xmm4, xmm4, 0
+ shufps xmm5, xmm5, 0
+ movaps [esp + ixH1], xmm0
+ movaps [esp + iyH1], xmm1
+ movaps [esp + izH1], xmm2
+ movaps [esp + ixH2], xmm3
+ movaps [esp + iyH2], xmm4
+ movaps [esp + izH2], xmm5
+
+ /* clear vctot and i forces */
+ xorps xmm4, xmm4
+ movaps [esp + vctot], xmm4
+ movaps [esp + vnbtot], xmm4
+ movaps [esp + fixO], xmm4
+ movaps [esp + fiyO], xmm4
+ movaps [esp + fizO], xmm4
+ movaps [esp + fixH1], xmm4
+ movaps [esp + fiyH1], xmm4
+ movaps [esp + fizH1], xmm4
+ movaps [esp + fixH2], xmm4
+ movaps [esp + fiyH2], xmm4
+ movaps [esp + fizH2], xmm4
+
+ mov eax, [ebp + jindex]
+ mov ecx, [eax] /* jindex[n] */
+ mov edx, [eax + 4] /* jindex[n+1] */
+ add [ebp + jindex], 4
+ sub edx, ecx /* number of innerloop atoms */
+
+ mov esi, [ebp + pos]
+ mov edi, [ebp + faction]
+ mov eax, [ebp + jjnr]
+ shl ecx, 2
+ add eax, ecx
+ mov [esp + innerjjnr], eax /* pointer to jjnr[nj0] */
+ sub edx, 4
+ mov [esp + innerk], edx /* number of innerloop atoms */
+ jge .i3120_unroll_loop
+ jmp .i3120_odd_inner
+.i3120_unroll_loop:
+ /* quad-unroll innerloop here */
+ mov edx, [esp + innerjjnr] /* pointer to jjnr[k] */
+ mov eax, [edx]
+ mov ebx, [edx + 4]
+ mov ecx, [edx + 8]
+ mov edx, [edx + 12] /* eax-edx=jnr1-4 */
+
+ add [esp + innerjjnr], 16 /* advance pointer (unrolled 4) */
+
+ mov esi, [ebp + charge] /* base of charge[] */
+
+ movss xmm3, [esi + eax*4]
+ movss xmm4, [esi + ecx*4]
+ movss xmm6, [esi + ebx*4]
+ movss xmm7, [esi + edx*4]
+
+ shufps xmm3, xmm6, 0
+ shufps xmm4, xmm7, 0
+ shufps xmm3, xmm4, 0b10001000 /* all charges in xmm3 */
+ movaps xmm4, xmm3 /* and in xmm4 */
+ mulps xmm3, [esp + iqO]
+ mulps xmm4, [esp + iqH]
+
+ movd mm0, eax /* use mmx registers as temp storage */
+ movd mm1, ebx
+ movd mm2, ecx
+ movd mm3, edx
+
+ movaps [esp + qqO], xmm3
+ movaps [esp + qqH], xmm4
+
+ mov esi, [ebp + type]
+ mov eax, [esi + eax*4]
+ mov ebx, [esi + ebx*4]
+ mov ecx, [esi + ecx*4]
+ mov edx, [esi + edx*4]
+ mov esi, [ebp + nbfp]
+ shl eax, 1
+ shl ebx, 1
+ shl ecx, 1
+ shl edx, 1
+ mov edi, [esp + ntia]
+ add eax, edi
+ add ebx, edi
+ add ecx, edi
+ add edx, edi
+
+ movlps xmm6, [esi + eax*4]
+ movlps xmm7, [esi + ecx*4]
+ movhps xmm6, [esi + ebx*4]
+ movhps xmm7, [esi + edx*4]
+
+ movaps xmm4, xmm6
+ shufps xmm4, xmm7, 0b10001000
+ shufps xmm6, xmm7, 0b11011101
+
+ movd eax, mm0
+ movd ebx, mm1
+ movd ecx, mm2
+ movd edx, mm3
+
+ movaps [esp + c6], xmm4
+ movaps [esp + c12], xmm6
+
+ mov esi, [ebp + pos] /* base of pos[] */
+
+ lea eax, [eax + eax*2] /* replace jnr with j3 */
+ lea ebx, [ebx + ebx*2]
+ lea ecx, [ecx + ecx*2] /* replace jnr with j3 */
+ lea edx, [edx + edx*2]
+
+ /* move four coordinates to xmm0-xmm2 */
+ movlps xmm4, [esi + eax*4]
+ movlps xmm5, [esi + ecx*4]
+ movss xmm2, [esi + eax*4 + 8]
+ movss xmm6, [esi + ecx*4 + 8]
+
+ movhps xmm4, [esi + ebx*4]
+ movhps xmm5, [esi + edx*4]
+
+ movss xmm0, [esi + ebx*4 + 8]
+ movss xmm1, [esi + edx*4 + 8]
+
+ shufps xmm2, xmm0, 0
+ shufps xmm6, xmm1, 0
+
+ movaps xmm0, xmm4
+ movaps xmm1, xmm4
+
+ shufps xmm2, xmm6, 0b10001000
+
+ shufps xmm0, xmm5, 0b10001000
+ shufps xmm1, xmm5, 0b11011101
+
+ /* move ixO-izO to xmm4-xmm6 */
+ movaps xmm4, [esp + ixO]
+ movaps xmm5, [esp + iyO]
+ movaps xmm6, [esp + izO]
+
+ /* calc dr */
+ subps xmm4, xmm0
+ subps xmm5, xmm1
+ subps xmm6, xmm2
+
+ /* store dr */
+ movaps [esp + dxO], xmm4
+ movaps [esp + dyO], xmm5
+ movaps [esp + dzO], xmm6
+ /* square it */
+ mulps xmm4,xmm4
+ mulps xmm5,xmm5
+ mulps xmm6,xmm6
+ addps xmm4, xmm5
+ addps xmm4, xmm6
+ movaps xmm7, xmm4
+ /* rsqO in xmm7 */
+
+ /* move ixH1-izH1 to xmm4-xmm6 */
+ movaps xmm4, [esp + ixH1]
+ movaps xmm5, [esp + iyH1]
+ movaps xmm6, [esp + izH1]
+
+ /* calc dr */
+ subps xmm4, xmm0
+ subps xmm5, xmm1
+ subps xmm6, xmm2
+
+ /* store dr */
+ movaps [esp + dxH1], xmm4
+ movaps [esp + dyH1], xmm5
+ movaps [esp + dzH1], xmm6
+ /* square it */
+ mulps xmm4,xmm4
+ mulps xmm5,xmm5
+ mulps xmm6,xmm6
+ addps xmm6, xmm5
+ addps xmm6, xmm4
+ /* rsqH1 in xmm6 */
+
+ /* move ixH2-izH2 to xmm3-xmm5 */
+ movaps xmm3, [esp + ixH2]
+ movaps xmm4, [esp + iyH2]
+ movaps xmm5, [esp + izH2]
+
+ /* calc dr */
+ subps xmm3, xmm0
+ subps xmm4, xmm1
+ subps xmm5, xmm2
+
+ /* store dr */
+ movaps [esp + dxH2], xmm3
+ movaps [esp + dyH2], xmm4
+ movaps [esp + dzH2], xmm5
+ /* square it */
+ mulps xmm3,xmm3
+ mulps xmm4,xmm4
+ mulps xmm5,xmm5
+ addps xmm5, xmm4
+ addps xmm5, xmm3
+ /* rsqH2 in xmm5, rsqH1 in xmm6, rsqO in xmm7 */
+
+ /* start with rsqO - seed to xmm2 */
+ rsqrtps xmm2, xmm7
+ movaps xmm3, xmm2
+ mulps xmm2, xmm2
+ movaps xmm4, [esp + three]
+ mulps xmm2, xmm7 /* rsq*lu*lu */
+ subps xmm4, xmm2 /* 30-rsq*lu*lu */
+ mulps xmm4, xmm3 /* lu*(3-rsq*lu*lu) */
+ mulps xmm4, [esp + half]
+ movaps [esp + rinvO], xmm4 /* rinvO in xmm4 */
+ mulps xmm7, xmm4
+ movaps [esp + rO], xmm7
+
+ /* rsqH1 - seed in xmm2 */
+ rsqrtps xmm2, xmm6
+ movaps xmm3, xmm2
+ mulps xmm2, xmm2
+ movaps xmm4, [esp + three]
+ mulps xmm2, xmm6 /* rsq*lu*lu */
+ subps xmm4, xmm2 /* 30-rsq*lu*lu */
+ mulps xmm4, xmm3 /* lu*(3-rsq*lu*lu) */
+ mulps xmm4, [esp + half]
+ movaps [esp + rinvH1], xmm4 /* rinvH1 in xmm4 */
+ mulps xmm6, xmm4
+ movaps [esp + rH1], xmm6
+
+ /* rsqH2 - seed to xmm2 */
+ rsqrtps xmm2, xmm5
+ movaps xmm3, xmm2
+ mulps xmm2, xmm2
+ movaps xmm4, [esp + three]
+ mulps xmm2, xmm5 /* rsq*lu*lu */
+ subps xmm4, xmm2 /* 30-rsq*lu*lu */
+ mulps xmm4, xmm3 /* lu*(3-rsq*lu*lu) */
+ mulps xmm4, [esp + half]
+ movaps [esp + rinvH2], xmm4 /* rinvH2 in xmm4 */
+ mulps xmm5, xmm4
+ movaps [esp + rH2], xmm5
+
+ /* do O interactions */
+ /* rO is still in xmm7 */
+ mulps xmm7, [esp + tsc]
+ movhlps xmm4, xmm7
+ cvttps2pi mm6, xmm7
+ cvttps2pi mm7, xmm4 /* mm6/mm7 contain lu indices */
+
+ cvtpi2ps xmm3, mm6
+ cvtpi2ps xmm4, mm7
+ movlhps xmm3, xmm4
+
+ subps xmm7, xmm3
+
+ movaps xmm1, xmm7 /* xmm1=eps */
+ movaps xmm2, xmm1
+ mulps xmm2, xmm2 /* xmm2=eps2 */
+ pslld mm6, 2
+ pslld mm7, 2
+
+ movd mm0, eax
+ movd mm1, ebx
+ movd mm2, ecx
+ movd mm3, edx
+
+ mov esi, [ebp + VFtab]
+ movd eax, mm6
+ psrlq mm6, 32
+ movd ecx, mm7
+ psrlq mm7, 32
+ movd ebx, mm6
+ movd edx, mm7
+
+ movlps xmm5, [esi + eax*4]
+ movlps xmm7, [esi + ecx*4]
+ movhps xmm5, [esi + ebx*4]
+ movhps xmm7, [esi + edx*4] /* got half coulomb table */
+
+ movaps xmm4, xmm5
+ shufps xmm4, xmm7, 0b10001000
+ shufps xmm5, xmm7, 0b11011101
+
+ movlps xmm7, [esi + eax*4 + 8]
+ movlps xmm3, [esi + ecx*4 + 8]
+ movhps xmm7, [esi + ebx*4 + 8]
+ movhps xmm3, [esi + edx*4 + 8] /* other half of coulomb table */
+ movaps xmm6, xmm7
+ shufps xmm6, xmm3, 0b10001000
+ shufps xmm7, xmm3, 0b11011101
+ /* coulomb table ready, in xmm4-xmm7 */
+
+ mulps xmm6, xmm1 /* xmm6=Geps */
+ mulps xmm7, xmm2 /* xmm7=Heps2 */
+ addps xmm5, xmm6
+ addps xmm5, xmm7 /* xmm5=Fp */
+ mulps xmm7, [esp + two] /* two*Heps2 */
+ movaps xmm0, [esp + qqO]
+ addps xmm7, xmm6
+ addps xmm7, xmm5 /* xmm7=FF */
+ mulps xmm5, xmm1 /* xmm5=eps*Fp */
+ addps xmm5, xmm4 /* xmm5=VV */
+ mulps xmm5, xmm0 /* vcoul=qq*VV */
+ mulps xmm0, xmm7 /* fijC=FF*qq */
+
+ /* do nontable L-J */
+ movaps xmm2, [esp + rinvO]
+ mulps xmm2, xmm2
+
+ /* at this point mm5 contains vcoul and xmm0 fijC */
+ /* increment vcoul - then we can get rid of mm5 */
+ addps xmm5, [esp + vctot]
+ movaps [esp + vctot], xmm5
+
+ movaps xmm1, xmm2
+ mulps xmm1, xmm1
+ mulps xmm1, xmm2 /* xmm1=rinvsix */
+ movaps xmm4, xmm1
+ mulps xmm4, xmm4 /* xmm4=rinvtwelve */
+ mulps xmm1, [esp + c6]
+ mulps xmm4, [esp + c12]
+ movaps xmm3, xmm4
+ subps xmm3, xmm1 /* xmm3=vnb12-vnb6 */
+ mulps xmm1, [esp + six]
+ mulps xmm4, [esp + twelve]
+ subps xmm4, xmm1
+ addps xmm3, [esp + vnbtot]
+ mulps xmm4, [esp + rinvO]
+ mulps xmm0, [esp + tsc]
+ subps xmm4, xmm0
+ movaps [esp + vnbtot], xmm3
+ mulps xmm4, [esp + rinvO]
+
+ movaps xmm0, [esp + dxO]
+ movaps xmm1, [esp + dyO]
+ movaps xmm2, [esp + dzO]
+ mulps xmm0, xmm4
+ mulps xmm1, xmm4
+ mulps xmm2, xmm4 /* tx in xmm0-xmm2 */
+
+ /* update O forces */
+ movaps xmm3, [esp + fixO]
+ movaps xmm4, [esp + fiyO]
+ movaps xmm7, [esp + fizO]
+ addps xmm3, xmm0
+ addps xmm4, xmm1
+ addps xmm7, xmm2
+ movaps [esp + fixO], xmm3
+ movaps [esp + fiyO], xmm4
+ movaps [esp + fizO], xmm7
+ /* update j forces with water O */
+ movaps [esp + fjx], xmm0
+ movaps [esp + fjy], xmm1
+ movaps [esp + fjz], xmm2
+
+ /* Done with O interactions - now H1! */
+ movaps xmm7, [esp + rH1]
+ mulps xmm7, [esp + tsc]
+ movhlps xmm4, xmm7
+ cvttps2pi mm6, xmm7
+ cvttps2pi mm7, xmm4 /* mm6/mm7 contain lu indices */
+
+ cvtpi2ps xmm3, mm6
+ cvtpi2ps xmm4, mm7
+ movlhps xmm3, xmm4
+
+ subps xmm7, xmm3
+ movaps xmm1, xmm7 /* xmm1=eps */
+ movaps xmm2, xmm1
+ mulps xmm2, xmm2 /* xmm2=eps2 */
+ pslld mm6, 2
+ pslld mm7, 2
+
+ movd eax, mm6
+ psrlq mm6, 32
+ movd ecx, mm7
+ psrlq mm7, 32
+ movd ebx, mm6
+ movd edx, mm7
+
+ movlps xmm5, [esi + eax*4]
+ movlps xmm7, [esi + ecx*4]
+ movhps xmm5, [esi + ebx*4]
+ movhps xmm7, [esi + edx*4] /* got half coulomb table */
+
+ movaps xmm4, xmm5
+ shufps xmm4, xmm7, 0b10001000
+ shufps xmm5, xmm7, 0b11011101
+
+ movlps xmm7, [esi + eax*4 + 8]
+ movlps xmm3, [esi + ecx*4 + 8]
+ movhps xmm7, [esi + ebx*4 + 8]
+ movhps xmm3, [esi + edx*4 + 8] /* other half of coulomb table */
+ movaps xmm6, xmm7
+ shufps xmm6, xmm3, 0b10001000
+ shufps xmm7, xmm3, 0b11011101
+ /* coulomb table ready, in xmm4-xmm7 */
+
+ mulps xmm6, xmm1 /* xmm6=Geps */
+ mulps xmm7, xmm2 /* xmm7=Heps2 */
+ addps xmm5, xmm6
+ addps xmm5, xmm7 /* xmm5=Fp */
+ mulps xmm7, [esp + two] /* two*Heps2 */
+ movaps xmm0, [esp + qqH]
+ addps xmm7, xmm6
+ addps xmm7, xmm5 /* xmm7=FF */
+ mulps xmm5, xmm1 /* xmm5=eps*Fp */
+ addps xmm5, xmm4 /* xmm5=VV */
+ mulps xmm5, xmm0 /* vcoul=qq*VV */
+ mulps xmm7, xmm0 /* fijC=FF*qq */
+ /* at this point mm5 contains vcoul and xmm7 fijC */
+ /* increment vcoul */
+ xorps xmm4, xmm4
+ addps xmm5, [esp + vctot]
+ mulps xmm7, [esp + rinvH1]
+ movaps [esp + vctot], xmm5
+ mulps xmm7, [esp + tsc]
+ subps xmm4, xmm7
+
+ movaps xmm0, [esp + dxH1]
+ movaps xmm1, [esp + dyH1]
+ movaps xmm2, [esp + dzH1]
+ mulps xmm0, xmm4
+ mulps xmm1, xmm4
+ mulps xmm2, xmm4
+
+ /* update H1 forces */
+ movaps xmm3, [esp + fixH1]
+ movaps xmm4, [esp + fiyH1]
+ movaps xmm7, [esp + fizH1]
+ addps xmm3, xmm0
+ addps xmm4, xmm1
+ addps xmm7, xmm2
+ movaps [esp + fixH1], xmm3
+ movaps [esp + fiyH1], xmm4
+ movaps [esp + fizH1], xmm7
+ /* update j forces with water H1 */
+ addps xmm0, [esp + fjx]
+ addps xmm1, [esp + fjy]
+ addps xmm2, [esp + fjz]
+ movaps [esp + fjx], xmm0
+ movaps [esp + fjy], xmm1
+ movaps [esp + fjz], xmm2
+
+ /* Done with H1, finally we do H2 interactions */
+ movaps xmm7, [esp + rH2]
+ mulps xmm7, [esp + tsc]
+ movhlps xmm4, xmm7
+ cvttps2pi mm6, xmm7
+ cvttps2pi mm7, xmm4 /* mm6/mm7 contain lu indices */
+
+ cvtpi2ps xmm3, mm6
+ cvtpi2ps xmm4, mm7
+ movlhps xmm3, xmm4
+
+ subps xmm7, xmm3
+ movaps xmm1, xmm7 /* xmm1=eps */
+ movaps xmm2, xmm1
+ mulps xmm2, xmm2 /* xmm2=eps2 */
+ pslld mm6, 2
+ pslld mm7, 2
+
+ movd eax, mm6
+ psrlq mm6, 32
+ movd ecx, mm7
+ psrlq mm7, 32
+ movd ebx, mm6
+ movd edx, mm7
+
+ movlps xmm5, [esi + eax*4]
+ movlps xmm7, [esi + ecx*4]
+ movhps xmm5, [esi + ebx*4]
+ movhps xmm7, [esi + edx*4] /* got half coulomb table */
+
+ movaps xmm4, xmm5
+ shufps xmm4, xmm7, 0b10001000
+ shufps xmm5, xmm7, 0b11011101
+
+ movlps xmm7, [esi + eax*4 + 8]
+ movlps xmm3, [esi + ecx*4 + 8]
+ movhps xmm7, [esi + ebx*4 + 8]
+ movhps xmm3, [esi + edx*4 + 8] /* other half of coulomb table */
+ movaps xmm6, xmm7
+ shufps xmm6, xmm3, 0b10001000
+ shufps xmm7, xmm3, 0b11011101
+ /* coulomb table ready, in xmm4-xmm7 */
+
+ mulps xmm6, xmm1 /* xmm6=Geps */
+ mulps xmm7, xmm2 /* xmm7=Heps2 */
+ addps xmm5, xmm6
+ addps xmm5, xmm7 /* xmm5=Fp */
+ mulps xmm7, [esp + two] /* two*Heps2 */
+ movaps xmm0, [esp + qqH]
+ addps xmm7, xmm6
+ addps xmm7, xmm5 /* xmm7=FF */
+ mulps xmm5, xmm1 /* xmm5=eps*Fp */
+ addps xmm5, xmm4 /* xmm5=VV */
+ mulps xmm5, xmm0 /* vcoul=qq*VV */
+ mulps xmm7, xmm0 /* fijC=FF*qq */
+ /* at this point mm5 contains vcoul and xmm0 fijC */
+ /* increment vcoul */
+ xorps xmm4, xmm4
+ addps xmm5, [esp + vctot]
+ mulps xmm7, [esp + rinvH2]
+ movaps [esp + vctot], xmm5
+ mulps xmm7, [esp + tsc]
+ subps xmm4, xmm7
+
+ movaps xmm0, [esp + dxH2]
+ movaps xmm1, [esp + dyH2]
+ movaps xmm2, [esp + dzH2]
+ mulps xmm0, xmm4
+ mulps xmm1, xmm4
+ mulps xmm2, xmm4
+
+ movd eax, mm0
+ movd ebx, mm1
+ movd ecx, mm2
+ movd edx, mm3
+
+ /* update H2 forces */
+ movaps xmm3, [esp + fixH2]
+ movaps xmm4, [esp + fiyH2]
+ movaps xmm7, [esp + fizH2]
+ addps xmm3, xmm0
+ addps xmm4, xmm1
+ addps xmm7, xmm2
+ movaps [esp + fixH2], xmm3
+ movaps [esp + fiyH2], xmm4
+ movaps [esp + fizH2], xmm7
+
+ mov edi, [ebp +faction]
+ /* update j forces */
+ addps xmm0, [esp + fjx]
+ addps xmm1, [esp + fjy]
+ addps xmm2, [esp + fjz]
+
+ movlps xmm4, [edi + eax*4]
+ movlps xmm7, [edi + ecx*4]
+ movhps xmm4, [edi + ebx*4]
+ movhps xmm7, [edi + edx*4]
+
+ movaps xmm3, xmm4
+ shufps xmm3, xmm7, 0b10001000
+ shufps xmm4, xmm7, 0b11011101
+ /* xmm3 has fjx, xmm4 has fjy */
+ subps xmm3, xmm0
+ subps xmm4, xmm1
+ /* unpack them back for storing */
+ movaps xmm7, xmm3
+ unpcklps xmm7, xmm4
+ unpckhps xmm3, xmm4
+ movlps [edi + eax*4], xmm7
+ movlps [edi + ecx*4], xmm3
+ movhps [edi + ebx*4], xmm7
+ movhps [edi + edx*4], xmm3
+ /* finally z forces */
+ movss xmm0, [edi + eax*4 + 8]
+ movss xmm1, [edi + ebx*4 + 8]
+ movss xmm3, [edi + ecx*4 + 8]
+ movss xmm4, [edi + edx*4 + 8]
+ subss xmm0, xmm2
+ shufps xmm2, xmm2, 0b11100101
+ subss xmm1, xmm2
+ shufps xmm2, xmm2, 0b11101010
+ subss xmm3, xmm2
+ shufps xmm2, xmm2, 0b11111111
+ subss xmm4, xmm2
+ movss [edi + eax*4 + 8], xmm0
+ movss [edi + ebx*4 + 8], xmm1
+ movss [edi + ecx*4 + 8], xmm3
+ movss [edi + edx*4 + 8], xmm4
+
+ /* should we do one more iteration? */
+ sub [esp + innerk], 4
+ jl .i3120_odd_inner
+ jmp .i3120_unroll_loop
+.i3120_odd_inner:
+ add [esp + innerk], 4
+ jnz .i3120_odd_loop
+ jmp .i3120_updateouterdata
+.i3120_odd_loop:
+ mov edx, [esp + innerjjnr] /* pointer to jjnr[k] */
+ mov eax, [edx]
+ add [esp + innerjjnr], 4
+
+ xorps xmm4, xmm4
+ movss xmm4, [esp + iqO]
+ mov esi, [ebp + charge]
+ movhps xmm4, [esp + iqH]
+ movss xmm3, [esi + eax*4] /* charge in xmm3 */
+ shufps xmm3, xmm3, 0
+ mulps xmm3, xmm4
+ movaps [esp + qqO], xmm3 /* use oxygen qq for storage */
+
+ xorps xmm6, xmm6
+ mov esi, [ebp + type]
+ mov ebx, [esi + eax*4]
+ mov esi, [ebp + nbfp]
+ shl ebx, 1
+ add ebx, [esp + ntia]
+ movlps xmm6, [esi + ebx*4]
+ movaps xmm7, xmm6
+ shufps xmm6, xmm6, 0b11111100
+ shufps xmm7, xmm7, 0b11111101
+ movaps [esp + c6], xmm6
+ movaps [esp + c12], xmm7
+
+ mov esi, [ebp + pos]
+ lea eax, [eax + eax*2]
+
+ /* move j coords to xmm0-xmm2 */
+ movss xmm0, [esi + eax*4]
+ movss xmm1, [esi + eax*4 + 4]
+ movss xmm2, [esi + eax*4 + 8]
+ shufps xmm0, xmm0, 0
+ shufps xmm1, xmm1, 0
+ shufps xmm2, xmm2, 0
+ movss xmm3, [esp + ixO]
+ movss xmm4, [esp + iyO]
+ movss xmm5, [esp + izO]
+
+ movlps xmm6, [esp + ixH1]
+ movlps xmm7, [esp + ixH2]
+ unpcklps xmm6, xmm7
+ movlhps xmm3, xmm6
+ movlps xmm6, [esp + iyH1]
+ movlps xmm7, [esp + iyH2]
+ unpcklps xmm6, xmm7
+ movlhps xmm4, xmm6
+ movlps xmm6, [esp + izH1]
+ movlps xmm7, [esp + izH2]
+ unpcklps xmm6, xmm7
+ movlhps xmm5, xmm6
+
+ subps xmm3, xmm0
+ subps xmm4, xmm1
+ subps xmm5, xmm2
+
+ movaps [esp + dxO], xmm3
+ movaps [esp + dyO], xmm4
+ movaps [esp + dzO], xmm5
+
+ mulps xmm3, xmm3
+ mulps xmm4, xmm4
+ mulps xmm5, xmm5
+
+ addps xmm4, xmm3
+ addps xmm4, xmm5
+ /* rsq in xmm4 */
+
+ rsqrtps xmm5, xmm4
+ /* lookup seed in xmm5 */
+ movaps xmm2, xmm5
+ mulps xmm5, xmm5
+ movaps xmm1, [esp + three]
+ mulps xmm5, xmm4 /* rsq*lu*lu */
+ movaps xmm0, [esp + half]
+ subps xmm1, xmm5 /* 30-rsq*lu*lu */
+ mulps xmm1, xmm2
+ mulps xmm0, xmm1 /* xmm0=rinv */
+ mulps xmm4, xmm0 /* xmm4=r */
+ movaps [esp + rinvO], xmm0
+
+ mulps xmm4, [esp + tsc]
+ movhlps xmm7, xmm4
+ cvttps2pi mm6, xmm4
+ cvttps2pi mm7, xmm7 /* mm6/mm7 contain lu indices */
+ cvtpi2ps xmm3, mm6
+ cvtpi2ps xmm7, mm7
+ movlhps xmm3, xmm7
+
+ subps xmm4, xmm3
+ movaps xmm1, xmm4 /* xmm1=eps */
+ movaps xmm2, xmm1
+ mulps xmm2, xmm2 /* xmm2=eps2 */
+ pslld mm6, 2
+ pslld mm7, 2
+
+ movd mm0, eax
+ movd mm1, ecx
+ movd mm2, edx
+
+ mov esi, [ebp + VFtab]
+ movd eax, mm6
+ movd ecx, mm7
+ psrlq mm7, 32
+ movd edx, mm7
+
+ movlps xmm5, [esi + eax*4]
+ movlps xmm7, [esi + ecx*4]
+ movhps xmm7, [esi + edx*4] /* got half coulomb table */
+
+ movaps xmm4, xmm5
+ shufps xmm4, xmm7, 0b10001000
+ shufps xmm5, xmm7, 0b11011101
+
+ movlps xmm7, [esi + eax*4 + 8]
+ movlps xmm3, [esi + ecx*4 + 8]
+ movhps xmm3, [esi + edx*4 + 8] /* other half of coulomb table */
+ movaps xmm6, xmm7
+ shufps xmm6, xmm3, 0b10001000
+ shufps xmm7, xmm3, 0b11011101
+ /* coulomb table ready, in xmm4-xmm7 */
+ mulps xmm6, xmm1 /* xmm6=Geps */
+ mulps xmm7, xmm2 /* xmm7=Heps2 */
+ addps xmm5, xmm6
+ addps xmm5, xmm7 /* xmm5=Fp */
+ mulps xmm7, [esp + two] /* two*Heps2 */
+ movaps xmm0, [esp + qqO]
+ addps xmm7, xmm6
+ addps xmm7, xmm5 /* xmm7=FF */
+ mulps xmm5, xmm1 /* xmm5=eps*Fp */
+ addps xmm5, xmm4 /* xmm5=VV */
+ mulps xmm5, xmm0 /* vcoul=qq*VV */
+ mulps xmm0, xmm7 /* fijC=FF*qq */
+ /* at this point mm5 contains vcoul and xmm0 fijC */
+ /* increment vcoul - then we can get rid of mm5 */
+ addps xmm5, [esp + vctot]
+ movaps [esp + vctot], xmm5
+
+ /* do nontable L-J */
+ movaps xmm2, [esp + rinvO]
+ mulps xmm2, xmm2
+ movaps xmm1, xmm2
+ mulps xmm1, xmm1
+ mulps xmm1, xmm2 /* xmm1=rinvsix */
+ movaps xmm4, xmm1
+ mulps xmm4, xmm4 /* xmm4=rinvtwelve */
+ mulps xmm1, [esp + c6]
+ mulps xmm4, [esp + c12]
+ movaps xmm3, xmm4
+ subps xmm3, xmm1 /* xmm3=vnb12-vnb6 */
+ mulps xmm1, [esp + six]
+ mulps xmm4, [esp + twelve]
+ subps xmm4, xmm1
+ addps xmm3, [esp + vnbtot]
+ mulps xmm4, [esp + rinvO]
+ mulps xmm0, [esp + tsc]
+ subps xmm4, xmm0
+ movaps [esp + vnbtot], xmm3
+ mulps xmm4, [esp + rinvO]
+
+ movd eax, mm0
+ movd ecx, mm1
+ movd edx, mm2
+
+ movaps xmm0, [esp + dxO]
+ movaps xmm1, [esp + dyO]
+ movaps xmm2, [esp + dzO]
+
+ mulps xmm0, xmm4
+ mulps xmm1, xmm4
+ mulps xmm2, xmm4 /* xmm0-xmm2 now contains tx-tz (partial force) */
+ movss xmm3, [esp + fixO]
+ movss xmm4, [esp + fiyO]
+ movss xmm5, [esp + fizO]
+ addss xmm3, xmm0
+ addss xmm4, xmm1
+ addss xmm5, xmm2
+ movss [esp + fixO], xmm3
+ movss [esp + fiyO], xmm4
+ movss [esp + fizO], xmm5 /* updated the O force now do the H's */
+ movaps xmm3, xmm0
+ movaps xmm4, xmm1
+ movaps xmm5, xmm2
+ shufps xmm3, xmm3, 0b11100110 /* shift right */
+ shufps xmm4, xmm4, 0b11100110
+ shufps xmm5, xmm5, 0b11100110
+ addss xmm3, [esp + fixH1]
+ addss xmm4, [esp + fiyH1]
+ addss xmm5, [esp + fizH1]
+ movss [esp + fixH1], xmm3
+ movss [esp + fiyH1], xmm4
+ movss [esp + fizH1], xmm5 /* updated the H1 force */
+
+ mov edi, [ebp + faction]
+ shufps xmm3, xmm3, 0b11100111 /* shift right */
+ shufps xmm4, xmm4, 0b11100111
+ shufps xmm5, xmm5, 0b11100111
+ addss xmm3, [esp + fixH2]
+ addss xmm4, [esp + fiyH2]
+ addss xmm5, [esp + fizH2]
+ movss [esp + fixH2], xmm3
+ movss [esp + fiyH2], xmm4
+ movss [esp + fizH2], xmm5 /* updated the H2 force */
+
+ /* the fj's - start by accumulating the tx/ty/tz force in xmm0, xmm1 */
+ xorps xmm5, xmm5
+ movaps xmm3, xmm0
+ movlps xmm6, [edi + eax*4]
+ movss xmm7, [edi + eax*4 + 8]
+ unpcklps xmm3, xmm1
+ movlhps xmm3, xmm5
+ unpckhps xmm0, xmm1
+ addps xmm0, xmm3
+ movhlps xmm3, xmm0
+ addps xmm0, xmm3 /* x,y sum in xmm0 */
+
+ movhlps xmm1, xmm2
+ addss xmm2, xmm1
+ shufps xmm1, xmm1, 1
+ addss xmm2, xmm1 /* z sum in xmm2 */
+ subps xmm6, xmm0
+ subss xmm7, xmm2
+
+ movlps [edi + eax*4], xmm6
+ movss [edi + eax*4 + 8], xmm7
+
+ dec dword ptr [esp + innerk]
+ jz .i3120_updateouterdata
+ jmp .i3120_odd_loop
+.i3120_updateouterdata:
+ mov ecx, [esp + ii3]
+ mov edi, [ebp + faction]
+ mov esi, [ebp + fshift]
+ mov edx, [esp + is3]
+
+ /* accumulate Oi forces in xmm0, xmm1, xmm2 */
+ movaps xmm0, [esp + fixO]
+ movaps xmm1, [esp + fiyO]
+ movaps xmm2, [esp + fizO]
+
+ movhlps xmm3, xmm0
+ movhlps xmm4, xmm1
+ movhlps xmm5, xmm2
+ addps xmm0, xmm3
+ addps xmm1, xmm4
+ addps xmm2, xmm5 /* sum is in 1/2 in xmm0-xmm2 */
+
+ movaps xmm3, xmm0
+ movaps xmm4, xmm1
+ movaps xmm5, xmm2
+
+ shufps xmm3, xmm3, 1
+ shufps xmm4, xmm4, 1
+ shufps xmm5, xmm5, 1
+ addss xmm0, xmm3
+ addss xmm1, xmm4
+ addss xmm2, xmm5 /* xmm0-xmm2 has single force in pos0 */
+
+ /* increment i force */
+ movss xmm3, [edi + ecx*4]
+ movss xmm4, [edi + ecx*4 + 4]
+ movss xmm5, [edi + ecx*4 + 8]
+ addss xmm3, xmm0
+ addss xmm4, xmm1
+ addss xmm5, xmm2
+ movss [edi + ecx*4], xmm3
+ movss [edi + ecx*4 + 4], xmm4
+ movss [edi + ecx*4 + 8], xmm5
+
+ /* accumulate force in xmm6/xmm7 for fshift */
+ movaps xmm6, xmm0
+ movss xmm7, xmm2
+ movlhps xmm6, xmm1
+ shufps xmm6, xmm6, 0b1000
+
+ /* accumulate H1i forces in xmm0, xmm1, xmm2 */
+ movaps xmm0, [esp + fixH1]
+ movaps xmm1, [esp + fiyH1]
+ movaps xmm2, [esp + fizH1]
+
+ movhlps xmm3, xmm0
+ movhlps xmm4, xmm1
+ movhlps xmm5, xmm2
+ addps xmm0, xmm3
+ addps xmm1, xmm4
+ addps xmm2, xmm5 /* sum is in 1/2 in xmm0-xmm2 */
+
+ movaps xmm3, xmm0
+ movaps xmm4, xmm1
+ movaps xmm5, xmm2
+
+ shufps xmm3, xmm3, 1
+ shufps xmm4, xmm4, 1
+ shufps xmm5, xmm5, 1
+ addss xmm0, xmm3
+ addss xmm1, xmm4
+ addss xmm2, xmm5 /* xmm0-xmm2 has single force in pos0 */
+
+ /* increment i force */
+ movss xmm3, [edi + ecx*4 + 12]
+ movss xmm4, [edi + ecx*4 + 16]
+ movss xmm5, [edi + ecx*4 + 20]
+ addss xmm3, xmm0
+ addss xmm4, xmm1
+ addss xmm5, xmm2
+ movss [edi + ecx*4 + 12], xmm3
+ movss [edi + ecx*4 + 16], xmm4
+ movss [edi + ecx*4 + 20], xmm5
+
+ /* accumulate force in xmm6/xmm7 for fshift */
+ addss xmm7, xmm2
+ movlhps xmm0, xmm1
+ shufps xmm0, xmm0, 0b1000
+ addps xmm6, xmm0
+
+ /* accumulate H2i forces in xmm0, xmm1, xmm2 */
+ movaps xmm0, [esp + fixH2]
+ movaps xmm1, [esp + fiyH2]
+ movaps xmm2, [esp + fizH2]
+
+ movhlps xmm3, xmm0
+ movhlps xmm4, xmm1
+ movhlps xmm5, xmm2
+ addps xmm0, xmm3
+ addps xmm1, xmm4
+ addps xmm2, xmm5 /* sum is in 1/2 in xmm0-xmm2 */
+
+ movaps xmm3, xmm0
+ movaps xmm4, xmm1
+ movaps xmm5, xmm2
+
+ shufps xmm3, xmm3, 1
+ shufps xmm4, xmm4, 1
+ shufps xmm5, xmm5, 1
+ addss xmm0, xmm3
+ addss xmm1, xmm4
+ addss xmm2, xmm5 /* xmm0-xmm2 has single force in pos0 */
+
+ /* increment i force */
+ movss xmm3, [edi + ecx*4 + 24]
+ movss xmm4, [edi + ecx*4 + 28]
+ movss xmm5, [edi + ecx*4 + 32]
+ addss xmm3, xmm0
+ addss xmm4, xmm1
+ addss xmm5, xmm2
+ movss [edi + ecx*4 + 24], xmm3
+ movss [edi + ecx*4 + 28], xmm4
+ movss [edi + ecx*4 + 32], xmm5
+
+ /* accumulate force in xmm6/xmm7 for fshift */
+ addss xmm7, xmm2
+ movlhps xmm0, xmm1
+ shufps xmm0, xmm0, 0b1000
+ addps xmm6, xmm0
+
+ /* increment fshift force */
+ movlps xmm3, [esi + edx*4]
+ movss xmm4, [esi + edx*4 + 8]
+ addps xmm3, xmm6
+ addss xmm4, xmm7
+ movlps [esi + edx*4], xmm3
+ movss [esi + edx*4 + 8], xmm4
+
+ mov edx, [ebp + gid]
+ mov edx, [edx]
+ add [ebp + gid], 4
+
+ /* accumulate total potential energy and update it */
+ movaps xmm7, [esp + vctot]
+ /* accumulate */
+ movhlps xmm6, xmm7
+ addps xmm7, xmm6 /* pos 0-1 in xmm7 have the sum now */
+ movaps xmm6, xmm7
+ shufps xmm6, xmm6, 1
+ addss xmm7, xmm6
+
+ /* add earlier value from mem */
+ mov eax, [ebp + Vc]
+ addss xmm7, [eax + edx*4]
+ /* move back to mem */
+ movss [eax + edx*4], xmm7
+
+ /* accumulate total lj energy and update it */
+ movaps xmm7, [esp + vnbtot]
+ /* accumulate */
+ movhlps xmm6, xmm7
+ addps xmm7, xmm6 /* pos 0-1 in xmm7 have the sum now */
+ movaps xmm6, xmm7
+ shufps xmm6, xmm6, 1
+ addss xmm7, xmm6
+
+ /* add earlier value from mem */
+ mov eax, [ebp + Vnb]
+ addss xmm7, [eax + edx*4]
+ /* move back to mem */
+ movss [eax + edx*4], xmm7
+
+ /* finish if last */
+ mov ecx, [ebp + nri]
+ dec ecx
+ jecxz .i3120_end
+ /* not last, iterate once more! */
+ mov [ebp + nri], ecx
+ jmp .i3120_outer
+.i3120_end:
+ emms
+ mov eax, [esp + salign]
+ add esp, eax
+ add esp, 824
+ pop edi
+ pop esi
+ pop edx
+ pop ecx
+ pop ebx
+ pop eax
+ leave
+ ret
+
+
+
+.globl inl3130_sse
+ .type inl3130_sse,@function
+inl3130_sse:
+.equ nri, 8
+.equ iinr, 12
+.equ jindex, 16
+.equ jjnr, 20
+.equ shift, 24
+.equ shiftvec, 28
+.equ fshift, 32
+.equ gid, 36
+.equ pos, 40
+.equ faction, 44
+.equ charge, 48
+.equ facel, 52
+.equ Vc, 56
+.equ type, 60
+.equ ntype, 64
+.equ nbfp, 68
+.equ Vnb, 72
+.equ tabscale, 76
+.equ VFtab, 80
+ /* stack offsets for local variables */
+ /* bottom of stack is cache-aligned for sse use */
+.equ ixO, 0
+.equ iyO, 16
+.equ izO, 32
+.equ ixH1, 48
+.equ iyH1, 64
+.equ izH1, 80
+.equ ixH2, 96
+.equ iyH2, 112
+.equ izH2, 128
+.equ jxO, 144
+.equ jyO, 160
+.equ jzO, 176
+.equ jxH1, 192
+.equ jyH1, 208
+.equ jzH1, 224
+.equ jxH2, 240
+.equ jyH2, 256
+.equ jzH2, 272
+.equ dxOO, 288
+.equ dyOO, 304
+.equ dzOO, 320
+.equ dxOH1, 336
+.equ dyOH1, 352
+.equ dzOH1, 368
+.equ dxOH2, 384
+.equ dyOH2, 400
+.equ dzOH2, 416
+.equ dxH1O, 432
+.equ dyH1O, 448
+.equ dzH1O, 464
+.equ dxH1H1, 480
+.equ dyH1H1, 496
+.equ dzH1H1, 512
+.equ dxH1H2, 528
+.equ dyH1H2, 544
+.equ dzH1H2, 560
+.equ dxH2O, 576
+.equ dyH2O, 592
+.equ dzH2O, 608
+.equ dxH2H1, 624
+.equ dyH2H1, 640
+.equ dzH2H1, 656
+.equ dxH2H2, 672
+.equ dyH2H2, 688
+.equ dzH2H2, 704
+.equ qqOO, 720
+.equ qqOH, 736
+.equ qqHH, 752
+.equ two, 768
+.equ tsc, 784
+.equ c6, 800
+.equ c12, 816
+.equ six, 832
+.equ twelve, 848
+.equ vctot, 864
+.equ vnbtot, 880
+.equ fixO, 896
+.equ fiyO, 912
+.equ fizO, 928
+.equ fixH1, 944
+.equ fiyH1, 960
+.equ fizH1, 976
+.equ fixH2, 992
+.equ fiyH2, 1008
+.equ fizH2, 1024
+.equ fjxO, 1040
+.equ fjyO, 1056
+.equ fjzO, 1072
+.equ fjxH1, 1088
+.equ fjyH1, 1104
+.equ fjzH1, 1120
+.equ fjxH2, 1136
+.equ fjyH2, 1152
+.equ fjzH2, 1168
+.equ half, 1184
+.equ three, 1200
+.equ rsqOO, 1216
+.equ rsqOH1, 1232
+.equ rsqOH2, 1248
+.equ rsqH1O, 1264
+.equ rsqH1H1, 1280
+.equ rsqH1H2, 1296
+.equ rsqH2O, 1312
+.equ rsqH2H1, 1328
+.equ rsqH2H2, 1344
+.equ rinvOO, 1360
+.equ rinvOH1, 1376
+.equ rinvOH2, 1392
+.equ rinvH1O, 1408
+.equ rinvH1H1, 1424
+.equ rinvH1H2, 1440
+.equ rinvH2O, 1456
+.equ rinvH2H1, 1472
+.equ rinvH2H2, 1488
+.equ fstmp, 1504
+.equ is3, 1520
+.equ ii3, 1524
+.equ innerjjnr, 1528
+.equ innerk, 1532
+.equ salign, 1536
+ push ebp
+ mov ebp,esp
+ push eax
+ push ebx
+ push ecx
+ push edx
+ push esi
+ push edi
+ sub esp, 1540 /* local stack space */
+ mov eax, esp
+ and eax, 0xf
+ sub esp, eax
+ mov [esp + salign], eax
+
+ emms
+
+ movups xmm0, [sse_half]
+ movups xmm1, [sse_two]
+ movups xmm2, [sse_three]
+ movups xmm3, [sse_six]
+ movups xmm4, [sse_twelve]
+ movss xmm5, [ebp +tabscale]
+ movaps [esp + half], xmm0
+ movaps [esp + two], xmm1
+ movaps [esp + three], xmm2
+ movaps [esp + six], xmm3
+ movaps [esp + twelve], xmm4
+ shufps xmm5, xmm5, 0
+ movaps [esp + tsc], xmm5
+
+ /* assume we have at least one i particle - start directly */
+ mov ecx, [ebp + iinr] /* ecx = pointer into iinr[] */
+ mov ebx, [ecx] /* ebx =ii */
+
+ mov edx, [ebp + charge]
+ movss xmm3, [edx + ebx*4]
+ movss xmm4, xmm3
+ movss xmm5, [edx + ebx*4 + 4]
+ movss xmm6, [ebp + facel]
+ mulss xmm3, xmm3
+ mulss xmm4, xmm5
+ mulss xmm5, xmm5
+ mulss xmm3, xmm6
+ mulss xmm4, xmm6
+ mulss xmm5, xmm6
+ shufps xmm3, xmm3, 0
+ shufps xmm4, xmm4, 0
+ shufps xmm5, xmm5, 0
+ movaps [esp + qqOO], xmm3
+ movaps [esp + qqOH], xmm4
+ movaps [esp + qqHH], xmm5
+
+ xorps xmm0, xmm0
+ mov edx, [ebp + type]
+ mov ecx, [edx + ebx*4]
+ shl ecx, 1
+ mov edx, ecx
+ imul ecx, [ebp + ntype] /* ecx = ntia = 2*ntype*type[ii0] */
+ add edx, ecx
+ mov eax, [ebp + nbfp]
+ movlps xmm0, [eax + edx*4]
+ movaps xmm1, xmm0
+ shufps xmm0, xmm0, 0
+ shufps xmm1, xmm1, 0b01010101
+ movaps [esp + c6], xmm0
+ movaps [esp + c12], xmm1
+
+.i3130_outer:
+ mov eax, [ebp + shift] /* eax = pointer into shift[] */
+ mov ebx, [eax] /* ebx=shift[n] */
+ add [ebp + shift], 4 /* advance pointer one step */
+
+ lea ebx, [ebx + ebx*2] /* ebx=3*is */
+ mov [esp + is3],ebx /* store is3 */
+
+ mov eax, [ebp + shiftvec] /* eax = base of shiftvec[] */
+
+ movss xmm0, [eax + ebx*4]
+ movss xmm1, [eax + ebx*4 + 4]
+ movss xmm2, [eax + ebx*4 + 8]
+
+ mov ecx, [ebp + iinr] /* ecx = pointer into iinr[] */
+ add [ebp + iinr], 4 /* advance pointer */
+ mov ebx, [ecx] /* ebx =ii */
+
+ lea ebx, [ebx + ebx*2] /* ebx = 3*ii=ii3 */
+ mov eax, [ebp + pos] /* eax = base of pos[] */
+ mov [esp + ii3], ebx
+
+ movaps xmm3, xmm0
+ movaps xmm4, xmm1
+ movaps xmm5, xmm2
+ addss xmm3, [eax + ebx*4]
+ addss xmm4, [eax + ebx*4 + 4]
+ addss xmm5, [eax + ebx*4 + 8]
+ shufps xmm3, xmm3, 0
+ shufps xmm4, xmm4, 0
+ shufps xmm5, xmm5, 0
+ movaps [esp + ixO], xmm3
+ movaps [esp + iyO], xmm4
+ movaps [esp + izO], xmm5
+
+ movss xmm3, xmm0
+ movss xmm4, xmm1
+ movss xmm5, xmm2
+ addss xmm0, [eax + ebx*4 + 12]
+ addss xmm1, [eax + ebx*4 + 16]
+ addss xmm2, [eax + ebx*4 + 20]
+ addss xmm3, [eax + ebx*4 + 24]
+ addss xmm4, [eax + ebx*4 + 28]
+ addss xmm5, [eax + ebx*4 + 32]
+
+ shufps xmm0, xmm0, 0
+ shufps xmm1, xmm1, 0
+ shufps xmm2, xmm2, 0
+ shufps xmm3, xmm3, 0
+ shufps xmm4, xmm4, 0
+ shufps xmm5, xmm5, 0
+ movaps [esp + ixH1], xmm0
+ movaps [esp + iyH1], xmm1
+ movaps [esp + izH1], xmm2
+ movaps [esp + ixH2], xmm3
+ movaps [esp + iyH2], xmm4
+ movaps [esp + izH2], xmm5
+
+ /* clear vctot and i forces */
+ xorps xmm4, xmm4
+ movaps [esp + vctot], xmm4
+ movaps [esp + vnbtot], xmm4
+ movaps [esp + fixO], xmm4
+ movaps [esp + fiyO], xmm4
+ movaps [esp + fizO], xmm4
+ movaps [esp + fixH1], xmm4
+ movaps [esp + fiyH1], xmm4
+ movaps [esp + fizH1], xmm4
+ movaps [esp + fixH2], xmm4
+ movaps [esp + fiyH2], xmm4
+ movaps [esp + fizH2], xmm4
+
+ mov eax, [ebp + jindex]
+ mov ecx, [eax] /* jindex[n] */
+ mov edx, [eax + 4] /* jindex[n+1] */
+ add [ebp + jindex], 4
+ sub edx, ecx /* number of innerloop atoms */
+
+ mov esi, [ebp + pos]
+ mov edi, [ebp + faction]
+ mov eax, [ebp + jjnr]
+ shl ecx, 2
+ add eax, ecx
+ mov [esp + innerjjnr], eax /* pointer to jjnr[nj0] */
+ sub edx, 4
+ mov [esp + innerk], edx /* number of innerloop atoms */
+ jge .i3130_unroll_loop
+ jmp .i3130_single_check
+.i3130_unroll_loop:
+ /* quad-unroll innerloop here */
+ mov edx, [esp + innerjjnr] /* pointer to jjnr[k] */
+
+ mov eax, [edx]
+ mov ebx, [edx + 4]
+ mov ecx, [edx + 8]
+ mov edx, [edx + 12] /* eax-edx=jnr1-4 */
+
+ add [esp + innerjjnr], 16 /* advance pointer (unrolled 4) */
+
+ mov esi, [ebp + pos] /* base of pos[] */
+
+ lea eax, [eax + eax*2] /* replace jnr with j3 */
+ lea ebx, [ebx + ebx*2]
+ lea ecx, [ecx + ecx*2] /* replace jnr with j3 */
+ lea edx, [edx + edx*2]
+
+ /* move j coordinates to local temp variables */
+ movlps xmm2, [esi + eax*4]
+ movlps xmm3, [esi + eax*4 + 12]
+ movlps xmm4, [esi + eax*4 + 24]
+
+ movlps xmm5, [esi + ebx*4]
+ movlps xmm6, [esi + ebx*4 + 12]
+ movlps xmm7, [esi + ebx*4 + 24]
+
+ movhps xmm2, [esi + ecx*4]
+ movhps xmm3, [esi + ecx*4 + 12]
+ movhps xmm4, [esi + ecx*4 + 24]
+
+ movhps xmm5, [esi + edx*4]
+ movhps xmm6, [esi + edx*4 + 12]
+ movhps xmm7, [esi + edx*4 + 24]
+
+ /* current state: */
+ /* xmm2= jxOa jyOa jxOc jyOc */
+ /* xmm3= jxH1a jyH1a jxH1c jyH1c */
+ /* xmm4= jxH2a jyH2a jxH2c jyH2c */
+ /* xmm5= jxOb jyOb jxOd jyOd */
+ /* xmm6= jxH1b jyH1b jxH1d jyH1d */
+ /* xmm7= jxH2b jyH2b jxH2d jyH2d */
+
+ movaps xmm0, xmm2
+ movaps xmm1, xmm3
+ unpcklps xmm0, xmm5 /* xmm0= jxOa jxOb jyOa jyOb */
+ unpcklps xmm1, xmm6 /* xmm1= jxH1a jxH1b jyH1a jyH1b */
+ unpckhps xmm2, xmm5 /* xmm2= jxOc jxOd jyOc jyOd */
+ unpckhps xmm3, xmm6 /* xmm3= jxH1c jxH1d jyH1c jyH1d */
+ movaps xmm5, xmm4
+ movaps xmm6, xmm0
+ unpcklps xmm4, xmm7 /* xmm4= jxH2a jxH2b jyH2a jyH2b */
+ unpckhps xmm5, xmm7 /* xmm5= jxH2c jxH2d jyH2c jyH2d */
+ movaps xmm7, xmm1
+ movlhps xmm0, xmm2 /* xmm0= jxOa jxOb jxOc jxOd */
+ movaps [esp + jxO], xmm0
+ movhlps xmm2, xmm6 /* xmm2= jyOa jyOb jyOc jyOd */
+ movaps [esp + jyO], xmm2
+ movlhps xmm1, xmm3
+ movaps [esp + jxH1], xmm1
+ movhlps xmm3, xmm7
+ movaps xmm6, xmm4
+ movaps [esp + jyH1], xmm3
+ movlhps xmm4, xmm5
+ movaps [esp + jxH2], xmm4
+ movhlps xmm5, xmm6
+ movaps [esp + jyH2], xmm5
+
+ movss xmm0, [esi + eax*4 + 8]
+ movss xmm1, [esi + eax*4 + 20]
+ movss xmm2, [esi + eax*4 + 32]
+
+ movss xmm3, [esi + ecx*4 + 8]
+ movss xmm4, [esi + ecx*4 + 20]
+ movss xmm5, [esi + ecx*4 + 32]
+
+ movhps xmm0, [esi + ebx*4 + 4]
+ movhps xmm1, [esi + ebx*4 + 16]
+ movhps xmm2, [esi + ebx*4 + 28]
+
+ movhps xmm3, [esi + edx*4 + 4]
+ movhps xmm4, [esi + edx*4 + 16]
+ movhps xmm5, [esi + edx*4 + 28]
+
+ shufps xmm0, xmm3, 0b11001100
+ shufps xmm1, xmm4, 0b11001100
+ shufps xmm2, xmm5, 0b11001100
+ movaps [esp + jzO], xmm0
+ movaps [esp + jzH1], xmm1
+ movaps [esp + jzH2], xmm2
+
+ movaps xmm0, [esp + ixO]
+ movaps xmm1, [esp + iyO]
+ movaps xmm2, [esp + izO]
+ movaps xmm3, [esp + ixO]
+ movaps xmm4, [esp + iyO]
+ movaps xmm5, [esp + izO]
+ subps xmm0, [esp + jxO]
+ subps xmm1, [esp + jyO]
+ subps xmm2, [esp + jzO]
+ subps xmm3, [esp + jxH1]
+ subps xmm4, [esp + jyH1]
+ subps xmm5, [esp + jzH1]
+ movaps [esp + dxOO], xmm0
+ movaps [esp + dyOO], xmm1
+ movaps [esp + dzOO], xmm2
+ mulps xmm0, xmm0
+ mulps xmm1, xmm1
+ mulps xmm2, xmm2
+ movaps [esp + dxOH1], xmm3
+ movaps [esp + dyOH1], xmm4
+ movaps [esp + dzOH1], xmm5
+ mulps xmm3, xmm3
+ mulps xmm4, xmm4
+ mulps xmm5, xmm5
+ addps xmm0, xmm1
+ addps xmm0, xmm2
+ addps xmm3, xmm4
+ addps xmm3, xmm5
+ movaps [esp + rsqOO], xmm0
+ movaps [esp + rsqOH1], xmm3
+
+ movaps xmm0, [esp + ixO]
+ movaps xmm1, [esp + iyO]
+ movaps xmm2, [esp + izO]
+ movaps xmm3, [esp + ixH1]
+ movaps xmm4, [esp + iyH1]
+ movaps xmm5, [esp + izH1]
+ subps xmm0, [esp + jxH2]
+ subps xmm1, [esp + jyH2]
+ subps xmm2, [esp + jzH2]
+ subps xmm3, [esp + jxO]
+ subps xmm4, [esp + jyO]
+ subps xmm5, [esp + jzO]
+ movaps [esp + dxOH2], xmm0
+ movaps [esp + dyOH2], xmm1
+ movaps [esp + dzOH2], xmm2
+ mulps xmm0, xmm0
+ mulps xmm1, xmm1
+ mulps xmm2, xmm2
+ movaps [esp + dxH1O], xmm3
+ movaps [esp + dyH1O], xmm4
+ movaps [esp + dzH1O], xmm5
+ mulps xmm3, xmm3
+ mulps xmm4, xmm4
+ mulps xmm5, xmm5
+ addps xmm0, xmm1
+ addps xmm0, xmm2
+ addps xmm3, xmm4
+ addps xmm3, xmm5
+ movaps [esp + rsqOH2], xmm0
+ movaps [esp + rsqH1O], xmm3
+
+ movaps xmm0, [esp + ixH1]
+ movaps xmm1, [esp + iyH1]
+ movaps xmm2, [esp + izH1]
+ movaps xmm3, [esp + ixH1]
+ movaps xmm4, [esp + iyH1]
+ movaps xmm5, [esp + izH1]
+ subps xmm0, [esp + jxH1]
+ subps xmm1, [esp + jyH1]
+ subps xmm2, [esp + jzH1]
+ subps xmm3, [esp + jxH2]
+ subps xmm4, [esp + jyH2]
+ subps xmm5, [esp + jzH2]
+ movaps [esp + dxH1H1], xmm0
+ movaps [esp + dyH1H1], xmm1
+ movaps [esp + dzH1H1], xmm2
+ mulps xmm0, xmm0
+ mulps xmm1, xmm1
+ mulps xmm2, xmm2
+ movaps [esp + dxH1H2], xmm3
+ movaps [esp + dyH1H2], xmm4
+ movaps [esp + dzH1H2], xmm5
+ mulps xmm3, xmm3
+ mulps xmm4, xmm4
+ mulps xmm5, xmm5
+ addps xmm0, xmm1
+ addps xmm0, xmm2
+ addps xmm3, xmm4
+ addps xmm3, xmm5
+ movaps [esp + rsqH1H1], xmm0
+ movaps [esp + rsqH1H2], xmm3
+
+ movaps xmm0, [esp + ixH2]
+ movaps xmm1, [esp + iyH2]
+ movaps xmm2, [esp + izH2]
+ movaps xmm3, [esp + ixH2]
+ movaps xmm4, [esp + iyH2]
+ movaps xmm5, [esp + izH2]
+ subps xmm0, [esp + jxO]
+ subps xmm1, [esp + jyO]
+ subps xmm2, [esp + jzO]
+ subps xmm3, [esp + jxH1]
+ subps xmm4, [esp + jyH1]
+ subps xmm5, [esp + jzH1]
+ movaps [esp + dxH2O], xmm0
+ movaps [esp + dyH2O], xmm1
+ movaps [esp + dzH2O], xmm2
+ mulps xmm0, xmm0
+ mulps xmm1, xmm1
+ mulps xmm2, xmm2
+ movaps [esp + dxH2H1], xmm3
+ movaps [esp + dyH2H1], xmm4
+ movaps [esp + dzH2H1], xmm5
+ mulps xmm3, xmm3
+ mulps xmm4, xmm4
+ mulps xmm5, xmm5
+ addps xmm0, xmm1
+ addps xmm0, xmm2
+ addps xmm4, xmm3
+ addps xmm4, xmm5
+ movaps [esp + rsqH2O], xmm0
+ movaps [esp + rsqH2H1], xmm4
+
+ movaps xmm0, [esp + ixH2]
+ movaps xmm1, [esp + iyH2]
+ movaps xmm2, [esp + izH2]
+ subps xmm0, [esp + jxH2]
+ subps xmm1, [esp + jyH2]
+ subps xmm2, [esp + jzH2]
+ movaps [esp + dxH2H2], xmm0
+ movaps [esp + dyH2H2], xmm1
+ movaps [esp + dzH2H2], xmm2
+ mulps xmm0, xmm0
+ mulps xmm1, xmm1
+ mulps xmm2, xmm2
+ addps xmm0, xmm1
+ addps xmm0, xmm2
+ movaps [esp + rsqH2H2], xmm0
+
+ /* start doing invsqrt use rsq values in xmm0, xmm4 */
+ rsqrtps xmm1, xmm0
+ rsqrtps xmm5, xmm4
+ movaps xmm2, xmm1
+ movaps xmm6, xmm5
+ mulps xmm1, xmm1
+ mulps xmm5, xmm5
+ movaps xmm3, [esp + three]
+ movaps xmm7, xmm3
+ mulps xmm1, xmm0
+ mulps xmm5, xmm4
+ subps xmm3, xmm1
+ subps xmm7, xmm5
+ mulps xmm3, xmm2
+ mulps xmm7, xmm6
+ mulps xmm3, [esp + half] /* rinvH2H2 */
+ mulps xmm7, [esp + half] /* rinvH2H1 */
+ movaps [esp + rinvH2H2], xmm3
+ movaps [esp + rinvH2H1], xmm7
+
+ rsqrtps xmm1, [esp + rsqOO]
+ rsqrtps xmm5, [esp + rsqOH1]
+ movaps xmm2, xmm1
+ movaps xmm6, xmm5
+ mulps xmm1, xmm1
+ mulps xmm5, xmm5
+ movaps xmm3, [esp + three]
+ movaps xmm7, xmm3
+ mulps xmm1, [esp + rsqOO]
+ mulps xmm5, [esp + rsqOH1]
+ subps xmm3, xmm1
+ subps xmm7, xmm5
+ mulps xmm3, xmm2
+ mulps xmm7, xmm6
+ mulps xmm3, [esp + half]
+ mulps xmm7, [esp + half]
+ movaps [esp + rinvOO], xmm3
+ movaps [esp + rinvOH1], xmm7
+
+ rsqrtps xmm1, [esp + rsqOH2]
+ rsqrtps xmm5, [esp + rsqH1O]
+ movaps xmm2, xmm1
+ movaps xmm6, xmm5
+ mulps xmm1, xmm1
+ mulps xmm5, xmm5
+ movaps xmm3, [esp + three]
+ movaps xmm7, xmm3
+ mulps xmm1, [esp + rsqOH2]
+ mulps xmm5, [esp + rsqH1O]
+ subps xmm3, xmm1
+ subps xmm7, xmm5
+ mulps xmm3, xmm2
+ mulps xmm7, xmm6
+ mulps xmm3, [esp + half]
+ mulps xmm7, [esp + half]
+ movaps [esp + rinvOH2], xmm3
+ movaps [esp + rinvH1O], xmm7
+
+ rsqrtps xmm1, [esp + rsqH1H1]
+ rsqrtps xmm5, [esp + rsqH1H2]
+ movaps xmm2, xmm1
+ movaps xmm6, xmm5
+ mulps xmm1, xmm1
+ mulps xmm5, xmm5
+ movaps xmm3, [esp + three]
+ movaps xmm7, xmm3
+ mulps xmm1, [esp + rsqH1H1]
+ mulps xmm5, [esp + rsqH1H2]
+ subps xmm3, xmm1
+ subps xmm7, xmm5
+ mulps xmm3, xmm2
+ mulps xmm7, xmm6
+ mulps xmm3, [esp + half]
+ mulps xmm7, [esp + half]
+ movaps [esp + rinvH1H1], xmm3
+ movaps [esp + rinvH1H2], xmm7
+
+ rsqrtps xmm1, [esp + rsqH2O]
+ movaps xmm2, xmm1
+ mulps xmm1, xmm1
+ movaps xmm3, [esp + three]
+ mulps xmm1, [esp + rsqH2O]
+ subps xmm3, xmm1
+ mulps xmm3, xmm2
+ mulps xmm3, [esp + half]
+ movaps [esp + rinvH2O], xmm3
+
+ /* start with OO interaction */
+ movaps xmm0, [esp + rinvOO]
+ movaps xmm1, xmm0
+ mulps xmm1, [esp + rsqOO] /* xmm1=r */
+ mulps xmm1, [esp + tsc]
+
+ movhlps xmm2, xmm1
+ cvttps2pi mm6, xmm1
+ cvttps2pi mm7, xmm2 /* mm6/mm7 contain lu indices */
+ cvtpi2ps xmm3, mm6
+ cvtpi2ps xmm2, mm7
+ movlhps xmm3, xmm2
+ subps xmm1, xmm3 /* xmm1=eps */
+ movaps xmm2, xmm1
+ mulps xmm2, xmm2 /* xmm2=eps2 */
+ pslld mm6, 2
+ pslld mm7, 2
+
+ movd mm0, eax
+ movd mm1, ebx
+ movd mm2, ecx
+ movd mm3, edx
+
+ mov esi, [ebp + VFtab]
+ movd eax, mm6
+ psrlq mm6, 32
+ movd ecx, mm7
+ psrlq mm7, 32
+ movd ebx, mm6
+ movd edx, mm7
+
+ movlps xmm5, [esi + eax*4]
+ movlps xmm7, [esi + ecx*4]
+ movhps xmm5, [esi + ebx*4]
+ movhps xmm7, [esi + edx*4] /* got half coulomb table */
+
+ movaps xmm4, xmm5
+ shufps xmm4, xmm7, 0b10001000
+ shufps xmm5, xmm7, 0b11011101
+
+ movlps xmm7, [esi + eax*4 + 8]
+ movlps xmm3, [esi + ecx*4 + 8]
+ movhps xmm7, [esi + ebx*4 + 8]
+ movhps xmm3, [esi + edx*4 + 8] /* other half of coulomb table */
+ movaps xmm6, xmm7
+ shufps xmm6, xmm3, 0b10001000
+ shufps xmm7, xmm3, 0b11011101
+ /* coulomb table ready, in xmm4-xmm7 */
+
+ mulps xmm6, xmm1 /* xmm6=Geps */
+ mulps xmm7, xmm2 /* xmm7=Heps2 */
+ addps xmm5, xmm6
+ addps xmm5, xmm7 /* xmm5=Fp */
+ mulps xmm7, [esp + two] /* two*Heps2 */
+ movaps xmm3, [esp + qqOO]
+ addps xmm7, xmm6
+ addps xmm7, xmm5 /* xmm7=FF */
+ mulps xmm5, xmm1 /* xmm5=eps*Fp */
+ addps xmm5, xmm4 /* xmm5=VV */
+ mulps xmm5, xmm3 /* vcoul=qq*VV */
+ mulps xmm3, xmm7 /* fijC=FF*qq */
+ /* at this point mm5 contains vcoul and mm3 fijC */
+ /* increment vcoul - then we can get rid of mm5 */
+ /* update vctot */
+ addps xmm5, [esp + vctot]
+ movaps [esp + vctot], xmm5
+ mulps xmm3, [esp + tsc]
+
+ /* start doing lj */
+ movaps xmm2, xmm0
+ mulps xmm2, xmm2
+ movaps xmm1, xmm2
+ mulps xmm1, xmm2
+ mulps xmm1, xmm2 /* xmm1=rinvsix */
+ movaps xmm2, xmm1
+ mulps xmm2, xmm2 /* xmm2=rinvtwelve */
+ mulps xmm1, [esp + c6]
+ mulps xmm2, [esp + c12]
+ movaps xmm4, xmm2
+ subps xmm4, xmm1
+ addps xmm4, [esp + vnbtot]
+ mulps xmm1, [esp + six]
+ mulps xmm2, [esp + twelve]
+ movaps [esp + vnbtot], xmm4
+ subps xmm2, xmm1
+ mulps xmm2, xmm0
+
+ subps xmm2, xmm3
+ mulps xmm0, xmm2
+
+ movaps xmm1, xmm0
+ movaps xmm2, xmm0
+
+ xorps xmm3, xmm3
+ movaps xmm4, xmm3
+ movaps xmm5, xmm3
+ mulps xmm0, [esp + dxOO]
+ mulps xmm1, [esp + dyOO]
+ mulps xmm2, [esp + dzOO]
+ subps xmm3, xmm0
+ subps xmm4, xmm1
+ subps xmm5, xmm2
+ addps xmm0, [esp + fixO]
+ addps xmm1, [esp + fiyO]
+ addps xmm2, [esp + fizO]
+ movaps [esp + fjxO], xmm3
+ movaps [esp + fjyO], xmm4
+ movaps [esp + fjzO], xmm5
+ movaps [esp + fixO], xmm0
+ movaps [esp + fiyO], xmm1
+ movaps [esp + fizO], xmm2
+
+ /* O-H1 interaction */
+ movaps xmm0, [esp + rinvOH1]
+ movaps xmm1, xmm0
+ mulps xmm1, [esp + rsqOH1] /* xmm1=r */
+ mulps xmm1, [esp + tsc]
+ movhlps xmm2, xmm1
+ cvttps2pi mm6, xmm1
+ cvttps2pi mm7, xmm2 /* mm6/mm7 contain lu indices */
+ cvtpi2ps xmm3, mm6
+ cvtpi2ps xmm2, mm7
+ movlhps xmm3, xmm2
+ subps xmm1, xmm3 /* xmm1=eps */
+ movaps xmm2, xmm1
+ mulps xmm2, xmm2 /* xmm2=eps2 */
+
+ pslld mm6, 2
+ pslld mm7, 2
+
+ movd eax, mm6
+ psrlq mm6, 32
+ movd ecx, mm7
+ psrlq mm7, 32
+ movd ebx, mm6
+ movd edx, mm7
+
+ movlps xmm5, [esi + eax*4]
+ movlps xmm7, [esi + ecx*4]
+ movhps xmm5, [esi + ebx*4]
+ movhps xmm7, [esi + edx*4] /* got half coulomb table */
+
+ movaps xmm4, xmm5
+ shufps xmm4, xmm7, 0b10001000
+ shufps xmm5, xmm7, 0b11011101
+
+ movlps xmm7, [esi + eax*4 + 8]
+ movlps xmm3, [esi + ecx*4 + 8]
+ movhps xmm7, [esi + ebx*4 + 8]
+ movhps xmm3, [esi + edx*4 + 8] /* other half of coulomb table */
+ movaps xmm6, xmm7
+ shufps xmm6, xmm3, 0b10001000
+ shufps xmm7, xmm3, 0b11011101
+ /* coulomb table ready, in xmm4-xmm7 */
+
+ mulps xmm6, xmm1 /* xmm6=Geps */
+ mulps xmm7, xmm2 /* xmm7=Heps2 */
+ addps xmm5, xmm6
+ addps xmm5, xmm7 /* xmm5=Fp */
+ mulps xmm7, [esp + two] /* two*Heps2 */
+ movaps xmm3, [esp + qqOH]
+ addps xmm7, xmm6
+ addps xmm7, xmm5 /* xmm7=FF */
+ mulps xmm5, xmm1 /* xmm5=eps*Fp */
+ addps xmm5, xmm4 /* xmm5=VV */
+ mulps xmm5, xmm3 /* vcoul=qq*VV */
+ mulps xmm3, xmm7 /* fijC=FF*qq */
+ /* at this point mm5 contains vcoul and mm3 fijC */
+
+ addps xmm5, [esp + vctot]
+ movaps [esp + vctot], xmm5
+ xorps xmm1, xmm1
+ mulps xmm3, [esp + tsc]
+ mulps xmm3, xmm0
+ subps xmm1, xmm3
+
+ movaps xmm0, xmm1
+ movaps xmm2, xmm1
+
+ xorps xmm3, xmm3
+ movaps xmm4, xmm3
+ movaps xmm5, xmm3
+ mulps xmm0, [esp + dxOH1]
+ mulps xmm1, [esp + dyOH1]
+ mulps xmm2, [esp + dzOH1]
+ subps xmm3, xmm0
+ subps xmm4, xmm1
+ subps xmm5, xmm2
+ addps xmm0, [esp + fixO]
+ addps xmm1, [esp + fiyO]
+ addps xmm2, [esp + fizO]
+ movaps [esp + fjxH1], xmm3
+ movaps [esp + fjyH1], xmm4
+ movaps [esp + fjzH1], xmm5
+ movaps [esp + fixO], xmm0
+ movaps [esp + fiyO], xmm1
+ movaps [esp + fizO], xmm2
+
+ /* O-H2 interaction */
+ movaps xmm0, [esp + rinvOH2]
+ movaps xmm1, xmm0
+ mulps xmm1, [esp + rsqOH2] /* xmm1=r */
+ mulps xmm1, [esp + tsc]
+ movhlps xmm2, xmm1
+ cvttps2pi mm6, xmm1
+ cvttps2pi mm7, xmm2 /* mm6/mm7 contain lu indices */
+ cvtpi2ps xmm3, mm6
+ cvtpi2ps xmm2, mm7
+ movlhps xmm3, xmm2
+ subps xmm1, xmm3 /* xmm1=eps */
+ movaps xmm2, xmm1
+ mulps xmm2, xmm2 /* xmm2=eps2 */
+
+ pslld mm6, 2
+ pslld mm7, 2
+
+ movd eax, mm6
+ psrlq mm6, 32
+ movd ecx, mm7
+ psrlq mm7, 32
+ movd ebx, mm6
+ movd edx, mm7
+
+ movlps xmm5, [esi + eax*4]
+ movlps xmm7, [esi + ecx*4]
+ movhps xmm5, [esi + ebx*4]
+ movhps xmm7, [esi + edx*4] /* got half coulomb table */
+
+ movaps xmm4, xmm5
+ shufps xmm4, xmm7, 0b10001000
+ shufps xmm5, xmm7, 0b11011101
+
+ movlps xmm7, [esi + eax*4 + 8]
+ movlps xmm3, [esi + ecx*4 + 8]
+ movhps xmm7, [esi + ebx*4 + 8]
+ movhps xmm3, [esi + edx*4 + 8] /* other half of coulomb table */
+ movaps xmm6, xmm7
+ shufps xmm6, xmm3, 0b10001000
+ shufps xmm7, xmm3, 0b11011101
+ /* coulomb table ready, in xmm4-xmm7 */
+
+ mulps xmm6, xmm1 /* xmm6=Geps */
+ mulps xmm7, xmm2 /* xmm7=Heps2 */
+ addps xmm5, xmm6
+ addps xmm5, xmm7 /* xmm5=Fp */
+ mulps xmm7, [esp + two] /* two*Heps2 */
+ movaps xmm3, [esp + qqOH]
+ addps xmm7, xmm6
+ addps xmm7, xmm5 /* xmm7=FF */
+ mulps xmm5, xmm1 /* xmm5=eps*Fp */
+ addps xmm5, xmm4 /* xmm5=VV */
+ mulps xmm5, xmm3 /* vcoul=qq*VV */
+ mulps xmm3, xmm7 /* fijC=FF*qq */
+ /* at this point mm5 contains vcoul and mm3 fijC */
+
+ addps xmm5, [esp + vctot]
+ movaps [esp + vctot], xmm5
+ xorps xmm1, xmm1
+ mulps xmm3, [esp + tsc]
+ mulps xmm3, xmm0
+ subps xmm1, xmm3
+
+ movaps xmm0, xmm1
+ movaps xmm2, xmm1
+
+ xorps xmm3, xmm3
+ movaps xmm4, xmm3
+ movaps xmm5, xmm3
+ mulps xmm0, [esp + dxOH2]
+ mulps xmm1, [esp + dyOH2]
+ mulps xmm2, [esp + dzOH2]
+ subps xmm3, xmm0
+ subps xmm4, xmm1
+ subps xmm5, xmm2
+ addps xmm0, [esp + fixO]
+ addps xmm1, [esp + fiyO]
+ addps xmm2, [esp + fizO]
+ movaps [esp + fjxH2], xmm3
+ movaps [esp + fjyH2], xmm4
+ movaps [esp + fjzH2], xmm5
+ movaps [esp + fixO], xmm0
+ movaps [esp + fiyO], xmm1
+ movaps [esp + fizO], xmm2
+
+ /* H1-O interaction */
+ movaps xmm0, [esp + rinvH1O]
+ movaps xmm1, xmm0
+ mulps xmm1, [esp + rsqH1O] /* xmm1=r */
+ mulps xmm1, [esp + tsc]
+ movhlps xmm2, xmm1
+ cvttps2pi mm6, xmm1
+ cvttps2pi mm7, xmm2 /* mm6/mm7 contain lu indices */
+ cvtpi2ps xmm3, mm6
+ cvtpi2ps xmm2, mm7
+ movlhps xmm3, xmm2
+ subps xmm1, xmm3 /* xmm1=eps */
+ movaps xmm2, xmm1
+ mulps xmm2, xmm2 /* xmm2=eps2 */
+
+ pslld mm6, 2
+ pslld mm7, 2
+
+ movd eax, mm6
+ psrlq mm6, 32
+ movd ecx, mm7
+ psrlq mm7, 32
+ movd ebx, mm6
+ movd edx, mm7
+
+ movlps xmm5, [esi + eax*4]
+ movlps xmm7, [esi + ecx*4]
+ movhps xmm5, [esi + ebx*4]
+ movhps xmm7, [esi + edx*4] /* got half coulomb table */
+
+ movaps xmm4, xmm5
+ shufps xmm4, xmm7, 0b10001000
+ shufps xmm5, xmm7, 0b11011101
+
+ movlps xmm7, [esi + eax*4 + 8]
+ movlps xmm3, [esi + ecx*4 + 8]
+ movhps xmm7, [esi + ebx*4 + 8]
+ movhps xmm3, [esi + edx*4 + 8] /* other half of coulomb table */
+ movaps xmm6, xmm7
+ shufps xmm6, xmm3, 0b10001000
+ shufps xmm7, xmm3, 0b11011101
+ /* coulomb table ready, in xmm4-xmm7 */
+
+ mulps xmm6, xmm1 /* xmm6=Geps */
+ mulps xmm7, xmm2 /* xmm7=Heps2 */
+ addps xmm5, xmm6
+ addps xmm5, xmm7 /* xmm5=Fp */
+ mulps xmm7, [esp + two] /* two*Heps2 */
+ movaps xmm3, [esp + qqOH]
+ addps xmm7, xmm6
+ addps xmm7, xmm5 /* xmm7=FF */
+ mulps xmm5, xmm1 /* xmm5=eps*Fp */
+ addps xmm5, xmm4 /* xmm5=VV */
+ mulps xmm5, xmm3 /* vcoul=qq*VV */
+ mulps xmm3, xmm7 /* fijC=FF*qq */
+ /* at this point mm5 contains vcoul and mm3 fijC */
+
+ addps xmm5, [esp + vctot]
+ movaps [esp + vctot], xmm5
+ xorps xmm1, xmm1
+ mulps xmm3, [esp + tsc]
+ mulps xmm3, xmm0
+ subps xmm1, xmm3
+
+ movaps xmm0, xmm1
+ movaps xmm2, xmm1
+
+ movaps xmm3, [esp + fjxO]
+ movaps xmm4, [esp + fjyO]
+ movaps xmm5, [esp + fjzO]
+ mulps xmm0, [esp + dxH1O]
+ mulps xmm1, [esp + dyH1O]
+ mulps xmm2, [esp + dzH1O]
+ subps xmm3, xmm0
+ subps xmm4, xmm1
+ subps xmm5, xmm2
+ addps xmm0, [esp + fixH1]
+ addps xmm1, [esp + fiyH1]
+ addps xmm2, [esp + fizH1]
+ movaps [esp + fjxO], xmm3
+ movaps [esp + fjyO], xmm4
+ movaps [esp + fjzO], xmm5
+ movaps [esp + fixH1], xmm0
+ movaps [esp + fiyH1], xmm1
+ movaps [esp + fizH1], xmm2
+
+ /* H1-H1 interaction */
+ movaps xmm0, [esp + rinvH1H1]
+ movaps xmm1, xmm0
+ mulps xmm1, [esp + rsqH1H1] /* xmm1=r */
+ mulps xmm1, [esp + tsc]
+ movhlps xmm2, xmm1
+ cvttps2pi mm6, xmm1
+ cvttps2pi mm7, xmm2 /* mm6/mm7 contain lu indices */
+ cvtpi2ps xmm3, mm6
+ cvtpi2ps xmm2, mm7
+ movlhps xmm3, xmm2
+ subps xmm1, xmm3 /* xmm1=eps */
+ movaps xmm2, xmm1
+ mulps xmm2, xmm2 /* xmm2=eps2 */
+
+ pslld mm6, 2
+ pslld mm7, 2
+
+ movd eax, mm6
+ psrlq mm6, 32
+ movd ecx, mm7
+ psrlq mm7, 32
+ movd ebx, mm6
+ movd edx, mm7
+
+ movlps xmm5, [esi + eax*4]
+ movlps xmm7, [esi + ecx*4]
+ movhps xmm5, [esi + ebx*4]
+ movhps xmm7, [esi + edx*4] /* got half coulomb table */
+
+ movaps xmm4, xmm5
+ shufps xmm4, xmm7, 0b10001000
+ shufps xmm5, xmm7, 0b11011101
+
+ movlps xmm7, [esi + eax*4 + 8]
+ movlps xmm3, [esi + ecx*4 + 8]
+ movhps xmm7, [esi + ebx*4 + 8]
+ movhps xmm3, [esi + edx*4 + 8] /* other half of coulomb table */
+ movaps xmm6, xmm7
+ shufps xmm6, xmm3, 0b10001000
+ shufps xmm7, xmm3, 0b11011101
+ /* coulomb table ready, in xmm4-xmm7 */
+
+ mulps xmm6, xmm1 /* xmm6=Geps */
+ mulps xmm7, xmm2 /* xmm7=Heps2 */
+ addps xmm5, xmm6
+ addps xmm5, xmm7 /* xmm5=Fp */
+ mulps xmm7, [esp + two] /* two*Heps2 */
+ movaps xmm3, [esp + qqHH]
+ addps xmm7, xmm6
+ addps xmm7, xmm5 /* xmm7=FF */
+ mulps xmm5, xmm1 /* xmm5=eps*Fp */
+ addps xmm5, xmm4 /* xmm5=VV */
+ mulps xmm5, xmm3 /* vcoul=qq*VV */
+ mulps xmm3, xmm7 /* fijC=FF*qq */
+ /* at this point mm5 contains vcoul and mm3 fijC */
+
+ addps xmm5, [esp + vctot]
+ movaps [esp + vctot], xmm5
+ xorps xmm1, xmm1
+ mulps xmm3, [esp + tsc]
+ mulps xmm3, xmm0
+ subps xmm1, xmm3
+
+ movaps xmm0, xmm1
+ movaps xmm2, xmm1
+
+ movaps xmm3, [esp + fjxH1]
+ movaps xmm4, [esp + fjyH1]
+ movaps xmm5, [esp + fjzH1]
+ mulps xmm0, [esp + dxH1H1]
+ mulps xmm1, [esp + dyH1H1]
+ mulps xmm2, [esp + dzH1H1]
+ subps xmm3, xmm0
+ subps xmm4, xmm1
+ subps xmm5, xmm2
+ addps xmm0, [esp + fixH1]
+ addps xmm1, [esp + fiyH1]
+ addps xmm2, [esp + fizH1]
+ movaps [esp + fjxH1], xmm3
+ movaps [esp + fjyH1], xmm4
+ movaps [esp + fjzH1], xmm5
+ movaps [esp + fixH1], xmm0
+ movaps [esp + fiyH1], xmm1
+ movaps [esp + fizH1], xmm2
+
+ /* H1-H2 interaction */
+ movaps xmm0, [esp + rinvH1H2]
+ movaps xmm1, xmm0
+ mulps xmm1, [esp + rsqH1H2] /* xmm1=r */
+ mulps xmm1, [esp + tsc]
+ movhlps xmm2, xmm1
+ cvttps2pi mm6, xmm1
+ cvttps2pi mm7, xmm2 /* mm6/mm7 contain lu indices */
+ cvtpi2ps xmm3, mm6
+ cvtpi2ps xmm2, mm7
+ movlhps xmm3, xmm2
+ subps xmm1, xmm3 /* xmm1=eps */
+ movaps xmm2, xmm1
+ mulps xmm2, xmm2 /* xmm2=eps2 */
+
+ pslld mm6, 2
+ pslld mm7, 2
+
+ movd eax, mm6
+ psrlq mm6, 32
+ movd ecx, mm7
+ psrlq mm7, 32
+ movd ebx, mm6
+ movd edx, mm7
+
+ movlps xmm5, [esi + eax*4]
+ movlps xmm7, [esi + ecx*4]
+ movhps xmm5, [esi + ebx*4]
+ movhps xmm7, [esi + edx*4] /* got half coulomb table */
+
+ movaps xmm4, xmm5
+ shufps xmm4, xmm7, 0b10001000
+ shufps xmm5, xmm7, 0b11011101
+
+ movlps xmm7, [esi + eax*4 + 8]
+ movlps xmm3, [esi + ecx*4 + 8]
+ movhps xmm7, [esi + ebx*4 + 8]
+ movhps xmm3, [esi + edx*4 + 8] /* other half of coulomb table */
+ movaps xmm6, xmm7
+ shufps xmm6, xmm3, 0b10001000
+ shufps xmm7, xmm3, 0b11011101
+ /* coulomb table ready, in xmm4-xmm7 */
+
+ mulps xmm6, xmm1 /* xmm6=Geps */
+ mulps xmm7, xmm2 /* xmm7=Heps2 */
+ addps xmm5, xmm6
+ addps xmm5, xmm7 /* xmm5=Fp */
+ mulps xmm7, [esp + two] /* two*Heps2 */
+ movaps xmm3, [esp + qqHH]
+ addps xmm7, xmm6
+ addps xmm7, xmm5 /* xmm7=FF */
+ mulps xmm5, xmm1 /* xmm5=eps*Fp */
+ addps xmm5, xmm4 /* xmm5=VV */
+ mulps xmm5, xmm3 /* vcoul=qq*VV */
+ mulps xmm3, xmm7 /* fijC=FF*qq */
+ /* at this point mm5 contains vcoul and mm3 fijC */
+
+ addps xmm5, [esp + vctot]
+ movaps [esp + vctot], xmm5
+ xorps xmm1, xmm1
+ mulps xmm3, [esp + tsc]
+ mulps xmm3, xmm0
+ subps xmm1, xmm3
+
+ movaps xmm0, xmm1
+ movaps xmm2, xmm1
+
+ movaps xmm3, [esp + fjxH2]
+ movaps xmm4, [esp + fjyH2]
+ movaps xmm5, [esp + fjzH2]
+ mulps xmm0, [esp + dxH1H2]
+ mulps xmm1, [esp + dyH1H2]
+ mulps xmm2, [esp + dzH1H2]
+ subps xmm3, xmm0
+ subps xmm4, xmm1
+ subps xmm5, xmm2
+ addps xmm0, [esp + fixH1]
+ addps xmm1, [esp + fiyH1]
+ addps xmm2, [esp + fizH1]
+ movaps [esp + fjxH2], xmm3
+ movaps [esp + fjyH2], xmm4
+ movaps [esp + fjzH2], xmm5
+ movaps [esp + fixH1], xmm0
+ movaps [esp + fiyH1], xmm1
+ movaps [esp + fizH1], xmm2
+
+ /* H2-O interaction */
+ movaps xmm0, [esp + rinvH2O]
+ movaps xmm1, xmm0
+ mulps xmm1, [esp + rsqH2O] /* xmm1=r */
+ mulps xmm1, [esp + tsc]
+ movhlps xmm2, xmm1
+ cvttps2pi mm6, xmm1
+ cvttps2pi mm7, xmm2 /* mm6/mm7 contain lu indices */
+ cvtpi2ps xmm3, mm6
+ cvtpi2ps xmm2, mm7
+ movlhps xmm3, xmm2
+ subps xmm1, xmm3 /* xmm1=eps */
+ movaps xmm2, xmm1
+ mulps xmm2, xmm2 /* xmm2=eps2 */
+
+ pslld mm6, 2
+ pslld mm7, 2
+
+ movd eax, mm6
+ psrlq mm6, 32
+ movd ecx, mm7
+ psrlq mm7, 32
+ movd ebx, mm6
+ movd edx, mm7
+
+ movlps xmm5, [esi + eax*4]
+ movlps xmm7, [esi + ecx*4]
+ movhps xmm5, [esi + ebx*4]
+ movhps xmm7, [esi + edx*4] /* got half coulomb table */
+
+ movaps xmm4, xmm5
+ shufps xmm4, xmm7, 0b10001000
+ shufps xmm5, xmm7, 0b11011101
+
+ movlps xmm7, [esi + eax*4 + 8]
+ movlps xmm3, [esi + ecx*4 + 8]
+ movhps xmm7, [esi + ebx*4 + 8]
+ movhps xmm3, [esi + edx*4 + 8] /* other half of coulomb table */
+ movaps xmm6, xmm7
+ shufps xmm6, xmm3, 0b10001000
+ shufps xmm7, xmm3, 0b11011101
+ /* coulomb table ready, in xmm4-xmm7 */
+
+ mulps xmm6, xmm1 /* xmm6=Geps */
+ mulps xmm7, xmm2 /* xmm7=Heps2 */
+ addps xmm5, xmm6
+ addps xmm5, xmm7 /* xmm5=Fp */
+ mulps xmm7, [esp + two] /* two*Heps2 */
+ movaps xmm3, [esp + qqOH]
+ addps xmm7, xmm6
+ addps xmm7, xmm5 /* xmm7=FF */
+ mulps xmm5, xmm1 /* xmm5=eps*Fp */
+ addps xmm5, xmm4 /* xmm5=VV */
+ mulps xmm5, xmm3 /* vcoul=qq*VV */
+ mulps xmm3, xmm7 /* fijC=FF*qq */
+ /* at this point mm5 contains vcoul and mm3 fijC */
+
+ addps xmm5, [esp + vctot]
+ movaps [esp + vctot], xmm5
+ xorps xmm1, xmm1
+ mulps xmm3, [esp + tsc]
+ mulps xmm3, xmm0
+ subps xmm1, xmm3
+
+ movaps xmm0, xmm1
+ movaps xmm2, xmm1
+
+ movaps xmm3, [esp + fjxO]
+ movaps xmm4, [esp + fjyO]
+ movaps xmm5, [esp + fjzO]
+ mulps xmm0, [esp + dxH2O]
+ mulps xmm1, [esp + dyH2O]
+ mulps xmm2, [esp + dzH2O]
+ subps xmm3, xmm0
+ subps xmm4, xmm1
+ subps xmm5, xmm2
+ addps xmm0, [esp + fixH2]
+ addps xmm1, [esp + fiyH2]
+ addps xmm2, [esp + fizH2]
+ movaps [esp + fjxO], xmm3
+ movaps [esp + fjyO], xmm4
+ movaps [esp + fjzO], xmm5
+ movaps [esp + fixH2], xmm0
+ movaps [esp + fiyH2], xmm1
+ movaps [esp + fizH2], xmm2
+
+ /* H2-H1 interaction */
+ movaps xmm0, [esp + rinvH2H1]
+ movaps xmm1, xmm0
+ mulps xmm1, [esp + rsqH2H1] /* xmm1=r */
+ mulps xmm1, [esp + tsc]
+ movhlps xmm2, xmm1
+ cvttps2pi mm6, xmm1
+ cvttps2pi mm7, xmm2 /* mm6/mm7 contain lu indices */
+ cvtpi2ps xmm3, mm6
+ cvtpi2ps xmm2, mm7
+ movlhps xmm3, xmm2
+ subps xmm1, xmm3 /* xmm1=eps */
+ movaps xmm2, xmm1
+ mulps xmm2, xmm2 /* xmm2=eps2 */
+
+ pslld mm6, 2
+ pslld mm7, 2
+
+ movd eax, mm6
+ psrlq mm6, 32
+ movd ecx, mm7
+ psrlq mm7, 32
+ movd ebx, mm6
+ movd edx, mm7
+
+ movlps xmm5, [esi + eax*4]
+ movlps xmm7, [esi + ecx*4]
+ movhps xmm5, [esi + ebx*4]
+ movhps xmm7, [esi + edx*4] /* got half coulomb table */
+
+ movaps xmm4, xmm5
+ shufps xmm4, xmm7, 0b10001000
+ shufps xmm5, xmm7, 0b11011101
+
+ movlps xmm7, [esi + eax*4 + 8]
+ movlps xmm3, [esi + ecx*4 + 8]
+ movhps xmm7, [esi + ebx*4 + 8]
+ movhps xmm3, [esi + edx*4 + 8] /* other half of coulomb table */
+ movaps xmm6, xmm7
+ shufps xmm6, xmm3, 0b10001000
+ shufps xmm7, xmm3, 0b11011101
+ /* coulomb table ready, in xmm4-xmm7 */
+
+ mulps xmm6, xmm1 /* xmm6=Geps */
+ mulps xmm7, xmm2 /* xmm7=Heps2 */
+ addps xmm5, xmm6
+ addps xmm5, xmm7 /* xmm5=Fp */
+ mulps xmm7, [esp + two] /* two*Heps2 */
+ movaps xmm3, [esp + qqHH]
+ addps xmm7, xmm6
+ addps xmm7, xmm5 /* xmm7=FF */
+ mulps xmm5, xmm1 /* xmm5=eps*Fp */
+ addps xmm5, xmm4 /* xmm5=VV */
+ mulps xmm5, xmm3 /* vcoul=qq*VV */
+ mulps xmm3, xmm7 /* fijC=FF*qq */
+ /* at this point mm5 contains vcoul and mm3 fijC */
+
+ addps xmm5, [esp + vctot]
+ movaps [esp + vctot], xmm5
+ xorps xmm1, xmm1
+ mulps xmm3, [esp + tsc]
+ mulps xmm3, xmm0
+ subps xmm1, xmm3
+
+ movaps xmm0, xmm1
+ movaps xmm2, xmm1
+
+ movaps xmm3, [esp + fjxH1]
+ movaps xmm4, [esp + fjyH1]
+ movaps xmm5, [esp + fjzH1]
+ mulps xmm0, [esp + dxH2H1]
+ mulps xmm1, [esp + dyH2H1]
+ mulps xmm2, [esp + dzH2H1]
+ subps xmm3, xmm0
+ subps xmm4, xmm1
+ subps xmm5, xmm2
+ addps xmm0, [esp + fixH2]
+ addps xmm1, [esp + fiyH2]
+ addps xmm2, [esp + fizH2]
+ movaps [esp + fjxH1], xmm3
+ movaps [esp + fjyH1], xmm4
+ movaps [esp + fjzH1], xmm5
+ movaps [esp + fixH2], xmm0
+ movaps [esp + fiyH2], xmm1
+ movaps [esp + fizH2], xmm2
+
+ /* H2-H2 interaction */
+ movaps xmm0, [esp + rinvH2H2]
+ movaps xmm1, xmm0
+ mulps xmm1, [esp + rsqH2H2] /* xmm1=r */
+ mulps xmm1, [esp + tsc]
+ movhlps xmm2, xmm1
+ cvttps2pi mm6, xmm1
+ cvttps2pi mm7, xmm2 /* mm6/mm7 contain lu indices */
+ cvtpi2ps xmm3, mm6
+ cvtpi2ps xmm2, mm7
+ movlhps xmm3, xmm2
+ subps xmm1, xmm3 /* xmm1=eps */
+ movaps xmm2, xmm1
+ mulps xmm2, xmm2 /* xmm2=eps2 */
+
+ pslld mm6, 2
+ pslld mm7, 2
+
+ movd eax, mm6
+ psrlq mm6, 32
+ movd ecx, mm7
+ psrlq mm7, 32
+ movd ebx, mm6
+ movd edx, mm7
+
+ movlps xmm5, [esi + eax*4]
+ movlps xmm7, [esi + ecx*4]
+ movhps xmm5, [esi + ebx*4]
+ movhps xmm7, [esi + edx*4] /* got half coulomb table */
+
+ movaps xmm4, xmm5
+ shufps xmm4, xmm7, 0b10001000
+ shufps xmm5, xmm7, 0b11011101
+
+ movlps xmm7, [esi + eax*4 + 8]
+ movlps xmm3, [esi + ecx*4 + 8]
+ movhps xmm7, [esi + ebx*4 + 8]
+ movhps xmm3, [esi + edx*4 + 8] /* other half of coulomb table */
+ movaps xmm6, xmm7
+ shufps xmm6, xmm3, 0b10001000
+ shufps xmm7, xmm3, 0b11011101
+ /* coulomb table ready, in xmm4-xmm7 */
+
+ mulps xmm6, xmm1 /* xmm6=Geps */
+ mulps xmm7, xmm2 /* xmm7=Heps2 */
+ addps xmm5, xmm6
+ addps xmm5, xmm7 /* xmm5=Fp */
+ mulps xmm7, [esp + two] /* two*Heps2 */
+ movaps xmm3, [esp + qqHH]
+ addps xmm7, xmm6
+ addps xmm7, xmm5 /* xmm7=FF */
+ mulps xmm5, xmm1 /* xmm5=eps*Fp */
+ addps xmm5, xmm4 /* xmm5=VV */
+ mulps xmm5, xmm3 /* vcoul=qq*VV */
+ mulps xmm3, xmm7 /* fijC=FF*qq */
+ /* at this point mm5 contains vcoul and mm3 fijC */
+
+ addps xmm5, [esp + vctot]
+ movaps [esp + vctot], xmm5
+ xorps xmm1, xmm1
+ mulps xmm3, [esp + tsc]
+ mulps xmm3, xmm0
+ subps xmm1, xmm3
+
+ movaps xmm0, xmm1
+ movaps xmm2, xmm1
+
+ movaps xmm3, [esp + fjxH2]
+ movaps xmm4, [esp + fjyH2]
+ movaps xmm5, [esp + fjzH2]
+ mulps xmm0, [esp + dxH2H2]
+ mulps xmm1, [esp + dyH2H2]
+ mulps xmm2, [esp + dzH2H2]
+ subps xmm3, xmm0
+ subps xmm4, xmm1
+ subps xmm5, xmm2
+ addps xmm0, [esp + fixH2]
+ addps xmm1, [esp + fiyH2]
+ addps xmm2, [esp + fizH2]
+ movaps [esp + fjxH2], xmm3
+ movaps [esp + fjyH2], xmm4
+ movaps [esp + fjzH2], xmm5
+ movaps [esp + fixH2], xmm0
+ movaps [esp + fiyH2], xmm1
+ movaps [esp + fizH2], xmm2
+
+ mov edi, [ebp +faction]
+
+ movd eax, mm0
+ movd ebx, mm1
+ movd ecx, mm2
+ movd edx, mm3
+
+ /* Did all interactions - now update j forces */
+ /* 4 j waters with three atoms each - first do a & b j particles */
+ movaps xmm0, [esp + fjxO] /* xmm0= fjxOa fjxOb fjxOc fjxOd */
+ movaps xmm1, [esp + fjyO] /* xmm1= fjyOa fjyOb fjyOc fjyOd */
+ unpcklps xmm0, xmm1 /* xmm0= fjxOa fjyOa fjxOb fjyOb */
+ movaps xmm1, [esp + fjzO]
+ movaps xmm2, [esp + fjxH1]
+ movhlps xmm3, xmm0 /* xmm3= fjxOb fjyOb */
+ unpcklps xmm1, xmm2 /* xmm1= fjzOa fjxH1a fjzOb fjxH1b */
+ movaps xmm4, [esp + fjyH1]
+ movaps xmm5, [esp + fjzH1]
+ unpcklps xmm4, xmm5 /* xmm4= fjyH1a fjzH1a fjyH1b fjzH1b */
+ movaps xmm5, [esp + fjxH2]
+ movaps xmm6, [esp + fjyH2]
+ movhlps xmm7, xmm4 /* xmm7= fjyH1b fjzH1b */
+ unpcklps xmm5, xmm6 /* xmm5= fjxH2a fjyH2a fjxH2b fjyH2b */
+ movlhps xmm0, xmm1 /* xmm0= fjxOa fjyOa fjzOa fjxH1a */
+ shufps xmm3, xmm1, 0b11100100
+ /* xmm3= fjxOb fjyOb fjzOb fjxH1b */
+ movlhps xmm4, xmm5 /* xmm4= fjyH1a fjzH1a fjxH2a fjyH2a */
+ shufps xmm7, xmm5, 0b11100100
+ /* xmm7= fjyH1b fjzH1b fjxH2b fjyH2b */
+ movups xmm1, [edi + eax*4]
+ movups xmm2, [edi + eax*4 + 16]
+ movups xmm5, [edi + ebx*4]
+ movups xmm6, [edi + ebx*4 + 16]
+ addps xmm1, xmm0
+ addps xmm2, xmm4
+ addps xmm5, xmm3
+ addps xmm6, xmm7
+ movss xmm0, [edi + eax*4 + 32]
+ movss xmm3, [edi + ebx*4 + 32]
+
+ movaps xmm4, [esp + fjzH2]
+ movaps xmm7, xmm4
+ shufps xmm7, xmm7, 1
+
+ movups [edi + eax*4], xmm1
+ movups [edi + eax*4 + 16],xmm2
+ movups [edi + ebx*4], xmm5
+ movups [edi + ebx*4 + 16],xmm6
+ addss xmm0, xmm4
+ addss xmm3, xmm7
+ movss [edi + eax*4 + 32], xmm0
+ movss [edi + ebx*4 + 32], xmm3
+
+ /* then do the second pair (c & d) */
+ movaps xmm0, [esp + fjxO] /* xmm0= fjxOa fjxOb fjxOc fjxOd */
+ movaps xmm1, [esp + fjyO] /* xmm1= fjyOa fjyOb fjyOc fjyOd */
+ unpckhps xmm0, xmm1 /* xmm0= fjxOc fjyOc fjxOd fjyOd */
+ movaps xmm1, [esp + fjzO]
+ movaps xmm2, [esp + fjxH1]
+ movhlps xmm3, xmm0 /* xmm3= fjxOd fjyOd */
+ unpckhps xmm1, xmm2 /* xmm1= fjzOc fjxH1c fjzOd fjxH1d */
+ movaps xmm4, [esp + fjyH1]
+ movaps xmm5, [esp + fjzH1]
+ unpckhps xmm4, xmm5 /* xmm4= fjyH1c fjzH1c fjyH1d fjzH1d */
+ movaps xmm5, [esp + fjxH2]
+ movaps xmm6, [esp + fjyH2]
+ movhlps xmm7, xmm4 /* xmm7= fjyH1d fjzH1d */
+ unpckhps xmm5, xmm6 /* xmm5= fjxH2c fjyH2c fjxH2d fjyH2d */
+ movlhps xmm0, xmm1 /* xmm0= fjxOc fjyOc fjzOc fjxH1c */
+ shufps xmm3, xmm1, 0b11100100
+ /* xmm3= fjxOd fjyOd fjzOd fjxH1d */
+ movlhps xmm4, xmm5 /* xmm4= fjyH1c fjzH1c fjxH2c fjyH2c */
+ shufps xmm7, xmm5, 0b11100100
+ /* xmm7= fjyH1d fjzH1d fjxH2d fjyH2d */
+ movups xmm1, [edi + ecx*4]
+ movups xmm2, [edi + ecx*4 + 16]
+ movups xmm5, [edi + edx*4]
+ movups xmm6, [edi + edx*4 + 16]
+ addps xmm1, xmm0
+ addps xmm2, xmm4
+ addps xmm5, xmm3
+ addps xmm6, xmm7
+ movss xmm0, [edi + ecx*4 + 32]
+ movss xmm3, [edi + edx*4 + 32]
+
+ movaps xmm4, [esp + fjzH2]
+ movaps xmm7, xmm4
+ shufps xmm4, xmm4, 0b10
+ shufps xmm7, xmm7, 0b11
+ movups [edi + ecx*4], xmm1
+ movups [edi + ecx*4 + 16],xmm2
+ movups [edi + edx*4], xmm5
+ movups [edi + edx*4 + 16],xmm6
+ addss xmm0, xmm4
+ addss xmm3, xmm7
+ movss [edi + ecx*4 + 32], xmm0
+ movss [edi + edx*4 + 32], xmm3
+
+ /* should we do one more iteration? */
+ sub [esp + innerk], 4
+ jl .i3130_single_check
+ jmp .i3130_unroll_loop
+.i3130_single_check:
+ add [esp + innerk], 4
+ jnz .i3130_single_loop
+ jmp .i3130_updateouterdata
+.i3130_single_loop:
+ mov edx, [esp + innerjjnr] /* pointer to jjnr[k] */
+ mov eax, [edx]
+ add [esp + innerjjnr], 4
+
+ mov esi, [ebp + pos]
+ lea eax, [eax + eax*2]
+
+ /* fetch j coordinates */
+ xorps xmm3, xmm3
+ xorps xmm4, xmm4
+ xorps xmm5, xmm5
+ movss xmm3, [esi + eax*4]
+ movss xmm4, [esi + eax*4 + 4]
+ movss xmm5, [esi + eax*4 + 8]
+
+ movlps xmm6, [esi + eax*4 + 12]
+ movhps xmm6, [esi + eax*4 + 24] /* xmm6=jxH1 jyH1 jxH2 jyH2 */
+ /* fetch both z coords in one go, to positions 0 and 3 in xmm7 */
+ movups xmm7, [esi + eax*4 + 20] /* xmm7=jzH1 jxH2 jyH2 jzH2 */
+ shufps xmm6, xmm6, 0b11011000 /* xmm6=jxH1 jxH2 jyH1 jyH2 */
+ movlhps xmm3, xmm6 /* xmm3= jxO 0 jxH1 jxH2 */
+ movaps xmm0, [esp + ixO]
+ movaps xmm1, [esp + iyO]
+ movaps xmm2, [esp + izO]
+ shufps xmm4, xmm6, 0b11100100 /* xmm4= jyO 0 jyH1 jyH2 */
+ shufps xmm5, xmm7, 0b11000100 /* xmm5= jzO 0 jzH1 jzH2 */
+ /* store all j coordinates in jO */
+ movaps [esp + jxO], xmm3
+ movaps [esp + jyO], xmm4
+ movaps [esp + jzO], xmm5
+ subps xmm0, xmm3
+ subps xmm1, xmm4
+ subps xmm2, xmm5
+ movaps [esp + dxOO], xmm0
+ movaps [esp + dyOO], xmm1
+ movaps [esp + dzOO], xmm2
+ mulps xmm0, xmm0
+ mulps xmm1, xmm1
+ mulps xmm2, xmm2
+ addps xmm0, xmm1
+ addps xmm0, xmm2 /* have rsq in xmm0 */
+
+ /* do invsqrt */
+ rsqrtps xmm1, xmm0
+ movaps xmm2, xmm1
+ mulps xmm1, xmm1
+ movaps xmm3, [esp + three]
+ mulps xmm1, xmm0
+ subps xmm3, xmm1
+ mulps xmm3, xmm2
+ mulps xmm3, [esp + half] /* rinv iO - j water */
+
+ movaps xmm1, xmm3
+ mulps xmm1, xmm0 /* xmm1=r */
+ movaps xmm0, xmm3 /* xmm0=rinv */
+ mulps xmm1, [esp + tsc]
+
+ movhlps xmm2, xmm1
+ cvttps2pi mm6, xmm1
+ cvttps2pi mm7, xmm2 /* mm6/mm7 contain lu indices */
+ cvtpi2ps xmm3, mm6
+ cvtpi2ps xmm2, mm7
+ movlhps xmm3, xmm2
+ subps xmm1, xmm3 /* xmm1=eps */
+ movaps xmm2, xmm1
+ mulps xmm2, xmm2 /* xmm2=eps2 */
+ pslld mm6, 2
+ pslld mm7, 2
+
+ movd ebx, mm6
+ movd ecx, mm7
+ psrlq mm7, 32
+ movd edx, mm7 /* table indices in ebx,ecx,edx */
+
+ mov esi, [ebp + VFtab]
+
+ movlps xmm5, [esi + ebx*4]
+ movlps xmm7, [esi + ecx*4]
+ movhps xmm7, [esi + edx*4] /* got half coulomb table */
+ movaps xmm4, xmm5
+ shufps xmm4, xmm7, 0b10001000
+ shufps xmm5, xmm7, 0b11011101
+
+ movlps xmm7, [esi + ebx*4 + 8]
+ movlps xmm3, [esi + ecx*4 + 8]
+ movhps xmm3, [esi + edx*4 + 8] /* other half of coulomb table */
+ movaps xmm6, xmm7
+ shufps xmm6, xmm3, 0b10001000
+ shufps xmm7, xmm3, 0b11011101
+ /* coulomb table ready, in xmm4-xmm7 */
+ mulps xmm6, xmm1 /* xmm6=Geps */
+ mulps xmm7, xmm2 /* xmm7=Heps2 */
+ addps xmm5, xmm6
+ addps xmm5, xmm7 /* xmm5=Fp */
+ mulps xmm7, [esp + two] /* two*Heps2 */
+
+ xorps xmm3, xmm3
+ /* fetch charges to xmm3 (temporary) */
+ movss xmm3, [esp + qqOO]
+ movhps xmm3, [esp + qqOH]
+
+ addps xmm7, xmm6
+ addps xmm7, xmm5 /* xmm7=FF */
+ mulps xmm5, xmm1 /* xmm5=eps*Fp */
+ addps xmm5, xmm4 /* xmm5=VV */
+ mulps xmm5, xmm3 /* vcoul=qq*VV */
+ mulps xmm3, xmm7 /* fijC=FF*qq */
+ /* at this point xmm5 contains vcoul and xmm3 fijC */
+
+ addps xmm5, [esp + vctot]
+ movaps [esp + vctot], xmm5
+
+ mulps xmm3, [esp + tsc]
+
+ /* start doing lj */
+ xorps xmm2, xmm2
+ movss xmm2, xmm0
+ mulss xmm2, xmm2
+ movaps xmm1, xmm2
+ mulss xmm1, xmm2
+ mulss xmm1, xmm2 /* xmm1=rinvsix */
+ movaps xmm2, xmm1
+ mulss xmm2, xmm2 /* xmm2=rinvtwelve */
+ mulss xmm1, [esp + c6]
+ mulss xmm2, [esp + c12]
+ movaps xmm4, xmm2
+ subss xmm4, xmm1
+ addps xmm4, [esp + vnbtot]
+ mulss xmm1, [esp + six]
+ mulss xmm2, [esp + twelve]
+ movaps [esp + vnbtot], xmm4
+ subss xmm2, xmm1
+ mulss xmm2, xmm0
+
+ subps xmm2, xmm3
+ mulps xmm0, xmm2
+
+ movaps xmm1, xmm0
+ movaps xmm2, xmm0
+
+ mulps xmm0, [esp + dxOO]
+ mulps xmm1, [esp + dyOO]
+ mulps xmm2, [esp + dzOO]
+ /* initial update for j forces */
+ xorps xmm3, xmm3
+ xorps xmm4, xmm4
+ xorps xmm5, xmm5
+ subps xmm3, xmm0
+ subps xmm4, xmm1
+ subps xmm5, xmm2
+ movaps [esp + fjxO], xmm3
+ movaps [esp + fjyO], xmm4
+ movaps [esp + fjzO], xmm5
+ addps xmm0, [esp + fixO]
+ addps xmm1, [esp + fiyO]
+ addps xmm2, [esp + fizO]
+ movaps [esp + fixO], xmm0
+ movaps [esp + fiyO], xmm1
+ movaps [esp + fizO], xmm2
+
+
+ /* done with i O Now do i H1 & H2 simultaneously first get i particle coords: */
+ movaps xmm0, [esp + ixH1]
+ movaps xmm1, [esp + iyH1]
+ movaps xmm2, [esp + izH1]
+ movaps xmm3, [esp + ixH2]
+ movaps xmm4, [esp + iyH2]
+ movaps xmm5, [esp + izH2]
+ subps xmm0, [esp + jxO]
+ subps xmm1, [esp + jyO]
+ subps xmm2, [esp + jzO]
+ subps xmm3, [esp + jxO]
+ subps xmm4, [esp + jyO]
+ subps xmm5, [esp + jzO]
+ movaps [esp + dxH1O], xmm0
+ movaps [esp + dyH1O], xmm1
+ movaps [esp + dzH1O], xmm2
+ movaps [esp + dxH2O], xmm3
+ movaps [esp + dyH2O], xmm4
+ movaps [esp + dzH2O], xmm5
+ mulps xmm0, xmm0
+ mulps xmm1, xmm1
+ mulps xmm2, xmm2
+ mulps xmm3, xmm3
+ mulps xmm4, xmm4
+ mulps xmm5, xmm5
+ addps xmm0, xmm1
+ addps xmm4, xmm3
+ addps xmm0, xmm2 /* have rsqH1 in xmm0 */
+ addps xmm4, xmm5 /* have rsqH2 in xmm4 */
+
+ /* start with H1, save H2 data */
+ movaps [esp + rsqH2O], xmm4
+
+ /* do invsqrt */
+ rsqrtps xmm1, xmm0
+ rsqrtps xmm5, xmm4
+ movaps xmm2, xmm1
+ movaps xmm6, xmm5
+ mulps xmm1, xmm1
+ mulps xmm5, xmm5
+ movaps xmm3, [esp + three]
+ movaps xmm7, xmm3
+ mulps xmm1, xmm0
+ mulps xmm5, xmm4
+ subps xmm3, xmm1
+ subps xmm7, xmm5
+ mulps xmm3, xmm2
+ mulps xmm7, xmm6
+ mulps xmm3, [esp + half] /* rinv H1 - j water */
+ mulps xmm7, [esp + half] /* rinv H2 - j water */
+
+ /* start with H1, save H2 data */
+ movaps [esp + rinvH2O], xmm7
+
+ movaps xmm1, xmm3
+ mulps xmm1, xmm0 /* xmm1=r */
+ movaps xmm0, xmm3 /* xmm0=rinv */
+ mulps xmm1, [esp + tsc]
+
+ movhlps xmm2, xmm1
+ cvttps2pi mm6, xmm1
+ cvttps2pi mm7, xmm2 /* mm6/mm7 contain lu indices */
+ cvtpi2ps xmm3, mm6
+ cvtpi2ps xmm2, mm7
+ movlhps xmm3, xmm2
+ subps xmm1, xmm3 /* xmm1=eps */
+ movaps xmm2, xmm1
+ mulps xmm2, xmm2 /* xmm2=eps2 */
+ pslld mm6, 2
+ pslld mm7, 2
+
+ movd ebx, mm6
+ movd ecx, mm7
+ psrlq mm7, 32
+ movd edx, mm7 /* table indices in ebx,ecx,edx */
+
+ movlps xmm5, [esi + ebx*4]
+ movlps xmm7, [esi + ecx*4]
+ movhps xmm7, [esi + edx*4] /* got half coulomb table */
+ movaps xmm4, xmm5
+ shufps xmm4, xmm7, 0b10001000
+ shufps xmm5, xmm7, 0b11011101
+
+ movlps xmm7, [esi + ebx*4 + 8]
+ movlps xmm3, [esi + ecx*4 + 8]
+ movhps xmm3, [esi + edx*4 + 8] /* other half of coulomb table */
+ movaps xmm6, xmm7
+ shufps xmm6, xmm3, 0b10001000
+ shufps xmm7, xmm3, 0b11011101
+ /* coulomb table ready, in xmm4-xmm7 */
+ mulps xmm6, xmm1 /* xmm6=Geps */
+ mulps xmm7, xmm2 /* xmm7=Heps2 */
+ addps xmm5, xmm6
+ addps xmm5, xmm7 /* xmm5=Fp */
+ mulps xmm7, [esp + two] /* two*Heps2 */
+
+ xorps xmm3, xmm3
+ /* fetch charges to xmm3 (temporary) */
+ movss xmm3, [esp + qqOH]
+ movhps xmm3, [esp + qqHH]
+
+ addps xmm7, xmm6
+ addps xmm7, xmm5 /* xmm7=FF */
+ mulps xmm5, xmm1 /* xmm5=eps*Fp */
+ addps xmm5, xmm4 /* xmm5=VV */
+ mulps xmm5, xmm3 /* vcoul=qq*VV */
+ mulps xmm3, xmm7 /* fijC=FF*qq */
+ /* at this point xmm5 contains vcoul and xmm3 fijC */
+ addps xmm5, [esp + vctot]
+ movaps [esp + vctot], xmm5
+
+ xorps xmm1, xmm1
+
+ mulps xmm3, [esp + tsc]
+ mulps xmm3, xmm0
+ subps xmm1, xmm3
+
+ movaps xmm0, xmm1
+ movaps xmm2, xmm1
+ mulps xmm0, [esp + dxH1O]
+ mulps xmm1, [esp + dyH1O]
+ mulps xmm2, [esp + dzH1O]
+ /* update forces H1 - j water */
+ movaps xmm3, [esp + fjxO]
+ movaps xmm4, [esp + fjyO]
+ movaps xmm5, [esp + fjzO]
+ subps xmm3, xmm0
+ subps xmm4, xmm1
+ subps xmm5, xmm2
+ movaps [esp + fjxO], xmm3
+ movaps [esp + fjyO], xmm4
+ movaps [esp + fjzO], xmm5
+ addps xmm0, [esp + fixH1]
+ addps xmm1, [esp + fiyH1]
+ addps xmm2, [esp + fizH1]
+ movaps [esp + fixH1], xmm0
+ movaps [esp + fiyH1], xmm1
+ movaps [esp + fizH1], xmm2
+ /* do table for H2 - j water interaction */
+ movaps xmm0, [esp + rinvH2O]
+ movaps xmm1, [esp + rsqH2O]
+ mulps xmm1, xmm0 /* xmm0=rinv, xmm1=r */
+ mulps xmm1, [esp + tsc]
+
+ movhlps xmm2, xmm1
+ cvttps2pi mm6, xmm1
+ cvttps2pi mm7, xmm2 /* mm6/mm7 contain lu indices */
+ cvtpi2ps xmm3, mm6
+ cvtpi2ps xmm2, mm7
+ movlhps xmm3, xmm2
+ subps xmm1, xmm3 /* xmm1=eps */
+ movaps xmm2, xmm1
+ mulps xmm2, xmm2 /* xmm2=eps2 */
+ pslld mm6, 2
+ pslld mm7, 2
+
+ movd ebx, mm6
+ movd ecx, mm7
+ psrlq mm7, 32
+ movd edx, mm7 /* table indices in ebx,ecx,edx */
+
+ movlps xmm5, [esi + ebx*4]
+ movlps xmm7, [esi + ecx*4]
+ movhps xmm7, [esi + edx*4] /* got half coulomb table */
+ movaps xmm4, xmm5
+ shufps xmm4, xmm7, 0b10001000
+ shufps xmm5, xmm7, 0b11011101
+
+ movlps xmm7, [esi + ebx*4 + 8]
+ movlps xmm3, [esi + ecx*4 + 8]
+ movhps xmm3, [esi + edx*4 + 8] /* other half of coulomb table */
+ movaps xmm6, xmm7
+ shufps xmm6, xmm3, 0b10001000
+ shufps xmm7, xmm3, 0b11011101
+ /* coulomb table ready, in xmm4-xmm7 */
+ mulps xmm6, xmm1 /* xmm6=Geps */
+ mulps xmm7, xmm2 /* xmm7=Heps2 */
+ addps xmm5, xmm6
+ addps xmm5, xmm7 /* xmm5=Fp */
+ mulps xmm7, [esp + two] /* two*Heps2 */
+
+ xorps xmm3, xmm3
+ /* fetch charges to xmm3 (temporary) */
+ movss xmm3, [esp + qqOH]
+ movhps xmm3, [esp + qqHH]
+
+ addps xmm7, xmm6
+ addps xmm7, xmm5 /* xmm7=FF */
+ mulps xmm5, xmm1 /* xmm5=eps*Fp */
+ addps xmm5, xmm4 /* xmm5=VV */
+ mulps xmm5, xmm3 /* vcoul=qq*VV */
+ mulps xmm3, xmm7 /* fijC=FF*qq */
+ /* at this point xmm5 contains vcoul and xmm3 fijC */
+ addps xmm5, [esp + vctot]
+ movaps [esp + vctot], xmm5
+
+ xorps xmm1, xmm1
+
+ mulps xmm3, [esp + tsc]
+ mulps xmm3, xmm0
+ subps xmm1, xmm3
+
+ movaps xmm0, xmm1
+ movaps xmm2, xmm1
+
+ mulps xmm0, [esp + dxH2O]
+ mulps xmm1, [esp + dyH2O]
+ mulps xmm2, [esp + dzH2O]
+ movaps xmm3, [esp + fjxO]
+ movaps xmm4, [esp + fjyO]
+ movaps xmm5, [esp + fjzO]
+ subps xmm3, xmm0
+ subps xmm4, xmm1
+ subps xmm5, xmm2
+ mov esi, [ebp + faction]
+ movaps [esp + fjxO], xmm3
+ movaps [esp + fjyO], xmm4
+ movaps [esp + fjzO], xmm5
+ addps xmm0, [esp + fixH2]
+ addps xmm1, [esp + fiyH2]
+ addps xmm2, [esp + fizH2]
+ movaps [esp + fixH2], xmm0
+ movaps [esp + fiyH2], xmm1
+ movaps [esp + fizH2], xmm2
+
+ /* update j water forces from local variables */
+ movlps xmm0, [esi + eax*4]
+ movlps xmm1, [esi + eax*4 + 12]
+ movhps xmm1, [esi + eax*4 + 24]
+ movaps xmm3, [esp + fjxO]
+ movaps xmm4, [esp + fjyO]
+ movaps xmm5, [esp + fjzO]
+ movaps xmm6, xmm5
+ movaps xmm7, xmm5
+ shufps xmm6, xmm6, 0b10
+ shufps xmm7, xmm7, 0b11
+ addss xmm5, [esi + eax*4 + 8]
+ addss xmm6, [esi + eax*4 + 20]
+ addss xmm7, [esi + eax*4 + 32]
+ movss [esi + eax*4 + 8], xmm5
+ movss [esi + eax*4 + 20], xmm6
+ movss [esi + eax*4 + 32], xmm7
+ movaps xmm5, xmm3
+ unpcklps xmm3, xmm4
+ unpckhps xmm5, xmm4
+ addps xmm0, xmm3
+ addps xmm1, xmm5
+ movlps [esi + eax*4], xmm0
+ movlps [esi + eax*4 + 12], xmm1
+ movhps [esi + eax*4 + 24], xmm1
+
+ dec dword ptr [esp + innerk]
+ jz .i3130_updateouterdata
+ jmp .i3130_single_loop
+.i3130_updateouterdata:
+ mov ecx, [esp + ii3]
+ mov edi, [ebp + faction]
+ mov esi, [ebp + fshift]
+ mov edx, [esp + is3]
+
+ /* accumulate Oi forces in xmm0, xmm1, xmm2 */
+ movaps xmm0, [esp + fixO]
+ movaps xmm1, [esp + fiyO]
+ movaps xmm2, [esp + fizO]
+
+ movhlps xmm3, xmm0
+ movhlps xmm4, xmm1
+ movhlps xmm5, xmm2
+ addps xmm0, xmm3
+ addps xmm1, xmm4
+ addps xmm2, xmm5 /* sum is in 1/2 in xmm0-xmm2 */
+
+ movaps xmm3, xmm0
+ movaps xmm4, xmm1
+ movaps xmm5, xmm2
+
+ shufps xmm3, xmm3, 1
+ shufps xmm4, xmm4, 1
+ shufps xmm5, xmm5, 1
+ addss xmm0, xmm3
+ addss xmm1, xmm4
+ addss xmm2, xmm5 /* xmm0-xmm2 has single force in pos0 */
+
+ /* increment i force */
+ movss xmm3, [edi + ecx*4]
+ movss xmm4, [edi + ecx*4 + 4]
+ movss xmm5, [edi + ecx*4 + 8]
+ addss xmm3, xmm0
+ addss xmm4, xmm1
+ addss xmm5, xmm2
+ movss [edi + ecx*4], xmm3
+ movss [edi + ecx*4 + 4], xmm4
+ movss [edi + ecx*4 + 8], xmm5
+
+ /* accumulate force in xmm6/xmm7 for fshift */
+ movaps xmm6, xmm0
+ movss xmm7, xmm2
+ movlhps xmm6, xmm1
+ shufps xmm6, xmm6, 0b1000
+
+ /* accumulate H1i forces in xmm0, xmm1, xmm2 */
+ movaps xmm0, [esp + fixH1]
+ movaps xmm1, [esp + fiyH1]
+ movaps xmm2, [esp + fizH1]
+
+ movhlps xmm3, xmm0
+ movhlps xmm4, xmm1
+ movhlps xmm5, xmm2
+ addps xmm0, xmm3
+ addps xmm1, xmm4
+ addps xmm2, xmm5 /* sum is in 1/2 in xmm0-xmm2 */
+
+ movaps xmm3, xmm0
+ movaps xmm4, xmm1
+ movaps xmm5, xmm2
+
+ shufps xmm3, xmm3, 1
+ shufps xmm4, xmm4, 1
+ shufps xmm5, xmm5, 1
+ addss xmm0, xmm3
+ addss xmm1, xmm4
+ addss xmm2, xmm5 /* xmm0-xmm2 has single force in pos0 */
+
+ /* increment i force */
+ movss xmm3, [edi + ecx*4 + 12]
+ movss xmm4, [edi + ecx*4 + 16]
+ movss xmm5, [edi + ecx*4 + 20]
+ addss xmm3, xmm0
+ addss xmm4, xmm1
+ addss xmm5, xmm2
+ movss [edi + ecx*4 + 12], xmm3
+ movss [edi + ecx*4 + 16], xmm4
+ movss [edi + ecx*4 + 20], xmm5
+
+ /* accumulate force in xmm6/xmm7 for fshift */
+ addss xmm7, xmm2
+ movlhps xmm0, xmm1
+ shufps xmm0, xmm0, 0b1000
+ addps xmm6, xmm0
+
+ /* accumulate H2i forces in xmm0, xmm1, xmm2 */
+ movaps xmm0, [esp + fixH2]
+ movaps xmm1, [esp + fiyH2]
+ movaps xmm2, [esp + fizH2]
+
+ movhlps xmm3, xmm0
+ movhlps xmm4, xmm1
+ movhlps xmm5, xmm2
+ addps xmm0, xmm3
+ addps xmm1, xmm4
+ addps xmm2, xmm5 /* sum is in 1/2 in xmm0-xmm2 */
+
+ movaps xmm3, xmm0
+ movaps xmm4, xmm1
+ movaps xmm5, xmm2
+
+ shufps xmm3, xmm3, 1
+ shufps xmm4, xmm4, 1
+ shufps xmm5, xmm5, 1
+ addss xmm0, xmm3
+ addss xmm1, xmm4
+ addss xmm2, xmm5 /* xmm0-xmm2 has single force in pos0 */
+
+ /* increment i force */
+ movss xmm3, [edi + ecx*4 + 24]
+ movss xmm4, [edi + ecx*4 + 28]
+ movss xmm5, [edi + ecx*4 + 32]
+ addss xmm3, xmm0
+ addss xmm4, xmm1
+ addss xmm5, xmm2
+ movss [edi + ecx*4 + 24], xmm3
+ movss [edi + ecx*4 + 28], xmm4
+ movss [edi + ecx*4 + 32], xmm5
+
+ /* accumulate force in xmm6/xmm7 for fshift */
+ addss xmm7, xmm2
+ movlhps xmm0, xmm1
+ shufps xmm0, xmm0, 0b1000
+ addps xmm6, xmm0
+
+ /* increment fshift force */
+ movlps xmm3, [esi + edx*4]
+ movss xmm4, [esi + edx*4 + 8]
+ addps xmm3, xmm6
+ addss xmm4, xmm7
+ movlps [esi + edx*4], xmm3
+ movss [esi + edx*4 + 8], xmm4
+
+ /* get group index for i particle */
+ mov edx, [ebp + gid] /* get group index for this i particle */
+ mov edx, [edx]
+ add [ebp + gid], 4 /* advance pointer */
+
+ /* accumulate total potential energy and update it */
+ movaps xmm7, [esp + vctot]
+ /* accumulate */
+ movhlps xmm6, xmm7
+ addps xmm7, xmm6 /* pos 0-1 in xmm7 have the sum now */
+ movaps xmm6, xmm7
+ shufps xmm6, xmm6, 1
+ addss xmm7, xmm6
+
+ /* add earlier value from mem */
+ mov eax, [ebp + Vc]
+ addss xmm7, [eax + edx*4]
+ /* move back to mem */
+ movss [eax + edx*4], xmm7
+
+ /* accumulate total lj energy and update it */
+ movaps xmm7, [esp + vnbtot]
+ /* accumulate */
+ movhlps xmm6, xmm7
+ addps xmm7, xmm6 /* pos 0-1 in xmm7 have the sum now */
+ movaps xmm6, xmm7
+ shufps xmm6, xmm6, 1
+ addss xmm7, xmm6
+
+ /* add earlier value from mem */
+ mov eax, [ebp + Vnb]
+ addss xmm7, [eax + edx*4]
+ /* move back to mem */
+ movss [eax + edx*4], xmm7
+
+ /* finish if last */
+ mov ecx, [ebp + nri]
+ dec ecx
+ jecxz .i3130_end
+ /* not last, iterate once more! */
+ mov [ebp + nri], ecx
+ jmp .i3130_outer
+.i3130_end:
+ emms
+ mov eax, [esp + salign]
+ add esp, eax
+ add esp, 1540
+ pop edi
+ pop esi
+ pop edx
+ pop ecx
+ pop ebx
+ pop eax
+ leave
+ ret
+
+
+
+
+.globl inl3300_sse
+ .type inl3300_sse,@function
+inl3300_sse:
+.equ nri, 8
+.equ iinr, 12
+.equ jindex, 16
+.equ jjnr, 20
+.equ shift, 24
+.equ shiftvec, 28
+.equ fshift, 32
+.equ gid, 36
+.equ pos, 40
+.equ faction, 44
+.equ charge, 48
+.equ facel, 52
+.equ Vc, 56
+.equ type, 60
+.equ ntype, 64
+.equ nbfp, 68
+.equ Vnb, 72
+.equ tabscale, 76
+.equ VFtab, 80
+ /* stack offsets for local variables */
+ /* bottom of stack is cache-aligned for sse use */
+.equ ix, 0
+.equ iy, 16
+.equ iz, 32
+.equ iq, 48
+.equ dx, 64
+.equ dy, 80
+.equ dz, 96
+.equ two, 112
+.equ tsc, 128
+.equ qq, 144
+.equ c6, 160
+.equ c12, 176
+.equ fscal, 192
+.equ vctot, 208
+.equ vnbtot, 224
+.equ fix, 240
+.equ fiy, 256
+.equ fiz, 272
+.equ half, 288
+.equ three, 304
+.equ is3, 320
+.equ ii3, 324
+.equ ntia, 328
+.equ innerjjnr, 332
+.equ innerk, 336
+.equ salign, 340
+ push ebp
+ mov ebp,esp
+ push eax
+ push ebx
+ push ecx
+ push edx
+ push esi
+ push edi
+ sub esp, 344 /* local stack space */
+ mov eax, esp
+ and eax, 0xf
+ sub esp, eax
+ mov [esp + salign], eax
+
+ emms
+
+ movups xmm0, [sse_half]
+ movups xmm1, [sse_two]
+ movups xmm2, [sse_three]
+ movss xmm3, [ebp + tabscale]
+ movaps [esp + half], xmm0
+ movaps [esp + two], xmm1
+ movaps [esp + three], xmm2
+ shufps xmm3, xmm3, 0
+ movaps [esp + tsc], xmm3
+
+ /* assume we have at least one i particle - start directly */
+.i3300_outer:
+ mov eax, [ebp + shift] /* eax = pointer into shift[] */
+ mov ebx, [eax] /* ebx=shift[n] */
+ add [ebp + shift], 4 /* advance pointer one step */
+
+ lea ebx, [ebx + ebx*2] /* ebx=3*is */
+ mov [esp + is3],ebx /* store is3 */
+
+ mov eax, [ebp + shiftvec] /* eax = base of shiftvec[] */
+
+ movss xmm0, [eax + ebx*4]
+ movss xmm1, [eax + ebx*4 + 4]
+ movss xmm2, [eax + ebx*4 + 8]
+
+ mov ecx, [ebp + iinr] /* ecx = pointer into iinr[] */
+ add [ebp + iinr], 4 /* advance pointer */
+ mov ebx, [ecx] /* ebx =ii */
+
+ mov edx, [ebp + charge]
+ movss xmm3, [edx + ebx*4]
+ mulss xmm3, [ebp + facel]
+ shufps xmm3, xmm3, 0
+
+ mov edx, [ebp + type]
+ mov edx, [edx + ebx*4]
+ imul edx, [ebp + ntype]
+ shl edx, 1
+ mov [esp + ntia], edx
+
+ lea ebx, [ebx + ebx*2] /* ebx = 3*ii=ii3 */
+ mov eax, [ebp + pos] /* eax = base of pos[] */
+
+ addss xmm0, [eax + ebx*4]
+ addss xmm1, [eax + ebx*4 + 4]
+ addss xmm2, [eax + ebx*4 + 8]
+
+ movaps [esp + iq], xmm3
+
+ shufps xmm0, xmm0, 0
+ shufps xmm1, xmm1, 0
+ shufps xmm2, xmm2, 0
+
+ movaps [esp + ix], xmm0
+ movaps [esp + iy], xmm1
+ movaps [esp + iz], xmm2
+
+ mov [esp + ii3], ebx
+
+ /* clear vctot and i forces */
+ xorps xmm4, xmm4
+ movaps [esp + vctot], xmm4
+ movaps [esp + vnbtot], xmm4
+ movaps [esp + fix], xmm4
+ movaps [esp + fiy], xmm4
+ movaps [esp + fiz], xmm4
+
+ mov eax, [ebp + jindex]
+ mov ecx, [eax] /* jindex[n] */
+ mov edx, [eax + 4] /* jindex[n+1] */
+ add [ebp + jindex], 4
+ sub edx, ecx /* number of innerloop atoms */
+
+ mov esi, [ebp + pos]
+ mov edi, [ebp + faction]
+ mov eax, [ebp + jjnr]
+ shl ecx, 2
+ add eax, ecx
+ mov [esp + innerjjnr], eax /* pointer to jjnr[nj0] */
+ sub edx, 4
+ mov [esp + innerk], edx /* number of innerloop atoms */
+ jge .i3300_unroll_loop
+ jmp .i3300_finish_inner
+.i3300_unroll_loop:
+ /* quad-unroll innerloop here */
+ mov edx, [esp + innerjjnr] /* pointer to jjnr[k] */
+ mov eax, [edx]
+ mov ebx, [edx + 4]
+ mov ecx, [edx + 8]
+ mov edx, [edx + 12] /* eax-edx=jnr1-4 */
+ add [esp + innerjjnr], 16 /* advance pointer (unrolled 4) */
+
+ mov esi, [ebp + charge] /* base of charge[] */
+
+ movss xmm3, [esi + eax*4]
+ movss xmm4, [esi + ecx*4]
+ movss xmm6, [esi + ebx*4]
+ movss xmm7, [esi + edx*4]
+
+ movaps xmm2, [esp + iq]
+ shufps xmm3, xmm6, 0
+ shufps xmm4, xmm7, 0
+ shufps xmm3, xmm4, 0b10001000 /* all charges in xmm3 */
+ movd mm0, eax /* use mmx registers as temp storage */
+ movd mm1, ebx
+ mulps xmm3, xmm2
+ movd mm2, ecx
+ movd mm3, edx
+
+ movaps [esp + qq], xmm3
+
+ mov esi, [ebp + type]
+ mov eax, [esi + eax*4]
+ mov ebx, [esi + ebx*4]
+ mov ecx, [esi + ecx*4]
+ mov edx, [esi + edx*4]
+ mov esi, [ebp + nbfp]
+ shl eax, 1
+ shl ebx, 1
+ shl ecx, 1
+ shl edx, 1
+ mov edi, [esp + ntia]
+ add eax, edi
+ add ebx, edi
+ add ecx, edi
+ add edx, edi
+
+ movlps xmm6, [esi + eax*4]
+ movlps xmm7, [esi + ecx*4]
+ movhps xmm6, [esi + ebx*4]
+ movhps xmm7, [esi + edx*4]
+
+ movaps xmm4, xmm6
+ shufps xmm4, xmm7, 0b10001000
+ shufps xmm6, xmm7, 0b11011101
+
+ movd eax, mm0
+ movd ebx, mm1
+ movd ecx, mm2
+ movd edx, mm3
+
+ movaps [esp + c6], xmm4
+ movaps [esp + c12], xmm6
+
+ mov esi, [ebp + pos] /* base of pos[] */
+
+ lea eax, [eax + eax*2] /* replace jnr with j3 */
+ lea ebx, [ebx + ebx*2]
+
+ lea ecx, [ecx + ecx*2] /* replace jnr with j3 */
+ lea edx, [edx + edx*2]
+
+ /* move four coordinates to xmm0-xmm2 */
+
+ movlps xmm4, [esi + eax*4]
+ movlps xmm5, [esi + ecx*4]
+ movss xmm2, [esi + eax*4 + 8]
+ movss xmm6, [esi + ecx*4 + 8]
+
+ movhps xmm4, [esi + ebx*4]
+ movhps xmm5, [esi + edx*4]
+
+ movss xmm0, [esi + ebx*4 + 8]
+ movss xmm1, [esi + edx*4 + 8]
+
+ shufps xmm2, xmm0, 0
+ shufps xmm6, xmm1, 0
+
+ movaps xmm0, xmm4
+ movaps xmm1, xmm4
+
+ shufps xmm2, xmm6, 0b10001000
+
+ shufps xmm0, xmm5, 0b10001000
+ shufps xmm1, xmm5, 0b11011101
+
+ /* move ix-iz to xmm4-xmm6 */
+ movaps xmm4, [esp + ix]
+ movaps xmm5, [esp + iy]
+ movaps xmm6, [esp + iz]
+
+ /* calc dr */
+ subps xmm4, xmm0
+ subps xmm5, xmm1
+ subps xmm6, xmm2
+
+ /* store dr */
+ movaps [esp + dx], xmm4
+ movaps [esp + dy], xmm5
+ movaps [esp + dz], xmm6
+ /* square it */
+ mulps xmm4,xmm4
+ mulps xmm5,xmm5
+ mulps xmm6,xmm6
+ addps xmm4, xmm5
+ addps xmm4, xmm6
+ /* rsq in xmm4 */
+
+ rsqrtps xmm5, xmm4
+ /* lookup seed in xmm5 */
+ movaps xmm2, xmm5
+ mulps xmm5, xmm5
+ movaps xmm1, [esp + three]
+ mulps xmm5, xmm4 /* rsq*lu*lu */
+ movaps xmm0, [esp + half]
+ subps xmm1, xmm5 /* 30-rsq*lu*lu */
+ mulps xmm1, xmm2
+ mulps xmm0, xmm1 /* xmm0=rinv */
+ mulps xmm4, xmm0 /* xmm4=r */
+ mulps xmm4, [esp + tsc]
+
+ movhlps xmm5, xmm4
+ cvttps2pi mm6, xmm4
+ cvttps2pi mm7, xmm5 /* mm6/mm7 contain lu indices */
+ cvtpi2ps xmm6, mm6
+ cvtpi2ps xmm5, mm7
+ movlhps xmm6, xmm5
+ subps xmm4, xmm6
+ movaps xmm1, xmm4 /* xmm1=eps */
+ movaps xmm2, xmm1
+ mulps xmm2, xmm2 /* xmm2=eps2 */
+ pslld mm6, 2
+ pslld mm7, 2
+
+ movd mm0, eax
+ movd mm1, ebx
+ movd mm2, ecx
+ movd mm3, edx
+
+ mov esi, [ebp + VFtab]
+ movd eax, mm6
+ psrlq mm6, 32
+ movd ecx, mm7
+ psrlq mm7, 32
+ movd ebx, mm6
+ movd edx, mm7
+
+ lea eax, [eax + eax*2]
+ lea ebx, [ebx + ebx*2]
+ lea ecx, [ecx + ecx*2]
+ lea edx, [edx + edx*2]
+
+ movlps xmm5, [esi + eax*4]
+ movlps xmm7, [esi + ecx*4]
+ movhps xmm5, [esi + ebx*4]
+ movhps xmm7, [esi + edx*4] /* got half coulomb table */
+
+ movaps xmm4, xmm5
+ shufps xmm4, xmm7, 0b10001000
+ shufps xmm5, xmm7, 0b11011101
+
+ movlps xmm7, [esi + eax*4 + 8]
+ movlps xmm3, [esi + ecx*4 + 8]
+ movhps xmm7, [esi + ebx*4 + 8]
+ movhps xmm3, [esi + edx*4 + 8] /* other half of coulomb table */
+ movaps xmm6, xmm7
+ shufps xmm6, xmm3, 0b10001000
+ shufps xmm7, xmm3, 0b11011101
+ /* coulomb table ready, in xmm4-xmm7 */
+
+ mulps xmm6, xmm1 /* xmm6=Geps */
+ mulps xmm7, xmm2 /* xmm7=Heps2 */
+ addps xmm5, xmm6
+ addps xmm5, xmm7 /* xmm5=Fp */
+ mulps xmm7, [esp + two] /* two*Heps2 */
+ movaps xmm3, [esp + qq]
+ addps xmm7, xmm6
+ addps xmm7, xmm5 /* xmm7=FF */
+ mulps xmm5, xmm1 /* xmm5=eps*Fp */
+ addps xmm5, xmm4 /* xmm5=VV */
+ mulps xmm5, xmm3 /* vcoul=qq*VV */
+ mulps xmm3, xmm7 /* fijC=FF*qq */
+ /* at this point mm5 contains vcoul and mm3 fijC */
+ /* increment vcoul - then we can get rid of mm5 */
+ /* update vctot */
+ addps xmm5, [esp + vctot]
+ movaps [esp + vctot], xmm5
+
+ /* put scalar force on stack temporarily */
+ movaps [esp + fscal], xmm3
+
+ /* dispersion */
+ movlps xmm5, [esi + eax*4 + 16]
+ movlps xmm7, [esi + ecx*4 + 16]
+ movhps xmm5, [esi + ebx*4 + 16]
+ movhps xmm7, [esi + edx*4 + 16] /* got half dispersion table */
+ movaps xmm4, xmm5
+ shufps xmm4, xmm7, 0b10001000
+ shufps xmm5, xmm7, 0b11011101
+
+ movlps xmm7, [esi + eax*4 + 24]
+ movlps xmm3, [esi + ecx*4 + 24]
+ movhps xmm7, [esi + ebx*4 + 24]
+ movhps xmm3, [esi + edx*4 + 24] /* other half of dispersion table */
+ movaps xmm6, xmm7
+ shufps xmm6, xmm3, 0b10001000
+ shufps xmm7, xmm3, 0b11011101
+ /* dispersion table ready, in xmm4-xmm7 */
+ mulps xmm6, xmm1 /* xmm6=Geps */
+ mulps xmm7, xmm2 /* xmm7=Heps2 */
+ addps xmm5, xmm6
+ addps xmm5, xmm7 /* xmm5=Fp */
+ mulps xmm7, [esp + two] /* two*Heps2 */
+ addps xmm7, xmm6
+ addps xmm7, xmm5 /* xmm7=FF */
+ mulps xmm5, xmm1 /* xmm5=eps*Fp */
+ addps xmm5, xmm4 /* xmm5=VV */
+
+ movaps xmm4, [esp + c6]
+ mulps xmm7, xmm4 /* fijD */
+ mulps xmm5, xmm4 /* vnb6 */
+ addps xmm7, [esp + fscal] /* add to fscal */
+
+ /* put scalar force on stack Update vnbtot directly */
+ addps xmm5, [esp + vnbtot]
+ movaps [esp + fscal], xmm7
+ movaps [esp + vnbtot], xmm5
+
+ /* repulsion */
+ movlps xmm5, [esi + eax*4 + 32]
+ movlps xmm7, [esi + ecx*4 + 32]
+ movhps xmm5, [esi + ebx*4 + 32]
+ movhps xmm7, [esi + edx*4 + 32] /* got half repulsion table */
+ movaps xmm4, xmm5
+ shufps xmm4, xmm7, 0b10001000
+ shufps xmm5, xmm7, 0b11011101
+
+ movlps xmm7, [esi + eax*4 + 40]
+ movlps xmm3, [esi + ecx*4 + 40]
+ movhps xmm7, [esi + ebx*4 + 40]
+ movhps xmm3, [esi + edx*4 + 40] /* other half of repulsion table */
+ movaps xmm6, xmm7
+ shufps xmm6, xmm3, 0b10001000
+ shufps xmm7, xmm3, 0b11011101
+ /* table ready, in xmm4-xmm7 */
+ mulps xmm6, xmm1 /* xmm6=Geps */
+ mulps xmm7, xmm2 /* xmm7=Heps2 */
+ addps xmm5, xmm6
+ addps xmm5, xmm7 /* xmm5=Fp */
+ mulps xmm7, [esp + two] /* two*Heps2 */
+ addps xmm7, xmm6
+ addps xmm7, xmm5 /* xmm7=FF */
+ mulps xmm5, xmm1 /* xmm5=eps*Fp */
+ addps xmm5, xmm4 /* xmm5=VV */
+
+ movaps xmm4, [esp + c12]
+ mulps xmm7, xmm4 /* fijR */
+ mulps xmm5, xmm4 /* vnb12 */
+ addps xmm7, [esp + fscal]
+
+ addps xmm5, [esp + vnbtot]
+ movaps [esp + vnbtot], xmm5
+ xorps xmm4, xmm4
+
+ mulps xmm7, [esp + tsc]
+ mulps xmm7, xmm0
+ subps xmm4, xmm7
+
+ movaps xmm0, [esp + dx]
+ movaps xmm1, [esp + dy]
+ movaps xmm2, [esp + dz]
+
+ movd eax, mm0
+ movd ebx, mm1
+ movd ecx, mm2
+ movd edx, mm3
+
+ mov edi, [ebp + faction]
+ mulps xmm0, xmm4
+ mulps xmm1, xmm4
+ mulps xmm2, xmm4
+ /* xmm0-xmm2 contains tx-tz (partial force) */
+ /* now update f_i */
+ movaps xmm3, [esp + fix]
+ movaps xmm4, [esp + fiy]
+ movaps xmm5, [esp + fiz]
+ addps xmm3, xmm0
+ addps xmm4, xmm1
+ addps xmm5, xmm2
+ movaps [esp + fix], xmm3
+ movaps [esp + fiy], xmm4
+ movaps [esp + fiz], xmm5
+ /* the fj's - start by accumulating x & y forces from memory */
+ movlps xmm4, [edi + eax*4]
+ movlps xmm6, [edi + ecx*4]
+ movhps xmm4, [edi + ebx*4]
+ movhps xmm6, [edi + edx*4]
+
+ movaps xmm3, xmm4
+ shufps xmm3, xmm6, 0b10001000
+ shufps xmm4, xmm6, 0b11011101
+
+ /* now xmm3-xmm5 contains fjx, fjy, fjz */
+ subps xmm3, xmm0
+ subps xmm4, xmm1
+
+ /* unpack them back so we can store them - first x & y in xmm3/xmm4 */
+
+ movaps xmm6, xmm3
+ unpcklps xmm6, xmm4
+ unpckhps xmm3, xmm4
+ /* xmm6(l)=x & y for j1, (h) for j2 */
+ /* xmm3(l)=x & y for j3, (h) for j4 */
+ movlps [edi + eax*4], xmm6
+ movlps [edi + ecx*4], xmm3
+
+ movhps [edi + ebx*4], xmm6
+ movhps [edi + edx*4], xmm3
+
+ /* and the z forces */
+ movss xmm4, [edi + eax*4 + 8]
+ movss xmm5, [edi + ebx*4 + 8]
+ movss xmm6, [edi + ecx*4 + 8]
+ movss xmm7, [edi + edx*4 + 8]
+ subss xmm4, xmm2
+ shufps xmm2, xmm2, 0b11100101
+ subss xmm5, xmm2
+ shufps xmm2, xmm2, 0b11101010
+ subss xmm6, xmm2
+ shufps xmm2, xmm2, 0b11111111
+ subss xmm7, xmm2
+ movss [edi + eax*4 + 8], xmm4
+ movss [edi + ebx*4 + 8], xmm5
+ movss [edi + ecx*4 + 8], xmm6
+ movss [edi + edx*4 + 8], xmm7
+
+ /* should we do one more iteration? */
+ sub [esp + innerk], 4
+ jl .i3300_finish_inner
+ jmp .i3300_unroll_loop
+.i3300_finish_inner:
+ /* check if at least two particles remain */
+ add [esp + innerk], 4
+ mov edx, [esp + innerk]
+ and edx, 2
+ jnz .i3300_dopair
+ jmp .i3300_checksingle
+.i3300_dopair:
+ mov esi, [ebp + charge]
+
+ mov ecx, [esp + innerjjnr]
+
+ mov eax, [ecx]
+ mov ebx, [ecx + 4]
+ add [esp + innerjjnr], 8
+ xorps xmm7, xmm7
+ movss xmm3, [esi + eax*4]
+ movss xmm6, [esi + ebx*4]
+ shufps xmm3, xmm6, 0
+ shufps xmm3, xmm3, 0b00001000 /* xmm3(0,1) has the charges */
+
+ mulps xmm3, [esp + iq]
+ movlhps xmm3, xmm7
+ movaps [esp + qq], xmm3
+
+ mov esi, [ebp + type]
+ mov ecx, eax
+ mov edx, ebx
+ mov ecx, [esi + ecx*4]
+ mov edx, [esi + edx*4]
+ mov esi, [ebp + nbfp]
+ shl ecx, 1
+ shl edx, 1
+ mov edi, [esp + ntia]
+ add ecx, edi
+ add edx, edi
+ movlps xmm6, [esi + ecx*4]
+ movhps xmm6, [esi + edx*4]
+ mov edi, [ebp + pos]
+
+ movaps xmm4, xmm6
+ shufps xmm4, xmm4, 0b1000
+ shufps xmm6, xmm6, 0b1101
+ movlhps xmm4, xmm7
+ movlhps xmm6, xmm7
+
+ movaps [esp + c6], xmm4
+ movaps [esp + c12], xmm6
+
+ lea eax, [eax + eax*2]
+ lea ebx, [ebx + ebx*2]
+ /* move coordinates to xmm0-xmm2 */
+ movlps xmm1, [edi + eax*4]
+ movss xmm2, [edi + eax*4 + 8]
+ movhps xmm1, [edi + ebx*4]
+ movss xmm0, [edi + ebx*4 + 8]
+
+ movlhps xmm3, xmm7
+
+ shufps xmm2, xmm0, 0
+
+ movaps xmm0, xmm1
+
+ shufps xmm2, xmm2, 0b10001000
+
+ shufps xmm0, xmm0, 0b10001000
+ shufps xmm1, xmm1, 0b11011101
+
+ mov edi, [ebp + faction]
+ /* move ix-iz to xmm4-xmm6 */
+ xorps xmm7, xmm7
+
+ movaps xmm4, [esp + ix]
+ movaps xmm5, [esp + iy]
+ movaps xmm6, [esp + iz]
+
+ /* calc dr */
+ subps xmm4, xmm0
+ subps xmm5, xmm1
+ subps xmm6, xmm2
+
+ /* store dr */
+ movaps [esp + dx], xmm4
+ movaps [esp + dy], xmm5
+ movaps [esp + dz], xmm6
+ /* square it */
+ mulps xmm4,xmm4
+ mulps xmm5,xmm5
+ mulps xmm6,xmm6
+ addps xmm4, xmm5
+ addps xmm4, xmm6
+ /* rsq in xmm4 */
+
+ rsqrtps xmm5, xmm4
+ /* lookup seed in xmm5 */
+ movaps xmm2, xmm5
+ mulps xmm5, xmm5
+ movaps xmm1, [esp + three]
+ mulps xmm5, xmm4 /* rsq*lu*lu */
+ movaps xmm0, [esp + half]
+ subps xmm1, xmm5 /* 30-rsq*lu*lu */
+ mulps xmm1, xmm2
+ mulps xmm0, xmm1 /* xmm0=rinv */
+ mulps xmm4, xmm0 /* xmm4=r */
+ mulps xmm4, [esp + tsc]
+
+ cvttps2pi mm6, xmm4 /* mm6 contain lu indices */
+ cvtpi2ps xmm6, mm6
+ subps xmm4, xmm6
+ movaps xmm1, xmm4 /* xmm1=eps */
+ movaps xmm2, xmm1
+ mulps xmm2, xmm2 /* xmm2=eps2 */
+
+ pslld mm6, 2
+
+ mov esi, [ebp + VFtab]
+ movd ecx, mm6
+ psrlq mm6, 32
+ movd edx, mm6
+ lea ecx, [ecx + ecx*2]
+ lea edx, [edx + edx*2]
+
+ movlps xmm5, [esi + ecx*4]
+ movhps xmm5, [esi + edx*4] /* got half coulomb table */
+ movaps xmm4, xmm5
+ shufps xmm4, xmm4, 0b10001000
+ shufps xmm5, xmm7, 0b11011101
+
+ movlps xmm7, [esi + ecx*4 + 8]
+ movhps xmm7, [esi + edx*4 + 8]
+ movaps xmm6, xmm7
+ shufps xmm6, xmm6, 0b10001000
+ shufps xmm7, xmm7, 0b11011101
+ /* table ready in xmm4-xmm7 */
+
+ mulps xmm6, xmm1 /* xmm6=Geps */
+ mulps xmm7, xmm2 /* xmm7=Heps2 */
+ addps xmm5, xmm6
+ addps xmm5, xmm7 /* xmm5=Fp */
+ mulps xmm7, [esp + two] /* two*Heps2 */
+ movaps xmm3, [esp + qq]
+ addps xmm7, xmm6
+ addps xmm7, xmm5 /* xmm7=FF */
+ mulps xmm5, xmm1 /* xmm5=eps*Fp */
+ addps xmm5, xmm4 /* xmm5=VV */
+ mulps xmm5, xmm3 /* vcoul=qq*VV */
+ mulps xmm3, xmm7 /* fijC=FF*qq */
+ /* at this point mm5 contains vcoul and mm3 fijC */
+ /* increment vcoul - then we can get rid of mm5 */
+ /* update vctot */
+ addps xmm5, [esp + vctot]
+ movaps [esp + vctot], xmm5
+
+ /* put scalar force on stack temporarily */
+ movaps [esp + fscal], xmm3
+
+ /* dispersion */
+ movlps xmm5, [esi + ecx*4 + 16]
+ movhps xmm5, [esi + edx*4 + 16]/* got half dispersion table */
+ movaps xmm4, xmm5
+ shufps xmm4, xmm4, 0b10001000
+ shufps xmm5, xmm5, 0b11011101
+
+ movlps xmm7, [esi + ecx*4 + 24]
+ movhps xmm7, [esi + edx*4 + 24] /* other half of dispersion table */
+ movaps xmm6, xmm7
+ shufps xmm6, xmm6, 0b10001000
+ shufps xmm7, xmm7, 0b11011101
+ /* dispersion table ready, in xmm4-xmm7 */
+ mulps xmm6, xmm1 /* xmm6=Geps */
+ mulps xmm7, xmm2 /* xmm7=Heps2 */
+ addps xmm5, xmm6
+ addps xmm5, xmm7 /* xmm5=Fp */
+ mulps xmm7, [esp + two] /* two*Heps2 */
+ addps xmm7, xmm6
+ addps xmm7, xmm5 /* xmm7=FF */
+ mulps xmm5, xmm1 /* xmm5=eps*Fp */
+ addps xmm5, xmm4 /* xmm5=VV */
+
+ movaps xmm4, [esp + c6]
+ mulps xmm7, xmm4 /* fijD */
+ mulps xmm5, xmm4 /* vnb6 */
+ addps xmm7, [esp + fscal] /* add to fscal */
+
+ /* put scalar force on stack Update vnbtot directly */
+ addps xmm5, [esp + vnbtot]
+ movaps [esp + fscal], xmm7
+ movaps [esp + vnbtot], xmm5
+
+ /* repulsion */
+ movlps xmm5, [esi + ecx*4 + 32]
+ movhps xmm5, [esi + edx*4 + 32] /* got half repulsion table */
+ movaps xmm4, xmm5
+ shufps xmm4, xmm7, 0b10001000
+ shufps xmm5, xmm7, 0b11011101
+
+ movlps xmm7, [esi + ecx*4 + 40]
+ movhps xmm7, [esi + edx*4 + 40] /* other half of repulsion table */
+ movaps xmm6, xmm7
+ shufps xmm6, xmm3, 0b10001000
+ shufps xmm7, xmm3, 0b11011101
+ /* table ready, in xmm4-xmm7 */
+ mulps xmm6, xmm1 /* xmm6=Geps */
+ mulps xmm7, xmm2 /* xmm7=Heps2 */
+ addps xmm5, xmm6
+ addps xmm5, xmm7 /* xmm5=Fp */
+ mulps xmm7, [esp + two] /* two*Heps2 */
+ addps xmm7, xmm6
+ addps xmm7, xmm5 /* xmm7=FF */
+ mulps xmm5, xmm1 /* xmm5=eps*Fp */
+ addps xmm5, xmm4 /* xmm5=VV */
+
+ movaps xmm4, [esp + c12]
+ mulps xmm7, xmm4 /* fijR */
+ mulps xmm5, xmm4 /* vnb12 */
+ addps xmm7, [esp + fscal]
+
+ addps xmm5, [esp + vnbtot]
+ movaps [esp + vnbtot], xmm5
+ xorps xmm4, xmm4
+
+ mulps xmm7, [esp + tsc]
+ mulps xmm7, xmm0
+ subps xmm4, xmm7
+
+ movaps xmm0, [esp + dx]
+ movaps xmm1, [esp + dy]
+ movaps xmm2, [esp + dz]
+
+ mulps xmm0, xmm4
+ mulps xmm1, xmm4
+ mulps xmm2, xmm4
+ /* xmm0-xmm2 contains tx-tz (partial force) */
+ /* now update f_i */
+ movaps xmm3, [esp + fix]
+ movaps xmm4, [esp + fiy]
+ movaps xmm5, [esp + fiz]
+ addps xmm3, xmm0
+ addps xmm4, xmm1
+ addps xmm5, xmm2
+ movaps [esp + fix], xmm3
+ movaps [esp + fiy], xmm4
+ movaps [esp + fiz], xmm5
+ /* update the fj's */
+ movss xmm3, [edi + eax*4]
+ movss xmm4, [edi + eax*4 + 4]
+ movss xmm5, [edi + eax*4 + 8]
+ subss xmm3, xmm0
+ subss xmm4, xmm1
+ subss xmm5, xmm2
+ movss [edi + eax*4], xmm3
+ movss [edi + eax*4 + 4], xmm4
+ movss [edi + eax*4 + 8], xmm5
+
+ shufps xmm0, xmm0, 0b11100001
+ shufps xmm1, xmm1, 0b11100001
+ shufps xmm2, xmm2, 0b11100001
+
+ movss xmm3, [edi + ebx*4]
+ movss xmm4, [edi + ebx*4 + 4]
+ movss xmm5, [edi + ebx*4 + 8]
+ subss xmm3, xmm0
+ subss xmm4, xmm1
+ subss xmm5, xmm2
+ movss [edi + ebx*4], xmm3
+ movss [edi + ebx*4 + 4], xmm4
+ movss [edi + ebx*4 + 8], xmm5
+
+.i3300_checksingle:
+ mov edx, [esp + innerk]
+ and edx, 1
+ jnz .i3300_dosingle
+ jmp .i3300_updateouterdata
+.i3300_dosingle:
+ mov esi, [ebp + charge]
+ mov edi, [ebp + pos]
+ mov ecx, [esp + innerjjnr]
+ mov eax, [ecx]
+ xorps xmm6, xmm6
+ movss xmm6, [esi + eax*4] /* xmm6(0) has the charge */
+ mulps xmm6, [esp + iq]
+ movaps [esp + qq], xmm6
+
+ mov esi, [ebp + type]
+ mov ecx, eax
+ mov ecx, [esi + ecx*4]
+ mov esi, [ebp + nbfp]
+ shl ecx, 1
+ add ecx, [esp + ntia]
+ movlps xmm6, [esi + ecx*4]
+ movaps xmm4, xmm6
+ shufps xmm4, xmm4, 0b11111100
+ shufps xmm6, xmm6, 0b11111101
+
+ movaps [esp + c6], xmm4
+ movaps [esp + c12], xmm6
+
+ lea eax, [eax + eax*2]
+
+ /* move coordinates to xmm0-xmm2 */
+ movss xmm0, [edi + eax*4]
+ movss xmm1, [edi + eax*4 + 4]
+ movss xmm2, [edi + eax*4 + 8]
+
+ movaps xmm4, [esp + ix]
+ movaps xmm5, [esp + iy]
+ movaps xmm6, [esp + iz]
+
+ /* calc dr */
+ subps xmm4, xmm0
+ subps xmm5, xmm1
+ subps xmm6, xmm2
+
+ /* store dr */
+ movaps [esp + dx], xmm4
+ movaps [esp + dy], xmm5
+ movaps [esp + dz], xmm6
+ /* square it */
+ mulps xmm4,xmm4
+ mulps xmm5,xmm5
+ mulps xmm6,xmm6
+ addps xmm4, xmm5
+ addps xmm4, xmm6
+ /* rsq in xmm4 */
+
+ rsqrtps xmm5, xmm4
+ /* lookup seed in xmm5 */
+ movaps xmm2, xmm5
+ mulps xmm5, xmm5
+ movaps xmm1, [esp + three]
+ mulps xmm5, xmm4 /* rsq*lu*lu */
+ movaps xmm0, [esp + half]
+ subps xmm1, xmm5 /* 30-rsq*lu*lu */
+ mulps xmm1, xmm2
+ mulps xmm0, xmm1 /* xmm0=rinv */
+
+ mulps xmm4, xmm0 /* xmm4=r */
+ mulps xmm4, [esp + tsc]
+
+ cvttps2pi mm6, xmm4 /* mm6 contain lu indices */
+ cvtpi2ps xmm6, mm6
+ subps xmm4, xmm6
+ movaps xmm1, xmm4 /* xmm1=eps */
+ movaps xmm2, xmm1
+ mulps xmm2, xmm2 /* xmm2=eps2 */
+
+ pslld mm6, 2
+
+ mov esi, [ebp + VFtab]
+ movd ebx, mm6
+
+ lea ebx, [ebx + ebx*2]
+
+ movlps xmm4, [esi + ebx*4]
+ movlps xmm6, [esi + ebx*4 + 8]
+ movaps xmm5, xmm4
+ movaps xmm7, xmm6
+ shufps xmm5, xmm5, 1
+ shufps xmm7, xmm7, 1
+ /* table ready in xmm4-xmm7 */
+
+ mulps xmm6, xmm1 /* xmm6=Geps */
+ mulps xmm7, xmm2 /* xmm7=Heps2 */
+ addps xmm5, xmm6
+ addps xmm5, xmm7 /* xmm5=Fp */
+ mulps xmm7, [esp + two] /* two*Heps2 */
+ movaps xmm3, [esp + qq]
+ addps xmm7, xmm6
+ addps xmm7, xmm5 /* xmm7=FF */
+ mulps xmm5, xmm1 /* xmm5=eps*Fp */
+ addps xmm5, xmm4 /* xmm5=VV */
+ mulps xmm5, xmm3 /* vcoul=qq*VV */
+ mulps xmm3, xmm7 /* fijC=FF*qq */
+ /* at this point mm5 contains vcoul and mm3 fijC */
+ /* increment vcoul - then we can get rid of mm5 */
+ /* update vctot */
+ addps xmm5, [esp + vctot]
+ movaps [esp + vctot], xmm5
+
+ /* put scalar force on stack temporarily */
+ movaps [esp + fscal], xmm3
+
+ /* dispersion */
+ movlps xmm4, [esi + ebx*4 + 16]
+ movlps xmm6, [esi + ebx*4 + 24]
+ movaps xmm5, xmm4
+ movaps xmm7, xmm6
+ shufps xmm5, xmm5, 1
+ shufps xmm7, xmm7, 1
+ /* table ready in xmm4-xmm7 */
+
+ mulps xmm6, xmm1 /* xmm6=Geps */
+ mulps xmm7, xmm2 /* xmm7=Heps2 */
+ addps xmm5, xmm6
+ addps xmm5, xmm7 /* xmm5=Fp */
+ mulps xmm7, [esp + two] /* two*Heps2 */
+ addps xmm7, xmm6
+ addps xmm7, xmm5 /* xmm7=FF */
+ mulps xmm5, xmm1 /* xmm5=eps*Fp */
+ addps xmm5, xmm4 /* xmm5=VV */
+
+ movaps xmm4, [esp + c6]
+ mulps xmm7, xmm4 /* fijD */
+ mulps xmm5, xmm4 /* vnb6 */
+ addps xmm7, [esp + fscal] /* add to fscal */
+
+ /* put scalar force on stack Update vnbtot directly */
+ addps xmm5, [esp + vnbtot]
+ movaps [esp + fscal], xmm7
+ movaps [esp + vnbtot], xmm5
+
+ /* repulsion */
+ movlps xmm4, [esi + ebx*4 + 32]
+ movlps xmm6, [esi + ebx*4 + 40]
+ movaps xmm5, xmm4
+ movaps xmm7, xmm6
+ shufps xmm5, xmm5, 1
+ shufps xmm7, xmm7, 1
+ /* table ready in xmm4-xmm7 */
+
+ mulps xmm6, xmm1 /* xmm6=Geps */
+ mulps xmm7, xmm2 /* xmm7=Heps2 */
+ addps xmm5, xmm6
+ addps xmm5, xmm7 /* xmm5=Fp */
+ mulps xmm7, [esp + two] /* two*Heps2 */
+ addps xmm7, xmm6
+ addps xmm7, xmm5 /* xmm7=FF */
+ mulps xmm5, xmm1 /* xmm5=eps*Fp */
+ addps xmm5, xmm4 /* xmm5=VV */
+
+ movaps xmm4, [esp + c12]
+ mulps xmm7, xmm4 /* fijR */
+ mulps xmm5, xmm4 /* vnb12 */
+ addps xmm7, [esp + fscal]
+
+ addps xmm5, [esp + vnbtot]
+ movaps [esp + vnbtot], xmm5
+ xorps xmm4, xmm4
+
+ mulps xmm7, [esp + tsc]
+ mulps xmm7, xmm0
+ subps xmm4, xmm7
+ mov edi, [ebp + faction]
+
+ movaps xmm0, [esp + dx]
+ movaps xmm1, [esp + dy]
+ movaps xmm2, [esp + dz]
+
+ mulps xmm0, xmm4
+ mulps xmm1, xmm4
+ mulps xmm2, xmm4
+ /* xmm0-xmm2 contains tx-tz (partial force) */
+ /* now update f_i */
+ movaps xmm3, [esp + fix]
+ movaps xmm4, [esp + fiy]
+ movaps xmm5, [esp + fiz]
+ addps xmm3, xmm0
+ addps xmm4, xmm1
+ addps xmm5, xmm2
+ movaps [esp + fix], xmm3
+ movaps [esp + fiy], xmm4
+ movaps [esp + fiz], xmm5
+ /* update fj */
+
+ movss xmm3, [edi + eax*4]
+ movss xmm4, [edi + eax*4 + 4]
+ movss xmm5, [edi + eax*4 + 8]
+ subss xmm3, xmm0
+ subss xmm4, xmm1
+ subss xmm5, xmm2
+ movss [edi + eax*4], xmm3
+ movss [edi + eax*4 + 4], xmm4
+ movss [edi + eax*4 + 8], xmm5
+.i3300_updateouterdata:
+ mov ecx, [esp + ii3]
+ mov edi, [ebp + faction]
+ mov esi, [ebp + fshift]
+ mov edx, [esp + is3]
+
+ /* accumulate i forces in xmm0, xmm1, xmm2 */
+ movaps xmm0, [esp + fix]
+ movaps xmm1, [esp + fiy]
+ movaps xmm2, [esp + fiz]
+
+ movhlps xmm3, xmm0
+ movhlps xmm4, xmm1
+ movhlps xmm5, xmm2
+ addps xmm0, xmm3
+ addps xmm1, xmm4
+ addps xmm2, xmm5 /* sum is in 1/2 in xmm0-xmm2 */
+
+ movaps xmm3, xmm0
+ movaps xmm4, xmm1
+ movaps xmm5, xmm2
+
+ shufps xmm3, xmm3, 1
+ shufps xmm4, xmm4, 1
+ shufps xmm5, xmm5, 1
+ addss xmm0, xmm3
+ addss xmm1, xmm4
+ addss xmm2, xmm5 /* xmm0-xmm2 has single force in pos0 */
+
+ /* increment i force */
+ movss xmm3, [edi + ecx*4]
+ movss xmm4, [edi + ecx*4 + 4]
+ movss xmm5, [edi + ecx*4 + 8]
+ addss xmm3, xmm0
+ addss xmm4, xmm1
+ addss xmm5, xmm2
+ movss [edi + ecx*4], xmm3
+ movss [edi + ecx*4 + 4], xmm4
+ movss [edi + ecx*4 + 8], xmm5
+
+ /* increment fshift force */
+ movss xmm3, [esi + edx*4]
+ movss xmm4, [esi + edx*4 + 4]
+ movss xmm5, [esi + edx*4 + 8]
+ addss xmm3, xmm0
+ addss xmm4, xmm1
+ addss xmm5, xmm2
+ movss [esi + edx*4], xmm3
+ movss [esi + edx*4 + 4], xmm4
+ movss [esi + edx*4 + 8], xmm5
+
+ /* get group index for i particle */
+ mov edx, [ebp + gid] /* get group index for this i particle */
+ mov edx, [edx]
+ add [ebp + gid], 4 /* advance pointer */
+
+ /* accumulate total potential energy and update it */
+ movaps xmm7, [esp + vctot]
+ /* accumulate */
+ movhlps xmm6, xmm7
+ addps xmm7, xmm6 /* pos 0-1 in xmm7 have the sum now */
+ movaps xmm6, xmm7
+ shufps xmm6, xmm6, 1
+ addss xmm7, xmm6
+
+ /* add earlier value from mem */
+ mov eax, [ebp + Vc]
+ addss xmm7, [eax + edx*4]
+ /* move back to mem */
+ movss [eax + edx*4], xmm7
+
+ /* accumulate total lj energy and update it */
+ movaps xmm7, [esp + vnbtot]
+ /* accumulate */
+ movhlps xmm6, xmm7
+ addps xmm7, xmm6 /* pos 0-1 in xmm7 have the sum now */
+ movaps xmm6, xmm7
+ shufps xmm6, xmm6, 1
+ addss xmm7, xmm6
+
+ /* add earlier value from mem */
+ mov eax, [ebp + Vnb]
+ addss xmm7, [eax + edx*4]
+ /* move back to mem */
+ movss [eax + edx*4], xmm7
+
+ /* finish if last */
+ mov ecx, [ebp + nri]
+ dec ecx
+ jecxz .i3300_end
+ /* not last, iterate once more! */
+ mov [ebp + nri], ecx
+ jmp .i3300_outer
+.i3300_end:
+ emms
+ mov eax, [esp + salign]
+ add esp, eax
+ add esp, 344
+ pop edi
+ pop esi
+ pop edx
+ pop ecx
+ pop ebx
+ pop eax
+ leave
+ ret
+
+
+
+
+
+.globl inl3310_sse
+ .type inl3310_sse,@function
+inl3310_sse:
+.equ nri, 8
+.equ iinr, 12
+.equ jindex, 16
+.equ jjnr, 20
+.equ shift, 24
+.equ shiftvec, 28
+.equ fshift, 32
+.equ gid, 36
+.equ pos, 40
+.equ faction, 44
+.equ charge, 48
+.equ facel, 52
+.equ Vc, 56
+.equ type, 60
+.equ ntype, 64
+.equ nbfp, 68
+.equ Vnb, 72
+.equ tabscale, 76
+.equ VFtab, 80
+.equ nsatoms, 84
+ /* stack offsets for local variables */
+ /* bottom of stack is cache-aligned for sse use */
+.equ ix, 0
+.equ iy, 16
+.equ iz, 32
+.equ iq, 48
+.equ dx, 64
+.equ dy, 80
+.equ dz, 96
+.equ two, 112
+.equ tsc, 128
+.equ qq, 144
+.equ c6, 160
+.equ c12, 176
+.equ fscal, 192
+.equ vctot, 208
+.equ vnbtot, 224
+.equ fix, 240
+.equ fiy, 256
+.equ fiz, 272
+.equ half, 288
+.equ three, 304
+.equ is3, 320
+.equ ii3, 324
+.equ shX, 328
+.equ shY, 332
+.equ shZ, 336
+.equ ntia, 340
+.equ innerjjnr0, 344
+.equ innerk0, 348
+.equ innerjjnr, 352
+.equ innerk, 356
+.equ salign, 360
+.equ nsvdwc, 364
+.equ nscoul, 368
+.equ nsvdw, 372
+.equ solnr, 376
+ push ebp
+ mov ebp,esp
+ push eax
+ push ebx
+ push ecx
+ push edx
+ push esi
+ push edi
+ sub esp, 380 /* local stack space */
+ mov eax, esp
+ and eax, 0xf
+ sub esp, eax
+ mov [esp + salign], eax
+
+ emms
+
+ movups xmm0, [sse_half]
+ movups xmm1, [sse_two]
+ movups xmm2, [sse_three]
+ movss xmm3, [ebp + tabscale]
+ movaps [esp + half], xmm0
+ movaps [esp + two], xmm1
+ movaps [esp + three], xmm2
+ shufps xmm3, xmm3, 0
+ movaps [esp + tsc], xmm3
+
+ /* assume we have at least one i particle - start directly */
+.i3310_outer:
+ mov eax, [ebp + shift] /* eax = pointer into shift[] */
+ mov ebx, [eax] /* ebx=shift[n] */
+ add [ebp + shift], 4 /* advance pointer one step */
+
+ lea ebx, [ebx + ebx*2] /* ebx=3*is */
+ mov [esp + is3],ebx /* store is3 */
+
+ mov eax, [ebp + shiftvec] /* eax = base of shiftvec[] */
+
+ movlps xmm0, [eax + ebx*4]
+ movss xmm1, [eax + ebx*4 + 8]
+ movlps [esp + shX], xmm0
+ movss [esp + shZ], xmm1
+
+ mov ecx, [ebp + iinr] /* ecx = pointer into iinr[] */
+ add [ebp + iinr], 4 /* advance pointer */
+ mov ebx, [ecx] /* ebx =ii */
+
+ mov eax, [ebp + nsatoms]
+ add [ebp + nsatoms], 12
+ mov ecx, [eax]
+ mov edx, [eax + 4]
+ mov eax, [eax + 8]
+ sub ecx, eax
+ sub eax, edx
+
+ mov [esp + nsvdwc], edx
+ mov [esp + nscoul], eax
+ mov [esp + nsvdw], ecx
+
+ /* clear potential */
+ xorps xmm4, xmm4
+ movaps [esp + vctot], xmm4
+ movaps [esp + vnbtot], xmm4
+ mov [esp + solnr], ebx
+
+ mov eax, [ebp + jindex]
+ mov ecx, [eax] /* jindex[n] */
+ mov edx, [eax + 4] /* jindex[n+1] */
+ add [ebp + jindex], 4
+ sub edx, ecx /* number of innerloop atoms */
+
+ mov eax, [ebp + jjnr]
+ shl ecx, 2
+ add eax, ecx
+ mov [esp + innerjjnr0], eax /* pointer to jjnr[nj0] */
+ mov [esp + innerk0], edx /* number of innerloop atoms */
+
+ mov ecx, [esp + nsvdwc]
+ cmp ecx, 0
+ jnz .i3310_mno_vdwc
+ jmp .i3310_testcoul
+.i3310_mno_vdwc:
+ mov ebx, [esp + solnr]
+ inc dword ptr [esp + solnr]
+
+ mov edx, [ebp + charge]
+ movss xmm3, [edx + ebx*4]
+ mulss xmm3, [ebp + facel]
+ shufps xmm3, xmm3, 0
+
+ mov edx, [ebp + type]
+ mov edx, [edx + ebx*4]
+ imul edx, [ebp + ntype]
+ shl edx, 1
+ mov [esp + ntia], edx
+
+ lea ebx, [ebx + ebx*2] /* ebx = 3*ii=ii3 */
+ mov eax, [ebp + pos] /* eax = base of pos[] */
+
+ movss xmm0, [esp + shX]
+ movss xmm1, [esp + shY]
+ movss xmm2, [esp + shZ]
+ addss xmm0, [eax + ebx*4]
+ addss xmm1, [eax + ebx*4 + 4]
+ addss xmm2, [eax + ebx*4 + 8]
+ /* clear i forces */
+ movaps [esp + iq], xmm3
+
+ shufps xmm0, xmm0, 0
+ shufps xmm1, xmm1, 0
+ shufps xmm2, xmm2, 0
+
+ movaps [esp + ix], xmm0
+ movaps [esp + iy], xmm1
+ movaps [esp + iz], xmm2
+
+ mov [esp + ii3], ebx
+
+ /* clear i forces */
+ xorps xmm4, xmm4
+ movaps [esp + fix], xmm4
+ movaps [esp + fiy], xmm4
+ movaps [esp + fiz], xmm4
+
+ mov ecx, [esp + innerjjnr0]
+ mov [esp + innerjjnr], ecx
+ mov edx, [esp + innerk0]
+ sub edx, 4
+ mov [esp + innerk], edx /* number of innerloop atoms */
+ jge .i3310_unroll_vdwc_loop
+ jmp .i3310_finish_vdwc_inner
+.i3310_unroll_vdwc_loop:
+ /* quad-unroll innerloop here */
+ mov edx, [esp + innerjjnr] /* pointer to jjnr[k] */
+ mov eax, [edx]
+ mov ebx, [edx + 4]
+ mov ecx, [edx + 8]
+ mov edx, [edx + 12] /* eax-edx=jnr1-4 */
+ add [esp + innerjjnr], 16 /* advance pointer (unrolled 4) */
+
+ mov esi, [ebp + charge] /* base of charge[] */
+
+ movss xmm3, [esi + eax*4]
+ movss xmm4, [esi + ecx*4]
+ movss xmm6, [esi + ebx*4]
+ movss xmm7, [esi + edx*4]
+
+ movaps xmm2, [esp + iq]
+ shufps xmm3, xmm6, 0
+ shufps xmm4, xmm7, 0
+ shufps xmm3, xmm4, 0b10001000 /* all charges in xmm3 */
+ movd mm0, eax /* use mmx registers as temp storage */
+ movd mm1, ebx
+ mulps xmm3, xmm2
+ movd mm2, ecx
+ movd mm3, edx
+
+ movaps [esp + qq], xmm3
+
+ mov esi, [ebp + type]
+ mov eax, [esi + eax*4]
+ mov ebx, [esi + ebx*4]
+ mov ecx, [esi + ecx*4]
+ mov edx, [esi + edx*4]
+ mov esi, [ebp + nbfp]
+ shl eax, 1
+ shl ebx, 1
+ shl ecx, 1
+ shl edx, 1
+ mov edi, [esp + ntia]
+ add eax, edi
+ add ebx, edi
+ add ecx, edi
+ add edx, edi
+
+ movlps xmm6, [esi + eax*4]
+ movlps xmm7, [esi + ecx*4]
+ movhps xmm6, [esi + ebx*4]
+ movhps xmm7, [esi + edx*4]
+
+ movaps xmm4, xmm6
+ shufps xmm4, xmm7, 0b10001000
+ shufps xmm6, xmm7, 0b11011101
+
+ movd eax, mm0
+ movd ebx, mm1
+ movd ecx, mm2
+ movd edx, mm3
+
+ movaps [esp + c6], xmm4
+ movaps [esp + c12], xmm6
+
+ mov esi, [ebp + pos] /* base of pos[] */
+
+ lea eax, [eax + eax*2] /* replace jnr with j3 */
+ lea ebx, [ebx + ebx*2]
+
+ lea ecx, [ecx + ecx*2] /* replace jnr with j3 */
+ lea edx, [edx + edx*2]
+
+ /* move four coordinates to xmm0-xmm2 */
+
+ movlps xmm4, [esi + eax*4]
+ movlps xmm5, [esi + ecx*4]
+ movss xmm2, [esi + eax*4 + 8]
+ movss xmm6, [esi + ecx*4 + 8]
+
+ movhps xmm4, [esi + ebx*4]
+ movhps xmm5, [esi + edx*4]
+
+ movss xmm0, [esi + ebx*4 + 8]
+ movss xmm1, [esi + edx*4 + 8]
+
+ shufps xmm2, xmm0, 0
+ shufps xmm6, xmm1, 0
+
+ movaps xmm0, xmm4
+ movaps xmm1, xmm4
+
+ shufps xmm2, xmm6, 0b10001000
+
+ shufps xmm0, xmm5, 0b10001000
+ shufps xmm1, xmm5, 0b11011101
+
+ /* move ix-iz to xmm4-xmm6 */
+ movaps xmm4, [esp + ix]
+ movaps xmm5, [esp + iy]
+ movaps xmm6, [esp + iz]
+
+ /* calc dr */
+ subps xmm4, xmm0
+ subps xmm5, xmm1
+ subps xmm6, xmm2
+
+ /* store dr */
+ movaps [esp + dx], xmm4
+ movaps [esp + dy], xmm5
+ movaps [esp + dz], xmm6
+ /* square it */
+ mulps xmm4,xmm4
+ mulps xmm5,xmm5
+ mulps xmm6,xmm6
+ addps xmm4, xmm5
+ addps xmm4, xmm6
+ /* rsq in xmm4 */
+
+ rsqrtps xmm5, xmm4
+ /* lookup seed in xmm5 */
+ movaps xmm2, xmm5
+ mulps xmm5, xmm5
+ movaps xmm1, [esp + three]
+ mulps xmm5, xmm4 /* rsq*lu*lu */
+ movaps xmm0, [esp + half]
+ subps xmm1, xmm5 /* 30-rsq*lu*lu */
+ mulps xmm1, xmm2
+ mulps xmm0, xmm1 /* xmm0=rinv */
+ mulps xmm4, xmm0 /* xmm4=r */
+ mulps xmm4, [esp + tsc]
+
+ movhlps xmm5, xmm4
+ cvttps2pi mm6, xmm4
+ cvttps2pi mm7, xmm5 /* mm6/mm7 contain lu indices */
+ cvtpi2ps xmm6, mm6
+ cvtpi2ps xmm5, mm7
+ movlhps xmm6, xmm5
+ subps xmm4, xmm6
+ movaps xmm1, xmm4 /* xmm1=eps */
+ movaps xmm2, xmm1
+ mulps xmm2, xmm2 /* xmm2=eps2 */
+ pslld mm6, 2
+ pslld mm7, 2
+
+ movd mm0, eax
+ movd mm1, ebx
+ movd mm2, ecx
+ movd mm3, edx
+
+ mov esi, [ebp + VFtab]
+ movd eax, mm6
+ psrlq mm6, 32
+ movd ecx, mm7
+ psrlq mm7, 32
+ movd ebx, mm6
+ movd edx, mm7
+
+ lea eax, [eax + eax*2]
+ lea ebx, [ebx + ebx*2]
+ lea ecx, [ecx + ecx*2]
+ lea edx, [edx + edx*2]
+
+ movlps xmm5, [esi + eax*4]
+ movlps xmm7, [esi + ecx*4]
+ movhps xmm5, [esi + ebx*4]
+ movhps xmm7, [esi + edx*4] /* got half coulomb table */
+
+ movaps xmm4, xmm5
+ shufps xmm4, xmm7, 0b10001000
+ shufps xmm5, xmm7, 0b11011101
+
+ movlps xmm7, [esi + eax*4 + 8]
+ movlps xmm3, [esi + ecx*4 + 8]
+ movhps xmm7, [esi + ebx*4 + 8]
+ movhps xmm3, [esi + edx*4 + 8] /* other half of coulomb table */
+ movaps xmm6, xmm7
+ shufps xmm6, xmm3, 0b10001000
+ shufps xmm7, xmm3, 0b11011101
+ /* coulomb table ready, in xmm4-xmm7 */
+
+ mulps xmm6, xmm1 /* xmm6=Geps */
+ mulps xmm7, xmm2 /* xmm7=Heps2 */
+ addps xmm5, xmm6
+ addps xmm5, xmm7 /* xmm5=Fp */
+ mulps xmm7, [esp + two] /* two*Heps2 */
+ movaps xmm3, [esp + qq]
+ addps xmm7, xmm6
+ addps xmm7, xmm5 /* xmm7=FF */
+ mulps xmm5, xmm1 /* xmm5=eps*Fp */
+ addps xmm5, xmm4 /* xmm5=VV */
+ mulps xmm5, xmm3 /* vcoul=qq*VV */
+ mulps xmm3, xmm7 /* fijC=FF*qq */
+ /* at this point mm5 contains vcoul and mm3 fijC */
+ /* increment vcoul - then we can get rid of mm5 */
+ /* update vctot */
+ addps xmm5, [esp + vctot]
+ movaps [esp + vctot], xmm5
+
+ /* put scalar force on stack temporarily */
+ movaps [esp + fscal], xmm3
+
+ /* dispersion */
+ movlps xmm5, [esi + eax*4 + 16]
+ movlps xmm7, [esi + ecx*4 + 16]
+ movhps xmm5, [esi + ebx*4 + 16]
+ movhps xmm7, [esi + edx*4 + 16] /* got half dispersion table */
+ movaps xmm4, xmm5
+ shufps xmm4, xmm7, 0b10001000
+ shufps xmm5, xmm7, 0b11011101
+
+ movlps xmm7, [esi + eax*4 + 24]
+ movlps xmm3, [esi + ecx*4 + 24]
+ movhps xmm7, [esi + ebx*4 + 24]
+ movhps xmm3, [esi + edx*4 + 24] /* other half of dispersion table */
+ movaps xmm6, xmm7
+ shufps xmm6, xmm3, 0b10001000
+ shufps xmm7, xmm3, 0b11011101
+ /* dispersion table ready, in xmm4-xmm7 */
+ mulps xmm6, xmm1 /* xmm6=Geps */
+ mulps xmm7, xmm2 /* xmm7=Heps2 */
+ addps xmm5, xmm6
+ addps xmm5, xmm7 /* xmm5=Fp */
+ mulps xmm7, [esp + two] /* two*Heps2 */
+ addps xmm7, xmm6
+ addps xmm7, xmm5 /* xmm7=FF */
+ mulps xmm5, xmm1 /* xmm5=eps*Fp */
+ addps xmm5, xmm4 /* xmm5=VV */
+
+ movaps xmm4, [esp + c6]
+ mulps xmm7, xmm4 /* fijD */
+ mulps xmm5, xmm4 /* vnb6 */
+ addps xmm7, [esp + fscal] /* add to fscal */
+
+ /* put scalar force on stack Update vnbtot directly */
+ addps xmm5, [esp + vnbtot]
+ movaps [esp + fscal], xmm7
+ movaps [esp + vnbtot], xmm5
+
+ /* repulsion */
+ movlps xmm5, [esi + eax*4 + 32]
+ movlps xmm7, [esi + ecx*4 + 32]
+ movhps xmm5, [esi + ebx*4 + 32]
+ movhps xmm7, [esi + edx*4 + 32] /* got half repulsion table */
+ movaps xmm4, xmm5
+ shufps xmm4, xmm7, 0b10001000
+ shufps xmm5, xmm7, 0b11011101
+
+ movlps xmm7, [esi + eax*4 + 40]
+ movlps xmm3, [esi + ecx*4 + 40]
+ movhps xmm7, [esi + ebx*4 + 40]
+ movhps xmm3, [esi + edx*4 + 40] /* other half of repulsion table */
+ movaps xmm6, xmm7
+ shufps xmm6, xmm3, 0b10001000
+ shufps xmm7, xmm3, 0b11011101
+ /* table ready, in xmm4-xmm7 */
+ mulps xmm6, xmm1 /* xmm6=Geps */
+ mulps xmm7, xmm2 /* xmm7=Heps2 */
+ addps xmm5, xmm6
+ addps xmm5, xmm7 /* xmm5=Fp */
+ mulps xmm7, [esp + two] /* two*Heps2 */
+ addps xmm7, xmm6
+ addps xmm7, xmm5 /* xmm7=FF */
+ mulps xmm5, xmm1 /* xmm5=eps*Fp */
+ addps xmm5, xmm4 /* xmm5=VV */
+
+ movaps xmm4, [esp + c12]
+ mulps xmm7, xmm4 /* fijR */
+ mulps xmm5, xmm4 /* vnb12 */
+ addps xmm7, [esp + fscal]
+
+ addps xmm5, [esp + vnbtot]
+ movaps [esp + vnbtot], xmm5
+ xorps xmm4, xmm4
+
+ mulps xmm7, [esp + tsc]
+ mulps xmm7, xmm0
+ subps xmm4, xmm7
+
+ movaps xmm0, [esp + dx]
+ movaps xmm1, [esp + dy]
+ movaps xmm2, [esp + dz]
+
+ movd eax, mm0
+ movd ebx, mm1
+ movd ecx, mm2
+ movd edx, mm3
+
+ mov edi, [ebp + faction]
+ mulps xmm0, xmm4
+ mulps xmm1, xmm4
+ mulps xmm2, xmm4
+ /* xmm0-xmm2 contains tx-tz (partial force) */
+ /* now update f_i */
+ movaps xmm3, [esp + fix]
+ movaps xmm4, [esp + fiy]
+ movaps xmm5, [esp + fiz]
+ addps xmm3, xmm0
+ addps xmm4, xmm1
+ addps xmm5, xmm2
+ movaps [esp + fix], xmm3
+ movaps [esp + fiy], xmm4
+ movaps [esp + fiz], xmm5
+ /* the fj's - start by accumulating x & y forces from memory */
+ movlps xmm4, [edi + eax*4]
+ movlps xmm6, [edi + ecx*4]
+ movhps xmm4, [edi + ebx*4]
+ movhps xmm6, [edi + edx*4]
+
+ movaps xmm3, xmm4
+ shufps xmm3, xmm6, 0b10001000
+ shufps xmm4, xmm6, 0b11011101
+
+ /* now xmm3-xmm5 contains fjx, fjy, fjz */
+ subps xmm3, xmm0
+ subps xmm4, xmm1
+
+ /* unpack them back so we can store them - first x & y in xmm3/xmm4 */
+
+ movaps xmm6, xmm3
+ unpcklps xmm6, xmm4
+ unpckhps xmm3, xmm4
+ /* xmm6(l)=x & y for j1, (h) for j2 */
+ /* xmm3(l)=x & y for j3, (h) for j4 */
+ movlps [edi + eax*4], xmm6
+ movlps [edi + ecx*4], xmm3
+
+ movhps [edi + ebx*4], xmm6
+ movhps [edi + edx*4], xmm3
+
+ /* and the z forces */
+ movss xmm4, [edi + eax*4 + 8]
+ movss xmm5, [edi + ebx*4 + 8]
+ movss xmm6, [edi + ecx*4 + 8]
+ movss xmm7, [edi + edx*4 + 8]
+ subss xmm4, xmm2
+ shufps xmm2, xmm2, 0b11100101
+ subss xmm5, xmm2
+ shufps xmm2, xmm2, 0b11101010
+ subss xmm6, xmm2
+ shufps xmm2, xmm2, 0b11111111
+ subss xmm7, xmm2
+ movss [edi + eax*4 + 8], xmm4
+ movss [edi + ebx*4 + 8], xmm5
+ movss [edi + ecx*4 + 8], xmm6
+ movss [edi + edx*4 + 8], xmm7
+
+ /* should we do one more iteration? */
+ sub [esp + innerk], 4
+ jl .i3310_finish_vdwc_inner
+ jmp .i3310_unroll_vdwc_loop
+.i3310_finish_vdwc_inner:
+ /* check if at least two particles remain */
+ add [esp + innerk], 4
+ mov edx, [esp + innerk]
+ and edx, 2
+ jnz .i3310_dopair_vdwc
+ jmp .i3310_checksingle_vdwc
+.i3310_dopair_vdwc:
+ mov esi, [ebp + charge]
+
+ mov ecx, [esp + innerjjnr]
+
+ mov eax, [ecx]
+ mov ebx, [ecx + 4]
+ add [esp + innerjjnr], 8
+ xorps xmm7, xmm7
+ movss xmm3, [esi + eax*4]
+ movss xmm6, [esi + ebx*4]
+ shufps xmm3, xmm6, 0
+ shufps xmm3, xmm3, 0b00001000 /* xmm3(0,1) has the charges */
+
+ mulps xmm3, [esp + iq]
+ movlhps xmm3, xmm7
+ movaps [esp + qq], xmm3
+
+ mov esi, [ebp + type]
+ mov ecx, eax
+ mov edx, ebx
+ mov ecx, [esi + ecx*4]
+ mov edx, [esi + edx*4]
+ mov esi, [ebp + nbfp]
+ shl ecx, 1
+ shl edx, 1
+ mov edi, [esp + ntia]
+ add ecx, edi
+ add edx, edi
+ movlps xmm6, [esi + ecx*4]
+ movhps xmm6, [esi + edx*4]
+ mov edi, [ebp + pos]
+
+ movaps xmm4, xmm6
+ shufps xmm4, xmm4, 0b1000
+ shufps xmm6, xmm6, 0b1101
+ movlhps xmm4, xmm7
+ movlhps xmm6, xmm7
+
+ movaps [esp + c6], xmm4
+ movaps [esp + c12], xmm6
+
+ lea eax, [eax + eax*2]
+ lea ebx, [ebx + ebx*2]
+ /* move coordinates to xmm0-xmm2 */
+ movlps xmm1, [edi + eax*4]
+ movss xmm2, [edi + eax*4 + 8]
+ movhps xmm1, [edi + ebx*4]
+ movss xmm0, [edi + ebx*4 + 8]
+
+ movlhps xmm3, xmm7
+
+ shufps xmm2, xmm0, 0
+
+ movaps xmm0, xmm1
+
+ shufps xmm2, xmm2, 0b10001000
+
+ shufps xmm0, xmm0, 0b10001000
+ shufps xmm1, xmm1, 0b11011101
+
+ mov edi, [ebp + faction]
+ /* move ix-iz to xmm4-xmm6 */
+ xorps xmm7, xmm7
+
+ movaps xmm4, [esp + ix]
+ movaps xmm5, [esp + iy]
+ movaps xmm6, [esp + iz]
+
+ /* calc dr */
+ subps xmm4, xmm0
+ subps xmm5, xmm1
+ subps xmm6, xmm2
+
+ /* store dr */
+ movaps [esp + dx], xmm4
+ movaps [esp + dy], xmm5
+ movaps [esp + dz], xmm6
+ /* square it */
+ mulps xmm4,xmm4
+ mulps xmm5,xmm5
+ mulps xmm6,xmm6
+ addps xmm4, xmm5
+ addps xmm4, xmm6
+ /* rsq in xmm4 */
+
+ rsqrtps xmm5, xmm4
+ /* lookup seed in xmm5 */
+ movaps xmm2, xmm5
+ mulps xmm5, xmm5
+ movaps xmm1, [esp + three]
+ mulps xmm5, xmm4 /* rsq*lu*lu */
+ movaps xmm0, [esp + half]
+ subps xmm1, xmm5 /* 30-rsq*lu*lu */
+ mulps xmm1, xmm2
+ mulps xmm0, xmm1 /* xmm0=rinv */
+ mulps xmm4, xmm0 /* xmm4=r */
+ mulps xmm4, [esp + tsc]
+
+ cvttps2pi mm6, xmm4 /* mm6 contain lu indices */
+ cvtpi2ps xmm6, mm6
+ subps xmm4, xmm6
+ movaps xmm1, xmm4 /* xmm1=eps */
+ movaps xmm2, xmm1
+ mulps xmm2, xmm2 /* xmm2=eps2 */
+
+ pslld mm6, 2
+
+ mov esi, [ebp + VFtab]
+ movd ecx, mm6
+ psrlq mm6, 32
+ movd edx, mm6
+ lea ecx, [ecx + ecx*2]
+ lea edx, [edx + edx*2]
+
+ movlps xmm5, [esi + ecx*4]
+ movhps xmm5, [esi + edx*4] /* got half coulomb table */
+ movaps xmm4, xmm5
+ shufps xmm4, xmm4, 0b10001000
+ shufps xmm5, xmm5, 0b11011101
+
+ movlps xmm7, [esi + ecx*4 + 8]
+ movhps xmm7, [esi + edx*4 + 8]
+ movaps xmm6, xmm7
+ shufps xmm6, xmm6, 0b10001000
+ shufps xmm7, xmm7, 0b11011101
+ /* table ready in xmm4-xmm7 */
+
+ mulps xmm6, xmm1 /* xmm6=Geps */
+ mulps xmm7, xmm2 /* xmm7=Heps2 */
+ addps xmm5, xmm6
+ addps xmm5, xmm7 /* xmm5=Fp */
+ mulps xmm7, [esp + two] /* two*Heps2 */
+ movaps xmm3, [esp + qq]
+ addps xmm7, xmm6
+ addps xmm7, xmm5 /* xmm7=FF */
+ mulps xmm5, xmm1 /* xmm5=eps*Fp */
+ addps xmm5, xmm4 /* xmm5=VV */
+ mulps xmm5, xmm3 /* vcoul=qq*VV */
+ mulps xmm3, xmm7 /* fijC=FF*qq */
+ /* at this point mm5 contains vcoul and mm3 fijC */
+ /* increment vcoul - then we can get rid of mm5 */
+ /* update vctot */
+ addps xmm5, [esp + vctot]
+ movaps [esp + vctot], xmm5
+
+ /* put scalar force on stack temporarily */
+ movaps [esp + fscal], xmm3
+
+ /* dispersion */
+ movlps xmm5, [esi + ecx*4 + 16]
+ movhps xmm5, [esi + edx*4 + 16]/* got half dispersion table */
+ movaps xmm4, xmm5
+ shufps xmm4, xmm4, 0b10001000
+ shufps xmm5, xmm5, 0b11011101
+
+ movlps xmm7, [esi + ecx*4 + 24]
+ movhps xmm7, [esi + edx*4 + 24] /* other half of dispersion table */
+ movaps xmm6, xmm7
+ shufps xmm6, xmm6, 0b10001000
+ shufps xmm7, xmm7, 0b11011101
+ /* dispersion table ready, in xmm4-xmm7 */
+ mulps xmm6, xmm1 /* xmm6=Geps */
+ mulps xmm7, xmm2 /* xmm7=Heps2 */
+ addps xmm5, xmm6
+ addps xmm5, xmm7 /* xmm5=Fp */
+ mulps xmm7, [esp + two] /* two*Heps2 */
+ addps xmm7, xmm6
+ addps xmm7, xmm5 /* xmm7=FF */
+ mulps xmm5, xmm1 /* xmm5=eps*Fp */
+ addps xmm5, xmm4 /* xmm5=VV */
+
+ movaps xmm4, [esp + c6]
+ mulps xmm7, xmm4 /* fijD */
+ mulps xmm5, xmm4 /* vnb6 */
+ addps xmm7, [esp + fscal] /* add to fscal */
+
+ /* put scalar force on stack Update vnbtot directly */
+ addps xmm5, [esp + vnbtot]
+ movaps [esp + fscal], xmm7
+ movaps [esp + vnbtot], xmm5
+
+ /* repulsion */
+ movlps xmm5, [esi + ecx*4 + 32]
+ movhps xmm5, [esi + edx*4 + 32] /* got half repulsion table */
+ movaps xmm4, xmm5
+ shufps xmm4, xmm4, 0b10001000
+ shufps xmm5, xmm5, 0b11011101
+
+ movlps xmm7, [esi + ecx*4 + 40]
+ movhps xmm7, [esi + edx*4 + 40] /* other half of repulsion table */
+ movaps xmm6, xmm7
+ shufps xmm6, xmm6, 0b10001000
+ shufps xmm7, xmm7, 0b11011101
+ /* table ready, in xmm4-xmm7 */
+ mulps xmm6, xmm1 /* xmm6=Geps */
+ mulps xmm7, xmm2 /* xmm7=Heps2 */
+ addps xmm5, xmm6
+ addps xmm5, xmm7 /* xmm5=Fp */
+ mulps xmm7, [esp + two] /* two*Heps2 */
+ addps xmm7, xmm6
+ addps xmm7, xmm5 /* xmm7=FF */
+ mulps xmm5, xmm1 /* xmm5=eps*Fp */
+ addps xmm5, xmm4 /* xmm5=VV */
+
+ movaps xmm4, [esp + c12]
+ mulps xmm7, xmm4 /* fijR */
+ mulps xmm5, xmm4 /* vnb12 */
+ addps xmm7, [esp + fscal]
+
+ addps xmm5, [esp + vnbtot]
+ movaps [esp + vnbtot], xmm5
+ xorps xmm4, xmm4
+
+ mulps xmm7, [esp + tsc]
+ mulps xmm7, xmm0
+ subps xmm4, xmm7
+
+ movaps xmm0, [esp + dx]
+ movaps xmm1, [esp + dy]
+ movaps xmm2, [esp + dz]
+
+ mulps xmm0, xmm4
+ mulps xmm1, xmm4
+ mulps xmm2, xmm4
+ /* xmm0-xmm2 contains tx-tz (partial force) */
+ /* now update f_i */
+ movaps xmm3, [esp + fix]
+ movaps xmm4, [esp + fiy]
+ movaps xmm5, [esp + fiz]
+ addps xmm3, xmm0
+ addps xmm4, xmm1
+ addps xmm5, xmm2
+ movaps [esp + fix], xmm3
+ movaps [esp + fiy], xmm4
+ movaps [esp + fiz], xmm5
+ /* update the fj's */
+ movss xmm3, [edi + eax*4]
+ movss xmm4, [edi + eax*4 + 4]
+ movss xmm5, [edi + eax*4 + 8]
+ subss xmm3, xmm0
+ subss xmm4, xmm1
+ subss xmm5, xmm2
+ movss [edi + eax*4], xmm3
+ movss [edi + eax*4 + 4], xmm4
+ movss [edi + eax*4 + 8], xmm5
+
+ shufps xmm0, xmm0, 0b11100001
+ shufps xmm1, xmm1, 0b11100001
+ shufps xmm2, xmm2, 0b11100001
+
+ movss xmm3, [edi + ebx*4]
+ movss xmm4, [edi + ebx*4 + 4]
+ movss xmm5, [edi + ebx*4 + 8]
+ subss xmm3, xmm0
+ subss xmm4, xmm1
+ subss xmm5, xmm2
+ movss [edi + ebx*4], xmm3
+ movss [edi + ebx*4 + 4], xmm4
+ movss [edi + ebx*4 + 8], xmm5
+
+.i3310_checksingle_vdwc:
+ mov edx, [esp + innerk]
+ and edx, 1
+ jnz .i3310_dosingle_vdwc
+ jmp .i3310_updateouterdata_vdwc
+.i3310_dosingle_vdwc:
+ mov esi, [ebp + charge]
+ mov edi, [ebp + pos]
+ mov ecx, [esp + innerjjnr]
+ mov eax, [ecx]
+ xorps xmm6, xmm6
+ movss xmm6, [esi + eax*4] /* xmm6(0) has the charge */
+ mulps xmm6, [esp + iq]
+ movaps [esp + qq], xmm6
+
+ mov esi, [ebp + type]
+ mov ecx, eax
+ mov ecx, [esi + ecx*4]
+ mov esi, [ebp + nbfp]
+ shl ecx, 1
+ add ecx, [esp + ntia]
+ movlps xmm6, [esi + ecx*4]
+ movaps xmm4, xmm6
+ shufps xmm4, xmm4, 0b11111100
+ shufps xmm6, xmm6, 0b11111101
+
+ movaps [esp + c6], xmm4
+ movaps [esp + c12], xmm6
+
+ lea eax, [eax + eax*2]
+
+ /* move coordinates to xmm0-xmm2 */
+ movss xmm0, [edi + eax*4]
+ movss xmm1, [edi + eax*4 + 4]
+ movss xmm2, [edi + eax*4 + 8]
+
+ movaps xmm4, [esp + ix]
+ movaps xmm5, [esp + iy]
+ movaps xmm6, [esp + iz]
+
+ /* calc dr */
+ subps xmm4, xmm0
+ subps xmm5, xmm1
+ subps xmm6, xmm2
+
+ /* store dr */
+ movaps [esp + dx], xmm4
+ movaps [esp + dy], xmm5
+ movaps [esp + dz], xmm6
+ /* square it */
+ mulps xmm4,xmm4
+ mulps xmm5,xmm5
+ mulps xmm6,xmm6
+ addps xmm4, xmm5
+ addps xmm4, xmm6
+ /* rsq in xmm4 */
+
+ rsqrtps xmm5, xmm4
+ /* lookup seed in xmm5 */
+ movaps xmm2, xmm5
+ mulps xmm5, xmm5
+ movaps xmm1, [esp + three]
+ mulps xmm5, xmm4 /* rsq*lu*lu */
+ movaps xmm0, [esp + half]
+ subps xmm1, xmm5 /* 30-rsq*lu*lu */
+ mulps xmm1, xmm2
+ mulps xmm0, xmm1 /* xmm0=rinv */
+
+ mulps xmm4, xmm0 /* xmm4=r */
+ mulps xmm4, [esp + tsc]
+
+ cvttps2pi mm6, xmm4 /* mm6 contain lu indices */
+ cvtpi2ps xmm6, mm6
+ subps xmm4, xmm6
+ movaps xmm1, xmm4 /* xmm1=eps */
+ movaps xmm2, xmm1
+ mulps xmm2, xmm2 /* xmm2=eps2 */
+
+ pslld mm6, 2
+
+ mov esi, [ebp + VFtab]
+ movd ebx, mm6
+
+ lea ebx, [ebx + ebx*2]
+
+ movlps xmm4, [esi + ebx*4]
+ movlps xmm6, [esi + ebx*4 + 8]
+ movaps xmm5, xmm4
+ movaps xmm7, xmm6
+ shufps xmm5, xmm5, 1
+ shufps xmm7, xmm7, 1
+ /* table ready in xmm4-xmm7 */
+
+ mulps xmm6, xmm1 /* xmm6=Geps */
+ mulps xmm7, xmm2 /* xmm7=Heps2 */
+ addps xmm5, xmm6
+ addps xmm5, xmm7 /* xmm5=Fp */
+ mulps xmm7, [esp + two] /* two*Heps2 */
+ movaps xmm3, [esp + qq]
+ addps xmm7, xmm6
+ addps xmm7, xmm5 /* xmm7=FF */
+ mulps xmm5, xmm1 /* xmm5=eps*Fp */
+ addps xmm5, xmm4 /* xmm5=VV */
+ mulps xmm5, xmm3 /* vcoul=qq*VV */
+ mulps xmm3, xmm7 /* fijC=FF*qq */
+ /* at this point mm5 contains vcoul and mm3 fijC */
+ /* increment vcoul - then we can get rid of mm5 */
+ /* update vctot */
+ addps xmm5, [esp + vctot]
+ movaps [esp + vctot], xmm5
+
+ /* put scalar force on stack temporarily */
+ movaps [esp + fscal], xmm3
+
+ /* dispersion */
+ movlps xmm4, [esi + ebx*4 + 16]
+ movlps xmm6, [esi + ebx*4 + 24]
+ movaps xmm5, xmm4
+ movaps xmm7, xmm6
+ shufps xmm5, xmm5, 1
+ shufps xmm7, xmm7, 1
+ /* table ready in xmm4-xmm7 */
+
+ mulps xmm6, xmm1 /* xmm6=Geps */
+ mulps xmm7, xmm2 /* xmm7=Heps2 */
+ addps xmm5, xmm6
+ addps xmm5, xmm7 /* xmm5=Fp */
+ mulps xmm7, [esp + two] /* two*Heps2 */
+ addps xmm7, xmm6
+ addps xmm7, xmm5 /* xmm7=FF */
+ mulps xmm5, xmm1 /* xmm5=eps*Fp */
+ addps xmm5, xmm4 /* xmm5=VV */
+
+ movaps xmm4, [esp + c6]
+ mulps xmm7, xmm4 /* fijD */
+ mulps xmm5, xmm4 /* vnb6 */
+ addps xmm7, [esp + fscal] /* add to fscal */
+
+ /* put scalar force on stack Update vnbtot directly */
+ addps xmm5, [esp + vnbtot]
+ movaps [esp + fscal], xmm7
+ movaps [esp + vnbtot], xmm5
+
+ /* repulsion */
+ movlps xmm4, [esi + ebx*4 + 32]
+ movlps xmm6, [esi + ebx*4 + 40]
+ movaps xmm5, xmm4
+ movaps xmm7, xmm6
+ shufps xmm5, xmm5, 1
+ shufps xmm7, xmm7, 1
+ /* table ready in xmm4-xmm7 */
+
+ mulps xmm6, xmm1 /* xmm6=Geps */
+ mulps xmm7, xmm2 /* xmm7=Heps2 */
+ addps xmm5, xmm6
+ addps xmm5, xmm7 /* xmm5=Fp */
+ mulps xmm7, [esp + two] /* two*Heps2 */
+ addps xmm7, xmm6
+ addps xmm7, xmm5 /* xmm7=FF */
+ mulps xmm5, xmm1 /* xmm5=eps*Fp */
+ addps xmm5, xmm4 /* xmm5=VV */
+
+ movaps xmm4, [esp + c12]
+ mulps xmm7, xmm4 /* fijR */
+ mulps xmm5, xmm4 /* vnb12 */
+ addps xmm7, [esp + fscal]
+
+ addps xmm5, [esp + vnbtot]
+ movaps [esp + vnbtot], xmm5
+ xorps xmm4, xmm4
+
+ mulps xmm7, [esp + tsc]
+ mulps xmm7, xmm0
+ subps xmm4, xmm7
+ mov edi, [ebp + faction]
+
+ movaps xmm0, [esp + dx]
+ movaps xmm1, [esp + dy]
+ movaps xmm2, [esp + dz]
+
+ mulps xmm0, xmm4
+ mulps xmm1, xmm4
+ mulps xmm2, xmm4
+ /* xmm0-xmm2 contains tx-tz (partial force) */
+ /* now update f_i */
+ movaps xmm3, [esp + fix]
+ movaps xmm4, [esp + fiy]
+ movaps xmm5, [esp + fiz]
+ addps xmm3, xmm0
+ addps xmm4, xmm1
+ addps xmm5, xmm2
+ movaps [esp + fix], xmm3
+ movaps [esp + fiy], xmm4
+ movaps [esp + fiz], xmm5
+ /* update fj */
+
+ movss xmm3, [edi + eax*4]
+ movss xmm4, [edi + eax*4 + 4]
+ movss xmm5, [edi + eax*4 + 8]
+ subss xmm3, xmm0
+ subss xmm4, xmm1
+ subss xmm5, xmm2
+ movss [edi + eax*4], xmm3
+ movss [edi + eax*4 + 4], xmm4
+ movss [edi + eax*4 + 8], xmm5
+.i3310_updateouterdata_vdwc:
+ mov ecx, [esp + ii3]
+ mov edi, [ebp + faction]
+ mov esi, [ebp + fshift]
+ mov edx, [esp + is3]
+
+ /* accumulate i forces in xmm0, xmm1, xmm2 */
+ movaps xmm0, [esp + fix]
+ movaps xmm1, [esp + fiy]
+ movaps xmm2, [esp + fiz]
+
+ movhlps xmm3, xmm0
+ movhlps xmm4, xmm1
+ movhlps xmm5, xmm2
+ addps xmm0, xmm3
+ addps xmm1, xmm4
+ addps xmm2, xmm5 /* sum is in 1/2 in xmm0-xmm2 */
+
+ movaps xmm3, xmm0
+ movaps xmm4, xmm1
+ movaps xmm5, xmm2
+
+ shufps xmm3, xmm3, 1
+ shufps xmm4, xmm4, 1
+ shufps xmm5, xmm5, 1
+ addss xmm0, xmm3
+ addss xmm1, xmm4
+ addss xmm2, xmm5 /* xmm0-xmm2 has single force in pos0 */
+
+ /* increment i force */
+ movss xmm3, [edi + ecx*4]
+ movss xmm4, [edi + ecx*4 + 4]
+ movss xmm5, [edi + ecx*4 + 8]
+ addss xmm3, xmm0
+ addss xmm4, xmm1
+ addss xmm5, xmm2
+ movss [edi + ecx*4], xmm3
+ movss [edi + ecx*4 + 4], xmm4
+ movss [edi + ecx*4 + 8], xmm5
+
+ /* increment fshift force */
+ movss xmm3, [esi + edx*4]
+ movss xmm4, [esi + edx*4 + 4]
+ movss xmm5, [esi + edx*4 + 8]
+ addss xmm3, xmm0
+ addss xmm4, xmm1
+ addss xmm5, xmm2
+ movss [esi + edx*4], xmm3
+ movss [esi + edx*4 + 4], xmm4
+ movss [esi + edx*4 + 8], xmm5
+
+
+ /* loop back to mno */
+ dec dword ptr [esp + nsvdwc]
+ jz .i3310_testcoul
+ jmp .i3310_mno_vdwc
+.i3310_testcoul:
+ mov ecx, [esp + nscoul]
+ cmp ecx, 0
+ jnz .i3310_mno_coul
+ jmp .i3310_testvdw
+.i3310_mno_coul:
+ mov ebx, [esp + solnr]
+ inc dword ptr [esp + solnr]
+
+ movss xmm0, [esp + shX]
+ movss xmm1, [esp + shY]
+ movss xmm2, [esp + shZ]
+
+ mov edx, [ebp + charge]
+ movss xmm3, [edx + ebx*4]
+ mulss xmm3, [ebp + facel]
+ shufps xmm3, xmm3, 0
+
+ lea ebx, [ebx + ebx*2] /* ebx = 3*ii=ii3 */
+ mov eax, [ebp + pos] /* eax = base of pos[] */
+
+ addss xmm0, [eax + ebx*4]
+ addss xmm1, [eax + ebx*4 + 4]
+ addss xmm2, [eax + ebx*4 + 8]
+
+ movaps [esp + iq], xmm3
+
+ shufps xmm0, xmm0, 0
+ shufps xmm1, xmm1, 0
+ shufps xmm2, xmm2, 0
+
+ movaps [esp + ix], xmm0
+ movaps [esp + iy], xmm1
+ movaps [esp + iz], xmm2
+
+ mov [esp + ii3], ebx
+
+ /* clear i forces */
+ xorps xmm4, xmm4
+ movaps [esp + fix], xmm4
+ movaps [esp + fiy], xmm4
+ movaps [esp + fiz], xmm4
+
+ mov ecx, [esp + innerjjnr0]
+ mov [esp + innerjjnr], ecx
+ mov edx, [esp + innerk0]
+ sub edx, 4
+ mov [esp + innerk], edx /* number of innerloop atoms */
+ jge .i3310_unroll_coul_loop
+ jmp .i3310_finish_coul_inner
+
+.i3310_unroll_coul_loop:
+ /* quad-unroll innerloop here */
+ mov edx, [esp + innerjjnr] /* pointer to jjnr[k] */
+ mov eax, [edx]
+ mov ebx, [edx + 4]
+ mov ecx, [edx + 8]
+ mov edx, [edx + 12] /* eax-edx=jnr1-4 */
+ add [esp + innerjjnr], 16 /* advance pointer (unrolled 4) */
+
+ mov esi, [ebp + charge] /* base of charge[] */
+
+ movss xmm3, [esi + eax*4]
+ movss xmm4, [esi + ecx*4]
+ movss xmm6, [esi + ebx*4]
+ movss xmm7, [esi + edx*4]
+
+ movaps xmm2, [esp + iq]
+ shufps xmm3, xmm6, 0
+ shufps xmm4, xmm7, 0
+ shufps xmm3, xmm4, 0b10001000 /* all charges in xmm3 */
+ mulps xmm3, xmm2
+
+ movaps [esp + qq], xmm3
+
+ mov esi, [ebp + pos] /* base of pos[] */
+
+ lea eax, [eax + eax*2] /* replace jnr with j3 */
+ lea ebx, [ebx + ebx*2]
+
+ lea ecx, [ecx + ecx*2] /* replace jnr with j3 */
+ lea edx, [edx + edx*2]
+
+ /* move four coordinates to xmm0-xmm2 */
+
+ movlps xmm4, [esi + eax*4]
+ movlps xmm5, [esi + ecx*4]
+ movss xmm2, [esi + eax*4 + 8]
+ movss xmm6, [esi + ecx*4 + 8]
+
+ movhps xmm4, [esi + ebx*4]
+ movhps xmm5, [esi + edx*4]
+
+ movss xmm0, [esi + ebx*4 + 8]
+ movss xmm1, [esi + edx*4 + 8]
+
+ shufps xmm2, xmm0, 0
+ shufps xmm6, xmm1, 0
+
+ movaps xmm0, xmm4
+ movaps xmm1, xmm4
+
+ shufps xmm2, xmm6, 0b10001000
+
+ shufps xmm0, xmm5, 0b10001000
+ shufps xmm1, xmm5, 0b11011101
+
+ /* move ix-iz to xmm4-xmm6 */
+ movaps xmm4, [esp + ix]
+ movaps xmm5, [esp + iy]
+ movaps xmm6, [esp + iz]
+
+ /* calc dr */
+ subps xmm4, xmm0
+ subps xmm5, xmm1
+ subps xmm6, xmm2
+
+ /* store dr */
+ movaps [esp + dx], xmm4
+ movaps [esp + dy], xmm5
+ movaps [esp + dz], xmm6
+ /* square it */
+ mulps xmm4,xmm4
+ mulps xmm5,xmm5
+ mulps xmm6,xmm6
+ addps xmm4, xmm5
+ addps xmm4, xmm6
+ /* rsq in xmm4 */
+
+ rsqrtps xmm5, xmm4
+ /* lookup seed in xmm5 */
+ movaps xmm2, xmm5
+ mulps xmm5, xmm5
+ movaps xmm1, [esp + three]
+ mulps xmm5, xmm4 /* rsq*lu*lu */
+ movaps xmm0, [esp + half]
+ subps xmm1, xmm5 /* 30-rsq*lu*lu */
+ mulps xmm1, xmm2
+ mulps xmm0, xmm1 /* xmm0=rinv */
+ mulps xmm4, xmm0 /* xmm4=r */
+ mulps xmm4, [esp + tsc]
+
+ movhlps xmm5, xmm4
+ cvttps2pi mm6, xmm4
+ cvttps2pi mm7, xmm5 /* mm6/mm7 contain lu indices */
+ cvtpi2ps xmm6, mm6
+ cvtpi2ps xmm5, mm7
+ movlhps xmm6, xmm5
+ subps xmm4, xmm6
+ movaps xmm1, xmm4 /* xmm1=eps */
+ movaps xmm2, xmm1
+ mulps xmm2, xmm2 /* xmm2=eps2 */
+ pslld mm6, 2
+ pslld mm7, 2
+
+ movd mm0, eax
+ movd mm1, ebx
+ movd mm2, ecx
+ movd mm3, edx
+
+ mov esi, [ebp + VFtab]
+ movd eax, mm6
+ psrlq mm6, 32
+ movd ecx, mm7
+ psrlq mm7, 32
+ movd ebx, mm6
+ movd edx, mm7
+
+ lea eax, [eax + eax*2]
+ lea ebx, [ebx + ebx*2]
+ lea ecx, [ecx + ecx*2]
+ lea edx, [edx + edx*2]
+
+ movlps xmm5, [esi + eax*4]
+ movlps xmm7, [esi + ecx*4]
+ movhps xmm5, [esi + ebx*4]
+ movhps xmm7, [esi + edx*4] /* got half coulomb table */
+
+ movaps xmm4, xmm5
+ shufps xmm4, xmm7, 0b10001000
+ shufps xmm5, xmm7, 0b11011101
+
+ movlps xmm7, [esi + eax*4 + 8]
+ movlps xmm3, [esi + ecx*4 + 8]
+ movhps xmm7, [esi + ebx*4 + 8]
+ movhps xmm3, [esi + edx*4 + 8] /* other half of coulomb table */
+ movaps xmm6, xmm7
+ shufps xmm6, xmm3, 0b10001000
+ shufps xmm7, xmm3, 0b11011101
+ /* coulomb table ready, in xmm4-xmm7 */
+
+ mulps xmm6, xmm1 /* xmm6=Geps */
+ mulps xmm7, xmm2 /* xmm7=Heps2 */
+ addps xmm5, xmm6
+ addps xmm5, xmm7 /* xmm5=Fp */
+ mulps xmm7, [esp + two] /* two*Heps2 */
+ movaps xmm3, [esp + qq]
+ addps xmm7, xmm6
+ addps xmm7, xmm5 /* xmm7=FF */
+ mulps xmm5, xmm1 /* xmm5=eps*Fp */
+ addps xmm5, xmm4 /* xmm5=VV */
+ mulps xmm5, xmm3 /* vcoul=qq*VV */
+ mulps xmm3, xmm7 /* fijC=FF*qq */
+ /* at this point mm5 contains vcoul and mm3 fijC */
+ /* increment vcoul - then we can get rid of mm5 */
+ /* update vctot */
+ addps xmm5, [esp + vctot]
+ movaps [esp + vctot], xmm5
+
+ xorps xmm4, xmm4
+
+ mulps xmm3, [esp + tsc]
+ mulps xmm3, xmm0
+ subps xmm4, xmm3
+
+ movaps xmm0, [esp + dx]
+ movaps xmm1, [esp + dy]
+ movaps xmm2, [esp + dz]
+
+ movd eax, mm0
+ movd ebx, mm1
+ movd ecx, mm2
+ movd edx, mm3
+
+ mov edi, [ebp + faction]
+ mulps xmm0, xmm4
+ mulps xmm1, xmm4
+ mulps xmm2, xmm4
+ /* xmm0-xmm2 contains tx-tz (partial force) */
+ /* now update f_i */
+ movaps xmm3, [esp + fix]
+ movaps xmm4, [esp + fiy]
+ movaps xmm5, [esp + fiz]
+ addps xmm3, xmm0
+ addps xmm4, xmm1
+ addps xmm5, xmm2
+ movaps [esp + fix], xmm3
+ movaps [esp + fiy], xmm4
+ movaps [esp + fiz], xmm5
+ /* the fj's - start by accumulating x & y forces from memory */
+ movlps xmm4, [edi + eax*4]
+ movlps xmm6, [edi + ecx*4]
+ movhps xmm4, [edi + ebx*4]
+ movhps xmm6, [edi + edx*4]
+
+ movaps xmm3, xmm4
+ shufps xmm3, xmm6, 0b10001000
+ shufps xmm4, xmm6, 0b11011101
+
+ /* now xmm3-xmm5 contains fjx, fjy, fjz */
+ subps xmm3, xmm0
+ subps xmm4, xmm1
+
+ /* unpack them back so we can store them - first x & y in xmm3/xmm4 */
+
+ movaps xmm6, xmm3
+ unpcklps xmm6, xmm4
+ unpckhps xmm3, xmm4
+ /* xmm6(l)=x & y for j1, (h) for j2 */
+ /* xmm3(l)=x & y for j3, (h) for j4 */
+ movlps [edi + eax*4], xmm6
+ movlps [edi + ecx*4], xmm3
+
+ movhps [edi + ebx*4], xmm6
+ movhps [edi + edx*4], xmm3
+
+ /* and the z forces */
+ movss xmm4, [edi + eax*4 + 8]
+ movss xmm5, [edi + ebx*4 + 8]
+ movss xmm6, [edi + ecx*4 + 8]
+ movss xmm7, [edi + edx*4 + 8]
+ subss xmm4, xmm2
+ shufps xmm2, xmm2, 0b11100101
+ subss xmm5, xmm2
+ shufps xmm2, xmm2, 0b11101010
+ subss xmm6, xmm2
+ shufps xmm2, xmm2, 0b11111111
+ subss xmm7, xmm2
+ movss [edi + eax*4 + 8], xmm4
+ movss [edi + ebx*4 + 8], xmm5
+ movss [edi + ecx*4 + 8], xmm6
+ movss [edi + edx*4 + 8], xmm7
+
+ /* should we do one more iteration? */
+ sub [esp + innerk], 4
+ jl .i3310_finish_coul_inner
+ jmp .i3310_unroll_coul_loop
+.i3310_finish_coul_inner:
+ /* check if at least two particles remain */
+ add [esp + innerk], 4
+ mov edx, [esp + innerk]
+ and edx, 2
+ jnz .i3310_dopair_coul
+ jmp .i3310_checksingle_coul
+.i3310_dopair_coul:
+ mov esi, [ebp + charge]
+
+ mov ecx, [esp + innerjjnr]
+
+ mov eax, [ecx]
+ mov ebx, [ecx + 4]
+ add [esp + innerjjnr], 8
+ xorps xmm7, xmm7
+ movss xmm3, [esi + eax*4]
+ movss xmm6, [esi + ebx*4]
+ shufps xmm3, xmm6, 0
+ shufps xmm3, xmm3, 0b00001000 /* xmm3(0,1) has the charges */
+
+ mulps xmm3, [esp + iq]
+ movlhps xmm3, xmm7
+ movaps [esp + qq], xmm3
+
+ mov edi, [ebp + pos]
+
+ lea eax, [eax + eax*2]
+ lea ebx, [ebx + ebx*2]
+ /* move coordinates to xmm0-xmm2 */
+ movlps xmm1, [edi + eax*4]
+ movss xmm2, [edi + eax*4 + 8]
+ movhps xmm1, [edi + ebx*4]
+ movss xmm0, [edi + ebx*4 + 8]
+
+ movlhps xmm3, xmm7
+
+ shufps xmm2, xmm0, 0
+
+ movaps xmm0, xmm1
+
+ shufps xmm2, xmm2, 0b10001000
+
+ shufps xmm0, xmm0, 0b10001000
+ shufps xmm1, xmm1, 0b11011101
+
+ mov edi, [ebp + faction]
+ /* move ix-iz to xmm4-xmm6 */
+ xorps xmm7, xmm7
+
+ movaps xmm4, [esp + ix]
+ movaps xmm5, [esp + iy]
+ movaps xmm6, [esp + iz]
+
+ /* calc dr */
+ subps xmm4, xmm0
+ subps xmm5, xmm1
+ subps xmm6, xmm2
+
+ /* store dr */
+ movaps [esp + dx], xmm4
+ movaps [esp + dy], xmm5
+ movaps [esp + dz], xmm6
+ /* square it */
+ mulps xmm4,xmm4
+ mulps xmm5,xmm5
+ mulps xmm6,xmm6
+ addps xmm4, xmm5
+ addps xmm4, xmm6
+ /* rsq in xmm4 */
+
+ rsqrtps xmm5, xmm4
+ /* lookup seed in xmm5 */
+ movaps xmm2, xmm5
+ mulps xmm5, xmm5
+ movaps xmm1, [esp + three]
+ mulps xmm5, xmm4 /* rsq*lu*lu */
+ movaps xmm0, [esp + half]
+ subps xmm1, xmm5 /* 30-rsq*lu*lu */
+ mulps xmm1, xmm2
+ mulps xmm0, xmm1 /* xmm0=rinv */
+ mulps xmm4, xmm0 /* xmm4=r */
+ mulps xmm4, [esp + tsc]
+
+ cvttps2pi mm6, xmm4 /* mm6 contain lu indices */
+ cvtpi2ps xmm6, mm6
+ subps xmm4, xmm6
+ movaps xmm1, xmm4 /* xmm1=eps */
+ movaps xmm2, xmm1
+ mulps xmm2, xmm2 /* xmm2=eps2 */
+
+ pslld mm6, 2
+
+ mov esi, [ebp + VFtab]
+ movd ecx, mm6
+ psrlq mm6, 32
+ movd edx, mm6
+
+ lea ecx, [ecx + ecx*2]
+ lea edx, [edx + edx*2]
+
+ movlps xmm5, [esi + ecx*4]
+ movhps xmm5, [esi + edx*4] /* got half coulomb table */
+ movaps xmm4, xmm5
+ shufps xmm4, xmm4, 0b10001000
+ shufps xmm5, xmm7, 0b11011101
+
+ movlps xmm7, [esi + ecx*4 + 8]
+ movhps xmm7, [esi + edx*4 + 8]
+ movaps xmm6, xmm7
+ shufps xmm6, xmm6, 0b10001000
+ shufps xmm7, xmm7, 0b11011101
+ /* table ready in xmm4-xmm7 */
+
+ mulps xmm6, xmm1 /* xmm6=Geps */
+ mulps xmm7, xmm2 /* xmm7=Heps2 */
+ addps xmm5, xmm6
+ addps xmm5, xmm7 /* xmm5=Fp */
+ mulps xmm7, [esp + two] /* two*Heps2 */
+ movaps xmm3, [esp + qq]
+ addps xmm7, xmm6
+ addps xmm7, xmm5 /* xmm7=FF */
+ mulps xmm5, xmm1 /* xmm5=eps*Fp */
+ addps xmm5, xmm4 /* xmm5=VV */
+ mulps xmm5, xmm3 /* vcoul=qq*VV */
+ mulps xmm3, xmm7 /* fijC=FF*qq */
+ /* at this point mm5 contains vcoul and mm3 fijC */
+ /* increment vcoul - then we can get rid of mm5 */
+ /* update vctot */
+ addps xmm5, [esp + vctot]
+ movaps [esp + vctot], xmm5
+
+ xorps xmm4, xmm4
+
+ mulps xmm3, [esp + tsc]
+ mulps xmm3, xmm0
+ subps xmm4, xmm3
+
+ movaps xmm0, [esp + dx]
+ movaps xmm1, [esp + dy]
+ movaps xmm2, [esp + dz]
+
+ mulps xmm0, xmm4
+ mulps xmm1, xmm4
+ mulps xmm2, xmm4
+ /* xmm0-xmm2 contains tx-tz (partial force) */
+ /* now update f_i */
+ movaps xmm3, [esp + fix]
+ movaps xmm4, [esp + fiy]
+ movaps xmm5, [esp + fiz]
+ addps xmm3, xmm0
+ addps xmm4, xmm1
+ addps xmm5, xmm2
+ movaps [esp + fix], xmm3
+ movaps [esp + fiy], xmm4
+ movaps [esp + fiz], xmm5
+ /* update the fj's */
+ movss xmm3, [edi + eax*4]
+ movss xmm4, [edi + eax*4 + 4]
+ movss xmm5, [edi + eax*4 + 8]
+ subss xmm3, xmm0
+ subss xmm4, xmm1
+ subss xmm5, xmm2
+ movss [edi + eax*4], xmm3
+ movss [edi + eax*4 + 4], xmm4
+ movss [edi + eax*4 + 8], xmm5
+
+ shufps xmm0, xmm0, 0b11100001
+ shufps xmm1, xmm1, 0b11100001
+ shufps xmm2, xmm2, 0b11100001
+
+ movss xmm3, [edi + ebx*4]
+ movss xmm4, [edi + ebx*4 + 4]
+ movss xmm5, [edi + ebx*4 + 8]
+ subss xmm3, xmm0
+ subss xmm4, xmm1
+ subss xmm5, xmm2
+ movss [edi + ebx*4], xmm3
+ movss [edi + ebx*4 + 4], xmm4
+ movss [edi + ebx*4 + 8], xmm5
+
+.i3310_checksingle_coul:
+ mov edx, [esp + innerk]
+ and edx, 1
+ jnz .i3310_dosingle_coul
+ jmp .i3310_updateouterdata_coul
+.i3310_dosingle_coul:
+ mov esi, [ebp + charge]
+ mov edi, [ebp + pos]
+ mov ecx, [esp + innerjjnr]
+ mov eax, [ecx]
+ xorps xmm6, xmm6
+ movss xmm6, [esi + eax*4] /* xmm6(0) has the charge */
+ mulps xmm6, [esp + iq]
+ movaps [esp + qq], xmm6
+
+ lea eax, [eax + eax*2]
+
+ /* move coordinates to xmm0-xmm2 */
+ movss xmm0, [edi + eax*4]
+ movss xmm1, [edi + eax*4 + 4]
+ movss xmm2, [edi + eax*4 + 8]
+
+ movaps xmm4, [esp + ix]
+ movaps xmm5, [esp + iy]
+ movaps xmm6, [esp + iz]
+
+ /* calc dr */
+ subps xmm4, xmm0
+ subps xmm5, xmm1
+ subps xmm6, xmm2
+
+ /* store dr */
+ movaps [esp + dx], xmm4
+ movaps [esp + dy], xmm5
+ movaps [esp + dz], xmm6
+ /* square it */
+ mulps xmm4,xmm4
+ mulps xmm5,xmm5
+ mulps xmm6,xmm6
+ addps xmm4, xmm5
+ addps xmm4, xmm6
+ /* rsq in xmm4 */
+
+ rsqrtps xmm5, xmm4
+ /* lookup seed in xmm5 */
+ movaps xmm2, xmm5
+ mulps xmm5, xmm5
+ movaps xmm1, [esp + three]
+ mulps xmm5, xmm4 /* rsq*lu*lu */
+ movaps xmm0, [esp + half]
+ subps xmm1, xmm5 /* 30-rsq*lu*lu */
+ mulps xmm1, xmm2
+ mulps xmm0, xmm1 /* xmm0=rinv */
+
+ mulps xmm4, xmm0 /* xmm4=r */
+ mulps xmm4, [esp + tsc]
+
+ cvttps2pi mm6, xmm4 /* mm6 contain lu indices */
+ cvtpi2ps xmm6, mm6
+ subps xmm4, xmm6
+ movaps xmm1, xmm4 /* xmm1=eps */
+ movaps xmm2, xmm1
+ mulps xmm2, xmm2 /* xmm2=eps2 */
+
+ pslld mm6, 2
+
+ mov esi, [ebp + VFtab]
+ movd ebx, mm6
+
+ lea ebx, [ebx + ebx*2]
+
+ movlps xmm4, [esi + ebx*4]
+ movlps xmm6, [esi + ebx*4 + 8]
+ movaps xmm5, xmm4
+ movaps xmm7, xmm6
+ shufps xmm5, xmm5, 1
+ shufps xmm7, xmm7, 1
+ /* table ready in xmm4-xmm7 */
+
+ mulps xmm6, xmm1 /* xmm6=Geps */
+ mulps xmm7, xmm2 /* xmm7=Heps2 */
+ addps xmm5, xmm6
+ addps xmm5, xmm7 /* xmm5=Fp */
+ mulps xmm7, [esp + two] /* two*Heps2 */
+ movaps xmm3, [esp + qq]
+ addps xmm7, xmm6
+ addps xmm7, xmm5 /* xmm7=FF */
+ mulps xmm5, xmm1 /* xmm5=eps*Fp */
+ addps xmm5, xmm4 /* xmm5=VV */
+ mulps xmm5, xmm3 /* vcoul=qq*VV */
+ mulps xmm3, xmm7 /* fijC=FF*qq */
+ /* at this point mm5 contains vcoul and mm3 fijC */
+ /* increment vcoul - then we can get rid of mm5 */
+ /* update vctot */
+ addps xmm5, [esp + vctot]
+ movaps [esp + vctot], xmm5
+
+ xorps xmm4, xmm4
+
+ mulps xmm3, [esp + tsc]
+ mulps xmm3, xmm0
+ subps xmm4, xmm3
+ mov edi, [ebp + faction]
+
+ movaps xmm0, [esp + dx]
+ movaps xmm1, [esp + dy]
+ movaps xmm2, [esp + dz]
+
+ mulps xmm0, xmm4
+ mulps xmm1, xmm4
+ mulps xmm2, xmm4
+ /* xmm0-xmm2 contains tx-tz (partial force) */
+ /* now update f_i */
+ movaps xmm3, [esp + fix]
+ movaps xmm4, [esp + fiy]
+ movaps xmm5, [esp + fiz]
+ addps xmm3, xmm0
+ addps xmm4, xmm1
+ addps xmm5, xmm2
+ movaps [esp + fix], xmm3
+ movaps [esp + fiy], xmm4
+ movaps [esp + fiz], xmm5
+ /* update fj */
+
+ movss xmm3, [edi + eax*4]
+ movss xmm4, [edi + eax*4 + 4]
+ movss xmm5, [edi + eax*4 + 8]
+ subss xmm3, xmm0
+ subss xmm4, xmm1
+ subss xmm5, xmm2
+ movss [edi + eax*4], xmm3
+ movss [edi + eax*4 + 4], xmm4
+ movss [edi + eax*4 + 8], xmm5
+.i3310_updateouterdata_coul:
+ mov ecx, [esp + ii3]
+ mov edi, [ebp + faction]
+ mov esi, [ebp + fshift]
+ mov edx, [esp + is3]
+
+ /* accumulate i forces in xmm0, xmm1, xmm2 */
+ movaps xmm0, [esp + fix]
+ movaps xmm1, [esp + fiy]
+ movaps xmm2, [esp + fiz]
+
+ movhlps xmm3, xmm0
+ movhlps xmm4, xmm1
+ movhlps xmm5, xmm2
+ addps xmm0, xmm3
+ addps xmm1, xmm4
+ addps xmm2, xmm5 /* sum is in 1/2 in xmm0-xmm2 */
+
+ movaps xmm3, xmm0
+ movaps xmm4, xmm1
+ movaps xmm5, xmm2
+
+ shufps xmm3, xmm3, 1
+ shufps xmm4, xmm4, 1
+ shufps xmm5, xmm5, 1
+ addss xmm0, xmm3
+ addss xmm1, xmm4
+ addss xmm2, xmm5 /* xmm0-xmm2 has single force in pos0 */
+
+ /* increment i force */
+ movss xmm3, [edi + ecx*4]
+ movss xmm4, [edi + ecx*4 + 4]
+ movss xmm5, [edi + ecx*4 + 8]
+ addss xmm3, xmm0
+ addss xmm4, xmm1
+ addss xmm5, xmm2
+ movss [edi + ecx*4], xmm3
+ movss [edi + ecx*4 + 4], xmm4
+ movss [edi + ecx*4 + 8], xmm5
+
+ /* increment fshift force */
+ movss xmm3, [esi + edx*4]
+ movss xmm4, [esi + edx*4 + 4]
+ movss xmm5, [esi + edx*4 + 8]
+ addss xmm3, xmm0
+ addss xmm4, xmm1
+ addss xmm5, xmm2
+ movss [esi + edx*4], xmm3
+ movss [esi + edx*4 + 4], xmm4
+ movss [esi + edx*4 + 8], xmm5
+
+ /* loop back to mno */
+ dec dword ptr [esp + nscoul]
+ jz .i3310_testvdw
+ jmp .i3310_mno_coul
+.i3310_testvdw:
+ mov ecx, [esp + nsvdw]
+ cmp ecx, 0
+ jnz .i3310_mno_vdw
+ jmp .i3310_last_mno
+.i3310_mno_vdw:
+ mov ebx, [esp + solnr]
+ inc dword ptr [esp + solnr]
+
+ mov edx, [ebp + type]
+ mov edx, [edx + ebx*4]
+ imul edx, [ebp + ntype]
+ shl edx, 1
+ mov [esp + ntia], edx
+
+ lea ebx, [ebx + ebx*2] /* ebx = 3*ii=ii3 */
+ mov eax, [ebp + pos] /* eax = base of pos[] */
+ mov [esp + ii3], ebx
+
+ movss xmm0, [esp + shX]
+ movss xmm1, [esp + shY]
+ movss xmm2, [esp + shZ]
+
+ addss xmm0, [eax + ebx*4]
+ addss xmm1, [eax + ebx*4 + 4]
+ addss xmm2, [eax + ebx*4 + 8]
+
+ xorps xmm4, xmm4
+ movaps [esp + fix], xmm4
+ movaps [esp + fiy], xmm4
+ movaps [esp + fiz], xmm4
+
+ shufps xmm0, xmm0, 0
+ shufps xmm1, xmm1, 0
+ shufps xmm2, xmm2, 0
+
+ movaps [esp + ix], xmm0
+ movaps [esp + iy], xmm1
+ movaps [esp + iz], xmm2
+
+ mov ecx, [esp + innerjjnr0]
+ mov [esp + innerjjnr], ecx
+ mov edx, [esp + innerk0]
+ sub edx, 4
+ mov [esp + innerk], edx /* number of innerloop atoms */
+ jge .i3310_unroll_vdw_loop
+ jmp .i3310_finish_vdw_inner
+.i3310_unroll_vdw_loop:
+ /* quad-unroll innerloop here */
+ mov edx, [esp + innerjjnr] /* pointer to jjnr[k] */
+ mov eax, [edx]
+ mov ebx, [edx + 4]
+ mov ecx, [edx + 8]
+ mov edx, [edx + 12] /* eax-edx=jnr1-4 */
+ add [esp + innerjjnr], 16 /* advance pointer (unrolled 4) */
+
+ movd mm0, eax /* use mmx registers as temp storage */
+ movd mm1, ebx
+ movd mm2, ecx
+ movd mm3, edx
+
+ mov esi, [ebp + type]
+ mov eax, [esi + eax*4]
+ mov ebx, [esi + ebx*4]
+ mov ecx, [esi + ecx*4]
+ mov edx, [esi + edx*4]
+ mov esi, [ebp + nbfp]
+ shl eax, 1
+ shl ebx, 1
+ shl ecx, 1
+ shl edx, 1
+ mov edi, [esp + ntia]
+ add eax, edi
+ add ebx, edi
+ add ecx, edi
+ add edx, edi
+
+ movlps xmm6, [esi + eax*4]
+ movlps xmm7, [esi + ecx*4]
+ movhps xmm6, [esi + ebx*4]
+ movhps xmm7, [esi + edx*4]
+
+ movaps xmm4, xmm6
+ shufps xmm4, xmm7, 0b10001000
+ shufps xmm6, xmm7, 0b11011101
+
+ movd eax, mm0
+ movd ebx, mm1
+ movd ecx, mm2
+ movd edx, mm3
+
+ movaps [esp + c6], xmm4
+ movaps [esp + c12], xmm6
+
+ mov esi, [ebp + pos] /* base of pos[] */
+
+ lea eax, [eax + eax*2] /* replace jnr with j3 */
+ lea ebx, [ebx + ebx*2]
+
+ lea ecx, [ecx + ecx*2] /* replace jnr with j3 */
+ lea edx, [edx + edx*2]
+
+ /* move four coordinates to xmm0-xmm2 */
+
+ movlps xmm4, [esi + eax*4]
+ movlps xmm5, [esi + ecx*4]
+ movss xmm2, [esi + eax*4 + 8]
+ movss xmm6, [esi + ecx*4 + 8]
+
+ movhps xmm4, [esi + ebx*4]
+ movhps xmm5, [esi + edx*4]
+
+ movss xmm0, [esi + ebx*4 + 8]
+ movss xmm1, [esi + edx*4 + 8]
+
+ shufps xmm2, xmm0, 0
+ shufps xmm6, xmm1, 0
+
+ movaps xmm0, xmm4
+ movaps xmm1, xmm4
+
+ shufps xmm2, xmm6, 0b10001000
+
+ shufps xmm0, xmm5, 0b10001000
+ shufps xmm1, xmm5, 0b11011101
+
+ /* move ix-iz to xmm4-xmm6 */
+ movaps xmm4, [esp + ix]
+ movaps xmm5, [esp + iy]
+ movaps xmm6, [esp + iz]
+
+ /* calc dr */
+ subps xmm4, xmm0
+ subps xmm5, xmm1
+ subps xmm6, xmm2
+
+ /* store dr */
+ movaps [esp + dx], xmm4
+ movaps [esp + dy], xmm5
+ movaps [esp + dz], xmm6
+ /* square it */
+ mulps xmm4,xmm4
+ mulps xmm5,xmm5
+ mulps xmm6,xmm6
+ addps xmm4, xmm5
+ addps xmm4, xmm6
+ /* rsq in xmm4 */
+
+ rsqrtps xmm5, xmm4
+ /* lookup seed in xmm5 */
+ movaps xmm2, xmm5
+ mulps xmm5, xmm5
+ movaps xmm1, [esp + three]
+ mulps xmm5, xmm4 /* rsq*lu*lu */
+ movaps xmm0, [esp + half]
+ subps xmm1, xmm5 /* 30-rsq*lu*lu */
+ mulps xmm1, xmm2
+ mulps xmm0, xmm1 /* xmm0=rinv */
+ mulps xmm4, xmm0 /* xmm4=r */
+ mulps xmm4, [esp + tsc]
+
+ movhlps xmm5, xmm4
+ cvttps2pi mm6, xmm4
+ cvttps2pi mm7, xmm5 /* mm6/mm7 contain lu indices */
+ cvtpi2ps xmm6, mm6
+ cvtpi2ps xmm5, mm7
+ movlhps xmm6, xmm5
+ subps xmm4, xmm6
+ movaps xmm1, xmm4 /* xmm1=eps */
+ movaps xmm2, xmm1
+ mulps xmm2, xmm2 /* xmm2=eps2 */
+ pslld mm6, 2
+ pslld mm7, 2
+
+ movd mm0, eax
+ movd mm1, ebx
+ movd mm2, ecx
+ movd mm3, edx
+
+ mov esi, [ebp + VFtab]
+ movd eax, mm6
+ psrlq mm6, 32
+ movd ecx, mm7
+ psrlq mm7, 32
+ movd ebx, mm6
+ movd edx, mm7
+
+ lea eax, [eax + eax*2]
+ lea ebx, [ebx + ebx*2]
+ lea ecx, [ecx + ecx*2]
+ lea edx, [edx + edx*2]
+
+ /* dispersion */
+ movlps xmm5, [esi + eax*4 + 0]
+ movlps xmm7, [esi + ecx*4 + 0]
+ movhps xmm5, [esi + ebx*4 + 0]
+ movhps xmm7, [esi + edx*4 + 0] /* got half dispersion table */
+ movaps xmm4, xmm5
+ shufps xmm4, xmm7, 0b10001000
+ shufps xmm5, xmm7, 0b11011101
+
+ movlps xmm7, [esi + eax*4 + 8]
+ movlps xmm3, [esi + ecx*4 + 8]
+ movhps xmm7, [esi + ebx*4 + 8]
+ movhps xmm3, [esi + edx*4 + 8] /* other half of dispersion table */
+ movaps xmm6, xmm7
+ shufps xmm6, xmm3, 0b10001000
+ shufps xmm7, xmm3, 0b11011101
+ /* dispersion table ready, in xmm4-xmm7 */
+ mulps xmm6, xmm1 /* xmm6=Geps */
+ mulps xmm7, xmm2 /* xmm7=Heps2 */
+ addps xmm5, xmm6
+ addps xmm5, xmm7 /* xmm5=Fp */
+ mulps xmm7, [esp + two] /* two*Heps2 */
+ addps xmm7, xmm6
+ addps xmm7, xmm5 /* xmm7=FF */
+ mulps xmm5, xmm1 /* xmm5=eps*Fp */
+ addps xmm5, xmm4 /* xmm5=VV */
+
+ movaps xmm4, [esp + c6]
+ mulps xmm7, xmm4 /* fijD */
+ mulps xmm5, xmm4 /* vnb6 */
+
+ /* put scalar force on stack Update vnbtot directly */
+ addps xmm5, [esp + vnbtot]
+ movaps [esp + fscal], xmm7
+ movaps [esp + vnbtot], xmm5
+
+ /* repulsion */
+ movlps xmm5, [esi + eax*4 + 16]
+ movlps xmm7, [esi + ecx*4 + 16]
+ movhps xmm5, [esi + ebx*4 + 16]
+ movhps xmm7, [esi + edx*4 + 16] /* got half repulsion table */
+ movaps xmm4, xmm5
+ shufps xmm4, xmm7, 0b10001000
+ shufps xmm5, xmm7, 0b11011101
+
+ movlps xmm7, [esi + eax*4 + 24]
+ movlps xmm3, [esi + ecx*4 + 24]
+ movhps xmm7, [esi + ebx*4 + 24]
+ movhps xmm3, [esi + edx*4 + 24] /* other half of repulsion table */
+ movaps xmm6, xmm7
+ shufps xmm6, xmm3, 0b10001000
+ shufps xmm7, xmm3, 0b11011101
+ /* table ready, in xmm4-xmm7 */
+ mulps xmm6, xmm1 /* xmm6=Geps */
+ mulps xmm7, xmm2 /* xmm7=Heps2 */
+ addps xmm5, xmm6
+ addps xmm5, xmm7 /* xmm5=Fp */
+ mulps xmm7, [esp + two] /* two*Heps2 */
+ addps xmm7, xmm6
+ addps xmm7, xmm5 /* xmm7=FF */
+ mulps xmm5, xmm1 /* xmm5=eps*Fp */
+ addps xmm5, xmm4 /* xmm5=VV */
+
+ movaps xmm4, [esp + c12]
+ mulps xmm7, xmm4 /* fijR */
+ mulps xmm5, xmm4 /* vnb12 */
+ addps xmm7, [esp + fscal]
+
+ addps xmm5, [esp + vnbtot]
+ movaps [esp + vnbtot], xmm5
+ xorps xmm4, xmm4
+
+ mulps xmm7, [esp + tsc]
+ mulps xmm7, xmm0
+ subps xmm4, xmm7
+
+ movaps xmm0, [esp + dx]
+ movaps xmm1, [esp + dy]
+ movaps xmm2, [esp + dz]
+
+ movd eax, mm0
+ movd ebx, mm1
+ movd ecx, mm2
+ movd edx, mm3
+
+ mov edi, [ebp + faction]
+ mulps xmm0, xmm4
+ mulps xmm1, xmm4
+ mulps xmm2, xmm4
+ /* xmm0-xmm2 contains tx-tz (partial force) */
+ /* now update f_i */
+ movaps xmm3, [esp + fix]
+ movaps xmm4, [esp + fiy]
+ movaps xmm5, [esp + fiz]
+ addps xmm3, xmm0
+ addps xmm4, xmm1
+ addps xmm5, xmm2
+ movaps [esp + fix], xmm3
+ movaps [esp + fiy], xmm4
+ movaps [esp + fiz], xmm5
+ /* the fj's - start by accumulating x & y forces from memory */
+ movlps xmm4, [edi + eax*4]
+ movlps xmm6, [edi + ecx*4]
+ movhps xmm4, [edi + ebx*4]
+ movhps xmm6, [edi + edx*4]
+
+ movaps xmm3, xmm4
+ shufps xmm3, xmm6, 0b10001000
+ shufps xmm4, xmm6, 0b11011101
+
+ /* now xmm3-xmm5 contains fjx, fjy, fjz */
+ subps xmm3, xmm0
+ subps xmm4, xmm1
+
+ /* unpack them back so we can store them - first x & y in xmm3/xmm4 */
+
+ movaps xmm6, xmm3
+ unpcklps xmm6, xmm4
+ unpckhps xmm3, xmm4
+ /* xmm6(l)=x & y for j1, (h) for j2 */
+ /* xmm3(l)=x & y for j3, (h) for j4 */
+ movlps [edi + eax*4], xmm6
+ movlps [edi + ecx*4], xmm3
+
+ movhps [edi + ebx*4], xmm6
+ movhps [edi + edx*4], xmm3
+
+ /* and the z forces */
+ movss xmm4, [edi + eax*4 + 8]
+ movss xmm5, [edi + ebx*4 + 8]
+ movss xmm6, [edi + ecx*4 + 8]
+ movss xmm7, [edi + edx*4 + 8]
+ subss xmm4, xmm2
+ shufps xmm2, xmm2, 0b11100101
+ subss xmm5, xmm2
+ shufps xmm2, xmm2, 0b11101010
+ subss xmm6, xmm2
+ shufps xmm2, xmm2, 0b11111111
+ subss xmm7, xmm2
+ movss [edi + eax*4 + 8], xmm4
+ movss [edi + ebx*4 + 8], xmm5
+ movss [edi + ecx*4 + 8], xmm6
+ movss [edi + edx*4 + 8], xmm7
+
+ /* should we do one more iteration? */
+ sub [esp + innerk], 4
+ jl .i3310_finish_vdw_inner
+ jmp .i3310_unroll_vdw_loop
+.i3310_finish_vdw_inner:
+ /* check if at least two particles remain */
+ add [esp + innerk], 4
+ mov edx, [esp + innerk]
+ and edx, 2
+ jnz .i3310_dopair_vdw
+ jmp .i3310_checksingle_vdw
+.i3310_dopair_vdw:
+ mov ecx, [esp + innerjjnr]
+
+ mov eax, [ecx]
+ mov ebx, [ecx + 4]
+ add [esp + innerjjnr], 8
+ xorps xmm7, xmm7
+
+ mov esi, [ebp + type]
+ mov ecx, eax
+ mov edx, ebx
+ mov ecx, [esi + ecx*4]
+ mov edx, [esi + edx*4]
+ mov esi, [ebp + nbfp]
+ shl ecx, 1
+ shl edx, 1
+ mov edi, [esp + ntia]
+ add ecx, edi
+ add edx, edi
+ movlps xmm6, [esi + ecx*4]
+ movhps xmm6, [esi + edx*4]
+ mov edi, [ebp + pos]
+
+ movaps xmm4, xmm6
+ shufps xmm4, xmm4, 0b1000
+ shufps xmm6, xmm6, 0b1101
+ movlhps xmm4, xmm7
+ movlhps xmm6, xmm7
+
+ movaps [esp + c6], xmm4
+ movaps [esp + c12], xmm6
+
+ lea eax, [eax + eax*2]
+ lea ebx, [ebx + ebx*2]
+ /* move coordinates to xmm0-xmm2 */
+ movlps xmm1, [edi + eax*4]
+ movss xmm2, [edi + eax*4 + 8]
+ movhps xmm1, [edi + ebx*4]
+ movss xmm0, [edi + ebx*4 + 8]
+
+ movlhps xmm3, xmm7
+
+ shufps xmm2, xmm0, 0
+
+ movaps xmm0, xmm1
+
+ shufps xmm2, xmm2, 0b10001000
+
+ shufps xmm0, xmm0, 0b10001000
+ shufps xmm1, xmm1, 0b11011101
+
+ mov edi, [ebp + faction]
+ /* move ix-iz to xmm4-xmm6 */
+ xorps xmm7, xmm7
+
+ movaps xmm4, [esp + ix]
+ movaps xmm5, [esp + iy]
+ movaps xmm6, [esp + iz]
+
+ /* calc dr */
+ subps xmm4, xmm0
+ subps xmm5, xmm1
+ subps xmm6, xmm2
+
+ /* store dr */
+ movaps [esp + dx], xmm4
+ movaps [esp + dy], xmm5
+ movaps [esp + dz], xmm6
+ /* square it */
+ mulps xmm4,xmm4
+ mulps xmm5,xmm5
+ mulps xmm6,xmm6
+ addps xmm4, xmm5
+ addps xmm4, xmm6
+ /* rsq in xmm4 */
+
+ rsqrtps xmm5, xmm4
+ /* lookup seed in xmm5 */
+ movaps xmm2, xmm5
+ mulps xmm5, xmm5
+ movaps xmm1, [esp + three]
+ mulps xmm5, xmm4 /* rsq*lu*lu */
+ movaps xmm0, [esp + half]
+ subps xmm1, xmm5 /* 30-rsq*lu*lu */
+ mulps xmm1, xmm2
+ mulps xmm0, xmm1 /* xmm0=rinv */
+ mulps xmm4, xmm0 /* xmm4=r */
+ mulps xmm4, [esp + tsc]
+
+ cvttps2pi mm6, xmm4 /* mm6 contain lu indices */
+ cvtpi2ps xmm6, mm6
+ subps xmm4, xmm6
+ movaps xmm1, xmm4 /* xmm1=eps */
+ movaps xmm2, xmm1
+ mulps xmm2, xmm2 /* xmm2=eps2 */
+
+ pslld mm6, 2
+
+ mov esi, [ebp + VFtab]
+ movd ecx, mm6
+ psrlq mm6, 32
+ movd edx, mm6
+
+ lea ecx, [ecx + ecx*2]
+ lea edx, [edx + edx*2]
+
+ /* dispersion */
+ movlps xmm5, [esi + ecx*4 + 0]
+ movhps xmm5, [esi + edx*4 + 0]/* got half dispersion table */
+ movaps xmm4, xmm5
+ shufps xmm4, xmm4, 0b10001000
+ shufps xmm5, xmm5, 0b11011101
+
+ movlps xmm7, [esi + ecx*4 + 8]
+ movhps xmm7, [esi + edx*4 + 8] /* other half of dispersion table */
+ movaps xmm6, xmm7
+ shufps xmm6, xmm6, 0b10001000
+ shufps xmm7, xmm7, 0b11011101
+ /* dispersion table ready, in xmm4-xmm7 */
+ mulps xmm6, xmm1 /* xmm6=Geps */
+ mulps xmm7, xmm2 /* xmm7=Heps2 */
+ addps xmm5, xmm6
+ addps xmm5, xmm7 /* xmm5=Fp */
+ mulps xmm7, [esp + two] /* two*Heps2 */
+ addps xmm7, xmm6
+ addps xmm7, xmm5 /* xmm7=FF */
+ mulps xmm5, xmm1 /* xmm5=eps*Fp */
+ addps xmm5, xmm4 /* xmm5=VV */
+
+ movaps xmm4, [esp + c6]
+ mulps xmm7, xmm4 /* fijD */
+ mulps xmm5, xmm4 /* vnb6 */
+
+ /* put scalar force on stack Update vnbtot directly */
+ addps xmm5, [esp + vnbtot]
+ movaps [esp + fscal], xmm7
+ movaps [esp + vnbtot], xmm5
+
+ /* repulsion */
+ movlps xmm5, [esi + ecx*4 + 16]
+ movhps xmm5, [esi + edx*4 + 16] /* got half repulsion table */
+ movaps xmm4, xmm5
+ shufps xmm4, xmm7, 0b10001000
+ shufps xmm5, xmm7, 0b11011101
+
+ movlps xmm7, [esi + ecx*4 + 24]
+ movhps xmm7, [esi + edx*4 + 24] /* other half of repulsion table */
+ movaps xmm6, xmm7
+ shufps xmm6, xmm3, 0b10001000
+ shufps xmm7, xmm3, 0b11011101
+ /* table ready, in xmm4-xmm7 */
+ mulps xmm6, xmm1 /* xmm6=Geps */
+ mulps xmm7, xmm2 /* xmm7=Heps2 */
+ addps xmm5, xmm6
+ addps xmm5, xmm7 /* xmm5=Fp */
+ mulps xmm7, [esp + two] /* two*Heps2 */
+ addps xmm7, xmm6
+ addps xmm7, xmm5 /* xmm7=FF */
+ mulps xmm5, xmm1 /* xmm5=eps*Fp */
+ addps xmm5, xmm4 /* xmm5=VV */
+
+ movaps xmm4, [esp + c12]
+ mulps xmm7, xmm4 /* fijR */
+ mulps xmm5, xmm4 /* vnb12 */
+ addps xmm7, [esp + fscal]
+
+ addps xmm5, [esp + vnbtot]
+ movaps [esp + vnbtot], xmm5
+ xorps xmm4, xmm4
+
+ mulps xmm7, [esp + tsc]
+ mulps xmm7, xmm0
+ subps xmm4, xmm7
+
+ movaps xmm0, [esp + dx]
+ movaps xmm1, [esp + dy]
+ movaps xmm2, [esp + dz]
+
+ mulps xmm0, xmm4
+ mulps xmm1, xmm4
+ mulps xmm2, xmm4
+ /* xmm0-xmm2 contains tx-tz (partial force) */
+ /* now update f_i */
+ movaps xmm3, [esp + fix]
+ movaps xmm4, [esp + fiy]
+ movaps xmm5, [esp + fiz]
+ addps xmm3, xmm0
+ addps xmm4, xmm1
+ addps xmm5, xmm2
+ movaps [esp + fix], xmm3
+ movaps [esp + fiy], xmm4
+ movaps [esp + fiz], xmm5
+ /* update the fj's */
+ movss xmm3, [edi + eax*4]
+ movss xmm4, [edi + eax*4 + 4]
+ movss xmm5, [edi + eax*4 + 8]
+ subss xmm3, xmm0
+ subss xmm4, xmm1
+ subss xmm5, xmm2
+ movss [edi + eax*4], xmm3
+ movss [edi + eax*4 + 4], xmm4
+ movss [edi + eax*4 + 8], xmm5
+
+ shufps xmm0, xmm0, 0b11100001
+ shufps xmm1, xmm1, 0b11100001
+ shufps xmm2, xmm2, 0b11100001
+
+ movss xmm3, [edi + ebx*4]
+ movss xmm4, [edi + ebx*4 + 4]
+ movss xmm5, [edi + ebx*4 + 8]
+ subss xmm3, xmm0
+ subss xmm4, xmm1
+ subss xmm5, xmm2
+ movss [edi + ebx*4], xmm3
+ movss [edi + ebx*4 + 4], xmm4
+ movss [edi + ebx*4 + 8], xmm5
+
+.i3310_checksingle_vdw:
+ mov edx, [esp + innerk]
+ and edx, 1
+ jnz .i3310_dosingle_vdw
+ jmp .i3310_updateouterdata_vdw
+.i3310_dosingle_vdw:
+ mov edi, [ebp + pos]
+ mov ecx, [esp + innerjjnr]
+ mov eax, [ecx]
+ xorps xmm6, xmm6
+
+ mov esi, [ebp + type]
+ mov ecx, eax
+ mov ecx, [esi + ecx*4]
+ mov esi, [ebp + nbfp]
+ shl ecx, 1
+ add ecx, [esp + ntia]
+ movlps xmm6, [esi + ecx*4]
+ movaps xmm4, xmm6
+ shufps xmm4, xmm4, 0b11111100
+ shufps xmm6, xmm6, 0b11111101
+
+ movaps [esp + c6], xmm4
+ movaps [esp + c12], xmm6
+
+ lea eax, [eax + eax*2]
+
+ /* move coordinates to xmm0-xmm2 */
+ movss xmm0, [edi + eax*4]
+ movss xmm1, [edi + eax*4 + 4]
+ movss xmm2, [edi + eax*4 + 8]
+
+ movaps xmm4, [esp + ix]
+ movaps xmm5, [esp + iy]
+ movaps xmm6, [esp + iz]
+
+ /* calc dr */
+ subps xmm4, xmm0
+ subps xmm5, xmm1
+ subps xmm6, xmm2
+
+ /* store dr */
+ movaps [esp + dx], xmm4
+ movaps [esp + dy], xmm5
+ movaps [esp + dz], xmm6
+ /* square it */
+ mulps xmm4,xmm4
+ mulps xmm5,xmm5
+ mulps xmm6,xmm6
+ addps xmm4, xmm5
+ addps xmm4, xmm6
+ /* rsq in xmm4 */
+
+ rsqrtps xmm5, xmm4
+ /* lookup seed in xmm5 */
+ movaps xmm2, xmm5
+ mulps xmm5, xmm5
+ movaps xmm1, [esp + three]
+ mulps xmm5, xmm4 /* rsq*lu*lu */
+ movaps xmm0, [esp + half]
+ subps xmm1, xmm5 /* 30-rsq*lu*lu */
+ mulps xmm1, xmm2
+ mulps xmm0, xmm1 /* xmm0=rinv */
+
+ mulps xmm4, xmm0 /* xmm4=r */
+ mulps xmm4, [esp + tsc]
+
+ cvttps2pi mm6, xmm4 /* mm6 contain lu indices */
+ cvtpi2ps xmm6, mm6
+ subps xmm4, xmm6
+ movaps xmm1, xmm4 /* xmm1=eps */
+ movaps xmm2, xmm1
+ mulps xmm2, xmm2 /* xmm2=eps2 */
+
+ pslld mm6, 2
+
+ mov esi, [ebp + VFtab]
+ movd ebx, mm6
+
+ lea ebx, [ebx + ebx*2]
+
+ /* dispersion */
+ movlps xmm4, [esi + ebx*4 + 0]
+ movlps xmm6, [esi + ebx*4 + 8]
+ movaps xmm5, xmm4
+ movaps xmm7, xmm6
+ shufps xmm5, xmm5, 1
+ shufps xmm7, xmm7, 1
+ /* table ready in xmm4-xmm7 */
+
+ mulps xmm6, xmm1 /* xmm6=Geps */
+ mulps xmm7, xmm2 /* xmm7=Heps2 */
+ addps xmm5, xmm6
+ addps xmm5, xmm7 /* xmm5=Fp */
+ mulps xmm7, [esp + two] /* two*Heps2 */
+ addps xmm7, xmm6
+ addps xmm7, xmm5 /* xmm7=FF */
+ mulps xmm5, xmm1 /* xmm5=eps*Fp */
+ addps xmm5, xmm4 /* xmm5=VV */
+
+ movaps xmm4, [esp + c6]
+ mulps xmm7, xmm4 /* fijD */
+ mulps xmm5, xmm4 /* vnb6 */
+
+ /* put scalar force on stack Update vnbtot directly */
+ addps xmm5, [esp + vnbtot]
+ movaps [esp + fscal], xmm7
+ movaps [esp + vnbtot], xmm5
+
+ /* repulsion */
+ movlps xmm4, [esi + ebx*4 + 16]
+ movlps xmm6, [esi + ebx*4 + 24]
+ movaps xmm5, xmm4
+ movaps xmm7, xmm6
+ shufps xmm5, xmm5, 1
+ shufps xmm7, xmm7, 1
+ /* table ready in xmm4-xmm7 */
+
+ mulps xmm6, xmm1 /* xmm6=Geps */
+ mulps xmm7, xmm2 /* xmm7=Heps2 */
+ addps xmm5, xmm6
+ addps xmm5, xmm7 /* xmm5=Fp */
+ mulps xmm7, [esp + two] /* two*Heps2 */
+ addps xmm7, xmm6
+ addps xmm7, xmm5 /* xmm7=FF */
+ mulps xmm5, xmm1 /* xmm5=eps*Fp */
+ addps xmm5, xmm4 /* xmm5=VV */
+
+ movaps xmm4, [esp + c12]
+ mulps xmm7, xmm4 /* fijR */
+ mulps xmm5, xmm4 /* vnb12 */
+ addps xmm7, [esp + fscal]
+
+ addps xmm5, [esp + vnbtot]
+ movaps [esp + vnbtot], xmm5
+ xorps xmm4, xmm4
+
+ mulps xmm7, [esp + tsc]
+ mulps xmm7, xmm0
+ subps xmm4, xmm7
+ mov edi, [ebp + faction]
+
+ movaps xmm0, [esp + dx]
+ movaps xmm1, [esp + dy]
+ movaps xmm2, [esp + dz]
+
+ mulps xmm0, xmm4
+ mulps xmm1, xmm4
+ mulps xmm2, xmm4
+ /* xmm0-xmm2 contains tx-tz (partial force) */
+ /* now update f_i */
+ movaps xmm3, [esp + fix]
+ movaps xmm4, [esp + fiy]
+ movaps xmm5, [esp + fiz]
+ addps xmm3, xmm0
+ addps xmm4, xmm1
+ addps xmm5, xmm2
+ movaps [esp + fix], xmm3
+ movaps [esp + fiy], xmm4
+ movaps [esp + fiz], xmm5
+ /* update fj */
+
+ movss xmm3, [edi + eax*4]
+ movss xmm4, [edi + eax*4 + 4]
+ movss xmm5, [edi + eax*4 + 8]
+ subss xmm3, xmm0
+ subss xmm4, xmm1
+ subss xmm5, xmm2
+ movss [edi + eax*4], xmm3
+ movss [edi + eax*4 + 4], xmm4
+ movss [edi + eax*4 + 8], xmm5
+.i3310_updateouterdata_vdw:
+ mov ecx, [esp + ii3]
+ mov edi, [ebp + faction]
+ mov esi, [ebp + fshift]
+ mov edx, [esp + is3]
+
+ /* accumulate i forces in xmm0, xmm1, xmm2 */
+ movaps xmm0, [esp + fix]
+ movaps xmm1, [esp + fiy]
+ movaps xmm2, [esp + fiz]
+
+ movhlps xmm3, xmm0
+ movhlps xmm4, xmm1
+ movhlps xmm5, xmm2
+ addps xmm0, xmm3
+ addps xmm1, xmm4
+ addps xmm2, xmm5 /* sum is in 1/2 in xmm0-xmm2 */
+
+ movaps xmm3, xmm0
+ movaps xmm4, xmm1
+ movaps xmm5, xmm2
+
+ shufps xmm3, xmm3, 1
+ shufps xmm4, xmm4, 1
+ shufps xmm5, xmm5, 1
+ addss xmm0, xmm3
+ addss xmm1, xmm4
+ addss xmm2, xmm5 /* xmm0-xmm2 has single force in pos0 */
+
+ /* increment i force */
+ movss xmm3, [edi + ecx*4]
+ movss xmm4, [edi + ecx*4 + 4]
+ movss xmm5, [edi + ecx*4 + 8]
+ addss xmm3, xmm0
+ addss xmm4, xmm1
+ addss xmm5, xmm2
+ movss [edi + ecx*4], xmm3
+ movss [edi + ecx*4 + 4], xmm4
+ movss [edi + ecx*4 + 8], xmm5
+
+ /* increment fshift force */
+ movss xmm3, [esi + edx*4]
+ movss xmm4, [esi + edx*4 + 4]
+ movss xmm5, [esi + edx*4 + 8]
+ addss xmm3, xmm0
+ addss xmm4, xmm1
+ addss xmm5, xmm2
+ movss [esi + edx*4], xmm3
+ movss [esi + edx*4 + 4], xmm4
+ movss [esi + edx*4 + 8], xmm5
+
+ /* loop back to mno */
+ dec dword ptr [esp + nsvdw]
+ jz .i3310_last_mno
+ jmp .i3310_mno_vdw
+.i3310_last_mno:
+ /* get group index for i particle */
+ mov edx, [ebp + gid] /* get group index for this i particle */
+ mov edx, [edx]
+ add [ebp + gid], 4 /* advance pointer */
+
+ /* accumulate total potential energy and update it */
+ movaps xmm7, [esp + vctot]
+ /* accumulate */
+ movhlps xmm6, xmm7
+ addps xmm7, xmm6 /* pos 0-1 in xmm7 have the sum now */
+ movaps xmm6, xmm7
+ shufps xmm6, xmm6, 1
+ addss xmm7, xmm6
+
+ /* add earlier value from mem */
+ mov eax, [ebp + Vc]
+ addss xmm7, [eax + edx*4]
+ /* move back to mem */
+ movss [eax + edx*4], xmm7
+
+ /* accumulate total lj energy and update it */
+ movaps xmm7, [esp + vnbtot]
+ /* accumulate */
+ movhlps xmm6, xmm7
+ addps xmm7, xmm6 /* pos 0-1 in xmm7 have the sum now */
+ movaps xmm6, xmm7
+ shufps xmm6, xmm6, 1
+ addss xmm7, xmm6
+
+ /* add earlier value from mem */
+ mov eax, [ebp + Vnb]
+ addss xmm7, [eax + edx*4]
+ /* move back to mem */
+ movss [eax + edx*4], xmm7
+
+ /* finish if last */
+ mov ecx, [ebp + nri]
+ dec ecx
+ jecxz .i3310_end
+ /* not last, iterate once more! */
+ mov [ebp + nri], ecx
+ jmp .i3310_outer
+.i3310_end:
+ emms
+ mov eax, [esp + salign]
+ add esp, eax
+ add esp, 380
+ pop edi
+ pop esi
+ pop edx
+ pop ecx
+ pop ebx
+ pop eax
+ leave
+ ret
+
+
+
+.globl inl3320_sse
+ .type inl3320_sse,@function
+inl3320_sse:
+.equ nri, 8
+.equ iinr, 12
+.equ jindex, 16
+.equ jjnr, 20
+.equ shift, 24
+.equ shiftvec, 28
+.equ fshift, 32
+.equ gid, 36
+.equ pos, 40
+.equ faction, 44
+.equ charge, 48
+.equ facel, 52
+.equ Vc, 56
+.equ type, 60
+.equ ntype, 64
+.equ nbfp, 68
+.equ Vnb, 72
+.equ tabscale, 76
+.equ VFtab, 80
+ /* stack offsets for local variables */
+ /* bottom of stack is cache-aligned for sse use */
+.equ ixO, 0
+.equ iyO, 16
+.equ izO, 32
+.equ ixH1, 48
+.equ iyH1, 64
+.equ izH1, 80
+.equ ixH2, 96
+.equ iyH2, 112
+.equ izH2, 128
+.equ iqO, 144
+.equ iqH, 160
+.equ dxO, 176
+.equ dyO, 192
+.equ dzO, 208
+.equ dxH1, 224
+.equ dyH1, 240
+.equ dzH1, 256
+.equ dxH2, 272
+.equ dyH2, 288
+.equ dzH2, 304
+.equ qqO, 320
+.equ qqH, 336
+.equ rinvO, 352
+.equ rinvH1, 368
+.equ rinvH2, 384
+.equ rO, 400
+.equ rH1, 416
+.equ rH2, 432
+.equ tsc, 448
+.equ two, 464
+.equ c6, 480
+.equ c12, 496
+.equ vctot, 512
+.equ vnbtot, 528
+.equ fixO, 544
+.equ fiyO, 560
+.equ fizO, 576
+.equ fixH1, 592
+.equ fiyH1, 608
+.equ fizH1, 624
+.equ fixH2, 640
+.equ fiyH2, 656
+.equ fizH2, 672
+.equ fjx, 688
+.equ fjy, 704
+.equ fjz, 720
+.equ half, 736
+.equ three, 752
+.equ is3, 768
+.equ ii3, 772
+.equ ntia, 776
+.equ innerjjnr, 780
+.equ innerk, 784
+.equ salign, 788
+ push ebp
+ mov ebp,esp
+ push eax
+ push ebx
+ push ecx
+ push edx
+ push esi
+ push edi
+ sub esp, 792 /* local stack space */
+ mov eax, esp
+ and eax, 0xf
+ sub esp, eax
+ mov [esp + salign], eax
+
+ emms
+
+ movups xmm0, [sse_half]
+ movups xmm1, [sse_two]
+ movups xmm2, [sse_three]
+ movss xmm3, [ebp +tabscale]
+
+ movaps [esp + half], xmm0
+ movaps [esp + two], xmm1
+ movaps [esp + three], xmm2
+ shufps xmm3, xmm3, 0
+ movaps [esp + tsc], xmm3
+
+ /* assume we have at least one i particle - start directly */
+ mov ecx, [ebp + iinr] /* ecx = pointer into iinr[] */
+ mov ebx, [ecx] /* ebx =ii */
+
+ mov edx, [ebp + charge]
+ movss xmm3, [edx + ebx*4]
+ movss xmm4, [edx + ebx*4 + 4]
+ movss xmm5, [ebp + facel]
+ mulss xmm3, xmm5
+ mulss xmm4, xmm5
+
+ shufps xmm3, xmm3, 0
+ shufps xmm4, xmm4, 0
+ movaps [esp + iqO], xmm3
+ movaps [esp + iqH], xmm4
+
+ mov edx, [ebp + type]
+ mov ecx, [edx + ebx*4]
+ shl ecx, 1
+ imul ecx, [ebp + ntype] /* ecx = ntia = 2*ntype*type[ii0] */
+ mov [esp + ntia], ecx
+.i3320_outer:
+ mov eax, [ebp + shift] /* eax = pointer into shift[] */
+ mov ebx, [eax] /* ebx=shift[n] */
+ add [ebp + shift], 4 /* advance pointer one step */
+
+ lea ebx, [ebx + ebx*2] /* ebx=3*is */
+ mov [esp + is3],ebx /* store is3 */
+
+ mov eax, [ebp + shiftvec] /* eax = base of shiftvec[] */
+
+ movss xmm0, [eax + ebx*4]
+ movss xmm1, [eax + ebx*4 + 4]
+ movss xmm2, [eax + ebx*4 + 8]
+
+ mov ecx, [ebp + iinr] /* ecx = pointer into iinr[] */
+ add [ebp + iinr], 4 /* advance pointer */
+ mov ebx, [ecx] /* ebx =ii */
+
+ movaps xmm3, xmm0
+ movaps xmm4, xmm1
+ movaps xmm5, xmm2
+
+ lea ebx, [ebx + ebx*2] /* ebx = 3*ii=ii3 */
+ mov eax, [ebp + pos] /* eax = base of pos[] */
+ mov [esp + ii3], ebx
+
+ addss xmm3, [eax + ebx*4]
+ addss xmm4, [eax + ebx*4 + 4]
+ addss xmm5, [eax + ebx*4 + 8]
+ shufps xmm3, xmm3, 0
+ shufps xmm4, xmm4, 0
+ shufps xmm5, xmm5, 0
+ movaps [esp + ixO], xmm3
+ movaps [esp + iyO], xmm4
+ movaps [esp + izO], xmm5
+
+ movss xmm3, xmm0
+ movss xmm4, xmm1
+ movss xmm5, xmm2
+ addss xmm0, [eax + ebx*4 + 12]
+ addss xmm1, [eax + ebx*4 + 16]
+ addss xmm2, [eax + ebx*4 + 20]
+ addss xmm3, [eax + ebx*4 + 24]
+ addss xmm4, [eax + ebx*4 + 28]
+ addss xmm5, [eax + ebx*4 + 32]
+
+ shufps xmm0, xmm0, 0
+ shufps xmm1, xmm1, 0
+ shufps xmm2, xmm2, 0
+ shufps xmm3, xmm3, 0
+ shufps xmm4, xmm4, 0
+ shufps xmm5, xmm5, 0
+ movaps [esp + ixH1], xmm0
+ movaps [esp + iyH1], xmm1
+ movaps [esp + izH1], xmm2
+ movaps [esp + ixH2], xmm3
+ movaps [esp + iyH2], xmm4
+ movaps [esp + izH2], xmm5
+
+ /* clear vctot and i forces */
+ xorps xmm4, xmm4
+ movaps [esp + vctot], xmm4
+ movaps [esp + vnbtot], xmm4
+ movaps [esp + fixO], xmm4
+ movaps [esp + fiyO], xmm4
+ movaps [esp + fizO], xmm4
+ movaps [esp + fixH1], xmm4
+ movaps [esp + fiyH1], xmm4
+ movaps [esp + fizH1], xmm4
+ movaps [esp + fixH2], xmm4
+ movaps [esp + fiyH2], xmm4
+ movaps [esp + fizH2], xmm4
+
+ mov eax, [ebp + jindex]
+ mov ecx, [eax] /* jindex[n] */
+ mov edx, [eax + 4] /* jindex[n+1] */
+ add [ebp + jindex], 4
+ sub edx, ecx /* number of innerloop atoms */
+
+ mov esi, [ebp + pos]
+ mov edi, [ebp + faction]
+ mov eax, [ebp + jjnr]
+ shl ecx, 2
+ add eax, ecx
+ mov [esp + innerjjnr], eax /* pointer to jjnr[nj0] */
+ sub edx, 4
+ mov [esp + innerk], edx /* number of innerloop atoms */
+ jge .i3320_unroll_loop
+ jmp .i3320_odd_inner
+.i3320_unroll_loop:
+ /* quad-unroll innerloop here */
+ mov edx, [esp + innerjjnr] /* pointer to jjnr[k] */
+ mov eax, [edx]
+ mov ebx, [edx + 4]
+ mov ecx, [edx + 8]
+ mov edx, [edx + 12] /* eax-edx=jnr1-4 */
+
+ add [esp + innerjjnr], 16 /* advance pointer (unrolled 4) */
+
+ mov esi, [ebp + charge] /* base of charge[] */
+
+ movss xmm3, [esi + eax*4]
+ movss xmm4, [esi + ecx*4]
+ movss xmm6, [esi + ebx*4]
+ movss xmm7, [esi + edx*4]
+
+ shufps xmm3, xmm6, 0
+ shufps xmm4, xmm7, 0
+ shufps xmm3, xmm4, 0b10001000 /* all charges in xmm3 */
+ movaps xmm4, xmm3 /* and in xmm4 */
+ mulps xmm3, [esp + iqO]
+ mulps xmm4, [esp + iqH]
+
+ movd mm0, eax /* use mmx registers as temp storage */
+ movd mm1, ebx
+ movd mm2, ecx
+ movd mm3, edx
+
+ movaps [esp + qqO], xmm3
+ movaps [esp + qqH], xmm4
+
+ mov esi, [ebp + type]
+ mov eax, [esi + eax*4]
+ mov ebx, [esi + ebx*4]
+ mov ecx, [esi + ecx*4]
+ mov edx, [esi + edx*4]
+ mov esi, [ebp + nbfp]
+ shl eax, 1
+ shl ebx, 1
+ shl ecx, 1
+ shl edx, 1
+ mov edi, [esp + ntia]
+ add eax, edi
+ add ebx, edi
+ add ecx, edi
+ add edx, edi
+
+ movlps xmm6, [esi + eax*4]
+ movlps xmm7, [esi + ecx*4]
+ movhps xmm6, [esi + ebx*4]
+ movhps xmm7, [esi + edx*4]
+
+ movaps xmm4, xmm6
+ shufps xmm4, xmm7, 0b10001000
+ shufps xmm6, xmm7, 0b11011101
+
+ movd eax, mm0
+ movd ebx, mm1
+ movd ecx, mm2
+ movd edx, mm3
+
+ movaps [esp + c6], xmm4
+ movaps [esp + c12], xmm6
+
+ mov esi, [ebp + pos] /* base of pos[] */
+
+ lea eax, [eax + eax*2] /* replace jnr with j3 */
+ lea ebx, [ebx + ebx*2]
+ lea ecx, [ecx + ecx*2] /* replace jnr with j3 */
+ lea edx, [edx + edx*2]
+
+ /* move four coordinates to xmm0-xmm2 */
+ movlps xmm4, [esi + eax*4]
+ movlps xmm5, [esi + ecx*4]
+ movss xmm2, [esi + eax*4 + 8]
+ movss xmm6, [esi + ecx*4 + 8]
+
+ movhps xmm4, [esi + ebx*4]
+ movhps xmm5, [esi + edx*4]
+
+ movss xmm0, [esi + ebx*4 + 8]
+ movss xmm1, [esi + edx*4 + 8]
+
+ shufps xmm2, xmm0, 0
+ shufps xmm6, xmm1, 0
+
+ movaps xmm0, xmm4
+ movaps xmm1, xmm4
+
+ shufps xmm2, xmm6, 0b10001000
+
+ shufps xmm0, xmm5, 0b10001000
+ shufps xmm1, xmm5, 0b11011101
+
+ /* move ixO-izO to xmm4-xmm6 */
+ movaps xmm4, [esp + ixO]
+ movaps xmm5, [esp + iyO]
+ movaps xmm6, [esp + izO]
+
+ /* calc dr */
+ subps xmm4, xmm0
+ subps xmm5, xmm1
+ subps xmm6, xmm2
+
+ /* store dr */
+ movaps [esp + dxO], xmm4
+ movaps [esp + dyO], xmm5
+ movaps [esp + dzO], xmm6
+ /* square it */
+ mulps xmm4,xmm4
+ mulps xmm5,xmm5
+ mulps xmm6,xmm6
+ addps xmm4, xmm5
+ addps xmm4, xmm6
+ movaps xmm7, xmm4
+ /* rsqO in xmm7 */
+
+ /* move ixH1-izH1 to xmm4-xmm6 */
+ movaps xmm4, [esp + ixH1]
+ movaps xmm5, [esp + iyH1]
+ movaps xmm6, [esp + izH1]
+
+ /* calc dr */
+ subps xmm4, xmm0
+ subps xmm5, xmm1
+ subps xmm6, xmm2
+
+ /* store dr */
+ movaps [esp + dxH1], xmm4
+ movaps [esp + dyH1], xmm5
+ movaps [esp + dzH1], xmm6
+ /* square it */
+ mulps xmm4,xmm4
+ mulps xmm5,xmm5
+ mulps xmm6,xmm6
+ addps xmm6, xmm5
+ addps xmm6, xmm4
+ /* rsqH1 in xmm6 */
+
+ /* move ixH2-izH2 to xmm3-xmm5 */
+ movaps xmm3, [esp + ixH2]
+ movaps xmm4, [esp + iyH2]
+ movaps xmm5, [esp + izH2]
+
+ /* calc dr */
+ subps xmm3, xmm0
+ subps xmm4, xmm1
+ subps xmm5, xmm2
+
+ /* store dr */
+ movaps [esp + dxH2], xmm3
+ movaps [esp + dyH2], xmm4
+ movaps [esp + dzH2], xmm5
+ /* square it */
+ mulps xmm3,xmm3
+ mulps xmm4,xmm4
+ mulps xmm5,xmm5
+ addps xmm5, xmm4
+ addps xmm5, xmm3
+ /* rsqH2 in xmm5, rsqH1 in xmm6, rsqO in xmm7 */
+
+ /* start with rsqO - seed to xmm2 */
+ rsqrtps xmm2, xmm7
+ movaps xmm3, xmm2
+ mulps xmm2, xmm2
+ movaps xmm4, [esp + three]
+ mulps xmm2, xmm7 /* rsq*lu*lu */
+ subps xmm4, xmm2 /* 30-rsq*lu*lu */
+ mulps xmm4, xmm3 /* lu*(3-rsq*lu*lu) */
+ mulps xmm4, [esp + half]
+ movaps [esp + rinvO], xmm4 /* rinvO in xmm4 */
+ mulps xmm7, xmm4
+ movaps [esp + rO], xmm7
+
+ /* rsqH1 - seed in xmm2 */
+ rsqrtps xmm2, xmm6
+ movaps xmm3, xmm2
+ mulps xmm2, xmm2
+ movaps xmm4, [esp + three]
+ mulps xmm2, xmm6 /* rsq*lu*lu */
+ subps xmm4, xmm2 /* 30-rsq*lu*lu */
+ mulps xmm4, xmm3 /* lu*(3-rsq*lu*lu) */
+ mulps xmm4, [esp + half]
+ movaps [esp + rinvH1], xmm4 /* rinvH1 in xmm4 */
+ mulps xmm6, xmm4
+ movaps [esp + rH1], xmm6
+
+ /* rsqH2 - seed to xmm2 */
+ rsqrtps xmm2, xmm5
+ movaps xmm3, xmm2
+ mulps xmm2, xmm2
+ movaps xmm4, [esp + three]
+ mulps xmm2, xmm5 /* rsq*lu*lu */
+ subps xmm4, xmm2 /* 30-rsq*lu*lu */
+ mulps xmm4, xmm3 /* lu*(3-rsq*lu*lu) */
+ mulps xmm4, [esp + half]
+ movaps [esp + rinvH2], xmm4 /* rinvH2 in xmm4 */
+ mulps xmm5, xmm4
+ movaps [esp + rH2], xmm5
+
+ /* do O interactions */
+ /* rO is still in xmm7 */
+ mulps xmm7, [esp + tsc]
+ movhlps xmm4, xmm7
+ cvttps2pi mm6, xmm7
+ cvttps2pi mm7, xmm4 /* mm6/mm7 contain lu indices */
+
+ cvtpi2ps xmm3, mm6
+ cvtpi2ps xmm4, mm7
+ movlhps xmm3, xmm4
+
+ subps xmm7, xmm3
+
+ movaps xmm1, xmm7 /* xmm1=eps */
+ movaps xmm2, xmm1
+ mulps xmm2, xmm2 /* xmm2=eps2 */
+ pslld mm6, 2
+ pslld mm7, 2
+
+ movd mm0, eax
+ movd mm1, ebx
+ movd mm2, ecx
+ movd mm3, edx
+
+ mov esi, [ebp + VFtab]
+ movd eax, mm6
+ psrlq mm6, 32
+ movd ecx, mm7
+ psrlq mm7, 32
+ movd ebx, mm6
+ movd edx, mm7
+
+ lea eax, [eax + eax*2]
+ lea ebx, [ebx + ebx*2]
+ lea ecx, [ecx + ecx*2]
+ lea edx, [edx + edx*2]
+
+ movlps xmm5, [esi + eax*4]
+ movlps xmm7, [esi + ecx*4]
+ movhps xmm5, [esi + ebx*4]
+ movhps xmm7, [esi + edx*4] /* got half coulomb table */
+
+ movaps xmm4, xmm5
+ shufps xmm4, xmm7, 0b10001000
+ shufps xmm5, xmm7, 0b11011101
+
+ movlps xmm7, [esi + eax*4 + 8]
+ movlps xmm3, [esi + ecx*4 + 8]
+ movhps xmm7, [esi + ebx*4 + 8]
+ movhps xmm3, [esi + edx*4 + 8] /* other half of coulomb table */
+ movaps xmm6, xmm7
+ shufps xmm6, xmm3, 0b10001000
+ shufps xmm7, xmm3, 0b11011101
+ /* coulomb table ready, in xmm4-xmm7 */
+
+ mulps xmm6, xmm1 /* xmm6=Geps */
+ mulps xmm7, xmm2 /* xmm7=Heps2 */
+ addps xmm5, xmm6
+ addps xmm5, xmm7 /* xmm5=Fp */
+ mulps xmm7, [esp + two] /* two*Heps2 */
+ movaps xmm0, [esp + qqO]
+ addps xmm7, xmm6
+ addps xmm7, xmm5 /* xmm7=FF */
+ mulps xmm5, xmm1 /* xmm5=eps*Fp */
+ addps xmm5, xmm4 /* xmm5=VV */
+ mulps xmm5, xmm0 /* vcoul=qq*VV */
+ mulps xmm0, xmm7 /* fijC=FF*qq */
+ /* at this point mm5 contains vcoul and xmm0 fijC */
+ /* increment vcoul - then we can get rid of mm5 */
+ addps xmm5, [esp + vctot]
+ movaps [esp + vctot], xmm5
+
+ /* dispersion */
+ movlps xmm5, [esi + eax*4 + 16]
+ movlps xmm7, [esi + ecx*4 + 16]
+ movhps xmm5, [esi + ebx*4 + 16]
+ movhps xmm7, [esi + edx*4 + 16] /* got half dispersion table */
+ movaps xmm4, xmm5
+ shufps xmm4, xmm7, 0b10001000
+ shufps xmm5, xmm7, 0b11011101
+
+ movlps xmm7, [esi + eax*4 + 24]
+ movlps xmm3, [esi + ecx*4 + 24]
+ movhps xmm7, [esi + ebx*4 + 24]
+ movhps xmm3, [esi + edx*4 + 24] /* other half of dispersion table */
+ movaps xmm6, xmm7
+ shufps xmm6, xmm3, 0b10001000
+ shufps xmm7, xmm3, 0b11011101
+ /* dispersion table ready, in xmm4-xmm7 */
+ mulps xmm6, xmm1 /* xmm6=Geps */
+ mulps xmm7, xmm2 /* xmm7=Heps2 */
+ addps xmm5, xmm6
+ addps xmm5, xmm7 /* xmm5=Fp */
+ mulps xmm7, [esp + two] /* two*Heps2 */
+ addps xmm7, xmm6
+ addps xmm7, xmm5 /* xmm7=FF */
+ mulps xmm5, xmm1 /* xmm5=eps*Fp */
+ addps xmm5, xmm4 /* xmm5=VV */
+
+ movaps xmm4, [esp + c6]
+ mulps xmm7, xmm4 /* fijD */
+ mulps xmm5, xmm4 /* vnb6 */
+ addps xmm0, xmm7 /* add to fscal */
+
+ /* Update vnbtot directly */
+ addps xmm5, [esp + vnbtot]
+ movaps [esp + vnbtot], xmm5
+
+ /* repulsion */
+ movlps xmm5, [esi + eax*4 + 32]
+ movlps xmm7, [esi + ecx*4 + 32]
+ movhps xmm5, [esi + ebx*4 + 32]
+ movhps xmm7, [esi + edx*4 + 32] /* got half repulsion table */
+ movaps xmm4, xmm5
+ shufps xmm4, xmm7, 0b10001000
+ shufps xmm5, xmm7, 0b11011101
+
+ movlps xmm7, [esi + eax*4 + 40]
+ movlps xmm3, [esi + ecx*4 + 40]
+ movhps xmm7, [esi + ebx*4 + 40]
+ movhps xmm3, [esi + edx*4 + 40] /* other half of repulsion table */
+ movaps xmm6, xmm7
+ shufps xmm6, xmm3, 0b10001000
+ shufps xmm7, xmm3, 0b11011101
+ /* repulsion table ready, in xmm4-xmm7 */
+ mulps xmm6, xmm1 /* xmm6=Geps */
+ mulps xmm7, xmm2 /* xmm7=Heps2 */
+ addps xmm5, xmm6
+ addps xmm5, xmm7 /* xmm5=Fp */
+ mulps xmm7, [esp + two] /* two*Heps2 */
+ addps xmm7, xmm6
+ addps xmm7, xmm5 /* xmm7=FF */
+ mulps xmm5, xmm1 /* xmm5=eps*Fp */
+ addps xmm5, xmm4 /* xmm5=VV */
+
+ movaps xmm4, [esp + c12]
+ mulps xmm7, xmm4 /* fijD */
+ mulps xmm5, xmm4 /* vnb12 */
+ addps xmm7, xmm0 /* add to fscal */
+ addps xmm5, [esp + vnbtot] /* total nonbonded potential in xmm5 */
+ xorps xmm4, xmm4
+
+ mulps xmm7, [esp + rinvO] /* total fscal now in xmm7 */
+
+ mulps xmm7, [esp + tsc]
+ movaps [esp + vnbtot], xmm5
+ subps xmm4, xmm7
+
+ movaps xmm0, [esp + dxO]
+ movaps xmm1, [esp + dyO]
+ movaps xmm2, [esp + dzO]
+ mulps xmm0, xmm4
+ mulps xmm1, xmm4
+ mulps xmm2, xmm4 /* tx in xmm0-xmm2 */
+
+ /* update O forces */
+ movaps xmm3, [esp + fixO]
+ movaps xmm4, [esp + fiyO]
+ movaps xmm7, [esp + fizO]
+ addps xmm3, xmm0
+ addps xmm4, xmm1
+ addps xmm7, xmm2
+ movaps [esp + fixO], xmm3
+ movaps [esp + fiyO], xmm4
+ movaps [esp + fizO], xmm7
+ /* update j forces with water O */
+ movaps [esp + fjx], xmm0
+ movaps [esp + fjy], xmm1
+ movaps [esp + fjz], xmm2
+
+ /* Done with O interactions - now H1! */
+ movaps xmm7, [esp + rH1]
+ mulps xmm7, [esp + tsc]
+ movhlps xmm4, xmm7
+ cvttps2pi mm6, xmm7
+ cvttps2pi mm7, xmm4 /* mm6/mm7 contain lu indices */
+
+ cvtpi2ps xmm3, mm6
+ cvtpi2ps xmm4, mm7
+ movlhps xmm3, xmm4
+
+ subps xmm7, xmm3
+ movaps xmm1, xmm7 /* xmm1=eps */
+ movaps xmm2, xmm1
+ mulps xmm2, xmm2 /* xmm2=eps2 */
+ pslld mm6, 2
+ pslld mm7, 2
+
+ movd eax, mm6
+ psrlq mm6, 32
+ movd ecx, mm7
+ psrlq mm7, 32
+ movd ebx, mm6
+ movd edx, mm7
+
+ lea eax, [eax + eax*2]
+ lea ebx, [ebx + ebx*2]
+ lea ecx, [ecx + ecx*2]
+ lea edx, [edx + edx*2]
+
+ movlps xmm5, [esi + eax*4]
+ movlps xmm7, [esi + ecx*4]
+ movhps xmm5, [esi + ebx*4]
+ movhps xmm7, [esi + edx*4] /* got half coulomb table */
+
+ movaps xmm4, xmm5
+ shufps xmm4, xmm7, 0b10001000
+ shufps xmm5, xmm7, 0b11011101
+
+ movlps xmm7, [esi + eax*4 + 8]
+ movlps xmm3, [esi + ecx*4 + 8]
+ movhps xmm7, [esi + ebx*4 + 8]
+ movhps xmm3, [esi + edx*4 + 8] /* other half of coulomb table */
+ movaps xmm6, xmm7
+ shufps xmm6, xmm3, 0b10001000
+ shufps xmm7, xmm3, 0b11011101
+ /* coulomb table ready, in xmm4-xmm7 */
+
+ mulps xmm6, xmm1 /* xmm6=Geps */
+ mulps xmm7, xmm2 /* xmm7=Heps2 */
+ addps xmm5, xmm6
+ addps xmm5, xmm7 /* xmm5=Fp */
+ mulps xmm7, [esp + two] /* two*Heps2 */
+ movaps xmm0, [esp + qqH]
+ addps xmm7, xmm6
+ addps xmm7, xmm5 /* xmm7=FF */
+ mulps xmm5, xmm1 /* xmm5=eps*Fp */
+ addps xmm5, xmm4 /* xmm5=VV */
+ mulps xmm5, xmm0 /* vcoul=qq*VV */
+ mulps xmm7, xmm0 /* fijC=FF*qq */
+ /* at this point mm5 contains vcoul and xmm7 fijC */
+ /* increment vcoul */
+ xorps xmm4, xmm4
+ addps xmm5, [esp + vctot]
+ mulps xmm7, [esp + rinvH1]
+ movaps [esp + vctot], xmm5
+ mulps xmm7, [esp + tsc]
+ subps xmm4, xmm7
+
+ movaps xmm0, [esp + dxH1]
+ movaps xmm1, [esp + dyH1]
+ movaps xmm2, [esp + dzH1]
+ mulps xmm0, xmm4
+ mulps xmm1, xmm4
+ mulps xmm2, xmm4
+
+ /* update H1 forces */
+ movaps xmm3, [esp + fixH1]
+ movaps xmm4, [esp + fiyH1]
+ movaps xmm7, [esp + fizH1]
+ addps xmm3, xmm0
+ addps xmm4, xmm1
+ addps xmm7, xmm2
+ movaps [esp + fixH1], xmm3
+ movaps [esp + fiyH1], xmm4
+ movaps [esp + fizH1], xmm7
+ /* update j forces with water H1 */
+ addps xmm0, [esp + fjx]
+ addps xmm1, [esp + fjy]
+ addps xmm2, [esp + fjz]
+ movaps [esp + fjx], xmm0
+ movaps [esp + fjy], xmm1
+ movaps [esp + fjz], xmm2
+
+ /* Done with H1, finally we do H2 interactions */
+ movaps xmm7, [esp + rH2]
+ mulps xmm7, [esp + tsc]
+ movhlps xmm4, xmm7
+ cvttps2pi mm6, xmm7
+ cvttps2pi mm7, xmm4 /* mm6/mm7 contain lu indices */
+
+ cvtpi2ps xmm3, mm6
+ cvtpi2ps xmm4, mm7
+ movlhps xmm3, xmm4
+
+ subps xmm7, xmm3
+ movaps xmm1, xmm7 /* xmm1=eps */
+ movaps xmm2, xmm1
+ mulps xmm2, xmm2 /* xmm2=eps2 */
+ pslld mm6, 2
+ pslld mm7, 2
+
+ movd eax, mm6
+ psrlq mm6, 32
+ movd ecx, mm7
+ psrlq mm7, 32
+ movd ebx, mm6
+ movd edx, mm7
+
+ lea eax, [eax + eax*2]
+ lea ebx, [ebx + ebx*2]
+ lea ecx, [ecx + ecx*2]
+ lea edx, [edx + edx*2]
+
+ movlps xmm5, [esi + eax*4]
+ movlps xmm7, [esi + ecx*4]
+ movhps xmm5, [esi + ebx*4]
+ movhps xmm7, [esi + edx*4] /* got half coulomb table */
+
+ movaps xmm4, xmm5
+ shufps xmm4, xmm7, 0b10001000
+ shufps xmm5, xmm7, 0b11011101
+
+ movlps xmm7, [esi + eax*4 + 8]
+ movlps xmm3, [esi + ecx*4 + 8]
+ movhps xmm7, [esi + ebx*4 + 8]
+ movhps xmm3, [esi + edx*4 + 8] /* other half of coulomb table */
+ movaps xmm6, xmm7
+ shufps xmm6, xmm3, 0b10001000
+ shufps xmm7, xmm3, 0b11011101
+ /* coulomb table ready, in xmm4-xmm7 */
+
+ mulps xmm6, xmm1 /* xmm6=Geps */
+ mulps xmm7, xmm2 /* xmm7=Heps2 */
+ addps xmm5, xmm6
+ addps xmm5, xmm7 /* xmm5=Fp */
+ mulps xmm7, [esp + two] /* two*Heps2 */
+ movaps xmm0, [esp + qqH]
+ addps xmm7, xmm6
+ addps xmm7, xmm5 /* xmm7=FF */
+ mulps xmm5, xmm1 /* xmm5=eps*Fp */
+ addps xmm5, xmm4 /* xmm5=VV */
+ mulps xmm5, xmm0 /* vcoul=qq*VV */
+ mulps xmm7, xmm0 /* fijC=FF*qq */
+ /* at this point mm5 contains vcoul and xmm0 fijC */
+ /* increment vcoul */
+ xorps xmm4, xmm4
+ addps xmm5, [esp + vctot]
+ mulps xmm7, [esp + rinvH2]
+ movaps [esp + vctot], xmm5
+ mulps xmm7, [esp + tsc]
+ subps xmm4, xmm7
+
+ movaps xmm0, [esp + dxH2]
+ movaps xmm1, [esp + dyH2]
+ movaps xmm2, [esp + dzH2]
+ mulps xmm0, xmm4
+ mulps xmm1, xmm4
+ mulps xmm2, xmm4
+
+ movd eax, mm0
+ movd ebx, mm1
+ movd ecx, mm2
+ movd edx, mm3
+
+ /* update H2 forces */
+ movaps xmm3, [esp + fixH2]
+ movaps xmm4, [esp + fiyH2]
+ movaps xmm7, [esp + fizH2]
+ addps xmm3, xmm0
+ addps xmm4, xmm1
+ addps xmm7, xmm2
+ movaps [esp + fixH2], xmm3
+ movaps [esp + fiyH2], xmm4
+ movaps [esp + fizH2], xmm7
+
+ mov edi, [ebp +faction]
+ /* update j forces */
+ addps xmm0, [esp + fjx]
+ addps xmm1, [esp + fjy]
+ addps xmm2, [esp + fjz]
+
+ movlps xmm4, [edi + eax*4]
+ movlps xmm7, [edi + ecx*4]
+ movhps xmm4, [edi + ebx*4]
+ movhps xmm7, [edi + edx*4]
+
+ movaps xmm3, xmm4
+ shufps xmm3, xmm7, 0b10001000
+ shufps xmm4, xmm7, 0b11011101
+ /* xmm3 has fjx, xmm4 has fjy */
+ subps xmm3, xmm0
+ subps xmm4, xmm1
+ /* unpack them back for storing */
+ movaps xmm7, xmm3
+ unpcklps xmm7, xmm4
+ unpckhps xmm3, xmm4
+ movlps [edi + eax*4], xmm7
+ movlps [edi + ecx*4], xmm3
+ movhps [edi + ebx*4], xmm7
+ movhps [edi + edx*4], xmm3
+ /* finally z forces */
+ movss xmm0, [edi + eax*4 + 8]
+ movss xmm1, [edi + ebx*4 + 8]
+ movss xmm3, [edi + ecx*4 + 8]
+ movss xmm4, [edi + edx*4 + 8]
+ subss xmm0, xmm2
+ shufps xmm2, xmm2, 0b11100101
+ subss xmm1, xmm2
+ shufps xmm2, xmm2, 0b11101010
+ subss xmm3, xmm2
+ shufps xmm2, xmm2, 0b11111111
+ subss xmm4, xmm2
+ movss [edi + eax*4 + 8], xmm0
+ movss [edi + ebx*4 + 8], xmm1
+ movss [edi + ecx*4 + 8], xmm3
+ movss [edi + edx*4 + 8], xmm4
+
+ /* should we do one more iteration? */
+ sub [esp + innerk], 4
+ jl .i3320_odd_inner
+ jmp .i3320_unroll_loop
+.i3320_odd_inner:
+ add [esp + innerk], 4
+ jnz .i3320_odd_loop
+ jmp .i3320_updateouterdata
+.i3320_odd_loop:
+ mov edx, [esp + innerjjnr] /* pointer to jjnr[k] */
+ mov eax, [edx]
+ add [esp + innerjjnr], 4
+
+ xorps xmm4, xmm4
+ movss xmm4, [esp + iqO]
+ mov esi, [ebp + charge]
+ movhps xmm4, [esp + iqH]
+ movss xmm3, [esi + eax*4] /* charge in xmm3 */
+ shufps xmm3, xmm3, 0
+ mulps xmm3, xmm4
+ movaps [esp + qqO], xmm3 /* use oxygen qq for storage */
+
+ xorps xmm6, xmm6
+ mov esi, [ebp + type]
+ mov ebx, [esi + eax*4]
+ mov esi, [ebp + nbfp]
+ shl ebx, 1
+ add ebx, [esp + ntia]
+ movlps xmm6, [esi + ebx*4]
+ movaps xmm7, xmm6
+ shufps xmm6, xmm6, 0b11111100
+ shufps xmm7, xmm7, 0b11111101
+ movaps [esp + c6], xmm6
+ movaps [esp + c12], xmm7
+
+ mov esi, [ebp + pos]
+ lea eax, [eax + eax*2]
+
+ /* move j coords to xmm0-xmm2 */
+ movss xmm0, [esi + eax*4]
+ movss xmm1, [esi + eax*4 + 4]
+ movss xmm2, [esi + eax*4 + 8]
+ shufps xmm0, xmm0, 0
+ shufps xmm1, xmm1, 0
+ shufps xmm2, xmm2, 0
+
+ movss xmm3, [esp + ixO]
+ movss xmm4, [esp + iyO]
+ movss xmm5, [esp + izO]
+
+ movlps xmm6, [esp + ixH1]
+ movlps xmm7, [esp + ixH2]
+ unpcklps xmm6, xmm7
+ movlhps xmm3, xmm6
+ movlps xmm6, [esp + iyH1]
+ movlps xmm7, [esp + iyH2]
+ unpcklps xmm6, xmm7
+ movlhps xmm4, xmm6
+ movlps xmm6, [esp + izH1]
+ movlps xmm7, [esp + izH2]
+ unpcklps xmm6, xmm7
+ movlhps xmm5, xmm6
+
+ subps xmm3, xmm0
+ subps xmm4, xmm1
+ subps xmm5, xmm2
+
+ movaps [esp + dxO], xmm3
+ movaps [esp + dyO], xmm4
+ movaps [esp + dzO], xmm5
+
+ mulps xmm3, xmm3
+ mulps xmm4, xmm4
+ mulps xmm5, xmm5
+
+ addps xmm4, xmm3
+ addps xmm4, xmm5
+ /* rsq in xmm4 */
+
+ rsqrtps xmm5, xmm4
+ /* lookup seed in xmm5 */
+ movaps xmm2, xmm5
+ mulps xmm5, xmm5
+ movaps xmm1, [esp + three]
+ mulps xmm5, xmm4 /* rsq*lu*lu */
+ movaps xmm0, [esp + half]
+ subps xmm1, xmm5 /* 30-rsq*lu*lu */
+ mulps xmm1, xmm2
+ mulps xmm0, xmm1 /* xmm0=rinv */
+ mulps xmm4, xmm0 /* xmm4=r */
+ movaps [esp + rinvO], xmm0
+
+ mulps xmm4, [esp + tsc]
+ movhlps xmm7, xmm4
+ cvttps2pi mm6, xmm4
+ cvttps2pi mm7, xmm7 /* mm6/mm7 contain lu indices */
+ cvtpi2ps xmm3, mm6
+ cvtpi2ps xmm7, mm7
+ movlhps xmm3, xmm7
+
+ subps xmm4, xmm3
+ movaps xmm1, xmm4 /* xmm1=eps */
+ movaps xmm2, xmm1
+ mulps xmm2, xmm2 /* xmm2=eps2 */
+ pslld mm6, 2
+ pslld mm7, 2
+
+ movd mm0, eax
+ movd mm1, ecx
+ movd mm2, edx
+
+ mov esi, [ebp + VFtab]
+ movd eax, mm6
+ movd ecx, mm7
+ psrlq mm7, 32
+ movd edx, mm7
+
+ lea eax, [eax + eax*2]
+ lea ecx, [ecx + ecx*2]
+ lea edx, [edx + edx*2]
+
+ movlps xmm5, [esi + eax*4]
+ movlps xmm7, [esi + ecx*4]
+ movhps xmm7, [esi + edx*4] /* got half coulomb table */
+
+ movaps xmm4, xmm5
+ shufps xmm4, xmm7, 0b10001000
+ shufps xmm5, xmm7, 0b11011101
+
+ movlps xmm7, [esi + eax*4 + 8]
+ movlps xmm3, [esi + ecx*4 + 8]
+ movhps xmm3, [esi + edx*4 + 8] /* other half of coulomb table */
+ movaps xmm6, xmm7
+ shufps xmm6, xmm3, 0b10001000
+ shufps xmm7, xmm3, 0b11011101
+ /* coulomb table ready, in xmm4-xmm7 */
+ mulps xmm6, xmm1 /* xmm6=Geps */
+ mulps xmm7, xmm2 /* xmm7=Heps2 */
+ addps xmm5, xmm6
+ addps xmm5, xmm7 /* xmm5=Fp */
+ mulps xmm7, [esp + two] /* two*Heps2 */
+ movaps xmm0, [esp + qqO]
+ addps xmm7, xmm6
+ addps xmm7, xmm5 /* xmm7=FF */
+ mulps xmm5, xmm1 /* xmm5=eps*Fp */
+ addps xmm5, xmm4 /* xmm5=VV */
+ mulps xmm5, xmm0 /* vcoul=qq*VV */
+ mulps xmm0, xmm7 /* fijC=FF*qq */
+ /* at this point mm5 contains vcoul and xmm0 fijC */
+ /* increment vcoul - then we can get rid of mm5 */
+ addps xmm5, [esp + vctot]
+ movaps [esp + vctot], xmm5
+
+ /* dispersion */
+ movlps xmm5, [esi + eax*4 + 16] /* half table */
+ movaps xmm4, xmm5
+ shufps xmm4, xmm4, 0b11111100
+ shufps xmm5, xmm5, 0b11111101
+
+ movlps xmm7, [esi + eax*4 + 24] /* other half of dispersion table */
+ movaps xmm6, xmm7
+ shufps xmm6, xmm6, 0b11111100
+ shufps xmm7, xmm7, 0b11111101
+ /* dispersion table ready, in xmm4-xmm7 */
+ mulss xmm6, xmm1 /* xmm6=Geps */
+ mulss xmm7, xmm2 /* xmm7=Heps2 */
+ addss xmm5, xmm6 /* Update vnbtot directly */
+ addss xmm5, xmm7 /* xmm5=Fp */
+ mulss xmm7, [esp + two] /* two*Heps2 */
+ addss xmm7, xmm6
+ addss xmm7, xmm5 /* xmm7=FF */
+ mulss xmm5, xmm1 /* xmm5=eps*Fp */
+ addss xmm5, xmm4 /* xmm5=VV */
+
+ movaps xmm4, [esp + c6]
+ mulps xmm7, xmm4 /* fijD */
+ mulps xmm5, xmm4 /* vnb6 */
+ addps xmm0, xmm7 /* add to fscal */
+
+ /* Update vnbtot directly */
+ addps xmm5, [esp + vnbtot]
+ movaps [esp + vnbtot], xmm5
+
+ /* repulsion */
+ movlps xmm5, [esi + eax*4 + 32] /* got half repulsion table */
+ movaps xmm4, xmm5
+ shufps xmm4, xmm4, 0b10001000
+ shufps xmm5, xmm5, 0b11011101
+
+ movlps xmm7, [esi + eax*4 + 40] /* other half of repulsion table */
+ movaps xmm6, xmm7
+ shufps xmm6, xmm6, 0b10001000
+ shufps xmm7, xmm7, 0b11011101
+ /* repulsion table ready, in xmm4-xmm7 */
+ mulss xmm6, xmm1 /* xmm6=Geps */
+ mulss xmm7, xmm2 /* xmm7=Heps2 */
+ addss xmm5, xmm6
+ addss xmm5, xmm7 /* xmm5=Fp */
+ mulss xmm7, [esp + two] /* two*Heps2 */
+ addss xmm7, xmm6
+ addss xmm7, xmm5 /* xmm7=FF */
+ mulss xmm5, xmm1 /* xmm5=eps*Fp */
+ addss xmm5, xmm4 /* xmm5=VV */
+
+ movaps xmm4, [esp + c12]
+ mulps xmm7, xmm4 /* fijD */
+ mulps xmm5, xmm4 /* vnb12 */
+ addps xmm7, xmm0 /* add to fscal */
+ addps xmm5, [esp + vnbtot] /* total nonbonded potential in xmm5 */
+
+ xorps xmm4, xmm4
+ movd eax, mm0
+ movd ecx, mm1
+ movd edx, mm2
+
+ mulps xmm7, [esp + rinvO] /* total fscal now in xmm7 */
+ movaps [esp + vnbtot], xmm5
+ mulps xmm7, [esp + tsc]
+ subps xmm4, xmm7
+
+ movaps xmm0, [esp + dxO]
+ movaps xmm1, [esp + dyO]
+ movaps xmm2, [esp + dzO]
+
+ mulps xmm0, xmm4
+ mulps xmm1, xmm4
+ mulps xmm2, xmm4 /* xmm0-xmm2 now contains tx-tz (partial force) */
+ movss xmm3, [esp + fixO]
+ movss xmm4, [esp + fiyO]
+ movss xmm5, [esp + fizO]
+ addss xmm3, xmm0
+ addss xmm4, xmm1
+ addss xmm5, xmm2
+ movss [esp + fixO], xmm3
+ movss [esp + fiyO], xmm4
+ movss [esp + fizO], xmm5 /* updated the O force now do the H's */
+ movaps xmm3, xmm0
+ movaps xmm4, xmm1
+ movaps xmm5, xmm2
+ shufps xmm3, xmm3, 0b11100110 /* shift right */
+ shufps xmm4, xmm4, 0b11100110
+ shufps xmm5, xmm5, 0b11100110
+ addss xmm3, [esp + fixH1]
+ addss xmm4, [esp + fiyH1]
+ addss xmm5, [esp + fizH1]
+ movss [esp + fixH1], xmm3
+ movss [esp + fiyH1], xmm4
+ movss [esp + fizH1], xmm5 /* updated the H1 force */
+
+ mov edi, [ebp + faction]
+ shufps xmm3, xmm3, 0b11100111 /* shift right */
+ shufps xmm4, xmm4, 0b11100111
+ shufps xmm5, xmm5, 0b11100111
+ addss xmm3, [esp + fixH2]
+ addss xmm4, [esp + fiyH2]
+ addss xmm5, [esp + fizH2]
+ movss [esp + fixH2], xmm3
+ movss [esp + fiyH2], xmm4
+ movss [esp + fizH2], xmm5 /* updated the H2 force */
+
+ /* the fj's - start by accumulating the tx/ty/tz force in xmm0, xmm1 */
+ xorps xmm5, xmm5
+ movaps xmm3, xmm0
+ movlps xmm6, [edi + eax*4]
+ movss xmm7, [edi + eax*4 + 8]
+ unpcklps xmm3, xmm1
+ movlhps xmm3, xmm5
+ unpckhps xmm0, xmm1
+ addps xmm0, xmm3
+ movhlps xmm3, xmm0
+ addps xmm0, xmm3 /* x,y sum in xmm0 */
+
+ movhlps xmm1, xmm2
+ addss xmm2, xmm1
+ shufps xmm1, xmm1, 1
+ addss xmm2, xmm1 /* z sum in xmm2 */
+ subps xmm6, xmm0
+ subss xmm7, xmm2
+
+ movlps [edi + eax*4], xmm6
+ movss [edi + eax*4 + 8], xmm7
+
+ dec dword ptr [esp + innerk]
+ jz .i3320_updateouterdata
+ jmp .i3320_odd_loop
+.i3320_updateouterdata:
+ mov ecx, [esp + ii3]
+ mov edi, [ebp + faction]
+ mov esi, [ebp + fshift]
+ mov edx, [esp + is3]
+
+ /* accumulate Oi forces in xmm0, xmm1, xmm2 */
+ movaps xmm0, [esp + fixO]
+ movaps xmm1, [esp + fiyO]
+ movaps xmm2, [esp + fizO]
+
+ movhlps xmm3, xmm0
+ movhlps xmm4, xmm1
+ movhlps xmm5, xmm2
+ addps xmm0, xmm3
+ addps xmm1, xmm4
+ addps xmm2, xmm5 /* sum is in 1/2 in xmm0-xmm2 */
+
+ movaps xmm3, xmm0
+ movaps xmm4, xmm1
+ movaps xmm5, xmm2
+
+ shufps xmm3, xmm3, 1
+ shufps xmm4, xmm4, 1
+ shufps xmm5, xmm5, 1
+ addss xmm0, xmm3
+ addss xmm1, xmm4
+ addss xmm2, xmm5 /* xmm0-xmm2 has single force in pos0 */
+
+ /* increment i force */
+ movss xmm3, [edi + ecx*4]
+ movss xmm4, [edi + ecx*4 + 4]
+ movss xmm5, [edi + ecx*4 + 8]
+ addss xmm3, xmm0
+ addss xmm4, xmm1
+ addss xmm5, xmm2
+ movss [edi + ecx*4], xmm3
+ movss [edi + ecx*4 + 4], xmm4
+ movss [edi + ecx*4 + 8], xmm5
+
+ /* accumulate force in xmm6/xmm7 for fshift */
+ movaps xmm6, xmm0
+ movss xmm7, xmm2
+ movlhps xmm6, xmm1
+ shufps xmm6, xmm6, 0b1000
+
+ /* accumulate H1i forces in xmm0, xmm1, xmm2 */
+ movaps xmm0, [esp + fixH1]
+ movaps xmm1, [esp + fiyH1]
+ movaps xmm2, [esp + fizH1]
+
+ movhlps xmm3, xmm0
+ movhlps xmm4, xmm1
+ movhlps xmm5, xmm2
+ addps xmm0, xmm3
+ addps xmm1, xmm4
+ addps xmm2, xmm5 /* sum is in 1/2 in xmm0-xmm2 */
+
+ movaps xmm3, xmm0
+ movaps xmm4, xmm1
+ movaps xmm5, xmm2
+
+ shufps xmm3, xmm3, 1
+ shufps xmm4, xmm4, 1
+ shufps xmm5, xmm5, 1
+ addss xmm0, xmm3
+ addss xmm1, xmm4
+ addss xmm2, xmm5 /* xmm0-xmm2 has single force in pos0 */
+
+ /* increment i force */
+ movss xmm3, [edi + ecx*4 + 12]
+ movss xmm4, [edi + ecx*4 + 16]
+ movss xmm5, [edi + ecx*4 + 20]
+ addss xmm3, xmm0
+ addss xmm4, xmm1
+ addss xmm5, xmm2
+ movss [edi + ecx*4 + 12], xmm3
+ movss [edi + ecx*4 + 16], xmm4
+ movss [edi + ecx*4 + 20], xmm5
+
+ /* accumulate force in xmm6/xmm7 for fshift */
+ addss xmm7, xmm2
+ movlhps xmm0, xmm1
+ shufps xmm0, xmm0, 0b1000
+ addps xmm6, xmm0
+
+ /* accumulate H2i forces in xmm0, xmm1, xmm2 */
+ movaps xmm0, [esp + fixH2]
+ movaps xmm1, [esp + fiyH2]
+ movaps xmm2, [esp + fizH2]
+
+ movhlps xmm3, xmm0
+ movhlps xmm4, xmm1
+ movhlps xmm5, xmm2
+ addps xmm0, xmm3
+ addps xmm1, xmm4
+ addps xmm2, xmm5 /* sum is in 1/2 in xmm0-xmm2 */
+
+ movaps xmm3, xmm0
+ movaps xmm4, xmm1
+ movaps xmm5, xmm2
+
+ shufps xmm3, xmm3, 1
+ shufps xmm4, xmm4, 1
+ shufps xmm5, xmm5, 1
+ addss xmm0, xmm3
+ addss xmm1, xmm4
+ addss xmm2, xmm5 /* xmm0-xmm2 has single force in pos0 */
+
+ /* increment i force */
+ movss xmm3, [edi + ecx*4 + 24]
+ movss xmm4, [edi + ecx*4 + 28]
+ movss xmm5, [edi + ecx*4 + 32]
+ addss xmm3, xmm0
+ addss xmm4, xmm1
+ addss xmm5, xmm2
+ movss [edi + ecx*4 + 24], xmm3
+ movss [edi + ecx*4 + 28], xmm4
+ movss [edi + ecx*4 + 32], xmm5
+
+ /* accumulate force in xmm6/xmm7 for fshift */
+ addss xmm7, xmm2
+ movlhps xmm0, xmm1
+ shufps xmm0, xmm0, 0b1000
+ addps xmm6, xmm0
+
+ /* increment fshift force */
+ movlps xmm3, [esi + edx*4]
+ movss xmm4, [esi + edx*4 + 8]
+ addps xmm3, xmm6
+ addss xmm4, xmm7
+ movlps [esi + edx*4], xmm3
+ movss [esi + edx*4 + 8], xmm4
+
+ mov edx, [ebp + gid]
+ mov edx, [edx]
+ add [ebp + gid], 4
+
+ /* accumulate total potential energy and update it */
+ movaps xmm7, [esp + vctot]
+ /* accumulate */
+ movhlps xmm6, xmm7
+ addps xmm7, xmm6 /* pos 0-1 in xmm7 have the sum now */
+ movaps xmm6, xmm7
+ shufps xmm6, xmm6, 1
+ addss xmm7, xmm6
+
+ /* add earlier value from mem */
+ mov eax, [ebp + Vc]
+ addss xmm7, [eax + edx*4]
+ /* move back to mem */
+ movss [eax + edx*4], xmm7
+
+ /* accumulate total lj energy and update it */
+ movaps xmm7, [esp + vnbtot]
+ /* accumulate */
+ movhlps xmm6, xmm7
+ addps xmm7, xmm6 /* pos 0-1 in xmm7 have the sum now */
+ movaps xmm6, xmm7
+ shufps xmm6, xmm6, 1
+ addss xmm7, xmm6
+
+ /* add earlier value from mem */
+ mov eax, [ebp + Vnb]
+ addss xmm7, [eax + edx*4]
+ /* move back to mem */
+ movss [eax + edx*4], xmm7
+
+ /* finish if last */
+ mov ecx, [ebp + nri]
+ dec ecx
+ jecxz .i3320_end
+ /* not last, iterate once more! */
+ mov [ebp + nri], ecx
+ jmp .i3320_outer
+.i3320_end:
+ emms
+ mov eax, [esp + salign]
+ add esp, eax
+ add esp, 792
+ pop edi
+ pop esi
+ pop edx
+ pop ecx
+ pop ebx
+ pop eax
+ leave
+ ret
+
+
+
+.globl inl3330_sse
+ .type inl3330_sse,@function
+inl3330_sse:
+.equ nri, 8
+.equ iinr, 12
+.equ jindex, 16
+.equ jjnr, 20
+.equ shift, 24
+.equ shiftvec, 28
+.equ fshift, 32
+.equ gid, 36
+.equ pos, 40
+.equ faction, 44
+.equ charge, 48
+.equ facel, 52
+.equ Vc, 56
+.equ type, 60
+.equ ntype, 64
+.equ nbfp, 68
+.equ Vnb, 72
+.equ tabscale, 76
+.equ VFtab, 80
+ /* stack offsets for local variables */
+ /* bottom of stack is cache-aligned for sse use */
+.equ ixO, 0
+.equ iyO, 16
+.equ izO, 32
+.equ ixH1, 48
+.equ iyH1, 64
+.equ izH1, 80
+.equ ixH2, 96
+.equ iyH2, 112
+.equ izH2, 128
+.equ jxO, 144
+.equ jyO, 160
+.equ jzO, 176
+.equ jxH1, 192
+.equ jyH1, 208
+.equ jzH1, 224
+.equ jxH2, 240
+.equ jyH2, 256
+.equ jzH2, 272
+.equ dxOO, 288
+.equ dyOO, 304
+.equ dzOO, 320
+.equ dxOH1, 336
+.equ dyOH1, 352
+.equ dzOH1, 368
+.equ dxOH2, 384
+.equ dyOH2, 400
+.equ dzOH2, 416
+.equ dxH1O, 432
+.equ dyH1O, 448
+.equ dzH1O, 464
+.equ dxH1H1, 480
+.equ dyH1H1, 496
+.equ dzH1H1, 512
+.equ dxH1H2, 528
+.equ dyH1H2, 544
+.equ dzH1H2, 560
+.equ dxH2O, 576
+.equ dyH2O, 592
+.equ dzH2O, 608
+.equ dxH2H1, 624
+.equ dyH2H1, 640
+.equ dzH2H1, 656
+.equ dxH2H2, 672
+.equ dyH2H2, 688
+.equ dzH2H2, 704
+.equ qqOO, 720
+.equ qqOH, 736
+.equ qqHH, 752
+.equ two, 768
+.equ tsc, 784
+.equ c6, 800
+.equ c12, 816
+.equ vctot, 832
+.equ vnbtot, 848
+.equ fixO, 864
+.equ fiyO, 880
+.equ fizO, 896
+.equ fixH1, 912
+.equ fiyH1, 928
+.equ fizH1, 944
+.equ fixH2, 960
+.equ fiyH2, 976
+.equ fizH2, 992
+.equ fjxO, 1008
+.equ fjyO, 1024
+.equ fjzO, 1040
+.equ fjxH1, 1056
+.equ fjyH1, 1072
+.equ fjzH1, 1088
+.equ fjxH2, 1104
+.equ fjyH2, 1120
+.equ fjzH2, 1136
+.equ half, 1152
+.equ three, 1168
+.equ rsqOO, 1184
+.equ rsqOH1, 1200
+.equ rsqOH2, 1216
+.equ rsqH1O, 1232
+.equ rsqH1H1, 1248
+.equ rsqH1H2, 1264
+.equ rsqH2O, 1280
+.equ rsqH2H1, 1296
+.equ rsqH2H2, 1312
+.equ rinvOO, 1328
+.equ rinvOH1, 1344
+.equ rinvOH2, 1360
+.equ rinvH1O, 1376
+.equ rinvH1H1, 1392
+.equ rinvH1H2, 1408
+.equ rinvH2O, 1424
+.equ rinvH2H1, 1440
+.equ rinvH2H2, 1456
+.equ fstmp, 1472
+.equ is3, 1488
+.equ ii3, 1492
+.equ innerjjnr, 1496
+.equ innerk, 1500
+.equ salign, 1504
+ push ebp
+ mov ebp,esp
+ push eax
+ push ebx
+ push ecx
+ push edx
+ push esi
+ push edi
+ sub esp, 1508 /* local stack space */
+ mov eax, esp
+ and eax, 0xf
+ sub esp, eax
+ mov [esp + salign], eax
+
+ emms
+
+ movups xmm0, [sse_half]
+ movups xmm1, [sse_two]
+ movups xmm2, [sse_three]
+ movss xmm3, [ebp +tabscale]
+ movaps [esp + half], xmm0
+ movaps [esp + two], xmm1
+ movaps [esp + three], xmm2
+ shufps xmm3, xmm3, 0
+ movaps [esp + tsc], xmm3
+
+ /* assume we have at least one i particle - start directly */
+ mov ecx, [ebp + iinr] /* ecx = pointer into iinr[] */
+ mov ebx, [ecx] /* ebx =ii */
+
+ mov edx, [ebp + charge]
+ movss xmm3, [edx + ebx*4]
+ movss xmm4, xmm3
+ movss xmm5, [edx + ebx*4 + 4]
+ movss xmm6, [ebp + facel]
+ mulss xmm3, xmm3
+ mulss xmm4, xmm5
+ mulss xmm5, xmm5
+ mulss xmm3, xmm6
+ mulss xmm4, xmm6
+ mulss xmm5, xmm6
+ shufps xmm3, xmm3, 0
+ shufps xmm4, xmm4, 0
+ shufps xmm5, xmm5, 0
+ movaps [esp + qqOO], xmm3
+ movaps [esp + qqOH], xmm4
+ movaps [esp + qqHH], xmm5
+
+ xorps xmm0, xmm0
+ mov edx, [ebp + type]
+ mov ecx, [edx + ebx*4]
+ shl ecx, 1
+ mov edx, ecx
+ imul ecx, [ebp + ntype] /* ecx = ntia = 2*ntype*type[ii0] */
+ add edx, ecx
+ mov eax, [ebp + nbfp]
+ movlps xmm0, [eax + edx*4]
+ movaps xmm1, xmm0
+ shufps xmm0, xmm0, 0
+ shufps xmm1, xmm1, 0b01010101
+ movaps [esp + c6], xmm0
+ movaps [esp + c12], xmm1
+
+.i3330_outer:
+ mov eax, [ebp + shift] /* eax = pointer into shift[] */
+ mov ebx, [eax] /* ebx=shift[n] */
+ add [ebp + shift], 4 /* advance pointer one step */
+
+ lea ebx, [ebx + ebx*2] /* ebx=3*is */
+ mov [esp + is3],ebx /* store is3 */
+
+ mov eax, [ebp + shiftvec] /* eax = base of shiftvec[] */
+
+ movss xmm0, [eax + ebx*4]
+ movss xmm1, [eax + ebx*4 + 4]
+ movss xmm2, [eax + ebx*4 + 8]
+
+ mov ecx, [ebp + iinr] /* ecx = pointer into iinr[] */
+ add [ebp + iinr], 4 /* advance pointer */
+ mov ebx, [ecx] /* ebx =ii */
+
+ lea ebx, [ebx + ebx*2] /* ebx = 3*ii=ii3 */
+ mov eax, [ebp + pos] /* eax = base of pos[] */
+ mov [esp + ii3], ebx
+
+ movaps xmm3, xmm0
+ movaps xmm4, xmm1
+ movaps xmm5, xmm2
+ addss xmm3, [eax + ebx*4]
+ addss xmm4, [eax + ebx*4 + 4]
+ addss xmm5, [eax + ebx*4 + 8]
+ shufps xmm3, xmm3, 0
+ shufps xmm4, xmm4, 0
+ shufps xmm5, xmm5, 0
+ movaps [esp + ixO], xmm3
+ movaps [esp + iyO], xmm4
+ movaps [esp + izO], xmm5
+
+ movss xmm3, xmm0
+ movss xmm4, xmm1
+ movss xmm5, xmm2
+ addss xmm0, [eax + ebx*4 + 12]
+ addss xmm1, [eax + ebx*4 + 16]
+ addss xmm2, [eax + ebx*4 + 20]
+ addss xmm3, [eax + ebx*4 + 24]
+ addss xmm4, [eax + ebx*4 + 28]
+ addss xmm5, [eax + ebx*4 + 32]
+
+ shufps xmm0, xmm0, 0
+ shufps xmm1, xmm1, 0
+ shufps xmm2, xmm2, 0
+ shufps xmm3, xmm3, 0
+ shufps xmm4, xmm4, 0
+ shufps xmm5, xmm5, 0
+ movaps [esp + ixH1], xmm0
+ movaps [esp + iyH1], xmm1
+ movaps [esp + izH1], xmm2
+ movaps [esp + ixH2], xmm3
+ movaps [esp + iyH2], xmm4
+ movaps [esp + izH2], xmm5
+
+ /* clear vctot and i forces */
+ xorps xmm4, xmm4
+ movaps [esp + vctot], xmm4
+ movaps [esp + vnbtot], xmm4
+ movaps [esp + fixO], xmm4
+ movaps [esp + fiyO], xmm4
+ movaps [esp + fizO], xmm4
+ movaps [esp + fixH1], xmm4
+ movaps [esp + fiyH1], xmm4
+ movaps [esp + fizH1], xmm4
+ movaps [esp + fixH2], xmm4
+ movaps [esp + fiyH2], xmm4
+ movaps [esp + fizH2], xmm4
+
+ mov eax, [ebp + jindex]
+ mov ecx, [eax] /* jindex[n] */
+ mov edx, [eax + 4] /* jindex[n+1] */
+ add [ebp + jindex], 4
+ sub edx, ecx /* number of innerloop atoms */
+
+ mov esi, [ebp + pos]
+ mov edi, [ebp + faction]
+ mov eax, [ebp + jjnr]
+ shl ecx, 2
+ add eax, ecx
+ mov [esp + innerjjnr], eax /* pointer to jjnr[nj0] */
+ sub edx, 4
+ mov [esp + innerk], edx /* number of innerloop atoms */
+ jge .i3330_unroll_loop
+ jmp .i3330_single_check
+.i3330_unroll_loop:
+ /* quad-unroll innerloop here */
+ mov edx, [esp + innerjjnr] /* pointer to jjnr[k] */
+
+ mov eax, [edx]
+ mov ebx, [edx + 4]
+ mov ecx, [edx + 8]
+ mov edx, [edx + 12] /* eax-edx=jnr1-4 */
+
+ add [esp + innerjjnr], 16 /* advance pointer (unrolled 4) */
+
+ mov esi, [ebp + pos] /* base of pos[] */
+
+ lea eax, [eax + eax*2] /* replace jnr with j3 */
+ lea ebx, [ebx + ebx*2]
+ lea ecx, [ecx + ecx*2] /* replace jnr with j3 */
+ lea edx, [edx + edx*2]
+
+ /* move j coordinates to local temp variables */
+ movlps xmm2, [esi + eax*4]
+ movlps xmm3, [esi + eax*4 + 12]
+ movlps xmm4, [esi + eax*4 + 24]
+
+ movlps xmm5, [esi + ebx*4]
+ movlps xmm6, [esi + ebx*4 + 12]
+ movlps xmm7, [esi + ebx*4 + 24]
+
+ movhps xmm2, [esi + ecx*4]
+ movhps xmm3, [esi + ecx*4 + 12]
+ movhps xmm4, [esi + ecx*4 + 24]
+
+ movhps xmm5, [esi + edx*4]
+ movhps xmm6, [esi + edx*4 + 12]
+ movhps xmm7, [esi + edx*4 + 24]
+
+ /* current state: */
+ /* xmm2= jxOa jyOa jxOc jyOc */
+ /* xmm3= jxH1a jyH1a jxH1c jyH1c */
+ /* xmm4= jxH2a jyH2a jxH2c jyH2c */
+ /* xmm5= jxOb jyOb jxOd jyOd */
+ /* xmm6= jxH1b jyH1b jxH1d jyH1d */
+ /* xmm7= jxH2b jyH2b jxH2d jyH2d */
+
+ movaps xmm0, xmm2
+ movaps xmm1, xmm3
+ unpcklps xmm0, xmm5 /* xmm0= jxOa jxOb jyOa jyOb */
+ unpcklps xmm1, xmm6 /* xmm1= jxH1a jxH1b jyH1a jyH1b */
+ unpckhps xmm2, xmm5 /* xmm2= jxOc jxOd jyOc jyOd */
+ unpckhps xmm3, xmm6 /* xmm3= jxH1c jxH1d jyH1c jyH1d */
+ movaps xmm5, xmm4
+ movaps xmm6, xmm0
+ unpcklps xmm4, xmm7 /* xmm4= jxH2a jxH2b jyH2a jyH2b */
+ unpckhps xmm5, xmm7 /* xmm5= jxH2c jxH2d jyH2c jyH2d */
+ movaps xmm7, xmm1
+ movlhps xmm0, xmm2 /* xmm0= jxOa jxOb jxOc jxOd */
+ movaps [esp + jxO], xmm0
+ movhlps xmm2, xmm6 /* xmm2= jyOa jyOb jyOc jyOd */
+ movaps [esp + jyO], xmm2
+ movlhps xmm1, xmm3
+ movaps [esp + jxH1], xmm1
+ movhlps xmm3, xmm7
+ movaps xmm6, xmm4
+ movaps [esp + jyH1], xmm3
+ movlhps xmm4, xmm5
+ movaps [esp + jxH2], xmm4
+ movhlps xmm5, xmm6
+ movaps [esp + jyH2], xmm5
+
+ movss xmm0, [esi + eax*4 + 8]
+ movss xmm1, [esi + eax*4 + 20]
+ movss xmm2, [esi + eax*4 + 32]
+
+ movss xmm3, [esi + ecx*4 + 8]
+ movss xmm4, [esi + ecx*4 + 20]
+ movss xmm5, [esi + ecx*4 + 32]
+
+ movhps xmm0, [esi + ebx*4 + 4]
+ movhps xmm1, [esi + ebx*4 + 16]
+ movhps xmm2, [esi + ebx*4 + 28]
+
+ movhps xmm3, [esi + edx*4 + 4]
+ movhps xmm4, [esi + edx*4 + 16]
+ movhps xmm5, [esi + edx*4 + 28]
+
+ shufps xmm0, xmm3, 0b11001100
+ shufps xmm1, xmm4, 0b11001100
+ shufps xmm2, xmm5, 0b11001100
+ movaps [esp + jzO], xmm0
+ movaps [esp + jzH1], xmm1
+ movaps [esp + jzH2], xmm2
+
+ movaps xmm0, [esp + ixO]
+ movaps xmm1, [esp + iyO]
+ movaps xmm2, [esp + izO]
+ movaps xmm3, [esp + ixO]
+ movaps xmm4, [esp + iyO]
+ movaps xmm5, [esp + izO]
+ subps xmm0, [esp + jxO]
+ subps xmm1, [esp + jyO]
+ subps xmm2, [esp + jzO]
+ subps xmm3, [esp + jxH1]
+ subps xmm4, [esp + jyH1]
+ subps xmm5, [esp + jzH1]
+ movaps [esp + dxOO], xmm0
+ movaps [esp + dyOO], xmm1
+ movaps [esp + dzOO], xmm2
+ mulps xmm0, xmm0
+ mulps xmm1, xmm1
+ mulps xmm2, xmm2
+ movaps [esp + dxOH1], xmm3
+ movaps [esp + dyOH1], xmm4
+ movaps [esp + dzOH1], xmm5
+ mulps xmm3, xmm3
+ mulps xmm4, xmm4
+ mulps xmm5, xmm5
+ addps xmm0, xmm1
+ addps xmm0, xmm2
+ addps xmm3, xmm4
+ addps xmm3, xmm5
+ movaps [esp + rsqOO], xmm0
+ movaps [esp + rsqOH1], xmm3
+
+ movaps xmm0, [esp + ixO]
+ movaps xmm1, [esp + iyO]
+ movaps xmm2, [esp + izO]
+ movaps xmm3, [esp + ixH1]
+ movaps xmm4, [esp + iyH1]
+ movaps xmm5, [esp + izH1]
+ subps xmm0, [esp + jxH2]
+ subps xmm1, [esp + jyH2]
+ subps xmm2, [esp + jzH2]
+ subps xmm3, [esp + jxO]
+ subps xmm4, [esp + jyO]
+ subps xmm5, [esp + jzO]
+ movaps [esp + dxOH2], xmm0
+ movaps [esp + dyOH2], xmm1
+ movaps [esp + dzOH2], xmm2
+ mulps xmm0, xmm0
+ mulps xmm1, xmm1
+ mulps xmm2, xmm2
+ movaps [esp + dxH1O], xmm3
+ movaps [esp + dyH1O], xmm4
+ movaps [esp + dzH1O], xmm5
+ mulps xmm3, xmm3
+ mulps xmm4, xmm4
+ mulps xmm5, xmm5
+ addps xmm0, xmm1
+ addps xmm0, xmm2
+ addps xmm3, xmm4
+ addps xmm3, xmm5
+ movaps [esp + rsqOH2], xmm0
+ movaps [esp + rsqH1O], xmm3
+
+ movaps xmm0, [esp + ixH1]
+ movaps xmm1, [esp + iyH1]
+ movaps xmm2, [esp + izH1]
+ movaps xmm3, [esp + ixH1]
+ movaps xmm4, [esp + iyH1]
+ movaps xmm5, [esp + izH1]
+ subps xmm0, [esp + jxH1]
+ subps xmm1, [esp + jyH1]
+ subps xmm2, [esp + jzH1]
+ subps xmm3, [esp + jxH2]
+ subps xmm4, [esp + jyH2]
+ subps xmm5, [esp + jzH2]
+ movaps [esp + dxH1H1], xmm0
+ movaps [esp + dyH1H1], xmm1
+ movaps [esp + dzH1H1], xmm2
+ mulps xmm0, xmm0
+ mulps xmm1, xmm1
+ mulps xmm2, xmm2
+ movaps [esp + dxH1H2], xmm3
+ movaps [esp + dyH1H2], xmm4
+ movaps [esp + dzH1H2], xmm5
+ mulps xmm3, xmm3
+ mulps xmm4, xmm4
+ mulps xmm5, xmm5
+ addps xmm0, xmm1
+ addps xmm0, xmm2
+ addps xmm3, xmm4
+ addps xmm3, xmm5
+ movaps [esp + rsqH1H1], xmm0
+ movaps [esp + rsqH1H2], xmm3
+
+ movaps xmm0, [esp + ixH2]
+ movaps xmm1, [esp + iyH2]
+ movaps xmm2, [esp + izH2]
+ movaps xmm3, [esp + ixH2]
+ movaps xmm4, [esp + iyH2]
+ movaps xmm5, [esp + izH2]
+ subps xmm0, [esp + jxO]
+ subps xmm1, [esp + jyO]
+ subps xmm2, [esp + jzO]
+ subps xmm3, [esp + jxH1]
+ subps xmm4, [esp + jyH1]
+ subps xmm5, [esp + jzH1]
+ movaps [esp + dxH2O], xmm0
+ movaps [esp + dyH2O], xmm1
+ movaps [esp + dzH2O], xmm2
+ mulps xmm0, xmm0
+ mulps xmm1, xmm1
+ mulps xmm2, xmm2
+ movaps [esp + dxH2H1], xmm3
+ movaps [esp + dyH2H1], xmm4
+ movaps [esp + dzH2H1], xmm5
+ mulps xmm3, xmm3
+ mulps xmm4, xmm4
+ mulps xmm5, xmm5
+ addps xmm0, xmm1
+ addps xmm0, xmm2
+ addps xmm4, xmm3
+ addps xmm4, xmm5
+ movaps [esp + rsqH2O], xmm0
+ movaps [esp + rsqH2H1], xmm4
+
+ movaps xmm0, [esp + ixH2]
+ movaps xmm1, [esp + iyH2]
+ movaps xmm2, [esp + izH2]
+ subps xmm0, [esp + jxH2]
+ subps xmm1, [esp + jyH2]
+ subps xmm2, [esp + jzH2]
+ movaps [esp + dxH2H2], xmm0
+ movaps [esp + dyH2H2], xmm1
+ movaps [esp + dzH2H2], xmm2
+ mulps xmm0, xmm0
+ mulps xmm1, xmm1
+ mulps xmm2, xmm2
+ addps xmm0, xmm1
+ addps xmm0, xmm2
+ movaps [esp + rsqH2H2], xmm0
+
+ /* start doing invsqrt use rsq values in xmm0, xmm4 */
+ rsqrtps xmm1, xmm0
+ rsqrtps xmm5, xmm4
+ movaps xmm2, xmm1
+ movaps xmm6, xmm5
+ mulps xmm1, xmm1
+ mulps xmm5, xmm5
+ movaps xmm3, [esp + three]
+ movaps xmm7, xmm3
+ mulps xmm1, xmm0
+ mulps xmm5, xmm4
+ subps xmm3, xmm1
+ subps xmm7, xmm5
+ mulps xmm3, xmm2
+ mulps xmm7, xmm6
+ mulps xmm3, [esp + half] /* rinvH2H2 */
+ mulps xmm7, [esp + half] /* rinvH2H1 */
+ movaps [esp + rinvH2H2], xmm3
+ movaps [esp + rinvH2H1], xmm7
+
+ rsqrtps xmm1, [esp + rsqOO]
+ rsqrtps xmm5, [esp + rsqOH1]
+ movaps xmm2, xmm1
+ movaps xmm6, xmm5
+ mulps xmm1, xmm1
+ mulps xmm5, xmm5
+ movaps xmm3, [esp + three]
+ movaps xmm7, xmm3
+ mulps xmm1, [esp + rsqOO]
+ mulps xmm5, [esp + rsqOH1]
+ subps xmm3, xmm1
+ subps xmm7, xmm5
+ mulps xmm3, xmm2
+ mulps xmm7, xmm6
+ mulps xmm3, [esp + half]
+ mulps xmm7, [esp + half]
+ movaps [esp + rinvOO], xmm3
+ movaps [esp + rinvOH1], xmm7
+
+ rsqrtps xmm1, [esp + rsqOH2]
+ rsqrtps xmm5, [esp + rsqH1O]
+ movaps xmm2, xmm1
+ movaps xmm6, xmm5
+ mulps xmm1, xmm1
+ mulps xmm5, xmm5
+ movaps xmm3, [esp + three]
+ movaps xmm7, xmm3
+ mulps xmm1, [esp + rsqOH2]
+ mulps xmm5, [esp + rsqH1O]
+ subps xmm3, xmm1
+ subps xmm7, xmm5
+ mulps xmm3, xmm2
+ mulps xmm7, xmm6
+ mulps xmm3, [esp + half]
+ mulps xmm7, [esp + half]
+ movaps [esp + rinvOH2], xmm3
+ movaps [esp + rinvH1O], xmm7
+
+ rsqrtps xmm1, [esp + rsqH1H1]
+ rsqrtps xmm5, [esp + rsqH1H2]
+ movaps xmm2, xmm1
+ movaps xmm6, xmm5
+ mulps xmm1, xmm1
+ mulps xmm5, xmm5
+ movaps xmm3, [esp + three]
+ movaps xmm7, xmm3
+ mulps xmm1, [esp + rsqH1H1]
+ mulps xmm5, [esp + rsqH1H2]
+ subps xmm3, xmm1
+ subps xmm7, xmm5
+ mulps xmm3, xmm2
+ mulps xmm7, xmm6
+ mulps xmm3, [esp + half]
+ mulps xmm7, [esp + half]
+ movaps [esp + rinvH1H1], xmm3
+ movaps [esp + rinvH1H2], xmm7
+
+ rsqrtps xmm1, [esp + rsqH2O]
+ movaps xmm2, xmm1
+ mulps xmm1, xmm1
+ movaps xmm3, [esp + three]
+ mulps xmm1, [esp + rsqH2O]
+ subps xmm3, xmm1
+ mulps xmm3, xmm2
+ mulps xmm3, [esp + half]
+ movaps [esp + rinvH2O], xmm3
+
+ /* start with OO interaction */
+ movaps xmm0, [esp + rinvOO]
+ movaps xmm1, xmm0
+ mulps xmm1, [esp + rsqOO] /* xmm1=r */
+ mulps xmm1, [esp + tsc]
+
+ movhlps xmm2, xmm1
+ cvttps2pi mm6, xmm1
+ cvttps2pi mm7, xmm2 /* mm6/mm7 contain lu indices */
+ cvtpi2ps xmm3, mm6
+ cvtpi2ps xmm2, mm7
+ movlhps xmm3, xmm2
+ subps xmm1, xmm3 /* xmm1=eps */
+ movaps xmm2, xmm1
+ mulps xmm2, xmm2 /* xmm2=eps2 */
+ pslld mm6, 2
+ pslld mm7, 2
+
+ movd mm0, eax
+ movd mm1, ebx
+ movd mm2, ecx
+ movd mm3, edx
+
+ mov esi, [ebp + VFtab]
+ movd eax, mm6
+ psrlq mm6, 32
+ movd ecx, mm7
+ psrlq mm7, 32
+ movd ebx, mm6
+ movd edx, mm7
+
+ lea eax, [eax + eax*2]
+ lea ebx, [ebx + ebx*2]
+ lea ecx, [ecx + ecx*2]
+ lea edx, [edx + edx*2]
+
+ movlps xmm5, [esi + eax*4]
+ movlps xmm7, [esi + ecx*4]
+ movhps xmm5, [esi + ebx*4]
+ movhps xmm7, [esi + edx*4] /* got half coulomb table */
+
+ movaps xmm4, xmm5
+ shufps xmm4, xmm7, 0b10001000
+ shufps xmm5, xmm7, 0b11011101
+
+ movlps xmm7, [esi + eax*4 + 8]
+ movlps xmm3, [esi + ecx*4 + 8]
+ movhps xmm7, [esi + ebx*4 + 8]
+ movhps xmm3, [esi + edx*4 + 8] /* other half of coulomb table */
+ movaps xmm6, xmm7
+ shufps xmm6, xmm3, 0b10001000
+ shufps xmm7, xmm3, 0b11011101
+ /* coulomb table ready, in xmm4-xmm7 */
+
+ mulps xmm6, xmm1 /* xmm6=Geps */
+ mulps xmm7, xmm2 /* xmm7=Heps2 */
+ addps xmm5, xmm6
+ addps xmm5, xmm7 /* xmm5=Fp */
+ mulps xmm7, [esp + two] /* two*Heps2 */
+ movaps xmm3, [esp + qqOO]
+ addps xmm7, xmm6
+ addps xmm7, xmm5 /* xmm7=FF */
+ mulps xmm5, xmm1 /* xmm5=eps*Fp */
+ addps xmm5, xmm4 /* xmm5=VV */
+ mulps xmm5, xmm3 /* vcoul=qq*VV */
+ mulps xmm3, xmm7 /* fijC=FF*qq */
+ /* at this point mm5 contains vcoul and mm3 fijC */
+ /* increment vcoul - then we can get rid of mm5 */
+ /* update vctot */
+ addps xmm5, [esp + vctot]
+ movaps [esp + vctot], xmm5
+
+ /* put scalar force on stack temporarily */
+ movaps [esp + fstmp], xmm3
+
+ /* dispersion */
+ movlps xmm5, [esi + eax*4 + 16]
+ movlps xmm7, [esi + ecx*4 + 16]
+ movhps xmm5, [esi + ebx*4 + 16]
+ movhps xmm7, [esi + edx*4 + 16] /* got half dispersion table */
+ movaps xmm4, xmm5
+ shufps xmm4, xmm7, 0b10001000
+ shufps xmm5, xmm7, 0b11011101
+
+ movlps xmm7, [esi + eax*4 + 24]
+ movlps xmm3, [esi + ecx*4 + 24]
+ movhps xmm7, [esi + ebx*4 + 24]
+ movhps xmm3, [esi + edx*4 + 24] /* other half of dispersion table */
+ movaps xmm6, xmm7
+ shufps xmm6, xmm3, 0b10001000
+ shufps xmm7, xmm3, 0b11011101
+ /* dispersion table ready, in xmm4-xmm7 */
+ mulps xmm6, xmm1 /* xmm6=Geps */
+ mulps xmm7, xmm2 /* xmm7=Heps2 */
+ addps xmm5, xmm6
+ addps xmm5, xmm7 /* xmm5=Fp */
+ mulps xmm7, [esp + two] /* two*Heps2 */
+ addps xmm7, xmm6
+ addps xmm7, xmm5 /* xmm7=FF */
+ mulps xmm5, xmm1 /* xmm5=eps*Fp */
+ addps xmm5, xmm4 /* xmm5=VV */
+
+ movaps xmm4, [esp + c6]
+ mulps xmm7, xmm4 /* fijD */
+ mulps xmm5, xmm4 /* vnb6 */
+ addps xmm7, [esp + fstmp] /* add to fscal */
+
+ /* put scalar force on stack Update vnbtot directly */
+ addps xmm5, [esp + vnbtot]
+ movaps [esp + fstmp], xmm7
+ movaps [esp + vnbtot], xmm5
+
+ /* repulsion */
+ movlps xmm5, [esi + eax*4 + 32]
+ movlps xmm7, [esi + ecx*4 + 32]
+ movhps xmm5, [esi + ebx*4 + 32]
+ movhps xmm7, [esi + edx*4 + 32] /* got half repulsion table */
+ movaps xmm4, xmm5
+ shufps xmm4, xmm7, 0b10001000
+ shufps xmm5, xmm7, 0b11011101
+
+ movlps xmm7, [esi + eax*4 + 40]
+ movlps xmm3, [esi + ecx*4 + 40]
+ movhps xmm7, [esi + ebx*4 + 40]
+ movhps xmm3, [esi + edx*4 + 40] /* other half of repulsion table */
+ movaps xmm6, xmm7
+ shufps xmm6, xmm3, 0b10001000
+ shufps xmm7, xmm3, 0b11011101
+ /* table ready, in xmm4-xmm7 */
+ mulps xmm6, xmm1 /* xmm6=Geps */
+ mulps xmm7, xmm2 /* xmm7=Heps2 */
+ addps xmm5, xmm6
+ addps xmm5, xmm7 /* xmm5=Fp */
+ mulps xmm7, [esp + two] /* two*Heps2 */
+ addps xmm7, xmm6
+ addps xmm7, xmm5 /* xmm7=FF */
+ mulps xmm5, xmm1 /* xmm5=eps*Fp */
+ addps xmm5, xmm4 /* xmm5=VV */
+
+ movaps xmm4, [esp + c12]
+ mulps xmm7, xmm4 /* fijR */
+ mulps xmm5, xmm4 /* vnb12 */
+ addps xmm7, [esp + fstmp]
+
+ addps xmm5, [esp + vnbtot]
+ movaps [esp + vnbtot], xmm5
+ xorps xmm1, xmm1
+
+ mulps xmm7, [esp + tsc]
+ mulps xmm7, xmm0
+ subps xmm1, xmm7
+
+ movaps xmm0, xmm1
+ movaps xmm2, xmm1
+
+ xorps xmm3, xmm3
+ movaps xmm4, xmm3
+ movaps xmm5, xmm3
+ mulps xmm0, [esp + dxOO]
+ mulps xmm1, [esp + dyOO]
+ mulps xmm2, [esp + dzOO]
+ subps xmm3, xmm0
+ subps xmm4, xmm1
+ subps xmm5, xmm2
+ addps xmm0, [esp + fixO]
+ addps xmm1, [esp + fiyO]
+ addps xmm2, [esp + fizO]
+ movaps [esp + fjxO], xmm3
+ movaps [esp + fjyO], xmm4
+ movaps [esp + fjzO], xmm5
+ movaps [esp + fixO], xmm0
+ movaps [esp + fiyO], xmm1
+ movaps [esp + fizO], xmm2
+
+ /* O-H1 interaction */
+ movaps xmm0, [esp + rinvOH1]
+ movaps xmm1, xmm0
+ mulps xmm1, [esp + rsqOH1] /* xmm1=r */
+ mulps xmm1, [esp + tsc]
+ movhlps xmm2, xmm1
+ cvttps2pi mm6, xmm1
+ cvttps2pi mm7, xmm2 /* mm6/mm7 contain lu indices */
+ cvtpi2ps xmm3, mm6
+ cvtpi2ps xmm2, mm7
+ movlhps xmm3, xmm2
+ subps xmm1, xmm3 /* xmm1=eps */
+ movaps xmm2, xmm1
+ mulps xmm2, xmm2 /* xmm2=eps2 */
+ pslld mm6, 2
+ pslld mm7, 2
+
+ movd eax, mm6
+ psrlq mm6, 32
+ movd ecx, mm7
+ psrlq mm7, 32
+ movd ebx, mm6
+ movd edx, mm7
+
+ lea eax, [eax + eax*2]
+ lea ebx, [ebx + ebx*2]
+ lea ecx, [ecx + ecx*2]
+ lea edx, [edx + edx*2]
+
+ movlps xmm5, [esi + eax*4]
+ movlps xmm7, [esi + ecx*4]
+ movhps xmm5, [esi + ebx*4]
+ movhps xmm7, [esi + edx*4] /* got half coulomb table */
+
+ movaps xmm4, xmm5
+ shufps xmm4, xmm7, 0b10001000
+ shufps xmm5, xmm7, 0b11011101
+
+ movlps xmm7, [esi + eax*4 + 8]
+ movlps xmm3, [esi + ecx*4 + 8]
+ movhps xmm7, [esi + ebx*4 + 8]
+ movhps xmm3, [esi + edx*4 + 8] /* other half of coulomb table */
+ movaps xmm6, xmm7
+ shufps xmm6, xmm3, 0b10001000
+ shufps xmm7, xmm3, 0b11011101
+ /* coulomb table ready, in xmm4-xmm7 */
+
+ mulps xmm6, xmm1 /* xmm6=Geps */
+ mulps xmm7, xmm2 /* xmm7=Heps2 */
+ addps xmm5, xmm6
+ addps xmm5, xmm7 /* xmm5=Fp */
+ mulps xmm7, [esp + two] /* two*Heps2 */
+ movaps xmm3, [esp + qqOH]
+ addps xmm7, xmm6
+ addps xmm7, xmm5 /* xmm7=FF */
+ mulps xmm5, xmm1 /* xmm5=eps*Fp */
+ addps xmm5, xmm4 /* xmm5=VV */
+ mulps xmm5, xmm3 /* vcoul=qq*VV */
+ mulps xmm3, xmm7 /* fijC=FF*qq */
+ /* at this point mm5 contains vcoul and mm3 fijC */
+
+ addps xmm5, [esp + vctot]
+ movaps [esp + vctot], xmm5
+ xorps xmm1, xmm1
+ mulps xmm3, [esp + tsc]
+ mulps xmm3, xmm0
+ subps xmm1, xmm3
+
+ movaps xmm0, xmm1
+ movaps xmm2, xmm1
+
+ xorps xmm3, xmm3
+ movaps xmm4, xmm3
+ movaps xmm5, xmm3
+ mulps xmm0, [esp + dxOH1]
+ mulps xmm1, [esp + dyOH1]
+ mulps xmm2, [esp + dzOH1]
+ subps xmm3, xmm0
+ subps xmm4, xmm1
+ subps xmm5, xmm2
+ addps xmm0, [esp + fixO]
+ addps xmm1, [esp + fiyO]
+ addps xmm2, [esp + fizO]
+ movaps [esp + fjxH1], xmm3
+ movaps [esp + fjyH1], xmm4
+ movaps [esp + fjzH1], xmm5
+ movaps [esp + fixO], xmm0
+ movaps [esp + fiyO], xmm1
+ movaps [esp + fizO], xmm2
+
+ /* O-H2 interaction */
+ movaps xmm0, [esp + rinvOH2]
+ movaps xmm1, xmm0
+ mulps xmm1, [esp + rsqOH2] /* xmm1=r */
+ mulps xmm1, [esp + tsc]
+ movhlps xmm2, xmm1
+ cvttps2pi mm6, xmm1
+ cvttps2pi mm7, xmm2 /* mm6/mm7 contain lu indices */
+ cvtpi2ps xmm3, mm6
+ cvtpi2ps xmm2, mm7
+ movlhps xmm3, xmm2
+ subps xmm1, xmm3 /* xmm1=eps */
+ movaps xmm2, xmm1
+ mulps xmm2, xmm2 /* xmm2=eps2 */
+ pslld mm6, 2
+ pslld mm7, 2
+
+ movd eax, mm6
+ psrlq mm6, 32
+ movd ecx, mm7
+ psrlq mm7, 32
+ movd ebx, mm6
+ movd edx, mm7
+
+ lea eax, [eax + eax*2]
+ lea ebx, [ebx + ebx*2]
+ lea ecx, [ecx + ecx*2]
+ lea edx, [edx + edx*2]
+
+ movlps xmm5, [esi + eax*4]
+ movlps xmm7, [esi + ecx*4]
+ movhps xmm5, [esi + ebx*4]
+ movhps xmm7, [esi + edx*4] /* got half coulomb table */
+
+ movaps xmm4, xmm5
+ shufps xmm4, xmm7, 0b10001000
+ shufps xmm5, xmm7, 0b11011101
+
+ movlps xmm7, [esi + eax*4 + 8]
+ movlps xmm3, [esi + ecx*4 + 8]
+ movhps xmm7, [esi + ebx*4 + 8]
+ movhps xmm3, [esi + edx*4 + 8] /* other half of coulomb table */
+ movaps xmm6, xmm7
+ shufps xmm6, xmm3, 0b10001000
+ shufps xmm7, xmm3, 0b11011101
+ /* coulomb table ready, in xmm4-xmm7 */
+
+ mulps xmm6, xmm1 /* xmm6=Geps */
+ mulps xmm7, xmm2 /* xmm7=Heps2 */
+ addps xmm5, xmm6
+ addps xmm5, xmm7 /* xmm5=Fp */
+ mulps xmm7, [esp + two] /* two*Heps2 */
+ movaps xmm3, [esp + qqOH]
+ addps xmm7, xmm6
+ addps xmm7, xmm5 /* xmm7=FF */
+ mulps xmm5, xmm1 /* xmm5=eps*Fp */
+ addps xmm5, xmm4 /* xmm5=VV */
+ mulps xmm5, xmm3 /* vcoul=qq*VV */
+ mulps xmm3, xmm7 /* fijC=FF*qq */
+ /* at this point mm5 contains vcoul and mm3 fijC */
+
+ addps xmm5, [esp + vctot]
+ movaps [esp + vctot], xmm5
+ xorps xmm1, xmm1
+ mulps xmm3, [esp + tsc]
+ mulps xmm3, xmm0
+ subps xmm1, xmm3
+
+ movaps xmm0, xmm1
+ movaps xmm2, xmm1
+
+ xorps xmm3, xmm3
+ movaps xmm4, xmm3
+ movaps xmm5, xmm3
+ mulps xmm0, [esp + dxOH2]
+ mulps xmm1, [esp + dyOH2]
+ mulps xmm2, [esp + dzOH2]
+ subps xmm3, xmm0
+ subps xmm4, xmm1
+ subps xmm5, xmm2
+ addps xmm0, [esp + fixO]
+ addps xmm1, [esp + fiyO]
+ addps xmm2, [esp + fizO]
+ movaps [esp + fjxH2], xmm3
+ movaps [esp + fjyH2], xmm4
+ movaps [esp + fjzH2], xmm5
+ movaps [esp + fixO], xmm0
+ movaps [esp + fiyO], xmm1
+ movaps [esp + fizO], xmm2
+
+ /* H1-O interaction */
+ movaps xmm0, [esp + rinvH1O]
+ movaps xmm1, xmm0
+ mulps xmm1, [esp + rsqH1O] /* xmm1=r */
+ mulps xmm1, [esp + tsc]
+ movhlps xmm2, xmm1
+ cvttps2pi mm6, xmm1
+ cvttps2pi mm7, xmm2 /* mm6/mm7 contain lu indices */
+ cvtpi2ps xmm3, mm6
+ cvtpi2ps xmm2, mm7
+ movlhps xmm3, xmm2
+ subps xmm1, xmm3 /* xmm1=eps */
+ movaps xmm2, xmm1
+ mulps xmm2, xmm2 /* xmm2=eps2 */
+ pslld mm6, 2
+ pslld mm7, 2
+
+ movd eax, mm6
+ psrlq mm6, 32
+ movd ecx, mm7
+ psrlq mm7, 32
+ movd ebx, mm6
+ movd edx, mm7
+
+ lea eax, [eax + eax*2]
+ lea ebx, [ebx + ebx*2]
+ lea ecx, [ecx + ecx*2]
+ lea edx, [edx + edx*2]
+
+ movlps xmm5, [esi + eax*4]
+ movlps xmm7, [esi + ecx*4]
+ movhps xmm5, [esi + ebx*4]
+ movhps xmm7, [esi + edx*4] /* got half coulomb table */
+
+ movaps xmm4, xmm5
+ shufps xmm4, xmm7, 0b10001000
+ shufps xmm5, xmm7, 0b11011101
+
+ movlps xmm7, [esi + eax*4 + 8]
+ movlps xmm3, [esi + ecx*4 + 8]
+ movhps xmm7, [esi + ebx*4 + 8]
+ movhps xmm3, [esi + edx*4 + 8] /* other half of coulomb table */
+ movaps xmm6, xmm7
+ shufps xmm6, xmm3, 0b10001000
+ shufps xmm7, xmm3, 0b11011101
+ /* coulomb table ready, in xmm4-xmm7 */
+
+ mulps xmm6, xmm1 /* xmm6=Geps */
+ mulps xmm7, xmm2 /* xmm7=Heps2 */
+ addps xmm5, xmm6
+ addps xmm5, xmm7 /* xmm5=Fp */
+ mulps xmm7, [esp + two] /* two*Heps2 */
+ movaps xmm3, [esp + qqOH]
+ addps xmm7, xmm6
+ addps xmm7, xmm5 /* xmm7=FF */
+ mulps xmm5, xmm1 /* xmm5=eps*Fp */
+ addps xmm5, xmm4 /* xmm5=VV */
+ mulps xmm5, xmm3 /* vcoul=qq*VV */
+ mulps xmm3, xmm7 /* fijC=FF*qq */
+ /* at this point mm5 contains vcoul and mm3 fijC */
+
+ addps xmm5, [esp + vctot]
+ movaps [esp + vctot], xmm5
+ xorps xmm1, xmm1
+ mulps xmm3, [esp + tsc]
+ mulps xmm3, xmm0
+ subps xmm1, xmm3
+
+ movaps xmm0, xmm1
+ movaps xmm2, xmm1
+
+ movaps xmm3, [esp + fjxO]
+ movaps xmm4, [esp + fjyO]
+ movaps xmm5, [esp + fjzO]
+ mulps xmm0, [esp + dxH1O]
+ mulps xmm1, [esp + dyH1O]
+ mulps xmm2, [esp + dzH1O]
+ subps xmm3, xmm0
+ subps xmm4, xmm1
+ subps xmm5, xmm2
+ addps xmm0, [esp + fixH1]
+ addps xmm1, [esp + fiyH1]
+ addps xmm2, [esp + fizH1]
+ movaps [esp + fjxO], xmm3
+ movaps [esp + fjyO], xmm4
+ movaps [esp + fjzO], xmm5
+ movaps [esp + fixH1], xmm0
+ movaps [esp + fiyH1], xmm1
+ movaps [esp + fizH1], xmm2
+
+ /* H1-H1 interaction */
+ movaps xmm0, [esp + rinvH1H1]
+ movaps xmm1, xmm0
+ mulps xmm1, [esp + rsqH1H1] /* xmm1=r */
+ mulps xmm1, [esp + tsc]
+ movhlps xmm2, xmm1
+ cvttps2pi mm6, xmm1
+ cvttps2pi mm7, xmm2 /* mm6/mm7 contain lu indices */
+ cvtpi2ps xmm3, mm6
+ cvtpi2ps xmm2, mm7
+ movlhps xmm3, xmm2
+ subps xmm1, xmm3 /* xmm1=eps */
+ movaps xmm2, xmm1
+ mulps xmm2, xmm2 /* xmm2=eps2 */
+ pslld mm6, 2
+ pslld mm7, 2
+
+ movd eax, mm6
+ psrlq mm6, 32
+ movd ecx, mm7
+ psrlq mm7, 32
+ movd ebx, mm6
+ movd edx, mm7
+
+ lea eax, [eax + eax*2]
+ lea ebx, [ebx + ebx*2]
+ lea ecx, [ecx + ecx*2]
+ lea edx, [edx + edx*2]
+
+ movlps xmm5, [esi + eax*4]
+ movlps xmm7, [esi + ecx*4]
+ movhps xmm5, [esi + ebx*4]
+ movhps xmm7, [esi + edx*4] /* got half coulomb table */
+
+ movaps xmm4, xmm5
+ shufps xmm4, xmm7, 0b10001000
+ shufps xmm5, xmm7, 0b11011101
+
+ movlps xmm7, [esi + eax*4 + 8]
+ movlps xmm3, [esi + ecx*4 + 8]
+ movhps xmm7, [esi + ebx*4 + 8]
+ movhps xmm3, [esi + edx*4 + 8] /* other half of coulomb table */
+ movaps xmm6, xmm7
+ shufps xmm6, xmm3, 0b10001000
+ shufps xmm7, xmm3, 0b11011101
+ /* coulomb table ready, in xmm4-xmm7 */
+
+ mulps xmm6, xmm1 /* xmm6=Geps */
+ mulps xmm7, xmm2 /* xmm7=Heps2 */
+ addps xmm5, xmm6
+ addps xmm5, xmm7 /* xmm5=Fp */
+ mulps xmm7, [esp + two] /* two*Heps2 */
+ movaps xmm3, [esp + qqHH]
+ addps xmm7, xmm6
+ addps xmm7, xmm5 /* xmm7=FF */
+ mulps xmm5, xmm1 /* xmm5=eps*Fp */
+ addps xmm5, xmm4 /* xmm5=VV */
+ mulps xmm5, xmm3 /* vcoul=qq*VV */
+ mulps xmm3, xmm7 /* fijC=FF*qq */
+ /* at this point mm5 contains vcoul and mm3 fijC */
+
+ addps xmm5, [esp + vctot]
+ movaps [esp + vctot], xmm5
+ xorps xmm1, xmm1
+ mulps xmm3, [esp + tsc]
+ mulps xmm3, xmm0
+ subps xmm1, xmm3
+
+ movaps xmm0, xmm1
+ movaps xmm2, xmm1
+
+ movaps xmm3, [esp + fjxH1]
+ movaps xmm4, [esp + fjyH1]
+ movaps xmm5, [esp + fjzH1]
+ mulps xmm0, [esp + dxH1H1]
+ mulps xmm1, [esp + dyH1H1]
+ mulps xmm2, [esp + dzH1H1]
+ subps xmm3, xmm0
+ subps xmm4, xmm1
+ subps xmm5, xmm2
+ addps xmm0, [esp + fixH1]
+ addps xmm1, [esp + fiyH1]
+ addps xmm2, [esp + fizH1]
+ movaps [esp + fjxH1], xmm3
+ movaps [esp + fjyH1], xmm4
+ movaps [esp + fjzH1], xmm5
+ movaps [esp + fixH1], xmm0
+ movaps [esp + fiyH1], xmm1
+ movaps [esp + fizH1], xmm2
+
+ /* H1-H2 interaction */
+ movaps xmm0, [esp + rinvH1H2]
+ movaps xmm1, xmm0
+ mulps xmm1, [esp + rsqH1H2] /* xmm1=r */
+ mulps xmm1, [esp + tsc]
+ movhlps xmm2, xmm1
+ cvttps2pi mm6, xmm1
+ cvttps2pi mm7, xmm2 /* mm6/mm7 contain lu indices */
+ cvtpi2ps xmm3, mm6
+ cvtpi2ps xmm2, mm7
+ movlhps xmm3, xmm2
+ subps xmm1, xmm3 /* xmm1=eps */
+ movaps xmm2, xmm1
+ mulps xmm2, xmm2 /* xmm2=eps2 */
+ pslld mm6, 2
+ pslld mm7, 2
+
+ movd eax, mm6
+ psrlq mm6, 32
+ movd ecx, mm7
+ psrlq mm7, 32
+ movd ebx, mm6
+ movd edx, mm7
+
+ lea eax, [eax + eax*2]
+ lea ebx, [ebx + ebx*2]
+ lea ecx, [ecx + ecx*2]
+ lea edx, [edx + edx*2]
+
+ movlps xmm5, [esi + eax*4]
+ movlps xmm7, [esi + ecx*4]
+ movhps xmm5, [esi + ebx*4]
+ movhps xmm7, [esi + edx*4] /* got half coulomb table */
+
+ movaps xmm4, xmm5
+ shufps xmm4, xmm7, 0b10001000
+ shufps xmm5, xmm7, 0b11011101
+
+ movlps xmm7, [esi + eax*4 + 8]
+ movlps xmm3, [esi + ecx*4 + 8]
+ movhps xmm7, [esi + ebx*4 + 8]
+ movhps xmm3, [esi + edx*4 + 8] /* other half of coulomb table */
+ movaps xmm6, xmm7
+ shufps xmm6, xmm3, 0b10001000
+ shufps xmm7, xmm3, 0b11011101
+ /* coulomb table ready, in xmm4-xmm7 */
+
+ mulps xmm6, xmm1 /* xmm6=Geps */
+ mulps xmm7, xmm2 /* xmm7=Heps2 */
+ addps xmm5, xmm6
+ addps xmm5, xmm7 /* xmm5=Fp */
+ mulps xmm7, [esp + two] /* two*Heps2 */
+ movaps xmm3, [esp + qqHH]
+ addps xmm7, xmm6
+ addps xmm7, xmm5 /* xmm7=FF */
+ mulps xmm5, xmm1 /* xmm5=eps*Fp */
+ addps xmm5, xmm4 /* xmm5=VV */
+ mulps xmm5, xmm3 /* vcoul=qq*VV */
+ mulps xmm3, xmm7 /* fijC=FF*qq */
+ /* at this point mm5 contains vcoul and mm3 fijC */
+
+ addps xmm5, [esp + vctot]
+ movaps [esp + vctot], xmm5
+ xorps xmm1, xmm1
+ mulps xmm3, [esp + tsc]
+ mulps xmm3, xmm0
+ subps xmm1, xmm3
+
+ movaps xmm0, xmm1
+ movaps xmm2, xmm1
+
+ movaps xmm3, [esp + fjxH2]
+ movaps xmm4, [esp + fjyH2]
+ movaps xmm5, [esp + fjzH2]
+ mulps xmm0, [esp + dxH1H2]
+ mulps xmm1, [esp + dyH1H2]
+ mulps xmm2, [esp + dzH1H2]
+ subps xmm3, xmm0
+ subps xmm4, xmm1
+ subps xmm5, xmm2
+ addps xmm0, [esp + fixH1]
+ addps xmm1, [esp + fiyH1]
+ addps xmm2, [esp + fizH1]
+ movaps [esp + fjxH2], xmm3
+ movaps [esp + fjyH2], xmm4
+ movaps [esp + fjzH2], xmm5
+ movaps [esp + fixH1], xmm0
+ movaps [esp + fiyH1], xmm1
+ movaps [esp + fizH1], xmm2
+
+ /* H2-O interaction */
+ movaps xmm0, [esp + rinvH2O]
+ movaps xmm1, xmm0
+ mulps xmm1, [esp + rsqH2O] /* xmm1=r */
+ mulps xmm1, [esp + tsc]
+ movhlps xmm2, xmm1
+ cvttps2pi mm6, xmm1
+ cvttps2pi mm7, xmm2 /* mm6/mm7 contain lu indices */
+ cvtpi2ps xmm3, mm6
+ cvtpi2ps xmm2, mm7
+ movlhps xmm3, xmm2
+ subps xmm1, xmm3 /* xmm1=eps */
+ movaps xmm2, xmm1
+ mulps xmm2, xmm2 /* xmm2=eps2 */
+ pslld mm6, 2
+ pslld mm7, 2
+
+ movd eax, mm6
+ psrlq mm6, 32
+ movd ecx, mm7
+ psrlq mm7, 32
+ movd ebx, mm6
+ movd edx, mm7
+
+ lea eax, [eax + eax*2]
+ lea ebx, [ebx + ebx*2]
+ lea ecx, [ecx + ecx*2]
+ lea edx, [edx + edx*2]
+
+ movlps xmm5, [esi + eax*4]
+ movlps xmm7, [esi + ecx*4]
+ movhps xmm5, [esi + ebx*4]
+ movhps xmm7, [esi + edx*4] /* got half coulomb table */
+
+ movaps xmm4, xmm5
+ shufps xmm4, xmm7, 0b10001000
+ shufps xmm5, xmm7, 0b11011101
+
+ movlps xmm7, [esi + eax*4 + 8]
+ movlps xmm3, [esi + ecx*4 + 8]
+ movhps xmm7, [esi + ebx*4 + 8]
+ movhps xmm3, [esi + edx*4 + 8] /* other half of coulomb table */
+ movaps xmm6, xmm7
+ shufps xmm6, xmm3, 0b10001000
+ shufps xmm7, xmm3, 0b11011101
+ /* coulomb table ready, in xmm4-xmm7 */
+
+ mulps xmm6, xmm1 /* xmm6=Geps */
+ mulps xmm7, xmm2 /* xmm7=Heps2 */
+ addps xmm5, xmm6
+ addps xmm5, xmm7 /* xmm5=Fp */
+ mulps xmm7, [esp + two] /* two*Heps2 */
+ movaps xmm3, [esp + qqOH]
+ addps xmm7, xmm6
+ addps xmm7, xmm5 /* xmm7=FF */
+ mulps xmm5, xmm1 /* xmm5=eps*Fp */
+ addps xmm5, xmm4 /* xmm5=VV */
+ mulps xmm5, xmm3 /* vcoul=qq*VV */
+ mulps xmm3, xmm7 /* fijC=FF*qq */
+ /* at this point mm5 contains vcoul and mm3 fijC */
+
+ addps xmm5, [esp + vctot]
+ movaps [esp + vctot], xmm5
+ xorps xmm1, xmm1
+ mulps xmm3, [esp + tsc]
+ mulps xmm3, xmm0
+ subps xmm1, xmm3
+
+ movaps xmm0, xmm1
+ movaps xmm2, xmm1
+
+ movaps xmm3, [esp + fjxO]
+ movaps xmm4, [esp + fjyO]
+ movaps xmm5, [esp + fjzO]
+ mulps xmm0, [esp + dxH2O]
+ mulps xmm1, [esp + dyH2O]
+ mulps xmm2, [esp + dzH2O]
+ subps xmm3, xmm0
+ subps xmm4, xmm1
+ subps xmm5, xmm2
+ addps xmm0, [esp + fixH2]
+ addps xmm1, [esp + fiyH2]
+ addps xmm2, [esp + fizH2]
+ movaps [esp + fjxO], xmm3
+ movaps [esp + fjyO], xmm4
+ movaps [esp + fjzO], xmm5
+ movaps [esp + fixH2], xmm0
+ movaps [esp + fiyH2], xmm1
+ movaps [esp + fizH2], xmm2
+
+ /* H2-H1 interaction */
+ movaps xmm0, [esp + rinvH2H1]
+ movaps xmm1, xmm0
+ mulps xmm1, [esp + rsqH2H1] /* xmm1=r */
+ mulps xmm1, [esp + tsc]
+ movhlps xmm2, xmm1
+ cvttps2pi mm6, xmm1
+ cvttps2pi mm7, xmm2 /* mm6/mm7 contain lu indices */
+ cvtpi2ps xmm3, mm6
+ cvtpi2ps xmm2, mm7
+ movlhps xmm3, xmm2
+ subps xmm1, xmm3 /* xmm1=eps */
+ movaps xmm2, xmm1
+ mulps xmm2, xmm2 /* xmm2=eps2 */
+ pslld mm6, 2
+ pslld mm7, 2
+
+ movd eax, mm6
+ psrlq mm6, 32
+ movd ecx, mm7
+ psrlq mm7, 32
+ movd ebx, mm6
+ movd edx, mm7
+
+ lea eax, [eax + eax*2]
+ lea ebx, [ebx + ebx*2]
+ lea ecx, [ecx + ecx*2]
+ lea edx, [edx + edx*2]
+
+ movlps xmm5, [esi + eax*4]
+ movlps xmm7, [esi + ecx*4]
+ movhps xmm5, [esi + ebx*4]
+ movhps xmm7, [esi + edx*4] /* got half coulomb table */
+
+ movaps xmm4, xmm5
+ shufps xmm4, xmm7, 0b10001000
+ shufps xmm5, xmm7, 0b11011101
+
+ movlps xmm7, [esi + eax*4 + 8]
+ movlps xmm3, [esi + ecx*4 + 8]
+ movhps xmm7, [esi + ebx*4 + 8]
+ movhps xmm3, [esi + edx*4 + 8] /* other half of coulomb table */
+ movaps xmm6, xmm7
+ shufps xmm6, xmm3, 0b10001000
+ shufps xmm7, xmm3, 0b11011101
+ /* coulomb table ready, in xmm4-xmm7 */
+
+ mulps xmm6, xmm1 /* xmm6=Geps */
+ mulps xmm7, xmm2 /* xmm7=Heps2 */
+ addps xmm5, xmm6
+ addps xmm5, xmm7 /* xmm5=Fp */
+ mulps xmm7, [esp + two] /* two*Heps2 */
+ movaps xmm3, [esp + qqHH]
+ addps xmm7, xmm6
+ addps xmm7, xmm5 /* xmm7=FF */
+ mulps xmm5, xmm1 /* xmm5=eps*Fp */
+ addps xmm5, xmm4 /* xmm5=VV */
+ mulps xmm5, xmm3 /* vcoul=qq*VV */
+ mulps xmm3, xmm7 /* fijC=FF*qq */
+ /* at this point mm5 contains vcoul and mm3 fijC */
+
+ addps xmm5, [esp + vctot]
+ movaps [esp + vctot], xmm5
+ xorps xmm1, xmm1
+ mulps xmm3, [esp + tsc]
+ mulps xmm3, xmm0
+ subps xmm1, xmm3
+
+ movaps xmm0, xmm1
+ movaps xmm2, xmm1
+
+ movaps xmm3, [esp + fjxH1]
+ movaps xmm4, [esp + fjyH1]
+ movaps xmm5, [esp + fjzH1]
+ mulps xmm0, [esp + dxH2H1]
+ mulps xmm1, [esp + dyH2H1]
+ mulps xmm2, [esp + dzH2H1]
+ subps xmm3, xmm0
+ subps xmm4, xmm1
+ subps xmm5, xmm2
+ addps xmm0, [esp + fixH2]
+ addps xmm1, [esp + fiyH2]
+ addps xmm2, [esp + fizH2]
+ movaps [esp + fjxH1], xmm3
+ movaps [esp + fjyH1], xmm4
+ movaps [esp + fjzH1], xmm5
+ movaps [esp + fixH2], xmm0
+ movaps [esp + fiyH2], xmm1
+ movaps [esp + fizH2], xmm2
+
+ /* H2-H2 interaction */
+ movaps xmm0, [esp + rinvH2H2]
+ movaps xmm1, xmm0
+ mulps xmm1, [esp + rsqH2H2] /* xmm1=r */
+ mulps xmm1, [esp + tsc]
+ movhlps xmm2, xmm1
+ cvttps2pi mm6, xmm1
+ cvttps2pi mm7, xmm2 /* mm6/mm7 contain lu indices */
+ cvtpi2ps xmm3, mm6
+ cvtpi2ps xmm2, mm7
+ movlhps xmm3, xmm2
+ subps xmm1, xmm3 /* xmm1=eps */
+ movaps xmm2, xmm1
+ mulps xmm2, xmm2 /* xmm2=eps2 */
+ pslld mm6, 2
+ pslld mm7, 2
+
+ movd eax, mm6
+ psrlq mm6, 32
+ movd ecx, mm7
+ psrlq mm7, 32
+ movd ebx, mm6
+ movd edx, mm7
+
+ lea eax, [eax + eax*2]
+ lea ebx, [ebx + ebx*2]
+ lea ecx, [ecx + ecx*2]
+ lea edx, [edx + edx*2]
+
+ movlps xmm5, [esi + eax*4]
+ movlps xmm7, [esi + ecx*4]
+ movhps xmm5, [esi + ebx*4]
+ movhps xmm7, [esi + edx*4] /* got half coulomb table */
+
+ movaps xmm4, xmm5
+ shufps xmm4, xmm7, 0b10001000
+ shufps xmm5, xmm7, 0b11011101
+
+ movlps xmm7, [esi + eax*4 + 8]
+ movlps xmm3, [esi + ecx*4 + 8]
+ movhps xmm7, [esi + ebx*4 + 8]
+ movhps xmm3, [esi + edx*4 + 8] /* other half of coulomb table */
+ movaps xmm6, xmm7
+ shufps xmm6, xmm3, 0b10001000
+ shufps xmm7, xmm3, 0b11011101
+ /* coulomb table ready, in xmm4-xmm7 */
+
+ mulps xmm6, xmm1 /* xmm6=Geps */
+ mulps xmm7, xmm2 /* xmm7=Heps2 */
+ addps xmm5, xmm6
+ addps xmm5, xmm7 /* xmm5=Fp */
+ mulps xmm7, [esp + two] /* two*Heps2 */
+ movaps xmm3, [esp + qqHH]
+ addps xmm7, xmm6
+ addps xmm7, xmm5 /* xmm7=FF */
+ mulps xmm5, xmm1 /* xmm5=eps*Fp */
+ addps xmm5, xmm4 /* xmm5=VV */
+ mulps xmm5, xmm3 /* vcoul=qq*VV */
+ mulps xmm3, xmm7 /* fijC=FF*qq */
+ /* at this point mm5 contains vcoul and mm3 fijC */
+
+ addps xmm5, [esp + vctot]
+ movaps [esp + vctot], xmm5
+ xorps xmm1, xmm1
+ mulps xmm3, [esp + tsc]
+ mulps xmm3, xmm0
+ subps xmm1, xmm3
+
+ movaps xmm0, xmm1
+ movaps xmm2, xmm1
+
+ movaps xmm3, [esp + fjxH2]
+ movaps xmm4, [esp + fjyH2]
+ movaps xmm5, [esp + fjzH2]
+ mulps xmm0, [esp + dxH2H2]
+ mulps xmm1, [esp + dyH2H2]
+ mulps xmm2, [esp + dzH2H2]
+ subps xmm3, xmm0
+ subps xmm4, xmm1
+ subps xmm5, xmm2
+ addps xmm0, [esp + fixH2]
+ addps xmm1, [esp + fiyH2]
+ addps xmm2, [esp + fizH2]
+ movaps [esp + fjxH2], xmm3
+ movaps [esp + fjyH2], xmm4
+ movaps [esp + fjzH2], xmm5
+ movaps [esp + fixH2], xmm0
+ movaps [esp + fiyH2], xmm1
+ movaps [esp + fizH2], xmm2
+
+ mov edi, [ebp +faction]
+
+ movd eax, mm0
+ movd ebx, mm1
+ movd ecx, mm2
+ movd edx, mm3
+
+ /* Did all interactions - now update j forces */
+ /* 4 j waters with three atoms each - first do a & b j particles */
+ movaps xmm0, [esp + fjxO] /* xmm0= fjxOa fjxOb fjxOc fjxOd */
+ movaps xmm1, [esp + fjyO] /* xmm1= fjyOa fjyOb fjyOc fjyOd */
+ unpcklps xmm0, xmm1 /* xmm0= fjxOa fjyOa fjxOb fjyOb */
+ movaps xmm1, [esp + fjzO]
+ movaps xmm2, [esp + fjxH1]
+ movhlps xmm3, xmm0 /* xmm3= fjxOb fjyOb */
+ unpcklps xmm1, xmm2 /* xmm1= fjzOa fjxH1a fjzOb fjxH1b */
+ movaps xmm4, [esp + fjyH1]
+ movaps xmm5, [esp + fjzH1]
+ unpcklps xmm4, xmm5 /* xmm4= fjyH1a fjzH1a fjyH1b fjzH1b */
+ movaps xmm5, [esp + fjxH2]
+ movaps xmm6, [esp + fjyH2]
+ movhlps xmm7, xmm4 /* xmm7= fjyH1b fjzH1b */
+ unpcklps xmm5, xmm6 /* xmm5= fjxH2a fjyH2a fjxH2b fjyH2b */
+ movlhps xmm0, xmm1 /* xmm0= fjxOa fjyOa fjzOa fjxH1a */
+ shufps xmm3, xmm1, 0b11100100
+ /* xmm3= fjxOb fjyOb fjzOb fjxH1b */
+ movlhps xmm4, xmm5 /* xmm4= fjyH1a fjzH1a fjxH2a fjyH2a */
+ shufps xmm7, xmm5, 0b11100100
+ /* xmm7= fjyH1b fjzH1b fjxH2b fjyH2b */
+ movups xmm1, [edi + eax*4]
+ movups xmm2, [edi + eax*4 + 16]
+ movups xmm5, [edi + ebx*4]
+ movups xmm6, [edi + ebx*4 + 16]
+ addps xmm1, xmm0
+ addps xmm2, xmm4
+ addps xmm5, xmm3
+ addps xmm6, xmm7
+ movss xmm0, [edi + eax*4 + 32]
+ movss xmm3, [edi + ebx*4 + 32]
+
+ movaps xmm4, [esp + fjzH2]
+ movaps xmm7, xmm4
+ shufps xmm7, xmm7, 1
+
+ movups [edi + eax*4], xmm1
+ movups [edi + eax*4 + 16],xmm2
+ movups [edi + ebx*4], xmm5
+ movups [edi + ebx*4 + 16],xmm6
+ addss xmm0, xmm4
+ addss xmm3, xmm7
+ movss [edi + eax*4 + 32], xmm0
+ movss [edi + ebx*4 + 32], xmm3
+
+ /* then do the second pair (c & d) */
+ movaps xmm0, [esp + fjxO] /* xmm0= fjxOa fjxOb fjxOc fjxOd */
+ movaps xmm1, [esp + fjyO] /* xmm1= fjyOa fjyOb fjyOc fjyOd */
+ unpckhps xmm0, xmm1 /* xmm0= fjxOc fjyOc fjxOd fjyOd */
+ movaps xmm1, [esp + fjzO]
+ movaps xmm2, [esp + fjxH1]
+ movhlps xmm3, xmm0 /* xmm3= fjxOd fjyOd */
+ unpckhps xmm1, xmm2 /* xmm1= fjzOc fjxH1c fjzOd fjxH1d */
+ movaps xmm4, [esp + fjyH1]
+ movaps xmm5, [esp + fjzH1]
+ unpckhps xmm4, xmm5 /* xmm4= fjyH1c fjzH1c fjyH1d fjzH1d */
+ movaps xmm5, [esp + fjxH2]
+ movaps xmm6, [esp + fjyH2]
+ movhlps xmm7, xmm4 /* xmm7= fjyH1d fjzH1d */
+ unpckhps xmm5, xmm6 /* xmm5= fjxH2c fjyH2c fjxH2d fjyH2d */
+ movlhps xmm0, xmm1 /* xmm0= fjxOc fjyOc fjzOc fjxH1c */
+ shufps xmm3, xmm1, 0b11100100
+ /* xmm3= fjxOd fjyOd fjzOd fjxH1d */
+ movlhps xmm4, xmm5 /* xmm4= fjyH1c fjzH1c fjxH2c fjyH2c */
+ shufps xmm7, xmm5, 0b11100100
+ /* xmm7= fjyH1d fjzH1d fjxH2d fjyH2d */
+ movups xmm1, [edi + ecx*4]
+ movups xmm2, [edi + ecx*4 + 16]
+ movups xmm5, [edi + edx*4]
+ movups xmm6, [edi + edx*4 + 16]
+ addps xmm1, xmm0
+ addps xmm2, xmm4
+ addps xmm5, xmm3
+ addps xmm6, xmm7
+ movss xmm0, [edi + ecx*4 + 32]
+ movss xmm3, [edi + edx*4 + 32]
+
+ movaps xmm4, [esp + fjzH2]
+ movaps xmm7, xmm4
+ shufps xmm4, xmm4, 0b10
+ shufps xmm7, xmm7, 0b11
+ movups [edi + ecx*4], xmm1
+ movups [edi + ecx*4 + 16],xmm2
+ movups [edi + edx*4], xmm5
+ movups [edi + edx*4 + 16],xmm6
+ addss xmm0, xmm4
+ addss xmm3, xmm7
+ movss [edi + ecx*4 + 32], xmm0
+ movss [edi + edx*4 + 32], xmm3
+
+ /* should we do one more iteration? */
+ sub [esp + innerk], 4
+ jl .i3330_single_check
+ jmp .i3330_unroll_loop
+.i3330_single_check:
+ add [esp + innerk], 4
+ jnz .i3330_single_loop
+ jmp .i3330_updateouterdata
+.i3330_single_loop:
+ mov edx, [esp + innerjjnr] /* pointer to jjnr[k] */
+ mov eax, [edx]
+ add [esp + innerjjnr], 4
+
+ mov esi, [ebp + pos]
+ lea eax, [eax + eax*2]
+
+ /* fetch j coordinates */
+ xorps xmm3, xmm3
+ xorps xmm4, xmm4
+ xorps xmm5, xmm5
+ movss xmm3, [esi + eax*4]
+ movss xmm4, [esi + eax*4 + 4]
+ movss xmm5, [esi + eax*4 + 8]
+
+ movlps xmm6, [esi + eax*4 + 12]
+ movhps xmm6, [esi + eax*4 + 24] /* xmm6=jxH1 jyH1 jxH2 jyH2 */
+ /* fetch both z coords in one go, to positions 0 and 3 in xmm7 */
+ movups xmm7, [esi + eax*4 + 20] /* xmm7=jzH1 jxH2 jyH2 jzH2 */
+ shufps xmm6, xmm6, 0b11011000 /* xmm6=jxH1 jxH2 jyH1 jyH2 */
+ movlhps xmm3, xmm6 /* xmm3= jxO 0 jxH1 jxH2 */
+ movaps xmm0, [esp + ixO]
+ movaps xmm1, [esp + iyO]
+ movaps xmm2, [esp + izO]
+ shufps xmm4, xmm6, 0b11100100 /* xmm4= jyO 0 jyH1 jyH2 */
+ shufps xmm5, xmm7, 0b11000100 /* xmm5= jzO 0 jzH1 jzH2 */
+ /* store all j coordinates in jO */
+ movaps [esp + jxO], xmm3
+ movaps [esp + jyO], xmm4
+ movaps [esp + jzO], xmm5
+ subps xmm0, xmm3
+ subps xmm1, xmm4
+ subps xmm2, xmm5
+ movaps [esp + dxOO], xmm0
+ movaps [esp + dyOO], xmm1
+ movaps [esp + dzOO], xmm2
+ mulps xmm0, xmm0
+ mulps xmm1, xmm1
+ mulps xmm2, xmm2
+ addps xmm0, xmm1
+ addps xmm0, xmm2 /* have rsq in xmm0 */
+
+ /* do invsqrt */
+ rsqrtps xmm1, xmm0
+ movaps xmm2, xmm1
+ mulps xmm1, xmm1
+ movaps xmm3, [esp + three]
+ mulps xmm1, xmm0
+ subps xmm3, xmm1
+ mulps xmm3, xmm2
+ mulps xmm3, [esp + half] /* rinv iO - j water */
+
+ movaps xmm1, xmm3
+ mulps xmm1, xmm0 /* xmm1=r */
+ movaps xmm0, xmm3 /* xmm0=rinv */
+ mulps xmm1, [esp + tsc]
+
+ movhlps xmm2, xmm1
+ cvttps2pi mm6, xmm1
+ cvttps2pi mm7, xmm2 /* mm6/mm7 contain lu indices */
+ cvtpi2ps xmm3, mm6
+ cvtpi2ps xmm2, mm7
+ movlhps xmm3, xmm2
+ subps xmm1, xmm3 /* xmm1=eps */
+ movaps xmm2, xmm1
+ mulps xmm2, xmm2 /* xmm2=eps2 */
+ pslld mm6, 2
+ pslld mm7, 2
+ movd ebx, mm6
+ movd ecx, mm7
+ psrlq mm7, 32
+ movd edx, mm7 /* table indices in ebx,ecx,edx */
+
+ mov esi, [ebp + VFtab]
+
+ lea ebx, [ebx + ebx*2]
+ lea ecx, [ecx + ecx*2]
+ lea edx, [edx + edx*2]
+
+ movlps xmm5, [esi + ebx*4]
+ movlps xmm7, [esi + ecx*4]
+ movhps xmm7, [esi + edx*4] /* got half coulomb table */
+ movaps xmm4, xmm5
+ shufps xmm4, xmm7, 0b10001000
+ shufps xmm5, xmm7, 0b11011101
+
+ movlps xmm7, [esi + ebx*4 + 8]
+ movlps xmm3, [esi + ecx*4 + 8]
+ movhps xmm3, [esi + edx*4 + 8] /* other half of coulomb table */
+ movaps xmm6, xmm7
+ shufps xmm6, xmm3, 0b10001000
+ shufps xmm7, xmm3, 0b11011101
+ /* coulomb table ready, in xmm4-xmm7 */
+ mulps xmm6, xmm1 /* xmm6=Geps */
+ mulps xmm7, xmm2 /* xmm7=Heps2 */
+ addps xmm5, xmm6
+ addps xmm5, xmm7 /* xmm5=Fp */
+ mulps xmm7, [esp + two] /* two*Heps2 */
+
+ xorps xmm3, xmm3
+ /* fetch charges to xmm3 (temporary) */
+ movss xmm3, [esp + qqOO]
+ movhps xmm3, [esp + qqOH]
+
+ addps xmm7, xmm6
+ addps xmm7, xmm5 /* xmm7=FF */
+ mulps xmm5, xmm1 /* xmm5=eps*Fp */
+ addps xmm5, xmm4 /* xmm5=VV */
+ mulps xmm5, xmm3 /* vcoul=qq*VV */
+ mulps xmm3, xmm7 /* fijC=FF*qq */
+ /* at this point xmm5 contains vcoul and xmm3 fijC */
+
+ addps xmm5, [esp + vctot]
+ movaps [esp + vctot], xmm5
+ /* put scalar force on stack temporarily */
+ movaps [esp + fstmp], xmm3
+
+ /* dispersion */
+ movss xmm4, [esi + ebx*4 + 16]
+ movss xmm5, [esi + ebx*4 + 20]
+ movss xmm6, [esi + ebx*4 + 24]
+ movss xmm7, [esi + ebx*4 + 28]
+ /* dispersion table ready, in xmm4-xmm7 */
+ mulss xmm6, xmm1 /* xmm6=Geps */
+ mulss xmm7, xmm2 /* xmm7=Heps2 */
+ addss xmm5, xmm6
+ addss xmm5, xmm7 /* xmm5=Fp */
+ mulss xmm7, [esp + two] /* two*Heps2 */
+ addss xmm7, xmm6
+ addss xmm7, xmm5 /* xmm7=FF */
+ mulss xmm5, xmm1 /* xmm5=eps*Fp */
+ addss xmm5, xmm4 /* xmm5=VV */
+ xorps xmm4, xmm4
+ movss xmm4, [esp + c6]
+ mulps xmm7, xmm4 /* fijD */
+ mulps xmm5, xmm4 /* vnb6 */
+ addps xmm7, [esp + fstmp] /* add to fscal */
+
+ /* put scalar force on stack Update vnbtot directly */
+ addps xmm5, [esp + vnbtot]
+ movaps [esp + fstmp], xmm7
+ movaps [esp + vnbtot], xmm5
+
+ /* repulsion */
+ movss xmm4, [esi + ebx*4 + 32]
+ movss xmm5, [esi + ebx*4 + 36]
+ movss xmm6, [esi + ebx*4 + 40]
+ movss xmm7, [esi + ebx*4 + 44]
+ /* table ready, in xmm4-xmm7 */
+ mulss xmm6, xmm1 /* xmm6=Geps */
+ mulss xmm7, xmm2 /* xmm7=Heps2 */
+ addss xmm5, xmm6
+ addss xmm5, xmm7 /* xmm5=Fp */
+ mulss xmm7, [esp + two] /* two*Heps2 */
+ addss xmm7, xmm6
+ addss xmm7, xmm5 /* xmm7=FF */
+ mulss xmm5, xmm1 /* xmm5=eps*Fp */
+ addss xmm5, xmm4 /* xmm5=VV */
+
+ xorps xmm4, xmm4
+ movss xmm4, [esp + c12]
+ mulps xmm7, xmm4 /* fijR */
+ mulps xmm5, xmm4 /* vnb12 */
+ addps xmm7, [esp + fstmp]
+
+ addps xmm5, [esp + vnbtot]
+ movaps [esp + vnbtot], xmm5
+ xorps xmm1, xmm1
+
+ mulps xmm7, [esp + tsc]
+ mulps xmm7, xmm0
+ subps xmm1, xmm7
+
+ movaps xmm0, xmm1
+ movaps xmm2, xmm1
+
+ mulps xmm0, [esp + dxOO]
+ mulps xmm1, [esp + dyOO]
+ mulps xmm2, [esp + dzOO]
+ /* initial update for j forces */
+ xorps xmm3, xmm3
+ xorps xmm4, xmm4
+ xorps xmm5, xmm5
+ subps xmm3, xmm0
+ subps xmm4, xmm1
+ subps xmm5, xmm2
+ movaps [esp + fjxO], xmm3
+ movaps [esp + fjyO], xmm4
+ movaps [esp + fjzO], xmm5
+ addps xmm0, [esp + fixO]
+ addps xmm1, [esp + fiyO]
+ addps xmm2, [esp + fizO]
+ movaps [esp + fixO], xmm0
+ movaps [esp + fiyO], xmm1
+ movaps [esp + fizO], xmm2
+
+
+ /* done with i O Now do i H1 & H2 simultaneously first get i particle coords: */
+ movaps xmm0, [esp + ixH1]
+ movaps xmm1, [esp + iyH1]
+ movaps xmm2, [esp + izH1]
+ movaps xmm3, [esp + ixH2]
+ movaps xmm4, [esp + iyH2]
+ movaps xmm5, [esp + izH2]
+ subps xmm0, [esp + jxO]
+ subps xmm1, [esp + jyO]
+ subps xmm2, [esp + jzO]
+ subps xmm3, [esp + jxO]
+ subps xmm4, [esp + jyO]
+ subps xmm5, [esp + jzO]
+ movaps [esp + dxH1O], xmm0
+ movaps [esp + dyH1O], xmm1
+ movaps [esp + dzH1O], xmm2
+ movaps [esp + dxH2O], xmm3
+ movaps [esp + dyH2O], xmm4
+ movaps [esp + dzH2O], xmm5
+ mulps xmm0, xmm0
+ mulps xmm1, xmm1
+ mulps xmm2, xmm2
+ mulps xmm3, xmm3
+ mulps xmm4, xmm4
+ mulps xmm5, xmm5
+ addps xmm0, xmm1
+ addps xmm4, xmm3
+ addps xmm0, xmm2 /* have rsqH1 in xmm0 */
+ addps xmm4, xmm5 /* have rsqH2 in xmm4 */
+
+ /* start with H1, save H2 data */
+ movaps [esp + rsqH2O], xmm4
+
+ /* do invsqrt */
+ rsqrtps xmm1, xmm0
+ rsqrtps xmm5, xmm4
+ movaps xmm2, xmm1
+ movaps xmm6, xmm5
+ mulps xmm1, xmm1
+ mulps xmm5, xmm5
+ movaps xmm3, [esp + three]
+ movaps xmm7, xmm3
+ mulps xmm1, xmm0
+ mulps xmm5, xmm4
+ subps xmm3, xmm1
+ subps xmm7, xmm5
+ mulps xmm3, xmm2
+ mulps xmm7, xmm6
+ mulps xmm3, [esp + half] /* rinv H1 - j water */
+ mulps xmm7, [esp + half] /* rinv H2 - j water */
+
+ /* start with H1, save H2 data */
+ movaps [esp + rinvH2O], xmm7
+
+ movaps xmm1, xmm3
+ mulps xmm1, xmm0 /* xmm1=r */
+ movaps xmm0, xmm3 /* xmm0=rinv */
+ mulps xmm1, [esp + tsc]
+
+ movhlps xmm2, xmm1
+ cvttps2pi mm6, xmm1
+ cvttps2pi mm7, xmm2 /* mm6/mm7 contain lu indices */
+ cvtpi2ps xmm3, mm6
+ cvtpi2ps xmm2, mm7
+ movlhps xmm3, xmm2
+ subps xmm1, xmm3 /* xmm1=eps */
+ movaps xmm2, xmm1
+ mulps xmm2, xmm2 /* xmm2=eps2 */
+ pslld mm6, 2
+ pslld mm7, 2
+ movd ebx, mm6
+ movd ecx, mm7
+ psrlq mm7, 32
+ movd edx, mm7 /* table indices in ebx,ecx,edx */
+
+ lea ebx, [ebx + ebx*2]
+ lea ecx, [ecx + ecx*2]
+ lea edx, [edx + edx*2]
+
+ movlps xmm5, [esi + ebx*4]
+ movlps xmm7, [esi + ecx*4]
+ movhps xmm7, [esi + edx*4] /* got half coulomb table */
+ movaps xmm4, xmm5
+ shufps xmm4, xmm7, 0b10001000
+ shufps xmm5, xmm7, 0b11011101
+
+ movlps xmm7, [esi + ebx*4 + 8]
+ movlps xmm3, [esi + ecx*4 + 8]
+ movhps xmm3, [esi + edx*4 + 8] /* other half of coulomb table */
+ movaps xmm6, xmm7
+ shufps xmm6, xmm3, 0b10001000
+ shufps xmm7, xmm3, 0b11011101
+ /* coulomb table ready, in xmm4-xmm7 */
+ mulps xmm6, xmm1 /* xmm6=Geps */
+ mulps xmm7, xmm2 /* xmm7=Heps2 */
+ addps xmm5, xmm6
+ addps xmm5, xmm7 /* xmm5=Fp */
+ mulps xmm7, [esp + two] /* two*Heps2 */
+
+ xorps xmm3, xmm3
+ /* fetch charges to xmm3 (temporary) */
+ movss xmm3, [esp + qqOH]
+ movhps xmm3, [esp + qqHH]
+
+ addps xmm7, xmm6
+ addps xmm7, xmm5 /* xmm7=FF */
+ mulps xmm5, xmm1 /* xmm5=eps*Fp */
+ addps xmm5, xmm4 /* xmm5=VV */
+ mulps xmm5, xmm3 /* vcoul=qq*VV */
+ mulps xmm3, xmm7 /* fijC=FF*qq */
+ /* at this point xmm5 contains vcoul and xmm3 fijC */
+ addps xmm5, [esp + vctot]
+ movaps [esp + vctot], xmm5
+
+ xorps xmm1, xmm1
+
+ mulps xmm3, [esp + tsc]
+ mulps xmm3, xmm0
+ subps xmm1, xmm3
+
+ movaps xmm0, xmm1
+ movaps xmm2, xmm1
+ mulps xmm0, [esp + dxH1O]
+ mulps xmm1, [esp + dyH1O]
+ mulps xmm2, [esp + dzH1O]
+ /* update forces H1 - j water */
+ movaps xmm3, [esp + fjxO]
+ movaps xmm4, [esp + fjyO]
+ movaps xmm5, [esp + fjzO]
+ subps xmm3, xmm0
+ subps xmm4, xmm1
+ subps xmm5, xmm2
+ movaps [esp + fjxO], xmm3
+ movaps [esp + fjyO], xmm4
+ movaps [esp + fjzO], xmm5
+ addps xmm0, [esp + fixH1]
+ addps xmm1, [esp + fiyH1]
+ addps xmm2, [esp + fizH1]
+ movaps [esp + fixH1], xmm0
+ movaps [esp + fiyH1], xmm1
+ movaps [esp + fizH1], xmm2
+ /* do table for H2 - j water interaction */
+ movaps xmm0, [esp + rinvH2O]
+ movaps xmm1, [esp + rsqH2O]
+ mulps xmm1, xmm0 /* xmm0=rinv, xmm1=r */
+ mulps xmm1, [esp + tsc]
+
+ movhlps xmm2, xmm1
+ cvttps2pi mm6, xmm1
+ cvttps2pi mm7, xmm2 /* mm6/mm7 contain lu indices */
+ cvtpi2ps xmm3, mm6
+ cvtpi2ps xmm2, mm7
+ movlhps xmm3, xmm2
+ subps xmm1, xmm3 /* xmm1=eps */
+ movaps xmm2, xmm1
+ mulps xmm2, xmm2 /* xmm2=eps2 */
+ pslld mm6, 2
+ pslld mm7, 2
+ movd ebx, mm6
+ movd ecx, mm7
+ psrlq mm7, 32
+ movd edx, mm7 /* table indices in ebx,ecx,edx */
+
+ lea ebx, [ebx + ebx*2]
+ lea ecx, [ecx + ecx*2]
+ lea edx, [edx + edx*2]
+
+ movlps xmm5, [esi + ebx*4]
+ movlps xmm7, [esi + ecx*4]
+ movhps xmm7, [esi + edx*4] /* got half coulomb table */
+ movaps xmm4, xmm5
+ shufps xmm4, xmm7, 0b10001000
+ shufps xmm5, xmm7, 0b11011101
+
+ movlps xmm7, [esi + ebx*4 + 8]
+ movlps xmm3, [esi + ecx*4 + 8]
+ movhps xmm3, [esi + edx*4 + 8] /* other half of coulomb table */
+ movaps xmm6, xmm7
+ shufps xmm6, xmm3, 0b10001000
+ shufps xmm7, xmm3, 0b11011101
+ /* coulomb table ready, in xmm4-xmm7 */
+ mulps xmm6, xmm1 /* xmm6=Geps */
+ mulps xmm7, xmm2 /* xmm7=Heps2 */
+ addps xmm5, xmm6
+ addps xmm5, xmm7 /* xmm5=Fp */
+ mulps xmm7, [esp + two] /* two*Heps2 */
+
+ xorps xmm3, xmm3
+ /* fetch charges to xmm3 (temporary) */
+ movss xmm3, [esp + qqOH]
+ movhps xmm3, [esp + qqHH]
+
+ addps xmm7, xmm6
+ addps xmm7, xmm5 /* xmm7=FF */
+ mulps xmm5, xmm1 /* xmm5=eps*Fp */
+ addps xmm5, xmm4 /* xmm5=VV */
+ mulps xmm5, xmm3 /* vcoul=qq*VV */
+ mulps xmm3, xmm7 /* fijC=FF*qq */
+ /* at this point xmm5 contains vcoul and xmm3 fijC */
+ addps xmm5, [esp + vctot]
+ movaps [esp + vctot], xmm5
+
+ xorps xmm1, xmm1
+
+ mulps xmm3, [esp + tsc]
+ mulps xmm3, xmm0
+ subps xmm1, xmm3
+
+ movaps xmm0, xmm1
+ movaps xmm2, xmm1
+
+ mulps xmm0, [esp + dxH2O]
+ mulps xmm1, [esp + dyH2O]
+ mulps xmm2, [esp + dzH2O]
+ movaps xmm3, [esp + fjxO]
+ movaps xmm4, [esp + fjyO]
+ movaps xmm5, [esp + fjzO]
+ subps xmm3, xmm0
+ subps xmm4, xmm1
+ subps xmm5, xmm2
+ mov esi, [ebp + faction]
+ movaps [esp + fjxO], xmm3
+ movaps [esp + fjyO], xmm4
+ movaps [esp + fjzO], xmm5
+ addps xmm0, [esp + fixH2]
+ addps xmm1, [esp + fiyH2]
+ addps xmm2, [esp + fizH2]
+ movaps [esp + fixH2], xmm0
+ movaps [esp + fiyH2], xmm1
+ movaps [esp + fizH2], xmm2
+
+ /* update j water forces from local variables */
+ movlps xmm0, [esi + eax*4]
+ movlps xmm1, [esi + eax*4 + 12]
+ movhps xmm1, [esi + eax*4 + 24]
+ movaps xmm3, [esp + fjxO]
+ movaps xmm4, [esp + fjyO]
+ movaps xmm5, [esp + fjzO]
+ movaps xmm6, xmm5
+ movaps xmm7, xmm5
+ shufps xmm6, xmm6, 0b10
+ shufps xmm7, xmm7, 0b11
+ addss xmm5, [esi + eax*4 + 8]
+ addss xmm6, [esi + eax*4 + 20]
+ addss xmm7, [esi + eax*4 + 32]
+ movss [esi + eax*4 + 8], xmm5
+ movss [esi + eax*4 + 20], xmm6
+ movss [esi + eax*4 + 32], xmm7
+ movaps xmm5, xmm3
+ unpcklps xmm3, xmm4
+ unpckhps xmm5, xmm4
+ addps xmm0, xmm3
+ addps xmm1, xmm5
+ movlps [esi + eax*4], xmm0
+ movlps [esi + eax*4 + 12], xmm1
+ movhps [esi + eax*4 + 24], xmm1
+
+ dec dword ptr [esp + innerk]
+ jz .i3330_updateouterdata
+ jmp .i3330_single_loop
+.i3330_updateouterdata:
+ mov ecx, [esp + ii3]
+ mov edi, [ebp + faction]
+ mov esi, [ebp + fshift]
+ mov edx, [esp + is3]
+
+ /* accumulate Oi forces in xmm0, xmm1, xmm2 */
+ movaps xmm0, [esp + fixO]
+ movaps xmm1, [esp + fiyO]
+ movaps xmm2, [esp + fizO]
+
+ movhlps xmm3, xmm0
+ movhlps xmm4, xmm1
+ movhlps xmm5, xmm2
+ addps xmm0, xmm3
+ addps xmm1, xmm4
+ addps xmm2, xmm5 /* sum is in 1/2 in xmm0-xmm2 */
+
+ movaps xmm3, xmm0
+ movaps xmm4, xmm1
+ movaps xmm5, xmm2
+
+ shufps xmm3, xmm3, 1
+ shufps xmm4, xmm4, 1
+ shufps xmm5, xmm5, 1
+ addss xmm0, xmm3
+ addss xmm1, xmm4
+ addss xmm2, xmm5 /* xmm0-xmm2 has single force in pos0 */
+
+ /* increment i force */
+ movss xmm3, [edi + ecx*4]
+ movss xmm4, [edi + ecx*4 + 4]
+ movss xmm5, [edi + ecx*4 + 8]
+ addss xmm3, xmm0
+ addss xmm4, xmm1
+ addss xmm5, xmm2
+ movss [edi + ecx*4], xmm3
+ movss [edi + ecx*4 + 4], xmm4
+ movss [edi + ecx*4 + 8], xmm5
+
+ /* accumulate force in xmm6/xmm7 for fshift */
+ movaps xmm6, xmm0
+ movss xmm7, xmm2
+ movlhps xmm6, xmm1
+ shufps xmm6, xmm6, 0b1000
+
+ /* accumulate H1i forces in xmm0, xmm1, xmm2 */
+ movaps xmm0, [esp + fixH1]
+ movaps xmm1, [esp + fiyH1]
+ movaps xmm2, [esp + fizH1]
+
+ movhlps xmm3, xmm0
+ movhlps xmm4, xmm1
+ movhlps xmm5, xmm2
+ addps xmm0, xmm3
+ addps xmm1, xmm4
+ addps xmm2, xmm5 /* sum is in 1/2 in xmm0-xmm2 */
+
+ movaps xmm3, xmm0
+ movaps xmm4, xmm1
+ movaps xmm5, xmm2
+
+ shufps xmm3, xmm3, 1
+ shufps xmm4, xmm4, 1
+ shufps xmm5, xmm5, 1
+ addss xmm0, xmm3
+ addss xmm1, xmm4
+ addss xmm2, xmm5 /* xmm0-xmm2 has single force in pos0 */
+
+ /* increment i force */
+ movss xmm3, [edi + ecx*4 + 12]
+ movss xmm4, [edi + ecx*4 + 16]
+ movss xmm5, [edi + ecx*4 + 20]
+ addss xmm3, xmm0
+ addss xmm4, xmm1
+ addss xmm5, xmm2
+ movss [edi + ecx*4 + 12], xmm3
+ movss [edi + ecx*4 + 16], xmm4
+ movss [edi + ecx*4 + 20], xmm5
+
+ /* accumulate force in xmm6/xmm7 for fshift */
+ addss xmm7, xmm2
+ movlhps xmm0, xmm1
+ shufps xmm0, xmm0, 0b1000
+ addps xmm6, xmm0
+
+ /* accumulate H2i forces in xmm0, xmm1, xmm2 */
+ movaps xmm0, [esp + fixH2]
+ movaps xmm1, [esp + fiyH2]
+ movaps xmm2, [esp + fizH2]
+
+ movhlps xmm3, xmm0
+ movhlps xmm4, xmm1
+ movhlps xmm5, xmm2
+ addps xmm0, xmm3
+ addps xmm1, xmm4
+ addps xmm2, xmm5 /* sum is in 1/2 i xmm0-xmm2 */
+
+ movaps xmm3, xmm0
+ movaps xmm4, xmm1
+ movaps xmm5, xmm2
+
+ shufps xmm3, xmm3, 1
+ shufps xmm4, xmm4, 1
+ shufps xmm5, xmm5, 1
+ addss xmm0, xmm3
+ addss xmm1, xmm4
+ addss xmm2, xmm5 /* xmm0-xmm2 has single force in pos0 */
+
+ /* increment i force */
+ movss xmm3, [edi + ecx*4 + 24]
+ movss xmm4, [edi + ecx*4 + 28]
+ movss xmm5, [edi + ecx*4 + 32]
+ addss xmm3, xmm0
+ addss xmm4, xmm1
+ addss xmm5, xmm2
+ movss [edi + ecx*4 + 24], xmm3
+ movss [edi + ecx*4 + 28], xmm4
+ movss [edi + ecx*4 + 32], xmm5
+
+ /* accumulate force in xmm6/xmm7 for fshift */
+ addss xmm7, xmm2
+ movlhps xmm0, xmm1
+ shufps xmm0, xmm0, 0b1000
+ addps xmm6, xmm0
+
+ /* increment fshift force */
+ movlps xmm3, [esi + edx*4]
+ movss xmm4, [esi + edx*4 + 8]
+ addps xmm3, xmm6
+ addss xmm4, xmm7
+ movlps [esi + edx*4], xmm3
+ movss [esi + edx*4 + 8], xmm4
+
+ /* get group index for i particle */
+ mov edx, [ebp + gid] /* get group index for this i particle */
+ mov edx, [edx]
+ add [ebp + gid], 4 /* advance pointer */
+
+ /* accumulate total potential energy and update it */
+ movaps xmm7, [esp + vctot]
+ /* accumulate */
+ movhlps xmm6, xmm7
+ addps xmm7, xmm6 /* pos 0-1 in xmm7 have the sum now */
+ movaps xmm6, xmm7
+ shufps xmm6, xmm6, 1
+ addss xmm7, xmm6
+
+ /* add earlier value from mem */
+ mov eax, [ebp + Vc]
+ addss xmm7, [eax + edx*4]
+ /* move back to mem */
+ movss [eax + edx*4], xmm7
+
+ /* accumulate total lj energy and update it */
+ movaps xmm7, [esp + vnbtot]
+ /* accumulate */
+ movhlps xmm6, xmm7
+ addps xmm7, xmm6 /* pos 0-1 in xmm7 have the sum now */
+ movaps xmm6, xmm7
+ shufps xmm6, xmm6, 1
+ addss xmm7, xmm6
+
+ /* add earlier value from mem */
+ mov eax, [ebp + Vnb]
+ addss xmm7, [eax + edx*4]
+ /* move back to mem */
+ movss [eax + edx*4], xmm7
+
+ /* finish if last */
+ mov ecx, [ebp + nri]
+ dec ecx
+ jecxz .i3330_end
+ /* not last, iterate once more! */
+ mov [ebp + nri], ecx
+ jmp .i3330_outer
+.i3330_end:
+ emms
+ mov eax, [esp + salign]
+ add esp, eax
+ add esp, 1508
+ pop edi
+ pop esi
+ pop edx
+ pop ecx
+ pop ebx
+ pop eax
+ leave
+ ret
+
+++ /dev/null
-;;
-;; This source code is part of
-;;
-;; G R O M A C S
-;;
-;; GROningen MAchine for Chemical Simulations
-;;
-;; VERSION 3.0
-;;
-;; Copyright (c) 1991-2001
-;; BIOSON Research Institute, Dept. of Biophysical Chemistry
-;; University of Groningen, The Netherlands
-;;
-;; This program is free software; you can redistribute it and/or
-;; modify it under the terms of the GNU General Public License
-;; as published by the Free Software Foundation; either version 2
-;; of the License, or (at your option) any later version.
-;;
-;; If you want to redistribute modifications, please consider that
-;; scientific software is very special. Version control is crucial -
-;; bugs must be traceable. We will be happy to consider code for
-;; inclusion in the official distribution, but derived work must not
-;; be called official GROMACS. Details are found in the README & COPYING
-;; files - if they are missing, get the official version at www.gromacs.org.
-;;
-;; To help us fund GROMACS development, we humbly ask that you cite
-;; the papers on the package - you can find them in the top README file.
-;;
-;; Do check out http://www.gromacs.org , or mail us at gromacs@gromacs.org .
-;;
-;; And Hey:
-;; GROup of MAchos and Cynical Suckers
-
-; NASM macro set to make interfacing to 32-bit programs easier -*- nasm -*-
-%imacro proc 1 ; begin a procedure definition
-%push proc
- global %1
-%1: push ebp
- mov ebp,esp
-%assign %$arg 8
-%define %$procname %1
-%endmacro
-
-
-
-%imacro arg 0-1 4 ; used with the argument name as a label
-%00 equ %$arg
-%assign %$arg %1+%$arg
-%endmacro
-
-
-
-%imacro endproc 0
-%ifnctx proc
-%error Mismatched `endproc'/`proc'
-
-%else
- leave
- ret
-__end_%$procname: ; useful for calculating function size
-
-%pop
-%endif
-%endmacro
-
-
-segment .data
-
-sse_minushalf
- dd -0.5
- dd -0.5
- dd -0.5
- dd -0.5
-sse_half
- dd 0.5
- dd 0.5
- dd 0.5
- dd 0.5
-sse_two
- dd 2.0
- dd 2.0
- dd 2.0
- dd 2.0
-sse_three
- dd 3.0
- dd 3.0
- dd 3.0
- dd 3.0
-sse_six
- dd 6.0
- dd 6.0
- dd 6.0
- dd 6.0
-sse_twelve
- dd 12.0
- dd 12.0
- dd 12.0
- dd 12.0
-
-
-segment .text
-
- global checksse ; tries to issue a simple SSE instruction
-checksse:
- emms
- xorps xmm0,xmm0
- emms
- ret
-
-align 16
- global vecinvsqrt_sse
-vecinvsqrt_sse
- push ebp
- mov ebp,esp
- push eax
- push ebx
- push ecx
- push edx
-
- mov eax, [ebp + 8]
- mov ebx, [ebp + 12]
- mov ecx, [ebp + 16]
- mov edx, ecx
- movups xmm6,[sse_three]
- movups xmm7,[sse_half]
- shr ecx, 3
- jecxz .iter4
- emms
-.loop8:
- movaps xmm0,[eax]
- add eax, byte 16
- rsqrtps xmm1,xmm0
- movaps xmm2,[eax]
- add eax, byte 16
- rsqrtps xmm3,xmm2
- mulps xmm0,xmm1
- mulps xmm2,xmm3
- mulps xmm0,xmm1
- mulps xmm2,xmm3
- subps xmm0,xmm6
- subps xmm2,xmm6
- mulps xmm0,xmm1
- mulps xmm2,xmm3
- mulps xmm0,xmm7
- mulps xmm2,xmm7
- movaps [ebx],xmm0
- add ebx, byte 16
- movaps [ebx],xmm2
- add ebx, byte 16
- dec ecx
- jecxz .iter4
- jmp .loop8
-.iter4:
- mov ecx,edx
- and ecx,4
- jecxz .iter2
- movaps xmm0,[eax]
- add eax, byte 16
- rsqrtps xmm1,xmm0
- mulps xmm0,xmm1
- mulps xmm0,xmm1
- subps xmm0,xmm6
- mulps xmm0,xmm1
- mulps xmm0,xmm7
- movaps [ebx],xmm0
- add ebx, byte 16
-.iter2:
- mov ecx,edx
- and ecx,2
- jecxz .iter1
- movlps xmm0,[eax]
- add eax, byte 8
- rsqrtps xmm1,xmm0
- mulps xmm0,xmm1
- mulps xmm0,xmm1
- subps xmm0,xmm6
- mulps xmm0,xmm1
- mulps xmm0,xmm7
- movlps [ebx],xmm0
- add ebx, byte 8
-.iter1:
- mov ecx,edx
- and ecx,1
- jecxz .end
- movss xmm0,[eax]
- rsqrtss xmm1,xmm0
- mulss xmm0,xmm1
- mulss xmm0,xmm1
- subss xmm0,xmm6
- mulss xmm0,xmm1
- mulss xmm0,xmm7
- movss [ebx],xmm0
-.end:
- emms
- pop edx
- pop ecx
- pop ebx
- pop eax
- leave
- ret
-
- global vecrecip_sse
-vecrecip_sse
- push ebp
- mov ebp,esp
- push eax
- push ebx
- push ecx
- push edx
-
- mov eax, [ebp + 8]
- mov ebx, [ebp + 12]
- mov ecx, [ebp + 16]
- mov edx, ecx
- movups xmm6,[sse_two]
- shr ecx, 3
- jecxz .iter4
- emms
-.loop8:
- movaps xmm0,[eax]
- add eax, byte 16
- rcpps xmm1,xmm0
- movaps xmm3,[eax]
- add eax, byte 16
- rcpps xmm4,xmm3
- movaps xmm2,xmm6
- mulps xmm0,xmm1
- movaps xmm5,xmm6
- subps xmm2,xmm0
- mulps xmm3,xmm4
- mulps xmm2,xmm1
- subps xmm5,xmm3
- movaps [ebx],xmm2
- mulps xmm5,xmm4
- add ebx, byte 16
- movaps [ebx],xmm5
- add ebx, byte 16
- dec ecx
- jecxz .iter4
- jmp .loop8
-.iter4:
- mov ecx,edx
- and ecx,4
- jecxz .iter2
- movaps xmm0,[eax]
- add eax, byte 16
- rcpps xmm1,xmm0
- movaps xmm2,xmm6
- mulps xmm0,xmm1
- subps xmm2,xmm0
- mulps xmm2,xmm1
- movaps [ebx],xmm2
- add ebx, byte 16
-.iter2:
- mov ecx,edx
- and ecx,2
- jecxz .iter1
- movlps xmm0,[eax]
- add eax, byte 8
- rcpps xmm1,xmm0
- movaps xmm2,xmm6
- mulps xmm0,xmm1
- subps xmm2,xmm0
- mulps xmm2,xmm1
- movlps [ebx],xmm2
- add ebx, byte 8
-.iter1:
- mov ecx,edx
- and ecx,1
- jecxz .end
- movss xmm0,[eax]
- rcpss xmm1,xmm0
- movss xmm2,xmm6
- mulss xmm0,xmm1
- subss xmm2,xmm0
- mulss xmm2,xmm1
- movss [ebx],xmm2
-.end:
- emms
- pop edx
- pop ecx
- pop ebx
- pop eax
- leave
- ret
-
-
-proc inl0100_sse
-%$nri arg
-%$iinr arg
-%$jindex arg
-%$jjnr arg
-%$shift arg
-%$shiftvec arg
-%$fshift arg
-%$gid arg
-%$pos arg
-%$faction arg
-%$type arg
-%$ntype arg
-%$nbfp arg
-%$Vnb arg
- ;; stack offsets for local variables
- ;; bottom of stack is cache-aligned for sse use
-.ix equ 0
-.iy equ 16
-.iz equ 32
-.dx equ 48
-.dy equ 64
-.dz equ 80
-.two equ 96
-.c6 equ 112
-.c12 equ 128
-.six equ 144
-.twelve equ 160
-.vnbtot equ 176
-.fix equ 192
-.fiy equ 208
-.fiz equ 224
-.half equ 240
-.three equ 256
-.is3 equ 272
-.ii3 equ 276
-.ntia equ 280
-.innerjjnr equ 284
-.innerk equ 288
-.salign equ 292
- push eax
- push ebx
- push ecx
- push edx
- push esi
- push edi
- sub esp, 296 ; local stack space
- mov eax, esp
- and eax, 0xf
- sub esp, eax
- mov [esp + .salign], eax
-
- emms
-
- movups xmm1, [sse_two]
- movups xmm2, [sse_six]
- movups xmm3, [sse_twelve]
- movaps [esp + .two], xmm1
- movaps [esp + .six], xmm2
- movaps [esp + .twelve], xmm3
-
- ;; assume we have at least one i particle - start directly
-.outer:
- mov eax, [ebp + %$shift] ; eax = pointer into shift[]
- mov ebx, [eax] ; ebx=shift[n]
- add [ebp + %$shift], dword 4 ; advance pointer one step
-
- lea ebx, [ebx + ebx*2] ; ebx=3*is
- mov [esp + .is3],ebx ; store is3
-
- mov eax, [ebp + %$shiftvec] ; eax = base of shiftvec[]
-
- movss xmm0, [eax + ebx*4]
- movss xmm1, [eax + ebx*4 + 4]
- movss xmm2, [eax + ebx*4 + 8]
-
- mov ecx, [ebp + %$iinr] ; ecx = pointer into iinr[]
- add [ebp + %$iinr], dword 4 ; advance pointer
- mov ebx, [ecx] ; ebx =ii
-
- mov edx, [ebp + %$type]
- mov edx, [edx + ebx*4]
- imul edx, [ebp + %$ntype]
- shl edx, 1
- mov [esp + .ntia], edx
-
- lea ebx, [ebx + ebx*2] ; ebx = 3*ii=ii3
- mov eax, [ebp + %$pos] ; eax = base of pos[]
-
- addss xmm0, [eax + ebx*4]
- addss xmm1, [eax + ebx*4 + 4]
- addss xmm2, [eax + ebx*4 + 8]
-
- shufps xmm0, xmm0, 0b
- shufps xmm1, xmm1, 0b
- shufps xmm2, xmm2, 0b
-
- movaps [esp + .ix], xmm0
- movaps [esp + .iy], xmm1
- movaps [esp + .iz], xmm2
-
- mov [esp + .ii3], ebx
-
- ; clear vnbtot and i forces
- xorps xmm4, xmm4
- movaps [esp + .vnbtot], xmm4
- movaps [esp + .fix], xmm4
- movaps [esp + .fiy], xmm4
- movaps [esp + .fiz], xmm4
-
- mov eax, [ebp + %$jindex]
- mov ecx, [eax] ; jindex[n]
- mov edx, [eax + 4] ; jindex[n+1]
- add [ebp + %$jindex], dword 4
- sub edx, ecx ; number of innerloop atoms
-
- mov esi, [ebp + %$pos]
- mov edi, [ebp + %$faction]
- mov eax, [ebp + %$jjnr]
- shl ecx, 2
- add eax, ecx
- mov [esp + .innerjjnr], eax ; pointer to jjnr[nj0]
- sub edx, dword 4
- mov [esp + .innerk], edx ; number of innerloop atoms
- jge .unroll_loop
- jmp .finish_inner
-.unroll_loop:
- ;; quad-unroll innerloop here.
- mov edx, [esp + .innerjjnr] ; pointer to jjnr[k]
- mov eax, [edx]
- mov ebx, [edx + 4]
- mov ecx, [edx + 8]
- mov edx, [edx + 12] ; eax-edx=jnr1-4
- add [esp + .innerjjnr], dword 16 ; advance pointer (unrolled 4)
-
- movd mm0, eax ; use mmx registers as temp. storage
- movd mm1, ebx
- movd mm2, ecx
- movd mm3, edx
-
- mov esi, [ebp + %$type]
- mov eax, [esi + eax*4]
- mov ebx, [esi + ebx*4]
- mov ecx, [esi + ecx*4]
- mov edx, [esi + edx*4]
- mov esi, [ebp + %$nbfp]
- shl eax, 1
- shl ebx, 1
- shl ecx, 1
- shl edx, 1
- mov edi, [esp + .ntia]
- add eax, edi
- add ebx, edi
- add ecx, edi
- add edx, edi
-
- movlps xmm6, [esi + eax*4]
- movlps xmm7, [esi + ecx*4]
- movhps xmm6, [esi + ebx*4]
- movhps xmm7, [esi + edx*4]
-
- movaps xmm4, xmm6
- shufps xmm4, xmm7, 10001000b
- shufps xmm6, xmm7, 11011101b
-
- movd eax, mm0
- movd ebx, mm1
- movd ecx, mm2
- movd edx, mm3
-
- movaps [esp + .c6], xmm4
- movaps [esp + .c12], xmm6
-
- mov esi, [ebp + %$pos] ; base of pos[]
-
- lea eax, [eax + eax*2] ; replace jnr with j3
- lea ebx, [ebx + ebx*2]
-
- mulps xmm3, xmm2
- lea ecx, [ecx + ecx*2] ; replace jnr with j3
- lea edx, [edx + edx*2]
-
- ; move four coordinates to xmm0-xmm2
-
- movlps xmm4, [esi + eax*4]
- movlps xmm5, [esi + ecx*4]
- movss xmm2, [esi + eax*4 + 8]
- movss xmm6, [esi + ecx*4 + 8]
-
- movhps xmm4, [esi + ebx*4]
- movhps xmm5, [esi + edx*4]
-
- movss xmm0, [esi + ebx*4 + 8]
- movss xmm1, [esi + edx*4 + 8]
-
- shufps xmm2, xmm0, 0b
- shufps xmm6, xmm1, 0b
-
- movaps xmm0, xmm4
- movaps xmm1, xmm4
-
- shufps xmm2, xmm6, 10001000b
-
- shufps xmm0, xmm5, 10001000b
- shufps xmm1, xmm5, 11011101b
-
- ; move ix-iz to xmm4-xmm6
- movaps xmm4, [esp + .ix]
- movaps xmm5, [esp + .iy]
- movaps xmm6, [esp + .iz]
-
- ; calc dr
- subps xmm4, xmm0
- subps xmm5, xmm1
- subps xmm6, xmm2
-
- ; store dr
- movaps [esp + .dx], xmm4
- movaps [esp + .dy], xmm5
- movaps [esp + .dz], xmm6
- ; square it
- mulps xmm4,xmm4
- mulps xmm5,xmm5
- mulps xmm6,xmm6
- addps xmm4, xmm5
- addps xmm4, xmm6
- ; rsq in xmm4
-
- rcpps xmm5, xmm4
- ; 1/x lookup seed in xmm5
- movaps xmm0, [esp + .two]
- mulps xmm4, xmm5
- subps xmm0, xmm4
- mulps xmm0, xmm5 ; xmm0=rinvsq
- movaps xmm4, xmm0
-
- movaps xmm1, xmm0
- mulps xmm1, xmm0
- mulps xmm1, xmm0 ; xmm1=rinvsix
- movaps xmm2, xmm1
- mulps xmm2, xmm2 ; xmm2=rinvtwelve
-
- mulps xmm1, [esp + .c6]
- mulps xmm2, [esp + .c12]
- movaps xmm5, xmm2
- subps xmm5, xmm1 ; vnb=vnb12-vnb6
- addps xmm5, [esp + .vnbtot]
- mulps xmm1, [esp + .six]
- mulps xmm2, [esp + .twelve]
- subps xmm2, xmm1
- mulps xmm4, xmm2 ; xmm4=total fscal
-
- movaps xmm0, [esp + .dx]
- movaps xmm1, [esp + .dy]
- movaps xmm2, [esp + .dz]
-
- movaps [esp + .vnbtot], xmm5
-
- mov edi, [ebp + %$faction]
- mulps xmm0, xmm4
- mulps xmm1, xmm4
- mulps xmm2, xmm4
- ; xmm0-xmm2 contains tx-tz (partial force)
- ; now update f_i
- movaps xmm3, [esp + .fix]
- movaps xmm4, [esp + .fiy]
- movaps xmm5, [esp + .fiz]
- addps xmm3, xmm0
- addps xmm4, xmm1
- addps xmm5, xmm2
- movaps [esp + .fix], xmm3
- movaps [esp + .fiy], xmm4
- movaps [esp + .fiz], xmm5
- ; the fj's - start by accumulating x & y forces from memory
- movlps xmm4, [edi + eax*4]
- movlps xmm6, [edi + ecx*4]
- movhps xmm4, [edi + ebx*4]
- movhps xmm6, [edi + edx*4]
-
- movaps xmm3, xmm4
- shufps xmm3, xmm6, 10001000b
- shufps xmm4, xmm6, 11011101b
-
- ; now xmm3-xmm5 contains fjx, fjy, fjz
- subps xmm3, xmm0
- subps xmm4, xmm1
-
- ; unpack them back so we can store them - first x & y in xmm3/xmm4
-
- movaps xmm6, xmm3
- unpcklps xmm6, xmm4
- unpckhps xmm3, xmm4
- ; xmm6(l)=x & y for j1, (h) for j2
- ; xmm3(l)=x & y for j3, (h) for j4
- movlps [edi + eax*4], xmm6
- movlps [edi + ecx*4], xmm3
-
- movhps [edi + ebx*4], xmm6
- movhps [edi + edx*4], xmm3
-
- ;; and the z forces
- movss xmm4, [edi + eax*4 + 8]
- movss xmm5, [edi + ebx*4 + 8]
- movss xmm6, [edi + ecx*4 + 8]
- movss xmm7, [edi + edx*4 + 8]
- subss xmm4, xmm2
- shufps xmm2, xmm2, 11100101b
- subss xmm5, xmm2
- shufps xmm2, xmm2, 11101010b
- subss xmm6, xmm2
- shufps xmm2, xmm2, 11111111b
- subss xmm7, xmm2
- movss [edi + eax*4 + 8], xmm4
- movss [edi + ebx*4 + 8], xmm5
- movss [edi + ecx*4 + 8], xmm6
- movss [edi + edx*4 + 8], xmm7
-
- ;; should we do one more iteration?
- sub [esp + .innerk], dword 4
- jl .finish_inner
- jmp .unroll_loop
-.finish_inner:
- ;; check if at least two particles remain
- add [esp + .innerk], dword 4
- mov edx, [esp + .innerk]
- and edx, 10b
- jnz .dopair
- jmp .checksingle
-.dopair:
-
- mov ecx, [esp + .innerjjnr]
-
- mov eax, [ecx]
- mov ebx, [ecx + 4]
- add [esp + .innerjjnr], dword 8
-
- mov esi, [ebp + %$type]
- mov ecx, eax
- mov edx, ebx
- mov ecx, [esi + ecx*4]
- mov edx, [esi + edx*4]
- mov esi, [ebp + %$nbfp]
- shl ecx, 1
- shl edx, 1
- mov edi, [esp + .ntia]
- add ecx, edi
- add edx, edi
- movlps xmm6, [esi + ecx*4]
- movhps xmm6, [esi + edx*4]
- mov edi, [ebp + %$pos]
- xorps xmm7,xmm7
- movaps xmm4, xmm6
- shufps xmm4, xmm4, 1000b
- shufps xmm6, xmm6, 1101b
- movlhps xmm4, xmm7
- movlhps xmm6, xmm7
-
- movaps [esp + .c6], xmm4
- movaps [esp + .c12], xmm6
-
- lea eax, [eax + eax*2]
- lea ebx, [ebx + ebx*2]
- ; move coordinates to xmm0-xmm2
- movlps xmm1, [edi + eax*4]
- movss xmm2, [edi + eax*4 + 8]
- movhps xmm1, [edi + ebx*4]
- movss xmm0, [edi + ebx*4 + 8]
-
-
- movlhps xmm3, xmm7
-
- shufps xmm2, xmm0, 0b
-
- movaps xmm0, xmm1
-
- shufps xmm2, xmm2, 10001000b
-
- shufps xmm0, xmm0, 10001000b
- shufps xmm1, xmm1, 11011101b
-
- mov edi, [ebp + %$faction]
- ; move ix-iz to xmm4-xmm6
- xorps xmm7, xmm7
-
- movaps xmm4, [esp + .ix]
- movaps xmm5, [esp + .iy]
- movaps xmm6, [esp + .iz]
-
- ; calc dr
- subps xmm4, xmm0
- subps xmm5, xmm1
- subps xmm6, xmm2
-
- ; store dr
- movaps [esp + .dx], xmm4
- movaps [esp + .dy], xmm5
- movaps [esp + .dz], xmm6
- ; square it
- mulps xmm4,xmm4
- mulps xmm5,xmm5
- mulps xmm6,xmm6
- addps xmm4, xmm5
- addps xmm4, xmm6
- ; rsq in xmm4
-
-
- rcpps xmm5, xmm4
- ; 1/x lookup seed in xmm5
- movaps xmm0, [esp + .two]
- mulps xmm4, xmm5
- subps xmm0, xmm4
- mulps xmm0, xmm5 ; xmm0=rinvsq
- movaps xmm4, xmm0
-
- movaps xmm1, xmm0
- mulps xmm1, xmm0
- mulps xmm1, xmm0 ; xmm1=rinvsix
- movaps xmm2, xmm1
- mulps xmm2, xmm2 ; xmm2=rinvtwelve
-
- mulps xmm1, [esp + .c6]
- mulps xmm2, [esp + .c12]
- movaps xmm5, xmm2
- subps xmm5, xmm1 ; vnb=vnb12-vnb6
- addps xmm5, [esp + .vnbtot]
- mulps xmm1, [esp + .six]
- mulps xmm2, [esp + .twelve]
- subps xmm2, xmm1
- mulps xmm4, xmm2 ; xmm4=total fscal
-
- movaps xmm0, [esp + .dx]
- movaps xmm1, [esp + .dy]
- movaps xmm2, [esp + .dz]
-
- movaps [esp + .vnbtot], xmm5
-
- mulps xmm0, xmm4
- mulps xmm1, xmm4
- mulps xmm2, xmm4
- ; xmm0-xmm2 contains tx-tz (partial force)
- ; now update f_i
- movaps xmm3, [esp + .fix]
- movaps xmm4, [esp + .fiy]
- movaps xmm5, [esp + .fiz]
- addps xmm3, xmm0
- addps xmm4, xmm1
- addps xmm5, xmm2
- movaps [esp + .fix], xmm3
- movaps [esp + .fiy], xmm4
- movaps [esp + .fiz], xmm5
- ; update the fj's
- movss xmm3, [edi + eax*4]
- movss xmm4, [edi + eax*4 + 4]
- movss xmm5, [edi + eax*4 + 8]
- subss xmm3, xmm0
- subss xmm4, xmm1
- subss xmm5, xmm2
- movss [edi + eax*4], xmm3
- movss [edi + eax*4 + 4], xmm4
- movss [edi + eax*4 + 8], xmm5
-
- shufps xmm0, xmm0, 11100001b
- shufps xmm1, xmm1, 11100001b
- shufps xmm2, xmm2, 11100001b
-
- movss xmm3, [edi + ebx*4]
- movss xmm4, [edi + ebx*4 + 4]
- movss xmm5, [edi + ebx*4 + 8]
- subss xmm3, xmm0
- subss xmm4, xmm1
- subss xmm5, xmm2
- movss [edi + ebx*4], xmm3
- movss [edi + ebx*4 + 4], xmm4
- movss [edi + ebx*4 + 8], xmm5
-
-.checksingle:
- mov edx, [esp + .innerk]
- and edx, 1b
- jnz .dosingle
- jmp .updateouterdata
-.dosingle:
- mov edi, [ebp + %$pos]
- mov ecx, [esp + .innerjjnr]
- mov eax, [ecx]
-
- mov esi, [ebp + %$type]
- mov ecx, eax
- mov ecx, [esi + ecx*4]
- mov esi, [ebp + %$nbfp]
- shl ecx, 1
- add ecx, [esp + .ntia]
- xorps xmm6, xmm6
- movlps xmm6, [esi + ecx*4]
- movaps xmm4, xmm6
- shufps xmm4, xmm4, 11111100b
- shufps xmm6, xmm6, 11111101b
-
- movaps [esp + .c6], xmm4
- movaps [esp + .c12], xmm6
-
- lea eax, [eax + eax*2]
-
- ; move coordinates to xmm0-xmm2
- movss xmm0, [edi + eax*4]
- movss xmm1, [edi + eax*4 + 4]
- movss xmm2, [edi + eax*4 + 8]
-
- xorps xmm7, xmm7
-
- movaps xmm4, [esp + .ix]
- movaps xmm5, [esp + .iy]
- movaps xmm6, [esp + .iz]
-
- ; calc dr
- subps xmm4, xmm0
- subps xmm5, xmm1
- subps xmm6, xmm2
-
- ; store dr
- movaps [esp + .dx], xmm4
- movaps [esp + .dy], xmm5
- movaps [esp + .dz], xmm6
- ; square it
- mulps xmm4,xmm4
- mulps xmm5,xmm5
- mulps xmm6,xmm6
- addps xmm4, xmm5
- addps xmm4, xmm6
- ; rsq in xmm4
-
- rcpps xmm5, xmm4
- ; 1/x lookup seed in xmm5
- movaps xmm0, [esp + .two]
- mulps xmm4, xmm5
- subps xmm0, xmm4
- mulps xmm0, xmm5 ; xmm0=rinvsq
- movaps xmm4, xmm0
-
- movaps xmm1, xmm0
- mulps xmm1, xmm0
- mulps xmm1, xmm0 ; xmm1=rinvsix
- movaps xmm2, xmm1
- mulps xmm2, xmm2 ; xmm2=rinvtwelve
-
- mulps xmm1, [esp + .c6]
- mulps xmm2, [esp + .c12]
- movaps xmm5, xmm2
- subps xmm5, xmm1 ; vnb=vnb12-vnb6
- addps xmm5, [esp + .vnbtot]
- mulps xmm1, [esp + .six]
- mulps xmm2, [esp + .twelve]
- subps xmm2, xmm1
- mulps xmm4, xmm2 ; xmm4=total fscal
-
- mov edi, [ebp + %$faction]
-
- movaps xmm0, [esp + .dx]
- movaps xmm1, [esp + .dy]
- movaps xmm2, [esp + .dz]
-
- movaps [esp + .vnbtot], xmm5
-
- mulps xmm0, xmm4
- mulps xmm1, xmm4
- mulps xmm2, xmm4
- ; xmm0-xmm2 contains tx-tz (partial force)
- ; now update f_i
- movaps xmm3, [esp + .fix]
- movaps xmm4, [esp + .fiy]
- movaps xmm5, [esp + .fiz]
- addps xmm3, xmm0
- addps xmm4, xmm1
- addps xmm5, xmm2
- movaps [esp + .fix], xmm3
- movaps [esp + .fiy], xmm4
- movaps [esp + .fiz], xmm5
- ; update fj
-
- movss xmm3, [edi + eax*4]
- movss xmm4, [edi + eax*4 + 4]
- movss xmm5, [edi + eax*4 + 8]
- subss xmm3, xmm0
- subss xmm4, xmm1
- subss xmm5, xmm2
- movss [edi + eax*4], xmm3
- movss [edi + eax*4 + 4], xmm4
- movss [edi + eax*4 + 8], xmm5
-.updateouterdata:
- mov ecx, [esp + .ii3]
- mov edi, [ebp + %$faction]
- mov esi, [ebp + %$fshift]
- mov edx, [esp + .is3]
-
- ; accumulate i forces in xmm0, xmm1, xmm2
- movaps xmm0, [esp + .fix]
- movaps xmm1, [esp + .fiy]
- movaps xmm2, [esp + .fiz]
-
- movhlps xmm3, xmm0
- movhlps xmm4, xmm1
- movhlps xmm5, xmm2
- addps xmm0, xmm3
- addps xmm1, xmm4
- addps xmm2, xmm5 ; summan ligger i 1/2 i xmm0-xmm2
-
- movaps xmm3, xmm0
- movaps xmm4, xmm1
- movaps xmm5, xmm2
-
- shufps xmm3, xmm3, 1b
- shufps xmm4, xmm4, 1b
- shufps xmm5, xmm5, 1b
- addss xmm0, xmm3
- addss xmm1, xmm4
- addss xmm2, xmm5 ; xmm0-xmm2 has single force in pos0.
-
- ; increment i force
- movss xmm3, [edi + ecx*4]
- movss xmm4, [edi + ecx*4 + 4]
- movss xmm5, [edi + ecx*4 + 8]
- addss xmm3, xmm0
- addss xmm4, xmm1
- addss xmm5, xmm2
- movss [edi + ecx*4], xmm3
- movss [edi + ecx*4 + 4], xmm4
- movss [edi + ecx*4 + 8], xmm5
-
- ; increment fshift force
- movss xmm3, [esi + edx*4]
- movss xmm4, [esi + edx*4 + 4]
- movss xmm5, [esi + edx*4 + 8]
- addss xmm3, xmm0
- addss xmm4, xmm1
- addss xmm5, xmm2
- movss [esi + edx*4], xmm3
- movss [esi + edx*4 + 4], xmm4
- movss [esi + edx*4 + 8], xmm5
-
- ; get group index for i particle
- mov edx, [ebp + %$gid] ; get group index for this i particle
- mov edx, [edx]
- add [ebp + %$gid], dword 4 ; advance pointer
-
-
- ; accumulate total lj energy and update it.
- movaps xmm7, [esp + .vnbtot]
- ; accumulate
- movhlps xmm6, xmm7
- addps xmm7, xmm6 ; pos 0-1 in xmm7 have the sum now
- movaps xmm6, xmm7
- shufps xmm6, xmm6, 1b
- addss xmm7, xmm6
-
- ; add earlier value from mem.
- mov eax, [ebp + %$Vnb]
- addss xmm7, [eax + edx*4]
- ; move back to mem.
- movss [eax + edx*4], xmm7
-
- ;; finish if last
- mov ecx, [ebp + %$nri]
- dec ecx
- jecxz .end
- ;; not last, iterate once more!
- mov [ebp + %$nri], ecx
- jmp .outer
-.end:
- emms
- mov eax, [esp + .salign]
- add esp, eax
- add esp, 296
- pop edi
- pop esi
- pop edx
- pop ecx
- pop ebx
- pop eax
- endproc
-
-
-
-proc inl0110_sse
-%$nri arg
-%$iinr arg
-%$jindex arg
-%$jjnr arg
-%$shift arg
-%$shiftvec arg
-%$fshift arg
-%$gid arg
-%$pos arg
-%$faction arg
-%$type arg
-%$ntype arg
-%$nbfp arg
-%$Vnb arg
-%$nsatoms arg
- ;; stack offsets for local variables
- ;; bottom of stack is cache-aligned for sse use
-.ix equ 0
-.iy equ 16
-.iz equ 32
-.dx equ 48
-.dy equ 64
-.dz equ 80
-.two equ 96
-.c6 equ 112
-.c12 equ 128
-.six equ 144
-.twelve equ 160
-.vnbtot equ 176
-.fix equ 192
-.fiy equ 208
-.fiz equ 224
-.half equ 240
-.three equ 256
-.is3 equ 272
-.ii3 equ 276
-.shX equ 280
-.shY equ 284
-.shZ equ 288
-.ntia equ 292
-.innerjjnr0 equ 296
-.innerjjnr equ 300
-.innerk0 equ 304
-.innerk equ 308
-.salign equ 312
-.nsvdwc equ 316
-.nscoul equ 320
-.nsvdw equ 324
-.solnr equ 328
-
- push eax
- push ebx
- push ecx
- push edx
- push esi
- push edi
- sub esp, 332 ; local stack space
- mov eax, esp
- and eax, 0xf
- sub esp, eax
- mov [esp + .salign], eax
-
- emms
-
- movups xmm1, [sse_two]
- movups xmm2, [sse_six]
- movups xmm3, [sse_twelve]
- movaps [esp + .two], xmm1
- movaps [esp + .six], xmm2
- movaps [esp + .twelve], xmm3
-
- ;; assume we have at least one i particle - start directly
-.outer:
- mov eax, [ebp + %$shift] ; eax = pointer into shift[]
- mov ebx, [eax] ; ebx=shift[n]
- add [ebp + %$shift], dword 4 ; advance pointer one step
-
- lea ebx, [ebx + ebx*2] ; ebx=3*is
- mov [esp + .is3],ebx ; store is3
-
- mov eax, [ebp + %$shiftvec] ; eax = base of shiftvec[]
-
- movlps xmm0, [eax + ebx*4] ; getting the shiftvector
- movss xmm1, [eax + ebx*4 + 8]
- movlps [esp + .shX], xmm0
- movss [esp + .shZ], xmm1
-
- mov ecx, [ebp + %$iinr] ; ecx = pointer into iinr[]
- add [ebp + %$iinr], dword 4 ; advance pointer
- mov ebx, [ecx] ; ebx =ii
-
- mov eax, [ebp + %$nsatoms]
- add [ebp + %$nsatoms], dword 12
- mov ecx, [eax]
- mov edx, [eax + 4]
- mov eax, [eax + 8]
- sub ecx, eax
- sub eax, edx
-
- mov [esp + .nsvdwc], edx
- mov [esp + .nscoul], eax
- mov [esp + .nsvdw], ecx
-
- ; clear vnbtot
- xorps xmm4, xmm4
- movaps [esp + .vnbtot], xmm4
- mov [esp + .solnr], ebx
-
- mov eax, [ebp + %$jindex]
- mov ecx, [eax] ; jindex[n]
- mov edx, [eax + 4] ; jindex[n+1]
- add [ebp + %$jindex], dword 4
- sub edx, ecx ; number of innerloop atoms
- mov eax, [ebp + %$jjnr]
- shl ecx, 2
- add eax, ecx
- mov [esp + .innerjjnr0], eax ; pointer to jjnr[nj0]
-
- mov [esp + .innerk0], edx ; number of innerloop atoms
-
- mov ecx, [esp + .nsvdwc]
- cmp ecx, dword 0
- jnz .mno_vdwc
- jmp .testvdw
-.mno_vdwc:
- mov ebx, [esp + .solnr]
- inc dword [esp + .solnr]
-
- mov edx, [ebp + %$type]
- mov edx, [edx + ebx*4]
- imul edx, [ebp + %$ntype]
- shl edx, 1
- mov [esp + .ntia], edx
-
- lea ebx, [ebx + ebx*2] ; ebx = 3*ii=ii3
- mov eax, [ebp + %$pos] ; eax = base of pos[]
- mov [esp + .ii3], ebx
-
- movss xmm0, [esp + .shX]
- movss xmm1, [esp + .shY]
- movss xmm2, [esp + .shZ]
-
- addss xmm0, [eax + ebx*4]
- addss xmm1, [eax + ebx*4 + 4]
- addss xmm2, [eax + ebx*4 + 8]
-
- ; clear i forces
- xorps xmm4, xmm4
- movaps [esp + .fix], xmm4
- movaps [esp + .fiy], xmm4
- movaps [esp + .fiz], xmm4
-
- shufps xmm0, xmm0, 0b
- shufps xmm1, xmm1, 0b
- shufps xmm2, xmm2, 0b
-
- movaps [esp + .ix], xmm0
- movaps [esp + .iy], xmm1
- movaps [esp + .iz], xmm2
-
- mov ecx, [esp + .innerjjnr0]
- mov [esp + .innerjjnr], ecx
- mov edx, [esp + .innerk0]
- sub edx, dword 4
- mov [esp + .innerk], edx ; number of innerloop atoms
- jge .unroll_vdwc_loop
- jmp .finish_vdwc_inner
-.unroll_vdwc_loop:
- ; quad-unroll innerloop here
- mov edx, [esp + .innerjjnr] ; pointer to jjnr[k]
- mov eax, [edx]
- mov ebx, [edx + 4]
- mov ecx, [edx + 8]
- mov edx, [edx + 12] ; eax-edx=jnr1-4
- add [esp + .innerjjnr], dword 16 ; advance pointer (unrolled 4)
-
- movd mm0, eax ; use mmx registers as temp. storage
- movd mm1, ebx
- movd mm2, ecx
- movd mm3, edx
-
- mov esi, [ebp + %$type]
- mov eax, [esi + eax*4]
- mov ebx, [esi + ebx*4]
- mov ecx, [esi + ecx*4]
- mov edx, [esi + edx*4]
- mov esi, [ebp + %$nbfp]
- shl eax, 1
- shl ebx, 1
- shl ecx, 1
- shl edx, 1
- mov edi, [esp + .ntia]
- add eax, edi
- add ebx, edi
- add ecx, edi
- add edx, edi
-
- movlps xmm6, [esi + eax*4]
- movlps xmm7, [esi + ecx*4]
- movhps xmm6, [esi + ebx*4]
- movhps xmm7, [esi + edx*4]
-
- movaps xmm4, xmm6
- shufps xmm4, xmm7, 10001000b
- shufps xmm6, xmm7, 11011101b
-
- movd eax, mm0
- movd ebx, mm1
- movd ecx, mm2
- movd edx, mm3
-
- movaps [esp + .c6], xmm4
- movaps [esp + .c12], xmm6
-
- mov esi, [ebp + %$pos] ; base of pos[]
-
- lea eax, [eax + eax*2] ; replace jnr with j3
- lea ebx, [ebx + ebx*2]
-
- mulps xmm3, xmm2
- lea ecx, [ecx + ecx*2] ; replace jnr with j3
- lea edx, [edx + edx*2]
-
- ; move four coordinates to xmm0-xmm2
-
- movlps xmm4, [esi + eax*4]
- movlps xmm5, [esi + ecx*4]
- movss xmm2, [esi + eax*4 + 8]
- movss xmm6, [esi + ecx*4 + 8]
-
- movhps xmm4, [esi + ebx*4]
- movhps xmm5, [esi + edx*4]
-
- movss xmm0, [esi + ebx*4 + 8]
- movss xmm1, [esi + edx*4 + 8]
-
- shufps xmm2, xmm0, 0b
- shufps xmm6, xmm1, 0b
-
- movaps xmm0, xmm4
- movaps xmm1, xmm4
-
- shufps xmm2, xmm6, 10001000b
-
- shufps xmm0, xmm5, 10001000b
- shufps xmm1, xmm5, 11011101b
-
- ; move ix-iz to xmm4-xmm6
- movaps xmm4, [esp + .ix]
- movaps xmm5, [esp + .iy]
- movaps xmm6, [esp + .iz]
-
- ; calc dr
- subps xmm4, xmm0
- subps xmm5, xmm1
- subps xmm6, xmm2
-
- ; store dr
- movaps [esp + .dx], xmm4
- movaps [esp + .dy], xmm5
- movaps [esp + .dz], xmm6
- ; square it
- mulps xmm4,xmm4
- mulps xmm5,xmm5
- mulps xmm6,xmm6
- addps xmm4, xmm5
- addps xmm4, xmm6
- ; rsq in xmm4
-
- rcpps xmm5, xmm4
- ; 1/x lookup seed in xmm5
- movaps xmm0, [esp + .two]
- mulps xmm4, xmm5
- subps xmm0, xmm4
- mulps xmm0, xmm5 ; xmm0=rinvsq
- movaps xmm4, xmm0
-
- movaps xmm1, xmm0
- mulps xmm1, xmm0
- mulps xmm1, xmm0 ; xmm1=rinvsix
- movaps xmm2, xmm1
- mulps xmm2, xmm2 ; xmm2=rinvtwelve
-
- mulps xmm1, [esp + .c6]
- mulps xmm2, [esp + .c12]
- movaps xmm5, xmm2
- subps xmm5, xmm1 ; vnb=vnb12-vnb6
- addps xmm5, [esp + .vnbtot]
- mulps xmm1, [esp + .six]
- mulps xmm2, [esp + .twelve]
- subps xmm2, xmm1
- mulps xmm4, xmm2 ; xmm4=total fscal
-
- movaps xmm0, [esp + .dx]
- movaps xmm1, [esp + .dy]
- movaps xmm2, [esp + .dz]
-
- movaps [esp + .vnbtot], xmm5
-
- mov edi, [ebp + %$faction]
- mulps xmm0, xmm4
- mulps xmm1, xmm4
- mulps xmm2, xmm4
- ; xmm0-xmm2 contains tx-tz (partial force)
- ; now update f_i
- movaps xmm3, [esp + .fix]
- movaps xmm4, [esp + .fiy]
- movaps xmm5, [esp + .fiz]
- addps xmm3, xmm0
- addps xmm4, xmm1
- addps xmm5, xmm2
- movaps [esp + .fix], xmm3
- movaps [esp + .fiy], xmm4
- movaps [esp + .fiz], xmm5
- ; the fj's - start by accumulating x & y forces from memory
- movlps xmm4, [edi + eax*4]
- movlps xmm6, [edi + ecx*4]
- movhps xmm4, [edi + ebx*4]
- movhps xmm6, [edi + edx*4]
-
- movaps xmm3, xmm4
- shufps xmm3, xmm6, 10001000b
- shufps xmm4, xmm6, 11011101b
-
- ; now xmm3-xmm5 contains fjx, fjy, fjz
- subps xmm3, xmm0
- subps xmm4, xmm1
-
- ; unpack them back so we can store them - first x & y in xmm3/xmm4
-
- movaps xmm6, xmm3
- unpcklps xmm6, xmm4
- unpckhps xmm3, xmm4
- ; xmm6(l)=x & y for j1, (h) for j2
- ; xmm3(l)=x & y for j3, (h) for j4
- movlps [edi + eax*4], xmm6
- movlps [edi + ecx*4], xmm3
-
- movhps [edi + ebx*4], xmm6
- movhps [edi + edx*4], xmm3
-
- ;; and the z forces
- movss xmm4, [edi + eax*4 + 8]
- movss xmm5, [edi + ebx*4 + 8]
- movss xmm6, [edi + ecx*4 + 8]
- movss xmm7, [edi + edx*4 + 8]
- subss xmm4, xmm2
- shufps xmm2, xmm2, 11100101b
- subss xmm5, xmm2
- shufps xmm2, xmm2, 11101010b
- subss xmm6, xmm2
- shufps xmm2, xmm2, 11111111b
- subss xmm7, xmm2
- movss [edi + eax*4 + 8], xmm4
- movss [edi + ebx*4 + 8], xmm5
- movss [edi + ecx*4 + 8], xmm6
- movss [edi + edx*4 + 8], xmm7
-
- ;; should we do one more iteration?
- sub [esp + .innerk], dword 4
- jl .finish_vdwc_inner
- jmp .unroll_vdwc_loop
-.finish_vdwc_inner:
- ;; check if at least two particles remain
- add [esp + .innerk], dword 4
- mov edx, [esp + .innerk]
- and edx, 10b
- jnz .dopair_vdwc
- jmp .checksingle_vdwc
-.dopair_vdwc:
-
- mov ecx, [esp + .innerjjnr]
-
- mov eax, [ecx]
- mov ebx, [ecx + 4]
- add [esp + .innerjjnr], dword 8
-
- mov esi, [ebp + %$type]
- mov ecx, eax
- mov edx, ebx
- mov ecx, [esi + ecx*4]
- mov edx, [esi + edx*4]
- mov esi, [ebp + %$nbfp]
- shl ecx, 1
- shl edx, 1
- mov edi, [esp + .ntia]
- add ecx, edi
- add edx, edi
- movlps xmm6, [esi + ecx*4]
- movhps xmm6, [esi + edx*4]
- mov edi, [ebp + %$pos]
- xorps xmm7,xmm7
- movaps xmm4, xmm6
- shufps xmm4, xmm4, 1000b
- shufps xmm6, xmm6, 1101b
- movlhps xmm4, xmm7
- movlhps xmm6, xmm7
-
- movaps [esp + .c6], xmm4
- movaps [esp + .c12], xmm6
-
- lea eax, [eax + eax*2]
- lea ebx, [ebx + ebx*2]
- ; move coordinates to xmm0-xmm2
- movlps xmm1, [edi + eax*4]
- movss xmm2, [edi + eax*4 + 8]
- movhps xmm1, [edi + ebx*4]
- movss xmm0, [edi + ebx*4 + 8]
-
-
- movlhps xmm3, xmm7
-
- shufps xmm2, xmm0, 0b
-
- movaps xmm0, xmm1
-
- shufps xmm2, xmm2, 10001000b
-
- shufps xmm0, xmm0, 10001000b
- shufps xmm1, xmm1, 11011101b
-
- mov edi, [ebp + %$faction]
- ; move ix-iz to xmm4-xmm6
- xorps xmm7, xmm7
-
- movaps xmm4, [esp + .ix]
- movaps xmm5, [esp + .iy]
- movaps xmm6, [esp + .iz]
-
- ; calc dr
- subps xmm4, xmm0
- subps xmm5, xmm1
- subps xmm6, xmm2
-
- ; store dr
- movaps [esp + .dx], xmm4
- movaps [esp + .dy], xmm5
- movaps [esp + .dz], xmm6
- ; square it
- mulps xmm4,xmm4
- mulps xmm5,xmm5
- mulps xmm6,xmm6
- addps xmm4, xmm5
- addps xmm4, xmm6
- ; rsq in xmm4
-
-
- rcpps xmm5, xmm4
- ; 1/x lookup seed in xmm5
- movaps xmm0, [esp + .two]
- mulps xmm4, xmm5
- subps xmm0, xmm4
- mulps xmm0, xmm5 ; xmm0=rinvsq
- movaps xmm4, xmm0
-
- movaps xmm1, xmm0
- mulps xmm1, xmm0
- mulps xmm1, xmm0 ; xmm1=rinvsix
- movaps xmm2, xmm1
- mulps xmm2, xmm2 ; xmm2=rinvtwelve
-
- mulps xmm1, [esp + .c6]
- mulps xmm2, [esp + .c12]
- movaps xmm5, xmm2
- subps xmm5, xmm1 ; vnb=vnb12-vnb6
- addps xmm5, [esp + .vnbtot]
- mulps xmm1, [esp + .six]
- mulps xmm2, [esp + .twelve]
- subps xmm2, xmm1
- mulps xmm4, xmm2 ; xmm4=total fscal
-
- movaps xmm0, [esp + .dx]
- movaps xmm1, [esp + .dy]
- movaps xmm2, [esp + .dz]
-
- movaps [esp + .vnbtot], xmm5
-
- mulps xmm0, xmm4
- mulps xmm1, xmm4
- mulps xmm2, xmm4
- ; xmm0-xmm2 contains tx-tz (partial force)
- ; now update f_i
- movaps xmm3, [esp + .fix]
- movaps xmm4, [esp + .fiy]
- movaps xmm5, [esp + .fiz]
- addps xmm3, xmm0
- addps xmm4, xmm1
- addps xmm5, xmm2
- movaps [esp + .fix], xmm3
- movaps [esp + .fiy], xmm4
- movaps [esp + .fiz], xmm5
- ; update the fj's
- movss xmm3, [edi + eax*4]
- movss xmm4, [edi + eax*4 + 4]
- movss xmm5, [edi + eax*4 + 8]
- subss xmm3, xmm0
- subss xmm4, xmm1
- subss xmm5, xmm2
- movss [edi + eax*4], xmm3
- movss [edi + eax*4 + 4], xmm4
- movss [edi + eax*4 + 8], xmm5
-
- shufps xmm0, xmm0, 11100001b
- shufps xmm1, xmm1, 11100001b
- shufps xmm2, xmm2, 11100001b
-
- movss xmm3, [edi + ebx*4]
- movss xmm4, [edi + ebx*4 + 4]
- movss xmm5, [edi + ebx*4 + 8]
- subss xmm3, xmm0
- subss xmm4, xmm1
- subss xmm5, xmm2
- movss [edi + ebx*4], xmm3
- movss [edi + ebx*4 + 4], xmm4
- movss [edi + ebx*4 + 8], xmm5
-
-.checksingle_vdwc:
- mov edx, [esp + .innerk]
- and edx, 1b
- jnz .dosingle_vdwc
- jmp .updateouterdata_vdwc
-.dosingle_vdwc:
- mov edi, [ebp + %$pos]
- mov ecx, [esp + .innerjjnr]
- mov eax, [ecx]
-
- mov esi, [ebp + %$type]
- mov ecx, eax
- mov ecx, [esi + ecx*4]
- mov esi, [ebp + %$nbfp]
- shl ecx, 1
- add ecx, [esp + .ntia]
- xorps xmm6, xmm6
- movlps xmm6, [esi + ecx*4]
- movaps xmm4, xmm6
- shufps xmm4, xmm4, 11111100b
- shufps xmm6, xmm6, 11111101b
-
- movaps [esp + .c6], xmm4
- movaps [esp + .c12], xmm6
-
- lea eax, [eax + eax*2]
-
- ; move coordinates to xmm0-xmm2
- movss xmm0, [edi + eax*4]
- movss xmm1, [edi + eax*4 + 4]
- movss xmm2, [edi + eax*4 + 8]
-
- xorps xmm7, xmm7
-
- movaps xmm4, [esp + .ix]
- movaps xmm5, [esp + .iy]
- movaps xmm6, [esp + .iz]
-
- ; calc dr
- subps xmm4, xmm0
- subps xmm5, xmm1
- subps xmm6, xmm2
-
- ; store dr
- movaps [esp + .dx], xmm4
- movaps [esp + .dy], xmm5
- movaps [esp + .dz], xmm6
- ; square it
- mulps xmm4,xmm4
- mulps xmm5,xmm5
- mulps xmm6,xmm6
- addps xmm4, xmm5
- addps xmm4, xmm6
- ; rsq in xmm4
-
- rcpps xmm5, xmm4
- ; 1/x lookup seed in xmm5
- movaps xmm0, [esp + .two]
- mulps xmm4, xmm5
- subps xmm0, xmm4
- mulps xmm0, xmm5 ; xmm0=rinvsq
- movaps xmm4, xmm0
-
- movaps xmm1, xmm0
- mulps xmm1, xmm0
- mulps xmm1, xmm0 ; xmm1=rinvsix
- movaps xmm2, xmm1
- mulps xmm2, xmm2 ; xmm2=rinvtwelve
-
- mulps xmm1, [esp + .c6]
- mulps xmm2, [esp + .c12]
- movaps xmm5, xmm2
- subps xmm5, xmm1 ; vnb=vnb12-vnb6
- addps xmm5, [esp + .vnbtot]
- mulps xmm1, [esp + .six]
- mulps xmm2, [esp + .twelve]
- subps xmm2, xmm1
- mulps xmm4, xmm2 ; xmm4=total fscal
-
- mov edi, [ebp + %$faction]
-
- movaps xmm0, [esp + .dx]
- movaps xmm1, [esp + .dy]
- movaps xmm2, [esp + .dz]
-
- movaps [esp + .vnbtot], xmm5
-
- mulps xmm0, xmm4
- mulps xmm1, xmm4
- mulps xmm2, xmm4
- ; xmm0-xmm2 contains tx-tz (partial force)
- ; now update f_i
- movaps xmm3, [esp + .fix]
- movaps xmm4, [esp + .fiy]
- movaps xmm5, [esp + .fiz]
- addps xmm3, xmm0
- addps xmm4, xmm1
- addps xmm5, xmm2
- movaps [esp + .fix], xmm3
- movaps [esp + .fiy], xmm4
- movaps [esp + .fiz], xmm5
- ; update fj
-
- movss xmm3, [edi + eax*4]
- movss xmm4, [edi + eax*4 + 4]
- movss xmm5, [edi + eax*4 + 8]
- subss xmm3, xmm0
- subss xmm4, xmm1
- subss xmm5, xmm2
- movss [edi + eax*4], xmm3
- movss [edi + eax*4 + 4], xmm4
- movss [edi + eax*4 + 8], xmm5
-.updateouterdata_vdwc:
- mov ecx, [esp + .ii3]
- mov edi, [ebp + %$faction]
- mov esi, [ebp + %$fshift]
- mov edx, [esp + .is3]
-
- ; accumulate i forces in xmm0, xmm1, xmm2
- movaps xmm0, [esp + .fix]
- movaps xmm1, [esp + .fiy]
- movaps xmm2, [esp + .fiz]
-
- movhlps xmm3, xmm0
- movhlps xmm4, xmm1
- movhlps xmm5, xmm2
- addps xmm0, xmm3
- addps xmm1, xmm4
- addps xmm2, xmm5 ; summan ligger i 1/2 i xmm0-xmm2
-
- movaps xmm3, xmm0
- movaps xmm4, xmm1
- movaps xmm5, xmm2
-
- shufps xmm3, xmm3, 1b
- shufps xmm4, xmm4, 1b
- shufps xmm5, xmm5, 1b
- addss xmm0, xmm3
- addss xmm1, xmm4
- addss xmm2, xmm5 ; xmm0-xmm2 has single force in pos0.
-
- ; increment i force
- movss xmm3, [edi + ecx*4]
- movss xmm4, [edi + ecx*4 + 4]
- movss xmm5, [edi + ecx*4 + 8]
- addss xmm3, xmm0
- addss xmm4, xmm1
- addss xmm5, xmm2
- movss [edi + ecx*4], xmm3
- movss [edi + ecx*4 + 4], xmm4
- movss [edi + ecx*4 + 8], xmm5
-
- ; increment fshift force
- movss xmm3, [esi + edx*4]
- movss xmm4, [esi + edx*4 + 4]
- movss xmm5, [esi + edx*4 + 8]
- addss xmm3, xmm0
- addss xmm4, xmm1
- addss xmm5, xmm2
- movss [esi + edx*4], xmm3
- movss [esi + edx*4 + 4], xmm4
- movss [esi + edx*4 + 8], xmm5
-
- ;; loop back to mno.
- dec dword [esp + .nsvdwc]
- jz .testvdw
- jmp .mno_vdwc
-.testvdw
- mov ebx, [esp + .nscoul]
- add [esp + .solnr], dword ebx
-
- mov ecx, [esp + .nsvdw]
- cmp ecx, byte 0
- jnz .mno_vdw
- jmp .last_mno
-.mno_vdw:
- mov ebx, [esp + .solnr]
- inc dword [esp + .solnr]
-
- mov edx, [ebp + %$type]
- mov edx, [edx + ebx*4]
- imul edx, [ebp + %$ntype]
- shl edx, 1
- mov [esp + .ntia], edx
-
- lea ebx, [ebx + ebx*2] ; ebx = 3*ii=ii3
- mov eax, [ebp + %$pos] ; eax = base of pos[]
- mov [esp + .ii3], ebx
-
- movss xmm0, [esp + .shX]
- movss xmm1, [esp + .shY]
- movss xmm2, [esp + .shZ]
-
- addss xmm0, [eax + ebx*4]
- addss xmm1, [eax + ebx*4 + 4]
- addss xmm2, [eax + ebx*4 + 8]
-
- xorps xmm4, xmm4
- movaps [esp + .fix], xmm4
- movaps [esp + .fiy], xmm4
- movaps [esp + .fiz], xmm4
-
- shufps xmm0, xmm0, 0b
- shufps xmm1, xmm1, 0b
- shufps xmm2, xmm2, 0b
-
- movaps [esp + .ix], xmm0
- movaps [esp + .iy], xmm1
- movaps [esp + .iz], xmm2
-
- mov ecx, [esp + .innerjjnr0]
- mov [esp + .innerjjnr], ecx
- mov edx, [esp + .innerk0]
- sub edx, dword 4
- mov [esp + .innerk], edx ; number of innerloop atoms
- jge .unroll_vdw_loop
- jmp .finish_vdw_inner
-.unroll_vdw_loop:
- ;; quad-unroll innerloop here.
- mov edx, [esp + .innerjjnr] ; pointer to jjnr[k]
- mov eax, [edx]
- mov ebx, [edx + 4]
- mov ecx, [edx + 8]
- mov edx, [edx + 12] ; eax-edx=jnr1-4
- add [esp + .innerjjnr], dword 16 ; advance pointer (unrolled 4)
-
- movd mm0, eax ; use mmx registers as temp. storage
- movd mm1, ebx
- movd mm2, ecx
- movd mm3, edx
-
- mov esi, [ebp + %$type]
- mov eax, [esi + eax*4]
- mov ebx, [esi + ebx*4]
- mov ecx, [esi + ecx*4]
- mov edx, [esi + edx*4]
- mov esi, [ebp + %$nbfp]
- shl eax, 1
- shl ebx, 1
- shl ecx, 1
- shl edx, 1
- mov edi, [esp + .ntia]
- add eax, edi
- add ebx, edi
- add ecx, edi
- add edx, edi
-
- movlps xmm6, [esi + eax*4]
- movlps xmm7, [esi + ecx*4]
- movhps xmm6, [esi + ebx*4]
- movhps xmm7, [esi + edx*4]
-
- movaps xmm4, xmm6
- shufps xmm4, xmm7, 10001000b
- shufps xmm6, xmm7, 11011101b
-
- movd eax, mm0
- movd ebx, mm1
- movd ecx, mm2
- movd edx, mm3
-
- movaps [esp + .c6], xmm4
- movaps [esp + .c12], xmm6
-
- mov esi, [ebp + %$pos] ; base of pos[]
-
- lea eax, [eax + eax*2] ; replace jnr with j3
- lea ebx, [ebx + ebx*2]
-
- mulps xmm3, xmm2
- lea ecx, [ecx + ecx*2] ; replace jnr with j3
- lea edx, [edx + edx*2]
-
- ; move four coordinates to xmm0-xmm2
-
- movlps xmm4, [esi + eax*4]
- movlps xmm5, [esi + ecx*4]
- movss xmm2, [esi + eax*4 + 8]
- movss xmm6, [esi + ecx*4 + 8]
-
- movhps xmm4, [esi + ebx*4]
- movhps xmm5, [esi + edx*4]
-
- movss xmm0, [esi + ebx*4 + 8]
- movss xmm1, [esi + edx*4 + 8]
-
- shufps xmm2, xmm0, 0b
- shufps xmm6, xmm1, 0b
-
- movaps xmm0, xmm4
- movaps xmm1, xmm4
-
- shufps xmm2, xmm6, 10001000b
-
- shufps xmm0, xmm5, 10001000b
- shufps xmm1, xmm5, 11011101b
-
- ; move ix-iz to xmm4-xmm6
- movaps xmm4, [esp + .ix]
- movaps xmm5, [esp + .iy]
- movaps xmm6, [esp + .iz]
-
- ; calc dr
- subps xmm4, xmm0
- subps xmm5, xmm1
- subps xmm6, xmm2
-
- ; store dr
- movaps [esp + .dx], xmm4
- movaps [esp + .dy], xmm5
- movaps [esp + .dz], xmm6
- ; square it
- mulps xmm4,xmm4
- mulps xmm5,xmm5
- mulps xmm6,xmm6
- addps xmm4, xmm5
- addps xmm4, xmm6
- ; rsq in xmm4
-
- rcpps xmm5, xmm4
- ; 1/x lookup seed in xmm5
- movaps xmm0, [esp + .two]
- mulps xmm4, xmm5
- subps xmm0, xmm4
- mulps xmm0, xmm5 ; xmm0=rinvsq
- movaps xmm4, xmm0
-
- movaps xmm1, xmm0
- mulps xmm1, xmm0
- mulps xmm1, xmm0 ; xmm1=rinvsix
- movaps xmm2, xmm1
- mulps xmm2, xmm2 ; xmm2=rinvtwelve
-
- mulps xmm1, [esp + .c6]
- mulps xmm2, [esp + .c12]
- movaps xmm5, xmm2
- subps xmm5, xmm1 ; vnb=vnb12-vnb6
- addps xmm5, [esp + .vnbtot]
- mulps xmm1, [esp + .six]
- mulps xmm2, [esp + .twelve]
- subps xmm2, xmm1
- mulps xmm4, xmm2 ; xmm4=total fscal
-
- movaps xmm0, [esp + .dx]
- movaps xmm1, [esp + .dy]
- movaps xmm2, [esp + .dz]
-
- movaps [esp + .vnbtot], xmm5
-
- mov edi, [ebp + %$faction]
- mulps xmm0, xmm4
- mulps xmm1, xmm4
- mulps xmm2, xmm4
- ; xmm0-xmm2 contains tx-tz (partial force)
- ; now update f_i
- movaps xmm3, [esp + .fix]
- movaps xmm4, [esp + .fiy]
- movaps xmm5, [esp + .fiz]
- addps xmm3, xmm0
- addps xmm4, xmm1
- addps xmm5, xmm2
- movaps [esp + .fix], xmm3
- movaps [esp + .fiy], xmm4
- movaps [esp + .fiz], xmm5
- ; the fj's - start by accumulating x & y forces from memory
- movlps xmm4, [edi + eax*4]
- movlps xmm6, [edi + ecx*4]
- movhps xmm4, [edi + ebx*4]
- movhps xmm6, [edi + edx*4]
-
- movaps xmm3, xmm4
- shufps xmm3, xmm6, 10001000b
- shufps xmm4, xmm6, 11011101b
-
- ; now xmm3-xmm5 contains fjx, fjy, fjz
- subps xmm3, xmm0
- subps xmm4, xmm1
-
- ; unpack them back so we can store them - first x & y in xmm3/xmm4
-
- movaps xmm6, xmm3
- unpcklps xmm6, xmm4
- unpckhps xmm3, xmm4
- ; xmm6(l)=x & y for j1, (h) for j2
- ; xmm3(l)=x & y for j3, (h) for j4
- movlps [edi + eax*4], xmm6
- movlps [edi + ecx*4], xmm3
-
- movhps [edi + ebx*4], xmm6
- movhps [edi + edx*4], xmm3
-
- ;; and the z forces
- movss xmm4, [edi + eax*4 + 8]
- movss xmm5, [edi + ebx*4 + 8]
- movss xmm6, [edi + ecx*4 + 8]
- movss xmm7, [edi + edx*4 + 8]
- subss xmm4, xmm2
- shufps xmm2, xmm2, 11100101b
- subss xmm5, xmm2
- shufps xmm2, xmm2, 11101010b
- subss xmm6, xmm2
- shufps xmm2, xmm2, 11111111b
- subss xmm7, xmm2
- movss [edi + eax*4 + 8], xmm4
- movss [edi + ebx*4 + 8], xmm5
- movss [edi + ecx*4 + 8], xmm6
- movss [edi + edx*4 + 8], xmm7
-
- ;; should we do one more iteration?
- sub [esp + .innerk], dword 4
- jl .finish_vdw_inner
- jmp .unroll_vdw_loop
-.finish_vdw_inner:
- ;; check if at least two particles remain
- add [esp + .innerk], dword 4
- mov edx, [esp + .innerk]
- and edx, 10b
- jnz .dopair_vdw
- jmp .checksingle_vdw
-.dopair_vdw:
-
- mov ecx, [esp + .innerjjnr]
-
- mov eax, [ecx]
- mov ebx, [ecx + 4]
- add [esp + .innerjjnr], dword 8
-
- mov esi, [ebp + %$type]
- mov ecx, eax
- mov edx, ebx
- mov ecx, [esi + ecx*4]
- mov edx, [esi + edx*4]
- mov esi, [ebp + %$nbfp]
- shl ecx, 1
- shl edx, 1
- mov edi, [esp + .ntia]
- add ecx, edi
- add edx, edi
- movlps xmm6, [esi + ecx*4]
- movhps xmm6, [esi + edx*4]
- mov edi, [ebp + %$pos]
- xorps xmm7,xmm7
- movaps xmm4, xmm6
- shufps xmm4, xmm4, 1000b
- shufps xmm6, xmm6, 1101b
- movlhps xmm4, xmm7
- movlhps xmm6, xmm7
-
- movaps [esp + .c6], xmm4
- movaps [esp + .c12], xmm6
-
- lea eax, [eax + eax*2]
- lea ebx, [ebx + ebx*2]
- ; move coordinates to xmm0-xmm2
- movlps xmm1, [edi + eax*4]
- movss xmm2, [edi + eax*4 + 8]
- movhps xmm1, [edi + ebx*4]
- movss xmm0, [edi + ebx*4 + 8]
-
-
- movlhps xmm3, xmm7
-
- shufps xmm2, xmm0, 0b
-
- movaps xmm0, xmm1
-
- shufps xmm2, xmm2, 10001000b
-
- shufps xmm0, xmm0, 10001000b
- shufps xmm1, xmm1, 11011101b
-
- mov edi, [ebp + %$faction]
- ; move ix-iz to xmm4-xmm6
- xorps xmm7, xmm7
-
- movaps xmm4, [esp + .ix]
- movaps xmm5, [esp + .iy]
- movaps xmm6, [esp + .iz]
-
- ; calc dr
- subps xmm4, xmm0
- subps xmm5, xmm1
- subps xmm6, xmm2
-
- ; store dr
- movaps [esp + .dx], xmm4
- movaps [esp + .dy], xmm5
- movaps [esp + .dz], xmm6
- ; square it
- mulps xmm4,xmm4
- mulps xmm5,xmm5
- mulps xmm6,xmm6
- addps xmm4, xmm5
- addps xmm4, xmm6
- ; rsq in xmm4
-
-
- rcpps xmm5, xmm4
- ; 1/x lookup seed in xmm5
- movaps xmm0, [esp + .two]
- mulps xmm4, xmm5
- subps xmm0, xmm4
- mulps xmm0, xmm5 ; xmm0=rinvsq
- movaps xmm4, xmm0
-
- movaps xmm1, xmm0
- mulps xmm1, xmm0
- mulps xmm1, xmm0 ; xmm1=rinvsix
- movaps xmm2, xmm1
- mulps xmm2, xmm2 ; xmm2=rinvtwelve
-
- mulps xmm1, [esp + .c6]
- mulps xmm2, [esp + .c12]
- movaps xmm5, xmm2
- subps xmm5, xmm1 ; vnb=vnb12-vnb6
- addps xmm5, [esp + .vnbtot]
- mulps xmm1, [esp + .six]
- mulps xmm2, [esp + .twelve]
- subps xmm2, xmm1
- mulps xmm4, xmm2 ; xmm4=total fscal
-
- movaps xmm0, [esp + .dx]
- movaps xmm1, [esp + .dy]
- movaps xmm2, [esp + .dz]
-
- movaps [esp + .vnbtot], xmm5
-
- mulps xmm0, xmm4
- mulps xmm1, xmm4
- mulps xmm2, xmm4
- ; xmm0-xmm2 contains tx-tz (partial force)
- ; now update f_i
- movaps xmm3, [esp + .fix]
- movaps xmm4, [esp + .fiy]
- movaps xmm5, [esp + .fiz]
- addps xmm3, xmm0
- addps xmm4, xmm1
- addps xmm5, xmm2
- movaps [esp + .fix], xmm3
- movaps [esp + .fiy], xmm4
- movaps [esp + .fiz], xmm5
- ; update the fj's
- movss xmm3, [edi + eax*4]
- movss xmm4, [edi + eax*4 + 4]
- movss xmm5, [edi + eax*4 + 8]
- subss xmm3, xmm0
- subss xmm4, xmm1
- subss xmm5, xmm2
- movss [edi + eax*4], xmm3
- movss [edi + eax*4 + 4], xmm4
- movss [edi + eax*4 + 8], xmm5
-
- shufps xmm0, xmm0, 11100001b
- shufps xmm1, xmm1, 11100001b
- shufps xmm2, xmm2, 11100001b
-
- movss xmm3, [edi + ebx*4]
- movss xmm4, [edi + ebx*4 + 4]
- movss xmm5, [edi + ebx*4 + 8]
- subss xmm3, xmm0
- subss xmm4, xmm1
- subss xmm5, xmm2
- movss [edi + ebx*4], xmm3
- movss [edi + ebx*4 + 4], xmm4
- movss [edi + ebx*4 + 8], xmm5
-
-.checksingle_vdw:
- mov edx, [esp + .innerk]
- and edx, 1b
- jnz .dosingle_vdw
- jmp .updateouterdata_vdw
-.dosingle_vdw:
- mov edi, [ebp + %$pos]
- mov ecx, [esp + .innerjjnr]
- mov eax, [ecx]
-
- mov esi, [ebp + %$type]
- mov ecx, eax
- mov ecx, [esi + ecx*4]
- mov esi, [ebp + %$nbfp]
- shl ecx, 1
- add ecx, [esp + .ntia]
- xorps xmm6, xmm6
- movlps xmm6, [esi + ecx*4]
- movaps xmm4, xmm6
- shufps xmm4, xmm4, 11111100b
- shufps xmm6, xmm6, 11111101b
-
- movaps [esp + .c6], xmm4
- movaps [esp + .c12], xmm6
-
- lea eax, [eax + eax*2]
-
- ; move coordinates to xmm0-xmm2
- movss xmm0, [edi + eax*4]
- movss xmm1, [edi + eax*4 + 4]
- movss xmm2, [edi + eax*4 + 8]
-
- xorps xmm7, xmm7
-
- movaps xmm4, [esp + .ix]
- movaps xmm5, [esp + .iy]
- movaps xmm6, [esp + .iz]
-
- ; calc dr
- subps xmm4, xmm0
- subps xmm5, xmm1
- subps xmm6, xmm2
-
- ; store dr
- movaps [esp + .dx], xmm4
- movaps [esp + .dy], xmm5
- movaps [esp + .dz], xmm6
- ; square it
- mulps xmm4,xmm4
- mulps xmm5,xmm5
- mulps xmm6,xmm6
- addps xmm4, xmm5
- addps xmm4, xmm6
- ; rsq in xmm4
-
- rcpps xmm5, xmm4
- ; 1/x lookup seed in xmm5
- movaps xmm0, [esp + .two]
- mulps xmm4, xmm5
- subps xmm0, xmm4
- mulps xmm0, xmm5 ; xmm0=rinvsq
- movaps xmm4, xmm0
-
- movaps xmm1, xmm0
- mulps xmm1, xmm0
- mulps xmm1, xmm0 ; xmm1=rinvsix
- movaps xmm2, xmm1
- mulps xmm2, xmm2 ; xmm2=rinvtwelve
-
- mulps xmm1, [esp + .c6]
- mulps xmm2, [esp + .c12]
- movaps xmm5, xmm2
- subps xmm5, xmm1 ; vnb=vnb12-vnb6
- addps xmm5, [esp + .vnbtot]
- mulps xmm1, [esp + .six]
- mulps xmm2, [esp + .twelve]
- subps xmm2, xmm1
- mulps xmm4, xmm2 ; xmm4=total fscal
-
- mov edi, [ebp + %$faction]
-
- movaps xmm0, [esp + .dx]
- movaps xmm1, [esp + .dy]
- movaps xmm2, [esp + .dz]
-
- movaps [esp + .vnbtot], xmm5
-
- mulps xmm0, xmm4
- mulps xmm1, xmm4
- mulps xmm2, xmm4
- ; xmm0-xmm2 contains tx-tz (partial force)
- ; now update f_i
- movaps xmm3, [esp + .fix]
- movaps xmm4, [esp + .fiy]
- movaps xmm5, [esp + .fiz]
- addps xmm3, xmm0
- addps xmm4, xmm1
- addps xmm5, xmm2
- movaps [esp + .fix], xmm3
- movaps [esp + .fiy], xmm4
- movaps [esp + .fiz], xmm5
- ; update fj
-
- movss xmm3, [edi + eax*4]
- movss xmm4, [edi + eax*4 + 4]
- movss xmm5, [edi + eax*4 + 8]
- subss xmm3, xmm0
- subss xmm4, xmm1
- subss xmm5, xmm2
- movss [edi + eax*4], xmm3
- movss [edi + eax*4 + 4], xmm4
- movss [edi + eax*4 + 8], xmm5
-.updateouterdata_vdw:
- mov ecx, [esp + .ii3]
- mov edi, [ebp + %$faction]
- mov esi, [ebp + %$fshift]
- mov edx, [esp + .is3]
-
- ; accumulate i forces in xmm0, xmm1, xmm2
- movaps xmm0, [esp + .fix]
- movaps xmm1, [esp + .fiy]
- movaps xmm2, [esp + .fiz]
-
- movhlps xmm3, xmm0
- movhlps xmm4, xmm1
- movhlps xmm5, xmm2
- addps xmm0, xmm3
- addps xmm1, xmm4
- addps xmm2, xmm5 ; summan ligger i 1/2 i xmm0-xmm2
-
- movaps xmm3, xmm0
- movaps xmm4, xmm1
- movaps xmm5, xmm2
-
- shufps xmm3, xmm3, 1b
- shufps xmm4, xmm4, 1b
- shufps xmm5, xmm5, 1b
- addss xmm0, xmm3
- addss xmm1, xmm4
- addss xmm2, xmm5 ; xmm0-xmm2 has single force in pos0.
-
- ; increment i force
- movss xmm3, [edi + ecx*4]
- movss xmm4, [edi + ecx*4 + 4]
- movss xmm5, [edi + ecx*4 + 8]
- addss xmm3, xmm0
- addss xmm4, xmm1
- addss xmm5, xmm2
- movss [edi + ecx*4], xmm3
- movss [edi + ecx*4 + 4], xmm4
- movss [edi + ecx*4 + 8], xmm5
-
- ; increment fshift force
- movss xmm3, [esi + edx*4]
- movss xmm4, [esi + edx*4 + 4]
- movss xmm5, [esi + edx*4 + 8]
- addss xmm3, xmm0
- addss xmm4, xmm1
- addss xmm5, xmm2
- movss [esi + edx*4], xmm3
- movss [esi + edx*4 + 4], xmm4
- movss [esi + edx*4 + 8], xmm5
-
- ;; loop back to mno.
- dec dword [esp + .nsvdw]
- jz .last_mno
- jmp .mno_vdw
-
-.last_mno:
-
- ; get group index for i particle
- mov edx, [ebp + %$gid] ; get group index for this i particle
- mov edx, [edx]
- add [ebp + %$gid], dword 4 ; advance pointer
-
-
- ; accumulate total lj energy and update it.
- movaps xmm7, [esp + .vnbtot]
- ; accumulate
- movhlps xmm6, xmm7
- addps xmm7, xmm6 ; pos 0-1 in xmm7 have the sum now
- movaps xmm6, xmm7
- shufps xmm6, xmm6, 1b
- addss xmm7, xmm6
-
- ; add earlier value from mem.
- mov eax, [ebp + %$Vnb]
- addss xmm7, [eax + edx*4]
- ; move back to mem.
- movss [eax + edx*4], xmm7
-
- ;; finish if last
- mov ecx, [ebp + %$nri]
- dec ecx
- jecxz .end
- ;; not last, iterate once more!
- mov [ebp + %$nri], ecx
- jmp .outer
-.end:
- emms
- mov eax, [esp + .salign]
- add esp, eax
- add esp, 332
- pop edi
- pop esi
- pop edx
- pop ecx
- pop ebx
- pop eax
- endproc
-
-
-
-proc inl0300_sse
-%$nri arg
-%$iinr arg
-%$jindex arg
-%$jjnr arg
-%$shift arg
-%$shiftvec arg
-%$fshift arg
-%$gid arg
-%$pos arg
-%$faction arg
-%$type arg
-%$ntype arg
-%$nbfp arg
-%$Vnb arg
-%$tabscale arg
-%$VFtab arg
- ;; stack offsets for local variables
- ;; bottom of stack is cache-aligned for sse use
-.ix equ 0
-.iy equ 16
-.iz equ 32
-.dx equ 48
-.dy equ 64
-.dz equ 80
-.two equ 96
-.tabscale equ 112
-.c6 equ 128
-.c12 equ 144
-.fs equ 160
-.vnbtot equ 176
-.fix equ 192
-.fiy equ 208
-.fiz equ 224
-.half equ 240
-.three equ 256
-.is3 equ 272
-.ii3 equ 276
-.ntia equ 280
-.innerjjnr equ 284
-.innerk equ 288
-.salign equ 292
- push eax
- push ebx
- push ecx
- push edx
- push esi
- push edi
- sub esp, 296 ; local stack space
- mov eax, esp
- and eax, 0xf
- sub esp, eax
- mov [esp + .salign], eax
-
- emms
-
- movups xmm0, [sse_half]
- movups xmm1, [sse_two]
- movups xmm2, [sse_three]
- movss xmm3, [ebp + %$tabscale]
- movaps [esp + .half], xmm0
- movaps [esp + .two], xmm1
- movaps [esp + .three], xmm2
- shufps xmm3, xmm3, 0b
- movaps [esp + .tabscale], xmm3
-
- ;; assume we have at least one i particle - start directly
-.outer:
- mov eax, [ebp + %$shift] ; eax = pointer into shift[]
- mov ebx, [eax] ; ebx=shift[n]
- add [ebp + %$shift], dword 4 ; advance pointer one step
-
- lea ebx, [ebx + ebx*2] ; ebx=3*is
- mov [esp + .is3],ebx ; store is3
-
- mov eax, [ebp + %$shiftvec] ; eax = base of shiftvec[]
-
- movss xmm0, [eax + ebx*4]
- movss xmm1, [eax + ebx*4 + 4]
- movss xmm2, [eax + ebx*4 + 8]
-
- mov ecx, [ebp + %$iinr] ; ecx = pointer into iinr[]
- add [ebp + %$iinr], dword 4 ; advance pointer
- mov ebx, [ecx] ; ebx =ii
-
- mov edx, [ebp + %$type]
- mov edx, [edx + ebx*4]
- imul edx, [ebp + %$ntype]
- shl edx, 1
- mov [esp + .ntia], edx
-
- lea ebx, [ebx + ebx*2] ; ebx = 3*ii=ii3
- mov eax, [ebp + %$pos] ; eax = base of pos[]
-
- addss xmm0, [eax + ebx*4]
- addss xmm1, [eax + ebx*4 + 4]
- addss xmm2, [eax + ebx*4 + 8]
-
- shufps xmm0, xmm0, 0b
- shufps xmm1, xmm1, 0b
- shufps xmm2, xmm2, 0b
-
- movaps [esp + .ix], xmm0
- movaps [esp + .iy], xmm1
- movaps [esp + .iz], xmm2
-
- mov [esp + .ii3], ebx
-
- ; clear tot potential and i forces
- xorps xmm4, xmm4
- movaps [esp + .vnbtot], xmm4
- movaps [esp + .fix], xmm4
- movaps [esp + .fiy], xmm4
- movaps [esp + .fiz], xmm4
-
- mov eax, [ebp + %$jindex]
- mov ecx, [eax] ; jindex[n]
- mov edx, [eax + 4] ; jindex[n+1]
- add [ebp + %$jindex], dword 4
- sub edx, ecx ; number of innerloop atoms
-
- mov esi, [ebp + %$pos]
- mov edi, [ebp + %$faction]
- mov eax, [ebp + %$jjnr]
- shl ecx, 2
- add eax, ecx
- mov [esp + .innerjjnr], eax ; pointer to jjnr[nj0]
- sub edx, dword 4
- mov [esp + .innerk], edx ; number of innerloop atoms
- jge .unroll_loop
- jmp .finish_inner
-.unroll_loop:
- ;; quad-unroll innerloop here.
- mov edx, [esp + .innerjjnr] ; pointer to jjnr[k]
- mov eax, [edx]
- mov ebx, [edx + 4]
- mov ecx, [edx + 8]
- mov edx, [edx + 12] ; eax-edx=jnr1-4
- add [esp + .innerjjnr], dword 16 ; advance pointer (unrolled 4)
-
- movd mm0, eax ; use mmx registers as temp. storage
- movd mm1, ebx
- movd mm2, ecx
- movd mm3, edx
-
- mov esi, [ebp + %$type]
- mov eax, [esi + eax*4]
- mov ebx, [esi + ebx*4]
- mov ecx, [esi + ecx*4]
- mov edx, [esi + edx*4]
- mov esi, [ebp + %$nbfp]
- shl eax, 1
- shl ebx, 1
- shl ecx, 1
- shl edx, 1
- mov edi, [esp + .ntia]
- add eax, edi
- add ebx, edi
- add ecx, edi
- add edx, edi
-
- movlps xmm6, [esi + eax*4]
- movlps xmm7, [esi + ecx*4]
- movhps xmm6, [esi + ebx*4]
- movhps xmm7, [esi + edx*4]
-
- movaps xmm4, xmm6
- shufps xmm4, xmm7, 10001000b
- shufps xmm6, xmm7, 11011101b
-
- movd eax, mm0
- movd ebx, mm1
- movd ecx, mm2
- movd edx, mm3
-
- movaps [esp + .c6], xmm4
- movaps [esp + .c12], xmm6
-
- mov esi, [ebp + %$pos] ; base of pos[]
-
- lea eax, [eax + eax*2] ; replace jnr with j3
- lea ebx, [ebx + ebx*2]
-
- lea ecx, [ecx + ecx*2] ; replace jnr with j3
- lea edx, [edx + edx*2]
-
- ; move four coordinates to xmm0-xmm2
-
- movlps xmm4, [esi + eax*4]
- movlps xmm5, [esi + ecx*4]
- movss xmm2, [esi + eax*4 + 8]
- movss xmm6, [esi + ecx*4 + 8]
-
- movhps xmm4, [esi + ebx*4]
- movhps xmm5, [esi + edx*4]
-
- movss xmm0, [esi + ebx*4 + 8]
- movss xmm1, [esi + edx*4 + 8]
-
- shufps xmm2, xmm0, 0b
- shufps xmm6, xmm1, 0b
-
- movaps xmm0, xmm4
- movaps xmm1, xmm4
-
- shufps xmm2, xmm6, 10001000b
-
- shufps xmm0, xmm5, 10001000b
- shufps xmm1, xmm5, 11011101b
-
- ; move ix-iz to xmm4-xmm6
- movaps xmm4, [esp + .ix]
- movaps xmm5, [esp + .iy]
- movaps xmm6, [esp + .iz]
-
- ; calc dr
- subps xmm4, xmm0
- subps xmm5, xmm1
- subps xmm6, xmm2
-
- ; store dr
- movaps [esp + .dx], xmm4
- movaps [esp + .dy], xmm5
- movaps [esp + .dz], xmm6
- ; square it
- mulps xmm4,xmm4
- mulps xmm5,xmm5
- mulps xmm6,xmm6
- addps xmm4, xmm5
- addps xmm4, xmm6
- ; rsq in xmm4
-
- rsqrtps xmm5, xmm4
- ; lookup seed in xmm5
- movaps xmm2, xmm5
- mulps xmm5, xmm5
- movaps xmm1, [esp + .three]
- mulps xmm5, xmm4 ;rsq*lu*lu
- movaps xmm0, [esp + .half]
- subps xmm1, xmm5 ; 3.0-rsq*lu*lu
- mulps xmm1, xmm2
- mulps xmm0, xmm1 ; xmm0=rinv
- mulps xmm4, xmm0 ; xmm4=r
- mulps xmm4, [esp + .tabscale]
-
- movhlps xmm5, xmm4
- cvttps2pi mm6, xmm4
- cvttps2pi mm7, xmm5 ; mm6/mm7 contain lu indices
- cvtpi2ps xmm6, mm6
- cvtpi2ps xmm5, mm7
- movlhps xmm6, xmm5
- subps xmm4, xmm6
- movaps xmm1, xmm4 ;xmm1=eps
- movaps xmm2, xmm1
- mulps xmm2, xmm2 ;xmm2=eps2
- pslld mm6, 3
- pslld mm7, 3
-
- movd mm0, eax
- movd mm1, ebx
- movd mm2, ecx
- movd mm3, edx
-
- mov esi, [ebp + %$VFtab]
- movd eax, mm6
- psrlq mm6, 32
- movd ecx, mm7
- psrlq mm7, 32
- movd ebx, mm6
- movd edx, mm7
-
- ; dispersion
- movlps xmm5, [esi + eax*4 + 0]
- movlps xmm7, [esi + ecx*4 + 0]
- movhps xmm5, [esi + ebx*4 + 0]
- movhps xmm7, [esi + edx*4 + 0] ; got half dispersion table
- movaps xmm4, xmm5
- shufps xmm4, xmm7, 10001000b
- shufps xmm5, xmm7, 11011101b
-
- movlps xmm7, [esi + eax*4 + 8]
- movlps xmm3, [esi + ecx*4 + 8]
- movhps xmm7, [esi + ebx*4 + 8]
- movhps xmm3, [esi + edx*4 + 8] ; other half of dispersion table
- movaps xmm6, xmm7
- shufps xmm6, xmm3, 10001000b
- shufps xmm7, xmm3, 11011101b
- ; dispersion table ready, in xmm4-xmm7
- mulps xmm6, xmm1 ; xmm6=Geps
- mulps xmm7, xmm2 ; xmm7=Heps2
- addps xmm5, xmm6
- addps xmm5, xmm7 ; xmm5=Fp
- mulps xmm7, [esp + .two] ; two*Heps2
- addps xmm7, xmm6
- addps xmm7, xmm5 ; xmm7=FF
- mulps xmm5, xmm1 ; xmm5=eps*Fp
- addps xmm5, xmm4 ; xmm5=VV
-
- movaps xmm4, [esp + .c6]
- mulps xmm7, xmm4 ; fijD
- mulps xmm5, xmm4 ; vnb6
-
- ; put scalar force on stack. Update vnbtot directly.
- addps xmm5, [esp + .vnbtot]
- movaps [esp + .fs], xmm7
- movaps [esp + .vnbtot], xmm5
-
- ; repulsion
- movlps xmm5, [esi + eax*4 + 16]
- movlps xmm7, [esi + ecx*4 + 16]
- movhps xmm5, [esi + ebx*4 + 16]
- movhps xmm7, [esi + edx*4 + 16] ; got half repulsion table
- movaps xmm4, xmm5
- shufps xmm4, xmm7, 10001000b
- shufps xmm5, xmm7, 11011101b
-
- movlps xmm7, [esi + eax*4 + 24]
- movlps xmm3, [esi + ecx*4 + 24]
- movhps xmm7, [esi + ebx*4 + 24]
- movhps xmm3, [esi + edx*4 + 24] ; other half of repulsion table
- movaps xmm6, xmm7
- shufps xmm6, xmm3, 10001000b
- shufps xmm7, xmm3, 11011101b
- ; table ready, in xmm4-xmm7
- mulps xmm6, xmm1 ; xmm6=Geps
- mulps xmm7, xmm2 ; xmm7=Heps2
- addps xmm5, xmm6
- addps xmm5, xmm7 ; xmm5=Fp
- mulps xmm7, [esp + .two] ; two*Heps2
- addps xmm7, xmm6
- addps xmm7, xmm5 ; xmm7=FF
- mulps xmm5, xmm1 ; xmm5=eps*Fp
- addps xmm5, xmm4 ; xmm5=VV
-
- movaps xmm4, [esp + .c12]
- mulps xmm7, xmm4 ; fijR
- mulps xmm5, xmm4 ; vnb12
- addps xmm7, [esp + .fs]
-
- addps xmm5, [esp + .vnbtot]
- movaps [esp + .vnbtot], xmm5
- xorps xmm4, xmm4
-
- mulps xmm7, [esp + .tabscale]
- mulps xmm7, xmm0
- subps xmm4, xmm7
-
- movaps xmm0, [esp + .dx]
- movaps xmm1, [esp + .dy]
- movaps xmm2, [esp + .dz]
-
- movd eax, mm0
- movd ebx, mm1
- movd ecx, mm2
- movd edx, mm3
-
- mov edi, [ebp + %$faction]
- mulps xmm0, xmm4
- mulps xmm1, xmm4
- mulps xmm2, xmm4
- ; xmm0-xmm2 contains tx-tz (partial force)
- ; now update f_i
- movaps xmm3, [esp + .fix]
- movaps xmm4, [esp + .fiy]
- movaps xmm5, [esp + .fiz]
- addps xmm3, xmm0
- addps xmm4, xmm1
- addps xmm5, xmm2
- movaps [esp + .fix], xmm3
- movaps [esp + .fiy], xmm4
- movaps [esp + .fiz], xmm5
- ; the fj's - start by accumulating x & y forces from memory
- movlps xmm4, [edi + eax*4]
- movlps xmm6, [edi + ecx*4]
- movhps xmm4, [edi + ebx*4]
- movhps xmm6, [edi + edx*4]
-
- movaps xmm3, xmm4
- shufps xmm3, xmm6, 10001000b
- shufps xmm4, xmm6, 11011101b
-
- ; now xmm3-xmm5 contains fjx, fjy, fjz
- subps xmm3, xmm0
- subps xmm4, xmm1
-
- ; unpack them back so we can store them - first x & y in xmm3/xmm4
-
- movaps xmm6, xmm3
- unpcklps xmm6, xmm4
- unpckhps xmm3, xmm4
- ; xmm6(l)=x & y for j1, (h) for j2
- ; xmm3(l)=x & y for j3, (h) for j4
- movlps [edi + eax*4], xmm6
- movlps [edi + ecx*4], xmm3
-
- movhps [edi + ebx*4], xmm6
- movhps [edi + edx*4], xmm3
-
- ;; and the z forces
- movss xmm4, [edi + eax*4 + 8]
- movss xmm5, [edi + ebx*4 + 8]
- movss xmm6, [edi + ecx*4 + 8]
- movss xmm7, [edi + edx*4 + 8]
- subss xmm4, xmm2
- shufps xmm2, xmm2, 11100101b
- subss xmm5, xmm2
- shufps xmm2, xmm2, 11101010b
- subss xmm6, xmm2
- shufps xmm2, xmm2, 11111111b
- subss xmm7, xmm2
- movss [edi + eax*4 + 8], xmm4
- movss [edi + ebx*4 + 8], xmm5
- movss [edi + ecx*4 + 8], xmm6
- movss [edi + edx*4 + 8], xmm7
-
- ;; should we do one more iteration?
- sub [esp + .innerk], dword 4
- jl .finish_inner
- jmp .unroll_loop
-.finish_inner:
- ;; check if at least two particles remain
- add [esp + .innerk], dword 4
- mov edx, [esp + .innerk]
- and edx, 10b
- jnz .dopair
- jmp .checksingle
-.dopair:
- mov ecx, [esp + .innerjjnr]
-
- mov eax, [ecx]
- mov ebx, [ecx + 4]
- add [esp + .innerjjnr], dword 8
- xorps xmm7, xmm7
-
- mov esi, [ebp + %$type]
- mov ecx, eax
- mov edx, ebx
- mov ecx, [esi + ecx*4]
- mov edx, [esi + edx*4]
- mov esi, [ebp + %$nbfp]
- shl ecx, 1
- shl edx, 1
- mov edi, [esp + .ntia]
- add ecx, edi
- add edx, edi
- movlps xmm6, [esi + ecx*4]
- movhps xmm6, [esi + edx*4]
- mov edi, [ebp + %$pos]
-
- movaps xmm4, xmm6
- shufps xmm4, xmm4, 1000b
- shufps xmm6, xmm6, 1101b
- movlhps xmm4, xmm7
- movlhps xmm6, xmm7
-
- movaps [esp + .c6], xmm4
- movaps [esp + .c12], xmm6
-
- lea eax, [eax + eax*2]
- lea ebx, [ebx + ebx*2]
- ; move coordinates to xmm0-xmm2
- movlps xmm1, [edi + eax*4]
- movss xmm2, [edi + eax*4 + 8]
- movhps xmm1, [edi + ebx*4]
- movss xmm0, [edi + ebx*4 + 8]
-
- movlhps xmm3, xmm7
-
- shufps xmm2, xmm0, 0b
-
- movaps xmm0, xmm1
-
- shufps xmm2, xmm2, 10001000b
-
- shufps xmm0, xmm0, 10001000b
- shufps xmm1, xmm1, 11011101b
-
- mov edi, [ebp + %$faction]
- ; move ix-iz to xmm4-xmm6
- xorps xmm7, xmm7
-
- movaps xmm4, [esp + .ix]
- movaps xmm5, [esp + .iy]
- movaps xmm6, [esp + .iz]
-
- ; calc dr
- subps xmm4, xmm0
- subps xmm5, xmm1
- subps xmm6, xmm2
-
- ; store dr
- movaps [esp + .dx], xmm4
- movaps [esp + .dy], xmm5
- movaps [esp + .dz], xmm6
- ; square it
- mulps xmm4,xmm4
- mulps xmm5,xmm5
- mulps xmm6,xmm6
- addps xmm4, xmm5
- addps xmm4, xmm6
- ; rsq in xmm4
-
- rsqrtps xmm5, xmm4
- ; lookup seed in xmm5
- movaps xmm2, xmm5
- mulps xmm5, xmm5
- movaps xmm1, [esp + .three]
- mulps xmm5, xmm4 ;rsq*lu*lu
- movaps xmm0, [esp + .half]
- subps xmm1, xmm5 ; 3.0-rsq*lu*lu
- mulps xmm1, xmm2
- mulps xmm0, xmm1 ; xmm0=rinv
- mulps xmm4, xmm0 ; xmm4=r
- mulps xmm4, [esp + .tabscale]
-
- cvttps2pi mm6, xmm4 ; mm6 contain lu indices
- cvtpi2ps xmm6, mm6
- subps xmm4, xmm6
- movaps xmm1, xmm4 ;xmm1=eps
- movaps xmm2, xmm1
- mulps xmm2, xmm2 ;xmm2=eps2
-
- pslld mm6, 3
-
- mov esi, [ebp + %$VFtab]
- movd ecx, mm6
- psrlq mm6, 32
- movd edx, mm6
-
- ; dispersion
- movlps xmm5, [esi + ecx*4 + 0]
- movhps xmm5, [esi + edx*4 + 0]; got half dispersion table
- movaps xmm4, xmm5
- shufps xmm4, xmm4, 10001000b
- shufps xmm5, xmm5, 11011101b
-
- movlps xmm7, [esi + ecx*4 + 8]
- movhps xmm7, [esi + edx*4 + 8] ; other half of dispersion table
- movaps xmm6, xmm7
- shufps xmm6, xmm6, 10001000b
- shufps xmm7, xmm7, 11011101b
- ; dispersion table ready, in xmm4-xmm7
- mulps xmm6, xmm1 ; xmm6=Geps
- mulps xmm7, xmm2 ; xmm7=Heps2
- addps xmm5, xmm6
- addps xmm5, xmm7 ; xmm5=Fp
- mulps xmm7, [esp + .two] ; two*Heps2
- addps xmm7, xmm6
- addps xmm7, xmm5 ; xmm7=FF
- mulps xmm5, xmm1 ; xmm5=eps*Fp
- addps xmm5, xmm4 ; xmm5=VV
-
- movaps xmm4, [esp + .c6]
- mulps xmm7, xmm4 ; fijD
- mulps xmm5, xmm4 ; vnb6
-
- ; put scalar force on stack. Update vnbtot directly.
- addps xmm5, [esp + .vnbtot]
- movaps [esp + .fs], xmm7
- movaps [esp + .vnbtot], xmm5
-
- ; repulsion
- movlps xmm5, [esi + ecx*4 + 16]
- movhps xmm5, [esi + edx*4 + 16] ; got half repulsion table
- movaps xmm4, xmm5
- shufps xmm4, xmm7, 10001000b
- shufps xmm5, xmm7, 11011101b
-
- movlps xmm7, [esi + ecx*4 + 24]
- movhps xmm7, [esi + edx*4 + 24] ; other half of repulsion table
- movaps xmm6, xmm7
- shufps xmm6, xmm3, 10001000b
- shufps xmm7, xmm3, 11011101b
- ; table ready, in xmm4-xmm7
- mulps xmm6, xmm1 ; xmm6=Geps
- mulps xmm7, xmm2 ; xmm7=Heps2
- addps xmm5, xmm6
- addps xmm5, xmm7 ; xmm5=Fp
- mulps xmm7, [esp + .two] ; two*Heps2
- addps xmm7, xmm6
- addps xmm7, xmm5 ; xmm7=FF
- mulps xmm5, xmm1 ; xmm5=eps*Fp
- addps xmm5, xmm4 ; xmm5=VV
-
- movaps xmm4, [esp + .c12]
- mulps xmm7, xmm4 ; fijR
- mulps xmm5, xmm4 ; vnb12
- addps xmm7, [esp + .fs]
-
- addps xmm5, [esp + .vnbtot]
- movaps [esp + .vnbtot], xmm5
- xorps xmm4, xmm4
-
- mulps xmm7, [esp + .tabscale]
- mulps xmm7, xmm0
- subps xmm4, xmm7
-
- movaps xmm0, [esp + .dx]
- movaps xmm1, [esp + .dy]
- movaps xmm2, [esp + .dz]
-
- mulps xmm0, xmm4
- mulps xmm1, xmm4
- mulps xmm2, xmm4
- ; xmm0-xmm2 contains tx-tz (partial force)
- ; now update f_i
- movaps xmm3, [esp + .fix]
- movaps xmm4, [esp + .fiy]
- movaps xmm5, [esp + .fiz]
- addps xmm3, xmm0
- addps xmm4, xmm1
- addps xmm5, xmm2
- movaps [esp + .fix], xmm3
- movaps [esp + .fiy], xmm4
- movaps [esp + .fiz], xmm5
- ; update the fj's
- movss xmm3, [edi + eax*4]
- movss xmm4, [edi + eax*4 + 4]
- movss xmm5, [edi + eax*4 + 8]
- subss xmm3, xmm0
- subss xmm4, xmm1
- subss xmm5, xmm2
- movss [edi + eax*4], xmm3
- movss [edi + eax*4 + 4], xmm4
- movss [edi + eax*4 + 8], xmm5
-
- shufps xmm0, xmm0, 11100001b
- shufps xmm1, xmm1, 11100001b
- shufps xmm2, xmm2, 11100001b
-
- movss xmm3, [edi + ebx*4]
- movss xmm4, [edi + ebx*4 + 4]
- movss xmm5, [edi + ebx*4 + 8]
- subss xmm3, xmm0
- subss xmm4, xmm1
- subss xmm5, xmm2
- movss [edi + ebx*4], xmm3
- movss [edi + ebx*4 + 4], xmm4
- movss [edi + ebx*4 + 8], xmm5
-
-.checksingle:
- mov edx, [esp + .innerk]
- and edx, 1b
- jnz .dosingle
- jmp .updateouterdata
-.dosingle:
- mov edi, [ebp + %$pos]
- mov ecx, [esp + .innerjjnr]
- mov eax, [ecx]
- xorps xmm6, xmm6
-
- mov esi, [ebp + %$type]
- mov ecx, eax
- mov ecx, [esi + ecx*4]
- mov esi, [ebp + %$nbfp]
- shl ecx, 1
- add ecx, [esp + .ntia]
- movlps xmm6, [esi + ecx*4]
- movaps xmm4, xmm6
- shufps xmm4, xmm4, 11111100b
- shufps xmm6, xmm6, 11111101b
-
- movaps [esp + .c6], xmm4
- movaps [esp + .c12], xmm6
-
- lea eax, [eax + eax*2]
-
- ; move coordinates to xmm0-xmm2
- movss xmm0, [edi + eax*4]
- movss xmm1, [edi + eax*4 + 4]
- movss xmm2, [edi + eax*4 + 8]
-
- movaps xmm4, [esp + .ix]
- movaps xmm5, [esp + .iy]
- movaps xmm6, [esp + .iz]
-
- ; calc dr
- subps xmm4, xmm0
- subps xmm5, xmm1
- subps xmm6, xmm2
-
- ; store dr
- movaps [esp + .dx], xmm4
- movaps [esp + .dy], xmm5
- movaps [esp + .dz], xmm6
- ; square it
- mulps xmm4,xmm4
- mulps xmm5,xmm5
- mulps xmm6,xmm6
- addps xmm4, xmm5
- addps xmm4, xmm6
- ; rsq in xmm4
-
- rsqrtps xmm5, xmm4
- ; lookup seed in xmm5
- movaps xmm2, xmm5
- mulps xmm5, xmm5
- movaps xmm1, [esp + .three]
- mulps xmm5, xmm4 ;rsq*lu*lu
- movaps xmm0, [esp + .half]
- subps xmm1, xmm5 ; 3.0-rsq*lu*lu
- mulps xmm1, xmm2
- mulps xmm0, xmm1 ; xmm0=rinv
-
- mulps xmm4, xmm0 ; xmm4=r
- mulps xmm4, [esp + .tabscale]
-
- cvttps2pi mm6, xmm4 ; mm6 contain lu indices
- cvtpi2ps xmm6, mm6
- subps xmm4, xmm6
- movaps xmm1, xmm4 ;xmm1=eps
- movaps xmm2, xmm1
- mulps xmm2, xmm2 ;xmm2=eps2
-
- pslld mm6, 3
-
- mov esi, [ebp + %$VFtab]
- movd ebx, mm6
-
- ; dispersion
- movlps xmm4, [esi + ebx*4 + 0]
- movlps xmm6, [esi + ebx*4 + 8]
- movaps xmm5, xmm4
- movaps xmm7, xmm6
- shufps xmm5, xmm5, 1b
- shufps xmm7, xmm7, 1b
- ; table ready in xmm4-xmm7
-
- mulps xmm6, xmm1 ; xmm6=Geps
- mulps xmm7, xmm2 ; xmm7=Heps2
- addps xmm5, xmm6
- addps xmm5, xmm7 ; xmm5=Fp
- mulps xmm7, [esp + .two] ; two*Heps2
- addps xmm7, xmm6
- addps xmm7, xmm5 ; xmm7=FF
- mulps xmm5, xmm1 ; xmm5=eps*Fp
- addps xmm5, xmm4 ; xmm5=VV
-
- movaps xmm4, [esp + .c6]
- mulps xmm7, xmm4 ; fijD
- mulps xmm5, xmm4 ; vnb6
-
- ; put scalar force on stack. Update vnbtot directly.
- addps xmm5, [esp + .vnbtot]
- movaps [esp + .fs], xmm7
- movaps [esp + .vnbtot], xmm5
-
- ; repulsion
- movlps xmm4, [esi + ebx*4 + 16]
- movlps xmm6, [esi + ebx*4 + 24]
- movaps xmm5, xmm4
- movaps xmm7, xmm6
- shufps xmm5, xmm5, 1b
- shufps xmm7, xmm7, 1b
- ; table ready in xmm4-xmm7
-
- mulps xmm6, xmm1 ; xmm6=Geps
- mulps xmm7, xmm2 ; xmm7=Heps2
- addps xmm5, xmm6
- addps xmm5, xmm7 ; xmm5=Fp
- mulps xmm7, [esp + .two] ; two*Heps2
- addps xmm7, xmm6
- addps xmm7, xmm5 ; xmm7=FF
- mulps xmm5, xmm1 ; xmm5=eps*Fp
- addps xmm5, xmm4 ; xmm5=VV
-
- movaps xmm4, [esp + .c12]
- mulps xmm7, xmm4 ; fijR
- mulps xmm5, xmm4 ; vnb12
- addps xmm7, [esp + .fs]
-
- addps xmm5, [esp + .vnbtot]
- movaps [esp + .vnbtot], xmm5
- xorps xmm4, xmm4
-
- mulps xmm7, [esp + .tabscale]
- mulps xmm7, xmm0
- subps xmm4, xmm7
- mov edi, [ebp + %$faction]
-
- movaps xmm0, [esp + .dx]
- movaps xmm1, [esp + .dy]
- movaps xmm2, [esp + .dz]
-
- mulps xmm0, xmm4
- mulps xmm1, xmm4
- mulps xmm2, xmm4
- ; xmm0-xmm2 contains tx-tz (partial force)
- ; now update f_i
- movaps xmm3, [esp + .fix]
- movaps xmm4, [esp + .fiy]
- movaps xmm5, [esp + .fiz]
- addps xmm3, xmm0
- addps xmm4, xmm1
- addps xmm5, xmm2
- movaps [esp + .fix], xmm3
- movaps [esp + .fiy], xmm4
- movaps [esp + .fiz], xmm5
- ; update fj
-
- movss xmm3, [edi + eax*4]
- movss xmm4, [edi + eax*4 + 4]
- movss xmm5, [edi + eax*4 + 8]
- subss xmm3, xmm0
- subss xmm4, xmm1
- subss xmm5, xmm2
- movss [edi + eax*4], xmm3
- movss [edi + eax*4 + 4], xmm4
- movss [edi + eax*4 + 8], xmm5
-.updateouterdata:
- mov ecx, [esp + .ii3]
- mov edi, [ebp + %$faction]
- mov esi, [ebp + %$fshift]
- mov edx, [esp + .is3]
-
- ; accumulate i forces in xmm0, xmm1, xmm2
- movaps xmm0, [esp + .fix]
- movaps xmm1, [esp + .fiy]
- movaps xmm2, [esp + .fiz]
-
- movhlps xmm3, xmm0
- movhlps xmm4, xmm1
- movhlps xmm5, xmm2
- addps xmm0, xmm3
- addps xmm1, xmm4
- addps xmm2, xmm5 ; summan ligger i 1/2 i xmm0-xmm2
-
- movaps xmm3, xmm0
- movaps xmm4, xmm1
- movaps xmm5, xmm2
-
- shufps xmm3, xmm3, 1b
- shufps xmm4, xmm4, 1b
- shufps xmm5, xmm5, 1b
- addss xmm0, xmm3
- addss xmm1, xmm4
- addss xmm2, xmm5 ; xmm0-xmm2 has single force in pos0.
-
- ; increment i force
- movss xmm3, [edi + ecx*4]
- movss xmm4, [edi + ecx*4 + 4]
- movss xmm5, [edi + ecx*4 + 8]
- addss xmm3, xmm0
- addss xmm4, xmm1
- addss xmm5, xmm2
- movss [edi + ecx*4], xmm3
- movss [edi + ecx*4 + 4], xmm4
- movss [edi + ecx*4 + 8], xmm5
-
- ; increment fshift force
- movss xmm3, [esi + edx*4]
- movss xmm4, [esi + edx*4 + 4]
- movss xmm5, [esi + edx*4 + 8]
- addss xmm3, xmm0
- addss xmm4, xmm1
- addss xmm5, xmm2
- movss [esi + edx*4], xmm3
- movss [esi + edx*4 + 4], xmm4
- movss [esi + edx*4 + 8], xmm5
-
- ; get group index for i particle
- mov edx, [ebp + %$gid] ; get group index for this i particle
- mov edx, [edx]
- add [ebp + %$gid], dword 4 ; advance pointer
-
- ; accumulate total lj energy and update it.
- movaps xmm7, [esp + .vnbtot]
- ; accumulate
- movhlps xmm6, xmm7
- addps xmm7, xmm6 ; pos 0-1 in xmm7 have the sum now
- movaps xmm6, xmm7
- shufps xmm6, xmm6, 1b
- addss xmm7, xmm6
-
- ; add earlier value from mem.
- mov eax, [ebp + %$Vnb]
- addss xmm7, [eax + edx*4]
- ; move back to mem.
- movss [eax + edx*4], xmm7
-
- ;; finish if last
- mov ecx, [ebp + %$nri]
- dec ecx
- jecxz .end
- ;; not last, iterate once more!
- mov [ebp + %$nri], ecx
- jmp .outer
-.end:
- emms
- mov eax, [esp + .salign]
- add esp, eax
- add esp, 296
- pop edi
- pop esi
- pop edx
- pop ecx
- pop ebx
- pop eax
- endproc
-
-
-
-proc inl0310_sse
-%$nri arg
-%$iinr arg
-%$jindex arg
-%$jjnr arg
-%$shift arg
-%$shiftvec arg
-%$fshift arg
-%$gid arg
-%$pos arg
-%$faction arg
-%$type arg
-%$ntype arg
-%$nbfp arg
-%$Vnb arg
-%$tabscale arg
-%$VFtab arg
-%$nsatoms arg
- ;; stack offsets for local variables
- ;; bottom of stack is cache-aligned for sse use
-.ix equ 0
-.iy equ 16
-.iz equ 32
-.dx equ 48
-.dy equ 64
-.dz equ 80
-.two equ 96
-.tabscale equ 112
-.c6 equ 128
-.c12 equ 144
-.fs equ 160
-.vnbtot equ 176
-.fix equ 192
-.fiy equ 208
-.fiz equ 224
-.half equ 240
-.three equ 256
-.is3 equ 272
-.ii3 equ 276
-.shX equ 280
-.shY equ 284
-.shZ equ 288
-.ntia equ 292
-.innerjjnr0 equ 296
-.innerjjnr equ 300
-.innerk0 equ 304
-.innerk equ 308
-.salign equ 312
-.nsvdwc equ 316
-.nscoul equ 320
-.nsvdw equ 324
-.solnr equ 328
- push eax
- push ebx
- push ecx
- push edx
- push esi
- push edi
- sub esp, 332 ; local stack space
- mov eax, esp
- and eax, 0xf
- sub esp, eax
- mov [esp + .salign], eax
-
- emms
-
- movups xmm0, [sse_half]
- movups xmm1, [sse_two]
- movups xmm2, [sse_three]
- movss xmm3, [ebp + %$tabscale]
- movaps [esp + .half], xmm0
- movaps [esp + .two], xmm1
- movaps [esp + .three], xmm2
- shufps xmm3, xmm3, 0b
- movaps [esp + .tabscale], xmm3
-
- ;; assume we have at least one i particle - start directly
-.outer:
- mov eax, [ebp + %$shift] ; eax = pointer into shift[]
- mov ebx, [eax] ; ebx=shift[n]
- add [ebp + %$shift], dword 4 ; advance pointer one step
-
- lea ebx, [ebx + ebx*2] ; ebx=3*is
- mov [esp + .is3],ebx ; store is3
-
- mov eax, [ebp + %$shiftvec] ; eax = base of shiftvec[]
-
- movlps xmm0, [eax + ebx*4] ; getting the shiftvector
- movss xmm1, [eax + ebx*4 + 8]
- movlps [esp + .shX], xmm0
- movss [esp + .shZ], xmm1
-
- mov ecx, [ebp + %$iinr] ; ecx = pointer into iinr[]
- add [ebp + %$iinr], dword 4 ; advance pointer
- mov ebx, [ecx] ; ebx =ii
-
- mov eax, [ebp + %$nsatoms]
- add [ebp + %$nsatoms], dword 12
- mov ecx, [eax]
- mov edx, [eax + 4]
- mov eax, [eax + 8]
- sub ecx, eax
- sub eax, edx
-
- mov [esp + .nsvdwc], edx
- mov [esp + .nscoul], eax
- mov [esp + .nsvdw], ecx
-
- ; clear vnbtot
- xorps xmm4, xmm4
- movaps [esp + .vnbtot], xmm4
- mov [esp + .solnr], ebx
-
- mov eax, [ebp + %$jindex]
- mov ecx, [eax] ; jindex[n]
- mov edx, [eax + 4] ; jindex[n+1]
- add [ebp + %$jindex], dword 4
- sub edx, ecx ; number of innerloop atoms
- mov eax, [ebp + %$jjnr]
- shl ecx, 2
- add eax, ecx
- mov [esp + .innerjjnr0], eax ; pointer to jjnr[nj0]
-
- mov [esp + .innerk0], edx ; number of innerloop atoms
-
- mov ecx, [esp + .nsvdwc]
- cmp ecx, dword 0
- jnz .mno_vdwc
- jmp .testvdw
-.mno_vdwc:
- mov ebx, [esp + .solnr]
- inc dword [esp + .solnr]
-
- mov edx, [ebp + %$type]
- mov edx, [edx + ebx*4]
- imul edx, [ebp + %$ntype]
- shl edx, 1
- mov [esp + .ntia], edx
-
- lea ebx, [ebx + ebx*2] ; ebx = 3*ii=ii3
- mov eax, [ebp + %$pos] ; eax = base of pos[]
- mov [esp + .ii3], ebx
-
- movss xmm0, [esp + .shX]
- movss xmm1, [esp + .shY]
- movss xmm2, [esp + .shZ]
-
- addss xmm0, [eax + ebx*4]
- addss xmm1, [eax + ebx*4 + 4]
- addss xmm2, [eax + ebx*4 + 8]
-
- ; clear i forces
- xorps xmm4, xmm4
- movaps [esp + .fix], xmm4
- movaps [esp + .fiy], xmm4
- movaps [esp + .fiz], xmm4
-
- shufps xmm0, xmm0, 0b
- shufps xmm1, xmm1, 0b
- shufps xmm2, xmm2, 0b
-
- movaps [esp + .ix], xmm0
- movaps [esp + .iy], xmm1
- movaps [esp + .iz], xmm2
-
- mov ecx, [esp + .innerjjnr0]
- mov [esp + .innerjjnr], ecx
- mov edx, [esp + .innerk0]
- sub edx, dword 4
- mov [esp + .innerk], edx ; number of innerloop atoms
- jge .unroll_vdwc_loop
- jmp .finish_vdwc_inner
-.unroll_vdwc_loop:
- ;; quad-unroll innerloop here.
- mov edx, [esp + .innerjjnr] ; pointer to jjnr[k]
- mov eax, [edx]
- mov ebx, [edx + 4]
- mov ecx, [edx + 8]
- mov edx, [edx + 12] ; eax-edx=jnr1-4
- add [esp + .innerjjnr], dword 16 ; advance pointer (unrolled 4)
-
- movd mm0, eax ; use mmx registers as temp. storage
- movd mm1, ebx
- movd mm2, ecx
- movd mm3, edx
-
- mov esi, [ebp + %$type]
- mov eax, [esi + eax*4]
- mov ebx, [esi + ebx*4]
- mov ecx, [esi + ecx*4]
- mov edx, [esi + edx*4]
- mov esi, [ebp + %$nbfp]
- shl eax, 1
- shl ebx, 1
- shl ecx, 1
- shl edx, 1
- mov edi, [esp + .ntia]
- add eax, edi
- add ebx, edi
- add ecx, edi
- add edx, edi
-
- movlps xmm6, [esi + eax*4]
- movlps xmm7, [esi + ecx*4]
- movhps xmm6, [esi + ebx*4]
- movhps xmm7, [esi + edx*4]
-
- movaps xmm4, xmm6
- shufps xmm4, xmm7, 10001000b
- shufps xmm6, xmm7, 11011101b
-
- movd eax, mm0
- movd ebx, mm1
- movd ecx, mm2
- movd edx, mm3
-
- movaps [esp + .c6], xmm4
- movaps [esp + .c12], xmm6
-
- mov esi, [ebp + %$pos] ; base of pos[]
-
- lea eax, [eax + eax*2] ; replace jnr with j3
- lea ebx, [ebx + ebx*2]
-
- lea ecx, [ecx + ecx*2] ; replace jnr with j3
- lea edx, [edx + edx*2]
-
- ; move four coordinates to xmm0-xmm2
-
- movlps xmm4, [esi + eax*4]
- movlps xmm5, [esi + ecx*4]
- movss xmm2, [esi + eax*4 + 8]
- movss xmm6, [esi + ecx*4 + 8]
-
- movhps xmm4, [esi + ebx*4]
- movhps xmm5, [esi + edx*4]
-
- movss xmm0, [esi + ebx*4 + 8]
- movss xmm1, [esi + edx*4 + 8]
-
- shufps xmm2, xmm0, 0b
- shufps xmm6, xmm1, 0b
-
- movaps xmm0, xmm4
- movaps xmm1, xmm4
-
- shufps xmm2, xmm6, 10001000b
-
- shufps xmm0, xmm5, 10001000b
- shufps xmm1, xmm5, 11011101b
-
- ; move ix-iz to xmm4-xmm6
- movaps xmm4, [esp + .ix]
- movaps xmm5, [esp + .iy]
- movaps xmm6, [esp + .iz]
-
- ; calc dr
- subps xmm4, xmm0
- subps xmm5, xmm1
- subps xmm6, xmm2
-
- ; store dr
- movaps [esp + .dx], xmm4
- movaps [esp + .dy], xmm5
- movaps [esp + .dz], xmm6
- ; square it
- mulps xmm4,xmm4
- mulps xmm5,xmm5
- mulps xmm6,xmm6
- addps xmm4, xmm5
- addps xmm4, xmm6
- ; rsq in xmm4
-
- rsqrtps xmm5, xmm4
- ; lookup seed in xmm5
- movaps xmm2, xmm5
- mulps xmm5, xmm5
- movaps xmm1, [esp + .three]
- mulps xmm5, xmm4 ;rsq*lu*lu
- movaps xmm0, [esp + .half]
- subps xmm1, xmm5 ; 3.0-rsq*lu*lu
- mulps xmm1, xmm2
- mulps xmm0, xmm1 ; xmm0=rinv
- mulps xmm4, xmm0 ; xmm4=r
- mulps xmm4, [esp + .tabscale]
-
- movhlps xmm5, xmm4
- cvttps2pi mm6, xmm4
- cvttps2pi mm7, xmm5 ; mm6/mm7 contain lu indices
- cvtpi2ps xmm6, mm6
- cvtpi2ps xmm5, mm7
- movlhps xmm6, xmm5
- subps xmm4, xmm6
- movaps xmm1, xmm4 ;xmm1=eps
- movaps xmm2, xmm1
- mulps xmm2, xmm2 ;xmm2=eps2
- pslld mm6, 3
- pslld mm7, 3
-
- movd mm0, eax
- movd mm1, ebx
- movd mm2, ecx
- movd mm3, edx
-
- mov esi, [ebp + %$VFtab]
- movd eax, mm6
- psrlq mm6, 32
- movd ecx, mm7
- psrlq mm7, 32
- movd ebx, mm6
- movd edx, mm7
-
- ; dispersion
- movlps xmm5, [esi + eax*4 + 0]
- movlps xmm7, [esi + ecx*4 + 0]
- movhps xmm5, [esi + ebx*4 + 0]
- movhps xmm7, [esi + edx*4 + 0] ; got half dispersion table
- movaps xmm4, xmm5
- shufps xmm4, xmm7, 10001000b
- shufps xmm5, xmm7, 11011101b
-
- movlps xmm7, [esi + eax*4 + 8]
- movlps xmm3, [esi + ecx*4 + 8]
- movhps xmm7, [esi + ebx*4 + 8]
- movhps xmm3, [esi + edx*4 + 8] ; other half of dispersion table
- movaps xmm6, xmm7
- shufps xmm6, xmm3, 10001000b
- shufps xmm7, xmm3, 11011101b
- ; dispersion table ready, in xmm4-xmm7
- mulps xmm6, xmm1 ; xmm6=Geps
- mulps xmm7, xmm2 ; xmm7=Heps2
- addps xmm5, xmm6
- addps xmm5, xmm7 ; xmm5=Fp
- mulps xmm7, [esp + .two] ; two*Heps2
- addps xmm7, xmm6
- addps xmm7, xmm5 ; xmm7=FF
- mulps xmm5, xmm1 ; xmm5=eps*Fp
- addps xmm5, xmm4 ; xmm5=VV
-
- movaps xmm4, [esp + .c6]
- mulps xmm7, xmm4 ; fijD
- mulps xmm5, xmm4 ; vnb6
-
- ; put scalar force on stack. Update vnbtot directly.
- addps xmm5, [esp + .vnbtot]
- movaps [esp + .fs], xmm7
- movaps [esp + .vnbtot], xmm5
-
- ; repulsion
- movlps xmm5, [esi + eax*4 + 16]
- movlps xmm7, [esi + ecx*4 + 16]
- movhps xmm5, [esi + ebx*4 + 16]
- movhps xmm7, [esi + edx*4 + 16] ; got half repulsion table
- movaps xmm4, xmm5
- shufps xmm4, xmm7, 10001000b
- shufps xmm5, xmm7, 11011101b
-
- movlps xmm7, [esi + eax*4 + 24]
- movlps xmm3, [esi + ecx*4 + 24]
- movhps xmm7, [esi + ebx*4 + 24]
- movhps xmm3, [esi + edx*4 + 24] ; other half of repulsion table
- movaps xmm6, xmm7
- shufps xmm6, xmm3, 10001000b
- shufps xmm7, xmm3, 11011101b
- ; table ready, in xmm4-xmm7
- mulps xmm6, xmm1 ; xmm6=Geps
- mulps xmm7, xmm2 ; xmm7=Heps2
- addps xmm5, xmm6
- addps xmm5, xmm7 ; xmm5=Fp
- mulps xmm7, [esp + .two] ; two*Heps2
- addps xmm7, xmm6
- addps xmm7, xmm5 ; xmm7=FF
- mulps xmm5, xmm1 ; xmm5=eps*Fp
- addps xmm5, xmm4 ; xmm5=VV
-
- movaps xmm4, [esp + .c12]
- mulps xmm7, xmm4 ; fijR
- mulps xmm5, xmm4 ; vnb12
- addps xmm7, [esp + .fs]
-
- addps xmm5, [esp + .vnbtot]
- movaps [esp + .vnbtot], xmm5
- xorps xmm4, xmm4
-
- mulps xmm7, [esp + .tabscale]
- mulps xmm7, xmm0
- subps xmm4, xmm7
-
- movaps xmm0, [esp + .dx]
- movaps xmm1, [esp + .dy]
- movaps xmm2, [esp + .dz]
-
- movd eax, mm0
- movd ebx, mm1
- movd ecx, mm2
- movd edx, mm3
-
- mov edi, [ebp + %$faction]
- mulps xmm0, xmm4
- mulps xmm1, xmm4
- mulps xmm2, xmm4
- ; xmm0-xmm2 contains tx-tz (partial force)
- ; now update f_i
- movaps xmm3, [esp + .fix]
- movaps xmm4, [esp + .fiy]
- movaps xmm5, [esp + .fiz]
- addps xmm3, xmm0
- addps xmm4, xmm1
- addps xmm5, xmm2
- movaps [esp + .fix], xmm3
- movaps [esp + .fiy], xmm4
- movaps [esp + .fiz], xmm5
- ; the fj's - start by accumulating x & y forces from memory
- movlps xmm4, [edi + eax*4]
- movlps xmm6, [edi + ecx*4]
- movhps xmm4, [edi + ebx*4]
- movhps xmm6, [edi + edx*4]
-
- movaps xmm3, xmm4
- shufps xmm3, xmm6, 10001000b
- shufps xmm4, xmm6, 11011101b
-
- ; now xmm3-xmm5 contains fjx, fjy, fjz
- subps xmm3, xmm0
- subps xmm4, xmm1
-
- ; unpack them back so we can store them - first x & y in xmm3/xmm4
-
- movaps xmm6, xmm3
- unpcklps xmm6, xmm4
- unpckhps xmm3, xmm4
- ; xmm6(l)=x & y for j1, (h) for j2
- ; xmm3(l)=x & y for j3, (h) for j4
- movlps [edi + eax*4], xmm6
- movlps [edi + ecx*4], xmm3
-
- movhps [edi + ebx*4], xmm6
- movhps [edi + edx*4], xmm3
-
- ;; and the z forces
- movss xmm4, [edi + eax*4 + 8]
- movss xmm5, [edi + ebx*4 + 8]
- movss xmm6, [edi + ecx*4 + 8]
- movss xmm7, [edi + edx*4 + 8]
- subss xmm4, xmm2
- shufps xmm2, xmm2, 11100101b
- subss xmm5, xmm2
- shufps xmm2, xmm2, 11101010b
- subss xmm6, xmm2
- shufps xmm2, xmm2, 11111111b
- subss xmm7, xmm2
- movss [edi + eax*4 + 8], xmm4
- movss [edi + ebx*4 + 8], xmm5
- movss [edi + ecx*4 + 8], xmm6
- movss [edi + edx*4 + 8], xmm7
-
- ;; should we do one more iteration?
- sub [esp + .innerk], dword 4
- jl .finish_vdwc_inner
- jmp .unroll_vdwc_loop
-.finish_vdwc_inner:
- ;; check if at least two particles remain
- add [esp + .innerk], dword 4
- mov edx, [esp + .innerk]
- and edx, 10b
- jnz .dopair_vdwc
- jmp .checksingle_vdwc
-.dopair_vdwc:
- mov ecx, [esp + .innerjjnr]
-
- mov eax, [ecx]
- mov ebx, [ecx + 4]
- add [esp + .innerjjnr], dword 8
- xorps xmm7, xmm7
-
- mov esi, [ebp + %$type]
- mov ecx, eax
- mov edx, ebx
- mov ecx, [esi + ecx*4]
- mov edx, [esi + edx*4]
- mov esi, [ebp + %$nbfp]
- shl ecx, 1
- shl edx, 1
- mov edi, [esp + .ntia]
- add ecx, edi
- add edx, edi
- movlps xmm6, [esi + ecx*4]
- movhps xmm6, [esi + edx*4]
- mov edi, [ebp + %$pos]
-
- movaps xmm4, xmm6
- shufps xmm4, xmm4, 1000b
- shufps xmm6, xmm6, 1101b
- movlhps xmm4, xmm7
- movlhps xmm6, xmm7
-
- movaps [esp + .c6], xmm4
- movaps [esp + .c12], xmm6
-
- lea eax, [eax + eax*2]
- lea ebx, [ebx + ebx*2]
- ; move coordinates to xmm0-xmm2
- movlps xmm1, [edi + eax*4]
- movss xmm2, [edi + eax*4 + 8]
- movhps xmm1, [edi + ebx*4]
- movss xmm0, [edi + ebx*4 + 8]
-
- shufps xmm2, xmm0, 0b
-
- movaps xmm0, xmm1
-
- shufps xmm2, xmm2, 10001000b
-
- shufps xmm0, xmm0, 10001000b
- shufps xmm1, xmm1, 11011101b
-
- mov edi, [ebp + %$faction]
- ; move ix-iz to xmm4-xmm6
- xorps xmm7, xmm7
-
- movaps xmm4, [esp + .ix]
- movaps xmm5, [esp + .iy]
- movaps xmm6, [esp + .iz]
-
- ; calc dr
- subps xmm4, xmm0
- subps xmm5, xmm1
- subps xmm6, xmm2
-
- ; store dr
- movaps [esp + .dx], xmm4
- movaps [esp + .dy], xmm5
- movaps [esp + .dz], xmm6
- ; square it
- mulps xmm4,xmm4
- mulps xmm5,xmm5
- mulps xmm6,xmm6
- addps xmm4, xmm5
- addps xmm4, xmm6
- ; rsq in xmm4
-
- rsqrtps xmm5, xmm4
- ; lookup seed in xmm5
- movaps xmm2, xmm5
- mulps xmm5, xmm5
- movaps xmm1, [esp + .three]
- mulps xmm5, xmm4 ;rsq*lu*lu
- movaps xmm0, [esp + .half]
- subps xmm1, xmm5 ; 3.0-rsq*lu*lu
- mulps xmm1, xmm2
- mulps xmm0, xmm1 ; xmm0=rinv
- mulps xmm4, xmm0 ; xmm4=r
- mulps xmm4, [esp + .tabscale]
-
- cvttps2pi mm6, xmm4 ; mm6 contain lu indices
- cvtpi2ps xmm6, mm6
- subps xmm4, xmm6
- movaps xmm1, xmm4 ;xmm1=eps
- movaps xmm2, xmm1
- mulps xmm2, xmm2 ;xmm2=eps2
-
- pslld mm6, 3
-
- mov esi, [ebp + %$VFtab]
- movd ecx, mm6
- psrlq mm6, 32
- movd edx, mm6
-
- ; dispersion
- movlps xmm5, [esi + ecx*4 + 0]
- movhps xmm5, [esi + edx*4 + 0]; got half dispersion table
- movaps xmm4, xmm5
- shufps xmm4, xmm4, 10001000b
- shufps xmm5, xmm5, 11011101b
-
- movlps xmm7, [esi + ecx*4 + 8]
- movhps xmm7, [esi + edx*4 + 8] ; other half of dispersion table
- movaps xmm6, xmm7
- shufps xmm6, xmm6, 10001000b
- shufps xmm7, xmm7, 11011101b
- ; dispersion table ready, in xmm4-xmm7
- mulps xmm6, xmm1 ; xmm6=Geps
- mulps xmm7, xmm2 ; xmm7=Heps2
- addps xmm5, xmm6
- addps xmm5, xmm7 ; xmm5=Fp
- mulps xmm7, [esp + .two] ; two*Heps2
- addps xmm7, xmm6
- addps xmm7, xmm5 ; xmm7=FF
- mulps xmm5, xmm1 ; xmm5=eps*Fp
- addps xmm5, xmm4 ; xmm5=VV
-
- movaps xmm4, [esp + .c6]
- mulps xmm7, xmm4 ; fijD
- mulps xmm5, xmm4 ; vnb6
-
- ; put scalar force on stack. Update vnbtot directly.
- addps xmm5, [esp + .vnbtot]
- movaps [esp + .fs], xmm7
- movaps [esp + .vnbtot], xmm5
-
- ; repulsion
- movlps xmm5, [esi + ecx*4 + 16]
- movhps xmm5, [esi + edx*4 + 16] ; got half repulsion table
- movaps xmm4, xmm5
- shufps xmm4, xmm4, 10001000b
- shufps xmm5, xmm5, 11011101b
-
- movlps xmm7, [esi + ecx*4 + 24]
- movhps xmm7, [esi + edx*4 + 24] ; other half of repulsion table
- movaps xmm6, xmm7
- shufps xmm6, xmm6, 10001000b
- shufps xmm7, xmm7, 11011101b
- ; table ready, in xmm4-xmm7
- mulps xmm6, xmm1 ; xmm6=Geps
- mulps xmm7, xmm2 ; xmm7=Heps2
- addps xmm5, xmm6
- addps xmm5, xmm7 ; xmm5=Fp
- mulps xmm7, [esp + .two] ; two*Heps2
- addps xmm7, xmm6
- addps xmm7, xmm5 ; xmm7=FF
- mulps xmm5, xmm1 ; xmm5=eps*Fp
- addps xmm5, xmm4 ; xmm5=VV
-
- movaps xmm4, [esp + .c12]
- mulps xmm7, xmm4 ; fijR
- mulps xmm5, xmm4 ; vnb12
- addps xmm7, [esp + .fs]
-
- addps xmm5, [esp + .vnbtot]
- movaps [esp + .vnbtot], xmm5
- xorps xmm4, xmm4
-
- mulps xmm7, [esp + .tabscale]
- mulps xmm7, xmm0
- subps xmm4, xmm7
-
- movaps xmm0, [esp + .dx]
- movaps xmm1, [esp + .dy]
- movaps xmm2, [esp + .dz]
-
- mulps xmm0, xmm4
- mulps xmm1, xmm4
- mulps xmm2, xmm4
- ; xmm0-xmm2 contains tx-tz (partial force)
- ; now update f_i
- movaps xmm3, [esp + .fix]
- movaps xmm4, [esp + .fiy]
- movaps xmm5, [esp + .fiz]
- addps xmm3, xmm0
- addps xmm4, xmm1
- addps xmm5, xmm2
- movaps [esp + .fix], xmm3
- movaps [esp + .fiy], xmm4
- movaps [esp + .fiz], xmm5
- ; update the fj's
- movss xmm3, [edi + eax*4]
- movss xmm4, [edi + eax*4 + 4]
- movss xmm5, [edi + eax*4 + 8]
- subss xmm3, xmm0
- subss xmm4, xmm1
- subss xmm5, xmm2
- movss [edi + eax*4], xmm3
- movss [edi + eax*4 + 4], xmm4
- movss [edi + eax*4 + 8], xmm5
-
- shufps xmm0, xmm0, 11100001b
- shufps xmm1, xmm1, 11100001b
- shufps xmm2, xmm2, 11100001b
-
- movss xmm3, [edi + ebx*4]
- movss xmm4, [edi + ebx*4 + 4]
- movss xmm5, [edi + ebx*4 + 8]
- subss xmm3, xmm0
- subss xmm4, xmm1
- subss xmm5, xmm2
- movss [edi + ebx*4], xmm3
- movss [edi + ebx*4 + 4], xmm4
- movss [edi + ebx*4 + 8], xmm5
-
-.checksingle_vdwc:
- mov edx, [esp + .innerk]
- and edx, 1b
- jnz .dosingle_vdwc
- jmp .updateouterdata_vdwc
-.dosingle_vdwc:
- mov edi, [ebp + %$pos]
- mov ecx, [esp + .innerjjnr]
- mov eax, [ecx]
- xorps xmm6, xmm6
-
- mov esi, [ebp + %$type]
- mov ecx, eax
- mov ecx, [esi + ecx*4]
- mov esi, [ebp + %$nbfp]
- shl ecx, 1
- add ecx, [esp + .ntia]
- movlps xmm6, [esi + ecx*4]
- movaps xmm4, xmm6
- shufps xmm4, xmm4, 11111100b
- shufps xmm6, xmm6, 11111101b
-
- movaps [esp + .c6], xmm4
- movaps [esp + .c12], xmm6
-
- lea eax, [eax + eax*2]
-
- ; move coordinates to xmm0-xmm2
- movss xmm0, [edi + eax*4]
- movss xmm1, [edi + eax*4 + 4]
- movss xmm2, [edi + eax*4 + 8]
-
- movaps xmm4, [esp + .ix]
- movaps xmm5, [esp + .iy]
- movaps xmm6, [esp + .iz]
-
- ; calc dr
- subps xmm4, xmm0
- subps xmm5, xmm1
- subps xmm6, xmm2
-
- ; store dr
- movaps [esp + .dx], xmm4
- movaps [esp + .dy], xmm5
- movaps [esp + .dz], xmm6
- ; square it
- mulps xmm4,xmm4
- mulps xmm5,xmm5
- mulps xmm6,xmm6
- addps xmm4, xmm5
- addps xmm4, xmm6
- ; rsq in xmm4
-
- rsqrtps xmm5, xmm4
- ; lookup seed in xmm5
- movaps xmm2, xmm5
- mulps xmm5, xmm5
- movaps xmm1, [esp + .three]
- mulps xmm5, xmm4 ;rsq*lu*lu
- movaps xmm0, [esp + .half]
- subps xmm1, xmm5 ; 3.0-rsq*lu*lu
- mulps xmm1, xmm2
- mulps xmm0, xmm1 ; xmm0=rinv
-
- mulps xmm4, xmm0 ; xmm4=r
- mulps xmm4, [esp + .tabscale]
-
- cvttps2pi mm6, xmm4 ; mm6 contain lu indices
- cvtpi2ps xmm6, mm6
- subps xmm4, xmm6
- movaps xmm1, xmm4 ;xmm1=eps
- movaps xmm2, xmm1
- mulps xmm2, xmm2 ;xmm2=eps2
-
- pslld mm6, 3
-
- mov esi, [ebp + %$VFtab]
- movd ebx, mm6
-
- ; dispersion
- movlps xmm4, [esi + ebx*4 + 0]
- movlps xmm6, [esi + ebx*4 + 8]
- movaps xmm5, xmm4
- movaps xmm7, xmm6
- shufps xmm5, xmm5, 1b
- shufps xmm7, xmm7, 1b
- ; table ready in xmm4-xmm7
-
- mulps xmm6, xmm1 ; xmm6=Geps
- mulps xmm7, xmm2 ; xmm7=Heps2
- addps xmm5, xmm6
- addps xmm5, xmm7 ; xmm5=Fp
- mulps xmm7, [esp + .two] ; two*Heps2
- addps xmm7, xmm6
- addps xmm7, xmm5 ; xmm7=FF
- mulps xmm5, xmm1 ; xmm5=eps*Fp
- addps xmm5, xmm4 ; xmm5=VV
-
- movaps xmm4, [esp + .c6]
- mulps xmm7, xmm4 ; fijD
- mulps xmm5, xmm4 ; vnb6
-
- ; put scalar force on stack. Update vnbtot directly.
- addps xmm5, [esp + .vnbtot]
- movaps [esp + .fs], xmm7
- movaps [esp + .vnbtot], xmm5
-
- ; repulsion
- movlps xmm4, [esi + ebx*4 + 16]
- movlps xmm6, [esi + ebx*4 + 24]
- movaps xmm5, xmm4
- movaps xmm7, xmm6
- shufps xmm5, xmm5, 1b
- shufps xmm7, xmm7, 1b
- ; table ready in xmm4-xmm7
-
- mulps xmm6, xmm1 ; xmm6=Geps
- mulps xmm7, xmm2 ; xmm7=Heps2
- addps xmm5, xmm6
- addps xmm5, xmm7 ; xmm5=Fp
- mulps xmm7, [esp + .two] ; two*Heps2
- addps xmm7, xmm6
- addps xmm7, xmm5 ; xmm7=FF
- mulps xmm5, xmm1 ; xmm5=eps*Fp
- addps xmm5, xmm4 ; xmm5=VV
-
- movaps xmm4, [esp + .c12]
- mulps xmm7, xmm4 ; fijR
- mulps xmm5, xmm4 ; vnb12
- addps xmm7, [esp + .fs]
-
- addps xmm5, [esp + .vnbtot]
- movaps [esp + .vnbtot], xmm5
- xorps xmm4, xmm4
-
- mulps xmm7, [esp + .tabscale]
- mulps xmm7, xmm0
- subps xmm4, xmm7
- mov edi, [ebp + %$faction]
-
- movaps xmm0, [esp + .dx]
- movaps xmm1, [esp + .dy]
- movaps xmm2, [esp + .dz]
-
- mulps xmm0, xmm4
- mulps xmm1, xmm4
- mulps xmm2, xmm4
- ; xmm0-xmm2 contains tx-tz (partial force)
- ; now update f_i
- movaps xmm3, [esp + .fix]
- movaps xmm4, [esp + .fiy]
- movaps xmm5, [esp + .fiz]
- addps xmm3, xmm0
- addps xmm4, xmm1
- addps xmm5, xmm2
- movaps [esp + .fix], xmm3
- movaps [esp + .fiy], xmm4
- movaps [esp + .fiz], xmm5
- ; update fj
-
- movss xmm3, [edi + eax*4]
- movss xmm4, [edi + eax*4 + 4]
- movss xmm5, [edi + eax*4 + 8]
- subss xmm3, xmm0
- subss xmm4, xmm1
- subss xmm5, xmm2
- movss [edi + eax*4], xmm3
- movss [edi + eax*4 + 4], xmm4
- movss [edi + eax*4 + 8], xmm5
-.updateouterdata_vdwc:
- mov ecx, [esp + .ii3]
- mov edi, [ebp + %$faction]
- mov esi, [ebp + %$fshift]
- mov edx, [esp + .is3]
-
- ; accumulate i forces in xmm0, xmm1, xmm2
- movaps xmm0, [esp + .fix]
- movaps xmm1, [esp + .fiy]
- movaps xmm2, [esp + .fiz]
-
- movhlps xmm3, xmm0
- movhlps xmm4, xmm1
- movhlps xmm5, xmm2
- addps xmm0, xmm3
- addps xmm1, xmm4
- addps xmm2, xmm5 ; summan ligger i 1/2 i xmm0-xmm2
-
- movaps xmm3, xmm0
- movaps xmm4, xmm1
- movaps xmm5, xmm2
-
- shufps xmm3, xmm3, 1b
- shufps xmm4, xmm4, 1b
- shufps xmm5, xmm5, 1b
- addss xmm0, xmm3
- addss xmm1, xmm4
- addss xmm2, xmm5 ; xmm0-xmm2 has single force in pos0.
-
- ; increment i force
- movss xmm3, [edi + ecx*4]
- movss xmm4, [edi + ecx*4 + 4]
- movss xmm5, [edi + ecx*4 + 8]
- addss xmm3, xmm0
- addss xmm4, xmm1
- addss xmm5, xmm2
- movss [edi + ecx*4], xmm3
- movss [edi + ecx*4 + 4], xmm4
- movss [edi + ecx*4 + 8], xmm5
-
- ; increment fshift force
- movss xmm3, [esi + edx*4]
- movss xmm4, [esi + edx*4 + 4]
- movss xmm5, [esi + edx*4 + 8]
- addss xmm3, xmm0
- addss xmm4, xmm1
- addss xmm5, xmm2
- movss [esi + edx*4], xmm3
- movss [esi + edx*4 + 4], xmm4
- movss [esi + edx*4 + 8], xmm5
-
- ;; loop back to mno.
- dec dword [esp + .nsvdwc]
- jz .testvdw
- jmp .mno_vdwc
-.testvdw
- mov ebx, [esp + .nscoul]
- add [esp + .solnr], dword ebx
-
- mov ecx, [esp + .nsvdw]
- cmp ecx, byte 0
- jnz .mno_vdw
- jmp .last_mno
-.mno_vdw:
- mov ebx, [esp + .solnr]
- inc dword [esp + .solnr]
-
- mov edx, [ebp + %$type]
- mov edx, [edx + ebx*4]
- imul edx, [ebp + %$ntype]
- shl edx, 1
- mov [esp + .ntia], edx
-
- lea ebx, [ebx + ebx*2] ; ebx = 3*ii=ii3
- mov eax, [ebp + %$pos] ; eax = base of pos[]
- mov [esp + .ii3], ebx
-
- movss xmm0, [esp + .shX]
- movss xmm1, [esp + .shY]
- movss xmm2, [esp + .shZ]
-
- addss xmm0, [eax + ebx*4]
- addss xmm1, [eax + ebx*4 + 4]
- addss xmm2, [eax + ebx*4 + 8]
-
- xorps xmm4, xmm4
- movaps [esp + .fix], xmm4
- movaps [esp + .fiy], xmm4
- movaps [esp + .fiz], xmm4
-
- shufps xmm0, xmm0, 0b
- shufps xmm1, xmm1, 0b
- shufps xmm2, xmm2, 0b
-
- movaps [esp + .ix], xmm0
- movaps [esp + .iy], xmm1
- movaps [esp + .iz], xmm2
-
- mov ecx, [esp + .innerjjnr0]
- mov [esp + .innerjjnr], ecx
- mov edx, [esp + .innerk0]
- sub edx, dword 4
- mov [esp + .innerk], edx ; number of innerloop atoms
- jge .unroll_vdw_loop
- jmp .finish_vdw_inner
-.unroll_vdw_loop:
- ;; quad-unroll innerloop here.
- mov edx, [esp + .innerjjnr] ; pointer to jjnr[k]
- mov eax, [edx]
- mov ebx, [edx + 4]
- mov ecx, [edx + 8]
- mov edx, [edx + 12] ; eax-edx=jnr1-4
- add [esp + .innerjjnr], dword 16 ; advance pointer (unrolled 4)
-
- movd mm0, eax ; use mmx registers as temp. storage
- movd mm1, ebx
- movd mm2, ecx
- movd mm3, edx
-
- mov esi, [ebp + %$type]
- mov eax, [esi + eax*4]
- mov ebx, [esi + ebx*4]
- mov ecx, [esi + ecx*4]
- mov edx, [esi + edx*4]
- mov esi, [ebp + %$nbfp]
- shl eax, 1
- shl ebx, 1
- shl ecx, 1
- shl edx, 1
- mov edi, [esp + .ntia]
- add eax, edi
- add ebx, edi
- add ecx, edi
- add edx, edi
-
- movlps xmm6, [esi + eax*4]
- movlps xmm7, [esi + ecx*4]
- movhps xmm6, [esi + ebx*4]
- movhps xmm7, [esi + edx*4]
-
- movaps xmm4, xmm6
- shufps xmm4, xmm7, 10001000b
- shufps xmm6, xmm7, 11011101b
-
- movd eax, mm0
- movd ebx, mm1
- movd ecx, mm2
- movd edx, mm3
-
- movaps [esp + .c6], xmm4
- movaps [esp + .c12], xmm6
-
- mov esi, [ebp + %$pos] ; base of pos[]
-
- lea eax, [eax + eax*2] ; replace jnr with j3
- lea ebx, [ebx + ebx*2]
-
- lea ecx, [ecx + ecx*2] ; replace jnr with j3
- lea edx, [edx + edx*2]
-
- ; move four coordinates to xmm0-xmm2
-
- movlps xmm4, [esi + eax*4]
- movlps xmm5, [esi + ecx*4]
- movss xmm2, [esi + eax*4 + 8]
- movss xmm6, [esi + ecx*4 + 8]
-
- movhps xmm4, [esi + ebx*4]
- movhps xmm5, [esi + edx*4]
-
- movss xmm0, [esi + ebx*4 + 8]
- movss xmm1, [esi + edx*4 + 8]
-
- shufps xmm2, xmm0, 0b
- shufps xmm6, xmm1, 0b
-
- movaps xmm0, xmm4
- movaps xmm1, xmm4
-
- shufps xmm2, xmm6, 10001000b
-
- shufps xmm0, xmm5, 10001000b
- shufps xmm1, xmm5, 11011101b
-
- ; move ix-iz to xmm4-xmm6
- movaps xmm4, [esp + .ix]
- movaps xmm5, [esp + .iy]
- movaps xmm6, [esp + .iz]
-
- ; calc dr
- subps xmm4, xmm0
- subps xmm5, xmm1
- subps xmm6, xmm2
-
- ; store dr
- movaps [esp + .dx], xmm4
- movaps [esp + .dy], xmm5
- movaps [esp + .dz], xmm6
- ; square it
- mulps xmm4,xmm4
- mulps xmm5,xmm5
- mulps xmm6,xmm6
- addps xmm4, xmm5
- addps xmm4, xmm6
- ; rsq in xmm4
-
- rsqrtps xmm5, xmm4
- ; lookup seed in xmm5
- movaps xmm2, xmm5
- mulps xmm5, xmm5
- movaps xmm1, [esp + .three]
- mulps xmm5, xmm4 ;rsq*lu*lu
- movaps xmm0, [esp + .half]
- subps xmm1, xmm5 ; 3.0-rsq*lu*lu
- mulps xmm1, xmm2
- mulps xmm0, xmm1 ; xmm0=rinv
- mulps xmm4, xmm0 ; xmm4=r
- mulps xmm4, [esp + .tabscale]
-
- movhlps xmm5, xmm4
- cvttps2pi mm6, xmm4
- cvttps2pi mm7, xmm5 ; mm6/mm7 contain lu indices
- cvtpi2ps xmm6, mm6
- cvtpi2ps xmm5, mm7
- movlhps xmm6, xmm5
- subps xmm4, xmm6
- movaps xmm1, xmm4 ;xmm1=eps
- movaps xmm2, xmm1
- mulps xmm2, xmm2 ;xmm2=eps2
- pslld mm6, 3
- pslld mm7, 3
-
- movd mm0, eax
- movd mm1, ebx
- movd mm2, ecx
- movd mm3, edx
-
- mov esi, [ebp + %$VFtab]
- movd eax, mm6
- psrlq mm6, 32
- movd ecx, mm7
- psrlq mm7, 32
- movd ebx, mm6
- movd edx, mm7
-
- ; dispersion
- movlps xmm5, [esi + eax*4 + 0]
- movlps xmm7, [esi + ecx*4 + 0]
- movhps xmm5, [esi + ebx*4 + 0]
- movhps xmm7, [esi + edx*4 + 0] ; got half dispersion table
- movaps xmm4, xmm5
- shufps xmm4, xmm7, 10001000b
- shufps xmm5, xmm7, 11011101b
-
- movlps xmm7, [esi + eax*4 + 8]
- movlps xmm3, [esi + ecx*4 + 8]
- movhps xmm7, [esi + ebx*4 + 8]
- movhps xmm3, [esi + edx*4 + 8] ; other half of dispersion table
- movaps xmm6, xmm7
- shufps xmm6, xmm3, 10001000b
- shufps xmm7, xmm3, 11011101b
- ; dispersion table ready, in xmm4-xmm7
- mulps xmm6, xmm1 ; xmm6=Geps
- mulps xmm7, xmm2 ; xmm7=Heps2
- addps xmm5, xmm6
- addps xmm5, xmm7 ; xmm5=Fp
- mulps xmm7, [esp + .two] ; two*Heps2
- addps xmm7, xmm6
- addps xmm7, xmm5 ; xmm7=FF
- mulps xmm5, xmm1 ; xmm5=eps*Fp
- addps xmm5, xmm4 ; xmm5=VV
-
- movaps xmm4, [esp + .c6]
- mulps xmm7, xmm4 ; fijD
- mulps xmm5, xmm4 ; vnb6
-
- ; put scalar force on stack. Update vnbtot directly.
- addps xmm5, [esp + .vnbtot]
- movaps [esp + .fs], xmm7
- movaps [esp + .vnbtot], xmm5
-
- ; repulsion
- movlps xmm5, [esi + eax*4 + 16]
- movlps xmm7, [esi + ecx*4 + 16]
- movhps xmm5, [esi + ebx*4 + 16]
- movhps xmm7, [esi + edx*4 + 16] ; got half repulsion table
- movaps xmm4, xmm5
- shufps xmm4, xmm7, 10001000b
- shufps xmm5, xmm7, 11011101b
-
- movlps xmm7, [esi + eax*4 + 24]
- movlps xmm3, [esi + ecx*4 + 24]
- movhps xmm7, [esi + ebx*4 + 24]
- movhps xmm3, [esi + edx*4 + 24] ; other half of repulsion table
- movaps xmm6, xmm7
- shufps xmm6, xmm3, 10001000b
- shufps xmm7, xmm3, 11011101b
- ; table ready, in xmm4-xmm7
- mulps xmm6, xmm1 ; xmm6=Geps
- mulps xmm7, xmm2 ; xmm7=Heps2
- addps xmm5, xmm6
- addps xmm5, xmm7 ; xmm5=Fp
- mulps xmm7, [esp + .two] ; two*Heps2
- addps xmm7, xmm6
- addps xmm7, xmm5 ; xmm7=FF
- mulps xmm5, xmm1 ; xmm5=eps*Fp
- addps xmm5, xmm4 ; xmm5=VV
-
- movaps xmm4, [esp + .c12]
- mulps xmm7, xmm4 ; fijR
- mulps xmm5, xmm4 ; vnb12
- addps xmm7, [esp + .fs]
-
- addps xmm5, [esp + .vnbtot]
- movaps [esp + .vnbtot], xmm5
- xorps xmm4, xmm4
-
- mulps xmm7, [esp + .tabscale]
- mulps xmm7, xmm0
- subps xmm4, xmm7
-
- movaps xmm0, [esp + .dx]
- movaps xmm1, [esp + .dy]
- movaps xmm2, [esp + .dz]
-
- movd eax, mm0
- movd ebx, mm1
- movd ecx, mm2
- movd edx, mm3
-
- mov edi, [ebp + %$faction]
- mulps xmm0, xmm4
- mulps xmm1, xmm4
- mulps xmm2, xmm4
- ; xmm0-xmm2 contains tx-tz (partial force)
- ; now update f_i
- movaps xmm3, [esp + .fix]
- movaps xmm4, [esp + .fiy]
- movaps xmm5, [esp + .fiz]
- addps xmm3, xmm0
- addps xmm4, xmm1
- addps xmm5, xmm2
- movaps [esp + .fix], xmm3
- movaps [esp + .fiy], xmm4
- movaps [esp + .fiz], xmm5
- ; the fj's - start by accumulating x & y forces from memory
- movlps xmm4, [edi + eax*4]
- movlps xmm6, [edi + ecx*4]
- movhps xmm4, [edi + ebx*4]
- movhps xmm6, [edi + edx*4]
-
- movaps xmm3, xmm4
- shufps xmm3, xmm6, 10001000b
- shufps xmm4, xmm6, 11011101b
-
- ; now xmm3-xmm5 contains fjx, fjy, fjz
- subps xmm3, xmm0
- subps xmm4, xmm1
-
- ; unpack them back so we can store them - first x & y in xmm3/xmm4
-
- movaps xmm6, xmm3
- unpcklps xmm6, xmm4
- unpckhps xmm3, xmm4
- ; xmm6(l)=x & y for j1, (h) for j2
- ; xmm3(l)=x & y for j3, (h) for j4
- movlps [edi + eax*4], xmm6
- movlps [edi + ecx*4], xmm3
-
- movhps [edi + ebx*4], xmm6
- movhps [edi + edx*4], xmm3
-
- ;; and the z forces
- movss xmm4, [edi + eax*4 + 8]
- movss xmm5, [edi + ebx*4 + 8]
- movss xmm6, [edi + ecx*4 + 8]
- movss xmm7, [edi + edx*4 + 8]
- subss xmm4, xmm2
- shufps xmm2, xmm2, 11100101b
- subss xmm5, xmm2
- shufps xmm2, xmm2, 11101010b
- subss xmm6, xmm2
- shufps xmm2, xmm2, 11111111b
- subss xmm7, xmm2
- movss [edi + eax*4 + 8], xmm4
- movss [edi + ebx*4 + 8], xmm5
- movss [edi + ecx*4 + 8], xmm6
- movss [edi + edx*4 + 8], xmm7
-
- ;; should we do one more iteration?
- sub [esp + .innerk], dword 4
- jl .finish_vdw_inner
- jmp .unroll_vdw_loop
-.finish_vdw_inner:
- ;; check if at least two particles remain
- add [esp + .innerk], dword 4
- mov edx, [esp + .innerk]
- and edx, 10b
- jnz .dopair_vdw
- jmp .checksingle_vdw
-.dopair_vdw:
- mov ecx, [esp + .innerjjnr]
-
- mov eax, [ecx]
- mov ebx, [ecx + 4]
- add [esp + .innerjjnr], dword 8
- xorps xmm7, xmm7
-
- mov esi, [ebp + %$type]
- mov ecx, eax
- mov edx, ebx
- mov ecx, [esi + ecx*4]
- mov edx, [esi + edx*4]
- mov esi, [ebp + %$nbfp]
- shl ecx, 1
- shl edx, 1
- mov edi, [esp + .ntia]
- add ecx, edi
- add edx, edi
- movlps xmm6, [esi + ecx*4]
- movhps xmm6, [esi + edx*4]
- mov edi, [ebp + %$pos]
-
- movaps xmm4, xmm6
- shufps xmm4, xmm4, 1000b
- shufps xmm6, xmm6, 1101b
- movlhps xmm4, xmm7
- movlhps xmm6, xmm7
-
- movaps [esp + .c6], xmm4
- movaps [esp + .c12], xmm6
-
- lea eax, [eax + eax*2]
- lea ebx, [ebx + ebx*2]
- ; move coordinates to xmm0-xmm2
- movlps xmm1, [edi + eax*4]
- movss xmm2, [edi + eax*4 + 8]
- movhps xmm1, [edi + ebx*4]
- movss xmm0, [edi + ebx*4 + 8]
-
- shufps xmm2, xmm0, 0b
-
- movaps xmm0, xmm1
-
- shufps xmm2, xmm2, 10001000b
-
- shufps xmm0, xmm0, 10001000b
- shufps xmm1, xmm1, 11011101b
-
- mov edi, [ebp + %$faction]
- ; move ix-iz to xmm4-xmm6
- movaps xmm4, [esp + .ix]
- movaps xmm5, [esp + .iy]
- movaps xmm6, [esp + .iz]
-
- ; calc dr
- subps xmm4, xmm0
- subps xmm5, xmm1
- subps xmm6, xmm2
-
- ; store dr
- movaps [esp + .dx], xmm4
- movaps [esp + .dy], xmm5
- movaps [esp + .dz], xmm6
- ; square it
- mulps xmm4,xmm4
- mulps xmm5,xmm5
- mulps xmm6,xmm6
- addps xmm4, xmm5
- addps xmm4, xmm6
- ; rsq in xmm4
-
- rsqrtps xmm5, xmm4
- ; lookup seed in xmm5
- movaps xmm2, xmm5
- mulps xmm5, xmm5
- movaps xmm1, [esp + .three]
- mulps xmm5, xmm4 ;rsq*lu*lu
- movaps xmm0, [esp + .half]
- subps xmm1, xmm5 ; 3.0-rsq*lu*lu
- mulps xmm1, xmm2
- mulps xmm0, xmm1 ; xmm0=rinv
- mulps xmm4, xmm0 ; xmm4=r
- mulps xmm4, [esp + .tabscale]
-
- cvttps2pi mm6, xmm4 ; mm6 contain lu indices
- cvtpi2ps xmm6, mm6
- subps xmm4, xmm6
- movaps xmm1, xmm4 ;xmm1=eps
- movaps xmm2, xmm1
- mulps xmm2, xmm2 ;xmm2=eps2
-
- pslld mm6, 3
-
- mov esi, [ebp + %$VFtab]
- movd ecx, mm6
- psrlq mm6, 32
- movd edx, mm6
-
- ; dispersion
- movlps xmm5, [esi + ecx*4 + 0]
- movhps xmm5, [esi + edx*4 + 0]; got half dispersion table
- movaps xmm4, xmm5
- shufps xmm4, xmm4, 10001000b
- shufps xmm5, xmm5, 11011101b
-
- movlps xmm7, [esi + ecx*4 + 8]
- movhps xmm7, [esi + edx*4 + 8] ; other half of dispersion table
- movaps xmm6, xmm7
- shufps xmm6, xmm6, 10001000b
- shufps xmm7, xmm7, 11011101b
- ; dispersion table ready, in xmm4-xmm7
- mulps xmm6, xmm1 ; xmm6=Geps
- mulps xmm7, xmm2 ; xmm7=Heps2
- addps xmm5, xmm6
- addps xmm5, xmm7 ; xmm5=Fp
- mulps xmm7, [esp + .two] ; two*Heps2
- addps xmm7, xmm6
- addps xmm7, xmm5 ; xmm7=FF
- mulps xmm5, xmm1 ; xmm5=eps*Fp
- addps xmm5, xmm4 ; xmm5=VV
-
- movaps xmm4, [esp + .c6]
- mulps xmm7, xmm4 ; fijD
- mulps xmm5, xmm4 ; vnb6
-
- ; put scalar force on stack. Update vnbtot directly.
- addps xmm5, [esp + .vnbtot]
- movaps [esp + .fs], xmm7
- movaps [esp + .vnbtot], xmm5
-
- ; repulsion
- movlps xmm5, [esi + ecx*4 + 16]
- movhps xmm5, [esi + edx*4 + 16] ; got half repulsion table
- movaps xmm4, xmm5
- shufps xmm4, xmm4, 10001000b
- shufps xmm5, xmm5, 11011101b
-
- movlps xmm7, [esi + ecx*4 + 24]
- movhps xmm7, [esi + edx*4 + 24] ; other half of repulsion table
- movaps xmm6, xmm7
- shufps xmm6, xmm6, 10001000b
- shufps xmm7, xmm7, 11011101b
- ; table ready, in xmm4-xmm7
- mulps xmm6, xmm1 ; xmm6=Geps
- mulps xmm7, xmm2 ; xmm7=Heps2
- addps xmm5, xmm6
- addps xmm5, xmm7 ; xmm5=Fp
- mulps xmm7, [esp + .two] ; two*Heps2
- addps xmm7, xmm6
- addps xmm7, xmm5 ; xmm7=FF
- mulps xmm5, xmm1 ; xmm5=eps*Fp
- addps xmm5, xmm4 ; xmm5=VV
-
- movaps xmm4, [esp + .c12]
- mulps xmm7, xmm4 ; fijR
- mulps xmm5, xmm4 ; vnb12
- addps xmm7, [esp + .fs]
-
- addps xmm5, [esp + .vnbtot]
- movaps [esp + .vnbtot], xmm5
- xorps xmm4, xmm4
-
- mulps xmm7, [esp + .tabscale]
- mulps xmm7, xmm0
- subps xmm4, xmm7
-
- movaps xmm0, [esp + .dx]
- movaps xmm1, [esp + .dy]
- movaps xmm2, [esp + .dz]
-
- mulps xmm0, xmm4
- mulps xmm1, xmm4
- mulps xmm2, xmm4
- ; xmm0-xmm2 contains tx-tz (partial force)
- ; now update f_i
- movaps xmm3, [esp + .fix]
- movaps xmm4, [esp + .fiy]
- movaps xmm5, [esp + .fiz]
- addps xmm3, xmm0
- addps xmm4, xmm1
- addps xmm5, xmm2
- movaps [esp + .fix], xmm3
- movaps [esp + .fiy], xmm4
- movaps [esp + .fiz], xmm5
- ; update the fj's
- movss xmm3, [edi + eax*4]
- movss xmm4, [edi + eax*4 + 4]
- movss xmm5, [edi + eax*4 + 8]
- subss xmm3, xmm0
- subss xmm4, xmm1
- subss xmm5, xmm2
- movss [edi + eax*4], xmm3
- movss [edi + eax*4 + 4], xmm4
- movss [edi + eax*4 + 8], xmm5
-
- shufps xmm0, xmm0, 11100001b
- shufps xmm1, xmm1, 11100001b
- shufps xmm2, xmm2, 11100001b
-
- movss xmm3, [edi + ebx*4]
- movss xmm4, [edi + ebx*4 + 4]
- movss xmm5, [edi + ebx*4 + 8]
- subss xmm3, xmm0
- subss xmm4, xmm1
- subss xmm5, xmm2
- movss [edi + ebx*4], xmm3
- movss [edi + ebx*4 + 4], xmm4
- movss [edi + ebx*4 + 8], xmm5
-
-.checksingle_vdw:
- mov edx, [esp + .innerk]
- and edx, 1b
- jnz .dosingle_vdw
- jmp .updateouterdata_vdw
-.dosingle_vdw:
- mov edi, [ebp + %$pos]
- mov ecx, [esp + .innerjjnr]
- mov eax, [ecx]
- xorps xmm6, xmm6
-
- mov esi, [ebp + %$type]
- mov ecx, eax
- mov ecx, [esi + ecx*4]
- mov esi, [ebp + %$nbfp]
- shl ecx, 1
- add ecx, [esp + .ntia]
- movlps xmm6, [esi + ecx*4]
- movaps xmm4, xmm6
- shufps xmm4, xmm4, 11111100b
- shufps xmm6, xmm6, 11111101b
-
- movaps [esp + .c6], xmm4
- movaps [esp + .c12], xmm6
-
- lea eax, [eax + eax*2]
-
- ; move coordinates to xmm0-xmm2
- movss xmm0, [edi + eax*4]
- movss xmm1, [edi + eax*4 + 4]
- movss xmm2, [edi + eax*4 + 8]
-
- movaps xmm4, [esp + .ix]
- movaps xmm5, [esp + .iy]
- movaps xmm6, [esp + .iz]
-
- ; calc dr
- subps xmm4, xmm0
- subps xmm5, xmm1
- subps xmm6, xmm2
-
- ; store dr
- movaps [esp + .dx], xmm4
- movaps [esp + .dy], xmm5
- movaps [esp + .dz], xmm6
- ; square it
- mulps xmm4,xmm4
- mulps xmm5,xmm5
- mulps xmm6,xmm6
- addps xmm4, xmm5
- addps xmm4, xmm6
- ; rsq in xmm4
-
- rsqrtps xmm5, xmm4
- ; lookup seed in xmm5
- movaps xmm2, xmm5
- mulps xmm5, xmm5
- movaps xmm1, [esp + .three]
- mulps xmm5, xmm4 ;rsq*lu*lu
- movaps xmm0, [esp + .half]
- subps xmm1, xmm5 ; 3.0-rsq*lu*lu
- mulps xmm1, xmm2
- mulps xmm0, xmm1 ; xmm0=rinv
-
- mulps xmm4, xmm0 ; xmm4=r
- mulps xmm4, [esp + .tabscale]
-
- cvttps2pi mm6, xmm4 ; mm6 contain lu indices
- cvtpi2ps xmm6, mm6
- subps xmm4, xmm6
- movaps xmm1, xmm4 ;xmm1=eps
- movaps xmm2, xmm1
- mulps xmm2, xmm2 ;xmm2=eps2
-
- pslld mm6, 3
-
- mov esi, [ebp + %$VFtab]
- movd ebx, mm6
-
- ; dispersion
- movlps xmm4, [esi + ebx*4 + 0]
- movlps xmm6, [esi + ebx*4 + 8]
- movaps xmm5, xmm4
- movaps xmm7, xmm6
- shufps xmm5, xmm5, 1b
- shufps xmm7, xmm7, 1b
- ; table ready in xmm4-xmm7
-
- mulps xmm6, xmm1 ; xmm6=Geps
- mulps xmm7, xmm2 ; xmm7=Heps2
- addps xmm5, xmm6
- addps xmm5, xmm7 ; xmm5=Fp
- mulps xmm7, [esp + .two] ; two*Heps2
- addps xmm7, xmm6
- addps xmm7, xmm5 ; xmm7=FF
- mulps xmm5, xmm1 ; xmm5=eps*Fp
- addps xmm5, xmm4 ; xmm5=VV
-
- movaps xmm4, [esp + .c6]
- mulps xmm7, xmm4 ; fijD
- mulps xmm5, xmm4 ; vnb6
-
- ; put scalar force on stack. Update vnbtot directly.
- addps xmm5, [esp + .vnbtot]
- movaps [esp + .fs], xmm7
- movaps [esp + .vnbtot], xmm5
-
- ; repulsion
- movlps xmm4, [esi + ebx*4 + 16]
- movlps xmm6, [esi + ebx*4 + 24]
- movaps xmm5, xmm4
- movaps xmm7, xmm6
- shufps xmm5, xmm5, 1b
- shufps xmm7, xmm7, 1b
- ; table ready in xmm4-xmm7
-
- mulps xmm6, xmm1 ; xmm6=Geps
- mulps xmm7, xmm2 ; xmm7=Heps2
- addps xmm5, xmm6
- addps xmm5, xmm7 ; xmm5=Fp
- mulps xmm7, [esp + .two] ; two*Heps2
- addps xmm7, xmm6
- addps xmm7, xmm5 ; xmm7=FF
- mulps xmm5, xmm1 ; xmm5=eps*Fp
- addps xmm5, xmm4 ; xmm5=VV
-
- movaps xmm4, [esp + .c12]
- mulps xmm7, xmm4 ; fijR
- mulps xmm5, xmm4 ; vnb12
- addps xmm7, [esp + .fs]
-
- addps xmm5, [esp + .vnbtot]
- movaps [esp + .vnbtot], xmm5
- xorps xmm4, xmm4
-
- mulps xmm7, [esp + .tabscale]
- mulps xmm7, xmm0
- subps xmm4, xmm7
- mov edi, [ebp + %$faction]
-
- movaps xmm0, [esp + .dx]
- movaps xmm1, [esp + .dy]
- movaps xmm2, [esp + .dz]
-
- mulps xmm0, xmm4
- mulps xmm1, xmm4
- mulps xmm2, xmm4
- ; xmm0-xmm2 contains tx-tz (partial force)
- ; now update f_i
- movaps xmm3, [esp + .fix]
- movaps xmm4, [esp + .fiy]
- movaps xmm5, [esp + .fiz]
- addps xmm3, xmm0
- addps xmm4, xmm1
- addps xmm5, xmm2
- movaps [esp + .fix], xmm3
- movaps [esp + .fiy], xmm4
- movaps [esp + .fiz], xmm5
- ; update fj
- movss xmm3, [edi + eax*4]
- movss xmm4, [edi + eax*4 + 4]
- movss xmm5, [edi + eax*4 + 8]
- subss xmm3, xmm0
- subss xmm4, xmm1
- subss xmm5, xmm2
- movss [edi + eax*4], xmm3
- movss [edi + eax*4 + 4], xmm4
- movss [edi + eax*4 + 8], xmm5
-.updateouterdata_vdw:
- mov ecx, [esp + .ii3]
- mov edi, [ebp + %$faction]
- mov esi, [ebp + %$fshift]
- mov edx, [esp + .is3]
-
- ; accumulate i forces in xmm0, xmm1, xmm2
- movaps xmm0, [esp + .fix]
- movaps xmm1, [esp + .fiy]
- movaps xmm2, [esp + .fiz]
-
- movhlps xmm3, xmm0
- movhlps xmm4, xmm1
- movhlps xmm5, xmm2
- addps xmm0, xmm3
- addps xmm1, xmm4
- addps xmm2, xmm5 ; summan ligger i 1/2 i xmm0-xmm2
-
- movaps xmm3, xmm0
- movaps xmm4, xmm1
- movaps xmm5, xmm2
-
- shufps xmm3, xmm3, 1b
- shufps xmm4, xmm4, 1b
- shufps xmm5, xmm5, 1b
- addss xmm0, xmm3
- addss xmm1, xmm4
- addss xmm2, xmm5 ; xmm0-xmm2 has single force in pos0.
-
- ; increment i force
- movss xmm3, [edi + ecx*4]
- movss xmm4, [edi + ecx*4 + 4]
- movss xmm5, [edi + ecx*4 + 8]
- addss xmm3, xmm0
- addss xmm4, xmm1
- addss xmm5, xmm2
- movss [edi + ecx*4], xmm3
- movss [edi + ecx*4 + 4], xmm4
- movss [edi + ecx*4 + 8], xmm5
-
- ; increment fshift force
- movss xmm3, [esi + edx*4]
- movss xmm4, [esi + edx*4 + 4]
- movss xmm5, [esi + edx*4 + 8]
- addss xmm3, xmm0
- addss xmm4, xmm1
- addss xmm5, xmm2
- movss [esi + edx*4], xmm3
- movss [esi + edx*4 + 4], xmm4
- movss [esi + edx*4 + 8], xmm5
-
- ;; loop back to mno.
- dec dword [esp + .nsvdw]
- jz .last_mno
- jmp .mno_vdw
-.last_mno:
-
- ; get group index for i particle
- mov edx, [ebp + %$gid] ; get group index for this i particle
- mov edx, [edx]
- add [ebp + %$gid], dword 4 ; advance pointer
-
-
- ; accumulate total lj energy and update it.
- movaps xmm7, [esp + .vnbtot]
- ; accumulate
- movhlps xmm6, xmm7
- addps xmm7, xmm6 ; pos 0-1 in xmm7 have the sum now
- movaps xmm6, xmm7
- shufps xmm6, xmm6, 1b
- addss xmm7, xmm6
-
- ; add earlier value from mem.
- mov eax, [ebp + %$Vnb]
- addss xmm7, [eax + edx*4]
- ; move back to mem.
- movss [eax + edx*4], xmm7
-
- ;; finish if last
- mov ecx, [ebp + %$nri]
- dec ecx
- jecxz .end
- ;; not last, iterate once more!
- mov [ebp + %$nri], ecx
- jmp .outer
-.end:
- emms
- mov eax, [esp + .salign]
- add esp, eax
- add esp, 332
- pop edi
- pop esi
- pop edx
- pop ecx
- pop ebx
- pop eax
- endproc
-
-
-
-proc inl1000_sse
-%$nri arg
-%$iinr arg
-%$jindex arg
-%$jjnr arg
-%$shift arg
-%$shiftvec arg
-%$fshift arg
-%$gid arg
-%$pos arg
-%$faction arg
-%$charge arg
-%$facel arg
-%$Vc arg
- ;; stack offsets for local variables
- ;; bottom of stack is cache-aligned for sse use
-.ix equ 0
-.iy equ 16
-.iz equ 32
-.iq equ 48
-.dx equ 64
-.dy equ 80
-.dz equ 96
-.vctot equ 112
-.fix equ 128
-.fiy equ 144
-.fiz equ 160
-.half equ 176
-.three equ 192
-.is3 equ 208
-.ii3 equ 212
-.innerjjnr equ 216
-.innerk equ 220
-.salign equ 224
- push eax
- push ebx
- push ecx
- push edx
- push esi
- push edi
- sub esp, 228 ; local stack space
- mov eax, esp
- and eax, 0xf
- sub esp, eax
- mov [esp + .salign], eax
-
- emms
-
- movups xmm0, [sse_half]
- movups xmm1, [sse_three]
- movaps [esp + .half], xmm0
- movaps [esp + .three], xmm1
-
- ;; assume we have at least one i particle - start directly
-.outer:
- mov eax, [ebp + %$shift] ; eax = pointer into shift[]
- mov ebx, [eax] ; ebx=shift[n]
- add [ebp + %$shift], dword 4 ; advance pointer one step
-
- lea ebx, [ebx + ebx*2] ; ebx=3*is
- mov [esp + .is3],ebx ; store is3
-
- mov eax, [ebp + %$shiftvec] ; eax = base of shiftvec[]
-
- movss xmm0, [eax + ebx*4]
- movss xmm1, [eax + ebx*4 + 4]
- movss xmm2, [eax + ebx*4 + 8]
-
- mov ecx, [ebp + %$iinr] ; ecx = pointer into iinr[]
- add [ebp + %$iinr], dword 4 ; advance pointer
- mov ebx, [ecx] ; ebx =ii
-
- mov edx, [ebp + %$charge]
- movss xmm3, [edx + ebx*4]
- mulss xmm3, [ebp + %$facel]
- shufps xmm3, xmm3, 0b
-
- lea ebx, [ebx + ebx*2] ; ebx = 3*ii=ii3
- mov eax, [ebp + %$pos] ; eax = base of pos[]
-
- addss xmm0, [eax + ebx*4]
- addss xmm1, [eax + ebx*4 + 4]
- addss xmm2, [eax + ebx*4 + 8]
-
- movaps [esp + .iq], xmm3
-
- shufps xmm0, xmm0, 0b
- shufps xmm1, xmm1, 0b
- shufps xmm2, xmm2, 0b
-
- movaps [esp + .ix], xmm0
- movaps [esp + .iy], xmm1
- movaps [esp + .iz], xmm2
-
- mov [esp + .ii3], ebx
-
- ; clear vctot and i forces
- xorps xmm4, xmm4
- movaps [esp + .vctot], xmm4
- movaps [esp + .fix], xmm4
- movaps [esp + .fiy], xmm4
- movaps [esp + .fiz], xmm4
-
- mov eax, [ebp + %$jindex]
- mov ecx, [eax] ; jindex[n]
- mov edx, [eax + 4] ; jindex[n+1]
- add [ebp + %$jindex], dword 4
- sub edx, ecx ; number of innerloop atoms
-
- mov esi, [ebp + %$pos]
- mov edi, [ebp + %$faction]
- mov eax, [ebp + %$jjnr]
- shl ecx, 2
- add eax, ecx
- mov [esp + .innerjjnr], eax ; pointer to jjnr[nj0]
- sub edx, dword 4
- mov [esp + .innerk], edx ; number of innerloop atoms
- jge .unroll_loop
- jmp .finish_inner
-.unroll_loop:
- ;; quad-unrolled innerloop here.
- mov edx, [esp + .innerjjnr] ; pointer to jjnr[k]
- mov eax, [edx]
- mov ebx, [edx + 4]
- mov ecx, [edx + 8]
- mov edx, [edx + 12] ; eax-edx=jnr1-4
- add [esp + .innerjjnr], dword 16 ; advance pointer (unrolled 4)
-
- mov esi, [ebp + %$charge] ; base of charge[]
-
- movss xmm3, [esi + eax*4]
- movss xmm4, [esi + ecx*4]
- movss xmm6, [esi + ebx*4]
- movss xmm7, [esi + edx*4]
-
- movaps xmm5, [esp + .iq]
- shufps xmm3, xmm6, 00000000b
- shufps xmm4, xmm7, 00000000b
- shufps xmm3, xmm4, 10001000b
- mov esi, [ebp + %$pos] ; base of pos[]
-
- lea eax, [eax + eax*2] ; replace jnr with j3
- lea ebx, [ebx + ebx*2]
-
- mulps xmm3, xmm5
- lea ecx, [ecx + ecx*2] ; replace jnr with j3
- lea edx, [edx + edx*2]
-
- ; move four coordinates to xmm0-xmm2
-
- movlps xmm4, [esi + eax*4] ;x1 y1 - -
- movlps xmm5, [esi + ecx*4] ;x3 y3 - -
- movss xmm2, [esi + eax*4 + 8] ;z1 - - -
- movss xmm6, [esi + ecx*4 + 8] ;z3 - - -
-
- movhps xmm4, [esi + ebx*4] ;x1 y1 x2 y2
- movhps xmm5, [esi + edx*4] ;x3 y3 x4 y4
-
- movss xmm0, [esi + ebx*4 + 8] ;z2 - - -
- movss xmm1, [esi + edx*4 + 8] ;z4 - - -
-
- shufps xmm2, xmm0, 0b ;z1 z1 z2 z2
- shufps xmm6, xmm1, 0b ;z3 z3 z4 z4
-
- movaps xmm0, xmm4 ;x1 y1 x2 y2
- movaps xmm1, xmm4 ;x1 y1 x2 y2
-
- shufps xmm2, xmm6, 10001000b ;z1 z2 z3 z4
-
- shufps xmm0, xmm5, 10001000b ;x1 x2 x3 x4
- shufps xmm1, xmm5, 11011101b ;y1 y2 y3 y4
-
- mov edi, [ebp + %$faction]
-
- ; move ix-iz to xmm4-xmm6
- movaps xmm4, [esp + .ix]
- movaps xmm5, [esp + .iy]
- movaps xmm6, [esp + .iz]
-
- ; calc dr
- subps xmm4, xmm0
- subps xmm5, xmm1
- subps xmm6, xmm2
-
- ; store dr
- movaps [esp + .dx], xmm4
- movaps [esp + .dy], xmm5
- movaps [esp + .dz], xmm6
- ; square it
- mulps xmm4,xmm4
- mulps xmm5,xmm5
- mulps xmm6,xmm6
- addps xmm4, xmm5
- addps xmm4, xmm6
- ; rsq in xmm4
-
- rsqrtps xmm5, xmm4
- ; lookup seed in xmm5
- movaps xmm2, xmm5
- mulps xmm5, xmm5
- movaps xmm1, [esp + .three]
- mulps xmm5, xmm4 ;rsq*lu*lu
- movaps xmm0, [esp + .half]
- subps xmm1, xmm5 ; 3.0-rsq*lu*lu
- mulps xmm1, xmm2
- mulps xmm0, xmm1 ; xmm0=rinv
- movaps xmm4, xmm0
- mulps xmm4, xmm4 ; xmm4=rinvsq
-
- movaps xmm5, [esp + .vctot]
- mulps xmm3, xmm0 ; xmm3=vcoul
- mulps xmm4, xmm3 ; xmm4=fscal
- addps xmm5, xmm3
-
- movaps xmm0, [esp + .dx]
- movaps xmm1, [esp + .dy]
- movaps xmm2, [esp + .dz]
-
- movaps [esp + .vctot], xmm5
-
- mulps xmm0, xmm4
- mulps xmm1, xmm4
- mulps xmm2, xmm4
- ; xmm0-xmm2 contains tx-tz (partial force)
- ; now update f_i
- movaps xmm3, [esp + .fix]
- movaps xmm4, [esp + .fiy]
- movaps xmm5, [esp + .fiz]
- addps xmm3, xmm0
- addps xmm4, xmm1
- addps xmm5, xmm2
- movaps [esp + .fix], xmm3
- movaps [esp + .fiy], xmm4
- movaps [esp + .fiz], xmm5
- ; the fj's - start by accumulating x & y forces from memory
- movlps xmm4, [edi + eax*4]
- movlps xmm6, [edi + ecx*4]
- movhps xmm4, [edi + ebx*4]
- movhps xmm6, [edi + edx*4]
-
- movaps xmm3, xmm4
- shufps xmm3, xmm6, 10001000b
- shufps xmm4, xmm6, 11011101b
-
- ; now xmm3-xmm5 contains fjx, fjy, fjz
- subps xmm3, xmm0
- subps xmm4, xmm1
-
- ; unpack them back so we can store them - first x & y in xmm3/xmm4
-
- movaps xmm6, xmm3
- unpcklps xmm6, xmm4
- unpckhps xmm3, xmm4
- ; xmm6(l)=x & y for j1, (h) for j2
- ; xmm3(l)=x & y for j3, (h) for j4
- movlps [edi + eax*4], xmm6
- movlps [edi + ecx*4], xmm3
-
- movhps [edi + ebx*4], xmm6
- movhps [edi + edx*4], xmm3
-
- ;; and the z forces
- movss xmm4, [edi + eax*4 + 8]
- movss xmm5, [edi + ebx*4 + 8]
- movss xmm6, [edi + ecx*4 + 8]
- movss xmm7, [edi + edx*4 + 8]
- subss xmm4, xmm2
- shufps xmm2, xmm2, 11100101b
- subss xmm5, xmm2
- shufps xmm2, xmm2, 11101010b
- subss xmm6, xmm2
- shufps xmm2, xmm2, 11111111b
- subss xmm7, xmm2
- movss [edi + eax*4 + 8], xmm4
- movss [edi + ebx*4 + 8], xmm5
- movss [edi + ecx*4 + 8], xmm6
- movss [edi + edx*4 + 8], xmm7
-
- ;; should we do one more iteration?
- sub [esp + .innerk], dword 4
- jl .finish_inner
- jmp .unroll_loop
-.finish_inner:
- ;; check if at least two particles remain
- add [esp + .innerk], dword 4
- mov edx, [esp + .innerk]
- and edx, 10b
- jnz .dopair
- jmp .checksingle
-.dopair:
- mov esi, [ebp + %$charge]
- mov edi, [ebp + %$pos]
- mov ecx, [esp + .innerjjnr]
-
- mov eax, [ecx]
- mov ebx, [ecx + 4]
- add [esp + .innerjjnr], dword 8
-
- movss xmm3, [esi + eax*4]
- movss xmm6, [esi + ebx*4]
- shufps xmm3, xmm6, 00000000b
- shufps xmm3, xmm3, 00001000b ; xmm3(0,1) has the charges.
-
- lea eax, [eax + eax*2]
- lea ebx, [ebx + ebx*2]
- ; move coordinates to xmm0-xmm2
- movlps xmm1, [edi + eax*4]
- movss xmm2, [edi + eax*4 + 8]
- movhps xmm1, [edi + ebx*4]
- movss xmm0, [edi + ebx*4 + 8]
-
- mulps xmm3, [esp + .iq]
- xorps xmm7,xmm7
- movlhps xmm3, xmm7
-
- shufps xmm2, xmm0, 0b
-
- movaps xmm0, xmm1
-
- shufps xmm2, xmm2, 10001000b
-
- shufps xmm0, xmm0, 10001000b
- shufps xmm1, xmm1, 11011101b
-
- mov edi, [ebp + %$faction]
- ; move ix-iz to xmm4-xmm6
- xorps xmm7, xmm7
-
- movaps xmm4, [esp + .ix]
- movaps xmm5, [esp + .iy]
- movaps xmm6, [esp + .iz]
-
- ; calc dr
- subps xmm4, xmm0
- subps xmm5, xmm1
- subps xmm6, xmm2
-
- ; store dr
- movaps [esp + .dx], xmm4
- movaps [esp + .dy], xmm5
- movaps [esp + .dz], xmm6
- ; square it
- mulps xmm4,xmm4
- mulps xmm5,xmm5
- mulps xmm6,xmm6
- addps xmm4, xmm5
- addps xmm4, xmm6
- ; rsq in xmm4
-
- rsqrtps xmm5, xmm4
- ; lookup seed in xmm5
- movaps xmm2, xmm5
- mulps xmm5, xmm5
- movaps xmm1, [esp + .three]
- mulps xmm5, xmm4 ;rsq*lu*lu
- movaps xmm0, [esp + .half]
- subps xmm1, xmm5 ; 3.0-rsq*lu*lu
- mulps xmm1, xmm2
- mulps xmm0, xmm1 ; xmm0=rinv
- movaps xmm4, xmm0
- mulps xmm4, xmm4 ; xmm4=rinvsq
-
- movaps xmm5, [esp + .vctot]
- mulps xmm3, xmm0 ; xmm3=vcoul
- mulps xmm4, xmm3 ; xmm4=fscal
- addps xmm5, xmm3
-
- movaps xmm0, [esp + .dx]
- movaps xmm1, [esp + .dy]
- movaps xmm2, [esp + .dz]
-
- movaps [esp + .vctot], xmm5
-
- mulps xmm0, xmm4
- mulps xmm1, xmm4
- mulps xmm2, xmm4
- ; xmm0-xmm2 contains tx-tz (partial force)
- ; now update f_i
- movaps xmm3, [esp + .fix]
- movaps xmm4, [esp + .fiy]
- movaps xmm5, [esp + .fiz]
- addps xmm3, xmm0
- addps xmm4, xmm1
- addps xmm5, xmm2
- movaps [esp + .fix], xmm3
- movaps [esp + .fiy], xmm4
- movaps [esp + .fiz], xmm5
- ; update the fj's
- movss xmm3, [edi + eax*4]
- movss xmm4, [edi + eax*4 + 4]
- movss xmm5, [edi + eax*4 + 8]
- subss xmm3, xmm0
- subss xmm4, xmm1
- subss xmm5, xmm2
- movss [edi + eax*4], xmm3
- movss [edi + eax*4 + 4], xmm4
- movss [edi + eax*4 + 8], xmm5
-
- shufps xmm0, xmm0, 11100001b
- shufps xmm1, xmm1, 11100001b
- shufps xmm2, xmm2, 11100001b
-
- movss xmm3, [edi + ebx*4]
- movss xmm4, [edi + ebx*4 + 4]
- movss xmm5, [edi + ebx*4 + 8]
- subss xmm3, xmm0
- subss xmm4, xmm1
- subss xmm5, xmm2
- movss [edi + ebx*4], xmm3
- movss [edi + ebx*4 + 4], xmm4
- movss [edi + ebx*4 + 8], xmm5
-.checksingle:
- mov edx, [esp + .innerk]
- and edx, 1b
- jnz .dosingle
- jmp .updateouterdata
-.dosingle:
- mov esi, [ebp + %$charge]
- mov edi, [ebp + %$pos]
- mov ecx, [esp + .innerjjnr]
- mov eax, [ecx]
- movss xmm3, [esi + eax*4] ; xmm3(0) has the charge
-
- lea eax, [eax + eax*2]
-
- ; move coordinates to xmm0-xmm2
- movss xmm0, [edi + eax*4]
- movss xmm1, [edi + eax*4 + 4]
- movss xmm2, [edi + eax*4 + 8]
-
- mulps xmm3, [esp + .iq]
-
- xorps xmm7, xmm7
-
- movaps xmm4, [esp + .ix]
- movaps xmm5, [esp + .iy]
- movaps xmm6, [esp + .iz]
-
- ; calc dr
- subps xmm4, xmm0
- subps xmm5, xmm1
- subps xmm6, xmm2
-
- ; store dr
- movaps [esp + .dx], xmm4
- movaps [esp + .dy], xmm5
- movaps [esp + .dz], xmm6
- ; square it
- mulps xmm4,xmm4
- mulps xmm5,xmm5
- mulps xmm6,xmm6
- addps xmm4, xmm5
- addps xmm4, xmm6
- ; rsq in xmm4
-
- rsqrtps xmm5, xmm4
- ; lookup seed in xmm5
- movaps xmm2, xmm5
- mulps xmm5, xmm5
- movaps xmm1, [esp + .three]
- mulps xmm5, xmm4 ;rsq*lu*lu
- movaps xmm0, [esp + .half]
- subps xmm1, xmm5 ; 3.0-rsq*lu*lu
- mulps xmm1, xmm2
- mulps xmm0, xmm1 ; xmm0=rinv
- movaps xmm4, xmm0
- mulps xmm4, xmm4 ; xmm4=rinvsq
- mov edi, [ebp + %$faction]
- movaps xmm5, [esp + .vctot]
- mulps xmm3, xmm0 ; xmm3=vcoul
- mulps xmm4, xmm3 ; xmm4=fscal
- addps xmm5, xmm3
-
- movaps xmm0, [esp + .dx]
- movaps xmm1, [esp + .dy]
- movaps xmm2, [esp + .dz]
-
- movaps [esp + .vctot], xmm5
-
- mulps xmm0, xmm4
- mulps xmm1, xmm4
- mulps xmm2, xmm4
- ; xmm0-xmm2 contains tx-tz (partial force)
- ; now update f_i
- movaps xmm3, [esp + .fix]
- movaps xmm4, [esp + .fiy]
- movaps xmm5, [esp + .fiz]
- addps xmm3, xmm0
- addps xmm4, xmm1
- addps xmm5, xmm2
- movaps [esp + .fix], xmm3
- movaps [esp + .fiy], xmm4
- movaps [esp + .fiz], xmm5
- ; update fj
- movss xmm3, [edi + eax*4]
- movss xmm4, [edi + eax*4 + 4]
- movss xmm5, [edi + eax*4 + 8]
- subss xmm3, xmm0
- subss xmm4, xmm1
- subss xmm5, xmm2
- movss [edi + eax*4], xmm3
- movss [edi + eax*4 + 4], xmm4
- movss [edi + eax*4 + 8], xmm5
-.updateouterdata:
- mov ecx, [esp + .ii3]
- mov edi, [ebp + %$faction]
- mov esi, [ebp + %$fshift]
- mov edx, [esp + .is3]
-
- ; accumulate i forces in xmm0, xmm1, xmm2
- movaps xmm0, [esp + .fix]
- movaps xmm1, [esp + .fiy]
- movaps xmm2, [esp + .fiz]
-
- movhlps xmm3, xmm0
- movhlps xmm4, xmm1
- movhlps xmm5, xmm2
- addps xmm0, xmm3
- addps xmm1, xmm4
- addps xmm2, xmm5 ; summan ligger i 1/2 i xmm0-xmm2
-
- movaps xmm3, xmm0
- movaps xmm4, xmm1
- movaps xmm5, xmm2
-
- shufps xmm3, xmm3, 1b
- shufps xmm4, xmm4, 1b
- shufps xmm5, xmm5, 1b
- addss xmm0, xmm3
- addss xmm1, xmm4
- addss xmm2, xmm5 ; xmm0-xmm2 has single force in pos0.
-
- ; increment i force
- movss xmm3, [edi + ecx*4]
- movss xmm4, [edi + ecx*4 + 4]
- movss xmm5, [edi + ecx*4 + 8]
- addss xmm3, xmm0
- addss xmm4, xmm1
- addss xmm5, xmm2
- movss [edi + ecx*4], xmm3
- movss [edi + ecx*4 + 4], xmm4
- movss [edi + ecx*4 + 8], xmm5
-
- ; increment fshift force
- movss xmm3, [esi + edx*4]
- movss xmm4, [esi + edx*4 + 4]
- movss xmm5, [esi + edx*4 + 8]
- addss xmm3, xmm0
- addss xmm4, xmm1
- addss xmm5, xmm2
- movss [esi + edx*4], xmm3
- movss [esi + edx*4 + 4], xmm4
- movss [esi + edx*4 + 8], xmm5
-
- ; get group index for i particle
- mov edx, [ebp + %$gid] ; get group index for this i particle
- mov edx, [edx]
- add [ebp + %$gid], dword 4 ; advance pointer
-
- ; accumulate total potential energy and update it.
- movaps xmm7, [esp + .vctot]
- ; accumulate
- movhlps xmm6, xmm7
- addps xmm7, xmm6 ; pos 0-1 in xmm7 have the sum now
- movaps xmm6, xmm7
- shufps xmm6, xmm6, 1b
- addss xmm7, xmm6
-
- ; add earlier value from mem.
- mov eax, [ebp + %$Vc]
- addss xmm7, [eax + edx*4]
- ; move back to mem.
- movss [eax + edx*4], xmm7
-
- ;; finish if last
- mov ecx, [ebp + %$nri]
- dec dword ecx
- jecxz .end
- ;; not last, iterate once more!
- mov [ebp + %$nri], ecx
- jmp .outer
-.end:
- emms
- mov eax, [esp + .salign]
- add esp, eax
- add esp, 228
- pop edi
- pop esi
- pop edx
- pop ecx
- pop ebx
- pop eax
- endproc
-
-
-
-proc inl1010_sse
-%$nri arg
-%$iinr arg
-%$jindex arg
-%$jjnr arg
-%$shift arg
-%$shiftvec arg
-%$fshift arg
-%$gid arg
-%$pos arg
-%$faction arg
-%$charge arg
-%$facel arg
-%$Vc arg
-%$nsatoms arg
- ;; stack offsets for local variables
- ;; bottom of stack is cache-aligned for sse use
-.ix equ 0
-.iy equ 16
-.iz equ 32
-.iq equ 48
-.dx equ 64
-.dy equ 80
-.dz equ 96
-.vctot equ 112
-.fix equ 128
-.fiy equ 144
-.fiz equ 160
-.half equ 176
-.three equ 192
-.is3 equ 208
-.ii3 equ 212
-.shX equ 216
-.shY equ 220
-.shZ equ 224
-.ntia equ 228
-.innerjjnr0 equ 232
-.innerk0 equ 236
-.innerjjnr equ 240
-.innerk equ 244
-.salign equ 248
-.nscoul equ 252
-.solnr equ 256
-
- push eax
- push ebx
- push ecx
- push edx
- push esi
- push edi
- sub esp, 260 ; local stack space
- mov eax, esp
- and eax, 0xf
- sub esp, eax
- mov [esp + .salign], eax
-
- emms
-
- movups xmm0, [sse_half]
- movups xmm1, [sse_three]
- movaps [esp + .half], xmm0
- movaps [esp + .three], xmm1
- add [ebp + %$nsatoms], dword 8
-
- ;; assume we have at least one i particle - start directly
-.outer:
- mov eax, [ebp + %$shift] ; eax = pointer into shift[]
- mov ebx, [eax] ; ebx=shift[n]
- add [ebp + %$shift], dword 4 ; advance pointer one step
-
- lea ebx, [ebx + ebx*2] ; ebx=3*is
- mov [esp + .is3],ebx ; store is3
-
- mov eax, [ebp + %$shiftvec] ; eax = base of shiftvec[]
-
- movss xmm0, [eax + ebx*4]
- movss xmm1, [eax + ebx*4 + 4]
- movss xmm2, [eax + ebx*4 + 8]
- movss [esp + .shX], xmm0
- movss [esp + .shY], xmm1
- movss [esp + .shZ], xmm2
-
- mov ecx, [ebp + %$iinr] ; ecx = pointer into iinr[]
- add [ebp + %$iinr], dword 4 ; advance pointer
- mov ebx, [ecx] ; ebx =ii
-
- mov eax, [ebp + %$nsatoms]
- mov ecx, [eax]
- add [ebp + %$nsatoms], dword 12
- mov [esp + .nscoul], ecx
-
- ; clear vctot
- xorps xmm4, xmm4
- movaps [esp + .vctot], xmm4
- mov [esp + .solnr], ebx
-
- mov eax, [ebp + %$jindex]
- mov ecx, [eax] ; jindex[n]
- mov edx, [eax + 4] ; jindex[n+1]
- add [ebp + %$jindex], dword 4
- sub edx, ecx ; number of innerloop atoms
-
- mov eax, [ebp + %$jjnr]
- shl ecx, 2
- add eax, ecx
- mov [esp + .innerjjnr0], eax ; pointer to jjnr[nj0]
- mov [esp + .innerk0], edx ; number of innerloop atoms
-
- mov ecx, [esp + .nscoul]
- cmp ecx, dword 0
- jnz .mno_coul
- jmp .last_mno
-.mno_coul:
- mov ebx, [esp + .solnr]
- inc dword [esp + .solnr]
-
- movss xmm0, [esp + .shX]
- movss xmm1, [esp + .shY]
- movss xmm2, [esp + .shZ]
-
- mov edx, [ebp + %$charge]
- movss xmm3, [edx + ebx*4]
- mulss xmm3, [ebp + %$facel]
- shufps xmm3, xmm3, 0b
-
- lea ebx, [ebx + ebx*2] ; ebx = 3*ii=ii3
- mov eax, [ebp + %$pos] ; eax = base of pos[]
-
- addss xmm0, [eax + ebx*4]
- addss xmm1, [eax + ebx*4 + 4]
- addss xmm2, [eax + ebx*4 + 8]
-
- movaps [esp + .iq], xmm3
-
- shufps xmm0, xmm0, 0b
- shufps xmm1, xmm1, 0b
- shufps xmm2, xmm2, 0b
-
- movaps [esp + .ix], xmm0
- movaps [esp + .iy], xmm1
- movaps [esp + .iz], xmm2
-
- mov [esp + .ii3], ebx
-
- ; clear i forces
- xorps xmm4, xmm4
- movaps [esp + .fix], xmm4
- movaps [esp + .fiy], xmm4
- movaps [esp + .fiz], xmm4
-
- mov ecx, [esp + .innerjjnr0]
- mov [esp + .innerjjnr], ecx
- mov edx, [esp + .innerk0]
- sub edx, dword 4
- mov [esp + .innerk], edx ; number of innerloop atoms
- jge .unroll_coul_loop
- jmp .finish_coul_inner
-
-.unroll_coul_loop:
- ;; quad-unrolled innerloop here.
- mov edx, [esp + .innerjjnr] ; pointer to jjnr[k]
- mov eax, [edx]
- mov ebx, [edx + 4]
- mov ecx, [edx + 8]
- mov edx, [edx + 12] ; eax-edx=jnr1-4
- add [esp + .innerjjnr], dword 16 ; advance pointer (unrolled 4)
-
- mov esi, [ebp + %$charge] ; base of charge[]
-
- movss xmm3, [esi + eax*4]
- movss xmm4, [esi + ecx*4]
- movss xmm6, [esi + ebx*4]
- movss xmm7, [esi + edx*4]
-
- movaps xmm5, [esp + .iq]
- shufps xmm3, xmm6, 00000000b
- shufps xmm4, xmm7, 00000000b
- shufps xmm3, xmm4, 10001000b
- mov esi, [ebp + %$pos] ; base of pos[]
-
- lea eax, [eax + eax*2] ; replace jnr with j3
- lea ebx, [ebx + ebx*2]
-
- mulps xmm3, xmm5
- lea ecx, [ecx + ecx*2] ; replace jnr with j3
- lea edx, [edx + edx*2]
-
- ; move four coordinates to xmm0-xmm2
-
- movlps xmm4, [esi + eax*4]
- movlps xmm5, [esi + ecx*4]
- movss xmm2, [esi + eax*4 + 8]
- movss xmm6, [esi + ecx*4 + 8]
-
- movhps xmm4, [esi + ebx*4]
- movhps xmm5, [esi + edx*4]
-
- movss xmm0, [esi + ebx*4 + 8]
- movss xmm1, [esi + edx*4 + 8]
-
- shufps xmm2, xmm0, 0b
- shufps xmm6, xmm1, 0b
-
- movaps xmm0, xmm4
- movaps xmm1, xmm4
-
- shufps xmm2, xmm6, 10001000b
-
- shufps xmm0, xmm5, 10001000b
- shufps xmm1, xmm5, 11011101b
-
- mov edi, [ebp + %$faction]
-
- ; move ix-iz to xmm4-xmm6
- movaps xmm4, [esp + .ix]
- movaps xmm5, [esp + .iy]
- movaps xmm6, [esp + .iz]
-
- ; calc dr
- subps xmm4, xmm0
- subps xmm5, xmm1
- subps xmm6, xmm2
-
- ; store dr
- movaps [esp + .dx], xmm4
- movaps [esp + .dy], xmm5
- movaps [esp + .dz], xmm6
- ; square it
- mulps xmm4,xmm4
- mulps xmm5,xmm5
- mulps xmm6,xmm6
- addps xmm4, xmm5
- addps xmm4, xmm6
- ; rsq in xmm4
-
- rsqrtps xmm5, xmm4
- ; lookup seed in xmm5
- movaps xmm2, xmm5
- mulps xmm5, xmm5
- movaps xmm1, [esp + .three]
- mulps xmm5, xmm4 ;rsq*lu*lu
- movaps xmm0, [esp + .half]
- subps xmm1, xmm5 ; 3.0-rsq*lu*lu
- mulps xmm1, xmm2
- mulps xmm0, xmm1 ; xmm0=rinv
- movaps xmm4, xmm0
- mulps xmm4, xmm4 ; xmm4=rinvsq
-
- movaps xmm5, [esp + .vctot]
- mulps xmm3, xmm0 ; xmm3=vcoul
- mulps xmm4, xmm3 ; xmm4=fscal
- addps xmm5, xmm3
-
- movaps xmm0, [esp + .dx]
- movaps xmm1, [esp + .dy]
- movaps xmm2, [esp + .dz]
-
- movaps [esp + .vctot], xmm5
-
-
- mulps xmm0, xmm4
- mulps xmm1, xmm4
- mulps xmm2, xmm4
- ; xmm0-xmm2 contains tx-tz (partial force)
- ; now update f_i
- movaps xmm3, [esp + .fix]
- movaps xmm4, [esp + .fiy]
- movaps xmm5, [esp + .fiz]
- addps xmm3, xmm0
- addps xmm4, xmm1
- addps xmm5, xmm2
- movaps [esp + .fix], xmm3
- movaps [esp + .fiy], xmm4
- movaps [esp + .fiz], xmm5
- ; the fj's - start by accumulating x & y forces from memory
- movlps xmm4, [edi + eax*4]
- movlps xmm6, [edi + ecx*4]
- movhps xmm4, [edi + ebx*4]
- movhps xmm6, [edi + edx*4]
-
- movaps xmm3, xmm4
- shufps xmm3, xmm6, 10001000b
- shufps xmm4, xmm6, 11011101b
-
- ; now xmm3-xmm5 contains fjx, fjy, fjz
- subps xmm3, xmm0
- subps xmm4, xmm1
-
- ; unpack them back so we can store them - first x & y in xmm3/xmm4
-
- movaps xmm6, xmm3
- unpcklps xmm6, xmm4
- unpckhps xmm3, xmm4
- ; xmm6(l)=x & y for j1, (h) for j2
- ; xmm3(l)=x & y for j3, (h) for j4
- movlps [edi + eax*4], xmm6
- movlps [edi + ecx*4], xmm3
-
- movhps [edi + ebx*4], xmm6
- movhps [edi + edx*4], xmm3
-
- ;; and the z forces
- movss xmm4, [edi + eax*4 + 8]
- movss xmm5, [edi + ebx*4 + 8]
- movss xmm6, [edi + ecx*4 + 8]
- movss xmm7, [edi + edx*4 + 8]
- subss xmm4, xmm2
- shufps xmm2, xmm2, 11100101b
- subss xmm5, xmm2
- shufps xmm2, xmm2, 11101010b
- subss xmm6, xmm2
- shufps xmm2, xmm2, 11111111b
- subss xmm7, xmm2
- movss [edi + eax*4 + 8], xmm4
- movss [edi + ebx*4 + 8], xmm5
- movss [edi + ecx*4 + 8], xmm6
- movss [edi + edx*4 + 8], xmm7
-
- ;; should we do one more iteration?
- sub [esp + .innerk], dword 4
- jl .finish_coul_inner
- jmp .unroll_coul_loop
-.finish_coul_inner:
- ;; check if at least two particles remain
- add [esp + .innerk], dword 4
- mov edx, [esp + .innerk]
- and edx, 10b
- jnz .dopair_coul
- jmp .checksingle_coul
-.dopair_coul:
- mov esi, [ebp + %$charge]
- mov edi, [ebp + %$pos]
- mov ecx, [esp + .innerjjnr]
-
- mov eax, [ecx]
- mov ebx, [ecx + 4]
- add [esp + .innerjjnr], dword 8
-
- movss xmm3, [esi + eax*4]
- movss xmm6, [esi + ebx*4]
- shufps xmm3, xmm6, 00000000b
- shufps xmm3, xmm3, 00001000b ; xmm3(0,1) has the charges.
-
- lea eax, [eax + eax*2]
- lea ebx, [ebx + ebx*2]
- ; move coordinates to xmm0-xmm2
- movlps xmm1, [edi + eax*4]
- movss xmm2, [edi + eax*4 + 8]
- movhps xmm1, [edi + ebx*4]
- movss xmm0, [edi + ebx*4 + 8]
-
- mulps xmm3, [esp + .iq]
- xorps xmm7,xmm7
- movlhps xmm3, xmm7
-
- shufps xmm2, xmm0, 0b
-
- movaps xmm0, xmm1
-
- shufps xmm2, xmm2, 10001000b
-
- shufps xmm0, xmm0, 10001000b
- shufps xmm1, xmm1, 11011101b
-
- mov edi, [ebp + %$faction]
- ; move ix-iz to xmm4-xmm6
- xorps xmm7, xmm7
-
- movaps xmm4, [esp + .ix]
- movaps xmm5, [esp + .iy]
- movaps xmm6, [esp + .iz]
-
- ; calc dr
- subps xmm4, xmm0
- subps xmm5, xmm1
- subps xmm6, xmm2
-
- ; store dr
- movaps [esp + .dx], xmm4
- movaps [esp + .dy], xmm5
- movaps [esp + .dz], xmm6
- ; square it
- mulps xmm4,xmm4
- mulps xmm5,xmm5
- mulps xmm6,xmm6
- addps xmm4, xmm5
- addps xmm4, xmm6
- ; rsq in xmm4
-
- rsqrtps xmm5, xmm4
- ; lookup seed in xmm5
- movaps xmm2, xmm5
- mulps xmm5, xmm5
- movaps xmm1, [esp + .three]
- mulps xmm5, xmm4 ;rsq*lu*lu
- movaps xmm0, [esp + .half]
- subps xmm1, xmm5 ; 3.0-rsq*lu*lu
- mulps xmm1, xmm2
- mulps xmm0, xmm1 ; xmm0=rinv
- movaps xmm4, xmm0
- mulps xmm4, xmm4 ; xmm4=rinvsq
-
- movaps xmm5, [esp + .vctot]
- mulps xmm3, xmm0 ; xmm3=vcoul
- mulps xmm4, xmm3 ; xmm4=fscal
- addps xmm5, xmm3
-
- movaps xmm0, [esp + .dx]
- movaps xmm1, [esp + .dy]
- movaps xmm2, [esp + .dz]
-
- movaps [esp + .vctot], xmm5
-
- mulps xmm0, xmm4
- mulps xmm1, xmm4
- mulps xmm2, xmm4
- ; xmm0-xmm2 contains tx-tz (partial force)
- ; now update f_i
- movaps xmm3, [esp + .fix]
- movaps xmm4, [esp + .fiy]
- movaps xmm5, [esp + .fiz]
- addps xmm3, xmm0
- addps xmm4, xmm1
- addps xmm5, xmm2
- movaps [esp + .fix], xmm3
- movaps [esp + .fiy], xmm4
- movaps [esp + .fiz], xmm5
- ; update the fj's
- movss xmm3, [edi + eax*4]
- movss xmm4, [edi + eax*4 + 4]
- movss xmm5, [edi + eax*4 + 8]
- subps xmm3, xmm0
- subps xmm4, xmm1
- subps xmm5, xmm2
- movss [edi + eax*4], xmm3
- movss [edi + eax*4 + 4], xmm4
- movss [edi + eax*4 + 8], xmm5
-
- shufps xmm0, xmm0, 11100001b
- shufps xmm1, xmm1, 11100001b
- shufps xmm2, xmm2, 11100001b
-
- movss xmm3, [edi + ebx*4]
- movss xmm4, [edi + ebx*4 + 4]
- movss xmm5, [edi + ebx*4 + 8]
- subps xmm3, xmm0
- subps xmm4, xmm1
- subps xmm5, xmm2
- movss [edi + ebx*4], xmm3
- movss [edi + ebx*4 + 4], xmm4
- movss [edi + ebx*4 + 8], xmm5
-
-.checksingle_coul:
- mov edx, [esp + .innerk]
- and edx, 1b
- jnz .dosingle_coul
- jmp .updateouterdata_coul
-.dosingle_coul:
- mov esi, [ebp + %$charge]
- mov edi, [ebp + %$pos]
- mov ecx, [esp + .innerjjnr]
- mov eax, [ecx]
- movss xmm3, [esi + eax*4] ; xmm3(0) has the charge
-
- lea eax, [eax + eax*2]
-
- ; move coordinates to xmm0-xmm2
- movss xmm0, [edi + eax*4]
- movss xmm1, [edi + eax*4 + 4]
- movss xmm2, [edi + eax*4 + 8]
-
- mulps xmm3, [esp + .iq]
-
- xorps xmm7, xmm7
-
- movaps xmm4, [esp + .ix]
- movaps xmm5, [esp + .iy]
- movaps xmm6, [esp + .iz]
-
- ; calc dr
- subps xmm4, xmm0
- subps xmm5, xmm1
- subps xmm6, xmm2
-
- ; store dr
- movaps [esp + .dx], xmm4
- movaps [esp + .dy], xmm5
- movaps [esp + .dz], xmm6
- ; square it
- mulps xmm4,xmm4
- mulps xmm5,xmm5
- mulps xmm6,xmm6
- addps xmm4, xmm5
- addps xmm4, xmm6
- ; rsq in xmm4
-
- rsqrtps xmm5, xmm4
- ; lookup seed in xmm5
- movaps xmm2, xmm5
- mulps xmm5, xmm5
- movaps xmm1, [esp + .three]
- mulps xmm5, xmm4 ;rsq*lu*lu
- movaps xmm0, [esp + .half]
- subps xmm1, xmm5 ; 3.0-rsq*lu*lu
- mulps xmm1, xmm2
- mulps xmm0, xmm1 ; xmm0=rinv
- movaps xmm4, xmm0
- mulps xmm4, xmm4 ; xmm4=rinvsq
- mov edi, [ebp + %$faction]
- movaps xmm5, [esp + .vctot]
- mulps xmm3, xmm0 ; xmm3=vcoul
- mulps xmm4, xmm3 ; xmm4=fscal
- addps xmm5, xmm3
-
- movaps xmm0, [esp + .dx]
- movaps xmm1, [esp + .dy]
- movaps xmm2, [esp + .dz]
-
- movaps [esp + .vctot], xmm5
-
- mulps xmm0, xmm4
- mulps xmm1, xmm4
- mulps xmm2, xmm4
- ; xmm0-xmm2 contains tx-tz (partial force)
- ; now update f_i
- movaps xmm3, [esp + .fix]
- movaps xmm4, [esp + .fiy]
- movaps xmm5, [esp + .fiz]
- addps xmm3, xmm0
- addps xmm4, xmm1
- addps xmm5, xmm2
- movaps [esp + .fix], xmm3
- movaps [esp + .fiy], xmm4
- movaps [esp + .fiz], xmm5
- ; update fj
- movss xmm3, [edi + eax*4]
- movss xmm4, [edi + eax*4 + 4]
- movss xmm5, [edi + eax*4 + 8]
- subss xmm3, xmm0
- subss xmm4, xmm1
- subss xmm5, xmm2
- movss [edi + eax*4], xmm3
- movss [edi + eax*4 + 4], xmm4
- movss [edi + eax*4 + 8], xmm5
-.updateouterdata_coul:
- mov ecx, [esp + .ii3]
- mov edi, [ebp + %$faction]
- mov esi, [ebp + %$fshift]
- mov edx, [esp + .is3]
-
- ; accumulate i forces in xmm0, xmm1, xmm2
- movaps xmm0, [esp + .fix]
- movaps xmm1, [esp + .fiy]
- movaps xmm2, [esp + .fiz]
-
- movhlps xmm3, xmm0
- movhlps xmm4, xmm1
- movhlps xmm5, xmm2
- addps xmm0, xmm3
- addps xmm1, xmm4
- addps xmm2, xmm5 ; summan ligger i 1/2 i xmm0-xmm2
-
- movaps xmm3, xmm0
- movaps xmm4, xmm1
- movaps xmm5, xmm2
-
- shufps xmm3, xmm3, 1b
- shufps xmm4, xmm4, 1b
- shufps xmm5, xmm5, 1b
- addss xmm0, xmm3
- addss xmm1, xmm4
- addss xmm2, xmm5 ; xmm0-xmm2 has single force in pos0.
-
- ; increment i force
- movss xmm3, [edi + ecx*4]
- movss xmm4, [edi + ecx*4 + 4]
- movss xmm5, [edi + ecx*4 + 8]
- addss xmm3, xmm0
- addss xmm4, xmm1
- addss xmm5, xmm2
- movss [edi + ecx*4], xmm3
- movss [edi + ecx*4 + 4], xmm4
- movss [edi + ecx*4 + 8], xmm5
-
- ; increment fshift force
- movss xmm3, [esi + edx*4]
- movss xmm4, [esi + edx*4 + 4]
- movss xmm5, [esi + edx*4 + 8]
- addss xmm3, xmm0
- addss xmm4, xmm1
- addss xmm5, xmm2
- movss [esi + edx*4], xmm3
- movss [esi + edx*4 + 4], xmm4
- movss [esi + edx*4 + 8], xmm5
-
- ;; loop back to mno.
- dec dword [esp + .nscoul]
- jz .last_mno
- jmp .mno_coul
-
-.last_mno:
- ; get group index for i particle
- mov edx, [ebp + %$gid] ; get group index for this i particle
- mov edx, [edx]
- add [ebp + %$gid], dword 4 ; advance pointer
-
- ; accumulate total potential energy and update it.
- movaps xmm7, [esp + .vctot]
- ; accumulate
- movhlps xmm6, xmm7
- addps xmm7, xmm6 ; pos 0-1 in xmm7 have the sum now
- movaps xmm6, xmm7
- shufps xmm6, xmm6, 1b
- addss xmm7, xmm6
-
- ; add earlier value from mem.
- mov eax, [ebp + %$Vc]
- addss xmm7, [eax + edx*4]
- ; move back to mem.
- movss [eax + edx*4], xmm7
-
- ;; finish if last
- mov ecx, [ebp + %$nri]
- dec ecx
- jecxz .end
- ;; not last, iterate once more!
- mov [ebp + %$nri], ecx
- jmp .outer
-.end:
- emms
- mov eax, [esp + .salign]
- add esp, eax
- add esp, 260
- pop edi
- pop esi
- pop edx
- pop ecx
- pop ebx
- pop eax
- endproc
-
-
-
-
-proc inl1020_sse
-%$nri arg
-%$iinr arg
-%$jindex arg
-%$jjnr arg
-%$shift arg
-%$shiftvec arg
-%$fshift arg
-%$gid arg
-%$pos arg
-%$faction arg
-%$charge arg
-%$facel arg
-%$Vc arg
- ;; stack offsets for local variables
- ;; bottom of stack is cache-aligned for sse use
-.ixO equ 0
-.iyO equ 16
-.izO equ 32
-.ixH1 equ 48
-.iyH1 equ 64
-.izH1 equ 80
-.ixH2 equ 96
-.iyH2 equ 112
-.izH2 equ 128
-.iqO equ 144
-.iqH equ 160
-.dxO equ 176
-.dyO equ 192
-.dzO equ 208
-.dxH1 equ 224
-.dyH1 equ 240
-.dzH1 equ 256
-.dxH2 equ 272
-.dyH2 equ 288
-.dzH2 equ 304
-.qqO equ 320
-.qqH equ 336
-.vctot equ 352
-.fixO equ 368
-.fiyO equ 384
-.fizO equ 400
-.fixH1 equ 416
-.fiyH1 equ 432
-.fizH1 equ 448
-.fixH2 equ 464
-.fiyH2 equ 480
-.fizH2 equ 496
-.fjx equ 512
-.fjy equ 528
-.fjz equ 544
-.half equ 560
-.three equ 576
-.is3 equ 592
-.ii3 equ 596
-.innerjjnr equ 600
-.innerk equ 604
-.salign equ 608
- push eax
- push ebx
- push ecx
- push edx
- push esi
- push edi
- sub esp, 612 ; local stack space
- mov eax, esp
- and eax, 0xf
- sub esp, eax
- mov [esp + .salign], eax
-
- emms
-
- movups xmm0, [sse_half]
- movups xmm1, [sse_three]
- movaps [esp + .half], xmm0
- movaps [esp + .three], xmm1
-
- ;; assume we have at least one i particle - start directly
- mov ecx, [ebp + %$iinr] ; ecx = pointer into iinr[]
- mov ebx, [ecx] ; ebx =ii
-
- mov edx, [ebp + %$charge]
- movss xmm3, [edx + ebx*4]
- movss xmm4, [edx + ebx*4 + 4]
- movss xmm5, [ebp + %$facel]
- mulss xmm3, xmm5
- mulss xmm4, xmm5
-
- shufps xmm3, xmm3, 0b
- shufps xmm4, xmm4, 0b
- movaps [esp + .iqO], xmm3
- movaps [esp + .iqH], xmm4
-
-.outer:
- mov eax, [ebp + %$shift] ; eax = pointer into shift[]
- mov ebx, [eax] ; ebx=shift[n]
- add [ebp + %$shift], dword 4 ; advance pointer one step
-
- lea ebx, [ebx + ebx*2] ; ebx=3*is
- mov [esp + .is3],ebx ; store is3
-
- mov eax, [ebp + %$shiftvec] ; eax = base of shiftvec[]
-
- movss xmm0, [eax + ebx*4]
- movss xmm1, [eax + ebx*4 + 4]
- movss xmm2, [eax + ebx*4 + 8]
-
- mov ecx, [ebp + %$iinr] ; ecx = pointer into iinr[]
- add [ebp + %$iinr], dword 4 ; advance pointer
- mov ebx, [ecx] ; ebx =ii
-
- movaps xmm3, xmm0
- movaps xmm4, xmm1
- movaps xmm5, xmm2
-
- lea ebx, [ebx + ebx*2] ; ebx = 3*ii=ii3
- mov eax, [ebp + %$pos] ; eax = base of pos[]
- mov [esp + .ii3], ebx
-
- addss xmm3, [eax + ebx*4]
- addss xmm4, [eax + ebx*4 + 4]
- addss xmm5, [eax + ebx*4 + 8]
- shufps xmm3, xmm3, 0b
- shufps xmm4, xmm4, 0b
- shufps xmm5, xmm5, 0b
- movaps [esp + .ixO], xmm3
- movaps [esp + .iyO], xmm4
- movaps [esp + .izO], xmm5
-
- movss xmm3, xmm0
- movss xmm4, xmm1
- movss xmm5, xmm2
- addss xmm0, [eax + ebx*4 + 12]
- addss xmm1, [eax + ebx*4 + 16]
- addss xmm2, [eax + ebx*4 + 20]
- addss xmm3, [eax + ebx*4 + 24]
- addss xmm4, [eax + ebx*4 + 28]
- addss xmm5, [eax + ebx*4 + 32]
-
- shufps xmm0, xmm0, 0b
- shufps xmm1, xmm1, 0b
- shufps xmm2, xmm2, 0b
- shufps xmm3, xmm3, 0b
- shufps xmm4, xmm4, 0b
- shufps xmm5, xmm5, 0b
- movaps [esp + .ixH1], xmm0
- movaps [esp + .iyH1], xmm1
- movaps [esp + .izH1], xmm2
- movaps [esp + .ixH2], xmm3
- movaps [esp + .iyH2], xmm4
- movaps [esp + .izH2], xmm5
-
- ; clear vctot and i forces
- xorps xmm4, xmm4
- movaps [esp + .vctot], xmm4
- movaps [esp + .fixO], xmm4
- movaps [esp + .fiyO], xmm4
- movaps [esp + .fizO], xmm4
- movaps [esp + .fixH1], xmm4
- movaps [esp + .fiyH1], xmm4
- movaps [esp + .fizH1], xmm4
- movaps [esp + .fixH2], xmm4
- movaps [esp + .fiyH2], xmm4
- movaps [esp + .fizH2], xmm4
-
- mov eax, [ebp + %$jindex]
- mov ecx, [eax] ; jindex[n]
- mov edx, [eax + 4] ; jindex[n+1]
- add [ebp + %$jindex], dword 4
- sub edx, ecx ; number of innerloop atoms
-
- mov esi, [ebp + %$pos]
- mov edi, [ebp + %$faction]
- mov eax, [ebp + %$jjnr]
- shl ecx, 2
- add eax, ecx
- mov [esp + .innerjjnr], eax ; pointer to jjnr[nj0]
- sub edx, dword 4
- mov [esp + .innerk], edx ; number of innerloop atoms
- jge .unroll_loop
- jmp .odd_inner
-.unroll_loop:
- ;; quad-unroll innerloop here.
- mov edx, [esp + .innerjjnr] ; pointer to jjnr[k]
- mov eax, [edx]
- mov ebx, [edx + 4]
- mov ecx, [edx + 8]
- mov edx, [edx + 12] ; eax-edx=jnr1-4
-
- add [esp + .innerjjnr], dword 16 ; advance pointer (unrolled 4)
-
- mov esi, [ebp + %$charge] ; base of charge[]
-
- movss xmm3, [esi + eax*4]
- movss xmm4, [esi + ecx*4]
- movss xmm6, [esi + ebx*4]
- movss xmm7, [esi + edx*4]
-
- shufps xmm3, xmm6, 00000000b
- shufps xmm4, xmm7, 00000000b
- shufps xmm3, xmm4, 10001000b ; all charges in xmm3
- movaps xmm4, xmm3 ; and in xmm4
- mulps xmm3, [esp + .iqO]
- mulps xmm4, [esp + .iqH]
-
- movaps [esp + .qqO], xmm3
- movaps [esp + .qqH], xmm4
-
- mov esi, [ebp + %$pos] ; base of pos[]
-
- lea eax, [eax + eax*2] ; replace jnr with j3
- lea ebx, [ebx + ebx*2]
- lea ecx, [ecx + ecx*2] ; replace jnr with j3
- lea edx, [edx + edx*2]
-
- ; move four coordinates to xmm0-xmm2
- movlps xmm4, [esi + eax*4]
- movlps xmm5, [esi + ecx*4]
- movss xmm2, [esi + eax*4 + 8]
- movss xmm6, [esi + ecx*4 + 8]
-
- movhps xmm4, [esi + ebx*4]
- movhps xmm5, [esi + edx*4]
-
- movss xmm0, [esi + ebx*4 + 8]
- movss xmm1, [esi + edx*4 + 8]
-
- shufps xmm2, xmm0, 0b
- shufps xmm6, xmm1, 0b
-
- movaps xmm0, xmm4
- movaps xmm1, xmm4
-
- shufps xmm2, xmm6, 10001000b
-
- shufps xmm0, xmm5, 10001000b
- shufps xmm1, xmm5, 11011101b
-
- ; move ixO-izO to xmm4-xmm6
- movaps xmm4, [esp + .ixO]
- movaps xmm5, [esp + .iyO]
- movaps xmm6, [esp + .izO]
-
- ; calc dr
- subps xmm4, xmm0
- subps xmm5, xmm1
- subps xmm6, xmm2
-
- ; store dr
- movaps [esp + .dxO], xmm4
- movaps [esp + .dyO], xmm5
- movaps [esp + .dzO], xmm6
- ; square it
- mulps xmm4,xmm4
- mulps xmm5,xmm5
- mulps xmm6,xmm6
- addps xmm4, xmm5
- addps xmm4, xmm6
- movaps xmm7, xmm4
- ; rsqO in xmm7
-
- ; move ixH1-izH1 to xmm4-xmm6
- movaps xmm4, [esp + .ixH1]
- movaps xmm5, [esp + .iyH1]
- movaps xmm6, [esp + .izH1]
-
- ; calc dr
- subps xmm4, xmm0
- subps xmm5, xmm1
- subps xmm6, xmm2
-
- ; store dr
- movaps [esp + .dxH1], xmm4
- movaps [esp + .dyH1], xmm5
- movaps [esp + .dzH1], xmm6
- ; square it
- mulps xmm4,xmm4
- mulps xmm5,xmm5
- mulps xmm6,xmm6
- addps xmm6, xmm5
- addps xmm6, xmm4
- ; rsqH1 in xmm6
-
- ; move ixH2-izH2 to xmm3-xmm5
- movaps xmm3, [esp + .ixH2]
- movaps xmm4, [esp + .iyH2]
- movaps xmm5, [esp + .izH2]
-
- ; calc dr
- subps xmm3, xmm0
- subps xmm4, xmm1
- subps xmm5, xmm2
-
- ; store dr
- movaps [esp + .dxH2], xmm3
- movaps [esp + .dyH2], xmm4
- movaps [esp + .dzH2], xmm5
- ; square it
- mulps xmm3,xmm3
- mulps xmm4,xmm4
- mulps xmm5,xmm5
- addps xmm5, xmm4
- addps xmm5, xmm3
- ; rsqH2 in xmm5, rsqH1 in xmm6, rsqO in xmm7
-
- ; start with rsqO - seed in xmm2
- rsqrtps xmm2, xmm7
- movaps xmm3, xmm2
- mulps xmm2, xmm2
- movaps xmm4, [esp + .three]
- mulps xmm2, xmm7 ; rsq*lu*lu
- subps xmm4, xmm2 ; 3.0-rsq*lu*lu
- mulps xmm4, xmm3 ; lu*(3-rsq*lu*lu)
- mulps xmm4, [esp + .half]
- movaps xmm7, xmm4 ; rinvO in xmm7
- ; rsqH1 - seed in xmm2
- rsqrtps xmm2, xmm6
- movaps xmm3, xmm2
- mulps xmm2, xmm2
- movaps xmm4, [esp + .three]
- mulps xmm2, xmm6 ; rsq*lu*lu
- subps xmm4, xmm2 ; 3.0-rsq*lu*lu
- mulps xmm4, xmm3 ; lu*(3-rsq*lu*lu)
- mulps xmm4, [esp + .half]
- movaps xmm6, xmm4 ; rinvH1 in xmm6
- ; rsqH2 - seed in xmm2
- rsqrtps xmm2, xmm5
- movaps xmm3, xmm2
- mulps xmm2, xmm2
- movaps xmm4, [esp + .three]
- mulps xmm2, xmm5 ; rsq*lu*lu
- subps xmm4, xmm2 ; 3.0-rsq*lu*lu
- mulps xmm4, xmm3 ; lu*(3-rsq*lu*lu)
- mulps xmm4, [esp + .half]
- movaps xmm5, xmm4 ; rinvH2 in xmm5
-
- ; do O interactions
- movaps xmm4, xmm7
- mulps xmm4, xmm4 ; xmm7=rinv, xmm4=rinvsq
- mulps xmm7, [esp + .qqO] ;xmm7=vcoul
-
- mulps xmm4, xmm7 ; total fsO in xmm4
-
- addps xmm7, [esp + .vctot]
-
- movaps [esp + .vctot], xmm7
-
- movaps xmm0, [esp + .dxO]
- movaps xmm1, [esp + .dyO]
- movaps xmm2, [esp + .dzO]
- mulps xmm0, xmm4
- mulps xmm1, xmm4
- mulps xmm2, xmm4
-
- ; update O forces
- movaps xmm3, [esp + .fixO]
- movaps xmm4, [esp + .fiyO]
- movaps xmm7, [esp + .fizO]
- addps xmm3, xmm0
- addps xmm4, xmm1
- addps xmm7, xmm2
- movaps [esp + .fixO], xmm3
- movaps [esp + .fiyO], xmm4
- movaps [esp + .fizO], xmm7
- ; update j forces with water O
- movaps [esp + .fjx], xmm0
- movaps [esp + .fjy], xmm1
- movaps [esp + .fjz], xmm2
-
- ; H1 interactions
- movaps xmm4, xmm6
- mulps xmm4, xmm4 ; xmm6=rinv, xmm4=rinvsq
- mulps xmm6, [esp + .qqH] ;xmm6=vcoul
- mulps xmm4, xmm6 ; total fsH1 in xmm4
-
- addps xmm6, [esp + .vctot]
-
- movaps xmm0, [esp + .dxH1]
- movaps xmm1, [esp + .dyH1]
- movaps xmm2, [esp + .dzH1]
- movaps [esp + .vctot], xmm6
- mulps xmm0, xmm4
- mulps xmm1, xmm4
- mulps xmm2, xmm4
-
- ; update H1 forces
- movaps xmm3, [esp + .fixH1]
- movaps xmm4, [esp + .fiyH1]
- movaps xmm7, [esp + .fizH1]
- addps xmm3, xmm0
- addps xmm4, xmm1
- addps xmm7, xmm2
- movaps [esp + .fixH1], xmm3
- movaps [esp + .fiyH1], xmm4
- movaps [esp + .fizH1], xmm7
- ; update j forces with water H1
- addps xmm0, [esp + .fjx]
- addps xmm1, [esp + .fjy]
- addps xmm2, [esp + .fjz]
- movaps [esp + .fjx], xmm0
- movaps [esp + .fjy], xmm1
- movaps [esp + .fjz], xmm2
-
- ; H2 interactions
- movaps xmm4, xmm5
- mulps xmm4, xmm4 ; xmm5=rinv, xmm4=rinvsq
- mulps xmm5, [esp + .qqH] ;xmm5=vcoul
- mulps xmm4, xmm5 ; total fsH1 in xmm4
-
- addps xmm5, [esp + .vctot]
-
- movaps xmm0, [esp + .dxH2]
- movaps xmm1, [esp + .dyH2]
- movaps xmm2, [esp + .dzH2]
- movaps [esp + .vctot], xmm5
- mulps xmm0, xmm4
- mulps xmm1, xmm4
- mulps xmm2, xmm4
-
- ; update H2 forces
- movaps xmm3, [esp + .fixH2]
- movaps xmm4, [esp + .fiyH2]
- movaps xmm7, [esp + .fizH2]
- addps xmm3, xmm0
- addps xmm4, xmm1
- addps xmm7, xmm2
- movaps [esp + .fixH2], xmm3
- movaps [esp + .fiyH2], xmm4
- movaps [esp + .fizH2], xmm7
-
- mov edi, [ebp +%$faction]
- ; update j forces
- addps xmm0, [esp + .fjx]
- addps xmm1, [esp + .fjy]
- addps xmm2, [esp + .fjz]
-
- movlps xmm4, [edi + eax*4]
- movlps xmm7, [edi + ecx*4]
- movhps xmm4, [edi + ebx*4]
- movhps xmm7, [edi + edx*4]
-
- movaps xmm3, xmm4
- shufps xmm3, xmm7, 10001000b
- shufps xmm4, xmm7, 11011101b
- ; xmm3 has fjx, xmm4 has fjy.
- subps xmm3, xmm0
- subps xmm4, xmm1
- ; unpack the back for storing.
- movaps xmm7, xmm3
- unpcklps xmm7, xmm4
- unpckhps xmm3, xmm4
- movlps [edi + eax*4], xmm7
- movlps [edi + ecx*4], xmm3
- movhps [edi + ebx*4], xmm7
- movhps [edi + edx*4], xmm3
- ; finally z forces
- movss xmm0, [edi + eax*4 + 8]
- movss xmm1, [edi + ebx*4 + 8]
- movss xmm3, [edi + ecx*4 + 8]
- movss xmm4, [edi + edx*4 + 8]
- subss xmm0, xmm2
- shufps xmm2, xmm2, 11100101b
- subss xmm1, xmm2
- shufps xmm2, xmm2, 11101010b
- subss xmm3, xmm2
- shufps xmm2, xmm2, 11111111b
- subss xmm4, xmm2
- movss [edi + eax*4 + 8], xmm0
- movss [edi + ebx*4 + 8], xmm1
- movss [edi + ecx*4 + 8], xmm3
- movss [edi + edx*4 + 8], xmm4
-
- ;; should we do one more iteration?
- sub [esp + .innerk], dword 4
- jl .odd_inner
- jmp .unroll_loop
-.odd_inner:
- add [esp + .innerk], dword 4
- jnz .odd_loop
- jmp .updateouterdata
-.odd_loop:
- mov edx, [esp + .innerjjnr] ; pointer to jjnr[k]
- mov eax, [edx]
- add [esp + .innerjjnr], dword 4
-
- xorps xmm4, xmm4
- movss xmm4, [esp + .iqO]
- mov esi, [ebp + %$charge]
- movhps xmm4, [esp + .iqH]
- movss xmm3, [esi + eax*4] ; charge in xmm3
- shufps xmm3, xmm3, 0b
- mulps xmm3, xmm4
- movaps [esp + .qqO], xmm3 ; use oxygen qq for storage.
-
- mov esi, [ebp + %$pos]
- lea eax, [eax + eax*2]
-
- ; move j coords to xmm0-xmm2
- movss xmm0, [esi + eax*4]
- movss xmm1, [esi + eax*4 + 4]
- movss xmm2, [esi + eax*4 + 8]
- shufps xmm0, xmm0, 0b
- shufps xmm1, xmm1, 0b
- shufps xmm2, xmm2, 0b
-
- movss xmm3, [esp + .ixO]
- movss xmm4, [esp + .iyO]
- movss xmm5, [esp + .izO]
-
- movlps xmm6, [esp + .ixH1]
- movlps xmm7, [esp + .ixH2]
- unpcklps xmm6, xmm7
- movlhps xmm3, xmm6
- movlps xmm6, [esp + .iyH1]
- movlps xmm7, [esp + .iyH2]
- unpcklps xmm6, xmm7
- movlhps xmm4, xmm6
- movlps xmm6, [esp + .izH1]
- movlps xmm7, [esp + .izH2]
- unpcklps xmm6, xmm7
- movlhps xmm5, xmm6
-
- subps xmm3, xmm0
- subps xmm4, xmm1
- subps xmm5, xmm2
-
- movaps [esp + .dxO], xmm3
- movaps [esp + .dyO], xmm4
- movaps [esp + .dzO], xmm5
-
- mulps xmm3, xmm3
- mulps xmm4, xmm4
- mulps xmm5, xmm5
-
- addps xmm4, xmm3
- addps xmm4, xmm5
- ; rsq in xmm4.
-
- rsqrtps xmm5, xmm4
- ; lookup seed in xmm5
- movaps xmm2, xmm5
- mulps xmm5, xmm5
- movaps xmm1, [esp + .three]
- mulps xmm5, xmm4 ;rsq*lu*lu
- movaps xmm0, [esp + .half]
- subps xmm1, xmm5 ; 3.0-rsq*lu*lu
- mulps xmm1, xmm2
- mulps xmm0, xmm1 ; xmm0=rinv
- movaps xmm4, xmm0
- mulps xmm4, xmm4 ; xmm4=rinvsq
- movaps xmm3, [esp + .qqO]
-
- mulps xmm3, xmm0 ; xmm3=vcoul
- mulps xmm4, xmm3 ; xmm4=total fscal
- addps xmm3, [esp + .vctot]
-
- movaps xmm0, [esp + .dxO]
- movaps xmm1, [esp + .dyO]
- movaps xmm2, [esp + .dzO]
-
- movaps [esp + .vctot], xmm3
-
- mulps xmm0, xmm4
- mulps xmm1, xmm4
- mulps xmm2, xmm4
- ; xmm0-xmm2 contains tx-tz (partial force)
- movss xmm3, [esp + .fixO]
- movss xmm4, [esp + .fiyO]
- movss xmm5, [esp + .fizO]
- addss xmm3, xmm0
- addss xmm4, xmm1
- addss xmm5, xmm2
- movss [esp + .fixO], xmm3
- movss [esp + .fiyO], xmm4
- movss [esp + .fizO], xmm5 ; updated the O force. now do the H's
- movaps xmm3, xmm0
- movaps xmm4, xmm1
- movaps xmm5, xmm2
- shufps xmm3, xmm3, 11100110b ; shift right
- shufps xmm4, xmm4, 11100110b
- shufps xmm5, xmm5, 11100110b
- addss xmm3, [esp + .fixH1]
- addss xmm4, [esp + .fiyH1]
- addss xmm5, [esp + .fizH1]
- movss [esp + .fixH1], xmm3
- movss [esp + .fiyH1], xmm4
- movss [esp + .fizH1], xmm5 ; updated the H1 force.
-
- mov edi, [ebp + %$faction]
- shufps xmm3, xmm3, 11100111b ; shift right
- shufps xmm4, xmm4, 11100111b
- shufps xmm5, xmm5, 11100111b
- addss xmm3, [esp + .fixH2]
- addss xmm4, [esp + .fiyH2]
- addss xmm5, [esp + .fizH2]
- movss [esp + .fixH2], xmm3
- movss [esp + .fiyH2], xmm4
- movss [esp + .fizH2], xmm5 ; updated the H2 force.
-
- ; the fj's - start by accumulating the tx/ty/tz force in xmm0, xmm1.
- xorps xmm5, xmm5
- movaps xmm3, xmm0
- movlps xmm6, [edi + eax*4]
- movss xmm7, [edi + eax*4 + 8]
- unpcklps xmm3, xmm1
- movlhps xmm3, xmm5
- unpckhps xmm0, xmm1
- addps xmm0, xmm3
- movhlps xmm3, xmm0
- addps xmm0, xmm3 ; x,y sum in xmm0
-
- movhlps xmm1, xmm2
- addss xmm2, xmm1
- shufps xmm1, xmm1, 1b
- addss xmm2, xmm1 ; z sum in xmm2
- subps xmm6, xmm0
- subss xmm7, xmm2
-
- movlps [edi + eax*4], xmm6
- movss [edi + eax*4 + 8], xmm7
-
- dec dword [esp + .innerk]
- jz .updateouterdata
- jmp .odd_loop
-.updateouterdata:
- mov ecx, [esp + .ii3]
- mov edi, [ebp + %$faction]
- mov esi, [ebp + %$fshift]
- mov edx, [esp + .is3]
-
- ; accumulate Oi forces in xmm0, xmm1, xmm2
- movaps xmm0, [esp + .fixO]
- movaps xmm1, [esp + .fiyO]
- movaps xmm2, [esp + .fizO]
-
- movhlps xmm3, xmm0
- movhlps xmm4, xmm1
- movhlps xmm5, xmm2
- addps xmm0, xmm3
- addps xmm1, xmm4
- addps xmm2, xmm5 ; summan ligger i 1/2 i xmm0-xmm2
-
- movaps xmm3, xmm0
- movaps xmm4, xmm1
- movaps xmm5, xmm2
-
- shufps xmm3, xmm3, 1b
- shufps xmm4, xmm4, 1b
- shufps xmm5, xmm5, 1b
- addss xmm0, xmm3
- addss xmm1, xmm4
- addss xmm2, xmm5 ; xmm0-xmm2 has single force in pos0.
-
- ; increment i force
- movss xmm3, [edi + ecx*4]
- movss xmm4, [edi + ecx*4 + 4]
- movss xmm5, [edi + ecx*4 + 8]
- addss xmm3, xmm0
- addss xmm4, xmm1
- addss xmm5, xmm2
- movss [edi + ecx*4], xmm3
- movss [edi + ecx*4 + 4], xmm4
- movss [edi + ecx*4 + 8], xmm5
-
- ; accumulate force in xmm6/xmm7 for fshift
- movaps xmm6, xmm0
- movss xmm7, xmm2
- movlhps xmm6, xmm1
- shufps xmm6, xmm6, 1000b
-
- ; accumulate H1i forces in xmm0, xmm1, xmm2
- movaps xmm0, [esp + .fixH1]
- movaps xmm1, [esp + .fiyH1]
- movaps xmm2, [esp + .fizH1]
-
- movhlps xmm3, xmm0
- movhlps xmm4, xmm1
- movhlps xmm5, xmm2
- addps xmm0, xmm3
- addps xmm1, xmm4
- addps xmm2, xmm5 ; summan ligger i 1/2 i xmm0-xmm2
-
- movaps xmm3, xmm0
- movaps xmm4, xmm1
- movaps xmm5, xmm2
-
- shufps xmm3, xmm3, 1b
- shufps xmm4, xmm4, 1b
- shufps xmm5, xmm5, 1b
- addss xmm0, xmm3
- addss xmm1, xmm4
- addss xmm2, xmm5 ; xmm0-xmm2 has single force in pos0.
-
- ; increment i force
- movss xmm3, [edi + ecx*4 + 12]
- movss xmm4, [edi + ecx*4 + 16]
- movss xmm5, [edi + ecx*4 + 20]
- addss xmm3, xmm0
- addss xmm4, xmm1
- addss xmm5, xmm2
- movss [edi + ecx*4 + 12], xmm3
- movss [edi + ecx*4 + 16], xmm4
- movss [edi + ecx*4 + 20], xmm5
-
- ;accumulate force in xmm6/xmm7 for fshift
- addss xmm7, xmm2
- movlhps xmm0, xmm1
- shufps xmm0, xmm0, 1000b
- addps xmm6, xmm0
-
- ; accumulate H2i forces in xmm0, xmm1, xmm2
- movaps xmm0, [esp + .fixH2]
- movaps xmm1, [esp + .fiyH2]
- movaps xmm2, [esp + .fizH2]
-
- movhlps xmm3, xmm0
- movhlps xmm4, xmm1
- movhlps xmm5, xmm2
- addps xmm0, xmm3
- addps xmm1, xmm4
- addps xmm2, xmm5 ; summan ligger i 1/2 i xmm0-xmm2
-
- movaps xmm3, xmm0
- movaps xmm4, xmm1
- movaps xmm5, xmm2
-
- shufps xmm3, xmm3, 1b
- shufps xmm4, xmm4, 1b
- shufps xmm5, xmm5, 1b
- addss xmm0, xmm3
- addss xmm1, xmm4
- addss xmm2, xmm5 ; xmm0-xmm2 has single force in pos0.
-
- ; increment i force
- movss xmm3, [edi + ecx*4 + 24]
- movss xmm4, [edi + ecx*4 + 28]
- movss xmm5, [edi + ecx*4 + 32]
- addss xmm3, xmm0
- addss xmm4, xmm1
- addss xmm5, xmm2
- movss [edi + ecx*4 + 24], xmm3
- movss [edi + ecx*4 + 28], xmm4
- movss [edi + ecx*4 + 32], xmm5
-
- ;accumulate force in xmm6/xmm7 for fshift
- addss xmm7, xmm2
- movlhps xmm0, xmm1
- shufps xmm0, xmm0, 1000b
- addps xmm6, xmm0
-
- ; increment fshift force
- movlps xmm3, [esi + edx*4]
- movss xmm4, [esi + edx*4 + 8]
- addps xmm3, xmm6
- addss xmm4, xmm7
- movlps [esi + edx*4], xmm3
- movss [esi + edx*4 + 8], xmm4
-
- mov edx, [ebp + %$gid]
- mov edx, [edx]
- add [ebp + %$gid], dword 4
-
- ; accumulate total potential energy and update it.
- movaps xmm7, [esp + .vctot]
- ; accumulate
- movhlps xmm6, xmm7
- addps xmm7, xmm6 ; pos 0-1 in xmm7 have the sum now
- movaps xmm6, xmm7
- shufps xmm6, xmm6, 1b
- addss xmm7, xmm6
-
- ; add earlier value from mem.
- mov eax, [ebp + %$Vc]
- addss xmm7, [eax + edx*4]
- ; move back to mem.
- movss [eax + edx*4], xmm7
-
- ;; finish if last
- mov ecx, [ebp + %$nri]
- dec ecx
- jecxz .end
- ;; not last, iterate once more!
- mov [ebp + %$nri], ecx
- jmp .outer
-.end:
- emms
- mov eax, [esp + .salign]
- add esp, eax
- add esp, 612
- pop edi
- pop esi
- pop edx
- pop ecx
- pop ebx
- pop eax
- endproc
-
-
-
-proc inl1030_sse
-%$nri arg
-%$iinr arg
-%$jindex arg
-%$jjnr arg
-%$shift arg
-%$shiftvec arg
-%$fshift arg
-%$gid arg
-%$pos arg
-%$faction arg
-%$charge arg
-%$facel arg
-%$Vc arg
- ;; stack offsets for local variables
- ;; bottom of stack is cache-aligned for sse use
-.ixO equ 0
-.iyO equ 16
-.izO equ 32
-.ixH1 equ 48
-.iyH1 equ 64
-.izH1 equ 80
-.ixH2 equ 96
-.iyH2 equ 112
-.izH2 equ 128
-.jxO equ 144
-.jyO equ 160
-.jzO equ 176
-.jxH1 equ 192
-.jyH1 equ 208
-.jzH1 equ 224
-.jxH2 equ 240
-.jyH2 equ 256
-.jzH2 equ 272
-.dxOO equ 288
-.dyOO equ 304
-.dzOO equ 320
-.dxOH1 equ 336
-.dyOH1 equ 352
-.dzOH1 equ 368
-.dxOH2 equ 384
-.dyOH2 equ 400
-.dzOH2 equ 416
-.dxH1O equ 432
-.dyH1O equ 448
-.dzH1O equ 464
-.dxH1H1 equ 480
-.dyH1H1 equ 496
-.dzH1H1 equ 512
-.dxH1H2 equ 528
-.dyH1H2 equ 544
-.dzH1H2 equ 560
-.dxH2O equ 576
-.dyH2O equ 592
-.dzH2O equ 608
-.dxH2H1 equ 624
-.dyH2H1 equ 640
-.dzH2H1 equ 656
-.dxH2H2 equ 672
-.dyH2H2 equ 688
-.dzH2H2 equ 704
-.qqOO equ 720
-.qqOH equ 736
-.qqHH equ 752
-.vctot equ 768
-.fixO equ 784
-.fiyO equ 800
-.fizO equ 816
-.fixH1 equ 832
-.fiyH1 equ 848
-.fizH1 equ 864
-.fixH2 equ 880
-.fiyH2 equ 896
-.fizH2 equ 912
-.fjxO equ 928
-.fjyO equ 944
-.fjzO equ 960
-.fjxH1 equ 976
-.fjyH1 equ 992
-.fjzH1 equ 1008
-.fjxH2 equ 1024
-.fjyH2 equ 1040
-.fjzH2 equ 1056
-.half equ 1072
-.three equ 1088
-.rsqOO equ 1104
-.rsqOH1 equ 1120
-.rsqOH2 equ 1136
-.rsqH1O equ 1152
-.rsqH1H1 equ 1168
-.rsqH1H2 equ 1184
-.rsqH2O equ 1200
-.rsqH2H1 equ 1216
-.rsqH2H2 equ 1232
-.rinvOO equ 1248
-.rinvOH1 equ 1264
-.rinvOH2 equ 1280
-.rinvH1O equ 1296
-.rinvH1H1 equ 1312
-.rinvH1H2 equ 1328
-.rinvH2O equ 1344
-.rinvH2H1 equ 1360
-.rinvH2H2 equ 1376
-.is3 equ 1392
-.ii3 equ 1396
-.innerjjnr equ 1400
-.innerk equ 1404
-.salign equ 1408
- push eax
- push ebx
- push ecx
- push edx
- push esi
- push edi
- sub esp, 1412 ; local stack space
- mov eax, esp
- and eax, 0xf
- sub esp, eax
- mov [esp + .salign], eax
-
- emms
-
- movups xmm0, [sse_half]
- movups xmm1, [sse_three]
- movaps [esp + .half], xmm0
- movaps [esp + .three], xmm1
-
- ;; assume we have at least one i particle - start directly
- mov ecx, [ebp + %$iinr] ; ecx = pointer into iinr[]
- mov ebx, [ecx] ; ebx =ii
-
- mov edx, [ebp + %$charge]
- movss xmm3, [edx + ebx*4]
- movss xmm4, xmm3
- movss xmm5, [edx + ebx*4 + 4]
- movss xmm6, [ebp + %$facel]
- mulss xmm3, xmm3
- mulss xmm4, xmm5
- mulss xmm5, xmm5
- mulss xmm3, xmm6
- mulss xmm4, xmm6
- mulss xmm5, xmm6
- shufps xmm3, xmm3, 0b
- shufps xmm4, xmm4, 0b
- shufps xmm5, xmm5, 0b
- movaps [esp + .qqOO], xmm3
- movaps [esp + .qqOH], xmm4
- movaps [esp + .qqHH], xmm5
-
-.outer:
- mov eax, [ebp + %$shift] ; eax = pointer into shift[]
- mov ebx, [eax] ; ebx=shift[n]
- add [ebp + %$shift], dword 4 ; advance pointer one step
-
- lea ebx, [ebx + ebx*2] ; ebx=3*is
- mov [esp + .is3],ebx ; store is3
-
- mov eax, [ebp + %$shiftvec] ; eax = base of shiftvec[]
-
- movss xmm0, [eax + ebx*4]
- movss xmm1, [eax + ebx*4 + 4]
- movss xmm2, [eax + ebx*4 + 8]
-
- mov ecx, [ebp + %$iinr] ; ecx = pointer into iinr[]
- add [ebp + %$iinr], dword 4 ; advance pointer
- mov ebx, [ecx] ; ebx =ii
-
- lea ebx, [ebx + ebx*2] ; ebx = 3*ii=ii3
- mov eax, [ebp + %$pos] ; eax = base of pos[]
- mov [esp + .ii3], ebx
-
- movaps xmm3, xmm0
- movaps xmm4, xmm1
- movaps xmm5, xmm2
- addss xmm3, [eax + ebx*4]
- addss xmm4, [eax + ebx*4 + 4]
- addss xmm5, [eax + ebx*4 + 8]
- shufps xmm3, xmm3, 0b
- shufps xmm4, xmm4, 0b
- shufps xmm5, xmm5, 0b
- movaps [esp + .ixO], xmm3
- movaps [esp + .iyO], xmm4
- movaps [esp + .izO], xmm5
-
- movss xmm3, xmm0
- movss xmm4, xmm1
- movss xmm5, xmm2
- addss xmm0, [eax + ebx*4 + 12]
- addss xmm1, [eax + ebx*4 + 16]
- addss xmm2, [eax + ebx*4 + 20]
- addss xmm3, [eax + ebx*4 + 24]
- addss xmm4, [eax + ebx*4 + 28]
- addss xmm5, [eax + ebx*4 + 32]
-
- shufps xmm0, xmm0, 0b
- shufps xmm1, xmm1, 0b
- shufps xmm2, xmm2, 0b
- shufps xmm3, xmm3, 0b
- shufps xmm4, xmm4, 0b
- shufps xmm5, xmm5, 0b
- movaps [esp + .ixH1], xmm0
- movaps [esp + .iyH1], xmm1
- movaps [esp + .izH1], xmm2
- movaps [esp + .ixH2], xmm3
- movaps [esp + .iyH2], xmm4
- movaps [esp + .izH2], xmm5
-
- ; clear vctot and i forces
- xorps xmm4, xmm4
- movaps [esp + .vctot], xmm4
- movaps [esp + .fixO], xmm4
- movaps [esp + .fiyO], xmm4
- movaps [esp + .fizO], xmm4
- movaps [esp + .fixH1], xmm4
- movaps [esp + .fiyH1], xmm4
- movaps [esp + .fizH1], xmm4
- movaps [esp + .fixH2], xmm4
- movaps [esp + .fiyH2], xmm4
- movaps [esp + .fizH2], xmm4
-
- mov eax, [ebp + %$jindex]
- mov ecx, [eax] ; jindex[n]
- mov edx, [eax + 4] ; jindex[n+1]
- add [ebp + %$jindex], dword 4
- sub edx, ecx ; number of innerloop atoms
-
- mov esi, [ebp + %$pos]
- mov edi, [ebp + %$faction]
- mov eax, [ebp + %$jjnr]
- shl ecx, 2
- add eax, ecx
- mov [esp + .innerjjnr], eax ; pointer to jjnr[nj0]
- sub edx, dword 4
- mov [esp + .innerk], edx ; number of innerloop atoms
- jge .unroll_loop
- jmp .single_check
-.unroll_loop:
- ;; quad-unroll innerloop here.
- mov edx, [esp + .innerjjnr] ; pointer to jjnr[k]
-
- mov eax, [edx]
- mov ebx, [edx + 4]
- mov ecx, [edx + 8]
- mov edx, [edx + 12] ; eax-edx=jnr1-4
-
- add [esp + .innerjjnr], dword 16 ; advance pointer (unrolled 4)
-
- mov esi, [ebp + %$pos] ; base of pos[]
-
- lea eax, [eax + eax*2] ; replace jnr with j3
- lea ebx, [ebx + ebx*2]
- lea ecx, [ecx + ecx*2] ; replace jnr with j3
- lea edx, [edx + edx*2]
-
- ; move j coordinates to local temp. variables
- movlps xmm2, [esi + eax*4]
- movlps xmm3, [esi + eax*4 + 12]
- movlps xmm4, [esi + eax*4 + 24]
-
- movlps xmm5, [esi + ebx*4]
- movlps xmm6, [esi + ebx*4 + 12]
- movlps xmm7, [esi + ebx*4 + 24]
-
- movhps xmm2, [esi + ecx*4]
- movhps xmm3, [esi + ecx*4 + 12]
- movhps xmm4, [esi + ecx*4 + 24]
-
- movhps xmm5, [esi + edx*4]
- movhps xmm6, [esi + edx*4 + 12]
- movhps xmm7, [esi + edx*4 + 24]
-
- ;; current state:
- ;; xmm2= jxOa jyOa jxOc jyOc
- ;; xmm3= jxH1a jyH1a jxH1c jyH1c
- ;; xmm4= jxH2a jyH2a jxH2c jyH2c
- ;; xmm5= jxOb jyOb jxOd jyOd
- ;; xmm6= jxH1b jyH1b jxH1d jyH1d
- ;; xmm7= jxH2b jyH2b jxH2d jyH2d
-
- movaps xmm0, xmm2
- movaps xmm1, xmm3
- unpcklps xmm0, xmm5 ; xmm0= jxOa jxOb jyOa jyOb
- unpcklps xmm1, xmm6 ; xmm1= jxH1a jxH1b jyH1a jyH1b
- unpckhps xmm2, xmm5 ; xmm2= jxOc jxOd jyOc jyOd
- unpckhps xmm3, xmm6 ; xmm3= jxH1c jxH1d jyH1c jyH1d
- movaps xmm5, xmm4
- movaps xmm6, xmm0
- unpcklps xmm4, xmm7 ; xmm4= jxH2a jxH2b jyH2a jyH2b
- unpckhps xmm5, xmm7 ; xmm5= jxH2c jxH2d jyH2c jyH2d
- movaps xmm7, xmm1
- movlhps xmm0, xmm2 ; xmm0= jxOa jxOb jxOc jxOd
- movaps [esp + .jxO], xmm0
- movhlps xmm2, xmm6 ; xmm2= jyOa jyOb jyOc jyOd
- movaps [esp + .jyO], xmm2
- movlhps xmm1, xmm3
- movaps [esp + .jxH1], xmm1
- movhlps xmm3, xmm7
- movaps xmm6, xmm4
- movaps [esp + .jyH1], xmm3
- movlhps xmm4, xmm5
- movaps [esp + .jxH2], xmm4
- movhlps xmm5, xmm6
- movaps [esp + .jyH2], xmm5
-
- movss xmm0, [esi + eax*4 + 8]
- movss xmm1, [esi + eax*4 + 20]
- movss xmm2, [esi + eax*4 + 32]
-
- movss xmm3, [esi + ecx*4 + 8]
- movss xmm4, [esi + ecx*4 + 20]
- movss xmm5, [esi + ecx*4 + 32]
-
- movhps xmm0, [esi + ebx*4 + 4]
- movhps xmm1, [esi + ebx*4 + 16]
- movhps xmm2, [esi + ebx*4 + 28]
-
- movhps xmm3, [esi + edx*4 + 4]
- movhps xmm4, [esi + edx*4 + 16]
- movhps xmm5, [esi + edx*4 + 28]
-
- shufps xmm0, xmm3, 11001100b
- shufps xmm1, xmm4, 11001100b
- shufps xmm2, xmm5, 11001100b
- movaps [esp + .jzO], xmm0
- movaps [esp + .jzH1], xmm1
- movaps [esp + .jzH2], xmm2
-
- movaps xmm0, [esp + .ixO]
- movaps xmm1, [esp + .iyO]
- movaps xmm2, [esp + .izO]
- movaps xmm3, [esp + .ixO]
- movaps xmm4, [esp + .iyO]
- movaps xmm5, [esp + .izO]
- subps xmm0, [esp + .jxO]
- subps xmm1, [esp + .jyO]
- subps xmm2, [esp + .jzO]
- subps xmm3, [esp + .jxH1]
- subps xmm4, [esp + .jyH1]
- subps xmm5, [esp + .jzH1]
- movaps [esp + .dxOO], xmm0
- movaps [esp + .dyOO], xmm1
- movaps [esp + .dzOO], xmm2
- mulps xmm0, xmm0
- mulps xmm1, xmm1
- mulps xmm2, xmm2
- movaps [esp + .dxOH1], xmm3
- movaps [esp + .dyOH1], xmm4
- movaps [esp + .dzOH1], xmm5
- mulps xmm3, xmm3
- mulps xmm4, xmm4
- mulps xmm5, xmm5
- addps xmm0, xmm1
- addps xmm0, xmm2
- addps xmm3, xmm4
- addps xmm3, xmm5
- movaps [esp + .rsqOO], xmm0
- movaps [esp + .rsqOH1], xmm3
-
- movaps xmm0, [esp + .ixO]
- movaps xmm1, [esp + .iyO]
- movaps xmm2, [esp + .izO]
- movaps xmm3, [esp + .ixH1]
- movaps xmm4, [esp + .iyH1]
- movaps xmm5, [esp + .izH1]
- subps xmm0, [esp + .jxH2]
- subps xmm1, [esp + .jyH2]
- subps xmm2, [esp + .jzH2]
- subps xmm3, [esp + .jxO]
- subps xmm4, [esp + .jyO]
- subps xmm5, [esp + .jzO]
- movaps [esp + .dxOH2], xmm0
- movaps [esp + .dyOH2], xmm1
- movaps [esp + .dzOH2], xmm2
- mulps xmm0, xmm0
- mulps xmm1, xmm1
- mulps xmm2, xmm2
- movaps [esp + .dxH1O], xmm3
- movaps [esp + .dyH1O], xmm4
- movaps [esp + .dzH1O], xmm5
- mulps xmm3, xmm3
- mulps xmm4, xmm4
- mulps xmm5, xmm5
- addps xmm0, xmm1
- addps xmm0, xmm2
- addps xmm3, xmm4
- addps xmm3, xmm5
- movaps [esp + .rsqOH2], xmm0
- movaps [esp + .rsqH1O], xmm3
-
- movaps xmm0, [esp + .ixH1]
- movaps xmm1, [esp + .iyH1]
- movaps xmm2, [esp + .izH1]
- movaps xmm3, [esp + .ixH1]
- movaps xmm4, [esp + .iyH1]
- movaps xmm5, [esp + .izH1]
- subps xmm0, [esp + .jxH1]
- subps xmm1, [esp + .jyH1]
- subps xmm2, [esp + .jzH1]
- subps xmm3, [esp + .jxH2]
- subps xmm4, [esp + .jyH2]
- subps xmm5, [esp + .jzH2]
- movaps [esp + .dxH1H1], xmm0
- movaps [esp + .dyH1H1], xmm1
- movaps [esp + .dzH1H1], xmm2
- mulps xmm0, xmm0
- mulps xmm1, xmm1
- mulps xmm2, xmm2
- movaps [esp + .dxH1H2], xmm3
- movaps [esp + .dyH1H2], xmm4
- movaps [esp + .dzH1H2], xmm5
- mulps xmm3, xmm3
- mulps xmm4, xmm4
- mulps xmm5, xmm5
- addps xmm0, xmm1
- addps xmm0, xmm2
- addps xmm3, xmm4
- addps xmm3, xmm5
- movaps [esp + .rsqH1H1], xmm0
- movaps [esp + .rsqH1H2], xmm3
-
- movaps xmm0, [esp + .ixH2]
- movaps xmm1, [esp + .iyH2]
- movaps xmm2, [esp + .izH2]
- movaps xmm3, [esp + .ixH2]
- movaps xmm4, [esp + .iyH2]
- movaps xmm5, [esp + .izH2]
- subps xmm0, [esp + .jxO]
- subps xmm1, [esp + .jyO]
- subps xmm2, [esp + .jzO]
- subps xmm3, [esp + .jxH1]
- subps xmm4, [esp + .jyH1]
- subps xmm5, [esp + .jzH1]
- movaps [esp + .dxH2O], xmm0
- movaps [esp + .dyH2O], xmm1
- movaps [esp + .dzH2O], xmm2
- mulps xmm0, xmm0
- mulps xmm1, xmm1
- mulps xmm2, xmm2
- movaps [esp + .dxH2H1], xmm3
- movaps [esp + .dyH2H1], xmm4
- movaps [esp + .dzH2H1], xmm5
- mulps xmm3, xmm3
- mulps xmm4, xmm4
- mulps xmm5, xmm5
- addps xmm0, xmm1
- addps xmm0, xmm2
- addps xmm4, xmm3
- addps xmm4, xmm5
- movaps [esp + .rsqH2O], xmm0
- movaps [esp + .rsqH2H1], xmm4
-
- movaps xmm0, [esp + .ixH2]
- movaps xmm1, [esp + .iyH2]
- movaps xmm2, [esp + .izH2]
- subps xmm0, [esp + .jxH2]
- subps xmm1, [esp + .jyH2]
- subps xmm2, [esp + .jzH2]
- movaps [esp + .dxH2H2], xmm0
- movaps [esp + .dyH2H2], xmm1
- movaps [esp + .dzH2H2], xmm2
- mulps xmm0, xmm0
- mulps xmm1, xmm1
- mulps xmm2, xmm2
- addps xmm0, xmm1
- addps xmm0, xmm2
- movaps [esp + .rsqH2H2], xmm0
-
- ; start doing invsqrt. use rsq values in xmm0, xmm4
- rsqrtps xmm1, xmm0
- rsqrtps xmm5, xmm4
- movaps xmm2, xmm1
- movaps xmm6, xmm5
- mulps xmm1, xmm1
- mulps xmm5, xmm5
- movaps xmm3, [esp + .three]
- movaps xmm7, xmm3
- mulps xmm1, xmm0
- mulps xmm5, xmm4
- subps xmm3, xmm1
- subps xmm7, xmm5
- mulps xmm3, xmm2
- mulps xmm7, xmm6
- mulps xmm3, [esp + .half] ; rinvH2H2
- mulps xmm7, [esp + .half] ; rinvH2H1
- movaps [esp + .rinvH2H2], xmm3
- movaps [esp + .rinvH2H1], xmm7
-
- rsqrtps xmm1, [esp + .rsqOO]
- rsqrtps xmm5, [esp + .rsqOH1]
- movaps xmm2, xmm1
- movaps xmm6, xmm5
- mulps xmm1, xmm1
- mulps xmm5, xmm5
- movaps xmm3, [esp + .three]
- movaps xmm7, xmm3
- mulps xmm1, [esp + .rsqOO]
- mulps xmm5, [esp + .rsqOH1]
- subps xmm3, xmm1
- subps xmm7, xmm5
- mulps xmm3, xmm2
- mulps xmm7, xmm6
- mulps xmm3, [esp + .half]
- mulps xmm7, [esp + .half]
- movaps [esp + .rinvOO], xmm3
- movaps [esp + .rinvOH1], xmm7
-
- rsqrtps xmm1, [esp + .rsqOH2]
- rsqrtps xmm5, [esp + .rsqH1O]
- movaps xmm2, xmm1
- movaps xmm6, xmm5
- mulps xmm1, xmm1
- mulps xmm5, xmm5
- movaps xmm3, [esp + .three]
- movaps xmm7, xmm3
- mulps xmm1, [esp + .rsqOH2]
- mulps xmm5, [esp + .rsqH1O]
- subps xmm3, xmm1
- subps xmm7, xmm5
- mulps xmm3, xmm2
- mulps xmm7, xmm6
- mulps xmm3, [esp + .half]
- mulps xmm7, [esp + .half]
- movaps [esp + .rinvOH2], xmm3
- movaps [esp + .rinvH1O], xmm7
-
- rsqrtps xmm1, [esp + .rsqH1H1]
- rsqrtps xmm5, [esp + .rsqH1H2]
- movaps xmm2, xmm1
- movaps xmm6, xmm5
- mulps xmm1, xmm1
- mulps xmm5, xmm5
- movaps xmm3, [esp + .three]
- movaps xmm7, xmm3
- mulps xmm1, [esp + .rsqH1H1]
- mulps xmm5, [esp + .rsqH1H2]
- subps xmm3, xmm1
- subps xmm7, xmm5
- mulps xmm3, xmm2
- mulps xmm7, xmm6
- mulps xmm3, [esp + .half]
- mulps xmm7, [esp + .half]
- movaps [esp + .rinvH1H1], xmm3
- movaps [esp + .rinvH1H2], xmm7
-
- rsqrtps xmm1, [esp + .rsqH2O]
- movaps xmm2, xmm1
- mulps xmm1, xmm1
- movaps xmm3, [esp + .three]
- mulps xmm1, [esp + .rsqH2O]
- subps xmm3, xmm1
- mulps xmm3, xmm2
- mulps xmm3, [esp + .half]
- movaps [esp + .rinvH2O], xmm3
-
- ;; start with OO interaction.
- movaps xmm0, [esp + .rinvOO]
- movaps xmm7, xmm0
- mulps xmm0, xmm0
- mulps xmm7, [esp + .qqOO]
- mulps xmm0, xmm7
- addps xmm7, [esp + .vctot]
- movaps xmm1, xmm0
- movaps xmm2, xmm0
-
- xorps xmm3, xmm3
- movaps xmm4, xmm3
- movaps xmm5, xmm3
- mulps xmm0, [esp + .dxOO]
- mulps xmm1, [esp + .dyOO]
- mulps xmm2, [esp + .dzOO]
- subps xmm3, xmm0
- subps xmm4, xmm1
- subps xmm5, xmm2
- addps xmm0, [esp + .fixO]
- addps xmm1, [esp + .fiyO]
- addps xmm2, [esp + .fizO]
- movaps [esp + .fjxO], xmm3
- movaps [esp + .fjyO], xmm4
- movaps [esp + .fjzO], xmm5
- movaps [esp + .fixO], xmm0
- movaps [esp + .fiyO], xmm1
- movaps [esp + .fizO], xmm2
-
- ; O-H1 interaction
- movaps xmm0, [esp + .rinvOH1]
- movaps xmm1, xmm0
- mulps xmm0, xmm0
- mulps xmm1, [esp + .qqOH]
- mulps xmm0, xmm1 ; fsOH1
- addps xmm7, xmm1 ; add to local vctot.
- movaps xmm1, xmm0
- movaps xmm2, xmm0
-
- xorps xmm3, xmm3
- movaps xmm4, xmm3
- movaps xmm5, xmm3
- mulps xmm0, [esp + .dxOH1]
- mulps xmm1, [esp + .dyOH1]
- mulps xmm2, [esp + .dzOH1]
- subps xmm3, xmm0
- subps xmm4, xmm1
- subps xmm5, xmm2
- addps xmm0, [esp + .fixO]
- addps xmm1, [esp + .fiyO]
- addps xmm2, [esp + .fizO]
- movaps [esp + .fjxH1], xmm3
- movaps [esp + .fjyH1], xmm4
- movaps [esp + .fjzH1], xmm5
- movaps [esp + .fixO], xmm0
- movaps [esp + .fiyO], xmm1
- movaps [esp + .fizO], xmm2
-
- ; O-H2 interaction
- movaps xmm0, [esp + .rinvOH2]
- movaps xmm1, xmm0
- mulps xmm0, xmm0
- mulps xmm1, [esp + .qqOH]
- mulps xmm0, xmm1 ; fsOH2
- addps xmm7, xmm1 ; add to local vctot.
- movaps xmm1, xmm0
- movaps xmm2, xmm0
-
- xorps xmm3, xmm3
- movaps xmm4, xmm3
- movaps xmm5, xmm3
- mulps xmm0, [esp + .dxOH2]
- mulps xmm1, [esp + .dyOH2]
- mulps xmm2, [esp + .dzOH2]
- subps xmm3, xmm0
- subps xmm4, xmm1
- subps xmm5, xmm2
- addps xmm0, [esp + .fixO]
- addps xmm1, [esp + .fiyO]
- addps xmm2, [esp + .fizO]
- movaps [esp + .fjxH2], xmm3
- movaps [esp + .fjyH2], xmm4
- movaps [esp + .fjzH2], xmm5
- movaps [esp + .fixO], xmm0
- movaps [esp + .fiyO], xmm1
- movaps [esp + .fizO], xmm2
-
- ; H1-O interaction
- movaps xmm0, [esp + .rinvH1O]
- movaps xmm1, xmm0
- mulps xmm0, xmm0
- mulps xmm1, [esp + .qqOH]
- mulps xmm0, xmm1 ; fsH1O
- addps xmm7, xmm1 ; add to local vctot.
- movaps xmm1, xmm0
- movaps xmm2, xmm0
- movaps xmm3, [esp + .fjxO]
- movaps xmm4, [esp + .fjyO]
- movaps xmm5, [esp + .fjzO]
- mulps xmm0, [esp + .dxH1O]
- mulps xmm1, [esp + .dyH1O]
- mulps xmm2, [esp + .dzH1O]
- subps xmm3, xmm0
- subps xmm4, xmm1
- subps xmm5, xmm2
- addps xmm0, [esp + .fixH1]
- addps xmm1, [esp + .fiyH1]
- addps xmm2, [esp + .fizH1]
- movaps [esp + .fjxO], xmm3
- movaps [esp + .fjyO], xmm4
- movaps [esp + .fjzO], xmm5
- movaps [esp + .fixH1], xmm0
- movaps [esp + .fiyH1], xmm1
- movaps [esp + .fizH1], xmm2
-
- ; H1-H1 interaction
- movaps xmm0, [esp + .rinvH1H1]
- movaps xmm1, xmm0
- mulps xmm0, xmm0
- mulps xmm1, [esp + .qqHH]
- mulps xmm0, xmm1 ; fsH1H1
- addps xmm7, xmm1 ; add to local vctot.
- movaps xmm1, xmm0
- movaps xmm2, xmm0
- movaps xmm3, [esp + .fjxH1]
- movaps xmm4, [esp + .fjyH1]
- movaps xmm5, [esp + .fjzH1]
- mulps xmm0, [esp + .dxH1H1]
- mulps xmm1, [esp + .dyH1H1]
- mulps xmm2, [esp + .dzH1H1]
- subps xmm3, xmm0
- subps xmm4, xmm1
- subps xmm5, xmm2
- addps xmm0, [esp + .fixH1]
- addps xmm1, [esp + .fiyH1]
- addps xmm2, [esp + .fizH1]
- movaps [esp + .fjxH1], xmm3
- movaps [esp + .fjyH1], xmm4
- movaps [esp + .fjzH1], xmm5
- movaps [esp + .fixH1], xmm0
- movaps [esp + .fiyH1], xmm1
- movaps [esp + .fizH1], xmm2
-
- ; H1-H2 interaction
- movaps xmm0, [esp + .rinvH1H2]
- movaps xmm1, xmm0
- mulps xmm0, xmm0
- mulps xmm1, [esp + .qqHH]
- mulps xmm0, xmm1 ; fsOH2
- addps xmm7, xmm1 ; add to local vctot.
- movaps xmm1, xmm0
- movaps xmm2, xmm0
- movaps xmm3, [esp + .fjxH2]
- movaps xmm4, [esp + .fjyH2]
- movaps xmm5, [esp + .fjzH2]
- mulps xmm0, [esp + .dxH1H2]
- mulps xmm1, [esp + .dyH1H2]
- mulps xmm2, [esp + .dzH1H2]
- subps xmm3, xmm0
- subps xmm4, xmm1
- subps xmm5, xmm2
- addps xmm0, [esp + .fixH1]
- addps xmm1, [esp + .fiyH1]
- addps xmm2, [esp + .fizH1]
- movaps [esp + .fjxH2], xmm3
- movaps [esp + .fjyH2], xmm4
- movaps [esp + .fjzH2], xmm5
- movaps [esp + .fixH1], xmm0
- movaps [esp + .fiyH1], xmm1
- movaps [esp + .fizH1], xmm2
-
- ; H2-O interaction
- movaps xmm0, [esp + .rinvH2O]
- movaps xmm1, xmm0
- mulps xmm0, xmm0
- mulps xmm1, [esp + .qqOH]
- mulps xmm0, xmm1 ; fsH2O
- addps xmm7, xmm1 ; add to local vctot.
- movaps xmm1, xmm0
- movaps xmm2, xmm0
- movaps xmm3, [esp + .fjxO]
- movaps xmm4, [esp + .fjyO]
- movaps xmm5, [esp + .fjzO]
- mulps xmm0, [esp + .dxH2O]
- mulps xmm1, [esp + .dyH2O]
- mulps xmm2, [esp + .dzH2O]
- subps xmm3, xmm0
- subps xmm4, xmm1
- subps xmm5, xmm2
- addps xmm0, [esp + .fixH2]
- addps xmm1, [esp + .fiyH2]
- addps xmm2, [esp + .fizH2]
- movaps [esp + .fjxO], xmm3
- movaps [esp + .fjyO], xmm4
- movaps [esp + .fjzO], xmm5
- movaps [esp + .fixH2], xmm0
- movaps [esp + .fiyH2], xmm1
- movaps [esp + .fizH2], xmm2
-
- ; H2-H1 interaction
- movaps xmm0, [esp + .rinvH2H1]
- movaps xmm1, xmm0
- mulps xmm0, xmm0
- mulps xmm1, [esp + .qqHH]
- mulps xmm0, xmm1 ; fsH2H1
- addps xmm7, xmm1 ; add to local vctot.
- movaps xmm1, xmm0
- movaps xmm2, xmm0
- movaps xmm3, [esp + .fjxH1]
- movaps xmm4, [esp + .fjyH1]
- movaps xmm5, [esp + .fjzH1]
- mulps xmm0, [esp + .dxH2H1]
- mulps xmm1, [esp + .dyH2H1]
- mulps xmm2, [esp + .dzH2H1]
- subps xmm3, xmm0
- subps xmm4, xmm1
- subps xmm5, xmm2
- addps xmm0, [esp + .fixH2]
- addps xmm1, [esp + .fiyH2]
- addps xmm2, [esp + .fizH2]
- movaps [esp + .fjxH1], xmm3
- movaps [esp + .fjyH1], xmm4
- movaps [esp + .fjzH1], xmm5
- movaps [esp + .fixH2], xmm0
- movaps [esp + .fiyH2], xmm1
- movaps [esp + .fizH2], xmm2
-
- ; H2-H2 interaction
- movaps xmm0, [esp + .rinvH2H2]
- movaps xmm1, xmm0
- mulps xmm0, xmm0
- mulps xmm1, [esp + .qqHH]
- mulps xmm0, xmm1 ; fsH2H2
- addps xmm7, xmm1 ; add to local vctot.
- movaps xmm1, xmm0
- movaps [esp + .vctot], xmm7
- movaps xmm2, xmm0
- movaps xmm3, [esp + .fjxH2]
- movaps xmm4, [esp + .fjyH2]
- movaps xmm5, [esp + .fjzH2]
- mulps xmm0, [esp + .dxH2H2]
- mulps xmm1, [esp + .dyH2H2]
- mulps xmm2, [esp + .dzH2H2]
- subps xmm3, xmm0
- subps xmm4, xmm1
- subps xmm5, xmm2
- addps xmm0, [esp + .fixH2]
- addps xmm1, [esp + .fiyH2]
- addps xmm2, [esp + .fizH2]
- movaps [esp + .fjxH2], xmm3
- movaps [esp + .fjyH2], xmm4
- movaps [esp + .fjzH2], xmm5
- movaps [esp + .fixH2], xmm0
- movaps [esp + .fiyH2], xmm1
- movaps [esp + .fizH2], xmm2
-
- mov edi, [ebp +%$faction]
-
- ; Did all interactions - now update j forces.
- ; 4 j waters with three atoms each - first do a & b j particles
- movaps xmm0, [esp + .fjxO] ; xmm0= fjxOa fjxOb fjxOc fjxOd
- movaps xmm1, [esp + .fjyO] ; xmm1= fjyOa fjyOb fjyOc fjyOd
- unpcklps xmm0, xmm1 ; xmm0= fjxOa fjyOa fjxOb fjyOb
- movaps xmm1, [esp + .fjzO]
- movaps xmm2, [esp + .fjxH1]
- movhlps xmm3, xmm0 ; xmm3= fjxOb fjyOb
- unpcklps xmm1, xmm2 ; xmm1= fjzOa fjxH1a fjzOb fjxH1b
- movaps xmm4, [esp + .fjyH1]
- movaps xmm5, [esp + .fjzH1]
- unpcklps xmm4, xmm5 ; xmm4= fjyH1a fjzH1a fjyH1b fjzH1b
- movaps xmm5, [esp + .fjxH2]
- movaps xmm6, [esp + .fjyH2]
- movhlps xmm7, xmm4 ; xmm7= fjyH1b fjzH1b
- unpcklps xmm5, xmm6 ; xmm5= fjxH2a fjyH2a fjxH2b fjyH2b
- movlhps xmm0, xmm1 ; xmm0= fjxOa fjyOa fjzOa fjxH1a
- shufps xmm3, xmm1, 11100100b
- ; xmm3= fjxOb fjyOb fjzOb fjxH1b
- movlhps xmm4, xmm5 ; xmm4= fjyH1a fjzH1a fjxH2a fjyH2a
- shufps xmm7, xmm5, 11100100b
- ; xmm7= fjyH1b fjzH1b fjxH2b fjyH2b
- movups xmm1, [edi + eax*4]
- movups xmm2, [edi + eax*4 + 16]
- movups xmm5, [edi + ebx*4]
- movups xmm6, [edi + ebx*4 + 16]
- addps xmm1, xmm0
- addps xmm2, xmm4
- addps xmm5, xmm3
- addps xmm6, xmm7
- movss xmm0, [edi + eax*4 + 32]
- movss xmm3, [edi + ebx*4 + 32]
-
- movaps xmm4, [esp + .fjzH2]
- movaps xmm7, xmm4
- shufps xmm7, xmm7, 1b
-
- movups [edi + eax*4], xmm1
- movups [edi + eax*4 + 16],xmm2
- movups [edi + ebx*4], xmm5
- movups [edi + ebx*4 + 16],xmm6
- addss xmm0, xmm4
- addss xmm3, xmm7
- movss [edi + eax*4 + 32], xmm0
- movss [edi + ebx*4 + 32], xmm3
-
- ;; then do the second pair (c & d)
- movaps xmm0, [esp + .fjxO] ; xmm0= fjxOa fjxOb fjxOc fjxOd
- movaps xmm1, [esp + .fjyO] ; xmm1= fjyOa fjyOb fjyOc fjyOd
- unpckhps xmm0, xmm1 ; xmm0= fjxOc fjyOc fjxOd fjyOd
- movaps xmm1, [esp + .fjzO]
- movaps xmm2, [esp + .fjxH1]
- movhlps xmm3, xmm0 ; xmm3= fjxOd fjyOd
- unpckhps xmm1, xmm2 ; xmm1= fjzOc fjxH1c fjzOd fjxH1d
- movaps xmm4, [esp + .fjyH1]
- movaps xmm5, [esp + .fjzH1]
- unpckhps xmm4, xmm5 ; xmm4= fjyH1c fjzH1c fjyH1d fjzH1d
- movaps xmm5, [esp + .fjxH2]
- movaps xmm6, [esp + .fjyH2]
- movhlps xmm7, xmm4 ; xmm7= fjyH1d fjzH1d
- unpckhps xmm5, xmm6 ; xmm5= fjxH2c fjyH2c fjxH2d fjyH2d
- movlhps xmm0, xmm1 ; xmm0= fjxOc fjyOc fjzOc fjxH1c
- shufps xmm3, xmm1, 11100100b
- ; xmm3= fjxOd fjyOd fjzOd fjxH1d
- movlhps xmm4, xmm5 ; xmm4= fjyH1c fjzH1c fjxH2c fjyH2c
- shufps xmm7, xmm5, 11100100b
- ; xmm7= fjyH1d fjzH1d fjxH2d fjyH2d
- movups xmm1, [edi + ecx*4]
- movups xmm2, [edi + ecx*4 + 16]
- movups xmm5, [edi + edx*4]
- movups xmm6, [edi + edx*4 + 16]
- addps xmm1, xmm0
- addps xmm2, xmm4
- addps xmm5, xmm3
- addps xmm6, xmm7
- movss xmm0, [edi + ecx*4 + 32]
- movss xmm3, [edi + edx*4 + 32]
-
- movaps xmm4, [esp + .fjzH2]
- movaps xmm7, xmm4
- shufps xmm4, xmm4, 10b
- shufps xmm7, xmm7, 11b
- movups [edi + ecx*4], xmm1
- movups [edi + ecx*4 + 16],xmm2
- movups [edi + edx*4], xmm5
- movups [edi + edx*4 + 16],xmm6
- addss xmm0, xmm4
- addss xmm3, xmm7
- movss [edi + ecx*4 + 32], xmm0
- movss [edi + edx*4 + 32], xmm3
-
- ;; should we do one more iteration?
- sub [esp + .innerk], dword 4
- jl .single_check
- jmp .unroll_loop
-.single_check:
- add [esp + .innerk], dword 4
- jnz .single_loop
- jmp .updateouterdata
-.single_loop:
- mov edx, [esp + .innerjjnr] ; pointer to jjnr[k]
- mov eax, [edx]
- add [esp + .innerjjnr], dword 4
-
- mov esi, [ebp + %$pos]
- lea eax, [eax + eax*2]
-
- ; fetch j coordinates
- xorps xmm3, xmm3
- xorps xmm4, xmm4
- xorps xmm5, xmm5
- movss xmm3, [esi + eax*4]
- movss xmm4, [esi + eax*4 + 4]
- movss xmm5, [esi + eax*4 + 8]
-
- movlps xmm6, [esi + eax*4 + 12]
- movhps xmm6, [esi + eax*4 + 24] ; xmm6=jxH1 jyH1 jxH2 jyH2
- ;; fetch both z coords in one go, to positions 0 and 3 in xmm7
- movups xmm7, [esi + eax*4 + 20] ; xmm7=jzH1 jxH2 jyH2 jzH2
- shufps xmm6, xmm6, 11011000b ; xmm6=jxH1 jxH2 jyH1 jyH2
- movlhps xmm3, xmm6 ; xmm3= jxO 0 jxH1 jxH2
- movaps xmm0, [esp + .ixO]
- movaps xmm1, [esp + .iyO]
- movaps xmm2, [esp + .izO]
- shufps xmm4, xmm6, 11100100b ; xmm4= jyO 0 jyH1 jyH2
- shufps xmm5, xmm7, 11000100b ; xmm5= jzO 0 jzH1 jzH2
- ;; store all j coordinates in jO
- movaps [esp + .jxO], xmm3
- movaps [esp + .jyO], xmm4
- movaps [esp + .jzO], xmm5
- subps xmm0, xmm3
- subps xmm1, xmm4
- subps xmm2, xmm5
- movaps [esp + .dxOO], xmm0
- movaps [esp + .dyOO], xmm1
- movaps [esp + .dzOO], xmm2
- mulps xmm0, xmm0
- mulps xmm1, xmm1
- mulps xmm2, xmm2
- addps xmm0, xmm1
- addps xmm0, xmm2 ; have rsq in xmm0.
-
- ;; do invsqrt
- rsqrtps xmm1, xmm0
- movaps xmm2, xmm1
- mulps xmm1, xmm1
- movaps xmm3, [esp + .three]
- mulps xmm1, xmm0
- subps xmm3, xmm1
- mulps xmm3, xmm2
- mulps xmm3, [esp + .half] ; rinv iO - j water
-
- xorps xmm1, xmm1
- movaps xmm0, xmm3
- xorps xmm4, xmm4
- mulps xmm0, xmm0 ; xmm0=rinvsq
- ;; fetch charges to xmm4 (temporary)
- movss xmm4, [esp + .qqOO]
-
- movhps xmm4, [esp + .qqOH]
-
- mulps xmm3, xmm4 ; xmm3=vcoul
- mulps xmm0, xmm3 ; total fscal
- addps xmm3, [esp + .vctot]
- movaps [esp + .vctot], xmm3
-
- movaps xmm1, xmm0
- movaps xmm2, xmm0
- mulps xmm0, [esp + .dxOO]
- mulps xmm1, [esp + .dyOO]
- mulps xmm2, [esp + .dzOO]
- ;; initial update for j forces
- xorps xmm3, xmm3
- xorps xmm4, xmm4
- xorps xmm5, xmm5
- subps xmm3, xmm0
- subps xmm4, xmm1
- subps xmm5, xmm2
- movaps [esp + .fjxO], xmm3
- movaps [esp + .fjyO], xmm4
- movaps [esp + .fjzO], xmm5
- addps xmm0, [esp + .fixO]
- addps xmm1, [esp + .fiyO]
- addps xmm2, [esp + .fizO]
- movaps [esp + .fixO], xmm0
- movaps [esp + .fiyO], xmm1
- movaps [esp + .fizO], xmm2
-
-
- ;; done with i O. Now do i H1 & H2 simultaneously. first get i particle coords:
- movaps xmm0, [esp + .ixH1]
- movaps xmm1, [esp + .iyH1]
- movaps xmm2, [esp + .izH1]
- movaps xmm3, [esp + .ixH2]
- movaps xmm4, [esp + .iyH2]
- movaps xmm5, [esp + .izH2]
- subps xmm0, [esp + .jxO]
- subps xmm1, [esp + .jyO]
- subps xmm2, [esp + .jzO]
- subps xmm3, [esp + .jxO]
- subps xmm4, [esp + .jyO]
- subps xmm5, [esp + .jzO]
- movaps [esp + .dxH1O], xmm0
- movaps [esp + .dyH1O], xmm1
- movaps [esp + .dzH1O], xmm2
- movaps [esp + .dxH2O], xmm3
- movaps [esp + .dyH2O], xmm4
- movaps [esp + .dzH2O], xmm5
- mulps xmm0, xmm0
- mulps xmm1, xmm1
- mulps xmm2, xmm2
- mulps xmm3, xmm3
- mulps xmm4, xmm4
- mulps xmm5, xmm5
- addps xmm0, xmm1
- addps xmm4, xmm3
- addps xmm0, xmm2 ; have rsqH1 in xmm0.
- addps xmm4, xmm5 ; have rsqH2 in xmm4.
-
- ;; do invsqrt
- rsqrtps xmm1, xmm0
- rsqrtps xmm5, xmm4
- movaps xmm2, xmm1
- movaps xmm6, xmm5
- mulps xmm1, xmm1
- mulps xmm5, xmm5
- movaps xmm3, [esp + .three]
- movaps xmm7, xmm3
- mulps xmm1, xmm0
- mulps xmm5, xmm4
- subps xmm3, xmm1
- subps xmm7, xmm5
- mulps xmm3, xmm2
- mulps xmm7, xmm6
- mulps xmm3, [esp + .half] ; rinv H1 - j water
- mulps xmm7, [esp + .half] ; rinv H2 - j water
-
- ;; assemble charges in xmm6
- xorps xmm6, xmm6
- ; do coulomb interaction
- movaps xmm0, xmm3
- movss xmm6, [esp + .qqOH]
- movaps xmm4, xmm7
- movhps xmm6, [esp + .qqHH]
- mulps xmm0, xmm0 ; rinvsq
- mulps xmm4, xmm4 ; rinvsq
- mulps xmm3, xmm6 ; vcoul
- mulps xmm7, xmm6 ; vcoul
- movaps xmm2, xmm3
- addps xmm2, xmm7 ; total vcoul
- mulps xmm0, xmm3 ; fscal
-
- addps xmm2, [esp + .vctot]
- mulps xmm7, xmm4 ; fscal
- movaps [esp + .vctot], xmm2
- movaps xmm1, xmm0
- movaps xmm2, xmm0
- mulps xmm0, [esp + .dxH1O]
- mulps xmm1, [esp + .dyH1O]
- mulps xmm2, [esp + .dzH1O]
- ;; update forces H1 - j water
- movaps xmm3, [esp + .fjxO]
- movaps xmm4, [esp + .fjyO]
- movaps xmm5, [esp + .fjzO]
- subps xmm3, xmm0
- subps xmm4, xmm1
- subps xmm5, xmm2
- movaps [esp + .fjxO], xmm3
- movaps [esp + .fjyO], xmm4
- movaps [esp + .fjzO], xmm5
- addps xmm0, [esp + .fixH1]
- addps xmm1, [esp + .fiyH1]
- addps xmm2, [esp + .fizH1]
- movaps [esp + .fixH1], xmm0
- movaps [esp + .fiyH1], xmm1
- movaps [esp + .fizH1], xmm2
- ;; do forces H2 - j water
- movaps xmm0, xmm7
- movaps xmm1, xmm7
- movaps xmm2, xmm7
- mulps xmm0, [esp + .dxH2O]
- mulps xmm1, [esp + .dyH2O]
- mulps xmm2, [esp + .dzH2O]
- movaps xmm3, [esp + .fjxO]
- movaps xmm4, [esp + .fjyO]
- movaps xmm5, [esp + .fjzO]
- subps xmm3, xmm0
- subps xmm4, xmm1
- subps xmm5, xmm2
- mov esi, [ebp + %$faction]
- movaps [esp + .fjxO], xmm3
- movaps [esp + .fjyO], xmm4
- movaps [esp + .fjzO], xmm5
- addps xmm0, [esp + .fixH2]
- addps xmm1, [esp + .fiyH2]
- addps xmm2, [esp + .fizH2]
- movaps [esp + .fixH2], xmm0
- movaps [esp + .fiyH2], xmm1
- movaps [esp + .fizH2], xmm2
-
- ;; update j water forces from local variables
- movlps xmm0, [esi + eax*4]
- movlps xmm1, [esi + eax*4 + 12]
- movhps xmm1, [esi + eax*4 + 24]
- movaps xmm3, [esp + .fjxO]
- movaps xmm4, [esp + .fjyO]
- movaps xmm5, [esp + .fjzO]
- movaps xmm6, xmm5
- movaps xmm7, xmm5
- shufps xmm6, xmm6, 10b
- shufps xmm7, xmm7, 11b
- addss xmm5, [esi + eax*4 + 8]
- addss xmm6, [esi + eax*4 + 20]
- addss xmm7, [esi + eax*4 + 32]
- movss [esi + eax*4 + 8], xmm5
- movss [esi + eax*4 + 20], xmm6
- movss [esi + eax*4 + 32], xmm7
- movaps xmm5, xmm3
- unpcklps xmm3, xmm4
- unpckhps xmm5, xmm4
- addps xmm0, xmm3
- addps xmm1, xmm5
- movlps [esi + eax*4], xmm0
- movlps [esi + eax*4 + 12], xmm1
- movhps [esi + eax*4 + 24], xmm1
-
- dec dword [esp + .innerk]
- jz .updateouterdata
- jmp .single_loop
-.updateouterdata:
- mov ecx, [esp + .ii3]
- mov edi, [ebp + %$faction]
- mov esi, [ebp + %$fshift]
- mov edx, [esp + .is3]
-
- ; accumulate Oi forces in xmm0, xmm1, xmm2
- movaps xmm0, [esp + .fixO]
- movaps xmm1, [esp + .fiyO]
- movaps xmm2, [esp + .fizO]
-
- movhlps xmm3, xmm0
- movhlps xmm4, xmm1
- movhlps xmm5, xmm2
- addps xmm0, xmm3
- addps xmm1, xmm4
- addps xmm2, xmm5 ; summan ligger i 1/2 i xmm0-xmm2
-
- movaps xmm3, xmm0
- movaps xmm4, xmm1
- movaps xmm5, xmm2
-
- shufps xmm3, xmm3, 1b
- shufps xmm4, xmm4, 1b
- shufps xmm5, xmm5, 1b
- addss xmm0, xmm3
- addss xmm1, xmm4
- addss xmm2, xmm5 ; xmm0-xmm2 has single force in pos0.
-
- ; increment i force
- movss xmm3, [edi + ecx*4]
- movss xmm4, [edi + ecx*4 + 4]
- movss xmm5, [edi + ecx*4 + 8]
- addss xmm3, xmm0
- addss xmm4, xmm1
- addss xmm5, xmm2
- movss [edi + ecx*4], xmm3
- movss [edi + ecx*4 + 4], xmm4
- movss [edi + ecx*4 + 8], xmm5
-
- ; accumulate force in xmm6/xmm7 for fshift
- movaps xmm6, xmm0
- movss xmm7, xmm2
- movlhps xmm6, xmm1
- shufps xmm6, xmm6, 1000b
-
- ; accumulate H1i forces in xmm0, xmm1, xmm2
- movaps xmm0, [esp + .fixH1]
- movaps xmm1, [esp + .fiyH1]
- movaps xmm2, [esp + .fizH1]
-
- movhlps xmm3, xmm0
- movhlps xmm4, xmm1
- movhlps xmm5, xmm2
- addps xmm0, xmm3
- addps xmm1, xmm4
- addps xmm2, xmm5 ; summan ligger i 1/2 i xmm0-xmm2
-
- movaps xmm3, xmm0
- movaps xmm4, xmm1
- movaps xmm5, xmm2
-
- shufps xmm3, xmm3, 1b
- shufps xmm4, xmm4, 1b
- shufps xmm5, xmm5, 1b
- addss xmm0, xmm3
- addss xmm1, xmm4
- addss xmm2, xmm5 ; xmm0-xmm2 has single force in pos0.
-
- ; increment i force
- movss xmm3, [edi + ecx*4 + 12]
- movss xmm4, [edi + ecx*4 + 16]
- movss xmm5, [edi + ecx*4 + 20]
- addss xmm3, xmm0
- addss xmm4, xmm1
- addss xmm5, xmm2
- movss [edi + ecx*4 + 12], xmm3
- movss [edi + ecx*4 + 16], xmm4
- movss [edi + ecx*4 + 20], xmm5
-
- ;accumulate force in xmm6/xmm7 for fshift
- addss xmm7, xmm2
- movlhps xmm0, xmm1
- shufps xmm0, xmm0, 1000b
- addps xmm6, xmm0
-
- ; accumulate H2i forces in xmm0, xmm1, xmm2
- movaps xmm0, [esp + .fixH2]
- movaps xmm1, [esp + .fiyH2]
- movaps xmm2, [esp + .fizH2]
-
- movhlps xmm3, xmm0
- movhlps xmm4, xmm1
- movhlps xmm5, xmm2
- addps xmm0, xmm3
- addps xmm1, xmm4
- addps xmm2, xmm5 ; summan ligger i 1/2 i xmm0-xmm2
-
- movaps xmm3, xmm0
- movaps xmm4, xmm1
- movaps xmm5, xmm2
-
- shufps xmm3, xmm3, 1b
- shufps xmm4, xmm4, 1b
- shufps xmm5, xmm5, 1b
- addss xmm0, xmm3
- addss xmm1, xmm4
- addss xmm2, xmm5 ; xmm0-xmm2 has single force in pos0.
-
- ; increment i force
- movss xmm3, [edi + ecx*4 + 24]
- movss xmm4, [edi + ecx*4 + 28]
- movss xmm5, [edi + ecx*4 + 32]
- addss xmm3, xmm0
- addss xmm4, xmm1
- addss xmm5, xmm2
- movss [edi + ecx*4 + 24], xmm3
- movss [edi + ecx*4 + 28], xmm4
- movss [edi + ecx*4 + 32], xmm5
-
- ;accumulate force in xmm6/xmm7 for fshift
- addss xmm7, xmm2
- movlhps xmm0, xmm1
- shufps xmm0, xmm0, 1000b
- addps xmm6, xmm0
-
- ; increment fshift force
- movlps xmm3, [esi + edx*4]
- movss xmm4, [esi + edx*4 + 8]
- addps xmm3, xmm6
- addss xmm4, xmm7
- movlps [esi + edx*4], xmm3
- movss [esi + edx*4 + 8], xmm4
-
- ; get group index for i particle
- mov edx, [ebp + %$gid] ; get group index for this i particle
- mov edx, [edx]
- add [ebp + %$gid], dword 4 ; advance pointer
-
- ; accumulate total potential energy and update it.
- movaps xmm7, [esp + .vctot]
- ; accumulate
- movhlps xmm6, xmm7
- addps xmm7, xmm6 ; pos 0-1 in xmm7 have the sum now
- movaps xmm6, xmm7
- shufps xmm6, xmm6, 1b
- addss xmm7, xmm6
-
- ; add earlier value from mem.
- mov eax, [ebp + %$Vc]
- addss xmm7, [eax + edx*4]
- ; move back to mem.
- movss [eax + edx*4], xmm7
-
- ;; finish if last
- mov ecx, [ebp + %$nri]
- dec ecx
- jecxz .end
- ;; not last, iterate once more!
- mov [ebp + %$nri], ecx
- jmp .outer
-.end:
- emms
- mov eax, [esp + .salign]
- add esp, eax
- add esp, 1412
- pop edi
- pop esi
- pop edx
- pop ecx
- pop ebx
- pop eax
- endproc
-
-
-
-
-
-
-
-proc inl1100_sse
-%$nri arg
-%$iinr arg
-%$jindex arg
-%$jjnr arg
-%$shift arg
-%$shiftvec arg
-%$fshift arg
-%$gid arg
-%$pos arg
-%$faction arg
-%$charge arg
-%$facel arg
-%$Vc arg
-%$type arg
-%$ntype arg
-%$nbfp arg
-%$Vnb arg
- ;; stack offsets for local variables
- ;; bottom of stack is cache-aligned for sse use
-.ix equ 0
-.iy equ 16
-.iz equ 32
-.iq equ 48
-.dx equ 64
-.dy equ 80
-.dz equ 96
-.c6 equ 112
-.c12 equ 128
-.six equ 144
-.twelve equ 160
-.vctot equ 176
-.vnbtot equ 192
-.fix equ 208
-.fiy equ 224
-.fiz equ 240
-.half equ 256
-.three equ 272
-.is3 equ 288
-.ii3 equ 292
-.ntia equ 296
-.innerjjnr equ 300
-.innerk equ 304
-.salign equ 308
- push eax
- push ebx
- push ecx
- push edx
- push esi
- push edi
- sub esp, dword 312 ; local stack space
- mov eax, esp
- and eax, 0xf
- sub esp, eax
- mov [esp + .salign], eax
-
- emms
-
- movups xmm0, [sse_half]
- movups xmm1, [sse_three]
- movups xmm2, [sse_six]
- movups xmm3, [sse_twelve]
- movaps [esp + .half], xmm0
- movaps [esp + .three], xmm1
- movaps [esp + .six], xmm2
- movaps [esp + .twelve], xmm3
-
- ;; assume we have at least one i particle - start directly
-.outer:
- mov eax, [ebp + %$shift] ; eax = pointer into shift[]
- mov ebx, [eax] ; ebx=shift[n]
- add [ebp + %$shift], dword 4 ; advance pointer one step
-
- lea ebx, [ebx + ebx*2] ; ebx=3*is
- mov [esp + .is3],ebx ; store is3
-
- mov eax, [ebp + %$shiftvec] ; eax = base of shiftvec[]
-
- movss xmm0, [eax + ebx*4]
- movss xmm1, [eax + ebx*4 + 4]
- movss xmm2, [eax + ebx*4 + 8]
-
- mov ecx, [ebp + %$iinr] ; ecx = pointer into iinr[]
- add [ebp + %$iinr], dword 4 ; advance pointer
- mov ebx, [ecx] ; ebx =ii
-
- mov edx, [ebp + %$charge]
- movss xmm3, [edx + ebx*4]
- mulss xmm3, [ebp + %$facel]
- shufps xmm3, xmm3, 0b
-
- mov edx, [ebp + %$type]
- mov edx, [edx + ebx*4]
- imul edx, [ebp + %$ntype]
- shl edx, 1
- mov [esp + .ntia], edx
-
- lea ebx, [ebx + ebx*2] ; ebx = 3*ii=ii3
- mov eax, [ebp + %$pos] ; eax = base of pos[]
-
- addss xmm0, [eax + ebx*4]
- addss xmm1, [eax + ebx*4 + 4]
- addss xmm2, [eax + ebx*4 + 8]
-
- movaps [esp + .iq], xmm3
-
- shufps xmm0, xmm0, 0b
- shufps xmm1, xmm1, 0b
- shufps xmm2, xmm2, 0b
-
- movaps [esp + .ix], xmm0
- movaps [esp + .iy], xmm1
- movaps [esp + .iz], xmm2
-
- mov [esp + .ii3], ebx
-
- ; clear vctot and i forces
- xorps xmm4, xmm4
- movaps [esp + .vctot], xmm4
- movaps [esp + .vnbtot], xmm4
- movaps [esp + .fix], xmm4
- movaps [esp + .fiy], xmm4
- movaps [esp + .fiz], xmm4
-
- mov eax, [ebp + %$jindex]
- mov ecx, [eax] ; jindex[n]
- mov edx, [eax + 4] ; jindex[n+1]
- add [ebp + %$jindex], dword 4
- sub edx, ecx ; number of innerloop atoms
-
- mov esi, [ebp + %$pos]
- mov edi, [ebp + %$faction]
- mov eax, [ebp + %$jjnr]
- shl ecx, 2
- add eax, ecx
- mov [esp + .innerjjnr], eax ; pointer to jjnr[nj0]
- sub edx, dword 4
- mov [esp + .innerk], edx ; number of innerloop atoms
- jge .unroll_loop
- jmp .finish_inner
-.unroll_loop:
- ;; quad-unroll innerloop here.
- mov edx, [esp + .innerjjnr] ; pointer to jjnr[k]
- mov eax, [edx]
- mov ebx, [edx + 4]
- mov ecx, [edx + 8]
- mov edx, [edx + 12] ; eax-edx=jnr1-4
- add [esp + .innerjjnr], dword 16 ; advance pointer (unrolled 4)
-
- mov esi, [ebp + %$charge] ; base of charge[]
-
- movss xmm3, [esi + eax*4]
- movss xmm4, [esi + ecx*4]
- movss xmm6, [esi + ebx*4]
- movss xmm7, [esi + edx*4]
-
- movaps xmm2, [esp + .iq]
- shufps xmm3, xmm6, 00000000b
- shufps xmm4, xmm7, 00000000b
- shufps xmm3, xmm4, 10001000b ; all charges in xmm3
- movd mm0, eax ; use mmx registers as temp. storage
- movd mm1, ebx
- movd mm2, ecx
- movd mm3, edx
-
- mov esi, [ebp + %$type]
- mov eax, [esi + eax*4]
- mov ebx, [esi + ebx*4]
- mov ecx, [esi + ecx*4]
- mov edx, [esi + edx*4]
- mov esi, [ebp + %$nbfp]
- shl eax, 1
- shl ebx, 1
- shl ecx, 1
- shl edx, 1
- mov edi, [esp + .ntia]
- add eax, edi
- add ebx, edi
- add ecx, edi
- add edx, edi
-
- movlps xmm6, [esi + eax*4]
- movlps xmm7, [esi + ecx*4]
- movhps xmm6, [esi + ebx*4]
- movhps xmm7, [esi + edx*4]
-
- movaps xmm4, xmm6
- shufps xmm4, xmm7, 10001000b
- shufps xmm6, xmm7, 11011101b
-
- movd eax, mm0
- movd ebx, mm1
- movd ecx, mm2
- movd edx, mm3
-
- movaps [esp + .c6], xmm4
- movaps [esp + .c12], xmm6
-
- mov esi, [ebp + %$pos] ; base of pos[]
-
- lea eax, [eax + eax*2] ; replace jnr with j3
- lea ebx, [ebx + ebx*2]
-
- mulps xmm3, xmm2
- lea ecx, [ecx + ecx*2] ; replace jnr with j3
- lea edx, [edx + edx*2]
-
- ; move four coordinates to xmm0-xmm2
-
- movlps xmm4, [esi + eax*4]
- movlps xmm5, [esi + ecx*4]
- movss xmm2, [esi + eax*4 + 8]
- movss xmm6, [esi + ecx*4 + 8]
-
- movhps xmm4, [esi + ebx*4]
- movhps xmm5, [esi + edx*4]
-
- movss xmm0, [esi + ebx*4 + 8]
- movss xmm1, [esi + edx*4 + 8]
-
- shufps xmm2, xmm0, 0b
- shufps xmm6, xmm1, 0b
-
- movaps xmm0, xmm4
- movaps xmm1, xmm4
-
- shufps xmm2, xmm6, 10001000b
-
- shufps xmm0, xmm5, 10001000b
- shufps xmm1, xmm5, 11011101b
-
- ; move ix-iz to xmm4-xmm6
- movaps xmm4, [esp + .ix]
- movaps xmm5, [esp + .iy]
- movaps xmm6, [esp + .iz]
-
- ; calc dr
- subps xmm4, xmm0
- subps xmm5, xmm1
- subps xmm6, xmm2
-
- ; store dr
- movaps [esp + .dx], xmm4
- movaps [esp + .dy], xmm5
- movaps [esp + .dz], xmm6
- ; square it
- mulps xmm4,xmm4
- mulps xmm5,xmm5
- mulps xmm6,xmm6
- addps xmm4, xmm5
- addps xmm4, xmm6
- ; rsq in xmm4
-
- rsqrtps xmm5, xmm4
- ; lookup seed in xmm5
- movaps xmm2, xmm5
- mulps xmm5, xmm5
- movaps xmm1, [esp + .three]
- mulps xmm5, xmm4 ;rsq*lu*lu
- movaps xmm0, [esp + .half]
- subps xmm1, xmm5 ; 3.0-rsq*lu*lu
- mulps xmm1, xmm2
- mulps xmm0, xmm1 ; xmm0=rinv
- movaps xmm4, xmm0
- mulps xmm4, xmm4 ; xmm4=rinvsq
- movaps xmm1, xmm4
- mulps xmm1, xmm4
- mulps xmm1, xmm4 ; xmm1=rinvsix
- movaps xmm2, xmm1
- mulps xmm2, xmm2 ; xmm2=rinvtwelve
- mulps xmm3, xmm0 ; xmm3=vcoul
- mulps xmm1, [esp + .c6]
- mulps xmm2, [esp + .c12]
- movaps xmm5, xmm2
- subps xmm5, xmm1 ; vnb=vnb12-vnb6
- addps xmm5, [esp + .vnbtot]
- mulps xmm1, [esp + .six]
- mulps xmm2, [esp + .twelve]
- subps xmm2, xmm1
- addps xmm2, xmm3
- mulps xmm4, xmm2 ; xmm4=total fscal
- addps xmm3, [esp + .vctot]
-
- movaps xmm0, [esp + .dx]
- movaps xmm1, [esp + .dy]
- movaps xmm2, [esp + .dz]
-
- movaps [esp + .vctot], xmm3
- movaps [esp + .vnbtot], xmm5
-
- mov edi, [ebp + %$faction]
- mulps xmm0, xmm4
- mulps xmm1, xmm4
- mulps xmm2, xmm4
- ; xmm0-xmm2 contains tx-tz (partial force)
- ; now update f_i
- movaps xmm3, [esp + .fix]
- movaps xmm4, [esp + .fiy]
- movaps xmm5, [esp + .fiz]
- addps xmm3, xmm0
- addps xmm4, xmm1
- addps xmm5, xmm2
- movaps [esp + .fix], xmm3
- movaps [esp + .fiy], xmm4
- movaps [esp + .fiz], xmm5
- ; the fj's - start by accumulating x & y forces from memory
- movlps xmm4, [edi + eax*4]
- movlps xmm6, [edi + ecx*4]
- movhps xmm4, [edi + ebx*4]
- movhps xmm6, [edi + edx*4]
-
- movaps xmm3, xmm4
- shufps xmm3, xmm6, 10001000b
- shufps xmm4, xmm6, 11011101b
-
- ; now xmm3-xmm5 contains fjx, fjy, fjz
- subps xmm3, xmm0
- subps xmm4, xmm1
-
- ; unpack them back so we can store them - first x & y in xmm3/xmm4
-
- movaps xmm6, xmm3
- unpcklps xmm6, xmm4
- unpckhps xmm3, xmm4
- ; xmm6(l)=x & y for j1, (h) for j2
- ; xmm3(l)=x & y for j3, (h) for j4
- movlps [edi + eax*4], xmm6
- movlps [edi + ecx*4], xmm3
-
- movhps [edi + ebx*4], xmm6
- movhps [edi + edx*4], xmm3
-
- ;; and the z forces
- movss xmm4, [edi + eax*4 + 8]
- movss xmm5, [edi + ebx*4 + 8]
- movss xmm6, [edi + ecx*4 + 8]
- movss xmm7, [edi + edx*4 + 8]
- subss xmm4, xmm2
- shufps xmm2, xmm2, 11100101b
- subss xmm5, xmm2
- shufps xmm2, xmm2, 11101010b
- subss xmm6, xmm2
- shufps xmm2, xmm2, 11111111b
- subss xmm7, xmm2
- movss [edi + eax*4 + 8], xmm4
- movss [edi + ebx*4 + 8], xmm5
- movss [edi + ecx*4 + 8], xmm6
- movss [edi + edx*4 + 8], xmm7
-
- ;; should we do one more iteration?
- sub [esp + .innerk], dword 4
- jl .finish_inner
- jmp .unroll_loop
-.finish_inner:
- ;; check if at least two particles remain
- add [esp + .innerk], dword 4
- mov edx, [esp + .innerk]
- and edx, 10b
- jnz .dopair
- jmp .checksingle
-.dopair:
- mov esi, [ebp + %$charge]
-
- mov ecx, [esp + .innerjjnr]
-
- mov eax, [ecx]
- mov ebx, [ecx + 4]
- add [esp + .innerjjnr], dword 8
-
- xorps xmm3, xmm3
- movss xmm3, [esi + eax*4]
- movss xmm6, [esi + ebx*4]
- shufps xmm3, xmm6, 00001100b
- shufps xmm3, xmm3, 01011000b ; xmm3(0,1) has the charges.
-
- mov esi, [ebp + %$type]
- mov ecx, eax
- mov edx, ebx
- mov ecx, [esi + ecx*4]
- mov edx, [esi + edx*4]
- mov esi, [ebp + %$nbfp]
- shl ecx, 1
- shl edx, 1
- mov edi, [esp + .ntia]
- add ecx, edi
- add edx, edi
- movlps xmm6, [esi + ecx*4]
- movhps xmm6, [esi + edx*4]
- mov edi, [ebp + %$pos]
- xorps xmm7,xmm7
- movaps xmm4, xmm6
- shufps xmm4, xmm4, 1000b
- shufps xmm6, xmm6, 1101b
- movlhps xmm4, xmm7
- movlhps xmm6, xmm7
-
- movaps [esp + .c6], xmm4
- movaps [esp + .c12], xmm6
-
- lea eax, [eax + eax*2]
- lea ebx, [ebx + ebx*2]
- ; move coordinates to xmm0-xmm2
- movlps xmm1, [edi + eax*4]
- movss xmm2, [edi + eax*4 + 8]
- movhps xmm1, [edi + ebx*4]
- movss xmm0, [edi + ebx*4 + 8]
-
- mulps xmm3, [esp + .iq]
-
- movlhps xmm3, xmm7
-
- shufps xmm2, xmm0, 0b
-
- movaps xmm0, xmm1
-
- shufps xmm2, xmm2, 10001000b
-
- shufps xmm0, xmm0, 10001000b
- shufps xmm1, xmm1, 11011101b
-
- mov edi, [ebp + %$faction]
- ; move ix-iz to xmm4-xmm6
- xorps xmm7, xmm7
-
- movaps xmm4, [esp + .ix]
- movaps xmm5, [esp + .iy]
- movaps xmm6, [esp + .iz]
-
- ; calc dr
- subps xmm4, xmm0
- subps xmm5, xmm1
- subps xmm6, xmm2
-
- ; store dr
- movaps [esp + .dx], xmm4
- movaps [esp + .dy], xmm5
- movaps [esp + .dz], xmm6
- ; square it
- mulps xmm4,xmm4
- mulps xmm5,xmm5
- mulps xmm6,xmm6
- addps xmm4, xmm5
- addps xmm4, xmm6
- ; rsq in xmm4
-
- rsqrtps xmm5, xmm4
- ; lookup seed in xmm5
- movaps xmm2, xmm5
- mulps xmm5, xmm5
- movaps xmm1, [esp + .three]
- mulps xmm5, xmm4 ;rsq*lu*lu
- movaps xmm0, [esp + .half]
- subps xmm1, xmm5 ; 3.0-rsq*lu*lu
- mulps xmm1, xmm2
- mulps xmm0, xmm1 ; xmm0=rinv
- movaps xmm4, xmm0
- mulps xmm4, xmm4 ; xmm4=rinvsq
- movaps xmm1, xmm4
- mulps xmm1, xmm4
- mulps xmm1, xmm4 ; xmm1=rinvsix
- movaps xmm2, xmm1
- mulps xmm2, xmm2 ; xmm2=rinvtwelve
-
- mulps xmm3, xmm0 ; xmm3=vcoul
- mulps xmm1, [esp + .c6]
- mulps xmm2, [esp + .c12]
- movaps xmm5, xmm2
- subps xmm5, xmm1 ; vnb=vnb12-vnb6
- addps xmm5, [esp + .vnbtot]
- mulps xmm1, [esp + .six]
- mulps xmm2, [esp + .twelve]
- subps xmm2, xmm1
- addps xmm2, xmm3
- mulps xmm4, xmm2 ; xmm4=total fscal
- addps xmm3, [esp + .vctot]
-
- movaps xmm0, [esp + .dx]
- movaps xmm1, [esp + .dy]
- movaps xmm2, [esp + .dz]
-
- movaps [esp + .vctot], xmm3
- movaps [esp + .vnbtot], xmm5
-
- mulps xmm0, xmm4
- mulps xmm1, xmm4
- mulps xmm2, xmm4
- ; xmm0-xmm2 contains tx-tz (partial force)
- ; now update f_i
- movaps xmm3, [esp + .fix]
- movaps xmm4, [esp + .fiy]
- movaps xmm5, [esp + .fiz]
- addps xmm3, xmm0
- addps xmm4, xmm1
- addps xmm5, xmm2
- movaps [esp + .fix], xmm3
- movaps [esp + .fiy], xmm4
- movaps [esp + .fiz], xmm5
- ; update the fj's
- movss xmm3, [edi + eax*4]
- movss xmm4, [edi + eax*4 + 4]
- movss xmm5, [edi + eax*4 + 8]
- subss xmm3, xmm0
- subss xmm4, xmm1
- subss xmm5, xmm2
- movss [edi + eax*4], xmm3
- movss [edi + eax*4 + 4], xmm4
- movss [edi + eax*4 + 8], xmm5
-
- shufps xmm0, xmm0, 11100001b
- shufps xmm1, xmm1, 11100001b
- shufps xmm2, xmm2, 11100001b
-
- movss xmm3, [edi + ebx*4]
- movss xmm4, [edi + ebx*4 + 4]
- movss xmm5, [edi + ebx*4 + 8]
- subss xmm3, xmm0
- subss xmm4, xmm1
- subss xmm5, xmm2
- movss [edi + ebx*4], xmm3
- movss [edi + ebx*4 + 4], xmm4
- movss [edi + ebx*4 + 8], xmm5
-
-.checksingle:
- mov edx, [esp + .innerk]
- and edx, 1b
- jnz .dosingle
- jmp .updateouterdata
-.dosingle:
- mov esi, [ebp + %$charge]
- mov edi, [ebp + %$pos]
- mov ecx, [esp + .innerjjnr]
- xorps xmm3, xmm3
- mov eax, [ecx]
- movss xmm3, [esi + eax*4] ; xmm3(0) has the charge
-
- mov esi, [ebp + %$type]
- mov ecx, eax
- mov ecx, [esi + ecx*4]
- mov esi, [ebp + %$nbfp]
- shl ecx, 1
- add ecx, [esp + .ntia]
- xorps xmm6, xmm6
- movlps xmm6, [esi + ecx*4]
- movaps xmm4, xmm6
- shufps xmm4, xmm4, 11111100b
- shufps xmm6, xmm6, 11111101b
-
- movaps [esp + .c6], xmm4
- movaps [esp + .c12], xmm6
-
- lea eax, [eax + eax*2]
-
- ; move coordinates to xmm0-xmm2
- movss xmm0, [edi + eax*4]
- movss xmm1, [edi + eax*4 + 4]
- movss xmm2, [edi + eax*4 + 8]
-
- mulps xmm3, [esp + .iq]
-
- xorps xmm7, xmm7
-
- movaps xmm4, [esp + .ix]
- movaps xmm5, [esp + .iy]
- movaps xmm6, [esp + .iz]
-
- ; calc dr
- subps xmm4, xmm0
- subps xmm5, xmm1
- subps xmm6, xmm2
-
- ; store dr
- movaps [esp + .dx], xmm4
- movaps [esp + .dy], xmm5
- movaps [esp + .dz], xmm6
- ; square it
- mulps xmm4,xmm4
- mulps xmm5,xmm5
- mulps xmm6,xmm6
- addps xmm4, xmm5
- addps xmm4, xmm6
- ; rsq in xmm4
-
- rsqrtps xmm5, xmm4
- ; lookup seed in xmm5
- movaps xmm2, xmm5
- mulps xmm5, xmm5
- movaps xmm1, [esp + .three]
- mulps xmm5, xmm4 ;rsq*lu*lu
- movaps xmm0, [esp + .half]
- subps xmm1, xmm5 ; 3.0-rsq*lu*lu
- mulps xmm1, xmm2
- mulps xmm0, xmm1 ; xmm0=rinv
- movaps xmm4, xmm0
- mulps xmm4, xmm4 ; xmm4=rinvsq
- movaps xmm1, xmm4
- mulps xmm1, xmm4
- mulps xmm1, xmm4 ; xmm1=rinvsix
- movaps xmm2, xmm1
- mulps xmm2, xmm2 ; xmm2=rinvtwelve
- mulps xmm3, xmm0 ; xmm3=vcoul
- mulps xmm1, [esp + .c6]
- mulps xmm2, [esp + .c12]
- movaps xmm5, xmm2
- subps xmm5, xmm1 ; vnb=vnb12-vnb6
- addps xmm5, [esp + .vnbtot]
- mulps xmm1, [esp + .six]
- mulps xmm2, [esp + .twelve]
- subps xmm2, xmm1
- addps xmm2, xmm3
- mulps xmm4, xmm2 ; xmm4=total fscal
- addps xmm3, [esp + .vctot]
-
- mov edi, [ebp + %$faction]
-
- movaps xmm0, [esp + .dx]
- movaps xmm1, [esp + .dy]
- movaps xmm2, [esp + .dz]
-
- movaps [esp + .vctot], xmm3
- movaps [esp + .vnbtot], xmm5
-
- mulps xmm0, xmm4
- mulps xmm1, xmm4
- mulps xmm2, xmm4
- ; xmm0-xmm2 contains tx-tz (partial force)
- ; now update f_i
- movaps xmm3, [esp + .fix]
- movaps xmm4, [esp + .fiy]
- movaps xmm5, [esp + .fiz]
- addps xmm3, xmm0
- addps xmm4, xmm1
- addps xmm5, xmm2
- movaps [esp + .fix], xmm3
- movaps [esp + .fiy], xmm4
- movaps [esp + .fiz], xmm5
- ; update fj
-
- movss xmm3, [edi + eax*4]
- movss xmm4, [edi + eax*4 + 4]
- movss xmm5, [edi + eax*4 + 8]
- subss xmm3, xmm0
- subss xmm4, xmm1
- subss xmm5, xmm2
- movss [edi + eax*4], xmm3
- movss [edi + eax*4 + 4], xmm4
- movss [edi + eax*4 + 8], xmm5
-.updateouterdata:
- mov ecx, [esp + .ii3]
- mov edi, [ebp + %$faction]
- mov esi, [ebp + %$fshift]
- mov edx, [esp + .is3]
-
- ; accumulate i forces in xmm0, xmm1, xmm2
- movaps xmm0, [esp + .fix]
- movaps xmm1, [esp + .fiy]
- movaps xmm2, [esp + .fiz]
-
- movhlps xmm3, xmm0
- movhlps xmm4, xmm1
- movhlps xmm5, xmm2
- addps xmm0, xmm3
- addps xmm1, xmm4
- addps xmm2, xmm5 ; summan ligger i 1/2 i xmm0-xmm2
-
- movaps xmm3, xmm0
- movaps xmm4, xmm1
- movaps xmm5, xmm2
-
- shufps xmm3, xmm3, 1b
- shufps xmm4, xmm4, 1b
- shufps xmm5, xmm5, 1b
- addss xmm0, xmm3
- addss xmm1, xmm4
- addss xmm2, xmm5 ; xmm0-xmm2 has single force in pos0.
-
- ; increment i force
- movss xmm3, [edi + ecx*4]
- movss xmm4, [edi + ecx*4 + 4]
- movss xmm5, [edi + ecx*4 + 8]
- addss xmm3, xmm0
- addss xmm4, xmm1
- addss xmm5, xmm2
- movss [edi + ecx*4], xmm3
- movss [edi + ecx*4 + 4], xmm4
- movss [edi + ecx*4 + 8], xmm5
-
- ; increment fshift force
- movss xmm3, [esi + edx*4]
- movss xmm4, [esi + edx*4 + 4]
- movss xmm5, [esi + edx*4 + 8]
- addss xmm3, xmm0
- addss xmm4, xmm1
- addss xmm5, xmm2
- movss [esi + edx*4], xmm3
- movss [esi + edx*4 + 4], xmm4
- movss [esi + edx*4 + 8], xmm5
-
- ; get group index for i particle
- mov edx, [ebp + %$gid] ; get group index for this i particle
- mov edx, [edx]
- add [ebp + %$gid], dword 4 ; advance pointer
-
- ; accumulate total potential energy and update it.
- movaps xmm7, [esp + .vctot]
- ; accumulate
- movhlps xmm6, xmm7
- addps xmm7, xmm6 ; pos 0-1 in xmm7 have the sum now
- movaps xmm6, xmm7
- shufps xmm6, xmm6, 1b
- addss xmm7, xmm6
-
- ; add earlier value from mem.
- mov eax, [ebp + %$Vc]
- addss xmm7, [eax + edx*4]
- ; move back to mem.
- movss [eax + edx*4], xmm7
-
- ; accumulate total lj energy and update it.
- movaps xmm7, [esp + .vnbtot]
- ; accumulate
- movhlps xmm6, xmm7
- addps xmm7, xmm6 ; pos 0-1 in xmm7 have the sum now
- movaps xmm6, xmm7
- shufps xmm6, xmm6, 1b
- addss xmm7, xmm6
-
- ; add earlier value from mem.
- mov eax, [ebp + %$Vnb]
- addss xmm7, [eax + edx*4]
- ; move back to mem.
- movss [eax + edx*4], xmm7
-
- ;; finish if last
- mov ecx, [ebp + %$nri]
- dec ecx
- jecxz .end
- ;; not last, iterate once more!
- mov [ebp + %$nri], ecx
- jmp .outer
-.end:
- emms
- mov eax, [esp + .salign]
- add esp, eax
- add esp, dword 312
- pop edi
- pop esi
- pop edx
- pop ecx
- pop ebx
- pop eax
- endproc
-
-
-
-
-proc inl2100_sse
-%$nri arg
-%$iinr arg
-%$jindex arg
-%$jjnr arg
-%$shift arg
-%$shiftvec arg
-%$fshift arg
-%$gid arg
-%$pos arg
-%$faction arg
-%$charge arg
-%$facel arg
-%$Vc arg
-%$krf arg
-%$crf arg
-%$type arg
-%$ntype arg
-%$nbfp arg
-%$Vnb arg
- ;; stack offsets for local variables
- ;; bottom of stack is cache-aligned for sse use
-.ix equ 0
-.iy equ 16
-.iz equ 32
-.iq equ 48
-.dx equ 64
-.dy equ 80
-.dz equ 96
-.c6 equ 112
-.c12 equ 128
-.six equ 144
-.twelve equ 160
-.vctot equ 176
-.vnbtot equ 192
-.fix equ 208
-.fiy equ 224
-.fiz equ 240
-.half equ 256
-.three equ 272
-.two equ 288
-.krf equ 304
-.crf equ 320
-.is3 equ 336
-.ii3 equ 340
-.ntia equ 344
-.innerjjnr equ 348
-.innerk equ 352
-.salign equ 356
- push eax
- push ebx
- push ecx
- push edx
- push esi
- push edi
- sub esp, dword 360 ; local stack space
- mov eax, esp
- and eax, 0xf
- sub esp, eax
- mov [esp + .salign], eax
-
- emms
-
- movups xmm0, [sse_half]
- movups xmm1, [sse_three]
- movups xmm2, [sse_six]
- movups xmm3, [sse_twelve]
- movups xmm4, [sse_two]
- movss xmm5, [ebp + %$krf]
- movss xmm6, [ebp + %$crf]
-
- movaps [esp + .half], xmm0
- movaps [esp + .three], xmm1
- movaps [esp + .six], xmm2
- movaps [esp + .twelve], xmm3
- movaps [esp + .two], xmm4
- shufps xmm5, xmm5, 0b
- shufps xmm6, xmm6, 0b
- movaps [esp + .krf], xmm5
- movaps [esp + .crf], xmm6
-
- ;; assume we have at least one i particle - start directly
-.outer:
- mov eax, [ebp + %$shift] ; eax = pointer into shift[]
- mov ebx, [eax] ; ebx=shift[n]
- add [ebp + %$shift], dword 4 ; advance pointer one step
-
- lea ebx, [ebx + ebx*2] ; ebx=3*is
- mov [esp + .is3],ebx ; store is3
-
- mov eax, [ebp + %$shiftvec] ; eax = base of shiftvec[]
-
- movss xmm0, [eax + ebx*4]
- movss xmm1, [eax + ebx*4 + 4]
- movss xmm2, [eax + ebx*4 + 8]
-
- mov ecx, [ebp + %$iinr] ; ecx = pointer into iinr[]
- add [ebp + %$iinr], dword 4 ; advance pointer
- mov ebx, [ecx] ; ebx =ii
-
- mov edx, [ebp + %$charge]
- movss xmm3, [edx + ebx*4]
- mulss xmm3, [ebp + %$facel]
- shufps xmm3, xmm3, 0b
-
- mov edx, [ebp + %$type]
- mov edx, [edx + ebx*4]
- imul edx, [ebp + %$ntype]
- shl edx, 1
- mov [esp + .ntia], edx
-
- lea ebx, [ebx + ebx*2] ; ebx = 3*ii=ii3
- mov eax, [ebp + %$pos] ; eax = base of pos[]
-
- addss xmm0, [eax + ebx*4]
- addss xmm1, [eax + ebx*4 + 4]
- addss xmm2, [eax + ebx*4 + 8]
-
- movaps [esp + .iq], xmm3
-
- shufps xmm0, xmm0, 0b
- shufps xmm1, xmm1, 0b
- shufps xmm2, xmm2, 0b
-
- movaps [esp + .ix], xmm0
- movaps [esp + .iy], xmm1
- movaps [esp + .iz], xmm2
-
- mov [esp + .ii3], ebx
-
- ; clear vctot and i forces
- xorps xmm4, xmm4
- movaps [esp + .vctot], xmm4
- movaps [esp + .vnbtot], xmm4
- movaps [esp + .fix], xmm4
- movaps [esp + .fiy], xmm4
- movaps [esp + .fiz], xmm4
-
- mov eax, [ebp + %$jindex]
- mov ecx, [eax] ; jindex[n]
- mov edx, [eax + 4] ; jindex[n+1]
- add [ebp + %$jindex], dword 4
- sub edx, ecx ; number of innerloop atoms
-
- mov esi, [ebp + %$pos]
- mov edi, [ebp + %$faction]
- mov eax, [ebp + %$jjnr]
- shl ecx, 2
- add eax, ecx
- mov [esp + .innerjjnr], eax ; pointer to jjnr[nj0]
- sub edx, dword 4
- mov [esp + .innerk], edx ; number of innerloop atoms
- jge .unroll_loop
- jmp .finish_inner
-.unroll_loop:
- ;; quad-unroll innerloop here.
- mov edx, [esp + .innerjjnr] ; pointer to jjnr[k]
- mov eax, [edx]
- mov ebx, [edx + 4]
- mov ecx, [edx + 8]
- mov edx, [edx + 12] ; eax-edx=jnr1-4
- add [esp + .innerjjnr], dword 16 ; advance pointer (unrolled 4)
-
- mov esi, [ebp + %$charge] ; base of charge[]
-
- movss xmm3, [esi + eax*4]
- movss xmm4, [esi + ecx*4]
- movss xmm6, [esi + ebx*4]
- movss xmm7, [esi + edx*4]
-
- movaps xmm2, [esp + .iq]
- shufps xmm3, xmm6, 00000000b
- shufps xmm4, xmm7, 00000000b
- shufps xmm3, xmm4, 10001000b ; all charges in xmm3
- movd mm0, eax ; use mmx registers as temp. storage
- movd mm1, ebx
- movd mm2, ecx
- movd mm3, edx
-
- mov esi, [ebp + %$type]
- mov eax, [esi + eax*4]
- mov ebx, [esi + ebx*4]
- mov ecx, [esi + ecx*4]
- mov edx, [esi + edx*4]
- mov esi, [ebp + %$nbfp]
- shl eax, 1
- shl ebx, 1
- shl ecx, 1
- shl edx, 1
- mov edi, [esp + .ntia]
- add eax, edi
- add ebx, edi
- add ecx, edi
- add edx, edi
-
- movlps xmm6, [esi + eax*4]
- movlps xmm7, [esi + ecx*4]
- movhps xmm6, [esi + ebx*4]
- movhps xmm7, [esi + edx*4]
-
- movaps xmm4, xmm6
- shufps xmm4, xmm7, 10001000b
- shufps xmm6, xmm7, 11011101b
-
- movd eax, mm0
- movd ebx, mm1
- movd ecx, mm2
- movd edx, mm3
-
- movaps [esp + .c6], xmm4
- movaps [esp + .c12], xmm6
-
- mov esi, [ebp + %$pos] ; base of pos[]
-
- lea eax, [eax + eax*2] ; replace jnr with j3
- lea ebx, [ebx + ebx*2]
-
- mulps xmm3, xmm2
- lea ecx, [ecx + ecx*2] ; replace jnr with j3
- lea edx, [edx + edx*2]
-
- ; move four coordinates to xmm0-xmm2
-
- movlps xmm4, [esi + eax*4]
- movlps xmm5, [esi + ecx*4]
- movss xmm2, [esi + eax*4 + 8]
- movss xmm6, [esi + ecx*4 + 8]
-
- movhps xmm4, [esi + ebx*4]
- movhps xmm5, [esi + edx*4]
-
- movss xmm0, [esi + ebx*4 + 8]
- movss xmm1, [esi + edx*4 + 8]
-
- shufps xmm2, xmm0, 0b
- shufps xmm6, xmm1, 0b
-
- movaps xmm0, xmm4
- movaps xmm1, xmm4
-
- shufps xmm2, xmm6, 10001000b
-
- shufps xmm0, xmm5, 10001000b
- shufps xmm1, xmm5, 11011101b
-
- ; move ix-iz to xmm4-xmm6
- movaps xmm4, [esp + .ix]
- movaps xmm5, [esp + .iy]
- movaps xmm6, [esp + .iz]
-
- ; calc dr
- subps xmm4, xmm0
- subps xmm5, xmm1
- subps xmm6, xmm2
-
- ; store dr
- movaps [esp + .dx], xmm4
- movaps [esp + .dy], xmm5
- movaps [esp + .dz], xmm6
- ; square it
- mulps xmm4,xmm4
- mulps xmm5,xmm5
- mulps xmm6,xmm6
- addps xmm4, xmm5
- addps xmm4, xmm6
- ; rsq in xmm4
-
- movaps xmm7, [esp + .krf]
- rsqrtps xmm5, xmm4
- ; lookup seed in xmm5
- movaps xmm2, xmm5
- mulps xmm5, xmm5
- movaps xmm1, [esp + .three]
- mulps xmm5, xmm4 ;rsq*lu*lu
- movaps xmm0, [esp + .half]
- mulps xmm7, xmm4 ; xmm7=krsq
- subps xmm1, xmm5 ; 3.0-rsq*lu*lu
- mulps xmm1, xmm2
- mulps xmm0, xmm1 ; xmm0=rinv
- movaps xmm4, xmm0
- mulps xmm4, xmm4 ; xmm4=rinvsq
- movaps xmm6, xmm0
- addps xmm6, xmm7 ; xmm6=rinv+krsq
- movaps xmm1, xmm4
- subps xmm6, [esp + .crf]
- mulps xmm1, xmm4
- mulps xmm1, xmm4 ; xmm1=rinvsix
- movaps xmm2, xmm1
- mulps xmm2, xmm2 ; xmm2=rinvtwelve
- mulps xmm6, xmm3 ; xmm6=vcoul=qq*(rinv+krsq)
- mulps xmm7, [esp + .two]
- mulps xmm1, [esp + .c6]
- mulps xmm2, [esp + .c12]
- movaps xmm5, xmm2
- subps xmm5, xmm1 ; vnb=vnb12-vnb6
- addps xmm5, [esp + .vnbtot]
- mulps xmm1, [esp + .six]
- mulps xmm2, [esp + .twelve]
- subps xmm2, xmm1
- subps xmm0, xmm7
- mulps xmm3, xmm0
- addps xmm2, xmm3
- mulps xmm4, xmm2 ; xmm4=total fscal
- addps xmm6, [esp + .vctot]
-
- movaps xmm0, [esp + .dx]
- movaps xmm1, [esp + .dy]
- movaps xmm2, [esp + .dz]
-
- movaps [esp + .vctot], xmm6
- movaps [esp + .vnbtot], xmm5
-
- mov edi, [ebp + %$faction]
- mulps xmm0, xmm4
- mulps xmm1, xmm4
- mulps xmm2, xmm4
- ; xmm0-xmm2 contains tx-tz (partial force)
- ; now update f_i
- movaps xmm3, [esp + .fix]
- movaps xmm4, [esp + .fiy]
- movaps xmm5, [esp + .fiz]
- addps xmm3, xmm0
- addps xmm4, xmm1
- addps xmm5, xmm2
- movaps [esp + .fix], xmm3
- movaps [esp + .fiy], xmm4
- movaps [esp + .fiz], xmm5
- ; the fj's - start by accumulating x & y forces from memory
- movlps xmm4, [edi + eax*4]
- movlps xmm6, [edi + ecx*4]
- movhps xmm4, [edi + ebx*4]
- movhps xmm6, [edi + edx*4]
-
- movaps xmm3, xmm4
- shufps xmm3, xmm6, 10001000b
- shufps xmm4, xmm6, 11011101b
-
- ; now xmm3-xmm5 contains fjx, fjy, fjz
- subps xmm3, xmm0
- subps xmm4, xmm1
-
- ; unpack them back so we can store them - first x & y in xmm3/xmm4
-
- movaps xmm6, xmm3
- unpcklps xmm6, xmm4
- unpckhps xmm3, xmm4
- ; xmm6(l)=x & y for j1, (h) for j2
- ; xmm3(l)=x & y for j3, (h) for j4
- movlps [edi + eax*4], xmm6
- movlps [edi + ecx*4], xmm3
-
- movhps [edi + ebx*4], xmm6
- movhps [edi + edx*4], xmm3
-
- ;; and the z forces
- movss xmm4, [edi + eax*4 + 8]
- movss xmm5, [edi + ebx*4 + 8]
- movss xmm6, [edi + ecx*4 + 8]
- movss xmm7, [edi + edx*4 + 8]
- subss xmm4, xmm2
- shufps xmm2, xmm2, 11100101b
- subss xmm5, xmm2
- shufps xmm2, xmm2, 11101010b
- subss xmm6, xmm2
- shufps xmm2, xmm2, 11111111b
- subss xmm7, xmm2
- movss [edi + eax*4 + 8], xmm4
- movss [edi + ebx*4 + 8], xmm5
- movss [edi + ecx*4 + 8], xmm6
- movss [edi + edx*4 + 8], xmm7
-
- ;; should we do one more iteration?
- sub [esp + .innerk], dword 4
- jl .finish_inner
- jmp .unroll_loop
-.finish_inner:
- ;; check if at least two particles remain
- add [esp + .innerk], dword 4
- mov edx, [esp + .innerk]
- and edx, 10b
- jnz .dopair
- jmp .checksingle
-.dopair:
- mov esi, [ebp + %$charge]
-
- mov ecx, [esp + .innerjjnr]
-
- mov eax, [ecx]
- mov ebx, [ecx + 4]
- add [esp + .innerjjnr], dword 8
-
- xorps xmm3, xmm3
- movss xmm3, [esi + eax*4]
- movss xmm6, [esi + ebx*4]
- shufps xmm3, xmm6, 00001100b
- shufps xmm3, xmm3, 01011000b ; xmm3(0,1) has the charges.
-
- mov esi, [ebp + %$type]
- mov ecx, eax
- mov edx, ebx
- mov ecx, [esi + ecx*4]
- mov edx, [esi + edx*4]
- mov esi, [ebp + %$nbfp]
- shl ecx, 1
- shl edx, 1
- mov edi, [esp + .ntia]
- add ecx, edi
- add edx, edi
- movlps xmm6, [esi + ecx*4]
- movhps xmm6, [esi + edx*4]
- mov edi, [ebp + %$pos]
- xorps xmm7,xmm7
- movaps xmm4, xmm6
- shufps xmm4, xmm4, 1000b
- shufps xmm6, xmm6, 1101b
- movlhps xmm4, xmm7
- movlhps xmm6, xmm7
-
- movaps [esp + .c6], xmm4
- movaps [esp + .c12], xmm6
-
- lea eax, [eax + eax*2]
- lea ebx, [ebx + ebx*2]
- ; move coordinates to xmm0-xmm2
- movlps xmm1, [edi + eax*4]
- movss xmm2, [edi + eax*4 + 8]
- movhps xmm1, [edi + ebx*4]
- movss xmm0, [edi + ebx*4 + 8]
-
- mulps xmm3, [esp + .iq]
-
- movlhps xmm3, xmm7
-
- shufps xmm2, xmm0, 0b
-
- movaps xmm0, xmm1
-
- shufps xmm2, xmm2, 10001000b
-
- shufps xmm0, xmm0, 10001000b
- shufps xmm1, xmm1, 11011101b
-
- mov edi, [ebp + %$faction]
- ; move ix-iz to xmm4-xmm6
- xorps xmm7, xmm7
-
- movaps xmm4, [esp + .ix]
- movaps xmm5, [esp + .iy]
- movaps xmm6, [esp + .iz]
-
- ; calc dr
- subps xmm4, xmm0
- subps xmm5, xmm1
- subps xmm6, xmm2
-
- ; store dr
- movaps [esp + .dx], xmm4
- movaps [esp + .dy], xmm5
- movaps [esp + .dz], xmm6
- ; square it
- mulps xmm4,xmm4
- mulps xmm5,xmm5
- mulps xmm6,xmm6
- addps xmm4, xmm5
- addps xmm4, xmm6
- ; rsq in xmm4
-
- movaps xmm7, [esp + .krf]
- rsqrtps xmm5, xmm4
- ; lookup seed in xmm5
- movaps xmm2, xmm5
- mulps xmm5, xmm5
- movaps xmm1, [esp + .three]
- mulps xmm5, xmm4 ;rsq*lu*lu
- movaps xmm0, [esp + .half]
- mulps xmm7, xmm4 ; xmm7=krsq
- subps xmm1, xmm5 ; 3.0-rsq*lu*lu
- mulps xmm1, xmm2
- mulps xmm0, xmm1 ; xmm0=rinv
- movaps xmm4, xmm0
- mulps xmm4, xmm4 ; xmm4=rinvsq
- movaps xmm6, xmm0
- addps xmm6, xmm7 ; xmm6=rinv+krsq
- movaps xmm1, xmm4
- subps xmm6, [esp + .crf]
- mulps xmm1, xmm4
- mulps xmm1, xmm4 ; xmm1=rinvsix
- movaps xmm2, xmm1
- mulps xmm2, xmm2 ; xmm2=rinvtwelve
- mulps xmm6, xmm3 ; xmm6=vcoul=qq*(rinv+krsq-crf)
- mulps xmm7, [esp + .two]
- mulps xmm1, [esp + .c6]
- mulps xmm2, [esp + .c12]
- movaps xmm5, xmm2
- subps xmm5, xmm1 ; vnb=vnb12-vnb6
- addps xmm5, [esp + .vnbtot]
- mulps xmm1, [esp + .six]
- mulps xmm2, [esp + .twelve]
- subps xmm2, xmm1
- subps xmm0, xmm7
- mulps xmm3, xmm0
- addps xmm2, xmm3
- mulps xmm4, xmm2 ; xmm4=total fscal
- addps xmm6, [esp + .vctot]
-
- movaps xmm0, [esp + .dx]
- movaps xmm1, [esp + .dy]
- movaps xmm2, [esp + .dz]
-
- movaps [esp + .vctot], xmm6
- movaps [esp + .vnbtot], xmm5
-
- mulps xmm0, xmm4
- mulps xmm1, xmm4
- mulps xmm2, xmm4
- ; xmm0-xmm2 contains tx-tz (partial force)
- ; now update f_i
- movaps xmm3, [esp + .fix]
- movaps xmm4, [esp + .fiy]
- movaps xmm5, [esp + .fiz]
- addps xmm3, xmm0
- addps xmm4, xmm1
- addps xmm5, xmm2
- movaps [esp + .fix], xmm3
- movaps [esp + .fiy], xmm4
- movaps [esp + .fiz], xmm5
- ; update the fj's
- movss xmm3, [edi + eax*4]
- movss xmm4, [edi + eax*4 + 4]
- movss xmm5, [edi + eax*4 + 8]
- subss xmm3, xmm0
- subss xmm4, xmm1
- subss xmm5, xmm2
- movss [edi + eax*4], xmm3
- movss [edi + eax*4 + 4], xmm4
- movss [edi + eax*4 + 8], xmm5
-
- shufps xmm0, xmm0, 11100001b
- shufps xmm1, xmm1, 11100001b
- shufps xmm2, xmm2, 11100001b
-
- movss xmm3, [edi + ebx*4]
- movss xmm4, [edi + ebx*4 + 4]
- movss xmm5, [edi + ebx*4 + 8]
- subss xmm3, xmm0
- subss xmm4, xmm1
- subss xmm5, xmm2
- movss [edi + ebx*4], xmm3
- movss [edi + ebx*4 + 4], xmm4
- movss [edi + ebx*4 + 8], xmm5
-
-.checksingle:
- mov edx, [esp + .innerk]
- and edx, 1b
- jnz .dosingle
- jmp .updateouterdata
-.dosingle:
- mov esi, [ebp + %$charge]
- mov edi, [ebp + %$pos]
- mov ecx, [esp + .innerjjnr]
- xorps xmm3, xmm3
- mov eax, [ecx]
- movss xmm3, [esi + eax*4] ; xmm3(0) has the charge
-
- mov esi, [ebp + %$type]
- mov ecx, eax
- mov ecx, [esi + ecx*4]
- mov esi, [ebp + %$nbfp]
- shl ecx, 1
- add ecx, [esp + .ntia]
- xorps xmm6, xmm6
- movlps xmm6, [esi + ecx*4]
- movaps xmm4, xmm6
- shufps xmm4, xmm4, 11111100b
- shufps xmm6, xmm6, 11111101b
-
- movaps [esp + .c6], xmm4
- movaps [esp + .c12], xmm6
-
- lea eax, [eax + eax*2]
-
- ; move coordinates to xmm0-xmm2
- movss xmm0, [edi + eax*4]
- movss xmm1, [edi + eax*4 + 4]
- movss xmm2, [edi + eax*4 + 8]
-
- mulps xmm3, [esp + .iq]
-
- xorps xmm7, xmm7
-
- movaps xmm4, [esp + .ix]
- movaps xmm5, [esp + .iy]
- movaps xmm6, [esp + .iz]
-
- ; calc dr
- subps xmm4, xmm0
- subps xmm5, xmm1
- subps xmm6, xmm2
-
- ; store dr
- movaps [esp + .dx], xmm4
- movaps [esp + .dy], xmm5
- movaps [esp + .dz], xmm6
- ; square it
- mulps xmm4,xmm4
- mulps xmm5,xmm5
- mulps xmm6,xmm6
- addps xmm4, xmm5
- addps xmm4, xmm6
- ; rsq in xmm4
-
- movaps xmm7, [esp + .krf]
- rsqrtps xmm5, xmm4
- ; lookup seed in xmm5
- movaps xmm2, xmm5
- mulps xmm5, xmm5
- movaps xmm1, [esp + .three]
- mulps xmm5, xmm4 ;rsq*lu*lu
- movaps xmm0, [esp + .half]
- mulps xmm7, xmm4 ; xmm7=krsq
- subps xmm1, xmm5 ; 3.0-rsq*lu*lu
- mulps xmm1, xmm2
- mulps xmm0, xmm1 ; xmm0=rinv
- movaps xmm4, xmm0
- mulps xmm4, xmm4 ; xmm4=rinvsq
- movaps xmm6, xmm0
- addps xmm6, xmm7 ; xmm6=rinv+krsq
- movaps xmm1, xmm4
- subps xmm6, [esp + .crf]
- mulps xmm1, xmm4
- mulps xmm1, xmm4 ; xmm1=rinvsix
- movaps xmm2, xmm1
- mulps xmm2, xmm2 ; xmm2=rinvtwelve
- mulps xmm6, xmm3 ; xmm6=vcoul
- mulps xmm7, [esp + .two]
- mulps xmm1, [esp + .c6]
- mulps xmm2, [esp + .c12]
- movaps xmm5, xmm2
- subps xmm5, xmm1 ; vnb=vnb12-vnb6
- addps xmm5, [esp + .vnbtot]
- mulps xmm1, [esp + .six]
- mulps xmm2, [esp + .twelve]
- subps xmm2, xmm1
- subps xmm0, xmm7
- mulps xmm3, xmm0
- addps xmm2, xmm3
- mulps xmm4, xmm2 ; xmm4=total fscal
- addps xmm6, [esp + .vctot]
-
- mov edi, [ebp + %$faction]
-
- movaps xmm0, [esp + .dx]
- movaps xmm1, [esp + .dy]
- movaps xmm2, [esp + .dz]
-
- movaps [esp + .vctot], xmm6
- movaps [esp + .vnbtot], xmm5
-
- mulps xmm0, xmm4
- mulps xmm1, xmm4
- mulps xmm2, xmm4
- ; xmm0-xmm2 contains tx-tz (partial force)
- ; now update f_i
- movaps xmm3, [esp + .fix]
- movaps xmm4, [esp + .fiy]
- movaps xmm5, [esp + .fiz]
- addps xmm3, xmm0
- addps xmm4, xmm1
- addps xmm5, xmm2
- movaps [esp + .fix], xmm3
- movaps [esp + .fiy], xmm4
- movaps [esp + .fiz], xmm5
- ; update fj
-
- movss xmm3, [edi + eax*4]
- movss xmm4, [edi + eax*4 + 4]
- movss xmm5, [edi + eax*4 + 8]
- subss xmm3, xmm0
- subss xmm4, xmm1
- subss xmm5, xmm2
- movss [edi + eax*4], xmm3
- movss [edi + eax*4 + 4], xmm4
- movss [edi + eax*4 + 8], xmm5
-.updateouterdata:
- mov ecx, [esp + .ii3]
- mov edi, [ebp + %$faction]
- mov esi, [ebp + %$fshift]
- mov edx, [esp + .is3]
-
- ; accumulate i forces in xmm0, xmm1, xmm2
- movaps xmm0, [esp + .fix]
- movaps xmm1, [esp + .fiy]
- movaps xmm2, [esp + .fiz]
-
- movhlps xmm3, xmm0
- movhlps xmm4, xmm1
- movhlps xmm5, xmm2
- addps xmm0, xmm3
- addps xmm1, xmm4
- addps xmm2, xmm5 ; summan ligger i 1/2 i xmm0-xmm2
-
- movaps xmm3, xmm0
- movaps xmm4, xmm1
- movaps xmm5, xmm2
-
- shufps xmm3, xmm3, 1b
- shufps xmm4, xmm4, 1b
- shufps xmm5, xmm5, 1b
- addss xmm0, xmm3
- addss xmm1, xmm4
- addss xmm2, xmm5 ; xmm0-xmm2 has single force in pos0.
-
- ; increment i force
- movss xmm3, [edi + ecx*4]
- movss xmm4, [edi + ecx*4 + 4]
- movss xmm5, [edi + ecx*4 + 8]
- addss xmm3, xmm0
- addss xmm4, xmm1
- addss xmm5, xmm2
- movss [edi + ecx*4], xmm3
- movss [edi + ecx*4 + 4], xmm4
- movss [edi + ecx*4 + 8], xmm5
-
- ; increment fshift force
- movss xmm3, [esi + edx*4]
- movss xmm4, [esi + edx*4 + 4]
- movss xmm5, [esi + edx*4 + 8]
- addss xmm3, xmm0
- addss xmm4, xmm1
- addss xmm5, xmm2
- movss [esi + edx*4], xmm3
- movss [esi + edx*4 + 4], xmm4
- movss [esi + edx*4 + 8], xmm5
-
- ; get group index for i particle
- mov edx, [ebp + %$gid] ; get group index for this i particle
- mov edx, [edx]
- add [ebp + %$gid], dword 4 ; advance pointer
-
- ; accumulate total potential energy and update it.
- movaps xmm7, [esp + .vctot]
- ; accumulate
- movhlps xmm6, xmm7
- addps xmm7, xmm6 ; pos 0-1 in xmm7 have the sum now
- movaps xmm6, xmm7
- shufps xmm6, xmm6, 1b
- addss xmm7, xmm6
-
- ; add earlier value from mem.
- mov eax, [ebp + %$Vc]
- addss xmm7, [eax + edx*4]
- ; move back to mem.
- movss [eax + edx*4], xmm7
-
- ; accumulate total lj energy and update it.
- movaps xmm7, [esp + .vnbtot]
- ; accumulate
- movhlps xmm6, xmm7
- addps xmm7, xmm6 ; pos 0-1 in xmm7 have the sum now
- movaps xmm6, xmm7
- shufps xmm6, xmm6, 1b
- addss xmm7, xmm6
-
- ; add earlier value from mem.
- mov eax, [ebp + %$Vnb]
- addss xmm7, [eax + edx*4]
- ; move back to mem.
- movss [eax + edx*4], xmm7
-
- ;; finish if last
- mov ecx, [ebp + %$nri]
- dec ecx
- jecxz .end
- ;; not last, iterate once more!
- mov [ebp + %$nri], ecx
- jmp .outer
-.end:
- emms
- mov eax, [esp + .salign]
- add esp, eax
- add esp, dword 360
- pop edi
- pop esi
- pop edx
- pop ecx
- pop ebx
- pop eax
- endproc
-
-
-
-proc inl2000_sse
-%$nri arg
-%$iinr arg
-%$jindex arg
-%$jjnr arg
-%$shift arg
-%$shiftvec arg
-%$fshift arg
-%$gid arg
-%$pos arg
-%$faction arg
-%$charge arg
-%$facel arg
-%$Vc arg
-%$krf arg
-%$crf arg
- ;; stack offsets for local variables
- ;; bottom of stack is cache-aligned for sse use
-.ix equ 0
-.iy equ 16
-.iz equ 32
-.iq equ 48
-.dx equ 64
-.dy equ 80
-.dz equ 96
-.vctot equ 112
-.fix equ 128
-.fiy equ 144
-.fiz equ 160
-.half equ 176
-.three equ 192
-.two equ 208
-.krf equ 224
-.crf equ 240
-.is3 equ 256
-.ii3 equ 260
-.innerjjnr equ 264
-.innerk equ 268
-.salign equ 272
- push eax
- push ebx
- push ecx
- push edx
- push esi
- push edi
- sub esp, dword 276 ; local stack space
- mov eax, esp
- and eax, 0xf
- sub esp, eax
- mov [esp + .salign], eax
-
- emms
-
- movups xmm0, [sse_half]
- movups xmm1, [sse_three]
- movups xmm4, [sse_two]
- movss xmm5, [ebp + %$krf]
- movss xmm6, [ebp + %$crf]
-
- movaps [esp + .half], xmm0
- movaps [esp + .three], xmm1
- movaps [esp + .two], xmm4
- shufps xmm5, xmm5, 0b
- movaps [esp + .krf], xmm5
- shufps xmm6, xmm6, 0b
- movaps [esp + .crf], xmm6
-
- ;; assume we have at least one i particle - start directly
-.outer:
- mov eax, [ebp + %$shift] ; eax = pointer into shift[]
- mov ebx, [eax] ; ebx=shift[n]
- add [ebp + %$shift], dword 4 ; advance pointer one step
-
- lea ebx, [ebx + ebx*2] ; ebx=3*is
- mov [esp + .is3],ebx ; store is3
-
- mov eax, [ebp + %$shiftvec] ; eax = base of shiftvec[]
-
- movss xmm0, [eax + ebx*4]
- movss xmm1, [eax + ebx*4 + 4]
- movss xmm2, [eax + ebx*4 + 8]
-
- mov ecx, [ebp + %$iinr] ; ecx = pointer into iinr[]
- add [ebp + %$iinr], dword 4 ; advance pointer
- mov ebx, [ecx] ; ebx =ii
-
- mov edx, [ebp + %$charge]
- movss xmm3, [edx + ebx*4]
- mulss xmm3, [ebp + %$facel]
- shufps xmm3, xmm3, 0b
-
- lea ebx, [ebx + ebx*2] ; ebx = 3*ii=ii3
- mov eax, [ebp + %$pos] ; eax = base of pos[]
-
- addss xmm0, [eax + ebx*4]
- addss xmm1, [eax + ebx*4 + 4]
- addss xmm2, [eax + ebx*4 + 8]
-
- movaps [esp + .iq], xmm3
-
- shufps xmm0, xmm0, 0b
- shufps xmm1, xmm1, 0b
- shufps xmm2, xmm2, 0b
-
- movaps [esp + .ix], xmm0
- movaps [esp + .iy], xmm1
- movaps [esp + .iz], xmm2
-
- mov [esp + .ii3], ebx
-
- ; clear vctot and i forces
- xorps xmm4, xmm4
- movaps [esp + .vctot], xmm4
- movaps [esp + .fix], xmm4
- movaps [esp + .fiy], xmm4
- movaps [esp + .fiz], xmm4
-
- mov eax, [ebp + %$jindex]
- mov ecx, [eax] ; jindex[n]
- mov edx, [eax + 4] ; jindex[n+1]
- add [ebp + %$jindex], dword 4
- sub edx, ecx ; number of innerloop atoms
-
- mov esi, [ebp + %$pos]
- mov edi, [ebp + %$faction]
- mov eax, [ebp + %$jjnr]
- shl ecx, 2
- add eax, ecx
- mov [esp + .innerjjnr], eax ; pointer to jjnr[nj0]
- sub edx, dword 4
- mov [esp + .innerk], edx ; number of innerloop atoms
- jge .unroll_loop
- jmp .finish_inner
-.unroll_loop:
- ;; quad-unroll innerloop here.
- mov edx, [esp + .innerjjnr] ; pointer to jjnr[k]
- mov eax, [edx]
- mov ebx, [edx + 4]
- mov ecx, [edx + 8]
- mov edx, [edx + 12] ; eax-edx=jnr1-4
- add [esp + .innerjjnr], dword 16 ; advance pointer (unrolled 4)
-
- mov esi, [ebp + %$charge] ; base of charge[]
-
- movss xmm3, [esi + eax*4]
- movss xmm4, [esi + ecx*4]
- movss xmm6, [esi + ebx*4]
- movss xmm7, [esi + edx*4]
-
- movaps xmm2, [esp + .iq]
- shufps xmm3, xmm6, 00000000b
- shufps xmm4, xmm7, 00000000b
- shufps xmm3, xmm4, 10001000b ; all charges in xmm3
-
- mov esi, [ebp + %$pos] ; base of pos[]
-
- lea eax, [eax + eax*2] ; replace jnr with j3
- lea ebx, [ebx + ebx*2]
-
- mulps xmm3, xmm2
- lea ecx, [ecx + ecx*2] ; replace jnr with j3
- lea edx, [edx + edx*2]
-
- ; move four coordinates to xmm0-xmm2
-
- movlps xmm4, [esi + eax*4]
- movlps xmm5, [esi + ecx*4]
- movss xmm2, [esi + eax*4 + 8]
- movss xmm6, [esi + ecx*4 + 8]
-
- movhps xmm4, [esi + ebx*4]
- movhps xmm5, [esi + edx*4]
-
- movss xmm0, [esi + ebx*4 + 8]
- movss xmm1, [esi + edx*4 + 8]
-
- shufps xmm2, xmm0, 0b
- shufps xmm6, xmm1, 0b
-
- movaps xmm0, xmm4
- movaps xmm1, xmm4
-
- shufps xmm2, xmm6, 10001000b
-
- shufps xmm0, xmm5, 10001000b
- shufps xmm1, xmm5, 11011101b
-
- ; move ix-iz to xmm4-xmm6
- movaps xmm4, [esp + .ix]
- movaps xmm5, [esp + .iy]
- movaps xmm6, [esp + .iz]
-
- ; calc dr
- subps xmm4, xmm0
- subps xmm5, xmm1
- subps xmm6, xmm2
-
- ; store dr
- movaps [esp + .dx], xmm4
- movaps [esp + .dy], xmm5
- movaps [esp + .dz], xmm6
- ; square it
- mulps xmm4,xmm4
- mulps xmm5,xmm5
- mulps xmm6,xmm6
- addps xmm4, xmm5
- addps xmm4, xmm6
- ; rsq in xmm4
-
- movaps xmm7, [esp + .krf]
- rsqrtps xmm5, xmm4
- ; lookup seed in xmm5
- movaps xmm2, xmm5
- mulps xmm5, xmm5
- movaps xmm1, [esp + .three]
- mulps xmm5, xmm4 ;rsq*lu*lu
- movaps xmm0, [esp + .half]
- mulps xmm7, xmm4 ; xmm7=krsq
- subps xmm1, xmm5 ; 3.0-rsq*lu*lu
- mulps xmm1, xmm2
- mulps xmm0, xmm1 ; xmm0=rinv
- movaps xmm4, xmm0
- mulps xmm4, xmm4 ; xmm4=rinvsq
- movaps xmm6, xmm0
- addps xmm6, xmm7 ; xmm6=rinv+krsq
-
- subps xmm6, [esp + .crf] ; xmm6=rinv+krsq-crf
-
- mulps xmm6, xmm3 ; xmm6=vcoul=qq*(rinv+krsq)
- mulps xmm7, [esp + .two]
-
- subps xmm0, xmm7
- mulps xmm3, xmm0
- mulps xmm4, xmm3 ; xmm4=total fscal
- addps xmm6, [esp + .vctot]
-
- movaps xmm0, [esp + .dx]
- movaps xmm1, [esp + .dy]
- movaps xmm2, [esp + .dz]
-
- movaps [esp + .vctot], xmm6
-
- mov edi, [ebp + %$faction]
- mulps xmm0, xmm4
- mulps xmm1, xmm4
- mulps xmm2, xmm4
- ; xmm0-xmm2 contains tx-tz (partial force)
- ; now update f_i
- movaps xmm3, [esp + .fix]
- movaps xmm4, [esp + .fiy]
- movaps xmm5, [esp + .fiz]
- addps xmm3, xmm0
- addps xmm4, xmm1
- addps xmm5, xmm2
- movaps [esp + .fix], xmm3
- movaps [esp + .fiy], xmm4
- movaps [esp + .fiz], xmm5
- ; the fj's - start by accumulating x & y forces from memory
- movlps xmm4, [edi + eax*4]
- movlps xmm6, [edi + ecx*4]
- movhps xmm4, [edi + ebx*4]
- movhps xmm6, [edi + edx*4]
-
- movaps xmm3, xmm4
- shufps xmm3, xmm6, 10001000b
- shufps xmm4, xmm6, 11011101b
-
- ; now xmm3-xmm5 contains fjx, fjy, fjz
- subps xmm3, xmm0
- subps xmm4, xmm1
-
- ; unpack them back so we can store them - first x & y in xmm3/xmm4
-
- movaps xmm6, xmm3
- unpcklps xmm6, xmm4
- unpckhps xmm3, xmm4
- ; xmm6(l)=x & y for j1, (h) for j2
- ; xmm3(l)=x & y for j3, (h) for j4
- movlps [edi + eax*4], xmm6
- movlps [edi + ecx*4], xmm3
-
- movhps [edi + ebx*4], xmm6
- movhps [edi + edx*4], xmm3
-
- ;; and the z forces
- movss xmm4, [edi + eax*4 + 8]
- movss xmm5, [edi + ebx*4 + 8]
- movss xmm6, [edi + ecx*4 + 8]
- movss xmm7, [edi + edx*4 + 8]
- subss xmm4, xmm2
- shufps xmm2, xmm2, 11100101b
- subss xmm5, xmm2
- shufps xmm2, xmm2, 11101010b
- subss xmm6, xmm2
- shufps xmm2, xmm2, 11111111b
- subss xmm7, xmm2
- movss [edi + eax*4 + 8], xmm4
- movss [edi + ebx*4 + 8], xmm5
- movss [edi + ecx*4 + 8], xmm6
- movss [edi + edx*4 + 8], xmm7
-
- ;; should we do one more iteration?
- sub [esp + .innerk], dword 4
- jl .finish_inner
- jmp .unroll_loop
-.finish_inner:
- ;; check if at least two particles remain
- add [esp + .innerk], dword 4
- mov edx, [esp + .innerk]
- and edx, 10b
- jnz .dopair
- jmp .checksingle
-.dopair:
- mov esi, [ebp + %$charge]
-
- mov ecx, [esp + .innerjjnr]
-
- mov eax, [ecx]
- mov ebx, [ecx + 4]
- add [esp + .innerjjnr], dword 8
-
- xorps xmm3, xmm3
- movss xmm3, [esi + eax*4]
- movss xmm6, [esi + ebx*4]
- shufps xmm3, xmm6, 00001100b
- shufps xmm3, xmm3, 01011000b ; xmm3(0,1) has the charges.
-
- mov edi, [ebp + %$pos]
-
- lea eax, [eax + eax*2]
- lea ebx, [ebx + ebx*2]
- ; move coordinates to xmm0-xmm2
- movlps xmm1, [edi + eax*4]
- movss xmm2, [edi + eax*4 + 8]
- movhps xmm1, [edi + ebx*4]
- movss xmm0, [edi + ebx*4 + 8]
-
- mulps xmm3, [esp + .iq]
-
- xorps xmm7,xmm7
- movlhps xmm3, xmm7
-
- shufps xmm2, xmm0, 0b
-
- movaps xmm0, xmm1
-
- shufps xmm2, xmm2, 10001000b
-
- shufps xmm0, xmm0, 10001000b
- shufps xmm1, xmm1, 11011101b
-
- mov edi, [ebp + %$faction]
- ; move ix-iz to xmm4-xmm6
-
- movaps xmm4, [esp + .ix]
- movaps xmm5, [esp + .iy]
- movaps xmm6, [esp + .iz]
-
- ; calc dr
- subps xmm4, xmm0
- subps xmm5, xmm1
- subps xmm6, xmm2
-
- ; store dr
- movaps [esp + .dx], xmm4
- movaps [esp + .dy], xmm5
- movaps [esp + .dz], xmm6
- ; square it
- mulps xmm4,xmm4
- mulps xmm5,xmm5
- mulps xmm6,xmm6
- addps xmm4, xmm5
- addps xmm4, xmm6
- ; rsq in xmm4
-
- movaps xmm7, [esp + .krf]
- rsqrtps xmm5, xmm4
- ; lookup seed in xmm5
- movaps xmm2, xmm5
- mulps xmm5, xmm5
- movaps xmm1, [esp + .three]
- mulps xmm5, xmm4 ;rsq*lu*lu
- movaps xmm0, [esp + .half]
- mulps xmm7, xmm4 ; xmm7=krsq
- subps xmm1, xmm5 ; 3.0-rsq*lu*lu
- mulps xmm1, xmm2
- mulps xmm0, xmm1 ; xmm0=rinv
-
- movaps xmm4, xmm0
- mulps xmm4, xmm4 ; xmm4=rinvsq
- movaps xmm6, xmm0
- addps xmm6, xmm7 ; xmm6=rinv+krsq
-
- subps xmm6, [esp + .crf] ; xmm6=rinv+krsq-crf
-
- mulps xmm6, xmm3 ; xmm6=vcoul=qq*(rinv+krsq-crf)
- mulps xmm7, [esp + .two]
-
- subps xmm0, xmm7
- mulps xmm3, xmm0
-
- mulps xmm4, xmm3 ; xmm4=total fscal
- addps xmm6, [esp + .vctot]
-
- movaps xmm0, [esp + .dx]
- movaps xmm1, [esp + .dy]
- movaps xmm2, [esp + .dz]
-
- movaps [esp + .vctot], xmm6
-
- mulps xmm0, xmm4
- mulps xmm1, xmm4
- mulps xmm2, xmm4
- ; xmm0-xmm2 contains tx-tz (partial force)
- ; now update f_i
- movaps xmm3, [esp + .fix]
- movaps xmm4, [esp + .fiy]
- movaps xmm5, [esp + .fiz]
- addps xmm3, xmm0
- addps xmm4, xmm1
- addps xmm5, xmm2
- movaps [esp + .fix], xmm3
- movaps [esp + .fiy], xmm4
- movaps [esp + .fiz], xmm5
- ; update the fj's
- movss xmm3, [edi + eax*4]
- movss xmm4, [edi + eax*4 + 4]
- movss xmm5, [edi + eax*4 + 8]
- subss xmm3, xmm0
- subss xmm4, xmm1
- subss xmm5, xmm2
- movss [edi + eax*4], xmm3
- movss [edi + eax*4 + 4], xmm4
- movss [edi + eax*4 + 8], xmm5
-
- shufps xmm0, xmm0, 11100001b
- shufps xmm1, xmm1, 11100001b
- shufps xmm2, xmm2, 11100001b
-
- movss xmm3, [edi + ebx*4]
- movss xmm4, [edi + ebx*4 + 4]
- movss xmm5, [edi + ebx*4 + 8]
- subss xmm3, xmm0
- subss xmm4, xmm1
- subss xmm5, xmm2
- movss [edi + ebx*4], xmm3
- movss [edi + ebx*4 + 4], xmm4
- movss [edi + ebx*4 + 8], xmm5
-
-.checksingle:
- mov edx, [esp + .innerk]
- and edx, 1b
- jnz .dosingle
- jmp .updateouterdata
-.dosingle:
- mov esi, [ebp + %$charge]
- mov edi, [ebp + %$pos]
- mov ecx, [esp + .innerjjnr]
- xorps xmm3, xmm3
- mov eax, [ecx]
- movss xmm3, [esi + eax*4] ; xmm3(0) has the charge
-
- lea eax, [eax + eax*2]
-
- ; move coordinates to xmm0-xmm2
- movss xmm0, [edi + eax*4]
- movss xmm1, [edi + eax*4 + 4]
- movss xmm2, [edi + eax*4 + 8]
-
- mulps xmm3, [esp + .iq]
-
- xorps xmm7, xmm7
-
- movaps xmm4, [esp + .ix]
- movaps xmm5, [esp + .iy]
- movaps xmm6, [esp + .iz]
-
- ; calc dr
- subps xmm4, xmm0
- subps xmm5, xmm1
- subps xmm6, xmm2
-
- ; store dr
- movaps [esp + .dx], xmm4
- movaps [esp + .dy], xmm5
- movaps [esp + .dz], xmm6
- ; square it
- mulps xmm4,xmm4
- mulps xmm5,xmm5
- mulps xmm6,xmm6
- addps xmm4, xmm5
- addps xmm4, xmm6
- ; rsq in xmm4
-
- movaps xmm7, [esp + .krf]
- rsqrtps xmm5, xmm4
- ; lookup seed in xmm5
- movaps xmm2, xmm5
- mulps xmm5, xmm5
- movaps xmm1, [esp + .three]
- mulps xmm5, xmm4 ;rsq*lu*lu
- movaps xmm0, [esp + .half]
- mulps xmm7, xmm4 ; xmm7=krsq
- subps xmm1, xmm5 ; 3.0-rsq*lu*lu
- mulps xmm1, xmm2
- mulps xmm0, xmm1 ; xmm0=rinv
- movaps xmm4, xmm0
- mulps xmm4, xmm4 ; xmm4=rinvsq
- movaps xmm6, xmm0
- addps xmm6, xmm7 ; xmm6=rinv+krsq
-
- subps xmm6, [esp + .crf] ; xmm6=rinv+krsq-crf
-
- mulps xmm6, xmm3 ; xmm6=vcoul
- mulps xmm7, [esp + .two]
-
- subps xmm0, xmm7
- mulps xmm3, xmm0
- mulps xmm4, xmm3 ; xmm4=total fscal
- addps xmm6, [esp + .vctot]
-
- mov edi, [ebp + %$faction]
-
- movaps xmm0, [esp + .dx]
- movaps xmm1, [esp + .dy]
- movaps xmm2, [esp + .dz]
-
- movaps [esp + .vctot], xmm6
-
- mulps xmm0, xmm4
- mulps xmm1, xmm4
- mulps xmm2, xmm4
- ; xmm0-xmm2 contains tx-tz (partial force)
- ; now update f_i
- movaps xmm3, [esp + .fix]
- movaps xmm4, [esp + .fiy]
- movaps xmm5, [esp + .fiz]
- addps xmm3, xmm0
- addps xmm4, xmm1
- addps xmm5, xmm2
- movaps [esp + .fix], xmm3
- movaps [esp + .fiy], xmm4
- movaps [esp + .fiz], xmm5
- ; update fj
-
- movss xmm3, [edi + eax*4]
- movss xmm4, [edi + eax*4 + 4]
- movss xmm5, [edi + eax*4 + 8]
- subss xmm3, xmm0
- subss xmm4, xmm1
- subss xmm5, xmm2
- movss [edi + eax*4], xmm3
- movss [edi + eax*4 + 4], xmm4
- movss [edi + eax*4 + 8], xmm5
-.updateouterdata:
- mov ecx, [esp + .ii3]
- mov edi, [ebp + %$faction]
- mov esi, [ebp + %$fshift]
- mov edx, [esp + .is3]
-
- ; accumulate i forces in xmm0, xmm1, xmm2
- movaps xmm0, [esp + .fix]
- movaps xmm1, [esp + .fiy]
- movaps xmm2, [esp + .fiz]
-
- movhlps xmm3, xmm0
- movhlps xmm4, xmm1
- movhlps xmm5, xmm2
- addps xmm0, xmm3
- addps xmm1, xmm4
- addps xmm2, xmm5 ; summan ligger i 1/2 i xmm0-xmm2
-
- movaps xmm3, xmm0
- movaps xmm4, xmm1
- movaps xmm5, xmm2
-
- shufps xmm3, xmm3, 1b
- shufps xmm4, xmm4, 1b
- shufps xmm5, xmm5, 1b
- addss xmm0, xmm3
- addss xmm1, xmm4
- addss xmm2, xmm5 ; xmm0-xmm2 has single force in pos0.
-
- ; increment i force
- movss xmm3, [edi + ecx*4]
- movss xmm4, [edi + ecx*4 + 4]
- movss xmm5, [edi + ecx*4 + 8]
- addss xmm3, xmm0
- addss xmm4, xmm1
- addss xmm5, xmm2
- movss [edi + ecx*4], xmm3
- movss [edi + ecx*4 + 4], xmm4
- movss [edi + ecx*4 + 8], xmm5
-
- ; increment fshift force
- movss xmm3, [esi + edx*4]
- movss xmm4, [esi + edx*4 + 4]
- movss xmm5, [esi + edx*4 + 8]
- addss xmm3, xmm0
- addss xmm4, xmm1
- addss xmm5, xmm2
- movss [esi + edx*4], xmm3
- movss [esi + edx*4 + 4], xmm4
- movss [esi + edx*4 + 8], xmm5
-
- ; get group index for i particle
- mov edx, [ebp + %$gid] ; get group index for this i particle
- mov edx, [edx]
- add [ebp + %$gid], dword 4 ; advance pointer
-
- ; accumulate total potential energy and update it.
- movaps xmm7, [esp + .vctot]
- ; accumulate
- movhlps xmm6, xmm7
- addps xmm7, xmm6 ; pos 0-1 in xmm7 have the sum now
- movaps xmm6, xmm7
- shufps xmm6, xmm6, 1b
- addss xmm7, xmm6
-
- ; add earlier value from mem.
- mov eax, [ebp + %$Vc]
- addss xmm7, [eax + edx*4]
- ; move back to mem.
- movss [eax + edx*4], xmm7
-
- ;; finish if last
- mov ecx, [ebp + %$nri]
- dec ecx
- jecxz .end
- ;; not last, iterate once more!
- mov [ebp + %$nri], ecx
- jmp .outer
-.end:
- emms
- mov eax, [esp + .salign]
- add esp, eax
- add esp, dword 276
- pop edi
- pop esi
- pop edx
- pop ecx
- pop ebx
- pop eax
- endproc
-
-
-
-
-
-proc inl1110_sse
-%$nri arg
-%$iinr arg
-%$jindex arg
-%$jjnr arg
-%$shift arg
-%$shiftvec arg
-%$fshift arg
-%$gid arg
-%$pos arg
-%$faction arg
-%$charge arg
-%$facel arg
-%$Vc arg
-%$type arg
-%$ntype arg
-%$nbfp arg
-%$Vnb arg
-%$nsatoms arg
- ;; stack offsets for local variables
- ;; bottom of stack is cache-aligned for sse use
-.ix equ 0
-.iy equ 16
-.iz equ 32
-.iq equ 48
-.dx equ 64
-.dy equ 80
-.dz equ 96
-.c6 equ 112
-.c12 equ 128
-.two equ 144
-.six equ 160
-.twelve equ 176
-.vctot equ 192
-.vnbtot equ 208
-.fix equ 224
-.fiy equ 240
-.fiz equ 256
-.half equ 272
-.three equ 288
-.is3 equ 304
-.ii3 equ 308
-.shX equ 312
-.shY equ 316
-.shZ equ 320
-.ntia equ 324
-.innerjjnr0 equ 328
-.innerk0 equ 332
-.innerjjnr equ 336
-.innerk equ 340
-.salign equ 344
-.nsvdwc equ 348
-.nscoul equ 352
-.nsvdw equ 356
-.solnr equ 360
-
- push eax
- push ebx
- push ecx
- push edx
- push esi
- push edi
- sub esp, 364 ; local stack space
- mov eax, esp
- and eax, 0xf
- sub esp, eax
- mov [esp + .salign], eax
-
- emms
-
- movups xmm0, [sse_half]
- movups xmm1, [sse_two]
- movups xmm2, [sse_three]
- movups xmm3, [sse_six]
- movups xmm4, [sse_twelve]
- movaps [esp + .half], xmm0
- movaps [esp + .two], xmm1
- movaps [esp + .three], xmm2
- movaps [esp + .six], xmm3
- movaps [esp + .twelve], xmm4
-
- ;; assume we have at least one i particle - start directly
-.outer:
- mov eax, [ebp + %$shift] ; eax = pointer into shift[]
- mov ebx, [eax] ; ebx=shift[n]
- add [ebp + %$shift], dword 4 ; advance pointer one step
-
- lea ebx, [ebx + ebx*2] ; ebx=3*is
- mov [esp + .is3],ebx ; store is3
-
- mov eax, [ebp + %$shiftvec] ; eax = base of shiftvec[]
-
- movlps xmm0, [eax + ebx*4]
- movss xmm1, [eax + ebx*4 + 8]
- movlps [esp + .shX], xmm0
- movss [esp + .shZ], xmm1
-
- mov ecx, [ebp + %$iinr] ; ecx = pointer into iinr[]
- add [ebp + %$iinr], dword 4 ; advance pointer
- mov ebx, [ecx] ; ebx =ii
-
- mov eax, [ebp + %$nsatoms]
- add [ebp + %$nsatoms], dword 12
- mov ecx, [eax]
- mov edx, [eax + 4]
- mov eax, [eax + 8]
- sub ecx, eax
- sub eax, edx
-
- mov [esp + .nsvdwc], edx
- mov [esp + .nscoul], eax
- mov [esp + .nsvdw], ecx
-
- ;; clear potential
- xorps xmm4, xmm4
- movaps [esp + .vctot], xmm4
- movaps [esp + .vnbtot], xmm4
- mov [esp + .solnr], ebx
-
- mov eax, [ebp + %$jindex]
- mov ecx, [eax] ; jindex[n]
- mov edx, [eax + 4] ; jindex[n+1]
- add [ebp + %$jindex], dword 4
- sub edx, ecx ; number of innerloop atoms
-
- mov eax, [ebp + %$jjnr]
- shl ecx, 2
- add eax, ecx
- mov [esp + .innerjjnr0], eax ; pointer to jjnr[nj0]
- mov [esp + .innerk0], edx ; number of innerloop atoms
-
- mov ecx, [esp + .nsvdwc]
- cmp ecx, dword 0
- jnz .mno_vdwc
- jmp .testcoul
-.mno_vdwc:
- mov ebx, [esp + .solnr]
- inc dword [esp + .solnr]
-
- mov edx, [ebp + %$charge]
- movss xmm3, [edx + ebx*4]
- mulss xmm3, [ebp + %$facel]
- shufps xmm3, xmm3, 0b
-
- mov edx, [ebp + %$type]
- mov edx, [edx + ebx*4]
- imul edx, [ebp + %$ntype]
- shl edx, 1
- mov [esp + .ntia], edx
-
- lea ebx, [ebx + ebx*2] ; ebx = 3*ii=ii3
- mov eax, [ebp + %$pos] ; eax = base of pos[]
-
- movss xmm0, [esp + .shX]
- movss xmm1, [esp + .shY]
- movss xmm2, [esp + .shZ]
- addss xmm0, [eax + ebx*4]
- addss xmm1, [eax + ebx*4 + 4]
- addss xmm2, [eax + ebx*4 + 8]
-
- movaps [esp + .iq], xmm3
-
- shufps xmm0, xmm0, 0b
- shufps xmm1, xmm1, 0b
- shufps xmm2, xmm2, 0b
-
- movaps [esp + .ix], xmm0
- movaps [esp + .iy], xmm1
- movaps [esp + .iz], xmm2
-
- mov [esp + .ii3], ebx
-
- ; clear i forces
- xorps xmm4, xmm4
- movaps [esp + .fix], xmm4
- movaps [esp + .fiy], xmm4
- movaps [esp + .fiz], xmm4
-
- mov ecx, [esp + .innerjjnr0]
- mov [esp + .innerjjnr], ecx
- mov edx, [esp + .innerk0]
- sub edx, dword 4
- mov [esp + .innerk], edx ; number of innerloop atoms
- jge .unroll_vdwc_loop
- jmp .finish_vdwc_inner
-.unroll_vdwc_loop:
- ;; quad-unroll innerloop here.
- mov edx, [esp + .innerjjnr] ; pointer to jjnr[k]
- mov eax, [edx]
- mov ebx, [edx + 4]
- mov ecx, [edx + 8]
- mov edx, [edx + 12] ; eax-edx=jnr1-4
- add [esp + .innerjjnr], dword 16 ; advance pointer (unrolled 4)
-
- mov esi, [ebp + %$charge] ; base of charge[]
-
- movss xmm3, [esi + eax*4]
- movss xmm4, [esi + ecx*4]
- movss xmm6, [esi + ebx*4]
- movss xmm7, [esi + edx*4]
-
- movaps xmm2, [esp + .iq]
- shufps xmm3, xmm6, 00000000b
- shufps xmm4, xmm7, 00000000b
- shufps xmm3, xmm4, 10001000b ; all charges in xmm3
- movd mm0, eax ; use mmx registers as temp. storage
- movd mm1, ebx
- movd mm2, ecx
- movd mm3, edx
-
- mov esi, [ebp + %$type]
- mov eax, [esi + eax*4]
- mov ebx, [esi + ebx*4]
- mov ecx, [esi + ecx*4]
- mov edx, [esi + edx*4]
- mov esi, [ebp + %$nbfp]
- shl eax, 1
- shl ebx, 1
- shl ecx, 1
- shl edx, 1
- mov edi, [esp + .ntia]
- add eax, edi
- add ebx, edi
- add ecx, edi
- add edx, edi
-
- movlps xmm6, [esi + eax*4]
- movlps xmm7, [esi + ecx*4]
- movhps xmm6, [esi + ebx*4]
- movhps xmm7, [esi + edx*4]
-
- movaps xmm4, xmm6
- shufps xmm4, xmm7, 10001000b
- shufps xmm6, xmm7, 11011101b
-
- movd eax, mm0
- movd ebx, mm1
- movd ecx, mm2
- movd edx, mm3
-
- movaps [esp + .c6], xmm4
- movaps [esp + .c12], xmm6
-
- mov esi, [ebp + %$pos] ; base of pos[]
-
- lea eax, [eax + eax*2] ; replace jnr with j3
- lea ebx, [ebx + ebx*2]
-
- mulps xmm3, xmm2
- lea ecx, [ecx + ecx*2] ; replace jnr with j3
- lea edx, [edx + edx*2]
-
- ; move four coordinates to xmm0-xmm2
-
- movlps xmm4, [esi + eax*4]
- movlps xmm5, [esi + ecx*4]
- movss xmm2, [esi + eax*4 + 8]
- movss xmm6, [esi + ecx*4 + 8]
-
- movhps xmm4, [esi + ebx*4]
- movhps xmm5, [esi + edx*4]
-
- movss xmm0, [esi + ebx*4 + 8]
- movss xmm1, [esi + edx*4 + 8]
-
- shufps xmm2, xmm0, 0b
- shufps xmm6, xmm1, 0b
-
- movaps xmm0, xmm4
- movaps xmm1, xmm4
-
- shufps xmm2, xmm6, 10001000b
-
- shufps xmm0, xmm5, 10001000b
- shufps xmm1, xmm5, 11011101b
-
- ; move ix-iz to xmm4-xmm6
- movaps xmm4, [esp + .ix]
- movaps xmm5, [esp + .iy]
- movaps xmm6, [esp + .iz]
-
- ; calc dr
- subps xmm4, xmm0
- subps xmm5, xmm1
- subps xmm6, xmm2
-
- ; store dr
- movaps [esp + .dx], xmm4
- movaps [esp + .dy], xmm5
- movaps [esp + .dz], xmm6
- ; square it
- mulps xmm4,xmm4
- mulps xmm5,xmm5
- mulps xmm6,xmm6
- addps xmm4, xmm5
- addps xmm4, xmm6
- ; rsq in xmm4
-
- rsqrtps xmm5, xmm4
- ; lookup seed in xmm5
- movaps xmm2, xmm5
- mulps xmm5, xmm5
- movaps xmm1, [esp + .three]
- mulps xmm5, xmm4 ;rsq*lu*lu
- movaps xmm0, [esp + .half]
- subps xmm1, xmm5 ; 3.0-rsq*lu*lu
- mulps xmm1, xmm2
- mulps xmm0, xmm1 ; xmm0=rinv
- movaps xmm4, xmm0
- mulps xmm4, xmm4 ; xmm4=rinvsq
- movaps xmm1, xmm4
- mulps xmm1, xmm4
- mulps xmm1, xmm4 ; xmm1=rinvsix
- movaps xmm2, xmm1
- mulps xmm2, xmm2 ; xmm2=rinvtwelve
- mulps xmm3, xmm0 ; xmm3=vcoul
- mulps xmm1, [esp + .c6]
- mulps xmm2, [esp + .c12]
- movaps xmm5, xmm2
- subps xmm5, xmm1 ; vnb=vnb12-vnb6
- addps xmm5, [esp + .vnbtot]
- mulps xmm1, [esp + .six]
- mulps xmm2, [esp + .twelve]
- subps xmm2, xmm1
- addps xmm2, xmm3
- mulps xmm4, xmm2 ; xmm4=total fscal
- addps xmm3, [esp + .vctot]
-
- movaps xmm0, [esp + .dx]
- movaps xmm1, [esp + .dy]
- movaps xmm2, [esp + .dz]
-
- movaps [esp + .vctot], xmm3
- movaps [esp + .vnbtot], xmm5
-
- mov edi, [ebp + %$faction]
- mulps xmm0, xmm4
- mulps xmm1, xmm4
- mulps xmm2, xmm4
- ; xmm0-xmm2 contains tx-tz (partial force)
- ; now update f_i
- movaps xmm3, [esp + .fix]
- movaps xmm4, [esp + .fiy]
- movaps xmm5, [esp + .fiz]
- addps xmm3, xmm0
- addps xmm4, xmm1
- addps xmm5, xmm2
- movaps [esp + .fix], xmm3
- movaps [esp + .fiy], xmm4
- movaps [esp + .fiz], xmm5
- ; the fj's - start by accumulating x & y forces from memory
- movlps xmm4, [edi + eax*4]
- movlps xmm6, [edi + ecx*4]
- movhps xmm4, [edi + ebx*4]
- movhps xmm6, [edi + edx*4]
-
- movaps xmm3, xmm4
- shufps xmm3, xmm6, 10001000b
- shufps xmm4, xmm6, 11011101b
-
- ; now xmm3-xmm5 contains fjx, fjy, fjz
- subps xmm3, xmm0
- subps xmm4, xmm1
-
- ; unpack them back so we can store them - first x & y in xmm3/xmm4
-
- movaps xmm6, xmm3
- unpcklps xmm6, xmm4
- unpckhps xmm3, xmm4
- ; xmm6(l)=x & y for j1, (h) for j2
- ; xmm3(l)=x & y for j3, (h) for j4
- movlps [edi + eax*4], xmm6
- movlps [edi + ecx*4], xmm3
-
- movhps [edi + ebx*4], xmm6
- movhps [edi + edx*4], xmm3
-
- ;; and the z forces
- movss xmm4, [edi + eax*4 + 8]
- movss xmm5, [edi + ebx*4 + 8]
- movss xmm6, [edi + ecx*4 + 8]
- movss xmm7, [edi + edx*4 + 8]
- subss xmm4, xmm2
- shufps xmm2, xmm2, 11100101b
- subss xmm5, xmm2
- shufps xmm2, xmm2, 11101010b
- subss xmm6, xmm2
- shufps xmm2, xmm2, 11111111b
- subss xmm7, xmm2
- movss [edi + eax*4 + 8], xmm4
- movss [edi + ebx*4 + 8], xmm5
- movss [edi + ecx*4 + 8], xmm6
- movss [edi + edx*4 + 8], xmm7
-
- ;; should we do one more iteration?
- sub [esp + .innerk], dword 4
- jl .finish_vdwc_inner
- jmp .unroll_vdwc_loop
-.finish_vdwc_inner:
- ;; check if at least two particles remain
- add [esp + .innerk], dword 4
- mov edx, [esp + .innerk]
- and edx, 10b
- jnz .dopair_vdwc
- jmp .checksingle_vdwc
-.dopair_vdwc:
- mov esi, [ebp + %$charge]
-
- mov ecx, [esp + .innerjjnr]
-
- mov eax, [ecx]
- mov ebx, [ecx + 4]
- add [esp + .innerjjnr], dword 8
-
- movss xmm3, [esi + eax*4]
- movss xmm6, [esi + ebx*4]
- shufps xmm3, xmm6, 00000000b
- shufps xmm3, xmm3, 00001000b ; xmm3(0,1) has the charges.
-
- mov esi, [ebp + %$type]
- mov ecx, eax
- mov edx, ebx
- mov ecx, [esi + ecx*4]
- mov edx, [esi + edx*4]
- mov esi, [ebp + %$nbfp]
- shl ecx, 1
- shl edx, 1
- mov edi, [esp + .ntia]
- add ecx, edi
- add edx, edi
- movlps xmm6, [esi + ecx*4]
- movhps xmm6, [esi + edx*4]
- mov edi, [ebp + %$pos]
- xorps xmm7,xmm7
- movaps xmm4, xmm6
- shufps xmm4, xmm4, 1000b
- shufps xmm6, xmm6, 1101b
- movlhps xmm4, xmm7
- movlhps xmm6, xmm7
-
- movaps [esp + .c6], xmm4
- movaps [esp + .c12], xmm6
-
- lea eax, [eax + eax*2]
- lea ebx, [ebx + ebx*2]
- ; move coordinates to xmm0-xmm2
- movlps xmm1, [edi + eax*4]
- movss xmm2, [edi + eax*4 + 8]
- movhps xmm1, [edi + ebx*4]
- movss xmm0, [edi + ebx*4 + 8]
-
- mulps xmm3, [esp + .iq]
-
- movlhps xmm3, xmm7
-
- shufps xmm2, xmm0, 0b
-
- movaps xmm0, xmm1
-
- shufps xmm2, xmm2, 10001000b
-
- shufps xmm0, xmm0, 10001000b
- shufps xmm1, xmm1, 11011101b
-
- mov edi, [ebp + %$faction]
- ; move ix-iz to xmm4-xmm6
- xorps xmm7, xmm7
-
- movaps xmm4, [esp + .ix]
- movaps xmm5, [esp + .iy]
- movaps xmm6, [esp + .iz]
-
- ; calc dr
- subps xmm4, xmm0
- subps xmm5, xmm1
- subps xmm6, xmm2
-
- ; store dr
- movaps [esp + .dx], xmm4
- movaps [esp + .dy], xmm5
- movaps [esp + .dz], xmm6
- ; square it
- mulps xmm4,xmm4
- mulps xmm5,xmm5
- mulps xmm6,xmm6
- addps xmm4, xmm5
- addps xmm4, xmm6
- ; rsq in xmm4
-
- rsqrtps xmm5, xmm4
- ; lookup seed in xmm5
- movaps xmm2, xmm5
- mulps xmm5, xmm5
- movaps xmm1, [esp + .three]
- mulps xmm5, xmm4 ;rsq*lu*lu
- movaps xmm0, [esp + .half]
- subps xmm1, xmm5 ; 3.0-rsq*lu*lu
- mulps xmm1, xmm2
- mulps xmm0, xmm1 ; xmm0=rinv
- movaps xmm4, xmm0
- mulps xmm4, xmm4 ; xmm4=rinvsq
- movaps xmm1, xmm4
- mulps xmm1, xmm4
- mulps xmm1, xmm4 ; xmm1=rinvsix
- movaps xmm2, xmm1
- mulps xmm2, xmm2 ; xmm2=rinvtwelve
-
- mulps xmm3, xmm0 ; xmm3=vcoul
- mulps xmm1, [esp + .c6]
- mulps xmm2, [esp + .c12]
- movaps xmm5, xmm2
- subps xmm5, xmm1 ; vnb=vnb12-vnb6
- addps xmm5, [esp + .vnbtot]
- mulps xmm1, [esp + .six]
- mulps xmm2, [esp + .twelve]
- subps xmm2, xmm1
- addps xmm2, xmm3
- mulps xmm4, xmm2 ; xmm4=total fscal
- addps xmm3, [esp + .vctot]
-
- movaps xmm0, [esp + .dx]
- movaps xmm1, [esp + .dy]
- movaps xmm2, [esp + .dz]
-
- movaps [esp + .vctot], xmm3
- movaps [esp + .vnbtot], xmm5
-
- mulps xmm0, xmm4
- mulps xmm1, xmm4
- mulps xmm2, xmm4
- ; xmm0-xmm2 contains tx-tz (partial force)
- ; now update f_i
- movaps xmm3, [esp + .fix]
- movaps xmm4, [esp + .fiy]
- movaps xmm5, [esp + .fiz]
- addps xmm3, xmm0
- addps xmm4, xmm1
- addps xmm5, xmm2
- movaps [esp + .fix], xmm3
- movaps [esp + .fiy], xmm4
- movaps [esp + .fiz], xmm5
- ; update the fj's
- movss xmm3, [edi + eax*4]
- movss xmm4, [edi + eax*4 + 4]
- movss xmm5, [edi + eax*4 + 8]
- subss xmm3, xmm0
- subss xmm4, xmm1
- subss xmm5, xmm2
- movss [edi + eax*4], xmm3
- movss [edi + eax*4 + 4], xmm4
- movss [edi + eax*4 + 8], xmm5
-
- shufps xmm0, xmm0, 11100001b
- shufps xmm1, xmm1, 11100001b
- shufps xmm2, xmm2, 11100001b
-
- movss xmm3, [edi + ebx*4]
- movss xmm4, [edi + ebx*4 + 4]
- movss xmm5, [edi + ebx*4 + 8]
- subss xmm3, xmm0
- subss xmm4, xmm1
- subss xmm5, xmm2
- movss [edi + ebx*4], xmm3
- movss [edi + ebx*4 + 4], xmm4
- movss [edi + ebx*4 + 8], xmm5
-
-.checksingle_vdwc:
- mov edx, [esp + .innerk]
- and edx, 1b
- jnz .dosingle_vdwc
- jmp .updateouterdata_vdwc
-.dosingle_vdwc:
- mov esi, [ebp + %$charge]
- mov edi, [ebp + %$pos]
- mov ecx, [esp + .innerjjnr]
- mov eax, [ecx]
- movss xmm3, [esi + eax*4] ; xmm3(0) has the charge
-
- mov esi, [ebp + %$type]
- mov ecx, eax
- mov ecx, [esi + ecx*4]
- mov esi, [ebp + %$nbfp]
- shl ecx, 1
- add ecx, [esp + .ntia]
- xorps xmm6, xmm6
- movlps xmm6, [esi + ecx*4]
- movaps xmm4, xmm6
- shufps xmm4, xmm4, 11111100b
- shufps xmm6, xmm6, 11111101b
-
- movaps [esp + .c6], xmm4
- movaps [esp + .c12], xmm6
-
- lea eax, [eax + eax*2]
-
- ; move coordinates to xmm0-xmm2
- movss xmm0, [edi + eax*4]
- movss xmm1, [edi + eax*4 + 4]
- movss xmm2, [edi + eax*4 + 8]
-
- mulps xmm3, [esp + .iq]
-
- xorps xmm7, xmm7
-
- movaps xmm4, [esp + .ix]
- movaps xmm5, [esp + .iy]
- movaps xmm6, [esp + .iz]
-
- ; calc dr
- subps xmm4, xmm0
- subps xmm5, xmm1
- subps xmm6, xmm2
-
- ; store dr
- movaps [esp + .dx], xmm4
- movaps [esp + .dy], xmm5
- movaps [esp + .dz], xmm6
- ; square it
- mulps xmm4,xmm4
- mulps xmm5,xmm5
- mulps xmm6,xmm6
- addps xmm4, xmm5
- addps xmm4, xmm6
- ; rsq in xmm4
-
- rsqrtps xmm5, xmm4
- ; lookup seed in xmm5
- movaps xmm2, xmm5
- mulps xmm5, xmm5
- movaps xmm1, [esp + .three]
- mulps xmm5, xmm4 ;rsq*lu*lu
- movaps xmm0, [esp + .half]
- subps xmm1, xmm5 ; 3.0-rsq*lu*lu
- mulps xmm1, xmm2
- mulps xmm0, xmm1 ; xmm0=rinv
- movaps xmm4, xmm0
- mulps xmm4, xmm4 ; xmm4=rinvsq
- movaps xmm1, xmm4
- mulps xmm1, xmm4
- mulps xmm1, xmm4 ; xmm1=rinvsix
- movaps xmm2, xmm1
- mulps xmm2, xmm2 ; xmm2=rinvtwelve
- mulps xmm3, xmm0 ; xmm3=vcoul
- mulps xmm1, [esp + .c6]
- mulps xmm2, [esp + .c12]
- movaps xmm5, xmm2
- subps xmm5, xmm1 ; vnb=vnb12-vnb6
- addps xmm5, [esp + .vnbtot]
- mulps xmm1, [esp + .six]
- mulps xmm2, [esp + .twelve]
- subps xmm2, xmm1
- addps xmm2, xmm3
- mulps xmm4, xmm2 ; xmm4=total fscal
- addps xmm3, [esp + .vctot]
-
- mov edi, [ebp + %$faction]
-
- movaps xmm0, [esp + .dx]
- movaps xmm1, [esp + .dy]
- movaps xmm2, [esp + .dz]
-
- movaps [esp + .vctot], xmm3
- movaps [esp + .vnbtot], xmm5
-
- mulps xmm0, xmm4
- mulps xmm1, xmm4
- mulps xmm2, xmm4
- ; xmm0-xmm2 contains tx-tz (partial force)
- ; now update f_i
- movaps xmm3, [esp + .fix]
- movaps xmm4, [esp + .fiy]
- movaps xmm5, [esp + .fiz]
- addps xmm3, xmm0
- addps xmm4, xmm1
- addps xmm5, xmm2
- movaps [esp + .fix], xmm3
- movaps [esp + .fiy], xmm4
- movaps [esp + .fiz], xmm5
- ; update fj
-
- movss xmm3, [edi + eax*4]
- movss xmm4, [edi + eax*4 + 4]
- movss xmm5, [edi + eax*4 + 8]
- subss xmm3, xmm0
- subss xmm4, xmm1
- subss xmm5, xmm2
- movss [edi + eax*4], xmm3
- movss [edi + eax*4 + 4], xmm4
- movss [edi + eax*4 + 8], xmm5
-.updateouterdata_vdwc:
- mov ecx, [esp + .ii3]
- mov edi, [ebp + %$faction]
- mov esi, [ebp + %$fshift]
- mov edx, [esp + .is3]
-
- ; accumulate i forces in xmm0, xmm1, xmm2
- movaps xmm0, [esp + .fix]
- movaps xmm1, [esp + .fiy]
- movaps xmm2, [esp + .fiz]
-
- movhlps xmm3, xmm0
- movhlps xmm4, xmm1
- movhlps xmm5, xmm2
- addps xmm0, xmm3
- addps xmm1, xmm4
- addps xmm2, xmm5 ; summan ligger i 1/2 i xmm0-xmm2
-
- movaps xmm3, xmm0
- movaps xmm4, xmm1
- movaps xmm5, xmm2
-
- shufps xmm3, xmm3, 1b
- shufps xmm4, xmm4, 1b
- shufps xmm5, xmm5, 1b
- addss xmm0, xmm3
- addss xmm1, xmm4
- addss xmm2, xmm5 ; xmm0-xmm2 has single force in pos0.
-
- ; increment i force
- movss xmm3, [edi + ecx*4]
- movss xmm4, [edi + ecx*4 + 4]
- movss xmm5, [edi + ecx*4 + 8]
- addss xmm3, xmm0
- addss xmm4, xmm1
- addss xmm5, xmm2
- movss [edi + ecx*4], xmm3
- movss [edi + ecx*4 + 4], xmm4
- movss [edi + ecx*4 + 8], xmm5
-
- ; increment fshift force
- movss xmm3, [esi + edx*4]
- movss xmm4, [esi + edx*4 + 4]
- movss xmm5, [esi + edx*4 + 8]
- addss xmm3, xmm0
- addss xmm4, xmm1
- addss xmm5, xmm2
- movss [esi + edx*4], xmm3
- movss [esi + edx*4 + 4], xmm4
- movss [esi + edx*4 + 8], xmm5
-
-
- ;; loop back to mno.
- dec dword [esp + .nsvdwc]
- jz .testcoul
- jmp .mno_vdwc
-.testcoul:
- mov ecx, [esp + .nscoul]
- cmp ecx, byte 0
- jnz .mno_coul
- jmp .testvdw
-.mno_coul:
- mov ebx, [esp + .solnr]
- inc dword [esp + .solnr]
-
- movss xmm0, [esp + .shX]
- movss xmm1, [esp + .shY]
- movss xmm2, [esp + .shZ]
-
- mov edx, [ebp + %$charge]
- movss xmm3, [edx + ebx*4]
- mulss xmm3, [ebp + %$facel]
- shufps xmm3, xmm3, 0b
-
- lea ebx, [ebx + ebx*2] ; ebx = 3*ii=ii3
- mov eax, [ebp + %$pos] ; eax = base of pos[]
-
- addss xmm0, [eax + ebx*4]
- addss xmm1, [eax + ebx*4 + 4]
- addss xmm2, [eax + ebx*4 + 8]
-
- movaps [esp + .iq], xmm3
-
- shufps xmm0, xmm0, 0b
- shufps xmm1, xmm1, 0b
- shufps xmm2, xmm2, 0b
-
- movaps [esp + .ix], xmm0
- movaps [esp + .iy], xmm1
- movaps [esp + .iz], xmm2
-
- mov [esp + .ii3], ebx
-
- ; clear i forces
- xorps xmm4, xmm4
- movaps [esp + .fix], xmm4
- movaps [esp + .fiy], xmm4
- movaps [esp + .fiz], xmm4
-
- mov ecx, [esp + .innerjjnr0]
- mov [esp + .innerjjnr], ecx
- mov edx, [esp + .innerk0]
- sub edx, dword 4
- mov [esp + .innerk], edx ; number of innerloop atoms
- jge .unroll_coul_loop
- jmp .finish_coul_inner
-
-.unroll_coul_loop:
- ;; quad-unrolled innerloop here.
- mov edx, [esp + .innerjjnr] ; pointer to jjnr[k]
- mov eax, [edx]
- mov ebx, [edx + 4]
- mov ecx, [edx + 8]
- mov edx, [edx + 12] ; eax-edx=jnr1-4
- add [esp + .innerjjnr], dword 16 ; advance pointer (unrolled 4)
-
- mov esi, [ebp + %$charge] ; base of charge[]
-
- movss xmm3, [esi + eax*4]
- movss xmm4, [esi + ecx*4]
- movss xmm6, [esi + ebx*4]
- movss xmm7, [esi + edx*4]
-
- movaps xmm5, [esp + .iq]
- shufps xmm3, xmm6, 00000000b
- shufps xmm4, xmm7, 00000000b
- shufps xmm3, xmm4, 10001000b
- mov esi, [ebp + %$pos] ; base of pos[]
-
- lea eax, [eax + eax*2] ; replace jnr with j3
- lea ebx, [ebx + ebx*2]
-
- mulps xmm3, xmm5
- lea ecx, [ecx + ecx*2] ; replace jnr with j3
- lea edx, [edx + edx*2]
-
- ; move four coordinates to xmm0-xmm2
-
- movlps xmm4, [esi + eax*4]
- movlps xmm5, [esi + ecx*4]
- movss xmm2, [esi + eax*4 + 8]
- movss xmm6, [esi + ecx*4 + 8]
-
- movhps xmm4, [esi + ebx*4]
- movhps xmm5, [esi + edx*4]
-
- movss xmm0, [esi + ebx*4 + 8]
- movss xmm1, [esi + edx*4 + 8]
-
- shufps xmm2, xmm0, 0b
- shufps xmm6, xmm1, 0b
-
- movaps xmm0, xmm4
- movaps xmm1, xmm4
-
- shufps xmm2, xmm6, 10001000b
-
- shufps xmm0, xmm5, 10001000b
- shufps xmm1, xmm5, 11011101b
-
- mov edi, [ebp + %$faction]
-
- ; move ix-iz to xmm4-xmm6
- movaps xmm4, [esp + .ix]
- movaps xmm5, [esp + .iy]
- movaps xmm6, [esp + .iz]
-
- ; calc dr
- subps xmm4, xmm0
- subps xmm5, xmm1
- subps xmm6, xmm2
-
- ; store dr
- movaps [esp + .dx], xmm4
- movaps [esp + .dy], xmm5
- movaps [esp + .dz], xmm6
- ; square it
- mulps xmm4,xmm4
- mulps xmm5,xmm5
- mulps xmm6,xmm6
- addps xmm4, xmm5
- addps xmm4, xmm6
- ; rsq in xmm4
-
- rsqrtps xmm5, xmm4
- ; lookup seed in xmm5
- movaps xmm2, xmm5
- mulps xmm5, xmm5
- movaps xmm1, [esp + .three]
- mulps xmm5, xmm4 ;rsq*lu*lu
- movaps xmm0, [esp + .half]
- subps xmm1, xmm5 ; 3.0-rsq*lu*lu
- mulps xmm1, xmm2
- mulps xmm0, xmm1 ; xmm0=rinv
- movaps xmm4, xmm0
- mulps xmm4, xmm4 ; xmm4=rinvsq
-
- movaps xmm5, [esp + .vctot]
- mulps xmm3, xmm0 ; xmm3=vcoul
- mulps xmm4, xmm3 ; xmm4=fscal
- addps xmm5, xmm3
-
- movaps xmm0, [esp + .dx]
- movaps xmm1, [esp + .dy]
- movaps xmm2, [esp + .dz]
-
- movaps [esp + .vctot], xmm5
-
-
- mulps xmm0, xmm4
- mulps xmm1, xmm4
- mulps xmm2, xmm4
- ; xmm0-xmm2 contains tx-tz (partial force)
- ; now update f_i
- movaps xmm3, [esp + .fix]
- movaps xmm4, [esp + .fiy]
- movaps xmm5, [esp + .fiz]
- addps xmm3, xmm0
- addps xmm4, xmm1
- addps xmm5, xmm2
- movaps [esp + .fix], xmm3
- movaps [esp + .fiy], xmm4
- movaps [esp + .fiz], xmm5
- ; the fj's - start by accumulating x & y forces from memory
- movlps xmm4, [edi + eax*4]
- movlps xmm6, [edi + ecx*4]
- movhps xmm4, [edi + ebx*4]
- movhps xmm6, [edi + edx*4]
-
- movaps xmm3, xmm4
- shufps xmm3, xmm6, 10001000b
- shufps xmm4, xmm6, 11011101b
-
- ; now xmm3-xmm5 contains fjx, fjy, fjz
- subps xmm3, xmm0
- subps xmm4, xmm1
-
- ; unpack them back so we can store them - first x & y in xmm3/xmm4
-
- movaps xmm6, xmm3
- unpcklps xmm6, xmm4
- unpckhps xmm3, xmm4
- ; xmm6(l)=x & y for j1, (h) for j2
- ; xmm3(l)=x & y for j3, (h) for j4
- movlps [edi + eax*4], xmm6
- movlps [edi + ecx*4], xmm3
-
- movhps [edi + ebx*4], xmm6
- movhps [edi + edx*4], xmm3
-
- ;; and the z forces
- movss xmm4, [edi + eax*4 + 8]
- movss xmm5, [edi + ebx*4 + 8]
- movss xmm6, [edi + ecx*4 + 8]
- movss xmm7, [edi + edx*4 + 8]
- subss xmm4, xmm2
- shufps xmm2, xmm2, 11100101b
- subss xmm5, xmm2
- shufps xmm2, xmm2, 11101010b
- subss xmm6, xmm2
- shufps xmm2, xmm2, 11111111b
- subss xmm7, xmm2
- movss [edi + eax*4 + 8], xmm4
- movss [edi + ebx*4 + 8], xmm5
- movss [edi + ecx*4 + 8], xmm6
- movss [edi + edx*4 + 8], xmm7
-
- ;; should we do one more iteration?
- sub [esp + .innerk], dword 4
- jl .finish_coul_inner
- jmp .unroll_coul_loop
-.finish_coul_inner:
- ;; check if at least two particles remain
- add [esp + .innerk], dword 4
- mov edx, [esp + .innerk]
- and edx, 10b
- jnz .dopair_coul
- jmp .checksingle_coul
-.dopair_coul:
- mov esi, [ebp + %$charge]
- mov edi, [ebp + %$pos]
- mov ecx, [esp + .innerjjnr]
-
- mov eax, [ecx]
- mov ebx, [ecx + 4]
- add [esp + .innerjjnr], dword 8
-
- movss xmm3, [esi + eax*4]
- movss xmm6, [esi + ebx*4]
- shufps xmm3, xmm6, 00000000b
- shufps xmm3, xmm3, 00001000b ; xmm3(0,1) has the charges.
-
- lea eax, [eax + eax*2]
- lea ebx, [ebx + ebx*2]
- ; move coordinates to xmm0-xmm2
- movlps xmm1, [edi + eax*4]
- movss xmm2, [edi + eax*4 + 8]
- movhps xmm1, [edi + ebx*4]
- movss xmm0, [edi + ebx*4 + 8]
-
- mulps xmm3, [esp + .iq]
- xorps xmm7,xmm7
- movlhps xmm3, xmm7
-
- shufps xmm2, xmm0, 0b
-
- movaps xmm0, xmm1
-
- shufps xmm2, xmm2, 10001000b
-
- shufps xmm0, xmm0, 10001000b
- shufps xmm1, xmm1, 11011101b
-
- mov edi, [ebp + %$faction]
- ; move ix-iz to xmm4-xmm6
- xorps xmm7, xmm7
-
- movaps xmm4, [esp + .ix]
- movaps xmm5, [esp + .iy]
- movaps xmm6, [esp + .iz]
-
- ; calc dr
- subps xmm4, xmm0
- subps xmm5, xmm1
- subps xmm6, xmm2
-
- ; store dr
- movaps [esp + .dx], xmm4
- movaps [esp + .dy], xmm5
- movaps [esp + .dz], xmm6
- ; square it
- mulps xmm4,xmm4
- mulps xmm5,xmm5
- mulps xmm6,xmm6
- addps xmm4, xmm5
- addps xmm4, xmm6
- ; rsq in xmm4
-
- rsqrtps xmm5, xmm4
- ; lookup seed in xmm5
- movaps xmm2, xmm5
- mulps xmm5, xmm5
- movaps xmm1, [esp + .three]
- mulps xmm5, xmm4 ;rsq*lu*lu
- movaps xmm0, [esp + .half]
- subps xmm1, xmm5 ; 3.0-rsq*lu*lu
- mulps xmm1, xmm2
- mulps xmm0, xmm1 ; xmm0=rinv
- movaps xmm4, xmm0
- mulps xmm4, xmm4 ; xmm4=rinvsq
-
- movaps xmm5, [esp + .vctot]
- mulps xmm3, xmm0 ; xmm3=vcoul
- mulps xmm4, xmm3 ; xmm4=fscal
- addps xmm5, xmm3
-
- movaps xmm0, [esp + .dx]
- movaps xmm1, [esp + .dy]
- movaps xmm2, [esp + .dz]
-
- movaps [esp + .vctot], xmm5
-
- mulps xmm0, xmm4
- mulps xmm1, xmm4
- mulps xmm2, xmm4
- ; xmm0-xmm2 contains tx-tz (partial force)
- ; now update f_i
- movaps xmm3, [esp + .fix]
- movaps xmm4, [esp + .fiy]
- movaps xmm5, [esp + .fiz]
- addps xmm3, xmm0
- addps xmm4, xmm1
- addps xmm5, xmm2
- movaps [esp + .fix], xmm3
- movaps [esp + .fiy], xmm4
- movaps [esp + .fiz], xmm5
- ; update the fj's
- movss xmm3, [edi + eax*4]
- movss xmm4, [edi + eax*4 + 4]
- movss xmm5, [edi + eax*4 + 8]
- subss xmm3, xmm0
- subss xmm4, xmm1
- subss xmm5, xmm2
- movss [edi + eax*4], xmm3
- movss [edi + eax*4 + 4], xmm4
- movss [edi + eax*4 + 8], xmm5
-
- shufps xmm0, xmm0, 11100001b
- shufps xmm1, xmm1, 11100001b
- shufps xmm2, xmm2, 11100001b
-
- movss xmm3, [edi + ebx*4]
- movss xmm4, [edi + ebx*4 + 4]
- movss xmm5, [edi + ebx*4 + 8]
- subss xmm3, xmm0
- subss xmm4, xmm1
- subss xmm5, xmm2
- movss [edi + ebx*4], xmm3
- movss [edi + ebx*4 + 4], xmm4
- movss [edi + ebx*4 + 8], xmm5
-
-.checksingle_coul:
- mov edx, [esp + .innerk]
- and edx, 1b
- jnz .dosingle_coul
- jmp .updateouterdata_coul
-.dosingle_coul:
- mov esi, [ebp + %$charge]
- mov edi, [ebp + %$pos]
- mov ecx, [esp + .innerjjnr]
- mov eax, [ecx]
- movss xmm3, [esi + eax*4] ; xmm3(0) has the charge
-
- lea eax, [eax + eax*2]
-
- ; move coordinates to xmm0-xmm2
- movss xmm0, [edi + eax*4]
- movss xmm1, [edi + eax*4 + 4]
- movss xmm2, [edi + eax*4 + 8]
-
- mulps xmm3, [esp + .iq]
-
- xorps xmm7, xmm7
-
- movaps xmm4, [esp + .ix]
- movaps xmm5, [esp + .iy]
- movaps xmm6, [esp + .iz]
-
- ; calc dr
- subps xmm4, xmm0
- subps xmm5, xmm1
- subps xmm6, xmm2
-
- ; store dr
- movaps [esp + .dx], xmm4
- movaps [esp + .dy], xmm5
- movaps [esp + .dz], xmm6
- ; square it
- mulps xmm4,xmm4
- mulps xmm5,xmm5
- mulps xmm6,xmm6
- addps xmm4, xmm5
- addps xmm4, xmm6
- ; rsq in xmm4
-
- rsqrtps xmm5, xmm4
- ; lookup seed in xmm5
- movaps xmm2, xmm5
- mulps xmm5, xmm5
- movaps xmm1, [esp + .three]
- mulps xmm5, xmm4 ;rsq*lu*lu
- movaps xmm0, [esp + .half]
- subps xmm1, xmm5 ; 3.0-rsq*lu*lu
- mulps xmm1, xmm2
- mulps xmm0, xmm1 ; xmm0=rinv
- movaps xmm4, xmm0
- mulps xmm4, xmm4 ; xmm4=rinvsq
- mov edi, [ebp + %$faction]
- movaps xmm5, [esp + .vctot]
- mulps xmm3, xmm0 ; xmm3=vcoul
- mulps xmm4, xmm3 ; xmm4=fscal
- addps xmm5, xmm3
-
- movaps xmm0, [esp + .dx]
- movaps xmm1, [esp + .dy]
- movaps xmm2, [esp + .dz]
-
- movaps [esp + .vctot], xmm5
-
- mulps xmm0, xmm4
- mulps xmm1, xmm4
- mulps xmm2, xmm4
- ; xmm0-xmm2 contains tx-tz (partial force)
- ; now update f_i
- movaps xmm3, [esp + .fix]
- movaps xmm4, [esp + .fiy]
- movaps xmm5, [esp + .fiz]
- addps xmm3, xmm0
- addps xmm4, xmm1
- addps xmm5, xmm2
- movaps [esp + .fix], xmm3
- movaps [esp + .fiy], xmm4
- movaps [esp + .fiz], xmm5
- ; update fj
- movss xmm3, [edi + eax*4]
- movss xmm4, [edi + eax*4 + 4]
- movss xmm5, [edi + eax*4 + 8]
- subss xmm3, xmm0
- subss xmm4, xmm1
- subss xmm5, xmm2
- movss [edi + eax*4], xmm3
- movss [edi + eax*4 + 4], xmm4
- movss [edi + eax*4 + 8], xmm5
-
-.updateouterdata_coul:
- mov ecx, [esp + .ii3]
- mov edi, [ebp + %$faction]
- mov esi, [ebp + %$fshift]
- mov edx, [esp + .is3]
-
- ; accumulate i forces in xmm0, xmm1, xmm2
- movaps xmm0, [esp + .fix]
- movaps xmm1, [esp + .fiy]
- movaps xmm2, [esp + .fiz]
-
- movhlps xmm3, xmm0
- movhlps xmm4, xmm1
- movhlps xmm5, xmm2
- addps xmm0, xmm3
- addps xmm1, xmm4
- addps xmm2, xmm5 ; summan ligger i 1/2 i xmm0-xmm2
-
- movaps xmm3, xmm0
- movaps xmm4, xmm1
- movaps xmm5, xmm2
-
- shufps xmm3, xmm3, 1b
- shufps xmm4, xmm4, 1b
- shufps xmm5, xmm5, 1b
- addss xmm0, xmm3
- addss xmm1, xmm4
- addss xmm2, xmm5 ; xmm0-xmm2 has single force in pos0.
-
- ; increment i force
- movss xmm3, [edi + ecx*4]
- movss xmm4, [edi + ecx*4 + 4]
- movss xmm5, [edi + ecx*4 + 8]
- addss xmm3, xmm0
- addss xmm4, xmm1
- addss xmm5, xmm2
- movss [edi + ecx*4], xmm3
- movss [edi + ecx*4 + 4], xmm4
- movss [edi + ecx*4 + 8], xmm5
-
- ; increment fshift force
- movss xmm3, [esi + edx*4]
- movss xmm4, [esi + edx*4 + 4]
- movss xmm5, [esi + edx*4 + 8]
- addss xmm3, xmm0
- addss xmm4, xmm1
- addss xmm5, xmm2
- movss [esi + edx*4], xmm3
- movss [esi + edx*4 + 4], xmm4
- movss [esi + edx*4 + 8], xmm5
-
- ;; loop back to mno.
- dec dword [esp + .nscoul]
- jz .testvdw
- jmp .mno_coul
-.testvdw:
- mov ecx, [esp + .nsvdw]
- cmp ecx, byte 0
- jnz .mno_vdw
- jmp .last_mno
-.mno_vdw:
- mov ebx, [esp + .solnr]
- inc dword [esp + .solnr]
-
- mov edx, [ebp + %$type]
- mov edx, [edx + ebx*4]
- imul edx, [ebp + %$ntype]
- shl edx, 1
- mov [esp + .ntia], edx
-
- lea ebx, [ebx + ebx*2] ; ebx = 3*ii=ii3
- mov eax, [ebp + %$pos] ; eax = base of pos[]
- mov [esp + .ii3], ebx
-
- movss xmm0, [esp + .shX]
- movss xmm1, [esp + .shY]
- movss xmm2, [esp + .shZ]
-
- addss xmm0, [eax + ebx*4]
- addss xmm1, [eax + ebx*4 + 4]
- addss xmm2, [eax + ebx*4 + 8]
-
- xorps xmm4, xmm4
- movaps [esp + .fix], xmm4
- movaps [esp + .fiy], xmm4
- movaps [esp + .fiz], xmm4
-
- shufps xmm0, xmm0, 0b
- shufps xmm1, xmm1, 0b
- shufps xmm2, xmm2, 0b
-
- movaps [esp + .ix], xmm0
- movaps [esp + .iy], xmm1
- movaps [esp + .iz], xmm2
-
- mov ecx, [esp + .innerjjnr0]
- mov [esp + .innerjjnr], ecx
- mov edx, [esp + .innerk0]
- sub edx, dword 4
- mov [esp + .innerk], edx ; number of innerloop atoms
- jge .unroll_vdw_loop
- jmp .finish_vdw_inner
-.unroll_vdw_loop:
- ;; quad-unroll innerloop here.
- mov edx, [esp + .innerjjnr] ; pointer to jjnr[k]
- mov eax, [edx]
- mov ebx, [edx + 4]
- mov ecx, [edx + 8]
- mov edx, [edx + 12] ; eax-edx=jnr1-4
- add [esp + .innerjjnr], dword 16 ; advance pointer (unrolled 4)
-
- movd mm0, eax ; use mmx registers as temp. storage
- movd mm1, ebx
- movd mm2, ecx
- movd mm3, edx
-
- mov esi, [ebp + %$type]
- mov eax, [esi + eax*4]
- mov ebx, [esi + ebx*4]
- mov ecx, [esi + ecx*4]
- mov edx, [esi + edx*4]
- mov esi, [ebp + %$nbfp]
- shl eax, 1
- shl ebx, 1
- shl ecx, 1
- shl edx, 1
- mov edi, [esp + .ntia]
- add eax, edi
- add ebx, edi
- add ecx, edi
- add edx, edi
-
- movlps xmm6, [esi + eax*4]
- movlps xmm7, [esi + ecx*4]
- movhps xmm6, [esi + ebx*4]
- movhps xmm7, [esi + edx*4]
-
- movaps xmm4, xmm6
- shufps xmm4, xmm7, 10001000b
- shufps xmm6, xmm7, 11011101b
-
- movd eax, mm0
- movd ebx, mm1
- movd ecx, mm2
- movd edx, mm3
-
- movaps [esp + .c6], xmm4
- movaps [esp + .c12], xmm6
-
- mov esi, [ebp + %$pos] ; base of pos[]
-
- lea eax, [eax + eax*2] ; replace jnr with j3
- lea ebx, [ebx + ebx*2]
-
- lea ecx, [ecx + ecx*2] ; replace jnr with j3
- lea edx, [edx + edx*2]
-
- ; move four coordinates to xmm0-xmm2
-
- movlps xmm4, [esi + eax*4]
- movlps xmm5, [esi + ecx*4]
- movss xmm2, [esi + eax*4 + 8]
- movss xmm6, [esi + ecx*4 + 8]
-
- movhps xmm4, [esi + ebx*4]
- movhps xmm5, [esi + edx*4]
-
- movss xmm0, [esi + ebx*4 + 8]
- movss xmm1, [esi + edx*4 + 8]
-
- shufps xmm2, xmm0, 0b
- shufps xmm6, xmm1, 0b
-
- movaps xmm0, xmm4
- movaps xmm1, xmm4
-
- shufps xmm2, xmm6, 10001000b
-
- shufps xmm0, xmm5, 10001000b
- shufps xmm1, xmm5, 11011101b
-
- ; move ix-iz to xmm4-xmm6
- movaps xmm4, [esp + .ix]
- movaps xmm5, [esp + .iy]
- movaps xmm6, [esp + .iz]
-
- ; calc dr
- subps xmm4, xmm0
- subps xmm5, xmm1
- subps xmm6, xmm2
-
- ; store dr
- movaps [esp + .dx], xmm4
- movaps [esp + .dy], xmm5
- movaps [esp + .dz], xmm6
- ; square it
- mulps xmm4,xmm4
- mulps xmm5,xmm5
- mulps xmm6,xmm6
- addps xmm4, xmm5
- addps xmm4, xmm6
- ; rsq in xmm4
-
- rcpps xmm5, xmm4
- ; 1/x lookup seed in xmm5
- movaps xmm0, [esp + .two]
- mulps xmm4, xmm5
- subps xmm0, xmm4
- mulps xmm0, xmm5 ; xmm0=rinvsq
- movaps xmm4, xmm0
-
- movaps xmm1, xmm0
- mulps xmm1, xmm0
- mulps xmm1, xmm0 ; xmm1=rinvsix
- movaps xmm2, xmm1
- mulps xmm2, xmm2 ; xmm2=rinvtwelve
-
- mulps xmm1, [esp + .c6]
- mulps xmm2, [esp + .c12]
- movaps xmm5, xmm2
- subps xmm5, xmm1 ; vnb=vnb12-vnb6
- addps xmm5, [esp + .vnbtot]
- mulps xmm1, [esp + .six]
- mulps xmm2, [esp + .twelve]
- subps xmm2, xmm1
- mulps xmm4, xmm2 ; xmm4=total fscal
-
- movaps xmm0, [esp + .dx]
- movaps xmm1, [esp + .dy]
- movaps xmm2, [esp + .dz]
-
- movaps [esp + .vnbtot], xmm5
-
- mov edi, [ebp + %$faction]
- mulps xmm0, xmm4
- mulps xmm1, xmm4
- mulps xmm2, xmm4
- ; xmm0-xmm2 contains tx-tz (partial force)
- ; now update f_i
- movaps xmm3, [esp + .fix]
- movaps xmm4, [esp + .fiy]
- movaps xmm5, [esp + .fiz]
- addps xmm3, xmm0
- addps xmm4, xmm1
- addps xmm5, xmm2
- movaps [esp + .fix], xmm3
- movaps [esp + .fiy], xmm4
- movaps [esp + .fiz], xmm5
- ; the fj's - start by accumulating x & y forces from memory
- movlps xmm4, [edi + eax*4]
- movlps xmm6, [edi + ecx*4]
- movhps xmm4, [edi + ebx*4]
- movhps xmm6, [edi + edx*4]
-
- movaps xmm3, xmm4
- shufps xmm3, xmm6, 10001000b
- shufps xmm4, xmm6, 11011101b
-
- ; now xmm3-xmm5 contains fjx, fjy, fjz
- subps xmm3, xmm0
- subps xmm4, xmm1
-
- ; unpack them back so we can store them - first x & y in xmm3/xmm4
-
- movaps xmm6, xmm3
- unpcklps xmm6, xmm4
- unpckhps xmm3, xmm4
- ; xmm6(l)=x & y for j1, (h) for j2
- ; xmm3(l)=x & y for j3, (h) for j4
- movlps [edi + eax*4], xmm6
- movlps [edi + ecx*4], xmm3
-
- movhps [edi + ebx*4], xmm6
- movhps [edi + edx*4], xmm3
-
- ;; and the z forces
- movss xmm4, [edi + eax*4 + 8]
- movss xmm5, [edi + ebx*4 + 8]
- movss xmm6, [edi + ecx*4 + 8]
- movss xmm7, [edi + edx*4 + 8]
- subss xmm4, xmm2
- shufps xmm2, xmm2, 11100101b
- subss xmm5, xmm2
- shufps xmm2, xmm2, 11101010b
- subss xmm6, xmm2
- shufps xmm2, xmm2, 11111111b
- subss xmm7, xmm2
- movss [edi + eax*4 + 8], xmm4
- movss [edi + ebx*4 + 8], xmm5
- movss [edi + ecx*4 + 8], xmm6
- movss [edi + edx*4 + 8], xmm7
-
- ;; should we do one more iteration?
- sub [esp + .innerk], dword 4
- jl .finish_vdw_inner
- jmp .unroll_vdw_loop
-.finish_vdw_inner:
- ;; check if at least two particles remain
- add [esp + .innerk], dword 4
- mov edx, [esp + .innerk]
- and edx, 10b
- jnz .dopair_vdw
- jmp .checksingle_vdw
-.dopair_vdw:
-
- mov ecx, [esp + .innerjjnr]
-
- mov eax, [ecx]
- mov ebx, [ecx + 4]
- add [esp + .innerjjnr], dword 8
-
- mov esi, [ebp + %$type]
- mov ecx, eax
- mov edx, ebx
- mov ecx, [esi + ecx*4]
- mov edx, [esi + edx*4]
- mov esi, [ebp + %$nbfp]
- shl ecx, 1
- shl edx, 1
- mov edi, [esp + .ntia]
- add ecx, edi
- add edx, edi
- movlps xmm6, [esi + ecx*4]
- movhps xmm6, [esi + edx*4]
- mov edi, [ebp + %$pos]
- xorps xmm7,xmm7
- movaps xmm4, xmm6
- shufps xmm4, xmm4, 1000b
- shufps xmm6, xmm6, 1101b
- movlhps xmm4, xmm7
- movlhps xmm6, xmm7
-
- movaps [esp + .c6], xmm4
- movaps [esp + .c12], xmm6
-
- lea eax, [eax + eax*2]
- lea ebx, [ebx + ebx*2]
- ; move coordinates to xmm0-xmm2
- movlps xmm1, [edi + eax*4]
- movss xmm2, [edi + eax*4 + 8]
- movhps xmm1, [edi + ebx*4]
- movss xmm0, [edi + ebx*4 + 8]
-
-
- movlhps xmm3, xmm7
-
- shufps xmm2, xmm0, 0b
-
- movaps xmm0, xmm1
-
- shufps xmm2, xmm2, 10001000b
-
- shufps xmm0, xmm0, 10001000b
- shufps xmm1, xmm1, 11011101b
-
- mov edi, [ebp + %$faction]
- ; move ix-iz to xmm4-xmm6
- xorps xmm7, xmm7
-
- movaps xmm4, [esp + .ix]
- movaps xmm5, [esp + .iy]
- movaps xmm6, [esp + .iz]
-
- ; calc dr
- subps xmm4, xmm0
- subps xmm5, xmm1
- subps xmm6, xmm2
-
- ; store dr
- movaps [esp + .dx], xmm4
- movaps [esp + .dy], xmm5
- movaps [esp + .dz], xmm6
- ; square it
- mulps xmm4,xmm4
- mulps xmm5,xmm5
- mulps xmm6,xmm6
- addps xmm4, xmm5
- addps xmm4, xmm6
- ; rsq in xmm4
-
-
- rcpps xmm5, xmm4
- ; 1/x lookup seed in xmm5
- movaps xmm0, [esp + .two]
- mulps xmm4, xmm5
- subps xmm0, xmm4
- mulps xmm0, xmm5 ; xmm0=rinvsq
- movaps xmm4, xmm0
-
- movaps xmm1, xmm0
- mulps xmm1, xmm0
- mulps xmm1, xmm0 ; xmm1=rinvsix
- movaps xmm2, xmm1
- mulps xmm2, xmm2 ; xmm2=rinvtwelve
-
- mulps xmm1, [esp + .c6]
- mulps xmm2, [esp + .c12]
- movaps xmm5, xmm2
- subps xmm5, xmm1 ; vnb=vnb12-vnb6
- addps xmm5, [esp + .vnbtot]
- mulps xmm1, [esp + .six]
- mulps xmm2, [esp + .twelve]
- subps xmm2, xmm1
- mulps xmm4, xmm2 ; xmm4=total fscal
-
- movaps xmm0, [esp + .dx]
- movaps xmm1, [esp + .dy]
- movaps xmm2, [esp + .dz]
-
- movaps [esp + .vnbtot], xmm5
-
- mulps xmm0, xmm4
- mulps xmm1, xmm4
- mulps xmm2, xmm4
- ; xmm0-xmm2 contains tx-tz (partial force)
- ; now update f_i
- movaps xmm3, [esp + .fix]
- movaps xmm4, [esp + .fiy]
- movaps xmm5, [esp + .fiz]
- addps xmm3, xmm0
- addps xmm4, xmm1
- addps xmm5, xmm2
- movaps [esp + .fix], xmm3
- movaps [esp + .fiy], xmm4
- movaps [esp + .fiz], xmm5
- ; update the fj's
- movss xmm3, [edi + eax*4]
- movss xmm4, [edi + eax*4 + 4]
- movss xmm5, [edi + eax*4 + 8]
- subss xmm3, xmm0
- subss xmm4, xmm1
- subss xmm5, xmm2
- movss [edi + eax*4], xmm3
- movss [edi + eax*4 + 4], xmm4
- movss [edi + eax*4 + 8], xmm5
-
- shufps xmm0, xmm0, 11100001b
- shufps xmm1, xmm1, 11100001b
- shufps xmm2, xmm2, 11100001b
-
- movss xmm3, [edi + ebx*4]
- movss xmm4, [edi + ebx*4 + 4]
- movss xmm5, [edi + ebx*4 + 8]
- subss xmm3, xmm0
- subss xmm4, xmm1
- subss xmm5, xmm2
- movss [edi + ebx*4], xmm3
- movss [edi + ebx*4 + 4], xmm4
- movss [edi + ebx*4 + 8], xmm5
-
-.checksingle_vdw:
- mov edx, [esp + .innerk]
- and edx, 1b
- jnz .dosingle_vdw
- jmp .updateouterdata_vdw
-.dosingle_vdw:
- mov edi, [ebp + %$pos]
- mov ecx, [esp + .innerjjnr]
- mov eax, [ecx]
-
- mov esi, [ebp + %$type]
- mov ecx, eax
- mov ecx, [esi + ecx*4]
- mov esi, [ebp + %$nbfp]
- shl ecx, 1
- add ecx, [esp + .ntia]
- xorps xmm6, xmm6
- movlps xmm6, [esi + ecx*4]
- movaps xmm4, xmm6
- shufps xmm4, xmm4, 11111100b
- shufps xmm6, xmm6, 11111101b
-
- movaps [esp + .c6], xmm4
- movaps [esp + .c12], xmm6
-
- lea eax, [eax + eax*2]
-
- ; move coordinates to xmm0-xmm2
- movss xmm0, [edi + eax*4]
- movss xmm1, [edi + eax*4 + 4]
- movss xmm2, [edi + eax*4 + 8]
-
- xorps xmm7, xmm7
-
- movaps xmm4, [esp + .ix]
- movaps xmm5, [esp + .iy]
- movaps xmm6, [esp + .iz]
-
- ; calc dr
- subps xmm4, xmm0
- subps xmm5, xmm1
- subps xmm6, xmm2
-
- ; store dr
- movaps [esp + .dx], xmm4
- movaps [esp + .dy], xmm5
- movaps [esp + .dz], xmm6
- ; square it
- mulps xmm4,xmm4
- mulps xmm5,xmm5
- mulps xmm6,xmm6
- addps xmm4, xmm5
- addps xmm4, xmm6
- ; rsq in xmm4
-
- rcpps xmm5, xmm4
- ; 1/x lookup seed in xmm5
- movaps xmm0, [esp + .two]
- mulps xmm4, xmm5
- subps xmm0, xmm4
- mulps xmm0, xmm5 ; xmm0=rinvsq
- movaps xmm4, xmm0
-
- movaps xmm1, xmm0
- mulps xmm1, xmm0
- mulps xmm1, xmm0 ; xmm1=rinvsix
- movaps xmm2, xmm1
- mulps xmm2, xmm2 ; xmm2=rinvtwelve
-
- mulps xmm1, [esp + .c6]
- mulps xmm2, [esp + .c12]
- movaps xmm5, xmm2
- subps xmm5, xmm1 ; vnb=vnb12-vnb6
- addps xmm5, [esp + .vnbtot]
- mulps xmm1, [esp + .six]
- mulps xmm2, [esp + .twelve]
- subps xmm2, xmm1
- mulps xmm4, xmm2 ; xmm4=total fscal
-
- mov edi, [ebp + %$faction]
-
- movaps xmm0, [esp + .dx]
- movaps xmm1, [esp + .dy]
- movaps xmm2, [esp + .dz]
-
- movaps [esp + .vnbtot], xmm5
-
- mulps xmm0, xmm4
- mulps xmm1, xmm4
- mulps xmm2, xmm4
- ; xmm0-xmm2 contains tx-tz (partial force)
- ; now update f_i
- movaps xmm3, [esp + .fix]
- movaps xmm4, [esp + .fiy]
- movaps xmm5, [esp + .fiz]
- addps xmm3, xmm0
- addps xmm4, xmm1
- addps xmm5, xmm2
- movaps [esp + .fix], xmm3
- movaps [esp + .fiy], xmm4
- movaps [esp + .fiz], xmm5
- ; update fj
-
- movss xmm3, [edi + eax*4]
- movss xmm4, [edi + eax*4 + 4]
- movss xmm5, [edi + eax*4 + 8]
- subss xmm3, xmm0
- subss xmm4, xmm1
- subss xmm5, xmm2
- movss [edi + eax*4], xmm3
- movss [edi + eax*4 + 4], xmm4
- movss [edi + eax*4 + 8], xmm5
-.updateouterdata_vdw:
- mov ecx, [esp + .ii3]
- mov edi, [ebp + %$faction]
- mov esi, [ebp + %$fshift]
- mov edx, [esp + .is3]
-
- ; accumulate i forces in xmm0, xmm1, xmm2
- movaps xmm0, [esp + .fix]
- movaps xmm1, [esp + .fiy]
- movaps xmm2, [esp + .fiz]
-
- movhlps xmm3, xmm0
- movhlps xmm4, xmm1
- movhlps xmm5, xmm2
- addps xmm0, xmm3
- addps xmm1, xmm4
- addps xmm2, xmm5 ; summan ligger i 1/2 i xmm0-xmm2
-
- movaps xmm3, xmm0
- movaps xmm4, xmm1
- movaps xmm5, xmm2
-
- shufps xmm3, xmm3, 1b
- shufps xmm4, xmm4, 1b
- shufps xmm5, xmm5, 1b
- addss xmm0, xmm3
- addss xmm1, xmm4
- addss xmm2, xmm5 ; xmm0-xmm2 has single force in pos0.
-
- ; increment i force
- movss xmm3, [edi + ecx*4]
- movss xmm4, [edi + ecx*4 + 4]
- movss xmm5, [edi + ecx*4 + 8]
- addss xmm3, xmm0
- addss xmm4, xmm1
- addss xmm5, xmm2
- movss [edi + ecx*4], xmm3
- movss [edi + ecx*4 + 4], xmm4
- movss [edi + ecx*4 + 8], xmm5
-
- ; increment fshift force
- movss xmm3, [esi + edx*4]
- movss xmm4, [esi + edx*4 + 4]
- movss xmm5, [esi + edx*4 + 8]
- addss xmm3, xmm0
- addss xmm4, xmm1
- addss xmm5, xmm2
- movss [esi + edx*4], xmm3
- movss [esi + edx*4 + 4], xmm4
- movss [esi + edx*4 + 8], xmm5
-
- ;; loop back to mno.
- dec dword [esp + .nsvdw]
- jz .last_mno
- jmp .mno_vdw
-.last_mno:
- ; get group index for i particle
- mov edx, [ebp + %$gid] ; get group index for this i particle
- mov edx, [edx]
- add [ebp + %$gid], dword 4 ; advance pointer
-
- ; accumulate total potential energy and update it.
- movaps xmm7, [esp + .vctot]
- ; accumulate
- movhlps xmm6, xmm7
- addps xmm7, xmm6 ; pos 0-1 in xmm7 have the sum now
- movaps xmm6, xmm7
- shufps xmm6, xmm6, 1b
- addss xmm7, xmm6
-
- ; add earlier value from mem.
- mov eax, [ebp + %$Vc]
- addss xmm7, [eax + edx*4]
- ; move back to mem.
- movss [eax + edx*4], xmm7
-
- ; accumulate total lj energy and update it.
- movaps xmm7, [esp + .vnbtot]
- ; accumulate
- movhlps xmm6, xmm7
- addps xmm7, xmm6 ; pos 0-1 in xmm7 have the sum now
- movaps xmm6, xmm7
- shufps xmm6, xmm6, 1b
- addss xmm7, xmm6
-
- ; add earlier value from mem.
- mov eax, [ebp + %$Vnb]
- addss xmm7, [eax + edx*4]
- ; move back to mem.
- movss [eax + edx*4], xmm7
-
- ;; finish if last
- mov ecx, [ebp + %$nri]
- dec ecx
- jecxz .end
- ;; not last, iterate once more!
- mov [ebp + %$nri], ecx
- jmp .outer
-.end:
- emms
- mov eax, [esp + .salign]
- add esp, eax
- add esp, 364
- pop edi
- pop esi
- pop edx
- pop ecx
- pop ebx
- pop eax
- endproc
-
-
-
-
-proc inl1120_sse
-%$nri arg
-%$iinr arg
-%$jindex arg
-%$jjnr arg
-%$shift arg
-%$shiftvec arg
-%$fshift arg
-%$gid arg
-%$pos arg
-%$faction arg
-%$charge arg
-%$facel arg
-%$Vc arg
-%$type arg
-%$ntype arg
-%$nbfp arg
-%$Vnb arg
- ;; stack offsets for local variables
- ;; bottom of stack is cache-aligned for sse use
-.ixO equ 0
-.iyO equ 16
-.izO equ 32
-.ixH1 equ 48
-.iyH1 equ 64
-.izH1 equ 80
-.ixH2 equ 96
-.iyH2 equ 112
-.izH2 equ 128
-.iqO equ 144
-.iqH equ 160
-.dxO equ 176
-.dyO equ 192
-.dzO equ 208
-.dxH1 equ 224
-.dyH1 equ 240
-.dzH1 equ 256
-.dxH2 equ 272
-.dyH2 equ 288
-.dzH2 equ 304
-.qqO equ 320
-.qqH equ 336
-.c6 equ 352
-.c12 equ 368
-.six equ 384
-.twelve equ 400
-.vctot equ 416
-.vnbtot equ 432
-.fixO equ 448
-.fiyO equ 464
-.fizO equ 480
-.fixH1 equ 496
-.fiyH1 equ 512
-.fizH1 equ 528
-.fixH2 equ 544
-.fiyH2 equ 560
-.fizH2 equ 576
-.fjx equ 592
-.fjy equ 608
-.fjz equ 624
-.half equ 640
-.three equ 656
-.is3 equ 672
-.ii3 equ 676
-.ntia equ 680
-.innerjjnr equ 684
-.innerk equ 688
-.salign equ 692
- push eax
- push ebx
- push ecx
- push edx
- push esi
- push edi
- sub esp, 696 ; local stack space
- mov eax, esp
- and eax, 0xf
- sub esp, eax
- mov [esp + .salign], eax
-
- emms
-
- movups xmm0, [sse_half]
- movups xmm1, [sse_three]
- movups xmm2, [sse_six]
- movups xmm3, [sse_twelve]
- movaps [esp + .half], xmm0
- movaps [esp + .three], xmm1
- movaps [esp + .six], xmm2
- movaps [esp + .twelve], xmm3
-
- ;; assume we have at least one i particle - start directly
- mov ecx, [ebp + %$iinr] ; ecx = pointer into iinr[]
- mov ebx, [ecx] ; ebx =ii
-
- mov edx, [ebp + %$charge]
- movss xmm3, [edx + ebx*4]
- movss xmm4, [edx + ebx*4 + 4]
- movss xmm5, [ebp + %$facel]
- mulss xmm3, xmm5
- mulss xmm4, xmm5
-
- shufps xmm3, xmm3, 0b
- shufps xmm4, xmm4, 0b
- movaps [esp + .iqO], xmm3
- movaps [esp + .iqH], xmm4
-
- mov edx, [ebp + %$type]
- mov ecx, [edx + ebx*4]
- shl ecx, 1
- imul ecx, [ebp + %$ntype] ; ecx = ntia = 2*ntype*type[ii0]
- mov [esp + .ntia], ecx
-.outer:
- mov eax, [ebp + %$shift] ; eax = pointer into shift[]
- mov ebx, [eax] ; ebx=shift[n]
- add [ebp + %$shift], dword 4 ; advance pointer one step
-
- lea ebx, [ebx + ebx*2] ; ebx=3*is
- mov [esp + .is3],ebx ; store is3
-
- mov eax, [ebp + %$shiftvec] ; eax = base of shiftvec[]
-
- movss xmm0, [eax + ebx*4]
- movss xmm1, [eax + ebx*4 + 4]
- movss xmm2, [eax + ebx*4 + 8]
-
- mov ecx, [ebp + %$iinr] ; ecx = pointer into iinr[]
- add [ebp + %$iinr], dword 4 ; advance pointer
- mov ebx, [ecx] ; ebx =ii
-
- movaps xmm3, xmm0
- movaps xmm4, xmm1
- movaps xmm5, xmm2
-
- lea ebx, [ebx + ebx*2] ; ebx = 3*ii=ii3
- mov eax, [ebp + %$pos] ; eax = base of pos[]
- mov [esp + .ii3], ebx
-
- addss xmm3, [eax + ebx*4]
- addss xmm4, [eax + ebx*4 + 4]
- addss xmm5, [eax + ebx*4 + 8]
- shufps xmm3, xmm3, 0b
- shufps xmm4, xmm4, 0b
- shufps xmm5, xmm5, 0b
- movaps [esp + .ixO], xmm3
- movaps [esp + .iyO], xmm4
- movaps [esp + .izO], xmm5
-
- movss xmm3, xmm0
- movss xmm4, xmm1
- movss xmm5, xmm2
- addss xmm0, [eax + ebx*4 + 12]
- addss xmm1, [eax + ebx*4 + 16]
- addss xmm2, [eax + ebx*4 + 20]
- addss xmm3, [eax + ebx*4 + 24]
- addss xmm4, [eax + ebx*4 + 28]
- addss xmm5, [eax + ebx*4 + 32]
-
- shufps xmm0, xmm0, 0b
- shufps xmm1, xmm1, 0b
- shufps xmm2, xmm2, 0b
- shufps xmm3, xmm3, 0b
- shufps xmm4, xmm4, 0b
- shufps xmm5, xmm5, 0b
- movaps [esp + .ixH1], xmm0
- movaps [esp + .iyH1], xmm1
- movaps [esp + .izH1], xmm2
- movaps [esp + .ixH2], xmm3
- movaps [esp + .iyH2], xmm4
- movaps [esp + .izH2], xmm5
-
- ; clear vctot and i forces
- xorps xmm4, xmm4
- movaps [esp + .vctot], xmm4
- movaps [esp + .vnbtot], xmm4
- movaps [esp + .fixO], xmm4
- movaps [esp + .fiyO], xmm4
- movaps [esp + .fizO], xmm4
- movaps [esp + .fixH1], xmm4
- movaps [esp + .fiyH1], xmm4
- movaps [esp + .fizH1], xmm4
- movaps [esp + .fixH2], xmm4
- movaps [esp + .fiyH2], xmm4
- movaps [esp + .fizH2], xmm4
-
- mov eax, [ebp + %$jindex]
- mov ecx, [eax] ; jindex[n]
- mov edx, [eax + 4] ; jindex[n+1]
- add [ebp + %$jindex], dword 4
- sub edx, ecx ; number of innerloop atoms
-
- mov esi, [ebp + %$pos]
- mov edi, [ebp + %$faction]
- mov eax, [ebp + %$jjnr]
- shl ecx, 2
- add eax, ecx
- mov [esp + .innerjjnr], eax ; pointer to jjnr[nj0]
- sub edx, dword 4
- mov [esp + .innerk], edx ; number of innerloop atoms
- jge .unroll_loop
- jmp .odd_inner
-.unroll_loop:
- ;; quad-unroll innerloop here.
- mov edx, [esp + .innerjjnr] ; pointer to jjnr[k]
- mov eax, [edx]
- mov ebx, [edx + 4]
- mov ecx, [edx + 8]
- mov edx, [edx + 12] ; eax-edx=jnr1-4
-
- add [esp + .innerjjnr], dword 16 ; advance pointer (unrolled 4)
-
- mov esi, [ebp + %$charge] ; base of charge[]
-
- movss xmm3, [esi + eax*4]
- movss xmm4, [esi + ecx*4]
- movss xmm6, [esi + ebx*4]
- movss xmm7, [esi + edx*4]
-
- shufps xmm3, xmm6, 00000000b
- shufps xmm4, xmm7, 00000000b
- shufps xmm3, xmm4, 10001000b ; all charges in xmm3
- movaps xmm4, xmm3 ; and in xmm4
- mulps xmm3, [esp + .iqO]
- mulps xmm4, [esp + .iqH]
-
- movd mm0, eax ; use mmx registers as temp. storage
- movd mm1, ebx
- movd mm2, ecx
- movd mm3, edx
-
- movaps [esp + .qqO], xmm3
- movaps [esp + .qqH], xmm4
-
- mov esi, [ebp + %$type]
- mov eax, [esi + eax*4]
- mov ebx, [esi + ebx*4]
- mov ecx, [esi + ecx*4]
- mov edx, [esi + edx*4]
- mov esi, [ebp + %$nbfp]
- shl eax, 1
- shl ebx, 1
- shl ecx, 1
- shl edx, 1
- mov edi, [esp + .ntia]
- add eax, edi
- add ebx, edi
- add ecx, edi
- add edx, edi
-
- movlps xmm6, [esi + eax*4]
- movlps xmm7, [esi + ecx*4]
- movhps xmm6, [esi + ebx*4]
- movhps xmm7, [esi + edx*4]
-
- movaps xmm4, xmm6
- shufps xmm4, xmm7, 10001000b
- shufps xmm6, xmm7, 11011101b
-
- movd eax, mm0
- movd ebx, mm1
- movd ecx, mm2
- movd edx, mm3
-
- movaps [esp + .c6], xmm4
- movaps [esp + .c12], xmm6
-
- mov esi, [ebp + %$pos] ; base of pos[]
-
- lea eax, [eax + eax*2] ; replace jnr with j3
- lea ebx, [ebx + ebx*2]
- lea ecx, [ecx + ecx*2] ; replace jnr with j3
- lea edx, [edx + edx*2]
-
- ; move four coordinates to xmm0-xmm2
- movlps xmm4, [esi + eax*4]
- movlps xmm5, [esi + ecx*4]
- movss xmm2, [esi + eax*4 + 8]
- movss xmm6, [esi + ecx*4 + 8]
-
- movhps xmm4, [esi + ebx*4]
- movhps xmm5, [esi + edx*4]
-
- movss xmm0, [esi + ebx*4 + 8]
- movss xmm1, [esi + edx*4 + 8]
-
- shufps xmm2, xmm0, 0b
- shufps xmm6, xmm1, 0b
-
- movaps xmm0, xmm4
- movaps xmm1, xmm4
-
- shufps xmm2, xmm6, 10001000b
-
- shufps xmm0, xmm5, 10001000b
- shufps xmm1, xmm5, 11011101b
-
- ; move ixO-izO to xmm4-xmm6
- movaps xmm4, [esp + .ixO]
- movaps xmm5, [esp + .iyO]
- movaps xmm6, [esp + .izO]
-
- ; calc dr
- subps xmm4, xmm0
- subps xmm5, xmm1
- subps xmm6, xmm2
-
- ; store dr
- movaps [esp + .dxO], xmm4
- movaps [esp + .dyO], xmm5
- movaps [esp + .dzO], xmm6
- ; square it
- mulps xmm4,xmm4
- mulps xmm5,xmm5
- mulps xmm6,xmm6
- addps xmm4, xmm5
- addps xmm4, xmm6
- movaps xmm7, xmm4
- ; rsqO in xmm7
-
- ; move ixH1-izH1 to xmm4-xmm6
- movaps xmm4, [esp + .ixH1]
- movaps xmm5, [esp + .iyH1]
- movaps xmm6, [esp + .izH1]
-
- ; calc dr
- subps xmm4, xmm0
- subps xmm5, xmm1
- subps xmm6, xmm2
-
- ; store dr
- movaps [esp + .dxH1], xmm4
- movaps [esp + .dyH1], xmm5
- movaps [esp + .dzH1], xmm6
- ; square it
- mulps xmm4,xmm4
- mulps xmm5,xmm5
- mulps xmm6,xmm6
- addps xmm6, xmm5
- addps xmm6, xmm4
- ; rsqH1 in xmm6
-
- ; move ixH2-izH2 to xmm3-xmm5
- movaps xmm3, [esp + .ixH2]
- movaps xmm4, [esp + .iyH2]
- movaps xmm5, [esp + .izH2]
-
- ; calc dr
- subps xmm3, xmm0
- subps xmm4, xmm1
- subps xmm5, xmm2
-
- ; store dr
- movaps [esp + .dxH2], xmm3
- movaps [esp + .dyH2], xmm4
- movaps [esp + .dzH2], xmm5
- ; square it
- mulps xmm3,xmm3
- mulps xmm4,xmm4
- mulps xmm5,xmm5
- addps xmm5, xmm4
- addps xmm5, xmm3
- ; rsqH2 in xmm5, rsqH1 in xmm6, rsqO in xmm7
-
- ; start with rsqO - seed in xmm2
- rsqrtps xmm2, xmm7
- movaps xmm3, xmm2
- mulps xmm2, xmm2
- movaps xmm4, [esp + .three]
- mulps xmm2, xmm7 ; rsq*lu*lu
- subps xmm4, xmm2 ; 3.0-rsq*lu*lu
- mulps xmm4, xmm3 ; lu*(3-rsq*lu*lu)
- mulps xmm4, [esp + .half]
- movaps xmm7, xmm4 ; rinvO in xmm7
- ; rsqH1 - seed in xmm2
- rsqrtps xmm2, xmm6
- movaps xmm3, xmm2
- mulps xmm2, xmm2
- movaps xmm4, [esp + .three]
- mulps xmm2, xmm6 ; rsq*lu*lu
- subps xmm4, xmm2 ; 3.0-rsq*lu*lu
- mulps xmm4, xmm3 ; lu*(3-rsq*lu*lu)
- mulps xmm4, [esp + .half]
- movaps xmm6, xmm4 ; rinvH1 in xmm6
- ; rsqH2 - seed in xmm2
- rsqrtps xmm2, xmm5
- movaps xmm3, xmm2
- mulps xmm2, xmm2
- movaps xmm4, [esp + .three]
- mulps xmm2, xmm5 ; rsq*lu*lu
- subps xmm4, xmm2 ; 3.0-rsq*lu*lu
- mulps xmm4, xmm3 ; lu*(3-rsq*lu*lu)
- mulps xmm4, [esp + .half]
- movaps xmm5, xmm4 ; rinvH2 in xmm5
-
- ; do O interactions
- movaps xmm4, xmm7
- mulps xmm4, xmm4 ; xmm7=rinv, xmm4=rinvsq
- movaps xmm1, xmm4
- mulps xmm1, xmm4
- mulps xmm1, xmm4 ; xmm1=rinvsix
- movaps xmm2, xmm1
- mulps xmm2, xmm2 ; xmm2=rinvtwelve
- mulps xmm7, [esp + .qqO] ;xmm7=vcoul
-
- mulps xmm1, [esp + .c6]
- mulps xmm2, [esp + .c12]
- movaps xmm3, xmm2
- subps xmm3, xmm1 ; vnb=vnb12-vnb6
- addps xmm3, [esp + .vnbtot]
- mulps xmm1, [esp + .six]
- mulps xmm2, [esp + .twelve]
- subps xmm2, xmm1
- addps xmm2, xmm7
- mulps xmm4, xmm2 ; total fsO in xmm4
-
- addps xmm7, [esp + .vctot]
-
- movaps [esp + .vnbtot], xmm3
- movaps [esp + .vctot], xmm7
-
- movaps xmm0, [esp + .dxO]
- movaps xmm1, [esp + .dyO]
- movaps xmm2, [esp + .dzO]
- mulps xmm0, xmm4
- mulps xmm1, xmm4
- mulps xmm2, xmm4
-
- ; update O forces
- movaps xmm3, [esp + .fixO]
- movaps xmm4, [esp + .fiyO]
- movaps xmm7, [esp + .fizO]
- addps xmm3, xmm0
- addps xmm4, xmm1
- addps xmm7, xmm2
- movaps [esp + .fixO], xmm3
- movaps [esp + .fiyO], xmm4
- movaps [esp + .fizO], xmm7
- ; update j forces with water O
- movaps [esp + .fjx], xmm0
- movaps [esp + .fjy], xmm1
- movaps [esp + .fjz], xmm2
-
- ; H1 interactions
- movaps xmm4, xmm6
- mulps xmm4, xmm4 ; xmm6=rinv, xmm4=rinvsq
- mulps xmm6, [esp + .qqH] ;xmm6=vcoul
- mulps xmm4, xmm6 ; total fsH1 in xmm4
-
- addps xmm6, [esp + .vctot]
-
- movaps xmm0, [esp + .dxH1]
- movaps xmm1, [esp + .dyH1]
- movaps xmm2, [esp + .dzH1]
- movaps [esp + .vctot], xmm6
- mulps xmm0, xmm4
- mulps xmm1, xmm4
- mulps xmm2, xmm4
-
- ; update H1 forces
- movaps xmm3, [esp + .fixH1]
- movaps xmm4, [esp + .fiyH1]
- movaps xmm7, [esp + .fizH1]
- addps xmm3, xmm0
- addps xmm4, xmm1
- addps xmm7, xmm2
- movaps [esp + .fixH1], xmm3
- movaps [esp + .fiyH1], xmm4
- movaps [esp + .fizH1], xmm7
- ; update j forces with water H1
- addps xmm0, [esp + .fjx]
- addps xmm1, [esp + .fjy]
- addps xmm2, [esp + .fjz]
- movaps [esp + .fjx], xmm0
- movaps [esp + .fjy], xmm1
- movaps [esp + .fjz], xmm2
-
- ; H2 interactions
- movaps xmm4, xmm5
- mulps xmm4, xmm4 ; xmm5=rinv, xmm4=rinvsq
- mulps xmm5, [esp + .qqH] ;xmm5=vcoul
- mulps xmm4, xmm5 ; total fsH1 in xmm4
-
- addps xmm5, [esp + .vctot]
-
- movaps xmm0, [esp + .dxH2]
- movaps xmm1, [esp + .dyH2]
- movaps xmm2, [esp + .dzH2]
- movaps [esp + .vctot], xmm5
- mulps xmm0, xmm4
- mulps xmm1, xmm4
- mulps xmm2, xmm4
-
- ; update H2 forces
- movaps xmm3, [esp + .fixH2]
- movaps xmm4, [esp + .fiyH2]
- movaps xmm7, [esp + .fizH2]
- addps xmm3, xmm0
- addps xmm4, xmm1
- addps xmm7, xmm2
- movaps [esp + .fixH2], xmm3
- movaps [esp + .fiyH2], xmm4
- movaps [esp + .fizH2], xmm7
-
- mov edi, [ebp +%$faction]
- ; update j forces
- addps xmm0, [esp + .fjx]
- addps xmm1, [esp + .fjy]
- addps xmm2, [esp + .fjz]
-
- movlps xmm4, [edi + eax*4]
- movlps xmm7, [edi + ecx*4]
- movhps xmm4, [edi + ebx*4]
- movhps xmm7, [edi + edx*4]
-
- movaps xmm3, xmm4
- shufps xmm3, xmm7, 10001000b
- shufps xmm4, xmm7, 11011101b
- ; xmm3 has fjx, xmm4 has fjy.
- subps xmm3, xmm0
- subps xmm4, xmm1
- ; unpack the back for storing.
- movaps xmm7, xmm3
- unpcklps xmm7, xmm4
- unpckhps xmm3, xmm4
- movlps [edi + eax*4], xmm7
- movlps [edi + ecx*4], xmm3
- movhps [edi + ebx*4], xmm7
- movhps [edi + edx*4], xmm3
- ; finally z forces
- movss xmm0, [edi + eax*4 + 8]
- movss xmm1, [edi + ebx*4 + 8]
- movss xmm3, [edi + ecx*4 + 8]
- movss xmm4, [edi + edx*4 + 8]
- subss xmm0, xmm2
- shufps xmm2, xmm2, 11100101b
- subss xmm1, xmm2
- shufps xmm2, xmm2, 11101010b
- subss xmm3, xmm2
- shufps xmm2, xmm2, 11111111b
- subss xmm4, xmm2
- movss [edi + eax*4 + 8], xmm0
- movss [edi + ebx*4 + 8], xmm1
- movss [edi + ecx*4 + 8], xmm3
- movss [edi + edx*4 + 8], xmm4
-
- ;; should we do one more iteration?
- sub [esp + .innerk], dword 4
- jl .odd_inner
- jmp .unroll_loop
-.odd_inner:
- add [esp + .innerk], dword 4
- jnz .odd_loop
- jmp .updateouterdata
-.odd_loop:
- mov edx, [esp + .innerjjnr] ; pointer to jjnr[k]
- mov eax, [edx]
- add [esp + .innerjjnr], dword 4
-
- xorps xmm4, xmm4
- movss xmm4, [esp + .iqO]
- mov esi, [ebp + %$charge]
- movhps xmm4, [esp + .iqH]
- movss xmm3, [esi + eax*4] ; charge in xmm3
- shufps xmm3, xmm3, 0b
- mulps xmm3, xmm4
- movaps [esp + .qqO], xmm3 ; use oxygen qq for storage.
-
- xorps xmm6, xmm6
- mov esi, [ebp + %$type]
- mov ebx, [esi + eax*4]
- mov esi, [ebp + %$nbfp]
- shl ebx, 1
- add ebx, [esp + .ntia]
- movlps xmm6, [esi + ebx*4]
- movaps xmm7, xmm6
- shufps xmm6, xmm6, 11111100b
- shufps xmm7, xmm7, 11111101b
- movaps [esp + .c6], xmm6
- movaps [esp + .c12], xmm7
-
- mov esi, [ebp + %$pos]
- lea eax, [eax + eax*2]
-
- ; move j coords to xmm0-xmm2
- movss xmm0, [esi + eax*4]
- movss xmm1, [esi + eax*4 + 4]
- movss xmm2, [esi + eax*4 + 8]
- shufps xmm0, xmm0, 0b
- shufps xmm1, xmm1, 0b
- shufps xmm2, xmm2, 0b
-
- movss xmm3, [esp + .ixO]
- movss xmm4, [esp + .iyO]
- movss xmm5, [esp + .izO]
-
- movlps xmm6, [esp + .ixH1]
- movlps xmm7, [esp + .ixH2]
- unpcklps xmm6, xmm7
- movlhps xmm3, xmm6
- movlps xmm6, [esp + .iyH1]
- movlps xmm7, [esp + .iyH2]
- unpcklps xmm6, xmm7
- movlhps xmm4, xmm6
- movlps xmm6, [esp + .izH1]
- movlps xmm7, [esp + .izH2]
- unpcklps xmm6, xmm7
- movlhps xmm5, xmm6
-
- subps xmm3, xmm0
- subps xmm4, xmm1
- subps xmm5, xmm2
-
- movaps [esp + .dxO], xmm3
- movaps [esp + .dyO], xmm4
- movaps [esp + .dzO], xmm5
-
- mulps xmm3, xmm3
- mulps xmm4, xmm4
- mulps xmm5, xmm5
-
- addps xmm4, xmm3
- addps xmm4, xmm5
- ; rsq in xmm4.
-
- rsqrtps xmm5, xmm4
- ; lookup seed in xmm5
- movaps xmm2, xmm5
- mulps xmm5, xmm5
- movaps xmm1, [esp + .three]
- mulps xmm5, xmm4 ;rsq*lu*lu
- movaps xmm0, [esp + .half]
- subps xmm1, xmm5 ; 3.0-rsq*lu*lu
- mulps xmm1, xmm2
- mulps xmm0, xmm1 ; xmm0=rinv
- movaps xmm4, xmm0
- mulps xmm4, xmm4 ; xmm4=rinvsq
- movaps xmm1, xmm4
- mulss xmm1, xmm4
- movaps xmm3, [esp + .qqO]
- mulss xmm1, xmm4 ; xmm1=rinvsix
- movaps xmm2, xmm1
- mulss xmm2, xmm2 ; xmm2=rinvtwelve
- mulps xmm3, xmm0 ; xmm3=vcoul
- mulps xmm1, [esp + .c6]
- mulps xmm2, [esp + .c12]
- movaps xmm5, xmm2
- subss xmm5, xmm1 ; vnb=vnb12-vnb6
- addps xmm5, [esp + .vnbtot]
- mulss xmm1, [esp + .six]
- mulss xmm2, [esp + .twelve]
- subss xmm2, xmm1
- addps xmm2, xmm3
- mulps xmm4, xmm2 ; xmm4=total fscal
- addps xmm3, [esp + .vctot]
-
- movaps xmm0, [esp + .dxO]
- movaps xmm1, [esp + .dyO]
- movaps xmm2, [esp + .dzO]
-
- movaps [esp + .vctot], xmm3
- movaps [esp + .vnbtot], xmm5
-
- mulps xmm0, xmm4
- mulps xmm1, xmm4
- mulps xmm2, xmm4
- ; xmm0-xmm2 contains tx-tz (partial force)
- movss xmm3, [esp + .fixO]
- movss xmm4, [esp + .fiyO]
- movss xmm5, [esp + .fizO]
- addss xmm3, xmm0
- addss xmm4, xmm1
- addss xmm5, xmm2
- movss [esp + .fixO], xmm3
- movss [esp + .fiyO], xmm4
- movss [esp + .fizO], xmm5 ; updated the O force. now do the H's
- movaps xmm3, xmm0
- movaps xmm4, xmm1
- movaps xmm5, xmm2
- shufps xmm3, xmm3, 11100110b ; shift right
- shufps xmm4, xmm4, 11100110b
- shufps xmm5, xmm5, 11100110b
- addss xmm3, [esp + .fixH1]
- addss xmm4, [esp + .fiyH1]
- addss xmm5, [esp + .fizH1]
- movss [esp + .fixH1], xmm3
- movss [esp + .fiyH1], xmm4
- movss [esp + .fizH1], xmm5 ; updated the H1 force.
-
- mov edi, [ebp + %$faction]
- shufps xmm3, xmm3, 11100111b ; shift right
- shufps xmm4, xmm4, 11100111b
- shufps xmm5, xmm5, 11100111b
- addss xmm3, [esp + .fixH2]
- addss xmm4, [esp + .fiyH2]
- addss xmm5, [esp + .fizH2]
- movss [esp + .fixH2], xmm3
- movss [esp + .fiyH2], xmm4
- movss [esp + .fizH2], xmm5 ; updated the H2 force.
-
- ; the fj's - start by accumulating the tx/ty/tz force in xmm0, xmm1.
- xorps xmm5, xmm5
- movaps xmm3, xmm0
- movlps xmm6, [edi + eax*4]
- movss xmm7, [edi + eax*4 + 8]
- unpcklps xmm3, xmm1
- movlhps xmm3, xmm5
- unpckhps xmm0, xmm1
- addps xmm0, xmm3
- movhlps xmm3, xmm0
- addps xmm0, xmm3 ; x,y sum in xmm0
-
- movhlps xmm1, xmm2
- addss xmm2, xmm1
- shufps xmm1, xmm1, 1b
- addss xmm2, xmm1 ; z sum in xmm2
- subps xmm6, xmm0
- subss xmm7, xmm2
-
- movlps [edi + eax*4], xmm6
- movss [edi + eax*4 + 8], xmm7
-
- dec dword [esp + .innerk]
- jz .updateouterdata
- jmp .odd_loop
-.updateouterdata:
- mov ecx, [esp + .ii3]
- mov edi, [ebp + %$faction]
- mov esi, [ebp + %$fshift]
- mov edx, [esp + .is3]
-
- ; accumulate Oi forces in xmm0, xmm1, xmm2
- movaps xmm0, [esp + .fixO]
- movaps xmm1, [esp + .fiyO]
- movaps xmm2, [esp + .fizO]
-
- movhlps xmm3, xmm0
- movhlps xmm4, xmm1
- movhlps xmm5, xmm2
- addps xmm0, xmm3
- addps xmm1, xmm4
- addps xmm2, xmm5 ; summan ligger i 1/2 i xmm0-xmm2
-
- movaps xmm3, xmm0
- movaps xmm4, xmm1
- movaps xmm5, xmm2
-
- shufps xmm3, xmm3, 1b
- shufps xmm4, xmm4, 1b
- shufps xmm5, xmm5, 1b
- addss xmm0, xmm3
- addss xmm1, xmm4
- addss xmm2, xmm5 ; xmm0-xmm2 has single force in pos0.
-
- ; increment i force
- movss xmm3, [edi + ecx*4]
- movss xmm4, [edi + ecx*4 + 4]
- movss xmm5, [edi + ecx*4 + 8]
- addss xmm3, xmm0
- addss xmm4, xmm1
- addss xmm5, xmm2
- movss [edi + ecx*4], xmm3
- movss [edi + ecx*4 + 4], xmm4
- movss [edi + ecx*4 + 8], xmm5
-
- ; accumulate force in xmm6/xmm7 for fshift
- movaps xmm6, xmm0
- movss xmm7, xmm2
- movlhps xmm6, xmm1
- shufps xmm6, xmm6, 1000b
-
- ; accumulate H1i forces in xmm0, xmm1, xmm2
- movaps xmm0, [esp + .fixH1]
- movaps xmm1, [esp + .fiyH1]
- movaps xmm2, [esp + .fizH1]
-
- movhlps xmm3, xmm0
- movhlps xmm4, xmm1
- movhlps xmm5, xmm2
- addps xmm0, xmm3
- addps xmm1, xmm4
- addps xmm2, xmm5 ; summan ligger i 1/2 i xmm0-xmm2
-
- movaps xmm3, xmm0
- movaps xmm4, xmm1
- movaps xmm5, xmm2
-
- shufps xmm3, xmm3, 1b
- shufps xmm4, xmm4, 1b
- shufps xmm5, xmm5, 1b
- addss xmm0, xmm3
- addss xmm1, xmm4
- addss xmm2, xmm5 ; xmm0-xmm2 has single force in pos0.
-
- ; increment i force
- movss xmm3, [edi + ecx*4 + 12]
- movss xmm4, [edi + ecx*4 + 16]
- movss xmm5, [edi + ecx*4 + 20]
- addss xmm3, xmm0
- addss xmm4, xmm1
- addss xmm5, xmm2
- movss [edi + ecx*4 + 12], xmm3
- movss [edi + ecx*4 + 16], xmm4
- movss [edi + ecx*4 + 20], xmm5
-
- ;accumulate force in xmm6/xmm7 for fshift
- addss xmm7, xmm2
- movlhps xmm0, xmm1
- shufps xmm0, xmm0, 1000b
- addps xmm6, xmm0
-
- ; accumulate H2i forces in xmm0, xmm1, xmm2
- movaps xmm0, [esp + .fixH2]
- movaps xmm1, [esp + .fiyH2]
- movaps xmm2, [esp + .fizH2]
-
- movhlps xmm3, xmm0
- movhlps xmm4, xmm1
- movhlps xmm5, xmm2
- addps xmm0, xmm3
- addps xmm1, xmm4
- addps xmm2, xmm5 ; summan ligger i 1/2 i xmm0-xmm2
-
- movaps xmm3, xmm0
- movaps xmm4, xmm1
- movaps xmm5, xmm2
-
- shufps xmm3, xmm3, 1b
- shufps xmm4, xmm4, 1b
- shufps xmm5, xmm5, 1b
- addss xmm0, xmm3
- addss xmm1, xmm4
- addss xmm2, xmm5 ; xmm0-xmm2 has single force in pos0.
-
- ; increment i force
- movss xmm3, [edi + ecx*4 + 24]
- movss xmm4, [edi + ecx*4 + 28]
- movss xmm5, [edi + ecx*4 + 32]
- addss xmm3, xmm0
- addss xmm4, xmm1
- addss xmm5, xmm2
- movss [edi + ecx*4 + 24], xmm3
- movss [edi + ecx*4 + 28], xmm4
- movss [edi + ecx*4 + 32], xmm5
-
- ;accumulate force in xmm6/xmm7 for fshift
- addss xmm7, xmm2
- movlhps xmm0, xmm1
- shufps xmm0, xmm0, 1000b
- addps xmm6, xmm0
-
- ; increment fshift force
- movlps xmm3, [esi + edx*4]
- movss xmm4, [esi + edx*4 + 8]
- addps xmm3, xmm6
- addss xmm4, xmm7
- movlps [esi + edx*4], xmm3
- movss [esi + edx*4 + 8], xmm4
-
- mov edx, [ebp + %$gid]
- mov edx, [edx]
- add [ebp + %$gid], dword 4
-
- ; accumulate total potential energy and update it.
- movaps xmm7, [esp + .vctot]
- ; accumulate
- movhlps xmm6, xmm7
- addps xmm7, xmm6 ; pos 0-1 in xmm7 have the sum now
- movaps xmm6, xmm7
- shufps xmm6, xmm6, 1b
- addss xmm7, xmm6
-
- ; add earlier value from mem.
- mov eax, [ebp + %$Vc]
- addss xmm7, [eax + edx*4]
- ; move back to mem.
- movss [eax + edx*4], xmm7
-
- ; accumulate total lj energy and update it.
- movaps xmm7, [esp + .vnbtot]
- ; accumulate
- movhlps xmm6, xmm7
- addps xmm7, xmm6 ; pos 0-1 in xmm7 have the sum now
- movaps xmm6, xmm7
- shufps xmm6, xmm6, 1b
- addss xmm7, xmm6
-
- ; add earlier value from mem.
- mov eax, [ebp + %$Vnb]
- addss xmm7, [eax + edx*4]
- ; move back to mem.
- movss [eax + edx*4], xmm7
-
- ;; finish if last
- mov ecx, [ebp + %$nri]
- dec ecx
- jecxz .end
- ;; not last, iterate once more!
- mov [ebp + %$nri], ecx
- jmp .outer
-.end:
- emms
- mov eax, [esp + .salign]
- add esp, eax
- add esp, 696
- pop edi
- pop esi
- pop edx
- pop ecx
- pop ebx
- pop eax
- endproc
-
-
-
-proc inl1130_sse
-%$nri arg
-%$iinr arg
-%$jindex arg
-%$jjnr arg
-%$shift arg
-%$shiftvec arg
-%$fshift arg
-%$gid arg
-%$pos arg
-%$faction arg
-%$charge arg
-%$facel arg
-%$Vc arg
-%$type arg
-%$ntype arg
-%$nbfp arg
-%$Vnb arg
- ;; stack offsets for local variables
- ;; bottom of stack is cache-aligned for sse use
-.ixO equ 0
-.iyO equ 16
-.izO equ 32
-.ixH1 equ 48
-.iyH1 equ 64
-.izH1 equ 80
-.ixH2 equ 96
-.iyH2 equ 112
-.izH2 equ 128
-.jxO equ 144
-.jyO equ 160
-.jzO equ 176
-.jxH1 equ 192
-.jyH1 equ 208
-.jzH1 equ 224
-.jxH2 equ 240
-.jyH2 equ 256
-.jzH2 equ 272
-.dxOO equ 288
-.dyOO equ 304
-.dzOO equ 320
-.dxOH1 equ 336
-.dyOH1 equ 352
-.dzOH1 equ 368
-.dxOH2 equ 384
-.dyOH2 equ 400
-.dzOH2 equ 416
-.dxH1O equ 432
-.dyH1O equ 448
-.dzH1O equ 464
-.dxH1H1 equ 480
-.dyH1H1 equ 496
-.dzH1H1 equ 512
-.dxH1H2 equ 528
-.dyH1H2 equ 544
-.dzH1H2 equ 560
-.dxH2O equ 576
-.dyH2O equ 592
-.dzH2O equ 608
-.dxH2H1 equ 624
-.dyH2H1 equ 640
-.dzH2H1 equ 656
-.dxH2H2 equ 672
-.dyH2H2 equ 688
-.dzH2H2 equ 704
-.qqOO equ 720
-.qqOH equ 736
-.qqHH equ 752
-.c6 equ 768
-.c12 equ 784
-.six equ 800
-.twelve equ 816
-.vctot equ 832
-.vnbtot equ 848
-.fixO equ 864
-.fiyO equ 880
-.fizO equ 896
-.fixH1 equ 912
-.fiyH1 equ 928
-.fizH1 equ 944
-.fixH2 equ 960
-.fiyH2 equ 976
-.fizH2 equ 992
-.fjxO equ 1008
-.fjyO equ 1024
-.fjzO equ 1040
-.fjxH1 equ 1056
-.fjyH1 equ 1072
-.fjzH1 equ 1088
-.fjxH2 equ 1104
-.fjyH2 equ 1120
-.fjzH2 equ 1136
-.half equ 1152
-.three equ 1168
-.rsqOO equ 1184
-.rsqOH1 equ 1200
-.rsqOH2 equ 1216
-.rsqH1O equ 1232
-.rsqH1H1 equ 1248
-.rsqH1H2 equ 1264
-.rsqH2O equ 1280
-.rsqH2H1 equ 1296
-.rsqH2H2 equ 1312
-.rinvOO equ 1328
-.rinvOH1 equ 1344
-.rinvOH2 equ 1360
-.rinvH1O equ 1376
-.rinvH1H1 equ 1392
-.rinvH1H2 equ 1408
-.rinvH2O equ 1424
-.rinvH2H1 equ 1440
-.rinvH2H2 equ 1456
-.is3 equ 1472
-.ii3 equ 1476
-.innerjjnr equ 1480
-.innerk equ 1484
-.salign equ 1488
- push eax
- push ebx
- push ecx
- push edx
- push esi
- push edi
- sub esp, 1492 ; local stack space
- mov eax, esp
- and eax, 0xf
- sub esp, eax
- mov [esp + .salign], eax
-
- emms
-
- movups xmm0, [sse_half]
- movups xmm1, [sse_three]
- movups xmm2, [sse_six]
- movups xmm3, [sse_twelve]
- movaps [esp + .half], xmm0
- movaps [esp + .three], xmm1
- movaps [esp + .six], xmm2
- movaps [esp + .twelve], xmm3
-
- ;; assume we have at least one i particle - start directly
- mov ecx, [ebp + %$iinr] ; ecx = pointer into iinr[]
- mov ebx, [ecx] ; ebx =ii
-
- mov edx, [ebp + %$charge]
- movss xmm3, [edx + ebx*4]
- movss xmm4, xmm3
- movss xmm5, [edx + ebx*4 + 4]
- movss xmm6, [ebp + %$facel]
- mulss xmm3, xmm3
- mulss xmm4, xmm5
- mulss xmm5, xmm5
- mulss xmm3, xmm6
- mulss xmm4, xmm6
- mulss xmm5, xmm6
- shufps xmm3, xmm3, 0b
- shufps xmm4, xmm4, 0b
- shufps xmm5, xmm5, 0b
- movaps [esp + .qqOO], xmm3
- movaps [esp + .qqOH], xmm4
- movaps [esp + .qqHH], xmm5
-
- xorps xmm0, xmm0
- mov edx, [ebp + %$type]
- mov ecx, [edx + ebx*4]
- shl ecx, 1
- mov edx, ecx
- imul ecx, [ebp + %$ntype] ; ecx = ntia = 2*ntype*type[ii0]
- add edx, ecx
- mov eax, [ebp + %$nbfp]
- movlps xmm0, [eax + edx*4]
- movaps xmm1, xmm0
- shufps xmm0, xmm0, 0b
- shufps xmm1, xmm1, 01010101b
- movaps [esp + .c6], xmm0
- movaps [esp + .c12], xmm1
-
-.outer:
- mov eax, [ebp + %$shift] ; eax = pointer into shift[]
- mov ebx, [eax] ; ebx=shift[n]
- add [ebp + %$shift], dword 4 ; advance pointer one step
-
- lea ebx, [ebx + ebx*2] ; ebx=3*is
- mov [esp + .is3],ebx ; store is3
-
- mov eax, [ebp + %$shiftvec] ; eax = base of shiftvec[]
-
- movss xmm0, [eax + ebx*4]
- movss xmm1, [eax + ebx*4 + 4]
- movss xmm2, [eax + ebx*4 + 8]
-
- mov ecx, [ebp + %$iinr] ; ecx = pointer into iinr[]
- add [ebp + %$iinr], dword 4 ; advance pointer
- mov ebx, [ecx] ; ebx =ii
-
- lea ebx, [ebx + ebx*2] ; ebx = 3*ii=ii3
- mov eax, [ebp + %$pos] ; eax = base of pos[]
- mov [esp + .ii3], ebx
-
- movaps xmm3, xmm0
- movaps xmm4, xmm1
- movaps xmm5, xmm2
- addss xmm3, [eax + ebx*4]
- addss xmm4, [eax + ebx*4 + 4]
- addss xmm5, [eax + ebx*4 + 8]
- shufps xmm3, xmm3, 0b
- shufps xmm4, xmm4, 0b
- shufps xmm5, xmm5, 0b
- movaps [esp + .ixO], xmm3
- movaps [esp + .iyO], xmm4
- movaps [esp + .izO], xmm5
-
- movss xmm3, xmm0
- movss xmm4, xmm1
- movss xmm5, xmm2
- addss xmm0, [eax + ebx*4 + 12]
- addss xmm1, [eax + ebx*4 + 16]
- addss xmm2, [eax + ebx*4 + 20]
- addss xmm3, [eax + ebx*4 + 24]
- addss xmm4, [eax + ebx*4 + 28]
- addss xmm5, [eax + ebx*4 + 32]
-
- shufps xmm0, xmm0, 0b
- shufps xmm1, xmm1, 0b
- shufps xmm2, xmm2, 0b
- shufps xmm3, xmm3, 0b
- shufps xmm4, xmm4, 0b
- shufps xmm5, xmm5, 0b
- movaps [esp + .ixH1], xmm0
- movaps [esp + .iyH1], xmm1
- movaps [esp + .izH1], xmm2
- movaps [esp + .ixH2], xmm3
- movaps [esp + .iyH2], xmm4
- movaps [esp + .izH2], xmm5
-
- ; clear vctot and i forces
- xorps xmm4, xmm4
- movaps [esp + .vctot], xmm4
- movaps [esp + .vnbtot], xmm4
- movaps [esp + .fixO], xmm4
- movaps [esp + .fiyO], xmm4
- movaps [esp + .fizO], xmm4
- movaps [esp + .fixH1], xmm4
- movaps [esp + .fiyH1], xmm4
- movaps [esp + .fizH1], xmm4
- movaps [esp + .fixH2], xmm4
- movaps [esp + .fiyH2], xmm4
- movaps [esp + .fizH2], xmm4
-
- mov eax, [ebp + %$jindex]
- mov ecx, [eax] ; jindex[n]
- mov edx, [eax + 4] ; jindex[n+1]
- add [ebp + %$jindex], dword 4
- sub edx, ecx ; number of innerloop atoms
-
- mov esi, [ebp + %$pos]
- mov edi, [ebp + %$faction]
- mov eax, [ebp + %$jjnr]
- shl ecx, 2
- add eax, ecx
- mov [esp + .innerjjnr], eax ; pointer to jjnr[nj0]
- sub edx, dword 4
- mov [esp + .innerk], edx ; number of innerloop atoms
- jge .unroll_loop
- jmp .single_check
-.unroll_loop:
- ;; quad-unroll innerloop here.
- mov edx, [esp + .innerjjnr] ; pointer to jjnr[k]
-
- mov eax, [edx]
- mov ebx, [edx + 4]
- mov ecx, [edx + 8]
- mov edx, [edx + 12] ; eax-edx=jnr1-4
-
- add [esp + .innerjjnr], dword 16 ; advance pointer (unrolled 4)
-
- mov esi, [ebp + %$pos] ; base of pos[]
-
- lea eax, [eax + eax*2] ; replace jnr with j3
- lea ebx, [ebx + ebx*2]
- lea ecx, [ecx + ecx*2] ; replace jnr with j3
- lea edx, [edx + edx*2]
-
- ; move j coordinates to local temp. variables
- movlps xmm2, [esi + eax*4]
- movlps xmm3, [esi + eax*4 + 12]
- movlps xmm4, [esi + eax*4 + 24]
-
- movlps xmm5, [esi + ebx*4]
- movlps xmm6, [esi + ebx*4 + 12]
- movlps xmm7, [esi + ebx*4 + 24]
-
- movhps xmm2, [esi + ecx*4]
- movhps xmm3, [esi + ecx*4 + 12]
- movhps xmm4, [esi + ecx*4 + 24]
-
- movhps xmm5, [esi + edx*4]
- movhps xmm6, [esi + edx*4 + 12]
- movhps xmm7, [esi + edx*4 + 24]
-
- ;; current state:
- ;; xmm2= jxOa jyOa jxOc jyOc
- ;; xmm3= jxH1a jyH1a jxH1c jyH1c
- ;; xmm4= jxH2a jyH2a jxH2c jyH2c
- ;; xmm5= jxOb jyOb jxOd jyOd
- ;; xmm6= jxH1b jyH1b jxH1d jyH1d
- ;; xmm7= jxH2b jyH2b jxH2d jyH2d
-
- movaps xmm0, xmm2
- movaps xmm1, xmm3
- unpcklps xmm0, xmm5 ; xmm0= jxOa jxOb jyOa jyOb
- unpcklps xmm1, xmm6 ; xmm1= jxH1a jxH1b jyH1a jyH1b
- unpckhps xmm2, xmm5 ; xmm2= jxOc jxOd jyOc jyOd
- unpckhps xmm3, xmm6 ; xmm3= jxH1c jxH1d jyH1c jyH1d
- movaps xmm5, xmm4
- movaps xmm6, xmm0
- unpcklps xmm4, xmm7 ; xmm4= jxH2a jxH2b jyH2a jyH2b
- unpckhps xmm5, xmm7 ; xmm5= jxH2c jxH2d jyH2c jyH2d
- movaps xmm7, xmm1
- movlhps xmm0, xmm2 ; xmm0= jxOa jxOb jxOc jxOd
- movaps [esp + .jxO], xmm0
- movhlps xmm2, xmm6 ; xmm2= jyOa jyOb jyOc jyOd
- movaps [esp + .jyO], xmm2
- movlhps xmm1, xmm3
- movaps [esp + .jxH1], xmm1
- movhlps xmm3, xmm7
- movaps xmm6, xmm4
- movaps [esp + .jyH1], xmm3
- movlhps xmm4, xmm5
- movaps [esp + .jxH2], xmm4
- movhlps xmm5, xmm6
- movaps [esp + .jyH2], xmm5
-
- movss xmm0, [esi + eax*4 + 8]
- movss xmm1, [esi + eax*4 + 20]
- movss xmm2, [esi + eax*4 + 32]
-
- movss xmm3, [esi + ecx*4 + 8]
- movss xmm4, [esi + ecx*4 + 20]
- movss xmm5, [esi + ecx*4 + 32]
-
- movhps xmm0, [esi + ebx*4 + 4]
- movhps xmm1, [esi + ebx*4 + 16]
- movhps xmm2, [esi + ebx*4 + 28]
-
- movhps xmm3, [esi + edx*4 + 4]
- movhps xmm4, [esi + edx*4 + 16]
- movhps xmm5, [esi + edx*4 + 28]
-
- shufps xmm0, xmm3, 11001100b
- shufps xmm1, xmm4, 11001100b
- shufps xmm2, xmm5, 11001100b
- movaps [esp + .jzO], xmm0
- movaps [esp + .jzH1], xmm1
- movaps [esp + .jzH2], xmm2
-
- movaps xmm0, [esp + .ixO]
- movaps xmm1, [esp + .iyO]
- movaps xmm2, [esp + .izO]
- movaps xmm3, [esp + .ixO]
- movaps xmm4, [esp + .iyO]
- movaps xmm5, [esp + .izO]
- subps xmm0, [esp + .jxO]
- subps xmm1, [esp + .jyO]
- subps xmm2, [esp + .jzO]
- subps xmm3, [esp + .jxH1]
- subps xmm4, [esp + .jyH1]
- subps xmm5, [esp + .jzH1]
- movaps [esp + .dxOO], xmm0
- movaps [esp + .dyOO], xmm1
- movaps [esp + .dzOO], xmm2
- mulps xmm0, xmm0
- mulps xmm1, xmm1
- mulps xmm2, xmm2
- movaps [esp + .dxOH1], xmm3
- movaps [esp + .dyOH1], xmm4
- movaps [esp + .dzOH1], xmm5
- mulps xmm3, xmm3
- mulps xmm4, xmm4
- mulps xmm5, xmm5
- addps xmm0, xmm1
- addps xmm0, xmm2
- addps xmm3, xmm4
- addps xmm3, xmm5
- movaps [esp + .rsqOO], xmm0
- movaps [esp + .rsqOH1], xmm3
-
- movaps xmm0, [esp + .ixO]
- movaps xmm1, [esp + .iyO]
- movaps xmm2, [esp + .izO]
- movaps xmm3, [esp + .ixH1]
- movaps xmm4, [esp + .iyH1]
- movaps xmm5, [esp + .izH1]
- subps xmm0, [esp + .jxH2]
- subps xmm1, [esp + .jyH2]
- subps xmm2, [esp + .jzH2]
- subps xmm3, [esp + .jxO]
- subps xmm4, [esp + .jyO]
- subps xmm5, [esp + .jzO]
- movaps [esp + .dxOH2], xmm0
- movaps [esp + .dyOH2], xmm1
- movaps [esp + .dzOH2], xmm2
- mulps xmm0, xmm0
- mulps xmm1, xmm1
- mulps xmm2, xmm2
- movaps [esp + .dxH1O], xmm3
- movaps [esp + .dyH1O], xmm4
- movaps [esp + .dzH1O], xmm5
- mulps xmm3, xmm3
- mulps xmm4, xmm4
- mulps xmm5, xmm5
- addps xmm0, xmm1
- addps xmm0, xmm2
- addps xmm3, xmm4
- addps xmm3, xmm5
- movaps [esp + .rsqOH2], xmm0
- movaps [esp + .rsqH1O], xmm3
-
- movaps xmm0, [esp + .ixH1]
- movaps xmm1, [esp + .iyH1]
- movaps xmm2, [esp + .izH1]
- movaps xmm3, [esp + .ixH1]
- movaps xmm4, [esp + .iyH1]
- movaps xmm5, [esp + .izH1]
- subps xmm0, [esp + .jxH1]
- subps xmm1, [esp + .jyH1]
- subps xmm2, [esp + .jzH1]
- subps xmm3, [esp + .jxH2]
- subps xmm4, [esp + .jyH2]
- subps xmm5, [esp + .jzH2]
- movaps [esp + .dxH1H1], xmm0
- movaps [esp + .dyH1H1], xmm1
- movaps [esp + .dzH1H1], xmm2
- mulps xmm0, xmm0
- mulps xmm1, xmm1
- mulps xmm2, xmm2
- movaps [esp + .dxH1H2], xmm3
- movaps [esp + .dyH1H2], xmm4
- movaps [esp + .dzH1H2], xmm5
- mulps xmm3, xmm3
- mulps xmm4, xmm4
- mulps xmm5, xmm5
- addps xmm0, xmm1
- addps xmm0, xmm2
- addps xmm3, xmm4
- addps xmm3, xmm5
- movaps [esp + .rsqH1H1], xmm0
- movaps [esp + .rsqH1H2], xmm3
-
- movaps xmm0, [esp + .ixH2]
- movaps xmm1, [esp + .iyH2]
- movaps xmm2, [esp + .izH2]
- movaps xmm3, [esp + .ixH2]
- movaps xmm4, [esp + .iyH2]
- movaps xmm5, [esp + .izH2]
- subps xmm0, [esp + .jxO]
- subps xmm1, [esp + .jyO]
- subps xmm2, [esp + .jzO]
- subps xmm3, [esp + .jxH1]
- subps xmm4, [esp + .jyH1]
- subps xmm5, [esp + .jzH1]
- movaps [esp + .dxH2O], xmm0
- movaps [esp + .dyH2O], xmm1
- movaps [esp + .dzH2O], xmm2
- mulps xmm0, xmm0
- mulps xmm1, xmm1
- mulps xmm2, xmm2
- movaps [esp + .dxH2H1], xmm3
- movaps [esp + .dyH2H1], xmm4
- movaps [esp + .dzH2H1], xmm5
- mulps xmm3, xmm3
- mulps xmm4, xmm4
- mulps xmm5, xmm5
- addps xmm0, xmm1
- addps xmm0, xmm2
- addps xmm4, xmm3
- addps xmm4, xmm5
- movaps [esp + .rsqH2O], xmm0
- movaps [esp + .rsqH2H1], xmm4
-
- movaps xmm0, [esp + .ixH2]
- movaps xmm1, [esp + .iyH2]
- movaps xmm2, [esp + .izH2]
- subps xmm0, [esp + .jxH2]
- subps xmm1, [esp + .jyH2]
- subps xmm2, [esp + .jzH2]
- movaps [esp + .dxH2H2], xmm0
- movaps [esp + .dyH2H2], xmm1
- movaps [esp + .dzH2H2], xmm2
- mulps xmm0, xmm0
- mulps xmm1, xmm1
- mulps xmm2, xmm2
- addps xmm0, xmm1
- addps xmm0, xmm2
- movaps [esp + .rsqH2H2], xmm0
-
- ; start doing invsqrt. use rsq values in xmm0, xmm4
- rsqrtps xmm1, xmm0
- rsqrtps xmm5, xmm4
- movaps xmm2, xmm1
- movaps xmm6, xmm5
- mulps xmm1, xmm1
- mulps xmm5, xmm5
- movaps xmm3, [esp + .three]
- movaps xmm7, xmm3
- mulps xmm1, xmm0
- mulps xmm5, xmm4
- subps xmm3, xmm1
- subps xmm7, xmm5
- mulps xmm3, xmm2
- mulps xmm7, xmm6
- mulps xmm3, [esp + .half] ; rinvH2H2
- mulps xmm7, [esp + .half] ; rinvH2H1
- movaps [esp + .rinvH2H2], xmm3
- movaps [esp + .rinvH2H1], xmm7
-
- rsqrtps xmm1, [esp + .rsqOO]
- rsqrtps xmm5, [esp + .rsqOH1]
- movaps xmm2, xmm1
- movaps xmm6, xmm5
- mulps xmm1, xmm1
- mulps xmm5, xmm5
- movaps xmm3, [esp + .three]
- movaps xmm7, xmm3
- mulps xmm1, [esp + .rsqOO]
- mulps xmm5, [esp + .rsqOH1]
- subps xmm3, xmm1
- subps xmm7, xmm5
- mulps xmm3, xmm2
- mulps xmm7, xmm6
- mulps xmm3, [esp + .half]
- mulps xmm7, [esp + .half]
- movaps [esp + .rinvOO], xmm3
- movaps [esp + .rinvOH1], xmm7
-
- rsqrtps xmm1, [esp + .rsqOH2]
- rsqrtps xmm5, [esp + .rsqH1O]
- movaps xmm2, xmm1
- movaps xmm6, xmm5
- mulps xmm1, xmm1
- mulps xmm5, xmm5
- movaps xmm3, [esp + .three]
- movaps xmm7, xmm3
- mulps xmm1, [esp + .rsqOH2]
- mulps xmm5, [esp + .rsqH1O]
- subps xmm3, xmm1
- subps xmm7, xmm5
- mulps xmm3, xmm2
- mulps xmm7, xmm6
- mulps xmm3, [esp + .half]
- mulps xmm7, [esp + .half]
- movaps [esp + .rinvOH2], xmm3
- movaps [esp + .rinvH1O], xmm7
-
- rsqrtps xmm1, [esp + .rsqH1H1]
- rsqrtps xmm5, [esp + .rsqH1H2]
- movaps xmm2, xmm1
- movaps xmm6, xmm5
- mulps xmm1, xmm1
- mulps xmm5, xmm5
- movaps xmm3, [esp + .three]
- movaps xmm7, xmm3
- mulps xmm1, [esp + .rsqH1H1]
- mulps xmm5, [esp + .rsqH1H2]
- subps xmm3, xmm1
- subps xmm7, xmm5
- mulps xmm3, xmm2
- mulps xmm7, xmm6
- mulps xmm3, [esp + .half]
- mulps xmm7, [esp + .half]
- movaps [esp + .rinvH1H1], xmm3
- movaps [esp + .rinvH1H2], xmm7
-
- rsqrtps xmm1, [esp + .rsqH2O]
- movaps xmm2, xmm1
- mulps xmm1, xmm1
- movaps xmm3, [esp + .three]
- mulps xmm1, [esp + .rsqH2O]
- subps xmm3, xmm1
- mulps xmm3, xmm2
- mulps xmm3, [esp + .half]
- movaps [esp + .rinvH2O], xmm3
-
- ;; start with OO interaction.
- movaps xmm0, [esp + .rinvOO]
- movaps xmm7, xmm0
- mulps xmm0, xmm0
- movaps xmm1, xmm0
- mulps xmm1, xmm0
- mulps xmm1, xmm0 ; xmm1=rinvsix
- mulps xmm7, [esp + .qqOO]
- movaps xmm2, xmm1
- mulps xmm2, xmm2 ; xmm2=rinvtwelve
- mulps xmm1, [esp + .c6]
- mulps xmm2, [esp + .c12]
- movaps xmm3, xmm2
- subps xmm3, xmm1 ; xmm3=vnb12-vnb6
- addps xmm3, [esp + .vnbtot]
- mulps xmm1, [esp + .six]
- mulps xmm2, [esp + .twelve]
- movaps [esp + .vnbtot], xmm3
- subps xmm2, xmm1
- addps xmm2, xmm7
- addps xmm7, [esp + .vctot]
- mulps xmm0, xmm2
-
- movaps xmm1, xmm0
- movaps xmm2, xmm0
-
- xorps xmm3, xmm3
- movaps xmm4, xmm3
- movaps xmm5, xmm3
- mulps xmm0, [esp + .dxOO]
- mulps xmm1, [esp + .dyOO]
- mulps xmm2, [esp + .dzOO]
- subps xmm3, xmm0
- subps xmm4, xmm1
- subps xmm5, xmm2
- addps xmm0, [esp + .fixO]
- addps xmm1, [esp + .fiyO]
- addps xmm2, [esp + .fizO]
- movaps [esp + .fjxO], xmm3
- movaps [esp + .fjyO], xmm4
- movaps [esp + .fjzO], xmm5
- movaps [esp + .fixO], xmm0
- movaps [esp + .fiyO], xmm1
- movaps [esp + .fizO], xmm2
-
- ; O-H1 interaction
- movaps xmm0, [esp + .rinvOH1]
- movaps xmm1, xmm0
- mulps xmm0, xmm0
- mulps xmm1, [esp + .qqOH]
- mulps xmm0, xmm1 ; fsOH1
- addps xmm7, xmm1 ; add to local vctot.
- movaps xmm1, xmm0
- movaps xmm2, xmm0
-
- xorps xmm3, xmm3
- movaps xmm4, xmm3
- movaps xmm5, xmm3
- mulps xmm0, [esp + .dxOH1]
- mulps xmm1, [esp + .dyOH1]
- mulps xmm2, [esp + .dzOH1]
- subps xmm3, xmm0
- subps xmm4, xmm1
- subps xmm5, xmm2
- addps xmm0, [esp + .fixO]
- addps xmm1, [esp + .fiyO]
- addps xmm2, [esp + .fizO]
- movaps [esp + .fjxH1], xmm3
- movaps [esp + .fjyH1], xmm4
- movaps [esp + .fjzH1], xmm5
- movaps [esp + .fixO], xmm0
- movaps [esp + .fiyO], xmm1
- movaps [esp + .fizO], xmm2
-
- ; O-H2 interaction
- movaps xmm0, [esp + .rinvOH2]
- movaps xmm1, xmm0
- mulps xmm0, xmm0
- mulps xmm1, [esp + .qqOH]
- mulps xmm0, xmm1 ; fsOH2
- addps xmm7, xmm1 ; add to local vctot.
- movaps xmm1, xmm0
- movaps xmm2, xmm0
-
- xorps xmm3, xmm3
- movaps xmm4, xmm3
- movaps xmm5, xmm3
- mulps xmm0, [esp + .dxOH2]
- mulps xmm1, [esp + .dyOH2]
- mulps xmm2, [esp + .dzOH2]
- subps xmm3, xmm0
- subps xmm4, xmm1
- subps xmm5, xmm2
- addps xmm0, [esp + .fixO]
- addps xmm1, [esp + .fiyO]
- addps xmm2, [esp + .fizO]
- movaps [esp + .fjxH2], xmm3
- movaps [esp + .fjyH2], xmm4
- movaps [esp + .fjzH2], xmm5
- movaps [esp + .fixO], xmm0
- movaps [esp + .fiyO], xmm1
- movaps [esp + .fizO], xmm2
-
- ; H1-O interaction
- movaps xmm0, [esp + .rinvH1O]
- movaps xmm1, xmm0
- mulps xmm0, xmm0
- mulps xmm1, [esp + .qqOH]
- mulps xmm0, xmm1 ; fsH1O
- addps xmm7, xmm1 ; add to local vctot.
- movaps xmm1, xmm0
- movaps xmm2, xmm0
- movaps xmm3, [esp + .fjxO]
- movaps xmm4, [esp + .fjyO]
- movaps xmm5, [esp + .fjzO]
- mulps xmm0, [esp + .dxH1O]
- mulps xmm1, [esp + .dyH1O]
- mulps xmm2, [esp + .dzH1O]
- subps xmm3, xmm0
- subps xmm4, xmm1
- subps xmm5, xmm2
- addps xmm0, [esp + .fixH1]
- addps xmm1, [esp + .fiyH1]
- addps xmm2, [esp + .fizH1]
- movaps [esp + .fjxO], xmm3
- movaps [esp + .fjyO], xmm4
- movaps [esp + .fjzO], xmm5
- movaps [esp + .fixH1], xmm0
- movaps [esp + .fiyH1], xmm1
- movaps [esp + .fizH1], xmm2
-
- ; H1-H1 interaction
- movaps xmm0, [esp + .rinvH1H1]
- movaps xmm1, xmm0
- mulps xmm0, xmm0
- mulps xmm1, [esp + .qqHH]
- mulps xmm0, xmm1 ; fsH1H1
- addps xmm7, xmm1 ; add to local vctot.
- movaps xmm1, xmm0
- movaps xmm2, xmm0
- movaps xmm3, [esp + .fjxH1]
- movaps xmm4, [esp + .fjyH1]
- movaps xmm5, [esp + .fjzH1]
- mulps xmm0, [esp + .dxH1H1]
- mulps xmm1, [esp + .dyH1H1]
- mulps xmm2, [esp + .dzH1H1]
- subps xmm3, xmm0
- subps xmm4, xmm1
- subps xmm5, xmm2
- addps xmm0, [esp + .fixH1]
- addps xmm1, [esp + .fiyH1]
- addps xmm2, [esp + .fizH1]
- movaps [esp + .fjxH1], xmm3
- movaps [esp + .fjyH1], xmm4
- movaps [esp + .fjzH1], xmm5
- movaps [esp + .fixH1], xmm0
- movaps [esp + .fiyH1], xmm1
- movaps [esp + .fizH1], xmm2
-
- ; H1-H2 interaction
- movaps xmm0, [esp + .rinvH1H2]
- movaps xmm1, xmm0
- mulps xmm0, xmm0
- mulps xmm1, [esp + .qqHH]
- mulps xmm0, xmm1 ; fsOH2
- addps xmm7, xmm1 ; add to local vctot.
- movaps xmm1, xmm0
- movaps xmm2, xmm0
- movaps xmm3, [esp + .fjxH2]
- movaps xmm4, [esp + .fjyH2]
- movaps xmm5, [esp + .fjzH2]
- mulps xmm0, [esp + .dxH1H2]
- mulps xmm1, [esp + .dyH1H2]
- mulps xmm2, [esp + .dzH1H2]
- subps xmm3, xmm0
- subps xmm4, xmm1
- subps xmm5, xmm2
- addps xmm0, [esp + .fixH1]
- addps xmm1, [esp + .fiyH1]
- addps xmm2, [esp + .fizH1]
- movaps [esp + .fjxH2], xmm3
- movaps [esp + .fjyH2], xmm4
- movaps [esp + .fjzH2], xmm5
- movaps [esp + .fixH1], xmm0
- movaps [esp + .fiyH1], xmm1
- movaps [esp + .fizH1], xmm2
-
- ; H2-O interaction
- movaps xmm0, [esp + .rinvH2O]
- movaps xmm1, xmm0
- mulps xmm0, xmm0
- mulps xmm1, [esp + .qqOH]
- mulps xmm0, xmm1 ; fsH2O
- addps xmm7, xmm1 ; add to local vctot.
- movaps xmm1, xmm0
- movaps xmm2, xmm0
- movaps xmm3, [esp + .fjxO]
- movaps xmm4, [esp + .fjyO]
- movaps xmm5, [esp + .fjzO]
- mulps xmm0, [esp + .dxH2O]
- mulps xmm1, [esp + .dyH2O]
- mulps xmm2, [esp + .dzH2O]
- subps xmm3, xmm0
- subps xmm4, xmm1
- subps xmm5, xmm2
- addps xmm0, [esp + .fixH2]
- addps xmm1, [esp + .fiyH2]
- addps xmm2, [esp + .fizH2]
- movaps [esp + .fjxO], xmm3
- movaps [esp + .fjyO], xmm4
- movaps [esp + .fjzO], xmm5
- movaps [esp + .fixH2], xmm0
- movaps [esp + .fiyH2], xmm1
- movaps [esp + .fizH2], xmm2
-
- ; H2-H1 interaction
- movaps xmm0, [esp + .rinvH2H1]
- movaps xmm1, xmm0
- mulps xmm0, xmm0
- mulps xmm1, [esp + .qqHH]
- mulps xmm0, xmm1 ; fsH2H1
- addps xmm7, xmm1 ; add to local vctot.
- movaps xmm1, xmm0
- movaps xmm2, xmm0
- movaps xmm3, [esp + .fjxH1]
- movaps xmm4, [esp + .fjyH1]
- movaps xmm5, [esp + .fjzH1]
- mulps xmm0, [esp + .dxH2H1]
- mulps xmm1, [esp + .dyH2H1]
- mulps xmm2, [esp + .dzH2H1]
- subps xmm3, xmm0
- subps xmm4, xmm1
- subps xmm5, xmm2
- addps xmm0, [esp + .fixH2]
- addps xmm1, [esp + .fiyH2]
- addps xmm2, [esp + .fizH2]
- movaps [esp + .fjxH1], xmm3
- movaps [esp + .fjyH1], xmm4
- movaps [esp + .fjzH1], xmm5
- movaps [esp + .fixH2], xmm0
- movaps [esp + .fiyH2], xmm1
- movaps [esp + .fizH2], xmm2
-
- ; H2-H2 interaction
- movaps xmm0, [esp + .rinvH2H2]
- movaps xmm1, xmm0
- mulps xmm0, xmm0
- mulps xmm1, [esp + .qqHH]
- mulps xmm0, xmm1 ; fsH2H2
- addps xmm7, xmm1 ; add to local vctot.
- movaps xmm1, xmm0
- movaps [esp + .vctot], xmm7
- movaps xmm2, xmm0
- movaps xmm3, [esp + .fjxH2]
- movaps xmm4, [esp + .fjyH2]
- movaps xmm5, [esp + .fjzH2]
- mulps xmm0, [esp + .dxH2H2]
- mulps xmm1, [esp + .dyH2H2]
- mulps xmm2, [esp + .dzH2H2]
- subps xmm3, xmm0
- subps xmm4, xmm1
- subps xmm5, xmm2
- addps xmm0, [esp + .fixH2]
- addps xmm1, [esp + .fiyH2]
- addps xmm2, [esp + .fizH2]
- movaps [esp + .fjxH2], xmm3
- movaps [esp + .fjyH2], xmm4
- movaps [esp + .fjzH2], xmm5
- movaps [esp + .fixH2], xmm0
- movaps [esp + .fiyH2], xmm1
- movaps [esp + .fizH2], xmm2
-
- mov edi, [ebp +%$faction]
-
- ; Did all interactions - now update j forces.
- ; 4 j waters with three atoms each - first do a & b j particles
- movaps xmm0, [esp + .fjxO] ; xmm0= fjxOa fjxOb fjxOc fjxOd
- movaps xmm1, [esp + .fjyO] ; xmm1= fjyOa fjyOb fjyOc fjyOd
- unpcklps xmm0, xmm1 ; xmm0= fjxOa fjyOa fjxOb fjyOb
- movaps xmm1, [esp + .fjzO]
- movaps xmm2, [esp + .fjxH1]
- movhlps xmm3, xmm0 ; xmm3= fjxOb fjyOb
- unpcklps xmm1, xmm2 ; xmm1= fjzOa fjxH1a fjzOb fjxH1b
- movaps xmm4, [esp + .fjyH1]
- movaps xmm5, [esp + .fjzH1]
- unpcklps xmm4, xmm5 ; xmm4= fjyH1a fjzH1a fjyH1b fjzH1b
- movaps xmm5, [esp + .fjxH2]
- movaps xmm6, [esp + .fjyH2]
- movhlps xmm7, xmm4 ; xmm7= fjyH1b fjzH1b
- unpcklps xmm5, xmm6 ; xmm5= fjxH2a fjyH2a fjxH2b fjyH2b
- movlhps xmm0, xmm1 ; xmm0= fjxOa fjyOa fjzOa fjxH1a
- shufps xmm3, xmm1, 11100100b
- ; xmm3= fjxOb fjyOb fjzOb fjxH1b
- movlhps xmm4, xmm5 ; xmm4= fjyH1a fjzH1a fjxH2a fjyH2a
- shufps xmm7, xmm5, 11100100b
- ; xmm7= fjyH1b fjzH1b fjxH2b fjyH2b
- movups xmm1, [edi + eax*4]
- movups xmm2, [edi + eax*4 + 16]
- movups xmm5, [edi + ebx*4]
- movups xmm6, [edi + ebx*4 + 16]
- addps xmm1, xmm0
- addps xmm2, xmm4
- addps xmm5, xmm3
- addps xmm6, xmm7
- movss xmm0, [edi + eax*4 + 32]
- movss xmm3, [edi + ebx*4 + 32]
-
- movaps xmm4, [esp + .fjzH2]
- movaps xmm7, xmm4
- shufps xmm7, xmm7, 1b
-
- movups [edi + eax*4], xmm1
- movups [edi + eax*4 + 16],xmm2
- movups [edi + ebx*4], xmm5
- movups [edi + ebx*4 + 16],xmm6
- addss xmm0, xmm4
- addss xmm3, xmm7
- movss [edi + eax*4 + 32], xmm0
- movss [edi + ebx*4 + 32], xmm3
-
- ;; then do the second pair (c & d)
- movaps xmm0, [esp + .fjxO] ; xmm0= fjxOa fjxOb fjxOc fjxOd
- movaps xmm1, [esp + .fjyO] ; xmm1= fjyOa fjyOb fjyOc fjyOd
- unpckhps xmm0, xmm1 ; xmm0= fjxOc fjyOc fjxOd fjyOd
- movaps xmm1, [esp + .fjzO]
- movaps xmm2, [esp + .fjxH1]
- movhlps xmm3, xmm0 ; xmm3= fjxOd fjyOd
- unpckhps xmm1, xmm2 ; xmm1= fjzOc fjxH1c fjzOd fjxH1d
- movaps xmm4, [esp + .fjyH1]
- movaps xmm5, [esp + .fjzH1]
- unpckhps xmm4, xmm5 ; xmm4= fjyH1c fjzH1c fjyH1d fjzH1d
- movaps xmm5, [esp + .fjxH2]
- movaps xmm6, [esp + .fjyH2]
- movhlps xmm7, xmm4 ; xmm7= fjyH1d fjzH1d
- unpckhps xmm5, xmm6 ; xmm5= fjxH2c fjyH2c fjxH2d fjyH2d
- movlhps xmm0, xmm1 ; xmm0= fjxOc fjyOc fjzOc fjxH1c
- shufps xmm3, xmm1, 11100100b
- ; xmm3= fjxOd fjyOd fjzOd fjxH1d
- movlhps xmm4, xmm5 ; xmm4= fjyH1c fjzH1c fjxH2c fjyH2c
- shufps xmm7, xmm5, 11100100b
- ; xmm7= fjyH1d fjzH1d fjxH2d fjyH2d
- movups xmm1, [edi + ecx*4]
- movups xmm2, [edi + ecx*4 + 16]
- movups xmm5, [edi + edx*4]
- movups xmm6, [edi + edx*4 + 16]
- addps xmm1, xmm0
- addps xmm2, xmm4
- addps xmm5, xmm3
- addps xmm6, xmm7
- movss xmm0, [edi + ecx*4 + 32]
- movss xmm3, [edi + edx*4 + 32]
-
- movaps xmm4, [esp + .fjzH2]
- movaps xmm7, xmm4
- shufps xmm4, xmm4, 10b
- shufps xmm7, xmm7, 11b
- movups [edi + ecx*4], xmm1
- movups [edi + ecx*4 + 16],xmm2
- movups [edi + edx*4], xmm5
- movups [edi + edx*4 + 16],xmm6
- addss xmm0, xmm4
- addss xmm3, xmm7
- movss [edi + ecx*4 + 32], xmm0
- movss [edi + edx*4 + 32], xmm3
-
- ;; should we do one more iteration?
- sub [esp + .innerk], dword 4
- jl .single_check
- jmp .unroll_loop
-.single_check:
- add [esp + .innerk], dword 4
- jnz .single_loop
- jmp .updateouterdata
-.single_loop:
- mov edx, [esp + .innerjjnr] ; pointer to jjnr[k]
- mov eax, [edx]
- add [esp + .innerjjnr], dword 4
-
- mov esi, [ebp + %$pos]
- lea eax, [eax + eax*2]
-
- ; fetch j coordinates
- xorps xmm3, xmm3
- xorps xmm4, xmm4
- xorps xmm5, xmm5
- movss xmm3, [esi + eax*4]
- movss xmm4, [esi + eax*4 + 4]
- movss xmm5, [esi + eax*4 + 8]
-
- movlps xmm6, [esi + eax*4 + 12]
- movhps xmm6, [esi + eax*4 + 24] ; xmm6=jxH1 jyH1 jxH2 jyH2
- ;; fetch both z coords in one go, to positions 0 and 3 in xmm7
- movups xmm7, [esi + eax*4 + 20] ; xmm7=jzH1 jxH2 jyH2 jzH2
- shufps xmm6, xmm6, 11011000b ; xmm6=jxH1 jxH2 jyH1 jyH2
- movlhps xmm3, xmm6 ; xmm3= jxO 0 jxH1 jxH2
- movaps xmm0, [esp + .ixO]
- movaps xmm1, [esp + .iyO]
- movaps xmm2, [esp + .izO]
- shufps xmm4, xmm6, 11100100b ; xmm4= jyO 0 jyH1 jyH2
- shufps xmm5, xmm7, 11000100b ; xmm5= jzO 0 jzH1 jzH2
- ;; store all j coordinates in jO
- movaps [esp + .jxO], xmm3
- movaps [esp + .jyO], xmm4
- movaps [esp + .jzO], xmm5
- subps xmm0, xmm3
- subps xmm1, xmm4
- subps xmm2, xmm5
- movaps [esp + .dxOO], xmm0
- movaps [esp + .dyOO], xmm1
- movaps [esp + .dzOO], xmm2
- mulps xmm0, xmm0
- mulps xmm1, xmm1
- mulps xmm2, xmm2
- addps xmm0, xmm1
- addps xmm0, xmm2 ; have rsq in xmm0.
-
- ;; do invsqrt
- rsqrtps xmm1, xmm0
- movaps xmm2, xmm1
- mulps xmm1, xmm1
- movaps xmm3, [esp + .three]
- mulps xmm1, xmm0
- subps xmm3, xmm1
- mulps xmm3, xmm2
- mulps xmm3, [esp + .half] ; rinv iO - j water
-
- xorps xmm1, xmm1
- movaps xmm0, xmm3
- xorps xmm4, xmm4
- mulps xmm0, xmm0 ; xmm0=rinvsq
- ;; fetch charges to xmm4 (temporary)
- movss xmm4, [esp + .qqOO]
- movss xmm1, xmm0
- movhps xmm4, [esp + .qqOH]
- mulss xmm1, xmm0
- mulps xmm3, xmm4 ; xmm3=vcoul
- mulss xmm1, xmm0 ; xmm1(0)=rinvsix
- movaps xmm2, xmm1 ; zero everything else in xmm2
- mulss xmm2, xmm2 ; xmm2=rinvtwelve
-
- mulss xmm1, [esp + .c6]
- mulss xmm2, [esp + .c12]
- movaps xmm4, xmm2
- subss xmm4, xmm1 ; vnbtot=vnb12-vnb6
- addps xmm4, [esp + .vnbtot]
- mulss xmm1, [esp + .six]
- mulss xmm2, [esp + .twelve]
- movaps [esp + .vnbtot], xmm4
- subss xmm2, xmm1 ; fsD+fsR
- addps xmm2, xmm3 ; fsC+fsD+fsR
-
- addps xmm3, [esp + .vctot]
- mulps xmm0, xmm2 ; total fscal
- movaps [esp + .vctot], xmm3
-
- movaps xmm1, xmm0
- movaps xmm2, xmm0
- mulps xmm0, [esp + .dxOO]
- mulps xmm1, [esp + .dyOO]
- mulps xmm2, [esp + .dzOO]
- ;; initial update for j forces
- xorps xmm3, xmm3
- xorps xmm4, xmm4
- xorps xmm5, xmm5
- subps xmm3, xmm0
- subps xmm4, xmm1
- subps xmm5, xmm2
- movaps [esp + .fjxO], xmm3
- movaps [esp + .fjyO], xmm4
- movaps [esp + .fjzO], xmm5
- addps xmm0, [esp + .fixO]
- addps xmm1, [esp + .fiyO]
- addps xmm2, [esp + .fizO]
- movaps [esp + .fixO], xmm0
- movaps [esp + .fiyO], xmm1
- movaps [esp + .fizO], xmm2
-
-
- ;; done with i O. Now do i H1 & H2 simultaneously. first get i particle coords:
- movaps xmm0, [esp + .ixH1]
- movaps xmm1, [esp + .iyH1]
- movaps xmm2, [esp + .izH1]
- movaps xmm3, [esp + .ixH2]
- movaps xmm4, [esp + .iyH2]
- movaps xmm5, [esp + .izH2]
- subps xmm0, [esp + .jxO]
- subps xmm1, [esp + .jyO]
- subps xmm2, [esp + .jzO]
- subps xmm3, [esp + .jxO]
- subps xmm4, [esp + .jyO]
- subps xmm5, [esp + .jzO]
- movaps [esp + .dxH1O], xmm0
- movaps [esp + .dyH1O], xmm1
- movaps [esp + .dzH1O], xmm2
- movaps [esp + .dxH2O], xmm3
- movaps [esp + .dyH2O], xmm4
- movaps [esp + .dzH2O], xmm5
- mulps xmm0, xmm0
- mulps xmm1, xmm1
- mulps xmm2, xmm2
- mulps xmm3, xmm3
- mulps xmm4, xmm4
- mulps xmm5, xmm5
- addps xmm0, xmm1
- addps xmm4, xmm3
- addps xmm0, xmm2 ; have rsqH1 in xmm0.
- addps xmm4, xmm5 ; have rsqH2 in xmm4.
-
- ;; do invsqrt
- rsqrtps xmm1, xmm0
- rsqrtps xmm5, xmm4
- movaps xmm2, xmm1
- movaps xmm6, xmm5
- mulps xmm1, xmm1
- mulps xmm5, xmm5
- movaps xmm3, [esp + .three]
- movaps xmm7, xmm3
- mulps xmm1, xmm0
- mulps xmm5, xmm4
- subps xmm3, xmm1
- subps xmm7, xmm5
- mulps xmm3, xmm2
- mulps xmm7, xmm6
- mulps xmm3, [esp + .half] ; rinv H1 - j water
- mulps xmm7, [esp + .half] ; rinv H2 - j water
-
- ;; assemble charges in xmm6
- xorps xmm6, xmm6
- ; do coulomb interaction
- movaps xmm0, xmm3
- movss xmm6, [esp + .qqOH]
- movaps xmm4, xmm7
- movhps xmm6, [esp + .qqHH]
- mulps xmm0, xmm0 ; rinvsq
- mulps xmm4, xmm4 ; rinvsq
- mulps xmm3, xmm6 ; vcoul
- mulps xmm7, xmm6 ; vcoul
- movaps xmm2, xmm3
- addps xmm2, xmm7 ; total vcoul
- mulps xmm0, xmm3 ; fscal
-
- addps xmm2, [esp + .vctot]
- mulps xmm7, xmm4 ; fscal
- movaps [esp + .vctot], xmm2
- movaps xmm1, xmm0
- movaps xmm2, xmm0
- mulps xmm0, [esp + .dxH1O]
- mulps xmm1, [esp + .dyH1O]
- mulps xmm2, [esp + .dzH1O]
- ;; update forces H1 - j water
- movaps xmm3, [esp + .fjxO]
- movaps xmm4, [esp + .fjyO]
- movaps xmm5, [esp + .fjzO]
- subps xmm3, xmm0
- subps xmm4, xmm1
- subps xmm5, xmm2
- movaps [esp + .fjxO], xmm3
- movaps [esp + .fjyO], xmm4
- movaps [esp + .fjzO], xmm5
- addps xmm0, [esp + .fixH1]
- addps xmm1, [esp + .fiyH1]
- addps xmm2, [esp + .fizH1]
- movaps [esp + .fixH1], xmm0
- movaps [esp + .fiyH1], xmm1
- movaps [esp + .fizH1], xmm2
- ;; do forces H2 - j water
- movaps xmm0, xmm7
- movaps xmm1, xmm7
- movaps xmm2, xmm7
- mulps xmm0, [esp + .dxH2O]
- mulps xmm1, [esp + .dyH2O]
- mulps xmm2, [esp + .dzH2O]
- movaps xmm3, [esp + .fjxO]
- movaps xmm4, [esp + .fjyO]
- movaps xmm5, [esp + .fjzO]
- subps xmm3, xmm0
- subps xmm4, xmm1
- subps xmm5, xmm2
- mov esi, [ebp + %$faction]
- movaps [esp + .fjxO], xmm3
- movaps [esp + .fjyO], xmm4
- movaps [esp + .fjzO], xmm5
- addps xmm0, [esp + .fixH2]
- addps xmm1, [esp + .fiyH2]
- addps xmm2, [esp + .fizH2]
- movaps [esp + .fixH2], xmm0
- movaps [esp + .fiyH2], xmm1
- movaps [esp + .fizH2], xmm2
-
- ;; update j water forces from local variables
- movlps xmm0, [esi + eax*4]
- movlps xmm1, [esi + eax*4 + 12]
- movhps xmm1, [esi + eax*4 + 24]
- movaps xmm3, [esp + .fjxO]
- movaps xmm4, [esp + .fjyO]
- movaps xmm5, [esp + .fjzO]
- movaps xmm6, xmm5
- movaps xmm7, xmm5
- shufps xmm6, xmm6, 10b
- shufps xmm7, xmm7, 11b
- addss xmm5, [esi + eax*4 + 8]
- addss xmm6, [esi + eax*4 + 20]
- addss xmm7, [esi + eax*4 + 32]
- movss [esi + eax*4 + 8], xmm5
- movss [esi + eax*4 + 20], xmm6
- movss [esi + eax*4 + 32], xmm7
- movaps xmm5, xmm3
- unpcklps xmm3, xmm4
- unpckhps xmm5, xmm4
- addps xmm0, xmm3
- addps xmm1, xmm5
- movlps [esi + eax*4], xmm0
- movlps [esi + eax*4 + 12], xmm1
- movhps [esi + eax*4 + 24], xmm1
-
- dec dword [esp + .innerk]
- jz .updateouterdata
- jmp .single_loop
-.updateouterdata:
- mov ecx, [esp + .ii3]
- mov edi, [ebp + %$faction]
- mov esi, [ebp + %$fshift]
- mov edx, [esp + .is3]
-
- ; accumulate Oi forces in xmm0, xmm1, xmm2
- movaps xmm0, [esp + .fixO]
- movaps xmm1, [esp + .fiyO]
- movaps xmm2, [esp + .fizO]
-
- movhlps xmm3, xmm0
- movhlps xmm4, xmm1
- movhlps xmm5, xmm2
- addps xmm0, xmm3
- addps xmm1, xmm4
- addps xmm2, xmm5 ; summan ligger i 1/2 i xmm0-xmm2
-
- movaps xmm3, xmm0
- movaps xmm4, xmm1
- movaps xmm5, xmm2
-
- shufps xmm3, xmm3, 1b
- shufps xmm4, xmm4, 1b
- shufps xmm5, xmm5, 1b
- addss xmm0, xmm3
- addss xmm1, xmm4
- addss xmm2, xmm5 ; xmm0-xmm2 has single force in pos0.
-
- ; increment i force
- movss xmm3, [edi + ecx*4]
- movss xmm4, [edi + ecx*4 + 4]
- movss xmm5, [edi + ecx*4 + 8]
- addss xmm3, xmm0
- addss xmm4, xmm1
- addss xmm5, xmm2
- movss [edi + ecx*4], xmm3
- movss [edi + ecx*4 + 4], xmm4
- movss [edi + ecx*4 + 8], xmm5
-
- ; accumulate force in xmm6/xmm7 for fshift
- movaps xmm6, xmm0
- movss xmm7, xmm2
- movlhps xmm6, xmm1
- shufps xmm6, xmm6, 1000b
-
- ; accumulate H1i forces in xmm0, xmm1, xmm2
- movaps xmm0, [esp + .fixH1]
- movaps xmm1, [esp + .fiyH1]
- movaps xmm2, [esp + .fizH1]
-
- movhlps xmm3, xmm0
- movhlps xmm4, xmm1
- movhlps xmm5, xmm2
- addps xmm0, xmm3
- addps xmm1, xmm4
- addps xmm2, xmm5 ; summan ligger i 1/2 i xmm0-xmm2
-
- movaps xmm3, xmm0
- movaps xmm4, xmm1
- movaps xmm5, xmm2
-
- shufps xmm3, xmm3, 1b
- shufps xmm4, xmm4, 1b
- shufps xmm5, xmm5, 1b
- addss xmm0, xmm3
- addss xmm1, xmm4
- addss xmm2, xmm5 ; xmm0-xmm2 has single force in pos0.
-
- ; increment i force
- movss xmm3, [edi + ecx*4 + 12]
- movss xmm4, [edi + ecx*4 + 16]
- movss xmm5, [edi + ecx*4 + 20]
- addss xmm3, xmm0
- addss xmm4, xmm1
- addss xmm5, xmm2
- movss [edi + ecx*4 + 12], xmm3
- movss [edi + ecx*4 + 16], xmm4
- movss [edi + ecx*4 + 20], xmm5
-
- ;accumulate force in xmm6/xmm7 for fshift
- addss xmm7, xmm2
- movlhps xmm0, xmm1
- shufps xmm0, xmm0, 1000b
- addps xmm6, xmm0
-
- ; accumulate H2i forces in xmm0, xmm1, xmm2
- movaps xmm0, [esp + .fixH2]
- movaps xmm1, [esp + .fiyH2]
- movaps xmm2, [esp + .fizH2]
-
- movhlps xmm3, xmm0
- movhlps xmm4, xmm1
- movhlps xmm5, xmm2
- addps xmm0, xmm3
- addps xmm1, xmm4
- addps xmm2, xmm5 ; summan ligger i 1/2 i xmm0-xmm2
-
- movaps xmm3, xmm0
- movaps xmm4, xmm1
- movaps xmm5, xmm2
-
- shufps xmm3, xmm3, 1b
- shufps xmm4, xmm4, 1b
- shufps xmm5, xmm5, 1b
- addss xmm0, xmm3
- addss xmm1, xmm4
- addss xmm2, xmm5 ; xmm0-xmm2 has single force in pos0.
-
- ; increment i force
- movss xmm3, [edi + ecx*4 + 24]
- movss xmm4, [edi + ecx*4 + 28]
- movss xmm5, [edi + ecx*4 + 32]
- addss xmm3, xmm0
- addss xmm4, xmm1
- addss xmm5, xmm2
- movss [edi + ecx*4 + 24], xmm3
- movss [edi + ecx*4 + 28], xmm4
- movss [edi + ecx*4 + 32], xmm5
-
- ;accumulate force in xmm6/xmm7 for fshift
- addss xmm7, xmm2
- movlhps xmm0, xmm1
- shufps xmm0, xmm0, 1000b
- addps xmm6, xmm0
-
- ; increment fshift force
- movlps xmm3, [esi + edx*4]
- movss xmm4, [esi + edx*4 + 8]
- addps xmm3, xmm6
- addss xmm4, xmm7
- movlps [esi + edx*4], xmm3
- movss [esi + edx*4 + 8], xmm4
-
- ; get group index for i particle
- mov edx, [ebp + %$gid] ; get group index for this i particle
- mov edx, [edx]
- add [ebp + %$gid], dword 4 ; advance pointer
-
- ; accumulate total potential energy and update it.
- movaps xmm7, [esp + .vctot]
- ; accumulate
- movhlps xmm6, xmm7
- addps xmm7, xmm6 ; pos 0-1 in xmm7 have the sum now
- movaps xmm6, xmm7
- shufps xmm6, xmm6, 1b
- addss xmm7, xmm6
-
- ; add earlier value from mem.
- mov eax, [ebp + %$Vc]
- addss xmm7, [eax + edx*4]
- ; move back to mem.
- movss [eax + edx*4], xmm7
-
- ; accumulate total lj energy and update it.
- movaps xmm7, [esp + .vnbtot]
- ; accumulate
- movhlps xmm6, xmm7
- addps xmm7, xmm6 ; pos 0-1 in xmm7 have the sum now
- movaps xmm6, xmm7
- shufps xmm6, xmm6, 1b
- addss xmm7, xmm6
-
- ; add earlier value from mem.
- mov eax, [ebp + %$Vnb]
- addss xmm7, [eax + edx*4]
- ; move back to mem.
- movss [eax + edx*4], xmm7
-
- ;; finish if last
- mov ecx, [ebp + %$nri]
- dec ecx
- jecxz .end
- ;; not last, iterate once more!
- mov [ebp + %$nri], ecx
- jmp .outer
-.end:
- emms
- mov eax, [esp + .salign]
- add esp, eax
- add esp, 1492
- pop edi
- pop esi
- pop edx
- pop ecx
- pop ebx
- pop eax
- endproc
-
-
-
-proc inl2120_sse
-%$nri arg
-%$iinr arg
-%$jindex arg
-%$jjnr arg
-%$shift arg
-%$shiftvec arg
-%$fshift arg
-%$gid arg
-%$pos arg
-%$faction arg
-%$charge arg
-%$facel arg
-%$Vc arg
-%$krf arg
-%$crf arg
-%$type arg
-%$ntype arg
-%$nbfp arg
-%$Vnb arg
- ;; stack offsets for local variables
- ;; bottom of stack is cache-aligned for sse use
-.ixO equ 0
-.iyO equ 16
-.izO equ 32
-.ixH1 equ 48
-.iyH1 equ 64
-.izH1 equ 80
-.ixH2 equ 96
-.iyH2 equ 112
-.izH2 equ 128
-.iqO equ 144
-.iqH equ 160
-.dxO equ 176
-.dyO equ 192
-.dzO equ 208
-.dxH1 equ 224
-.dyH1 equ 240
-.dzH1 equ 256
-.dxH2 equ 272
-.dyH2 equ 288
-.dzH2 equ 304
-.qqO equ 320
-.qqH equ 336
-.c6 equ 352
-.c12 equ 368
-.six equ 384
-.twelve equ 400
-.vctot equ 416
-.vnbtot equ 432
-.fixO equ 448
-.fiyO equ 464
-.fizO equ 480
-.fixH1 equ 496
-.fiyH1 equ 512
-.fizH1 equ 528
-.fixH2 equ 544
-.fiyH2 equ 560
-.fizH2 equ 576
-.fjx equ 592
-.fjy equ 608
-.fjz equ 624
-.half equ 640
-.three equ 656
-.two equ 672
-.krf equ 688
-.crf equ 704
-.krsqO equ 720
-.krsqH1 equ 736
-.krsqH2 equ 752
-.is3 equ 768
-.ii3 equ 772
-.ntia equ 776
-.innerjjnr equ 780
-.innerk equ 784
-.salign equ 788
- push eax
- push ebx
- push ecx
- push edx
- push esi
- push edi
- sub esp, 792 ; local stack space
- mov eax, esp
- and eax, 0xf
- sub esp, eax
- mov [esp + .salign], eax
-
- emms
-
- movups xmm0, [sse_half]
- movups xmm1, [sse_three]
- movups xmm2, [sse_six]
- movups xmm3, [sse_twelve]
- movups xmm4, [sse_two]
- movss xmm5, [ebp + %$krf]
- movss xmm6, [ebp + %$crf]
-
- movaps [esp + .half], xmm0
- movaps [esp + .three], xmm1
- movaps [esp + .six], xmm2
- movaps [esp + .twelve], xmm3
- movaps [esp + .two], xmm4
- shufps xmm5, xmm5, 0b
- shufps xmm6, xmm6, 0b
- movaps [esp + .krf], xmm5
- movaps [esp + .crf], xmm6
-
- ;; assume we have at least one i particle - start directly
- mov ecx, [ebp + %$iinr] ; ecx = pointer into iinr[]
- mov ebx, [ecx] ; ebx =ii
-
- mov edx, [ebp + %$charge]
- movss xmm3, [edx + ebx*4]
- movss xmm4, [edx + ebx*4 + 4]
- movss xmm5, [ebp + %$facel]
- mulss xmm3, xmm5
- mulss xmm4, xmm5
-
- shufps xmm3, xmm3, 0b
- shufps xmm4, xmm4, 0b
- movaps [esp + .iqO], xmm3
- movaps [esp + .iqH], xmm4
-
- mov edx, [ebp + %$type]
- mov ecx, [edx + ebx*4]
- shl ecx, 1
- imul ecx, [ebp + %$ntype] ; ecx = ntia = 2*ntype*type[ii0]
- mov [esp + .ntia], ecx
-.outer:
- mov eax, [ebp + %$shift] ; eax = pointer into shift[]
- mov ebx, [eax] ; ebx=shift[n]
- add [ebp + %$shift], dword 4 ; advance pointer one step
-
- lea ebx, [ebx + ebx*2] ; ebx=3*is
- mov [esp + .is3],ebx ; store is3
-
- mov eax, [ebp + %$shiftvec] ; eax = base of shiftvec[]
-
- movss xmm0, [eax + ebx*4]
- movss xmm1, [eax + ebx*4 + 4]
- movss xmm2, [eax + ebx*4 + 8]
-
- mov ecx, [ebp + %$iinr] ; ecx = pointer into iinr[]
- add [ebp + %$iinr], dword 4 ; advance pointer
- mov ebx, [ecx] ; ebx =ii
-
- movaps xmm3, xmm0
- movaps xmm4, xmm1
- movaps xmm5, xmm2
-
- lea ebx, [ebx + ebx*2] ; ebx = 3*ii=ii3
- mov eax, [ebp + %$pos] ; eax = base of pos[]
- mov [esp + .ii3], ebx
-
- addss xmm3, [eax + ebx*4]
- addss xmm4, [eax + ebx*4 + 4]
- addss xmm5, [eax + ebx*4 + 8]
- shufps xmm3, xmm3, 0b
- shufps xmm4, xmm4, 0b
- shufps xmm5, xmm5, 0b
- movaps [esp + .ixO], xmm3
- movaps [esp + .iyO], xmm4
- movaps [esp + .izO], xmm5
-
- movss xmm3, xmm0
- movss xmm4, xmm1
- movss xmm5, xmm2
- addss xmm0, [eax + ebx*4 + 12]
- addss xmm1, [eax + ebx*4 + 16]
- addss xmm2, [eax + ebx*4 + 20]
- addss xmm3, [eax + ebx*4 + 24]
- addss xmm4, [eax + ebx*4 + 28]
- addss xmm5, [eax + ebx*4 + 32]
-
- shufps xmm0, xmm0, 0b
- shufps xmm1, xmm1, 0b
- shufps xmm2, xmm2, 0b
- shufps xmm3, xmm3, 0b
- shufps xmm4, xmm4, 0b
- shufps xmm5, xmm5, 0b
- movaps [esp + .ixH1], xmm0
- movaps [esp + .iyH1], xmm1
- movaps [esp + .izH1], xmm2
- movaps [esp + .ixH2], xmm3
- movaps [esp + .iyH2], xmm4
- movaps [esp + .izH2], xmm5
-
- ; clear vctot and i forces
- xorps xmm4, xmm4
- movaps [esp + .vctot], xmm4
- movaps [esp + .vnbtot], xmm4
- movaps [esp + .fixO], xmm4
- movaps [esp + .fiyO], xmm4
- movaps [esp + .fizO], xmm4
- movaps [esp + .fixH1], xmm4
- movaps [esp + .fiyH1], xmm4
- movaps [esp + .fizH1], xmm4
- movaps [esp + .fixH2], xmm4
- movaps [esp + .fiyH2], xmm4
- movaps [esp + .fizH2], xmm4
-
- mov eax, [ebp + %$jindex]
- mov ecx, [eax] ; jindex[n]
- mov edx, [eax + 4] ; jindex[n+1]
- add [ebp + %$jindex], dword 4
- sub edx, ecx ; number of innerloop atoms
-
- mov esi, [ebp + %$pos]
- mov edi, [ebp + %$faction]
- mov eax, [ebp + %$jjnr]
- shl ecx, 2
- add eax, ecx
- mov [esp + .innerjjnr], eax ; pointer to jjnr[nj0]
- sub edx, dword 4
- mov [esp + .innerk], edx ; number of innerloop atoms
- jge .unroll_loop
- jmp .odd_inner
-.unroll_loop:
- ;; quad-unroll innerloop here.
- mov edx, [esp + .innerjjnr] ; pointer to jjnr[k]
- mov eax, [edx]
- mov ebx, [edx + 4]
- mov ecx, [edx + 8]
- mov edx, [edx + 12] ; eax-edx=jnr1-4
-
- add [esp + .innerjjnr], dword 16 ; advance pointer (unrolled 4)
-
- mov esi, [ebp + %$charge] ; base of charge[]
-
- movss xmm3, [esi + eax*4]
- movss xmm4, [esi + ecx*4]
- movss xmm6, [esi + ebx*4]
- movss xmm7, [esi + edx*4]
-
- shufps xmm3, xmm6, 00000000b
- shufps xmm4, xmm7, 00000000b
- shufps xmm3, xmm4, 10001000b ; all charges in xmm3
- movaps xmm4, xmm3 ; and in xmm4
- mulps xmm3, [esp + .iqO]
- mulps xmm4, [esp + .iqH]
-
- movd mm0, eax ; use mmx registers as temp. storage
- movd mm1, ebx
- movd mm2, ecx
- movd mm3, edx
-
- movaps [esp + .qqO], xmm3
- movaps [esp + .qqH], xmm4
-
- mov esi, [ebp + %$type]
- mov eax, [esi + eax*4]
- mov ebx, [esi + ebx*4]
- mov ecx, [esi + ecx*4]
- mov edx, [esi + edx*4]
- mov esi, [ebp + %$nbfp]
- shl eax, 1
- shl ebx, 1
- shl ecx, 1
- shl edx, 1
- mov edi, [esp + .ntia]
- add eax, edi
- add ebx, edi
- add ecx, edi
- add edx, edi
-
- movlps xmm6, [esi + eax*4]
- movlps xmm7, [esi + ecx*4]
- movhps xmm6, [esi + ebx*4]
- movhps xmm7, [esi + edx*4]
-
- movaps xmm4, xmm6
- shufps xmm4, xmm7, 10001000b
- shufps xmm6, xmm7, 11011101b
-
- movd eax, mm0
- movd ebx, mm1
- movd ecx, mm2
- movd edx, mm3
-
- movaps [esp + .c6], xmm4
- movaps [esp + .c12], xmm6
-
- mov esi, [ebp + %$pos] ; base of pos[]
-
- lea eax, [eax + eax*2] ; replace jnr with j3
- lea ebx, [ebx + ebx*2]
- lea ecx, [ecx + ecx*2] ; replace jnr with j3
- lea edx, [edx + edx*2]
-
- ; move four coordinates to xmm0-xmm2
- movlps xmm4, [esi + eax*4]
- movlps xmm5, [esi + ecx*4]
- movss xmm2, [esi + eax*4 + 8]
- movss xmm6, [esi + ecx*4 + 8]
-
- movhps xmm4, [esi + ebx*4]
- movhps xmm5, [esi + edx*4]
-
- movss xmm0, [esi + ebx*4 + 8]
- movss xmm1, [esi + edx*4 + 8]
-
- shufps xmm2, xmm0, 0b
- shufps xmm6, xmm1, 0b
-
- movaps xmm0, xmm4
- movaps xmm1, xmm4
-
- shufps xmm2, xmm6, 10001000b
-
- shufps xmm0, xmm5, 10001000b
- shufps xmm1, xmm5, 11011101b
-
- ; move ixO-izO to xmm4-xmm6
- movaps xmm4, [esp + .ixO]
- movaps xmm5, [esp + .iyO]
- movaps xmm6, [esp + .izO]
-
- ; calc dr
- subps xmm4, xmm0
- subps xmm5, xmm1
- subps xmm6, xmm2
-
- ; store dr
- movaps [esp + .dxO], xmm4
- movaps [esp + .dyO], xmm5
- movaps [esp + .dzO], xmm6
- ; square it
- mulps xmm4,xmm4
- mulps xmm5,xmm5
- mulps xmm6,xmm6
- addps xmm4, xmm5
- addps xmm4, xmm6
- movaps xmm7, xmm4
- ; rsqO in xmm7
-
- ; move ixH1-izH1 to xmm4-xmm6
- movaps xmm4, [esp + .ixH1]
- movaps xmm5, [esp + .iyH1]
- movaps xmm6, [esp + .izH1]
-
- ; calc dr
- subps xmm4, xmm0
- subps xmm5, xmm1
- subps xmm6, xmm2
-
- ; store dr
- movaps [esp + .dxH1], xmm4
- movaps [esp + .dyH1], xmm5
- movaps [esp + .dzH1], xmm6
- ; square it
- mulps xmm4,xmm4
- mulps xmm5,xmm5
- mulps xmm6,xmm6
- addps xmm6, xmm5
- addps xmm6, xmm4
- ; rsqH1 in xmm6
-
- ; move ixH2-izH2 to xmm3-xmm5
- movaps xmm3, [esp + .ixH2]
- movaps xmm4, [esp + .iyH2]
- movaps xmm5, [esp + .izH2]
-
- ; calc dr
- subps xmm3, xmm0
- subps xmm4, xmm1
- subps xmm5, xmm2
-
- ; store dr
- movaps [esp + .dxH2], xmm3
- movaps [esp + .dyH2], xmm4
- movaps [esp + .dzH2], xmm5
- ; square it
- mulps xmm3,xmm3
- mulps xmm4,xmm4
- mulps xmm5,xmm5
- addps xmm5, xmm4
- addps xmm5, xmm3
- ; rsqH2 in xmm5, rsqH1 in xmm6, rsqO in xmm7
-
- movaps xmm0, xmm5
- movaps xmm1, xmm6
- movaps xmm2, xmm7
-
- mulps xmm0, [esp + .krf]
- mulps xmm1, [esp + .krf]
- mulps xmm2, [esp + .krf]
-
- movaps [esp + .krsqH2], xmm0
- movaps [esp + .krsqH1], xmm1
- movaps [esp + .krsqO], xmm2
-
- ; start with rsqO - seed in xmm2
- rsqrtps xmm2, xmm7
- movaps xmm3, xmm2
- mulps xmm2, xmm2
- movaps xmm4, [esp + .three]
- mulps xmm2, xmm7 ; rsq*lu*lu
- subps xmm4, xmm2 ; 3.0-rsq*lu*lu
- mulps xmm4, xmm3 ; lu*(3-rsq*lu*lu)
- mulps xmm4, [esp + .half]
- movaps xmm7, xmm4 ; rinvO in xmm7
- ; rsqH1 - seed in xmm2
- rsqrtps xmm2, xmm6
- movaps xmm3, xmm2
- mulps xmm2, xmm2
- movaps xmm4, [esp + .three]
- mulps xmm2, xmm6 ; rsq*lu*lu
- subps xmm4, xmm2 ; 3.0-rsq*lu*lu
- mulps xmm4, xmm3 ; lu*(3-rsq*lu*lu)
- mulps xmm4, [esp + .half]
- movaps xmm6, xmm4 ; rinvH1 in xmm6
- ; rsqH2 - seed in xmm2
- rsqrtps xmm2, xmm5
- movaps xmm3, xmm2
- mulps xmm2, xmm2
- movaps xmm4, [esp + .three]
- mulps xmm2, xmm5 ; rsq*lu*lu
- subps xmm4, xmm2 ; 3.0-rsq*lu*lu
- mulps xmm4, xmm3 ; lu*(3-rsq*lu*lu)
- mulps xmm4, [esp + .half]
- movaps xmm5, xmm4 ; rinvH2 in xmm5
-
- ; do O interactions
- movaps xmm4, xmm7
- mulps xmm4, xmm4 ; xmm7=rinv, xmm4=rinvsq
- movaps xmm1, xmm4
- mulps xmm1, xmm4
- mulps xmm1, xmm4 ; xmm1=rinvsix
- movaps xmm2, xmm1
- mulps xmm2, xmm2 ; xmm2=rinvtwelve
- mulps xmm1, [esp + .c6]
- mulps xmm2, [esp + .c12]
- movaps xmm3, xmm2
- subps xmm3, xmm1 ; vnb=vnb12-vnb6
- addps xmm3, [esp + .vnbtot]
- mulps xmm1, [esp + .six]
- mulps xmm2, [esp + .twelve]
- subps xmm2, xmm1 ; nb part of fs
-
- movaps xmm0, xmm7
- movaps xmm1, [esp + .krsqO]
- addps xmm0, xmm1
- mulps xmm1, [esp + .two]
- subps xmm0, [esp + .crf] ; xmm0=rinv+krsq-crf
- subps xmm7, xmm1
- mulps xmm0, [esp + .qqO]
- mulps xmm7, [esp + .qqO]
- addps xmm2, xmm7
-
- mulps xmm4, xmm2 ; total fsO in xmm4
-
- addps xmm0, [esp + .vctot]
- movaps [esp + .vnbtot], xmm3
- movaps [esp + .vctot], xmm0
-
- movaps xmm0, [esp + .dxO]
- movaps xmm1, [esp + .dyO]
- movaps xmm2, [esp + .dzO]
- mulps xmm0, xmm4
- mulps xmm1, xmm4
- mulps xmm2, xmm4
-
- ; update O forces
- movaps xmm3, [esp + .fixO]
- movaps xmm4, [esp + .fiyO]
- movaps xmm7, [esp + .fizO]
- addps xmm3, xmm0
- addps xmm4, xmm1
- addps xmm7, xmm2
- movaps [esp + .fixO], xmm3
- movaps [esp + .fiyO], xmm4
- movaps [esp + .fizO], xmm7
- ; update j forces with water O
- movaps [esp + .fjx], xmm0
- movaps [esp + .fjy], xmm1
- movaps [esp + .fjz], xmm2
-
- ; H1 interactions
- movaps xmm4, xmm6
- mulps xmm4, xmm4 ; xmm6=rinv, xmm4=rinvsq
- movaps xmm7, xmm6
- movaps xmm0, [esp + .krsqH1]
- addps xmm6, xmm0 ; xmm6=rinv+krsq
- mulps xmm0, [esp + .two]
- subps xmm6, [esp + .crf]
- subps xmm7, xmm0 ; xmm7=rinv-2*krsq
- mulps xmm6, [esp + .qqH] ; vcoul
- mulps xmm7, [esp + .qqH]
- mulps xmm4, xmm7 ; total fsH1 in xmm4
-
- addps xmm6, [esp + .vctot]
-
- movaps xmm0, [esp + .dxH1]
- movaps xmm1, [esp + .dyH1]
- movaps xmm2, [esp + .dzH1]
- movaps [esp + .vctot], xmm6
- mulps xmm0, xmm4
- mulps xmm1, xmm4
- mulps xmm2, xmm4
-
- ; update H1 forces
- movaps xmm3, [esp + .fixH1]
- movaps xmm4, [esp + .fiyH1]
- movaps xmm7, [esp + .fizH1]
- addps xmm3, xmm0
- addps xmm4, xmm1
- addps xmm7, xmm2
- movaps [esp + .fixH1], xmm3
- movaps [esp + .fiyH1], xmm4
- movaps [esp + .fizH1], xmm7
- ; update j forces with water H1
- addps xmm0, [esp + .fjx]
- addps xmm1, [esp + .fjy]
- addps xmm2, [esp + .fjz]
- movaps [esp + .fjx], xmm0
- movaps [esp + .fjy], xmm1
- movaps [esp + .fjz], xmm2
-
- ; H2 interactions
- movaps xmm4, xmm5
- mulps xmm4, xmm4 ; xmm5=rinv, xmm4=rinvsq
- movaps xmm7, xmm5
- movaps xmm0, [esp + .krsqH2]
- addps xmm5, xmm0 ; xmm5=rinv+krsq
- mulps xmm0, [esp + .two]
- subps xmm5, [esp + .crf]
- subps xmm7, xmm0 ; xmm7=rinv-2*krsq
- mulps xmm5, [esp + .qqH] ; vcoul
- mulps xmm7, [esp + .qqH]
- mulps xmm4, xmm7 ; total fsH2 in xmm4
-
- addps xmm5, [esp + .vctot]
-
- movaps xmm0, [esp + .dxH2]
- movaps xmm1, [esp + .dyH2]
- movaps xmm2, [esp + .dzH2]
- movaps [esp + .vctot], xmm5
- mulps xmm0, xmm4
- mulps xmm1, xmm4
- mulps xmm2, xmm4
-
- ; update H2 forces
- movaps xmm3, [esp + .fixH2]
- movaps xmm4, [esp + .fiyH2]
- movaps xmm7, [esp + .fizH2]
- addps xmm3, xmm0
- addps xmm4, xmm1
- addps xmm7, xmm2
- movaps [esp + .fixH2], xmm3
- movaps [esp + .fiyH2], xmm4
- movaps [esp + .fizH2], xmm7
-
- mov edi, [ebp +%$faction]
- ; update j forces
- addps xmm0, [esp + .fjx]
- addps xmm1, [esp + .fjy]
- addps xmm2, [esp + .fjz]
-
- movlps xmm4, [edi + eax*4]
- movlps xmm7, [edi + ecx*4]
- movhps xmm4, [edi + ebx*4]
- movhps xmm7, [edi + edx*4]
-
- movaps xmm3, xmm4
- shufps xmm3, xmm7, 10001000b
- shufps xmm4, xmm7, 11011101b
- ; xmm3 has fjx, xmm4 has fjy.
- subps xmm3, xmm0
- subps xmm4, xmm1
- ; unpack the back for storing.
- movaps xmm7, xmm3
- unpcklps xmm7, xmm4
- unpckhps xmm3, xmm4
- movlps [edi + eax*4], xmm7
- movlps [edi + ecx*4], xmm3
- movhps [edi + ebx*4], xmm7
- movhps [edi + edx*4], xmm3
- ; finally z forces
- movss xmm0, [edi + eax*4 + 8]
- movss xmm1, [edi + ebx*4 + 8]
- movss xmm3, [edi + ecx*4 + 8]
- movss xmm4, [edi + edx*4 + 8]
- subss xmm0, xmm2
- shufps xmm2, xmm2, 11100101b
- subss xmm1, xmm2
- shufps xmm2, xmm2, 11101010b
- subss xmm3, xmm2
- shufps xmm2, xmm2, 11111111b
- subss xmm4, xmm2
- movss [edi + eax*4 + 8], xmm0
- movss [edi + ebx*4 + 8], xmm1
- movss [edi + ecx*4 + 8], xmm3
- movss [edi + edx*4 + 8], xmm4
-
- ;; should we do one more iteration?
- sub [esp + .innerk], dword 4
- jl .odd_inner
- jmp .unroll_loop
-.odd_inner:
- add [esp + .innerk], dword 4
- jnz .odd_loop
- jmp .updateouterdata
-.odd_loop:
- mov edx, [esp + .innerjjnr] ; pointer to jjnr[k]
- mov eax, [edx]
- add [esp + .innerjjnr], dword 4
-
- xorps xmm4, xmm4
- movss xmm4, [esp + .iqO]
- mov esi, [ebp + %$charge]
- movhps xmm4, [esp + .iqH]
- movss xmm3, [esi + eax*4] ; charge in xmm3
- shufps xmm3, xmm3, 0b
- mulps xmm3, xmm4
- movaps [esp + .qqO], xmm3 ; use oxygen qq for storage.
-
- xorps xmm6, xmm6
- mov esi, [ebp + %$type]
- mov ebx, [esi + eax*4]
- mov esi, [ebp + %$nbfp]
- shl ebx, 1
- add ebx, [esp + .ntia]
- movlps xmm6, [esi + ebx*4]
- movaps xmm7, xmm6
- shufps xmm6, xmm6, 11111100b
- shufps xmm7, xmm7, 11111101b
- movaps [esp + .c6], xmm6
- movaps [esp + .c12], xmm7
-
- mov esi, [ebp + %$pos]
- lea eax, [eax + eax*2]
-
- ; move j coords to xmm0-xmm2
- movss xmm0, [esi + eax*4]
- movss xmm1, [esi + eax*4 + 4]
- movss xmm2, [esi + eax*4 + 8]
- shufps xmm0, xmm0, 0b
- shufps xmm1, xmm1, 0b
- shufps xmm2, xmm2, 0b
-
- movss xmm3, [esp + .ixO]
- movss xmm4, [esp + .iyO]
- movss xmm5, [esp + .izO]
-
- movlps xmm6, [esp + .ixH1]
- movlps xmm7, [esp + .ixH2]
- unpcklps xmm6, xmm7
- movlhps xmm3, xmm6
- movlps xmm6, [esp + .iyH1]
- movlps xmm7, [esp + .iyH2]
- unpcklps xmm6, xmm7
- movlhps xmm4, xmm6
- movlps xmm6, [esp + .izH1]
- movlps xmm7, [esp + .izH2]
- unpcklps xmm6, xmm7
- movlhps xmm5, xmm6
-
- subps xmm3, xmm0
- subps xmm4, xmm1
- subps xmm5, xmm2
-
- movaps [esp + .dxO], xmm3
- movaps [esp + .dyO], xmm4
- movaps [esp + .dzO], xmm5
-
- mulps xmm3, xmm3
- mulps xmm4, xmm4
- mulps xmm5, xmm5
-
- addps xmm4, xmm3
- addps xmm4, xmm5
- ; rsq in xmm4.
-
- movaps xmm0, xmm4
- mulps xmm0, [esp + .krf]
- movaps [esp + .krsqO], xmm0
-
- rsqrtps xmm5, xmm4
- ; lookup seed in xmm5
- movaps xmm2, xmm5
- mulps xmm5, xmm5
- movaps xmm1, [esp + .three]
- mulps xmm5, xmm4 ;rsq*lu*lu
- movaps xmm0, [esp + .half]
- subps xmm1, xmm5 ; 3.0-rsq*lu*lu
- mulps xmm1, xmm2
- mulps xmm0, xmm1 ; xmm0=rinv
- movaps xmm4, xmm0
- mulps xmm4, xmm4 ; xmm4=rinvsq
- movaps xmm1, xmm4
- mulss xmm1, xmm4
- mulss xmm1, xmm4 ; xmm1=rinvsix
- movaps xmm2, xmm1
- mulss xmm2, xmm2 ; xmm2=rinvtwelve
- mulps xmm1, [esp + .c6]
- mulps xmm2, [esp + .c12]
- movaps xmm5, xmm2
- subss xmm5, xmm1 ; vnb=vnb12-vnb6
- addps xmm5, [esp + .vnbtot]
- mulss xmm1, [esp + .six]
- mulss xmm2, [esp + .twelve]
- subss xmm2, xmm1
-
- movaps xmm1, xmm0 ; xmm1=rinv
- movaps xmm3, [esp + .krsqO]
- addps xmm0, xmm3 ; xmm0=rinv+krsq
- mulps xmm3, [esp + .two]
- subps xmm0, [esp + .crf] ; xmm0=rinv+krsq-crf
- subps xmm1, xmm3 ; xmm1=rinv-2*krsq
- mulps xmm0, [esp + .qqO] ; xmm0=vcoul
- mulps xmm1, [esp + .qqO] ; xmm1=coul part of fs
-
- addps xmm2, xmm1 ; total fs
-
- mulps xmm4, xmm2 ; xmm4=total fscal
- addps xmm0, [esp + .vctot]
- movaps [esp + .vctot], xmm0
-
- movaps xmm0, [esp + .dxO]
- movaps xmm1, [esp + .dyO]
- movaps xmm2, [esp + .dzO]
-
- movaps [esp + .vnbtot], xmm5
-
- mulps xmm0, xmm4
- mulps xmm1, xmm4
- mulps xmm2, xmm4
- ; xmm0-xmm2 contains tx-tz (partial force)
- movss xmm3, [esp + .fixO]
- movss xmm4, [esp + .fiyO]
- movss xmm5, [esp + .fizO]
- addss xmm3, xmm0
- addss xmm4, xmm1
- addss xmm5, xmm2
- movss [esp + .fixO], xmm3
- movss [esp + .fiyO], xmm4
- movss [esp + .fizO], xmm5 ; updated the O force. now do the H's
- movaps xmm3, xmm0
- movaps xmm4, xmm1
- movaps xmm5, xmm2
- shufps xmm3, xmm3, 11100110b ; shift right
- shufps xmm4, xmm4, 11100110b
- shufps xmm5, xmm5, 11100110b
- addss xmm3, [esp + .fixH1]
- addss xmm4, [esp + .fiyH1]
- addss xmm5, [esp + .fizH1]
- movss [esp + .fixH1], xmm3
- movss [esp + .fiyH1], xmm4
- movss [esp + .fizH1], xmm5 ; updated the H1 force.
-
- mov edi, [ebp + %$faction]
- shufps xmm3, xmm3, 11100111b ; shift right
- shufps xmm4, xmm4, 11100111b
- shufps xmm5, xmm5, 11100111b
- addss xmm3, [esp + .fixH2]
- addss xmm4, [esp + .fiyH2]
- addss xmm5, [esp + .fizH2]
- movss [esp + .fixH2], xmm3
- movss [esp + .fiyH2], xmm4
- movss [esp + .fizH2], xmm5 ; updated the H2 force.
-
- ; the fj's - start by accumulating the tx/ty/tz force in xmm0, xmm1.
- xorps xmm5, xmm5
- movaps xmm3, xmm0
- movlps xmm6, [edi + eax*4]
- movss xmm7, [edi + eax*4 + 8]
- unpcklps xmm3, xmm1
- movlhps xmm3, xmm5
- unpckhps xmm0, xmm1
- addps xmm0, xmm3
- movhlps xmm3, xmm0
- addps xmm0, xmm3 ; x,y sum in xmm0
-
- movhlps xmm1, xmm2
- addss xmm2, xmm1
- shufps xmm1, xmm1, 1b
- addss xmm2, xmm1 ; z sum in xmm2
- subps xmm6, xmm0
- subss xmm7, xmm2
-
- movlps [edi + eax*4], xmm6
- movss [edi + eax*4 + 8], xmm7
-
- dec dword [esp + .innerk]
- jz .updateouterdata
- jmp .odd_loop
-.updateouterdata:
- mov ecx, [esp + .ii3]
- mov edi, [ebp + %$faction]
- mov esi, [ebp + %$fshift]
- mov edx, [esp + .is3]
-
- ; accumulate Oi forces in xmm0, xmm1, xmm2
- movaps xmm0, [esp + .fixO]
- movaps xmm1, [esp + .fiyO]
- movaps xmm2, [esp + .fizO]
-
- movhlps xmm3, xmm0
- movhlps xmm4, xmm1
- movhlps xmm5, xmm2
- addps xmm0, xmm3
- addps xmm1, xmm4
- addps xmm2, xmm5 ; summan ligger i 1/2 i xmm0-xmm2
-
- movaps xmm3, xmm0
- movaps xmm4, xmm1
- movaps xmm5, xmm2
-
- shufps xmm3, xmm3, 1b
- shufps xmm4, xmm4, 1b
- shufps xmm5, xmm5, 1b
- addss xmm0, xmm3
- addss xmm1, xmm4
- addss xmm2, xmm5 ; xmm0-xmm2 has single force in pos0.
-
- ; increment i force
- movss xmm3, [edi + ecx*4]
- movss xmm4, [edi + ecx*4 + 4]
- movss xmm5, [edi + ecx*4 + 8]
- addss xmm3, xmm0
- addss xmm4, xmm1
- addss xmm5, xmm2
- movss [edi + ecx*4], xmm3
- movss [edi + ecx*4 + 4], xmm4
- movss [edi + ecx*4 + 8], xmm5
-
- ; accumulate force in xmm6/xmm7 for fshift
- movaps xmm6, xmm0
- movss xmm7, xmm2
- movlhps xmm6, xmm1
- shufps xmm6, xmm6, 1000b
-
- ; accumulate H1i forces in xmm0, xmm1, xmm2
- movaps xmm0, [esp + .fixH1]
- movaps xmm1, [esp + .fiyH1]
- movaps xmm2, [esp + .fizH1]
-
- movhlps xmm3, xmm0
- movhlps xmm4, xmm1
- movhlps xmm5, xmm2
- addps xmm0, xmm3
- addps xmm1, xmm4
- addps xmm2, xmm5 ; summan ligger i 1/2 i xmm0-xmm2
-
- movaps xmm3, xmm0
- movaps xmm4, xmm1
- movaps xmm5, xmm2
-
- shufps xmm3, xmm3, 1b
- shufps xmm4, xmm4, 1b
- shufps xmm5, xmm5, 1b
- addss xmm0, xmm3
- addss xmm1, xmm4
- addss xmm2, xmm5 ; xmm0-xmm2 has single force in pos0.
-
- ; increment i force
- movss xmm3, [edi + ecx*4 + 12]
- movss xmm4, [edi + ecx*4 + 16]
- movss xmm5, [edi + ecx*4 + 20]
- addss xmm3, xmm0
- addss xmm4, xmm1
- addss xmm5, xmm2
- movss [edi + ecx*4 + 12], xmm3
- movss [edi + ecx*4 + 16], xmm4
- movss [edi + ecx*4 + 20], xmm5
-
- ;accumulate force in xmm6/xmm7 for fshift
- addss xmm7, xmm2
- movlhps xmm0, xmm1
- shufps xmm0, xmm0, 1000b
- addps xmm6, xmm0
-
- ; accumulate H2i forces in xmm0, xmm1, xmm2
- movaps xmm0, [esp + .fixH2]
- movaps xmm1, [esp + .fiyH2]
- movaps xmm2, [esp + .fizH2]
-
- movhlps xmm3, xmm0
- movhlps xmm4, xmm1
- movhlps xmm5, xmm2
- addps xmm0, xmm3
- addps xmm1, xmm4
- addps xmm2, xmm5 ; summan ligger i 1/2 i xmm0-xmm2
-
- movaps xmm3, xmm0
- movaps xmm4, xmm1
- movaps xmm5, xmm2
-
- shufps xmm3, xmm3, 1b
- shufps xmm4, xmm4, 1b
- shufps xmm5, xmm5, 1b
- addss xmm0, xmm3
- addss xmm1, xmm4
- addss xmm2, xmm5 ; xmm0-xmm2 has single force in pos0.
-
- ; increment i force
- movss xmm3, [edi + ecx*4 + 24]
- movss xmm4, [edi + ecx*4 + 28]
- movss xmm5, [edi + ecx*4 + 32]
- addss xmm3, xmm0
- addss xmm4, xmm1
- addss xmm5, xmm2
- movss [edi + ecx*4 + 24], xmm3
- movss [edi + ecx*4 + 28], xmm4
- movss [edi + ecx*4 + 32], xmm5
-
- ;accumulate force in xmm6/xmm7 for fshift
- addss xmm7, xmm2
- movlhps xmm0, xmm1
- shufps xmm0, xmm0, 1000b
- addps xmm6, xmm0
-
- ; increment fshift force
- movlps xmm3, [esi + edx*4]
- movss xmm4, [esi + edx*4 + 8]
- addps xmm3, xmm6
- addss xmm4, xmm7
- movlps [esi + edx*4], xmm3
- movss [esi + edx*4 + 8], xmm4
-
- mov edx, [ebp + %$gid]
- mov edx, [edx]
- add [ebp + %$gid], dword 4
-
- ; accumulate total potential energy and update it.
- movaps xmm7, [esp + .vctot]
- ; accumulate
- movhlps xmm6, xmm7
- addps xmm7, xmm6 ; pos 0-1 in xmm7 have the sum now
- movaps xmm6, xmm7
- shufps xmm6, xmm6, 1b
- addss xmm7, xmm6
-
- ; add earlier value from mem.
- mov eax, [ebp + %$Vc]
- addss xmm7, [eax + edx*4]
- ; move back to mem.
- movss [eax + edx*4], xmm7
-
- ; accumulate total lj energy and update it.
- movaps xmm7, [esp + .vnbtot]
- ; accumulate
- movhlps xmm6, xmm7
- addps xmm7, xmm6 ; pos 0-1 in xmm7 have the sum now
- movaps xmm6, xmm7
- shufps xmm6, xmm6, 1b
- addss xmm7, xmm6
-
- ; add earlier value from mem.
- mov eax, [ebp + %$Vnb]
- addss xmm7, [eax + edx*4]
- ; move back to mem.
- movss [eax + edx*4], xmm7
-
- ;; finish if last
- mov ecx, [ebp + %$nri]
- dec ecx
- jecxz .end
- ;; not last, iterate once more!
- mov [ebp + %$nri], ecx
- jmp .outer
-.end:
- emms
- mov eax, [esp + .salign]
- add esp, eax
- add esp, 792
- pop edi
- pop esi
- pop edx
- pop ecx
- pop ebx
- pop eax
- endproc
-
-
-
-proc inl2130_sse
-%$nri arg
-%$iinr arg
-%$jindex arg
-%$jjnr arg
-%$shift arg
-%$shiftvec arg
-%$fshift arg
-%$gid arg
-%$pos arg
-%$faction arg
-%$charge arg
-%$facel arg
-%$Vc arg
-%$krf arg
-%$crf arg
-%$type arg
-%$ntype arg
-%$nbfp arg
-%$Vnb arg
- ;; stack offsets for local variables
- ;; bottom of stack is cache-aligned for sse use
-.ixO equ 0
-.iyO equ 16
-.izO equ 32
-.ixH1 equ 48
-.iyH1 equ 64
-.izH1 equ 80
-.ixH2 equ 96
-.iyH2 equ 112
-.izH2 equ 128
-.jxO equ 144
-.jyO equ 160
-.jzO equ 176
-.jxH1 equ 192
-.jyH1 equ 208
-.jzH1 equ 224
-.jxH2 equ 240
-.jyH2 equ 256
-.jzH2 equ 272
-.dxOO equ 288
-.dyOO equ 304
-.dzOO equ 320
-.dxOH1 equ 336
-.dyOH1 equ 352
-.dzOH1 equ 368
-.dxOH2 equ 384
-.dyOH2 equ 400
-.dzOH2 equ 416
-.dxH1O equ 432
-.dyH1O equ 448
-.dzH1O equ 464
-.dxH1H1 equ 480
-.dyH1H1 equ 496
-.dzH1H1 equ 512
-.dxH1H2 equ 528
-.dyH1H2 equ 544
-.dzH1H2 equ 560
-.dxH2O equ 576
-.dyH2O equ 592
-.dzH2O equ 608
-.dxH2H1 equ 624
-.dyH2H1 equ 640
-.dzH2H1 equ 656
-.dxH2H2 equ 672
-.dyH2H2 equ 688
-.dzH2H2 equ 704
-.qqOO equ 720
-.qqOH equ 736
-.qqHH equ 752
-.c6 equ 768
-.c12 equ 784
-.six equ 800
-.twelve equ 816
-.vctot equ 832
-.vnbtot equ 848
-.fixO equ 864
-.fiyO equ 880
-.fizO equ 896
-.fixH1 equ 912
-.fiyH1 equ 928
-.fizH1 equ 944
-.fixH2 equ 960
-.fiyH2 equ 976
-.fizH2 equ 992
-.fjxO equ 1008
-.fjyO equ 1024
-.fjzO equ 1040
-.fjxH1 equ 1056
-.fjyH1 equ 1072
-.fjzH1 equ 1088
-.fjxH2 equ 1104
-.fjyH2 equ 1120
-.fjzH2 equ 1136
-.half equ 1152
-.three equ 1168
-.rsqOO equ 1184
-.rsqOH1 equ 1200
-.rsqOH2 equ 1216
-.rsqH1O equ 1232
-.rsqH1H1 equ 1248
-.rsqH1H2 equ 1264
-.rsqH2O equ 1280
-.rsqH2H1 equ 1296
-.rsqH2H2 equ 1312
-.rinvOO equ 1328
-.rinvOH1 equ 1344
-.rinvOH2 equ 1360
-.rinvH1O equ 1376
-.rinvH1H1 equ 1392
-.rinvH1H2 equ 1408
-.rinvH2O equ 1424
-.rinvH2H1 equ 1440
-.rinvH2H2 equ 1456
-.two equ 1472
-.krf equ 1488
-.crf equ 1504
-.is3 equ 1520
-.ii3 equ 1524
-.innerjjnr equ 1528
-.innerk equ 1532
-.salign equ 1536
- push eax
- push ebx
- push ecx
- push edx
- push esi
- push edi
- sub esp, 1540 ; local stack space
- mov eax, esp
- and eax, 0xf
- sub esp, eax
- mov [esp + .salign], eax
-
- emms
-
- movups xmm0, [sse_half]
- movups xmm1, [sse_three]
- movups xmm2, [sse_six]
- movups xmm3, [sse_twelve]
- movups xmm4, [sse_two]
- movss xmm5, [ebp + %$krf]
- movss xmm6, [ebp + %$crf]
-
- movaps [esp + .half], xmm0
- movaps [esp + .three], xmm1
- movaps [esp + .six], xmm2
- movaps [esp + .twelve], xmm3
- movaps [esp + .two], xmm4
- shufps xmm5, xmm5, 0b
- shufps xmm6, xmm6, 0b
- movaps [esp + .krf], xmm5
- movaps [esp + .crf], xmm6
-
- ;; assume we have at least one i particle - start directly
- mov ecx, [ebp + %$iinr] ; ecx = pointer into iinr[]
- mov ebx, [ecx] ; ebx =ii
-
- mov edx, [ebp + %$charge]
- movss xmm3, [edx + ebx*4]
- movss xmm4, xmm3
- movss xmm5, [edx + ebx*4 + 4]
- movss xmm6, [ebp + %$facel]
- mulss xmm3, xmm3
- mulss xmm4, xmm5
- mulss xmm5, xmm5
- mulss xmm3, xmm6
- mulss xmm4, xmm6
- mulss xmm5, xmm6
- shufps xmm3, xmm3, 0b
- shufps xmm4, xmm4, 0b
- shufps xmm5, xmm5, 0b
- movaps [esp + .qqOO], xmm3
- movaps [esp + .qqOH], xmm4
- movaps [esp + .qqHH], xmm5
-
- xorps xmm0, xmm0
- mov edx, [ebp + %$type]
- mov ecx, [edx + ebx*4]
- shl ecx, 1
- mov edx, ecx
- imul ecx, [ebp + %$ntype] ; ecx = ntia = 2*ntype*type[ii0]
- add edx, ecx
- mov eax, [ebp + %$nbfp]
- movlps xmm0, [eax + edx*4]
- movaps xmm1, xmm0
- shufps xmm0, xmm0, 0b
- shufps xmm1, xmm1, 01010101b
- movaps [esp + .c6], xmm0
- movaps [esp + .c12], xmm1
-
-.outer:
- mov eax, [ebp + %$shift] ; eax = pointer into shift[]
- mov ebx, [eax] ; ebx=shift[n]
- add [ebp + %$shift], dword 4 ; advance pointer one step
-
- lea ebx, [ebx + ebx*2] ; ebx=3*is
- mov [esp + .is3],ebx ; store is3
-
- mov eax, [ebp + %$shiftvec] ; eax = base of shiftvec[]
-
- movss xmm0, [eax + ebx*4]
- movss xmm1, [eax + ebx*4 + 4]
- movss xmm2, [eax + ebx*4 + 8]
-
- mov ecx, [ebp + %$iinr] ; ecx = pointer into iinr[]
- add [ebp + %$iinr], dword 4 ; advance pointer
- mov ebx, [ecx] ; ebx =ii
-
- lea ebx, [ebx + ebx*2] ; ebx = 3*ii=ii3
- mov eax, [ebp + %$pos] ; eax = base of pos[]
- mov [esp + .ii3], ebx
-
- movaps xmm3, xmm0
- movaps xmm4, xmm1
- movaps xmm5, xmm2
- addss xmm3, [eax + ebx*4]
- addss xmm4, [eax + ebx*4 + 4]
- addss xmm5, [eax + ebx*4 + 8]
- shufps xmm3, xmm3, 0b
- shufps xmm4, xmm4, 0b
- shufps xmm5, xmm5, 0b
- movaps [esp + .ixO], xmm3
- movaps [esp + .iyO], xmm4
- movaps [esp + .izO], xmm5
-
- movss xmm3, xmm0
- movss xmm4, xmm1
- movss xmm5, xmm2
- addss xmm0, [eax + ebx*4 + 12]
- addss xmm1, [eax + ebx*4 + 16]
- addss xmm2, [eax + ebx*4 + 20]
- addss xmm3, [eax + ebx*4 + 24]
- addss xmm4, [eax + ebx*4 + 28]
- addss xmm5, [eax + ebx*4 + 32]
-
- shufps xmm0, xmm0, 0b
- shufps xmm1, xmm1, 0b
- shufps xmm2, xmm2, 0b
- shufps xmm3, xmm3, 0b
- shufps xmm4, xmm4, 0b
- shufps xmm5, xmm5, 0b
- movaps [esp + .ixH1], xmm0
- movaps [esp + .iyH1], xmm1
- movaps [esp + .izH1], xmm2
- movaps [esp + .ixH2], xmm3
- movaps [esp + .iyH2], xmm4
- movaps [esp + .izH2], xmm5
-
- ; clear vctot and i forces
- xorps xmm4, xmm4
- movaps [esp + .vctot], xmm4
- movaps [esp + .vnbtot], xmm4
- movaps [esp + .fixO], xmm4
- movaps [esp + .fiyO], xmm4
- movaps [esp + .fizO], xmm4
- movaps [esp + .fixH1], xmm4
- movaps [esp + .fiyH1], xmm4
- movaps [esp + .fizH1], xmm4
- movaps [esp + .fixH2], xmm4
- movaps [esp + .fiyH2], xmm4
- movaps [esp + .fizH2], xmm4
-
- mov eax, [ebp + %$jindex]
- mov ecx, [eax] ; jindex[n]
- mov edx, [eax + 4] ; jindex[n+1]
- add [ebp + %$jindex], dword 4
- sub edx, ecx ; number of innerloop atoms
-
- mov esi, [ebp + %$pos]
- mov edi, [ebp + %$faction]
- mov eax, [ebp + %$jjnr]
- shl ecx, 2
- add eax, ecx
- mov [esp + .innerjjnr], eax ; pointer to jjnr[nj0]
- sub edx, dword 4
- mov [esp + .innerk], edx ; number of innerloop atoms
- jge .unroll_loop
- jmp .single_check
-.unroll_loop:
- ;; quad-unroll innerloop here.
- mov edx, [esp + .innerjjnr] ; pointer to jjnr[k]
-
- mov eax, [edx]
- mov ebx, [edx + 4]
- mov ecx, [edx + 8]
- mov edx, [edx + 12] ; eax-edx=jnr1-4
-
- add [esp + .innerjjnr], dword 16 ; advance pointer (unrolled 4)
-
- mov esi, [ebp + %$pos] ; base of pos[]
-
- lea eax, [eax + eax*2] ; replace jnr with j3
- lea ebx, [ebx + ebx*2]
- lea ecx, [ecx + ecx*2] ; replace jnr with j3
- lea edx, [edx + edx*2]
-
- ; move j coordinates to local temp. variables
- movlps xmm2, [esi + eax*4]
- movlps xmm3, [esi + eax*4 + 12]
- movlps xmm4, [esi + eax*4 + 24]
-
- movlps xmm5, [esi + ebx*4]
- movlps xmm6, [esi + ebx*4 + 12]
- movlps xmm7, [esi + ebx*4 + 24]
-
- movhps xmm2, [esi + ecx*4]
- movhps xmm3, [esi + ecx*4 + 12]
- movhps xmm4, [esi + ecx*4 + 24]
-
- movhps xmm5, [esi + edx*4]
- movhps xmm6, [esi + edx*4 + 12]
- movhps xmm7, [esi + edx*4 + 24]
-
- ;; current state:
- ;; xmm2= jxOa jyOa jxOc jyOc
- ;; xmm3= jxH1a jyH1a jxH1c jyH1c
- ;; xmm4= jxH2a jyH2a jxH2c jyH2c
- ;; xmm5= jxOb jyOb jxOd jyOd
- ;; xmm6= jxH1b jyH1b jxH1d jyH1d
- ;; xmm7= jxH2b jyH2b jxH2d jyH2d
-
- movaps xmm0, xmm2
- movaps xmm1, xmm3
- unpcklps xmm0, xmm5 ; xmm0= jxOa jxOb jyOa jyOb
- unpcklps xmm1, xmm6 ; xmm1= jxH1a jxH1b jyH1a jyH1b
- unpckhps xmm2, xmm5 ; xmm2= jxOc jxOd jyOc jyOd
- unpckhps xmm3, xmm6 ; xmm3= jxH1c jxH1d jyH1c jyH1d
- movaps xmm5, xmm4
- movaps xmm6, xmm0
- unpcklps xmm4, xmm7 ; xmm4= jxH2a jxH2b jyH2a jyH2b
- unpckhps xmm5, xmm7 ; xmm5= jxH2c jxH2d jyH2c jyH2d
- movaps xmm7, xmm1
- movlhps xmm0, xmm2 ; xmm0= jxOa jxOb jxOc jxOd
- movaps [esp + .jxO], xmm0
- movhlps xmm2, xmm6 ; xmm2= jyOa jyOb jyOc jyOd
- movaps [esp + .jyO], xmm2
- movlhps xmm1, xmm3
- movaps [esp + .jxH1], xmm1
- movhlps xmm3, xmm7
- movaps xmm6, xmm4
- movaps [esp + .jyH1], xmm3
- movlhps xmm4, xmm5
- movaps [esp + .jxH2], xmm4
- movhlps xmm5, xmm6
- movaps [esp + .jyH2], xmm5
-
- movss xmm0, [esi + eax*4 + 8]
- movss xmm1, [esi + eax*4 + 20]
- movss xmm2, [esi + eax*4 + 32]
-
- movss xmm3, [esi + ecx*4 + 8]
- movss xmm4, [esi + ecx*4 + 20]
- movss xmm5, [esi + ecx*4 + 32]
-
- movhps xmm0, [esi + ebx*4 + 4]
- movhps xmm1, [esi + ebx*4 + 16]
- movhps xmm2, [esi + ebx*4 + 28]
-
- movhps xmm3, [esi + edx*4 + 4]
- movhps xmm4, [esi + edx*4 + 16]
- movhps xmm5, [esi + edx*4 + 28]
-
- shufps xmm0, xmm3, 11001100b
- shufps xmm1, xmm4, 11001100b
- shufps xmm2, xmm5, 11001100b
- movaps [esp + .jzO], xmm0
- movaps [esp + .jzH1], xmm1
- movaps [esp + .jzH2], xmm2
-
- movaps xmm0, [esp + .ixO]
- movaps xmm1, [esp + .iyO]
- movaps xmm2, [esp + .izO]
- movaps xmm3, [esp + .ixO]
- movaps xmm4, [esp + .iyO]
- movaps xmm5, [esp + .izO]
- subps xmm0, [esp + .jxO]
- subps xmm1, [esp + .jyO]
- subps xmm2, [esp + .jzO]
- subps xmm3, [esp + .jxH1]
- subps xmm4, [esp + .jyH1]
- subps xmm5, [esp + .jzH1]
- movaps [esp + .dxOO], xmm0
- movaps [esp + .dyOO], xmm1
- movaps [esp + .dzOO], xmm2
- mulps xmm0, xmm0
- mulps xmm1, xmm1
- mulps xmm2, xmm2
- movaps [esp + .dxOH1], xmm3
- movaps [esp + .dyOH1], xmm4
- movaps [esp + .dzOH1], xmm5
- mulps xmm3, xmm3
- mulps xmm4, xmm4
- mulps xmm5, xmm5
- addps xmm0, xmm1
- addps xmm0, xmm2
- addps xmm3, xmm4
- addps xmm3, xmm5
- movaps [esp + .rsqOO], xmm0
- movaps [esp + .rsqOH1], xmm3
-
- movaps xmm0, [esp + .ixO]
- movaps xmm1, [esp + .iyO]
- movaps xmm2, [esp + .izO]
- movaps xmm3, [esp + .ixH1]
- movaps xmm4, [esp + .iyH1]
- movaps xmm5, [esp + .izH1]
- subps xmm0, [esp + .jxH2]
- subps xmm1, [esp + .jyH2]
- subps xmm2, [esp + .jzH2]
- subps xmm3, [esp + .jxO]
- subps xmm4, [esp + .jyO]
- subps xmm5, [esp + .jzO]
- movaps [esp + .dxOH2], xmm0
- movaps [esp + .dyOH2], xmm1
- movaps [esp + .dzOH2], xmm2
- mulps xmm0, xmm0
- mulps xmm1, xmm1
- mulps xmm2, xmm2
- movaps [esp + .dxH1O], xmm3
- movaps [esp + .dyH1O], xmm4
- movaps [esp + .dzH1O], xmm5
- mulps xmm3, xmm3
- mulps xmm4, xmm4
- mulps xmm5, xmm5
- addps xmm0, xmm1
- addps xmm0, xmm2
- addps xmm3, xmm4
- addps xmm3, xmm5
- movaps [esp + .rsqOH2], xmm0
- movaps [esp + .rsqH1O], xmm3
-
- movaps xmm0, [esp + .ixH1]
- movaps xmm1, [esp + .iyH1]
- movaps xmm2, [esp + .izH1]
- movaps xmm3, [esp + .ixH1]
- movaps xmm4, [esp + .iyH1]
- movaps xmm5, [esp + .izH1]
- subps xmm0, [esp + .jxH1]
- subps xmm1, [esp + .jyH1]
- subps xmm2, [esp + .jzH1]
- subps xmm3, [esp + .jxH2]
- subps xmm4, [esp + .jyH2]
- subps xmm5, [esp + .jzH2]
- movaps [esp + .dxH1H1], xmm0
- movaps [esp + .dyH1H1], xmm1
- movaps [esp + .dzH1H1], xmm2
- mulps xmm0, xmm0
- mulps xmm1, xmm1
- mulps xmm2, xmm2
- movaps [esp + .dxH1H2], xmm3
- movaps [esp + .dyH1H2], xmm4
- movaps [esp + .dzH1H2], xmm5
- mulps xmm3, xmm3
- mulps xmm4, xmm4
- mulps xmm5, xmm5
- addps xmm0, xmm1
- addps xmm0, xmm2
- addps xmm3, xmm4
- addps xmm3, xmm5
- movaps [esp + .rsqH1H1], xmm0
- movaps [esp + .rsqH1H2], xmm3
-
- movaps xmm0, [esp + .ixH2]
- movaps xmm1, [esp + .iyH2]
- movaps xmm2, [esp + .izH2]
- movaps xmm3, [esp + .ixH2]
- movaps xmm4, [esp + .iyH2]
- movaps xmm5, [esp + .izH2]
- subps xmm0, [esp + .jxO]
- subps xmm1, [esp + .jyO]
- subps xmm2, [esp + .jzO]
- subps xmm3, [esp + .jxH1]
- subps xmm4, [esp + .jyH1]
- subps xmm5, [esp + .jzH1]
- movaps [esp + .dxH2O], xmm0
- movaps [esp + .dyH2O], xmm1
- movaps [esp + .dzH2O], xmm2
- mulps xmm0, xmm0
- mulps xmm1, xmm1
- mulps xmm2, xmm2
- movaps [esp + .dxH2H1], xmm3
- movaps [esp + .dyH2H1], xmm4
- movaps [esp + .dzH2H1], xmm5
- mulps xmm3, xmm3
- mulps xmm4, xmm4
- mulps xmm5, xmm5
- addps xmm0, xmm1
- addps xmm0, xmm2
- addps xmm4, xmm3
- addps xmm4, xmm5
- movaps [esp + .rsqH2O], xmm0
- movaps [esp + .rsqH2H1], xmm4
-
- movaps xmm0, [esp + .ixH2]
- movaps xmm1, [esp + .iyH2]
- movaps xmm2, [esp + .izH2]
- subps xmm0, [esp + .jxH2]
- subps xmm1, [esp + .jyH2]
- subps xmm2, [esp + .jzH2]
- movaps [esp + .dxH2H2], xmm0
- movaps [esp + .dyH2H2], xmm1
- movaps [esp + .dzH2H2], xmm2
- mulps xmm0, xmm0
- mulps xmm1, xmm1
- mulps xmm2, xmm2
- addps xmm0, xmm1
- addps xmm0, xmm2
- movaps [esp + .rsqH2H2], xmm0
-
- ; start doing invsqrt. use rsq values in xmm0, xmm4
- rsqrtps xmm1, xmm0
- rsqrtps xmm5, xmm4
- movaps xmm2, xmm1
- movaps xmm6, xmm5
- mulps xmm1, xmm1
- mulps xmm5, xmm5
- movaps xmm3, [esp + .three]
- movaps xmm7, xmm3
- mulps xmm1, xmm0
- mulps xmm5, xmm4
- subps xmm3, xmm1
- subps xmm7, xmm5
- mulps xmm3, xmm2
- mulps xmm7, xmm6
- mulps xmm3, [esp + .half] ; rinvH2H2
- mulps xmm7, [esp + .half] ; rinvH2H1
- movaps [esp + .rinvH2H2], xmm3
- movaps [esp + .rinvH2H1], xmm7
-
- rsqrtps xmm1, [esp + .rsqOO]
- rsqrtps xmm5, [esp + .rsqOH1]
- movaps xmm2, xmm1
- movaps xmm6, xmm5
- mulps xmm1, xmm1
- mulps xmm5, xmm5
- movaps xmm3, [esp + .three]
- movaps xmm7, xmm3
- mulps xmm1, [esp + .rsqOO]
- mulps xmm5, [esp + .rsqOH1]
- subps xmm3, xmm1
- subps xmm7, xmm5
- mulps xmm3, xmm2
- mulps xmm7, xmm6
- mulps xmm3, [esp + .half]
- mulps xmm7, [esp + .half]
- movaps [esp + .rinvOO], xmm3
- movaps [esp + .rinvOH1], xmm7
-
- rsqrtps xmm1, [esp + .rsqOH2]
- rsqrtps xmm5, [esp + .rsqH1O]
- movaps xmm2, xmm1
- movaps xmm6, xmm5
- mulps xmm1, xmm1
- mulps xmm5, xmm5
- movaps xmm3, [esp + .three]
- movaps xmm7, xmm3
- mulps xmm1, [esp + .rsqOH2]
- mulps xmm5, [esp + .rsqH1O]
- subps xmm3, xmm1
- subps xmm7, xmm5
- mulps xmm3, xmm2
- mulps xmm7, xmm6
- mulps xmm3, [esp + .half]
- mulps xmm7, [esp + .half]
- movaps [esp + .rinvOH2], xmm3
- movaps [esp + .rinvH1O], xmm7
-
- rsqrtps xmm1, [esp + .rsqH1H1]
- rsqrtps xmm5, [esp + .rsqH1H2]
- movaps xmm2, xmm1
- movaps xmm6, xmm5
- mulps xmm1, xmm1
- mulps xmm5, xmm5
- movaps xmm3, [esp + .three]
- movaps xmm7, xmm3
- mulps xmm1, [esp + .rsqH1H1]
- mulps xmm5, [esp + .rsqH1H2]
- subps xmm3, xmm1
- subps xmm7, xmm5
- mulps xmm3, xmm2
- mulps xmm7, xmm6
- mulps xmm3, [esp + .half]
- mulps xmm7, [esp + .half]
- movaps [esp + .rinvH1H1], xmm3
- movaps [esp + .rinvH1H2], xmm7
-
- rsqrtps xmm1, [esp + .rsqH2O]
- movaps xmm2, xmm1
- mulps xmm1, xmm1
- movaps xmm3, [esp + .three]
- mulps xmm1, [esp + .rsqH2O]
- subps xmm3, xmm1
- mulps xmm3, xmm2
- mulps xmm3, [esp + .half]
- movaps [esp + .rinvH2O], xmm3
-
- ;; start with OO interaction.
- movaps xmm0, [esp + .rinvOO]
- movaps xmm7, xmm0 ; xmm7=rinv
- movaps xmm5, [esp + .krf]
- mulps xmm0, xmm0
- movaps xmm1, xmm0
- mulps xmm1, xmm0
- mulps xmm1, xmm0 ; xmm1=rinvsix
- mulps xmm5, [esp + .rsqOO] ; xmm5=krsq
- movaps xmm6, xmm5
- addps xmm6, xmm7 ; xmm6=rinv+krsq
- subps xmm6, [esp + .crf]
-
- mulps xmm6, [esp + .qqOO] ; xmm6=voul=qq*(rinv+krsq-crf)
- mulps xmm5, [esp + .two]
- subps xmm7, xmm5 ; xmm7=rinv-2*krsq
- mulps xmm7, [esp + .qqOO] ; xmm7 = coul part of fscal
-
- movaps xmm2, xmm1
- mulps xmm2, xmm2 ; xmm2=rinvtwelve
- mulps xmm1, [esp + .c6]
- mulps xmm2, [esp + .c12]
- movaps xmm3, xmm2
- subps xmm3, xmm1 ; xmm3=vnb12-vnb6
- addps xmm3, [esp + .vnbtot]
- mulps xmm1, [esp + .six]
- mulps xmm2, [esp + .twelve]
- movaps [esp + .vnbtot], xmm3
- subps xmm2, xmm1
- addps xmm2, xmm7
- addps xmm6, [esp + .vctot] ; local vctot summation variable
- mulps xmm0, xmm2
-
- movaps xmm1, xmm0
- movaps xmm2, xmm0
-
- xorps xmm3, xmm3
- movaps xmm4, xmm3
- movaps xmm5, xmm3
- mulps xmm0, [esp + .dxOO]
- mulps xmm1, [esp + .dyOO]
- mulps xmm2, [esp + .dzOO]
- subps xmm3, xmm0
- subps xmm4, xmm1
- subps xmm5, xmm2
- addps xmm0, [esp + .fixO]
- addps xmm1, [esp + .fiyO]
- addps xmm2, [esp + .fizO]
- movaps [esp + .fjxO], xmm3
- movaps [esp + .fjyO], xmm4
- movaps [esp + .fjzO], xmm5
- movaps [esp + .fixO], xmm0
- movaps [esp + .fiyO], xmm1
- movaps [esp + .fizO], xmm2
-
- ; O-H1 interaction
- movaps xmm0, [esp + .rinvOH1]
- movaps xmm7, xmm0 ; xmm7=rinv
- movaps xmm5, [esp + .krf]
- movaps xmm1, xmm0
- mulps xmm5, [esp + .rsqOH1] ; xmm5=krsq
- movaps xmm4, xmm5
- addps xmm4, xmm7 ; xmm4=rinv+krsq
- mulps xmm0, xmm0
- subps xmm4, [esp + .crf]
- mulps xmm4, [esp + .qqOH] ; xmm4=voul=qq*(rinv+krsq)
- mulps xmm5, [esp + .two]
- subps xmm7, xmm5 ; xmm7=rinv-2*krsq
- mulps xmm7, [esp + .qqOH] ; xmm7 = coul part of fscal
- addps xmm6, xmm4 ; add to local vctot.
- mulps xmm0, xmm7 ; fsOH1
- movaps xmm1, xmm0
- movaps xmm2, xmm0
-
- xorps xmm3, xmm3
- movaps xmm4, xmm3
- movaps xmm5, xmm3
- mulps xmm0, [esp + .dxOH1]
- mulps xmm1, [esp + .dyOH1]
- mulps xmm2, [esp + .dzOH1]
- subps xmm3, xmm0
- subps xmm4, xmm1
- subps xmm5, xmm2
- addps xmm0, [esp + .fixO]
- addps xmm1, [esp + .fiyO]
- addps xmm2, [esp + .fizO]
- movaps [esp + .fjxH1], xmm3
- movaps [esp + .fjyH1], xmm4
- movaps [esp + .fjzH1], xmm5
- movaps [esp + .fixO], xmm0
- movaps [esp + .fiyO], xmm1
- movaps [esp + .fizO], xmm2
-
- ; O-H2 interaction
- movaps xmm0, [esp + .rinvOH2]
- movaps xmm7, xmm0 ; xmm7=rinv
- movaps xmm5, [esp + .krf]
- movaps xmm1, xmm0
- mulps xmm5, [esp + .rsqOH2] ; xmm5=krsq
- movaps xmm4, xmm5
- addps xmm4, xmm7 ; xmm4=rinv+krsq
- mulps xmm0, xmm0
- subps xmm4, [esp + .crf]
- mulps xmm4, [esp + .qqOH] ; xmm4=voul=qq*(rinv+krsq)
- mulps xmm5, [esp + .two]
- subps xmm7, xmm5 ; xmm7=rinv-2*krsq
- mulps xmm7, [esp + .qqOH] ; xmm7 = coul part of fscal
- addps xmm6, xmm4 ; add to local vctot.
- mulps xmm0, xmm7 ; fsOH2
- movaps xmm1, xmm0
- movaps xmm2, xmm0
-
- xorps xmm3, xmm3
- movaps xmm4, xmm3
- movaps xmm5, xmm3
- mulps xmm0, [esp + .dxOH2]
- mulps xmm1, [esp + .dyOH2]
- mulps xmm2, [esp + .dzOH2]
- subps xmm3, xmm0
- subps xmm4, xmm1
- subps xmm5, xmm2
- addps xmm0, [esp + .fixO]
- addps xmm1, [esp + .fiyO]
- addps xmm2, [esp + .fizO]
- movaps [esp + .fjxH2], xmm3
- movaps [esp + .fjyH2], xmm4
- movaps [esp + .fjzH2], xmm5
- movaps [esp + .fixO], xmm0
- movaps [esp + .fiyO], xmm1
- movaps [esp + .fizO], xmm2
-
- ; H1-O interaction
- movaps xmm0, [esp + .rinvH1O]
- movaps xmm7, xmm0 ; xmm7=rinv
- movaps xmm5, [esp + .krf]
- movaps xmm1, xmm0
- mulps xmm5, [esp + .rsqH1O] ; xmm5=krsq
- movaps xmm4, xmm5
- addps xmm4, xmm7 ; xmm4=rinv+krsq
- mulps xmm0, xmm0
- subps xmm4, [esp + .crf]
- mulps xmm4, [esp + .qqOH] ; xmm4=voul=qq*(rinv+krsq)
- mulps xmm5, [esp + .two]
- subps xmm7, xmm5 ; xmm7=rinv-2*krsq
- mulps xmm7, [esp + .qqOH] ; xmm7 = coul part of fscal
- addps xmm6, xmm4 ; add to local vctot.
- mulps xmm0, xmm7 ; fsOH2
- movaps xmm1, xmm0
- movaps xmm2, xmm0
-
- movaps xmm3, [esp + .fjxO]
- movaps xmm4, [esp + .fjyO]
- movaps xmm5, [esp + .fjzO]
- mulps xmm0, [esp + .dxH1O]
- mulps xmm1, [esp + .dyH1O]
- mulps xmm2, [esp + .dzH1O]
- subps xmm3, xmm0
- subps xmm4, xmm1
- subps xmm5, xmm2
- addps xmm0, [esp + .fixH1]
- addps xmm1, [esp + .fiyH1]
- addps xmm2, [esp + .fizH1]
- movaps [esp + .fjxO], xmm3
- movaps [esp + .fjyO], xmm4
- movaps [esp + .fjzO], xmm5
- movaps [esp + .fixH1], xmm0
- movaps [esp + .fiyH1], xmm1
- movaps [esp + .fizH1], xmm2
-
- ; H1-H1 interaction
- movaps xmm0, [esp + .rinvH1H1]
- movaps xmm7, xmm0 ; xmm7=rinv
- movaps xmm5, [esp + .krf]
- movaps xmm1, xmm0
- mulps xmm5, [esp + .rsqH1H1] ; xmm5=krsq
- movaps xmm4, xmm5
- addps xmm4, xmm7 ; xmm4=rinv+krsq
- subps xmm4, [esp + .crf]
- mulps xmm0, xmm0
- mulps xmm4, [esp + .qqHH] ; xmm4=voul=qq*(rinv+krsq)
- mulps xmm5, [esp + .two]
- subps xmm7, xmm5 ; xmm7=rinv-2*krsq
- mulps xmm7, [esp + .qqHH] ; xmm7 = coul part of fscal
- addps xmm6, xmm4 ; add to local vctot.
- mulps xmm0, xmm7 ; fsOH2
- movaps xmm1, xmm0
- movaps xmm2, xmm0
-
- movaps xmm3, [esp + .fjxH1]
- movaps xmm4, [esp + .fjyH1]
- movaps xmm5, [esp + .fjzH1]
- mulps xmm0, [esp + .dxH1H1]
- mulps xmm1, [esp + .dyH1H1]
- mulps xmm2, [esp + .dzH1H1]
- subps xmm3, xmm0
- subps xmm4, xmm1
- subps xmm5, xmm2
- addps xmm0, [esp + .fixH1]
- addps xmm1, [esp + .fiyH1]
- addps xmm2, [esp + .fizH1]
- movaps [esp + .fjxH1], xmm3
- movaps [esp + .fjyH1], xmm4
- movaps [esp + .fjzH1], xmm5
- movaps [esp + .fixH1], xmm0
- movaps [esp + .fiyH1], xmm1
- movaps [esp + .fizH1], xmm2
-
- ; H1-H2 interaction
- movaps xmm0, [esp + .rinvH1H2]
- movaps xmm7, xmm0 ; xmm7=rinv
- movaps xmm5, [esp + .krf]
- movaps xmm1, xmm0
- mulps xmm5, [esp + .rsqH1H2] ; xmm5=krsq
- movaps xmm4, xmm5
- addps xmm4, xmm7 ; xmm4=rinv+krsq
- mulps xmm0, xmm0
- subps xmm4, [esp + .crf]
- mulps xmm4, [esp + .qqHH] ; xmm4=voul=qq*(rinv+krsq)
- mulps xmm5, [esp + .two]
- subps xmm7, xmm5 ; xmm7=rinv-2*krsq
- mulps xmm7, [esp + .qqHH] ; xmm7 = coul part of fscal
- addps xmm6, xmm4 ; add to local vctot.
- mulps xmm0, xmm7 ; fsOH2
- movaps xmm1, xmm0
- movaps xmm2, xmm0
-
- movaps xmm3, [esp + .fjxH2]
- movaps xmm4, [esp + .fjyH2]
- movaps xmm5, [esp + .fjzH2]
- mulps xmm0, [esp + .dxH1H2]
- mulps xmm1, [esp + .dyH1H2]
- mulps xmm2, [esp + .dzH1H2]
- subps xmm3, xmm0
- subps xmm4, xmm1
- subps xmm5, xmm2
- addps xmm0, [esp + .fixH1]
- addps xmm1, [esp + .fiyH1]
- addps xmm2, [esp + .fizH1]
- movaps [esp + .fjxH2], xmm3
- movaps [esp + .fjyH2], xmm4
- movaps [esp + .fjzH2], xmm5
- movaps [esp + .fixH1], xmm0
- movaps [esp + .fiyH1], xmm1
- movaps [esp + .fizH1], xmm2
-
- ; H2-O interaction
- movaps xmm0, [esp + .rinvH2O]
- movaps xmm7, xmm0 ; xmm7=rinv
- movaps xmm5, [esp + .krf]
- movaps xmm1, xmm0
- mulps xmm5, [esp + .rsqH2O] ; xmm5=krsq
- movaps xmm4, xmm5
- addps xmm4, xmm7 ; xmm4=rinv+krsq
- subps xmm4, [esp + .crf]
- mulps xmm0, xmm0
- mulps xmm4, [esp + .qqOH] ; xmm4=voul=qq*(rinv+krsq)
- mulps xmm5, [esp + .two]
- subps xmm7, xmm5 ; xmm7=rinv-2*krsq
- mulps xmm7, [esp + .qqOH] ; xmm7 = coul part of fscal
- addps xmm6, xmm4 ; add to local vctot.
- mulps xmm0, xmm7 ; fsOH2
- movaps xmm1, xmm0
- movaps xmm2, xmm0
-
- movaps xmm3, [esp + .fjxO]
- movaps xmm4, [esp + .fjyO]
- movaps xmm5, [esp + .fjzO]
- mulps xmm0, [esp + .dxH2O]
- mulps xmm1, [esp + .dyH2O]
- mulps xmm2, [esp + .dzH2O]
- subps xmm3, xmm0
- subps xmm4, xmm1
- subps xmm5, xmm2
- addps xmm0, [esp + .fixH2]
- addps xmm1, [esp + .fiyH2]
- addps xmm2, [esp + .fizH2]
- movaps [esp + .fjxO], xmm3
- movaps [esp + .fjyO], xmm4
- movaps [esp + .fjzO], xmm5
- movaps [esp + .fixH2], xmm0
- movaps [esp + .fiyH2], xmm1
- movaps [esp + .fizH2], xmm2
-
- ; H2-H1 interaction
- movaps xmm0, [esp + .rinvH2H1]
- movaps xmm7, xmm0 ; xmm7=rinv
- movaps xmm5, [esp + .krf]
- movaps xmm1, xmm0
- mulps xmm5, [esp + .rsqH2H1] ; xmm5=krsq
- movaps xmm4, xmm5
- addps xmm4, xmm7 ; xmm4=rinv+krsq
- subps xmm4, [esp + .crf]
- mulps xmm0, xmm0
- mulps xmm4, [esp + .qqHH] ; xmm4=voul=qq*(rinv+krsq)
- mulps xmm5, [esp + .two]
- subps xmm7, xmm5 ; xmm7=rinv-2*krsq
- mulps xmm7, [esp + .qqHH] ; xmm7 = coul part of fscal
- addps xmm6, xmm4 ; add to local vctot.
- mulps xmm0, xmm7 ; fsOH2
- movaps xmm1, xmm0
- movaps xmm2, xmm0
-
- movaps xmm3, [esp + .fjxH1]
- movaps xmm4, [esp + .fjyH1]
- movaps xmm5, [esp + .fjzH1]
- mulps xmm0, [esp + .dxH2H1]
- mulps xmm1, [esp + .dyH2H1]
- mulps xmm2, [esp + .dzH2H1]
- subps xmm3, xmm0
- subps xmm4, xmm1
- subps xmm5, xmm2
- addps xmm0, [esp + .fixH2]
- addps xmm1, [esp + .fiyH2]
- addps xmm2, [esp + .fizH2]
- movaps [esp + .fjxH1], xmm3
- movaps [esp + .fjyH1], xmm4
- movaps [esp + .fjzH1], xmm5
- movaps [esp + .fixH2], xmm0
- movaps [esp + .fiyH2], xmm1
- movaps [esp + .fizH2], xmm2
-
- ; H2-H2 interaction
- movaps xmm0, [esp + .rinvH2H2]
- movaps xmm7, xmm0 ; xmm7=rinv
- movaps xmm5, [esp + .krf]
- movaps xmm1, xmm0
- mulps xmm5, [esp + .rsqH2H2] ; xmm5=krsq
- movaps xmm4, xmm5
- addps xmm4, xmm7 ; xmm4=rinv+krsq
- subps xmm4, [esp + .crf]
- mulps xmm0, xmm0
- mulps xmm4, [esp + .qqHH] ; xmm4=voul=qq*(rinv+krsq)
- mulps xmm5, [esp + .two]
- subps xmm7, xmm5 ; xmm7=rinv-2*krsq
- mulps xmm7, [esp + .qqHH] ; xmm7 = coul part of fscal
- addps xmm6, xmm4 ; add to local vctot.
- mulps xmm0, xmm7 ; fsOH2
- movaps xmm1, xmm0
- movaps xmm2, xmm0
-
- movaps xmm1, xmm0
- movaps [esp + .vctot], xmm6
- movaps xmm2, xmm0
-
- movaps xmm3, [esp + .fjxH2]
- movaps xmm4, [esp + .fjyH2]
- movaps xmm5, [esp + .fjzH2]
- mulps xmm0, [esp + .dxH2H2]
- mulps xmm1, [esp + .dyH2H2]
- mulps xmm2, [esp + .dzH2H2]
- subps xmm3, xmm0
- subps xmm4, xmm1
- subps xmm5, xmm2
- addps xmm0, [esp + .fixH2]
- addps xmm1, [esp + .fiyH2]
- addps xmm2, [esp + .fizH2]
- movaps [esp + .fjxH2], xmm3
- movaps [esp + .fjyH2], xmm4
- movaps [esp + .fjzH2], xmm5
- movaps [esp + .fixH2], xmm0
- movaps [esp + .fiyH2], xmm1
- movaps [esp + .fizH2], xmm2
-
- mov edi, [ebp +%$faction]
-
- ; Did all interactions - now update j forces.
- ; 4 j waters with three atoms each - first do a & b j particles
- movaps xmm0, [esp + .fjxO] ; xmm0= fjxOa fjxOb fjxOc fjxOd
- movaps xmm1, [esp + .fjyO] ; xmm1= fjyOa fjyOb fjyOc fjyOd
- unpcklps xmm0, xmm1 ; xmm0= fjxOa fjyOa fjxOb fjyOb
- movaps xmm1, [esp + .fjzO]
- movaps xmm2, [esp + .fjxH1]
- movhlps xmm3, xmm0 ; xmm3= fjxOb fjyOb
- unpcklps xmm1, xmm2 ; xmm1= fjzOa fjxH1a fjzOb fjxH1b
- movaps xmm4, [esp + .fjyH1]
- movaps xmm5, [esp + .fjzH1]
- unpcklps xmm4, xmm5 ; xmm4= fjyH1a fjzH1a fjyH1b fjzH1b
- movaps xmm5, [esp + .fjxH2]
- movaps xmm6, [esp + .fjyH2]
- movhlps xmm7, xmm4 ; xmm7= fjyH1b fjzH1b
- unpcklps xmm5, xmm6 ; xmm5= fjxH2a fjyH2a fjxH2b fjyH2b
- movlhps xmm0, xmm1 ; xmm0= fjxOa fjyOa fjzOa fjxH1a
- shufps xmm3, xmm1, 11100100b
- ; xmm3= fjxOb fjyOb fjzOb fjxH1b
- movlhps xmm4, xmm5 ; xmm4= fjyH1a fjzH1a fjxH2a fjyH2a
- shufps xmm7, xmm5, 11100100b
- ; xmm7= fjyH1b fjzH1b fjxH2b fjyH2b
- movups xmm1, [edi + eax*4]
- movups xmm2, [edi + eax*4 + 16]
- movups xmm5, [edi + ebx*4]
- movups xmm6, [edi + ebx*4 + 16]
- addps xmm1, xmm0
- addps xmm2, xmm4
- addps xmm5, xmm3
- addps xmm6, xmm7
- movss xmm0, [edi + eax*4 + 32]
- movss xmm3, [edi + ebx*4 + 32]
-
- movaps xmm4, [esp + .fjzH2]
- movaps xmm7, xmm4
- shufps xmm7, xmm7, 1b
-
- movups [edi + eax*4], xmm1
- movups [edi + eax*4 + 16],xmm2
- movups [edi + ebx*4], xmm5
- movups [edi + ebx*4 + 16],xmm6
- addss xmm0, xmm4
- addss xmm3, xmm7
- movss [edi + eax*4 + 32], xmm0
- movss [edi + ebx*4 + 32], xmm3
-
- ;; then do the second pair (c & d)
- movaps xmm0, [esp + .fjxO] ; xmm0= fjxOa fjxOb fjxOc fjxOd
- movaps xmm1, [esp + .fjyO] ; xmm1= fjyOa fjyOb fjyOc fjyOd
- unpckhps xmm0, xmm1 ; xmm0= fjxOc fjyOc fjxOd fjyOd
- movaps xmm1, [esp + .fjzO]
- movaps xmm2, [esp + .fjxH1]
- movhlps xmm3, xmm0 ; xmm3= fjxOd fjyOd
- unpckhps xmm1, xmm2 ; xmm1= fjzOc fjxH1c fjzOd fjxH1d
- movaps xmm4, [esp + .fjyH1]
- movaps xmm5, [esp + .fjzH1]
- unpckhps xmm4, xmm5 ; xmm4= fjyH1c fjzH1c fjyH1d fjzH1d
- movaps xmm5, [esp + .fjxH2]
- movaps xmm6, [esp + .fjyH2]
- movhlps xmm7, xmm4 ; xmm7= fjyH1d fjzH1d
- unpckhps xmm5, xmm6 ; xmm5= fjxH2c fjyH2c fjxH2d fjyH2d
- movlhps xmm0, xmm1 ; xmm0= fjxOc fjyOc fjzOc fjxH1c
- shufps xmm3, xmm1, 11100100b
- ; xmm3= fjxOd fjyOd fjzOd fjxH1d
- movlhps xmm4, xmm5 ; xmm4= fjyH1c fjzH1c fjxH2c fjyH2c
- shufps xmm7, xmm5, 11100100b
- ; xmm7= fjyH1d fjzH1d fjxH2d fjyH2d
- movups xmm1, [edi + ecx*4]
- movups xmm2, [edi + ecx*4 + 16]
- movups xmm5, [edi + edx*4]
- movups xmm6, [edi + edx*4 + 16]
- addps xmm1, xmm0
- addps xmm2, xmm4
- addps xmm5, xmm3
- addps xmm6, xmm7
- movss xmm0, [edi + ecx*4 + 32]
- movss xmm3, [edi + edx*4 + 32]
-
- movaps xmm4, [esp + .fjzH2]
- movaps xmm7, xmm4
- shufps xmm4, xmm4, 10b
- shufps xmm7, xmm7, 11b
- movups [edi + ecx*4], xmm1
- movups [edi + ecx*4 + 16],xmm2
- movups [edi + edx*4], xmm5
- movups [edi + edx*4 + 16],xmm6
- addss xmm0, xmm4
- addss xmm3, xmm7
- movss [edi + ecx*4 + 32], xmm0
- movss [edi + edx*4 + 32], xmm3
-
- ;; should we do one more iteration?
- sub [esp + .innerk], dword 4
- jl .single_check
- jmp .unroll_loop
-.single_check:
- add [esp + .innerk], dword 4
- jnz .single_loop
- jmp .updateouterdata
-.single_loop:
- mov edx, [esp + .innerjjnr] ; pointer to jjnr[k]
- mov eax, [edx]
- add [esp + .innerjjnr], dword 4
-
- mov esi, [ebp + %$pos]
- lea eax, [eax + eax*2]
-
- ; fetch j coordinates
- xorps xmm3, xmm3
- xorps xmm4, xmm4
- xorps xmm5, xmm5
- movss xmm3, [esi + eax*4]
- movss xmm4, [esi + eax*4 + 4]
- movss xmm5, [esi + eax*4 + 8]
-
- movlps xmm6, [esi + eax*4 + 12]
- movhps xmm6, [esi + eax*4 + 24] ; xmm6=jxH1 jyH1 jxH2 jyH2
- ;; fetch both z coords in one go, to positions 0 and 3 in xmm7
- movups xmm7, [esi + eax*4 + 20] ; xmm7=jzH1 jxH2 jyH2 jzH2
- shufps xmm6, xmm6, 11011000b ; xmm6=jxH1 jxH2 jyH1 jyH2
- movlhps xmm3, xmm6 ; xmm3= jxO 0 jxH1 jxH2
- movaps xmm0, [esp + .ixO]
- movaps xmm1, [esp + .iyO]
- movaps xmm2, [esp + .izO]
- shufps xmm4, xmm6, 11100100b ; xmm4= jyO 0 jyH1 jyH2
- shufps xmm5, xmm7, 11000100b ; xmm5= jzO 0 jzH1 jzH2
- ;; store all j coordinates in jO
- movaps [esp + .jxO], xmm3
- movaps [esp + .jyO], xmm4
- movaps [esp + .jzO], xmm5
- subps xmm0, xmm3
- subps xmm1, xmm4
- subps xmm2, xmm5
- movaps [esp + .dxOO], xmm0
- movaps [esp + .dyOO], xmm1
- movaps [esp + .dzOO], xmm2
- mulps xmm0, xmm0
- mulps xmm1, xmm1
- mulps xmm2, xmm2
- addps xmm0, xmm1
- addps xmm0, xmm2 ; have rsq in xmm0.
-
- movaps xmm6, xmm0
-
- ;; do invsqrt
- rsqrtps xmm1, xmm0
- mulps xmm6, [esp + .krf] ; xmm6=krsq
- movaps xmm2, xmm1
- movaps xmm7, xmm6
- mulps xmm1, xmm1
- movaps xmm3, [esp + .three]
- mulps xmm1, xmm0
- subps xmm3, xmm1
- mulps xmm3, xmm2
- mulps xmm3, [esp + .half] ; rinv iO - j water
-
- addps xmm6, xmm3 ; xmm6=rinv+krsq
- mulps xmm7, [esp + .two]
- subps xmm6, [esp + .crf] ; xmm6=rinv+krsq-crf
-
- xorps xmm1, xmm1
- movaps xmm0, xmm3
- subps xmm3, xmm7 ; xmm3=rinv-2*krsq
- xorps xmm4, xmm4
- mulps xmm0, xmm0 ; xmm0=rinvsq
- ;; fetch charges to xmm4 (temporary)
- movss xmm4, [esp + .qqOO]
- movss xmm1, xmm0
- movhps xmm4, [esp + .qqOH]
- mulss xmm1, xmm0
-
- mulps xmm6, xmm4 ; vcoul
- mulps xmm3, xmm4 ; coul part of fs
-
- mulss xmm1, xmm0 ; xmm1(0)=rinvsix
- movaps xmm2, xmm1 ; zero everything else in xmm2
- mulss xmm2, xmm2 ; xmm2=rinvtwelve
-
- mulss xmm1, [esp + .c6]
- mulss xmm2, [esp + .c12]
- movaps xmm4, xmm2
- subss xmm4, xmm1 ; vnbtot=vnb12-vnb6
- addps xmm4, [esp + .vnbtot]
- mulss xmm1, [esp + .six]
- mulss xmm2, [esp + .twelve]
- movaps [esp + .vnbtot], xmm4
- subss xmm2, xmm1 ; fsD+fsR
- addps xmm2, xmm3 ; fsC+fsD+fsR
-
- addps xmm6, [esp + .vctot]
- mulps xmm0, xmm2 ; total fscal
- movaps [esp + .vctot], xmm6
-
- movaps xmm1, xmm0
- movaps xmm2, xmm0
- mulps xmm0, [esp + .dxOO]
- mulps xmm1, [esp + .dyOO]
- mulps xmm2, [esp + .dzOO]
-
- ;; initial update for j forces
- xorps xmm3, xmm3
- xorps xmm4, xmm4
- xorps xmm5, xmm5
- subps xmm3, xmm0
- subps xmm4, xmm1
- subps xmm5, xmm2
- movaps [esp + .fjxO], xmm3
- movaps [esp + .fjyO], xmm4
- movaps [esp + .fjzO], xmm5
- addps xmm0, [esp + .fixO]
- addps xmm1, [esp + .fiyO]
- addps xmm2, [esp + .fizO]
- movaps [esp + .fixO], xmm0
- movaps [esp + .fiyO], xmm1
- movaps [esp + .fizO], xmm2
-
-
- ;; done with i O. Now do i H1 & H2 simultaneously. first get i particle coords:
- movaps xmm0, [esp + .ixH1]
- movaps xmm1, [esp + .iyH1]
- movaps xmm2, [esp + .izH1]
- movaps xmm3, [esp + .ixH2]
- movaps xmm4, [esp + .iyH2]
- movaps xmm5, [esp + .izH2]
- subps xmm0, [esp + .jxO]
- subps xmm1, [esp + .jyO]
- subps xmm2, [esp + .jzO]
- subps xmm3, [esp + .jxO]
- subps xmm4, [esp + .jyO]
- subps xmm5, [esp + .jzO]
- movaps [esp + .dxH1O], xmm0
- movaps [esp + .dyH1O], xmm1
- movaps [esp + .dzH1O], xmm2
- movaps [esp + .dxH2O], xmm3
- movaps [esp + .dyH2O], xmm4
- movaps [esp + .dzH2O], xmm5
- mulps xmm0, xmm0
- mulps xmm1, xmm1
- mulps xmm2, xmm2
- mulps xmm3, xmm3
- mulps xmm4, xmm4
- mulps xmm5, xmm5
- addps xmm0, xmm1
- addps xmm4, xmm3
- addps xmm0, xmm2 ; have rsqH1 in xmm0.
- addps xmm4, xmm5 ; have rsqH2 in xmm4.
-
- ;; do invsqrt
- rsqrtps xmm1, xmm0
- rsqrtps xmm5, xmm4
- movaps xmm2, xmm1
- movaps xmm6, xmm5
- mulps xmm1, xmm1
- mulps xmm5, xmm5
- movaps xmm3, [esp + .three]
- movaps xmm7, xmm3
- mulps xmm1, xmm0
- mulps xmm5, xmm4
- subps xmm3, xmm1
- subps xmm7, xmm5
- mulps xmm3, xmm2
- mulps xmm7, xmm6
- mulps xmm3, [esp + .half] ; rinv H1 - j water
- mulps xmm7, [esp + .half] ; rinv H2 - j water
-
- mulps xmm0, [esp + .krf] ; krsq
- mulps xmm4, [esp + .krf] ; krsq
-
-
- ;; assemble charges in xmm6
- xorps xmm6, xmm6
- movss xmm6, [esp + .qqOH]
- movhps xmm6, [esp + .qqHH]
- movaps xmm1, xmm0
- movaps xmm5, xmm4
- addps xmm0, xmm3 ; krsq+rinv
- addps xmm4, xmm7 ; krsq+rinv
- subps xmm0, [esp + .crf]
- subps xmm4, [esp + .crf]
- mulps xmm1, [esp + .two]
- mulps xmm5, [esp + .two]
- mulps xmm0, xmm6 ; vcoul
- mulps xmm4, xmm6 ; vcoul
- addps xmm4, xmm0
- addps xmm4, [esp + .vctot]
- movaps [esp + .vctot], xmm4
- movaps xmm0, xmm3
- movaps xmm4, xmm7
- mulps xmm3, xmm3
- mulps xmm7, xmm7
- subps xmm0, xmm1
- subps xmm4, xmm5
- mulps xmm0, xmm6
- mulps xmm4, xmm6
- mulps xmm0, xmm3 ; fscal
- mulps xmm7, xmm4 ; fscal
-
- movaps xmm1, xmm0
- movaps xmm2, xmm0
- mulps xmm0, [esp + .dxH1O]
- mulps xmm1, [esp + .dyH1O]
- mulps xmm2, [esp + .dzH1O]
- ;; update forces H1 - j water
- movaps xmm3, [esp + .fjxO]
- movaps xmm4, [esp + .fjyO]
- movaps xmm5, [esp + .fjzO]
- subps xmm3, xmm0
- subps xmm4, xmm1
- subps xmm5, xmm2
- movaps [esp + .fjxO], xmm3
- movaps [esp + .fjyO], xmm4
- movaps [esp + .fjzO], xmm5
- addps xmm0, [esp + .fixH1]
- addps xmm1, [esp + .fiyH1]
- addps xmm2, [esp + .fizH1]
- movaps [esp + .fixH1], xmm0
- movaps [esp + .fiyH1], xmm1
- movaps [esp + .fizH1], xmm2
- ;; do forces H2 - j water
- movaps xmm0, xmm7
- movaps xmm1, xmm7
- movaps xmm2, xmm7
- mulps xmm0, [esp + .dxH2O]
- mulps xmm1, [esp + .dyH2O]
- mulps xmm2, [esp + .dzH2O]
- movaps xmm3, [esp + .fjxO]
- movaps xmm4, [esp + .fjyO]
- movaps xmm5, [esp + .fjzO]
- subps xmm3, xmm0
- subps xmm4, xmm1
- subps xmm5, xmm2
- mov esi, [ebp + %$faction]
- movaps [esp + .fjxO], xmm3
- movaps [esp + .fjyO], xmm4
- movaps [esp + .fjzO], xmm5
- addps xmm0, [esp + .fixH2]
- addps xmm1, [esp + .fiyH2]
- addps xmm2, [esp + .fizH2]
- movaps [esp + .fixH2], xmm0
- movaps [esp + .fiyH2], xmm1
- movaps [esp + .fizH2], xmm2
-
- ;; update j water forces from local variables
- movlps xmm0, [esi + eax*4]
- movlps xmm1, [esi + eax*4 + 12]
- movhps xmm1, [esi + eax*4 + 24]
- movaps xmm3, [esp + .fjxO]
- movaps xmm4, [esp + .fjyO]
- movaps xmm5, [esp + .fjzO]
- movaps xmm6, xmm5
- movaps xmm7, xmm5
- shufps xmm6, xmm6, 10b
- shufps xmm7, xmm7, 11b
- addss xmm5, [esi + eax*4 + 8]
- addss xmm6, [esi + eax*4 + 20]
- addss xmm7, [esi + eax*4 + 32]
- movss [esi + eax*4 + 8], xmm5
- movss [esi + eax*4 + 20], xmm6
- movss [esi + eax*4 + 32], xmm7
- movaps xmm5, xmm3
- unpcklps xmm3, xmm4
- unpckhps xmm5, xmm4
- addps xmm0, xmm3
- addps xmm1, xmm5
- movlps [esi + eax*4], xmm0
- movlps [esi + eax*4 + 12], xmm1
- movhps [esi + eax*4 + 24], xmm1
-
- dec dword [esp + .innerk]
- jz .updateouterdata
- jmp .single_loop
-.updateouterdata:
- mov ecx, [esp + .ii3]
- mov edi, [ebp + %$faction]
- mov esi, [ebp + %$fshift]
- mov edx, [esp + .is3]
-
- ; accumulate Oi forces in xmm0, xmm1, xmm2
- movaps xmm0, [esp + .fixO]
- movaps xmm1, [esp + .fiyO]
- movaps xmm2, [esp + .fizO]
-
- movhlps xmm3, xmm0
- movhlps xmm4, xmm1
- movhlps xmm5, xmm2
- addps xmm0, xmm3
- addps xmm1, xmm4
- addps xmm2, xmm5 ; summan ligger i 1/2 i xmm0-xmm2
-
- movaps xmm3, xmm0
- movaps xmm4, xmm1
- movaps xmm5, xmm2
-
- shufps xmm3, xmm3, 1b
- shufps xmm4, xmm4, 1b
- shufps xmm5, xmm5, 1b
- addss xmm0, xmm3
- addss xmm1, xmm4
- addss xmm2, xmm5 ; xmm0-xmm2 has single force in pos0.
-
- ; increment i force
- movss xmm3, [edi + ecx*4]
- movss xmm4, [edi + ecx*4 + 4]
- movss xmm5, [edi + ecx*4 + 8]
- addss xmm3, xmm0
- addss xmm4, xmm1
- addss xmm5, xmm2
- movss [edi + ecx*4], xmm3
- movss [edi + ecx*4 + 4], xmm4
- movss [edi + ecx*4 + 8], xmm5
-
- ; accumulate force in xmm6/xmm7 for fshift
- movaps xmm6, xmm0
- movss xmm7, xmm2
- movlhps xmm6, xmm1
- shufps xmm6, xmm6, 1000b
-
- ; accumulate H1i forces in xmm0, xmm1, xmm2
- movaps xmm0, [esp + .fixH1]
- movaps xmm1, [esp + .fiyH1]
- movaps xmm2, [esp + .fizH1]
-
- movhlps xmm3, xmm0
- movhlps xmm4, xmm1
- movhlps xmm5, xmm2
- addps xmm0, xmm3
- addps xmm1, xmm4
- addps xmm2, xmm5 ; summan ligger i 1/2 i xmm0-xmm2
-
- movaps xmm3, xmm0
- movaps xmm4, xmm1
- movaps xmm5, xmm2
-
- shufps xmm3, xmm3, 1b
- shufps xmm4, xmm4, 1b
- shufps xmm5, xmm5, 1b
- addss xmm0, xmm3
- addss xmm1, xmm4
- addss xmm2, xmm5 ; xmm0-xmm2 has single force in pos0.
-
- ; increment i force
- movss xmm3, [edi + ecx*4 + 12]
- movss xmm4, [edi + ecx*4 + 16]
- movss xmm5, [edi + ecx*4 + 20]
- addss xmm3, xmm0
- addss xmm4, xmm1
- addss xmm5, xmm2
- movss [edi + ecx*4 + 12], xmm3
- movss [edi + ecx*4 + 16], xmm4
- movss [edi + ecx*4 + 20], xmm5
-
- ;accumulate force in xmm6/xmm7 for fshift
- addss xmm7, xmm2
- movlhps xmm0, xmm1
- shufps xmm0, xmm0, 1000b
- addps xmm6, xmm0
-
- ; accumulate H2i forces in xmm0, xmm1, xmm2
- movaps xmm0, [esp + .fixH2]
- movaps xmm1, [esp + .fiyH2]
- movaps xmm2, [esp + .fizH2]
-
- movhlps xmm3, xmm0
- movhlps xmm4, xmm1
- movhlps xmm5, xmm2
- addps xmm0, xmm3
- addps xmm1, xmm4
- addps xmm2, xmm5 ; summan ligger i 1/2 i xmm0-xmm2
-
- movaps xmm3, xmm0
- movaps xmm4, xmm1
- movaps xmm5, xmm2
-
- shufps xmm3, xmm3, 1b
- shufps xmm4, xmm4, 1b
- shufps xmm5, xmm5, 1b
- addss xmm0, xmm3
- addss xmm1, xmm4
- addss xmm2, xmm5 ; xmm0-xmm2 has single force in pos0.
-
- ; increment i force
- movss xmm3, [edi + ecx*4 + 24]
- movss xmm4, [edi + ecx*4 + 28]
- movss xmm5, [edi + ecx*4 + 32]
- addss xmm3, xmm0
- addss xmm4, xmm1
- addss xmm5, xmm2
- movss [edi + ecx*4 + 24], xmm3
- movss [edi + ecx*4 + 28], xmm4
- movss [edi + ecx*4 + 32], xmm5
-
- ;accumulate force in xmm6/xmm7 for fshift
- addss xmm7, xmm2
- movlhps xmm0, xmm1
- shufps xmm0, xmm0, 1000b
- addps xmm6, xmm0
-
- ; increment fshift force
- movlps xmm3, [esi + edx*4]
- movss xmm4, [esi + edx*4 + 8]
- addps xmm3, xmm6
- addss xmm4, xmm7
- movlps [esi + edx*4], xmm3
- movss [esi + edx*4 + 8], xmm4
-
- ; get group index for i particle
- mov edx, [ebp + %$gid] ; get group index for this i particle
- mov edx, [edx]
- add [ebp + %$gid], dword 4 ; advance pointer
-
- ; accumulate total potential energy and update it.
- movaps xmm7, [esp + .vctot]
- ; accumulate
- movhlps xmm6, xmm7
- addps xmm7, xmm6 ; pos 0-1 in xmm7 have the sum now
- movaps xmm6, xmm7
- shufps xmm6, xmm6, 1b
- addss xmm7, xmm6
-
- ; add earlier value from mem.
- mov eax, [ebp + %$Vc]
- addss xmm7, [eax + edx*4]
- ; move back to mem.
- movss [eax + edx*4], xmm7
-
- ; accumulate total lj energy and update it.
- movaps xmm7, [esp + .vnbtot]
- ; accumulate
- movhlps xmm6, xmm7
- addps xmm7, xmm6 ; pos 0-1 in xmm7 have the sum now
- movaps xmm6, xmm7
- shufps xmm6, xmm6, 1b
- addss xmm7, xmm6
-
- ; add earlier value from mem.
- mov eax, [ebp + %$Vnb]
- addss xmm7, [eax + edx*4]
- ; move back to mem.
- movss [eax + edx*4], xmm7
-
- ;; finish if last
- mov ecx, [ebp + %$nri]
- dec ecx
- jecxz .end
- ;; not last, iterate once more!
- mov [ebp + %$nri], ecx
- jmp .outer
-.end:
- emms
- mov eax, [esp + .salign]
- add esp, eax
- add esp, 1540
- pop edi
- pop esi
- pop edx
- pop ecx
- pop ebx
- pop eax
- endproc
-
-
-
-
-proc inl2020_sse
-%$nri arg
-%$iinr arg
-%$jindex arg
-%$jjnr arg
-%$shift arg
-%$shiftvec arg
-%$fshift arg
-%$gid arg
-%$pos arg
-%$faction arg
-%$charge arg
-%$facel arg
-%$Vc arg
-%$krf arg
-%$crf arg
- ;; stack offsets for local variables
- ;; bottom of stack is cache-aligned for sse use
-.ixO equ 0
-.iyO equ 16
-.izO equ 32
-.ixH1 equ 48
-.iyH1 equ 64
-.izH1 equ 80
-.ixH2 equ 96
-.iyH2 equ 112
-.izH2 equ 128
-.iqO equ 144
-.iqH equ 160
-.dxO equ 176
-.dyO equ 192
-.dzO equ 208
-.dxH1 equ 224
-.dyH1 equ 240
-.dzH1 equ 256
-.dxH2 equ 272
-.dyH2 equ 288
-.dzH2 equ 304
-.qqO equ 320
-.qqH equ 336
-.vctot equ 352
-.fixO equ 384
-.fiyO equ 400
-.fizO equ 416
-.fixH1 equ 432
-.fiyH1 equ 448
-.fizH1 equ 464
-.fixH2 equ 480
-.fiyH2 equ 496
-.fizH2 equ 512
-.fjx equ 528
-.fjy equ 544
-.fjz equ 560
-.half equ 576
-.three equ 592
-.two equ 608
-.krf equ 624
-.crf equ 640
-.krsqO equ 656
-.krsqH1 equ 672
-.krsqH2 equ 688
-.is3 equ 704
-.ii3 equ 708
-.innerjjnr equ 712
-.innerk equ 716
-.salign equ 720
- push eax
- push ebx
- push ecx
- push edx
- push esi
- push edi
- sub esp, 724 ; local stack space
- mov eax, esp
- and eax, 0xf
- sub esp, eax
- mov [esp + .salign], eax
-
- emms
-
- movups xmm0, [sse_half]
- movups xmm1, [sse_three]
- movups xmm4, [sse_two]
- movss xmm5, [ebp + %$krf]
- movss xmm6, [ebp + %$crf]
-
- movaps [esp + .half], xmm0
- movaps [esp + .three], xmm1
- movaps [esp + .two], xmm4
- shufps xmm5, xmm5, 0b
- shufps xmm6, xmm6, 0b
- movaps [esp + .krf], xmm5
- movaps [esp + .crf], xmm6
-
- ;; assume we have at least one i particle - start directly
- mov ecx, [ebp + %$iinr] ; ecx = pointer into iinr[]
- mov ebx, [ecx] ; ebx =ii
-
- mov edx, [ebp + %$charge]
- movss xmm3, [edx + ebx*4]
- movss xmm4, [edx + ebx*4 + 4]
- movss xmm5, [ebp + %$facel]
- mulss xmm3, xmm5
- mulss xmm4, xmm5
-
- shufps xmm3, xmm3, 0b
- shufps xmm4, xmm4, 0b
- movaps [esp + .iqO], xmm3
- movaps [esp + .iqH], xmm4
-
-.outer:
- mov eax, [ebp + %$shift] ; eax = pointer into shift[]
- mov ebx, [eax] ; ebx=shift[n]
- add [ebp + %$shift], dword 4 ; advance pointer one step
-
- lea ebx, [ebx + ebx*2] ; ebx=3*is
- mov [esp + .is3],ebx ; store is3
-
- mov eax, [ebp + %$shiftvec] ; eax = base of shiftvec[]
-
- movss xmm0, [eax + ebx*4]
- movss xmm1, [eax + ebx*4 + 4]
- movss xmm2, [eax + ebx*4 + 8]
-
- mov ecx, [ebp + %$iinr] ; ecx = pointer into iinr[]
- add [ebp + %$iinr], dword 4 ; advance pointer
- mov ebx, [ecx] ; ebx =ii
-
- movaps xmm3, xmm0
- movaps xmm4, xmm1
- movaps xmm5, xmm2
-
- lea ebx, [ebx + ebx*2] ; ebx = 3*ii=ii3
- mov eax, [ebp + %$pos] ; eax = base of pos[]
- mov [esp + .ii3], ebx
-
- addss xmm3, [eax + ebx*4]
- addss xmm4, [eax + ebx*4 + 4]
- addss xmm5, [eax + ebx*4 + 8]
- shufps xmm3, xmm3, 0b
- shufps xmm4, xmm4, 0b
- shufps xmm5, xmm5, 0b
- movaps [esp + .ixO], xmm3
- movaps [esp + .iyO], xmm4
- movaps [esp + .izO], xmm5
-
- movss xmm3, xmm0
- movss xmm4, xmm1
- movss xmm5, xmm2
- addss xmm0, [eax + ebx*4 + 12]
- addss xmm1, [eax + ebx*4 + 16]
- addss xmm2, [eax + ebx*4 + 20]
- addss xmm3, [eax + ebx*4 + 24]
- addss xmm4, [eax + ebx*4 + 28]
- addss xmm5, [eax + ebx*4 + 32]
-
- shufps xmm0, xmm0, 0b
- shufps xmm1, xmm1, 0b
- shufps xmm2, xmm2, 0b
- shufps xmm3, xmm3, 0b
- shufps xmm4, xmm4, 0b
- shufps xmm5, xmm5, 0b
- movaps [esp + .ixH1], xmm0
- movaps [esp + .iyH1], xmm1
- movaps [esp + .izH1], xmm2
- movaps [esp + .ixH2], xmm3
- movaps [esp + .iyH2], xmm4
- movaps [esp + .izH2], xmm5
-
- ; clear vctot and i forces
- xorps xmm4, xmm4
- movaps [esp + .vctot], xmm4
- movaps [esp + .fixO], xmm4
- movaps [esp + .fiyO], xmm4
- movaps [esp + .fizO], xmm4
- movaps [esp + .fixH1], xmm4
- movaps [esp + .fiyH1], xmm4
- movaps [esp + .fizH1], xmm4
- movaps [esp + .fixH2], xmm4
- movaps [esp + .fiyH2], xmm4
- movaps [esp + .fizH2], xmm4
-
- mov eax, [ebp + %$jindex]
- mov ecx, [eax] ; jindex[n]
- mov edx, [eax + 4] ; jindex[n+1]
- add [ebp + %$jindex], dword 4
- sub edx, ecx ; number of innerloop atoms
-
- mov esi, [ebp + %$pos]
- mov edi, [ebp + %$faction]
- mov eax, [ebp + %$jjnr]
- shl ecx, 2
- add eax, ecx
- mov [esp + .innerjjnr], eax ; pointer to jjnr[nj0]
- sub edx, dword 4
- mov [esp + .innerk], edx ; number of innerloop atoms
- jge .unroll_loop
- jmp .odd_inner
-.unroll_loop:
- ;; quad-unroll innerloop here.
- mov edx, [esp + .innerjjnr] ; pointer to jjnr[k]
- mov eax, [edx]
- mov ebx, [edx + 4]
- mov ecx, [edx + 8]
- mov edx, [edx + 12] ; eax-edx=jnr1-4
-
- add [esp + .innerjjnr], dword 16 ; advance pointer (unrolled 4)
-
- mov esi, [ebp + %$charge] ; base of charge[]
-
- movss xmm3, [esi + eax*4]
- movss xmm4, [esi + ecx*4]
- movss xmm6, [esi + ebx*4]
- movss xmm7, [esi + edx*4]
-
- shufps xmm3, xmm6, 00000000b
- shufps xmm4, xmm7, 00000000b
- shufps xmm3, xmm4, 10001000b ; all charges in xmm3
- movaps xmm4, xmm3 ; and in xmm4
- mulps xmm3, [esp + .iqO]
- mulps xmm4, [esp + .iqH]
-
- movaps [esp + .qqO], xmm3
- movaps [esp + .qqH], xmm4
-
- mov esi, [ebp + %$pos] ; base of pos[]
-
- lea eax, [eax + eax*2] ; replace jnr with j3
- lea ebx, [ebx + ebx*2]
- lea ecx, [ecx + ecx*2] ; replace jnr with j3
- lea edx, [edx + edx*2]
-
- ; move four coordinates to xmm0-xmm2
- movlps xmm4, [esi + eax*4]
- movlps xmm5, [esi + ecx*4]
- movss xmm2, [esi + eax*4 + 8]
- movss xmm6, [esi + ecx*4 + 8]
-
- movhps xmm4, [esi + ebx*4]
- movhps xmm5, [esi + edx*4]
-
- movss xmm0, [esi + ebx*4 + 8]
- movss xmm1, [esi + edx*4 + 8]
-
- shufps xmm2, xmm0, 0b
- shufps xmm6, xmm1, 0b
-
- movaps xmm0, xmm4
- movaps xmm1, xmm4
-
- shufps xmm2, xmm6, 10001000b
-
- shufps xmm0, xmm5, 10001000b
- shufps xmm1, xmm5, 11011101b
-
- ; move ixO-izO to xmm4-xmm6
- movaps xmm4, [esp + .ixO]
- movaps xmm5, [esp + .iyO]
- movaps xmm6, [esp + .izO]
-
- ; calc dr
- subps xmm4, xmm0
- subps xmm5, xmm1
- subps xmm6, xmm2
-
- ; store dr
- movaps [esp + .dxO], xmm4
- movaps [esp + .dyO], xmm5
- movaps [esp + .dzO], xmm6
- ; square it
- mulps xmm4,xmm4
- mulps xmm5,xmm5
- mulps xmm6,xmm6
- addps xmm4, xmm5
- addps xmm4, xmm6
- movaps xmm7, xmm4
- ; rsqO in xmm7
-
- ; move ixH1-izH1 to xmm4-xmm6
- movaps xmm4, [esp + .ixH1]
- movaps xmm5, [esp + .iyH1]
- movaps xmm6, [esp + .izH1]
-
- ; calc dr
- subps xmm4, xmm0
- subps xmm5, xmm1
- subps xmm6, xmm2
-
- ; store dr
- movaps [esp + .dxH1], xmm4
- movaps [esp + .dyH1], xmm5
- movaps [esp + .dzH1], xmm6
- ; square it
- mulps xmm4,xmm4
- mulps xmm5,xmm5
- mulps xmm6,xmm6
- addps xmm6, xmm5
- addps xmm6, xmm4
- ; rsqH1 in xmm6
-
- ; move ixH2-izH2 to xmm3-xmm5
- movaps xmm3, [esp + .ixH2]
- movaps xmm4, [esp + .iyH2]
- movaps xmm5, [esp + .izH2]
-
- ; calc dr
- subps xmm3, xmm0
- subps xmm4, xmm1
- subps xmm5, xmm2
-
- ; store dr
- movaps [esp + .dxH2], xmm3
- movaps [esp + .dyH2], xmm4
- movaps [esp + .dzH2], xmm5
- ; square it
- mulps xmm3,xmm3
- mulps xmm4,xmm4
- mulps xmm5,xmm5
- addps xmm5, xmm4
- addps xmm5, xmm3
- ; rsqH2 in xmm5, rsqH1 in xmm6, rsqO in xmm7
-
- movaps xmm0, xmm5
- movaps xmm1, xmm6
- movaps xmm2, xmm7
-
- mulps xmm0, [esp + .krf]
- mulps xmm1, [esp + .krf]
- mulps xmm2, [esp + .krf]
-
- movaps [esp + .krsqH2], xmm0
- movaps [esp + .krsqH1], xmm1
- movaps [esp + .krsqO], xmm2
-
- ; start with rsqO - seed in xmm2
- rsqrtps xmm2, xmm7
- movaps xmm3, xmm2
- mulps xmm2, xmm2
- movaps xmm4, [esp + .three]
- mulps xmm2, xmm7 ; rsq*lu*lu
- subps xmm4, xmm2 ; 3.0-rsq*lu*lu
- mulps xmm4, xmm3 ; lu*(3-rsq*lu*lu)
- mulps xmm4, [esp + .half]
- movaps xmm7, xmm4 ; rinvO in xmm7
- ; rsqH1 - seed in xmm2
- rsqrtps xmm2, xmm6
- movaps xmm3, xmm2
- mulps xmm2, xmm2
- movaps xmm4, [esp + .three]
- mulps xmm2, xmm6 ; rsq*lu*lu
- subps xmm4, xmm2 ; 3.0-rsq*lu*lu
- mulps xmm4, xmm3 ; lu*(3-rsq*lu*lu)
- mulps xmm4, [esp + .half]
- movaps xmm6, xmm4 ; rinvH1 in xmm6
- ; rsqH2 - seed in xmm2
- rsqrtps xmm2, xmm5
- movaps xmm3, xmm2
- mulps xmm2, xmm2
- movaps xmm4, [esp + .three]
- mulps xmm2, xmm5 ; rsq*lu*lu
- subps xmm4, xmm2 ; 3.0-rsq*lu*lu
- mulps xmm4, xmm3 ; lu*(3-rsq*lu*lu)
- mulps xmm4, [esp + .half]
- movaps xmm5, xmm4 ; rinvH2 in xmm5
-
- ; do O interactions
- movaps xmm4, xmm7
- mulps xmm4, xmm4 ; xmm7=rinv, xmm4=rinvsq
-
- movaps xmm0, xmm7
- movaps xmm1, [esp + .krsqO]
- addps xmm0, xmm1
- subps xmm0, [esp + .crf] ; xmm0=rinv+krsq-crf
- mulps xmm1, [esp + .two]
- subps xmm7, xmm1
- mulps xmm0, [esp + .qqO]
- mulps xmm7, [esp + .qqO]
-
- mulps xmm4, xmm7 ; total fsO in xmm4
-
- addps xmm0, [esp + .vctot]
- movaps [esp + .vctot], xmm0
-
- movaps xmm0, [esp + .dxO]
- movaps xmm1, [esp + .dyO]
- movaps xmm2, [esp + .dzO]
- mulps xmm0, xmm4
- mulps xmm1, xmm4
- mulps xmm2, xmm4
-
- ; update O forces
- movaps xmm3, [esp + .fixO]
- movaps xmm4, [esp + .fiyO]
- movaps xmm7, [esp + .fizO]
- addps xmm3, xmm0
- addps xmm4, xmm1
- addps xmm7, xmm2
- movaps [esp + .fixO], xmm3
- movaps [esp + .fiyO], xmm4
- movaps [esp + .fizO], xmm7
- ; update j forces with water O
- movaps [esp + .fjx], xmm0
- movaps [esp + .fjy], xmm1
- movaps [esp + .fjz], xmm2
-
- ; H1 interactions
- movaps xmm4, xmm6
- mulps xmm4, xmm4 ; xmm6=rinv, xmm4=rinvsq
- movaps xmm7, xmm6
- movaps xmm0, [esp + .krsqH1]
- addps xmm6, xmm0 ; xmm6=rinv+krsq
- subps xmm6, [esp + .crf] ; xmm6=rinv+krsq-crf
- mulps xmm0, [esp + .two]
- subps xmm7, xmm0 ; xmm7=rinv-2*krsq
- mulps xmm6, [esp + .qqH] ; vcoul
- mulps xmm7, [esp + .qqH]
- mulps xmm4, xmm7 ; total fsH1 in xmm4
-
- addps xmm6, [esp + .vctot]
-
- movaps xmm0, [esp + .dxH1]
- movaps xmm1, [esp + .dyH1]
- movaps xmm2, [esp + .dzH1]
- movaps [esp + .vctot], xmm6
- mulps xmm0, xmm4
- mulps xmm1, xmm4
- mulps xmm2, xmm4
-
- ; update H1 forces
- movaps xmm3, [esp + .fixH1]
- movaps xmm4, [esp + .fiyH1]
- movaps xmm7, [esp + .fizH1]
- addps xmm3, xmm0
- addps xmm4, xmm1
- addps xmm7, xmm2
- movaps [esp + .fixH1], xmm3
- movaps [esp + .fiyH1], xmm4
- movaps [esp + .fizH1], xmm7
- ; update j forces with water H1
- addps xmm0, [esp + .fjx]
- addps xmm1, [esp + .fjy]
- addps xmm2, [esp + .fjz]
- movaps [esp + .fjx], xmm0
- movaps [esp + .fjy], xmm1
- movaps [esp + .fjz], xmm2
-
- ; H2 interactions
- movaps xmm4, xmm5
- mulps xmm4, xmm4 ; xmm5=rinv, xmm4=rinvsq
- movaps xmm7, xmm5
- movaps xmm0, [esp + .krsqH2]
- addps xmm5, xmm0 ; xmm6=rinv+krsq
- subps xmm5, [esp + .crf] ; xmm5=rinv+krsq-crf
- mulps xmm0, [esp + .two]
- subps xmm7, xmm0 ; xmm7=rinv-2*krsq
- mulps xmm5, [esp + .qqH] ; vcoul
- mulps xmm7, [esp + .qqH]
- mulps xmm4, xmm7 ; total fsH2 in xmm4
-
- addps xmm5, [esp + .vctot]
-
- movaps xmm0, [esp + .dxH2]
- movaps xmm1, [esp + .dyH2]
- movaps xmm2, [esp + .dzH2]
- movaps [esp + .vctot], xmm5
- mulps xmm0, xmm4
- mulps xmm1, xmm4
- mulps xmm2, xmm4
-
- ; update H2 forces
- movaps xmm3, [esp + .fixH2]
- movaps xmm4, [esp + .fiyH2]
- movaps xmm7, [esp + .fizH2]
- addps xmm3, xmm0
- addps xmm4, xmm1
- addps xmm7, xmm2
- movaps [esp + .fixH2], xmm3
- movaps [esp + .fiyH2], xmm4
- movaps [esp + .fizH2], xmm7
-
- mov edi, [ebp +%$faction]
- ; update j forces
- addps xmm0, [esp + .fjx]
- addps xmm1, [esp + .fjy]
- addps xmm2, [esp + .fjz]
-
- movlps xmm4, [edi + eax*4]
- movlps xmm7, [edi + ecx*4]
- movhps xmm4, [edi + ebx*4]
- movhps xmm7, [edi + edx*4]
-
- movaps xmm3, xmm4
- shufps xmm3, xmm7, 10001000b
- shufps xmm4, xmm7, 11011101b
- ; xmm3 has fjx, xmm4 has fjy.
- subps xmm3, xmm0
- subps xmm4, xmm1
- ; unpack the back for storing.
- movaps xmm7, xmm3
- unpcklps xmm7, xmm4
- unpckhps xmm3, xmm4
- movlps [edi + eax*4], xmm7
- movlps [edi + ecx*4], xmm3
- movhps [edi + ebx*4], xmm7
- movhps [edi + edx*4], xmm3
- ; finally z forces
- movss xmm0, [edi + eax*4 + 8]
- movss xmm1, [edi + ebx*4 + 8]
- movss xmm3, [edi + ecx*4 + 8]
- movss xmm4, [edi + edx*4 + 8]
- subss xmm0, xmm2
- shufps xmm2, xmm2, 11100101b
- subss xmm1, xmm2
- shufps xmm2, xmm2, 11101010b
- subss xmm3, xmm2
- shufps xmm2, xmm2, 11111111b
- subss xmm4, xmm2
- movss [edi + eax*4 + 8], xmm0
- movss [edi + ebx*4 + 8], xmm1
- movss [edi + ecx*4 + 8], xmm3
- movss [edi + edx*4 + 8], xmm4
-
- ;; should we do one more iteration?
- sub [esp + .innerk], dword 4
- jl .odd_inner
- jmp .unroll_loop
-.odd_inner:
- add [esp + .innerk], dword 4
- jnz .odd_loop
- jmp .updateouterdata
-.odd_loop:
- mov edx, [esp + .innerjjnr] ; pointer to jjnr[k]
- mov eax, [edx]
- add [esp + .innerjjnr], dword 4
-
- xorps xmm4, xmm4
- movss xmm4, [esp + .iqO]
- mov esi, [ebp + %$charge]
- movhps xmm4, [esp + .iqH]
- movss xmm3, [esi + eax*4] ; charge in xmm3
- shufps xmm3, xmm3, 0b
- mulps xmm3, xmm4
- movaps [esp + .qqO], xmm3 ; use oxygen qq for storage.
-
- mov esi, [ebp + %$pos]
- lea eax, [eax + eax*2]
-
- ; move j coords to xmm0-xmm2
- movss xmm0, [esi + eax*4]
- movss xmm1, [esi + eax*4 + 4]
- movss xmm2, [esi + eax*4 + 8]
- shufps xmm0, xmm0, 0b
- shufps xmm1, xmm1, 0b
- shufps xmm2, xmm2, 0b
-
- movss xmm3, [esp + .ixO]
- movss xmm4, [esp + .iyO]
- movss xmm5, [esp + .izO]
-
- movlps xmm6, [esp + .ixH1]
- movlps xmm7, [esp + .ixH2]
- unpcklps xmm6, xmm7
- movlhps xmm3, xmm6
- movlps xmm6, [esp + .iyH1]
- movlps xmm7, [esp + .iyH2]
- unpcklps xmm6, xmm7
- movlhps xmm4, xmm6
- movlps xmm6, [esp + .izH1]
- movlps xmm7, [esp + .izH2]
- unpcklps xmm6, xmm7
- movlhps xmm5, xmm6
-
- subps xmm3, xmm0
- subps xmm4, xmm1
- subps xmm5, xmm2
-
- movaps [esp + .dxO], xmm3
- movaps [esp + .dyO], xmm4
- movaps [esp + .dzO], xmm5
-
- mulps xmm3, xmm3
- mulps xmm4, xmm4
- mulps xmm5, xmm5
-
- addps xmm4, xmm3
- addps xmm4, xmm5
- ; rsq in xmm4.
-
- movaps xmm0, xmm4
- mulps xmm0, [esp + .krf]
- movaps [esp + .krsqO], xmm0
-
- rsqrtps xmm5, xmm4
- ; lookup seed in xmm5
- movaps xmm2, xmm5
- mulps xmm5, xmm5
- movaps xmm1, [esp + .three]
- mulps xmm5, xmm4 ;rsq*lu*lu
- movaps xmm0, [esp + .half]
- subps xmm1, xmm5 ; 3.0-rsq*lu*lu
- mulps xmm1, xmm2
- mulps xmm0, xmm1 ; xmm0=rinv
- movaps xmm4, xmm0
- mulps xmm4, xmm4 ; xmm4=rinvsq
-
- movaps xmm1, xmm0 ; xmm1=rinv
- movaps xmm3, [esp + .krsqO]
- addps xmm0, xmm3 ; xmm0=rinv+krsq
- subps xmm0, [esp + .crf] ; xmm0=rinv+krsq-crf
- mulps xmm3, [esp + .two]
- subps xmm1, xmm3 ; xmm1=rinv-2*krsq
- mulps xmm0, [esp + .qqO] ; xmm0=vcoul
- mulps xmm1, [esp + .qqO] ; xmm1=coul part of fs
-
-
- mulps xmm4, xmm1 ; xmm4=total fscal
- addps xmm0, [esp + .vctot]
- movaps [esp + .vctot], xmm0
-
- movaps xmm0, [esp + .dxO]
- movaps xmm1, [esp + .dyO]
- movaps xmm2, [esp + .dzO]
-
- mulps xmm0, xmm4
- mulps xmm1, xmm4
- mulps xmm2, xmm4
- ; xmm0-xmm2 contains tx-tz (partial force)
- movss xmm3, [esp + .fixO]
- movss xmm4, [esp + .fiyO]
- movss xmm5, [esp + .fizO]
- addss xmm3, xmm0
- addss xmm4, xmm1
- addss xmm5, xmm2
- movss [esp + .fixO], xmm3
- movss [esp + .fiyO], xmm4
- movss [esp + .fizO], xmm5 ; updated the O force. now do the H's
- movaps xmm3, xmm0
- movaps xmm4, xmm1
- movaps xmm5, xmm2
- shufps xmm3, xmm3, 11100110b ; shift right
- shufps xmm4, xmm4, 11100110b
- shufps xmm5, xmm5, 11100110b
- addss xmm3, [esp + .fixH1]
- addss xmm4, [esp + .fiyH1]
- addss xmm5, [esp + .fizH1]
- movss [esp + .fixH1], xmm3
- movss [esp + .fiyH1], xmm4
- movss [esp + .fizH1], xmm5 ; updated the H1 force.
-
- mov edi, [ebp + %$faction]
- shufps xmm3, xmm3, 11100111b ; shift right
- shufps xmm4, xmm4, 11100111b
- shufps xmm5, xmm5, 11100111b
- addss xmm3, [esp + .fixH2]
- addss xmm4, [esp + .fiyH2]
- addss xmm5, [esp + .fizH2]
- movss [esp + .fixH2], xmm3
- movss [esp + .fiyH2], xmm4
- movss [esp + .fizH2], xmm5 ; updated the H2 force.
-
- ; the fj's - start by accumulating the tx/ty/tz force in xmm0, xmm1.
- xorps xmm5, xmm5
- movaps xmm3, xmm0
- movlps xmm6, [edi + eax*4]
- movss xmm7, [edi + eax*4 + 8]
- unpcklps xmm3, xmm1
- movlhps xmm3, xmm5
- unpckhps xmm0, xmm1
- addps xmm0, xmm3
- movhlps xmm3, xmm0
- addps xmm0, xmm3 ; x,y sum in xmm0
-
- movhlps xmm1, xmm2
- addss xmm2, xmm1
- shufps xmm1, xmm1, 1b
- addss xmm2, xmm1 ; z sum in xmm2
- subps xmm6, xmm0
- subss xmm7, xmm2
-
- movlps [edi + eax*4], xmm6
- movss [edi + eax*4 + 8], xmm7
-
- dec dword [esp + .innerk]
- jz .updateouterdata
- jmp .odd_loop
-.updateouterdata:
- mov ecx, [esp + .ii3]
- mov edi, [ebp + %$faction]
- mov esi, [ebp + %$fshift]
- mov edx, [esp + .is3]
-
- ; accumulate Oi forces in xmm0, xmm1, xmm2
- movaps xmm0, [esp + .fixO]
- movaps xmm1, [esp + .fiyO]
- movaps xmm2, [esp + .fizO]
-
- movhlps xmm3, xmm0
- movhlps xmm4, xmm1
- movhlps xmm5, xmm2
- addps xmm0, xmm3
- addps xmm1, xmm4
- addps xmm2, xmm5 ; summan ligger i 1/2 i xmm0-xmm2
-
- movaps xmm3, xmm0
- movaps xmm4, xmm1
- movaps xmm5, xmm2
-
- shufps xmm3, xmm3, 1b
- shufps xmm4, xmm4, 1b
- shufps xmm5, xmm5, 1b
- addss xmm0, xmm3
- addss xmm1, xmm4
- addss xmm2, xmm5 ; xmm0-xmm2 has single force in pos0.
-
- ; increment i force
- movss xmm3, [edi + ecx*4]
- movss xmm4, [edi + ecx*4 + 4]
- movss xmm5, [edi + ecx*4 + 8]
- addss xmm3, xmm0
- addss xmm4, xmm1
- addss xmm5, xmm2
- movss [edi + ecx*4], xmm3
- movss [edi + ecx*4 + 4], xmm4
- movss [edi + ecx*4 + 8], xmm5
-
- ; accumulate force in xmm6/xmm7 for fshift
- movaps xmm6, xmm0
- movss xmm7, xmm2
- movlhps xmm6, xmm1
- shufps xmm6, xmm6, 1000b
-
- ; accumulate H1i forces in xmm0, xmm1, xmm2
- movaps xmm0, [esp + .fixH1]
- movaps xmm1, [esp + .fiyH1]
- movaps xmm2, [esp + .fizH1]
-
- movhlps xmm3, xmm0
- movhlps xmm4, xmm1
- movhlps xmm5, xmm2
- addps xmm0, xmm3
- addps xmm1, xmm4
- addps xmm2, xmm5 ; summan ligger i 1/2 i xmm0-xmm2
-
- movaps xmm3, xmm0
- movaps xmm4, xmm1
- movaps xmm5, xmm2
-
- shufps xmm3, xmm3, 1b
- shufps xmm4, xmm4, 1b
- shufps xmm5, xmm5, 1b
- addss xmm0, xmm3
- addss xmm1, xmm4
- addss xmm2, xmm5 ; xmm0-xmm2 has single force in pos0.
-
- ; increment i force
- movss xmm3, [edi + ecx*4 + 12]
- movss xmm4, [edi + ecx*4 + 16]
- movss xmm5, [edi + ecx*4 + 20]
- addss xmm3, xmm0
- addss xmm4, xmm1
- addss xmm5, xmm2
- movss [edi + ecx*4 + 12], xmm3
- movss [edi + ecx*4 + 16], xmm4
- movss [edi + ecx*4 + 20], xmm5
-
- ;accumulate force in xmm6/xmm7 for fshift
- addss xmm7, xmm2
- movlhps xmm0, xmm1
- shufps xmm0, xmm0, 1000b
- addps xmm6, xmm0
-
- ; accumulate H2i forces in xmm0, xmm1, xmm2
- movaps xmm0, [esp + .fixH2]
- movaps xmm1, [esp + .fiyH2]
- movaps xmm2, [esp + .fizH2]
-
- movhlps xmm3, xmm0
- movhlps xmm4, xmm1
- movhlps xmm5, xmm2
- addps xmm0, xmm3
- addps xmm1, xmm4
- addps xmm2, xmm5 ; summan ligger i 1/2 i xmm0-xmm2
-
- movaps xmm3, xmm0
- movaps xmm4, xmm1
- movaps xmm5, xmm2
-
- shufps xmm3, xmm3, 1b
- shufps xmm4, xmm4, 1b
- shufps xmm5, xmm5, 1b
- addss xmm0, xmm3
- addss xmm1, xmm4
- addss xmm2, xmm5 ; xmm0-xmm2 has single force in pos0.
-
- ; increment i force
- movss xmm3, [edi + ecx*4 + 24]
- movss xmm4, [edi + ecx*4 + 28]
- movss xmm5, [edi + ecx*4 + 32]
- addss xmm3, xmm0
- addss xmm4, xmm1
- addss xmm5, xmm2
- movss [edi + ecx*4 + 24], xmm3
- movss [edi + ecx*4 + 28], xmm4
- movss [edi + ecx*4 + 32], xmm5
-
- ;accumulate force in xmm6/xmm7 for fshift
- addss xmm7, xmm2
- movlhps xmm0, xmm1
- shufps xmm0, xmm0, 1000b
- addps xmm6, xmm0
-
- ; increment fshift force
- movlps xmm3, [esi + edx*4]
- movss xmm4, [esi + edx*4 + 8]
- addps xmm3, xmm6
- addss xmm4, xmm7
- movlps [esi + edx*4], xmm3
- movss [esi + edx*4 + 8], xmm4
-
- mov edx, [ebp + %$gid]
- mov edx, [edx]
- add [ebp + %$gid], dword 4
-
- ; accumulate total potential energy and update it.
- movaps xmm7, [esp + .vctot]
- ; accumulate
- movhlps xmm6, xmm7
- addps xmm7, xmm6 ; pos 0-1 in xmm7 have the sum now
- movaps xmm6, xmm7
- shufps xmm6, xmm6, 1b
- addss xmm7, xmm6
-
- ; add earlier value from mem.
- mov eax, [ebp + %$Vc]
- addss xmm7, [eax + edx*4]
- ; move back to mem.
- movss [eax + edx*4], xmm7
-
- ;; finish if last
- mov ecx, [ebp + %$nri]
- dec ecx
- jecxz .end
- ;; not last, iterate once more!
- mov [ebp + %$nri], ecx
- jmp .outer
-.end:
- emms
- mov eax, [esp + .salign]
- add esp, eax
- add esp, 724
- pop edi
- pop esi
- pop edx
- pop ecx
- pop ebx
- pop eax
- endproc
-
-
-
-proc inl2030_sse
-%$nri arg
-%$iinr arg
-%$jindex arg
-%$jjnr arg
-%$shift arg
-%$shiftvec arg
-%$fshift arg
-%$gid arg
-%$pos arg
-%$faction arg
-%$charge arg
-%$facel arg
-%$Vc arg
-%$krf arg
-%$crf arg
- ;; stack offsets for local variables
- ;; bottom of stack is cache-aligned for sse use
-.ixO equ 0
-.iyO equ 16
-.izO equ 32
-.ixH1 equ 48
-.iyH1 equ 64
-.izH1 equ 80
-.ixH2 equ 96
-.iyH2 equ 112
-.izH2 equ 128
-.jxO equ 144
-.jyO equ 160
-.jzO equ 176
-.jxH1 equ 192
-.jyH1 equ 208
-.jzH1 equ 224
-.jxH2 equ 240
-.jyH2 equ 256
-.jzH2 equ 272
-.dxOO equ 288
-.dyOO equ 304
-.dzOO equ 320
-.dxOH1 equ 336
-.dyOH1 equ 352
-.dzOH1 equ 368
-.dxOH2 equ 384
-.dyOH2 equ 400
-.dzOH2 equ 416
-.dxH1O equ 432
-.dyH1O equ 448
-.dzH1O equ 464
-.dxH1H1 equ 480
-.dyH1H1 equ 496
-.dzH1H1 equ 512
-.dxH1H2 equ 528
-.dyH1H2 equ 544
-.dzH1H2 equ 560
-.dxH2O equ 576
-.dyH2O equ 592
-.dzH2O equ 608
-.dxH2H1 equ 624
-.dyH2H1 equ 640
-.dzH2H1 equ 656
-.dxH2H2 equ 672
-.dyH2H2 equ 688
-.dzH2H2 equ 704
-.qqOO equ 720
-.qqOH equ 736
-.qqHH equ 752
-.vctot equ 768
-.fixO equ 784
-.fiyO equ 800
-.fizO equ 816
-.fixH1 equ 832
-.fiyH1 equ 848
-.fizH1 equ 864
-.fixH2 equ 880
-.fiyH2 equ 896
-.fizH2 equ 912
-.fjxO equ 928
-.fjyO equ 944
-.fjzO equ 960
-.fjxH1 equ 976
-.fjyH1 equ 992
-.fjzH1 equ 1008
-.fjxH2 equ 1024
-.fjyH2 equ 1040
-.fjzH2 equ 1056
-.half equ 1072
-.three equ 1088
-.rsqOO equ 1104
-.rsqOH1 equ 1120
-.rsqOH2 equ 1136
-.rsqH1O equ 1152
-.rsqH1H1 equ 1168
-.rsqH1H2 equ 1184
-.rsqH2O equ 1200
-.rsqH2H1 equ 1216
-.rsqH2H2 equ 1232
-.rinvOO equ 1248
-.rinvOH1 equ 1264
-.rinvOH2 equ 1280
-.rinvH1O equ 1296
-.rinvH1H1 equ 1312
-.rinvH1H2 equ 1328
-.rinvH2O equ 1344
-.rinvH2H1 equ 1360
-.rinvH2H2 equ 1376
-.two equ 1392
-.krf equ 1408
-.crf equ 1424
-.is3 equ 1440
-.ii3 equ 1444
-.innerjjnr equ 1448
-.innerk equ 1452
-.salign equ 1456
- push eax
- push ebx
- push ecx
- push edx
- push esi
- push edi
- sub esp, 1460 ; local stack space
- mov eax, esp
- and eax, 0xf
- sub esp, eax
- mov [esp + .salign], eax
-
- emms
-
- movups xmm0, [sse_half]
- movups xmm1, [sse_three]
- movups xmm4, [sse_two]
- movss xmm5, [ebp + %$krf]
- movss xmm6, [ebp + %$crf]
-
- movaps [esp + .half], xmm0
- movaps [esp + .three], xmm1
- movaps [esp + .two], xmm4
- shufps xmm5, xmm5, 0b
- shufps xmm6, xmm6, 0b
- movaps [esp + .krf], xmm5
- movaps [esp + .crf], xmm6
-
- ;; assume we have at least one i particle - start directly
- mov ecx, [ebp + %$iinr] ; ecx = pointer into iinr[]
- mov ebx, [ecx] ; ebx =ii
-
- mov edx, [ebp + %$charge]
- movss xmm3, [edx + ebx*4]
- movss xmm4, xmm3
- movss xmm5, [edx + ebx*4 + 4]
- movss xmm6, [ebp + %$facel]
- mulss xmm3, xmm3
- mulss xmm4, xmm5
- mulss xmm5, xmm5
- mulss xmm3, xmm6
- mulss xmm4, xmm6
- mulss xmm5, xmm6
- shufps xmm3, xmm3, 0b
- shufps xmm4, xmm4, 0b
- shufps xmm5, xmm5, 0b
- movaps [esp + .qqOO], xmm3
- movaps [esp + .qqOH], xmm4
- movaps [esp + .qqHH], xmm5
-
-.outer:
- mov eax, [ebp + %$shift] ; eax = pointer into shift[]
- mov ebx, [eax] ; ebx=shift[n]
- add [ebp + %$shift], dword 4 ; advance pointer one step
-
- lea ebx, [ebx + ebx*2] ; ebx=3*is
- mov [esp + .is3],ebx ; store is3
-
- mov eax, [ebp + %$shiftvec] ; eax = base of shiftvec[]
-
- movss xmm0, [eax + ebx*4]
- movss xmm1, [eax + ebx*4 + 4]
- movss xmm2, [eax + ebx*4 + 8]
-
- mov ecx, [ebp + %$iinr] ; ecx = pointer into iinr[]
- add [ebp + %$iinr], dword 4 ; advance pointer
- mov ebx, [ecx] ; ebx =ii
-
- lea ebx, [ebx + ebx*2] ; ebx = 3*ii=ii3
- mov eax, [ebp + %$pos] ; eax = base of pos[]
- mov [esp + .ii3], ebx
-
- movaps xmm3, xmm0
- movaps xmm4, xmm1
- movaps xmm5, xmm2
- addss xmm3, [eax + ebx*4]
- addss xmm4, [eax + ebx*4 + 4]
- addss xmm5, [eax + ebx*4 + 8]
- shufps xmm3, xmm3, 0b
- shufps xmm4, xmm4, 0b
- shufps xmm5, xmm5, 0b
- movaps [esp + .ixO], xmm3
- movaps [esp + .iyO], xmm4
- movaps [esp + .izO], xmm5
-
- movss xmm3, xmm0
- movss xmm4, xmm1
- movss xmm5, xmm2
- addss xmm0, [eax + ebx*4 + 12]
- addss xmm1, [eax + ebx*4 + 16]
- addss xmm2, [eax + ebx*4 + 20]
- addss xmm3, [eax + ebx*4 + 24]
- addss xmm4, [eax + ebx*4 + 28]
- addss xmm5, [eax + ebx*4 + 32]
-
- shufps xmm0, xmm0, 0b
- shufps xmm1, xmm1, 0b
- shufps xmm2, xmm2, 0b
- shufps xmm3, xmm3, 0b
- shufps xmm4, xmm4, 0b
- shufps xmm5, xmm5, 0b
- movaps [esp + .ixH1], xmm0
- movaps [esp + .iyH1], xmm1
- movaps [esp + .izH1], xmm2
- movaps [esp + .ixH2], xmm3
- movaps [esp + .iyH2], xmm4
- movaps [esp + .izH2], xmm5
-
- ; clear vctot and i forces
- xorps xmm4, xmm4
- movaps [esp + .vctot], xmm4
- movaps [esp + .fixO], xmm4
- movaps [esp + .fiyO], xmm4
- movaps [esp + .fizO], xmm4
- movaps [esp + .fixH1], xmm4
- movaps [esp + .fiyH1], xmm4
- movaps [esp + .fizH1], xmm4
- movaps [esp + .fixH2], xmm4
- movaps [esp + .fiyH2], xmm4
- movaps [esp + .fizH2], xmm4
-
- mov eax, [ebp + %$jindex]
- mov ecx, [eax] ; jindex[n]
- mov edx, [eax + 4] ; jindex[n+1]
- add [ebp + %$jindex], dword 4
- sub edx, ecx ; number of innerloop atoms
-
- mov esi, [ebp + %$pos]
- mov edi, [ebp + %$faction]
- mov eax, [ebp + %$jjnr]
- shl ecx, 2
- add eax, ecx
- mov [esp + .innerjjnr], eax ; pointer to jjnr[nj0]
- sub edx, dword 4
- mov [esp + .innerk], edx ; number of innerloop atoms
- jge .unroll_loop
- jmp .single_check
-.unroll_loop:
- ;; quad-unroll innerloop here.
- mov edx, [esp + .innerjjnr] ; pointer to jjnr[k]
-
- mov eax, [edx]
- mov ebx, [edx + 4]
- mov ecx, [edx + 8]
- mov edx, [edx + 12] ; eax-edx=jnr1-4
-
- add [esp + .innerjjnr], dword 16 ; advance pointer (unrolled 4)
-
- mov esi, [ebp + %$pos] ; base of pos[]
-
- lea eax, [eax + eax*2] ; replace jnr with j3
- lea ebx, [ebx + ebx*2]
- lea ecx, [ecx + ecx*2] ; replace jnr with j3
- lea edx, [edx + edx*2]
-
- ; move j coordinates to local temp. variables
- movlps xmm2, [esi + eax*4]
- movlps xmm3, [esi + eax*4 + 12]
- movlps xmm4, [esi + eax*4 + 24]
-
- movlps xmm5, [esi + ebx*4]
- movlps xmm6, [esi + ebx*4 + 12]
- movlps xmm7, [esi + ebx*4 + 24]
-
- movhps xmm2, [esi + ecx*4]
- movhps xmm3, [esi + ecx*4 + 12]
- movhps xmm4, [esi + ecx*4 + 24]
-
- movhps xmm5, [esi + edx*4]
- movhps xmm6, [esi + edx*4 + 12]
- movhps xmm7, [esi + edx*4 + 24]
-
- ;; current state:
- ;; xmm2= jxOa jyOa jxOc jyOc
- ;; xmm3= jxH1a jyH1a jxH1c jyH1c
- ;; xmm4= jxH2a jyH2a jxH2c jyH2c
- ;; xmm5= jxOb jyOb jxOd jyOd
- ;; xmm6= jxH1b jyH1b jxH1d jyH1d
- ;; xmm7= jxH2b jyH2b jxH2d jyH2d
-
- movaps xmm0, xmm2
- movaps xmm1, xmm3
- unpcklps xmm0, xmm5 ; xmm0= jxOa jxOb jyOa jyOb
- unpcklps xmm1, xmm6 ; xmm1= jxH1a jxH1b jyH1a jyH1b
- unpckhps xmm2, xmm5 ; xmm2= jxOc jxOd jyOc jyOd
- unpckhps xmm3, xmm6 ; xmm3= jxH1c jxH1d jyH1c jyH1d
- movaps xmm5, xmm4
- movaps xmm6, xmm0
- unpcklps xmm4, xmm7 ; xmm4= jxH2a jxH2b jyH2a jyH2b
- unpckhps xmm5, xmm7 ; xmm5= jxH2c jxH2d jyH2c jyH2d
- movaps xmm7, xmm1
- movlhps xmm0, xmm2 ; xmm0= jxOa jxOb jxOc jxOd
- movaps [esp + .jxO], xmm0
- movhlps xmm2, xmm6 ; xmm2= jyOa jyOb jyOc jyOd
- movaps [esp + .jyO], xmm2
- movlhps xmm1, xmm3
- movaps [esp + .jxH1], xmm1
- movhlps xmm3, xmm7
- movaps xmm6, xmm4
- movaps [esp + .jyH1], xmm3
- movlhps xmm4, xmm5
- movaps [esp + .jxH2], xmm4
- movhlps xmm5, xmm6
- movaps [esp + .jyH2], xmm5
-
- movss xmm0, [esi + eax*4 + 8]
- movss xmm1, [esi + eax*4 + 20]
- movss xmm2, [esi + eax*4 + 32]
-
- movss xmm3, [esi + ecx*4 + 8]
- movss xmm4, [esi + ecx*4 + 20]
- movss xmm5, [esi + ecx*4 + 32]
-
- movhps xmm0, [esi + ebx*4 + 4]
- movhps xmm1, [esi + ebx*4 + 16]
- movhps xmm2, [esi + ebx*4 + 28]
-
- movhps xmm3, [esi + edx*4 + 4]
- movhps xmm4, [esi + edx*4 + 16]
- movhps xmm5, [esi + edx*4 + 28]
-
- shufps xmm0, xmm3, 11001100b
- shufps xmm1, xmm4, 11001100b
- shufps xmm2, xmm5, 11001100b
- movaps [esp + .jzO], xmm0
- movaps [esp + .jzH1], xmm1
- movaps [esp + .jzH2], xmm2
-
- movaps xmm0, [esp + .ixO]
- movaps xmm1, [esp + .iyO]
- movaps xmm2, [esp + .izO]
- movaps xmm3, [esp + .ixO]
- movaps xmm4, [esp + .iyO]
- movaps xmm5, [esp + .izO]
- subps xmm0, [esp + .jxO]
- subps xmm1, [esp + .jyO]
- subps xmm2, [esp + .jzO]
- subps xmm3, [esp + .jxH1]
- subps xmm4, [esp + .jyH1]
- subps xmm5, [esp + .jzH1]
- movaps [esp + .dxOO], xmm0
- movaps [esp + .dyOO], xmm1
- movaps [esp + .dzOO], xmm2
- mulps xmm0, xmm0
- mulps xmm1, xmm1
- mulps xmm2, xmm2
- movaps [esp + .dxOH1], xmm3
- movaps [esp + .dyOH1], xmm4
- movaps [esp + .dzOH1], xmm5
- mulps xmm3, xmm3
- mulps xmm4, xmm4
- mulps xmm5, xmm5
- addps xmm0, xmm1
- addps xmm0, xmm2
- addps xmm3, xmm4
- addps xmm3, xmm5
- movaps [esp + .rsqOO], xmm0
- movaps [esp + .rsqOH1], xmm3
-
- movaps xmm0, [esp + .ixO]
- movaps xmm1, [esp + .iyO]
- movaps xmm2, [esp + .izO]
- movaps xmm3, [esp + .ixH1]
- movaps xmm4, [esp + .iyH1]
- movaps xmm5, [esp + .izH1]
- subps xmm0, [esp + .jxH2]
- subps xmm1, [esp + .jyH2]
- subps xmm2, [esp + .jzH2]
- subps xmm3, [esp + .jxO]
- subps xmm4, [esp + .jyO]
- subps xmm5, [esp + .jzO]
- movaps [esp + .dxOH2], xmm0
- movaps [esp + .dyOH2], xmm1
- movaps [esp + .dzOH2], xmm2
- mulps xmm0, xmm0
- mulps xmm1, xmm1
- mulps xmm2, xmm2
- movaps [esp + .dxH1O], xmm3
- movaps [esp + .dyH1O], xmm4
- movaps [esp + .dzH1O], xmm5
- mulps xmm3, xmm3
- mulps xmm4, xmm4
- mulps xmm5, xmm5
- addps xmm0, xmm1
- addps xmm0, xmm2
- addps xmm3, xmm4
- addps xmm3, xmm5
- movaps [esp + .rsqOH2], xmm0
- movaps [esp + .rsqH1O], xmm3
-
- movaps xmm0, [esp + .ixH1]
- movaps xmm1, [esp + .iyH1]
- movaps xmm2, [esp + .izH1]
- movaps xmm3, [esp + .ixH1]
- movaps xmm4, [esp + .iyH1]
- movaps xmm5, [esp + .izH1]
- subps xmm0, [esp + .jxH1]
- subps xmm1, [esp + .jyH1]
- subps xmm2, [esp + .jzH1]
- subps xmm3, [esp + .jxH2]
- subps xmm4, [esp + .jyH2]
- subps xmm5, [esp + .jzH2]
- movaps [esp + .dxH1H1], xmm0
- movaps [esp + .dyH1H1], xmm1
- movaps [esp + .dzH1H1], xmm2
- mulps xmm0, xmm0
- mulps xmm1, xmm1
- mulps xmm2, xmm2
- movaps [esp + .dxH1H2], xmm3
- movaps [esp + .dyH1H2], xmm4
- movaps [esp + .dzH1H2], xmm5
- mulps xmm3, xmm3
- mulps xmm4, xmm4
- mulps xmm5, xmm5
- addps xmm0, xmm1
- addps xmm0, xmm2
- addps xmm3, xmm4
- addps xmm3, xmm5
- movaps [esp + .rsqH1H1], xmm0
- movaps [esp + .rsqH1H2], xmm3
-
- movaps xmm0, [esp + .ixH2]
- movaps xmm1, [esp + .iyH2]
- movaps xmm2, [esp + .izH2]
- movaps xmm3, [esp + .ixH2]
- movaps xmm4, [esp + .iyH2]
- movaps xmm5, [esp + .izH2]
- subps xmm0, [esp + .jxO]
- subps xmm1, [esp + .jyO]
- subps xmm2, [esp + .jzO]
- subps xmm3, [esp + .jxH1]
- subps xmm4, [esp + .jyH1]
- subps xmm5, [esp + .jzH1]
- movaps [esp + .dxH2O], xmm0
- movaps [esp + .dyH2O], xmm1
- movaps [esp + .dzH2O], xmm2
- mulps xmm0, xmm0
- mulps xmm1, xmm1
- mulps xmm2, xmm2
- movaps [esp + .dxH2H1], xmm3
- movaps [esp + .dyH2H1], xmm4
- movaps [esp + .dzH2H1], xmm5
- mulps xmm3, xmm3
- mulps xmm4, xmm4
- mulps xmm5, xmm5
- addps xmm0, xmm1
- addps xmm0, xmm2
- addps xmm4, xmm3
- addps xmm4, xmm5
- movaps [esp + .rsqH2O], xmm0
- movaps [esp + .rsqH2H1], xmm4
-
- movaps xmm0, [esp + .ixH2]
- movaps xmm1, [esp + .iyH2]
- movaps xmm2, [esp + .izH2]
- subps xmm0, [esp + .jxH2]
- subps xmm1, [esp + .jyH2]
- subps xmm2, [esp + .jzH2]
- movaps [esp + .dxH2H2], xmm0
- movaps [esp + .dyH2H2], xmm1
- movaps [esp + .dzH2H2], xmm2
- mulps xmm0, xmm0
- mulps xmm1, xmm1
- mulps xmm2, xmm2
- addps xmm0, xmm1
- addps xmm0, xmm2
- movaps [esp + .rsqH2H2], xmm0
-
- ; start doing invsqrt. use rsq values in xmm0, xmm4
- rsqrtps xmm1, xmm0
- rsqrtps xmm5, xmm4
- movaps xmm2, xmm1
- movaps xmm6, xmm5
- mulps xmm1, xmm1
- mulps xmm5, xmm5
- movaps xmm3, [esp + .three]
- movaps xmm7, xmm3
- mulps xmm1, xmm0
- mulps xmm5, xmm4
- subps xmm3, xmm1
- subps xmm7, xmm5
- mulps xmm3, xmm2
- mulps xmm7, xmm6
- mulps xmm3, [esp + .half] ; rinvH2H2
- mulps xmm7, [esp + .half] ; rinvH2H1
- movaps [esp + .rinvH2H2], xmm3
- movaps [esp + .rinvH2H1], xmm7
-
- rsqrtps xmm1, [esp + .rsqOO]
- rsqrtps xmm5, [esp + .rsqOH1]
- movaps xmm2, xmm1
- movaps xmm6, xmm5
- mulps xmm1, xmm1
- mulps xmm5, xmm5
- movaps xmm3, [esp + .three]
- movaps xmm7, xmm3
- mulps xmm1, [esp + .rsqOO]
- mulps xmm5, [esp + .rsqOH1]
- subps xmm3, xmm1
- subps xmm7, xmm5
- mulps xmm3, xmm2
- mulps xmm7, xmm6
- mulps xmm3, [esp + .half]
- mulps xmm7, [esp + .half]
- movaps [esp + .rinvOO], xmm3
- movaps [esp + .rinvOH1], xmm7
-
- rsqrtps xmm1, [esp + .rsqOH2]
- rsqrtps xmm5, [esp + .rsqH1O]
- movaps xmm2, xmm1
- movaps xmm6, xmm5
- mulps xmm1, xmm1
- mulps xmm5, xmm5
- movaps xmm3, [esp + .three]
- movaps xmm7, xmm3
- mulps xmm1, [esp + .rsqOH2]
- mulps xmm5, [esp + .rsqH1O]
- subps xmm3, xmm1
- subps xmm7, xmm5
- mulps xmm3, xmm2
- mulps xmm7, xmm6
- mulps xmm3, [esp + .half]
- mulps xmm7, [esp + .half]
- movaps [esp + .rinvOH2], xmm3
- movaps [esp + .rinvH1O], xmm7
-
- rsqrtps xmm1, [esp + .rsqH1H1]
- rsqrtps xmm5, [esp + .rsqH1H2]
- movaps xmm2, xmm1
- movaps xmm6, xmm5
- mulps xmm1, xmm1
- mulps xmm5, xmm5
- movaps xmm3, [esp + .three]
- movaps xmm7, xmm3
- mulps xmm1, [esp + .rsqH1H1]
- mulps xmm5, [esp + .rsqH1H2]
- subps xmm3, xmm1
- subps xmm7, xmm5
- mulps xmm3, xmm2
- mulps xmm7, xmm6
- mulps xmm3, [esp + .half]
- mulps xmm7, [esp + .half]
- movaps [esp + .rinvH1H1], xmm3
- movaps [esp + .rinvH1H2], xmm7
-
- rsqrtps xmm1, [esp + .rsqH2O]
- movaps xmm2, xmm1
- mulps xmm1, xmm1
- movaps xmm3, [esp + .three]
- mulps xmm1, [esp + .rsqH2O]
- subps xmm3, xmm1
- mulps xmm3, xmm2
- mulps xmm3, [esp + .half]
- movaps [esp + .rinvH2O], xmm3
-
- ;; start with OO interaction.
- movaps xmm0, [esp + .rinvOO]
- movaps xmm7, xmm0 ; xmm7=rinv
- movaps xmm5, [esp + .krf]
- mulps xmm0, xmm0 ; xmm0=rinvsq
-
- mulps xmm5, [esp + .rsqOO] ; xmm5=krsq
- movaps xmm6, xmm5
- addps xmm6, xmm7 ; xmm6=rinv+krsq
- subps xmm6, [esp + .crf]
- mulps xmm6, [esp + .qqOO] ; xmm6=voul=qq*(rinv+krsq-crf)
- mulps xmm5, [esp + .two]
- subps xmm7, xmm5 ; xmm7=rinv-2*krsq
- mulps xmm7, [esp + .qqOO] ; xmm7 = coul part of fscal
-
- addps xmm6, [esp + .vctot] ; local vctot summation variable
- mulps xmm0, xmm7
-
- movaps xmm1, xmm0
- movaps xmm2, xmm0
-
- xorps xmm3, xmm3
- movaps xmm4, xmm3
- movaps xmm5, xmm3
- mulps xmm0, [esp + .dxOO]
- mulps xmm1, [esp + .dyOO]
- mulps xmm2, [esp + .dzOO]
- subps xmm3, xmm0
- subps xmm4, xmm1
- subps xmm5, xmm2
- addps xmm0, [esp + .fixO]
- addps xmm1, [esp + .fiyO]
- addps xmm2, [esp + .fizO]
- movaps [esp + .fjxO], xmm3
- movaps [esp + .fjyO], xmm4
- movaps [esp + .fjzO], xmm5
- movaps [esp + .fixO], xmm0
- movaps [esp + .fiyO], xmm1
- movaps [esp + .fizO], xmm2
-
- ; O-H1 interaction
- movaps xmm0, [esp + .rinvOH1]
- movaps xmm7, xmm0 ; xmm7=rinv
- movaps xmm5, [esp + .krf]
- movaps xmm1, xmm0
- mulps xmm5, [esp + .rsqOH1] ; xmm5=krsq
- movaps xmm4, xmm5
- addps xmm4, xmm7 ; xmm4=rinv+krsq
- subps xmm4, [esp + .crf]
- mulps xmm0, xmm0
- mulps xmm4, [esp + .qqOH] ; xmm4=voul=qq*(rinv+krsq-crf)
- mulps xmm5, [esp + .two]
- subps xmm7, xmm5 ; xmm7=rinv-2*krsq
- mulps xmm7, [esp + .qqOH] ; xmm7 = coul part of fscal
- addps xmm6, xmm4 ; add to local vctot.
- mulps xmm0, xmm7 ; fsOH1
- movaps xmm1, xmm0
- movaps xmm2, xmm0
-
- xorps xmm3, xmm3
- movaps xmm4, xmm3
- movaps xmm5, xmm3
- mulps xmm0, [esp + .dxOH1]
- mulps xmm1, [esp + .dyOH1]
- mulps xmm2, [esp + .dzOH1]
- subps xmm3, xmm0
- subps xmm4, xmm1
- subps xmm5, xmm2
- addps xmm0, [esp + .fixO]
- addps xmm1, [esp + .fiyO]
- addps xmm2, [esp + .fizO]
- movaps [esp + .fjxH1], xmm3
- movaps [esp + .fjyH1], xmm4
- movaps [esp + .fjzH1], xmm5
- movaps [esp + .fixO], xmm0
- movaps [esp + .fiyO], xmm1
- movaps [esp + .fizO], xmm2
-
- ; O-H2 interaction
- movaps xmm0, [esp + .rinvOH2]
- movaps xmm7, xmm0 ; xmm7=rinv
- movaps xmm5, [esp + .krf]
- movaps xmm1, xmm0
- mulps xmm5, [esp + .rsqOH2] ; xmm5=krsq
- movaps xmm4, xmm5
- addps xmm4, xmm7 ; xmm4=rinv+krsq
- subps xmm4, [esp + .crf]
- mulps xmm0, xmm0
- mulps xmm4, [esp + .qqOH] ; xmm4=voul=qq*(rinv+krsq-crf)
- mulps xmm5, [esp + .two]
- subps xmm7, xmm5 ; xmm7=rinv-2*krsq
- mulps xmm7, [esp + .qqOH] ; xmm7 = coul part of fscal
- addps xmm6, xmm4 ; add to local vctot.
- mulps xmm0, xmm7 ; fsOH2
- movaps xmm1, xmm0
- movaps xmm2, xmm0
-
- xorps xmm3, xmm3
- movaps xmm4, xmm3
- movaps xmm5, xmm3
- mulps xmm0, [esp + .dxOH2]
- mulps xmm1, [esp + .dyOH2]
- mulps xmm2, [esp + .dzOH2]
- subps xmm3, xmm0
- subps xmm4, xmm1
- subps xmm5, xmm2
- addps xmm0, [esp + .fixO]
- addps xmm1, [esp + .fiyO]
- addps xmm2, [esp + .fizO]
- movaps [esp + .fjxH2], xmm3
- movaps [esp + .fjyH2], xmm4
- movaps [esp + .fjzH2], xmm5
- movaps [esp + .fixO], xmm0
- movaps [esp + .fiyO], xmm1
- movaps [esp + .fizO], xmm2
-
- ; H1-O interaction
- movaps xmm0, [esp + .rinvH1O]
- movaps xmm7, xmm0 ; xmm7=rinv
- movaps xmm5, [esp + .krf]
- movaps xmm1, xmm0
- mulps xmm5, [esp + .rsqH1O] ; xmm5=krsq
- movaps xmm4, xmm5
- addps xmm4, xmm7 ; xmm4=rinv+krsq
- subps xmm4, [esp + .crf]
- mulps xmm0, xmm0
- mulps xmm4, [esp + .qqOH] ; xmm4=voul=qq*(rinv+krsq-crf)
- mulps xmm5, [esp + .two]
- subps xmm7, xmm5 ; xmm7=rinv-2*krsq
- mulps xmm7, [esp + .qqOH] ; xmm7 = coul part of fscal
- addps xmm6, xmm4 ; add to local vctot.
- mulps xmm0, xmm7 ; fsOH2
- movaps xmm1, xmm0
- movaps xmm2, xmm0
-
- movaps xmm3, [esp + .fjxO]
- movaps xmm4, [esp + .fjyO]
- movaps xmm5, [esp + .fjzO]
- mulps xmm0, [esp + .dxH1O]
- mulps xmm1, [esp + .dyH1O]
- mulps xmm2, [esp + .dzH1O]
- subps xmm3, xmm0
- subps xmm4, xmm1
- subps xmm5, xmm2
- addps xmm0, [esp + .fixH1]
- addps xmm1, [esp + .fiyH1]
- addps xmm2, [esp + .fizH1]
- movaps [esp + .fjxO], xmm3
- movaps [esp + .fjyO], xmm4
- movaps [esp + .fjzO], xmm5
- movaps [esp + .fixH1], xmm0
- movaps [esp + .fiyH1], xmm1
- movaps [esp + .fizH1], xmm2
-
- ; H1-H1 interaction
- movaps xmm0, [esp + .rinvH1H1]
- movaps xmm7, xmm0 ; xmm7=rinv
- movaps xmm5, [esp + .krf]
- movaps xmm1, xmm0
- mulps xmm5, [esp + .rsqH1H1] ; xmm5=krsq
- movaps xmm4, xmm5
- addps xmm4, xmm7 ; xmm4=rinv+krsq
- subps xmm4, [esp + .crf]
- mulps xmm0, xmm0
- mulps xmm4, [esp + .qqHH] ; xmm4=voul=qq*(rinv+krsq-crf)
- mulps xmm5, [esp + .two]
- subps xmm7, xmm5 ; xmm7=rinv-2*krsq
- mulps xmm7, [esp + .qqHH] ; xmm7 = coul part of fscal
- addps xmm6, xmm4 ; add to local vctot.
- mulps xmm0, xmm7 ; fsOH2
- movaps xmm1, xmm0
- movaps xmm2, xmm0
-
- movaps xmm3, [esp + .fjxH1]
- movaps xmm4, [esp + .fjyH1]
- movaps xmm5, [esp + .fjzH1]
- mulps xmm0, [esp + .dxH1H1]
- mulps xmm1, [esp + .dyH1H1]
- mulps xmm2, [esp + .dzH1H1]
- subps xmm3, xmm0
- subps xmm4, xmm1
- subps xmm5, xmm2
- addps xmm0, [esp + .fixH1]
- addps xmm1, [esp + .fiyH1]
- addps xmm2, [esp + .fizH1]
- movaps [esp + .fjxH1], xmm3
- movaps [esp + .fjyH1], xmm4
- movaps [esp + .fjzH1], xmm5
- movaps [esp + .fixH1], xmm0
- movaps [esp + .fiyH1], xmm1
- movaps [esp + .fizH1], xmm2
-
- ; H1-H2 interaction
- movaps xmm0, [esp + .rinvH1H2]
- movaps xmm7, xmm0 ; xmm7=rinv
- movaps xmm5, [esp + .krf]
- movaps xmm1, xmm0
- mulps xmm5, [esp + .rsqH1H2] ; xmm5=krsq
- movaps xmm4, xmm5
- addps xmm4, xmm7 ; xmm4=rinv+krsq
- subps xmm4, [esp + .crf]
- mulps xmm0, xmm0
- mulps xmm4, [esp + .qqHH] ; xmm4=voul=qq*(rinv+krsq-crf)
- mulps xmm5, [esp + .two]
- subps xmm7, xmm5 ; xmm7=rinv-2*krsq
- mulps xmm7, [esp + .qqHH] ; xmm7 = coul part of fscal
- addps xmm6, xmm4 ; add to local vctot.
- mulps xmm0, xmm7 ; fsOH2
- movaps xmm1, xmm0
- movaps xmm2, xmm0
-
- movaps xmm3, [esp + .fjxH2]
- movaps xmm4, [esp + .fjyH2]
- movaps xmm5, [esp + .fjzH2]
- mulps xmm0, [esp + .dxH1H2]
- mulps xmm1, [esp + .dyH1H2]
- mulps xmm2, [esp + .dzH1H2]
- subps xmm3, xmm0
- subps xmm4, xmm1
- subps xmm5, xmm2
- addps xmm0, [esp + .fixH1]
- addps xmm1, [esp + .fiyH1]
- addps xmm2, [esp + .fizH1]
- movaps [esp + .fjxH2], xmm3
- movaps [esp + .fjyH2], xmm4
- movaps [esp + .fjzH2], xmm5
- movaps [esp + .fixH1], xmm0
- movaps [esp + .fiyH1], xmm1
- movaps [esp + .fizH1], xmm2
-
- ; H2-O interaction
- movaps xmm0, [esp + .rinvH2O]
- movaps xmm7, xmm0 ; xmm7=rinv
- movaps xmm5, [esp + .krf]
- movaps xmm1, xmm0
- mulps xmm5, [esp + .rsqH2O] ; xmm5=krsq
- movaps xmm4, xmm5
- addps xmm4, xmm7 ; xmm4=rinv+krsq
- subps xmm4, [esp + .crf]
- mulps xmm0, xmm0
- mulps xmm4, [esp + .qqOH] ; xmm4=voul=qq*(rinv+krsq-crf)
- mulps xmm5, [esp + .two]
- subps xmm7, xmm5 ; xmm7=rinv-2*krsq
- mulps xmm7, [esp + .qqOH] ; xmm7 = coul part of fscal
- addps xmm6, xmm4 ; add to local vctot.
- mulps xmm0, xmm7 ; fsOH2
- movaps xmm1, xmm0
- movaps xmm2, xmm0
-
- movaps xmm3, [esp + .fjxO]
- movaps xmm4, [esp + .fjyO]
- movaps xmm5, [esp + .fjzO]
- mulps xmm0, [esp + .dxH2O]
- mulps xmm1, [esp + .dyH2O]
- mulps xmm2, [esp + .dzH2O]
- subps xmm3, xmm0
- subps xmm4, xmm1
- subps xmm5, xmm2
- addps xmm0, [esp + .fixH2]
- addps xmm1, [esp + .fiyH2]
- addps xmm2, [esp + .fizH2]
- movaps [esp + .fjxO], xmm3
- movaps [esp + .fjyO], xmm4
- movaps [esp + .fjzO], xmm5
- movaps [esp + .fixH2], xmm0
- movaps [esp + .fiyH2], xmm1
- movaps [esp + .fizH2], xmm2
-
- ; H2-H1 interaction
- movaps xmm0, [esp + .rinvH2H1]
- movaps xmm7, xmm0 ; xmm7=rinv
- movaps xmm5, [esp + .krf]
- movaps xmm1, xmm0
- mulps xmm5, [esp + .rsqH2H1] ; xmm5=krsq
- movaps xmm4, xmm5
- addps xmm4, xmm7 ; xmm4=rinv+krsq
- subps xmm4, [esp + .crf]
- mulps xmm0, xmm0
- mulps xmm4, [esp + .qqHH] ; xmm4=voul=qq*(rinv+krsq-crf)
- mulps xmm5, [esp + .two]
- subps xmm7, xmm5 ; xmm7=rinv-2*krsq
- mulps xmm7, [esp + .qqHH] ; xmm7 = coul part of fscal
- addps xmm6, xmm4 ; add to local vctot.
- mulps xmm0, xmm7 ; fsOH2
- movaps xmm1, xmm0
- movaps xmm2, xmm0
-
- movaps xmm3, [esp + .fjxH1]
- movaps xmm4, [esp + .fjyH1]
- movaps xmm5, [esp + .fjzH1]
- mulps xmm0, [esp + .dxH2H1]
- mulps xmm1, [esp + .dyH2H1]
- mulps xmm2, [esp + .dzH2H1]
- subps xmm3, xmm0
- subps xmm4, xmm1
- subps xmm5, xmm2
- addps xmm0, [esp + .fixH2]
- addps xmm1, [esp + .fiyH2]
- addps xmm2, [esp + .fizH2]
- movaps [esp + .fjxH1], xmm3
- movaps [esp + .fjyH1], xmm4
- movaps [esp + .fjzH1], xmm5
- movaps [esp + .fixH2], xmm0
- movaps [esp + .fiyH2], xmm1
- movaps [esp + .fizH2], xmm2
-
- ; H2-H2 interaction
- movaps xmm0, [esp + .rinvH2H2]
- movaps xmm7, xmm0 ; xmm7=rinv
- movaps xmm5, [esp + .krf]
- movaps xmm1, xmm0
- mulps xmm5, [esp + .rsqH2H2] ; xmm5=krsq
- movaps xmm4, xmm5
- addps xmm4, xmm7 ; xmm4=rinv+krsq
- subps xmm4, [esp + .crf]
- mulps xmm0, xmm0
- mulps xmm4, [esp + .qqHH] ; xmm4=voul=qq*(rinv+krsq-crf)
- mulps xmm5, [esp + .two]
- subps xmm7, xmm5 ; xmm7=rinv-2*krsq
- mulps xmm7, [esp + .qqHH] ; xmm7 = coul part of fscal
- addps xmm6, xmm4 ; add to local vctot.
- mulps xmm0, xmm7 ; fsOH2
- movaps xmm1, xmm0
- movaps xmm2, xmm0
-
- movaps xmm1, xmm0
- movaps [esp + .vctot], xmm6
- movaps xmm2, xmm0
-
- movaps xmm3, [esp + .fjxH2]
- movaps xmm4, [esp + .fjyH2]
- movaps xmm5, [esp + .fjzH2]
- mulps xmm0, [esp + .dxH2H2]
- mulps xmm1, [esp + .dyH2H2]
- mulps xmm2, [esp + .dzH2H2]
- subps xmm3, xmm0
- subps xmm4, xmm1
- subps xmm5, xmm2
- addps xmm0, [esp + .fixH2]
- addps xmm1, [esp + .fiyH2]
- addps xmm2, [esp + .fizH2]
- movaps [esp + .fjxH2], xmm3
- movaps [esp + .fjyH2], xmm4
- movaps [esp + .fjzH2], xmm5
- movaps [esp + .fixH2], xmm0
- movaps [esp + .fiyH2], xmm1
- movaps [esp + .fizH2], xmm2
-
- mov edi, [ebp +%$faction]
-
- ; Did all interactions - now update j forces.
- ; 4 j waters with three atoms each - first do a & b j particles
- movaps xmm0, [esp + .fjxO] ; xmm0= fjxOa fjxOb fjxOc fjxOd
- movaps xmm1, [esp + .fjyO] ; xmm1= fjyOa fjyOb fjyOc fjyOd
- unpcklps xmm0, xmm1 ; xmm0= fjxOa fjyOa fjxOb fjyOb
- movaps xmm1, [esp + .fjzO]
- movaps xmm2, [esp + .fjxH1]
- movhlps xmm3, xmm0 ; xmm3= fjxOb fjyOb
- unpcklps xmm1, xmm2 ; xmm1= fjzOa fjxH1a fjzOb fjxH1b
- movaps xmm4, [esp + .fjyH1]
- movaps xmm5, [esp + .fjzH1]
- unpcklps xmm4, xmm5 ; xmm4= fjyH1a fjzH1a fjyH1b fjzH1b
- movaps xmm5, [esp + .fjxH2]
- movaps xmm6, [esp + .fjyH2]
- movhlps xmm7, xmm4 ; xmm7= fjyH1b fjzH1b
- unpcklps xmm5, xmm6 ; xmm5= fjxH2a fjyH2a fjxH2b fjyH2b
- movlhps xmm0, xmm1 ; xmm0= fjxOa fjyOa fjzOa fjxH1a
- shufps xmm3, xmm1, 11100100b
- ; xmm3= fjxOb fjyOb fjzOb fjxH1b
- movlhps xmm4, xmm5 ; xmm4= fjyH1a fjzH1a fjxH2a fjyH2a
- shufps xmm7, xmm5, 11100100b
- ; xmm7= fjyH1b fjzH1b fjxH2b fjyH2b
- movups xmm1, [edi + eax*4]
- movups xmm2, [edi + eax*4 + 16]
- movups xmm5, [edi + ebx*4]
- movups xmm6, [edi + ebx*4 + 16]
- addps xmm1, xmm0
- addps xmm2, xmm4
- addps xmm5, xmm3
- addps xmm6, xmm7
- movss xmm0, [edi + eax*4 + 32]
- movss xmm3, [edi + ebx*4 + 32]
-
- movaps xmm4, [esp + .fjzH2]
- movaps xmm7, xmm4
- shufps xmm7, xmm7, 1b
-
- movups [edi + eax*4], xmm1
- movups [edi + eax*4 + 16],xmm2
- movups [edi + ebx*4], xmm5
- movups [edi + ebx*4 + 16],xmm6
- addss xmm0, xmm4
- addss xmm3, xmm7
- movss [edi + eax*4 + 32], xmm0
- movss [edi + ebx*4 + 32], xmm3
-
- ;; then do the second pair (c & d)
- movaps xmm0, [esp + .fjxO] ; xmm0= fjxOa fjxOb fjxOc fjxOd
- movaps xmm1, [esp + .fjyO] ; xmm1= fjyOa fjyOb fjyOc fjyOd
- unpckhps xmm0, xmm1 ; xmm0= fjxOc fjyOc fjxOd fjyOd
- movaps xmm1, [esp + .fjzO]
- movaps xmm2, [esp + .fjxH1]
- movhlps xmm3, xmm0 ; xmm3= fjxOd fjyOd
- unpckhps xmm1, xmm2 ; xmm1= fjzOc fjxH1c fjzOd fjxH1d
- movaps xmm4, [esp + .fjyH1]
- movaps xmm5, [esp + .fjzH1]
- unpckhps xmm4, xmm5 ; xmm4= fjyH1c fjzH1c fjyH1d fjzH1d
- movaps xmm5, [esp + .fjxH2]
- movaps xmm6, [esp + .fjyH2]
- movhlps xmm7, xmm4 ; xmm7= fjyH1d fjzH1d
- unpckhps xmm5, xmm6 ; xmm5= fjxH2c fjyH2c fjxH2d fjyH2d
- movlhps xmm0, xmm1 ; xmm0= fjxOc fjyOc fjzOc fjxH1c
- shufps xmm3, xmm1, 11100100b
- ; xmm3= fjxOd fjyOd fjzOd fjxH1d
- movlhps xmm4, xmm5 ; xmm4= fjyH1c fjzH1c fjxH2c fjyH2c
- shufps xmm7, xmm5, 11100100b
- ; xmm7= fjyH1d fjzH1d fjxH2d fjyH2d
- movups xmm1, [edi + ecx*4]
- movups xmm2, [edi + ecx*4 + 16]
- movups xmm5, [edi + edx*4]
- movups xmm6, [edi + edx*4 + 16]
- addps xmm1, xmm0
- addps xmm2, xmm4
- addps xmm5, xmm3
- addps xmm6, xmm7
- movss xmm0, [edi + ecx*4 + 32]
- movss xmm3, [edi + edx*4 + 32]
-
- movaps xmm4, [esp + .fjzH2]
- movaps xmm7, xmm4
- shufps xmm4, xmm4, 10b
- shufps xmm7, xmm7, 11b
- movups [edi + ecx*4], xmm1
- movups [edi + ecx*4 + 16],xmm2
- movups [edi + edx*4], xmm5
- movups [edi + edx*4 + 16],xmm6
- addss xmm0, xmm4
- addss xmm3, xmm7
- movss [edi + ecx*4 + 32], xmm0
- movss [edi + edx*4 + 32], xmm3
-
- ;; should we do one more iteration?
- sub [esp + .innerk], dword 4
- jl .single_check
- jmp .unroll_loop
-.single_check:
- add [esp + .innerk], dword 4
- jnz .single_loop
- jmp .updateouterdata
-.single_loop:
- mov edx, [esp + .innerjjnr] ; pointer to jjnr[k]
- mov eax, [edx]
- add [esp + .innerjjnr], dword 4
-
- mov esi, [ebp + %$pos]
- lea eax, [eax + eax*2]
-
- ; fetch j coordinates
- xorps xmm3, xmm3
- xorps xmm4, xmm4
- xorps xmm5, xmm5
- movss xmm3, [esi + eax*4]
- movss xmm4, [esi + eax*4 + 4]
- movss xmm5, [esi + eax*4 + 8]
-
- movlps xmm6, [esi + eax*4 + 12]
- movhps xmm6, [esi + eax*4 + 24] ; xmm6=jxH1 jyH1 jxH2 jyH2
- ;; fetch both z coords in one go, to positions 0 and 3 in xmm7
- movups xmm7, [esi + eax*4 + 20] ; xmm7=jzH1 jxH2 jyH2 jzH2
- shufps xmm6, xmm6, 11011000b ; xmm6=jxH1 jxH2 jyH1 jyH2
- movlhps xmm3, xmm6 ; xmm3= jxO 0 jxH1 jxH2
- movaps xmm0, [esp + .ixO]
- movaps xmm1, [esp + .iyO]
- movaps xmm2, [esp + .izO]
- shufps xmm4, xmm6, 11100100b ; xmm4= jyO 0 jyH1 jyH2
- shufps xmm5, xmm7, 11000100b ; xmm5= jzO 0 jzH1 jzH2
- ;; store all j coordinates in jO
- movaps [esp + .jxO], xmm3
- movaps [esp + .jyO], xmm4
- movaps [esp + .jzO], xmm5
- subps xmm0, xmm3
- subps xmm1, xmm4
- subps xmm2, xmm5
- movaps [esp + .dxOO], xmm0
- movaps [esp + .dyOO], xmm1
- movaps [esp + .dzOO], xmm2
- mulps xmm0, xmm0
- mulps xmm1, xmm1
- mulps xmm2, xmm2
- addps xmm0, xmm1
- addps xmm0, xmm2 ; have rsq in xmm0.
-
- movaps xmm6, xmm0
-
- ;; do invsqrt
- rsqrtps xmm1, xmm0
- mulps xmm6, [esp + .krf] ; xmm6=krsq
- movaps xmm2, xmm1
- movaps xmm7, xmm6 ; xmm7=krsq
- mulps xmm1, xmm1
- movaps xmm3, [esp + .three]
- mulps xmm1, xmm0
- subps xmm3, xmm1
- mulps xmm3, xmm2
- mulps xmm3, [esp + .half] ; rinv iO - j water
-
-
-
- addps xmm6, xmm3 ; xmm6=rinv+krsq
- mulps xmm7, [esp + .two]
- subps xmm6, [esp + .crf] ; xmm6=rinv+krsq-crf
-
- xorps xmm1, xmm1
- movaps xmm0, xmm3
- subps xmm3, xmm7 ; xmm3=rinv-2*krsq
- xorps xmm4, xmm4
- mulps xmm0, xmm0 ; xmm0=rinvsq
- ;; fetch charges to xmm4 (temporary)
- movss xmm4, [esp + .qqOO]
- movhps xmm4, [esp + .qqOH]
-
- mulps xmm6, xmm4 ; vcoul
- mulps xmm3, xmm4 ; coul part of fs
-
-
- addps xmm6, [esp + .vctot]
- mulps xmm0, xmm3 ; total fscal
- movaps [esp + .vctot], xmm6
-
- movaps xmm1, xmm0
- movaps xmm2, xmm0
- mulps xmm0, [esp + .dxOO]
- mulps xmm1, [esp + .dyOO]
- mulps xmm2, [esp + .dzOO]
-
- ;; initial update for j forces
- xorps xmm3, xmm3
- xorps xmm4, xmm4
- xorps xmm5, xmm5
- subps xmm3, xmm0
- subps xmm4, xmm1
- subps xmm5, xmm2
- movaps [esp + .fjxO], xmm3
- movaps [esp + .fjyO], xmm4
- movaps [esp + .fjzO], xmm5
- addps xmm0, [esp + .fixO]
- addps xmm1, [esp + .fiyO]
- addps xmm2, [esp + .fizO]
- movaps [esp + .fixO], xmm0
- movaps [esp + .fiyO], xmm1
- movaps [esp + .fizO], xmm2
-
-
- ;; done with i O. Now do i H1 & H2 simultaneously. first get i particle coords:
- movaps xmm0, [esp + .ixH1]
- movaps xmm1, [esp + .iyH1]
- movaps xmm2, [esp + .izH1]
- movaps xmm3, [esp + .ixH2]
- movaps xmm4, [esp + .iyH2]
- movaps xmm5, [esp + .izH2]
- subps xmm0, [esp + .jxO]
- subps xmm1, [esp + .jyO]
- subps xmm2, [esp + .jzO]
- subps xmm3, [esp + .jxO]
- subps xmm4, [esp + .jyO]
- subps xmm5, [esp + .jzO]
- movaps [esp + .dxH1O], xmm0
- movaps [esp + .dyH1O], xmm1
- movaps [esp + .dzH1O], xmm2
- movaps [esp + .dxH2O], xmm3
- movaps [esp + .dyH2O], xmm4
- movaps [esp + .dzH2O], xmm5
- mulps xmm0, xmm0
- mulps xmm1, xmm1
- mulps xmm2, xmm2
- mulps xmm3, xmm3
- mulps xmm4, xmm4
- mulps xmm5, xmm5
- addps xmm0, xmm1
- addps xmm4, xmm3
- addps xmm0, xmm2 ; have rsqH1 in xmm0.
- addps xmm4, xmm5 ; have rsqH2 in xmm4.
-
- ;; do invsqrt
- rsqrtps xmm1, xmm0
- rsqrtps xmm5, xmm4
- movaps xmm2, xmm1
- movaps xmm6, xmm5
- mulps xmm1, xmm1
- mulps xmm5, xmm5
- movaps xmm3, [esp + .three]
- movaps xmm7, xmm3
- mulps xmm1, xmm0
- mulps xmm5, xmm4
- subps xmm3, xmm1
- subps xmm7, xmm5
- mulps xmm3, xmm2
- mulps xmm7, xmm6
- mulps xmm3, [esp + .half] ; rinv H1 - j water
- mulps xmm7, [esp + .half] ; rinv H2 - j water
-
- mulps xmm0, [esp + .krf] ; krsq
- mulps xmm4, [esp + .krf] ; krsq
-
- ;; assemble charges in xmm6
- xorps xmm6, xmm6
- movss xmm6, [esp + .qqOH]
- movhps xmm6, [esp + .qqHH]
- movaps xmm1, xmm0
- movaps xmm5, xmm4
- addps xmm0, xmm3 ; krsq+rinv
- addps xmm4, xmm7 ; krsq+rinv
- subps xmm0, [esp + .crf]
- subps xmm4, [esp + .crf]
- mulps xmm1, [esp + .two]
- mulps xmm5, [esp + .two]
- mulps xmm0, xmm6 ; vcoul
- mulps xmm4, xmm6 ; vcoul
- addps xmm4, xmm0
- addps xmm4, [esp + .vctot]
- movaps [esp + .vctot], xmm4
- movaps xmm0, xmm3
- movaps xmm4, xmm7
- mulps xmm3, xmm3
- mulps xmm7, xmm7
- subps xmm0, xmm1
- subps xmm4, xmm5
- mulps xmm0, xmm6
- mulps xmm4, xmm6
- mulps xmm0, xmm3 ; fscal
- mulps xmm7, xmm4 ; fscal
-
- movaps xmm1, xmm0
- movaps xmm2, xmm0
- mulps xmm0, [esp + .dxH1O]
- mulps xmm1, [esp + .dyH1O]
- mulps xmm2, [esp + .dzH1O]
- ;; update forces H1 - j water
- movaps xmm3, [esp + .fjxO]
- movaps xmm4, [esp + .fjyO]
- movaps xmm5, [esp + .fjzO]
- subps xmm3, xmm0
- subps xmm4, xmm1
- subps xmm5, xmm2
- movaps [esp + .fjxO], xmm3
- movaps [esp + .fjyO], xmm4
- movaps [esp + .fjzO], xmm5
- addps xmm0, [esp + .fixH1]
- addps xmm1, [esp + .fiyH1]
- addps xmm2, [esp + .fizH1]
- movaps [esp + .fixH1], xmm0
- movaps [esp + .fiyH1], xmm1
- movaps [esp + .fizH1], xmm2
- ;; do forces H2 - j water
- movaps xmm0, xmm7
- movaps xmm1, xmm7
- movaps xmm2, xmm7
- mulps xmm0, [esp + .dxH2O]
- mulps xmm1, [esp + .dyH2O]
- mulps xmm2, [esp + .dzH2O]
- movaps xmm3, [esp + .fjxO]
- movaps xmm4, [esp + .fjyO]
- movaps xmm5, [esp + .fjzO]
- subps xmm3, xmm0
- subps xmm4, xmm1
- subps xmm5, xmm2
- mov esi, [ebp + %$faction]
- movaps [esp + .fjxO], xmm3
- movaps [esp + .fjyO], xmm4
- movaps [esp + .fjzO], xmm5
- addps xmm0, [esp + .fixH2]
- addps xmm1, [esp + .fiyH2]
- addps xmm2, [esp + .fizH2]
- movaps [esp + .fixH2], xmm0
- movaps [esp + .fiyH2], xmm1
- movaps [esp + .fizH2], xmm2
-
- ;; update j water forces from local variables
- movlps xmm0, [esi + eax*4]
- movlps xmm1, [esi + eax*4 + 12]
- movhps xmm1, [esi + eax*4 + 24]
- movaps xmm3, [esp + .fjxO]
- movaps xmm4, [esp + .fjyO]
- movaps xmm5, [esp + .fjzO]
- movaps xmm6, xmm5
- movaps xmm7, xmm5
- shufps xmm6, xmm6, 10b
- shufps xmm7, xmm7, 11b
- addss xmm5, [esi + eax*4 + 8]
- addss xmm6, [esi + eax*4 + 20]
- addss xmm7, [esi + eax*4 + 32]
- movss [esi + eax*4 + 8], xmm5
- movss [esi + eax*4 + 20], xmm6
- movss [esi + eax*4 + 32], xmm7
- movaps xmm5, xmm3
- unpcklps xmm3, xmm4
- unpckhps xmm5, xmm4
- addps xmm0, xmm3
- addps xmm1, xmm5
- movlps [esi + eax*4], xmm0
- movlps [esi + eax*4 + 12], xmm1
- movhps [esi + eax*4 + 24], xmm1
-
- dec dword [esp + .innerk]
- jz .updateouterdata
- jmp .single_loop
-.updateouterdata:
- mov ecx, [esp + .ii3]
- mov edi, [ebp + %$faction]
- mov esi, [ebp + %$fshift]
- mov edx, [esp + .is3]
-
- ; accumulate Oi forces in xmm0, xmm1, xmm2
- movaps xmm0, [esp + .fixO]
- movaps xmm1, [esp + .fiyO]
- movaps xmm2, [esp + .fizO]
-
- movhlps xmm3, xmm0
- movhlps xmm4, xmm1
- movhlps xmm5, xmm2
- addps xmm0, xmm3
- addps xmm1, xmm4
- addps xmm2, xmm5 ; summan ligger i 1/2 i xmm0-xmm2
-
- movaps xmm3, xmm0
- movaps xmm4, xmm1
- movaps xmm5, xmm2
-
- shufps xmm3, xmm3, 1b
- shufps xmm4, xmm4, 1b
- shufps xmm5, xmm5, 1b
- addss xmm0, xmm3
- addss xmm1, xmm4
- addss xmm2, xmm5 ; xmm0-xmm2 has single force in pos0.
-
- ; increment i force
- movss xmm3, [edi + ecx*4]
- movss xmm4, [edi + ecx*4 + 4]
- movss xmm5, [edi + ecx*4 + 8]
- addss xmm3, xmm0
- addss xmm4, xmm1
- addss xmm5, xmm2
- movss [edi + ecx*4], xmm3
- movss [edi + ecx*4 + 4], xmm4
- movss [edi + ecx*4 + 8], xmm5
-
- ; accumulate force in xmm6/xmm7 for fshift
- movaps xmm6, xmm0
- movss xmm7, xmm2
- movlhps xmm6, xmm1
- shufps xmm6, xmm6, 1000b
-
- ; accumulate H1i forces in xmm0, xmm1, xmm2
- movaps xmm0, [esp + .fixH1]
- movaps xmm1, [esp + .fiyH1]
- movaps xmm2, [esp + .fizH1]
-
- movhlps xmm3, xmm0
- movhlps xmm4, xmm1
- movhlps xmm5, xmm2
- addps xmm0, xmm3
- addps xmm1, xmm4
- addps xmm2, xmm5 ; summan ligger i 1/2 i xmm0-xmm2
-
- movaps xmm3, xmm0
- movaps xmm4, xmm1
- movaps xmm5, xmm2
-
- shufps xmm3, xmm3, 1b
- shufps xmm4, xmm4, 1b
- shufps xmm5, xmm5, 1b
- addss xmm0, xmm3
- addss xmm1, xmm4
- addss xmm2, xmm5 ; xmm0-xmm2 has single force in pos0.
-
- ; increment i force
- movss xmm3, [edi + ecx*4 + 12]
- movss xmm4, [edi + ecx*4 + 16]
- movss xmm5, [edi + ecx*4 + 20]
- addss xmm3, xmm0
- addss xmm4, xmm1
- addss xmm5, xmm2
- movss [edi + ecx*4 + 12], xmm3
- movss [edi + ecx*4 + 16], xmm4
- movss [edi + ecx*4 + 20], xmm5
-
- ;accumulate force in xmm6/xmm7 for fshift
- addss xmm7, xmm2
- movlhps xmm0, xmm1
- shufps xmm0, xmm0, 1000b
- addps xmm6, xmm0
-
- ; accumulate H2i forces in xmm0, xmm1, xmm2
- movaps xmm0, [esp + .fixH2]
- movaps xmm1, [esp + .fiyH2]
- movaps xmm2, [esp + .fizH2]
-
- movhlps xmm3, xmm0
- movhlps xmm4, xmm1
- movhlps xmm5, xmm2
- addps xmm0, xmm3
- addps xmm1, xmm4
- addps xmm2, xmm5 ; summan ligger i 1/2 i xmm0-xmm2
-
- movaps xmm3, xmm0
- movaps xmm4, xmm1
- movaps xmm5, xmm2
-
- shufps xmm3, xmm3, 1b
- shufps xmm4, xmm4, 1b
- shufps xmm5, xmm5, 1b
- addss xmm0, xmm3
- addss xmm1, xmm4
- addss xmm2, xmm5 ; xmm0-xmm2 has single force in pos0.
-
- ; increment i force
- movss xmm3, [edi + ecx*4 + 24]
- movss xmm4, [edi + ecx*4 + 28]
- movss xmm5, [edi + ecx*4 + 32]
- addss xmm3, xmm0
- addss xmm4, xmm1
- addss xmm5, xmm2
- movss [edi + ecx*4 + 24], xmm3
- movss [edi + ecx*4 + 28], xmm4
- movss [edi + ecx*4 + 32], xmm5
-
- ;accumulate force in xmm6/xmm7 for fshift
- addss xmm7, xmm2
- movlhps xmm0, xmm1
- shufps xmm0, xmm0, 1000b
- addps xmm6, xmm0
-
- ; increment fshift force
- movlps xmm3, [esi + edx*4]
- movss xmm4, [esi + edx*4 + 8]
- addps xmm3, xmm6
- addss xmm4, xmm7
- movlps [esi + edx*4], xmm3
- movss [esi + edx*4 + 8], xmm4
-
- ; get group index for i particle
- mov edx, [ebp + %$gid] ; get group index for this i particle
- mov edx, [edx]
- add [ebp + %$gid], dword 4 ; advance pointer
-
- ; accumulate total potential energy and update it.
- movaps xmm7, [esp + .vctot]
- ; accumulate
- movhlps xmm6, xmm7
- addps xmm7, xmm6 ; pos 0-1 in xmm7 have the sum now
- movaps xmm6, xmm7
- shufps xmm6, xmm6, 1b
- addss xmm7, xmm6
-
- ; add earlier value from mem.
- mov eax, [ebp + %$Vc]
- addss xmm7, [eax + edx*4]
- ; move back to mem.
- movss [eax + edx*4], xmm7
-
- ;; finish if last
- mov ecx, [ebp + %$nri]
- dec ecx
- jecxz .end
- ;; not last, iterate once more!
- mov [ebp + %$nri], ecx
- jmp .outer
-.end:
- emms
- mov eax, [esp + .salign]
- add esp, eax
- add esp, 1460
- pop edi
- pop esi
- pop edx
- pop ecx
- pop ebx
- pop eax
- endproc
-
-
-
-
-proc inl3000_sse
-%$nri arg
-%$iinr arg
-%$jindex arg
-%$jjnr arg
-%$shift arg
-%$shiftvec arg
-%$fshift arg
-%$gid arg
-%$pos arg
-%$faction arg
-%$charge arg
-%$facel arg
-%$Vc arg
-%$tabscale arg
-%$VFtab arg
- ;; stack offsets for local variables
- ;; bottom of stack is cache-aligned for sse use
-.ix equ 0
-.iy equ 16
-.iz equ 32
-.iq equ 48
-.dx equ 64
-.dy equ 80
-.dz equ 96
-.two equ 112
-.tabscale equ 128
-.qq equ 144
-.fs equ 160
-.vctot equ 176
-.fix equ 192
-.fiy equ 208
-.fiz equ 224
-.half equ 240
-.three equ 256
-.is3 equ 272
-.ii3 equ 276
-.innerjjnr equ 280
-.innerk equ 284
-.salign equ 288
- push eax
- push ebx
- push ecx
- push edx
- push esi
- push edi
- sub esp, 292 ; local stack space
- mov eax, esp
- and eax, 0xf
- sub esp, eax
- mov [esp + .salign], eax
-
- emms
-
- movups xmm0, [sse_half]
- movups xmm1, [sse_two]
- movups xmm2, [sse_three]
- movss xmm3, [ebp + %$tabscale]
- movaps [esp + .half], xmm0
- movaps [esp + .two], xmm1
- movaps [esp + .three], xmm2
- shufps xmm3, xmm3, 0b
- movaps [esp + .tabscale], xmm3
-
- ;; assume we have at least one i particle - start directly
-.outer:
- mov eax, [ebp + %$shift] ; eax = pointer into shift[]
- mov ebx, [eax] ; ebx=shift[n]
- add [ebp + %$shift], dword 4 ; advance pointer one step
-
- lea ebx, [ebx + ebx*2] ; ebx=3*is
- mov [esp + .is3],ebx ; store is3
-
- mov eax, [ebp + %$shiftvec] ; eax = base of shiftvec[]
-
- movss xmm0, [eax + ebx*4]
- movss xmm1, [eax + ebx*4 + 4]
- movss xmm2, [eax + ebx*4 + 8]
-
- mov ecx, [ebp + %$iinr] ; ecx = pointer into iinr[]
- add [ebp + %$iinr], dword 4 ; advance pointer
- mov ebx, [ecx] ; ebx =ii
-
- mov edx, [ebp + %$charge]
- movss xmm3, [edx + ebx*4]
- mulss xmm3, [ebp + %$facel]
- shufps xmm3, xmm3, 0b
-
- lea ebx, [ebx + ebx*2] ; ebx = 3*ii=ii3
- mov eax, [ebp + %$pos] ; eax = base of pos[]
-
- addss xmm0, [eax + ebx*4]
- addss xmm1, [eax + ebx*4 + 4]
- addss xmm2, [eax + ebx*4 + 8]
-
- movaps [esp + .iq], xmm3
-
- shufps xmm0, xmm0, 0b
- shufps xmm1, xmm1, 0b
- shufps xmm2, xmm2, 0b
-
- movaps [esp + .ix], xmm0
- movaps [esp + .iy], xmm1
- movaps [esp + .iz], xmm2
-
- mov [esp + .ii3], ebx
-
- ; clear vctot and i forces
- xorps xmm4, xmm4
- movaps [esp + .vctot], xmm4
- movaps [esp + .fix], xmm4
- movaps [esp + .fiy], xmm4
- movaps [esp + .fiz], xmm4
-
- mov eax, [ebp + %$jindex]
- mov ecx, [eax] ; jindex[n]
- mov edx, [eax + 4] ; jindex[n+1]
- add [ebp + %$jindex], dword 4
- sub edx, ecx ; number of innerloop atoms
-
- mov esi, [ebp + %$pos]
- mov edi, [ebp + %$faction]
- mov eax, [ebp + %$jjnr]
- shl ecx, 2
- add eax, ecx
- mov [esp + .innerjjnr], eax ; pointer to jjnr[nj0]
- sub edx, dword 4
- mov [esp + .innerk], edx ; number of innerloop atoms
- jge .unroll_loop
- jmp .finish_inner
-.unroll_loop:
- ;; quad-unroll innerloop here.
- mov edx, [esp + .innerjjnr] ; pointer to jjnr[k]
- mov eax, [edx]
- mov ebx, [edx + 4]
- mov ecx, [edx + 8]
- mov edx, [edx + 12] ; eax-edx=jnr1-4
- add [esp + .innerjjnr], dword 16 ; advance pointer (unrolled 4)
-
- mov esi, [ebp + %$charge] ; base of charge[]
-
- movss xmm3, [esi + eax*4]
- movss xmm4, [esi + ecx*4]
- movss xmm6, [esi + ebx*4]
- movss xmm7, [esi + edx*4]
-
- movaps xmm2, [esp + .iq]
- shufps xmm3, xmm6, 00000000b
- shufps xmm4, xmm7, 00000000b
- shufps xmm3, xmm4, 10001000b ; all charges in xmm3
- mulps xmm3, xmm2
-
- movaps [esp + .qq], xmm3
-
- mov esi, [ebp + %$pos] ; base of pos[]
-
- lea eax, [eax + eax*2] ; replace jnr with j3
- lea ebx, [ebx + ebx*2]
-
- lea ecx, [ecx + ecx*2] ; replace jnr with j3
- lea edx, [edx + edx*2]
-
- ; move four coordinates to xmm0-xmm2
-
- movlps xmm4, [esi + eax*4]
- movlps xmm5, [esi + ecx*4]
- movss xmm2, [esi + eax*4 + 8]
- movss xmm6, [esi + ecx*4 + 8]
-
- movhps xmm4, [esi + ebx*4]
- movhps xmm5, [esi + edx*4]
-
- movss xmm0, [esi + ebx*4 + 8]
- movss xmm1, [esi + edx*4 + 8]
-
- shufps xmm2, xmm0, 0b
- shufps xmm6, xmm1, 0b
-
- movaps xmm0, xmm4
- movaps xmm1, xmm4
-
- shufps xmm2, xmm6, 10001000b
-
- shufps xmm0, xmm5, 10001000b
- shufps xmm1, xmm5, 11011101b
-
- ; move ix-iz to xmm4-xmm6
- movaps xmm4, [esp + .ix]
- movaps xmm5, [esp + .iy]
- movaps xmm6, [esp + .iz]
-
- ; calc dr
- subps xmm4, xmm0
- subps xmm5, xmm1
- subps xmm6, xmm2
-
- ; store dr
- movaps [esp + .dx], xmm4
- movaps [esp + .dy], xmm5
- movaps [esp + .dz], xmm6
- ; square it
- mulps xmm4,xmm4
- mulps xmm5,xmm5
- mulps xmm6,xmm6
- addps xmm4, xmm5
- addps xmm4, xmm6
- ; rsq in xmm4
-
- rsqrtps xmm5, xmm4
- ; lookup seed in xmm5
- movaps xmm2, xmm5
- mulps xmm5, xmm5
- movaps xmm1, [esp + .three]
- mulps xmm5, xmm4 ;rsq*lu*lu
- movaps xmm0, [esp + .half]
- subps xmm1, xmm5 ; 3.0-rsq*lu*lu
- mulps xmm1, xmm2
- mulps xmm0, xmm1 ; xmm0=rinv
- mulps xmm4, xmm0 ; xmm4=r
- mulps xmm4, [esp + .tabscale]
-
- movhlps xmm5, xmm4
- cvttps2pi mm6, xmm4
- cvttps2pi mm7, xmm5 ; mm6/mm7 contain lu indices
- cvtpi2ps xmm6, mm6
- cvtpi2ps xmm5, mm7
- movlhps xmm6, xmm5
- subps xmm4, xmm6
- movaps xmm1, xmm4 ;xmm1=eps
- movaps xmm2, xmm1
- mulps xmm2, xmm2 ;xmm2=eps2
- pslld mm6, 2
- pslld mm7, 2
-
- movd mm0, eax
- movd mm1, ebx
- movd mm2, ecx
- movd mm3, edx
-
- mov esi, [ebp + %$VFtab]
- movd eax, mm6
- psrlq mm6, 32
- movd ecx, mm7
- psrlq mm7, 32
- movd ebx, mm6
- movd edx, mm7
-
- movlps xmm5, [esi + eax*4]
- movlps xmm7, [esi + ecx*4]
- movhps xmm5, [esi + ebx*4]
- movhps xmm7, [esi + edx*4] ; got half coulomb table
-
- movaps xmm4, xmm5
- shufps xmm4, xmm7, 10001000b
- shufps xmm5, xmm7, 11011101b
-
- movlps xmm7, [esi + eax*4 + 8]
- movlps xmm3, [esi + ecx*4 + 8]
- movhps xmm7, [esi + ebx*4 + 8]
- movhps xmm3, [esi + edx*4 + 8] ; other half of coulomb table
- movaps xmm6, xmm7
- shufps xmm6, xmm3, 10001000b
- shufps xmm7, xmm3, 11011101b
- ; coulomb table ready, in xmm4-xmm7
-
- mulps xmm6, xmm1 ; xmm6=Geps
- mulps xmm7, xmm2 ; xmm7=Heps2
- addps xmm5, xmm6
- addps xmm5, xmm7 ; xmm5=Fp
- mulps xmm7, [esp + .two] ; two*Heps2
- movaps xmm3, [esp + .qq]
- addps xmm7, xmm6
- addps xmm7, xmm5 ; xmm7=FF
- mulps xmm5, xmm1 ; xmm5=eps*Fp
- addps xmm5, xmm4 ; xmm5=VV
- mulps xmm5, xmm3 ; vcoul=qq*VV
- mulps xmm3, xmm7 ; fijC=FF*qq
- ; at this point mm5 contains vcoul and mm3 fijC.
- ; increment vcoul - then we can get rid of mm5.
- ;; update vctot
- addps xmm5, [esp + .vctot]
- movaps [esp + .vctot], xmm5
-
- xorps xmm4, xmm4
-
- mulps xmm3, [esp + .tabscale]
- mulps xmm3, xmm0
- subps xmm4, xmm3
-
- movaps xmm0, [esp + .dx]
- movaps xmm1, [esp + .dy]
- movaps xmm2, [esp + .dz]
-
- movd eax, mm0
- movd ebx, mm1
- movd ecx, mm2
- movd edx, mm3
-
- mov edi, [ebp + %$faction]
- mulps xmm0, xmm4
- mulps xmm1, xmm4
- mulps xmm2, xmm4
- ; xmm0-xmm2 contains tx-tz (partial force)
- ; now update f_i
- movaps xmm3, [esp + .fix]
- movaps xmm4, [esp + .fiy]
- movaps xmm5, [esp + .fiz]
- addps xmm3, xmm0
- addps xmm4, xmm1
- addps xmm5, xmm2
- movaps [esp + .fix], xmm3
- movaps [esp + .fiy], xmm4
- movaps [esp + .fiz], xmm5
- ; the fj's - start by accumulating x & y forces from memory
- movlps xmm4, [edi + eax*4]
- movlps xmm6, [edi + ecx*4]
- movhps xmm4, [edi + ebx*4]
- movhps xmm6, [edi + edx*4]
-
- movaps xmm3, xmm4
- shufps xmm3, xmm6, 10001000b
- shufps xmm4, xmm6, 11011101b
-
- ; now xmm3-xmm5 contains fjx, fjy, fjz
- subps xmm3, xmm0
- subps xmm4, xmm1
-
- ; unpack them back so we can store them - first x & y in xmm3/xmm4
-
- movaps xmm6, xmm3
- unpcklps xmm6, xmm4
- unpckhps xmm3, xmm4
- ; xmm6(l)=x & y for j1, (h) for j2
- ; xmm3(l)=x & y for j3, (h) for j4
- movlps [edi + eax*4], xmm6
- movlps [edi + ecx*4], xmm3
-
- movhps [edi + ebx*4], xmm6
- movhps [edi + edx*4], xmm3
-
- ;; and the z forces
- movss xmm4, [edi + eax*4 + 8]
- movss xmm5, [edi + ebx*4 + 8]
- movss xmm6, [edi + ecx*4 + 8]
- movss xmm7, [edi + edx*4 + 8]
- subss xmm4, xmm2
- shufps xmm2, xmm2, 11100101b
- subss xmm5, xmm2
- shufps xmm2, xmm2, 11101010b
- subss xmm6, xmm2
- shufps xmm2, xmm2, 11111111b
- subss xmm7, xmm2
- movss [edi + eax*4 + 8], xmm4
- movss [edi + ebx*4 + 8], xmm5
- movss [edi + ecx*4 + 8], xmm6
- movss [edi + edx*4 + 8], xmm7
-
- ;; should we do one more iteration?
- sub [esp + .innerk], dword 4
- jl .finish_inner
- jmp .unroll_loop
-.finish_inner:
- ;; check if at least two particles remain
- add [esp + .innerk], dword 4
- mov edx, [esp + .innerk]
- and edx, 10b
- jnz .dopair
- jmp .checksingle
-.dopair:
- mov esi, [ebp + %$charge]
-
- mov ecx, [esp + .innerjjnr]
-
- mov eax, [ecx]
- mov ebx, [ecx + 4]
- add [esp + .innerjjnr], dword 8
- xorps xmm7, xmm7
- movss xmm3, [esi + eax*4]
- movss xmm6, [esi + ebx*4]
- shufps xmm3, xmm6, 00000000b
- shufps xmm3, xmm3, 00001000b ; xmm3(0,1) has the charges.
-
- mulps xmm3, [esp + .iq]
- movlhps xmm3, xmm7
- movaps [esp + .qq], xmm3
-
- mov edi, [ebp + %$pos]
-
- lea eax, [eax + eax*2]
- lea ebx, [ebx + ebx*2]
- ; move coordinates to xmm0-xmm2
- movlps xmm1, [edi + eax*4]
- movss xmm2, [edi + eax*4 + 8]
- movhps xmm1, [edi + ebx*4]
- movss xmm0, [edi + ebx*4 + 8]
-
- movlhps xmm3, xmm7
-
- shufps xmm2, xmm0, 0b
-
- movaps xmm0, xmm1
-
- shufps xmm2, xmm2, 10001000b
-
- shufps xmm0, xmm0, 10001000b
- shufps xmm1, xmm1, 11011101b
-
- mov edi, [ebp + %$faction]
- ; move ix-iz to xmm4-xmm6
- xorps xmm7, xmm7
-
- movaps xmm4, [esp + .ix]
- movaps xmm5, [esp + .iy]
- movaps xmm6, [esp + .iz]
-
- ; calc dr
- subps xmm4, xmm0
- subps xmm5, xmm1
- subps xmm6, xmm2
-
- ; store dr
- movaps [esp + .dx], xmm4
- movaps [esp + .dy], xmm5
- movaps [esp + .dz], xmm6
- ; square it
- mulps xmm4,xmm4
- mulps xmm5,xmm5
- mulps xmm6,xmm6
- addps xmm4, xmm5
- addps xmm4, xmm6
- ; rsq in xmm4
-
- rsqrtps xmm5, xmm4
- ; lookup seed in xmm5
- movaps xmm2, xmm5
- mulps xmm5, xmm5
- movaps xmm1, [esp + .three]
- mulps xmm5, xmm4 ;rsq*lu*lu
- movaps xmm0, [esp + .half]
- subps xmm1, xmm5 ; 3.0-rsq*lu*lu
- mulps xmm1, xmm2
- mulps xmm0, xmm1 ; xmm0=rinv
- mulps xmm4, xmm0 ; xmm4=r
- mulps xmm4, [esp + .tabscale]
-
- cvttps2pi mm6, xmm4 ; mm6 contain lu indices
- cvtpi2ps xmm6, mm6
- subps xmm4, xmm6
- movaps xmm1, xmm4 ;xmm1=eps
- movaps xmm2, xmm1
- mulps xmm2, xmm2 ;xmm2=eps2
-
- pslld mm6, 2
-
- mov esi, [ebp + %$VFtab]
- movd ecx, mm6
- psrlq mm6, 32
- movd edx, mm6
-
- movlps xmm5, [esi + ecx*4]
- movhps xmm5, [esi + edx*4] ; got half coulomb table
- movaps xmm4, xmm5
- shufps xmm4, xmm4, 10001000b
- shufps xmm5, xmm7, 11011101b
-
- movlps xmm7, [esi + ecx*4 + 8]
- movhps xmm7, [esi + edx*4 + 8]
- movaps xmm6, xmm7
- shufps xmm6, xmm6, 10001000b
- shufps xmm7, xmm7, 11011101b
- ; table ready in xmm4-xmm7
-
- mulps xmm6, xmm1 ; xmm6=Geps
- mulps xmm7, xmm2 ; xmm7=Heps2
- addps xmm5, xmm6
- addps xmm5, xmm7 ; xmm5=Fp
- mulps xmm7, [esp + .two] ; two*Heps2
- movaps xmm3, [esp + .qq]
- addps xmm7, xmm6
- addps xmm7, xmm5 ; xmm7=FF
- mulps xmm5, xmm1 ; xmm5=eps*Fp
- addps xmm5, xmm4 ; xmm5=VV
- mulps xmm5, xmm3 ; vcoul=qq*VV
- mulps xmm3, xmm7 ; fijC=FF*qq
- ; at this point mm5 contains vcoul and mm3 fijC.
- ; increment vcoul - then we can get rid of mm5.
- ;; update vctot
- addps xmm5, [esp + .vctot]
- movaps [esp + .vctot], xmm5
-
- xorps xmm4, xmm4
-
- mulps xmm3, [esp + .tabscale]
- mulps xmm3, xmm0
- subps xmm4, xmm3
-
- movaps xmm0, [esp + .dx]
- movaps xmm1, [esp + .dy]
- movaps xmm2, [esp + .dz]
-
- mulps xmm0, xmm4
- mulps xmm1, xmm4
- mulps xmm2, xmm4
- ; xmm0-xmm2 contains tx-tz (partial force)
- ; now update f_i
- movaps xmm3, [esp + .fix]
- movaps xmm4, [esp + .fiy]
- movaps xmm5, [esp + .fiz]
- addps xmm3, xmm0
- addps xmm4, xmm1
- addps xmm5, xmm2
- movaps [esp + .fix], xmm3
- movaps [esp + .fiy], xmm4
- movaps [esp + .fiz], xmm5
- ; update the fj's
- movss xmm3, [edi + eax*4]
- movss xmm4, [edi + eax*4 + 4]
- movss xmm5, [edi + eax*4 + 8]
- subss xmm3, xmm0
- subss xmm4, xmm1
- subss xmm5, xmm2
- movss [edi + eax*4], xmm3
- movss [edi + eax*4 + 4], xmm4
- movss [edi + eax*4 + 8], xmm5
-
- shufps xmm0, xmm0, 11100001b
- shufps xmm1, xmm1, 11100001b
- shufps xmm2, xmm2, 11100001b
-
- movss xmm3, [edi + ebx*4]
- movss xmm4, [edi + ebx*4 + 4]
- movss xmm5, [edi + ebx*4 + 8]
- subss xmm3, xmm0
- subss xmm4, xmm1
- subss xmm5, xmm2
- movss [edi + ebx*4], xmm3
- movss [edi + ebx*4 + 4], xmm4
- movss [edi + ebx*4 + 8], xmm5
-
-.checksingle:
- mov edx, [esp + .innerk]
- and edx, 1b
- jnz .dosingle
- jmp .updateouterdata
-.dosingle:
- mov esi, [ebp + %$charge]
- mov edi, [ebp + %$pos]
- mov ecx, [esp + .innerjjnr]
- mov eax, [ecx]
- xorps xmm6, xmm6
- movss xmm6, [esi + eax*4] ; xmm6(0) has the charge
- mulps xmm6, [esp + .iq]
- movaps [esp + .qq], xmm6
-
- lea eax, [eax + eax*2]
-
- ; move coordinates to xmm0-xmm2
- movss xmm0, [edi + eax*4]
- movss xmm1, [edi + eax*4 + 4]
- movss xmm2, [edi + eax*4 + 8]
-
- movaps xmm4, [esp + .ix]
- movaps xmm5, [esp + .iy]
- movaps xmm6, [esp + .iz]
-
- ; calc dr
- subps xmm4, xmm0
- subps xmm5, xmm1
- subps xmm6, xmm2
-
- ; store dr
- movaps [esp + .dx], xmm4
- movaps [esp + .dy], xmm5
- movaps [esp + .dz], xmm6
- ; square it
- mulps xmm4,xmm4
- mulps xmm5,xmm5
- mulps xmm6,xmm6
- addps xmm4, xmm5
- addps xmm4, xmm6
- ; rsq in xmm4
-
- rsqrtps xmm5, xmm4
- ; lookup seed in xmm5
- movaps xmm2, xmm5
- mulps xmm5, xmm5
- movaps xmm1, [esp + .three]
- mulps xmm5, xmm4 ;rsq*lu*lu
- movaps xmm0, [esp + .half]
- subps xmm1, xmm5 ; 3.0-rsq*lu*lu
- mulps xmm1, xmm2
- mulps xmm0, xmm1 ; xmm0=rinv
-
- mulps xmm4, xmm0 ; xmm4=r
- mulps xmm4, [esp + .tabscale]
-
- cvttps2pi mm6, xmm4 ; mm6 contain lu indices
- cvtpi2ps xmm6, mm6
- subps xmm4, xmm6
- movaps xmm1, xmm4 ;xmm1=eps
- movaps xmm2, xmm1
- mulps xmm2, xmm2 ;xmm2=eps2
-
- pslld mm6, 2
-
- mov esi, [ebp + %$VFtab]
- movd ebx, mm6
-
- movlps xmm4, [esi + ebx*4]
- movlps xmm6, [esi + ebx*4 + 8]
- movaps xmm5, xmm4
- movaps xmm7, xmm6
- shufps xmm5, xmm5, 1b
- shufps xmm7, xmm7, 1b
- ; table ready in xmm4-xmm7
-
- mulps xmm6, xmm1 ; xmm6=Geps
- mulps xmm7, xmm2 ; xmm7=Heps2
- addps xmm5, xmm6
- addps xmm5, xmm7 ; xmm5=Fp
- mulps xmm7, [esp + .two] ; two*Heps2
- movaps xmm3, [esp + .qq]
- addps xmm7, xmm6
- addps xmm7, xmm5 ; xmm7=FF
- mulps xmm5, xmm1 ; xmm5=eps*Fp
- addps xmm5, xmm4 ; xmm5=VV
- mulps xmm5, xmm3 ; vcoul=qq*VV
- mulps xmm3, xmm7 ; fijC=FF*qq
- ; at this point mm5 contains vcoul and mm3 fijC.
- ; increment vcoul - then we can get rid of mm5.
- ;; update vctot
- addps xmm5, [esp + .vctot]
- movaps [esp + .vctot], xmm5
-
- xorps xmm4, xmm4
-
- mulps xmm3, [esp + .tabscale]
- mulps xmm3, xmm0
- subps xmm4, xmm3
- mov edi, [ebp + %$faction]
-
- movaps xmm0, [esp + .dx]
- movaps xmm1, [esp + .dy]
- movaps xmm2, [esp + .dz]
-
- mulps xmm0, xmm4
- mulps xmm1, xmm4
- mulps xmm2, xmm4
- ; xmm0-xmm2 contains tx-tz (partial force)
- ; now update f_i
- movaps xmm3, [esp + .fix]
- movaps xmm4, [esp + .fiy]
- movaps xmm5, [esp + .fiz]
- addps xmm3, xmm0
- addps xmm4, xmm1
- addps xmm5, xmm2
- movaps [esp + .fix], xmm3
- movaps [esp + .fiy], xmm4
- movaps [esp + .fiz], xmm5
- ; update fj
-
- movss xmm3, [edi + eax*4]
- movss xmm4, [edi + eax*4 + 4]
- movss xmm5, [edi + eax*4 + 8]
- subss xmm3, xmm0
- subss xmm4, xmm1
- subss xmm5, xmm2
- movss [edi + eax*4], xmm3
- movss [edi + eax*4 + 4], xmm4
- movss [edi + eax*4 + 8], xmm5
-.updateouterdata:
- mov ecx, [esp + .ii3]
- mov edi, [ebp + %$faction]
- mov esi, [ebp + %$fshift]
- mov edx, [esp + .is3]
-
- ; accumulate i forces in xmm0, xmm1, xmm2
- movaps xmm0, [esp + .fix]
- movaps xmm1, [esp + .fiy]
- movaps xmm2, [esp + .fiz]
-
- movhlps xmm3, xmm0
- movhlps xmm4, xmm1
- movhlps xmm5, xmm2
- addps xmm0, xmm3
- addps xmm1, xmm4
- addps xmm2, xmm5 ; summan ligger i 1/2 i xmm0-xmm2
-
- movaps xmm3, xmm0
- movaps xmm4, xmm1
- movaps xmm5, xmm2
-
- shufps xmm3, xmm3, 1b
- shufps xmm4, xmm4, 1b
- shufps xmm5, xmm5, 1b
- addss xmm0, xmm3
- addss xmm1, xmm4
- addss xmm2, xmm5 ; xmm0-xmm2 has single force in pos0.
-
- ; increment i force
- movss xmm3, [edi + ecx*4]
- movss xmm4, [edi + ecx*4 + 4]
- movss xmm5, [edi + ecx*4 + 8]
- addss xmm3, xmm0
- addss xmm4, xmm1
- addss xmm5, xmm2
- movss [edi + ecx*4], xmm3
- movss [edi + ecx*4 + 4], xmm4
- movss [edi + ecx*4 + 8], xmm5
-
- ; increment fshift force
- movss xmm3, [esi + edx*4]
- movss xmm4, [esi + edx*4 + 4]
- movss xmm5, [esi + edx*4 + 8]
- addss xmm3, xmm0
- addss xmm4, xmm1
- addss xmm5, xmm2
- movss [esi + edx*4], xmm3
- movss [esi + edx*4 + 4], xmm4
- movss [esi + edx*4 + 8], xmm5
-
- ; get group index for i particle
- mov edx, [ebp + %$gid] ; get group index for this i particle
- mov edx, [edx]
- add [ebp + %$gid], dword 4 ; advance pointer
-
- ; accumulate total potential energy and update it.
- movaps xmm7, [esp + .vctot]
- ; accumulate
- movhlps xmm6, xmm7
- addps xmm7, xmm6 ; pos 0-1 in xmm7 have the sum now
- movaps xmm6, xmm7
- shufps xmm6, xmm6, 1b
- addss xmm7, xmm6
-
- ; add earlier value from mem.
- mov eax, [ebp + %$Vc]
- addss xmm7, [eax + edx*4]
- ; move back to mem.
- movss [eax + edx*4], xmm7
-
- ;; finish if last
- mov ecx, [ebp + %$nri]
- dec ecx
- jecxz .end
- ;; not last, iterate once more!
- mov [ebp + %$nri], ecx
- jmp .outer
-.end:
- emms
- mov eax, [esp + .salign]
- add esp, eax
- add esp, 292
- pop edi
- pop esi
- pop edx
- pop ecx
- pop ebx
- pop eax
- endproc
-
-
-
-proc inl3010_sse
-%$nri arg
-%$iinr arg
-%$jindex arg
-%$jjnr arg
-%$shift arg
-%$shiftvec arg
-%$fshift arg
-%$gid arg
-%$pos arg
-%$faction arg
-%$charge arg
-%$facel arg
-%$Vc arg
-%$tabscale arg
-%$VFtab arg
-%$nsatoms arg
- ;; stack offsets for local variables
- ;; bottom of stack is cache-aligned for sse use
-.ix equ 0
-.iy equ 16
-.iz equ 32
-.iq equ 48
-.dx equ 64
-.dy equ 80
-.dz equ 96
-.two equ 112
-.tabscale equ 128
-.qq equ 144
-.fs equ 160
-.vctot equ 176
-.fix equ 192
-.fiy equ 208
-.fiz equ 224
-.half equ 240
-.three equ 256
-.is3 equ 272
-.ii3 equ 276
-.shX equ 280
-.shY equ 284
-.shZ equ 288
-.ntia equ 292
-.innerjjnr0 equ 296
-.innerk0 equ 300
-.innerjjnr equ 304
-.innerk equ 308
-.salign equ 312
-.nscoul equ 316
-.solnr equ 320
-
- push eax
- push ebx
- push ecx
- push edx
- push esi
- push edi
- sub esp, 324 ; local stack space
- mov eax, esp
- and eax, 0xf
- sub esp, eax
- mov [esp + .salign], eax
-
- emms
-
- movups xmm0, [sse_half]
- movups xmm1, [sse_two]
- movups xmm2, [sse_three]
- movss xmm3, [ebp + %$tabscale]
- movaps [esp + .half], xmm0
- movaps [esp + .two], xmm1
- movaps [esp + .three], xmm2
- shufps xmm3, xmm3, 0b
- movaps [esp + .tabscale], xmm3
-
- add [ebp + %$nsatoms], dword 8
-
- ;; assume we have at least one i particle - start directly
-.outer:
- mov eax, [ebp + %$shift] ; eax = pointer into shift[]
- mov ebx, [eax] ; ebx=shift[n]
- add [ebp + %$shift], dword 4 ; advance pointer one step
-
- lea ebx, [ebx + ebx*2] ; ebx=3*is
- mov [esp + .is3],ebx ; store is3
-
- mov eax, [ebp + %$shiftvec] ; eax = base of shiftvec[]
-
- movss xmm0, [eax + ebx*4]
- movss xmm1, [eax + ebx*4 + 4]
- movss xmm2, [eax + ebx*4 + 8]
- movss [esp + .shX], xmm0
- movss [esp + .shY], xmm1
- movss [esp + .shZ], xmm2
-
- mov ecx, [ebp + %$iinr] ; ecx = pointer into iinr[]
- add [ebp + %$iinr], dword 4 ; advance pointer
- mov ebx, [ecx] ; ebx =ii
-
- mov eax, [ebp + %$nsatoms]
- mov ecx, [eax]
- add [ebp + %$nsatoms], dword 12
- mov [esp + .nscoul], ecx
-
- ; clear vctot
- xorps xmm4, xmm4
- movaps [esp + .vctot], xmm4
- mov [esp + .solnr], ebx
-
- mov eax, [ebp + %$jindex]
- mov ecx, [eax] ; jindex[n]
- mov edx, [eax + 4] ; jindex[n+1]
- add [ebp + %$jindex], dword 4
- sub edx, ecx ; number of innerloop atoms
-
- mov eax, [ebp + %$jjnr]
- shl ecx, 2
- add eax, ecx
- mov [esp + .innerjjnr0], eax ; pointer to jjnr[nj0]
- mov [esp + .innerk0], edx ; number of innerloop atoms
-
- mov ecx, [esp + .nscoul]
- cmp ecx, dword 0
- jnz .mno_coul
- jmp .last_mno
-.mno_coul:
- mov ebx, [esp + .solnr]
- inc dword [esp + .solnr]
-
- movss xmm0, [esp + .shX]
- movss xmm1, [esp + .shY]
- movss xmm2, [esp + .shZ]
-
- mov edx, [ebp + %$charge]
- movss xmm3, [edx + ebx*4]
- mulss xmm3, [ebp + %$facel]
- shufps xmm3, xmm3, 0b
-
- lea ebx, [ebx + ebx*2] ; ebx = 3*ii=ii3
- mov eax, [ebp + %$pos] ; eax = base of pos[]
-
- addss xmm0, [eax + ebx*4]
- addss xmm1, [eax + ebx*4 + 4]
- addss xmm2, [eax + ebx*4 + 8]
-
- movaps [esp + .iq], xmm3
-
- shufps xmm0, xmm0, 0b
- shufps xmm1, xmm1, 0b
- shufps xmm2, xmm2, 0b
-
- movaps [esp + .ix], xmm0
- movaps [esp + .iy], xmm1
- movaps [esp + .iz], xmm2
-
- mov [esp + .ii3], ebx
-
- ; clear i forces
- xorps xmm4, xmm4
- movaps [esp + .fix], xmm4
- movaps [esp + .fiy], xmm4
- movaps [esp + .fiz], xmm4
-
- mov ecx, [esp + .innerjjnr0]
- mov [esp + .innerjjnr], ecx
- mov edx, [esp + .innerk0]
- sub edx, dword 4
- mov [esp + .innerk], edx ; number of innerloop atoms
- jge .unroll_coul_loop
- jmp .finish_coul_inner
-
-.unroll_coul_loop:
- ;; quad-unroll innerloop here.
- mov edx, [esp + .innerjjnr] ; pointer to jjnr[k]
- mov eax, [edx]
- mov ebx, [edx + 4]
- mov ecx, [edx + 8]
- mov edx, [edx + 12] ; eax-edx=jnr1-4
- add [esp + .innerjjnr], dword 16 ; advance pointer (unrolled 4)
-
- mov esi, [ebp + %$charge] ; base of charge[]
-
- movss xmm3, [esi + eax*4]
- movss xmm4, [esi + ecx*4]
- movss xmm6, [esi + ebx*4]
- movss xmm7, [esi + edx*4]
-
- movaps xmm2, [esp + .iq]
- shufps xmm3, xmm6, 00000000b
- shufps xmm4, xmm7, 00000000b
- shufps xmm3, xmm4, 10001000b ; all charges in xmm3
- mulps xmm3, xmm2
-
- movaps [esp + .qq], xmm3
-
- mov esi, [ebp + %$pos] ; base of pos[]
-
- lea eax, [eax + eax*2] ; replace jnr with j3
- lea ebx, [ebx + ebx*2]
-
- lea ecx, [ecx + ecx*2] ; replace jnr with j3
- lea edx, [edx + edx*2]
-
- ; move four coordinates to xmm0-xmm2
-
- movlps xmm4, [esi + eax*4]
- movlps xmm5, [esi + ecx*4]
- movss xmm2, [esi + eax*4 + 8]
- movss xmm6, [esi + ecx*4 + 8]
-
- movhps xmm4, [esi + ebx*4]
- movhps xmm5, [esi + edx*4]
-
- movss xmm0, [esi + ebx*4 + 8]
- movss xmm1, [esi + edx*4 + 8]
-
- shufps xmm2, xmm0, 0b
- shufps xmm6, xmm1, 0b
-
- movaps xmm0, xmm4
- movaps xmm1, xmm4
-
- shufps xmm2, xmm6, 10001000b
-
- shufps xmm0, xmm5, 10001000b
- shufps xmm1, xmm5, 11011101b
-
- ; move ix-iz to xmm4-xmm6
- movaps xmm4, [esp + .ix]
- movaps xmm5, [esp + .iy]
- movaps xmm6, [esp + .iz]
-
- ; calc dr
- subps xmm4, xmm0
- subps xmm5, xmm1
- subps xmm6, xmm2
-
- ; store dr
- movaps [esp + .dx], xmm4
- movaps [esp + .dy], xmm5
- movaps [esp + .dz], xmm6
- ; square it
- mulps xmm4,xmm4
- mulps xmm5,xmm5
- mulps xmm6,xmm6
- addps xmm4, xmm5
- addps xmm4, xmm6
- ; rsq in xmm4
-
- rsqrtps xmm5, xmm4
- ; lookup seed in xmm5
- movaps xmm2, xmm5
- mulps xmm5, xmm5
- movaps xmm1, [esp + .three]
- mulps xmm5, xmm4 ;rsq*lu*lu
- movaps xmm0, [esp + .half]
- subps xmm1, xmm5 ; 3.0-rsq*lu*lu
- mulps xmm1, xmm2
- mulps xmm0, xmm1 ; xmm0=rinv
- mulps xmm4, xmm0 ; xmm4=r
- mulps xmm4, [esp + .tabscale]
-
- movhlps xmm5, xmm4
- cvttps2pi mm6, xmm4
- cvttps2pi mm7, xmm5 ; mm6/mm7 contain lu indices
- cvtpi2ps xmm6, mm6
- cvtpi2ps xmm5, mm7
- movlhps xmm6, xmm5
- subps xmm4, xmm6
- movaps xmm1, xmm4 ;xmm1=eps
- movaps xmm2, xmm1
- mulps xmm2, xmm2 ;xmm2=eps2
- pslld mm6, 2
- pslld mm7, 2
-
- movd mm0, eax
- movd mm1, ebx
- movd mm2, ecx
- movd mm3, edx
-
- mov esi, [ebp + %$VFtab]
- movd eax, mm6
- psrlq mm6, 32
- movd ecx, mm7
- psrlq mm7, 32
- movd ebx, mm6
- movd edx, mm7
-
- movlps xmm5, [esi + eax*4]
- movlps xmm7, [esi + ecx*4]
- movhps xmm5, [esi + ebx*4]
- movhps xmm7, [esi + edx*4] ; got half coulomb table
-
- movaps xmm4, xmm5
- shufps xmm4, xmm7, 10001000b
- shufps xmm5, xmm7, 11011101b
-
- movlps xmm7, [esi + eax*4 + 8]
- movlps xmm3, [esi + ecx*4 + 8]
- movhps xmm7, [esi + ebx*4 + 8]
- movhps xmm3, [esi + edx*4 + 8] ; other half of coulomb table
- movaps xmm6, xmm7
- shufps xmm6, xmm3, 10001000b
- shufps xmm7, xmm3, 11011101b
- ; coulomb table ready, in xmm4-xmm7
-
- mulps xmm6, xmm1 ; xmm6=Geps
- mulps xmm7, xmm2 ; xmm7=Heps2
- addps xmm5, xmm6
- addps xmm5, xmm7 ; xmm5=Fp
- mulps xmm7, [esp + .two] ; two*Heps2
- movaps xmm3, [esp + .qq]
- addps xmm7, xmm6
- addps xmm7, xmm5 ; xmm7=FF
- mulps xmm5, xmm1 ; xmm5=eps*Fp
- addps xmm5, xmm4 ; xmm5=VV
- mulps xmm5, xmm3 ; vcoul=qq*VV
- mulps xmm3, xmm7 ; fijC=FF*qq
- ; at this point mm5 contains vcoul and mm3 fijC.
- ; increment vcoul - then we can get rid of mm5.
- ;; update vctot
- addps xmm5, [esp + .vctot]
- movaps [esp + .vctot], xmm5
-
- xorps xmm4, xmm4
-
- mulps xmm3, [esp + .tabscale]
- mulps xmm3, xmm0
- subps xmm4, xmm3
-
- movaps xmm0, [esp + .dx]
- movaps xmm1, [esp + .dy]
- movaps xmm2, [esp + .dz]
-
- movd eax, mm0
- movd ebx, mm1
- movd ecx, mm2
- movd edx, mm3
-
- mov edi, [ebp + %$faction]
- mulps xmm0, xmm4
- mulps xmm1, xmm4
- mulps xmm2, xmm4
- ; xmm0-xmm2 contains tx-tz (partial force)
- ; now update f_i
- movaps xmm3, [esp + .fix]
- movaps xmm4, [esp + .fiy]
- movaps xmm5, [esp + .fiz]
- addps xmm3, xmm0
- addps xmm4, xmm1
- addps xmm5, xmm2
- movaps [esp + .fix], xmm3
- movaps [esp + .fiy], xmm4
- movaps [esp + .fiz], xmm5
- ; the fj's - start by accumulating x & y forces from memory
- movlps xmm4, [edi + eax*4]
- movlps xmm6, [edi + ecx*4]
- movhps xmm4, [edi + ebx*4]
- movhps xmm6, [edi + edx*4]
-
- movaps xmm3, xmm4
- shufps xmm3, xmm6, 10001000b
- shufps xmm4, xmm6, 11011101b
-
- ; now xmm3-xmm5 contains fjx, fjy, fjz
- subps xmm3, xmm0
- subps xmm4, xmm1
-
- ; unpack them back so we can store them - first x & y in xmm3/xmm4
-
- movaps xmm6, xmm3
- unpcklps xmm6, xmm4
- unpckhps xmm3, xmm4
- ; xmm6(l)=x & y for j1, (h) for j2
- ; xmm3(l)=x & y for j3, (h) for j4
- movlps [edi + eax*4], xmm6
- movlps [edi + ecx*4], xmm3
-
- movhps [edi + ebx*4], xmm6
- movhps [edi + edx*4], xmm3
-
- ;; and the z forces
- movss xmm4, [edi + eax*4 + 8]
- movss xmm5, [edi + ebx*4 + 8]
- movss xmm6, [edi + ecx*4 + 8]
- movss xmm7, [edi + edx*4 + 8]
- subss xmm4, xmm2
- shufps xmm2, xmm2, 11100101b
- subss xmm5, xmm2
- shufps xmm2, xmm2, 11101010b
- subss xmm6, xmm2
- shufps xmm2, xmm2, 11111111b
- subss xmm7, xmm2
- movss [edi + eax*4 + 8], xmm4
- movss [edi + ebx*4 + 8], xmm5
- movss [edi + ecx*4 + 8], xmm6
- movss [edi + edx*4 + 8], xmm7
-
- ;; should we do one more iteration?
- sub [esp + .innerk], dword 4
- jl .finish_coul_inner
- jmp .unroll_coul_loop
-.finish_coul_inner:
- ;; check if at least two particles remain
- add [esp + .innerk], dword 4
- mov edx, [esp + .innerk]
- and edx, 10b
- jnz .dopair_coul
- jmp .checksingle_coul
-.dopair_coul:
- mov esi, [ebp + %$charge]
-
- mov ecx, [esp + .innerjjnr]
-
- mov eax, [ecx]
- mov ebx, [ecx + 4]
- add [esp + .innerjjnr], dword 8
- xorps xmm7, xmm7
- movss xmm3, [esi + eax*4]
- movss xmm6, [esi + ebx*4]
- shufps xmm3, xmm6, 00000000b
- shufps xmm3, xmm3, 00001000b ; xmm3(0,1) has the charges.
-
- mulps xmm3, [esp + .iq]
- movlhps xmm3, xmm7
- movaps [esp + .qq], xmm3
-
- mov edi, [ebp + %$pos]
-
- lea eax, [eax + eax*2]
- lea ebx, [ebx + ebx*2]
- ; move coordinates to xmm0-xmm2
- movlps xmm1, [edi + eax*4]
- movss xmm2, [edi + eax*4 + 8]
- movhps xmm1, [edi + ebx*4]
- movss xmm0, [edi + ebx*4 + 8]
-
- movlhps xmm3, xmm7
-
- shufps xmm2, xmm0, 0b
-
- movaps xmm0, xmm1
-
- shufps xmm2, xmm2, 10001000b
-
- shufps xmm0, xmm0, 10001000b
- shufps xmm1, xmm1, 11011101b
-
- mov edi, [ebp + %$faction]
- ; move ix-iz to xmm4-xmm6
- xorps xmm7, xmm7
-
- movaps xmm4, [esp + .ix]
- movaps xmm5, [esp + .iy]
- movaps xmm6, [esp + .iz]
-
- ; calc dr
- subps xmm4, xmm0
- subps xmm5, xmm1
- subps xmm6, xmm2
-
- ; store dr
- movaps [esp + .dx], xmm4
- movaps [esp + .dy], xmm5
- movaps [esp + .dz], xmm6
- ; square it
- mulps xmm4,xmm4
- mulps xmm5,xmm5
- mulps xmm6,xmm6
- addps xmm4, xmm5
- addps xmm4, xmm6
- ; rsq in xmm4
-
- rsqrtps xmm5, xmm4
- ; lookup seed in xmm5
- movaps xmm2, xmm5
- mulps xmm5, xmm5
- movaps xmm1, [esp + .three]
- mulps xmm5, xmm4 ;rsq*lu*lu
- movaps xmm0, [esp + .half]
- subps xmm1, xmm5 ; 3.0-rsq*lu*lu
- mulps xmm1, xmm2
- mulps xmm0, xmm1 ; xmm0=rinv
- mulps xmm4, xmm0 ; xmm4=r
- mulps xmm4, [esp + .tabscale]
-
- cvttps2pi mm6, xmm4 ; mm6 contain lu indices
- cvtpi2ps xmm6, mm6
- subps xmm4, xmm6
- movaps xmm1, xmm4 ;xmm1=eps
- movaps xmm2, xmm1
- mulps xmm2, xmm2 ;xmm2=eps2
-
- pslld mm6, 2
-
- mov esi, [ebp + %$VFtab]
- movd ecx, mm6
- psrlq mm6, 32
- movd edx, mm6
-
- movlps xmm5, [esi + ecx*4]
- movhps xmm5, [esi + edx*4] ; got half coulomb table
- movaps xmm4, xmm5
- shufps xmm4, xmm4, 10001000b
- shufps xmm5, xmm7, 11011101b
-
- movlps xmm7, [esi + ecx*4 + 8]
- movhps xmm7, [esi + edx*4 + 8]
- movaps xmm6, xmm7
- shufps xmm6, xmm6, 10001000b
- shufps xmm7, xmm7, 11011101b
- ; table ready in xmm4-xmm7
-
- mulps xmm6, xmm1 ; xmm6=Geps
- mulps xmm7, xmm2 ; xmm7=Heps2
- addps xmm5, xmm6
- addps xmm5, xmm7 ; xmm5=Fp
- mulps xmm7, [esp + .two] ; two*Heps2
- movaps xmm3, [esp + .qq]
- addps xmm7, xmm6
- addps xmm7, xmm5 ; xmm7=FF
- mulps xmm5, xmm1 ; xmm5=eps*Fp
- addps xmm5, xmm4 ; xmm5=VV
- mulps xmm5, xmm3 ; vcoul=qq*VV
- mulps xmm3, xmm7 ; fijC=FF*qq
- ; at this point mm5 contains vcoul and mm3 fijC.
- ; increment vcoul - then we can get rid of mm5.
- ;; update vctot
- addps xmm5, [esp + .vctot]
- movaps [esp + .vctot], xmm5
-
- xorps xmm4, xmm4
-
- mulps xmm3, [esp + .tabscale]
- mulps xmm3, xmm0
- subps xmm4, xmm3
-
- movaps xmm0, [esp + .dx]
- movaps xmm1, [esp + .dy]
- movaps xmm2, [esp + .dz]
-
- mulps xmm0, xmm4
- mulps xmm1, xmm4
- mulps xmm2, xmm4
- ; xmm0-xmm2 contains tx-tz (partial force)
- ; now update f_i
- movaps xmm3, [esp + .fix]
- movaps xmm4, [esp + .fiy]
- movaps xmm5, [esp + .fiz]
- addps xmm3, xmm0
- addps xmm4, xmm1
- addps xmm5, xmm2
- movaps [esp + .fix], xmm3
- movaps [esp + .fiy], xmm4
- movaps [esp + .fiz], xmm5
- ; update the fj's
- movss xmm3, [edi + eax*4]
- movss xmm4, [edi + eax*4 + 4]
- movss xmm5, [edi + eax*4 + 8]
- subss xmm3, xmm0
- subss xmm4, xmm1
- subss xmm5, xmm2
- movss [edi + eax*4], xmm3
- movss [edi + eax*4 + 4], xmm4
- movss [edi + eax*4 + 8], xmm5
-
- shufps xmm0, xmm0, 11100001b
- shufps xmm1, xmm1, 11100001b
- shufps xmm2, xmm2, 11100001b
-
- movss xmm3, [edi + ebx*4]
- movss xmm4, [edi + ebx*4 + 4]
- movss xmm5, [edi + ebx*4 + 8]
- subss xmm3, xmm0
- subss xmm4, xmm1
- subss xmm5, xmm2
- movss [edi + ebx*4], xmm3
- movss [edi + ebx*4 + 4], xmm4
- movss [edi + ebx*4 + 8], xmm5
-
-.checksingle_coul:
- mov edx, [esp + .innerk]
- and edx, 1b
- jnz .dosingle_coul
- jmp .updateouterdata_coul
-.dosingle_coul:
- mov esi, [ebp + %$charge]
- mov edi, [ebp + %$pos]
- mov ecx, [esp + .innerjjnr]
- mov eax, [ecx]
- xorps xmm6, xmm6
- movss xmm6, [esi + eax*4] ; xmm6(0) has the charge
- mulps xmm6, [esp + .iq]
- movaps [esp + .qq], xmm6
-
- lea eax, [eax + eax*2]
-
- ; move coordinates to xmm0-xmm2
- movss xmm0, [edi + eax*4]
- movss xmm1, [edi + eax*4 + 4]
- movss xmm2, [edi + eax*4 + 8]
-
- movaps xmm4, [esp + .ix]
- movaps xmm5, [esp + .iy]
- movaps xmm6, [esp + .iz]
-
- ; calc dr
- subps xmm4, xmm0
- subps xmm5, xmm1
- subps xmm6, xmm2
-
- ; store dr
- movaps [esp + .dx], xmm4
- movaps [esp + .dy], xmm5
- movaps [esp + .dz], xmm6
- ; square it
- mulps xmm4,xmm4
- mulps xmm5,xmm5
- mulps xmm6,xmm6
- addps xmm4, xmm5
- addps xmm4, xmm6
- ; rsq in xmm4
-
- rsqrtps xmm5, xmm4
- ; lookup seed in xmm5
- movaps xmm2, xmm5
- mulps xmm5, xmm5
- movaps xmm1, [esp + .three]
- mulps xmm5, xmm4 ;rsq*lu*lu
- movaps xmm0, [esp + .half]
- subps xmm1, xmm5 ; 3.0-rsq*lu*lu
- mulps xmm1, xmm2
- mulps xmm0, xmm1 ; xmm0=rinv
-
- mulps xmm4, xmm0 ; xmm4=r
- mulps xmm4, [esp + .tabscale]
-
- cvttps2pi mm6, xmm4 ; mm6 contain lu indices
- cvtpi2ps xmm6, mm6
- subps xmm4, xmm6
- movaps xmm1, xmm4 ;xmm1=eps
- movaps xmm2, xmm1
- mulps xmm2, xmm2 ;xmm2=eps2
-
- pslld mm6, 2
-
- mov esi, [ebp + %$VFtab]
- movd ebx, mm6
-
- movlps xmm4, [esi + ebx*4]
- movlps xmm6, [esi + ebx*4 + 8]
- movaps xmm5, xmm4
- movaps xmm7, xmm6
- shufps xmm5, xmm5, 1b
- shufps xmm7, xmm7, 1b
- ; table ready in xmm4-xmm7
-
- mulps xmm6, xmm1 ; xmm6=Geps
- mulps xmm7, xmm2 ; xmm7=Heps2
- addps xmm5, xmm6
- addps xmm5, xmm7 ; xmm5=Fp
- mulps xmm7, [esp + .two] ; two*Heps2
- movaps xmm3, [esp + .qq]
- addps xmm7, xmm6
- addps xmm7, xmm5 ; xmm7=FF
- mulps xmm5, xmm1 ; xmm5=eps*Fp
- addps xmm5, xmm4 ; xmm5=VV
- mulps xmm5, xmm3 ; vcoul=qq*VV
- mulps xmm3, xmm7 ; fijC=FF*qq
- ; at this point mm5 contains vcoul and mm3 fijC.
- ; increment vcoul - then we can get rid of mm5.
- ;; update vctot
- addps xmm5, [esp + .vctot]
- movaps [esp + .vctot], xmm5
-
- xorps xmm4, xmm4
-
- mulps xmm3, [esp + .tabscale]
- mulps xmm3, xmm0
- subps xmm4, xmm3
- mov edi, [ebp + %$faction]
-
- movaps xmm0, [esp + .dx]
- movaps xmm1, [esp + .dy]
- movaps xmm2, [esp + .dz]
-
- mulps xmm0, xmm4
- mulps xmm1, xmm4
- mulps xmm2, xmm4
- ; xmm0-xmm2 contains tx-tz (partial force)
- ; now update f_i
- movaps xmm3, [esp + .fix]
- movaps xmm4, [esp + .fiy]
- movaps xmm5, [esp + .fiz]
- addps xmm3, xmm0
- addps xmm4, xmm1
- addps xmm5, xmm2
- movaps [esp + .fix], xmm3
- movaps [esp + .fiy], xmm4
- movaps [esp + .fiz], xmm5
- ; update fj
-
- movss xmm3, [edi + eax*4]
- movss xmm4, [edi + eax*4 + 4]
- movss xmm5, [edi + eax*4 + 8]
- subss xmm3, xmm0
- subss xmm4, xmm1
- subss xmm5, xmm2
- movss [edi + eax*4], xmm3
- movss [edi + eax*4 + 4], xmm4
- movss [edi + eax*4 + 8], xmm5
-.updateouterdata_coul:
- mov ecx, [esp + .ii3]
- mov edi, [ebp + %$faction]
- mov esi, [ebp + %$fshift]
- mov edx, [esp + .is3]
-
- ; accumulate i forces in xmm0, xmm1, xmm2
- movaps xmm0, [esp + .fix]
- movaps xmm1, [esp + .fiy]
- movaps xmm2, [esp + .fiz]
-
- movhlps xmm3, xmm0
- movhlps xmm4, xmm1
- movhlps xmm5, xmm2
- addps xmm0, xmm3
- addps xmm1, xmm4
- addps xmm2, xmm5 ; summan ligger i 1/2 i xmm0-xmm2
-
- movaps xmm3, xmm0
- movaps xmm4, xmm1
- movaps xmm5, xmm2
-
- shufps xmm3, xmm3, 1b
- shufps xmm4, xmm4, 1b
- shufps xmm5, xmm5, 1b
- addss xmm0, xmm3
- addss xmm1, xmm4
- addss xmm2, xmm5 ; xmm0-xmm2 has single force in pos0.
-
- ; increment i force
- movss xmm3, [edi + ecx*4]
- movss xmm4, [edi + ecx*4 + 4]
- movss xmm5, [edi + ecx*4 + 8]
- addss xmm3, xmm0
- addss xmm4, xmm1
- addss xmm5, xmm2
- movss [edi + ecx*4], xmm3
- movss [edi + ecx*4 + 4], xmm4
- movss [edi + ecx*4 + 8], xmm5
-
- ; increment fshift force
- movss xmm3, [esi + edx*4]
- movss xmm4, [esi + edx*4 + 4]
- movss xmm5, [esi + edx*4 + 8]
- addss xmm3, xmm0
- addss xmm4, xmm1
- addss xmm5, xmm2
- movss [esi + edx*4], xmm3
- movss [esi + edx*4 + 4], xmm4
- movss [esi + edx*4 + 8], xmm5
-
- ;; loop back to mno.
- dec dword [esp + .nscoul]
- jz .last_mno
- jmp .mno_coul
-
-.last_mno:
- ; get group index for i particle
- mov edx, [ebp + %$gid] ; get group index for this i particle
- mov edx, [edx]
- add [ebp + %$gid], dword 4 ; advance pointer
-
- ; accumulate total potential energy and update it.
- movaps xmm7, [esp + .vctot]
- ; accumulate
- movhlps xmm6, xmm7
- addps xmm7, xmm6 ; pos 0-1 in xmm7 have the sum now
- movaps xmm6, xmm7
- shufps xmm6, xmm6, 1b
- addss xmm7, xmm6
-
- ; add earlier value from mem.
- mov eax, [ebp + %$Vc]
- addss xmm7, [eax + edx*4]
- ; move back to mem.
- movss [eax + edx*4], xmm7
-
- ;; finish if last
- mov ecx, [ebp + %$nri]
- dec ecx
- jecxz .end
- ;; not last, iterate once more!
- mov [ebp + %$nri], ecx
- jmp .outer
-.end:
- emms
- mov eax, [esp + .salign]
- add esp, eax
- add esp, 324
- pop edi
- pop esi
- pop edx
- pop ecx
- pop ebx
- pop eax
- endproc
-
-
-
-
-proc inl3020_sse
-%$nri arg
-%$iinr arg
-%$jindex arg
-%$jjnr arg
-%$shift arg
-%$shiftvec arg
-%$fshift arg
-%$gid arg
-%$pos arg
-%$faction arg
-%$charge arg
-%$facel arg
-%$Vc arg
-%$tabscale arg
-%$VFtab arg
- ;; stack offsets for local variables
- ;; bottom of stack is cache-aligned for sse use
-.ixO equ 0
-.iyO equ 16
-.izO equ 32
-.ixH1 equ 48
-.iyH1 equ 64
-.izH1 equ 80
-.ixH2 equ 96
-.iyH2 equ 112
-.izH2 equ 128
-.iqO equ 144
-.iqH equ 160
-.dxO equ 176
-.dyO equ 192
-.dzO equ 208
-.dxH1 equ 224
-.dyH1 equ 240
-.dzH1 equ 256
-.dxH2 equ 272
-.dyH2 equ 288
-.dzH2 equ 304
-.qqO equ 320
-.qqH equ 336
-.rinvO equ 352
-.rinvH1 equ 368
-.rinvH2 equ 384
-.rO equ 400
-.rH1 equ 416
-.rH2 equ 432
-.tabscale equ 448
-.two equ 464
-.vctot equ 480
-.fixO equ 496
-.fiyO equ 512
-.fizO equ 528
-.fixH1 equ 544
-.fiyH1 equ 560
-.fizH1 equ 576
-.fixH2 equ 592
-.fiyH2 equ 608
-.fizH2 equ 624
-.fjx equ 640
-.fjy equ 656
-.fjz equ 672
-.half equ 688
-.three equ 704
-.is3 equ 720
-.ii3 equ 724
-.innerjjnr equ 728
-.innerk equ 732
-.salign equ 736
- push eax
- push ebx
- push ecx
- push edx
- push esi
- push edi
- sub esp, 740 ; local stack space
- mov eax, esp
- and eax, 0xf
- sub esp, eax
- mov [esp + .salign], eax
-
- emms
-
- movups xmm0, [sse_half]
- movups xmm1, [sse_two]
- movups xmm2, [sse_three]
- movss xmm3, [ebp +%$tabscale]
-
- movaps [esp + .half], xmm0
- movaps [esp + .two], xmm1
- movaps [esp + .three], xmm2
- shufps xmm3, xmm3, 0b
- movaps [esp + .tabscale], xmm3
-
- ;; assume we have at least one i particle - start directly
- mov ecx, [ebp + %$iinr] ; ecx = pointer into iinr[]
- mov ebx, [ecx] ; ebx =ii
-
- mov edx, [ebp + %$charge]
- movss xmm3, [edx + ebx*4]
- movss xmm4, [edx + ebx*4 + 4]
- movss xmm5, [ebp + %$facel]
- mulss xmm3, xmm5
- mulss xmm4, xmm5
-
- shufps xmm3, xmm3, 0b
- shufps xmm4, xmm4, 0b
- movaps [esp + .iqO], xmm3
- movaps [esp + .iqH], xmm4
-
-.outer:
- mov eax, [ebp + %$shift] ; eax = pointer into shift[]
- mov ebx, [eax] ; ebx=shift[n]
- add [ebp + %$shift], dword 4 ; advance pointer one step
-
- lea ebx, [ebx + ebx*2] ; ebx=3*is
- mov [esp + .is3],ebx ; store is3
-
- mov eax, [ebp + %$shiftvec] ; eax = base of shiftvec[]
-
- movss xmm0, [eax + ebx*4]
- movss xmm1, [eax + ebx*4 + 4]
- movss xmm2, [eax + ebx*4 + 8]
-
- mov ecx, [ebp + %$iinr] ; ecx = pointer into iinr[]
- add [ebp + %$iinr], dword 4 ; advance pointer
- mov ebx, [ecx] ; ebx =ii
-
- movaps xmm3, xmm0
- movaps xmm4, xmm1
- movaps xmm5, xmm2
-
- lea ebx, [ebx + ebx*2] ; ebx = 3*ii=ii3
- mov eax, [ebp + %$pos] ; eax = base of pos[]
- mov [esp + .ii3], ebx
-
- addss xmm3, [eax + ebx*4]
- addss xmm4, [eax + ebx*4 + 4]
- addss xmm5, [eax + ebx*4 + 8]
- shufps xmm3, xmm3, 0b
- shufps xmm4, xmm4, 0b
- shufps xmm5, xmm5, 0b
- movaps [esp + .ixO], xmm3
- movaps [esp + .iyO], xmm4
- movaps [esp + .izO], xmm5
-
- movss xmm3, xmm0
- movss xmm4, xmm1
- movss xmm5, xmm2
- addss xmm0, [eax + ebx*4 + 12]
- addss xmm1, [eax + ebx*4 + 16]
- addss xmm2, [eax + ebx*4 + 20]
- addss xmm3, [eax + ebx*4 + 24]
- addss xmm4, [eax + ebx*4 + 28]
- addss xmm5, [eax + ebx*4 + 32]
-
- shufps xmm0, xmm0, 0b
- shufps xmm1, xmm1, 0b
- shufps xmm2, xmm2, 0b
- shufps xmm3, xmm3, 0b
- shufps xmm4, xmm4, 0b
- shufps xmm5, xmm5, 0b
- movaps [esp + .ixH1], xmm0
- movaps [esp + .iyH1], xmm1
- movaps [esp + .izH1], xmm2
- movaps [esp + .ixH2], xmm3
- movaps [esp + .iyH2], xmm4
- movaps [esp + .izH2], xmm5
-
- ; clear vctot and i forces
- xorps xmm4, xmm4
- movaps [esp + .vctot], xmm4
- movaps [esp + .fixO], xmm4
- movaps [esp + .fiyO], xmm4
- movaps [esp + .fizO], xmm4
- movaps [esp + .fixH1], xmm4
- movaps [esp + .fiyH1], xmm4
- movaps [esp + .fizH1], xmm4
- movaps [esp + .fixH2], xmm4
- movaps [esp + .fiyH2], xmm4
- movaps [esp + .fizH2], xmm4
-
- mov eax, [ebp + %$jindex]
- mov ecx, [eax] ; jindex[n]
- mov edx, [eax + 4] ; jindex[n+1]
- add [ebp + %$jindex], dword 4
- sub edx, ecx ; number of innerloop atoms
-
- mov esi, [ebp + %$pos]
- mov edi, [ebp + %$faction]
- mov eax, [ebp + %$jjnr]
- shl ecx, 2
- add eax, ecx
- mov [esp + .innerjjnr], eax ; pointer to jjnr[nj0]
- sub edx, dword 4
- mov [esp + .innerk], edx ; number of innerloop atoms
- jge .unroll_loop
- jmp .odd_inner
-.unroll_loop:
- ;; quad-unroll innerloop here.
- mov edx, [esp + .innerjjnr] ; pointer to jjnr[k]
- mov eax, [edx]
- mov ebx, [edx + 4]
- mov ecx, [edx + 8]
- mov edx, [edx + 12] ; eax-edx=jnr1-4
-
- add [esp + .innerjjnr], dword 16 ; advance pointer (unrolled 4)
-
- mov esi, [ebp + %$charge] ; base of charge[]
-
- movss xmm3, [esi + eax*4]
- movss xmm4, [esi + ecx*4]
- movss xmm6, [esi + ebx*4]
- movss xmm7, [esi + edx*4]
-
- shufps xmm3, xmm6, 00000000b
- shufps xmm4, xmm7, 00000000b
- shufps xmm3, xmm4, 10001000b ; all charges in xmm3
- movaps xmm4, xmm3 ; and in xmm4
- mulps xmm3, [esp + .iqO]
- mulps xmm4, [esp + .iqH]
-
- movd mm0, eax ; use mmx registers as temp. storage
- movd mm1, ebx
- movd mm2, ecx
- movd mm3, edx
-
- movaps [esp + .qqO], xmm3
- movaps [esp + .qqH], xmm4
-
- mov esi, [ebp + %$pos] ; base of pos[]
-
- lea eax, [eax + eax*2] ; replace jnr with j3
- lea ebx, [ebx + ebx*2]
- lea ecx, [ecx + ecx*2] ; replace jnr with j3
- lea edx, [edx + edx*2]
-
- ; move four coordinates to xmm0-xmm2
- movlps xmm4, [esi + eax*4]
- movlps xmm5, [esi + ecx*4]
- movss xmm2, [esi + eax*4 + 8]
- movss xmm6, [esi + ecx*4 + 8]
-
- movhps xmm4, [esi + ebx*4]
- movhps xmm5, [esi + edx*4]
-
- movss xmm0, [esi + ebx*4 + 8]
- movss xmm1, [esi + edx*4 + 8]
-
- shufps xmm2, xmm0, 0b
- shufps xmm6, xmm1, 0b
-
- movaps xmm0, xmm4
- movaps xmm1, xmm4
-
- shufps xmm2, xmm6, 10001000b
-
- shufps xmm0, xmm5, 10001000b
- shufps xmm1, xmm5, 11011101b
-
- ; move ixO-izO to xmm4-xmm6
- movaps xmm4, [esp + .ixO]
- movaps xmm5, [esp + .iyO]
- movaps xmm6, [esp + .izO]
-
- ; calc dr
- subps xmm4, xmm0
- subps xmm5, xmm1
- subps xmm6, xmm2
-
- ; store dr
- movaps [esp + .dxO], xmm4
- movaps [esp + .dyO], xmm5
- movaps [esp + .dzO], xmm6
- ; square it
- mulps xmm4,xmm4
- mulps xmm5,xmm5
- mulps xmm6,xmm6
- addps xmm4, xmm5
- addps xmm4, xmm6
- movaps xmm7, xmm4
- ; rsqO in xmm7
-
- ; move ixH1-izH1 to xmm4-xmm6
- movaps xmm4, [esp + .ixH1]
- movaps xmm5, [esp + .iyH1]
- movaps xmm6, [esp + .izH1]
-
- ; calc dr
- subps xmm4, xmm0
- subps xmm5, xmm1
- subps xmm6, xmm2
-
- ; store dr
- movaps [esp + .dxH1], xmm4
- movaps [esp + .dyH1], xmm5
- movaps [esp + .dzH1], xmm6
- ; square it
- mulps xmm4,xmm4
- mulps xmm5,xmm5
- mulps xmm6,xmm6
- addps xmm6, xmm5
- addps xmm6, xmm4
- ; rsqH1 in xmm6
-
- ; move ixH2-izH2 to xmm3-xmm5
- movaps xmm3, [esp + .ixH2]
- movaps xmm4, [esp + .iyH2]
- movaps xmm5, [esp + .izH2]
-
- ; calc dr
- subps xmm3, xmm0
- subps xmm4, xmm1
- subps xmm5, xmm2
-
- ; store dr
- movaps [esp + .dxH2], xmm3
- movaps [esp + .dyH2], xmm4
- movaps [esp + .dzH2], xmm5
- ; square it
- mulps xmm3,xmm3
- mulps xmm4,xmm4
- mulps xmm5,xmm5
- addps xmm5, xmm4
- addps xmm5, xmm3
- ; rsqH2 in xmm5, rsqH1 in xmm6, rsqO in xmm7
-
- ; start with rsqO - seed to xmm2
- rsqrtps xmm2, xmm7
- movaps xmm3, xmm2
- mulps xmm2, xmm2
- movaps xmm4, [esp + .three]
- mulps xmm2, xmm7 ; rsq*lu*lu
- subps xmm4, xmm2 ; 3.0-rsq*lu*lu
- mulps xmm4, xmm3 ; lu*(3-rsq*lu*lu)
- mulps xmm4, [esp + .half]
- movaps [esp + .rinvO], xmm4 ; rinvO in xmm4
- mulps xmm7, xmm4
- movaps [esp + .rO], xmm7
-
- ; rsqH1 - seed in xmm2
- rsqrtps xmm2, xmm6
- movaps xmm3, xmm2
- mulps xmm2, xmm2
- movaps xmm4, [esp + .three]
- mulps xmm2, xmm6 ; rsq*lu*lu
- subps xmm4, xmm2 ; 3.0-rsq*lu*lu
- mulps xmm4, xmm3 ; lu*(3-rsq*lu*lu)
- mulps xmm4, [esp + .half]
- movaps [esp + .rinvH1], xmm4 ; rinvH1 in xmm4
- mulps xmm6, xmm4
- movaps [esp + .rH1], xmm6
-
- ; rsqH2 - seed to xmm2
- rsqrtps xmm2, xmm5
- movaps xmm3, xmm2
- mulps xmm2, xmm2
- movaps xmm4, [esp + .three]
- mulps xmm2, xmm5 ; rsq*lu*lu
- subps xmm4, xmm2 ; 3.0-rsq*lu*lu
- mulps xmm4, xmm3 ; lu*(3-rsq*lu*lu)
- mulps xmm4, [esp + .half]
- movaps [esp + .rinvH2], xmm4 ; rinvH2 in xmm4
- mulps xmm5, xmm4
- movaps [esp + .rH2], xmm5
-
- ; do O interactions
- ;; rO is still in xmm7.
- mulps xmm7, [esp + .tabscale]
- movhlps xmm4, xmm7
- cvttps2pi mm6, xmm7
- cvttps2pi mm7, xmm4 ; mm6/mm7 contain lu indices
-
- cvtpi2ps xmm3, mm6
- cvtpi2ps xmm4, mm7
- movlhps xmm3, xmm4
-
- subps xmm7, xmm3
-
- movaps xmm1, xmm7 ; xmm1=eps
- movaps xmm2, xmm1
- mulps xmm2, xmm2 ; xmm2=eps2
- pslld mm6, 2
- pslld mm7, 2
-
- movd mm0, eax
- movd mm1, ebx
- movd mm2, ecx
- movd mm3, edx
-
- mov esi, [ebp + %$VFtab]
- movd eax, mm6
- psrlq mm6, 32
- movd ecx, mm7
- psrlq mm7, 32
- movd ebx, mm6
- movd edx, mm7
-
- movlps xmm5, [esi + eax*4]
- movlps xmm7, [esi + ecx*4]
- movhps xmm5, [esi + ebx*4]
- movhps xmm7, [esi + edx*4] ; got half coulomb table
-
- movaps xmm4, xmm5
- shufps xmm4, xmm7, 10001000b
- shufps xmm5, xmm7, 11011101b
-
- movlps xmm7, [esi + eax*4 + 8]
- movlps xmm3, [esi + ecx*4 + 8]
- movhps xmm7, [esi + ebx*4 + 8]
- movhps xmm3, [esi + edx*4 + 8] ; other half of coulomb table
- movaps xmm6, xmm7
- shufps xmm6, xmm3, 10001000b
- shufps xmm7, xmm3, 11011101b
- ; coulomb table ready, in xmm4-xmm7
-
- mulps xmm6, xmm1 ; xmm6=Geps
- mulps xmm7, xmm2 ; xmm7=Heps2
- addps xmm5, xmm6
- addps xmm5, xmm7 ; xmm5=Fp
- mulps xmm7, [esp + .two] ; two*Heps2
- movaps xmm0, [esp + .qqO]
- addps xmm7, xmm6
- addps xmm7, xmm5 ; xmm7=FF
- mulps xmm5, xmm1 ; xmm5=eps*Fp
- addps xmm5, xmm4 ; xmm5=VV
- mulps xmm5, xmm0 ; vcoul=qq*VV
- mulps xmm0, xmm7 ; fijC=FF*qq
-
- ; at this point mm5 contains vcoul and xmm0 fijC.
- ; increment vcoul - then we can get rid of mm5.
- addps xmm5, [esp + .vctot]
- movaps [esp + .vctot], xmm5
- xorps xmm4, xmm4
-
- mulps xmm0, [esp + .tabscale]
- mulps xmm0, [esp + .rinvO]
- subps xmm4, xmm0
-
- movaps xmm0, [esp + .dxO]
- movaps xmm1, [esp + .dyO]
- movaps xmm2, [esp + .dzO]
- mulps xmm0, xmm4
- mulps xmm1, xmm4
- mulps xmm2, xmm4 ; tx in xmm0-xmm2
-
- ; update O forces
- movaps xmm3, [esp + .fixO]
- movaps xmm4, [esp + .fiyO]
- movaps xmm7, [esp + .fizO]
- addps xmm3, xmm0
- addps xmm4, xmm1
- addps xmm7, xmm2
- movaps [esp + .fixO], xmm3
- movaps [esp + .fiyO], xmm4
- movaps [esp + .fizO], xmm7
- ; update j forces with water O
- movaps [esp + .fjx], xmm0
- movaps [esp + .fjy], xmm1
- movaps [esp + .fjz], xmm2
-
- ;; Done with O interactions - now H1!
- movaps xmm7, [esp + .rH1]
- mulps xmm7, [esp + .tabscale]
- movhlps xmm4, xmm7
- cvttps2pi mm6, xmm7
- cvttps2pi mm7, xmm4 ; mm6/mm7 contain lu indices
-
- cvtpi2ps xmm3, mm6
- cvtpi2ps xmm4, mm7
- movlhps xmm3, xmm4
-
- subps xmm7, xmm3
- movaps xmm1, xmm7 ; xmm1=eps
- movaps xmm2, xmm1
- mulps xmm2, xmm2 ; xmm2=eps2
- pslld mm6, 2
- pslld mm7, 2
-
- movd eax, mm6
- psrlq mm6, 32
- movd ecx, mm7
- psrlq mm7, 32
- movd ebx, mm6
- movd edx, mm7
-
- movlps xmm5, [esi + eax*4]
- movlps xmm7, [esi + ecx*4]
- movhps xmm5, [esi + ebx*4]
- movhps xmm7, [esi + edx*4] ; got half coulomb table
-
- movaps xmm4, xmm5
- shufps xmm4, xmm7, 10001000b
- shufps xmm5, xmm7, 11011101b
-
- movlps xmm7, [esi + eax*4 + 8]
- movlps xmm3, [esi + ecx*4 + 8]
- movhps xmm7, [esi + ebx*4 + 8]
- movhps xmm3, [esi + edx*4 + 8] ; other half of coulomb table
- movaps xmm6, xmm7
- shufps xmm6, xmm3, 10001000b
- shufps xmm7, xmm3, 11011101b
- ; coulomb table ready, in xmm4-xmm7
-
- mulps xmm6, xmm1 ; xmm6=Geps
- mulps xmm7, xmm2 ; xmm7=Heps2
- addps xmm5, xmm6
- addps xmm5, xmm7 ; xmm5=Fp
- mulps xmm7, [esp + .two] ; two*Heps2
- movaps xmm0, [esp + .qqH]
- addps xmm7, xmm6
- addps xmm7, xmm5 ; xmm7=FF
- mulps xmm5, xmm1 ; xmm5=eps*Fp
- addps xmm5, xmm4 ; xmm5=VV
- mulps xmm5, xmm0 ; vcoul=qq*VV
- mulps xmm7, xmm0 ; fijC=FF*qq
- ; at this point mm5 contains vcoul and xmm7 fijC.
- ; increment vcoul
- xorps xmm4, xmm4
- addps xmm5, [esp + .vctot]
- mulps xmm7, [esp + .rinvH1]
- movaps [esp + .vctot], xmm5
- mulps xmm7, [esp + .tabscale]
- subps xmm4, xmm7
-
- movaps xmm0, [esp + .dxH1]
- movaps xmm1, [esp + .dyH1]
- movaps xmm2, [esp + .dzH1]
- mulps xmm0, xmm4
- mulps xmm1, xmm4
- mulps xmm2, xmm4
-
- ; update H1 forces
- movaps xmm3, [esp + .fixH1]
- movaps xmm4, [esp + .fiyH1]
- movaps xmm7, [esp + .fizH1]
- addps xmm3, xmm0
- addps xmm4, xmm1
- addps xmm7, xmm2
- movaps [esp + .fixH1], xmm3
- movaps [esp + .fiyH1], xmm4
- movaps [esp + .fizH1], xmm7
- ; update j forces with water H1
- addps xmm0, [esp + .fjx]
- addps xmm1, [esp + .fjy]
- addps xmm2, [esp + .fjz]
- movaps [esp + .fjx], xmm0
- movaps [esp + .fjy], xmm1
- movaps [esp + .fjz], xmm2
-
- ; Done with H1, finally we do H2 interactions
- movaps xmm7, [esp + .rH2]
- mulps xmm7, [esp + .tabscale]
- movhlps xmm4, xmm7
- cvttps2pi mm6, xmm7
- cvttps2pi mm7, xmm4 ; mm6/mm7 contain lu indices
-
- cvtpi2ps xmm3, mm6
- cvtpi2ps xmm4, mm7
- movlhps xmm3, xmm4
-
- subps xmm7, xmm3
- movaps xmm1, xmm7 ; xmm1=eps
- movaps xmm2, xmm1
- mulps xmm2, xmm2 ; xmm2=eps2
- pslld mm6, 2
- pslld mm7, 2
-
- movd eax, mm6
- psrlq mm6, 32
- movd ecx, mm7
- psrlq mm7, 32
- movd ebx, mm6
- movd edx, mm7
-
- movlps xmm5, [esi + eax*4]
- movlps xmm7, [esi + ecx*4]
- movhps xmm5, [esi + ebx*4]
- movhps xmm7, [esi + edx*4] ; got half coulomb table
-
- movaps xmm4, xmm5
- shufps xmm4, xmm7, 10001000b
- shufps xmm5, xmm7, 11011101b
-
- movlps xmm7, [esi + eax*4 + 8]
- movlps xmm3, [esi + ecx*4 + 8]
- movhps xmm7, [esi + ebx*4 + 8]
- movhps xmm3, [esi + edx*4 + 8] ; other half of coulomb table
- movaps xmm6, xmm7
- shufps xmm6, xmm3, 10001000b
- shufps xmm7, xmm3, 11011101b
- ; coulomb table ready, in xmm4-xmm7
-
- mulps xmm6, xmm1 ; xmm6=Geps
- mulps xmm7, xmm2 ; xmm7=Heps2
- addps xmm5, xmm6
- addps xmm5, xmm7 ; xmm5=Fp
- mulps xmm7, [esp + .two] ; two*Heps2
- movaps xmm0, [esp + .qqH]
- addps xmm7, xmm6
- addps xmm7, xmm5 ; xmm7=FF
- mulps xmm5, xmm1 ; xmm5=eps*Fp
- addps xmm5, xmm4 ; xmm5=VV
- mulps xmm5, xmm0 ; vcoul=qq*VV
- mulps xmm7, xmm0 ; fijC=FF*qq
- ; at this point mm5 contains vcoul and xmm0 fijC.
- ; increment vcoul
- xorps xmm4, xmm4
- addps xmm5, [esp + .vctot]
- mulps xmm7, [esp + .rinvH2]
- movaps [esp + .vctot], xmm5
- mulps xmm7, [esp + .tabscale]
- subps xmm4, xmm7
-
- movaps xmm0, [esp + .dxH2]
- movaps xmm1, [esp + .dyH2]
- movaps xmm2, [esp + .dzH2]
- mulps xmm0, xmm4
- mulps xmm1, xmm4
- mulps xmm2, xmm4
-
- movd eax, mm0
- movd ebx, mm1
- movd ecx, mm2
- movd edx, mm3
-
- ; update H2 forces
- movaps xmm3, [esp + .fixH2]
- movaps xmm4, [esp + .fiyH2]
- movaps xmm7, [esp + .fizH2]
- addps xmm3, xmm0
- addps xmm4, xmm1
- addps xmm7, xmm2
- movaps [esp + .fixH2], xmm3
- movaps [esp + .fiyH2], xmm4
- movaps [esp + .fizH2], xmm7
-
- mov edi, [ebp +%$faction]
- ; update j forces
- addps xmm0, [esp + .fjx]
- addps xmm1, [esp + .fjy]
- addps xmm2, [esp + .fjz]
-
- movlps xmm4, [edi + eax*4]
- movlps xmm7, [edi + ecx*4]
- movhps xmm4, [edi + ebx*4]
- movhps xmm7, [edi + edx*4]
-
- movaps xmm3, xmm4
- shufps xmm3, xmm7, 10001000b
- shufps xmm4, xmm7, 11011101b
- ; xmm3 has fjx, xmm4 has fjy.
- subps xmm3, xmm0
- subps xmm4, xmm1
- ; unpack the back for storing.
- movaps xmm7, xmm3
- unpcklps xmm7, xmm4
- unpckhps xmm3, xmm4
- movlps [edi + eax*4], xmm7
- movlps [edi + ecx*4], xmm3
- movhps [edi + ebx*4], xmm7
- movhps [edi + edx*4], xmm3
- ; finally z forces
- movss xmm0, [edi + eax*4 + 8]
- movss xmm1, [edi + ebx*4 + 8]
- movss xmm3, [edi + ecx*4 + 8]
- movss xmm4, [edi + edx*4 + 8]
- subss xmm0, xmm2
- shufps xmm2, xmm2, 11100101b
- subss xmm1, xmm2
- shufps xmm2, xmm2, 11101010b
- subss xmm3, xmm2
- shufps xmm2, xmm2, 11111111b
- subss xmm4, xmm2
- movss [edi + eax*4 + 8], xmm0
- movss [edi + ebx*4 + 8], xmm1
- movss [edi + ecx*4 + 8], xmm3
- movss [edi + edx*4 + 8], xmm4
-
- ;; should we do one more iteration?
- sub [esp + .innerk], dword 4
- jl .odd_inner
- jmp .unroll_loop
-.odd_inner:
- add [esp + .innerk], dword 4
- jnz .odd_loop
- jmp .updateouterdata
-.odd_loop:
- mov edx, [esp + .innerjjnr] ; pointer to jjnr[k]
- mov eax, [edx]
- add [esp + .innerjjnr], dword 4
-
- xorps xmm4, xmm4
- movss xmm4, [esp + .iqO]
- mov esi, [ebp + %$charge]
- movhps xmm4, [esp + .iqH]
- movss xmm3, [esi + eax*4] ; charge in xmm3
- shufps xmm3, xmm3, 0b
- mulps xmm3, xmm4
- movaps [esp + .qqO], xmm3 ; use oxygen qq for storage.
-
- mov esi, [ebp + %$pos]
- lea eax, [eax + eax*2]
-
- ; move j coords to xmm0-xmm2
- movss xmm0, [esi + eax*4]
- movss xmm1, [esi + eax*4 + 4]
- movss xmm2, [esi + eax*4 + 8]
- shufps xmm0, xmm0, 0b
- shufps xmm1, xmm1, 0b
- shufps xmm2, xmm2, 0b
-
- movss xmm3, [esp + .ixO]
- movss xmm4, [esp + .iyO]
- movss xmm5, [esp + .izO]
-
- movlps xmm6, [esp + .ixH1]
- movlps xmm7, [esp + .ixH2]
- unpcklps xmm6, xmm7
- movlhps xmm3, xmm6
- movlps xmm6, [esp + .iyH1]
- movlps xmm7, [esp + .iyH2]
- unpcklps xmm6, xmm7
- movlhps xmm4, xmm6
- movlps xmm6, [esp + .izH1]
- movlps xmm7, [esp + .izH2]
- unpcklps xmm6, xmm7
- movlhps xmm5, xmm6
-
- subps xmm3, xmm0
- subps xmm4, xmm1
- subps xmm5, xmm2
-
- movaps [esp + .dxO], xmm3
- movaps [esp + .dyO], xmm4
- movaps [esp + .dzO], xmm5
-
- mulps xmm3, xmm3
- mulps xmm4, xmm4
- mulps xmm5, xmm5
-
- addps xmm4, xmm3
- addps xmm4, xmm5
- ; rsq in xmm4.
-
- rsqrtps xmm5, xmm4
- ; lookup seed in xmm5
- movaps xmm2, xmm5
- mulps xmm5, xmm5
- movaps xmm1, [esp + .three]
- mulps xmm5, xmm4 ;rsq*lu*lu
- movaps xmm0, [esp + .half]
- subps xmm1, xmm5 ; 3.0-rsq*lu*lu
- mulps xmm1, xmm2
- mulps xmm0, xmm1 ; xmm0=rinv
- mulps xmm4, xmm0 ; xmm4=r
- movaps [esp + .rinvO], xmm0
-
- mulps xmm4, [esp + .tabscale]
- movhlps xmm7, xmm4
- cvttps2pi mm6, xmm4
- cvttps2pi mm7, xmm7 ; mm6/mm7 contain lu indices
- cvtpi2ps xmm3, mm6
- cvtpi2ps xmm7, mm7
- movlhps xmm3, xmm7
-
- subps xmm4, xmm3
- movaps xmm1, xmm4 ; xmm1=eps
- movaps xmm2, xmm1
- mulps xmm2, xmm2 ; xmm2=eps2
- pslld mm6, 2
- pslld mm7, 2
-
- movd mm0, eax
- movd mm1, ecx
- movd mm2, edx
-
- mov esi, [ebp + %$VFtab]
- movd eax, mm6
- movd ecx, mm7
- psrlq mm7, 32
- movd edx, mm7
-
- movlps xmm5, [esi + eax*4]
- movlps xmm7, [esi + ecx*4]
- movhps xmm7, [esi + edx*4] ; got half coulomb table
-
- movaps xmm4, xmm5
- shufps xmm4, xmm7, 10001000b
- shufps xmm5, xmm7, 11011101b
-
- movlps xmm7, [esi + eax*4 + 8]
- movlps xmm3, [esi + ecx*4 + 8]
- movhps xmm3, [esi + edx*4 + 8] ; other half of coulomb table
- movaps xmm6, xmm7
- shufps xmm6, xmm3, 10001000b
- shufps xmm7, xmm3, 11011101b
- ; coulomb table ready, in xmm4-xmm7
- mulps xmm6, xmm1 ; xmm6=Geps
- mulps xmm7, xmm2 ; xmm7=Heps2
- addps xmm5, xmm6
- addps xmm5, xmm7 ; xmm5=Fp
- mulps xmm7, [esp + .two] ; two*Heps2
- movaps xmm0, [esp + .qqO]
- addps xmm7, xmm6
- addps xmm7, xmm5 ; xmm7=FF
- mulps xmm5, xmm1 ; xmm5=eps*Fp
- addps xmm5, xmm4 ; xmm5=VV
- mulps xmm5, xmm0 ; vcoul=qq*VV
- mulps xmm0, xmm7 ; fijC=FF*qq
- ; at this point mm5 contains vcoul and xmm0 fijC.
- ; increment vcoul - then we can get rid of mm5.
- addps xmm5, [esp + .vctot]
- movaps [esp + .vctot], xmm5
-
- xorps xmm4, xmm4
- mulps xmm0, [esp + .tabscale]
- mulps xmm0, [esp + .rinvO]
- subps xmm4, xmm0
-
- movd eax, mm0
- movd ecx, mm1
- movd edx, mm2
-
- movaps xmm0, [esp + .dxO]
- movaps xmm1, [esp + .dyO]
- movaps xmm2, [esp + .dzO]
-
- mulps xmm0, xmm4
- mulps xmm1, xmm4
- mulps xmm2, xmm4 ; xmm0-xmm2 now contains tx-tz (partial force)
- movss xmm3, [esp + .fixO]
- movss xmm4, [esp + .fiyO]
- movss xmm5, [esp + .fizO]
- addss xmm3, xmm0
- addss xmm4, xmm1
- addss xmm5, xmm2
- movss [esp + .fixO], xmm3
- movss [esp + .fiyO], xmm4
- movss [esp + .fizO], xmm5 ; updated the O force. now do the H's
- movaps xmm3, xmm0
- movaps xmm4, xmm1
- movaps xmm5, xmm2
- shufps xmm3, xmm3, 11100110b ; shift right
- shufps xmm4, xmm4, 11100110b
- shufps xmm5, xmm5, 11100110b
- addss xmm3, [esp + .fixH1]
- addss xmm4, [esp + .fiyH1]
- addss xmm5, [esp + .fizH1]
- movss [esp + .fixH1], xmm3
- movss [esp + .fiyH1], xmm4
- movss [esp + .fizH1], xmm5 ; updated the H1 force.
-
- mov edi, [ebp + %$faction]
- shufps xmm3, xmm3, 11100111b ; shift right
- shufps xmm4, xmm4, 11100111b
- shufps xmm5, xmm5, 11100111b
- addss xmm3, [esp + .fixH2]
- addss xmm4, [esp + .fiyH2]
- addss xmm5, [esp + .fizH2]
- movss [esp + .fixH2], xmm3
- movss [esp + .fiyH2], xmm4
- movss [esp + .fizH2], xmm5 ; updated the H2 force.
-
- ; the fj's - start by accumulating the tx/ty/tz force in xmm0, xmm1.
- xorps xmm5, xmm5
- movaps xmm3, xmm0
- movlps xmm6, [edi + eax*4]
- movss xmm7, [edi + eax*4 + 8]
- unpcklps xmm3, xmm1
- movlhps xmm3, xmm5
- unpckhps xmm0, xmm1
- addps xmm0, xmm3
- movhlps xmm3, xmm0
- addps xmm0, xmm3 ; x,y sum in xmm0
-
- movhlps xmm1, xmm2
- addss xmm2, xmm1
- shufps xmm1, xmm1, 1b
- addss xmm2, xmm1 ; z sum in xmm2
- subps xmm6, xmm0
- subss xmm7, xmm2
-
- movlps [edi + eax*4], xmm6
- movss [edi + eax*4 + 8], xmm7
-
- dec dword [esp + .innerk]
- jz .updateouterdata
- jmp .odd_loop
-.updateouterdata:
- mov ecx, [esp + .ii3]
- mov edi, [ebp + %$faction]
- mov esi, [ebp + %$fshift]
- mov edx, [esp + .is3]
-
- ; accumulate Oi forces in xmm0, xmm1, xmm2
- movaps xmm0, [esp + .fixO]
- movaps xmm1, [esp + .fiyO]
- movaps xmm2, [esp + .fizO]
-
- movhlps xmm3, xmm0
- movhlps xmm4, xmm1
- movhlps xmm5, xmm2
- addps xmm0, xmm3
- addps xmm1, xmm4
- addps xmm2, xmm5 ; summan ligger i 1/2 i xmm0-xmm2
-
- movaps xmm3, xmm0
- movaps xmm4, xmm1
- movaps xmm5, xmm2
-
- shufps xmm3, xmm3, 1b
- shufps xmm4, xmm4, 1b
- shufps xmm5, xmm5, 1b
- addss xmm0, xmm3
- addss xmm1, xmm4
- addss xmm2, xmm5 ; xmm0-xmm2 has single force in pos0.
-
- ; increment i force
- movss xmm3, [edi + ecx*4]
- movss xmm4, [edi + ecx*4 + 4]
- movss xmm5, [edi + ecx*4 + 8]
- addss xmm3, xmm0
- addss xmm4, xmm1
- addss xmm5, xmm2
- movss [edi + ecx*4], xmm3
- movss [edi + ecx*4 + 4], xmm4
- movss [edi + ecx*4 + 8], xmm5
-
- ; accumulate force in xmm6/xmm7 for fshift
- movaps xmm6, xmm0
- movss xmm7, xmm2
- movlhps xmm6, xmm1
- shufps xmm6, xmm6, 1000b
-
- ; accumulate H1i forces in xmm0, xmm1, xmm2
- movaps xmm0, [esp + .fixH1]
- movaps xmm1, [esp + .fiyH1]
- movaps xmm2, [esp + .fizH1]
-
- movhlps xmm3, xmm0
- movhlps xmm4, xmm1
- movhlps xmm5, xmm2
- addps xmm0, xmm3
- addps xmm1, xmm4
- addps xmm2, xmm5 ; summan ligger i 1/2 i xmm0-xmm2
-
- movaps xmm3, xmm0
- movaps xmm4, xmm1
- movaps xmm5, xmm2
-
- shufps xmm3, xmm3, 1b
- shufps xmm4, xmm4, 1b
- shufps xmm5, xmm5, 1b
- addss xmm0, xmm3
- addss xmm1, xmm4
- addss xmm2, xmm5 ; xmm0-xmm2 has single force in pos0.
-
- ; increment i force
- movss xmm3, [edi + ecx*4 + 12]
- movss xmm4, [edi + ecx*4 + 16]
- movss xmm5, [edi + ecx*4 + 20]
- addss xmm3, xmm0
- addss xmm4, xmm1
- addss xmm5, xmm2
- movss [edi + ecx*4 + 12], xmm3
- movss [edi + ecx*4 + 16], xmm4
- movss [edi + ecx*4 + 20], xmm5
-
- ;accumulate force in xmm6/xmm7 for fshift
- addss xmm7, xmm2
- movlhps xmm0, xmm1
- shufps xmm0, xmm0, 1000b
- addps xmm6, xmm0
-
- ; accumulate H2i forces in xmm0, xmm1, xmm2
- movaps xmm0, [esp + .fixH2]
- movaps xmm1, [esp + .fiyH2]
- movaps xmm2, [esp + .fizH2]
-
- movhlps xmm3, xmm0
- movhlps xmm4, xmm1
- movhlps xmm5, xmm2
- addps xmm0, xmm3
- addps xmm1, xmm4
- addps xmm2, xmm5 ; summan ligger i 1/2 i xmm0-xmm2
-
- movaps xmm3, xmm0
- movaps xmm4, xmm1
- movaps xmm5, xmm2
-
- shufps xmm3, xmm3, 1b
- shufps xmm4, xmm4, 1b
- shufps xmm5, xmm5, 1b
- addss xmm0, xmm3
- addss xmm1, xmm4
- addss xmm2, xmm5 ; xmm0-xmm2 has single force in pos0.
-
- ; increment i force
- movss xmm3, [edi + ecx*4 + 24]
- movss xmm4, [edi + ecx*4 + 28]
- movss xmm5, [edi + ecx*4 + 32]
- addss xmm3, xmm0
- addss xmm4, xmm1
- addss xmm5, xmm2
- movss [edi + ecx*4 + 24], xmm3
- movss [edi + ecx*4 + 28], xmm4
- movss [edi + ecx*4 + 32], xmm5
-
- ;accumulate force in xmm6/xmm7 for fshift
- addss xmm7, xmm2
- movlhps xmm0, xmm1
- shufps xmm0, xmm0, 1000b
- addps xmm6, xmm0
-
- ; increment fshift force
- movlps xmm3, [esi + edx*4]
- movss xmm4, [esi + edx*4 + 8]
- addps xmm3, xmm6
- addss xmm4, xmm7
- movlps [esi + edx*4], xmm3
- movss [esi + edx*4 + 8], xmm4
-
- mov edx, [ebp + %$gid]
- mov edx, [edx]
- add [ebp + %$gid], dword 4
-
- ; accumulate total potential energy and update it.
- movaps xmm7, [esp + .vctot]
- ; accumulate
- movhlps xmm6, xmm7
- addps xmm7, xmm6 ; pos 0-1 in xmm7 have the sum now
- movaps xmm6, xmm7
- shufps xmm6, xmm6, 1b
- addss xmm7, xmm6
-
- ; add earlier value from mem.
- mov eax, [ebp + %$Vc]
- addss xmm7, [eax + edx*4]
- ; move back to mem.
- movss [eax + edx*4], xmm7
-
- ;; finish if last
- mov ecx, [ebp + %$nri]
- dec ecx
- jecxz .end
- ;; not last, iterate once more!
- mov [ebp + %$nri], ecx
- jmp .outer
-.end:
- emms
- mov eax, [esp + .salign]
- add esp, eax
- add esp, 740
- pop edi
- pop esi
- pop edx
- pop ecx
- pop ebx
- pop eax
- endproc
-
-
-
-proc inl3030_sse
-%$nri arg
-%$iinr arg
-%$jindex arg
-%$jjnr arg
-%$shift arg
-%$shiftvec arg
-%$fshift arg
-%$gid arg
-%$pos arg
-%$faction arg
-%$charge arg
-%$facel arg
-%$Vc arg
-%$tabscale arg
-%$VFtab arg
- ;; stack offsets for local variables
- ;; bottom of stack is cache-aligned for sse use
-.ixO equ 0
-.iyO equ 16
-.izO equ 32
-.ixH1 equ 48
-.iyH1 equ 64
-.izH1 equ 80
-.ixH2 equ 96
-.iyH2 equ 112
-.izH2 equ 128
-.jxO equ 144
-.jyO equ 160
-.jzO equ 176
-.jxH1 equ 192
-.jyH1 equ 208
-.jzH1 equ 224
-.jxH2 equ 240
-.jyH2 equ 256
-.jzH2 equ 272
-.dxOO equ 288
-.dyOO equ 304
-.dzOO equ 320
-.dxOH1 equ 336
-.dyOH1 equ 352
-.dzOH1 equ 368
-.dxOH2 equ 384
-.dyOH2 equ 400
-.dzOH2 equ 416
-.dxH1O equ 432
-.dyH1O equ 448
-.dzH1O equ 464
-.dxH1H1 equ 480
-.dyH1H1 equ 496
-.dzH1H1 equ 512
-.dxH1H2 equ 528
-.dyH1H2 equ 544
-.dzH1H2 equ 560
-.dxH2O equ 576
-.dyH2O equ 592
-.dzH2O equ 608
-.dxH2H1 equ 624
-.dyH2H1 equ 640
-.dzH2H1 equ 656
-.dxH2H2 equ 672
-.dyH2H2 equ 688
-.dzH2H2 equ 704
-.qqOO equ 720
-.qqOH equ 736
-.qqHH equ 752
-.two equ 768
-.tabscale equ 784
-.vctot equ 800
-.fixO equ 816
-.fiyO equ 832
-.fizO equ 848
-.fixH1 equ 864
-.fiyH1 equ 880
-.fizH1 equ 896
-.fixH2 equ 912
-.fiyH2 equ 928
-.fizH2 equ 944
-.fjxO equ 960
-.fjyO equ 976
-.fjzO equ 992
-.fjxH1 equ 1008
-.fjyH1 equ 1024
-.fjzH1 equ 1040
-.fjxH2 equ 1056
-.fjyH2 equ 1072
-.fjzH2 equ 1088
-.half equ 1104
-.three equ 1120
-.rsqOO equ 1136
-.rsqOH1 equ 1152
-.rsqOH2 equ 1168
-.rsqH1O equ 1184
-.rsqH1H1 equ 1200
-.rsqH1H2 equ 1216
-.rsqH2O equ 1232
-.rsqH2H1 equ 1248
-.rsqH2H2 equ 1264
-.rinvOO equ 1280
-.rinvOH1 equ 1296
-.rinvOH2 equ 1312
-.rinvH1O equ 1328
-.rinvH1H1 equ 1344
-.rinvH1H2 equ 1360
-.rinvH2O equ 1376
-.rinvH2H1 equ 1392
-.rinvH2H2 equ 1408
-.is3 equ 1424
-.ii3 equ 1428
-.innerjjnr equ 1432
-.innerk equ 1436
-.salign equ 1440
- push eax
- push ebx
- push ecx
- push edx
- push esi
- push edi
- sub esp, 1444 ; local stack space
- mov eax, esp
- and eax, 0xf
- sub esp, eax
- mov [esp + .salign], eax
-
- emms
-
- movups xmm0, [sse_half]
- movups xmm1, [sse_two]
- movups xmm2, [sse_three]
- movss xmm3, [ebp +%$tabscale]
- movaps [esp + .half], xmm0
- movaps [esp + .two], xmm1
- movaps [esp + .three], xmm2
- shufps xmm3, xmm3, 0b
- movaps [esp + .tabscale], xmm3
-
- ;; assume we have at least one i particle - start directly
- mov ecx, [ebp + %$iinr] ; ecx = pointer into iinr[]
- mov ebx, [ecx] ; ebx =ii
-
- mov edx, [ebp + %$charge]
- movss xmm3, [edx + ebx*4]
- movss xmm4, xmm3
- movss xmm5, [edx + ebx*4 + 4]
- movss xmm6, [ebp + %$facel]
- mulss xmm3, xmm3
- mulss xmm4, xmm5
- mulss xmm5, xmm5
- mulss xmm3, xmm6
- mulss xmm4, xmm6
- mulss xmm5, xmm6
- shufps xmm3, xmm3, 0b
- shufps xmm4, xmm4, 0b
- shufps xmm5, xmm5, 0b
- movaps [esp + .qqOO], xmm3
- movaps [esp + .qqOH], xmm4
- movaps [esp + .qqHH], xmm5
-
-.outer:
- mov eax, [ebp + %$shift] ; eax = pointer into shift[]
- mov ebx, [eax] ; ebx=shift[n]
- add [ebp + %$shift], dword 4 ; advance pointer one step
-
- lea ebx, [ebx + ebx*2] ; ebx=3*is
- mov [esp + .is3],ebx ; store is3
-
- mov eax, [ebp + %$shiftvec] ; eax = base of shiftvec[]
-
- movss xmm0, [eax + ebx*4]
- movss xmm1, [eax + ebx*4 + 4]
- movss xmm2, [eax + ebx*4 + 8]
-
- mov ecx, [ebp + %$iinr] ; ecx = pointer into iinr[]
- add [ebp + %$iinr], dword 4 ; advance pointer
- mov ebx, [ecx] ; ebx =ii
-
- lea ebx, [ebx + ebx*2] ; ebx = 3*ii=ii3
- mov eax, [ebp + %$pos] ; eax = base of pos[]
- mov [esp + .ii3], ebx
-
- movaps xmm3, xmm0
- movaps xmm4, xmm1
- movaps xmm5, xmm2
- addss xmm3, [eax + ebx*4]
- addss xmm4, [eax + ebx*4 + 4]
- addss xmm5, [eax + ebx*4 + 8]
- shufps xmm3, xmm3, 0b
- shufps xmm4, xmm4, 0b
- shufps xmm5, xmm5, 0b
- movaps [esp + .ixO], xmm3
- movaps [esp + .iyO], xmm4
- movaps [esp + .izO], xmm5
-
- movss xmm3, xmm0
- movss xmm4, xmm1
- movss xmm5, xmm2
- addss xmm0, [eax + ebx*4 + 12]
- addss xmm1, [eax + ebx*4 + 16]
- addss xmm2, [eax + ebx*4 + 20]
- addss xmm3, [eax + ebx*4 + 24]
- addss xmm4, [eax + ebx*4 + 28]
- addss xmm5, [eax + ebx*4 + 32]
-
- shufps xmm0, xmm0, 0b
- shufps xmm1, xmm1, 0b
- shufps xmm2, xmm2, 0b
- shufps xmm3, xmm3, 0b
- shufps xmm4, xmm4, 0b
- shufps xmm5, xmm5, 0b
- movaps [esp + .ixH1], xmm0
- movaps [esp + .iyH1], xmm1
- movaps [esp + .izH1], xmm2
- movaps [esp + .ixH2], xmm3
- movaps [esp + .iyH2], xmm4
- movaps [esp + .izH2], xmm5
-
- ; clear vctot and i forces
- xorps xmm4, xmm4
- movaps [esp + .vctot], xmm4
- movaps [esp + .fixO], xmm4
- movaps [esp + .fiyO], xmm4
- movaps [esp + .fizO], xmm4
- movaps [esp + .fixH1], xmm4
- movaps [esp + .fiyH1], xmm4
- movaps [esp + .fizH1], xmm4
- movaps [esp + .fixH2], xmm4
- movaps [esp + .fiyH2], xmm4
- movaps [esp + .fizH2], xmm4
-
- mov eax, [ebp + %$jindex]
- mov ecx, [eax] ; jindex[n]
- mov edx, [eax + 4] ; jindex[n+1]
- add [ebp + %$jindex], dword 4
- sub edx, ecx ; number of innerloop atoms
-
- mov esi, [ebp + %$pos]
- mov edi, [ebp + %$faction]
- mov eax, [ebp + %$jjnr]
- shl ecx, 2
- add eax, ecx
- mov [esp + .innerjjnr], eax ; pointer to jjnr[nj0]
- sub edx, dword 4
- mov [esp + .innerk], edx ; number of innerloop atoms
- jge .unroll_loop
- jmp .single_check
-.unroll_loop:
- ;; quad-unroll innerloop here.
- mov edx, [esp + .innerjjnr] ; pointer to jjnr[k]
-
- mov eax, [edx]
- mov ebx, [edx + 4]
- mov ecx, [edx + 8]
- mov edx, [edx + 12] ; eax-edx=jnr1-4
-
- add [esp + .innerjjnr], dword 16 ; advance pointer (unrolled 4)
-
- mov esi, [ebp + %$pos] ; base of pos[]
-
- lea eax, [eax + eax*2] ; replace jnr with j3
- lea ebx, [ebx + ebx*2]
- lea ecx, [ecx + ecx*2] ; replace jnr with j3
- lea edx, [edx + edx*2]
-
- ; move j coordinates to local temp. variables
- movlps xmm2, [esi + eax*4]
- movlps xmm3, [esi + eax*4 + 12]
- movlps xmm4, [esi + eax*4 + 24]
-
- movlps xmm5, [esi + ebx*4]
- movlps xmm6, [esi + ebx*4 + 12]
- movlps xmm7, [esi + ebx*4 + 24]
-
- movhps xmm2, [esi + ecx*4]
- movhps xmm3, [esi + ecx*4 + 12]
- movhps xmm4, [esi + ecx*4 + 24]
-
- movhps xmm5, [esi + edx*4]
- movhps xmm6, [esi + edx*4 + 12]
- movhps xmm7, [esi + edx*4 + 24]
-
- ;; current state:
- ;; xmm2= jxOa jyOa jxOc jyOc
- ;; xmm3= jxH1a jyH1a jxH1c jyH1c
- ;; xmm4= jxH2a jyH2a jxH2c jyH2c
- ;; xmm5= jxOb jyOb jxOd jyOd
- ;; xmm6= jxH1b jyH1b jxH1d jyH1d
- ;; xmm7= jxH2b jyH2b jxH2d jyH2d
-
- movaps xmm0, xmm2
- movaps xmm1, xmm3
- unpcklps xmm0, xmm5 ; xmm0= jxOa jxOb jyOa jyOb
- unpcklps xmm1, xmm6 ; xmm1= jxH1a jxH1b jyH1a jyH1b
- unpckhps xmm2, xmm5 ; xmm2= jxOc jxOd jyOc jyOd
- unpckhps xmm3, xmm6 ; xmm3= jxH1c jxH1d jyH1c jyH1d
- movaps xmm5, xmm4
- movaps xmm6, xmm0
- unpcklps xmm4, xmm7 ; xmm4= jxH2a jxH2b jyH2a jyH2b
- unpckhps xmm5, xmm7 ; xmm5= jxH2c jxH2d jyH2c jyH2d
- movaps xmm7, xmm1
- movlhps xmm0, xmm2 ; xmm0= jxOa jxOb jxOc jxOd
- movaps [esp + .jxO], xmm0
- movhlps xmm2, xmm6 ; xmm2= jyOa jyOb jyOc jyOd
- movaps [esp + .jyO], xmm2
- movlhps xmm1, xmm3
- movaps [esp + .jxH1], xmm1
- movhlps xmm3, xmm7
- movaps xmm6, xmm4
- movaps [esp + .jyH1], xmm3
- movlhps xmm4, xmm5
- movaps [esp + .jxH2], xmm4
- movhlps xmm5, xmm6
- movaps [esp + .jyH2], xmm5
-
- movss xmm0, [esi + eax*4 + 8]
- movss xmm1, [esi + eax*4 + 20]
- movss xmm2, [esi + eax*4 + 32]
-
- movss xmm3, [esi + ecx*4 + 8]
- movss xmm4, [esi + ecx*4 + 20]
- movss xmm5, [esi + ecx*4 + 32]
-
- movhps xmm0, [esi + ebx*4 + 4]
- movhps xmm1, [esi + ebx*4 + 16]
- movhps xmm2, [esi + ebx*4 + 28]
-
- movhps xmm3, [esi + edx*4 + 4]
- movhps xmm4, [esi + edx*4 + 16]
- movhps xmm5, [esi + edx*4 + 28]
-
- shufps xmm0, xmm3, 11001100b
- shufps xmm1, xmm4, 11001100b
- shufps xmm2, xmm5, 11001100b
- movaps [esp + .jzO], xmm0
- movaps [esp + .jzH1], xmm1
- movaps [esp + .jzH2], xmm2
-
- movaps xmm0, [esp + .ixO]
- movaps xmm1, [esp + .iyO]
- movaps xmm2, [esp + .izO]
- movaps xmm3, [esp + .ixO]
- movaps xmm4, [esp + .iyO]
- movaps xmm5, [esp + .izO]
- subps xmm0, [esp + .jxO]
- subps xmm1, [esp + .jyO]
- subps xmm2, [esp + .jzO]
- subps xmm3, [esp + .jxH1]
- subps xmm4, [esp + .jyH1]
- subps xmm5, [esp + .jzH1]
- movaps [esp + .dxOO], xmm0
- movaps [esp + .dyOO], xmm1
- movaps [esp + .dzOO], xmm2
- mulps xmm0, xmm0
- mulps xmm1, xmm1
- mulps xmm2, xmm2
- movaps [esp + .dxOH1], xmm3
- movaps [esp + .dyOH1], xmm4
- movaps [esp + .dzOH1], xmm5
- mulps xmm3, xmm3
- mulps xmm4, xmm4
- mulps xmm5, xmm5
- addps xmm0, xmm1
- addps xmm0, xmm2
- addps xmm3, xmm4
- addps xmm3, xmm5
- movaps [esp + .rsqOO], xmm0
- movaps [esp + .rsqOH1], xmm3
-
- movaps xmm0, [esp + .ixO]
- movaps xmm1, [esp + .iyO]
- movaps xmm2, [esp + .izO]
- movaps xmm3, [esp + .ixH1]
- movaps xmm4, [esp + .iyH1]
- movaps xmm5, [esp + .izH1]
- subps xmm0, [esp + .jxH2]
- subps xmm1, [esp + .jyH2]
- subps xmm2, [esp + .jzH2]
- subps xmm3, [esp + .jxO]
- subps xmm4, [esp + .jyO]
- subps xmm5, [esp + .jzO]
- movaps [esp + .dxOH2], xmm0
- movaps [esp + .dyOH2], xmm1
- movaps [esp + .dzOH2], xmm2
- mulps xmm0, xmm0
- mulps xmm1, xmm1
- mulps xmm2, xmm2
- movaps [esp + .dxH1O], xmm3
- movaps [esp + .dyH1O], xmm4
- movaps [esp + .dzH1O], xmm5
- mulps xmm3, xmm3
- mulps xmm4, xmm4
- mulps xmm5, xmm5
- addps xmm0, xmm1
- addps xmm0, xmm2
- addps xmm3, xmm4
- addps xmm3, xmm5
- movaps [esp + .rsqOH2], xmm0
- movaps [esp + .rsqH1O], xmm3
-
- movaps xmm0, [esp + .ixH1]
- movaps xmm1, [esp + .iyH1]
- movaps xmm2, [esp + .izH1]
- movaps xmm3, [esp + .ixH1]
- movaps xmm4, [esp + .iyH1]
- movaps xmm5, [esp + .izH1]
- subps xmm0, [esp + .jxH1]
- subps xmm1, [esp + .jyH1]
- subps xmm2, [esp + .jzH1]
- subps xmm3, [esp + .jxH2]
- subps xmm4, [esp + .jyH2]
- subps xmm5, [esp + .jzH2]
- movaps [esp + .dxH1H1], xmm0
- movaps [esp + .dyH1H1], xmm1
- movaps [esp + .dzH1H1], xmm2
- mulps xmm0, xmm0
- mulps xmm1, xmm1
- mulps xmm2, xmm2
- movaps [esp + .dxH1H2], xmm3
- movaps [esp + .dyH1H2], xmm4
- movaps [esp + .dzH1H2], xmm5
- mulps xmm3, xmm3
- mulps xmm4, xmm4
- mulps xmm5, xmm5
- addps xmm0, xmm1
- addps xmm0, xmm2
- addps xmm3, xmm4
- addps xmm3, xmm5
- movaps [esp + .rsqH1H1], xmm0
- movaps [esp + .rsqH1H2], xmm3
-
- movaps xmm0, [esp + .ixH2]
- movaps xmm1, [esp + .iyH2]
- movaps xmm2, [esp + .izH2]
- movaps xmm3, [esp + .ixH2]
- movaps xmm4, [esp + .iyH2]
- movaps xmm5, [esp + .izH2]
- subps xmm0, [esp + .jxO]
- subps xmm1, [esp + .jyO]
- subps xmm2, [esp + .jzO]
- subps xmm3, [esp + .jxH1]
- subps xmm4, [esp + .jyH1]
- subps xmm5, [esp + .jzH1]
- movaps [esp + .dxH2O], xmm0
- movaps [esp + .dyH2O], xmm1
- movaps [esp + .dzH2O], xmm2
- mulps xmm0, xmm0
- mulps xmm1, xmm1
- mulps xmm2, xmm2
- movaps [esp + .dxH2H1], xmm3
- movaps [esp + .dyH2H1], xmm4
- movaps [esp + .dzH2H1], xmm5
- mulps xmm3, xmm3
- mulps xmm4, xmm4
- mulps xmm5, xmm5
- addps xmm0, xmm1
- addps xmm0, xmm2
- addps xmm4, xmm3
- addps xmm4, xmm5
- movaps [esp + .rsqH2O], xmm0
- movaps [esp + .rsqH2H1], xmm4
-
- movaps xmm0, [esp + .ixH2]
- movaps xmm1, [esp + .iyH2]
- movaps xmm2, [esp + .izH2]
- subps xmm0, [esp + .jxH2]
- subps xmm1, [esp + .jyH2]
- subps xmm2, [esp + .jzH2]
- movaps [esp + .dxH2H2], xmm0
- movaps [esp + .dyH2H2], xmm1
- movaps [esp + .dzH2H2], xmm2
- mulps xmm0, xmm0
- mulps xmm1, xmm1
- mulps xmm2, xmm2
- addps xmm0, xmm1
- addps xmm0, xmm2
- movaps [esp + .rsqH2H2], xmm0
-
- ; start doing invsqrt. use rsq values in xmm0, xmm4
- rsqrtps xmm1, xmm0
- rsqrtps xmm5, xmm4
- movaps xmm2, xmm1
- movaps xmm6, xmm5
- mulps xmm1, xmm1
- mulps xmm5, xmm5
- movaps xmm3, [esp + .three]
- movaps xmm7, xmm3
- mulps xmm1, xmm0
- mulps xmm5, xmm4
- subps xmm3, xmm1
- subps xmm7, xmm5
- mulps xmm3, xmm2
- mulps xmm7, xmm6
- mulps xmm3, [esp + .half] ; rinvH2H2
- mulps xmm7, [esp + .half] ; rinvH2H1
- movaps [esp + .rinvH2H2], xmm3
- movaps [esp + .rinvH2H1], xmm7
-
- rsqrtps xmm1, [esp + .rsqOO]
- rsqrtps xmm5, [esp + .rsqOH1]
- movaps xmm2, xmm1
- movaps xmm6, xmm5
- mulps xmm1, xmm1
- mulps xmm5, xmm5
- movaps xmm3, [esp + .three]
- movaps xmm7, xmm3
- mulps xmm1, [esp + .rsqOO]
- mulps xmm5, [esp + .rsqOH1]
- subps xmm3, xmm1
- subps xmm7, xmm5
- mulps xmm3, xmm2
- mulps xmm7, xmm6
- mulps xmm3, [esp + .half]
- mulps xmm7, [esp + .half]
- movaps [esp + .rinvOO], xmm3
- movaps [esp + .rinvOH1], xmm7
-
- rsqrtps xmm1, [esp + .rsqOH2]
- rsqrtps xmm5, [esp + .rsqH1O]
- movaps xmm2, xmm1
- movaps xmm6, xmm5
- mulps xmm1, xmm1
- mulps xmm5, xmm5
- movaps xmm3, [esp + .three]
- movaps xmm7, xmm3
- mulps xmm1, [esp + .rsqOH2]
- mulps xmm5, [esp + .rsqH1O]
- subps xmm3, xmm1
- subps xmm7, xmm5
- mulps xmm3, xmm2
- mulps xmm7, xmm6
- mulps xmm3, [esp + .half]
- mulps xmm7, [esp + .half]
- movaps [esp + .rinvOH2], xmm3
- movaps [esp + .rinvH1O], xmm7
-
- rsqrtps xmm1, [esp + .rsqH1H1]
- rsqrtps xmm5, [esp + .rsqH1H2]
- movaps xmm2, xmm1
- movaps xmm6, xmm5
- mulps xmm1, xmm1
- mulps xmm5, xmm5
- movaps xmm3, [esp + .three]
- movaps xmm7, xmm3
- mulps xmm1, [esp + .rsqH1H1]
- mulps xmm5, [esp + .rsqH1H2]
- subps xmm3, xmm1
- subps xmm7, xmm5
- mulps xmm3, xmm2
- mulps xmm7, xmm6
- mulps xmm3, [esp + .half]
- mulps xmm7, [esp + .half]
- movaps [esp + .rinvH1H1], xmm3
- movaps [esp + .rinvH1H2], xmm7
-
- rsqrtps xmm1, [esp + .rsqH2O]
- movaps xmm2, xmm1
- mulps xmm1, xmm1
- movaps xmm3, [esp + .three]
- mulps xmm1, [esp + .rsqH2O]
- subps xmm3, xmm1
- mulps xmm3, xmm2
- mulps xmm3, [esp + .half]
- movaps [esp + .rinvH2O], xmm3
-
- ;; start with OO interaction.
- movaps xmm0, [esp + .rinvOO]
- movaps xmm1, xmm0
- mulps xmm1, [esp + .rsqOO] ; xmm1=r
- mulps xmm1, [esp + .tabscale]
-
- movhlps xmm2, xmm1
- cvttps2pi mm6, xmm1
- cvttps2pi mm7, xmm2 ; mm6/mm7 contain lu indices
- cvtpi2ps xmm3, mm6
- cvtpi2ps xmm2, mm7
- movlhps xmm3, xmm2
- subps xmm1, xmm3 ; xmm1=eps
- movaps xmm2, xmm1
- mulps xmm2, xmm2 ;xmm2=eps2
- pslld mm6, 2
- pslld mm7, 2
-
- movd mm0, eax
- movd mm1, ebx
- movd mm2, ecx
- movd mm3, edx
-
- mov esi, [ebp + %$VFtab]
- movd eax, mm6
- psrlq mm6, 32
- movd ecx, mm7
- psrlq mm7, 32
- movd ebx, mm6
- movd edx, mm7
-
- movlps xmm5, [esi + eax*4]
- movlps xmm7, [esi + ecx*4]
- movhps xmm5, [esi + ebx*4]
- movhps xmm7, [esi + edx*4] ; got half coulomb table
-
- movaps xmm4, xmm5
- shufps xmm4, xmm7, 10001000b
- shufps xmm5, xmm7, 11011101b
-
- movlps xmm7, [esi + eax*4 + 8]
- movlps xmm3, [esi + ecx*4 + 8]
- movhps xmm7, [esi + ebx*4 + 8]
- movhps xmm3, [esi + edx*4 + 8] ; other half of coulomb table
- movaps xmm6, xmm7
- shufps xmm6, xmm3, 10001000b
- shufps xmm7, xmm3, 11011101b
- ; coulomb table ready, in xmm4-xmm7
-
- mulps xmm6, xmm1 ; xmm6=Geps
- mulps xmm7, xmm2 ; xmm7=Heps2
- addps xmm5, xmm6
- addps xmm5, xmm7 ; xmm5=Fp
- mulps xmm7, [esp + .two] ; two*Heps2
- movaps xmm3, [esp + .qqOO]
- addps xmm7, xmm6
- addps xmm7, xmm5 ; xmm7=FF
- mulps xmm5, xmm1 ; xmm5=eps*Fp
- addps xmm5, xmm4 ; xmm5=VV
- mulps xmm5, xmm3 ; vcoul=qq*VV
- mulps xmm3, xmm7 ; fijC=FF*qq
- ; at this point mm5 contains vcoul and mm3 fijC.
- ; increment vcoul - then we can get rid of mm5.
- ;; update vctot
- addps xmm5, [esp + .vctot]
- xorps xmm2, xmm2
- movaps [esp + .vctot], xmm5
- mulps xmm3, [esp + .tabscale]
-
- subps xmm2, xmm3
- mulps xmm0, xmm2
-
- movaps xmm1, xmm0
- movaps xmm2, xmm0
-
- xorps xmm3, xmm3
- movaps xmm4, xmm3
- movaps xmm5, xmm3
- mulps xmm0, [esp + .dxOO]
- mulps xmm1, [esp + .dyOO]
- mulps xmm2, [esp + .dzOO]
- subps xmm3, xmm0
- subps xmm4, xmm1
- subps xmm5, xmm2
- addps xmm0, [esp + .fixO]
- addps xmm1, [esp + .fiyO]
- addps xmm2, [esp + .fizO]
- movaps [esp + .fjxO], xmm3
- movaps [esp + .fjyO], xmm4
- movaps [esp + .fjzO], xmm5
- movaps [esp + .fixO], xmm0
- movaps [esp + .fiyO], xmm1
- movaps [esp + .fizO], xmm2
-
- ; O-H1 interaction
- movaps xmm0, [esp + .rinvOH1]
- movaps xmm1, xmm0
- mulps xmm1, [esp + .rsqOH1] ; xmm1=r
- mulps xmm1, [esp + .tabscale]
- movhlps xmm2, xmm1
- cvttps2pi mm6, xmm1
- cvttps2pi mm7, xmm2 ; mm6/mm7 contain lu indices
- cvtpi2ps xmm3, mm6
- cvtpi2ps xmm2, mm7
- movlhps xmm3, xmm2
- subps xmm1, xmm3 ; xmm1=eps
- movaps xmm2, xmm1
- mulps xmm2, xmm2 ;xmm2=eps2
-
- pslld mm6, 2
- pslld mm7, 2
-
- movd eax, mm6
- psrlq mm6, 32
- movd ecx, mm7
- psrlq mm7, 32
- movd ebx, mm6
- movd edx, mm7
-
- movlps xmm5, [esi + eax*4]
- movlps xmm7, [esi + ecx*4]
- movhps xmm5, [esi + ebx*4]
- movhps xmm7, [esi + edx*4] ; got half coulomb table
-
- movaps xmm4, xmm5
- shufps xmm4, xmm7, 10001000b
- shufps xmm5, xmm7, 11011101b
-
- movlps xmm7, [esi + eax*4 + 8]
- movlps xmm3, [esi + ecx*4 + 8]
- movhps xmm7, [esi + ebx*4 + 8]
- movhps xmm3, [esi + edx*4 + 8] ; other half of coulomb table
- movaps xmm6, xmm7
- shufps xmm6, xmm3, 10001000b
- shufps xmm7, xmm3, 11011101b
- ; coulomb table ready, in xmm4-xmm7
-
- mulps xmm6, xmm1 ; xmm6=Geps
- mulps xmm7, xmm2 ; xmm7=Heps2
- addps xmm5, xmm6
- addps xmm5, xmm7 ; xmm5=Fp
- mulps xmm7, [esp + .two] ; two*Heps2
- movaps xmm3, [esp + .qqOH]
- addps xmm7, xmm6
- addps xmm7, xmm5 ; xmm7=FF
- mulps xmm5, xmm1 ; xmm5=eps*Fp
- addps xmm5, xmm4 ; xmm5=VV
- mulps xmm5, xmm3 ; vcoul=qq*VV
- mulps xmm3, xmm7 ; fijC=FF*qq
- ; at this point mm5 contains vcoul and mm3 fijC.
-
- addps xmm5, [esp + .vctot]
- movaps [esp + .vctot], xmm5
- xorps xmm1, xmm1
- mulps xmm3, [esp + .tabscale]
- mulps xmm3, xmm0
- subps xmm1, xmm3
-
- movaps xmm0, xmm1
- movaps xmm2, xmm1
-
- xorps xmm3, xmm3
- movaps xmm4, xmm3
- movaps xmm5, xmm3
- mulps xmm0, [esp + .dxOH1]
- mulps xmm1, [esp + .dyOH1]
- mulps xmm2, [esp + .dzOH1]
- subps xmm3, xmm0
- subps xmm4, xmm1
- subps xmm5, xmm2
- addps xmm0, [esp + .fixO]
- addps xmm1, [esp + .fiyO]
- addps xmm2, [esp + .fizO]
- movaps [esp + .fjxH1], xmm3
- movaps [esp + .fjyH1], xmm4
- movaps [esp + .fjzH1], xmm5
- movaps [esp + .fixO], xmm0
- movaps [esp + .fiyO], xmm1
- movaps [esp + .fizO], xmm2
-
- ; O-H2 interaction
- movaps xmm0, [esp + .rinvOH2]
- movaps xmm1, xmm0
- mulps xmm1, [esp + .rsqOH2] ; xmm1=r
- mulps xmm1, [esp + .tabscale]
- movhlps xmm2, xmm1
- cvttps2pi mm6, xmm1
- cvttps2pi mm7, xmm2 ; mm6/mm7 contain lu indices
- cvtpi2ps xmm3, mm6
- cvtpi2ps xmm2, mm7
- movlhps xmm3, xmm2
- subps xmm1, xmm3 ; xmm1=eps
- movaps xmm2, xmm1
- mulps xmm2, xmm2 ;xmm2=eps2
-
- pslld mm6, 2
- pslld mm7, 2
-
- movd eax, mm6
- psrlq mm6, 32
- movd ecx, mm7
- psrlq mm7, 32
- movd ebx, mm6
- movd edx, mm7
-
- movlps xmm5, [esi + eax*4]
- movlps xmm7, [esi + ecx*4]
- movhps xmm5, [esi + ebx*4]
- movhps xmm7, [esi + edx*4] ; got half coulomb table
-
- movaps xmm4, xmm5
- shufps xmm4, xmm7, 10001000b
- shufps xmm5, xmm7, 11011101b
-
- movlps xmm7, [esi + eax*4 + 8]
- movlps xmm3, [esi + ecx*4 + 8]
- movhps xmm7, [esi + ebx*4 + 8]
- movhps xmm3, [esi + edx*4 + 8] ; other half of coulomb table
- movaps xmm6, xmm7
- shufps xmm6, xmm3, 10001000b
- shufps xmm7, xmm3, 11011101b
- ; coulomb table ready, in xmm4-xmm7
-
- mulps xmm6, xmm1 ; xmm6=Geps
- mulps xmm7, xmm2 ; xmm7=Heps2
- addps xmm5, xmm6
- addps xmm5, xmm7 ; xmm5=Fp
- mulps xmm7, [esp + .two] ; two*Heps2
- movaps xmm3, [esp + .qqOH]
- addps xmm7, xmm6
- addps xmm7, xmm5 ; xmm7=FF
- mulps xmm5, xmm1 ; xmm5=eps*Fp
- addps xmm5, xmm4 ; xmm5=VV
- mulps xmm5, xmm3 ; vcoul=qq*VV
- mulps xmm3, xmm7 ; fijC=FF*qq
- ; at this point mm5 contains vcoul and mm3 fijC.
-
- addps xmm5, [esp + .vctot]
- movaps [esp + .vctot], xmm5
- xorps xmm1, xmm1
- mulps xmm3, [esp + .tabscale]
- mulps xmm3, xmm0
- subps xmm1, xmm3
-
- movaps xmm0, xmm1
- movaps xmm2, xmm1
-
- xorps xmm3, xmm3
- movaps xmm4, xmm3
- movaps xmm5, xmm3
- mulps xmm0, [esp + .dxOH2]
- mulps xmm1, [esp + .dyOH2]
- mulps xmm2, [esp + .dzOH2]
- subps xmm3, xmm0
- subps xmm4, xmm1
- subps xmm5, xmm2
- addps xmm0, [esp + .fixO]
- addps xmm1, [esp + .fiyO]
- addps xmm2, [esp + .fizO]
- movaps [esp + .fjxH2], xmm3
- movaps [esp + .fjyH2], xmm4
- movaps [esp + .fjzH2], xmm5
- movaps [esp + .fixO], xmm0
- movaps [esp + .fiyO], xmm1
- movaps [esp + .fizO], xmm2
-
- ; H1-O interaction
- movaps xmm0, [esp + .rinvH1O]
- movaps xmm1, xmm0
- mulps xmm1, [esp + .rsqH1O] ; xmm1=r
- mulps xmm1, [esp + .tabscale]
- movhlps xmm2, xmm1
- cvttps2pi mm6, xmm1
- cvttps2pi mm7, xmm2 ; mm6/mm7 contain lu indices
- cvtpi2ps xmm3, mm6
- cvtpi2ps xmm2, mm7
- movlhps xmm3, xmm2
- subps xmm1, xmm3 ; xmm1=eps
- movaps xmm2, xmm1
- mulps xmm2, xmm2 ;xmm2=eps2
-
- pslld mm6, 2
- pslld mm7, 2
-
- movd eax, mm6
- psrlq mm6, 32
- movd ecx, mm7
- psrlq mm7, 32
- movd ebx, mm6
- movd edx, mm7
-
- movlps xmm5, [esi + eax*4]
- movlps xmm7, [esi + ecx*4]
- movhps xmm5, [esi + ebx*4]
- movhps xmm7, [esi + edx*4] ; got half coulomb table
-
- movaps xmm4, xmm5
- shufps xmm4, xmm7, 10001000b
- shufps xmm5, xmm7, 11011101b
-
- movlps xmm7, [esi + eax*4 + 8]
- movlps xmm3, [esi + ecx*4 + 8]
- movhps xmm7, [esi + ebx*4 + 8]
- movhps xmm3, [esi + edx*4 + 8] ; other half of coulomb table
- movaps xmm6, xmm7
- shufps xmm6, xmm3, 10001000b
- shufps xmm7, xmm3, 11011101b
- ; coulomb table ready, in xmm4-xmm7
-
- mulps xmm6, xmm1 ; xmm6=Geps
- mulps xmm7, xmm2 ; xmm7=Heps2
- addps xmm5, xmm6
- addps xmm5, xmm7 ; xmm5=Fp
- mulps xmm7, [esp + .two] ; two*Heps2
- movaps xmm3, [esp + .qqOH]
- addps xmm7, xmm6
- addps xmm7, xmm5 ; xmm7=FF
- mulps xmm5, xmm1 ; xmm5=eps*Fp
- addps xmm5, xmm4 ; xmm5=VV
- mulps xmm5, xmm3 ; vcoul=qq*VV
- mulps xmm3, xmm7 ; fijC=FF*qq
- ; at this point mm5 contains vcoul and mm3 fijC.
-
- addps xmm5, [esp + .vctot]
- movaps [esp + .vctot], xmm5
- xorps xmm1, xmm1
- mulps xmm3, [esp + .tabscale]
- mulps xmm3, xmm0
- subps xmm1, xmm3
-
- movaps xmm0, xmm1
- movaps xmm2, xmm1
-
- movaps xmm3, [esp + .fjxO]
- movaps xmm4, [esp + .fjyO]
- movaps xmm5, [esp + .fjzO]
- mulps xmm0, [esp + .dxH1O]
- mulps xmm1, [esp + .dyH1O]
- mulps xmm2, [esp + .dzH1O]
- subps xmm3, xmm0
- subps xmm4, xmm1
- subps xmm5, xmm2
- addps xmm0, [esp + .fixH1]
- addps xmm1, [esp + .fiyH1]
- addps xmm2, [esp + .fizH1]
- movaps [esp + .fjxO], xmm3
- movaps [esp + .fjyO], xmm4
- movaps [esp + .fjzO], xmm5
- movaps [esp + .fixH1], xmm0
- movaps [esp + .fiyH1], xmm1
- movaps [esp + .fizH1], xmm2
-
- ; H1-H1 interaction
- movaps xmm0, [esp + .rinvH1H1]
- movaps xmm1, xmm0
- mulps xmm1, [esp + .rsqH1H1] ; xmm1=r
- mulps xmm1, [esp + .tabscale]
- movhlps xmm2, xmm1
- cvttps2pi mm6, xmm1
- cvttps2pi mm7, xmm2 ; mm6/mm7 contain lu indices
- cvtpi2ps xmm3, mm6
- cvtpi2ps xmm2, mm7
- movlhps xmm3, xmm2
- subps xmm1, xmm3 ; xmm1=eps
- movaps xmm2, xmm1
- mulps xmm2, xmm2 ;xmm2=eps2
-
- pslld mm6, 2
- pslld mm7, 2
-
- movd eax, mm6
- psrlq mm6, 32
- movd ecx, mm7
- psrlq mm7, 32
- movd ebx, mm6
- movd edx, mm7
-
- movlps xmm5, [esi + eax*4]
- movlps xmm7, [esi + ecx*4]
- movhps xmm5, [esi + ebx*4]
- movhps xmm7, [esi + edx*4] ; got half coulomb table
-
- movaps xmm4, xmm5
- shufps xmm4, xmm7, 10001000b
- shufps xmm5, xmm7, 11011101b
-
- movlps xmm7, [esi + eax*4 + 8]
- movlps xmm3, [esi + ecx*4 + 8]
- movhps xmm7, [esi + ebx*4 + 8]
- movhps xmm3, [esi + edx*4 + 8] ; other half of coulomb table
- movaps xmm6, xmm7
- shufps xmm6, xmm3, 10001000b
- shufps xmm7, xmm3, 11011101b
- ; coulomb table ready, in xmm4-xmm7
-
- mulps xmm6, xmm1 ; xmm6=Geps
- mulps xmm7, xmm2 ; xmm7=Heps2
- addps xmm5, xmm6
- addps xmm5, xmm7 ; xmm5=Fp
- mulps xmm7, [esp + .two] ; two*Heps2
- movaps xmm3, [esp + .qqHH]
- addps xmm7, xmm6
- addps xmm7, xmm5 ; xmm7=FF
- mulps xmm5, xmm1 ; xmm5=eps*Fp
- addps xmm5, xmm4 ; xmm5=VV
- mulps xmm5, xmm3 ; vcoul=qq*VV
- mulps xmm3, xmm7 ; fijC=FF*qq
- ; at this point mm5 contains vcoul and mm3 fijC.
-
- addps xmm5, [esp + .vctot]
- movaps [esp + .vctot], xmm5
- xorps xmm1, xmm1
- mulps xmm3, [esp + .tabscale]
- mulps xmm3, xmm0
- subps xmm1, xmm3
-
- movaps xmm0, xmm1
- movaps xmm2, xmm1
-
- movaps xmm3, [esp + .fjxH1]
- movaps xmm4, [esp + .fjyH1]
- movaps xmm5, [esp + .fjzH1]
- mulps xmm0, [esp + .dxH1H1]
- mulps xmm1, [esp + .dyH1H1]
- mulps xmm2, [esp + .dzH1H1]
- subps xmm3, xmm0
- subps xmm4, xmm1
- subps xmm5, xmm2
- addps xmm0, [esp + .fixH1]
- addps xmm1, [esp + .fiyH1]
- addps xmm2, [esp + .fizH1]
- movaps [esp + .fjxH1], xmm3
- movaps [esp + .fjyH1], xmm4
- movaps [esp + .fjzH1], xmm5
- movaps [esp + .fixH1], xmm0
- movaps [esp + .fiyH1], xmm1
- movaps [esp + .fizH1], xmm2
-
- ; H1-H2 interaction
- movaps xmm0, [esp + .rinvH1H2]
- movaps xmm1, xmm0
- mulps xmm1, [esp + .rsqH1H2] ; xmm1=r
- mulps xmm1, [esp + .tabscale]
- movhlps xmm2, xmm1
- cvttps2pi mm6, xmm1
- cvttps2pi mm7, xmm2 ; mm6/mm7 contain lu indices
- cvtpi2ps xmm3, mm6
- cvtpi2ps xmm2, mm7
- movlhps xmm3, xmm2
- subps xmm1, xmm3 ; xmm1=eps
- movaps xmm2, xmm1
- mulps xmm2, xmm2 ;xmm2=eps2
-
- pslld mm6, 2
- pslld mm7, 2
-
- movd eax, mm6
- psrlq mm6, 32
- movd ecx, mm7
- psrlq mm7, 32
- movd ebx, mm6
- movd edx, mm7
-
- movlps xmm5, [esi + eax*4]
- movlps xmm7, [esi + ecx*4]
- movhps xmm5, [esi + ebx*4]
- movhps xmm7, [esi + edx*4] ; got half coulomb table
-
- movaps xmm4, xmm5
- shufps xmm4, xmm7, 10001000b
- shufps xmm5, xmm7, 11011101b
-
- movlps xmm7, [esi + eax*4 + 8]
- movlps xmm3, [esi + ecx*4 + 8]
- movhps xmm7, [esi + ebx*4 + 8]
- movhps xmm3, [esi + edx*4 + 8] ; other half of coulomb table
- movaps xmm6, xmm7
- shufps xmm6, xmm3, 10001000b
- shufps xmm7, xmm3, 11011101b
- ; coulomb table ready, in xmm4-xmm7
-
- mulps xmm6, xmm1 ; xmm6=Geps
- mulps xmm7, xmm2 ; xmm7=Heps2
- addps xmm5, xmm6
- addps xmm5, xmm7 ; xmm5=Fp
- mulps xmm7, [esp + .two] ; two*Heps2
- movaps xmm3, [esp + .qqHH]
- addps xmm7, xmm6
- addps xmm7, xmm5 ; xmm7=FF
- mulps xmm5, xmm1 ; xmm5=eps*Fp
- addps xmm5, xmm4 ; xmm5=VV
- mulps xmm5, xmm3 ; vcoul=qq*VV
- mulps xmm3, xmm7 ; fijC=FF*qq
- ; at this point mm5 contains vcoul and mm3 fijC.
-
- addps xmm5, [esp + .vctot]
- movaps [esp + .vctot], xmm5
- xorps xmm1, xmm1
- mulps xmm3, [esp + .tabscale]
- mulps xmm3, xmm0
- subps xmm1, xmm3
-
- movaps xmm0, xmm1
- movaps xmm2, xmm1
-
- movaps xmm3, [esp + .fjxH2]
- movaps xmm4, [esp + .fjyH2]
- movaps xmm5, [esp + .fjzH2]
- mulps xmm0, [esp + .dxH1H2]
- mulps xmm1, [esp + .dyH1H2]
- mulps xmm2, [esp + .dzH1H2]
- subps xmm3, xmm0
- subps xmm4, xmm1
- subps xmm5, xmm2
- addps xmm0, [esp + .fixH1]
- addps xmm1, [esp + .fiyH1]
- addps xmm2, [esp + .fizH1]
- movaps [esp + .fjxH2], xmm3
- movaps [esp + .fjyH2], xmm4
- movaps [esp + .fjzH2], xmm5
- movaps [esp + .fixH1], xmm0
- movaps [esp + .fiyH1], xmm1
- movaps [esp + .fizH1], xmm2
-
- ; H2-O interaction
- movaps xmm0, [esp + .rinvH2O]
- movaps xmm1, xmm0
- mulps xmm1, [esp + .rsqH2O] ; xmm1=r
- mulps xmm1, [esp + .tabscale]
- movhlps xmm2, xmm1
- cvttps2pi mm6, xmm1
- cvttps2pi mm7, xmm2 ; mm6/mm7 contain lu indices
- cvtpi2ps xmm3, mm6
- cvtpi2ps xmm2, mm7
- movlhps xmm3, xmm2
- subps xmm1, xmm3 ; xmm1=eps
- movaps xmm2, xmm1
- mulps xmm2, xmm2 ;xmm2=eps2
-
- pslld mm6, 2
- pslld mm7, 2
-
- movd eax, mm6
- psrlq mm6, 32
- movd ecx, mm7
- psrlq mm7, 32
- movd ebx, mm6
- movd edx, mm7
-
- movlps xmm5, [esi + eax*4]
- movlps xmm7, [esi + ecx*4]
- movhps xmm5, [esi + ebx*4]
- movhps xmm7, [esi + edx*4] ; got half coulomb table
-
- movaps xmm4, xmm5
- shufps xmm4, xmm7, 10001000b
- shufps xmm5, xmm7, 11011101b
-
- movlps xmm7, [esi + eax*4 + 8]
- movlps xmm3, [esi + ecx*4 + 8]
- movhps xmm7, [esi + ebx*4 + 8]
- movhps xmm3, [esi + edx*4 + 8] ; other half of coulomb table
- movaps xmm6, xmm7
- shufps xmm6, xmm3, 10001000b
- shufps xmm7, xmm3, 11011101b
- ; coulomb table ready, in xmm4-xmm7
-
- mulps xmm6, xmm1 ; xmm6=Geps
- mulps xmm7, xmm2 ; xmm7=Heps2
- addps xmm5, xmm6
- addps xmm5, xmm7 ; xmm5=Fp
- mulps xmm7, [esp + .two] ; two*Heps2
- movaps xmm3, [esp + .qqOH]
- addps xmm7, xmm6
- addps xmm7, xmm5 ; xmm7=FF
- mulps xmm5, xmm1 ; xmm5=eps*Fp
- addps xmm5, xmm4 ; xmm5=VV
- mulps xmm5, xmm3 ; vcoul=qq*VV
- mulps xmm3, xmm7 ; fijC=FF*qq
- ; at this point mm5 contains vcoul and mm3 fijC.
-
- addps xmm5, [esp + .vctot]
- movaps [esp + .vctot], xmm5
- xorps xmm1, xmm1
- mulps xmm3, [esp + .tabscale]
- mulps xmm3, xmm0
- subps xmm1, xmm3
-
- movaps xmm0, xmm1
- movaps xmm2, xmm1
-
- movaps xmm3, [esp + .fjxO]
- movaps xmm4, [esp + .fjyO]
- movaps xmm5, [esp + .fjzO]
- mulps xmm0, [esp + .dxH2O]
- mulps xmm1, [esp + .dyH2O]
- mulps xmm2, [esp + .dzH2O]
- subps xmm3, xmm0
- subps xmm4, xmm1
- subps xmm5, xmm2
- addps xmm0, [esp + .fixH2]
- addps xmm1, [esp + .fiyH2]
- addps xmm2, [esp + .fizH2]
- movaps [esp + .fjxO], xmm3
- movaps [esp + .fjyO], xmm4
- movaps [esp + .fjzO], xmm5
- movaps [esp + .fixH2], xmm0
- movaps [esp + .fiyH2], xmm1
- movaps [esp + .fizH2], xmm2
-
- ; H2-H1 interaction
- movaps xmm0, [esp + .rinvH2H1]
- movaps xmm1, xmm0
- mulps xmm1, [esp + .rsqH2H1] ; xmm1=r
- mulps xmm1, [esp + .tabscale]
- movhlps xmm2, xmm1
- cvttps2pi mm6, xmm1
- cvttps2pi mm7, xmm2 ; mm6/mm7 contain lu indices
- cvtpi2ps xmm3, mm6
- cvtpi2ps xmm2, mm7
- movlhps xmm3, xmm2
- subps xmm1, xmm3 ; xmm1=eps
- movaps xmm2, xmm1
- mulps xmm2, xmm2 ;xmm2=eps2
-
- pslld mm6, 2
- pslld mm7, 2
-
- movd eax, mm6
- psrlq mm6, 32
- movd ecx, mm7
- psrlq mm7, 32
- movd ebx, mm6
- movd edx, mm7
-
- movlps xmm5, [esi + eax*4]
- movlps xmm7, [esi + ecx*4]
- movhps xmm5, [esi + ebx*4]
- movhps xmm7, [esi + edx*4] ; got half coulomb table
-
- movaps xmm4, xmm5
- shufps xmm4, xmm7, 10001000b
- shufps xmm5, xmm7, 11011101b
-
- movlps xmm7, [esi + eax*4 + 8]
- movlps xmm3, [esi + ecx*4 + 8]
- movhps xmm7, [esi + ebx*4 + 8]
- movhps xmm3, [esi + edx*4 + 8] ; other half of coulomb table
- movaps xmm6, xmm7
- shufps xmm6, xmm3, 10001000b
- shufps xmm7, xmm3, 11011101b
- ; coulomb table ready, in xmm4-xmm7
-
- mulps xmm6, xmm1 ; xmm6=Geps
- mulps xmm7, xmm2 ; xmm7=Heps2
- addps xmm5, xmm6
- addps xmm5, xmm7 ; xmm5=Fp
- mulps xmm7, [esp + .two] ; two*Heps2
- movaps xmm3, [esp + .qqHH]
- addps xmm7, xmm6
- addps xmm7, xmm5 ; xmm7=FF
- mulps xmm5, xmm1 ; xmm5=eps*Fp
- addps xmm5, xmm4 ; xmm5=VV
- mulps xmm5, xmm3 ; vcoul=qq*VV
- mulps xmm3, xmm7 ; fijC=FF*qq
- ; at this point mm5 contains vcoul and mm3 fijC.
-
- addps xmm5, [esp + .vctot]
- movaps [esp + .vctot], xmm5
- xorps xmm1, xmm1
- mulps xmm3, [esp + .tabscale]
- mulps xmm3, xmm0
- subps xmm1, xmm3
-
- movaps xmm0, xmm1
- movaps xmm2, xmm1
-
- movaps xmm3, [esp + .fjxH1]
- movaps xmm4, [esp + .fjyH1]
- movaps xmm5, [esp + .fjzH1]
- mulps xmm0, [esp + .dxH2H1]
- mulps xmm1, [esp + .dyH2H1]
- mulps xmm2, [esp + .dzH2H1]
- subps xmm3, xmm0
- subps xmm4, xmm1
- subps xmm5, xmm2
- addps xmm0, [esp + .fixH2]
- addps xmm1, [esp + .fiyH2]
- addps xmm2, [esp + .fizH2]
- movaps [esp + .fjxH1], xmm3
- movaps [esp + .fjyH1], xmm4
- movaps [esp + .fjzH1], xmm5
- movaps [esp + .fixH2], xmm0
- movaps [esp + .fiyH2], xmm1
- movaps [esp + .fizH2], xmm2
-
- ; H2-H2 interaction
- movaps xmm0, [esp + .rinvH2H2]
- movaps xmm1, xmm0
- mulps xmm1, [esp + .rsqH2H2] ; xmm1=r
- mulps xmm1, [esp + .tabscale]
- movhlps xmm2, xmm1
- cvttps2pi mm6, xmm1
- cvttps2pi mm7, xmm2 ; mm6/mm7 contain lu indices
- cvtpi2ps xmm3, mm6
- cvtpi2ps xmm2, mm7
- movlhps xmm3, xmm2
- subps xmm1, xmm3 ; xmm1=eps
- movaps xmm2, xmm1
- mulps xmm2, xmm2 ;xmm2=eps2
-
- pslld mm6, 2
- pslld mm7, 2
-
- movd eax, mm6
- psrlq mm6, 32
- movd ecx, mm7
- psrlq mm7, 32
- movd ebx, mm6
- movd edx, mm7
-
- movlps xmm5, [esi + eax*4]
- movlps xmm7, [esi + ecx*4]
- movhps xmm5, [esi + ebx*4]
- movhps xmm7, [esi + edx*4] ; got half coulomb table
-
- movaps xmm4, xmm5
- shufps xmm4, xmm7, 10001000b
- shufps xmm5, xmm7, 11011101b
-
- movlps xmm7, [esi + eax*4 + 8]
- movlps xmm3, [esi + ecx*4 + 8]
- movhps xmm7, [esi + ebx*4 + 8]
- movhps xmm3, [esi + edx*4 + 8] ; other half of coulomb table
- movaps xmm6, xmm7
- shufps xmm6, xmm3, 10001000b
- shufps xmm7, xmm3, 11011101b
- ; coulomb table ready, in xmm4-xmm7
-
- mulps xmm6, xmm1 ; xmm6=Geps
- mulps xmm7, xmm2 ; xmm7=Heps2
- addps xmm5, xmm6
- addps xmm5, xmm7 ; xmm5=Fp
- mulps xmm7, [esp + .two] ; two*Heps2
- movaps xmm3, [esp + .qqHH]
- addps xmm7, xmm6
- addps xmm7, xmm5 ; xmm7=FF
- mulps xmm5, xmm1 ; xmm5=eps*Fp
- addps xmm5, xmm4 ; xmm5=VV
- mulps xmm5, xmm3 ; vcoul=qq*VV
- mulps xmm3, xmm7 ; fijC=FF*qq
- ; at this point mm5 contains vcoul and mm3 fijC.
-
- addps xmm5, [esp + .vctot]
- movaps [esp + .vctot], xmm5
- xorps xmm1, xmm1
- mulps xmm3, [esp + .tabscale]
- mulps xmm3, xmm0
- subps xmm1, xmm3
-
- movaps xmm0, xmm1
- movaps xmm2, xmm1
-
- movaps xmm3, [esp + .fjxH2]
- movaps xmm4, [esp + .fjyH2]
- movaps xmm5, [esp + .fjzH2]
- mulps xmm0, [esp + .dxH2H2]
- mulps xmm1, [esp + .dyH2H2]
- mulps xmm2, [esp + .dzH2H2]
- subps xmm3, xmm0
- subps xmm4, xmm1
- subps xmm5, xmm2
- addps xmm0, [esp + .fixH2]
- addps xmm1, [esp + .fiyH2]
- addps xmm2, [esp + .fizH2]
- movaps [esp + .fjxH2], xmm3
- movaps [esp + .fjyH2], xmm4
- movaps [esp + .fjzH2], xmm5
- movaps [esp + .fixH2], xmm0
- movaps [esp + .fiyH2], xmm1
- movaps [esp + .fizH2], xmm2
-
- mov edi, [ebp +%$faction]
-
- movd eax, mm0
- movd ebx, mm1
- movd ecx, mm2
- movd edx, mm3
-
- ; Did all interactions - now update j forces.
- ; 4 j waters with three atoms each - first do a & b j particles
- movaps xmm0, [esp + .fjxO] ; xmm0= fjxOa fjxOb fjxOc fjxOd
- movaps xmm1, [esp + .fjyO] ; xmm1= fjyOa fjyOb fjyOc fjyOd
- unpcklps xmm0, xmm1 ; xmm0= fjxOa fjyOa fjxOb fjyOb
- movaps xmm1, [esp + .fjzO]
- movaps xmm2, [esp + .fjxH1]
- movhlps xmm3, xmm0 ; xmm3= fjxOb fjyOb
- unpcklps xmm1, xmm2 ; xmm1= fjzOa fjxH1a fjzOb fjxH1b
- movaps xmm4, [esp + .fjyH1]
- movaps xmm5, [esp + .fjzH1]
- unpcklps xmm4, xmm5 ; xmm4= fjyH1a fjzH1a fjyH1b fjzH1b
- movaps xmm5, [esp + .fjxH2]
- movaps xmm6, [esp + .fjyH2]
- movhlps xmm7, xmm4 ; xmm7= fjyH1b fjzH1b
- unpcklps xmm5, xmm6 ; xmm5= fjxH2a fjyH2a fjxH2b fjyH2b
- movlhps xmm0, xmm1 ; xmm0= fjxOa fjyOa fjzOa fjxH1a
- shufps xmm3, xmm1, 11100100b
- ; xmm3= fjxOb fjyOb fjzOb fjxH1b
- movlhps xmm4, xmm5 ; xmm4= fjyH1a fjzH1a fjxH2a fjyH2a
- shufps xmm7, xmm5, 11100100b
- ; xmm7= fjyH1b fjzH1b fjxH2b fjyH2b
- movups xmm1, [edi + eax*4]
- movups xmm2, [edi + eax*4 + 16]
- movups xmm5, [edi + ebx*4]
- movups xmm6, [edi + ebx*4 + 16]
- addps xmm1, xmm0
- addps xmm2, xmm4
- addps xmm5, xmm3
- addps xmm6, xmm7
- movss xmm0, [edi + eax*4 + 32]
- movss xmm3, [edi + ebx*4 + 32]
-
- movaps xmm4, [esp + .fjzH2]
- movaps xmm7, xmm4
- shufps xmm7, xmm7, 1b
-
- movups [edi + eax*4], xmm1
- movups [edi + eax*4 + 16],xmm2
- movups [edi + ebx*4], xmm5
- movups [edi + ebx*4 + 16],xmm6
- addss xmm0, xmm4
- addss xmm3, xmm7
- movss [edi + eax*4 + 32], xmm0
- movss [edi + ebx*4 + 32], xmm3
-
- ;; then do the second pair (c & d)
- movaps xmm0, [esp + .fjxO] ; xmm0= fjxOa fjxOb fjxOc fjxOd
- movaps xmm1, [esp + .fjyO] ; xmm1= fjyOa fjyOb fjyOc fjyOd
- unpckhps xmm0, xmm1 ; xmm0= fjxOc fjyOc fjxOd fjyOd
- movaps xmm1, [esp + .fjzO]
- movaps xmm2, [esp + .fjxH1]
- movhlps xmm3, xmm0 ; xmm3= fjxOd fjyOd
- unpckhps xmm1, xmm2 ; xmm1= fjzOc fjxH1c fjzOd fjxH1d
- movaps xmm4, [esp + .fjyH1]
- movaps xmm5, [esp + .fjzH1]
- unpckhps xmm4, xmm5 ; xmm4= fjyH1c fjzH1c fjyH1d fjzH1d
- movaps xmm5, [esp + .fjxH2]
- movaps xmm6, [esp + .fjyH2]
- movhlps xmm7, xmm4 ; xmm7= fjyH1d fjzH1d
- unpckhps xmm5, xmm6 ; xmm5= fjxH2c fjyH2c fjxH2d fjyH2d
- movlhps xmm0, xmm1 ; xmm0= fjxOc fjyOc fjzOc fjxH1c
- shufps xmm3, xmm1, 11100100b
- ; xmm3= fjxOd fjyOd fjzOd fjxH1d
- movlhps xmm4, xmm5 ; xmm4= fjyH1c fjzH1c fjxH2c fjyH2c
- shufps xmm7, xmm5, 11100100b
- ; xmm7= fjyH1d fjzH1d fjxH2d fjyH2d
- movups xmm1, [edi + ecx*4]
- movups xmm2, [edi + ecx*4 + 16]
- movups xmm5, [edi + edx*4]
- movups xmm6, [edi + edx*4 + 16]
- addps xmm1, xmm0
- addps xmm2, xmm4
- addps xmm5, xmm3
- addps xmm6, xmm7
- movss xmm0, [edi + ecx*4 + 32]
- movss xmm3, [edi + edx*4 + 32]
-
- movaps xmm4, [esp + .fjzH2]
- movaps xmm7, xmm4
- shufps xmm4, xmm4, 10b
- shufps xmm7, xmm7, 11b
- movups [edi + ecx*4], xmm1
- movups [edi + ecx*4 + 16],xmm2
- movups [edi + edx*4], xmm5
- movups [edi + edx*4 + 16],xmm6
- addss xmm0, xmm4
- addss xmm3, xmm7
- movss [edi + ecx*4 + 32], xmm0
- movss [edi + edx*4 + 32], xmm3
-
- ;; should we do one more iteration?
- sub [esp + .innerk], dword 4
- jl .single_check
- jmp .unroll_loop
-.single_check:
- add [esp + .innerk], dword 4
- jnz .single_loop
- jmp .updateouterdata
-.single_loop:
- mov edx, [esp + .innerjjnr] ; pointer to jjnr[k]
- mov eax, [edx]
- add [esp + .innerjjnr], dword 4
-
- mov esi, [ebp + %$pos]
- lea eax, [eax + eax*2]
-
- ; fetch j coordinates
- xorps xmm3, xmm3
- xorps xmm4, xmm4
- xorps xmm5, xmm5
- movss xmm3, [esi + eax*4]
- movss xmm4, [esi + eax*4 + 4]
- movss xmm5, [esi + eax*4 + 8]
-
- movlps xmm6, [esi + eax*4 + 12]
- movhps xmm6, [esi + eax*4 + 24] ; xmm6=jxH1 jyH1 jxH2 jyH2
- ;; fetch both z coords in one go, to positions 0 and 3 in xmm7
- movups xmm7, [esi + eax*4 + 20] ; xmm7=jzH1 jxH2 jyH2 jzH2
- shufps xmm6, xmm6, 11011000b ; xmm6=jxH1 jxH2 jyH1 jyH2
- movlhps xmm3, xmm6 ; xmm3= jxO 0 jxH1 jxH2
- movaps xmm0, [esp + .ixO]
- movaps xmm1, [esp + .iyO]
- movaps xmm2, [esp + .izO]
- shufps xmm4, xmm6, 11100100b ; xmm4= jyO 0 jyH1 jyH2
- shufps xmm5, xmm7, 11000100b ; xmm5= jzO 0 jzH1 jzH2
- ;; store all j coordinates in jO
- movaps [esp + .jxO], xmm3
- movaps [esp + .jyO], xmm4
- movaps [esp + .jzO], xmm5
- subps xmm0, xmm3
- subps xmm1, xmm4
- subps xmm2, xmm5
- movaps [esp + .dxOO], xmm0
- movaps [esp + .dyOO], xmm1
- movaps [esp + .dzOO], xmm2
- mulps xmm0, xmm0
- mulps xmm1, xmm1
- mulps xmm2, xmm2
- addps xmm0, xmm1
- addps xmm0, xmm2 ; have rsq in xmm0.
-
- ;; do invsqrt
- rsqrtps xmm1, xmm0
- movaps xmm2, xmm1
- mulps xmm1, xmm1
- movaps xmm3, [esp + .three]
- mulps xmm1, xmm0
- subps xmm3, xmm1
- mulps xmm3, xmm2
- mulps xmm3, [esp + .half] ; rinv iO - j water
-
- movaps xmm1, xmm3
- mulps xmm1, xmm0 ; xmm1=r
- movaps xmm0, xmm3 ; xmm0=rinv
- mulps xmm1, [esp + .tabscale]
-
- movhlps xmm2, xmm1
- cvttps2pi mm6, xmm1
- cvttps2pi mm7, xmm2 ; mm6/mm7 contain lu indices
- cvtpi2ps xmm3, mm6
- cvtpi2ps xmm2, mm7
- movlhps xmm3, xmm2
- subps xmm1, xmm3 ; xmm1=eps
- movaps xmm2, xmm1
- mulps xmm2, xmm2 ; xmm2=eps2
- pslld mm6, 2
- pslld mm7, 2
-
- movd ebx, mm6
- movd ecx, mm7
- psrlq mm7, 32
- movd edx, mm7 ; table indices in ebx,ecx,edx
-
- mov esi, [ebp + %$VFtab]
-
- movlps xmm5, [esi + ebx*4]
- movlps xmm7, [esi + ecx*4]
- movhps xmm7, [esi + edx*4] ; got half coulomb table
- movaps xmm4, xmm5
- shufps xmm4, xmm7, 10001000b
- shufps xmm5, xmm7, 11011101b
-
- movlps xmm7, [esi + ebx*4 + 8]
- movlps xmm3, [esi + ecx*4 + 8]
- movhps xmm3, [esi + edx*4 + 8] ; other half of coulomb table
- movaps xmm6, xmm7
- shufps xmm6, xmm3, 10001000b
- shufps xmm7, xmm3, 11011101b
- ; coulomb table ready, in xmm4-xmm7
- mulps xmm6, xmm1 ; xmm6=Geps
- mulps xmm7, xmm2 ; xmm7=Heps2
- addps xmm5, xmm6
- addps xmm5, xmm7 ; xmm5=Fp
- mulps xmm7, [esp + .two] ; two*Heps2
-
- xorps xmm3, xmm3
- ;; fetch charges to xmm3 (temporary)
- movss xmm3, [esp + .qqOO]
- movhps xmm3, [esp + .qqOH]
-
- addps xmm7, xmm6
- addps xmm7, xmm5 ; xmm7=FF
- mulps xmm5, xmm1 ; xmm5=eps*Fp
- addps xmm5, xmm4 ; xmm5=VV
- mulps xmm5, xmm3 ; vcoul=qq*VV
- mulps xmm3, xmm7 ; fijC=FF*qq
- ; at this point xmm5 contains vcoul and xmm3 fijC.
-
- addps xmm5, [esp + .vctot]
- movaps [esp + .vctot], xmm5
- xorps xmm2, xmm2
- mulps xmm3, [esp + .tabscale]
-
- subps xmm2, xmm3
- mulps xmm0, xmm2
-
- movaps xmm1, xmm0
- movaps xmm2, xmm0
-
- mulps xmm0, [esp + .dxOO]
- mulps xmm1, [esp + .dyOO]
- mulps xmm2, [esp + .dzOO]
- ;; initial update for j forces
- xorps xmm3, xmm3
- xorps xmm4, xmm4
- xorps xmm5, xmm5
- subps xmm3, xmm0
- subps xmm4, xmm1
- subps xmm5, xmm2
- movaps [esp + .fjxO], xmm3
- movaps [esp + .fjyO], xmm4
- movaps [esp + .fjzO], xmm5
- addps xmm0, [esp + .fixO]
- addps xmm1, [esp + .fiyO]
- addps xmm2, [esp + .fizO]
- movaps [esp + .fixO], xmm0
- movaps [esp + .fiyO], xmm1
- movaps [esp + .fizO], xmm2
-
-
- ;; done with i O. Now do i H1 & H2 simultaneously. first get i particle coords:
- movaps xmm0, [esp + .ixH1]
- movaps xmm1, [esp + .iyH1]
- movaps xmm2, [esp + .izH1]
- movaps xmm3, [esp + .ixH2]
- movaps xmm4, [esp + .iyH2]
- movaps xmm5, [esp + .izH2]
- subps xmm0, [esp + .jxO]
- subps xmm1, [esp + .jyO]
- subps xmm2, [esp + .jzO]
- subps xmm3, [esp + .jxO]
- subps xmm4, [esp + .jyO]
- subps xmm5, [esp + .jzO]
- movaps [esp + .dxH1O], xmm0
- movaps [esp + .dyH1O], xmm1
- movaps [esp + .dzH1O], xmm2
- movaps [esp + .dxH2O], xmm3
- movaps [esp + .dyH2O], xmm4
- movaps [esp + .dzH2O], xmm5
- mulps xmm0, xmm0
- mulps xmm1, xmm1
- mulps xmm2, xmm2
- mulps xmm3, xmm3
- mulps xmm4, xmm4
- mulps xmm5, xmm5
- addps xmm0, xmm1
- addps xmm4, xmm3
- addps xmm0, xmm2 ; have rsqH1 in xmm0.
- addps xmm4, xmm5 ; have rsqH2 in xmm4.
-
- ;; start with H1, save H2 data
- movaps [esp + .rsqH2O], xmm4
-
- ;; do invsqrt
- rsqrtps xmm1, xmm0
- rsqrtps xmm5, xmm4
- movaps xmm2, xmm1
- movaps xmm6, xmm5
- mulps xmm1, xmm1
- mulps xmm5, xmm5
- movaps xmm3, [esp + .three]
- movaps xmm7, xmm3
- mulps xmm1, xmm0
- mulps xmm5, xmm4
- subps xmm3, xmm1
- subps xmm7, xmm5
- mulps xmm3, xmm2
- mulps xmm7, xmm6
- mulps xmm3, [esp + .half] ; rinv H1 - j water
- mulps xmm7, [esp + .half] ; rinv H2 - j water
-
- ;; start with H1, save H2 data
- movaps [esp + .rinvH2O], xmm7
-
- movaps xmm1, xmm3
- mulps xmm1, xmm0 ; xmm1=r
- movaps xmm0, xmm3 ; xmm0=rinv
- mulps xmm1, [esp + .tabscale]
-
- movhlps xmm2, xmm1
- cvttps2pi mm6, xmm1
- cvttps2pi mm7, xmm2 ; mm6/mm7 contain lu indices
- cvtpi2ps xmm3, mm6
- cvtpi2ps xmm2, mm7
- movlhps xmm3, xmm2
- subps xmm1, xmm3 ; xmm1=eps
- movaps xmm2, xmm1
- mulps xmm2, xmm2 ; xmm2=eps2
- pslld mm6, 2
- pslld mm7, 2
-
- movd ebx, mm6
- movd ecx, mm7
- psrlq mm7, 32
- movd edx, mm7 ; table indices in ebx,ecx,edx
-
- movlps xmm5, [esi + ebx*4]
- movlps xmm7, [esi + ecx*4]
- movhps xmm7, [esi + edx*4] ; got half coulomb table
- movaps xmm4, xmm5
- shufps xmm4, xmm7, 10001000b
- shufps xmm5, xmm7, 11011101b
-
- movlps xmm7, [esi + ebx*4 + 8]
- movlps xmm3, [esi + ecx*4 + 8]
- movhps xmm3, [esi + edx*4 + 8] ; other half of coulomb table
- movaps xmm6, xmm7
- shufps xmm6, xmm3, 10001000b
- shufps xmm7, xmm3, 11011101b
- ; coulomb table ready, in xmm4-xmm7
- mulps xmm6, xmm1 ; xmm6=Geps
- mulps xmm7, xmm2 ; xmm7=Heps2
- addps xmm5, xmm6
- addps xmm5, xmm7 ; xmm5=Fp
- mulps xmm7, [esp + .two] ; two*Heps2
-
- xorps xmm3, xmm3
- ;; fetch charges to xmm3 (temporary)
- movss xmm3, [esp + .qqOH]
- movhps xmm3, [esp + .qqHH]
-
- addps xmm7, xmm6
- addps xmm7, xmm5 ; xmm7=FF
- mulps xmm5, xmm1 ; xmm5=eps*Fp
- addps xmm5, xmm4 ; xmm5=VV
- mulps xmm5, xmm3 ; vcoul=qq*VV
- mulps xmm3, xmm7 ; fijC=FF*qq
- ; at this point xmm5 contains vcoul and xmm3 fijC.
- addps xmm5, [esp + .vctot]
- movaps [esp + .vctot], xmm5
-
- xorps xmm1, xmm1
-
- mulps xmm3, [esp + .tabscale]
- mulps xmm3, xmm0
- subps xmm1, xmm3
-
- movaps xmm0, xmm1
- movaps xmm2, xmm1
- mulps xmm0, [esp + .dxH1O]
- mulps xmm1, [esp + .dyH1O]
- mulps xmm2, [esp + .dzH1O]
- ;; update forces H1 - j water
- movaps xmm3, [esp + .fjxO]
- movaps xmm4, [esp + .fjyO]
- movaps xmm5, [esp + .fjzO]
- subps xmm3, xmm0
- subps xmm4, xmm1
- subps xmm5, xmm2
- movaps [esp + .fjxO], xmm3
- movaps [esp + .fjyO], xmm4
- movaps [esp + .fjzO], xmm5
- addps xmm0, [esp + .fixH1]
- addps xmm1, [esp + .fiyH1]
- addps xmm2, [esp + .fizH1]
- movaps [esp + .fixH1], xmm0
- movaps [esp + .fiyH1], xmm1
- movaps [esp + .fizH1], xmm2
- ;; do table for H2 - j water interaction
- movaps xmm0, [esp + .rinvH2O]
- movaps xmm1, [esp + .rsqH2O]
- mulps xmm1, xmm0 ; xmm0=rinv, xmm1=r
- mulps xmm1, [esp + .tabscale]
-
- movhlps xmm2, xmm1
- cvttps2pi mm6, xmm1
- cvttps2pi mm7, xmm2 ; mm6/mm7 contain lu indices
- cvtpi2ps xmm3, mm6
- cvtpi2ps xmm2, mm7
- movlhps xmm3, xmm2
- subps xmm1, xmm3 ; xmm1=eps
- movaps xmm2, xmm1
- mulps xmm2, xmm2 ; xmm2=eps2
- pslld mm6, 2
- pslld mm7, 2
-
- movd ebx, mm6
- movd ecx, mm7
- psrlq mm7, 32
- movd edx, mm7 ; table indices in ebx,ecx,edx
-
- movlps xmm5, [esi + ebx*4]
- movlps xmm7, [esi + ecx*4]
- movhps xmm7, [esi + edx*4] ; got half coulomb table
- movaps xmm4, xmm5
- shufps xmm4, xmm7, 10001000b
- shufps xmm5, xmm7, 11011101b
-
- movlps xmm7, [esi + ebx*4 + 8]
- movlps xmm3, [esi + ecx*4 + 8]
- movhps xmm3, [esi + edx*4 + 8] ; other half of coulomb table
- movaps xmm6, xmm7
- shufps xmm6, xmm3, 10001000b
- shufps xmm7, xmm3, 11011101b
- ; coulomb table ready, in xmm4-xmm7
- mulps xmm6, xmm1 ; xmm6=Geps
- mulps xmm7, xmm2 ; xmm7=Heps2
- addps xmm5, xmm6
- addps xmm5, xmm7 ; xmm5=Fp
- mulps xmm7, [esp + .two] ; two*Heps2
-
- xorps xmm3, xmm3
- ;; fetch charges to xmm3 (temporary)
- movss xmm3, [esp + .qqOH]
- movhps xmm3, [esp + .qqHH]
-
- addps xmm7, xmm6
- addps xmm7, xmm5 ; xmm7=FF
- mulps xmm5, xmm1 ; xmm5=eps*Fp
- addps xmm5, xmm4 ; xmm5=VV
- mulps xmm5, xmm3 ; vcoul=qq*VV
- mulps xmm3, xmm7 ; fijC=FF*qq
- ; at this point xmm5 contains vcoul and xmm3 fijC.
- addps xmm5, [esp + .vctot]
- movaps [esp + .vctot], xmm5
-
- xorps xmm1, xmm1
-
- mulps xmm3, [esp + .tabscale]
- mulps xmm3, xmm0
- subps xmm1, xmm3
-
- movaps xmm0, xmm1
- movaps xmm2, xmm1
-
- mulps xmm0, [esp + .dxH2O]
- mulps xmm1, [esp + .dyH2O]
- mulps xmm2, [esp + .dzH2O]
- movaps xmm3, [esp + .fjxO]
- movaps xmm4, [esp + .fjyO]
- movaps xmm5, [esp + .fjzO]
- subps xmm3, xmm0
- subps xmm4, xmm1
- subps xmm5, xmm2
- mov esi, [ebp + %$faction]
- movaps [esp + .fjxO], xmm3
- movaps [esp + .fjyO], xmm4
- movaps [esp + .fjzO], xmm5
- addps xmm0, [esp + .fixH2]
- addps xmm1, [esp + .fiyH2]
- addps xmm2, [esp + .fizH2]
- movaps [esp + .fixH2], xmm0
- movaps [esp + .fiyH2], xmm1
- movaps [esp + .fizH2], xmm2
-
- ;; update j water forces from local variables
- movlps xmm0, [esi + eax*4]
- movlps xmm1, [esi + eax*4 + 12]
- movhps xmm1, [esi + eax*4 + 24]
- movaps xmm3, [esp + .fjxO]
- movaps xmm4, [esp + .fjyO]
- movaps xmm5, [esp + .fjzO]
- movaps xmm6, xmm5
- movaps xmm7, xmm5
- shufps xmm6, xmm6, 10b
- shufps xmm7, xmm7, 11b
- addss xmm5, [esi + eax*4 + 8]
- addss xmm6, [esi + eax*4 + 20]
- addss xmm7, [esi + eax*4 + 32]
- movss [esi + eax*4 + 8], xmm5
- movss [esi + eax*4 + 20], xmm6
- movss [esi + eax*4 + 32], xmm7
- movaps xmm5, xmm3
- unpcklps xmm3, xmm4
- unpckhps xmm5, xmm4
- addps xmm0, xmm3
- addps xmm1, xmm5
- movlps [esi + eax*4], xmm0
- movlps [esi + eax*4 + 12], xmm1
- movhps [esi + eax*4 + 24], xmm1
-
- dec dword [esp + .innerk]
- jz .updateouterdata
- jmp .single_loop
-.updateouterdata:
- mov ecx, [esp + .ii3]
- mov edi, [ebp + %$faction]
- mov esi, [ebp + %$fshift]
- mov edx, [esp + .is3]
-
- ; accumulate Oi forces in xmm0, xmm1, xmm2
- movaps xmm0, [esp + .fixO]
- movaps xmm1, [esp + .fiyO]
- movaps xmm2, [esp + .fizO]
-
- movhlps xmm3, xmm0
- movhlps xmm4, xmm1
- movhlps xmm5, xmm2
- addps xmm0, xmm3
- addps xmm1, xmm4
- addps xmm2, xmm5 ; summan ligger i 1/2 i xmm0-xmm2
-
- movaps xmm3, xmm0
- movaps xmm4, xmm1
- movaps xmm5, xmm2
-
- shufps xmm3, xmm3, 1b
- shufps xmm4, xmm4, 1b
- shufps xmm5, xmm5, 1b
- addss xmm0, xmm3
- addss xmm1, xmm4
- addss xmm2, xmm5 ; xmm0-xmm2 has single force in pos0.
-
- ; increment i force
- movss xmm3, [edi + ecx*4]
- movss xmm4, [edi + ecx*4 + 4]
- movss xmm5, [edi + ecx*4 + 8]
- addss xmm3, xmm0
- addss xmm4, xmm1
- addss xmm5, xmm2
- movss [edi + ecx*4], xmm3
- movss [edi + ecx*4 + 4], xmm4
- movss [edi + ecx*4 + 8], xmm5
-
- ; accumulate force in xmm6/xmm7 for fshift
- movaps xmm6, xmm0
- movss xmm7, xmm2
- movlhps xmm6, xmm1
- shufps xmm6, xmm6, 1000b
-
- ; accumulate H1i forces in xmm0, xmm1, xmm2
- movaps xmm0, [esp + .fixH1]
- movaps xmm1, [esp + .fiyH1]
- movaps xmm2, [esp + .fizH1]
-
- movhlps xmm3, xmm0
- movhlps xmm4, xmm1
- movhlps xmm5, xmm2
- addps xmm0, xmm3
- addps xmm1, xmm4
- addps xmm2, xmm5 ; summan ligger i 1/2 i xmm0-xmm2
-
- movaps xmm3, xmm0
- movaps xmm4, xmm1
- movaps xmm5, xmm2
-
- shufps xmm3, xmm3, 1b
- shufps xmm4, xmm4, 1b
- shufps xmm5, xmm5, 1b
- addss xmm0, xmm3
- addss xmm1, xmm4
- addss xmm2, xmm5 ; xmm0-xmm2 has single force in pos0.
-
- ; increment i force
- movss xmm3, [edi + ecx*4 + 12]
- movss xmm4, [edi + ecx*4 + 16]
- movss xmm5, [edi + ecx*4 + 20]
- addss xmm3, xmm0
- addss xmm4, xmm1
- addss xmm5, xmm2
- movss [edi + ecx*4 + 12], xmm3
- movss [edi + ecx*4 + 16], xmm4
- movss [edi + ecx*4 + 20], xmm5
-
- ;accumulate force in xmm6/xmm7 for fshift
- addss xmm7, xmm2
- movlhps xmm0, xmm1
- shufps xmm0, xmm0, 1000b
- addps xmm6, xmm0
-
- ; accumulate H2i forces in xmm0, xmm1, xmm2
- movaps xmm0, [esp + .fixH2]
- movaps xmm1, [esp + .fiyH2]
- movaps xmm2, [esp + .fizH2]
-
- movhlps xmm3, xmm0
- movhlps xmm4, xmm1
- movhlps xmm5, xmm2
- addps xmm0, xmm3
- addps xmm1, xmm4
- addps xmm2, xmm5 ; summan ligger i 1/2 i xmm0-xmm2
-
- movaps xmm3, xmm0
- movaps xmm4, xmm1
- movaps xmm5, xmm2
-
- shufps xmm3, xmm3, 1b
- shufps xmm4, xmm4, 1b
- shufps xmm5, xmm5, 1b
- addss xmm0, xmm3
- addss xmm1, xmm4
- addss xmm2, xmm5 ; xmm0-xmm2 has single force in pos0.
-
- ; increment i force
- movss xmm3, [edi + ecx*4 + 24]
- movss xmm4, [edi + ecx*4 + 28]
- movss xmm5, [edi + ecx*4 + 32]
- addss xmm3, xmm0
- addss xmm4, xmm1
- addss xmm5, xmm2
- movss [edi + ecx*4 + 24], xmm3
- movss [edi + ecx*4 + 28], xmm4
- movss [edi + ecx*4 + 32], xmm5
-
- ;accumulate force in xmm6/xmm7 for fshift
- addss xmm7, xmm2
- movlhps xmm0, xmm1
- shufps xmm0, xmm0, 1000b
- addps xmm6, xmm0
-
- ; increment fshift force
- movlps xmm3, [esi + edx*4]
- movss xmm4, [esi + edx*4 + 8]
- addps xmm3, xmm6
- addss xmm4, xmm7
- movlps [esi + edx*4], xmm3
- movss [esi + edx*4 + 8], xmm4
-
- ; get group index for i particle
- mov edx, [ebp + %$gid] ; get group index for this i particle
- mov edx, [edx]
- add [ebp + %$gid], dword 4 ; advance pointer
-
- ; accumulate total potential energy and update it.
- movaps xmm7, [esp + .vctot]
- ; accumulate
- movhlps xmm6, xmm7
- addps xmm7, xmm6 ; pos 0-1 in xmm7 have the sum now
- movaps xmm6, xmm7
- shufps xmm6, xmm6, 1b
- addss xmm7, xmm6
-
- ; add earlier value from mem.
- mov eax, [ebp + %$Vc]
- addss xmm7, [eax + edx*4]
- ; move back to mem.
- movss [eax + edx*4], xmm7
-
- ;; finish if last
- mov ecx, [ebp + %$nri]
- dec ecx
- jecxz .end
- ;; not last, iterate once more!
- mov [ebp + %$nri], ecx
- jmp .outer
-.end:
- emms
- mov eax, [esp + .salign]
- add esp, eax
- add esp, 1444
- pop edi
- pop esi
- pop edx
- pop ecx
- pop ebx
- pop eax
- endproc
-
-
-
-
-proc inl3100_sse
-%$nri arg
-%$iinr arg
-%$jindex arg
-%$jjnr arg
-%$shift arg
-%$shiftvec arg
-%$fshift arg
-%$gid arg
-%$pos arg
-%$faction arg
-%$charge arg
-%$facel arg
-%$Vc arg
-%$type arg
-%$ntype arg
-%$nbfp arg
-%$Vnb arg
-%$tabscale arg
-%$VFtab arg
- ;; stack offsets for local variables
- ;; bottom of stack is cache-aligned for sse use
-.ix equ 0
-.iy equ 16
-.iz equ 32
-.iq equ 48
-.dx equ 64
-.dy equ 80
-.dz equ 96
-.two equ 112
-.six equ 128
-.twelve equ 144
-.tabscale equ 160
-.qq equ 176
-.c6 equ 192
-.c12 equ 208
-.fs equ 224
-.vctot equ 240
-.vnbtot equ 256
-.fix equ 272
-.fiy equ 288
-.fiz equ 304
-.half equ 320
-.three equ 336
-.is3 equ 352
-.ii3 equ 356
-.ntia equ 360
-.innerjjnr equ 364
-.innerk equ 368
-.salign equ 372
- push eax
- push ebx
- push ecx
- push edx
- push esi
- push edi
- sub esp, 376 ; local stack space
- mov eax, esp
- and eax, 0xf
- sub esp, eax
- mov [esp + .salign], eax
-
- emms
-
- movups xmm0, [sse_half]
- movups xmm1, [sse_two]
- movups xmm2, [sse_three]
- movups xmm3, [sse_six]
- movups xmm4, [sse_twelve]
- movss xmm5, [ebp + %$tabscale]
- movaps [esp + .half], xmm0
- movaps [esp + .two], xmm1
- movaps [esp + .three], xmm2
- movaps [esp + .six], xmm3
- movaps [esp + .twelve], xmm4
- shufps xmm5, xmm5, 0b
- movaps [esp + .tabscale], xmm5
-
- ;; assume we have at least one i particle - start directly
-.outer:
- mov eax, [ebp + %$shift] ; eax = pointer into shift[]
- mov ebx, [eax] ; ebx=shift[n]
- add [ebp + %$shift], dword 4 ; advance pointer one step
-
- lea ebx, [ebx + ebx*2] ; ebx=3*is
- mov [esp + .is3],ebx ; store is3
-
- mov eax, [ebp + %$shiftvec] ; eax = base of shiftvec[]
-
- movss xmm0, [eax + ebx*4]
- movss xmm1, [eax + ebx*4 + 4]
- movss xmm2, [eax + ebx*4 + 8]
-
- mov ecx, [ebp + %$iinr] ; ecx = pointer into iinr[]
- add [ebp + %$iinr], dword 4 ; advance pointer
- mov ebx, [ecx] ; ebx =ii
-
- mov edx, [ebp + %$charge]
- movss xmm3, [edx + ebx*4]
- mulss xmm3, [ebp + %$facel]
- shufps xmm3, xmm3, 0b
-
- mov edx, [ebp + %$type]
- mov edx, [edx + ebx*4]
- imul edx, [ebp + %$ntype]
- shl edx, 1
- mov [esp + .ntia], edx
-
- lea ebx, [ebx + ebx*2] ; ebx = 3*ii=ii3
- mov eax, [ebp + %$pos] ; eax = base of pos[]
-
- addss xmm0, [eax + ebx*4]
- addss xmm1, [eax + ebx*4 + 4]
- addss xmm2, [eax + ebx*4 + 8]
-
- movaps [esp + .iq], xmm3
-
- shufps xmm0, xmm0, 0b
- shufps xmm1, xmm1, 0b
- shufps xmm2, xmm2, 0b
-
- movaps [esp + .ix], xmm0
- movaps [esp + .iy], xmm1
- movaps [esp + .iz], xmm2
-
- mov [esp + .ii3], ebx
-
- ; clear vctot and i forces
- xorps xmm4, xmm4
- movaps [esp + .vctot], xmm4
- movaps [esp + .vnbtot], xmm4
- movaps [esp + .fix], xmm4
- movaps [esp + .fiy], xmm4
- movaps [esp + .fiz], xmm4
-
- mov eax, [ebp + %$jindex]
- mov ecx, [eax] ; jindex[n]
- mov edx, [eax + 4] ; jindex[n+1]
- add [ebp + %$jindex], dword 4
- sub edx, ecx ; number of innerloop atoms
-
- mov esi, [ebp + %$pos]
- mov edi, [ebp + %$faction]
- mov eax, [ebp + %$jjnr]
- shl ecx, 2
- add eax, ecx
- mov [esp + .innerjjnr], eax ; pointer to jjnr[nj0]
- sub edx, dword 4
- mov [esp + .innerk], edx ; number of innerloop atoms
- jge .unroll_loop
- jmp .finish_inner
-.unroll_loop:
- ;; quad-unroll innerloop here.
- mov edx, [esp + .innerjjnr] ; pointer to jjnr[k]
- mov eax, [edx]
- mov ebx, [edx + 4]
- mov ecx, [edx + 8]
- mov edx, [edx + 12] ; eax-edx=jnr1-4
- add [esp + .innerjjnr], dword 16 ; advance pointer (unrolled 4)
-
- mov esi, [ebp + %$charge] ; base of charge[]
-
- movss xmm3, [esi + eax*4]
- movss xmm4, [esi + ecx*4]
- movss xmm6, [esi + ebx*4]
- movss xmm7, [esi + edx*4]
-
- movaps xmm2, [esp + .iq]
- shufps xmm3, xmm6, 00000000b
- shufps xmm4, xmm7, 00000000b
- shufps xmm3, xmm4, 10001000b ; all charges in xmm3
- movd mm0, eax ; use mmx registers as temp. storage
- movd mm1, ebx
- mulps xmm3, xmm2
- movd mm2, ecx
- movd mm3, edx
-
- movaps [esp + .qq], xmm3
-
- mov esi, [ebp + %$type]
- mov eax, [esi + eax*4]
- mov ebx, [esi + ebx*4]
- mov ecx, [esi + ecx*4]
- mov edx, [esi + edx*4]
- mov esi, [ebp + %$nbfp]
- shl eax, 1
- shl ebx, 1
- shl ecx, 1
- shl edx, 1
- mov edi, [esp + .ntia]
- add eax, edi
- add ebx, edi
- add ecx, edi
- add edx, edi
-
- movlps xmm6, [esi + eax*4]
- movlps xmm7, [esi + ecx*4]
- movhps xmm6, [esi + ebx*4]
- movhps xmm7, [esi + edx*4]
-
- movaps xmm4, xmm6
- shufps xmm4, xmm7, 10001000b
- shufps xmm6, xmm7, 11011101b
-
- movd eax, mm0
- movd ebx, mm1
- movd ecx, mm2
- movd edx, mm3
-
- movaps [esp + .c6], xmm4
- movaps [esp + .c12], xmm6
-
- mov esi, [ebp + %$pos] ; base of pos[]
-
- lea eax, [eax + eax*2] ; replace jnr with j3
- lea ebx, [ebx + ebx*2]
-
- lea ecx, [ecx + ecx*2] ; replace jnr with j3
- lea edx, [edx + edx*2]
-
- ; move four coordinates to xmm0-xmm2
-
- movlps xmm4, [esi + eax*4]
- movlps xmm5, [esi + ecx*4]
- movss xmm2, [esi + eax*4 + 8]
- movss xmm6, [esi + ecx*4 + 8]
-
- movhps xmm4, [esi + ebx*4]
- movhps xmm5, [esi + edx*4]
-
- movss xmm0, [esi + ebx*4 + 8]
- movss xmm1, [esi + edx*4 + 8]
-
- shufps xmm2, xmm0, 0b
- shufps xmm6, xmm1, 0b
-
- movaps xmm0, xmm4
- movaps xmm1, xmm4
-
- shufps xmm2, xmm6, 10001000b
-
- shufps xmm0, xmm5, 10001000b
- shufps xmm1, xmm5, 11011101b
-
- ; move ix-iz to xmm4-xmm6
- movaps xmm4, [esp + .ix]
- movaps xmm5, [esp + .iy]
- movaps xmm6, [esp + .iz]
-
- ; calc dr
- subps xmm4, xmm0
- subps xmm5, xmm1
- subps xmm6, xmm2
-
- ; store dr
- movaps [esp + .dx], xmm4
- movaps [esp + .dy], xmm5
- movaps [esp + .dz], xmm6
- ; square it
- mulps xmm4,xmm4
- mulps xmm5,xmm5
- mulps xmm6,xmm6
- addps xmm4, xmm5
- addps xmm4, xmm6
- ; rsq in xmm4
-
- rsqrtps xmm5, xmm4
- ; lookup seed in xmm5
- movaps xmm2, xmm5
- mulps xmm5, xmm5
- movaps xmm1, [esp + .three]
- mulps xmm5, xmm4 ;rsq*lu*lu
- movaps xmm0, [esp + .half]
- subps xmm1, xmm5 ; 3.0-rsq*lu*lu
- mulps xmm1, xmm2
- mulps xmm0, xmm1 ; xmm0=rinv
- mulps xmm4, xmm0 ; xmm4=r
- mulps xmm4, [esp + .tabscale]
-
- movhlps xmm5, xmm4
- cvttps2pi mm6, xmm4
- cvttps2pi mm7, xmm5 ; mm6/mm7 contain lu indices
- cvtpi2ps xmm6, mm6
- cvtpi2ps xmm5, mm7
- movlhps xmm6, xmm5
- subps xmm4, xmm6
- movaps xmm1, xmm4 ;xmm1=eps
- movaps xmm2, xmm1
- mulps xmm2, xmm2 ;xmm2=eps2
- pslld mm6, 2
- pslld mm7, 2
-
- movd mm0, eax
- movd mm1, ebx
- movd mm2, ecx
- movd mm3, edx
-
- mov esi, [ebp + %$VFtab]
- movd eax, mm6
- psrlq mm6, 32
- movd ecx, mm7
- psrlq mm7, 32
- movd ebx, mm6
- movd edx, mm7
-
- movlps xmm5, [esi + eax*4]
- movlps xmm7, [esi + ecx*4]
- movhps xmm5, [esi + ebx*4]
- movhps xmm7, [esi + edx*4] ; got half coulomb table
-
- movaps xmm4, xmm5
- shufps xmm4, xmm7, 10001000b
- shufps xmm5, xmm7, 11011101b
-
- movlps xmm7, [esi + eax*4 + 8]
- movlps xmm3, [esi + ecx*4 + 8]
- movhps xmm7, [esi + ebx*4 + 8]
- movhps xmm3, [esi + edx*4 + 8] ; other half of coulomb table
- movaps xmm6, xmm7
- shufps xmm6, xmm3, 10001000b
- shufps xmm7, xmm3, 11011101b
- ; coulomb table ready, in xmm4-xmm7
-
- mulps xmm6, xmm1 ; xmm6=Geps
- mulps xmm7, xmm2 ; xmm7=Heps2
- addps xmm5, xmm6
- addps xmm5, xmm7 ; xmm5=Fp
- mulps xmm7, [esp + .two] ; two*Heps2
- movaps xmm3, [esp + .qq]
- addps xmm7, xmm6
- addps xmm7, xmm5 ; xmm7=FF
- mulps xmm5, xmm1 ; xmm5=eps*Fp
- addps xmm5, xmm4 ; xmm5=VV
- mulps xmm5, xmm3 ; vcoul=qq*VV
- mulps xmm3, xmm7 ; fijC=FF*qq
- ; L-J
- movaps xmm4, xmm0
- mulps xmm4, xmm0 ; xmm4=rinvsq
-
- ; at this point mm5 contains vcoul and mm3 fijC.
- ; increment vcoul - then we can get rid of mm5.
- ;; update vctot
- addps xmm5, [esp + .vctot]
-
- movaps xmm6, xmm4
- mulps xmm6, xmm4
-
- movaps [esp + .vctot], xmm5
-
- mulps xmm6, xmm4 ; xmm6=rinvsix
- movaps xmm4, xmm6
- mulps xmm4, xmm4 ; xmm4=rinvtwelve
- mulps xmm6, [esp + .c6]
- mulps xmm4, [esp + .c12]
- movaps xmm7, [esp + .vnbtot]
- addps xmm7, xmm4
- mulps xmm4, [esp + .twelve]
- subps xmm7, xmm6
- mulps xmm3, [esp + .tabscale]
- mulps xmm6, [esp + .six]
- movaps [esp + .vnbtot], xmm7
- subps xmm4, xmm6
- mulps xmm4, xmm0
- subps xmm4, xmm3
- mulps xmm4, xmm0
-
- movaps xmm0, [esp + .dx]
- movaps xmm1, [esp + .dy]
- movaps xmm2, [esp + .dz]
-
- movd eax, mm0
- movd ebx, mm1
- movd ecx, mm2
- movd edx, mm3
-
- mov edi, [ebp + %$faction]
- mulps xmm0, xmm4
- mulps xmm1, xmm4
- mulps xmm2, xmm4
- ; xmm0-xmm2 contains tx-tz (partial force)
- ; now update f_i
- movaps xmm3, [esp + .fix]
- movaps xmm4, [esp + .fiy]
- movaps xmm5, [esp + .fiz]
- addps xmm3, xmm0
- addps xmm4, xmm1
- addps xmm5, xmm2
- movaps [esp + .fix], xmm3
- movaps [esp + .fiy], xmm4
- movaps [esp + .fiz], xmm5
- ; the fj's - start by accumulating x & y forces from memory
- movlps xmm4, [edi + eax*4]
- movlps xmm6, [edi + ecx*4]
- movhps xmm4, [edi + ebx*4]
- movhps xmm6, [edi + edx*4]
-
- movaps xmm3, xmm4
- shufps xmm3, xmm6, 10001000b
- shufps xmm4, xmm6, 11011101b
-
- ; now xmm3-xmm5 contains fjx, fjy, fjz
- subps xmm3, xmm0
- subps xmm4, xmm1
-
- ; unpack them back so we can store them - first x & y in xmm3/xmm4
-
- movaps xmm6, xmm3
- unpcklps xmm6, xmm4
- unpckhps xmm3, xmm4
- ; xmm6(l)=x & y for j1, (h) for j2
- ; xmm3(l)=x & y for j3, (h) for j4
- movlps [edi + eax*4], xmm6
- movlps [edi + ecx*4], xmm3
-
- movhps [edi + ebx*4], xmm6
- movhps [edi + edx*4], xmm3
-
- ;; and the z forces
- movss xmm4, [edi + eax*4 + 8]
- movss xmm5, [edi + ebx*4 + 8]
- movss xmm6, [edi + ecx*4 + 8]
- movss xmm7, [edi + edx*4 + 8]
- subss xmm4, xmm2
- shufps xmm2, xmm2, 11100101b
- subss xmm5, xmm2
- shufps xmm2, xmm2, 11101010b
- subss xmm6, xmm2
- shufps xmm2, xmm2, 11111111b
- subss xmm7, xmm2
- movss [edi + eax*4 + 8], xmm4
- movss [edi + ebx*4 + 8], xmm5
- movss [edi + ecx*4 + 8], xmm6
- movss [edi + edx*4 + 8], xmm7
-
- ;; should we do one more iteration?
- sub [esp + .innerk], dword 4
- jl .finish_inner
- jmp .unroll_loop
-.finish_inner:
- ;; check if at least two particles remain
- add [esp + .innerk], dword 4
- mov edx, [esp + .innerk]
- and edx, 10b
- jnz .dopair
- jmp .checksingle
-.dopair:
- mov esi, [ebp + %$charge]
- mov ecx, [esp + .innerjjnr]
- mov eax, [ecx]
- mov ebx, [ecx + 4]
- add [esp + .innerjjnr], dword 8
- xorps xmm7, xmm7
- movss xmm3, [esi + eax*4]
- movss xmm6, [esi + ebx*4]
- shufps xmm3, xmm6, 00000000b
- shufps xmm3, xmm3, 00001000b ; xmm3(0,1) has the charges.
-
- mulps xmm3, [esp + .iq]
- movlhps xmm3, xmm7
- movaps [esp + .qq], xmm3
-
- mov esi, [ebp + %$type]
- mov ecx, eax
- mov edx, ebx
- mov ecx, [esi + ecx*4]
- mov edx, [esi + edx*4]
- mov esi, [ebp + %$nbfp]
- shl ecx, 1
- shl edx, 1
- mov edi, [esp + .ntia]
- add ecx, edi
- add edx, edi
- movlps xmm6, [esi + ecx*4]
- movhps xmm6, [esi + edx*4]
- mov edi, [ebp + %$pos]
-
- movaps xmm4, xmm6
- shufps xmm4, xmm4, 1000b
- shufps xmm6, xmm6, 1101b
- movlhps xmm4, xmm7
- movlhps xmm6, xmm7
-
- movaps [esp + .c6], xmm4
- movaps [esp + .c12], xmm6
-
- lea eax, [eax + eax*2]
- lea ebx, [ebx + ebx*2]
- ; move coordinates to xmm0-xmm2
- movlps xmm1, [edi + eax*4]
- movss xmm2, [edi + eax*4 + 8]
- movhps xmm1, [edi + ebx*4]
- movss xmm0, [edi + ebx*4 + 8]
-
- movlhps xmm3, xmm7
-
- shufps xmm2, xmm0, 0b
-
- movaps xmm0, xmm1
-
- shufps xmm2, xmm2, 10001000b
-
- shufps xmm0, xmm0, 10001000b
- shufps xmm1, xmm1, 11011101b
-
- mov edi, [ebp + %$faction]
- ; move ix-iz to xmm4-xmm6
- xorps xmm7, xmm7
-
- movaps xmm4, [esp + .ix]
- movaps xmm5, [esp + .iy]
- movaps xmm6, [esp + .iz]
-
- ; calc dr
- subps xmm4, xmm0
- subps xmm5, xmm1
- subps xmm6, xmm2
-
- ; store dr
- movaps [esp + .dx], xmm4
- movaps [esp + .dy], xmm5
- movaps [esp + .dz], xmm6
- ; square it
- mulps xmm4,xmm4
- mulps xmm5,xmm5
- mulps xmm6,xmm6
- addps xmm4, xmm5
- addps xmm4, xmm6
- ; rsq in xmm4
-
- rsqrtps xmm5, xmm4
- ; lookup seed in xmm5
- movaps xmm2, xmm5
- mulps xmm5, xmm5
- movaps xmm1, [esp + .three]
- mulps xmm5, xmm4 ;rsq*lu*lu
- movaps xmm0, [esp + .half]
- subps xmm1, xmm5 ; 3.0-rsq*lu*lu
- mulps xmm1, xmm2
- mulps xmm0, xmm1 ; xmm0=rinv
- mulps xmm4, xmm0 ; xmm4=r
- mulps xmm4, [esp + .tabscale]
-
- cvttps2pi mm6, xmm4 ; mm6 contain lu indices
- cvtpi2ps xmm6, mm6
- subps xmm4, xmm6
- movaps xmm1, xmm4 ;xmm1=eps
- movaps xmm2, xmm1
- mulps xmm2, xmm2 ;xmm2=eps2
-
- pslld mm6, 2
-
- mov esi, [ebp + %$VFtab]
- movd ecx, mm6
- psrlq mm6, 32
- movd edx, mm6
-
- movlps xmm5, [esi + ecx*4]
- movhps xmm5, [esi + edx*4] ; got half coulomb table
- movaps xmm4, xmm5
- shufps xmm4, xmm4, 10001000b
- shufps xmm5, xmm7, 11011101b
-
- movlps xmm7, [esi + ecx*4 + 8]
- movhps xmm7, [esi + edx*4 + 8]
- movaps xmm6, xmm7
- shufps xmm6, xmm6, 10001000b
- shufps xmm7, xmm7, 11011101b
- ; table ready in xmm4-xmm7
-
- mulps xmm6, xmm1 ; xmm6=Geps
- mulps xmm7, xmm2 ; xmm7=Heps2
- addps xmm5, xmm6
- addps xmm5, xmm7 ; xmm5=Fp
- mulps xmm7, [esp + .two] ; two*Heps2
- movaps xmm3, [esp + .qq]
- addps xmm7, xmm6
- addps xmm7, xmm5 ; xmm7=FF
- mulps xmm5, xmm1 ; xmm5=eps*Fp
- addps xmm5, xmm4 ; xmm5=VV
- mulps xmm5, xmm3 ; vcoul=qq*VV
- mulps xmm3, xmm7 ; fijC=FF*qq
- ; L-J
- movaps xmm4, xmm0
- mulps xmm4, xmm0 ; xmm4=rinvsq
-
- ; at this point mm5 contains vcoul and mm3 fijC.
- ; increment vcoul - then we can get rid of mm5.
- ;; update vctot
- addps xmm5, [esp + .vctot]
-
- movaps xmm6, xmm4
- mulps xmm6, xmm4
-
- movaps [esp + .vctot], xmm5
-
- mulps xmm6, xmm4 ; xmm6=rinvsix
- movaps xmm4, xmm6
- mulps xmm4, xmm4 ; xmm4=rinvtwelve
- mulps xmm6, [esp + .c6]
- mulps xmm4, [esp + .c12]
- movaps xmm7, [esp + .vnbtot]
- addps xmm7, xmm4
- mulps xmm4, [esp + .twelve]
- subps xmm7, xmm6
- mulps xmm3, [esp + .tabscale]
- mulps xmm6, [esp + .six]
- movaps [esp + .vnbtot], xmm7
- subps xmm4, xmm6
- mulps xmm4, xmm0
- subps xmm4, xmm3
- mulps xmm4, xmm0
-
- movaps xmm0, [esp + .dx]
- movaps xmm1, [esp + .dy]
- movaps xmm2, [esp + .dz]
-
- mulps xmm0, xmm4
- mulps xmm1, xmm4
- mulps xmm2, xmm4
- ; xmm0-xmm2 contains tx-tz (partial force)
- ; now update f_i
- movaps xmm3, [esp + .fix]
- movaps xmm4, [esp + .fiy]
- movaps xmm5, [esp + .fiz]
- addps xmm3, xmm0
- addps xmm4, xmm1
- addps xmm5, xmm2
- movaps [esp + .fix], xmm3
- movaps [esp + .fiy], xmm4
- movaps [esp + .fiz], xmm5
- ; update the fj's
- movss xmm3, [edi + eax*4]
- movss xmm4, [edi + eax*4 + 4]
- movss xmm5, [edi + eax*4 + 8]
- subss xmm3, xmm0
- subss xmm4, xmm1
- subss xmm5, xmm2
- movss [edi + eax*4], xmm3
- movss [edi + eax*4 + 4], xmm4
- movss [edi + eax*4 + 8], xmm5
-
- shufps xmm0, xmm0, 11100001b
- shufps xmm1, xmm1, 11100001b
- shufps xmm2, xmm2, 11100001b
-
- movss xmm3, [edi + ebx*4]
- movss xmm4, [edi + ebx*4 + 4]
- movss xmm5, [edi + ebx*4 + 8]
- subss xmm3, xmm0
- subss xmm4, xmm1
- subss xmm5, xmm2
- movss [edi + ebx*4], xmm3
- movss [edi + ebx*4 + 4], xmm4
- movss [edi + ebx*4 + 8], xmm5
-
-.checksingle:
- mov edx, [esp + .innerk]
- and edx, 1b
- jnz .dosingle
- jmp .updateouterdata
-.dosingle:
- mov esi, [ebp + %$charge]
- mov edi, [ebp + %$pos]
- mov ecx, [esp + .innerjjnr]
- mov eax, [ecx]
- xorps xmm6, xmm6
- movss xmm6, [esi + eax*4] ; xmm6(0) has the charge
- mulps xmm6, [esp + .iq]
- movaps [esp + .qq], xmm6
-
- mov esi, [ebp + %$type]
- mov ecx, eax
- mov ecx, [esi + ecx*4]
- mov esi, [ebp + %$nbfp]
- shl ecx, 1
- add ecx, [esp + .ntia]
- movlps xmm6, [esi + ecx*4]
- movaps xmm4, xmm6
- shufps xmm4, xmm4, 11111100b
- shufps xmm6, xmm6, 11111101b
-
- movaps [esp + .c6], xmm4
- movaps [esp + .c12], xmm6
-
- lea eax, [eax + eax*2]
-
- ; move coordinates to xmm0-xmm2
- movss xmm0, [edi + eax*4]
- movss xmm1, [edi + eax*4 + 4]
- movss xmm2, [edi + eax*4 + 8]
-
- movaps xmm4, [esp + .ix]
- movaps xmm5, [esp + .iy]
- movaps xmm6, [esp + .iz]
-
- ; calc dr
- subps xmm4, xmm0
- subps xmm5, xmm1
- subps xmm6, xmm2
-
- ; store dr
- movaps [esp + .dx], xmm4
- movaps [esp + .dy], xmm5
- movaps [esp + .dz], xmm6
- ; square it
- mulps xmm4,xmm4
- mulps xmm5,xmm5
- mulps xmm6,xmm6
- addps xmm4, xmm5
- addps xmm4, xmm6
- ; rsq in xmm4
-
- rsqrtps xmm5, xmm4
- ; lookup seed in xmm5
- movaps xmm2, xmm5
- mulps xmm5, xmm5
- movaps xmm1, [esp + .three]
- mulps xmm5, xmm4 ;rsq*lu*lu
- movaps xmm0, [esp + .half]
- subps xmm1, xmm5 ; 3.0-rsq*lu*lu
- mulps xmm1, xmm2
- mulps xmm0, xmm1 ; xmm0=rinv
-
- mulps xmm4, xmm0 ; xmm4=r
- mulps xmm4, [esp + .tabscale]
-
- cvttps2pi mm6, xmm4 ; mm6 contain lu indices
- cvtpi2ps xmm6, mm6
- subps xmm4, xmm6
- movaps xmm1, xmm4 ;xmm1=eps
- movaps xmm2, xmm1
- mulps xmm2, xmm2 ;xmm2=eps2
-
- pslld mm6, 2
-
- mov esi, [ebp + %$VFtab]
- movd ebx, mm6
-
- movlps xmm4, [esi + ebx*4]
- movlps xmm6, [esi + ebx*4 + 8]
- movaps xmm5, xmm4
- movaps xmm7, xmm6
- shufps xmm5, xmm5, 1b
- shufps xmm7, xmm7, 1b
- ; table ready in xmm4-xmm7
-
- mulps xmm6, xmm1 ; xmm6=Geps
- mulps xmm7, xmm2 ; xmm7=Heps2
- addps xmm5, xmm6
- addps xmm5, xmm7 ; xmm5=Fp
- mulps xmm7, [esp + .two] ; two*Heps2
- movaps xmm3, [esp + .qq]
- addps xmm7, xmm6
- addps xmm7, xmm5 ; xmm7=FF
- mulps xmm5, xmm1 ; xmm5=eps*Fp
- addps xmm5, xmm4 ; xmm5=VV
- mulps xmm5, xmm3 ; vcoul=qq*VV
- mulps xmm3, xmm7 ; fijC=FF*qq
- ; L-J
- movaps xmm4, xmm0
- mulps xmm4, xmm0 ; xmm4=rinvsq
-
- ; at this point mm5 contains vcoul and mm3 fijC.
- ; increment vcoul - then we can get rid of mm5.
- ;; update vctot
- addps xmm5, [esp + .vctot]
-
- movaps xmm6, xmm4
- mulps xmm6, xmm4
-
- movaps [esp + .vctot], xmm5
-
- mulps xmm6, xmm4 ; xmm6=rinvsix
- movaps xmm4, xmm6
- mulps xmm4, xmm4 ; xmm4=rinvtwelve
- mulps xmm6, [esp + .c6]
- mulps xmm4, [esp + .c12]
- movaps xmm7, [esp + .vnbtot]
- addps xmm7, xmm4
- mulps xmm4, [esp + .twelve]
- subps xmm7, xmm6
- mulps xmm3, [esp + .tabscale]
- mulps xmm6, [esp + .six]
- movaps [esp + .vnbtot], xmm7
- subps xmm4, xmm6
- mulps xmm4, xmm0
- subps xmm4, xmm3
- mulps xmm4, xmm0
-
- movaps xmm0, [esp + .dx]
- movaps xmm1, [esp + .dy]
- movaps xmm2, [esp + .dz]
-
- mov edi, [ebp + %$faction]
- mulps xmm0, xmm4
- mulps xmm1, xmm4
- mulps xmm2, xmm4
- ; xmm0-xmm2 contains tx-tz (partial force)
- ; now update f_i
- movaps xmm3, [esp + .fix]
- movaps xmm4, [esp + .fiy]
- movaps xmm5, [esp + .fiz]
- addps xmm3, xmm0
- addps xmm4, xmm1
- addps xmm5, xmm2
- movaps [esp + .fix], xmm3
- movaps [esp + .fiy], xmm4
- movaps [esp + .fiz], xmm5
- ; update fj
-
- movss xmm3, [edi + eax*4]
- movss xmm4, [edi + eax*4 + 4]
- movss xmm5, [edi + eax*4 + 8]
- subss xmm3, xmm0
- subss xmm4, xmm1
- subss xmm5, xmm2
- movss [edi + eax*4], xmm3
- movss [edi + eax*4 + 4], xmm4
- movss [edi + eax*4 + 8], xmm5
-.updateouterdata:
- mov ecx, [esp + .ii3]
- mov edi, [ebp + %$faction]
- mov esi, [ebp + %$fshift]
- mov edx, [esp + .is3]
-
- ; accumulate i forces in xmm0, xmm1, xmm2
- movaps xmm0, [esp + .fix]
- movaps xmm1, [esp + .fiy]
- movaps xmm2, [esp + .fiz]
-
- movhlps xmm3, xmm0
- movhlps xmm4, xmm1
- movhlps xmm5, xmm2
- addps xmm0, xmm3
- addps xmm1, xmm4
- addps xmm2, xmm5 ; summan ligger i 1/2 i xmm0-xmm2
-
- movaps xmm3, xmm0
- movaps xmm4, xmm1
- movaps xmm5, xmm2
-
- shufps xmm3, xmm3, 1b
- shufps xmm4, xmm4, 1b
- shufps xmm5, xmm5, 1b
- addss xmm0, xmm3
- addss xmm1, xmm4
- addss xmm2, xmm5 ; xmm0-xmm2 has single force in pos0.
-
- ; increment i force
- movss xmm3, [edi + ecx*4]
- movss xmm4, [edi + ecx*4 + 4]
- movss xmm5, [edi + ecx*4 + 8]
- addss xmm3, xmm0
- addss xmm4, xmm1
- addss xmm5, xmm2
- movss [edi + ecx*4], xmm3
- movss [edi + ecx*4 + 4], xmm4
- movss [edi + ecx*4 + 8], xmm5
-
- ; increment fshift force
- movss xmm3, [esi + edx*4]
- movss xmm4, [esi + edx*4 + 4]
- movss xmm5, [esi + edx*4 + 8]
- addss xmm3, xmm0
- addss xmm4, xmm1
- addss xmm5, xmm2
- movss [esi + edx*4], xmm3
- movss [esi + edx*4 + 4], xmm4
- movss [esi + edx*4 + 8], xmm5
-
- ; get group index for i particle
- mov edx, [ebp + %$gid] ; get group index for this i particle
- mov edx, [edx]
- add [ebp + %$gid], dword 4 ; advance pointer
-
- ; accumulate total potential energy and update it.
- movaps xmm7, [esp + .vctot]
- ; accumulate
- movhlps xmm6, xmm7
- addps xmm7, xmm6 ; pos 0-1 in xmm7 have the sum now
- movaps xmm6, xmm7
- shufps xmm6, xmm6, 1b
- addss xmm7, xmm6
-
- ; add earlier value from mem.
- mov eax, [ebp + %$Vc]
- addss xmm7, [eax + edx*4]
- ; move back to mem.
- movss [eax + edx*4], xmm7
-
- ; accumulate total lj energy and update it.
- movaps xmm7, [esp + .vnbtot]
- ; accumulate
- movhlps xmm6, xmm7
- addps xmm7, xmm6 ; pos 0-1 in xmm7 have the sum now
- movaps xmm6, xmm7
- shufps xmm6, xmm6, 1b
- addss xmm7, xmm6
-
- ; add earlier value from mem.
- mov eax, [ebp + %$Vnb]
- addss xmm7, [eax + edx*4]
- ; move back to mem.
- movss [eax + edx*4], xmm7
-
- ;; finish if last
- mov ecx, [ebp + %$nri]
- dec ecx
- jecxz .end
- ;; not last, iterate once more!
- mov [ebp + %$nri], ecx
- jmp .outer
-.end:
- emms
- mov eax, [esp + .salign]
- add esp, eax
- add esp, 376
- pop edi
- pop esi
- pop edx
- pop ecx
- pop ebx
- pop eax
- endproc
-
-
-
-
-proc inl3110_sse
-%$nri arg
-%$iinr arg
-%$jindex arg
-%$jjnr arg
-%$shift arg
-%$shiftvec arg
-%$fshift arg
-%$gid arg
-%$pos arg
-%$faction arg
-%$charge arg
-%$facel arg
-%$Vc arg
-%$type arg
-%$ntype arg
-%$nbfp arg
-%$Vnb arg
-%$tabscale arg
-%$VFtab arg
-%$nsatoms arg
- ;; stack offsets for local variables
- ;; bottom of stack is cache-aligned for sse use
-.ix equ 0
-.iy equ 16
-.iz equ 32
-.iq equ 48
-.dx equ 64
-.dy equ 80
-.dz equ 96
-.two equ 112
-.tabscale equ 128
-.qq equ 144
-.c6 equ 160
-.c12 equ 176
-.six equ 192
-.twelve equ 208
-.fs equ 224
-.vctot equ 240
-.vnbtot equ 256
-.fix equ 272
-.fiy equ 288
-.fiz equ 304
-.half equ 320
-.three equ 336
-.is3 equ 352
-.ii3 equ 356
-.shX equ 360
-.shY equ 364
-.shZ equ 368
-.ntia equ 372
-.innerjjnr0 equ 376
-.innerk0 equ 380
-.innerjjnr equ 384
-.innerk equ 388
-.salign equ 392
-.nsvdwc equ 396
-.nscoul equ 400
-.nsvdw equ 404
-.solnr equ 408
- push eax
- push ebx
- push ecx
- push edx
- push esi
- push edi
- sub esp, 412 ; local stack space
- mov eax, esp
- and eax, 0xf
- sub esp, eax
- mov [esp + .salign], eax
-
- emms
-
- movups xmm0, [sse_half]
- movups xmm1, [sse_two]
- movups xmm2, [sse_three]
- movups xmm3, [sse_six]
- movups xmm4, [sse_twelve]
- movss xmm5, [ebp + %$tabscale]
- movaps [esp + .half], xmm0
- movaps [esp + .two], xmm1
- movaps [esp + .three], xmm2
- movaps [esp + .six], xmm3
- movaps [esp + .twelve], xmm4
- shufps xmm5, xmm5, 0b
- movaps [esp + .tabscale], xmm5
-
- ;; assume we have at least one i particle - start directly
-.outer:
- mov eax, [ebp + %$shift] ; eax = pointer into shift[]
- mov ebx, [eax] ; ebx=shift[n]
- add [ebp + %$shift], dword 4 ; advance pointer one step
-
- lea ebx, [ebx + ebx*2] ; ebx=3*is
- mov [esp + .is3],ebx ; store is3
-
- mov eax, [ebp + %$shiftvec] ; eax = base of shiftvec[]
-
- movlps xmm0, [eax + ebx*4]
- movss xmm1, [eax + ebx*4 + 8]
- movlps [esp + .shX], xmm0
- movss [esp + .shZ], xmm1
-
- mov ecx, [ebp + %$iinr] ; ecx = pointer into iinr[]
- add [ebp + %$iinr], dword 4 ; advance pointer
- mov ebx, [ecx] ; ebx =ii
-
- mov eax, [ebp + %$nsatoms]
- add [ebp + %$nsatoms], dword 12
- mov ecx, [eax]
- mov edx, [eax + 4]
- mov eax, [eax + 8]
- sub ecx, eax
- sub eax, edx
-
- mov [esp + .nsvdwc], edx
- mov [esp + .nscoul], eax
- mov [esp + .nsvdw], ecx
-
- ;; clear potential
- xorps xmm4, xmm4
- movaps [esp + .vctot], xmm4
- movaps [esp + .vnbtot], xmm4
- mov [esp + .solnr], ebx
-
- mov eax, [ebp + %$jindex]
- mov ecx, [eax] ; jindex[n]
- mov edx, [eax + 4] ; jindex[n+1]
- add [ebp + %$jindex], dword 4
- sub edx, ecx ; number of innerloop atoms
-
- mov eax, [ebp + %$jjnr]
- shl ecx, 2
- add eax, ecx
- mov [esp + .innerjjnr0], eax ; pointer to jjnr[nj0]
- mov [esp + .innerk0], edx ; number of innerloop atoms
-
- mov ecx, [esp + .nsvdwc]
- cmp ecx, dword 0
- jnz .mno_vdwc
- jmp .testcoul
-.mno_vdwc:
- mov ebx, [esp + .solnr]
- inc dword [esp + .solnr]
-
- mov edx, [ebp + %$charge]
- movss xmm3, [edx + ebx*4]
- mulss xmm3, [ebp + %$facel]
- shufps xmm3, xmm3, 0b
-
- mov edx, [ebp + %$type]
- mov edx, [edx + ebx*4]
- imul edx, [ebp + %$ntype]
- shl edx, 1
- mov [esp + .ntia], edx
-
- lea ebx, [ebx + ebx*2] ; ebx = 3*ii=ii3
- mov eax, [ebp + %$pos] ; eax = base of pos[]
-
- movss xmm0, [esp + .shX]
- movss xmm1, [esp + .shY]
- movss xmm2, [esp + .shZ]
- addss xmm0, [eax + ebx*4]
- addss xmm1, [eax + ebx*4 + 4]
- addss xmm2, [eax + ebx*4 + 8]
-
- movaps [esp + .iq], xmm3
-
- shufps xmm0, xmm0, 0b
- shufps xmm1, xmm1, 0b
- shufps xmm2, xmm2, 0b
-
- movaps [esp + .ix], xmm0
- movaps [esp + .iy], xmm1
- movaps [esp + .iz], xmm2
-
- mov [esp + .ii3], ebx
-
- ; clear i forces
- xorps xmm4, xmm4
- movaps [esp + .fix], xmm4
- movaps [esp + .fiy], xmm4
- movaps [esp + .fiz], xmm4
-
- mov ecx, [esp + .innerjjnr0]
- mov [esp + .innerjjnr], ecx
- mov edx, [esp + .innerk0]
- sub edx, dword 4
- mov [esp + .innerk], edx ; number of innerloop atoms
- jge .unroll_vdwc_loop
- jmp .finish_vdwc_inner
-.unroll_vdwc_loop:
- ;; quad-unroll innerloop here.
- mov edx, [esp + .innerjjnr] ; pointer to jjnr[k]
- mov eax, [edx]
- mov ebx, [edx + 4]
- mov ecx, [edx + 8]
- mov edx, [edx + 12] ; eax-edx=jnr1-4
- add [esp + .innerjjnr], dword 16 ; advance pointer (unrolled 4)
-
- mov esi, [ebp + %$charge] ; base of charge[]
-
- movss xmm3, [esi + eax*4]
- movss xmm4, [esi + ecx*4]
- movss xmm6, [esi + ebx*4]
- movss xmm7, [esi + edx*4]
-
- movaps xmm2, [esp + .iq]
- shufps xmm3, xmm6, 00000000b
- shufps xmm4, xmm7, 00000000b
- shufps xmm3, xmm4, 10001000b ; all charges in xmm3
- movd mm0, eax ; use mmx registers as temp. storage
- movd mm1, ebx
- mulps xmm3, xmm2
- movd mm2, ecx
- movd mm3, edx
-
- movaps [esp + .qq], xmm3
-
- mov esi, [ebp + %$type]
- mov eax, [esi + eax*4]
- mov ebx, [esi + ebx*4]
- mov ecx, [esi + ecx*4]
- mov edx, [esi + edx*4]
- mov esi, [ebp + %$nbfp]
- shl eax, 1
- shl ebx, 1
- shl ecx, 1
- shl edx, 1
- mov edi, [esp + .ntia]
- add eax, edi
- add ebx, edi
- add ecx, edi
- add edx, edi
-
- movlps xmm6, [esi + eax*4]
- movlps xmm7, [esi + ecx*4]
- movhps xmm6, [esi + ebx*4]
- movhps xmm7, [esi + edx*4]
-
- movaps xmm4, xmm6
- shufps xmm4, xmm7, 10001000b
- shufps xmm6, xmm7, 11011101b
-
- movd eax, mm0
- movd ebx, mm1
- movd ecx, mm2
- movd edx, mm3
-
- movaps [esp + .c6], xmm4
- movaps [esp + .c12], xmm6
-
- mov esi, [ebp + %$pos] ; base of pos[]
-
- lea eax, [eax + eax*2] ; replace jnr with j3
- lea ebx, [ebx + ebx*2]
-
- lea ecx, [ecx + ecx*2] ; replace jnr with j3
- lea edx, [edx + edx*2]
-
- ; move four coordinates to xmm0-xmm2
-
- movlps xmm4, [esi + eax*4]
- movlps xmm5, [esi + ecx*4]
- movss xmm2, [esi + eax*4 + 8]
- movss xmm6, [esi + ecx*4 + 8]
-
- movhps xmm4, [esi + ebx*4]
- movhps xmm5, [esi + edx*4]
-
- movss xmm0, [esi + ebx*4 + 8]
- movss xmm1, [esi + edx*4 + 8]
-
- shufps xmm2, xmm0, 0b
- shufps xmm6, xmm1, 0b
-
- movaps xmm0, xmm4
- movaps xmm1, xmm4
-
- shufps xmm2, xmm6, 10001000b
-
- shufps xmm0, xmm5, 10001000b
- shufps xmm1, xmm5, 11011101b
-
- ; move ix-iz to xmm4-xmm6
- movaps xmm4, [esp + .ix]
- movaps xmm5, [esp + .iy]
- movaps xmm6, [esp + .iz]
-
- ; calc dr
- subps xmm4, xmm0
- subps xmm5, xmm1
- subps xmm6, xmm2
-
- ; store dr
- movaps [esp + .dx], xmm4
- movaps [esp + .dy], xmm5
- movaps [esp + .dz], xmm6
- ; square it
- mulps xmm4,xmm4
- mulps xmm5,xmm5
- mulps xmm6,xmm6
- addps xmm4, xmm5
- addps xmm4, xmm6
- ; rsq in xmm4
-
- rsqrtps xmm5, xmm4
- ; lookup seed in xmm5
- movaps xmm2, xmm5
- mulps xmm5, xmm5
- movaps xmm1, [esp + .three]
- mulps xmm5, xmm4 ;rsq*lu*lu
- movaps xmm0, [esp + .half]
- subps xmm1, xmm5 ; 3.0-rsq*lu*lu
- mulps xmm1, xmm2
- mulps xmm0, xmm1 ; xmm0=rinv
- mulps xmm4, xmm0 ; xmm4=r
- mulps xmm4, [esp + .tabscale]
-
- movhlps xmm5, xmm4
- cvttps2pi mm6, xmm4
- cvttps2pi mm7, xmm5 ; mm6/mm7 contain lu indices
- cvtpi2ps xmm6, mm6
- cvtpi2ps xmm5, mm7
- movlhps xmm6, xmm5
- subps xmm4, xmm6
- movaps xmm1, xmm4 ;xmm1=eps
- movaps xmm2, xmm1
- mulps xmm2, xmm2 ;xmm2=eps2
- pslld mm6, 2
- pslld mm7, 2
-
- movd mm0, eax
- movd mm1, ebx
- movd mm2, ecx
- movd mm3, edx
-
- mov esi, [ebp + %$VFtab]
- movd eax, mm6
- psrlq mm6, 32
- movd ecx, mm7
- psrlq mm7, 32
- movd ebx, mm6
- movd edx, mm7
-
- movlps xmm5, [esi + eax*4]
- movlps xmm7, [esi + ecx*4]
- movhps xmm5, [esi + ebx*4]
- movhps xmm7, [esi + edx*4] ; got half coulomb table
-
- movaps xmm4, xmm5
- shufps xmm4, xmm7, 10001000b
- shufps xmm5, xmm7, 11011101b
-
- movlps xmm7, [esi + eax*4 + 8]
- movlps xmm3, [esi + ecx*4 + 8]
- movhps xmm7, [esi + ebx*4 + 8]
- movhps xmm3, [esi + edx*4 + 8] ; other half of coulomb table
- movaps xmm6, xmm7
- shufps xmm6, xmm3, 10001000b
- shufps xmm7, xmm3, 11011101b
- ; coulomb table ready, in xmm4-xmm7
-
- mulps xmm6, xmm1 ; xmm6=Geps
- mulps xmm7, xmm2 ; xmm7=Heps2
- addps xmm5, xmm6
- addps xmm5, xmm7 ; xmm5=Fp
- mulps xmm7, [esp + .two] ; two*Heps2
- movaps xmm3, [esp + .qq]
- addps xmm7, xmm6
- addps xmm7, xmm5 ; xmm7=FF
- mulps xmm5, xmm1 ; xmm5=eps*Fp
- addps xmm5, xmm4 ; xmm5=VV
- mulps xmm5, xmm3 ; vcoul=qq*VV
- mulps xmm3, xmm7 ; fijC=FF*qq
- ; L-J
- movaps xmm4, xmm0
- mulps xmm4, xmm0 ; xmm4=rinvsq
-
- ; at this point mm5 contains vcoul and mm3 fijC.
- ; increment vcoul - then we can get rid of mm5.
- ;; update vctot
- addps xmm5, [esp + .vctot]
-
- movaps xmm6, xmm4
- mulps xmm6, xmm4
-
- movaps [esp + .vctot], xmm5
-
- mulps xmm6, xmm4 ; xmm6=rinvsix
- movaps xmm4, xmm6
- mulps xmm4, xmm4 ; xmm4=rinvtwelve
- mulps xmm6, [esp + .c6]
- mulps xmm4, [esp + .c12]
- movaps xmm7, [esp + .vnbtot]
- addps xmm7, xmm4
- mulps xmm4, [esp + .twelve]
- subps xmm7, xmm6
- mulps xmm3, [esp + .tabscale]
- mulps xmm6, [esp + .six]
- movaps [esp + .vnbtot], xmm7
- subps xmm4, xmm6
- mulps xmm4, xmm0
- subps xmm4, xmm3
- mulps xmm4, xmm0
-
- movaps xmm0, [esp + .dx]
- movaps xmm1, [esp + .dy]
- movaps xmm2, [esp + .dz]
-
- movd eax, mm0
- movd ebx, mm1
- movd ecx, mm2
- movd edx, mm3
-
- mov edi, [ebp + %$faction]
- mulps xmm0, xmm4
- mulps xmm1, xmm4
- mulps xmm2, xmm4
- ; xmm0-xmm2 contains tx-tz (partial force)
- ; now update f_i
- movaps xmm3, [esp + .fix]
- movaps xmm4, [esp + .fiy]
- movaps xmm5, [esp + .fiz]
- addps xmm3, xmm0
- addps xmm4, xmm1
- addps xmm5, xmm2
- movaps [esp + .fix], xmm3
- movaps [esp + .fiy], xmm4
- movaps [esp + .fiz], xmm5
- ; the fj's - start by accumulating x & y forces from memory
- movlps xmm4, [edi + eax*4]
- movlps xmm6, [edi + ecx*4]
- movhps xmm4, [edi + ebx*4]
- movhps xmm6, [edi + edx*4]
-
- movaps xmm3, xmm4
- shufps xmm3, xmm6, 10001000b
- shufps xmm4, xmm6, 11011101b
-
- ; now xmm3-xmm5 contains fjx, fjy, fjz
- subps xmm3, xmm0
- subps xmm4, xmm1
-
- ; unpack them back so we can store them - first x & y in xmm3/xmm4
-
- movaps xmm6, xmm3
- unpcklps xmm6, xmm4
- unpckhps xmm3, xmm4
- ; xmm6(l)=x & y for j1, (h) for j2
- ; xmm3(l)=x & y for j3, (h) for j4
- movlps [edi + eax*4], xmm6
- movlps [edi + ecx*4], xmm3
-
- movhps [edi + ebx*4], xmm6
- movhps [edi + edx*4], xmm3
-
- ;; and the z forces
- movss xmm4, [edi + eax*4 + 8]
- movss xmm5, [edi + ebx*4 + 8]
- movss xmm6, [edi + ecx*4 + 8]
- movss xmm7, [edi + edx*4 + 8]
- subss xmm4, xmm2
- shufps xmm2, xmm2, 11100101b
- subss xmm5, xmm2
- shufps xmm2, xmm2, 11101010b
- subss xmm6, xmm2
- shufps xmm2, xmm2, 11111111b
- subss xmm7, xmm2
- movss [edi + eax*4 + 8], xmm4
- movss [edi + ebx*4 + 8], xmm5
- movss [edi + ecx*4 + 8], xmm6
- movss [edi + edx*4 + 8], xmm7
-
- ;; should we do one more iteration?
- sub [esp + .innerk], dword 4
- jl .finish_vdwc_inner
- jmp .unroll_vdwc_loop
-.finish_vdwc_inner:
- ;; check if at least two particles remain
- add [esp + .innerk], dword 4
- mov edx, [esp + .innerk]
- and edx, 10b
- jnz .dopair_vdwc
- jmp .checksingle_vdwc
-.dopair_vdwc:
- mov esi, [ebp + %$charge]
-
- mov ecx, [esp + .innerjjnr]
-
- mov eax, [ecx]
- mov ebx, [ecx + 4]
- add [esp + .innerjjnr], dword 8
- xorps xmm7, xmm7
- movss xmm3, [esi + eax*4]
- movss xmm6, [esi + ebx*4]
- shufps xmm3, xmm6, 00000000b
- shufps xmm3, xmm3, 00001000b ; xmm3(0,1) has the charges.
-
- mulps xmm3, [esp + .iq]
- movlhps xmm3, xmm7
- movaps [esp + .qq], xmm3
-
- mov esi, [ebp + %$type]
- mov ecx, eax
- mov edx, ebx
- mov ecx, [esi + ecx*4]
- mov edx, [esi + edx*4]
- mov esi, [ebp + %$nbfp]
- shl ecx, 1
- shl edx, 1
- mov edi, [esp + .ntia]
- add ecx, edi
- add edx, edi
- movlps xmm6, [esi + ecx*4]
- movhps xmm6, [esi + edx*4]
- mov edi, [ebp + %$pos]
-
- movaps xmm4, xmm6
- shufps xmm4, xmm4, 1000b
- shufps xmm6, xmm6, 1101b
- movlhps xmm4, xmm7
- movlhps xmm6, xmm7
-
- movaps [esp + .c6], xmm4
- movaps [esp + .c12], xmm6
-
- lea eax, [eax + eax*2]
- lea ebx, [ebx + ebx*2]
- ; move coordinates to xmm0-xmm2
- movlps xmm1, [edi + eax*4]
- movss xmm2, [edi + eax*4 + 8]
- movhps xmm1, [edi + ebx*4]
- movss xmm0, [edi + ebx*4 + 8]
-
- movlhps xmm3, xmm7
-
- shufps xmm2, xmm0, 0b
-
- movaps xmm0, xmm1
-
- shufps xmm2, xmm2, 10001000b
-
- shufps xmm0, xmm0, 10001000b
- shufps xmm1, xmm1, 11011101b
-
- ; move ix-iz to xmm4-xmm6
- xorps xmm7, xmm7
-
- movaps xmm4, [esp + .ix]
- movaps xmm5, [esp + .iy]
- movaps xmm6, [esp + .iz]
-
- ; calc dr
- subps xmm4, xmm0
- subps xmm5, xmm1
- subps xmm6, xmm2
-
- ; store dr
- movaps [esp + .dx], xmm4
- movaps [esp + .dy], xmm5
- movaps [esp + .dz], xmm6
- ; square it
- mulps xmm4,xmm4
- mulps xmm5,xmm5
- mulps xmm6,xmm6
- addps xmm4, xmm5
- addps xmm4, xmm6
- ; rsq in xmm4
-
- rsqrtps xmm5, xmm4
- ; lookup seed in xmm5
- movaps xmm2, xmm5
- mulps xmm5, xmm5
- movaps xmm1, [esp + .three]
- mulps xmm5, xmm4 ;rsq*lu*lu
- movaps xmm0, [esp + .half]
- subps xmm1, xmm5 ; 3.0-rsq*lu*lu
- mulps xmm1, xmm2
- mulps xmm0, xmm1 ; xmm0=rinv
- mulps xmm4, xmm0 ; xmm4=r
- mulps xmm4, [esp + .tabscale]
-
- cvttps2pi mm6, xmm4 ; mm6 contain lu indices
- cvtpi2ps xmm6, mm6
- subps xmm4, xmm6
- movaps xmm1, xmm4 ;xmm1=eps
- movaps xmm2, xmm1
- mulps xmm2, xmm2 ;xmm2=eps2
-
- pslld mm6, 2
-
- mov esi, [ebp + %$VFtab]
- movd ecx, mm6
- psrlq mm6, 32
- movd edx, mm6
-
- movlps xmm5, [esi + ecx*4]
- movhps xmm5, [esi + edx*4] ; got half coulomb table
- movaps xmm4, xmm5
- shufps xmm4, xmm4, 10001000b
- shufps xmm5, xmm7, 11011101b
-
- movlps xmm7, [esi + ecx*4 + 8]
- movhps xmm7, [esi + edx*4 + 8]
- movaps xmm6, xmm7
- shufps xmm6, xmm6, 10001000b
- shufps xmm7, xmm7, 11011101b
- ; table ready in xmm4-xmm7
-
- mulps xmm6, xmm1 ; xmm6=Geps
- mulps xmm7, xmm2 ; xmm7=Heps2
- addps xmm5, xmm6
- addps xmm5, xmm7 ; xmm5=Fp
- mulps xmm7, [esp + .two] ; two*Heps2
- movaps xmm3, [esp + .qq]
- addps xmm7, xmm6
- addps xmm7, xmm5 ; xmm7=FF
- mulps xmm5, xmm1 ; xmm5=eps*Fp
- addps xmm5, xmm4 ; xmm5=VV
- mulps xmm5, xmm3 ; vcoul=qq*VV
- mulps xmm3, xmm7 ; fijC=FF*qq
- ; L-J
- movaps xmm4, xmm0
- mulps xmm4, xmm0 ; xmm4=rinvsq
-
- ; at this point mm5 contains vcoul and mm3 fijC.
- ; increment vcoul - then we can get rid of mm5.
- ;; update vctot
- addps xmm5, [esp + .vctot]
-
- movaps xmm6, xmm4
- mulps xmm6, xmm4
-
- movaps [esp + .vctot], xmm5
-
- mulps xmm6, xmm4 ; xmm6=rinvsix
- movaps xmm4, xmm6
- mulps xmm4, xmm4 ; xmm4=rinvtwelve
- mulps xmm6, [esp + .c6]
- mulps xmm4, [esp + .c12]
- movaps xmm7, [esp + .vnbtot]
- addps xmm7, xmm4
- mulps xmm4, [esp + .twelve]
- subps xmm7, xmm6
- mulps xmm3, [esp + .tabscale]
- mulps xmm6, [esp + .six]
- movaps [esp + .vnbtot], xmm7
- subps xmm4, xmm6
- mulps xmm4, xmm0
- subps xmm4, xmm3
- mulps xmm4, xmm0
-
- movaps xmm0, [esp + .dx]
- movaps xmm1, [esp + .dy]
- movaps xmm2, [esp + .dz]
-
- mulps xmm0, xmm4
- mulps xmm1, xmm4
- mulps xmm2, xmm4
- mov edi, [ebp + %$faction]
- ; xmm0-xmm2 contains tx-tz (partial force)
- ; now update f_i
- movaps xmm3, [esp + .fix]
- movaps xmm4, [esp + .fiy]
- movaps xmm5, [esp + .fiz]
- addps xmm3, xmm0
- addps xmm4, xmm1
- addps xmm5, xmm2
- movaps [esp + .fix], xmm3
- movaps [esp + .fiy], xmm4
- movaps [esp + .fiz], xmm5
- ; update the fj's
- movss xmm3, [edi + eax*4]
- movss xmm4, [edi + eax*4 + 4]
- movss xmm5, [edi + eax*4 + 8]
- subss xmm3, xmm0
- subss xmm4, xmm1
- subss xmm5, xmm2
- movss [edi + eax*4], xmm3
- movss [edi + eax*4 + 4], xmm4
- movss [edi + eax*4 + 8], xmm5
-
- shufps xmm0, xmm0, 11100001b
- shufps xmm1, xmm1, 11100001b
- shufps xmm2, xmm2, 11100001b
-
- movss xmm3, [edi + ebx*4]
- movss xmm4, [edi + ebx*4 + 4]
- movss xmm5, [edi + ebx*4 + 8]
- subss xmm3, xmm0
- subss xmm4, xmm1
- subss xmm5, xmm2
- movss [edi + ebx*4], xmm3
- movss [edi + ebx*4 + 4], xmm4
- movss [edi + ebx*4 + 8], xmm5
-
-.checksingle_vdwc:
- mov edx, [esp + .innerk]
- and edx, 1b
- jnz .dosingle_vdwc
- jmp .updateouterdata_vdwc
-.dosingle_vdwc:
- mov esi, [ebp + %$charge]
- mov edi, [ebp + %$pos]
- mov ecx, [esp + .innerjjnr]
- mov eax, [ecx]
- xorps xmm6, xmm6
- movss xmm6, [esi + eax*4] ; xmm6(0) has the charge
- mulps xmm6, [esp + .iq]
- movaps [esp + .qq], xmm6
-
- mov esi, [ebp + %$type]
- mov ecx, eax
- mov ecx, [esi + ecx*4]
- mov esi, [ebp + %$nbfp]
- shl ecx, 1
- add ecx, [esp + .ntia]
- movlps xmm6, [esi + ecx*4]
- movaps xmm4, xmm6
- shufps xmm4, xmm4, 11111100b
- shufps xmm6, xmm6, 11111101b
-
- movaps [esp + .c6], xmm4
- movaps [esp + .c12], xmm6
-
- lea eax, [eax + eax*2]
-
- ; move coordinates to xmm0-xmm2
- movss xmm0, [edi + eax*4]
- movss xmm1, [edi + eax*4 + 4]
- movss xmm2, [edi + eax*4 + 8]
-
- movaps xmm4, [esp + .ix]
- movaps xmm5, [esp + .iy]
- movaps xmm6, [esp + .iz]
-
- ; calc dr
- subps xmm4, xmm0
- subps xmm5, xmm1
- subps xmm6, xmm2
-
- ; store dr
- movaps [esp + .dx], xmm4
- movaps [esp + .dy], xmm5
- movaps [esp + .dz], xmm6
- ; square it
- mulps xmm4,xmm4
- mulps xmm5,xmm5
- mulps xmm6,xmm6
- addps xmm4, xmm5
- addps xmm4, xmm6
- ; rsq in xmm4
-
- rsqrtps xmm5, xmm4
- ; lookup seed in xmm5
- movaps xmm2, xmm5
- mulps xmm5, xmm5
- movaps xmm1, [esp + .three]
- mulps xmm5, xmm4 ;rsq*lu*lu
- movaps xmm0, [esp + .half]
- subps xmm1, xmm5 ; 3.0-rsq*lu*lu
- mulps xmm1, xmm2
- mulps xmm0, xmm1 ; xmm0=rinv
-
- mulps xmm4, xmm0 ; xmm4=r
- mulps xmm4, [esp + .tabscale]
-
- cvttps2pi mm6, xmm4 ; mm6 contain lu indices
- cvtpi2ps xmm6, mm6
- subps xmm4, xmm6
- movaps xmm1, xmm4 ;xmm1=eps
- movaps xmm2, xmm1
- mulps xmm2, xmm2 ;xmm2=eps2
-
- pslld mm6, 2
-
- mov esi, [ebp + %$VFtab]
- movd ebx, mm6
-
- movlps xmm4, [esi + ebx*4]
- movlps xmm6, [esi + ebx*4 + 8]
- movaps xmm5, xmm4
- movaps xmm7, xmm6
- shufps xmm5, xmm5, 1b
- shufps xmm7, xmm7, 1b
- ; table ready in xmm4-xmm7
-
- mulps xmm6, xmm1 ; xmm6=Geps
- mulps xmm7, xmm2 ; xmm7=Heps2
- addps xmm5, xmm6
- addps xmm5, xmm7 ; xmm5=Fp
- mulps xmm7, [esp + .two] ; two*Heps2
- movaps xmm3, [esp + .qq]
- addps xmm7, xmm6
- addps xmm7, xmm5 ; xmm7=FF
- mulps xmm5, xmm1 ; xmm5=eps*Fp
- addps xmm5, xmm4 ; xmm5=VV
- mulps xmm5, xmm3 ; vcoul=qq*VV
- mulps xmm3, xmm7 ; fijC=FF*qq
- ; L-J
- movaps xmm4, xmm0
- mulps xmm4, xmm0 ; xmm4=rinvsq
-
- ; at this point mm5 contains vcoul and mm3 fijC.
- ; increment vcoul - then we can get rid of mm5.
- ;; update vctot
- addps xmm5, [esp + .vctot]
-
- movaps xmm6, xmm4
- mulps xmm6, xmm4
-
- movaps [esp + .vctot], xmm5
-
- mulps xmm6, xmm4 ; xmm6=rinvsix
- movaps xmm4, xmm6
- mulps xmm4, xmm4 ; xmm4=rinvtwelve
- mulps xmm6, [esp + .c6]
- mulps xmm4, [esp + .c12]
- movaps xmm7, [esp + .vnbtot]
- addps xmm7, xmm4
- mulps xmm4, [esp + .twelve]
- subps xmm7, xmm6
- mulps xmm3, [esp + .tabscale]
- mulps xmm6, [esp + .six]
- movaps [esp + .vnbtot], xmm7
- subps xmm4, xmm6
- mulps xmm4, xmm0
- subps xmm4, xmm3
- mulps xmm4, xmm0
-
- mov edi, [ebp +%$faction]
-
- movaps xmm0, [esp + .dx]
- movaps xmm1, [esp + .dy]
- movaps xmm2, [esp + .dz]
-
- mulps xmm0, xmm4
- mulps xmm1, xmm4
- mulps xmm2, xmm4
- ; xmm0-xmm2 contains tx-tz (partial force)
- ; now update f_i
- movaps xmm3, [esp + .fix]
- movaps xmm4, [esp + .fiy]
- movaps xmm5, [esp + .fiz]
- addps xmm3, xmm0
- addps xmm4, xmm1
- addps xmm5, xmm2
- movaps [esp + .fix], xmm3
- movaps [esp + .fiy], xmm4
- movaps [esp + .fiz], xmm5
- ; update fj
-
- movss xmm3, [edi + eax*4]
- movss xmm4, [edi + eax*4 + 4]
- movss xmm5, [edi + eax*4 + 8]
- subss xmm3, xmm0
- subss xmm4, xmm1
- subss xmm5, xmm2
- movss [edi + eax*4], xmm3
- movss [edi + eax*4 + 4], xmm4
- movss [edi + eax*4 + 8], xmm5
-.updateouterdata_vdwc:
- mov ecx, [esp + .ii3]
- mov edi, [ebp + %$faction]
- mov esi, [ebp + %$fshift]
- mov edx, [esp + .is3]
-
- ; accumulate i forces in xmm0, xmm1, xmm2
- movaps xmm0, [esp + .fix]
- movaps xmm1, [esp + .fiy]
- movaps xmm2, [esp + .fiz]
-
- movhlps xmm3, xmm0
- movhlps xmm4, xmm1
- movhlps xmm5, xmm2
- addps xmm0, xmm3
- addps xmm1, xmm4
- addps xmm2, xmm5 ; summan ligger i 1/2 i xmm0-xmm2
-
- movaps xmm3, xmm0
- movaps xmm4, xmm1
- movaps xmm5, xmm2
-
- shufps xmm3, xmm3, 1b
- shufps xmm4, xmm4, 1b
- shufps xmm5, xmm5, 1b
- addss xmm0, xmm3
- addss xmm1, xmm4
- addss xmm2, xmm5 ; xmm0-xmm2 has single force in pos0.
-
- ; increment i force
- movss xmm3, [edi + ecx*4]
- movss xmm4, [edi + ecx*4 + 4]
- movss xmm5, [edi + ecx*4 + 8]
- addss xmm3, xmm0
- addss xmm4, xmm1
- addss xmm5, xmm2
- movss [edi + ecx*4], xmm3
- movss [edi + ecx*4 + 4], xmm4
- movss [edi + ecx*4 + 8], xmm5
-
- ; increment fshift force
- movss xmm3, [esi + edx*4]
- movss xmm4, [esi + edx*4 + 4]
- movss xmm5, [esi + edx*4 + 8]
- addss xmm3, xmm0
- addss xmm4, xmm1
- addss xmm5, xmm2
- movss [esi + edx*4], xmm3
- movss [esi + edx*4 + 4], xmm4
- movss [esi + edx*4 + 8], xmm5
-
-
- ;; loop back to mno.
- dec dword [esp + .nsvdwc]
- jz .testcoul
- jmp .mno_vdwc
-.testcoul:
- mov ecx, [esp + .nscoul]
- cmp ecx, byte 0
- jnz .mno_coul
- jmp .testvdw
-.mno_coul:
- mov ebx, [esp + .solnr]
- inc dword [esp + .solnr]
-
- movss xmm0, [esp + .shX]
- movss xmm1, [esp + .shY]
- movss xmm2, [esp + .shZ]
-
- mov edx, [ebp + %$charge]
- movss xmm3, [edx + ebx*4]
- mulss xmm3, [ebp + %$facel]
- shufps xmm3, xmm3, 0b
-
- lea ebx, [ebx + ebx*2] ; ebx = 3*ii=ii3
- mov eax, [ebp + %$pos] ; eax = base of pos[]
-
- addss xmm0, [eax + ebx*4]
- addss xmm1, [eax + ebx*4 + 4]
- addss xmm2, [eax + ebx*4 + 8]
-
- movaps [esp + .iq], xmm3
-
- shufps xmm0, xmm0, 0b
- shufps xmm1, xmm1, 0b
- shufps xmm2, xmm2, 0b
-
- movaps [esp + .ix], xmm0
- movaps [esp + .iy], xmm1
- movaps [esp + .iz], xmm2
-
- mov [esp + .ii3], ebx
-
- ; clear i forces
- xorps xmm4, xmm4
- movaps [esp + .fix], xmm4
- movaps [esp + .fiy], xmm4
- movaps [esp + .fiz], xmm4
-
- mov ecx, [esp + .innerjjnr0]
- mov [esp + .innerjjnr], ecx
- mov edx, [esp + .innerk0]
- sub edx, dword 4
- mov [esp + .innerk], edx ; number of innerloop atoms
- jge .unroll_coul_loop
- jmp .finish_coul_inner
-
-.unroll_coul_loop:
- ;; quad-unroll innerloop here.
- mov edx, [esp + .innerjjnr] ; pointer to jjnr[k]
- mov eax, [edx]
- mov ebx, [edx + 4]
- mov ecx, [edx + 8]
- mov edx, [edx + 12] ; eax-edx=jnr1-4
- add [esp + .innerjjnr], dword 16 ; advance pointer (unrolled 4)
-
- mov esi, [ebp + %$charge] ; base of charge[]
-
- movss xmm3, [esi + eax*4]
- movss xmm4, [esi + ecx*4]
- movss xmm6, [esi + ebx*4]
- movss xmm7, [esi + edx*4]
-
- movaps xmm2, [esp + .iq]
- shufps xmm3, xmm6, 00000000b
- shufps xmm4, xmm7, 00000000b
- shufps xmm3, xmm4, 10001000b ; all charges in xmm3
- mulps xmm3, xmm2
-
- movaps [esp + .qq], xmm3
-
- mov esi, [ebp + %$pos] ; base of pos[]
-
- lea eax, [eax + eax*2] ; replace jnr with j3
- lea ebx, [ebx + ebx*2]
-
- lea ecx, [ecx + ecx*2] ; replace jnr with j3
- lea edx, [edx + edx*2]
-
- ; move four coordinates to xmm0-xmm2
-
- movlps xmm4, [esi + eax*4]
- movlps xmm5, [esi + ecx*4]
- movss xmm2, [esi + eax*4 + 8]
- movss xmm6, [esi + ecx*4 + 8]
-
- movhps xmm4, [esi + ebx*4]
- movhps xmm5, [esi + edx*4]
-
- movss xmm0, [esi + ebx*4 + 8]
- movss xmm1, [esi + edx*4 + 8]
-
- shufps xmm2, xmm0, 0b
- shufps xmm6, xmm1, 0b
-
- movaps xmm0, xmm4
- movaps xmm1, xmm4
-
- shufps xmm2, xmm6, 10001000b
-
- shufps xmm0, xmm5, 10001000b
- shufps xmm1, xmm5, 11011101b
-
- ; move ix-iz to xmm4-xmm6
- movaps xmm4, [esp + .ix]
- movaps xmm5, [esp + .iy]
- movaps xmm6, [esp + .iz]
-
- ; calc dr
- subps xmm4, xmm0
- subps xmm5, xmm1
- subps xmm6, xmm2
-
- ; store dr
- movaps [esp + .dx], xmm4
- movaps [esp + .dy], xmm5
- movaps [esp + .dz], xmm6
- ; square it
- mulps xmm4,xmm4
- mulps xmm5,xmm5
- mulps xmm6,xmm6
- addps xmm4, xmm5
- addps xmm4, xmm6
- ; rsq in xmm4
-
- rsqrtps xmm5, xmm4
- ; lookup seed in xmm5
- movaps xmm2, xmm5
- mulps xmm5, xmm5
- movaps xmm1, [esp + .three]
- mulps xmm5, xmm4 ;rsq*lu*lu
- movaps xmm0, [esp + .half]
- subps xmm1, xmm5 ; 3.0-rsq*lu*lu
- mulps xmm1, xmm2
- mulps xmm0, xmm1 ; xmm0=rinv
- mulps xmm4, xmm0 ; xmm4=r
- mulps xmm4, [esp + .tabscale]
-
- movhlps xmm5, xmm4
- cvttps2pi mm6, xmm4
- cvttps2pi mm7, xmm5 ; mm6/mm7 contain lu indices
- cvtpi2ps xmm6, mm6
- cvtpi2ps xmm5, mm7
- movlhps xmm6, xmm5
- subps xmm4, xmm6
- movaps xmm1, xmm4 ;xmm1=eps
- movaps xmm2, xmm1
- mulps xmm2, xmm2 ;xmm2=eps2
- pslld mm6, 2
- pslld mm7, 2
-
- movd mm0, eax
- movd mm1, ebx
- movd mm2, ecx
- movd mm3, edx
-
- mov esi, [ebp + %$VFtab]
- movd eax, mm6
- psrlq mm6, 32
- movd ecx, mm7
- psrlq mm7, 32
- movd ebx, mm6
- movd edx, mm7
-
- movlps xmm5, [esi + eax*4]
- movlps xmm7, [esi + ecx*4]
- movhps xmm5, [esi + ebx*4]
- movhps xmm7, [esi + edx*4] ; got half coulomb table
-
- movaps xmm4, xmm5
- shufps xmm4, xmm7, 10001000b
- shufps xmm5, xmm7, 11011101b
-
- movlps xmm7, [esi + eax*4 + 8]
- movlps xmm3, [esi + ecx*4 + 8]
- movhps xmm7, [esi + ebx*4 + 8]
- movhps xmm3, [esi + edx*4 + 8] ; other half of coulomb table
- movaps xmm6, xmm7
- shufps xmm6, xmm3, 10001000b
- shufps xmm7, xmm3, 11011101b
- ; coulomb table ready, in xmm4-xmm7
-
- mulps xmm6, xmm1 ; xmm6=Geps
- mulps xmm7, xmm2 ; xmm7=Heps2
- addps xmm5, xmm6
- addps xmm5, xmm7 ; xmm5=Fp
- mulps xmm7, [esp + .two] ; two*Heps2
- movaps xmm3, [esp + .qq]
- addps xmm7, xmm6
- addps xmm7, xmm5 ; xmm7=FF
- mulps xmm5, xmm1 ; xmm5=eps*Fp
- addps xmm5, xmm4 ; xmm5=VV
- mulps xmm5, xmm3 ; vcoul=qq*VV
- mulps xmm3, xmm7 ; fijC=FF*qq
- ; at this point mm5 contains vcoul and mm3 fijC.
- ; increment vcoul - then we can get rid of mm5.
- ;; update vctot
- addps xmm5, [esp + .vctot]
- movaps [esp + .vctot], xmm5
-
- xorps xmm4, xmm4
-
- mulps xmm3, [esp + .tabscale]
- mulps xmm3, xmm0
- subps xmm4, xmm3
-
- movaps xmm0, [esp + .dx]
- movaps xmm1, [esp + .dy]
- movaps xmm2, [esp + .dz]
-
- movd eax, mm0
- movd ebx, mm1
- movd ecx, mm2
- movd edx, mm3
-
- mov edi, [ebp + %$faction]
- mulps xmm0, xmm4
- mulps xmm1, xmm4
- mulps xmm2, xmm4
- ; xmm0-xmm2 contains tx-tz (partial force)
- ; now update f_i
- movaps xmm3, [esp + .fix]
- movaps xmm4, [esp + .fiy]
- movaps xmm5, [esp + .fiz]
- addps xmm3, xmm0
- addps xmm4, xmm1
- addps xmm5, xmm2
- movaps [esp + .fix], xmm3
- movaps [esp + .fiy], xmm4
- movaps [esp + .fiz], xmm5
- ; the fj's - start by accumulating x & y forces from memory
- movlps xmm4, [edi + eax*4]
- movlps xmm6, [edi + ecx*4]
- movhps xmm4, [edi + ebx*4]
- movhps xmm6, [edi + edx*4]
-
- movaps xmm3, xmm4
- shufps xmm3, xmm6, 10001000b
- shufps xmm4, xmm6, 11011101b
-
- ; now xmm3-xmm5 contains fjx, fjy, fjz
- subps xmm3, xmm0
- subps xmm4, xmm1
-
- ; unpack them back so we can store them - first x & y in xmm3/xmm4
-
- movaps xmm6, xmm3
- unpcklps xmm6, xmm4
- unpckhps xmm3, xmm4
- ; xmm6(l)=x & y for j1, (h) for j2
- ; xmm3(l)=x & y for j3, (h) for j4
- movlps [edi + eax*4], xmm6
- movlps [edi + ecx*4], xmm3
-
- movhps [edi + ebx*4], xmm6
- movhps [edi + edx*4], xmm3
-
- ;; and the z forces
- movss xmm4, [edi + eax*4 + 8]
- movss xmm5, [edi + ebx*4 + 8]
- movss xmm6, [edi + ecx*4 + 8]
- movss xmm7, [edi + edx*4 + 8]
- subss xmm4, xmm2
- shufps xmm2, xmm2, 11100101b
- subss xmm5, xmm2
- shufps xmm2, xmm2, 11101010b
- subss xmm6, xmm2
- shufps xmm2, xmm2, 11111111b
- subss xmm7, xmm2
- movss [edi + eax*4 + 8], xmm4
- movss [edi + ebx*4 + 8], xmm5
- movss [edi + ecx*4 + 8], xmm6
- movss [edi + edx*4 + 8], xmm7
-
- ;; should we do one more iteration?
- sub [esp + .innerk], dword 4
- jl .finish_coul_inner
- jmp .unroll_coul_loop
-.finish_coul_inner:
- ;; check if at least two particles remain
- add [esp + .innerk], dword 4
- mov edx, [esp + .innerk]
- and edx, 10b
- jnz .dopair_coul
- jmp .checksingle_coul
-.dopair_coul:
- mov esi, [ebp + %$charge]
-
- mov ecx, [esp + .innerjjnr]
-
- mov eax, [ecx]
- mov ebx, [ecx + 4]
- add [esp + .innerjjnr], dword 8
- xorps xmm7, xmm7
- movss xmm3, [esi + eax*4]
- movss xmm6, [esi + ebx*4]
- shufps xmm3, xmm6, 00000000b
- shufps xmm3, xmm3, 00001000b ; xmm3(0,1) has the charges.
-
- mulps xmm3, [esp + .iq]
- movlhps xmm3, xmm7
- movaps [esp + .qq], xmm3
-
- mov edi, [ebp + %$pos]
-
- lea eax, [eax + eax*2]
- lea ebx, [ebx + ebx*2]
- ; move coordinates to xmm0-xmm2
- movlps xmm1, [edi + eax*4]
- movss xmm2, [edi + eax*4 + 8]
- movhps xmm1, [edi + ebx*4]
- movss xmm0, [edi + ebx*4 + 8]
-
- movlhps xmm3, xmm7
-
- shufps xmm2, xmm0, 0b
-
- movaps xmm0, xmm1
-
- shufps xmm2, xmm2, 10001000b
-
- shufps xmm0, xmm0, 10001000b
- shufps xmm1, xmm1, 11011101b
-
- mov edi, [ebp + %$faction]
- ; move ix-iz to xmm4-xmm6
- xorps xmm7, xmm7
-
- movaps xmm4, [esp + .ix]
- movaps xmm5, [esp + .iy]
- movaps xmm6, [esp + .iz]
-
- ; calc dr
- subps xmm4, xmm0
- subps xmm5, xmm1
- subps xmm6, xmm2
-
- ; store dr
- movaps [esp + .dx], xmm4
- movaps [esp + .dy], xmm5
- movaps [esp + .dz], xmm6
- ; square it
- mulps xmm4,xmm4
- mulps xmm5,xmm5
- mulps xmm6,xmm6
- addps xmm4, xmm5
- addps xmm4, xmm6
- ; rsq in xmm4
-
- rsqrtps xmm5, xmm4
- ; lookup seed in xmm5
- movaps xmm2, xmm5
- mulps xmm5, xmm5
- movaps xmm1, [esp + .three]
- mulps xmm5, xmm4 ;rsq*lu*lu
- movaps xmm0, [esp + .half]
- subps xmm1, xmm5 ; 3.0-rsq*lu*lu
- mulps xmm1, xmm2
- mulps xmm0, xmm1 ; xmm0=rinv
- mulps xmm4, xmm0 ; xmm4=r
- mulps xmm4, [esp + .tabscale]
-
- cvttps2pi mm6, xmm4 ; mm6 contain lu indices
- cvtpi2ps xmm6, mm6
- subps xmm4, xmm6
- movaps xmm1, xmm4 ;xmm1=eps
- movaps xmm2, xmm1
- mulps xmm2, xmm2 ;xmm2=eps2
-
- pslld mm6, 2
-
- mov esi, [ebp + %$VFtab]
- movd ecx, mm6
- psrlq mm6, 32
- movd edx, mm6
-
- movlps xmm5, [esi + ecx*4]
- movhps xmm5, [esi + edx*4] ; got half coulomb table
- movaps xmm4, xmm5
- shufps xmm4, xmm4, 10001000b
- shufps xmm5, xmm7, 11011101b
-
- movlps xmm7, [esi + ecx*4 + 8]
- movhps xmm7, [esi + edx*4 + 8]
- movaps xmm6, xmm7
- shufps xmm6, xmm6, 10001000b
- shufps xmm7, xmm7, 11011101b
- ; table ready in xmm4-xmm7
-
- mulps xmm6, xmm1 ; xmm6=Geps
- mulps xmm7, xmm2 ; xmm7=Heps2
- addps xmm5, xmm6
- addps xmm5, xmm7 ; xmm5=Fp
- mulps xmm7, [esp + .two] ; two*Heps2
- movaps xmm3, [esp + .qq]
- addps xmm7, xmm6
- addps xmm7, xmm5 ; xmm7=FF
- mulps xmm5, xmm1 ; xmm5=eps*Fp
- addps xmm5, xmm4 ; xmm5=VV
- mulps xmm5, xmm3 ; vcoul=qq*VV
- mulps xmm3, xmm7 ; fijC=FF*qq
- ; at this point mm5 contains vcoul and mm3 fijC.
- ; increment vcoul - then we can get rid of mm5.
- ;; update vctot
- addps xmm5, [esp + .vctot]
- movaps [esp + .vctot], xmm5
-
- xorps xmm4, xmm4
-
- mulps xmm3, [esp + .tabscale]
- mulps xmm3, xmm0
- subps xmm4, xmm3
-
- movaps xmm0, [esp + .dx]
- movaps xmm1, [esp + .dy]
- movaps xmm2, [esp + .dz]
-
- mulps xmm0, xmm4
- mulps xmm1, xmm4
- mulps xmm2, xmm4
- ; xmm0-xmm2 contains tx-tz (partial force)
- ; now update f_i
- movaps xmm3, [esp + .fix]
- movaps xmm4, [esp + .fiy]
- movaps xmm5, [esp + .fiz]
- addps xmm3, xmm0
- addps xmm4, xmm1
- addps xmm5, xmm2
- movaps [esp + .fix], xmm3
- movaps [esp + .fiy], xmm4
- movaps [esp + .fiz], xmm5
- ; update the fj's
- movss xmm3, [edi + eax*4]
- movss xmm4, [edi + eax*4 + 4]
- movss xmm5, [edi + eax*4 + 8]
- subss xmm3, xmm0
- subss xmm4, xmm1
- subss xmm5, xmm2
- movss [edi + eax*4], xmm3
- movss [edi + eax*4 + 4], xmm4
- movss [edi + eax*4 + 8], xmm5
-
- shufps xmm0, xmm0, 11100001b
- shufps xmm1, xmm1, 11100001b
- shufps xmm2, xmm2, 11100001b
-
- movss xmm3, [edi + ebx*4]
- movss xmm4, [edi + ebx*4 + 4]
- movss xmm5, [edi + ebx*4 + 8]
- subss xmm3, xmm0
- subss xmm4, xmm1
- subss xmm5, xmm2
- movss [edi + ebx*4], xmm3
- movss [edi + ebx*4 + 4], xmm4
- movss [edi + ebx*4 + 8], xmm5
-
-.checksingle_coul:
- mov edx, [esp + .innerk]
- and edx, 1b
- jnz .dosingle_coul
- jmp .updateouterdata_coul
-.dosingle_coul:
- mov esi, [ebp + %$charge]
- mov edi, [ebp + %$pos]
- mov ecx, [esp + .innerjjnr]
- mov eax, [ecx]
- xorps xmm6, xmm6
- movss xmm6, [esi + eax*4] ; xmm6(0) has the charge
- mulps xmm6, [esp + .iq]
- movaps [esp + .qq], xmm6
-
- lea eax, [eax + eax*2]
-
- ; move coordinates to xmm0-xmm2
- movss xmm0, [edi + eax*4]
- movss xmm1, [edi + eax*4 + 4]
- movss xmm2, [edi + eax*4 + 8]
-
- movaps xmm4, [esp + .ix]
- movaps xmm5, [esp + .iy]
- movaps xmm6, [esp + .iz]
-
- ; calc dr
- subps xmm4, xmm0
- subps xmm5, xmm1
- subps xmm6, xmm2
-
- ; store dr
- movaps [esp + .dx], xmm4
- movaps [esp + .dy], xmm5
- movaps [esp + .dz], xmm6
- ; square it
- mulps xmm4,xmm4
- mulps xmm5,xmm5
- mulps xmm6,xmm6
- addps xmm4, xmm5
- addps xmm4, xmm6
- ; rsq in xmm4
-
- rsqrtps xmm5, xmm4
- ; lookup seed in xmm5
- movaps xmm2, xmm5
- mulps xmm5, xmm5
- movaps xmm1, [esp + .three]
- mulps xmm5, xmm4 ;rsq*lu*lu
- movaps xmm0, [esp + .half]
- subps xmm1, xmm5 ; 3.0-rsq*lu*lu
- mulps xmm1, xmm2
- mulps xmm0, xmm1 ; xmm0=rinv
-
- mulps xmm4, xmm0 ; xmm4=r
- mulps xmm4, [esp + .tabscale]
-
- cvttps2pi mm6, xmm4 ; mm6 contain lu indices
- cvtpi2ps xmm6, mm6
- subps xmm4, xmm6
- movaps xmm1, xmm4 ;xmm1=eps
- movaps xmm2, xmm1
- mulps xmm2, xmm2 ;xmm2=eps2
-
- pslld mm6, 2
-
- mov esi, [ebp + %$VFtab]
- movd ebx, mm6
-
- movlps xmm4, [esi + ebx*4]
- movlps xmm6, [esi + ebx*4 + 8]
- movaps xmm5, xmm4
- movaps xmm7, xmm6
- shufps xmm5, xmm5, 1b
- shufps xmm7, xmm7, 1b
- ; table ready in xmm4-xmm7
-
- mulps xmm6, xmm1 ; xmm6=Geps
- mulps xmm7, xmm2 ; xmm7=Heps2
- addps xmm5, xmm6
- addps xmm5, xmm7 ; xmm5=Fp
- mulps xmm7, [esp + .two] ; two*Heps2
- movaps xmm3, [esp + .qq]
- addps xmm7, xmm6
- addps xmm7, xmm5 ; xmm7=FF
- mulps xmm5, xmm1 ; xmm5=eps*Fp
- addps xmm5, xmm4 ; xmm5=VV
- mulps xmm5, xmm3 ; vcoul=qq*VV
- mulps xmm3, xmm7 ; fijC=FF*qq
- ; at this point mm5 contains vcoul and mm3 fijC.
- ; increment vcoul - then we can get rid of mm5.
- ;; update vctot
- addps xmm5, [esp + .vctot]
- movaps [esp + .vctot], xmm5
-
- xorps xmm4, xmm4
-
- mulps xmm3, [esp + .tabscale]
- mulps xmm3, xmm0
- subps xmm4, xmm3
- mov edi, [ebp + %$faction]
-
- movaps xmm0, [esp + .dx]
- movaps xmm1, [esp + .dy]
- movaps xmm2, [esp + .dz]
-
- mulps xmm0, xmm4
- mulps xmm1, xmm4
- mulps xmm2, xmm4
- ; xmm0-xmm2 contains tx-tz (partial force)
- ; now update f_i
- movaps xmm3, [esp + .fix]
- movaps xmm4, [esp + .fiy]
- movaps xmm5, [esp + .fiz]
- addps xmm3, xmm0
- addps xmm4, xmm1
- addps xmm5, xmm2
- movaps [esp + .fix], xmm3
- movaps [esp + .fiy], xmm4
- movaps [esp + .fiz], xmm5
- ; update fj
-
- movss xmm3, [edi + eax*4]
- movss xmm4, [edi + eax*4 + 4]
- movss xmm5, [edi + eax*4 + 8]
- subss xmm3, xmm0
- subss xmm4, xmm1
- subss xmm5, xmm2
- movss [edi + eax*4], xmm3
- movss [edi + eax*4 + 4], xmm4
- movss [edi + eax*4 + 8], xmm5
-.updateouterdata_coul:
- mov ecx, [esp + .ii3]
- mov edi, [ebp + %$faction]
- mov esi, [ebp + %$fshift]
- mov edx, [esp + .is3]
-
- ; accumulate i forces in xmm0, xmm1, xmm2
- movaps xmm0, [esp + .fix]
- movaps xmm1, [esp + .fiy]
- movaps xmm2, [esp + .fiz]
-
- movhlps xmm3, xmm0
- movhlps xmm4, xmm1
- movhlps xmm5, xmm2
- addps xmm0, xmm3
- addps xmm1, xmm4
- addps xmm2, xmm5 ; summan ligger i 1/2 i xmm0-xmm2
-
- movaps xmm3, xmm0
- movaps xmm4, xmm1
- movaps xmm5, xmm2
-
- shufps xmm3, xmm3, 1b
- shufps xmm4, xmm4, 1b
- shufps xmm5, xmm5, 1b
- addss xmm0, xmm3
- addss xmm1, xmm4
- addss xmm2, xmm5 ; xmm0-xmm2 has single force in pos0.
-
- ; increment i force
- movss xmm3, [edi + ecx*4]
- movss xmm4, [edi + ecx*4 + 4]
- movss xmm5, [edi + ecx*4 + 8]
- addss xmm3, xmm0
- addss xmm4, xmm1
- addss xmm5, xmm2
- movss [edi + ecx*4], xmm3
- movss [edi + ecx*4 + 4], xmm4
- movss [edi + ecx*4 + 8], xmm5
-
- ; increment fshift force
- movss xmm3, [esi + edx*4]
- movss xmm4, [esi + edx*4 + 4]
- movss xmm5, [esi + edx*4 + 8]
- addss xmm3, xmm0
- addss xmm4, xmm1
- addss xmm5, xmm2
- movss [esi + edx*4], xmm3
- movss [esi + edx*4 + 4], xmm4
- movss [esi + edx*4 + 8], xmm5
-
- ;; loop back to mno.
- dec dword [esp + .nscoul]
- jz .testvdw
- jmp .mno_coul
-.testvdw:
- mov ecx, [esp + .nsvdw]
- cmp ecx, byte 0
- jnz .mno_vdw
- jmp .last_mno
-.mno_vdw:
- mov ebx, [esp + .solnr]
- inc dword [esp + .solnr]
-
- mov edx, [ebp + %$type]
- mov edx, [edx + ebx*4]
- imul edx, [ebp + %$ntype]
- shl edx, 1
- mov [esp + .ntia], edx
-
- lea ebx, [ebx + ebx*2] ; ebx = 3*ii=ii3
- mov eax, [ebp + %$pos] ; eax = base of pos[]
- mov [esp + .ii3], ebx
-
- movss xmm0, [esp + .shX]
- movss xmm1, [esp + .shY]
- movss xmm2, [esp + .shZ]
-
- addss xmm0, [eax + ebx*4]
- addss xmm1, [eax + ebx*4 + 4]
- addss xmm2, [eax + ebx*4 + 8]
-
- xorps xmm4, xmm4
- movaps [esp + .fix], xmm4
- movaps [esp + .fiy], xmm4
- movaps [esp + .fiz], xmm4
-
- shufps xmm0, xmm0, 0b
- shufps xmm1, xmm1, 0b
- shufps xmm2, xmm2, 0b
-
- movaps [esp + .ix], xmm0
- movaps [esp + .iy], xmm1
- movaps [esp + .iz], xmm2
-
- mov ecx, [esp + .innerjjnr0]
- mov [esp + .innerjjnr], ecx
- mov edx, [esp + .innerk0]
- sub edx, dword 4
- mov [esp + .innerk], edx ; number of innerloop atoms
- jge .unroll_vdw_loop
- jmp .finish_vdw_inner
-.unroll_vdw_loop:
- ;; quad-unroll innerloop here.
- mov edx, [esp + .innerjjnr] ; pointer to jjnr[k]
- mov eax, [edx]
- mov ebx, [edx + 4]
- mov ecx, [edx + 8]
- mov edx, [edx + 12] ; eax-edx=jnr1-4
- add [esp + .innerjjnr], dword 16 ; advance pointer (unrolled 4)
-
- movd mm0, eax ; use mmx registers as temp. storage
- movd mm1, ebx
- movd mm2, ecx
- movd mm3, edx
-
- mov esi, [ebp + %$type]
- mov eax, [esi + eax*4]
- mov ebx, [esi + ebx*4]
- mov ecx, [esi + ecx*4]
- mov edx, [esi + edx*4]
- mov esi, [ebp + %$nbfp]
- shl eax, 1
- shl ebx, 1
- shl ecx, 1
- shl edx, 1
- mov edi, [esp + .ntia]
- add eax, edi
- add ebx, edi
- add ecx, edi
- add edx, edi
-
- movlps xmm6, [esi + eax*4]
- movlps xmm7, [esi + ecx*4]
- movhps xmm6, [esi + ebx*4]
- movhps xmm7, [esi + edx*4]
-
- movaps xmm4, xmm6
- shufps xmm4, xmm7, 10001000b
- shufps xmm6, xmm7, 11011101b
-
- movd eax, mm0
- movd ebx, mm1
- movd ecx, mm2
- movd edx, mm3
-
- movaps [esp + .c6], xmm4
- movaps [esp + .c12], xmm6
-
- mov esi, [ebp + %$pos] ; base of pos[]
-
- lea eax, [eax + eax*2] ; replace jnr with j3
- lea ebx, [ebx + ebx*2]
-
- lea ecx, [ecx + ecx*2] ; replace jnr with j3
- lea edx, [edx + edx*2]
-
- ; move four coordinates to xmm0-xmm2
-
- movlps xmm4, [esi + eax*4]
- movlps xmm5, [esi + ecx*4]
- movss xmm2, [esi + eax*4 + 8]
- movss xmm6, [esi + ecx*4 + 8]
-
- movhps xmm4, [esi + ebx*4]
- movhps xmm5, [esi + edx*4]
-
- movss xmm0, [esi + ebx*4 + 8]
- movss xmm1, [esi + edx*4 + 8]
-
- shufps xmm2, xmm0, 0b
- shufps xmm6, xmm1, 0b
-
- movaps xmm0, xmm4
- movaps xmm1, xmm4
-
- shufps xmm2, xmm6, 10001000b
-
- shufps xmm0, xmm5, 10001000b
- shufps xmm1, xmm5, 11011101b
-
- ; move ix-iz to xmm4-xmm6
- movaps xmm4, [esp + .ix]
- movaps xmm5, [esp + .iy]
- movaps xmm6, [esp + .iz]
-
- ; calc dr
- subps xmm4, xmm0
- subps xmm5, xmm1
- subps xmm6, xmm2
-
- ; store dr
- movaps [esp + .dx], xmm4
- movaps [esp + .dy], xmm5
- movaps [esp + .dz], xmm6
- ; square it
- mulps xmm4,xmm4
- mulps xmm5,xmm5
- mulps xmm6,xmm6
- addps xmm4, xmm5
- addps xmm4, xmm6
- ; rsq in xmm4
-
- rcpps xmm5, xmm4
- ; 1/x lookup seed in xmm5
- movaps xmm0, [esp + .two]
- mulps xmm4, xmm5
- subps xmm0, xmm4
- mulps xmm0, xmm5 ; xmm0=rinvsq
- movaps xmm4, xmm0
-
- movaps xmm1, xmm0
- mulps xmm1, xmm0
- mulps xmm1, xmm0 ; xmm1=rinvsix
- movaps xmm2, xmm1
- mulps xmm2, xmm2 ; xmm2=rinvtwelve
-
- mulps xmm1, [esp + .c6]
- mulps xmm2, [esp + .c12]
- movaps xmm5, xmm2
- subps xmm5, xmm1 ; vnb=vnb12-vnb6
- addps xmm5, [esp + .vnbtot]
- mulps xmm1, [esp + .six]
- mulps xmm2, [esp + .twelve]
- subps xmm2, xmm1
- mulps xmm4, xmm2 ; xmm4=total fscal
- movaps xmm0, [esp + .dx]
- movaps xmm1, [esp + .dy]
- movaps xmm2, [esp + .dz]
-
- movaps [esp + .vnbtot], xmm5
-
- mov edi, [ebp + %$faction]
- mulps xmm0, xmm4
- mulps xmm1, xmm4
- mulps xmm2, xmm4
- ; xmm0-xmm2 contains tx-tz (partial force)
- ; now update f_i
- movaps xmm3, [esp + .fix]
- movaps xmm4, [esp + .fiy]
- movaps xmm5, [esp + .fiz]
- addps xmm3, xmm0
- addps xmm4, xmm1
- addps xmm5, xmm2
- movaps [esp + .fix], xmm3
- movaps [esp + .fiy], xmm4
- movaps [esp + .fiz], xmm5
- ; the fj's - start by accumulating x & y forces from memory
- movlps xmm4, [edi + eax*4]
- movlps xmm6, [edi + ecx*4]
- movhps xmm4, [edi + ebx*4]
- movhps xmm6, [edi + edx*4]
-
- movaps xmm3, xmm4
- shufps xmm3, xmm6, 10001000b
- shufps xmm4, xmm6, 11011101b
-
- ; now xmm3-xmm5 contains fjx, fjy, fjz
- subps xmm3, xmm0
- subps xmm4, xmm1
-
- ; unpack them back so we can store them - first x & y in xmm3/xmm4
-
- movaps xmm6, xmm3
- unpcklps xmm6, xmm4
- unpckhps xmm3, xmm4
- ; xmm6(l)=x & y for j1, (h) for j2
- ; xmm3(l)=x & y for j3, (h) for j4
- movlps [edi + eax*4], xmm6
- movlps [edi + ecx*4], xmm3
-
- movhps [edi + ebx*4], xmm6
- movhps [edi + edx*4], xmm3
-
- ;; and the z forces
- movss xmm4, [edi + eax*4 + 8]
- movss xmm5, [edi + ebx*4 + 8]
- movss xmm6, [edi + ecx*4 + 8]
- movss xmm7, [edi + edx*4 + 8]
- subss xmm4, xmm2
- shufps xmm2, xmm2, 11100101b
- subss xmm5, xmm2
- shufps xmm2, xmm2, 11101010b
- subss xmm6, xmm2
- shufps xmm2, xmm2, 11111111b
- subss xmm7, xmm2
- movss [edi + eax*4 + 8], xmm4
- movss [edi + ebx*4 + 8], xmm5
- movss [edi + ecx*4 + 8], xmm6
- movss [edi + edx*4 + 8], xmm7
-
- ;; should we do one more iteration?
- sub [esp + .innerk], dword 4
- jl .finish_vdw_inner
- jmp .unroll_vdw_loop
-.finish_vdw_inner:
- ;; check if at least two particles remain
- add [esp + .innerk], dword 4
- mov edx, [esp + .innerk]
- and edx, 10b
- jnz .dopair_vdw
- jmp .checksingle_vdw
-.dopair_vdw:
- mov ecx, [esp + .innerjjnr]
-
- mov eax, [ecx]
- mov ebx, [ecx + 4]
- add [esp + .innerjjnr], dword 8
- xorps xmm7, xmm7
-
- mov esi, [ebp + %$type]
- mov ecx, eax
- mov edx, ebx
- mov ecx, [esi + ecx*4]
- mov edx, [esi + edx*4]
- mov esi, [ebp + %$nbfp]
- shl ecx, 1
- shl edx, 1
- mov edi, [esp + .ntia]
- add ecx, edi
- add edx, edi
- movlps xmm6, [esi + ecx*4]
- movhps xmm6, [esi + edx*4]
- mov edi, [ebp + %$pos]
-
- movaps xmm4, xmm6
- shufps xmm4, xmm4, 1000b
- shufps xmm6, xmm6, 1101b
- movlhps xmm4, xmm7
- movlhps xmm6, xmm7
-
- movaps [esp + .c6], xmm4
- movaps [esp + .c12], xmm6
-
- lea eax, [eax + eax*2]
- lea ebx, [ebx + ebx*2]
- ; move coordinates to xmm0-xmm2
- movlps xmm1, [edi + eax*4]
- movss xmm2, [edi + eax*4 + 8]
- movhps xmm1, [edi + ebx*4]
- movss xmm0, [edi + ebx*4 + 8]
-
- movlhps xmm3, xmm7
-
- shufps xmm2, xmm0, 0b
-
- movaps xmm0, xmm1
-
- shufps xmm2, xmm2, 10001000b
-
- shufps xmm0, xmm0, 10001000b
- shufps xmm1, xmm1, 11011101b
-
- mov edi, [ebp + %$faction]
- ; move ix-iz to xmm4-xmm6
- xorps xmm7, xmm7
-
- movaps xmm4, [esp + .ix]
- movaps xmm5, [esp + .iy]
- movaps xmm6, [esp + .iz]
-
- ; calc dr
- subps xmm4, xmm0
- subps xmm5, xmm1
- subps xmm6, xmm2
-
- ; store dr
- movaps [esp + .dx], xmm4
- movaps [esp + .dy], xmm5
- movaps [esp + .dz], xmm6
- ; square it
- mulps xmm4,xmm4
- mulps xmm5,xmm5
- mulps xmm6,xmm6
- addps xmm4, xmm5
- addps xmm4, xmm6
- ; rsq in xmm4
-
- rcpps xmm5, xmm4
- ; 1/x lookup seed in xmm5
- movaps xmm0, [esp + .two]
- mulps xmm4, xmm5
- subps xmm0, xmm4
- mulps xmm0, xmm5 ; xmm0=rinvsq
- movaps xmm4, xmm0
-
- movaps xmm1, xmm0
- mulps xmm1, xmm0
- mulps xmm1, xmm0 ; xmm1=rinvsix
- movaps xmm2, xmm1
- mulps xmm2, xmm2 ; xmm2=rinvtwelve
-
- mulps xmm1, [esp + .c6]
- mulps xmm2, [esp + .c12]
- movaps xmm5, xmm2
- subps xmm5, xmm1 ; vnb=vnb12-vnb6
- addps xmm5, [esp + .vnbtot]
- mulps xmm1, [esp + .six]
- mulps xmm2, [esp + .twelve]
- subps xmm2, xmm1
- mulps xmm4, xmm2 ; xmm4=total fscal
-
- movaps xmm0, [esp + .dx]
- movaps xmm1, [esp + .dy]
- movaps xmm2, [esp + .dz]
-
- movaps [esp + .vnbtot], xmm5
-
- mulps xmm0, xmm4
- mulps xmm1, xmm4
- mulps xmm2, xmm4
- ; xmm0-xmm2 contains tx-tz (partial force)
- ; now update f_i
- movaps xmm3, [esp + .fix]
- movaps xmm4, [esp + .fiy]
- movaps xmm5, [esp + .fiz]
- addps xmm3, xmm0
- addps xmm4, xmm1
- addps xmm5, xmm2
- movaps [esp + .fix], xmm3
- movaps [esp + .fiy], xmm4
- movaps [esp + .fiz], xmm5
- ; update the fj's
- movss xmm3, [edi + eax*4]
- movss xmm4, [edi + eax*4 + 4]
- movss xmm5, [edi + eax*4 + 8]
- subss xmm3, xmm0
- subss xmm4, xmm1
- subss xmm5, xmm2
- movss [edi + eax*4], xmm3
- movss [edi + eax*4 + 4], xmm4
- movss [edi + eax*4 + 8], xmm5
-
- shufps xmm0, xmm0, 11100001b
- shufps xmm1, xmm1, 11100001b
- shufps xmm2, xmm2, 11100001b
-
- movss xmm3, [edi + ebx*4]
- movss xmm4, [edi + ebx*4 + 4]
- movss xmm5, [edi + ebx*4 + 8]
- subss xmm3, xmm0
- subss xmm4, xmm1
- subss xmm5, xmm2
- movss [edi + ebx*4], xmm3
- movss [edi + ebx*4 + 4], xmm4
- movss [edi + ebx*4 + 8], xmm5
-
-.checksingle_vdw:
- mov edx, [esp + .innerk]
- and edx, 1b
- jnz .dosingle_vdw
- jmp .updateouterdata_vdw
-.dosingle_vdw:
- mov edi, [ebp + %$pos]
- mov ecx, [esp + .innerjjnr]
- mov eax, [ecx]
- xorps xmm6, xmm6
-
- mov esi, [ebp + %$type]
- mov ecx, eax
- mov ecx, [esi + ecx*4]
- mov esi, [ebp + %$nbfp]
- shl ecx, 1
- add ecx, [esp + .ntia]
- movlps xmm6, [esi + ecx*4]
- movaps xmm4, xmm6
- shufps xmm4, xmm4, 11111100b
- shufps xmm6, xmm6, 11111101b
-
- movaps [esp + .c6], xmm4
- movaps [esp + .c12], xmm6
-
- lea eax, [eax + eax*2]
-
- ; move coordinates to xmm0-xmm2
- movss xmm0, [edi + eax*4]
- movss xmm1, [edi + eax*4 + 4]
- movss xmm2, [edi + eax*4 + 8]
-
- movaps xmm4, [esp + .ix]
- movaps xmm5, [esp + .iy]
- movaps xmm6, [esp + .iz]
-
- ; calc dr
- subps xmm4, xmm0
- subps xmm5, xmm1
- subps xmm6, xmm2
-
- ; store dr
- movaps [esp + .dx], xmm4
- movaps [esp + .dy], xmm5
- movaps [esp + .dz], xmm6
- ; square it
- mulps xmm4,xmm4
- mulps xmm5,xmm5
- mulps xmm6,xmm6
- addps xmm4, xmm5
- addps xmm4, xmm6
- ; rsq in xmm4
-
- rcpps xmm5, xmm4
- ; 1/x lookup seed in xmm5
- movaps xmm0, [esp + .two]
- mulps xmm4, xmm5
- subps xmm0, xmm4
- mulps xmm0, xmm5 ; xmm0=rinvsq
- movaps xmm4, xmm0
-
- movaps xmm1, xmm0
- mulps xmm1, xmm0
- mulps xmm1, xmm0 ; xmm1=rinvsix
- movaps xmm2, xmm1
- mulps xmm2, xmm2 ; xmm2=rinvtwelve
-
- mulps xmm1, [esp + .c6]
- mulps xmm2, [esp + .c12]
- movaps xmm5, xmm2
- subps xmm5, xmm1 ; vnb=vnb12-vnb6
- addps xmm5, [esp + .vnbtot]
- mulps xmm1, [esp + .six]
- mulps xmm2, [esp + .twelve]
- subps xmm2, xmm1
- mulps xmm4, xmm2 ; xmm4=total fscal
-
- mov edi, [ebp + %$faction]
-
- movaps xmm0, [esp + .dx]
- movaps xmm1, [esp + .dy]
- movaps xmm2, [esp + .dz]
-
- movaps [esp + .vnbtot], xmm5
-
- mulps xmm0, xmm4
- mulps xmm1, xmm4
- mulps xmm2, xmm4
- ; xmm0-xmm2 contains tx-tz (partial force)
- mov edi, [ebp +%$faction]
-
- ; now update f_i
- movaps xmm3, [esp + .fix]
- movaps xmm4, [esp + .fiy]
- movaps xmm5, [esp + .fiz]
- addps xmm3, xmm0
- addps xmm4, xmm1
- addps xmm5, xmm2
- movaps [esp + .fix], xmm3
- movaps [esp + .fiy], xmm4
- movaps [esp + .fiz], xmm5
- ; update fj
-
- movss xmm3, [edi + eax*4]
- movss xmm4, [edi + eax*4 + 4]
- movss xmm5, [edi + eax*4 + 8]
- subss xmm3, xmm0
- subss xmm4, xmm1
- subss xmm5, xmm2
- movss [edi + eax*4], xmm3
- movss [edi + eax*4 + 4], xmm4
- movss [edi + eax*4 + 8], xmm5
-.updateouterdata_vdw:
- mov ecx, [esp + .ii3]
- mov edi, [ebp + %$faction]
- mov esi, [ebp + %$fshift]
- mov edx, [esp + .is3]
-
- ; accumulate i forces in xmm0, xmm1, xmm2
- movaps xmm0, [esp + .fix]
- movaps xmm1, [esp + .fiy]
- movaps xmm2, [esp + .fiz]
-
- movhlps xmm3, xmm0
- movhlps xmm4, xmm1
- movhlps xmm5, xmm2
- addps xmm0, xmm3
- addps xmm1, xmm4
- addps xmm2, xmm5 ; summan ligger i 1/2 i xmm0-xmm2
-
- movaps xmm3, xmm0
- movaps xmm4, xmm1
- movaps xmm5, xmm2
-
- shufps xmm3, xmm3, 1b
- shufps xmm4, xmm4, 1b
- shufps xmm5, xmm5, 1b
- addss xmm0, xmm3
- addss xmm1, xmm4
- addss xmm2, xmm5 ; xmm0-xmm2 has single force in pos0.
-
- ; increment i force
- movss xmm3, [edi + ecx*4]
- movss xmm4, [edi + ecx*4 + 4]
- movss xmm5, [edi + ecx*4 + 8]
- addss xmm3, xmm0
- addss xmm4, xmm1
- addss xmm5, xmm2
- movss [edi + ecx*4], xmm3
- movss [edi + ecx*4 + 4], xmm4
- movss [edi + ecx*4 + 8], xmm5
-
- ; increment fshift force
- movss xmm3, [esi + edx*4]
- movss xmm4, [esi + edx*4 + 4]
- movss xmm5, [esi + edx*4 + 8]
- addss xmm3, xmm0
- addss xmm4, xmm1
- addss xmm5, xmm2
- movss [esi + edx*4], xmm3
- movss [esi + edx*4 + 4], xmm4
- movss [esi + edx*4 + 8], xmm5
-
- ;; loop back to mno.
- dec dword [esp + .nsvdw]
- jz .last_mno
- jmp .mno_vdw
-.last_mno:
- ; get group index for i particle
- mov edx, [ebp + %$gid] ; get group index for this i particle
- mov edx, [edx]
- add [ebp + %$gid], dword 4 ; advance pointer
-
- ; accumulate total potential energy and update it.
- movaps xmm7, [esp + .vctot]
- ; accumulate
- movhlps xmm6, xmm7
- addps xmm7, xmm6 ; pos 0-1 in xmm7 have the sum now
- movaps xmm6, xmm7
- shufps xmm6, xmm6, 1b
- addss xmm7, xmm6
-
- ; add earlier value from mem.
- mov eax, [ebp + %$Vc]
- addss xmm7, [eax + edx*4]
- ; move back to mem.
- movss [eax + edx*4], xmm7
-
- ; accumulate total lj energy and update it.
- movaps xmm7, [esp + .vnbtot]
- ; accumulate
- movhlps xmm6, xmm7
- addps xmm7, xmm6 ; pos 0-1 in xmm7 have the sum now
- movaps xmm6, xmm7
- shufps xmm6, xmm6, 1b
- addss xmm7, xmm6
-
- ; add earlier value from mem.
- mov eax, [ebp + %$Vnb]
- addss xmm7, [eax + edx*4]
- ; move back to mem.
- movss [eax + edx*4], xmm7
-
- ;; finish if last
- mov ecx, [ebp + %$nri]
- dec ecx
- jecxz .end
- ;; not last, iterate once more!
- mov [ebp + %$nri], ecx
- jmp .outer
-.end:
- emms
- mov eax, [esp + .salign]
- add esp, eax
- add esp, 412
- pop edi
- pop esi
- pop edx
- pop ecx
- pop ebx
- pop eax
- endproc
-
-
-
-
-proc inl3120_sse
-%$nri arg
-%$iinr arg
-%$jindex arg
-%$jjnr arg
-%$shift arg
-%$shiftvec arg
-%$fshift arg
-%$gid arg
-%$pos arg
-%$faction arg
-%$charge arg
-%$facel arg
-%$Vc arg
-%$type arg
-%$ntype arg
-%$nbfp arg
-%$Vnb arg
-%$tabscale arg
-%$VFtab arg
- ;; stack offsets for local variables
- ;; bottom of stack is cache-aligned for sse use
-.ixO equ 0
-.iyO equ 16
-.izO equ 32
-.ixH1 equ 48
-.iyH1 equ 64
-.izH1 equ 80
-.ixH2 equ 96
-.iyH2 equ 112
-.izH2 equ 128
-.iqO equ 144
-.iqH equ 160
-.dxO equ 176
-.dyO equ 192
-.dzO equ 208
-.dxH1 equ 224
-.dyH1 equ 240
-.dzH1 equ 256
-.dxH2 equ 272
-.dyH2 equ 288
-.dzH2 equ 304
-.qqO equ 320
-.qqH equ 336
-.rinvO equ 352
-.rinvH1 equ 368
-.rinvH2 equ 384
-.rO equ 400
-.rH1 equ 416
-.rH2 equ 432
-.tabscale equ 448
-.two equ 464
-.c6 equ 480
-.c12 equ 496
-.six equ 512
-.twelve equ 528
-.vctot equ 544
-.vnbtot equ 560
-.fixO equ 576
-.fiyO equ 592
-.fizO equ 608
-.fixH1 equ 624
-.fiyH1 equ 640
-.fizH1 equ 656
-.fixH2 equ 672
-.fiyH2 equ 688
-.fizH2 equ 704
-.fjx equ 720
-.fjy equ 736
-.fjz equ 752
-.half equ 768
-.three equ 784
-.is3 equ 800
-.ii3 equ 804
-.ntia equ 808
-.innerjjnr equ 812
-.innerk equ 816
-.salign equ 820
- push eax
- push ebx
- push ecx
- push edx
- push esi
- push edi
- sub esp, 824 ; local stack space
- mov eax, esp
- and eax, 0xf
- sub esp, eax
- mov [esp + .salign], eax
-
- emms
-
- movups xmm0, [sse_half]
- movups xmm1, [sse_two]
- movups xmm2, [sse_three]
- movups xmm3, [sse_six]
- movups xmm4, [sse_twelve]
- movss xmm5, [ebp +%$tabscale]
-
- movaps [esp + .half], xmm0
- movaps [esp + .two], xmm1
- movaps [esp + .three], xmm2
- movaps [esp + .six], xmm3
- movaps [esp + .twelve], xmm4
- shufps xmm5, xmm5, 0b
- movaps [esp + .tabscale], xmm5
-
- ;; assume we have at least one i particle - start directly
- mov ecx, [ebp + %$iinr] ; ecx = pointer into iinr[]
- mov ebx, [ecx] ; ebx =ii
-
- mov edx, [ebp + %$charge]
- movss xmm3, [edx + ebx*4]
- movss xmm4, [edx + ebx*4 + 4]
- movss xmm5, [ebp + %$facel]
- mulss xmm3, xmm5
- mulss xmm4, xmm5
-
- shufps xmm3, xmm3, 0b
- shufps xmm4, xmm4, 0b
- movaps [esp + .iqO], xmm3
- movaps [esp + .iqH], xmm4
-
- mov edx, [ebp + %$type]
- mov ecx, [edx + ebx*4]
- shl ecx, 1
- imul ecx, [ebp + %$ntype] ; ecx = ntia = 2*ntype*type[ii0]
- mov [esp + .ntia], ecx
-.outer:
- mov eax, [ebp + %$shift] ; eax = pointer into shift[]
- mov ebx, [eax] ; ebx=shift[n]
- add [ebp + %$shift], dword 4 ; advance pointer one step
-
- lea ebx, [ebx + ebx*2] ; ebx=3*is
- mov [esp + .is3],ebx ; store is3
-
- mov eax, [ebp + %$shiftvec] ; eax = base of shiftvec[]
-
- movss xmm0, [eax + ebx*4]
- movss xmm1, [eax + ebx*4 + 4]
- movss xmm2, [eax + ebx*4 + 8]
-
- mov ecx, [ebp + %$iinr] ; ecx = pointer into iinr[]
- add [ebp + %$iinr], dword 4 ; advance pointer
- mov ebx, [ecx] ; ebx =ii
-
- movaps xmm3, xmm0
- movaps xmm4, xmm1
- movaps xmm5, xmm2
-
- lea ebx, [ebx + ebx*2] ; ebx = 3*ii=ii3
- mov eax, [ebp + %$pos] ; eax = base of pos[]
- mov [esp + .ii3], ebx
-
- addss xmm3, [eax + ebx*4]
- addss xmm4, [eax + ebx*4 + 4]
- addss xmm5, [eax + ebx*4 + 8]
- shufps xmm3, xmm3, 0b
- shufps xmm4, xmm4, 0b
- shufps xmm5, xmm5, 0b
- movaps [esp + .ixO], xmm3
- movaps [esp + .iyO], xmm4
- movaps [esp + .izO], xmm5
-
- movss xmm3, xmm0
- movss xmm4, xmm1
- movss xmm5, xmm2
- addss xmm0, [eax + ebx*4 + 12]
- addss xmm1, [eax + ebx*4 + 16]
- addss xmm2, [eax + ebx*4 + 20]
- addss xmm3, [eax + ebx*4 + 24]
- addss xmm4, [eax + ebx*4 + 28]
- addss xmm5, [eax + ebx*4 + 32]
-
- shufps xmm0, xmm0, 0b
- shufps xmm1, xmm1, 0b
- shufps xmm2, xmm2, 0b
- shufps xmm3, xmm3, 0b
- shufps xmm4, xmm4, 0b
- shufps xmm5, xmm5, 0b
- movaps [esp + .ixH1], xmm0
- movaps [esp + .iyH1], xmm1
- movaps [esp + .izH1], xmm2
- movaps [esp + .ixH2], xmm3
- movaps [esp + .iyH2], xmm4
- movaps [esp + .izH2], xmm5
-
- ; clear vctot and i forces
- xorps xmm4, xmm4
- movaps [esp + .vctot], xmm4
- movaps [esp + .vnbtot], xmm4
- movaps [esp + .fixO], xmm4
- movaps [esp + .fiyO], xmm4
- movaps [esp + .fizO], xmm4
- movaps [esp + .fixH1], xmm4
- movaps [esp + .fiyH1], xmm4
- movaps [esp + .fizH1], xmm4
- movaps [esp + .fixH2], xmm4
- movaps [esp + .fiyH2], xmm4
- movaps [esp + .fizH2], xmm4
-
- mov eax, [ebp + %$jindex]
- mov ecx, [eax] ; jindex[n]
- mov edx, [eax + 4] ; jindex[n+1]
- add [ebp + %$jindex], dword 4
- sub edx, ecx ; number of innerloop atoms
-
- mov esi, [ebp + %$pos]
- mov edi, [ebp + %$faction]
- mov eax, [ebp + %$jjnr]
- shl ecx, 2
- add eax, ecx
- mov [esp + .innerjjnr], eax ; pointer to jjnr[nj0]
- sub edx, dword 4
- mov [esp + .innerk], edx ; number of innerloop atoms
- jge .unroll_loop
- jmp .odd_inner
-.unroll_loop:
- ;; quad-unroll innerloop here.
- mov edx, [esp + .innerjjnr] ; pointer to jjnr[k]
- mov eax, [edx]
- mov ebx, [edx + 4]
- mov ecx, [edx + 8]
- mov edx, [edx + 12] ; eax-edx=jnr1-4
-
- add [esp + .innerjjnr], dword 16 ; advance pointer (unrolled 4)
-
- mov esi, [ebp + %$charge] ; base of charge[]
-
- movss xmm3, [esi + eax*4]
- movss xmm4, [esi + ecx*4]
- movss xmm6, [esi + ebx*4]
- movss xmm7, [esi + edx*4]
-
- shufps xmm3, xmm6, 00000000b
- shufps xmm4, xmm7, 00000000b
- shufps xmm3, xmm4, 10001000b ; all charges in xmm3
- movaps xmm4, xmm3 ; and in xmm4
- mulps xmm3, [esp + .iqO]
- mulps xmm4, [esp + .iqH]
-
- movd mm0, eax ; use mmx registers as temp. storage
- movd mm1, ebx
- movd mm2, ecx
- movd mm3, edx
-
- movaps [esp + .qqO], xmm3
- movaps [esp + .qqH], xmm4
-
- mov esi, [ebp + %$type]
- mov eax, [esi + eax*4]
- mov ebx, [esi + ebx*4]
- mov ecx, [esi + ecx*4]
- mov edx, [esi + edx*4]
- mov esi, [ebp + %$nbfp]
- shl eax, 1
- shl ebx, 1
- shl ecx, 1
- shl edx, 1
- mov edi, [esp + .ntia]
- add eax, edi
- add ebx, edi
- add ecx, edi
- add edx, edi
-
- movlps xmm6, [esi + eax*4]
- movlps xmm7, [esi + ecx*4]
- movhps xmm6, [esi + ebx*4]
- movhps xmm7, [esi + edx*4]
-
- movaps xmm4, xmm6
- shufps xmm4, xmm7, 10001000b
- shufps xmm6, xmm7, 11011101b
-
- movd eax, mm0
- movd ebx, mm1
- movd ecx, mm2
- movd edx, mm3
-
- movaps [esp + .c6], xmm4
- movaps [esp + .c12], xmm6
-
- mov esi, [ebp + %$pos] ; base of pos[]
-
- lea eax, [eax + eax*2] ; replace jnr with j3
- lea ebx, [ebx + ebx*2]
- lea ecx, [ecx + ecx*2] ; replace jnr with j3
- lea edx, [edx + edx*2]
-
- ; move four coordinates to xmm0-xmm2
- movlps xmm4, [esi + eax*4]
- movlps xmm5, [esi + ecx*4]
- movss xmm2, [esi + eax*4 + 8]
- movss xmm6, [esi + ecx*4 + 8]
-
- movhps xmm4, [esi + ebx*4]
- movhps xmm5, [esi + edx*4]
-
- movss xmm0, [esi + ebx*4 + 8]
- movss xmm1, [esi + edx*4 + 8]
-
- shufps xmm2, xmm0, 0b
- shufps xmm6, xmm1, 0b
-
- movaps xmm0, xmm4
- movaps xmm1, xmm4
-
- shufps xmm2, xmm6, 10001000b
-
- shufps xmm0, xmm5, 10001000b
- shufps xmm1, xmm5, 11011101b
-
- ; move ixO-izO to xmm4-xmm6
- movaps xmm4, [esp + .ixO]
- movaps xmm5, [esp + .iyO]
- movaps xmm6, [esp + .izO]
-
- ; calc dr
- subps xmm4, xmm0
- subps xmm5, xmm1
- subps xmm6, xmm2
-
- ; store dr
- movaps [esp + .dxO], xmm4
- movaps [esp + .dyO], xmm5
- movaps [esp + .dzO], xmm6
- ; square it
- mulps xmm4,xmm4
- mulps xmm5,xmm5
- mulps xmm6,xmm6
- addps xmm4, xmm5
- addps xmm4, xmm6
- movaps xmm7, xmm4
- ; rsqO in xmm7
-
- ; move ixH1-izH1 to xmm4-xmm6
- movaps xmm4, [esp + .ixH1]
- movaps xmm5, [esp + .iyH1]
- movaps xmm6, [esp + .izH1]
-
- ; calc dr
- subps xmm4, xmm0
- subps xmm5, xmm1
- subps xmm6, xmm2
-
- ; store dr
- movaps [esp + .dxH1], xmm4
- movaps [esp + .dyH1], xmm5
- movaps [esp + .dzH1], xmm6
- ; square it
- mulps xmm4,xmm4
- mulps xmm5,xmm5
- mulps xmm6,xmm6
- addps xmm6, xmm5
- addps xmm6, xmm4
- ; rsqH1 in xmm6
-
- ; move ixH2-izH2 to xmm3-xmm5
- movaps xmm3, [esp + .ixH2]
- movaps xmm4, [esp + .iyH2]
- movaps xmm5, [esp + .izH2]
-
- ; calc dr
- subps xmm3, xmm0
- subps xmm4, xmm1
- subps xmm5, xmm2
-
- ; store dr
- movaps [esp + .dxH2], xmm3
- movaps [esp + .dyH2], xmm4
- movaps [esp + .dzH2], xmm5
- ; square it
- mulps xmm3,xmm3
- mulps xmm4,xmm4
- mulps xmm5,xmm5
- addps xmm5, xmm4
- addps xmm5, xmm3
- ; rsqH2 in xmm5, rsqH1 in xmm6, rsqO in xmm7
-
- ; start with rsqO - seed to xmm2
- rsqrtps xmm2, xmm7
- movaps xmm3, xmm2
- mulps xmm2, xmm2
- movaps xmm4, [esp + .three]
- mulps xmm2, xmm7 ; rsq*lu*lu
- subps xmm4, xmm2 ; 3.0-rsq*lu*lu
- mulps xmm4, xmm3 ; lu*(3-rsq*lu*lu)
- mulps xmm4, [esp + .half]
- movaps [esp + .rinvO], xmm4 ; rinvO in xmm4
- mulps xmm7, xmm4
- movaps [esp + .rO], xmm7
-
- ; rsqH1 - seed in xmm2
- rsqrtps xmm2, xmm6
- movaps xmm3, xmm2
- mulps xmm2, xmm2
- movaps xmm4, [esp + .three]
- mulps xmm2, xmm6 ; rsq*lu*lu
- subps xmm4, xmm2 ; 3.0-rsq*lu*lu
- mulps xmm4, xmm3 ; lu*(3-rsq*lu*lu)
- mulps xmm4, [esp + .half]
- movaps [esp + .rinvH1], xmm4 ; rinvH1 in xmm4
- mulps xmm6, xmm4
- movaps [esp + .rH1], xmm6
-
- ; rsqH2 - seed to xmm2
- rsqrtps xmm2, xmm5
- movaps xmm3, xmm2
- mulps xmm2, xmm2
- movaps xmm4, [esp + .three]
- mulps xmm2, xmm5 ; rsq*lu*lu
- subps xmm4, xmm2 ; 3.0-rsq*lu*lu
- mulps xmm4, xmm3 ; lu*(3-rsq*lu*lu)
- mulps xmm4, [esp + .half]
- movaps [esp + .rinvH2], xmm4 ; rinvH2 in xmm4
- mulps xmm5, xmm4
- movaps [esp + .rH2], xmm5
-
- ; do O interactions
- ;; rO is still in xmm7.
- mulps xmm7, [esp + .tabscale]
- movhlps xmm4, xmm7
- cvttps2pi mm6, xmm7
- cvttps2pi mm7, xmm4 ; mm6/mm7 contain lu indices
-
- cvtpi2ps xmm3, mm6
- cvtpi2ps xmm4, mm7
- movlhps xmm3, xmm4
-
- subps xmm7, xmm3
-
- movaps xmm1, xmm7 ; xmm1=eps
- movaps xmm2, xmm1
- mulps xmm2, xmm2 ; xmm2=eps2
- pslld mm6, 2
- pslld mm7, 2
-
- movd mm0, eax
- movd mm1, ebx
- movd mm2, ecx
- movd mm3, edx
-
- mov esi, [ebp + %$VFtab]
- movd eax, mm6
- psrlq mm6, 32
- movd ecx, mm7
- psrlq mm7, 32
- movd ebx, mm6
- movd edx, mm7
-
- movlps xmm5, [esi + eax*4]
- movlps xmm7, [esi + ecx*4]
- movhps xmm5, [esi + ebx*4]
- movhps xmm7, [esi + edx*4] ; got half coulomb table
-
- movaps xmm4, xmm5
- shufps xmm4, xmm7, 10001000b
- shufps xmm5, xmm7, 11011101b
-
- movlps xmm7, [esi + eax*4 + 8]
- movlps xmm3, [esi + ecx*4 + 8]
- movhps xmm7, [esi + ebx*4 + 8]
- movhps xmm3, [esi + edx*4 + 8] ; other half of coulomb table
- movaps xmm6, xmm7
- shufps xmm6, xmm3, 10001000b
- shufps xmm7, xmm3, 11011101b
- ; coulomb table ready, in xmm4-xmm7
-
- mulps xmm6, xmm1 ; xmm6=Geps
- mulps xmm7, xmm2 ; xmm7=Heps2
- addps xmm5, xmm6
- addps xmm5, xmm7 ; xmm5=Fp
- mulps xmm7, [esp + .two] ; two*Heps2
- movaps xmm0, [esp + .qqO]
- addps xmm7, xmm6
- addps xmm7, xmm5 ; xmm7=FF
- mulps xmm5, xmm1 ; xmm5=eps*Fp
- addps xmm5, xmm4 ; xmm5=VV
- mulps xmm5, xmm0 ; vcoul=qq*VV
- mulps xmm0, xmm7 ; fijC=FF*qq
-
- ; do nontable L-J
- movaps xmm2, [esp + .rinvO]
- mulps xmm2, xmm2
-
- ; at this point mm5 contains vcoul and xmm0 fijC.
- ; increment vcoul - then we can get rid of mm5.
- addps xmm5, [esp + .vctot]
- movaps [esp + .vctot], xmm5
-
- movaps xmm1, xmm2
- mulps xmm1, xmm1
- mulps xmm1, xmm2 ; xmm1=rinvsix
- movaps xmm4, xmm1
- mulps xmm4, xmm4 ; xmm4=rinvtwelve
- mulps xmm1, [esp + .c6]
- mulps xmm4, [esp + .c12]
- movaps xmm3, xmm4
- subps xmm3, xmm1 ; xmm3=vnb12-vnb6
- mulps xmm1, [esp + .six]
- mulps xmm4, [esp + .twelve]
- subps xmm4, xmm1
- addps xmm3, [esp + .vnbtot]
- mulps xmm4, [esp + .rinvO]
- mulps xmm0, [esp + .tabscale]
- subps xmm4, xmm0
- movaps [esp + .vnbtot], xmm3
- mulps xmm4, [esp + .rinvO]
-
- movaps xmm0, [esp + .dxO]
- movaps xmm1, [esp + .dyO]
- movaps xmm2, [esp + .dzO]
- mulps xmm0, xmm4
- mulps xmm1, xmm4
- mulps xmm2, xmm4 ; tx in xmm0-xmm2
-
- ; update O forces
- movaps xmm3, [esp + .fixO]
- movaps xmm4, [esp + .fiyO]
- movaps xmm7, [esp + .fizO]
- addps xmm3, xmm0
- addps xmm4, xmm1
- addps xmm7, xmm2
- movaps [esp + .fixO], xmm3
- movaps [esp + .fiyO], xmm4
- movaps [esp + .fizO], xmm7
- ; update j forces with water O
- movaps [esp + .fjx], xmm0
- movaps [esp + .fjy], xmm1
- movaps [esp + .fjz], xmm2
-
- ;; Done with O interactions - now H1!
- movaps xmm7, [esp + .rH1]
- mulps xmm7, [esp + .tabscale]
- movhlps xmm4, xmm7
- cvttps2pi mm6, xmm7
- cvttps2pi mm7, xmm4 ; mm6/mm7 contain lu indices
-
- cvtpi2ps xmm3, mm6
- cvtpi2ps xmm4, mm7
- movlhps xmm3, xmm4
-
- subps xmm7, xmm3
- movaps xmm1, xmm7 ; xmm1=eps
- movaps xmm2, xmm1
- mulps xmm2, xmm2 ; xmm2=eps2
- pslld mm6, 2
- pslld mm7, 2
-
- movd eax, mm6
- psrlq mm6, 32
- movd ecx, mm7
- psrlq mm7, 32
- movd ebx, mm6
- movd edx, mm7
-
- movlps xmm5, [esi + eax*4]
- movlps xmm7, [esi + ecx*4]
- movhps xmm5, [esi + ebx*4]
- movhps xmm7, [esi + edx*4] ; got half coulomb table
-
- movaps xmm4, xmm5
- shufps xmm4, xmm7, 10001000b
- shufps xmm5, xmm7, 11011101b
-
- movlps xmm7, [esi + eax*4 + 8]
- movlps xmm3, [esi + ecx*4 + 8]
- movhps xmm7, [esi + ebx*4 + 8]
- movhps xmm3, [esi + edx*4 + 8] ; other half of coulomb table
- movaps xmm6, xmm7
- shufps xmm6, xmm3, 10001000b
- shufps xmm7, xmm3, 11011101b
- ; coulomb table ready, in xmm4-xmm7
-
- mulps xmm6, xmm1 ; xmm6=Geps
- mulps xmm7, xmm2 ; xmm7=Heps2
- addps xmm5, xmm6
- addps xmm5, xmm7 ; xmm5=Fp
- mulps xmm7, [esp + .two] ; two*Heps2
- movaps xmm0, [esp + .qqH]
- addps xmm7, xmm6
- addps xmm7, xmm5 ; xmm7=FF
- mulps xmm5, xmm1 ; xmm5=eps*Fp
- addps xmm5, xmm4 ; xmm5=VV
- mulps xmm5, xmm0 ; vcoul=qq*VV
- mulps xmm7, xmm0 ; fijC=FF*qq
- ; at this point mm5 contains vcoul and xmm7 fijC.
- ; increment vcoul
- xorps xmm4, xmm4
- addps xmm5, [esp + .vctot]
- mulps xmm7, [esp + .rinvH1]
- movaps [esp + .vctot], xmm5
- mulps xmm7, [esp + .tabscale]
- subps xmm4, xmm7
-
- movaps xmm0, [esp + .dxH1]
- movaps xmm1, [esp + .dyH1]
- movaps xmm2, [esp + .dzH1]
- mulps xmm0, xmm4
- mulps xmm1, xmm4
- mulps xmm2, xmm4
-
- ; update H1 forces
- movaps xmm3, [esp + .fixH1]
- movaps xmm4, [esp + .fiyH1]
- movaps xmm7, [esp + .fizH1]
- addps xmm3, xmm0
- addps xmm4, xmm1
- addps xmm7, xmm2
- movaps [esp + .fixH1], xmm3
- movaps [esp + .fiyH1], xmm4
- movaps [esp + .fizH1], xmm7
- ; update j forces with water H1
- addps xmm0, [esp + .fjx]
- addps xmm1, [esp + .fjy]
- addps xmm2, [esp + .fjz]
- movaps [esp + .fjx], xmm0
- movaps [esp + .fjy], xmm1
- movaps [esp + .fjz], xmm2
-
- ; Done with H1, finally we do H2 interactions
- movaps xmm7, [esp + .rH2]
- mulps xmm7, [esp + .tabscale]
- movhlps xmm4, xmm7
- cvttps2pi mm6, xmm7
- cvttps2pi mm7, xmm4 ; mm6/mm7 contain lu indices
-
- cvtpi2ps xmm3, mm6
- cvtpi2ps xmm4, mm7
- movlhps xmm3, xmm4
-
- subps xmm7, xmm3
- movaps xmm1, xmm7 ; xmm1=eps
- movaps xmm2, xmm1
- mulps xmm2, xmm2 ; xmm2=eps2
- pslld mm6, 2
- pslld mm7, 2
-
- movd eax, mm6
- psrlq mm6, 32
- movd ecx, mm7
- psrlq mm7, 32
- movd ebx, mm6
- movd edx, mm7
-
- movlps xmm5, [esi + eax*4]
- movlps xmm7, [esi + ecx*4]
- movhps xmm5, [esi + ebx*4]
- movhps xmm7, [esi + edx*4] ; got half coulomb table
-
- movaps xmm4, xmm5
- shufps xmm4, xmm7, 10001000b
- shufps xmm5, xmm7, 11011101b
-
- movlps xmm7, [esi + eax*4 + 8]
- movlps xmm3, [esi + ecx*4 + 8]
- movhps xmm7, [esi + ebx*4 + 8]
- movhps xmm3, [esi + edx*4 + 8] ; other half of coulomb table
- movaps xmm6, xmm7
- shufps xmm6, xmm3, 10001000b
- shufps xmm7, xmm3, 11011101b
- ; coulomb table ready, in xmm4-xmm7
-
- mulps xmm6, xmm1 ; xmm6=Geps
- mulps xmm7, xmm2 ; xmm7=Heps2
- addps xmm5, xmm6
- addps xmm5, xmm7 ; xmm5=Fp
- mulps xmm7, [esp + .two] ; two*Heps2
- movaps xmm0, [esp + .qqH]
- addps xmm7, xmm6
- addps xmm7, xmm5 ; xmm7=FF
- mulps xmm5, xmm1 ; xmm5=eps*Fp
- addps xmm5, xmm4 ; xmm5=VV
- mulps xmm5, xmm0 ; vcoul=qq*VV
- mulps xmm7, xmm0 ; fijC=FF*qq
- ; at this point mm5 contains vcoul and xmm0 fijC.
- ; increment vcoul
- xorps xmm4, xmm4
- addps xmm5, [esp + .vctot]
- mulps xmm7, [esp + .rinvH2]
- movaps [esp + .vctot], xmm5
- mulps xmm7, [esp + .tabscale]
- subps xmm4, xmm7
-
- movaps xmm0, [esp + .dxH2]
- movaps xmm1, [esp + .dyH2]
- movaps xmm2, [esp + .dzH2]
- mulps xmm0, xmm4
- mulps xmm1, xmm4
- mulps xmm2, xmm4
-
- movd eax, mm0
- movd ebx, mm1
- movd ecx, mm2
- movd edx, mm3
-
- ; update H2 forces
- movaps xmm3, [esp + .fixH2]
- movaps xmm4, [esp + .fiyH2]
- movaps xmm7, [esp + .fizH2]
- addps xmm3, xmm0
- addps xmm4, xmm1
- addps xmm7, xmm2
- movaps [esp + .fixH2], xmm3
- movaps [esp + .fiyH2], xmm4
- movaps [esp + .fizH2], xmm7
-
- mov edi, [ebp +%$faction]
- ; update j forces
- addps xmm0, [esp + .fjx]
- addps xmm1, [esp + .fjy]
- addps xmm2, [esp + .fjz]
-
- movlps xmm4, [edi + eax*4]
- movlps xmm7, [edi + ecx*4]
- movhps xmm4, [edi + ebx*4]
- movhps xmm7, [edi + edx*4]
-
- movaps xmm3, xmm4
- shufps xmm3, xmm7, 10001000b
- shufps xmm4, xmm7, 11011101b
- ; xmm3 has fjx, xmm4 has fjy.
- subps xmm3, xmm0
- subps xmm4, xmm1
- ; unpack the back for storing.
- movaps xmm7, xmm3
- unpcklps xmm7, xmm4
- unpckhps xmm3, xmm4
- movlps [edi + eax*4], xmm7
- movlps [edi + ecx*4], xmm3
- movhps [edi + ebx*4], xmm7
- movhps [edi + edx*4], xmm3
- ; finally z forces
- movss xmm0, [edi + eax*4 + 8]
- movss xmm1, [edi + ebx*4 + 8]
- movss xmm3, [edi + ecx*4 + 8]
- movss xmm4, [edi + edx*4 + 8]
- subss xmm0, xmm2
- shufps xmm2, xmm2, 11100101b
- subss xmm1, xmm2
- shufps xmm2, xmm2, 11101010b
- subss xmm3, xmm2
- shufps xmm2, xmm2, 11111111b
- subss xmm4, xmm2
- movss [edi + eax*4 + 8], xmm0
- movss [edi + ebx*4 + 8], xmm1
- movss [edi + ecx*4 + 8], xmm3
- movss [edi + edx*4 + 8], xmm4
-
- ;; should we do one more iteration?
- sub [esp + .innerk], dword 4
- jl .odd_inner
- jmp .unroll_loop
-.odd_inner:
- add [esp + .innerk], dword 4
- jnz .odd_loop
- jmp .updateouterdata
-.odd_loop:
- mov edx, [esp + .innerjjnr] ; pointer to jjnr[k]
- mov eax, [edx]
- add [esp + .innerjjnr], dword 4
-
- xorps xmm4, xmm4
- movss xmm4, [esp + .iqO]
- mov esi, [ebp + %$charge]
- movhps xmm4, [esp + .iqH]
- movss xmm3, [esi + eax*4] ; charge in xmm3
- shufps xmm3, xmm3, 0b
- mulps xmm3, xmm4
- movaps [esp + .qqO], xmm3 ; use oxygen qq for storage.
-
- xorps xmm6, xmm6
- mov esi, [ebp + %$type]
- mov ebx, [esi + eax*4]
- mov esi, [ebp + %$nbfp]
- shl ebx, 1
- add ebx, [esp + .ntia]
- movlps xmm6, [esi + ebx*4]
- movaps xmm7, xmm6
- shufps xmm6, xmm6, 11111100b
- shufps xmm7, xmm7, 11111101b
- movaps [esp + .c6], xmm6
- movaps [esp + .c12], xmm7
-
- mov esi, [ebp + %$pos]
- lea eax, [eax + eax*2]
-
- ; move j coords to xmm0-xmm2
- movss xmm0, [esi + eax*4]
- movss xmm1, [esi + eax*4 + 4]
- movss xmm2, [esi + eax*4 + 8]
- shufps xmm0, xmm0, 0b
- shufps xmm1, xmm1, 0b
- shufps xmm2, xmm2, 0b
-
- movss xmm3, [esp + .ixO]
- movss xmm4, [esp + .iyO]
- movss xmm5, [esp + .izO]
-
- movlps xmm6, [esp + .ixH1]
- movlps xmm7, [esp + .ixH2]
- unpcklps xmm6, xmm7
- movlhps xmm3, xmm6
- movlps xmm6, [esp + .iyH1]
- movlps xmm7, [esp + .iyH2]
- unpcklps xmm6, xmm7
- movlhps xmm4, xmm6
- movlps xmm6, [esp + .izH1]
- movlps xmm7, [esp + .izH2]
- unpcklps xmm6, xmm7
- movlhps xmm5, xmm6
-
- subps xmm3, xmm0
- subps xmm4, xmm1
- subps xmm5, xmm2
-
- movaps [esp + .dxO], xmm3
- movaps [esp + .dyO], xmm4
- movaps [esp + .dzO], xmm5
-
- mulps xmm3, xmm3
- mulps xmm4, xmm4
- mulps xmm5, xmm5
-
- addps xmm4, xmm3
- addps xmm4, xmm5
- ; rsq in xmm4.
-
- rsqrtps xmm5, xmm4
- ; lookup seed in xmm5
- movaps xmm2, xmm5
- mulps xmm5, xmm5
- movaps xmm1, [esp + .three]
- mulps xmm5, xmm4 ;rsq*lu*lu
- movaps xmm0, [esp + .half]
- subps xmm1, xmm5 ; 3.0-rsq*lu*lu
- mulps xmm1, xmm2
- mulps xmm0, xmm1 ; xmm0=rinv
- mulps xmm4, xmm0 ; xmm4=r
- movaps [esp + .rinvO], xmm0
-
- mulps xmm4, [esp + .tabscale]
- movhlps xmm7, xmm4
- cvttps2pi mm6, xmm4
- cvttps2pi mm7, xmm7 ; mm6/mm7 contain lu indices
- cvtpi2ps xmm3, mm6
- cvtpi2ps xmm7, mm7
- movlhps xmm3, xmm7
-
- subps xmm4, xmm3
- movaps xmm1, xmm4 ; xmm1=eps
- movaps xmm2, xmm1
- mulps xmm2, xmm2 ; xmm2=eps2
- pslld mm6, 2
- pslld mm7, 2
-
- movd mm0, eax
- movd mm1, ecx
- movd mm2, edx
-
- mov esi, [ebp + %$VFtab]
- movd eax, mm6
- movd ecx, mm7
- psrlq mm7, 32
- movd edx, mm7
-
- movlps xmm5, [esi + eax*4]
- movlps xmm7, [esi + ecx*4]
- movhps xmm7, [esi + edx*4] ; got half coulomb table
-
- movaps xmm4, xmm5
- shufps xmm4, xmm7, 10001000b
- shufps xmm5, xmm7, 11011101b
-
- movlps xmm7, [esi + eax*4 + 8]
- movlps xmm3, [esi + ecx*4 + 8]
- movhps xmm3, [esi + edx*4 + 8] ; other half of coulomb table
- movaps xmm6, xmm7
- shufps xmm6, xmm3, 10001000b
- shufps xmm7, xmm3, 11011101b
- ; coulomb table ready, in xmm4-xmm7
- mulps xmm6, xmm1 ; xmm6=Geps
- mulps xmm7, xmm2 ; xmm7=Heps2
- addps xmm5, xmm6
- addps xmm5, xmm7 ; xmm5=Fp
- mulps xmm7, [esp + .two] ; two*Heps2
- movaps xmm0, [esp + .qqO]
- addps xmm7, xmm6
- addps xmm7, xmm5 ; xmm7=FF
- mulps xmm5, xmm1 ; xmm5=eps*Fp
- addps xmm5, xmm4 ; xmm5=VV
- mulps xmm5, xmm0 ; vcoul=qq*VV
- mulps xmm0, xmm7 ; fijC=FF*qq
- ; at this point mm5 contains vcoul and xmm0 fijC.
- ; increment vcoul - then we can get rid of mm5.
- addps xmm5, [esp + .vctot]
- movaps [esp + .vctot], xmm5
-
- ; do nontable L-J
- movaps xmm2, [esp + .rinvO]
- mulps xmm2, xmm2
- movaps xmm1, xmm2
- mulps xmm1, xmm1
- mulps xmm1, xmm2 ; xmm1=rinvsix
- movaps xmm4, xmm1
- mulps xmm4, xmm4 ; xmm4=rinvtwelve
- mulps xmm1, [esp + .c6]
- mulps xmm4, [esp + .c12]
- movaps xmm3, xmm4
- subps xmm3, xmm1 ; xmm3=vnb12-vnb6
- mulps xmm1, [esp + .six]
- mulps xmm4, [esp + .twelve]
- subps xmm4, xmm1
- addps xmm3, [esp + .vnbtot]
- mulps xmm4, [esp + .rinvO]
- mulps xmm0, [esp + .tabscale]
- subps xmm4, xmm0
- movaps [esp + .vnbtot], xmm3
- mulps xmm4, [esp + .rinvO]
-
- movd eax, mm0
- movd ecx, mm1
- movd edx, mm2
-
- movaps xmm0, [esp + .dxO]
- movaps xmm1, [esp + .dyO]
- movaps xmm2, [esp + .dzO]
-
- mulps xmm0, xmm4
- mulps xmm1, xmm4
- mulps xmm2, xmm4 ; xmm0-xmm2 now contains tx-tz (partial force)
- movss xmm3, [esp + .fixO]
- movss xmm4, [esp + .fiyO]
- movss xmm5, [esp + .fizO]
- addss xmm3, xmm0
- addss xmm4, xmm1
- addss xmm5, xmm2
- movss [esp + .fixO], xmm3
- movss [esp + .fiyO], xmm4
- movss [esp + .fizO], xmm5 ; updated the O force. now do the H's
- movaps xmm3, xmm0
- movaps xmm4, xmm1
- movaps xmm5, xmm2
- shufps xmm3, xmm3, 11100110b ; shift right
- shufps xmm4, xmm4, 11100110b
- shufps xmm5, xmm5, 11100110b
- addss xmm3, [esp + .fixH1]
- addss xmm4, [esp + .fiyH1]
- addss xmm5, [esp + .fizH1]
- movss [esp + .fixH1], xmm3
- movss [esp + .fiyH1], xmm4
- movss [esp + .fizH1], xmm5 ; updated the H1 force.
-
- mov edi, [ebp + %$faction]
- shufps xmm3, xmm3, 11100111b ; shift right
- shufps xmm4, xmm4, 11100111b
- shufps xmm5, xmm5, 11100111b
- addss xmm3, [esp + .fixH2]
- addss xmm4, [esp + .fiyH2]
- addss xmm5, [esp + .fizH2]
- movss [esp + .fixH2], xmm3
- movss [esp + .fiyH2], xmm4
- movss [esp + .fizH2], xmm5 ; updated the H2 force.
-
- ; the fj's - start by accumulating the tx/ty/tz force in xmm0, xmm1.
- xorps xmm5, xmm5
- movaps xmm3, xmm0
- movlps xmm6, [edi + eax*4]
- movss xmm7, [edi + eax*4 + 8]
- unpcklps xmm3, xmm1
- movlhps xmm3, xmm5
- unpckhps xmm0, xmm1
- addps xmm0, xmm3
- movhlps xmm3, xmm0
- addps xmm0, xmm3 ; x,y sum in xmm0
-
- movhlps xmm1, xmm2
- addss xmm2, xmm1
- shufps xmm1, xmm1, 1b
- addss xmm2, xmm1 ; z sum in xmm2
- subps xmm6, xmm0
- subss xmm7, xmm2
-
- movlps [edi + eax*4], xmm6
- movss [edi + eax*4 + 8], xmm7
-
- dec dword [esp + .innerk]
- jz .updateouterdata
- jmp .odd_loop
-.updateouterdata:
- mov ecx, [esp + .ii3]
- mov edi, [ebp + %$faction]
- mov esi, [ebp + %$fshift]
- mov edx, [esp + .is3]
-
- ; accumulate Oi forces in xmm0, xmm1, xmm2
- movaps xmm0, [esp + .fixO]
- movaps xmm1, [esp + .fiyO]
- movaps xmm2, [esp + .fizO]
-
- movhlps xmm3, xmm0
- movhlps xmm4, xmm1
- movhlps xmm5, xmm2
- addps xmm0, xmm3
- addps xmm1, xmm4
- addps xmm2, xmm5 ; summan ligger i 1/2 i xmm0-xmm2
-
- movaps xmm3, xmm0
- movaps xmm4, xmm1
- movaps xmm5, xmm2
-
- shufps xmm3, xmm3, 1b
- shufps xmm4, xmm4, 1b
- shufps xmm5, xmm5, 1b
- addss xmm0, xmm3
- addss xmm1, xmm4
- addss xmm2, xmm5 ; xmm0-xmm2 has single force in pos0.
-
- ; increment i force
- movss xmm3, [edi + ecx*4]
- movss xmm4, [edi + ecx*4 + 4]
- movss xmm5, [edi + ecx*4 + 8]
- addss xmm3, xmm0
- addss xmm4, xmm1
- addss xmm5, xmm2
- movss [edi + ecx*4], xmm3
- movss [edi + ecx*4 + 4], xmm4
- movss [edi + ecx*4 + 8], xmm5
-
- ; accumulate force in xmm6/xmm7 for fshift
- movaps xmm6, xmm0
- movss xmm7, xmm2
- movlhps xmm6, xmm1
- shufps xmm6, xmm6, 1000b
-
- ; accumulate H1i forces in xmm0, xmm1, xmm2
- movaps xmm0, [esp + .fixH1]
- movaps xmm1, [esp + .fiyH1]
- movaps xmm2, [esp + .fizH1]
-
- movhlps xmm3, xmm0
- movhlps xmm4, xmm1
- movhlps xmm5, xmm2
- addps xmm0, xmm3
- addps xmm1, xmm4
- addps xmm2, xmm5 ; summan ligger i 1/2 i xmm0-xmm2
-
- movaps xmm3, xmm0
- movaps xmm4, xmm1
- movaps xmm5, xmm2
-
- shufps xmm3, xmm3, 1b
- shufps xmm4, xmm4, 1b
- shufps xmm5, xmm5, 1b
- addss xmm0, xmm3
- addss xmm1, xmm4
- addss xmm2, xmm5 ; xmm0-xmm2 has single force in pos0.
-
- ; increment i force
- movss xmm3, [edi + ecx*4 + 12]
- movss xmm4, [edi + ecx*4 + 16]
- movss xmm5, [edi + ecx*4 + 20]
- addss xmm3, xmm0
- addss xmm4, xmm1
- addss xmm5, xmm2
- movss [edi + ecx*4 + 12], xmm3
- movss [edi + ecx*4 + 16], xmm4
- movss [edi + ecx*4 + 20], xmm5
-
- ;accumulate force in xmm6/xmm7 for fshift
- addss xmm7, xmm2
- movlhps xmm0, xmm1
- shufps xmm0, xmm0, 1000b
- addps xmm6, xmm0
-
- ; accumulate H2i forces in xmm0, xmm1, xmm2
- movaps xmm0, [esp + .fixH2]
- movaps xmm1, [esp + .fiyH2]
- movaps xmm2, [esp + .fizH2]
-
- movhlps xmm3, xmm0
- movhlps xmm4, xmm1
- movhlps xmm5, xmm2
- addps xmm0, xmm3
- addps xmm1, xmm4
- addps xmm2, xmm5 ; summan ligger i 1/2 i xmm0-xmm2
-
- movaps xmm3, xmm0
- movaps xmm4, xmm1
- movaps xmm5, xmm2
-
- shufps xmm3, xmm3, 1b
- shufps xmm4, xmm4, 1b
- shufps xmm5, xmm5, 1b
- addss xmm0, xmm3
- addss xmm1, xmm4
- addss xmm2, xmm5 ; xmm0-xmm2 has single force in pos0.
-
- ; increment i force
- movss xmm3, [edi + ecx*4 + 24]
- movss xmm4, [edi + ecx*4 + 28]
- movss xmm5, [edi + ecx*4 + 32]
- addss xmm3, xmm0
- addss xmm4, xmm1
- addss xmm5, xmm2
- movss [edi + ecx*4 + 24], xmm3
- movss [edi + ecx*4 + 28], xmm4
- movss [edi + ecx*4 + 32], xmm5
-
- ;accumulate force in xmm6/xmm7 for fshift
- addss xmm7, xmm2
- movlhps xmm0, xmm1
- shufps xmm0, xmm0, 1000b
- addps xmm6, xmm0
-
- ; increment fshift force
- movlps xmm3, [esi + edx*4]
- movss xmm4, [esi + edx*4 + 8]
- addps xmm3, xmm6
- addss xmm4, xmm7
- movlps [esi + edx*4], xmm3
- movss [esi + edx*4 + 8], xmm4
-
- mov edx, [ebp + %$gid]
- mov edx, [edx]
- add [ebp + %$gid], dword 4
-
- ; accumulate total potential energy and update it.
- movaps xmm7, [esp + .vctot]
- ; accumulate
- movhlps xmm6, xmm7
- addps xmm7, xmm6 ; pos 0-1 in xmm7 have the sum now
- movaps xmm6, xmm7
- shufps xmm6, xmm6, 1b
- addss xmm7, xmm6
-
- ; add earlier value from mem.
- mov eax, [ebp + %$Vc]
- addss xmm7, [eax + edx*4]
- ; move back to mem.
- movss [eax + edx*4], xmm7
-
- ; accumulate total lj energy and update it.
- movaps xmm7, [esp + .vnbtot]
- ; accumulate
- movhlps xmm6, xmm7
- addps xmm7, xmm6 ; pos 0-1 in xmm7 have the sum now
- movaps xmm6, xmm7
- shufps xmm6, xmm6, 1b
- addss xmm7, xmm6
-
- ; add earlier value from mem.
- mov eax, [ebp + %$Vnb]
- addss xmm7, [eax + edx*4]
- ; move back to mem.
- movss [eax + edx*4], xmm7
-
- ;; finish if last
- mov ecx, [ebp + %$nri]
- dec ecx
- jecxz .end
- ;; not last, iterate once more!
- mov [ebp + %$nri], ecx
- jmp .outer
-.end:
- emms
- mov eax, [esp + .salign]
- add esp, eax
- add esp, 824
- pop edi
- pop esi
- pop edx
- pop ecx
- pop ebx
- pop eax
- endproc
-
-
-
-proc inl3130_sse
-%$nri arg
-%$iinr arg
-%$jindex arg
-%$jjnr arg
-%$shift arg
-%$shiftvec arg
-%$fshift arg
-%$gid arg
-%$pos arg
-%$faction arg
-%$charge arg
-%$facel arg
-%$Vc arg
-%$type arg
-%$ntype arg
-%$nbfp arg
-%$Vnb arg
-%$tabscale arg
-%$VFtab arg
- ;; stack offsets for local variables
- ;; bottom of stack is cache-aligned for sse use
-.ixO equ 0
-.iyO equ 16
-.izO equ 32
-.ixH1 equ 48
-.iyH1 equ 64
-.izH1 equ 80
-.ixH2 equ 96
-.iyH2 equ 112
-.izH2 equ 128
-.jxO equ 144
-.jyO equ 160
-.jzO equ 176
-.jxH1 equ 192
-.jyH1 equ 208
-.jzH1 equ 224
-.jxH2 equ 240
-.jyH2 equ 256
-.jzH2 equ 272
-.dxOO equ 288
-.dyOO equ 304
-.dzOO equ 320
-.dxOH1 equ 336
-.dyOH1 equ 352
-.dzOH1 equ 368
-.dxOH2 equ 384
-.dyOH2 equ 400
-.dzOH2 equ 416
-.dxH1O equ 432
-.dyH1O equ 448
-.dzH1O equ 464
-.dxH1H1 equ 480
-.dyH1H1 equ 496
-.dzH1H1 equ 512
-.dxH1H2 equ 528
-.dyH1H2 equ 544
-.dzH1H2 equ 560
-.dxH2O equ 576
-.dyH2O equ 592
-.dzH2O equ 608
-.dxH2H1 equ 624
-.dyH2H1 equ 640
-.dzH2H1 equ 656
-.dxH2H2 equ 672
-.dyH2H2 equ 688
-.dzH2H2 equ 704
-.qqOO equ 720
-.qqOH equ 736
-.qqHH equ 752
-.two equ 768
-.tabscale equ 784
-.c6 equ 800
-.c12 equ 816
-.six equ 832
-.twelve equ 848
-.vctot equ 864
-.vnbtot equ 880
-.fixO equ 896
-.fiyO equ 912
-.fizO equ 928
-.fixH1 equ 944
-.fiyH1 equ 960
-.fizH1 equ 976
-.fixH2 equ 992
-.fiyH2 equ 1008
-.fizH2 equ 1024
-.fjxO equ 1040
-.fjyO equ 1056
-.fjzO equ 1072
-.fjxH1 equ 1088
-.fjyH1 equ 1104
-.fjzH1 equ 1120
-.fjxH2 equ 1136
-.fjyH2 equ 1152
-.fjzH2 equ 1168
-.half equ 1184
-.three equ 1200
-.rsqOO equ 1216
-.rsqOH1 equ 1232
-.rsqOH2 equ 1248
-.rsqH1O equ 1264
-.rsqH1H1 equ 1280
-.rsqH1H2 equ 1296
-.rsqH2O equ 1312
-.rsqH2H1 equ 1328
-.rsqH2H2 equ 1344
-.rinvOO equ 1360
-.rinvOH1 equ 1376
-.rinvOH2 equ 1392
-.rinvH1O equ 1408
-.rinvH1H1 equ 1424
-.rinvH1H2 equ 1440
-.rinvH2O equ 1456
-.rinvH2H1 equ 1472
-.rinvH2H2 equ 1488
-.fstmp equ 1504
-.is3 equ 1520
-.ii3 equ 1524
-.innerjjnr equ 1528
-.innerk equ 1532
-.salign equ 1536
- push eax
- push ebx
- push ecx
- push edx
- push esi
- push edi
- sub esp, 1540 ; local stack space
- mov eax, esp
- and eax, 0xf
- sub esp, eax
- mov [esp + .salign], eax
-
- emms
-
- movups xmm0, [sse_half]
- movups xmm1, [sse_two]
- movups xmm2, [sse_three]
- movups xmm3, [sse_six]
- movups xmm4, [sse_twelve]
- movss xmm5, [ebp +%$tabscale]
- movaps [esp + .half], xmm0
- movaps [esp + .two], xmm1
- movaps [esp + .three], xmm2
- movaps [esp + .six], xmm3
- movaps [esp + .twelve], xmm4
- shufps xmm5, xmm5, 0b
- movaps [esp + .tabscale], xmm5
-
- ;; assume we have at least one i particle - start directly
- mov ecx, [ebp + %$iinr] ; ecx = pointer into iinr[]
- mov ebx, [ecx] ; ebx =ii
-
- mov edx, [ebp + %$charge]
- movss xmm3, [edx + ebx*4]
- movss xmm4, xmm3
- movss xmm5, [edx + ebx*4 + 4]
- movss xmm6, [ebp + %$facel]
- mulss xmm3, xmm3
- mulss xmm4, xmm5
- mulss xmm5, xmm5
- mulss xmm3, xmm6
- mulss xmm4, xmm6
- mulss xmm5, xmm6
- shufps xmm3, xmm3, 0b
- shufps xmm4, xmm4, 0b
- shufps xmm5, xmm5, 0b
- movaps [esp + .qqOO], xmm3
- movaps [esp + .qqOH], xmm4
- movaps [esp + .qqHH], xmm5
-
- xorps xmm0, xmm0
- mov edx, [ebp + %$type]
- mov ecx, [edx + ebx*4]
- shl ecx, 1
- mov edx, ecx
- imul ecx, [ebp + %$ntype] ; ecx = ntia = 2*ntype*type[ii0]
- add edx, ecx
- mov eax, [ebp + %$nbfp]
- movlps xmm0, [eax + edx*4]
- movaps xmm1, xmm0
- shufps xmm0, xmm0, 0b
- shufps xmm1, xmm1, 01010101b
- movaps [esp + .c6], xmm0
- movaps [esp + .c12], xmm1
-
-.outer:
- mov eax, [ebp + %$shift] ; eax = pointer into shift[]
- mov ebx, [eax] ; ebx=shift[n]
- add [ebp + %$shift], dword 4 ; advance pointer one step
-
- lea ebx, [ebx + ebx*2] ; ebx=3*is
- mov [esp + .is3],ebx ; store is3
-
- mov eax, [ebp + %$shiftvec] ; eax = base of shiftvec[]
-
- movss xmm0, [eax + ebx*4]
- movss xmm1, [eax + ebx*4 + 4]
- movss xmm2, [eax + ebx*4 + 8]
-
- mov ecx, [ebp + %$iinr] ; ecx = pointer into iinr[]
- add [ebp + %$iinr], dword 4 ; advance pointer
- mov ebx, [ecx] ; ebx =ii
-
- lea ebx, [ebx + ebx*2] ; ebx = 3*ii=ii3
- mov eax, [ebp + %$pos] ; eax = base of pos[]
- mov [esp + .ii3], ebx
-
- movaps xmm3, xmm0
- movaps xmm4, xmm1
- movaps xmm5, xmm2
- addss xmm3, [eax + ebx*4]
- addss xmm4, [eax + ebx*4 + 4]
- addss xmm5, [eax + ebx*4 + 8]
- shufps xmm3, xmm3, 0b
- shufps xmm4, xmm4, 0b
- shufps xmm5, xmm5, 0b
- movaps [esp + .ixO], xmm3
- movaps [esp + .iyO], xmm4
- movaps [esp + .izO], xmm5
-
- movss xmm3, xmm0
- movss xmm4, xmm1
- movss xmm5, xmm2
- addss xmm0, [eax + ebx*4 + 12]
- addss xmm1, [eax + ebx*4 + 16]
- addss xmm2, [eax + ebx*4 + 20]
- addss xmm3, [eax + ebx*4 + 24]
- addss xmm4, [eax + ebx*4 + 28]
- addss xmm5, [eax + ebx*4 + 32]
-
- shufps xmm0, xmm0, 0b
- shufps xmm1, xmm1, 0b
- shufps xmm2, xmm2, 0b
- shufps xmm3, xmm3, 0b
- shufps xmm4, xmm4, 0b
- shufps xmm5, xmm5, 0b
- movaps [esp + .ixH1], xmm0
- movaps [esp + .iyH1], xmm1
- movaps [esp + .izH1], xmm2
- movaps [esp + .ixH2], xmm3
- movaps [esp + .iyH2], xmm4
- movaps [esp + .izH2], xmm5
-
- ; clear vctot and i forces
- xorps xmm4, xmm4
- movaps [esp + .vctot], xmm4
- movaps [esp + .vnbtot], xmm4
- movaps [esp + .fixO], xmm4
- movaps [esp + .fiyO], xmm4
- movaps [esp + .fizO], xmm4
- movaps [esp + .fixH1], xmm4
- movaps [esp + .fiyH1], xmm4
- movaps [esp + .fizH1], xmm4
- movaps [esp + .fixH2], xmm4
- movaps [esp + .fiyH2], xmm4
- movaps [esp + .fizH2], xmm4
-
- mov eax, [ebp + %$jindex]
- mov ecx, [eax] ; jindex[n]
- mov edx, [eax + 4] ; jindex[n+1]
- add [ebp + %$jindex], dword 4
- sub edx, ecx ; number of innerloop atoms
-
- mov esi, [ebp + %$pos]
- mov edi, [ebp + %$faction]
- mov eax, [ebp + %$jjnr]
- shl ecx, 2
- add eax, ecx
- mov [esp + .innerjjnr], eax ; pointer to jjnr[nj0]
- sub edx, dword 4
- mov [esp + .innerk], edx ; number of innerloop atoms
- jge .unroll_loop
- jmp .single_check
-.unroll_loop:
- ;; quad-unroll innerloop here.
- mov edx, [esp + .innerjjnr] ; pointer to jjnr[k]
-
- mov eax, [edx]
- mov ebx, [edx + 4]
- mov ecx, [edx + 8]
- mov edx, [edx + 12] ; eax-edx=jnr1-4
-
- add [esp + .innerjjnr], dword 16 ; advance pointer (unrolled 4)
-
- mov esi, [ebp + %$pos] ; base of pos[]
-
- lea eax, [eax + eax*2] ; replace jnr with j3
- lea ebx, [ebx + ebx*2]
- lea ecx, [ecx + ecx*2] ; replace jnr with j3
- lea edx, [edx + edx*2]
-
- ; move j coordinates to local temp. variables
- movlps xmm2, [esi + eax*4]
- movlps xmm3, [esi + eax*4 + 12]
- movlps xmm4, [esi + eax*4 + 24]
-
- movlps xmm5, [esi + ebx*4]
- movlps xmm6, [esi + ebx*4 + 12]
- movlps xmm7, [esi + ebx*4 + 24]
-
- movhps xmm2, [esi + ecx*4]
- movhps xmm3, [esi + ecx*4 + 12]
- movhps xmm4, [esi + ecx*4 + 24]
-
- movhps xmm5, [esi + edx*4]
- movhps xmm6, [esi + edx*4 + 12]
- movhps xmm7, [esi + edx*4 + 24]
-
- ;; current state:
- ;; xmm2= jxOa jyOa jxOc jyOc
- ;; xmm3= jxH1a jyH1a jxH1c jyH1c
- ;; xmm4= jxH2a jyH2a jxH2c jyH2c
- ;; xmm5= jxOb jyOb jxOd jyOd
- ;; xmm6= jxH1b jyH1b jxH1d jyH1d
- ;; xmm7= jxH2b jyH2b jxH2d jyH2d
-
- movaps xmm0, xmm2
- movaps xmm1, xmm3
- unpcklps xmm0, xmm5 ; xmm0= jxOa jxOb jyOa jyOb
- unpcklps xmm1, xmm6 ; xmm1= jxH1a jxH1b jyH1a jyH1b
- unpckhps xmm2, xmm5 ; xmm2= jxOc jxOd jyOc jyOd
- unpckhps xmm3, xmm6 ; xmm3= jxH1c jxH1d jyH1c jyH1d
- movaps xmm5, xmm4
- movaps xmm6, xmm0
- unpcklps xmm4, xmm7 ; xmm4= jxH2a jxH2b jyH2a jyH2b
- unpckhps xmm5, xmm7 ; xmm5= jxH2c jxH2d jyH2c jyH2d
- movaps xmm7, xmm1
- movlhps xmm0, xmm2 ; xmm0= jxOa jxOb jxOc jxOd
- movaps [esp + .jxO], xmm0
- movhlps xmm2, xmm6 ; xmm2= jyOa jyOb jyOc jyOd
- movaps [esp + .jyO], xmm2
- movlhps xmm1, xmm3
- movaps [esp + .jxH1], xmm1
- movhlps xmm3, xmm7
- movaps xmm6, xmm4
- movaps [esp + .jyH1], xmm3
- movlhps xmm4, xmm5
- movaps [esp + .jxH2], xmm4
- movhlps xmm5, xmm6
- movaps [esp + .jyH2], xmm5
-
- movss xmm0, [esi + eax*4 + 8]
- movss xmm1, [esi + eax*4 + 20]
- movss xmm2, [esi + eax*4 + 32]
-
- movss xmm3, [esi + ecx*4 + 8]
- movss xmm4, [esi + ecx*4 + 20]
- movss xmm5, [esi + ecx*4 + 32]
-
- movhps xmm0, [esi + ebx*4 + 4]
- movhps xmm1, [esi + ebx*4 + 16]
- movhps xmm2, [esi + ebx*4 + 28]
-
- movhps xmm3, [esi + edx*4 + 4]
- movhps xmm4, [esi + edx*4 + 16]
- movhps xmm5, [esi + edx*4 + 28]
-
- shufps xmm0, xmm3, 11001100b
- shufps xmm1, xmm4, 11001100b
- shufps xmm2, xmm5, 11001100b
- movaps [esp + .jzO], xmm0
- movaps [esp + .jzH1], xmm1
- movaps [esp + .jzH2], xmm2
-
- movaps xmm0, [esp + .ixO]
- movaps xmm1, [esp + .iyO]
- movaps xmm2, [esp + .izO]
- movaps xmm3, [esp + .ixO]
- movaps xmm4, [esp + .iyO]
- movaps xmm5, [esp + .izO]
- subps xmm0, [esp + .jxO]
- subps xmm1, [esp + .jyO]
- subps xmm2, [esp + .jzO]
- subps xmm3, [esp + .jxH1]
- subps xmm4, [esp + .jyH1]
- subps xmm5, [esp + .jzH1]
- movaps [esp + .dxOO], xmm0
- movaps [esp + .dyOO], xmm1
- movaps [esp + .dzOO], xmm2
- mulps xmm0, xmm0
- mulps xmm1, xmm1
- mulps xmm2, xmm2
- movaps [esp + .dxOH1], xmm3
- movaps [esp + .dyOH1], xmm4
- movaps [esp + .dzOH1], xmm5
- mulps xmm3, xmm3
- mulps xmm4, xmm4
- mulps xmm5, xmm5
- addps xmm0, xmm1
- addps xmm0, xmm2
- addps xmm3, xmm4
- addps xmm3, xmm5
- movaps [esp + .rsqOO], xmm0
- movaps [esp + .rsqOH1], xmm3
-
- movaps xmm0, [esp + .ixO]
- movaps xmm1, [esp + .iyO]
- movaps xmm2, [esp + .izO]
- movaps xmm3, [esp + .ixH1]
- movaps xmm4, [esp + .iyH1]
- movaps xmm5, [esp + .izH1]
- subps xmm0, [esp + .jxH2]
- subps xmm1, [esp + .jyH2]
- subps xmm2, [esp + .jzH2]
- subps xmm3, [esp + .jxO]
- subps xmm4, [esp + .jyO]
- subps xmm5, [esp + .jzO]
- movaps [esp + .dxOH2], xmm0
- movaps [esp + .dyOH2], xmm1
- movaps [esp + .dzOH2], xmm2
- mulps xmm0, xmm0
- mulps xmm1, xmm1
- mulps xmm2, xmm2
- movaps [esp + .dxH1O], xmm3
- movaps [esp + .dyH1O], xmm4
- movaps [esp + .dzH1O], xmm5
- mulps xmm3, xmm3
- mulps xmm4, xmm4
- mulps xmm5, xmm5
- addps xmm0, xmm1
- addps xmm0, xmm2
- addps xmm3, xmm4
- addps xmm3, xmm5
- movaps [esp + .rsqOH2], xmm0
- movaps [esp + .rsqH1O], xmm3
-
- movaps xmm0, [esp + .ixH1]
- movaps xmm1, [esp + .iyH1]
- movaps xmm2, [esp + .izH1]
- movaps xmm3, [esp + .ixH1]
- movaps xmm4, [esp + .iyH1]
- movaps xmm5, [esp + .izH1]
- subps xmm0, [esp + .jxH1]
- subps xmm1, [esp + .jyH1]
- subps xmm2, [esp + .jzH1]
- subps xmm3, [esp + .jxH2]
- subps xmm4, [esp + .jyH2]
- subps xmm5, [esp + .jzH2]
- movaps [esp + .dxH1H1], xmm0
- movaps [esp + .dyH1H1], xmm1
- movaps [esp + .dzH1H1], xmm2
- mulps xmm0, xmm0
- mulps xmm1, xmm1
- mulps xmm2, xmm2
- movaps [esp + .dxH1H2], xmm3
- movaps [esp + .dyH1H2], xmm4
- movaps [esp + .dzH1H2], xmm5
- mulps xmm3, xmm3
- mulps xmm4, xmm4
- mulps xmm5, xmm5
- addps xmm0, xmm1
- addps xmm0, xmm2
- addps xmm3, xmm4
- addps xmm3, xmm5
- movaps [esp + .rsqH1H1], xmm0
- movaps [esp + .rsqH1H2], xmm3
-
- movaps xmm0, [esp + .ixH2]
- movaps xmm1, [esp + .iyH2]
- movaps xmm2, [esp + .izH2]
- movaps xmm3, [esp + .ixH2]
- movaps xmm4, [esp + .iyH2]
- movaps xmm5, [esp + .izH2]
- subps xmm0, [esp + .jxO]
- subps xmm1, [esp + .jyO]
- subps xmm2, [esp + .jzO]
- subps xmm3, [esp + .jxH1]
- subps xmm4, [esp + .jyH1]
- subps xmm5, [esp + .jzH1]
- movaps [esp + .dxH2O], xmm0
- movaps [esp + .dyH2O], xmm1
- movaps [esp + .dzH2O], xmm2
- mulps xmm0, xmm0
- mulps xmm1, xmm1
- mulps xmm2, xmm2
- movaps [esp + .dxH2H1], xmm3
- movaps [esp + .dyH2H1], xmm4
- movaps [esp + .dzH2H1], xmm5
- mulps xmm3, xmm3
- mulps xmm4, xmm4
- mulps xmm5, xmm5
- addps xmm0, xmm1
- addps xmm0, xmm2
- addps xmm4, xmm3
- addps xmm4, xmm5
- movaps [esp + .rsqH2O], xmm0
- movaps [esp + .rsqH2H1], xmm4
-
- movaps xmm0, [esp + .ixH2]
- movaps xmm1, [esp + .iyH2]
- movaps xmm2, [esp + .izH2]
- subps xmm0, [esp + .jxH2]
- subps xmm1, [esp + .jyH2]
- subps xmm2, [esp + .jzH2]
- movaps [esp + .dxH2H2], xmm0
- movaps [esp + .dyH2H2], xmm1
- movaps [esp + .dzH2H2], xmm2
- mulps xmm0, xmm0
- mulps xmm1, xmm1
- mulps xmm2, xmm2
- addps xmm0, xmm1
- addps xmm0, xmm2
- movaps [esp + .rsqH2H2], xmm0
-
- ; start doing invsqrt. use rsq values in xmm0, xmm4
- rsqrtps xmm1, xmm0
- rsqrtps xmm5, xmm4
- movaps xmm2, xmm1
- movaps xmm6, xmm5
- mulps xmm1, xmm1
- mulps xmm5, xmm5
- movaps xmm3, [esp + .three]
- movaps xmm7, xmm3
- mulps xmm1, xmm0
- mulps xmm5, xmm4
- subps xmm3, xmm1
- subps xmm7, xmm5
- mulps xmm3, xmm2
- mulps xmm7, xmm6
- mulps xmm3, [esp + .half] ; rinvH2H2
- mulps xmm7, [esp + .half] ; rinvH2H1
- movaps [esp + .rinvH2H2], xmm3
- movaps [esp + .rinvH2H1], xmm7
-
- rsqrtps xmm1, [esp + .rsqOO]
- rsqrtps xmm5, [esp + .rsqOH1]
- movaps xmm2, xmm1
- movaps xmm6, xmm5
- mulps xmm1, xmm1
- mulps xmm5, xmm5
- movaps xmm3, [esp + .three]
- movaps xmm7, xmm3
- mulps xmm1, [esp + .rsqOO]
- mulps xmm5, [esp + .rsqOH1]
- subps xmm3, xmm1
- subps xmm7, xmm5
- mulps xmm3, xmm2
- mulps xmm7, xmm6
- mulps xmm3, [esp + .half]
- mulps xmm7, [esp + .half]
- movaps [esp + .rinvOO], xmm3
- movaps [esp + .rinvOH1], xmm7
-
- rsqrtps xmm1, [esp + .rsqOH2]
- rsqrtps xmm5, [esp + .rsqH1O]
- movaps xmm2, xmm1
- movaps xmm6, xmm5
- mulps xmm1, xmm1
- mulps xmm5, xmm5
- movaps xmm3, [esp + .three]
- movaps xmm7, xmm3
- mulps xmm1, [esp + .rsqOH2]
- mulps xmm5, [esp + .rsqH1O]
- subps xmm3, xmm1
- subps xmm7, xmm5
- mulps xmm3, xmm2
- mulps xmm7, xmm6
- mulps xmm3, [esp + .half]
- mulps xmm7, [esp + .half]
- movaps [esp + .rinvOH2], xmm3
- movaps [esp + .rinvH1O], xmm7
-
- rsqrtps xmm1, [esp + .rsqH1H1]
- rsqrtps xmm5, [esp + .rsqH1H2]
- movaps xmm2, xmm1
- movaps xmm6, xmm5
- mulps xmm1, xmm1
- mulps xmm5, xmm5
- movaps xmm3, [esp + .three]
- movaps xmm7, xmm3
- mulps xmm1, [esp + .rsqH1H1]
- mulps xmm5, [esp + .rsqH1H2]
- subps xmm3, xmm1
- subps xmm7, xmm5
- mulps xmm3, xmm2
- mulps xmm7, xmm6
- mulps xmm3, [esp + .half]
- mulps xmm7, [esp + .half]
- movaps [esp + .rinvH1H1], xmm3
- movaps [esp + .rinvH1H2], xmm7
-
- rsqrtps xmm1, [esp + .rsqH2O]
- movaps xmm2, xmm1
- mulps xmm1, xmm1
- movaps xmm3, [esp + .three]
- mulps xmm1, [esp + .rsqH2O]
- subps xmm3, xmm1
- mulps xmm3, xmm2
- mulps xmm3, [esp + .half]
- movaps [esp + .rinvH2O], xmm3
-
- ;; start with OO interaction.
- movaps xmm0, [esp + .rinvOO]
- movaps xmm1, xmm0
- mulps xmm1, [esp + .rsqOO] ; xmm1=r
- mulps xmm1, [esp + .tabscale]
-
- movhlps xmm2, xmm1
- cvttps2pi mm6, xmm1
- cvttps2pi mm7, xmm2 ; mm6/mm7 contain lu indices
- cvtpi2ps xmm3, mm6
- cvtpi2ps xmm2, mm7
- movlhps xmm3, xmm2
- subps xmm1, xmm3 ; xmm1=eps
- movaps xmm2, xmm1
- mulps xmm2, xmm2 ;xmm2=eps2
- pslld mm6, 2
- pslld mm7, 2
-
- movd mm0, eax
- movd mm1, ebx
- movd mm2, ecx
- movd mm3, edx
-
- mov esi, [ebp + %$VFtab]
- movd eax, mm6
- psrlq mm6, 32
- movd ecx, mm7
- psrlq mm7, 32
- movd ebx, mm6
- movd edx, mm7
-
- movlps xmm5, [esi + eax*4]
- movlps xmm7, [esi + ecx*4]
- movhps xmm5, [esi + ebx*4]
- movhps xmm7, [esi + edx*4] ; got half coulomb table
-
- movaps xmm4, xmm5
- shufps xmm4, xmm7, 10001000b
- shufps xmm5, xmm7, 11011101b
-
- movlps xmm7, [esi + eax*4 + 8]
- movlps xmm3, [esi + ecx*4 + 8]
- movhps xmm7, [esi + ebx*4 + 8]
- movhps xmm3, [esi + edx*4 + 8] ; other half of coulomb table
- movaps xmm6, xmm7
- shufps xmm6, xmm3, 10001000b
- shufps xmm7, xmm3, 11011101b
- ; coulomb table ready, in xmm4-xmm7
-
- mulps xmm6, xmm1 ; xmm6=Geps
- mulps xmm7, xmm2 ; xmm7=Heps2
- addps xmm5, xmm6
- addps xmm5, xmm7 ; xmm5=Fp
- mulps xmm7, [esp + .two] ; two*Heps2
- movaps xmm3, [esp + .qqOO]
- addps xmm7, xmm6
- addps xmm7, xmm5 ; xmm7=FF
- mulps xmm5, xmm1 ; xmm5=eps*Fp
- addps xmm5, xmm4 ; xmm5=VV
- mulps xmm5, xmm3 ; vcoul=qq*VV
- mulps xmm3, xmm7 ; fijC=FF*qq
- ; at this point mm5 contains vcoul and mm3 fijC.
- ; increment vcoul - then we can get rid of mm5.
- ;; update vctot
- addps xmm5, [esp + .vctot]
- movaps [esp + .vctot], xmm5
- mulps xmm3, [esp + .tabscale]
-
- ;; start doing lj
- movaps xmm2, xmm0
- mulps xmm2, xmm2
- movaps xmm1, xmm2
- mulps xmm1, xmm2
- mulps xmm1, xmm2 ; xmm1=rinvsix
- movaps xmm2, xmm1
- mulps xmm2, xmm2 ; xmm2=rinvtwelve
- mulps xmm1, [esp + .c6]
- mulps xmm2, [esp + .c12]
- movaps xmm4, xmm2
- subps xmm4, xmm1
- addps xmm4, [esp + .vnbtot]
- mulps xmm1, [esp + .six]
- mulps xmm2, [esp + .twelve]
- movaps [esp + .vnbtot], xmm4
- subps xmm2, xmm1
- mulps xmm2, xmm0
-
- subps xmm2, xmm3
- mulps xmm0, xmm2
-
- movaps xmm1, xmm0
- movaps xmm2, xmm0
-
- xorps xmm3, xmm3
- movaps xmm4, xmm3
- movaps xmm5, xmm3
- mulps xmm0, [esp + .dxOO]
- mulps xmm1, [esp + .dyOO]
- mulps xmm2, [esp + .dzOO]
- subps xmm3, xmm0
- subps xmm4, xmm1
- subps xmm5, xmm2
- addps xmm0, [esp + .fixO]
- addps xmm1, [esp + .fiyO]
- addps xmm2, [esp + .fizO]
- movaps [esp + .fjxO], xmm3
- movaps [esp + .fjyO], xmm4
- movaps [esp + .fjzO], xmm5
- movaps [esp + .fixO], xmm0
- movaps [esp + .fiyO], xmm1
- movaps [esp + .fizO], xmm2
-
- ; O-H1 interaction
- movaps xmm0, [esp + .rinvOH1]
- movaps xmm1, xmm0
- mulps xmm1, [esp + .rsqOH1] ; xmm1=r
- mulps xmm1, [esp + .tabscale]
- movhlps xmm2, xmm1
- cvttps2pi mm6, xmm1
- cvttps2pi mm7, xmm2 ; mm6/mm7 contain lu indices
- cvtpi2ps xmm3, mm6
- cvtpi2ps xmm2, mm7
- movlhps xmm3, xmm2
- subps xmm1, xmm3 ; xmm1=eps
- movaps xmm2, xmm1
- mulps xmm2, xmm2 ;xmm2=eps2
-
- pslld mm6, 2
- pslld mm7, 2
-
- movd eax, mm6
- psrlq mm6, 32
- movd ecx, mm7
- psrlq mm7, 32
- movd ebx, mm6
- movd edx, mm7
-
- movlps xmm5, [esi + eax*4]
- movlps xmm7, [esi + ecx*4]
- movhps xmm5, [esi + ebx*4]
- movhps xmm7, [esi + edx*4] ; got half coulomb table
-
- movaps xmm4, xmm5
- shufps xmm4, xmm7, 10001000b
- shufps xmm5, xmm7, 11011101b
-
- movlps xmm7, [esi + eax*4 + 8]
- movlps xmm3, [esi + ecx*4 + 8]
- movhps xmm7, [esi + ebx*4 + 8]
- movhps xmm3, [esi + edx*4 + 8] ; other half of coulomb table
- movaps xmm6, xmm7
- shufps xmm6, xmm3, 10001000b
- shufps xmm7, xmm3, 11011101b
- ; coulomb table ready, in xmm4-xmm7
-
- mulps xmm6, xmm1 ; xmm6=Geps
- mulps xmm7, xmm2 ; xmm7=Heps2
- addps xmm5, xmm6
- addps xmm5, xmm7 ; xmm5=Fp
- mulps xmm7, [esp + .two] ; two*Heps2
- movaps xmm3, [esp + .qqOH]
- addps xmm7, xmm6
- addps xmm7, xmm5 ; xmm7=FF
- mulps xmm5, xmm1 ; xmm5=eps*Fp
- addps xmm5, xmm4 ; xmm5=VV
- mulps xmm5, xmm3 ; vcoul=qq*VV
- mulps xmm3, xmm7 ; fijC=FF*qq
- ; at this point mm5 contains vcoul and mm3 fijC.
-
- addps xmm5, [esp + .vctot]
- movaps [esp + .vctot], xmm5
- xorps xmm1, xmm1
- mulps xmm3, [esp + .tabscale]
- mulps xmm3, xmm0
- subps xmm1, xmm3
-
- movaps xmm0, xmm1
- movaps xmm2, xmm1
-
- xorps xmm3, xmm3
- movaps xmm4, xmm3
- movaps xmm5, xmm3
- mulps xmm0, [esp + .dxOH1]
- mulps xmm1, [esp + .dyOH1]
- mulps xmm2, [esp + .dzOH1]
- subps xmm3, xmm0
- subps xmm4, xmm1
- subps xmm5, xmm2
- addps xmm0, [esp + .fixO]
- addps xmm1, [esp + .fiyO]
- addps xmm2, [esp + .fizO]
- movaps [esp + .fjxH1], xmm3
- movaps [esp + .fjyH1], xmm4
- movaps [esp + .fjzH1], xmm5
- movaps [esp + .fixO], xmm0
- movaps [esp + .fiyO], xmm1
- movaps [esp + .fizO], xmm2
-
- ; O-H2 interaction
- movaps xmm0, [esp + .rinvOH2]
- movaps xmm1, xmm0
- mulps xmm1, [esp + .rsqOH2] ; xmm1=r
- mulps xmm1, [esp + .tabscale]
- movhlps xmm2, xmm1
- cvttps2pi mm6, xmm1
- cvttps2pi mm7, xmm2 ; mm6/mm7 contain lu indices
- cvtpi2ps xmm3, mm6
- cvtpi2ps xmm2, mm7
- movlhps xmm3, xmm2
- subps xmm1, xmm3 ; xmm1=eps
- movaps xmm2, xmm1
- mulps xmm2, xmm2 ;xmm2=eps2
-
- pslld mm6, 2
- pslld mm7, 2
-
- movd eax, mm6
- psrlq mm6, 32
- movd ecx, mm7
- psrlq mm7, 32
- movd ebx, mm6
- movd edx, mm7
-
- movlps xmm5, [esi + eax*4]
- movlps xmm7, [esi + ecx*4]
- movhps xmm5, [esi + ebx*4]
- movhps xmm7, [esi + edx*4] ; got half coulomb table
-
- movaps xmm4, xmm5
- shufps xmm4, xmm7, 10001000b
- shufps xmm5, xmm7, 11011101b
-
- movlps xmm7, [esi + eax*4 + 8]
- movlps xmm3, [esi + ecx*4 + 8]
- movhps xmm7, [esi + ebx*4 + 8]
- movhps xmm3, [esi + edx*4 + 8] ; other half of coulomb table
- movaps xmm6, xmm7
- shufps xmm6, xmm3, 10001000b
- shufps xmm7, xmm3, 11011101b
- ; coulomb table ready, in xmm4-xmm7
-
- mulps xmm6, xmm1 ; xmm6=Geps
- mulps xmm7, xmm2 ; xmm7=Heps2
- addps xmm5, xmm6
- addps xmm5, xmm7 ; xmm5=Fp
- mulps xmm7, [esp + .two] ; two*Heps2
- movaps xmm3, [esp + .qqOH]
- addps xmm7, xmm6
- addps xmm7, xmm5 ; xmm7=FF
- mulps xmm5, xmm1 ; xmm5=eps*Fp
- addps xmm5, xmm4 ; xmm5=VV
- mulps xmm5, xmm3 ; vcoul=qq*VV
- mulps xmm3, xmm7 ; fijC=FF*qq
- ; at this point mm5 contains vcoul and mm3 fijC.
-
- addps xmm5, [esp + .vctot]
- movaps [esp + .vctot], xmm5
- xorps xmm1, xmm1
- mulps xmm3, [esp + .tabscale]
- mulps xmm3, xmm0
- subps xmm1, xmm3
-
- movaps xmm0, xmm1
- movaps xmm2, xmm1
-
- xorps xmm3, xmm3
- movaps xmm4, xmm3
- movaps xmm5, xmm3
- mulps xmm0, [esp + .dxOH2]
- mulps xmm1, [esp + .dyOH2]
- mulps xmm2, [esp + .dzOH2]
- subps xmm3, xmm0
- subps xmm4, xmm1
- subps xmm5, xmm2
- addps xmm0, [esp + .fixO]
- addps xmm1, [esp + .fiyO]
- addps xmm2, [esp + .fizO]
- movaps [esp + .fjxH2], xmm3
- movaps [esp + .fjyH2], xmm4
- movaps [esp + .fjzH2], xmm5
- movaps [esp + .fixO], xmm0
- movaps [esp + .fiyO], xmm1
- movaps [esp + .fizO], xmm2
-
- ; H1-O interaction
- movaps xmm0, [esp + .rinvH1O]
- movaps xmm1, xmm0
- mulps xmm1, [esp + .rsqH1O] ; xmm1=r
- mulps xmm1, [esp + .tabscale]
- movhlps xmm2, xmm1
- cvttps2pi mm6, xmm1
- cvttps2pi mm7, xmm2 ; mm6/mm7 contain lu indices
- cvtpi2ps xmm3, mm6
- cvtpi2ps xmm2, mm7
- movlhps xmm3, xmm2
- subps xmm1, xmm3 ; xmm1=eps
- movaps xmm2, xmm1
- mulps xmm2, xmm2 ;xmm2=eps2
-
- pslld mm6, 2
- pslld mm7, 2
-
- movd eax, mm6
- psrlq mm6, 32
- movd ecx, mm7
- psrlq mm7, 32
- movd ebx, mm6
- movd edx, mm7
-
- movlps xmm5, [esi + eax*4]
- movlps xmm7, [esi + ecx*4]
- movhps xmm5, [esi + ebx*4]
- movhps xmm7, [esi + edx*4] ; got half coulomb table
-
- movaps xmm4, xmm5
- shufps xmm4, xmm7, 10001000b
- shufps xmm5, xmm7, 11011101b
-
- movlps xmm7, [esi + eax*4 + 8]
- movlps xmm3, [esi + ecx*4 + 8]
- movhps xmm7, [esi + ebx*4 + 8]
- movhps xmm3, [esi + edx*4 + 8] ; other half of coulomb table
- movaps xmm6, xmm7
- shufps xmm6, xmm3, 10001000b
- shufps xmm7, xmm3, 11011101b
- ; coulomb table ready, in xmm4-xmm7
-
- mulps xmm6, xmm1 ; xmm6=Geps
- mulps xmm7, xmm2 ; xmm7=Heps2
- addps xmm5, xmm6
- addps xmm5, xmm7 ; xmm5=Fp
- mulps xmm7, [esp + .two] ; two*Heps2
- movaps xmm3, [esp + .qqOH]
- addps xmm7, xmm6
- addps xmm7, xmm5 ; xmm7=FF
- mulps xmm5, xmm1 ; xmm5=eps*Fp
- addps xmm5, xmm4 ; xmm5=VV
- mulps xmm5, xmm3 ; vcoul=qq*VV
- mulps xmm3, xmm7 ; fijC=FF*qq
- ; at this point mm5 contains vcoul and mm3 fijC.
-
- addps xmm5, [esp + .vctot]
- movaps [esp + .vctot], xmm5
- xorps xmm1, xmm1
- mulps xmm3, [esp + .tabscale]
- mulps xmm3, xmm0
- subps xmm1, xmm3
-
- movaps xmm0, xmm1
- movaps xmm2, xmm1
-
- movaps xmm3, [esp + .fjxO]
- movaps xmm4, [esp + .fjyO]
- movaps xmm5, [esp + .fjzO]
- mulps xmm0, [esp + .dxH1O]
- mulps xmm1, [esp + .dyH1O]
- mulps xmm2, [esp + .dzH1O]
- subps xmm3, xmm0
- subps xmm4, xmm1
- subps xmm5, xmm2
- addps xmm0, [esp + .fixH1]
- addps xmm1, [esp + .fiyH1]
- addps xmm2, [esp + .fizH1]
- movaps [esp + .fjxO], xmm3
- movaps [esp + .fjyO], xmm4
- movaps [esp + .fjzO], xmm5
- movaps [esp + .fixH1], xmm0
- movaps [esp + .fiyH1], xmm1
- movaps [esp + .fizH1], xmm2
-
- ; H1-H1 interaction
- movaps xmm0, [esp + .rinvH1H1]
- movaps xmm1, xmm0
- mulps xmm1, [esp + .rsqH1H1] ; xmm1=r
- mulps xmm1, [esp + .tabscale]
- movhlps xmm2, xmm1
- cvttps2pi mm6, xmm1
- cvttps2pi mm7, xmm2 ; mm6/mm7 contain lu indices
- cvtpi2ps xmm3, mm6
- cvtpi2ps xmm2, mm7
- movlhps xmm3, xmm2
- subps xmm1, xmm3 ; xmm1=eps
- movaps xmm2, xmm1
- mulps xmm2, xmm2 ;xmm2=eps2
-
- pslld mm6, 2
- pslld mm7, 2
-
- movd eax, mm6
- psrlq mm6, 32
- movd ecx, mm7
- psrlq mm7, 32
- movd ebx, mm6
- movd edx, mm7
-
- movlps xmm5, [esi + eax*4]
- movlps xmm7, [esi + ecx*4]
- movhps xmm5, [esi + ebx*4]
- movhps xmm7, [esi + edx*4] ; got half coulomb table
-
- movaps xmm4, xmm5
- shufps xmm4, xmm7, 10001000b
- shufps xmm5, xmm7, 11011101b
-
- movlps xmm7, [esi + eax*4 + 8]
- movlps xmm3, [esi + ecx*4 + 8]
- movhps xmm7, [esi + ebx*4 + 8]
- movhps xmm3, [esi + edx*4 + 8] ; other half of coulomb table
- movaps xmm6, xmm7
- shufps xmm6, xmm3, 10001000b
- shufps xmm7, xmm3, 11011101b
- ; coulomb table ready, in xmm4-xmm7
-
- mulps xmm6, xmm1 ; xmm6=Geps
- mulps xmm7, xmm2 ; xmm7=Heps2
- addps xmm5, xmm6
- addps xmm5, xmm7 ; xmm5=Fp
- mulps xmm7, [esp + .two] ; two*Heps2
- movaps xmm3, [esp + .qqHH]
- addps xmm7, xmm6
- addps xmm7, xmm5 ; xmm7=FF
- mulps xmm5, xmm1 ; xmm5=eps*Fp
- addps xmm5, xmm4 ; xmm5=VV
- mulps xmm5, xmm3 ; vcoul=qq*VV
- mulps xmm3, xmm7 ; fijC=FF*qq
- ; at this point mm5 contains vcoul and mm3 fijC.
-
- addps xmm5, [esp + .vctot]
- movaps [esp + .vctot], xmm5
- xorps xmm1, xmm1
- mulps xmm3, [esp + .tabscale]
- mulps xmm3, xmm0
- subps xmm1, xmm3
-
- movaps xmm0, xmm1
- movaps xmm2, xmm1
-
- movaps xmm3, [esp + .fjxH1]
- movaps xmm4, [esp + .fjyH1]
- movaps xmm5, [esp + .fjzH1]
- mulps xmm0, [esp + .dxH1H1]
- mulps xmm1, [esp + .dyH1H1]
- mulps xmm2, [esp + .dzH1H1]
- subps xmm3, xmm0
- subps xmm4, xmm1
- subps xmm5, xmm2
- addps xmm0, [esp + .fixH1]
- addps xmm1, [esp + .fiyH1]
- addps xmm2, [esp + .fizH1]
- movaps [esp + .fjxH1], xmm3
- movaps [esp + .fjyH1], xmm4
- movaps [esp + .fjzH1], xmm5
- movaps [esp + .fixH1], xmm0
- movaps [esp + .fiyH1], xmm1
- movaps [esp + .fizH1], xmm2
-
- ; H1-H2 interaction
- movaps xmm0, [esp + .rinvH1H2]
- movaps xmm1, xmm0
- mulps xmm1, [esp + .rsqH1H2] ; xmm1=r
- mulps xmm1, [esp + .tabscale]
- movhlps xmm2, xmm1
- cvttps2pi mm6, xmm1
- cvttps2pi mm7, xmm2 ; mm6/mm7 contain lu indices
- cvtpi2ps xmm3, mm6
- cvtpi2ps xmm2, mm7
- movlhps xmm3, xmm2
- subps xmm1, xmm3 ; xmm1=eps
- movaps xmm2, xmm1
- mulps xmm2, xmm2 ;xmm2=eps2
-
- pslld mm6, 2
- pslld mm7, 2
-
- movd eax, mm6
- psrlq mm6, 32
- movd ecx, mm7
- psrlq mm7, 32
- movd ebx, mm6
- movd edx, mm7
-
- movlps xmm5, [esi + eax*4]
- movlps xmm7, [esi + ecx*4]
- movhps xmm5, [esi + ebx*4]
- movhps xmm7, [esi + edx*4] ; got half coulomb table
-
- movaps xmm4, xmm5
- shufps xmm4, xmm7, 10001000b
- shufps xmm5, xmm7, 11011101b
-
- movlps xmm7, [esi + eax*4 + 8]
- movlps xmm3, [esi + ecx*4 + 8]
- movhps xmm7, [esi + ebx*4 + 8]
- movhps xmm3, [esi + edx*4 + 8] ; other half of coulomb table
- movaps xmm6, xmm7
- shufps xmm6, xmm3, 10001000b
- shufps xmm7, xmm3, 11011101b
- ; coulomb table ready, in xmm4-xmm7
-
- mulps xmm6, xmm1 ; xmm6=Geps
- mulps xmm7, xmm2 ; xmm7=Heps2
- addps xmm5, xmm6
- addps xmm5, xmm7 ; xmm5=Fp
- mulps xmm7, [esp + .two] ; two*Heps2
- movaps xmm3, [esp + .qqHH]
- addps xmm7, xmm6
- addps xmm7, xmm5 ; xmm7=FF
- mulps xmm5, xmm1 ; xmm5=eps*Fp
- addps xmm5, xmm4 ; xmm5=VV
- mulps xmm5, xmm3 ; vcoul=qq*VV
- mulps xmm3, xmm7 ; fijC=FF*qq
- ; at this point mm5 contains vcoul and mm3 fijC.
-
- addps xmm5, [esp + .vctot]
- movaps [esp + .vctot], xmm5
- xorps xmm1, xmm1
- mulps xmm3, [esp + .tabscale]
- mulps xmm3, xmm0
- subps xmm1, xmm3
-
- movaps xmm0, xmm1
- movaps xmm2, xmm1
-
- movaps xmm3, [esp + .fjxH2]
- movaps xmm4, [esp + .fjyH2]
- movaps xmm5, [esp + .fjzH2]
- mulps xmm0, [esp + .dxH1H2]
- mulps xmm1, [esp + .dyH1H2]
- mulps xmm2, [esp + .dzH1H2]
- subps xmm3, xmm0
- subps xmm4, xmm1
- subps xmm5, xmm2
- addps xmm0, [esp + .fixH1]
- addps xmm1, [esp + .fiyH1]
- addps xmm2, [esp + .fizH1]
- movaps [esp + .fjxH2], xmm3
- movaps [esp + .fjyH2], xmm4
- movaps [esp + .fjzH2], xmm5
- movaps [esp + .fixH1], xmm0
- movaps [esp + .fiyH1], xmm1
- movaps [esp + .fizH1], xmm2
-
- ; H2-O interaction
- movaps xmm0, [esp + .rinvH2O]
- movaps xmm1, xmm0
- mulps xmm1, [esp + .rsqH2O] ; xmm1=r
- mulps xmm1, [esp + .tabscale]
- movhlps xmm2, xmm1
- cvttps2pi mm6, xmm1
- cvttps2pi mm7, xmm2 ; mm6/mm7 contain lu indices
- cvtpi2ps xmm3, mm6
- cvtpi2ps xmm2, mm7
- movlhps xmm3, xmm2
- subps xmm1, xmm3 ; xmm1=eps
- movaps xmm2, xmm1
- mulps xmm2, xmm2 ;xmm2=eps2
-
- pslld mm6, 2
- pslld mm7, 2
-
- movd eax, mm6
- psrlq mm6, 32
- movd ecx, mm7
- psrlq mm7, 32
- movd ebx, mm6
- movd edx, mm7
-
- movlps xmm5, [esi + eax*4]
- movlps xmm7, [esi + ecx*4]
- movhps xmm5, [esi + ebx*4]
- movhps xmm7, [esi + edx*4] ; got half coulomb table
-
- movaps xmm4, xmm5
- shufps xmm4, xmm7, 10001000b
- shufps xmm5, xmm7, 11011101b
-
- movlps xmm7, [esi + eax*4 + 8]
- movlps xmm3, [esi + ecx*4 + 8]
- movhps xmm7, [esi + ebx*4 + 8]
- movhps xmm3, [esi + edx*4 + 8] ; other half of coulomb table
- movaps xmm6, xmm7
- shufps xmm6, xmm3, 10001000b
- shufps xmm7, xmm3, 11011101b
- ; coulomb table ready, in xmm4-xmm7
-
- mulps xmm6, xmm1 ; xmm6=Geps
- mulps xmm7, xmm2 ; xmm7=Heps2
- addps xmm5, xmm6
- addps xmm5, xmm7 ; xmm5=Fp
- mulps xmm7, [esp + .two] ; two*Heps2
- movaps xmm3, [esp + .qqOH]
- addps xmm7, xmm6
- addps xmm7, xmm5 ; xmm7=FF
- mulps xmm5, xmm1 ; xmm5=eps*Fp
- addps xmm5, xmm4 ; xmm5=VV
- mulps xmm5, xmm3 ; vcoul=qq*VV
- mulps xmm3, xmm7 ; fijC=FF*qq
- ; at this point mm5 contains vcoul and mm3 fijC.
-
- addps xmm5, [esp + .vctot]
- movaps [esp + .vctot], xmm5
- xorps xmm1, xmm1
- mulps xmm3, [esp + .tabscale]
- mulps xmm3, xmm0
- subps xmm1, xmm3
-
- movaps xmm0, xmm1
- movaps xmm2, xmm1
-
- movaps xmm3, [esp + .fjxO]
- movaps xmm4, [esp + .fjyO]
- movaps xmm5, [esp + .fjzO]
- mulps xmm0, [esp + .dxH2O]
- mulps xmm1, [esp + .dyH2O]
- mulps xmm2, [esp + .dzH2O]
- subps xmm3, xmm0
- subps xmm4, xmm1
- subps xmm5, xmm2
- addps xmm0, [esp + .fixH2]
- addps xmm1, [esp + .fiyH2]
- addps xmm2, [esp + .fizH2]
- movaps [esp + .fjxO], xmm3
- movaps [esp + .fjyO], xmm4
- movaps [esp + .fjzO], xmm5
- movaps [esp + .fixH2], xmm0
- movaps [esp + .fiyH2], xmm1
- movaps [esp + .fizH2], xmm2
-
- ; H2-H1 interaction
- movaps xmm0, [esp + .rinvH2H1]
- movaps xmm1, xmm0
- mulps xmm1, [esp + .rsqH2H1] ; xmm1=r
- mulps xmm1, [esp + .tabscale]
- movhlps xmm2, xmm1
- cvttps2pi mm6, xmm1
- cvttps2pi mm7, xmm2 ; mm6/mm7 contain lu indices
- cvtpi2ps xmm3, mm6
- cvtpi2ps xmm2, mm7
- movlhps xmm3, xmm2
- subps xmm1, xmm3 ; xmm1=eps
- movaps xmm2, xmm1
- mulps xmm2, xmm2 ;xmm2=eps2
-
- pslld mm6, 2
- pslld mm7, 2
-
- movd eax, mm6
- psrlq mm6, 32
- movd ecx, mm7
- psrlq mm7, 32
- movd ebx, mm6
- movd edx, mm7
-
- movlps xmm5, [esi + eax*4]
- movlps xmm7, [esi + ecx*4]
- movhps xmm5, [esi + ebx*4]
- movhps xmm7, [esi + edx*4] ; got half coulomb table
-
- movaps xmm4, xmm5
- shufps xmm4, xmm7, 10001000b
- shufps xmm5, xmm7, 11011101b
-
- movlps xmm7, [esi + eax*4 + 8]
- movlps xmm3, [esi + ecx*4 + 8]
- movhps xmm7, [esi + ebx*4 + 8]
- movhps xmm3, [esi + edx*4 + 8] ; other half of coulomb table
- movaps xmm6, xmm7
- shufps xmm6, xmm3, 10001000b
- shufps xmm7, xmm3, 11011101b
- ; coulomb table ready, in xmm4-xmm7
-
- mulps xmm6, xmm1 ; xmm6=Geps
- mulps xmm7, xmm2 ; xmm7=Heps2
- addps xmm5, xmm6
- addps xmm5, xmm7 ; xmm5=Fp
- mulps xmm7, [esp + .two] ; two*Heps2
- movaps xmm3, [esp + .qqHH]
- addps xmm7, xmm6
- addps xmm7, xmm5 ; xmm7=FF
- mulps xmm5, xmm1 ; xmm5=eps*Fp
- addps xmm5, xmm4 ; xmm5=VV
- mulps xmm5, xmm3 ; vcoul=qq*VV
- mulps xmm3, xmm7 ; fijC=FF*qq
- ; at this point mm5 contains vcoul and mm3 fijC.
-
- addps xmm5, [esp + .vctot]
- movaps [esp + .vctot], xmm5
- xorps xmm1, xmm1
- mulps xmm3, [esp + .tabscale]
- mulps xmm3, xmm0
- subps xmm1, xmm3
-
- movaps xmm0, xmm1
- movaps xmm2, xmm1
-
- movaps xmm3, [esp + .fjxH1]
- movaps xmm4, [esp + .fjyH1]
- movaps xmm5, [esp + .fjzH1]
- mulps xmm0, [esp + .dxH2H1]
- mulps xmm1, [esp + .dyH2H1]
- mulps xmm2, [esp + .dzH2H1]
- subps xmm3, xmm0
- subps xmm4, xmm1
- subps xmm5, xmm2
- addps xmm0, [esp + .fixH2]
- addps xmm1, [esp + .fiyH2]
- addps xmm2, [esp + .fizH2]
- movaps [esp + .fjxH1], xmm3
- movaps [esp + .fjyH1], xmm4
- movaps [esp + .fjzH1], xmm5
- movaps [esp + .fixH2], xmm0
- movaps [esp + .fiyH2], xmm1
- movaps [esp + .fizH2], xmm2
-
- ; H2-H2 interaction
- movaps xmm0, [esp + .rinvH2H2]
- movaps xmm1, xmm0
- mulps xmm1, [esp + .rsqH2H2] ; xmm1=r
- mulps xmm1, [esp + .tabscale]
- movhlps xmm2, xmm1
- cvttps2pi mm6, xmm1
- cvttps2pi mm7, xmm2 ; mm6/mm7 contain lu indices
- cvtpi2ps xmm3, mm6
- cvtpi2ps xmm2, mm7
- movlhps xmm3, xmm2
- subps xmm1, xmm3 ; xmm1=eps
- movaps xmm2, xmm1
- mulps xmm2, xmm2 ;xmm2=eps2
-
- pslld mm6, 2
- pslld mm7, 2
-
- movd eax, mm6
- psrlq mm6, 32
- movd ecx, mm7
- psrlq mm7, 32
- movd ebx, mm6
- movd edx, mm7
-
- movlps xmm5, [esi + eax*4]
- movlps xmm7, [esi + ecx*4]
- movhps xmm5, [esi + ebx*4]
- movhps xmm7, [esi + edx*4] ; got half coulomb table
-
- movaps xmm4, xmm5
- shufps xmm4, xmm7, 10001000b
- shufps xmm5, xmm7, 11011101b
-
- movlps xmm7, [esi + eax*4 + 8]
- movlps xmm3, [esi + ecx*4 + 8]
- movhps xmm7, [esi + ebx*4 + 8]
- movhps xmm3, [esi + edx*4 + 8] ; other half of coulomb table
- movaps xmm6, xmm7
- shufps xmm6, xmm3, 10001000b
- shufps xmm7, xmm3, 11011101b
- ; coulomb table ready, in xmm4-xmm7
-
- mulps xmm6, xmm1 ; xmm6=Geps
- mulps xmm7, xmm2 ; xmm7=Heps2
- addps xmm5, xmm6
- addps xmm5, xmm7 ; xmm5=Fp
- mulps xmm7, [esp + .two] ; two*Heps2
- movaps xmm3, [esp + .qqHH]
- addps xmm7, xmm6
- addps xmm7, xmm5 ; xmm7=FF
- mulps xmm5, xmm1 ; xmm5=eps*Fp
- addps xmm5, xmm4 ; xmm5=VV
- mulps xmm5, xmm3 ; vcoul=qq*VV
- mulps xmm3, xmm7 ; fijC=FF*qq
- ; at this point mm5 contains vcoul and mm3 fijC.
-
- addps xmm5, [esp + .vctot]
- movaps [esp + .vctot], xmm5
- xorps xmm1, xmm1
- mulps xmm3, [esp + .tabscale]
- mulps xmm3, xmm0
- subps xmm1, xmm3
-
- movaps xmm0, xmm1
- movaps xmm2, xmm1
-
- movaps xmm3, [esp + .fjxH2]
- movaps xmm4, [esp + .fjyH2]
- movaps xmm5, [esp + .fjzH2]
- mulps xmm0, [esp + .dxH2H2]
- mulps xmm1, [esp + .dyH2H2]
- mulps xmm2, [esp + .dzH2H2]
- subps xmm3, xmm0
- subps xmm4, xmm1
- subps xmm5, xmm2
- addps xmm0, [esp + .fixH2]
- addps xmm1, [esp + .fiyH2]
- addps xmm2, [esp + .fizH2]
- movaps [esp + .fjxH2], xmm3
- movaps [esp + .fjyH2], xmm4
- movaps [esp + .fjzH2], xmm5
- movaps [esp + .fixH2], xmm0
- movaps [esp + .fiyH2], xmm1
- movaps [esp + .fizH2], xmm2
-
- mov edi, [ebp +%$faction]
-
- movd eax, mm0
- movd ebx, mm1
- movd ecx, mm2
- movd edx, mm3
-
- ; Did all interactions - now update j forces.
- ; 4 j waters with three atoms each - first do a & b j particles
- movaps xmm0, [esp + .fjxO] ; xmm0= fjxOa fjxOb fjxOc fjxOd
- movaps xmm1, [esp + .fjyO] ; xmm1= fjyOa fjyOb fjyOc fjyOd
- unpcklps xmm0, xmm1 ; xmm0= fjxOa fjyOa fjxOb fjyOb
- movaps xmm1, [esp + .fjzO]
- movaps xmm2, [esp + .fjxH1]
- movhlps xmm3, xmm0 ; xmm3= fjxOb fjyOb
- unpcklps xmm1, xmm2 ; xmm1= fjzOa fjxH1a fjzOb fjxH1b
- movaps xmm4, [esp + .fjyH1]
- movaps xmm5, [esp + .fjzH1]
- unpcklps xmm4, xmm5 ; xmm4= fjyH1a fjzH1a fjyH1b fjzH1b
- movaps xmm5, [esp + .fjxH2]
- movaps xmm6, [esp + .fjyH2]
- movhlps xmm7, xmm4 ; xmm7= fjyH1b fjzH1b
- unpcklps xmm5, xmm6 ; xmm5= fjxH2a fjyH2a fjxH2b fjyH2b
- movlhps xmm0, xmm1 ; xmm0= fjxOa fjyOa fjzOa fjxH1a
- shufps xmm3, xmm1, 11100100b
- ; xmm3= fjxOb fjyOb fjzOb fjxH1b
- movlhps xmm4, xmm5 ; xmm4= fjyH1a fjzH1a fjxH2a fjyH2a
- shufps xmm7, xmm5, 11100100b
- ; xmm7= fjyH1b fjzH1b fjxH2b fjyH2b
- movups xmm1, [edi + eax*4]
- movups xmm2, [edi + eax*4 + 16]
- movups xmm5, [edi + ebx*4]
- movups xmm6, [edi + ebx*4 + 16]
- addps xmm1, xmm0
- addps xmm2, xmm4
- addps xmm5, xmm3
- addps xmm6, xmm7
- movss xmm0, [edi + eax*4 + 32]
- movss xmm3, [edi + ebx*4 + 32]
-
- movaps xmm4, [esp + .fjzH2]
- movaps xmm7, xmm4
- shufps xmm7, xmm7, 1b
-
- movups [edi + eax*4], xmm1
- movups [edi + eax*4 + 16],xmm2
- movups [edi + ebx*4], xmm5
- movups [edi + ebx*4 + 16],xmm6
- addss xmm0, xmm4
- addss xmm3, xmm7
- movss [edi + eax*4 + 32], xmm0
- movss [edi + ebx*4 + 32], xmm3
-
- ;; then do the second pair (c & d)
- movaps xmm0, [esp + .fjxO] ; xmm0= fjxOa fjxOb fjxOc fjxOd
- movaps xmm1, [esp + .fjyO] ; xmm1= fjyOa fjyOb fjyOc fjyOd
- unpckhps xmm0, xmm1 ; xmm0= fjxOc fjyOc fjxOd fjyOd
- movaps xmm1, [esp + .fjzO]
- movaps xmm2, [esp + .fjxH1]
- movhlps xmm3, xmm0 ; xmm3= fjxOd fjyOd
- unpckhps xmm1, xmm2 ; xmm1= fjzOc fjxH1c fjzOd fjxH1d
- movaps xmm4, [esp + .fjyH1]
- movaps xmm5, [esp + .fjzH1]
- unpckhps xmm4, xmm5 ; xmm4= fjyH1c fjzH1c fjyH1d fjzH1d
- movaps xmm5, [esp + .fjxH2]
- movaps xmm6, [esp + .fjyH2]
- movhlps xmm7, xmm4 ; xmm7= fjyH1d fjzH1d
- unpckhps xmm5, xmm6 ; xmm5= fjxH2c fjyH2c fjxH2d fjyH2d
- movlhps xmm0, xmm1 ; xmm0= fjxOc fjyOc fjzOc fjxH1c
- shufps xmm3, xmm1, 11100100b
- ; xmm3= fjxOd fjyOd fjzOd fjxH1d
- movlhps xmm4, xmm5 ; xmm4= fjyH1c fjzH1c fjxH2c fjyH2c
- shufps xmm7, xmm5, 11100100b
- ; xmm7= fjyH1d fjzH1d fjxH2d fjyH2d
- movups xmm1, [edi + ecx*4]
- movups xmm2, [edi + ecx*4 + 16]
- movups xmm5, [edi + edx*4]
- movups xmm6, [edi + edx*4 + 16]
- addps xmm1, xmm0
- addps xmm2, xmm4
- addps xmm5, xmm3
- addps xmm6, xmm7
- movss xmm0, [edi + ecx*4 + 32]
- movss xmm3, [edi + edx*4 + 32]
-
- movaps xmm4, [esp + .fjzH2]
- movaps xmm7, xmm4
- shufps xmm4, xmm4, 10b
- shufps xmm7, xmm7, 11b
- movups [edi + ecx*4], xmm1
- movups [edi + ecx*4 + 16],xmm2
- movups [edi + edx*4], xmm5
- movups [edi + edx*4 + 16],xmm6
- addss xmm0, xmm4
- addss xmm3, xmm7
- movss [edi + ecx*4 + 32], xmm0
- movss [edi + edx*4 + 32], xmm3
-
- ;; should we do one more iteration?
- sub [esp + .innerk], dword 4
- jl .single_check
- jmp .unroll_loop
-.single_check:
- add [esp + .innerk], dword 4
- jnz .single_loop
- jmp .updateouterdata
-.single_loop:
- mov edx, [esp + .innerjjnr] ; pointer to jjnr[k]
- mov eax, [edx]
- add [esp + .innerjjnr], dword 4
-
- mov esi, [ebp + %$pos]
- lea eax, [eax + eax*2]
-
- ; fetch j coordinates
- xorps xmm3, xmm3
- xorps xmm4, xmm4
- xorps xmm5, xmm5
- movss xmm3, [esi + eax*4]
- movss xmm4, [esi + eax*4 + 4]
- movss xmm5, [esi + eax*4 + 8]
-
- movlps xmm6, [esi + eax*4 + 12]
- movhps xmm6, [esi + eax*4 + 24] ; xmm6=jxH1 jyH1 jxH2 jyH2
- ;; fetch both z coords in one go, to positions 0 and 3 in xmm7
- movups xmm7, [esi + eax*4 + 20] ; xmm7=jzH1 jxH2 jyH2 jzH2
- shufps xmm6, xmm6, 11011000b ; xmm6=jxH1 jxH2 jyH1 jyH2
- movlhps xmm3, xmm6 ; xmm3= jxO 0 jxH1 jxH2
- movaps xmm0, [esp + .ixO]
- movaps xmm1, [esp + .iyO]
- movaps xmm2, [esp + .izO]
- shufps xmm4, xmm6, 11100100b ; xmm4= jyO 0 jyH1 jyH2
- shufps xmm5, xmm7, 11000100b ; xmm5= jzO 0 jzH1 jzH2
- ;; store all j coordinates in jO
- movaps [esp + .jxO], xmm3
- movaps [esp + .jyO], xmm4
- movaps [esp + .jzO], xmm5
- subps xmm0, xmm3
- subps xmm1, xmm4
- subps xmm2, xmm5
- movaps [esp + .dxOO], xmm0
- movaps [esp + .dyOO], xmm1
- movaps [esp + .dzOO], xmm2
- mulps xmm0, xmm0
- mulps xmm1, xmm1
- mulps xmm2, xmm2
- addps xmm0, xmm1
- addps xmm0, xmm2 ; have rsq in xmm0.
-
- ;; do invsqrt
- rsqrtps xmm1, xmm0
- movaps xmm2, xmm1
- mulps xmm1, xmm1
- movaps xmm3, [esp + .three]
- mulps xmm1, xmm0
- subps xmm3, xmm1
- mulps xmm3, xmm2
- mulps xmm3, [esp + .half] ; rinv iO - j water
-
- movaps xmm1, xmm3
- mulps xmm1, xmm0 ; xmm1=r
- movaps xmm0, xmm3 ; xmm0=rinv
- mulps xmm1, [esp + .tabscale]
-
- movhlps xmm2, xmm1
- cvttps2pi mm6, xmm1
- cvttps2pi mm7, xmm2 ; mm6/mm7 contain lu indices
- cvtpi2ps xmm3, mm6
- cvtpi2ps xmm2, mm7
- movlhps xmm3, xmm2
- subps xmm1, xmm3 ; xmm1=eps
- movaps xmm2, xmm1
- mulps xmm2, xmm2 ; xmm2=eps2
- pslld mm6, 2
- pslld mm7, 2
-
- movd ebx, mm6
- movd ecx, mm7
- psrlq mm7, 32
- movd edx, mm7 ; table indices in ebx,ecx,edx
-
- mov esi, [ebp + %$VFtab]
-
- movlps xmm5, [esi + ebx*4]
- movlps xmm7, [esi + ecx*4]
- movhps xmm7, [esi + edx*4] ; got half coulomb table
- movaps xmm4, xmm5
- shufps xmm4, xmm7, 10001000b
- shufps xmm5, xmm7, 11011101b
-
- movlps xmm7, [esi + ebx*4 + 8]
- movlps xmm3, [esi + ecx*4 + 8]
- movhps xmm3, [esi + edx*4 + 8] ; other half of coulomb table
- movaps xmm6, xmm7
- shufps xmm6, xmm3, 10001000b
- shufps xmm7, xmm3, 11011101b
- ; coulomb table ready, in xmm4-xmm7
- mulps xmm6, xmm1 ; xmm6=Geps
- mulps xmm7, xmm2 ; xmm7=Heps2
- addps xmm5, xmm6
- addps xmm5, xmm7 ; xmm5=Fp
- mulps xmm7, [esp + .two] ; two*Heps2
-
- xorps xmm3, xmm3
- ;; fetch charges to xmm3 (temporary)
- movss xmm3, [esp + .qqOO]
- movhps xmm3, [esp + .qqOH]
-
- addps xmm7, xmm6
- addps xmm7, xmm5 ; xmm7=FF
- mulps xmm5, xmm1 ; xmm5=eps*Fp
- addps xmm5, xmm4 ; xmm5=VV
- mulps xmm5, xmm3 ; vcoul=qq*VV
- mulps xmm3, xmm7 ; fijC=FF*qq
- ; at this point xmm5 contains vcoul and xmm3 fijC.
-
- addps xmm5, [esp + .vctot]
- movaps [esp + .vctot], xmm5
-
- mulps xmm3, [esp + .tabscale]
-
- ;; start doing lj
- xorps xmm2, xmm2
- movss xmm2, xmm0
- mulss xmm2, xmm2
- movaps xmm1, xmm2
- mulss xmm1, xmm2
- mulss xmm1, xmm2 ; xmm1=rinvsix
- movaps xmm2, xmm1
- mulss xmm2, xmm2 ; xmm2=rinvtwelve
- mulss xmm1, [esp + .c6]
- mulss xmm2, [esp + .c12]
- movaps xmm4, xmm2
- subss xmm4, xmm1
- addps xmm4, [esp + .vnbtot]
- mulss xmm1, [esp + .six]
- mulss xmm2, [esp + .twelve]
- movaps [esp + .vnbtot], xmm4
- subss xmm2, xmm1
- mulss xmm2, xmm0
-
- subps xmm2, xmm3
- mulps xmm0, xmm2
-
- movaps xmm1, xmm0
- movaps xmm2, xmm0
-
- mulps xmm0, [esp + .dxOO]
- mulps xmm1, [esp + .dyOO]
- mulps xmm2, [esp + .dzOO]
- ;; initial update for j forces
- xorps xmm3, xmm3
- xorps xmm4, xmm4
- xorps xmm5, xmm5
- subps xmm3, xmm0
- subps xmm4, xmm1
- subps xmm5, xmm2
- movaps [esp + .fjxO], xmm3
- movaps [esp + .fjyO], xmm4
- movaps [esp + .fjzO], xmm5
- addps xmm0, [esp + .fixO]
- addps xmm1, [esp + .fiyO]
- addps xmm2, [esp + .fizO]
- movaps [esp + .fixO], xmm0
- movaps [esp + .fiyO], xmm1
- movaps [esp + .fizO], xmm2
-
-
- ;; done with i O. Now do i H1 & H2 simultaneously. first get i particle coords:
- movaps xmm0, [esp + .ixH1]
- movaps xmm1, [esp + .iyH1]
- movaps xmm2, [esp + .izH1]
- movaps xmm3, [esp + .ixH2]
- movaps xmm4, [esp + .iyH2]
- movaps xmm5, [esp + .izH2]
- subps xmm0, [esp + .jxO]
- subps xmm1, [esp + .jyO]
- subps xmm2, [esp + .jzO]
- subps xmm3, [esp + .jxO]
- subps xmm4, [esp + .jyO]
- subps xmm5, [esp + .jzO]
- movaps [esp + .dxH1O], xmm0
- movaps [esp + .dyH1O], xmm1
- movaps [esp + .dzH1O], xmm2
- movaps [esp + .dxH2O], xmm3
- movaps [esp + .dyH2O], xmm4
- movaps [esp + .dzH2O], xmm5
- mulps xmm0, xmm0
- mulps xmm1, xmm1
- mulps xmm2, xmm2
- mulps xmm3, xmm3
- mulps xmm4, xmm4
- mulps xmm5, xmm5
- addps xmm0, xmm1
- addps xmm4, xmm3
- addps xmm0, xmm2 ; have rsqH1 in xmm0.
- addps xmm4, xmm5 ; have rsqH2 in xmm4.
-
- ;; start with H1, save H2 data
- movaps [esp + .rsqH2O], xmm4
-
- ;; do invsqrt
- rsqrtps xmm1, xmm0
- rsqrtps xmm5, xmm4
- movaps xmm2, xmm1
- movaps xmm6, xmm5
- mulps xmm1, xmm1
- mulps xmm5, xmm5
- movaps xmm3, [esp + .three]
- movaps xmm7, xmm3
- mulps xmm1, xmm0
- mulps xmm5, xmm4
- subps xmm3, xmm1
- subps xmm7, xmm5
- mulps xmm3, xmm2
- mulps xmm7, xmm6
- mulps xmm3, [esp + .half] ; rinv H1 - j water
- mulps xmm7, [esp + .half] ; rinv H2 - j water
-
- ;; start with H1, save H2 data
- movaps [esp + .rinvH2O], xmm7
-
- movaps xmm1, xmm3
- mulps xmm1, xmm0 ; xmm1=r
- movaps xmm0, xmm3 ; xmm0=rinv
- mulps xmm1, [esp + .tabscale]
-
- movhlps xmm2, xmm1
- cvttps2pi mm6, xmm1
- cvttps2pi mm7, xmm2 ; mm6/mm7 contain lu indices
- cvtpi2ps xmm3, mm6
- cvtpi2ps xmm2, mm7
- movlhps xmm3, xmm2
- subps xmm1, xmm3 ; xmm1=eps
- movaps xmm2, xmm1
- mulps xmm2, xmm2 ; xmm2=eps2
- pslld mm6, 2
- pslld mm7, 2
-
- movd ebx, mm6
- movd ecx, mm7
- psrlq mm7, 32
- movd edx, mm7 ; table indices in ebx,ecx,edx
-
- movlps xmm5, [esi + ebx*4]
- movlps xmm7, [esi + ecx*4]
- movhps xmm7, [esi + edx*4] ; got half coulomb table
- movaps xmm4, xmm5
- shufps xmm4, xmm7, 10001000b
- shufps xmm5, xmm7, 11011101b
-
- movlps xmm7, [esi + ebx*4 + 8]
- movlps xmm3, [esi + ecx*4 + 8]
- movhps xmm3, [esi + edx*4 + 8] ; other half of coulomb table
- movaps xmm6, xmm7
- shufps xmm6, xmm3, 10001000b
- shufps xmm7, xmm3, 11011101b
- ; coulomb table ready, in xmm4-xmm7
- mulps xmm6, xmm1 ; xmm6=Geps
- mulps xmm7, xmm2 ; xmm7=Heps2
- addps xmm5, xmm6
- addps xmm5, xmm7 ; xmm5=Fp
- mulps xmm7, [esp + .two] ; two*Heps2
-
- xorps xmm3, xmm3
- ;; fetch charges to xmm3 (temporary)
- movss xmm3, [esp + .qqOH]
- movhps xmm3, [esp + .qqHH]
-
- addps xmm7, xmm6
- addps xmm7, xmm5 ; xmm7=FF
- mulps xmm5, xmm1 ; xmm5=eps*Fp
- addps xmm5, xmm4 ; xmm5=VV
- mulps xmm5, xmm3 ; vcoul=qq*VV
- mulps xmm3, xmm7 ; fijC=FF*qq
- ; at this point xmm5 contains vcoul and xmm3 fijC.
- addps xmm5, [esp + .vctot]
- movaps [esp + .vctot], xmm5
-
- xorps xmm1, xmm1
-
- mulps xmm3, [esp + .tabscale]
- mulps xmm3, xmm0
- subps xmm1, xmm3
-
- movaps xmm0, xmm1
- movaps xmm2, xmm1
- mulps xmm0, [esp + .dxH1O]
- mulps xmm1, [esp + .dyH1O]
- mulps xmm2, [esp + .dzH1O]
- ;; update forces H1 - j water
- movaps xmm3, [esp + .fjxO]
- movaps xmm4, [esp + .fjyO]
- movaps xmm5, [esp + .fjzO]
- subps xmm3, xmm0
- subps xmm4, xmm1
- subps xmm5, xmm2
- movaps [esp + .fjxO], xmm3
- movaps [esp + .fjyO], xmm4
- movaps [esp + .fjzO], xmm5
- addps xmm0, [esp + .fixH1]
- addps xmm1, [esp + .fiyH1]
- addps xmm2, [esp + .fizH1]
- movaps [esp + .fixH1], xmm0
- movaps [esp + .fiyH1], xmm1
- movaps [esp + .fizH1], xmm2
- ;; do table for H2 - j water interaction
- movaps xmm0, [esp + .rinvH2O]
- movaps xmm1, [esp + .rsqH2O]
- mulps xmm1, xmm0 ; xmm0=rinv, xmm1=r
- mulps xmm1, [esp + .tabscale]
-
- movhlps xmm2, xmm1
- cvttps2pi mm6, xmm1
- cvttps2pi mm7, xmm2 ; mm6/mm7 contain lu indices
- cvtpi2ps xmm3, mm6
- cvtpi2ps xmm2, mm7
- movlhps xmm3, xmm2
- subps xmm1, xmm3 ; xmm1=eps
- movaps xmm2, xmm1
- mulps xmm2, xmm2 ; xmm2=eps2
- pslld mm6, 2
- pslld mm7, 2
-
- movd ebx, mm6
- movd ecx, mm7
- psrlq mm7, 32
- movd edx, mm7 ; table indices in ebx,ecx,edx
-
- movlps xmm5, [esi + ebx*4]
- movlps xmm7, [esi + ecx*4]
- movhps xmm7, [esi + edx*4] ; got half coulomb table
- movaps xmm4, xmm5
- shufps xmm4, xmm7, 10001000b
- shufps xmm5, xmm7, 11011101b
-
- movlps xmm7, [esi + ebx*4 + 8]
- movlps xmm3, [esi + ecx*4 + 8]
- movhps xmm3, [esi + edx*4 + 8] ; other half of coulomb table
- movaps xmm6, xmm7
- shufps xmm6, xmm3, 10001000b
- shufps xmm7, xmm3, 11011101b
- ; coulomb table ready, in xmm4-xmm7
- mulps xmm6, xmm1 ; xmm6=Geps
- mulps xmm7, xmm2 ; xmm7=Heps2
- addps xmm5, xmm6
- addps xmm5, xmm7 ; xmm5=Fp
- mulps xmm7, [esp + .two] ; two*Heps2
-
- xorps xmm3, xmm3
- ;; fetch charges to xmm3 (temporary)
- movss xmm3, [esp + .qqOH]
- movhps xmm3, [esp + .qqHH]
-
- addps xmm7, xmm6
- addps xmm7, xmm5 ; xmm7=FF
- mulps xmm5, xmm1 ; xmm5=eps*Fp
- addps xmm5, xmm4 ; xmm5=VV
- mulps xmm5, xmm3 ; vcoul=qq*VV
- mulps xmm3, xmm7 ; fijC=FF*qq
- ; at this point xmm5 contains vcoul and xmm3 fijC.
- addps xmm5, [esp + .vctot]
- movaps [esp + .vctot], xmm5
-
- xorps xmm1, xmm1
-
- mulps xmm3, [esp + .tabscale]
- mulps xmm3, xmm0
- subps xmm1, xmm3
-
- movaps xmm0, xmm1
- movaps xmm2, xmm1
-
- mulps xmm0, [esp + .dxH2O]
- mulps xmm1, [esp + .dyH2O]
- mulps xmm2, [esp + .dzH2O]
- movaps xmm3, [esp + .fjxO]
- movaps xmm4, [esp + .fjyO]
- movaps xmm5, [esp + .fjzO]
- subps xmm3, xmm0
- subps xmm4, xmm1
- subps xmm5, xmm2
- mov esi, [ebp + %$faction]
- movaps [esp + .fjxO], xmm3
- movaps [esp + .fjyO], xmm4
- movaps [esp + .fjzO], xmm5
- addps xmm0, [esp + .fixH2]
- addps xmm1, [esp + .fiyH2]
- addps xmm2, [esp + .fizH2]
- movaps [esp + .fixH2], xmm0
- movaps [esp + .fiyH2], xmm1
- movaps [esp + .fizH2], xmm2
-
- ;; update j water forces from local variables
- movlps xmm0, [esi + eax*4]
- movlps xmm1, [esi + eax*4 + 12]
- movhps xmm1, [esi + eax*4 + 24]
- movaps xmm3, [esp + .fjxO]
- movaps xmm4, [esp + .fjyO]
- movaps xmm5, [esp + .fjzO]
- movaps xmm6, xmm5
- movaps xmm7, xmm5
- shufps xmm6, xmm6, 10b
- shufps xmm7, xmm7, 11b
- addss xmm5, [esi + eax*4 + 8]
- addss xmm6, [esi + eax*4 + 20]
- addss xmm7, [esi + eax*4 + 32]
- movss [esi + eax*4 + 8], xmm5
- movss [esi + eax*4 + 20], xmm6
- movss [esi + eax*4 + 32], xmm7
- movaps xmm5, xmm3
- unpcklps xmm3, xmm4
- unpckhps xmm5, xmm4
- addps xmm0, xmm3
- addps xmm1, xmm5
- movlps [esi + eax*4], xmm0
- movlps [esi + eax*4 + 12], xmm1
- movhps [esi + eax*4 + 24], xmm1
-
- dec dword [esp + .innerk]
- jz .updateouterdata
- jmp .single_loop
-.updateouterdata:
- mov ecx, [esp + .ii3]
- mov edi, [ebp + %$faction]
- mov esi, [ebp + %$fshift]
- mov edx, [esp + .is3]
-
- ; accumulate Oi forces in xmm0, xmm1, xmm2
- movaps xmm0, [esp + .fixO]
- movaps xmm1, [esp + .fiyO]
- movaps xmm2, [esp + .fizO]
-
- movhlps xmm3, xmm0
- movhlps xmm4, xmm1
- movhlps xmm5, xmm2
- addps xmm0, xmm3
- addps xmm1, xmm4
- addps xmm2, xmm5 ; summan ligger i 1/2 i xmm0-xmm2
-
- movaps xmm3, xmm0
- movaps xmm4, xmm1
- movaps xmm5, xmm2
-
- shufps xmm3, xmm3, 1b
- shufps xmm4, xmm4, 1b
- shufps xmm5, xmm5, 1b
- addss xmm0, xmm3
- addss xmm1, xmm4
- addss xmm2, xmm5 ; xmm0-xmm2 has single force in pos0.
-
- ; increment i force
- movss xmm3, [edi + ecx*4]
- movss xmm4, [edi + ecx*4 + 4]
- movss xmm5, [edi + ecx*4 + 8]
- addss xmm3, xmm0
- addss xmm4, xmm1
- addss xmm5, xmm2
- movss [edi + ecx*4], xmm3
- movss [edi + ecx*4 + 4], xmm4
- movss [edi + ecx*4 + 8], xmm5
-
- ; accumulate force in xmm6/xmm7 for fshift
- movaps xmm6, xmm0
- movss xmm7, xmm2
- movlhps xmm6, xmm1
- shufps xmm6, xmm6, 1000b
-
- ; accumulate H1i forces in xmm0, xmm1, xmm2
- movaps xmm0, [esp + .fixH1]
- movaps xmm1, [esp + .fiyH1]
- movaps xmm2, [esp + .fizH1]
-
- movhlps xmm3, xmm0
- movhlps xmm4, xmm1
- movhlps xmm5, xmm2
- addps xmm0, xmm3
- addps xmm1, xmm4
- addps xmm2, xmm5 ; summan ligger i 1/2 i xmm0-xmm2
-
- movaps xmm3, xmm0
- movaps xmm4, xmm1
- movaps xmm5, xmm2
-
- shufps xmm3, xmm3, 1b
- shufps xmm4, xmm4, 1b
- shufps xmm5, xmm5, 1b
- addss xmm0, xmm3
- addss xmm1, xmm4
- addss xmm2, xmm5 ; xmm0-xmm2 has single force in pos0.
-
- ; increment i force
- movss xmm3, [edi + ecx*4 + 12]
- movss xmm4, [edi + ecx*4 + 16]
- movss xmm5, [edi + ecx*4 + 20]
- addss xmm3, xmm0
- addss xmm4, xmm1
- addss xmm5, xmm2
- movss [edi + ecx*4 + 12], xmm3
- movss [edi + ecx*4 + 16], xmm4
- movss [edi + ecx*4 + 20], xmm5
-
- ;accumulate force in xmm6/xmm7 for fshift
- addss xmm7, xmm2
- movlhps xmm0, xmm1
- shufps xmm0, xmm0, 1000b
- addps xmm6, xmm0
-
- ; accumulate H2i forces in xmm0, xmm1, xmm2
- movaps xmm0, [esp + .fixH2]
- movaps xmm1, [esp + .fiyH2]
- movaps xmm2, [esp + .fizH2]
-
- movhlps xmm3, xmm0
- movhlps xmm4, xmm1
- movhlps xmm5, xmm2
- addps xmm0, xmm3
- addps xmm1, xmm4
- addps xmm2, xmm5 ; summan ligger i 1/2 i xmm0-xmm2
-
- movaps xmm3, xmm0
- movaps xmm4, xmm1
- movaps xmm5, xmm2
-
- shufps xmm3, xmm3, 1b
- shufps xmm4, xmm4, 1b
- shufps xmm5, xmm5, 1b
- addss xmm0, xmm3
- addss xmm1, xmm4
- addss xmm2, xmm5 ; xmm0-xmm2 has single force in pos0.
-
- ; increment i force
- movss xmm3, [edi + ecx*4 + 24]
- movss xmm4, [edi + ecx*4 + 28]
- movss xmm5, [edi + ecx*4 + 32]
- addss xmm3, xmm0
- addss xmm4, xmm1
- addss xmm5, xmm2
- movss [edi + ecx*4 + 24], xmm3
- movss [edi + ecx*4 + 28], xmm4
- movss [edi + ecx*4 + 32], xmm5
-
- ;accumulate force in xmm6/xmm7 for fshift
- addss xmm7, xmm2
- movlhps xmm0, xmm1
- shufps xmm0, xmm0, 1000b
- addps xmm6, xmm0
-
- ; increment fshift force
- movlps xmm3, [esi + edx*4]
- movss xmm4, [esi + edx*4 + 8]
- addps xmm3, xmm6
- addss xmm4, xmm7
- movlps [esi + edx*4], xmm3
- movss [esi + edx*4 + 8], xmm4
-
- ; get group index for i particle
- mov edx, [ebp + %$gid] ; get group index for this i particle
- mov edx, [edx]
- add [ebp + %$gid], dword 4 ; advance pointer
-
- ; accumulate total potential energy and update it.
- movaps xmm7, [esp + .vctot]
- ; accumulate
- movhlps xmm6, xmm7
- addps xmm7, xmm6 ; pos 0-1 in xmm7 have the sum now
- movaps xmm6, xmm7
- shufps xmm6, xmm6, 1b
- addss xmm7, xmm6
-
- ; add earlier value from mem.
- mov eax, [ebp + %$Vc]
- addss xmm7, [eax + edx*4]
- ; move back to mem.
- movss [eax + edx*4], xmm7
-
- ; accumulate total lj energy and update it.
- movaps xmm7, [esp + .vnbtot]
- ; accumulate
- movhlps xmm6, xmm7
- addps xmm7, xmm6 ; pos 0-1 in xmm7 have the sum now
- movaps xmm6, xmm7
- shufps xmm6, xmm6, 1b
- addss xmm7, xmm6
-
- ; add earlier value from mem.
- mov eax, [ebp + %$Vnb]
- addss xmm7, [eax + edx*4]
- ; move back to mem.
- movss [eax + edx*4], xmm7
-
- ;; finish if last
- mov ecx, [ebp + %$nri]
- dec ecx
- jecxz .end
- ;; not last, iterate once more!
- mov [ebp + %$nri], ecx
- jmp .outer
-.end:
- emms
- mov eax, [esp + .salign]
- add esp, eax
- add esp, 1540
- pop edi
- pop esi
- pop edx
- pop ecx
- pop ebx
- pop eax
- endproc
-
-
-
-
-proc inl3300_sse
-%$nri arg
-%$iinr arg
-%$jindex arg
-%$jjnr arg
-%$shift arg
-%$shiftvec arg
-%$fshift arg
-%$gid arg
-%$pos arg
-%$faction arg
-%$charge arg
-%$facel arg
-%$Vc arg
-%$type arg
-%$ntype arg
-%$nbfp arg
-%$Vnb arg
-%$tabscale arg
-%$VFtab arg
- ;; stack offsets for local variables
- ;; bottom of stack is cache-aligned for sse use
-.ix equ 0
-.iy equ 16
-.iz equ 32
-.iq equ 48
-.dx equ 64
-.dy equ 80
-.dz equ 96
-.two equ 112
-.tabscale equ 128
-.qq equ 144
-.c6 equ 160
-.c12 equ 176
-.fs equ 192
-.vctot equ 208
-.vnbtot equ 224
-.fix equ 240
-.fiy equ 256
-.fiz equ 272
-.half equ 288
-.three equ 304
-.is3 equ 320
-.ii3 equ 324
-.ntia equ 328
-.innerjjnr equ 332
-.innerk equ 336
-.salign equ 340
- push eax
- push ebx
- push ecx
- push edx
- push esi
- push edi
- sub esp, 344 ; local stack space
- mov eax, esp
- and eax, 0xf
- sub esp, eax
- mov [esp + .salign], eax
-
- emms
-
- movups xmm0, [sse_half]
- movups xmm1, [sse_two]
- movups xmm2, [sse_three]
- movss xmm3, [ebp + %$tabscale]
- movaps [esp + .half], xmm0
- movaps [esp + .two], xmm1
- movaps [esp + .three], xmm2
- shufps xmm3, xmm3, 0b
- movaps [esp + .tabscale], xmm3
-
- ;; assume we have at least one i particle - start directly
-.outer:
- mov eax, [ebp + %$shift] ; eax = pointer into shift[]
- mov ebx, [eax] ; ebx=shift[n]
- add [ebp + %$shift], dword 4 ; advance pointer one step
-
- lea ebx, [ebx + ebx*2] ; ebx=3*is
- mov [esp + .is3],ebx ; store is3
-
- mov eax, [ebp + %$shiftvec] ; eax = base of shiftvec[]
-
- movss xmm0, [eax + ebx*4]
- movss xmm1, [eax + ebx*4 + 4]
- movss xmm2, [eax + ebx*4 + 8]
-
- mov ecx, [ebp + %$iinr] ; ecx = pointer into iinr[]
- add [ebp + %$iinr], dword 4 ; advance pointer
- mov ebx, [ecx] ; ebx =ii
-
- mov edx, [ebp + %$charge]
- movss xmm3, [edx + ebx*4]
- mulss xmm3, [ebp + %$facel]
- shufps xmm3, xmm3, 0b
-
- mov edx, [ebp + %$type]
- mov edx, [edx + ebx*4]
- imul edx, [ebp + %$ntype]
- shl edx, 1
- mov [esp + .ntia], edx
-
- lea ebx, [ebx + ebx*2] ; ebx = 3*ii=ii3
- mov eax, [ebp + %$pos] ; eax = base of pos[]
-
- addss xmm0, [eax + ebx*4]
- addss xmm1, [eax + ebx*4 + 4]
- addss xmm2, [eax + ebx*4 + 8]
-
- movaps [esp + .iq], xmm3
-
- shufps xmm0, xmm0, 0b
- shufps xmm1, xmm1, 0b
- shufps xmm2, xmm2, 0b
-
- movaps [esp + .ix], xmm0
- movaps [esp + .iy], xmm1
- movaps [esp + .iz], xmm2
-
- mov [esp + .ii3], ebx
-
- ; clear vctot and i forces
- xorps xmm4, xmm4
- movaps [esp + .vctot], xmm4
- movaps [esp + .vnbtot], xmm4
- movaps [esp + .fix], xmm4
- movaps [esp + .fiy], xmm4
- movaps [esp + .fiz], xmm4
-
- mov eax, [ebp + %$jindex]
- mov ecx, [eax] ; jindex[n]
- mov edx, [eax + 4] ; jindex[n+1]
- add [ebp + %$jindex], dword 4
- sub edx, ecx ; number of innerloop atoms
-
- mov esi, [ebp + %$pos]
- mov edi, [ebp + %$faction]
- mov eax, [ebp + %$jjnr]
- shl ecx, 2
- add eax, ecx
- mov [esp + .innerjjnr], eax ; pointer to jjnr[nj0]
- sub edx, dword 4
- mov [esp + .innerk], edx ; number of innerloop atoms
- jge .unroll_loop
- jmp .finish_inner
-.unroll_loop:
- ;; quad-unroll innerloop here.
- mov edx, [esp + .innerjjnr] ; pointer to jjnr[k]
- mov eax, [edx]
- mov ebx, [edx + 4]
- mov ecx, [edx + 8]
- mov edx, [edx + 12] ; eax-edx=jnr1-4
- add [esp + .innerjjnr], dword 16 ; advance pointer (unrolled 4)
-
- mov esi, [ebp + %$charge] ; base of charge[]
-
- movss xmm3, [esi + eax*4]
- movss xmm4, [esi + ecx*4]
- movss xmm6, [esi + ebx*4]
- movss xmm7, [esi + edx*4]
-
- movaps xmm2, [esp + .iq]
- shufps xmm3, xmm6, 00000000b
- shufps xmm4, xmm7, 00000000b
- shufps xmm3, xmm4, 10001000b ; all charges in xmm3
- movd mm0, eax ; use mmx registers as temp. storage
- movd mm1, ebx
- mulps xmm3, xmm2
- movd mm2, ecx
- movd mm3, edx
-
- movaps [esp + .qq], xmm3
-
- mov esi, [ebp + %$type]
- mov eax, [esi + eax*4]
- mov ebx, [esi + ebx*4]
- mov ecx, [esi + ecx*4]
- mov edx, [esi + edx*4]
- mov esi, [ebp + %$nbfp]
- shl eax, 1
- shl ebx, 1
- shl ecx, 1
- shl edx, 1
- mov edi, [esp + .ntia]
- add eax, edi
- add ebx, edi
- add ecx, edi
- add edx, edi
-
- movlps xmm6, [esi + eax*4]
- movlps xmm7, [esi + ecx*4]
- movhps xmm6, [esi + ebx*4]
- movhps xmm7, [esi + edx*4]
-
- movaps xmm4, xmm6
- shufps xmm4, xmm7, 10001000b
- shufps xmm6, xmm7, 11011101b
-
- movd eax, mm0
- movd ebx, mm1
- movd ecx, mm2
- movd edx, mm3
-
- movaps [esp + .c6], xmm4
- movaps [esp + .c12], xmm6
-
- mov esi, [ebp + %$pos] ; base of pos[]
-
- lea eax, [eax + eax*2] ; replace jnr with j3
- lea ebx, [ebx + ebx*2]
-
- lea ecx, [ecx + ecx*2] ; replace jnr with j3
- lea edx, [edx + edx*2]
-
- ; move four coordinates to xmm0-xmm2
-
- movlps xmm4, [esi + eax*4]
- movlps xmm5, [esi + ecx*4]
- movss xmm2, [esi + eax*4 + 8]
- movss xmm6, [esi + ecx*4 + 8]
-
- movhps xmm4, [esi + ebx*4]
- movhps xmm5, [esi + edx*4]
-
- movss xmm0, [esi + ebx*4 + 8]
- movss xmm1, [esi + edx*4 + 8]
-
- shufps xmm2, xmm0, 0b
- shufps xmm6, xmm1, 0b
-
- movaps xmm0, xmm4
- movaps xmm1, xmm4
-
- shufps xmm2, xmm6, 10001000b
-
- shufps xmm0, xmm5, 10001000b
- shufps xmm1, xmm5, 11011101b
-
- ; move ix-iz to xmm4-xmm6
- movaps xmm4, [esp + .ix]
- movaps xmm5, [esp + .iy]
- movaps xmm6, [esp + .iz]
-
- ; calc dr
- subps xmm4, xmm0
- subps xmm5, xmm1
- subps xmm6, xmm2
-
- ; store dr
- movaps [esp + .dx], xmm4
- movaps [esp + .dy], xmm5
- movaps [esp + .dz], xmm6
- ; square it
- mulps xmm4,xmm4
- mulps xmm5,xmm5
- mulps xmm6,xmm6
- addps xmm4, xmm5
- addps xmm4, xmm6
- ; rsq in xmm4
-
- rsqrtps xmm5, xmm4
- ; lookup seed in xmm5
- movaps xmm2, xmm5
- mulps xmm5, xmm5
- movaps xmm1, [esp + .three]
- mulps xmm5, xmm4 ;rsq*lu*lu
- movaps xmm0, [esp + .half]
- subps xmm1, xmm5 ; 3.0-rsq*lu*lu
- mulps xmm1, xmm2
- mulps xmm0, xmm1 ; xmm0=rinv
- mulps xmm4, xmm0 ; xmm4=r
- mulps xmm4, [esp + .tabscale]
-
- movhlps xmm5, xmm4
- cvttps2pi mm6, xmm4
- cvttps2pi mm7, xmm5 ; mm6/mm7 contain lu indices
- cvtpi2ps xmm6, mm6
- cvtpi2ps xmm5, mm7
- movlhps xmm6, xmm5
- subps xmm4, xmm6
- movaps xmm1, xmm4 ;xmm1=eps
- movaps xmm2, xmm1
- mulps xmm2, xmm2 ;xmm2=eps2
- pslld mm6, 2
- pslld mm7, 2
-
- movd mm0, eax
- movd mm1, ebx
- movd mm2, ecx
- movd mm3, edx
-
- mov esi, [ebp + %$VFtab]
- movd eax, mm6
- psrlq mm6, 32
- movd ecx, mm7
- psrlq mm7, 32
- movd ebx, mm6
- movd edx, mm7
-
- lea eax, [eax + eax*2]
- lea ebx, [ebx + ebx*2]
- lea ecx, [ecx + ecx*2]
- lea edx, [edx + edx*2]
-
- movlps xmm5, [esi + eax*4]
- movlps xmm7, [esi + ecx*4]
- movhps xmm5, [esi + ebx*4]
- movhps xmm7, [esi + edx*4] ; got half coulomb table
-
- movaps xmm4, xmm5
- shufps xmm4, xmm7, 10001000b
- shufps xmm5, xmm7, 11011101b
-
- movlps xmm7, [esi + eax*4 + 8]
- movlps xmm3, [esi + ecx*4 + 8]
- movhps xmm7, [esi + ebx*4 + 8]
- movhps xmm3, [esi + edx*4 + 8] ; other half of coulomb table
- movaps xmm6, xmm7
- shufps xmm6, xmm3, 10001000b
- shufps xmm7, xmm3, 11011101b
- ; coulomb table ready, in xmm4-xmm7
-
- mulps xmm6, xmm1 ; xmm6=Geps
- mulps xmm7, xmm2 ; xmm7=Heps2
- addps xmm5, xmm6
- addps xmm5, xmm7 ; xmm5=Fp
- mulps xmm7, [esp + .two] ; two*Heps2
- movaps xmm3, [esp + .qq]
- addps xmm7, xmm6
- addps xmm7, xmm5 ; xmm7=FF
- mulps xmm5, xmm1 ; xmm5=eps*Fp
- addps xmm5, xmm4 ; xmm5=VV
- mulps xmm5, xmm3 ; vcoul=qq*VV
- mulps xmm3, xmm7 ; fijC=FF*qq
- ; at this point mm5 contains vcoul and mm3 fijC.
- ; increment vcoul - then we can get rid of mm5.
- ;; update vctot
- addps xmm5, [esp + .vctot]
- movaps [esp + .vctot], xmm5
-
- ; put scalar force on stack temporarily...
- movaps [esp + .fs], xmm3
-
- ; dispersion
- movlps xmm5, [esi + eax*4 + 16]
- movlps xmm7, [esi + ecx*4 + 16]
- movhps xmm5, [esi + ebx*4 + 16]
- movhps xmm7, [esi + edx*4 + 16] ; got half dispersion table
- movaps xmm4, xmm5
- shufps xmm4, xmm7, 10001000b
- shufps xmm5, xmm7, 11011101b
-
- movlps xmm7, [esi + eax*4 + 24]
- movlps xmm3, [esi + ecx*4 + 24]
- movhps xmm7, [esi + ebx*4 + 24]
- movhps xmm3, [esi + edx*4 + 24] ; other half of dispersion table
- movaps xmm6, xmm7
- shufps xmm6, xmm3, 10001000b
- shufps xmm7, xmm3, 11011101b
- ; dispersion table ready, in xmm4-xmm7
- mulps xmm6, xmm1 ; xmm6=Geps
- mulps xmm7, xmm2 ; xmm7=Heps2
- addps xmm5, xmm6
- addps xmm5, xmm7 ; xmm5=Fp
- mulps xmm7, [esp + .two] ; two*Heps2
- addps xmm7, xmm6
- addps xmm7, xmm5 ; xmm7=FF
- mulps xmm5, xmm1 ; xmm5=eps*Fp
- addps xmm5, xmm4 ; xmm5=VV
-
- movaps xmm4, [esp + .c6]
- mulps xmm7, xmm4 ; fijD
- mulps xmm5, xmm4 ; vnb6
- addps xmm7, [esp + .fs] ; add to fscal
-
- ; put scalar force on stack. Update vnbtot directly.
- addps xmm5, [esp + .vnbtot]
- movaps [esp + .fs], xmm7
- movaps [esp + .vnbtot], xmm5
-
- ; repulsion
- movlps xmm5, [esi + eax*4 + 32]
- movlps xmm7, [esi + ecx*4 + 32]
- movhps xmm5, [esi + ebx*4 + 32]
- movhps xmm7, [esi + edx*4 + 32] ; got half repulsion table
- movaps xmm4, xmm5
- shufps xmm4, xmm7, 10001000b
- shufps xmm5, xmm7, 11011101b
-
- movlps xmm7, [esi + eax*4 + 40]
- movlps xmm3, [esi + ecx*4 + 40]
- movhps xmm7, [esi + ebx*4 + 40]
- movhps xmm3, [esi + edx*4 + 40] ; other half of repulsion table
- movaps xmm6, xmm7
- shufps xmm6, xmm3, 10001000b
- shufps xmm7, xmm3, 11011101b
- ; table ready, in xmm4-xmm7
- mulps xmm6, xmm1 ; xmm6=Geps
- mulps xmm7, xmm2 ; xmm7=Heps2
- addps xmm5, xmm6
- addps xmm5, xmm7 ; xmm5=Fp
- mulps xmm7, [esp + .two] ; two*Heps2
- addps xmm7, xmm6
- addps xmm7, xmm5 ; xmm7=FF
- mulps xmm5, xmm1 ; xmm5=eps*Fp
- addps xmm5, xmm4 ; xmm5=VV
-
- movaps xmm4, [esp + .c12]
- mulps xmm7, xmm4 ; fijR
- mulps xmm5, xmm4 ; vnb12
- addps xmm7, [esp + .fs]
-
- addps xmm5, [esp + .vnbtot]
- movaps [esp + .vnbtot], xmm5
- xorps xmm4, xmm4
-
- mulps xmm7, [esp + .tabscale]
- mulps xmm7, xmm0
- subps xmm4, xmm7
-
- movaps xmm0, [esp + .dx]
- movaps xmm1, [esp + .dy]
- movaps xmm2, [esp + .dz]
-
- movd eax, mm0
- movd ebx, mm1
- movd ecx, mm2
- movd edx, mm3
-
- mov edi, [ebp + %$faction]
- mulps xmm0, xmm4
- mulps xmm1, xmm4
- mulps xmm2, xmm4
- ; xmm0-xmm2 contains tx-tz (partial force)
- ; now update f_i
- movaps xmm3, [esp + .fix]
- movaps xmm4, [esp + .fiy]
- movaps xmm5, [esp + .fiz]
- addps xmm3, xmm0
- addps xmm4, xmm1
- addps xmm5, xmm2
- movaps [esp + .fix], xmm3
- movaps [esp + .fiy], xmm4
- movaps [esp + .fiz], xmm5
- ; the fj's - start by accumulating x & y forces from memory
- movlps xmm4, [edi + eax*4]
- movlps xmm6, [edi + ecx*4]
- movhps xmm4, [edi + ebx*4]
- movhps xmm6, [edi + edx*4]
-
- movaps xmm3, xmm4
- shufps xmm3, xmm6, 10001000b
- shufps xmm4, xmm6, 11011101b
-
- ; now xmm3-xmm5 contains fjx, fjy, fjz
- subps xmm3, xmm0
- subps xmm4, xmm1
-
- ; unpack them back so we can store them - first x & y in xmm3/xmm4
-
- movaps xmm6, xmm3
- unpcklps xmm6, xmm4
- unpckhps xmm3, xmm4
- ; xmm6(l)=x & y for j1, (h) for j2
- ; xmm3(l)=x & y for j3, (h) for j4
- movlps [edi + eax*4], xmm6
- movlps [edi + ecx*4], xmm3
-
- movhps [edi + ebx*4], xmm6
- movhps [edi + edx*4], xmm3
-
- ;; and the z forces
- movss xmm4, [edi + eax*4 + 8]
- movss xmm5, [edi + ebx*4 + 8]
- movss xmm6, [edi + ecx*4 + 8]
- movss xmm7, [edi + edx*4 + 8]
- subss xmm4, xmm2
- shufps xmm2, xmm2, 11100101b
- subss xmm5, xmm2
- shufps xmm2, xmm2, 11101010b
- subss xmm6, xmm2
- shufps xmm2, xmm2, 11111111b
- subss xmm7, xmm2
- movss [edi + eax*4 + 8], xmm4
- movss [edi + ebx*4 + 8], xmm5
- movss [edi + ecx*4 + 8], xmm6
- movss [edi + edx*4 + 8], xmm7
-
- ;; should we do one more iteration?
- sub [esp + .innerk], dword 4
- jl .finish_inner
- jmp .unroll_loop
-.finish_inner:
- ;; check if at least two particles remain
- add [esp + .innerk], dword 4
- mov edx, [esp + .innerk]
- and edx, 10b
- jnz .dopair
- jmp .checksingle
-.dopair:
- mov esi, [ebp + %$charge]
-
- mov ecx, [esp + .innerjjnr]
-
- mov eax, [ecx]
- mov ebx, [ecx + 4]
- add [esp + .innerjjnr], dword 8
- xorps xmm7, xmm7
- movss xmm3, [esi + eax*4]
- movss xmm6, [esi + ebx*4]
- shufps xmm3, xmm6, 00000000b
- shufps xmm3, xmm3, 00001000b ; xmm3(0,1) has the charges.
-
- mulps xmm3, [esp + .iq]
- movlhps xmm3, xmm7
- movaps [esp + .qq], xmm3
-
- mov esi, [ebp + %$type]
- mov ecx, eax
- mov edx, ebx
- mov ecx, [esi + ecx*4]
- mov edx, [esi + edx*4]
- mov esi, [ebp + %$nbfp]
- shl ecx, 1
- shl edx, 1
- mov edi, [esp + .ntia]
- add ecx, edi
- add edx, edi
- movlps xmm6, [esi + ecx*4]
- movhps xmm6, [esi + edx*4]
- mov edi, [ebp + %$pos]
-
- movaps xmm4, xmm6
- shufps xmm4, xmm4, 1000b
- shufps xmm6, xmm6, 1101b
- movlhps xmm4, xmm7
- movlhps xmm6, xmm7
-
- movaps [esp + .c6], xmm4
- movaps [esp + .c12], xmm6
-
- lea eax, [eax + eax*2]
- lea ebx, [ebx + ebx*2]
- ; move coordinates to xmm0-xmm2
- movlps xmm1, [edi + eax*4]
- movss xmm2, [edi + eax*4 + 8]
- movhps xmm1, [edi + ebx*4]
- movss xmm0, [edi + ebx*4 + 8]
-
- movlhps xmm3, xmm7
-
- shufps xmm2, xmm0, 0b
-
- movaps xmm0, xmm1
-
- shufps xmm2, xmm2, 10001000b
-
- shufps xmm0, xmm0, 10001000b
- shufps xmm1, xmm1, 11011101b
-
- mov edi, [ebp + %$faction]
- ; move ix-iz to xmm4-xmm6
- xorps xmm7, xmm7
-
- movaps xmm4, [esp + .ix]
- movaps xmm5, [esp + .iy]
- movaps xmm6, [esp + .iz]
-
- ; calc dr
- subps xmm4, xmm0
- subps xmm5, xmm1
- subps xmm6, xmm2
-
- ; store dr
- movaps [esp + .dx], xmm4
- movaps [esp + .dy], xmm5
- movaps [esp + .dz], xmm6
- ; square it
- mulps xmm4,xmm4
- mulps xmm5,xmm5
- mulps xmm6,xmm6
- addps xmm4, xmm5
- addps xmm4, xmm6
- ; rsq in xmm4
-
- rsqrtps xmm5, xmm4
- ; lookup seed in xmm5
- movaps xmm2, xmm5
- mulps xmm5, xmm5
- movaps xmm1, [esp + .three]
- mulps xmm5, xmm4 ;rsq*lu*lu
- movaps xmm0, [esp + .half]
- subps xmm1, xmm5 ; 3.0-rsq*lu*lu
- mulps xmm1, xmm2
- mulps xmm0, xmm1 ; xmm0=rinv
- mulps xmm4, xmm0 ; xmm4=r
- mulps xmm4, [esp + .tabscale]
-
- cvttps2pi mm6, xmm4 ; mm6 contain lu indices
- cvtpi2ps xmm6, mm6
- subps xmm4, xmm6
- movaps xmm1, xmm4 ;xmm1=eps
- movaps xmm2, xmm1
- mulps xmm2, xmm2 ;xmm2=eps2
-
- pslld mm6, 2
-
- mov esi, [ebp + %$VFtab]
- movd ecx, mm6
- psrlq mm6, 32
- movd edx, mm6
- lea ecx, [ecx + ecx*2]
- lea edx, [edx + edx*2]
-
- movlps xmm5, [esi + ecx*4]
- movhps xmm5, [esi + edx*4] ; got half coulomb table
- movaps xmm4, xmm5
- shufps xmm4, xmm4, 10001000b
- shufps xmm5, xmm7, 11011101b
-
- movlps xmm7, [esi + ecx*4 + 8]
- movhps xmm7, [esi + edx*4 + 8]
- movaps xmm6, xmm7
- shufps xmm6, xmm6, 10001000b
- shufps xmm7, xmm7, 11011101b
- ; table ready in xmm4-xmm7
-
- mulps xmm6, xmm1 ; xmm6=Geps
- mulps xmm7, xmm2 ; xmm7=Heps2
- addps xmm5, xmm6
- addps xmm5, xmm7 ; xmm5=Fp
- mulps xmm7, [esp + .two] ; two*Heps2
- movaps xmm3, [esp + .qq]
- addps xmm7, xmm6
- addps xmm7, xmm5 ; xmm7=FF
- mulps xmm5, xmm1 ; xmm5=eps*Fp
- addps xmm5, xmm4 ; xmm5=VV
- mulps xmm5, xmm3 ; vcoul=qq*VV
- mulps xmm3, xmm7 ; fijC=FF*qq
- ; at this point mm5 contains vcoul and mm3 fijC.
- ; increment vcoul - then we can get rid of mm5.
- ;; update vctot
- addps xmm5, [esp + .vctot]
- movaps [esp + .vctot], xmm5
-
- ; put scalar force on stack temporarily...
- movaps [esp + .fs], xmm3
-
- ; dispersion
- movlps xmm5, [esi + ecx*4 + 16]
- movhps xmm5, [esi + edx*4 + 16]; got half dispersion table
- movaps xmm4, xmm5
- shufps xmm4, xmm4, 10001000b
- shufps xmm5, xmm5, 11011101b
-
- movlps xmm7, [esi + ecx*4 + 24]
- movhps xmm7, [esi + edx*4 + 24] ; other half of dispersion table
- movaps xmm6, xmm7
- shufps xmm6, xmm6, 10001000b
- shufps xmm7, xmm7, 11011101b
- ; dispersion table ready, in xmm4-xmm7
- mulps xmm6, xmm1 ; xmm6=Geps
- mulps xmm7, xmm2 ; xmm7=Heps2
- addps xmm5, xmm6
- addps xmm5, xmm7 ; xmm5=Fp
- mulps xmm7, [esp + .two] ; two*Heps2
- addps xmm7, xmm6
- addps xmm7, xmm5 ; xmm7=FF
- mulps xmm5, xmm1 ; xmm5=eps*Fp
- addps xmm5, xmm4 ; xmm5=VV
-
- movaps xmm4, [esp + .c6]
- mulps xmm7, xmm4 ; fijD
- mulps xmm5, xmm4 ; vnb6
- addps xmm7, [esp + .fs] ; add to fscal
-
- ; put scalar force on stack. Update vnbtot directly.
- addps xmm5, [esp + .vnbtot]
- movaps [esp + .fs], xmm7
- movaps [esp + .vnbtot], xmm5
-
- ; repulsion
- movlps xmm5, [esi + ecx*4 + 32]
- movhps xmm5, [esi + edx*4 + 32] ; got half repulsion table
- movaps xmm4, xmm5
- shufps xmm4, xmm7, 10001000b
- shufps xmm5, xmm7, 11011101b
-
- movlps xmm7, [esi + ecx*4 + 40]
- movhps xmm7, [esi + edx*4 + 40] ; other half of repulsion table
- movaps xmm6, xmm7
- shufps xmm6, xmm3, 10001000b
- shufps xmm7, xmm3, 11011101b
- ; table ready, in xmm4-xmm7
- mulps xmm6, xmm1 ; xmm6=Geps
- mulps xmm7, xmm2 ; xmm7=Heps2
- addps xmm5, xmm6
- addps xmm5, xmm7 ; xmm5=Fp
- mulps xmm7, [esp + .two] ; two*Heps2
- addps xmm7, xmm6
- addps xmm7, xmm5 ; xmm7=FF
- mulps xmm5, xmm1 ; xmm5=eps*Fp
- addps xmm5, xmm4 ; xmm5=VV
-
- movaps xmm4, [esp + .c12]
- mulps xmm7, xmm4 ; fijR
- mulps xmm5, xmm4 ; vnb12
- addps xmm7, [esp + .fs]
-
- addps xmm5, [esp + .vnbtot]
- movaps [esp + .vnbtot], xmm5
- xorps xmm4, xmm4
-
- mulps xmm7, [esp + .tabscale]
- mulps xmm7, xmm0
- subps xmm4, xmm7
-
- movaps xmm0, [esp + .dx]
- movaps xmm1, [esp + .dy]
- movaps xmm2, [esp + .dz]
-
- mulps xmm0, xmm4
- mulps xmm1, xmm4
- mulps xmm2, xmm4
- ; xmm0-xmm2 contains tx-tz (partial force)
- ; now update f_i
- movaps xmm3, [esp + .fix]
- movaps xmm4, [esp + .fiy]
- movaps xmm5, [esp + .fiz]
- addps xmm3, xmm0
- addps xmm4, xmm1
- addps xmm5, xmm2
- movaps [esp + .fix], xmm3
- movaps [esp + .fiy], xmm4
- movaps [esp + .fiz], xmm5
- ; update the fj's
- movss xmm3, [edi + eax*4]
- movss xmm4, [edi + eax*4 + 4]
- movss xmm5, [edi + eax*4 + 8]
- subss xmm3, xmm0
- subss xmm4, xmm1
- subss xmm5, xmm2
- movss [edi + eax*4], xmm3
- movss [edi + eax*4 + 4], xmm4
- movss [edi + eax*4 + 8], xmm5
-
- shufps xmm0, xmm0, 11100001b
- shufps xmm1, xmm1, 11100001b
- shufps xmm2, xmm2, 11100001b
-
- movss xmm3, [edi + ebx*4]
- movss xmm4, [edi + ebx*4 + 4]
- movss xmm5, [edi + ebx*4 + 8]
- subss xmm3, xmm0
- subss xmm4, xmm1
- subss xmm5, xmm2
- movss [edi + ebx*4], xmm3
- movss [edi + ebx*4 + 4], xmm4
- movss [edi + ebx*4 + 8], xmm5
-
-.checksingle:
- mov edx, [esp + .innerk]
- and edx, 1b
- jnz .dosingle
- jmp .updateouterdata
-.dosingle:
- mov esi, [ebp + %$charge]
- mov edi, [ebp + %$pos]
- mov ecx, [esp + .innerjjnr]
- mov eax, [ecx]
- xorps xmm6, xmm6
- movss xmm6, [esi + eax*4] ; xmm6(0) has the charge
- mulps xmm6, [esp + .iq]
- movaps [esp + .qq], xmm6
-
- mov esi, [ebp + %$type]
- mov ecx, eax
- mov ecx, [esi + ecx*4]
- mov esi, [ebp + %$nbfp]
- shl ecx, 1
- add ecx, [esp + .ntia]
- movlps xmm6, [esi + ecx*4]
- movaps xmm4, xmm6
- shufps xmm4, xmm4, 11111100b
- shufps xmm6, xmm6, 11111101b
-
- movaps [esp + .c6], xmm4
- movaps [esp + .c12], xmm6
-
- lea eax, [eax + eax*2]
-
- ; move coordinates to xmm0-xmm2
- movss xmm0, [edi + eax*4]
- movss xmm1, [edi + eax*4 + 4]
- movss xmm2, [edi + eax*4 + 8]
-
- movaps xmm4, [esp + .ix]
- movaps xmm5, [esp + .iy]
- movaps xmm6, [esp + .iz]
-
- ; calc dr
- subps xmm4, xmm0
- subps xmm5, xmm1
- subps xmm6, xmm2
-
- ; store dr
- movaps [esp + .dx], xmm4
- movaps [esp + .dy], xmm5
- movaps [esp + .dz], xmm6
- ; square it
- mulps xmm4,xmm4
- mulps xmm5,xmm5
- mulps xmm6,xmm6
- addps xmm4, xmm5
- addps xmm4, xmm6
- ; rsq in xmm4
-
- rsqrtps xmm5, xmm4
- ; lookup seed in xmm5
- movaps xmm2, xmm5
- mulps xmm5, xmm5
- movaps xmm1, [esp + .three]
- mulps xmm5, xmm4 ;rsq*lu*lu
- movaps xmm0, [esp + .half]
- subps xmm1, xmm5 ; 3.0-rsq*lu*lu
- mulps xmm1, xmm2
- mulps xmm0, xmm1 ; xmm0=rinv
-
- mulps xmm4, xmm0 ; xmm4=r
- mulps xmm4, [esp + .tabscale]
-
- cvttps2pi mm6, xmm4 ; mm6 contain lu indices
- cvtpi2ps xmm6, mm6
- subps xmm4, xmm6
- movaps xmm1, xmm4 ;xmm1=eps
- movaps xmm2, xmm1
- mulps xmm2, xmm2 ;xmm2=eps2
-
- pslld mm6, 2
-
- mov esi, [ebp + %$VFtab]
- movd ebx, mm6
-
- lea ebx, [ebx + ebx*2]
-
- movlps xmm4, [esi + ebx*4]
- movlps xmm6, [esi + ebx*4 + 8]
- movaps xmm5, xmm4
- movaps xmm7, xmm6
- shufps xmm5, xmm5, 1b
- shufps xmm7, xmm7, 1b
- ; table ready in xmm4-xmm7
-
- mulps xmm6, xmm1 ; xmm6=Geps
- mulps xmm7, xmm2 ; xmm7=Heps2
- addps xmm5, xmm6
- addps xmm5, xmm7 ; xmm5=Fp
- mulps xmm7, [esp + .two] ; two*Heps2
- movaps xmm3, [esp + .qq]
- addps xmm7, xmm6
- addps xmm7, xmm5 ; xmm7=FF
- mulps xmm5, xmm1 ; xmm5=eps*Fp
- addps xmm5, xmm4 ; xmm5=VV
- mulps xmm5, xmm3 ; vcoul=qq*VV
- mulps xmm3, xmm7 ; fijC=FF*qq
- ; at this point mm5 contains vcoul and mm3 fijC.
- ; increment vcoul - then we can get rid of mm5.
- ;; update vctot
- addps xmm5, [esp + .vctot]
- movaps [esp + .vctot], xmm5
-
- ; put scalar force on stack temporarily...
- movaps [esp + .fs], xmm3
-
- ; dispersion
- movlps xmm4, [esi + ebx*4 + 16]
- movlps xmm6, [esi + ebx*4 + 24]
- movaps xmm5, xmm4
- movaps xmm7, xmm6
- shufps xmm5, xmm5, 1b
- shufps xmm7, xmm7, 1b
- ; table ready in xmm4-xmm7
-
- mulps xmm6, xmm1 ; xmm6=Geps
- mulps xmm7, xmm2 ; xmm7=Heps2
- addps xmm5, xmm6
- addps xmm5, xmm7 ; xmm5=Fp
- mulps xmm7, [esp + .two] ; two*Heps2
- addps xmm7, xmm6
- addps xmm7, xmm5 ; xmm7=FF
- mulps xmm5, xmm1 ; xmm5=eps*Fp
- addps xmm5, xmm4 ; xmm5=VV
-
- movaps xmm4, [esp + .c6]
- mulps xmm7, xmm4 ; fijD
- mulps xmm5, xmm4 ; vnb6
- addps xmm7, [esp + .fs] ; add to fscal
-
- ; put scalar force on stack. Update vnbtot directly.
- addps xmm5, [esp + .vnbtot]
- movaps [esp + .fs], xmm7
- movaps [esp + .vnbtot], xmm5
-
- ; repulsion
- movlps xmm4, [esi + ebx*4 + 32]
- movlps xmm6, [esi + ebx*4 + 40]
- movaps xmm5, xmm4
- movaps xmm7, xmm6
- shufps xmm5, xmm5, 1b
- shufps xmm7, xmm7, 1b
- ; table ready in xmm4-xmm7
-
- mulps xmm6, xmm1 ; xmm6=Geps
- mulps xmm7, xmm2 ; xmm7=Heps2
- addps xmm5, xmm6
- addps xmm5, xmm7 ; xmm5=Fp
- mulps xmm7, [esp + .two] ; two*Heps2
- addps xmm7, xmm6
- addps xmm7, xmm5 ; xmm7=FF
- mulps xmm5, xmm1 ; xmm5=eps*Fp
- addps xmm5, xmm4 ; xmm5=VV
-
- movaps xmm4, [esp + .c12]
- mulps xmm7, xmm4 ; fijR
- mulps xmm5, xmm4 ; vnb12
- addps xmm7, [esp + .fs]
-
- addps xmm5, [esp + .vnbtot]
- movaps [esp + .vnbtot], xmm5
- xorps xmm4, xmm4
-
- mulps xmm7, [esp + .tabscale]
- mulps xmm7, xmm0
- subps xmm4, xmm7
- mov edi, [ebp + %$faction]
-
- movaps xmm0, [esp + .dx]
- movaps xmm1, [esp + .dy]
- movaps xmm2, [esp + .dz]
-
- mulps xmm0, xmm4
- mulps xmm1, xmm4
- mulps xmm2, xmm4
- ; xmm0-xmm2 contains tx-tz (partial force)
- ; now update f_i
- movaps xmm3, [esp + .fix]
- movaps xmm4, [esp + .fiy]
- movaps xmm5, [esp + .fiz]
- addps xmm3, xmm0
- addps xmm4, xmm1
- addps xmm5, xmm2
- movaps [esp + .fix], xmm3
- movaps [esp + .fiy], xmm4
- movaps [esp + .fiz], xmm5
- ; update fj
-
- movss xmm3, [edi + eax*4]
- movss xmm4, [edi + eax*4 + 4]
- movss xmm5, [edi + eax*4 + 8]
- subss xmm3, xmm0
- subss xmm4, xmm1
- subss xmm5, xmm2
- movss [edi + eax*4], xmm3
- movss [edi + eax*4 + 4], xmm4
- movss [edi + eax*4 + 8], xmm5
-.updateouterdata:
- mov ecx, [esp + .ii3]
- mov edi, [ebp + %$faction]
- mov esi, [ebp + %$fshift]
- mov edx, [esp + .is3]
-
- ; accumulate i forces in xmm0, xmm1, xmm2
- movaps xmm0, [esp + .fix]
- movaps xmm1, [esp + .fiy]
- movaps xmm2, [esp + .fiz]
-
- movhlps xmm3, xmm0
- movhlps xmm4, xmm1
- movhlps xmm5, xmm2
- addps xmm0, xmm3
- addps xmm1, xmm4
- addps xmm2, xmm5 ; summan ligger i 1/2 i xmm0-xmm2
-
- movaps xmm3, xmm0
- movaps xmm4, xmm1
- movaps xmm5, xmm2
-
- shufps xmm3, xmm3, 1b
- shufps xmm4, xmm4, 1b
- shufps xmm5, xmm5, 1b
- addss xmm0, xmm3
- addss xmm1, xmm4
- addss xmm2, xmm5 ; xmm0-xmm2 has single force in pos0.
-
- ; increment i force
- movss xmm3, [edi + ecx*4]
- movss xmm4, [edi + ecx*4 + 4]
- movss xmm5, [edi + ecx*4 + 8]
- addss xmm3, xmm0
- addss xmm4, xmm1
- addss xmm5, xmm2
- movss [edi + ecx*4], xmm3
- movss [edi + ecx*4 + 4], xmm4
- movss [edi + ecx*4 + 8], xmm5
-
- ; increment fshift force
- movss xmm3, [esi + edx*4]
- movss xmm4, [esi + edx*4 + 4]
- movss xmm5, [esi + edx*4 + 8]
- addss xmm3, xmm0
- addss xmm4, xmm1
- addss xmm5, xmm2
- movss [esi + edx*4], xmm3
- movss [esi + edx*4 + 4], xmm4
- movss [esi + edx*4 + 8], xmm5
-
- ; get group index for i particle
- mov edx, [ebp + %$gid] ; get group index for this i particle
- mov edx, [edx]
- add [ebp + %$gid], dword 4 ; advance pointer
-
- ; accumulate total potential energy and update it.
- movaps xmm7, [esp + .vctot]
- ; accumulate
- movhlps xmm6, xmm7
- addps xmm7, xmm6 ; pos 0-1 in xmm7 have the sum now
- movaps xmm6, xmm7
- shufps xmm6, xmm6, 1b
- addss xmm7, xmm6
-
- ; add earlier value from mem.
- mov eax, [ebp + %$Vc]
- addss xmm7, [eax + edx*4]
- ; move back to mem.
- movss [eax + edx*4], xmm7
-
- ; accumulate total lj energy and update it.
- movaps xmm7, [esp + .vnbtot]
- ; accumulate
- movhlps xmm6, xmm7
- addps xmm7, xmm6 ; pos 0-1 in xmm7 have the sum now
- movaps xmm6, xmm7
- shufps xmm6, xmm6, 1b
- addss xmm7, xmm6
-
- ; add earlier value from mem.
- mov eax, [ebp + %$Vnb]
- addss xmm7, [eax + edx*4]
- ; move back to mem.
- movss [eax + edx*4], xmm7
-
- ;; finish if last
- mov ecx, [ebp + %$nri]
- dec ecx
- jecxz .end
- ;; not last, iterate once more!
- mov [ebp + %$nri], ecx
- jmp .outer
-.end:
- emms
- mov eax, [esp + .salign]
- add esp, eax
- add esp, 344
- pop edi
- pop esi
- pop edx
- pop ecx
- pop ebx
- pop eax
- endproc
-
-
-
-
-
-proc inl3310_sse
-%$nri arg
-%$iinr arg
-%$jindex arg
-%$jjnr arg
-%$shift arg
-%$shiftvec arg
-%$fshift arg
-%$gid arg
-%$pos arg
-%$faction arg
-%$charge arg
-%$facel arg
-%$Vc arg
-%$type arg
-%$ntype arg
-%$nbfp arg
-%$Vnb arg
-%$tabscale arg
-%$VFtab arg
-%$nsatoms arg
- ;; stack offsets for local variables
- ;; bottom of stack is cache-aligned for sse use
-.ix equ 0
-.iy equ 16
-.iz equ 32
-.iq equ 48
-.dx equ 64
-.dy equ 80
-.dz equ 96
-.two equ 112
-.tabscale equ 128
-.qq equ 144
-.c6 equ 160
-.c12 equ 176
-.fs equ 192
-.vctot equ 208
-.vnbtot equ 224
-.fix equ 240
-.fiy equ 256
-.fiz equ 272
-.half equ 288
-.three equ 304
-.is3 equ 320
-.ii3 equ 324
-.shX equ 328
-.shY equ 332
-.shZ equ 336
-.ntia equ 340
-.innerjjnr0 equ 344
-.innerk0 equ 348
-.innerjjnr equ 352
-.innerk equ 356
-.salign equ 360
-.nsvdwc equ 364
-.nscoul equ 368
-.nsvdw equ 372
-.solnr equ 376
- push eax
- push ebx
- push ecx
- push edx
- push esi
- push edi
- sub esp, 380 ; local stack space
- mov eax, esp
- and eax, 0xf
- sub esp, eax
- mov [esp + .salign], eax
-
- emms
-
- movups xmm0, [sse_half]
- movups xmm1, [sse_two]
- movups xmm2, [sse_three]
- movss xmm3, [ebp + %$tabscale]
- movaps [esp + .half], xmm0
- movaps [esp + .two], xmm1
- movaps [esp + .three], xmm2
- shufps xmm3, xmm3, 0b
- movaps [esp + .tabscale], xmm3
-
- ;; assume we have at least one i particle - start directly
-.outer:
- mov eax, [ebp + %$shift] ; eax = pointer into shift[]
- mov ebx, [eax] ; ebx=shift[n]
- add [ebp + %$shift], dword 4 ; advance pointer one step
-
- lea ebx, [ebx + ebx*2] ; ebx=3*is
- mov [esp + .is3],ebx ; store is3
-
- mov eax, [ebp + %$shiftvec] ; eax = base of shiftvec[]
-
- movlps xmm0, [eax + ebx*4]
- movss xmm1, [eax + ebx*4 + 8]
- movlps [esp + .shX], xmm0
- movss [esp + .shZ], xmm1
-
- mov ecx, [ebp + %$iinr] ; ecx = pointer into iinr[]
- add [ebp + %$iinr], dword 4 ; advance pointer
- mov ebx, [ecx] ; ebx =ii
-
- mov eax, [ebp + %$nsatoms]
- add [ebp + %$nsatoms], dword 12
- mov ecx, [eax]
- mov edx, [eax + 4]
- mov eax, [eax + 8]
- sub ecx, eax
- sub eax, edx
-
- mov [esp + .nsvdwc], edx
- mov [esp + .nscoul], eax
- mov [esp + .nsvdw], ecx
-
- ;; clear potential
- xorps xmm4, xmm4
- movaps [esp + .vctot], xmm4
- movaps [esp + .vnbtot], xmm4
- mov [esp + .solnr], ebx
-
- mov eax, [ebp + %$jindex]
- mov ecx, [eax] ; jindex[n]
- mov edx, [eax + 4] ; jindex[n+1]
- add [ebp + %$jindex], dword 4
- sub edx, ecx ; number of innerloop atoms
-
- mov eax, [ebp + %$jjnr]
- shl ecx, 2
- add eax, ecx
- mov [esp + .innerjjnr0], eax ; pointer to jjnr[nj0]
- mov [esp + .innerk0], edx ; number of innerloop atoms
-
- mov ecx, [esp + .nsvdwc]
- cmp ecx, dword 0
- jnz .mno_vdwc
- jmp .testcoul
-.mno_vdwc:
- mov ebx, [esp + .solnr]
- inc dword [esp + .solnr]
-
- mov edx, [ebp + %$charge]
- movss xmm3, [edx + ebx*4]
- mulss xmm3, [ebp + %$facel]
- shufps xmm3, xmm3, 0b
-
- mov edx, [ebp + %$type]
- mov edx, [edx + ebx*4]
- imul edx, [ebp + %$ntype]
- shl edx, 1
- mov [esp + .ntia], edx
-
- lea ebx, [ebx + ebx*2] ; ebx = 3*ii=ii3
- mov eax, [ebp + %$pos] ; eax = base of pos[]
-
- movss xmm0, [esp + .shX]
- movss xmm1, [esp + .shY]
- movss xmm2, [esp + .shZ]
- addss xmm0, [eax + ebx*4]
- addss xmm1, [eax + ebx*4 + 4]
- addss xmm2, [eax + ebx*4 + 8]
-
- movaps [esp + .iq], xmm3
-
- shufps xmm0, xmm0, 0b
- shufps xmm1, xmm1, 0b
- shufps xmm2, xmm2, 0b
-
- movaps [esp + .ix], xmm0
- movaps [esp + .iy], xmm1
- movaps [esp + .iz], xmm2
-
- mov [esp + .ii3], ebx
-
- ; clear i forces
- xorps xmm4, xmm4
- movaps [esp + .fix], xmm4
- movaps [esp + .fiy], xmm4
- movaps [esp + .fiz], xmm4
-
- mov ecx, [esp + .innerjjnr0]
- mov [esp + .innerjjnr], ecx
- mov edx, [esp + .innerk0]
- sub edx, dword 4
- mov [esp + .innerk], edx ; number of innerloop atoms
- jge .unroll_vdwc_loop
- jmp .finish_vdwc_inner
-.unroll_vdwc_loop:
- ;; quad-unroll innerloop here.
- mov edx, [esp + .innerjjnr] ; pointer to jjnr[k]
- mov eax, [edx]
- mov ebx, [edx + 4]
- mov ecx, [edx + 8]
- mov edx, [edx + 12] ; eax-edx=jnr1-4
- add [esp + .innerjjnr], dword 16 ; advance pointer (unrolled 4)
-
- mov esi, [ebp + %$charge] ; base of charge[]
-
- movss xmm3, [esi + eax*4]
- movss xmm4, [esi + ecx*4]
- movss xmm6, [esi + ebx*4]
- movss xmm7, [esi + edx*4]
-
- movaps xmm2, [esp + .iq]
- shufps xmm3, xmm6, 00000000b
- shufps xmm4, xmm7, 00000000b
- shufps xmm3, xmm4, 10001000b ; all charges in xmm3
- movd mm0, eax ; use mmx registers as temp. storage
- movd mm1, ebx
- mulps xmm3, xmm2
- movd mm2, ecx
- movd mm3, edx
-
- movaps [esp + .qq], xmm3
-
- mov esi, [ebp + %$type]
- mov eax, [esi + eax*4]
- mov ebx, [esi + ebx*4]
- mov ecx, [esi + ecx*4]
- mov edx, [esi + edx*4]
- mov esi, [ebp + %$nbfp]
- shl eax, 1
- shl ebx, 1
- shl ecx, 1
- shl edx, 1
- mov edi, [esp + .ntia]
- add eax, edi
- add ebx, edi
- add ecx, edi
- add edx, edi
-
- movlps xmm6, [esi + eax*4]
- movlps xmm7, [esi + ecx*4]
- movhps xmm6, [esi + ebx*4]
- movhps xmm7, [esi + edx*4]
-
- movaps xmm4, xmm6
- shufps xmm4, xmm7, 10001000b
- shufps xmm6, xmm7, 11011101b
-
- movd eax, mm0
- movd ebx, mm1
- movd ecx, mm2
- movd edx, mm3
-
- movaps [esp + .c6], xmm4
- movaps [esp + .c12], xmm6
-
- mov esi, [ebp + %$pos] ; base of pos[]
-
- lea eax, [eax + eax*2] ; replace jnr with j3
- lea ebx, [ebx + ebx*2]
-
- lea ecx, [ecx + ecx*2] ; replace jnr with j3
- lea edx, [edx + edx*2]
-
- ; move four coordinates to xmm0-xmm2
-
- movlps xmm4, [esi + eax*4]
- movlps xmm5, [esi + ecx*4]
- movss xmm2, [esi + eax*4 + 8]
- movss xmm6, [esi + ecx*4 + 8]
-
- movhps xmm4, [esi + ebx*4]
- movhps xmm5, [esi + edx*4]
-
- movss xmm0, [esi + ebx*4 + 8]
- movss xmm1, [esi + edx*4 + 8]
-
- shufps xmm2, xmm0, 0b
- shufps xmm6, xmm1, 0b
-
- movaps xmm0, xmm4
- movaps xmm1, xmm4
-
- shufps xmm2, xmm6, 10001000b
-
- shufps xmm0, xmm5, 10001000b
- shufps xmm1, xmm5, 11011101b
-
- ; move ix-iz to xmm4-xmm6
- movaps xmm4, [esp + .ix]
- movaps xmm5, [esp + .iy]
- movaps xmm6, [esp + .iz]
-
- ; calc dr
- subps xmm4, xmm0
- subps xmm5, xmm1
- subps xmm6, xmm2
-
- ; store dr
- movaps [esp + .dx], xmm4
- movaps [esp + .dy], xmm5
- movaps [esp + .dz], xmm6
- ; square it
- mulps xmm4,xmm4
- mulps xmm5,xmm5
- mulps xmm6,xmm6
- addps xmm4, xmm5
- addps xmm4, xmm6
- ; rsq in xmm4
-
- rsqrtps xmm5, xmm4
- ; lookup seed in xmm5
- movaps xmm2, xmm5
- mulps xmm5, xmm5
- movaps xmm1, [esp + .three]
- mulps xmm5, xmm4 ;rsq*lu*lu
- movaps xmm0, [esp + .half]
- subps xmm1, xmm5 ; 3.0-rsq*lu*lu
- mulps xmm1, xmm2
- mulps xmm0, xmm1 ; xmm0=rinv
- mulps xmm4, xmm0 ; xmm4=r
- mulps xmm4, [esp + .tabscale]
-
- movhlps xmm5, xmm4
- cvttps2pi mm6, xmm4
- cvttps2pi mm7, xmm5 ; mm6/mm7 contain lu indices
- cvtpi2ps xmm6, mm6
- cvtpi2ps xmm5, mm7
- movlhps xmm6, xmm5
- subps xmm4, xmm6
- movaps xmm1, xmm4 ;xmm1=eps
- movaps xmm2, xmm1
- mulps xmm2, xmm2 ;xmm2=eps2
- pslld mm6, 2
- pslld mm7, 2
-
- movd mm0, eax
- movd mm1, ebx
- movd mm2, ecx
- movd mm3, edx
-
- mov esi, [ebp + %$VFtab]
- movd eax, mm6
- psrlq mm6, 32
- movd ecx, mm7
- psrlq mm7, 32
- movd ebx, mm6
- movd edx, mm7
-
- lea eax, [eax + eax*2]
- lea ebx, [ebx + ebx*2]
- lea ecx, [ecx + ecx*2]
- lea edx, [edx + edx*2]
-
- movlps xmm5, [esi + eax*4]
- movlps xmm7, [esi + ecx*4]
- movhps xmm5, [esi + ebx*4]
- movhps xmm7, [esi + edx*4] ; got half coulomb table
-
- movaps xmm4, xmm5
- shufps xmm4, xmm7, 10001000b
- shufps xmm5, xmm7, 11011101b
-
- movlps xmm7, [esi + eax*4 + 8]
- movlps xmm3, [esi + ecx*4 + 8]
- movhps xmm7, [esi + ebx*4 + 8]
- movhps xmm3, [esi + edx*4 + 8] ; other half of coulomb table
- movaps xmm6, xmm7
- shufps xmm6, xmm3, 10001000b
- shufps xmm7, xmm3, 11011101b
- ; coulomb table ready, in xmm4-xmm7
-
- mulps xmm6, xmm1 ; xmm6=Geps
- mulps xmm7, xmm2 ; xmm7=Heps2
- addps xmm5, xmm6
- addps xmm5, xmm7 ; xmm5=Fp
- mulps xmm7, [esp + .two] ; two*Heps2
- movaps xmm3, [esp + .qq]
- addps xmm7, xmm6
- addps xmm7, xmm5 ; xmm7=FF
- mulps xmm5, xmm1 ; xmm5=eps*Fp
- addps xmm5, xmm4 ; xmm5=VV
- mulps xmm5, xmm3 ; vcoul=qq*VV
- mulps xmm3, xmm7 ; fijC=FF*qq
- ; at this point mm5 contains vcoul and mm3 fijC.
- ; increment vcoul - then we can get rid of mm5.
- ;; update vctot
- addps xmm5, [esp + .vctot]
- movaps [esp + .vctot], xmm5
-
- ; put scalar force on stack temporarily...
- movaps [esp + .fs], xmm3
-
- ; dispersion
- movlps xmm5, [esi + eax*4 + 16]
- movlps xmm7, [esi + ecx*4 + 16]
- movhps xmm5, [esi + ebx*4 + 16]
- movhps xmm7, [esi + edx*4 + 16] ; got half dispersion table
- movaps xmm4, xmm5
- shufps xmm4, xmm7, 10001000b
- shufps xmm5, xmm7, 11011101b
-
- movlps xmm7, [esi + eax*4 + 24]
- movlps xmm3, [esi + ecx*4 + 24]
- movhps xmm7, [esi + ebx*4 + 24]
- movhps xmm3, [esi + edx*4 + 24] ; other half of dispersion table
- movaps xmm6, xmm7
- shufps xmm6, xmm3, 10001000b
- shufps xmm7, xmm3, 11011101b
- ; dispersion table ready, in xmm4-xmm7
- mulps xmm6, xmm1 ; xmm6=Geps
- mulps xmm7, xmm2 ; xmm7=Heps2
- addps xmm5, xmm6
- addps xmm5, xmm7 ; xmm5=Fp
- mulps xmm7, [esp + .two] ; two*Heps2
- addps xmm7, xmm6
- addps xmm7, xmm5 ; xmm7=FF
- mulps xmm5, xmm1 ; xmm5=eps*Fp
- addps xmm5, xmm4 ; xmm5=VV
-
- movaps xmm4, [esp + .c6]
- mulps xmm7, xmm4 ; fijD
- mulps xmm5, xmm4 ; vnb6
- addps xmm7, [esp + .fs] ; add to fscal
-
- ; put scalar force on stack. Update vnbtot directly.
- addps xmm5, [esp + .vnbtot]
- movaps [esp + .fs], xmm7
- movaps [esp + .vnbtot], xmm5
-
- ; repulsion
- movlps xmm5, [esi + eax*4 + 32]
- movlps xmm7, [esi + ecx*4 + 32]
- movhps xmm5, [esi + ebx*4 + 32]
- movhps xmm7, [esi + edx*4 + 32] ; got half repulsion table
- movaps xmm4, xmm5
- shufps xmm4, xmm7, 10001000b
- shufps xmm5, xmm7, 11011101b
-
- movlps xmm7, [esi + eax*4 + 40]
- movlps xmm3, [esi + ecx*4 + 40]
- movhps xmm7, [esi + ebx*4 + 40]
- movhps xmm3, [esi + edx*4 + 40] ; other half of repulsion table
- movaps xmm6, xmm7
- shufps xmm6, xmm3, 10001000b
- shufps xmm7, xmm3, 11011101b
- ; table ready, in xmm4-xmm7
- mulps xmm6, xmm1 ; xmm6=Geps
- mulps xmm7, xmm2 ; xmm7=Heps2
- addps xmm5, xmm6
- addps xmm5, xmm7 ; xmm5=Fp
- mulps xmm7, [esp + .two] ; two*Heps2
- addps xmm7, xmm6
- addps xmm7, xmm5 ; xmm7=FF
- mulps xmm5, xmm1 ; xmm5=eps*Fp
- addps xmm5, xmm4 ; xmm5=VV
-
- movaps xmm4, [esp + .c12]
- mulps xmm7, xmm4 ; fijR
- mulps xmm5, xmm4 ; vnb12
- addps xmm7, [esp + .fs]
-
- addps xmm5, [esp + .vnbtot]
- movaps [esp + .vnbtot], xmm5
- xorps xmm4, xmm4
-
- mulps xmm7, [esp + .tabscale]
- mulps xmm7, xmm0
- subps xmm4, xmm7
-
- movaps xmm0, [esp + .dx]
- movaps xmm1, [esp + .dy]
- movaps xmm2, [esp + .dz]
-
- movd eax, mm0
- movd ebx, mm1
- movd ecx, mm2
- movd edx, mm3
-
- mov edi, [ebp + %$faction]
- mulps xmm0, xmm4
- mulps xmm1, xmm4
- mulps xmm2, xmm4
- ; xmm0-xmm2 contains tx-tz (partial force)
- ; now update f_i
- movaps xmm3, [esp + .fix]
- movaps xmm4, [esp + .fiy]
- movaps xmm5, [esp + .fiz]
- addps xmm3, xmm0
- addps xmm4, xmm1
- addps xmm5, xmm2
- movaps [esp + .fix], xmm3
- movaps [esp + .fiy], xmm4
- movaps [esp + .fiz], xmm5
- ; the fj's - start by accumulating x & y forces from memory
- movlps xmm4, [edi + eax*4]
- movlps xmm6, [edi + ecx*4]
- movhps xmm4, [edi + ebx*4]
- movhps xmm6, [edi + edx*4]
-
- movaps xmm3, xmm4
- shufps xmm3, xmm6, 10001000b
- shufps xmm4, xmm6, 11011101b
-
- ; now xmm3-xmm5 contains fjx, fjy, fjz
- subps xmm3, xmm0
- subps xmm4, xmm1
-
- ; unpack them back so we can store them - first x & y in xmm3/xmm4
-
- movaps xmm6, xmm3
- unpcklps xmm6, xmm4
- unpckhps xmm3, xmm4
- ; xmm6(l)=x & y for j1, (h) for j2
- ; xmm3(l)=x & y for j3, (h) for j4
- movlps [edi + eax*4], xmm6
- movlps [edi + ecx*4], xmm3
-
- movhps [edi + ebx*4], xmm6
- movhps [edi + edx*4], xmm3
-
- ;; and the z forces
- movss xmm4, [edi + eax*4 + 8]
- movss xmm5, [edi + ebx*4 + 8]
- movss xmm6, [edi + ecx*4 + 8]
- movss xmm7, [edi + edx*4 + 8]
- subss xmm4, xmm2
- shufps xmm2, xmm2, 11100101b
- subss xmm5, xmm2
- shufps xmm2, xmm2, 11101010b
- subss xmm6, xmm2
- shufps xmm2, xmm2, 11111111b
- subss xmm7, xmm2
- movss [edi + eax*4 + 8], xmm4
- movss [edi + ebx*4 + 8], xmm5
- movss [edi + ecx*4 + 8], xmm6
- movss [edi + edx*4 + 8], xmm7
-
- ;; should we do one more iteration?
- sub [esp + .innerk], dword 4
- jl .finish_vdwc_inner
- jmp .unroll_vdwc_loop
-.finish_vdwc_inner:
- ;; check if at least two particles remain
- add [esp + .innerk], dword 4
- mov edx, [esp + .innerk]
- and edx, 10b
- jnz .dopair_vdwc
- jmp .checksingle_vdwc
-.dopair_vdwc:
- mov esi, [ebp + %$charge]
-
- mov ecx, [esp + .innerjjnr]
-
- mov eax, [ecx]
- mov ebx, [ecx + 4]
- add [esp + .innerjjnr], dword 8
- xorps xmm7, xmm7
- movss xmm3, [esi + eax*4]
- movss xmm6, [esi + ebx*4]
- shufps xmm3, xmm6, 00000000b
- shufps xmm3, xmm3, 00001000b ; xmm3(0,1) has the charges.
-
- mulps xmm3, [esp + .iq]
- movlhps xmm3, xmm7
- movaps [esp + .qq], xmm3
-
- mov esi, [ebp + %$type]
- mov ecx, eax
- mov edx, ebx
- mov ecx, [esi + ecx*4]
- mov edx, [esi + edx*4]
- mov esi, [ebp + %$nbfp]
- shl ecx, 1
- shl edx, 1
- mov edi, [esp + .ntia]
- add ecx, edi
- add edx, edi
- movlps xmm6, [esi + ecx*4]
- movhps xmm6, [esi + edx*4]
- mov edi, [ebp + %$pos]
-
- movaps xmm4, xmm6
- shufps xmm4, xmm4, 1000b
- shufps xmm6, xmm6, 1101b
- movlhps xmm4, xmm7
- movlhps xmm6, xmm7
-
- movaps [esp + .c6], xmm4
- movaps [esp + .c12], xmm6
-
- lea eax, [eax + eax*2]
- lea ebx, [ebx + ebx*2]
- ; move coordinates to xmm0-xmm2
- movlps xmm1, [edi + eax*4]
- movss xmm2, [edi + eax*4 + 8]
- movhps xmm1, [edi + ebx*4]
- movss xmm0, [edi + ebx*4 + 8]
-
- movlhps xmm3, xmm7
-
- shufps xmm2, xmm0, 0b
-
- movaps xmm0, xmm1
-
- shufps xmm2, xmm2, 10001000b
-
- shufps xmm0, xmm0, 10001000b
- shufps xmm1, xmm1, 11011101b
-
- mov edi, [ebp + %$faction]
- ; move ix-iz to xmm4-xmm6
- xorps xmm7, xmm7
-
- movaps xmm4, [esp + .ix]
- movaps xmm5, [esp + .iy]
- movaps xmm6, [esp + .iz]
-
- ; calc dr
- subps xmm4, xmm0
- subps xmm5, xmm1
- subps xmm6, xmm2
-
- ; store dr
- movaps [esp + .dx], xmm4
- movaps [esp + .dy], xmm5
- movaps [esp + .dz], xmm6
- ; square it
- mulps xmm4,xmm4
- mulps xmm5,xmm5
- mulps xmm6,xmm6
- addps xmm4, xmm5
- addps xmm4, xmm6
- ; rsq in xmm4
-
- rsqrtps xmm5, xmm4
- ; lookup seed in xmm5
- movaps xmm2, xmm5
- mulps xmm5, xmm5
- movaps xmm1, [esp + .three]
- mulps xmm5, xmm4 ;rsq*lu*lu
- movaps xmm0, [esp + .half]
- subps xmm1, xmm5 ; 3.0-rsq*lu*lu
- mulps xmm1, xmm2
- mulps xmm0, xmm1 ; xmm0=rinv
- mulps xmm4, xmm0 ; xmm4=r
- mulps xmm4, [esp + .tabscale]
-
- cvttps2pi mm6, xmm4 ; mm6 contain lu indices
- cvtpi2ps xmm6, mm6
- subps xmm4, xmm6
- movaps xmm1, xmm4 ;xmm1=eps
- movaps xmm2, xmm1
- mulps xmm2, xmm2 ;xmm2=eps2
-
- pslld mm6, 2
-
- mov esi, [ebp + %$VFtab]
- movd ecx, mm6
- psrlq mm6, 32
- movd edx, mm6
- lea ecx, [ecx + ecx*2]
- lea edx, [edx + edx*2]
-
- movlps xmm5, [esi + ecx*4]
- movhps xmm5, [esi + edx*4] ; got half coulomb table
- movaps xmm4, xmm5
- shufps xmm4, xmm4, 10001000b
- shufps xmm5, xmm5, 11011101b
-
- movlps xmm7, [esi + ecx*4 + 8]
- movhps xmm7, [esi + edx*4 + 8]
- movaps xmm6, xmm7
- shufps xmm6, xmm6, 10001000b
- shufps xmm7, xmm7, 11011101b
- ; table ready in xmm4-xmm7
-
- mulps xmm6, xmm1 ; xmm6=Geps
- mulps xmm7, xmm2 ; xmm7=Heps2
- addps xmm5, xmm6
- addps xmm5, xmm7 ; xmm5=Fp
- mulps xmm7, [esp + .two] ; two*Heps2
- movaps xmm3, [esp + .qq]
- addps xmm7, xmm6
- addps xmm7, xmm5 ; xmm7=FF
- mulps xmm5, xmm1 ; xmm5=eps*Fp
- addps xmm5, xmm4 ; xmm5=VV
- mulps xmm5, xmm3 ; vcoul=qq*VV
- mulps xmm3, xmm7 ; fijC=FF*qq
- ; at this point mm5 contains vcoul and mm3 fijC.
- ; increment vcoul - then we can get rid of mm5.
- ;; update vctot
- addps xmm5, [esp + .vctot]
- movaps [esp + .vctot], xmm5
-
- ; put scalar force on stack temporarily...
- movaps [esp + .fs], xmm3
-
- ; dispersion
- movlps xmm5, [esi + ecx*4 + 16]
- movhps xmm5, [esi + edx*4 + 16]; got half dispersion table
- movaps xmm4, xmm5
- shufps xmm4, xmm4, 10001000b
- shufps xmm5, xmm5, 11011101b
-
- movlps xmm7, [esi + ecx*4 + 24]
- movhps xmm7, [esi + edx*4 + 24] ; other half of dispersion table
- movaps xmm6, xmm7
- shufps xmm6, xmm6, 10001000b
- shufps xmm7, xmm7, 11011101b
- ; dispersion table ready, in xmm4-xmm7
- mulps xmm6, xmm1 ; xmm6=Geps
- mulps xmm7, xmm2 ; xmm7=Heps2
- addps xmm5, xmm6
- addps xmm5, xmm7 ; xmm5=Fp
- mulps xmm7, [esp + .two] ; two*Heps2
- addps xmm7, xmm6
- addps xmm7, xmm5 ; xmm7=FF
- mulps xmm5, xmm1 ; xmm5=eps*Fp
- addps xmm5, xmm4 ; xmm5=VV
-
- movaps xmm4, [esp + .c6]
- mulps xmm7, xmm4 ; fijD
- mulps xmm5, xmm4 ; vnb6
- addps xmm7, [esp + .fs] ; add to fscal
-
- ; put scalar force on stack. Update vnbtot directly.
- addps xmm5, [esp + .vnbtot]
- movaps [esp + .fs], xmm7
- movaps [esp + .vnbtot], xmm5
-
- ; repulsion
- movlps xmm5, [esi + ecx*4 + 32]
- movhps xmm5, [esi + edx*4 + 32] ; got half repulsion table
- movaps xmm4, xmm5
- shufps xmm4, xmm4, 10001000b
- shufps xmm5, xmm5, 11011101b
-
- movlps xmm7, [esi + ecx*4 + 40]
- movhps xmm7, [esi + edx*4 + 40] ; other half of repulsion table
- movaps xmm6, xmm7
- shufps xmm6, xmm6, 10001000b
- shufps xmm7, xmm7, 11011101b
- ; table ready, in xmm4-xmm7
- mulps xmm6, xmm1 ; xmm6=Geps
- mulps xmm7, xmm2 ; xmm7=Heps2
- addps xmm5, xmm6
- addps xmm5, xmm7 ; xmm5=Fp
- mulps xmm7, [esp + .two] ; two*Heps2
- addps xmm7, xmm6
- addps xmm7, xmm5 ; xmm7=FF
- mulps xmm5, xmm1 ; xmm5=eps*Fp
- addps xmm5, xmm4 ; xmm5=VV
-
- movaps xmm4, [esp + .c12]
- mulps xmm7, xmm4 ; fijR
- mulps xmm5, xmm4 ; vnb12
- addps xmm7, [esp + .fs]
-
- addps xmm5, [esp + .vnbtot]
- movaps [esp + .vnbtot], xmm5
- xorps xmm4, xmm4
-
- mulps xmm7, [esp + .tabscale]
- mulps xmm7, xmm0
- subps xmm4, xmm7
-
- movaps xmm0, [esp + .dx]
- movaps xmm1, [esp + .dy]
- movaps xmm2, [esp + .dz]
-
- mulps xmm0, xmm4
- mulps xmm1, xmm4
- mulps xmm2, xmm4
- ; xmm0-xmm2 contains tx-tz (partial force)
- ; now update f_i
- movaps xmm3, [esp + .fix]
- movaps xmm4, [esp + .fiy]
- movaps xmm5, [esp + .fiz]
- addps xmm3, xmm0
- addps xmm4, xmm1
- addps xmm5, xmm2
- movaps [esp + .fix], xmm3
- movaps [esp + .fiy], xmm4
- movaps [esp + .fiz], xmm5
- ; update the fj's
- movss xmm3, [edi + eax*4]
- movss xmm4, [edi + eax*4 + 4]
- movss xmm5, [edi + eax*4 + 8]
- subss xmm3, xmm0
- subss xmm4, xmm1
- subss xmm5, xmm2
- movss [edi + eax*4], xmm3
- movss [edi + eax*4 + 4], xmm4
- movss [edi + eax*4 + 8], xmm5
-
- shufps xmm0, xmm0, 11100001b
- shufps xmm1, xmm1, 11100001b
- shufps xmm2, xmm2, 11100001b
-
- movss xmm3, [edi + ebx*4]
- movss xmm4, [edi + ebx*4 + 4]
- movss xmm5, [edi + ebx*4 + 8]
- subss xmm3, xmm0
- subss xmm4, xmm1
- subss xmm5, xmm2
- movss [edi + ebx*4], xmm3
- movss [edi + ebx*4 + 4], xmm4
- movss [edi + ebx*4 + 8], xmm5
-
-.checksingle_vdwc:
- mov edx, [esp + .innerk]
- and edx, 1b
- jnz .dosingle_vdwc
- jmp .updateouterdata_vdwc
-.dosingle_vdwc:
- mov esi, [ebp + %$charge]
- mov edi, [ebp + %$pos]
- mov ecx, [esp + .innerjjnr]
- mov eax, [ecx]
- xorps xmm6, xmm6
- movss xmm6, [esi + eax*4] ; xmm6(0) has the charge
- mulps xmm6, [esp + .iq]
- movaps [esp + .qq], xmm6
-
- mov esi, [ebp + %$type]
- mov ecx, eax
- mov ecx, [esi + ecx*4]
- mov esi, [ebp + %$nbfp]
- shl ecx, 1
- add ecx, [esp + .ntia]
- movlps xmm6, [esi + ecx*4]
- movaps xmm4, xmm6
- shufps xmm4, xmm4, 11111100b
- shufps xmm6, xmm6, 11111101b
-
- movaps [esp + .c6], xmm4
- movaps [esp + .c12], xmm6
-
- lea eax, [eax + eax*2]
-
- ; move coordinates to xmm0-xmm2
- movss xmm0, [edi + eax*4]
- movss xmm1, [edi + eax*4 + 4]
- movss xmm2, [edi + eax*4 + 8]
-
- movaps xmm4, [esp + .ix]
- movaps xmm5, [esp + .iy]
- movaps xmm6, [esp + .iz]
-
- ; calc dr
- subps xmm4, xmm0
- subps xmm5, xmm1
- subps xmm6, xmm2
-
- ; store dr
- movaps [esp + .dx], xmm4
- movaps [esp + .dy], xmm5
- movaps [esp + .dz], xmm6
- ; square it
- mulps xmm4,xmm4
- mulps xmm5,xmm5
- mulps xmm6,xmm6
- addps xmm4, xmm5
- addps xmm4, xmm6
- ; rsq in xmm4
-
- rsqrtps xmm5, xmm4
- ; lookup seed in xmm5
- movaps xmm2, xmm5
- mulps xmm5, xmm5
- movaps xmm1, [esp + .three]
- mulps xmm5, xmm4 ;rsq*lu*lu
- movaps xmm0, [esp + .half]
- subps xmm1, xmm5 ; 3.0-rsq*lu*lu
- mulps xmm1, xmm2
- mulps xmm0, xmm1 ; xmm0=rinv
-
- mulps xmm4, xmm0 ; xmm4=r
- mulps xmm4, [esp + .tabscale]
-
- cvttps2pi mm6, xmm4 ; mm6 contain lu indices
- cvtpi2ps xmm6, mm6
- subps xmm4, xmm6
- movaps xmm1, xmm4 ;xmm1=eps
- movaps xmm2, xmm1
- mulps xmm2, xmm2 ;xmm2=eps2
-
- pslld mm6, 2
-
- mov esi, [ebp + %$VFtab]
- movd ebx, mm6
-
- lea ebx, [ebx + ebx*2]
-
- movlps xmm4, [esi + ebx*4]
- movlps xmm6, [esi + ebx*4 + 8]
- movaps xmm5, xmm4
- movaps xmm7, xmm6
- shufps xmm5, xmm5, 1b
- shufps xmm7, xmm7, 1b
- ; table ready in xmm4-xmm7
-
- mulps xmm6, xmm1 ; xmm6=Geps
- mulps xmm7, xmm2 ; xmm7=Heps2
- addps xmm5, xmm6
- addps xmm5, xmm7 ; xmm5=Fp
- mulps xmm7, [esp + .two] ; two*Heps2
- movaps xmm3, [esp + .qq]
- addps xmm7, xmm6
- addps xmm7, xmm5 ; xmm7=FF
- mulps xmm5, xmm1 ; xmm5=eps*Fp
- addps xmm5, xmm4 ; xmm5=VV
- mulps xmm5, xmm3 ; vcoul=qq*VV
- mulps xmm3, xmm7 ; fijC=FF*qq
- ; at this point mm5 contains vcoul and mm3 fijC.
- ; increment vcoul - then we can get rid of mm5.
- ;; update vctot
- addps xmm5, [esp + .vctot]
- movaps [esp + .vctot], xmm5
-
- ; put scalar force on stack temporarily...
- movaps [esp + .fs], xmm3
-
- ; dispersion
- movlps xmm4, [esi + ebx*4 + 16]
- movlps xmm6, [esi + ebx*4 + 24]
- movaps xmm5, xmm4
- movaps xmm7, xmm6
- shufps xmm5, xmm5, 1b
- shufps xmm7, xmm7, 1b
- ; table ready in xmm4-xmm7
-
- mulps xmm6, xmm1 ; xmm6=Geps
- mulps xmm7, xmm2 ; xmm7=Heps2
- addps xmm5, xmm6
- addps xmm5, xmm7 ; xmm5=Fp
- mulps xmm7, [esp + .two] ; two*Heps2
- addps xmm7, xmm6
- addps xmm7, xmm5 ; xmm7=FF
- mulps xmm5, xmm1 ; xmm5=eps*Fp
- addps xmm5, xmm4 ; xmm5=VV
-
- movaps xmm4, [esp + .c6]
- mulps xmm7, xmm4 ; fijD
- mulps xmm5, xmm4 ; vnb6
- addps xmm7, [esp + .fs] ; add to fscal
-
- ; put scalar force on stack. Update vnbtot directly.
- addps xmm5, [esp + .vnbtot]
- movaps [esp + .fs], xmm7
- movaps [esp + .vnbtot], xmm5
-
- ; repulsion
- movlps xmm4, [esi + ebx*4 + 32]
- movlps xmm6, [esi + ebx*4 + 40]
- movaps xmm5, xmm4
- movaps xmm7, xmm6
- shufps xmm5, xmm5, 1b
- shufps xmm7, xmm7, 1b
- ; table ready in xmm4-xmm7
-
- mulps xmm6, xmm1 ; xmm6=Geps
- mulps xmm7, xmm2 ; xmm7=Heps2
- addps xmm5, xmm6
- addps xmm5, xmm7 ; xmm5=Fp
- mulps xmm7, [esp + .two] ; two*Heps2
- addps xmm7, xmm6
- addps xmm7, xmm5 ; xmm7=FF
- mulps xmm5, xmm1 ; xmm5=eps*Fp
- addps xmm5, xmm4 ; xmm5=VV
-
- movaps xmm4, [esp + .c12]
- mulps xmm7, xmm4 ; fijR
- mulps xmm5, xmm4 ; vnb12
- addps xmm7, [esp + .fs]
-
- addps xmm5, [esp + .vnbtot]
- movaps [esp + .vnbtot], xmm5
- xorps xmm4, xmm4
-
- mulps xmm7, [esp + .tabscale]
- mulps xmm7, xmm0
- subps xmm4, xmm7
- mov edi, [ebp + %$faction]
-
- movaps xmm0, [esp + .dx]
- movaps xmm1, [esp + .dy]
- movaps xmm2, [esp + .dz]
-
- mulps xmm0, xmm4
- mulps xmm1, xmm4
- mulps xmm2, xmm4
- ; xmm0-xmm2 contains tx-tz (partial force)
- ; now update f_i
- movaps xmm3, [esp + .fix]
- movaps xmm4, [esp + .fiy]
- movaps xmm5, [esp + .fiz]
- addps xmm3, xmm0
- addps xmm4, xmm1
- addps xmm5, xmm2
- movaps [esp + .fix], xmm3
- movaps [esp + .fiy], xmm4
- movaps [esp + .fiz], xmm5
- ; update fj
-
- movss xmm3, [edi + eax*4]
- movss xmm4, [edi + eax*4 + 4]
- movss xmm5, [edi + eax*4 + 8]
- subss xmm3, xmm0
- subss xmm4, xmm1
- subss xmm5, xmm2
- movss [edi + eax*4], xmm3
- movss [edi + eax*4 + 4], xmm4
- movss [edi + eax*4 + 8], xmm5
-.updateouterdata_vdwc:
- mov ecx, [esp + .ii3]
- mov edi, [ebp + %$faction]
- mov esi, [ebp + %$fshift]
- mov edx, [esp + .is3]
-
- ; accumulate i forces in xmm0, xmm1, xmm2
- movaps xmm0, [esp + .fix]
- movaps xmm1, [esp + .fiy]
- movaps xmm2, [esp + .fiz]
-
- movhlps xmm3, xmm0
- movhlps xmm4, xmm1
- movhlps xmm5, xmm2
- addps xmm0, xmm3
- addps xmm1, xmm4
- addps xmm2, xmm5 ; summan ligger i 1/2 i xmm0-xmm2
-
- movaps xmm3, xmm0
- movaps xmm4, xmm1
- movaps xmm5, xmm2
-
- shufps xmm3, xmm3, 1b
- shufps xmm4, xmm4, 1b
- shufps xmm5, xmm5, 1b
- addss xmm0, xmm3
- addss xmm1, xmm4
- addss xmm2, xmm5 ; xmm0-xmm2 has single force in pos0.
-
- ; increment i force
- movss xmm3, [edi + ecx*4]
- movss xmm4, [edi + ecx*4 + 4]
- movss xmm5, [edi + ecx*4 + 8]
- addss xmm3, xmm0
- addss xmm4, xmm1
- addss xmm5, xmm2
- movss [edi + ecx*4], xmm3
- movss [edi + ecx*4 + 4], xmm4
- movss [edi + ecx*4 + 8], xmm5
-
- ; increment fshift force
- movss xmm3, [esi + edx*4]
- movss xmm4, [esi + edx*4 + 4]
- movss xmm5, [esi + edx*4 + 8]
- addss xmm3, xmm0
- addss xmm4, xmm1
- addss xmm5, xmm2
- movss [esi + edx*4], xmm3
- movss [esi + edx*4 + 4], xmm4
- movss [esi + edx*4 + 8], xmm5
-
-
- ;; loop back to mno.
- dec dword [esp + .nsvdwc]
- jz .testcoul
- jmp .mno_vdwc
-.testcoul:
- mov ecx, [esp + .nscoul]
- cmp ecx, byte 0
- jnz .mno_coul
- jmp .testvdw
-.mno_coul:
- mov ebx, [esp + .solnr]
- inc dword [esp + .solnr]
-
- movss xmm0, [esp + .shX]
- movss xmm1, [esp + .shY]
- movss xmm2, [esp + .shZ]
-
- mov edx, [ebp + %$charge]
- movss xmm3, [edx + ebx*4]
- mulss xmm3, [ebp + %$facel]
- shufps xmm3, xmm3, 0b
-
- lea ebx, [ebx + ebx*2] ; ebx = 3*ii=ii3
- mov eax, [ebp + %$pos] ; eax = base of pos[]
-
- addss xmm0, [eax + ebx*4]
- addss xmm1, [eax + ebx*4 + 4]
- addss xmm2, [eax + ebx*4 + 8]
-
- movaps [esp + .iq], xmm3
-
- shufps xmm0, xmm0, 0b
- shufps xmm1, xmm1, 0b
- shufps xmm2, xmm2, 0b
-
- movaps [esp + .ix], xmm0
- movaps [esp + .iy], xmm1
- movaps [esp + .iz], xmm2
-
- mov [esp + .ii3], ebx
-
- ; clear i forces
- xorps xmm4, xmm4
- movaps [esp + .fix], xmm4
- movaps [esp + .fiy], xmm4
- movaps [esp + .fiz], xmm4
-
- mov ecx, [esp + .innerjjnr0]
- mov [esp + .innerjjnr], ecx
- mov edx, [esp + .innerk0]
- sub edx, dword 4
- mov [esp + .innerk], edx ; number of innerloop atoms
- jge .unroll_coul_loop
- jmp .finish_coul_inner
-
-.unroll_coul_loop:
- ;; quad-unroll innerloop here.
- mov edx, [esp + .innerjjnr] ; pointer to jjnr[k]
- mov eax, [edx]
- mov ebx, [edx + 4]
- mov ecx, [edx + 8]
- mov edx, [edx + 12] ; eax-edx=jnr1-4
- add [esp + .innerjjnr], dword 16 ; advance pointer (unrolled 4)
-
- mov esi, [ebp + %$charge] ; base of charge[]
-
- movss xmm3, [esi + eax*4]
- movss xmm4, [esi + ecx*4]
- movss xmm6, [esi + ebx*4]
- movss xmm7, [esi + edx*4]
-
- movaps xmm2, [esp + .iq]
- shufps xmm3, xmm6, 00000000b
- shufps xmm4, xmm7, 00000000b
- shufps xmm3, xmm4, 10001000b ; all charges in xmm3
- mulps xmm3, xmm2
-
- movaps [esp + .qq], xmm3
-
- mov esi, [ebp + %$pos] ; base of pos[]
-
- lea eax, [eax + eax*2] ; replace jnr with j3
- lea ebx, [ebx + ebx*2]
-
- lea ecx, [ecx + ecx*2] ; replace jnr with j3
- lea edx, [edx + edx*2]
-
- ; move four coordinates to xmm0-xmm2
-
- movlps xmm4, [esi + eax*4]
- movlps xmm5, [esi + ecx*4]
- movss xmm2, [esi + eax*4 + 8]
- movss xmm6, [esi + ecx*4 + 8]
-
- movhps xmm4, [esi + ebx*4]
- movhps xmm5, [esi + edx*4]
-
- movss xmm0, [esi + ebx*4 + 8]
- movss xmm1, [esi + edx*4 + 8]
-
- shufps xmm2, xmm0, 0b
- shufps xmm6, xmm1, 0b
-
- movaps xmm0, xmm4
- movaps xmm1, xmm4
-
- shufps xmm2, xmm6, 10001000b
-
- shufps xmm0, xmm5, 10001000b
- shufps xmm1, xmm5, 11011101b
-
- ; move ix-iz to xmm4-xmm6
- movaps xmm4, [esp + .ix]
- movaps xmm5, [esp + .iy]
- movaps xmm6, [esp + .iz]
-
- ; calc dr
- subps xmm4, xmm0
- subps xmm5, xmm1
- subps xmm6, xmm2
-
- ; store dr
- movaps [esp + .dx], xmm4
- movaps [esp + .dy], xmm5
- movaps [esp + .dz], xmm6
- ; square it
- mulps xmm4,xmm4
- mulps xmm5,xmm5
- mulps xmm6,xmm6
- addps xmm4, xmm5
- addps xmm4, xmm6
- ; rsq in xmm4
-
- rsqrtps xmm5, xmm4
- ; lookup seed in xmm5
- movaps xmm2, xmm5
- mulps xmm5, xmm5
- movaps xmm1, [esp + .three]
- mulps xmm5, xmm4 ;rsq*lu*lu
- movaps xmm0, [esp + .half]
- subps xmm1, xmm5 ; 3.0-rsq*lu*lu
- mulps xmm1, xmm2
- mulps xmm0, xmm1 ; xmm0=rinv
- mulps xmm4, xmm0 ; xmm4=r
- mulps xmm4, [esp + .tabscale]
-
- movhlps xmm5, xmm4
- cvttps2pi mm6, xmm4
- cvttps2pi mm7, xmm5 ; mm6/mm7 contain lu indices
- cvtpi2ps xmm6, mm6
- cvtpi2ps xmm5, mm7
- movlhps xmm6, xmm5
- subps xmm4, xmm6
- movaps xmm1, xmm4 ;xmm1=eps
- movaps xmm2, xmm1
- mulps xmm2, xmm2 ;xmm2=eps2
- pslld mm6, 2
- pslld mm7, 2
-
- movd mm0, eax
- movd mm1, ebx
- movd mm2, ecx
- movd mm3, edx
-
- mov esi, [ebp + %$VFtab]
- movd eax, mm6
- psrlq mm6, 32
- movd ecx, mm7
- psrlq mm7, 32
- movd ebx, mm6
- movd edx, mm7
-
- lea eax, [eax + eax*2]
- lea ebx, [ebx + ebx*2]
- lea ecx, [ecx + ecx*2]
- lea edx, [edx + edx*2]
-
- movlps xmm5, [esi + eax*4]
- movlps xmm7, [esi + ecx*4]
- movhps xmm5, [esi + ebx*4]
- movhps xmm7, [esi + edx*4] ; got half coulomb table
-
- movaps xmm4, xmm5
- shufps xmm4, xmm7, 10001000b
- shufps xmm5, xmm7, 11011101b
-
- movlps xmm7, [esi + eax*4 + 8]
- movlps xmm3, [esi + ecx*4 + 8]
- movhps xmm7, [esi + ebx*4 + 8]
- movhps xmm3, [esi + edx*4 + 8] ; other half of coulomb table
- movaps xmm6, xmm7
- shufps xmm6, xmm3, 10001000b
- shufps xmm7, xmm3, 11011101b
- ; coulomb table ready, in xmm4-xmm7
-
- mulps xmm6, xmm1 ; xmm6=Geps
- mulps xmm7, xmm2 ; xmm7=Heps2
- addps xmm5, xmm6
- addps xmm5, xmm7 ; xmm5=Fp
- mulps xmm7, [esp + .two] ; two*Heps2
- movaps xmm3, [esp + .qq]
- addps xmm7, xmm6
- addps xmm7, xmm5 ; xmm7=FF
- mulps xmm5, xmm1 ; xmm5=eps*Fp
- addps xmm5, xmm4 ; xmm5=VV
- mulps xmm5, xmm3 ; vcoul=qq*VV
- mulps xmm3, xmm7 ; fijC=FF*qq
- ; at this point mm5 contains vcoul and mm3 fijC.
- ; increment vcoul - then we can get rid of mm5.
- ;; update vctot
- addps xmm5, [esp + .vctot]
- movaps [esp + .vctot], xmm5
-
- xorps xmm4, xmm4
-
- mulps xmm3, [esp + .tabscale]
- mulps xmm3, xmm0
- subps xmm4, xmm3
-
- movaps xmm0, [esp + .dx]
- movaps xmm1, [esp + .dy]
- movaps xmm2, [esp + .dz]
-
- movd eax, mm0
- movd ebx, mm1
- movd ecx, mm2
- movd edx, mm3
-
- mov edi, [ebp + %$faction]
- mulps xmm0, xmm4
- mulps xmm1, xmm4
- mulps xmm2, xmm4
- ; xmm0-xmm2 contains tx-tz (partial force)
- ; now update f_i
- movaps xmm3, [esp + .fix]
- movaps xmm4, [esp + .fiy]
- movaps xmm5, [esp + .fiz]
- addps xmm3, xmm0
- addps xmm4, xmm1
- addps xmm5, xmm2
- movaps [esp + .fix], xmm3
- movaps [esp + .fiy], xmm4
- movaps [esp + .fiz], xmm5
- ; the fj's - start by accumulating x & y forces from memory
- movlps xmm4, [edi + eax*4]
- movlps xmm6, [edi + ecx*4]
- movhps xmm4, [edi + ebx*4]
- movhps xmm6, [edi + edx*4]
-
- movaps xmm3, xmm4
- shufps xmm3, xmm6, 10001000b
- shufps xmm4, xmm6, 11011101b
-
- ; now xmm3-xmm5 contains fjx, fjy, fjz
- subps xmm3, xmm0
- subps xmm4, xmm1
-
- ; unpack them back so we can store them - first x & y in xmm3/xmm4
-
- movaps xmm6, xmm3
- unpcklps xmm6, xmm4
- unpckhps xmm3, xmm4
- ; xmm6(l)=x & y for j1, (h) for j2
- ; xmm3(l)=x & y for j3, (h) for j4
- movlps [edi + eax*4], xmm6
- movlps [edi + ecx*4], xmm3
-
- movhps [edi + ebx*4], xmm6
- movhps [edi + edx*4], xmm3
-
- ;; and the z forces
- movss xmm4, [edi + eax*4 + 8]
- movss xmm5, [edi + ebx*4 + 8]
- movss xmm6, [edi + ecx*4 + 8]
- movss xmm7, [edi + edx*4 + 8]
- subss xmm4, xmm2
- shufps xmm2, xmm2, 11100101b
- subss xmm5, xmm2
- shufps xmm2, xmm2, 11101010b
- subss xmm6, xmm2
- shufps xmm2, xmm2, 11111111b
- subss xmm7, xmm2
- movss [edi + eax*4 + 8], xmm4
- movss [edi + ebx*4 + 8], xmm5
- movss [edi + ecx*4 + 8], xmm6
- movss [edi + edx*4 + 8], xmm7
-
- ;; should we do one more iteration?
- sub [esp + .innerk], dword 4
- jl .finish_coul_inner
- jmp .unroll_coul_loop
-.finish_coul_inner:
- ;; check if at least two particles remain
- add [esp + .innerk], dword 4
- mov edx, [esp + .innerk]
- and edx, 10b
- jnz .dopair_coul
- jmp .checksingle_coul
-.dopair_coul:
- mov esi, [ebp + %$charge]
-
- mov ecx, [esp + .innerjjnr]
-
- mov eax, [ecx]
- mov ebx, [ecx + 4]
- add [esp + .innerjjnr], dword 8
- xorps xmm7, xmm7
- movss xmm3, [esi + eax*4]
- movss xmm6, [esi + ebx*4]
- shufps xmm3, xmm6, 00000000b
- shufps xmm3, xmm3, 00001000b ; xmm3(0,1) has the charges.
-
- mulps xmm3, [esp + .iq]
- movlhps xmm3, xmm7
- movaps [esp + .qq], xmm3
-
- mov edi, [ebp + %$pos]
-
- lea eax, [eax + eax*2]
- lea ebx, [ebx + ebx*2]
- ; move coordinates to xmm0-xmm2
- movlps xmm1, [edi + eax*4]
- movss xmm2, [edi + eax*4 + 8]
- movhps xmm1, [edi + ebx*4]
- movss xmm0, [edi + ebx*4 + 8]
-
- movlhps xmm3, xmm7
-
- shufps xmm2, xmm0, 0b
-
- movaps xmm0, xmm1
-
- shufps xmm2, xmm2, 10001000b
-
- shufps xmm0, xmm0, 10001000b
- shufps xmm1, xmm1, 11011101b
-
- mov edi, [ebp + %$faction]
- ; move ix-iz to xmm4-xmm6
- xorps xmm7, xmm7
-
- movaps xmm4, [esp + .ix]
- movaps xmm5, [esp + .iy]
- movaps xmm6, [esp + .iz]
-
- ; calc dr
- subps xmm4, xmm0
- subps xmm5, xmm1
- subps xmm6, xmm2
-
- ; store dr
- movaps [esp + .dx], xmm4
- movaps [esp + .dy], xmm5
- movaps [esp + .dz], xmm6
- ; square it
- mulps xmm4,xmm4
- mulps xmm5,xmm5
- mulps xmm6,xmm6
- addps xmm4, xmm5
- addps xmm4, xmm6
- ; rsq in xmm4
-
- rsqrtps xmm5, xmm4
- ; lookup seed in xmm5
- movaps xmm2, xmm5
- mulps xmm5, xmm5
- movaps xmm1, [esp + .three]
- mulps xmm5, xmm4 ;rsq*lu*lu
- movaps xmm0, [esp + .half]
- subps xmm1, xmm5 ; 3.0-rsq*lu*lu
- mulps xmm1, xmm2
- mulps xmm0, xmm1 ; xmm0=rinv
- mulps xmm4, xmm0 ; xmm4=r
- mulps xmm4, [esp + .tabscale]
-
- cvttps2pi mm6, xmm4 ; mm6 contain lu indices
- cvtpi2ps xmm6, mm6
- subps xmm4, xmm6
- movaps xmm1, xmm4 ;xmm1=eps
- movaps xmm2, xmm1
- mulps xmm2, xmm2 ;xmm2=eps2
-
- pslld mm6, 2
-
- mov esi, [ebp + %$VFtab]
- movd ecx, mm6
- psrlq mm6, 32
- movd edx, mm6
-
- lea ecx, [ecx + ecx*2]
- lea edx, [edx + edx*2]
-
- movlps xmm5, [esi + ecx*4]
- movhps xmm5, [esi + edx*4] ; got half coulomb table
- movaps xmm4, xmm5
- shufps xmm4, xmm4, 10001000b
- shufps xmm5, xmm7, 11011101b
-
- movlps xmm7, [esi + ecx*4 + 8]
- movhps xmm7, [esi + edx*4 + 8]
- movaps xmm6, xmm7
- shufps xmm6, xmm6, 10001000b
- shufps xmm7, xmm7, 11011101b
- ; table ready in xmm4-xmm7
-
- mulps xmm6, xmm1 ; xmm6=Geps
- mulps xmm7, xmm2 ; xmm7=Heps2
- addps xmm5, xmm6
- addps xmm5, xmm7 ; xmm5=Fp
- mulps xmm7, [esp + .two] ; two*Heps2
- movaps xmm3, [esp + .qq]
- addps xmm7, xmm6
- addps xmm7, xmm5 ; xmm7=FF
- mulps xmm5, xmm1 ; xmm5=eps*Fp
- addps xmm5, xmm4 ; xmm5=VV
- mulps xmm5, xmm3 ; vcoul=qq*VV
- mulps xmm3, xmm7 ; fijC=FF*qq
- ; at this point mm5 contains vcoul and mm3 fijC.
- ; increment vcoul - then we can get rid of mm5.
- ;; update vctot
- addps xmm5, [esp + .vctot]
- movaps [esp + .vctot], xmm5
-
- xorps xmm4, xmm4
-
- mulps xmm3, [esp + .tabscale]
- mulps xmm3, xmm0
- subps xmm4, xmm3
-
- movaps xmm0, [esp + .dx]
- movaps xmm1, [esp + .dy]
- movaps xmm2, [esp + .dz]
-
- mulps xmm0, xmm4
- mulps xmm1, xmm4
- mulps xmm2, xmm4
- ; xmm0-xmm2 contains tx-tz (partial force)
- ; now update f_i
- movaps xmm3, [esp + .fix]
- movaps xmm4, [esp + .fiy]
- movaps xmm5, [esp + .fiz]
- addps xmm3, xmm0
- addps xmm4, xmm1
- addps xmm5, xmm2
- movaps [esp + .fix], xmm3
- movaps [esp + .fiy], xmm4
- movaps [esp + .fiz], xmm5
- ; update the fj's
- movss xmm3, [edi + eax*4]
- movss xmm4, [edi + eax*4 + 4]
- movss xmm5, [edi + eax*4 + 8]
- subss xmm3, xmm0
- subss xmm4, xmm1
- subss xmm5, xmm2
- movss [edi + eax*4], xmm3
- movss [edi + eax*4 + 4], xmm4
- movss [edi + eax*4 + 8], xmm5
-
- shufps xmm0, xmm0, 11100001b
- shufps xmm1, xmm1, 11100001b
- shufps xmm2, xmm2, 11100001b
-
- movss xmm3, [edi + ebx*4]
- movss xmm4, [edi + ebx*4 + 4]
- movss xmm5, [edi + ebx*4 + 8]
- subss xmm3, xmm0
- subss xmm4, xmm1
- subss xmm5, xmm2
- movss [edi + ebx*4], xmm3
- movss [edi + ebx*4 + 4], xmm4
- movss [edi + ebx*4 + 8], xmm5
-
-.checksingle_coul:
- mov edx, [esp + .innerk]
- and edx, 1b
- jnz .dosingle_coul
- jmp .updateouterdata_coul
-.dosingle_coul:
- mov esi, [ebp + %$charge]
- mov edi, [ebp + %$pos]
- mov ecx, [esp + .innerjjnr]
- mov eax, [ecx]
- xorps xmm6, xmm6
- movss xmm6, [esi + eax*4] ; xmm6(0) has the charge
- mulps xmm6, [esp + .iq]
- movaps [esp + .qq], xmm6
-
- lea eax, [eax + eax*2]
-
- ; move coordinates to xmm0-xmm2
- movss xmm0, [edi + eax*4]
- movss xmm1, [edi + eax*4 + 4]
- movss xmm2, [edi + eax*4 + 8]
-
- movaps xmm4, [esp + .ix]
- movaps xmm5, [esp + .iy]
- movaps xmm6, [esp + .iz]
-
- ; calc dr
- subps xmm4, xmm0
- subps xmm5, xmm1
- subps xmm6, xmm2
-
- ; store dr
- movaps [esp + .dx], xmm4
- movaps [esp + .dy], xmm5
- movaps [esp + .dz], xmm6
- ; square it
- mulps xmm4,xmm4
- mulps xmm5,xmm5
- mulps xmm6,xmm6
- addps xmm4, xmm5
- addps xmm4, xmm6
- ; rsq in xmm4
-
- rsqrtps xmm5, xmm4
- ; lookup seed in xmm5
- movaps xmm2, xmm5
- mulps xmm5, xmm5
- movaps xmm1, [esp + .three]
- mulps xmm5, xmm4 ;rsq*lu*lu
- movaps xmm0, [esp + .half]
- subps xmm1, xmm5 ; 3.0-rsq*lu*lu
- mulps xmm1, xmm2
- mulps xmm0, xmm1 ; xmm0=rinv
-
- mulps xmm4, xmm0 ; xmm4=r
- mulps xmm4, [esp + .tabscale]
-
- cvttps2pi mm6, xmm4 ; mm6 contain lu indices
- cvtpi2ps xmm6, mm6
- subps xmm4, xmm6
- movaps xmm1, xmm4 ;xmm1=eps
- movaps xmm2, xmm1
- mulps xmm2, xmm2 ;xmm2=eps2
-
- pslld mm6, 2
-
- mov esi, [ebp + %$VFtab]
- movd ebx, mm6
-
- lea ebx, [ebx + ebx*2]
-
- movlps xmm4, [esi + ebx*4]
- movlps xmm6, [esi + ebx*4 + 8]
- movaps xmm5, xmm4
- movaps xmm7, xmm6
- shufps xmm5, xmm5, 1b
- shufps xmm7, xmm7, 1b
- ; table ready in xmm4-xmm7
-
- mulps xmm6, xmm1 ; xmm6=Geps
- mulps xmm7, xmm2 ; xmm7=Heps2
- addps xmm5, xmm6
- addps xmm5, xmm7 ; xmm5=Fp
- mulps xmm7, [esp + .two] ; two*Heps2
- movaps xmm3, [esp + .qq]
- addps xmm7, xmm6
- addps xmm7, xmm5 ; xmm7=FF
- mulps xmm5, xmm1 ; xmm5=eps*Fp
- addps xmm5, xmm4 ; xmm5=VV
- mulps xmm5, xmm3 ; vcoul=qq*VV
- mulps xmm3, xmm7 ; fijC=FF*qq
- ; at this point mm5 contains vcoul and mm3 fijC.
- ; increment vcoul - then we can get rid of mm5.
- ;; update vctot
- addps xmm5, [esp + .vctot]
- movaps [esp + .vctot], xmm5
-
- xorps xmm4, xmm4
-
- mulps xmm3, [esp + .tabscale]
- mulps xmm3, xmm0
- subps xmm4, xmm3
- mov edi, [ebp + %$faction]
-
- movaps xmm0, [esp + .dx]
- movaps xmm1, [esp + .dy]
- movaps xmm2, [esp + .dz]
-
- mulps xmm0, xmm4
- mulps xmm1, xmm4
- mulps xmm2, xmm4
- ; xmm0-xmm2 contains tx-tz (partial force)
- ; now update f_i
- movaps xmm3, [esp + .fix]
- movaps xmm4, [esp + .fiy]
- movaps xmm5, [esp + .fiz]
- addps xmm3, xmm0
- addps xmm4, xmm1
- addps xmm5, xmm2
- movaps [esp + .fix], xmm3
- movaps [esp + .fiy], xmm4
- movaps [esp + .fiz], xmm5
- ; update fj
-
- movss xmm3, [edi + eax*4]
- movss xmm4, [edi + eax*4 + 4]
- movss xmm5, [edi + eax*4 + 8]
- subss xmm3, xmm0
- subss xmm4, xmm1
- subss xmm5, xmm2
- movss [edi + eax*4], xmm3
- movss [edi + eax*4 + 4], xmm4
- movss [edi + eax*4 + 8], xmm5
-.updateouterdata_coul:
- mov ecx, [esp + .ii3]
- mov edi, [ebp + %$faction]
- mov esi, [ebp + %$fshift]
- mov edx, [esp + .is3]
-
- ; accumulate i forces in xmm0, xmm1, xmm2
- movaps xmm0, [esp + .fix]
- movaps xmm1, [esp + .fiy]
- movaps xmm2, [esp + .fiz]
-
- movhlps xmm3, xmm0
- movhlps xmm4, xmm1
- movhlps xmm5, xmm2
- addps xmm0, xmm3
- addps xmm1, xmm4
- addps xmm2, xmm5 ; summan ligger i 1/2 i xmm0-xmm2
-
- movaps xmm3, xmm0
- movaps xmm4, xmm1
- movaps xmm5, xmm2
-
- shufps xmm3, xmm3, 1b
- shufps xmm4, xmm4, 1b
- shufps xmm5, xmm5, 1b
- addss xmm0, xmm3
- addss xmm1, xmm4
- addss xmm2, xmm5 ; xmm0-xmm2 has single force in pos0.
-
- ; increment i force
- movss xmm3, [edi + ecx*4]
- movss xmm4, [edi + ecx*4 + 4]
- movss xmm5, [edi + ecx*4 + 8]
- addss xmm3, xmm0
- addss xmm4, xmm1
- addss xmm5, xmm2
- movss [edi + ecx*4], xmm3
- movss [edi + ecx*4 + 4], xmm4
- movss [edi + ecx*4 + 8], xmm5
-
- ; increment fshift force
- movss xmm3, [esi + edx*4]
- movss xmm4, [esi + edx*4 + 4]
- movss xmm5, [esi + edx*4 + 8]
- addss xmm3, xmm0
- addss xmm4, xmm1
- addss xmm5, xmm2
- movss [esi + edx*4], xmm3
- movss [esi + edx*4 + 4], xmm4
- movss [esi + edx*4 + 8], xmm5
-
- ;; loop back to mno.
- dec dword [esp + .nscoul]
- jz .testvdw
- jmp .mno_coul
-.testvdw:
- mov ecx, [esp + .nsvdw]
- cmp ecx, byte 0
- jnz .mno_vdw
- jmp .last_mno
-.mno_vdw:
- mov ebx, [esp + .solnr]
- inc dword [esp + .solnr]
-
- mov edx, [ebp + %$type]
- mov edx, [edx + ebx*4]
- imul edx, [ebp + %$ntype]
- shl edx, 1
- mov [esp + .ntia], edx
-
- lea ebx, [ebx + ebx*2] ; ebx = 3*ii=ii3
- mov eax, [ebp + %$pos] ; eax = base of pos[]
- mov [esp + .ii3], ebx
-
- movss xmm0, [esp + .shX]
- movss xmm1, [esp + .shY]
- movss xmm2, [esp + .shZ]
-
- addss xmm0, [eax + ebx*4]
- addss xmm1, [eax + ebx*4 + 4]
- addss xmm2, [eax + ebx*4 + 8]
-
- xorps xmm4, xmm4
- movaps [esp + .fix], xmm4
- movaps [esp + .fiy], xmm4
- movaps [esp + .fiz], xmm4
-
- shufps xmm0, xmm0, 0b
- shufps xmm1, xmm1, 0b
- shufps xmm2, xmm2, 0b
-
- movaps [esp + .ix], xmm0
- movaps [esp + .iy], xmm1
- movaps [esp + .iz], xmm2
-
- mov ecx, [esp + .innerjjnr0]
- mov [esp + .innerjjnr], ecx
- mov edx, [esp + .innerk0]
- sub edx, dword 4
- mov [esp + .innerk], edx ; number of innerloop atoms
- jge .unroll_vdw_loop
- jmp .finish_vdw_inner
-.unroll_vdw_loop:
- ;; quad-unroll innerloop here.
- mov edx, [esp + .innerjjnr] ; pointer to jjnr[k]
- mov eax, [edx]
- mov ebx, [edx + 4]
- mov ecx, [edx + 8]
- mov edx, [edx + 12] ; eax-edx=jnr1-4
- add [esp + .innerjjnr], dword 16 ; advance pointer (unrolled 4)
-
- movd mm0, eax ; use mmx registers as temp. storage
- movd mm1, ebx
- movd mm2, ecx
- movd mm3, edx
-
- mov esi, [ebp + %$type]
- mov eax, [esi + eax*4]
- mov ebx, [esi + ebx*4]
- mov ecx, [esi + ecx*4]
- mov edx, [esi + edx*4]
- mov esi, [ebp + %$nbfp]
- shl eax, 1
- shl ebx, 1
- shl ecx, 1
- shl edx, 1
- mov edi, [esp + .ntia]
- add eax, edi
- add ebx, edi
- add ecx, edi
- add edx, edi
-
- movlps xmm6, [esi + eax*4]
- movlps xmm7, [esi + ecx*4]
- movhps xmm6, [esi + ebx*4]
- movhps xmm7, [esi + edx*4]
-
- movaps xmm4, xmm6
- shufps xmm4, xmm7, 10001000b
- shufps xmm6, xmm7, 11011101b
-
- movd eax, mm0
- movd ebx, mm1
- movd ecx, mm2
- movd edx, mm3
-
- movaps [esp + .c6], xmm4
- movaps [esp + .c12], xmm6
-
- mov esi, [ebp + %$pos] ; base of pos[]
-
- lea eax, [eax + eax*2] ; replace jnr with j3
- lea ebx, [ebx + ebx*2]
-
- lea ecx, [ecx + ecx*2] ; replace jnr with j3
- lea edx, [edx + edx*2]
-
- ; move four coordinates to xmm0-xmm2
-
- movlps xmm4, [esi + eax*4]
- movlps xmm5, [esi + ecx*4]
- movss xmm2, [esi + eax*4 + 8]
- movss xmm6, [esi + ecx*4 + 8]
-
- movhps xmm4, [esi + ebx*4]
- movhps xmm5, [esi + edx*4]
-
- movss xmm0, [esi + ebx*4 + 8]
- movss xmm1, [esi + edx*4 + 8]
-
- shufps xmm2, xmm0, 0b
- shufps xmm6, xmm1, 0b
-
- movaps xmm0, xmm4
- movaps xmm1, xmm4
-
- shufps xmm2, xmm6, 10001000b
-
- shufps xmm0, xmm5, 10001000b
- shufps xmm1, xmm5, 11011101b
-
- ; move ix-iz to xmm4-xmm6
- movaps xmm4, [esp + .ix]
- movaps xmm5, [esp + .iy]
- movaps xmm6, [esp + .iz]
-
- ; calc dr
- subps xmm4, xmm0
- subps xmm5, xmm1
- subps xmm6, xmm2
-
- ; store dr
- movaps [esp + .dx], xmm4
- movaps [esp + .dy], xmm5
- movaps [esp + .dz], xmm6
- ; square it
- mulps xmm4,xmm4
- mulps xmm5,xmm5
- mulps xmm6,xmm6
- addps xmm4, xmm5
- addps xmm4, xmm6
- ; rsq in xmm4
-
- rsqrtps xmm5, xmm4
- ; lookup seed in xmm5
- movaps xmm2, xmm5
- mulps xmm5, xmm5
- movaps xmm1, [esp + .three]
- mulps xmm5, xmm4 ;rsq*lu*lu
- movaps xmm0, [esp + .half]
- subps xmm1, xmm5 ; 3.0-rsq*lu*lu
- mulps xmm1, xmm2
- mulps xmm0, xmm1 ; xmm0=rinv
- mulps xmm4, xmm0 ; xmm4=r
- mulps xmm4, [esp + .tabscale]
-
- movhlps xmm5, xmm4
- cvttps2pi mm6, xmm4
- cvttps2pi mm7, xmm5 ; mm6/mm7 contain lu indices
- cvtpi2ps xmm6, mm6
- cvtpi2ps xmm5, mm7
- movlhps xmm6, xmm5
- subps xmm4, xmm6
- movaps xmm1, xmm4 ;xmm1=eps
- movaps xmm2, xmm1
- mulps xmm2, xmm2 ;xmm2=eps2
- pslld mm6, 2
- pslld mm7, 2
-
- movd mm0, eax
- movd mm1, ebx
- movd mm2, ecx
- movd mm3, edx
-
- mov esi, [ebp + %$VFtab]
- movd eax, mm6
- psrlq mm6, 32
- movd ecx, mm7
- psrlq mm7, 32
- movd ebx, mm6
- movd edx, mm7
-
- lea eax, [eax + eax*2]
- lea ebx, [ebx + ebx*2]
- lea ecx, [ecx + ecx*2]
- lea edx, [edx + edx*2]
-
- ; dispersion
- movlps xmm5, [esi + eax*4 + 0]
- movlps xmm7, [esi + ecx*4 + 0]
- movhps xmm5, [esi + ebx*4 + 0]
- movhps xmm7, [esi + edx*4 + 0] ; got half dispersion table
- movaps xmm4, xmm5
- shufps xmm4, xmm7, 10001000b
- shufps xmm5, xmm7, 11011101b
-
- movlps xmm7, [esi + eax*4 + 8]
- movlps xmm3, [esi + ecx*4 + 8]
- movhps xmm7, [esi + ebx*4 + 8]
- movhps xmm3, [esi + edx*4 + 8] ; other half of dispersion table
- movaps xmm6, xmm7
- shufps xmm6, xmm3, 10001000b
- shufps xmm7, xmm3, 11011101b
- ; dispersion table ready, in xmm4-xmm7
- mulps xmm6, xmm1 ; xmm6=Geps
- mulps xmm7, xmm2 ; xmm7=Heps2
- addps xmm5, xmm6
- addps xmm5, xmm7 ; xmm5=Fp
- mulps xmm7, [esp + .two] ; two*Heps2
- addps xmm7, xmm6
- addps xmm7, xmm5 ; xmm7=FF
- mulps xmm5, xmm1 ; xmm5=eps*Fp
- addps xmm5, xmm4 ; xmm5=VV
-
- movaps xmm4, [esp + .c6]
- mulps xmm7, xmm4 ; fijD
- mulps xmm5, xmm4 ; vnb6
-
- ; put scalar force on stack. Update vnbtot directly.
- addps xmm5, [esp + .vnbtot]
- movaps [esp + .fs], xmm7
- movaps [esp + .vnbtot], xmm5
-
- ; repulsion
- movlps xmm5, [esi + eax*4 + 16]
- movlps xmm7, [esi + ecx*4 + 16]
- movhps xmm5, [esi + ebx*4 + 16]
- movhps xmm7, [esi + edx*4 + 16] ; got half repulsion table
- movaps xmm4, xmm5
- shufps xmm4, xmm7, 10001000b
- shufps xmm5, xmm7, 11011101b
-
- movlps xmm7, [esi + eax*4 + 24]
- movlps xmm3, [esi + ecx*4 + 24]
- movhps xmm7, [esi + ebx*4 + 24]
- movhps xmm3, [esi + edx*4 + 24] ; other half of repulsion table
- movaps xmm6, xmm7
- shufps xmm6, xmm3, 10001000b
- shufps xmm7, xmm3, 11011101b
- ; table ready, in xmm4-xmm7
- mulps xmm6, xmm1 ; xmm6=Geps
- mulps xmm7, xmm2 ; xmm7=Heps2
- addps xmm5, xmm6
- addps xmm5, xmm7 ; xmm5=Fp
- mulps xmm7, [esp + .two] ; two*Heps2
- addps xmm7, xmm6
- addps xmm7, xmm5 ; xmm7=FF
- mulps xmm5, xmm1 ; xmm5=eps*Fp
- addps xmm5, xmm4 ; xmm5=VV
-
- movaps xmm4, [esp + .c12]
- mulps xmm7, xmm4 ; fijR
- mulps xmm5, xmm4 ; vnb12
- addps xmm7, [esp + .fs]
-
- addps xmm5, [esp + .vnbtot]
- movaps [esp + .vnbtot], xmm5
- xorps xmm4, xmm4
-
- mulps xmm7, [esp + .tabscale]
- mulps xmm7, xmm0
- subps xmm4, xmm7
-
- movaps xmm0, [esp + .dx]
- movaps xmm1, [esp + .dy]
- movaps xmm2, [esp + .dz]
-
- movd eax, mm0
- movd ebx, mm1
- movd ecx, mm2
- movd edx, mm3
-
- mov edi, [ebp + %$faction]
- mulps xmm0, xmm4
- mulps xmm1, xmm4
- mulps xmm2, xmm4
- ; xmm0-xmm2 contains tx-tz (partial force)
- ; now update f_i
- movaps xmm3, [esp + .fix]
- movaps xmm4, [esp + .fiy]
- movaps xmm5, [esp + .fiz]
- addps xmm3, xmm0
- addps xmm4, xmm1
- addps xmm5, xmm2
- movaps [esp + .fix], xmm3
- movaps [esp + .fiy], xmm4
- movaps [esp + .fiz], xmm5
- ; the fj's - start by accumulating x & y forces from memory
- movlps xmm4, [edi + eax*4]
- movlps xmm6, [edi + ecx*4]
- movhps xmm4, [edi + ebx*4]
- movhps xmm6, [edi + edx*4]
-
- movaps xmm3, xmm4
- shufps xmm3, xmm6, 10001000b
- shufps xmm4, xmm6, 11011101b
-
- ; now xmm3-xmm5 contains fjx, fjy, fjz
- subps xmm3, xmm0
- subps xmm4, xmm1
-
- ; unpack them back so we can store them - first x & y in xmm3/xmm4
-
- movaps xmm6, xmm3
- unpcklps xmm6, xmm4
- unpckhps xmm3, xmm4
- ; xmm6(l)=x & y for j1, (h) for j2
- ; xmm3(l)=x & y for j3, (h) for j4
- movlps [edi + eax*4], xmm6
- movlps [edi + ecx*4], xmm3
-
- movhps [edi + ebx*4], xmm6
- movhps [edi + edx*4], xmm3
-
- ;; and the z forces
- movss xmm4, [edi + eax*4 + 8]
- movss xmm5, [edi + ebx*4 + 8]
- movss xmm6, [edi + ecx*4 + 8]
- movss xmm7, [edi + edx*4 + 8]
- subss xmm4, xmm2
- shufps xmm2, xmm2, 11100101b
- subss xmm5, xmm2
- shufps xmm2, xmm2, 11101010b
- subss xmm6, xmm2
- shufps xmm2, xmm2, 11111111b
- subss xmm7, xmm2
- movss [edi + eax*4 + 8], xmm4
- movss [edi + ebx*4 + 8], xmm5
- movss [edi + ecx*4 + 8], xmm6
- movss [edi + edx*4 + 8], xmm7
-
- ;; should we do one more iteration?
- sub [esp + .innerk], dword 4
- jl .finish_vdw_inner
- jmp .unroll_vdw_loop
-.finish_vdw_inner:
- ;; check if at least two particles remain
- add [esp + .innerk], dword 4
- mov edx, [esp + .innerk]
- and edx, 10b
- jnz .dopair_vdw
- jmp .checksingle_vdw
-.dopair_vdw:
- mov ecx, [esp + .innerjjnr]
-
- mov eax, [ecx]
- mov ebx, [ecx + 4]
- add [esp + .innerjjnr], dword 8
- xorps xmm7, xmm7
-
- mov esi, [ebp + %$type]
- mov ecx, eax
- mov edx, ebx
- mov ecx, [esi + ecx*4]
- mov edx, [esi + edx*4]
- mov esi, [ebp + %$nbfp]
- shl ecx, 1
- shl edx, 1
- mov edi, [esp + .ntia]
- add ecx, edi
- add edx, edi
- movlps xmm6, [esi + ecx*4]
- movhps xmm6, [esi + edx*4]
- mov edi, [ebp + %$pos]
-
- movaps xmm4, xmm6
- shufps xmm4, xmm4, 1000b
- shufps xmm6, xmm6, 1101b
- movlhps xmm4, xmm7
- movlhps xmm6, xmm7
-
- movaps [esp + .c6], xmm4
- movaps [esp + .c12], xmm6
-
- lea eax, [eax + eax*2]
- lea ebx, [ebx + ebx*2]
- ; move coordinates to xmm0-xmm2
- movlps xmm1, [edi + eax*4]
- movss xmm2, [edi + eax*4 + 8]
- movhps xmm1, [edi + ebx*4]
- movss xmm0, [edi + ebx*4 + 8]
-
- movlhps xmm3, xmm7
-
- shufps xmm2, xmm0, 0b
-
- movaps xmm0, xmm1
-
- shufps xmm2, xmm2, 10001000b
-
- shufps xmm0, xmm0, 10001000b
- shufps xmm1, xmm1, 11011101b
-
- mov edi, [ebp + %$faction]
- ; move ix-iz to xmm4-xmm6
- xorps xmm7, xmm7
-
- movaps xmm4, [esp + .ix]
- movaps xmm5, [esp + .iy]
- movaps xmm6, [esp + .iz]
-
- ; calc dr
- subps xmm4, xmm0
- subps xmm5, xmm1
- subps xmm6, xmm2
-
- ; store dr
- movaps [esp + .dx], xmm4
- movaps [esp + .dy], xmm5
- movaps [esp + .dz], xmm6
- ; square it
- mulps xmm4,xmm4
- mulps xmm5,xmm5
- mulps xmm6,xmm6
- addps xmm4, xmm5
- addps xmm4, xmm6
- ; rsq in xmm4
-
- rsqrtps xmm5, xmm4
- ; lookup seed in xmm5
- movaps xmm2, xmm5
- mulps xmm5, xmm5
- movaps xmm1, [esp + .three]
- mulps xmm5, xmm4 ;rsq*lu*lu
- movaps xmm0, [esp + .half]
- subps xmm1, xmm5 ; 3.0-rsq*lu*lu
- mulps xmm1, xmm2
- mulps xmm0, xmm1 ; xmm0=rinv
- mulps xmm4, xmm0 ; xmm4=r
- mulps xmm4, [esp + .tabscale]
-
- cvttps2pi mm6, xmm4 ; mm6 contain lu indices
- cvtpi2ps xmm6, mm6
- subps xmm4, xmm6
- movaps xmm1, xmm4 ;xmm1=eps
- movaps xmm2, xmm1
- mulps xmm2, xmm2 ;xmm2=eps2
-
- pslld mm6, 2
-
- mov esi, [ebp + %$VFtab]
- movd ecx, mm6
- psrlq mm6, 32
- movd edx, mm6
-
- lea ecx, [ecx + ecx*2]
- lea edx, [edx + edx*2]
-
- ; dispersion
- movlps xmm5, [esi + ecx*4 + 0]
- movhps xmm5, [esi + edx*4 + 0]; got half dispersion table
- movaps xmm4, xmm5
- shufps xmm4, xmm4, 10001000b
- shufps xmm5, xmm5, 11011101b
-
- movlps xmm7, [esi + ecx*4 + 8]
- movhps xmm7, [esi + edx*4 + 8] ; other half of dispersion table
- movaps xmm6, xmm7
- shufps xmm6, xmm6, 10001000b
- shufps xmm7, xmm7, 11011101b
- ; dispersion table ready, in xmm4-xmm7
- mulps xmm6, xmm1 ; xmm6=Geps
- mulps xmm7, xmm2 ; xmm7=Heps2
- addps xmm5, xmm6
- addps xmm5, xmm7 ; xmm5=Fp
- mulps xmm7, [esp + .two] ; two*Heps2
- addps xmm7, xmm6
- addps xmm7, xmm5 ; xmm7=FF
- mulps xmm5, xmm1 ; xmm5=eps*Fp
- addps xmm5, xmm4 ; xmm5=VV
-
- movaps xmm4, [esp + .c6]
- mulps xmm7, xmm4 ; fijD
- mulps xmm5, xmm4 ; vnb6
-
- ; put scalar force on stack. Update vnbtot directly.
- addps xmm5, [esp + .vnbtot]
- movaps [esp + .fs], xmm7
- movaps [esp + .vnbtot], xmm5
-
- ; repulsion
- movlps xmm5, [esi + ecx*4 + 16]
- movhps xmm5, [esi + edx*4 + 16] ; got half repulsion table
- movaps xmm4, xmm5
- shufps xmm4, xmm7, 10001000b
- shufps xmm5, xmm7, 11011101b
-
- movlps xmm7, [esi + ecx*4 + 24]
- movhps xmm7, [esi + edx*4 + 24] ; other half of repulsion table
- movaps xmm6, xmm7
- shufps xmm6, xmm3, 10001000b
- shufps xmm7, xmm3, 11011101b
- ; table ready, in xmm4-xmm7
- mulps xmm6, xmm1 ; xmm6=Geps
- mulps xmm7, xmm2 ; xmm7=Heps2
- addps xmm5, xmm6
- addps xmm5, xmm7 ; xmm5=Fp
- mulps xmm7, [esp + .two] ; two*Heps2
- addps xmm7, xmm6
- addps xmm7, xmm5 ; xmm7=FF
- mulps xmm5, xmm1 ; xmm5=eps*Fp
- addps xmm5, xmm4 ; xmm5=VV
-
- movaps xmm4, [esp + .c12]
- mulps xmm7, xmm4 ; fijR
- mulps xmm5, xmm4 ; vnb12
- addps xmm7, [esp + .fs]
-
- addps xmm5, [esp + .vnbtot]
- movaps [esp + .vnbtot], xmm5
- xorps xmm4, xmm4
-
- mulps xmm7, [esp + .tabscale]
- mulps xmm7, xmm0
- subps xmm4, xmm7
-
- movaps xmm0, [esp + .dx]
- movaps xmm1, [esp + .dy]
- movaps xmm2, [esp + .dz]
-
- mulps xmm0, xmm4
- mulps xmm1, xmm4
- mulps xmm2, xmm4
- ; xmm0-xmm2 contains tx-tz (partial force)
- ; now update f_i
- movaps xmm3, [esp + .fix]
- movaps xmm4, [esp + .fiy]
- movaps xmm5, [esp + .fiz]
- addps xmm3, xmm0
- addps xmm4, xmm1
- addps xmm5, xmm2
- movaps [esp + .fix], xmm3
- movaps [esp + .fiy], xmm4
- movaps [esp + .fiz], xmm5
- ; update the fj's
- movss xmm3, [edi + eax*4]
- movss xmm4, [edi + eax*4 + 4]
- movss xmm5, [edi + eax*4 + 8]
- subss xmm3, xmm0
- subss xmm4, xmm1
- subss xmm5, xmm2
- movss [edi + eax*4], xmm3
- movss [edi + eax*4 + 4], xmm4
- movss [edi + eax*4 + 8], xmm5
-
- shufps xmm0, xmm0, 11100001b
- shufps xmm1, xmm1, 11100001b
- shufps xmm2, xmm2, 11100001b
-
- movss xmm3, [edi + ebx*4]
- movss xmm4, [edi + ebx*4 + 4]
- movss xmm5, [edi + ebx*4 + 8]
- subss xmm3, xmm0
- subss xmm4, xmm1
- subss xmm5, xmm2
- movss [edi + ebx*4], xmm3
- movss [edi + ebx*4 + 4], xmm4
- movss [edi + ebx*4 + 8], xmm5
-
-.checksingle_vdw:
- mov edx, [esp + .innerk]
- and edx, 1b
- jnz .dosingle_vdw
- jmp .updateouterdata_vdw
-.dosingle_vdw:
- mov edi, [ebp + %$pos]
- mov ecx, [esp + .innerjjnr]
- mov eax, [ecx]
- xorps xmm6, xmm6
-
- mov esi, [ebp + %$type]
- mov ecx, eax
- mov ecx, [esi + ecx*4]
- mov esi, [ebp + %$nbfp]
- shl ecx, 1
- add ecx, [esp + .ntia]
- movlps xmm6, [esi + ecx*4]
- movaps xmm4, xmm6
- shufps xmm4, xmm4, 11111100b
- shufps xmm6, xmm6, 11111101b
-
- movaps [esp + .c6], xmm4
- movaps [esp + .c12], xmm6
-
- lea eax, [eax + eax*2]
-
- ; move coordinates to xmm0-xmm2
- movss xmm0, [edi + eax*4]
- movss xmm1, [edi + eax*4 + 4]
- movss xmm2, [edi + eax*4 + 8]
-
- movaps xmm4, [esp + .ix]
- movaps xmm5, [esp + .iy]
- movaps xmm6, [esp + .iz]
-
- ; calc dr
- subps xmm4, xmm0
- subps xmm5, xmm1
- subps xmm6, xmm2
-
- ; store dr
- movaps [esp + .dx], xmm4
- movaps [esp + .dy], xmm5
- movaps [esp + .dz], xmm6
- ; square it
- mulps xmm4,xmm4
- mulps xmm5,xmm5
- mulps xmm6,xmm6
- addps xmm4, xmm5
- addps xmm4, xmm6
- ; rsq in xmm4
-
- rsqrtps xmm5, xmm4
- ; lookup seed in xmm5
- movaps xmm2, xmm5
- mulps xmm5, xmm5
- movaps xmm1, [esp + .three]
- mulps xmm5, xmm4 ;rsq*lu*lu
- movaps xmm0, [esp + .half]
- subps xmm1, xmm5 ; 3.0-rsq*lu*lu
- mulps xmm1, xmm2
- mulps xmm0, xmm1 ; xmm0=rinv
-
- mulps xmm4, xmm0 ; xmm4=r
- mulps xmm4, [esp + .tabscale]
-
- cvttps2pi mm6, xmm4 ; mm6 contain lu indices
- cvtpi2ps xmm6, mm6
- subps xmm4, xmm6
- movaps xmm1, xmm4 ;xmm1=eps
- movaps xmm2, xmm1
- mulps xmm2, xmm2 ;xmm2=eps2
-
- pslld mm6, 2
-
- mov esi, [ebp + %$VFtab]
- movd ebx, mm6
-
- lea ebx, [ebx + ebx*2]
-
- ; dispersion
- movlps xmm4, [esi + ebx*4 + 0]
- movlps xmm6, [esi + ebx*4 + 8]
- movaps xmm5, xmm4
- movaps xmm7, xmm6
- shufps xmm5, xmm5, 1b
- shufps xmm7, xmm7, 1b
- ; table ready in xmm4-xmm7
-
- mulps xmm6, xmm1 ; xmm6=Geps
- mulps xmm7, xmm2 ; xmm7=Heps2
- addps xmm5, xmm6
- addps xmm5, xmm7 ; xmm5=Fp
- mulps xmm7, [esp + .two] ; two*Heps2
- addps xmm7, xmm6
- addps xmm7, xmm5 ; xmm7=FF
- mulps xmm5, xmm1 ; xmm5=eps*Fp
- addps xmm5, xmm4 ; xmm5=VV
-
- movaps xmm4, [esp + .c6]
- mulps xmm7, xmm4 ; fijD
- mulps xmm5, xmm4 ; vnb6
-
- ; put scalar force on stack. Update vnbtot directly.
- addps xmm5, [esp + .vnbtot]
- movaps [esp + .fs], xmm7
- movaps [esp + .vnbtot], xmm5
-
- ; repulsion
- movlps xmm4, [esi + ebx*4 + 16]
- movlps xmm6, [esi + ebx*4 + 24]
- movaps xmm5, xmm4
- movaps xmm7, xmm6
- shufps xmm5, xmm5, 1b
- shufps xmm7, xmm7, 1b
- ; table ready in xmm4-xmm7
-
- mulps xmm6, xmm1 ; xmm6=Geps
- mulps xmm7, xmm2 ; xmm7=Heps2
- addps xmm5, xmm6
- addps xmm5, xmm7 ; xmm5=Fp
- mulps xmm7, [esp + .two] ; two*Heps2
- addps xmm7, xmm6
- addps xmm7, xmm5 ; xmm7=FF
- mulps xmm5, xmm1 ; xmm5=eps*Fp
- addps xmm5, xmm4 ; xmm5=VV
-
- movaps xmm4, [esp + .c12]
- mulps xmm7, xmm4 ; fijR
- mulps xmm5, xmm4 ; vnb12
- addps xmm7, [esp + .fs]
-
- addps xmm5, [esp + .vnbtot]
- movaps [esp + .vnbtot], xmm5
- xorps xmm4, xmm4
-
- mulps xmm7, [esp + .tabscale]
- mulps xmm7, xmm0
- subps xmm4, xmm7
- mov edi, [ebp + %$faction]
-
- movaps xmm0, [esp + .dx]
- movaps xmm1, [esp + .dy]
- movaps xmm2, [esp + .dz]
-
- mulps xmm0, xmm4
- mulps xmm1, xmm4
- mulps xmm2, xmm4
- ; xmm0-xmm2 contains tx-tz (partial force)
- ; now update f_i
- movaps xmm3, [esp + .fix]
- movaps xmm4, [esp + .fiy]
- movaps xmm5, [esp + .fiz]
- addps xmm3, xmm0
- addps xmm4, xmm1
- addps xmm5, xmm2
- movaps [esp + .fix], xmm3
- movaps [esp + .fiy], xmm4
- movaps [esp + .fiz], xmm5
- ; update fj
-
- movss xmm3, [edi + eax*4]
- movss xmm4, [edi + eax*4 + 4]
- movss xmm5, [edi + eax*4 + 8]
- subss xmm3, xmm0
- subss xmm4, xmm1
- subss xmm5, xmm2
- movss [edi + eax*4], xmm3
- movss [edi + eax*4 + 4], xmm4
- movss [edi + eax*4 + 8], xmm5
-.updateouterdata_vdw:
- mov ecx, [esp + .ii3]
- mov edi, [ebp + %$faction]
- mov esi, [ebp + %$fshift]
- mov edx, [esp + .is3]
-
- ; accumulate i forces in xmm0, xmm1, xmm2
- movaps xmm0, [esp + .fix]
- movaps xmm1, [esp + .fiy]
- movaps xmm2, [esp + .fiz]
-
- movhlps xmm3, xmm0
- movhlps xmm4, xmm1
- movhlps xmm5, xmm2
- addps xmm0, xmm3
- addps xmm1, xmm4
- addps xmm2, xmm5 ; summan ligger i 1/2 i xmm0-xmm2
-
- movaps xmm3, xmm0
- movaps xmm4, xmm1
- movaps xmm5, xmm2
-
- shufps xmm3, xmm3, 1b
- shufps xmm4, xmm4, 1b
- shufps xmm5, xmm5, 1b
- addss xmm0, xmm3
- addss xmm1, xmm4
- addss xmm2, xmm5 ; xmm0-xmm2 has single force in pos0.
-
- ; increment i force
- movss xmm3, [edi + ecx*4]
- movss xmm4, [edi + ecx*4 + 4]
- movss xmm5, [edi + ecx*4 + 8]
- addss xmm3, xmm0
- addss xmm4, xmm1
- addss xmm5, xmm2
- movss [edi + ecx*4], xmm3
- movss [edi + ecx*4 + 4], xmm4
- movss [edi + ecx*4 + 8], xmm5
-
- ; increment fshift force
- movss xmm3, [esi + edx*4]
- movss xmm4, [esi + edx*4 + 4]
- movss xmm5, [esi + edx*4 + 8]
- addss xmm3, xmm0
- addss xmm4, xmm1
- addss xmm5, xmm2
- movss [esi + edx*4], xmm3
- movss [esi + edx*4 + 4], xmm4
- movss [esi + edx*4 + 8], xmm5
-
- ;; loop back to mno.
- dec dword [esp + .nsvdw]
- jz .last_mno
- jmp .mno_vdw
-.last_mno:
- ; get group index for i particle
- mov edx, [ebp + %$gid] ; get group index for this i particle
- mov edx, [edx]
- add [ebp + %$gid], dword 4 ; advance pointer
-
- ; accumulate total potential energy and update it.
- movaps xmm7, [esp + .vctot]
- ; accumulate
- movhlps xmm6, xmm7
- addps xmm7, xmm6 ; pos 0-1 in xmm7 have the sum now
- movaps xmm6, xmm7
- shufps xmm6, xmm6, 1b
- addss xmm7, xmm6
-
- ; add earlier value from mem.
- mov eax, [ebp + %$Vc]
- addss xmm7, [eax + edx*4]
- ; move back to mem.
- movss [eax + edx*4], xmm7
-
- ; accumulate total lj energy and update it.
- movaps xmm7, [esp + .vnbtot]
- ; accumulate
- movhlps xmm6, xmm7
- addps xmm7, xmm6 ; pos 0-1 in xmm7 have the sum now
- movaps xmm6, xmm7
- shufps xmm6, xmm6, 1b
- addss xmm7, xmm6
-
- ; add earlier value from mem.
- mov eax, [ebp + %$Vnb]
- addss xmm7, [eax + edx*4]
- ; move back to mem.
- movss [eax + edx*4], xmm7
-
- ;; finish if last
- mov ecx, [ebp + %$nri]
- dec ecx
- jecxz .end
- ;; not last, iterate once more!
- mov [ebp + %$nri], ecx
- jmp .outer
-.end:
- emms
- mov eax, [esp + .salign]
- add esp, eax
- add esp, 380
- pop edi
- pop esi
- pop edx
- pop ecx
- pop ebx
- pop eax
- endproc
-
-
-
-proc inl3320_sse
-%$nri arg
-%$iinr arg
-%$jindex arg
-%$jjnr arg
-%$shift arg
-%$shiftvec arg
-%$fshift arg
-%$gid arg
-%$pos arg
-%$faction arg
-%$charge arg
-%$facel arg
-%$Vc arg
-%$type arg
-%$ntype arg
-%$nbfp arg
-%$Vnb arg
-%$tabscale arg
-%$VFtab arg
- ;; stack offsets for local variables
- ;; bottom of stack is cache-aligned for sse use
-.ixO equ 0
-.iyO equ 16
-.izO equ 32
-.ixH1 equ 48
-.iyH1 equ 64
-.izH1 equ 80
-.ixH2 equ 96
-.iyH2 equ 112
-.izH2 equ 128
-.iqO equ 144
-.iqH equ 160
-.dxO equ 176
-.dyO equ 192
-.dzO equ 208
-.dxH1 equ 224
-.dyH1 equ 240
-.dzH1 equ 256
-.dxH2 equ 272
-.dyH2 equ 288
-.dzH2 equ 304
-.qqO equ 320
-.qqH equ 336
-.rinvO equ 352
-.rinvH1 equ 368
-.rinvH2 equ 384
-.rO equ 400
-.rH1 equ 416
-.rH2 equ 432
-.tabscale equ 448
-.two equ 464
-.c6 equ 480
-.c12 equ 496
-.vctot equ 512
-.vnbtot equ 528
-.fixO equ 544
-.fiyO equ 560
-.fizO equ 576
-.fixH1 equ 592
-.fiyH1 equ 608
-.fizH1 equ 624
-.fixH2 equ 640
-.fiyH2 equ 656
-.fizH2 equ 672
-.fjx equ 688
-.fjy equ 704
-.fjz equ 720
-.half equ 736
-.three equ 752
-.is3 equ 768
-.ii3 equ 772
-.ntia equ 776
-.innerjjnr equ 780
-.innerk equ 784
-.salign equ 788
- push eax
- push ebx
- push ecx
- push edx
- push esi
- push edi
- sub esp, 792 ; local stack space
- mov eax, esp
- and eax, 0xf
- sub esp, eax
- mov [esp + .salign], eax
-
- emms
-
- movups xmm0, [sse_half]
- movups xmm1, [sse_two]
- movups xmm2, [sse_three]
- movss xmm3, [ebp +%$tabscale]
-
- movaps [esp + .half], xmm0
- movaps [esp + .two], xmm1
- movaps [esp + .three], xmm2
- shufps xmm3, xmm3, 0b
- movaps [esp + .tabscale], xmm3
-
- ;; assume we have at least one i particle - start directly
- mov ecx, [ebp + %$iinr] ; ecx = pointer into iinr[]
- mov ebx, [ecx] ; ebx =ii
-
- mov edx, [ebp + %$charge]
- movss xmm3, [edx + ebx*4]
- movss xmm4, [edx + ebx*4 + 4]
- movss xmm5, [ebp + %$facel]
- mulss xmm3, xmm5
- mulss xmm4, xmm5
-
- shufps xmm3, xmm3, 0b
- shufps xmm4, xmm4, 0b
- movaps [esp + .iqO], xmm3
- movaps [esp + .iqH], xmm4
-
- mov edx, [ebp + %$type]
- mov ecx, [edx + ebx*4]
- shl ecx, 1
- imul ecx, [ebp + %$ntype] ; ecx = ntia = 2*ntype*type[ii0]
- mov [esp + .ntia], ecx
-.outer:
- mov eax, [ebp + %$shift] ; eax = pointer into shift[]
- mov ebx, [eax] ; ebx=shift[n]
- add [ebp + %$shift], dword 4 ; advance pointer one step
-
- lea ebx, [ebx + ebx*2] ; ebx=3*is
- mov [esp + .is3],ebx ; store is3
-
- mov eax, [ebp + %$shiftvec] ; eax = base of shiftvec[]
-
- movss xmm0, [eax + ebx*4]
- movss xmm1, [eax + ebx*4 + 4]
- movss xmm2, [eax + ebx*4 + 8]
-
- mov ecx, [ebp + %$iinr] ; ecx = pointer into iinr[]
- add [ebp + %$iinr], dword 4 ; advance pointer
- mov ebx, [ecx] ; ebx =ii
-
- movaps xmm3, xmm0
- movaps xmm4, xmm1
- movaps xmm5, xmm2
-
- lea ebx, [ebx + ebx*2] ; ebx = 3*ii=ii3
- mov eax, [ebp + %$pos] ; eax = base of pos[]
- mov [esp + .ii3], ebx
-
- addss xmm3, [eax + ebx*4]
- addss xmm4, [eax + ebx*4 + 4]
- addss xmm5, [eax + ebx*4 + 8]
- shufps xmm3, xmm3, 0b
- shufps xmm4, xmm4, 0b
- shufps xmm5, xmm5, 0b
- movaps [esp + .ixO], xmm3
- movaps [esp + .iyO], xmm4
- movaps [esp + .izO], xmm5
-
- movss xmm3, xmm0
- movss xmm4, xmm1
- movss xmm5, xmm2
- addss xmm0, [eax + ebx*4 + 12]
- addss xmm1, [eax + ebx*4 + 16]
- addss xmm2, [eax + ebx*4 + 20]
- addss xmm3, [eax + ebx*4 + 24]
- addss xmm4, [eax + ebx*4 + 28]
- addss xmm5, [eax + ebx*4 + 32]
-
- shufps xmm0, xmm0, 0b
- shufps xmm1, xmm1, 0b
- shufps xmm2, xmm2, 0b
- shufps xmm3, xmm3, 0b
- shufps xmm4, xmm4, 0b
- shufps xmm5, xmm5, 0b
- movaps [esp + .ixH1], xmm0
- movaps [esp + .iyH1], xmm1
- movaps [esp + .izH1], xmm2
- movaps [esp + .ixH2], xmm3
- movaps [esp + .iyH2], xmm4
- movaps [esp + .izH2], xmm5
-
- ; clear vctot and i forces
- xorps xmm4, xmm4
- movaps [esp + .vctot], xmm4
- movaps [esp + .vnbtot], xmm4
- movaps [esp + .fixO], xmm4
- movaps [esp + .fiyO], xmm4
- movaps [esp + .fizO], xmm4
- movaps [esp + .fixH1], xmm4
- movaps [esp + .fiyH1], xmm4
- movaps [esp + .fizH1], xmm4
- movaps [esp + .fixH2], xmm4
- movaps [esp + .fiyH2], xmm4
- movaps [esp + .fizH2], xmm4
-
- mov eax, [ebp + %$jindex]
- mov ecx, [eax] ; jindex[n]
- mov edx, [eax + 4] ; jindex[n+1]
- add [ebp + %$jindex], dword 4
- sub edx, ecx ; number of innerloop atoms
-
- mov esi, [ebp + %$pos]
- mov edi, [ebp + %$faction]
- mov eax, [ebp + %$jjnr]
- shl ecx, 2
- add eax, ecx
- mov [esp + .innerjjnr], eax ; pointer to jjnr[nj0]
- sub edx, dword 4
- mov [esp + .innerk], edx ; number of innerloop atoms
- jge .unroll_loop
- jmp .odd_inner
-.unroll_loop:
- ;; quad-unroll innerloop here.
- mov edx, [esp + .innerjjnr] ; pointer to jjnr[k]
- mov eax, [edx]
- mov ebx, [edx + 4]
- mov ecx, [edx + 8]
- mov edx, [edx + 12] ; eax-edx=jnr1-4
-
- add [esp + .innerjjnr], dword 16 ; advance pointer (unrolled 4)
-
- mov esi, [ebp + %$charge] ; base of charge[]
-
- movss xmm3, [esi + eax*4]
- movss xmm4, [esi + ecx*4]
- movss xmm6, [esi + ebx*4]
- movss xmm7, [esi + edx*4]
-
- shufps xmm3, xmm6, 00000000b
- shufps xmm4, xmm7, 00000000b
- shufps xmm3, xmm4, 10001000b ; all charges in xmm3
- movaps xmm4, xmm3 ; and in xmm4
- mulps xmm3, [esp + .iqO]
- mulps xmm4, [esp + .iqH]
-
- movd mm0, eax ; use mmx registers as temp. storage
- movd mm1, ebx
- movd mm2, ecx
- movd mm3, edx
-
- movaps [esp + .qqO], xmm3
- movaps [esp + .qqH], xmm4
-
- mov esi, [ebp + %$type]
- mov eax, [esi + eax*4]
- mov ebx, [esi + ebx*4]
- mov ecx, [esi + ecx*4]
- mov edx, [esi + edx*4]
- mov esi, [ebp + %$nbfp]
- shl eax, 1
- shl ebx, 1
- shl ecx, 1
- shl edx, 1
- mov edi, [esp + .ntia]
- add eax, edi
- add ebx, edi
- add ecx, edi
- add edx, edi
-
- movlps xmm6, [esi + eax*4]
- movlps xmm7, [esi + ecx*4]
- movhps xmm6, [esi + ebx*4]
- movhps xmm7, [esi + edx*4]
-
- movaps xmm4, xmm6
- shufps xmm4, xmm7, 10001000b
- shufps xmm6, xmm7, 11011101b
-
- movd eax, mm0
- movd ebx, mm1
- movd ecx, mm2
- movd edx, mm3
-
- movaps [esp + .c6], xmm4
- movaps [esp + .c12], xmm6
-
- mov esi, [ebp + %$pos] ; base of pos[]
-
- lea eax, [eax + eax*2] ; replace jnr with j3
- lea ebx, [ebx + ebx*2]
- lea ecx, [ecx + ecx*2] ; replace jnr with j3
- lea edx, [edx + edx*2]
-
- ; move four coordinates to xmm0-xmm2
- movlps xmm4, [esi + eax*4]
- movlps xmm5, [esi + ecx*4]
- movss xmm2, [esi + eax*4 + 8]
- movss xmm6, [esi + ecx*4 + 8]
-
- movhps xmm4, [esi + ebx*4]
- movhps xmm5, [esi + edx*4]
-
- movss xmm0, [esi + ebx*4 + 8]
- movss xmm1, [esi + edx*4 + 8]
-
- shufps xmm2, xmm0, 0b
- shufps xmm6, xmm1, 0b
-
- movaps xmm0, xmm4
- movaps xmm1, xmm4
-
- shufps xmm2, xmm6, 10001000b
-
- shufps xmm0, xmm5, 10001000b
- shufps xmm1, xmm5, 11011101b
-
- ; move ixO-izO to xmm4-xmm6
- movaps xmm4, [esp + .ixO]
- movaps xmm5, [esp + .iyO]
- movaps xmm6, [esp + .izO]
-
- ; calc dr
- subps xmm4, xmm0
- subps xmm5, xmm1
- subps xmm6, xmm2
-
- ; store dr
- movaps [esp + .dxO], xmm4
- movaps [esp + .dyO], xmm5
- movaps [esp + .dzO], xmm6
- ; square it
- mulps xmm4,xmm4
- mulps xmm5,xmm5
- mulps xmm6,xmm6
- addps xmm4, xmm5
- addps xmm4, xmm6
- movaps xmm7, xmm4
- ; rsqO in xmm7
-
- ; move ixH1-izH1 to xmm4-xmm6
- movaps xmm4, [esp + .ixH1]
- movaps xmm5, [esp + .iyH1]
- movaps xmm6, [esp + .izH1]
-
- ; calc dr
- subps xmm4, xmm0
- subps xmm5, xmm1
- subps xmm6, xmm2
-
- ; store dr
- movaps [esp + .dxH1], xmm4
- movaps [esp + .dyH1], xmm5
- movaps [esp + .dzH1], xmm6
- ; square it
- mulps xmm4,xmm4
- mulps xmm5,xmm5
- mulps xmm6,xmm6
- addps xmm6, xmm5
- addps xmm6, xmm4
- ; rsqH1 in xmm6
-
- ; move ixH2-izH2 to xmm3-xmm5
- movaps xmm3, [esp + .ixH2]
- movaps xmm4, [esp + .iyH2]
- movaps xmm5, [esp + .izH2]
-
- ; calc dr
- subps xmm3, xmm0
- subps xmm4, xmm1
- subps xmm5, xmm2
-
- ; store dr
- movaps [esp + .dxH2], xmm3
- movaps [esp + .dyH2], xmm4
- movaps [esp + .dzH2], xmm5
- ; square it
- mulps xmm3,xmm3
- mulps xmm4,xmm4
- mulps xmm5,xmm5
- addps xmm5, xmm4
- addps xmm5, xmm3
- ; rsqH2 in xmm5, rsqH1 in xmm6, rsqO in xmm7
-
- ; start with rsqO - seed to xmm2
- rsqrtps xmm2, xmm7
- movaps xmm3, xmm2
- mulps xmm2, xmm2
- movaps xmm4, [esp + .three]
- mulps xmm2, xmm7 ; rsq*lu*lu
- subps xmm4, xmm2 ; 3.0-rsq*lu*lu
- mulps xmm4, xmm3 ; lu*(3-rsq*lu*lu)
- mulps xmm4, [esp + .half]
- movaps [esp + .rinvO], xmm4 ; rinvO in xmm4
- mulps xmm7, xmm4
- movaps [esp + .rO], xmm7
-
- ; rsqH1 - seed in xmm2
- rsqrtps xmm2, xmm6
- movaps xmm3, xmm2
- mulps xmm2, xmm2
- movaps xmm4, [esp + .three]
- mulps xmm2, xmm6 ; rsq*lu*lu
- subps xmm4, xmm2 ; 3.0-rsq*lu*lu
- mulps xmm4, xmm3 ; lu*(3-rsq*lu*lu)
- mulps xmm4, [esp + .half]
- movaps [esp + .rinvH1], xmm4 ; rinvH1 in xmm4
- mulps xmm6, xmm4
- movaps [esp + .rH1], xmm6
-
- ; rsqH2 - seed to xmm2
- rsqrtps xmm2, xmm5
- movaps xmm3, xmm2
- mulps xmm2, xmm2
- movaps xmm4, [esp + .three]
- mulps xmm2, xmm5 ; rsq*lu*lu
- subps xmm4, xmm2 ; 3.0-rsq*lu*lu
- mulps xmm4, xmm3 ; lu*(3-rsq*lu*lu)
- mulps xmm4, [esp + .half]
- movaps [esp + .rinvH2], xmm4 ; rinvH2 in xmm4
- mulps xmm5, xmm4
- movaps [esp + .rH2], xmm5
-
- ; do O interactions
- ;; rO is still in xmm7.
- mulps xmm7, [esp + .tabscale]
- movhlps xmm4, xmm7
- cvttps2pi mm6, xmm7
- cvttps2pi mm7, xmm4 ; mm6/mm7 contain lu indices
-
- cvtpi2ps xmm3, mm6
- cvtpi2ps xmm4, mm7
- movlhps xmm3, xmm4
-
- subps xmm7, xmm3
-
- movaps xmm1, xmm7 ; xmm1=eps
- movaps xmm2, xmm1
- mulps xmm2, xmm2 ; xmm2=eps2
- pslld mm6, 2
- pslld mm7, 2
-
- movd mm0, eax
- movd mm1, ebx
- movd mm2, ecx
- movd mm3, edx
-
- mov esi, [ebp + %$VFtab]
- movd eax, mm6
- psrlq mm6, 32
- movd ecx, mm7
- psrlq mm7, 32
- movd ebx, mm6
- movd edx, mm7
-
- lea eax, [eax + eax*2]
- lea ebx, [ebx + ebx*2]
- lea ecx, [ecx + ecx*2]
- lea edx, [edx + edx*2]
-
- movlps xmm5, [esi + eax*4]
- movlps xmm7, [esi + ecx*4]
- movhps xmm5, [esi + ebx*4]
- movhps xmm7, [esi + edx*4] ; got half coulomb table
-
- movaps xmm4, xmm5
- shufps xmm4, xmm7, 10001000b
- shufps xmm5, xmm7, 11011101b
-
- movlps xmm7, [esi + eax*4 + 8]
- movlps xmm3, [esi + ecx*4 + 8]
- movhps xmm7, [esi + ebx*4 + 8]
- movhps xmm3, [esi + edx*4 + 8] ; other half of coulomb table
- movaps xmm6, xmm7
- shufps xmm6, xmm3, 10001000b
- shufps xmm7, xmm3, 11011101b
- ; coulomb table ready, in xmm4-xmm7
-
- mulps xmm6, xmm1 ; xmm6=Geps
- mulps xmm7, xmm2 ; xmm7=Heps2
- addps xmm5, xmm6
- addps xmm5, xmm7 ; xmm5=Fp
- mulps xmm7, [esp + .two] ; two*Heps2
- movaps xmm0, [esp + .qqO]
- addps xmm7, xmm6
- addps xmm7, xmm5 ; xmm7=FF
- mulps xmm5, xmm1 ; xmm5=eps*Fp
- addps xmm5, xmm4 ; xmm5=VV
- mulps xmm5, xmm0 ; vcoul=qq*VV
- mulps xmm0, xmm7 ; fijC=FF*qq
- ; at this point mm5 contains vcoul and xmm0 fijC.
- ; increment vcoul - then we can get rid of mm5.
- addps xmm5, [esp + .vctot]
- movaps [esp + .vctot], xmm5
-
- ; dispersion
- movlps xmm5, [esi + eax*4 + 16]
- movlps xmm7, [esi + ecx*4 + 16]
- movhps xmm5, [esi + ebx*4 + 16]
- movhps xmm7, [esi + edx*4 + 16] ; got half dispersion table
- movaps xmm4, xmm5
- shufps xmm4, xmm7, 10001000b
- shufps xmm5, xmm7, 11011101b
-
- movlps xmm7, [esi + eax*4 + 24]
- movlps xmm3, [esi + ecx*4 + 24]
- movhps xmm7, [esi + ebx*4 + 24]
- movhps xmm3, [esi + edx*4 + 24] ; other half of dispersion table
- movaps xmm6, xmm7
- shufps xmm6, xmm3, 10001000b
- shufps xmm7, xmm3, 11011101b
- ; dispersion table ready, in xmm4-xmm7
- mulps xmm6, xmm1 ; xmm6=Geps
- mulps xmm7, xmm2 ; xmm7=Heps2
- addps xmm5, xmm6
- addps xmm5, xmm7 ; xmm5=Fp
- mulps xmm7, [esp + .two] ; two*Heps2
- addps xmm7, xmm6
- addps xmm7, xmm5 ; xmm7=FF
- mulps xmm5, xmm1 ; xmm5=eps*Fp
- addps xmm5, xmm4 ; xmm5=VV
-
- movaps xmm4, [esp + .c6]
- mulps xmm7, xmm4 ; fijD
- mulps xmm5, xmm4 ; vnb6
- addps xmm0, xmm7 ; add to fscal
-
- ; Update vnbtot directly.
- addps xmm5, [esp + .vnbtot]
- movaps [esp + .vnbtot], xmm5
-
- ; repulsion
- movlps xmm5, [esi + eax*4 + 32]
- movlps xmm7, [esi + ecx*4 + 32]
- movhps xmm5, [esi + ebx*4 + 32]
- movhps xmm7, [esi + edx*4 + 32] ; got half repulsion table
- movaps xmm4, xmm5
- shufps xmm4, xmm7, 10001000b
- shufps xmm5, xmm7, 11011101b
-
- movlps xmm7, [esi + eax*4 + 40]
- movlps xmm3, [esi + ecx*4 + 40]
- movhps xmm7, [esi + ebx*4 + 40]
- movhps xmm3, [esi + edx*4 + 40] ; other half of repulsion table
- movaps xmm6, xmm7
- shufps xmm6, xmm3, 10001000b
- shufps xmm7, xmm3, 11011101b
- ; repulsion table ready, in xmm4-xmm7
- mulps xmm6, xmm1 ; xmm6=Geps
- mulps xmm7, xmm2 ; xmm7=Heps2
- addps xmm5, xmm6
- addps xmm5, xmm7 ; xmm5=Fp
- mulps xmm7, [esp + .two] ; two*Heps2
- addps xmm7, xmm6
- addps xmm7, xmm5 ; xmm7=FF
- mulps xmm5, xmm1 ; xmm5=eps*Fp
- addps xmm5, xmm4 ; xmm5=VV
-
- movaps xmm4, [esp + .c12]
- mulps xmm7, xmm4 ; fijD
- mulps xmm5, xmm4 ; vnb12
- addps xmm7, xmm0 ; add to fscal
- addps xmm5, [esp + .vnbtot] ; total nonbonded potential in xmm5.
- xorps xmm4, xmm4
-
- mulps xmm7, [esp + .rinvO] ; total fscal now in xmm7
-
- mulps xmm7, [esp + .tabscale]
- movaps [esp + .vnbtot], xmm5
- subps xmm4, xmm7
-
- movaps xmm0, [esp + .dxO]
- movaps xmm1, [esp + .dyO]
- movaps xmm2, [esp + .dzO]
- mulps xmm0, xmm4
- mulps xmm1, xmm4
- mulps xmm2, xmm4 ; tx in xmm0-xmm2
-
- ; update O forces
- movaps xmm3, [esp + .fixO]
- movaps xmm4, [esp + .fiyO]
- movaps xmm7, [esp + .fizO]
- addps xmm3, xmm0
- addps xmm4, xmm1
- addps xmm7, xmm2
- movaps [esp + .fixO], xmm3
- movaps [esp + .fiyO], xmm4
- movaps [esp + .fizO], xmm7
- ; update j forces with water O
- movaps [esp + .fjx], xmm0
- movaps [esp + .fjy], xmm1
- movaps [esp + .fjz], xmm2
-
- ;; Done with O interactions - now H1!
- movaps xmm7, [esp + .rH1]
- mulps xmm7, [esp + .tabscale]
- movhlps xmm4, xmm7
- cvttps2pi mm6, xmm7
- cvttps2pi mm7, xmm4 ; mm6/mm7 contain lu indices
-
- cvtpi2ps xmm3, mm6
- cvtpi2ps xmm4, mm7
- movlhps xmm3, xmm4
-
- subps xmm7, xmm3
- movaps xmm1, xmm7 ; xmm1=eps
- movaps xmm2, xmm1
- mulps xmm2, xmm2 ; xmm2=eps2
- pslld mm6, 2
- pslld mm7, 2
-
- movd eax, mm6
- psrlq mm6, 32
- movd ecx, mm7
- psrlq mm7, 32
- movd ebx, mm6
- movd edx, mm7
-
- lea eax, [eax + eax*2]
- lea ebx, [ebx + ebx*2]
- lea ecx, [ecx + ecx*2]
- lea edx, [edx + edx*2]
-
- movlps xmm5, [esi + eax*4]
- movlps xmm7, [esi + ecx*4]
- movhps xmm5, [esi + ebx*4]
- movhps xmm7, [esi + edx*4] ; got half coulomb table
-
- movaps xmm4, xmm5
- shufps xmm4, xmm7, 10001000b
- shufps xmm5, xmm7, 11011101b
-
- movlps xmm7, [esi + eax*4 + 8]
- movlps xmm3, [esi + ecx*4 + 8]
- movhps xmm7, [esi + ebx*4 + 8]
- movhps xmm3, [esi + edx*4 + 8] ; other half of coulomb table
- movaps xmm6, xmm7
- shufps xmm6, xmm3, 10001000b
- shufps xmm7, xmm3, 11011101b
- ; coulomb table ready, in xmm4-xmm7
-
- mulps xmm6, xmm1 ; xmm6=Geps
- mulps xmm7, xmm2 ; xmm7=Heps2
- addps xmm5, xmm6
- addps xmm5, xmm7 ; xmm5=Fp
- mulps xmm7, [esp + .two] ; two*Heps2
- movaps xmm0, [esp + .qqH]
- addps xmm7, xmm6
- addps xmm7, xmm5 ; xmm7=FF
- mulps xmm5, xmm1 ; xmm5=eps*Fp
- addps xmm5, xmm4 ; xmm5=VV
- mulps xmm5, xmm0 ; vcoul=qq*VV
- mulps xmm7, xmm0 ; fijC=FF*qq
- ; at this point mm5 contains vcoul and xmm7 fijC.
- ; increment vcoul
- xorps xmm4, xmm4
- addps xmm5, [esp + .vctot]
- mulps xmm7, [esp + .rinvH1]
- movaps [esp + .vctot], xmm5
- mulps xmm7, [esp + .tabscale]
- subps xmm4, xmm7
-
- movaps xmm0, [esp + .dxH1]
- movaps xmm1, [esp + .dyH1]
- movaps xmm2, [esp + .dzH1]
- mulps xmm0, xmm4
- mulps xmm1, xmm4
- mulps xmm2, xmm4
-
- ; update H1 forces
- movaps xmm3, [esp + .fixH1]
- movaps xmm4, [esp + .fiyH1]
- movaps xmm7, [esp + .fizH1]
- addps xmm3, xmm0
- addps xmm4, xmm1
- addps xmm7, xmm2
- movaps [esp + .fixH1], xmm3
- movaps [esp + .fiyH1], xmm4
- movaps [esp + .fizH1], xmm7
- ; update j forces with water H1
- addps xmm0, [esp + .fjx]
- addps xmm1, [esp + .fjy]
- addps xmm2, [esp + .fjz]
- movaps [esp + .fjx], xmm0
- movaps [esp + .fjy], xmm1
- movaps [esp + .fjz], xmm2
-
- ; Done with H1, finally we do H2 interactions
- movaps xmm7, [esp + .rH2]
- mulps xmm7, [esp + .tabscale]
- movhlps xmm4, xmm7
- cvttps2pi mm6, xmm7
- cvttps2pi mm7, xmm4 ; mm6/mm7 contain lu indices
-
- cvtpi2ps xmm3, mm6
- cvtpi2ps xmm4, mm7
- movlhps xmm3, xmm4
-
- subps xmm7, xmm3
- movaps xmm1, xmm7 ; xmm1=eps
- movaps xmm2, xmm1
- mulps xmm2, xmm2 ; xmm2=eps2
- pslld mm6, 2
- pslld mm7, 2
-
- movd eax, mm6
- psrlq mm6, 32
- movd ecx, mm7
- psrlq mm7, 32
- movd ebx, mm6
- movd edx, mm7
-
- lea eax, [eax + eax*2]
- lea ebx, [ebx + ebx*2]
- lea ecx, [ecx + ecx*2]
- lea edx, [edx + edx*2]
-
- movlps xmm5, [esi + eax*4]
- movlps xmm7, [esi + ecx*4]
- movhps xmm5, [esi + ebx*4]
- movhps xmm7, [esi + edx*4] ; got half coulomb table
-
- movaps xmm4, xmm5
- shufps xmm4, xmm7, 10001000b
- shufps xmm5, xmm7, 11011101b
-
- movlps xmm7, [esi + eax*4 + 8]
- movlps xmm3, [esi + ecx*4 + 8]
- movhps xmm7, [esi + ebx*4 + 8]
- movhps xmm3, [esi + edx*4 + 8] ; other half of coulomb table
- movaps xmm6, xmm7
- shufps xmm6, xmm3, 10001000b
- shufps xmm7, xmm3, 11011101b
- ; coulomb table ready, in xmm4-xmm7
-
- mulps xmm6, xmm1 ; xmm6=Geps
- mulps xmm7, xmm2 ; xmm7=Heps2
- addps xmm5, xmm6
- addps xmm5, xmm7 ; xmm5=Fp
- mulps xmm7, [esp + .two] ; two*Heps2
- movaps xmm0, [esp + .qqH]
- addps xmm7, xmm6
- addps xmm7, xmm5 ; xmm7=FF
- mulps xmm5, xmm1 ; xmm5=eps*Fp
- addps xmm5, xmm4 ; xmm5=VV
- mulps xmm5, xmm0 ; vcoul=qq*VV
- mulps xmm7, xmm0 ; fijC=FF*qq
- ; at this point mm5 contains vcoul and xmm0 fijC.
- ; increment vcoul
- xorps xmm4, xmm4
- addps xmm5, [esp + .vctot]
- mulps xmm7, [esp + .rinvH2]
- movaps [esp + .vctot], xmm5
- mulps xmm7, [esp + .tabscale]
- subps xmm4, xmm7
-
- movaps xmm0, [esp + .dxH2]
- movaps xmm1, [esp + .dyH2]
- movaps xmm2, [esp + .dzH2]
- mulps xmm0, xmm4
- mulps xmm1, xmm4
- mulps xmm2, xmm4
-
- movd eax, mm0
- movd ebx, mm1
- movd ecx, mm2
- movd edx, mm3
-
- ; update H2 forces
- movaps xmm3, [esp + .fixH2]
- movaps xmm4, [esp + .fiyH2]
- movaps xmm7, [esp + .fizH2]
- addps xmm3, xmm0
- addps xmm4, xmm1
- addps xmm7, xmm2
- movaps [esp + .fixH2], xmm3
- movaps [esp + .fiyH2], xmm4
- movaps [esp + .fizH2], xmm7
-
- mov edi, [ebp +%$faction]
- ; update j forces
- addps xmm0, [esp + .fjx]
- addps xmm1, [esp + .fjy]
- addps xmm2, [esp + .fjz]
-
- movlps xmm4, [edi + eax*4]
- movlps xmm7, [edi + ecx*4]
- movhps xmm4, [edi + ebx*4]
- movhps xmm7, [edi + edx*4]
-
- movaps xmm3, xmm4
- shufps xmm3, xmm7, 10001000b
- shufps xmm4, xmm7, 11011101b
- ; xmm3 has fjx, xmm4 has fjy.
- subps xmm3, xmm0
- subps xmm4, xmm1
- ; unpack the back for storing.
- movaps xmm7, xmm3
- unpcklps xmm7, xmm4
- unpckhps xmm3, xmm4
- movlps [edi + eax*4], xmm7
- movlps [edi + ecx*4], xmm3
- movhps [edi + ebx*4], xmm7
- movhps [edi + edx*4], xmm3
- ; finally z forces
- movss xmm0, [edi + eax*4 + 8]
- movss xmm1, [edi + ebx*4 + 8]
- movss xmm3, [edi + ecx*4 + 8]
- movss xmm4, [edi + edx*4 + 8]
- subss xmm0, xmm2
- shufps xmm2, xmm2, 11100101b
- subss xmm1, xmm2
- shufps xmm2, xmm2, 11101010b
- subss xmm3, xmm2
- shufps xmm2, xmm2, 11111111b
- subss xmm4, xmm2
- movss [edi + eax*4 + 8], xmm0
- movss [edi + ebx*4 + 8], xmm1
- movss [edi + ecx*4 + 8], xmm3
- movss [edi + edx*4 + 8], xmm4
-
- ;; should we do one more iteration?
- sub [esp + .innerk], dword 4
- jl .odd_inner
- jmp .unroll_loop
-.odd_inner:
- add [esp + .innerk], dword 4
- jnz .odd_loop
- jmp .updateouterdata
-.odd_loop:
- mov edx, [esp + .innerjjnr] ; pointer to jjnr[k]
- mov eax, [edx]
- add [esp + .innerjjnr], dword 4
-
- xorps xmm4, xmm4
- movss xmm4, [esp + .iqO]
- mov esi, [ebp + %$charge]
- movhps xmm4, [esp + .iqH]
- movss xmm3, [esi + eax*4] ; charge in xmm3
- shufps xmm3, xmm3, 0b
- mulps xmm3, xmm4
- movaps [esp + .qqO], xmm3 ; use oxygen qq for storage.
-
- xorps xmm6, xmm6
- mov esi, [ebp + %$type]
- mov ebx, [esi + eax*4]
- mov esi, [ebp + %$nbfp]
- shl ebx, 1
- add ebx, [esp + .ntia]
- movlps xmm6, [esi + ebx*4]
- movaps xmm7, xmm6
- shufps xmm6, xmm6, 11111100b
- shufps xmm7, xmm7, 11111101b
- movaps [esp + .c6], xmm6
- movaps [esp + .c12], xmm7
-
- mov esi, [ebp + %$pos]
- lea eax, [eax + eax*2]
-
- ; move j coords to xmm0-xmm2
- movss xmm0, [esi + eax*4]
- movss xmm1, [esi + eax*4 + 4]
- movss xmm2, [esi + eax*4 + 8]
- shufps xmm0, xmm0, 0b
- shufps xmm1, xmm1, 0b
- shufps xmm2, xmm2, 0b
-
- movss xmm3, [esp + .ixO]
- movss xmm4, [esp + .iyO]
- movss xmm5, [esp + .izO]
-
- movlps xmm6, [esp + .ixH1]
- movlps xmm7, [esp + .ixH2]
- unpcklps xmm6, xmm7
- movlhps xmm3, xmm6
- movlps xmm6, [esp + .iyH1]
- movlps xmm7, [esp + .iyH2]
- unpcklps xmm6, xmm7
- movlhps xmm4, xmm6
- movlps xmm6, [esp + .izH1]
- movlps xmm7, [esp + .izH2]
- unpcklps xmm6, xmm7
- movlhps xmm5, xmm6
-
- subps xmm3, xmm0
- subps xmm4, xmm1
- subps xmm5, xmm2
-
- movaps [esp + .dxO], xmm3
- movaps [esp + .dyO], xmm4
- movaps [esp + .dzO], xmm5
-
- mulps xmm3, xmm3
- mulps xmm4, xmm4
- mulps xmm5, xmm5
-
- addps xmm4, xmm3
- addps xmm4, xmm5
- ; rsq in xmm4.
-
- rsqrtps xmm5, xmm4
- ; lookup seed in xmm5
- movaps xmm2, xmm5
- mulps xmm5, xmm5
- movaps xmm1, [esp + .three]
- mulps xmm5, xmm4 ;rsq*lu*lu
- movaps xmm0, [esp + .half]
- subps xmm1, xmm5 ; 3.0-rsq*lu*lu
- mulps xmm1, xmm2
- mulps xmm0, xmm1 ; xmm0=rinv
- mulps xmm4, xmm0 ; xmm4=r
- movaps [esp + .rinvO], xmm0
-
- mulps xmm4, [esp + .tabscale]
- movhlps xmm7, xmm4
- cvttps2pi mm6, xmm4
- cvttps2pi mm7, xmm7 ; mm6/mm7 contain lu indices
- cvtpi2ps xmm3, mm6
- cvtpi2ps xmm7, mm7
- movlhps xmm3, xmm7
-
- subps xmm4, xmm3
- movaps xmm1, xmm4 ; xmm1=eps
- movaps xmm2, xmm1
- mulps xmm2, xmm2 ; xmm2=eps2
- pslld mm6, 2
- pslld mm7, 2
-
- movd mm0, eax
- movd mm1, ecx
- movd mm2, edx
-
- mov esi, [ebp + %$VFtab]
- movd eax, mm6
- movd ecx, mm7
- psrlq mm7, 32
- movd edx, mm7
-
- lea eax, [eax + eax*2]
- lea ecx, [ecx + ecx*2]
- lea edx, [edx + edx*2]
-
- movlps xmm5, [esi + eax*4]
- movlps xmm7, [esi + ecx*4]
- movhps xmm7, [esi + edx*4] ; got half coulomb table
-
- movaps xmm4, xmm5
- shufps xmm4, xmm7, 10001000b
- shufps xmm5, xmm7, 11011101b
-
- movlps xmm7, [esi + eax*4 + 8]
- movlps xmm3, [esi + ecx*4 + 8]
- movhps xmm3, [esi + edx*4 + 8] ; other half of coulomb table
- movaps xmm6, xmm7
- shufps xmm6, xmm3, 10001000b
- shufps xmm7, xmm3, 11011101b
- ; coulomb table ready, in xmm4-xmm7
- mulps xmm6, xmm1 ; xmm6=Geps
- mulps xmm7, xmm2 ; xmm7=Heps2
- addps xmm5, xmm6
- addps xmm5, xmm7 ; xmm5=Fp
- mulps xmm7, [esp + .two] ; two*Heps2
- movaps xmm0, [esp + .qqO]
- addps xmm7, xmm6
- addps xmm7, xmm5 ; xmm7=FF
- mulps xmm5, xmm1 ; xmm5=eps*Fp
- addps xmm5, xmm4 ; xmm5=VV
- mulps xmm5, xmm0 ; vcoul=qq*VV
- mulps xmm0, xmm7 ; fijC=FF*qq
- ; at this point mm5 contains vcoul and xmm0 fijC.
- ; increment vcoul - then we can get rid of mm5.
- addps xmm5, [esp + .vctot]
- movaps [esp + .vctot], xmm5
-
- ; dispersion
- movlps xmm5, [esi + eax*4 + 16] ; half table
- movaps xmm4, xmm5
- shufps xmm4, xmm4, 11111100b
- shufps xmm5, xmm5, 11111101b
-
- movlps xmm7, [esi + eax*4 + 24] ; other half of dispersion table
- movaps xmm6, xmm7
- shufps xmm6, xmm6, 11111100b
- shufps xmm7, xmm7, 11111101b
- ; dispersion table ready, in xmm4-xmm7
- mulss xmm6, xmm1 ; xmm6=Geps
- mulss xmm7, xmm2 ; xmm7=Heps2
- addss xmm5, xmm6
- addss xmm5, xmm7 ; xmm5=Fp
- mulss xmm7, [esp + .two] ; two*Heps2
- addss xmm7, xmm6
- addss xmm7, xmm5 ; xmm7=FF
- mulss xmm5, xmm1 ; xmm5=eps*Fp
- addss xmm5, xmm4 ; xmm5=VV
-
- movaps xmm4, [esp + .c6]
- mulps xmm7, xmm4 ; fijD
- mulps xmm5, xmm4 ; vnb6
- addps xmm0, xmm7 ; add to fscal
-
- ; Update vnbtot directly.
- addps xmm5, [esp + .vnbtot]
- movaps [esp + .vnbtot], xmm5
-
- ; repulsion
- movlps xmm5, [esi + eax*4 + 32] ; got half repulsion table
- movaps xmm4, xmm5
- shufps xmm4, xmm4, 10001000b
- shufps xmm5, xmm5, 11011101b
-
- movlps xmm7, [esi + eax*4 + 40] ; other half of repulsion table
- movaps xmm6, xmm7
- shufps xmm6, xmm6, 10001000b
- shufps xmm7, xmm7, 11011101b
- ; repulsion table ready, in xmm4-xmm7
- mulss xmm6, xmm1 ; xmm6=Geps
- mulss xmm7, xmm2 ; xmm7=Heps2
- addss xmm5, xmm6
- addss xmm5, xmm7 ; xmm5=Fp
- mulss xmm7, [esp + .two] ; two*Heps2
- addss xmm7, xmm6
- addss xmm7, xmm5 ; xmm7=FF
- mulss xmm5, xmm1 ; xmm5=eps*Fp
- addss xmm5, xmm4 ; xmm5=VV
-
- movaps xmm4, [esp + .c12]
- mulps xmm7, xmm4 ; fijD
- mulps xmm5, xmm4 ; vnb12
- addps xmm7, xmm0 ; add to fscal
- addps xmm5, [esp + .vnbtot] ; total nonbonded potential in xmm5.
-
- xorps xmm4, xmm4
- movd eax, mm0
- movd ecx, mm1
- movd edx, mm2
-
- mulps xmm7, [esp + .rinvO] ; total fscal now in xmm7
- movaps [esp + .vnbtot], xmm5
- mulps xmm7, [esp + .tabscale]
- subps xmm4, xmm7
-
- movaps xmm0, [esp + .dxO]
- movaps xmm1, [esp + .dyO]
- movaps xmm2, [esp + .dzO]
-
- mulps xmm0, xmm4
- mulps xmm1, xmm4
- mulps xmm2, xmm4 ; xmm0-xmm2 now contains tx-tz (partial force)
- movss xmm3, [esp + .fixO]
- movss xmm4, [esp + .fiyO]
- movss xmm5, [esp + .fizO]
- addss xmm3, xmm0
- addss xmm4, xmm1
- addss xmm5, xmm2
- movss [esp + .fixO], xmm3
- movss [esp + .fiyO], xmm4
- movss [esp + .fizO], xmm5 ; updated the O force. now do the H's
- movaps xmm3, xmm0
- movaps xmm4, xmm1
- movaps xmm5, xmm2
- shufps xmm3, xmm3, 11100110b ; shift right
- shufps xmm4, xmm4, 11100110b
- shufps xmm5, xmm5, 11100110b
- addss xmm3, [esp + .fixH1]
- addss xmm4, [esp + .fiyH1]
- addss xmm5, [esp + .fizH1]
- movss [esp + .fixH1], xmm3
- movss [esp + .fiyH1], xmm4
- movss [esp + .fizH1], xmm5 ; updated the H1 force.
-
- mov edi, [ebp + %$faction]
- shufps xmm3, xmm3, 11100111b ; shift right
- shufps xmm4, xmm4, 11100111b
- shufps xmm5, xmm5, 11100111b
- addss xmm3, [esp + .fixH2]
- addss xmm4, [esp + .fiyH2]
- addss xmm5, [esp + .fizH2]
- movss [esp + .fixH2], xmm3
- movss [esp + .fiyH2], xmm4
- movss [esp + .fizH2], xmm5 ; updated the H2 force.
-
- ; the fj's - start by accumulating the tx/ty/tz force in xmm0, xmm1.
- xorps xmm5, xmm5
- movaps xmm3, xmm0
- movlps xmm6, [edi + eax*4]
- movss xmm7, [edi + eax*4 + 8]
- unpcklps xmm3, xmm1
- movlhps xmm3, xmm5
- unpckhps xmm0, xmm1
- addps xmm0, xmm3
- movhlps xmm3, xmm0
- addps xmm0, xmm3 ; x,y sum in xmm0
-
- movhlps xmm1, xmm2
- addss xmm2, xmm1
- shufps xmm1, xmm1, 1b
- addss xmm2, xmm1 ; z sum in xmm2
- subps xmm6, xmm0
- subss xmm7, xmm2
-
- movlps [edi + eax*4], xmm6
- movss [edi + eax*4 + 8], xmm7
-
- dec dword [esp + .innerk]
- jz .updateouterdata
- jmp .odd_loop
-.updateouterdata:
- mov ecx, [esp + .ii3]
- mov edi, [ebp + %$faction]
- mov esi, [ebp + %$fshift]
- mov edx, [esp + .is3]
-
- ; accumulate Oi forces in xmm0, xmm1, xmm2
- movaps xmm0, [esp + .fixO]
- movaps xmm1, [esp + .fiyO]
- movaps xmm2, [esp + .fizO]
-
- movhlps xmm3, xmm0
- movhlps xmm4, xmm1
- movhlps xmm5, xmm2
- addps xmm0, xmm3
- addps xmm1, xmm4
- addps xmm2, xmm5 ; summan ligger i 1/2 i xmm0-xmm2
-
- movaps xmm3, xmm0
- movaps xmm4, xmm1
- movaps xmm5, xmm2
-
- shufps xmm3, xmm3, 1b
- shufps xmm4, xmm4, 1b
- shufps xmm5, xmm5, 1b
- addss xmm0, xmm3
- addss xmm1, xmm4
- addss xmm2, xmm5 ; xmm0-xmm2 has single force in pos0.
-
- ; increment i force
- movss xmm3, [edi + ecx*4]
- movss xmm4, [edi + ecx*4 + 4]
- movss xmm5, [edi + ecx*4 + 8]
- addss xmm3, xmm0
- addss xmm4, xmm1
- addss xmm5, xmm2
- movss [edi + ecx*4], xmm3
- movss [edi + ecx*4 + 4], xmm4
- movss [edi + ecx*4 + 8], xmm5
-
- ; accumulate force in xmm6/xmm7 for fshift
- movaps xmm6, xmm0
- movss xmm7, xmm2
- movlhps xmm6, xmm1
- shufps xmm6, xmm6, 1000b
-
- ; accumulate H1i forces in xmm0, xmm1, xmm2
- movaps xmm0, [esp + .fixH1]
- movaps xmm1, [esp + .fiyH1]
- movaps xmm2, [esp + .fizH1]
-
- movhlps xmm3, xmm0
- movhlps xmm4, xmm1
- movhlps xmm5, xmm2
- addps xmm0, xmm3
- addps xmm1, xmm4
- addps xmm2, xmm5 ; summan ligger i 1/2 i xmm0-xmm2
-
- movaps xmm3, xmm0
- movaps xmm4, xmm1
- movaps xmm5, xmm2
-
- shufps xmm3, xmm3, 1b
- shufps xmm4, xmm4, 1b
- shufps xmm5, xmm5, 1b
- addss xmm0, xmm3
- addss xmm1, xmm4
- addss xmm2, xmm5 ; xmm0-xmm2 has single force in pos0.
-
- ; increment i force
- movss xmm3, [edi + ecx*4 + 12]
- movss xmm4, [edi + ecx*4 + 16]
- movss xmm5, [edi + ecx*4 + 20]
- addss xmm3, xmm0
- addss xmm4, xmm1
- addss xmm5, xmm2
- movss [edi + ecx*4 + 12], xmm3
- movss [edi + ecx*4 + 16], xmm4
- movss [edi + ecx*4 + 20], xmm5
-
- ;accumulate force in xmm6/xmm7 for fshift
- addss xmm7, xmm2
- movlhps xmm0, xmm1
- shufps xmm0, xmm0, 1000b
- addps xmm6, xmm0
-
- ; accumulate H2i forces in xmm0, xmm1, xmm2
- movaps xmm0, [esp + .fixH2]
- movaps xmm1, [esp + .fiyH2]
- movaps xmm2, [esp + .fizH2]
-
- movhlps xmm3, xmm0
- movhlps xmm4, xmm1
- movhlps xmm5, xmm2
- addps xmm0, xmm3
- addps xmm1, xmm4
- addps xmm2, xmm5 ; summan ligger i 1/2 i xmm0-xmm2
-
- movaps xmm3, xmm0
- movaps xmm4, xmm1
- movaps xmm5, xmm2
-
- shufps xmm3, xmm3, 1b
- shufps xmm4, xmm4, 1b
- shufps xmm5, xmm5, 1b
- addss xmm0, xmm3
- addss xmm1, xmm4
- addss xmm2, xmm5 ; xmm0-xmm2 has single force in pos0.
-
- ; increment i force
- movss xmm3, [edi + ecx*4 + 24]
- movss xmm4, [edi + ecx*4 + 28]
- movss xmm5, [edi + ecx*4 + 32]
- addss xmm3, xmm0
- addss xmm4, xmm1
- addss xmm5, xmm2
- movss [edi + ecx*4 + 24], xmm3
- movss [edi + ecx*4 + 28], xmm4
- movss [edi + ecx*4 + 32], xmm5
-
- ;accumulate force in xmm6/xmm7 for fshift
- addss xmm7, xmm2
- movlhps xmm0, xmm1
- shufps xmm0, xmm0, 1000b
- addps xmm6, xmm0
-
- ; increment fshift force
- movlps xmm3, [esi + edx*4]
- movss xmm4, [esi + edx*4 + 8]
- addps xmm3, xmm6
- addss xmm4, xmm7
- movlps [esi + edx*4], xmm3
- movss [esi + edx*4 + 8], xmm4
-
- mov edx, [ebp + %$gid]
- mov edx, [edx]
- add [ebp + %$gid], dword 4
-
- ; accumulate total potential energy and update it.
- movaps xmm7, [esp + .vctot]
- ; accumulate
- movhlps xmm6, xmm7
- addps xmm7, xmm6 ; pos 0-1 in xmm7 have the sum now
- movaps xmm6, xmm7
- shufps xmm6, xmm6, 1b
- addss xmm7, xmm6
-
- ; add earlier value from mem.
- mov eax, [ebp + %$Vc]
- addss xmm7, [eax + edx*4]
- ; move back to mem.
- movss [eax + edx*4], xmm7
-
- ; accumulate total lj energy and update it.
- movaps xmm7, [esp + .vnbtot]
- ; accumulate
- movhlps xmm6, xmm7
- addps xmm7, xmm6 ; pos 0-1 in xmm7 have the sum now
- movaps xmm6, xmm7
- shufps xmm6, xmm6, 1b
- addss xmm7, xmm6
-
- ; add earlier value from mem.
- mov eax, [ebp + %$Vnb]
- addss xmm7, [eax + edx*4]
- ; move back to mem.
- movss [eax + edx*4], xmm7
-
- ;; finish if last
- mov ecx, [ebp + %$nri]
- dec ecx
- jecxz .end
- ;; not last, iterate once more!
- mov [ebp + %$nri], ecx
- jmp .outer
-.end:
- emms
- mov eax, [esp + .salign]
- add esp, eax
- add esp, 792
- pop edi
- pop esi
- pop edx
- pop ecx
- pop ebx
- pop eax
- endproc
-
-
-
-proc inl3330_sse
-%$nri arg
-%$iinr arg
-%$jindex arg
-%$jjnr arg
-%$shift arg
-%$shiftvec arg
-%$fshift arg
-%$gid arg
-%$pos arg
-%$faction arg
-%$charge arg
-%$facel arg
-%$Vc arg
-%$type arg
-%$ntype arg
-%$nbfp arg
-%$Vnb arg
-%$tabscale arg
-%$VFtab arg
- ;; stack offsets for local variables
- ;; bottom of stack is cache-aligned for sse use
-.ixO equ 0
-.iyO equ 16
-.izO equ 32
-.ixH1 equ 48
-.iyH1 equ 64
-.izH1 equ 80
-.ixH2 equ 96
-.iyH2 equ 112
-.izH2 equ 128
-.jxO equ 144
-.jyO equ 160
-.jzO equ 176
-.jxH1 equ 192
-.jyH1 equ 208
-.jzH1 equ 224
-.jxH2 equ 240
-.jyH2 equ 256
-.jzH2 equ 272
-.dxOO equ 288
-.dyOO equ 304
-.dzOO equ 320
-.dxOH1 equ 336
-.dyOH1 equ 352
-.dzOH1 equ 368
-.dxOH2 equ 384
-.dyOH2 equ 400
-.dzOH2 equ 416
-.dxH1O equ 432
-.dyH1O equ 448
-.dzH1O equ 464
-.dxH1H1 equ 480
-.dyH1H1 equ 496
-.dzH1H1 equ 512
-.dxH1H2 equ 528
-.dyH1H2 equ 544
-.dzH1H2 equ 560
-.dxH2O equ 576
-.dyH2O equ 592
-.dzH2O equ 608
-.dxH2H1 equ 624
-.dyH2H1 equ 640
-.dzH2H1 equ 656
-.dxH2H2 equ 672
-.dyH2H2 equ 688
-.dzH2H2 equ 704
-.qqOO equ 720
-.qqOH equ 736
-.qqHH equ 752
-.two equ 768
-.tabscale equ 784
-.c6 equ 800
-.c12 equ 816
-.vctot equ 832
-.vnbtot equ 848
-.fixO equ 864
-.fiyO equ 880
-.fizO equ 896
-.fixH1 equ 912
-.fiyH1 equ 928
-.fizH1 equ 944
-.fixH2 equ 960
-.fiyH2 equ 976
-.fizH2 equ 992
-.fjxO equ 1008
-.fjyO equ 1024
-.fjzO equ 1040
-.fjxH1 equ 1056
-.fjyH1 equ 1072
-.fjzH1 equ 1088
-.fjxH2 equ 1104
-.fjyH2 equ 1120
-.fjzH2 equ 1136
-.half equ 1152
-.three equ 1168
-.rsqOO equ 1184
-.rsqOH1 equ 1200
-.rsqOH2 equ 1216
-.rsqH1O equ 1232
-.rsqH1H1 equ 1248
-.rsqH1H2 equ 1264
-.rsqH2O equ 1280
-.rsqH2H1 equ 1296
-.rsqH2H2 equ 1312
-.rinvOO equ 1328
-.rinvOH1 equ 1344
-.rinvOH2 equ 1360
-.rinvH1O equ 1376
-.rinvH1H1 equ 1392
-.rinvH1H2 equ 1408
-.rinvH2O equ 1424
-.rinvH2H1 equ 1440
-.rinvH2H2 equ 1456
-.fstmp equ 1472
-.is3 equ 1488
-.ii3 equ 1492
-.innerjjnr equ 1496
-.innerk equ 1500
-.salign equ 1504
- push eax
- push ebx
- push ecx
- push edx
- push esi
- push edi
- sub esp, 1508 ; local stack space
- mov eax, esp
- and eax, 0xf
- sub esp, eax
- mov [esp + .salign], eax
-
- emms
-
- movups xmm0, [sse_half]
- movups xmm1, [sse_two]
- movups xmm2, [sse_three]
- movss xmm3, [ebp +%$tabscale]
- movaps [esp + .half], xmm0
- movaps [esp + .two], xmm1
- movaps [esp + .three], xmm2
- shufps xmm3, xmm3, 0b
- movaps [esp + .tabscale], xmm3
-
- ;; assume we have at least one i particle - start directly
- mov ecx, [ebp + %$iinr] ; ecx = pointer into iinr[]
- mov ebx, [ecx] ; ebx =ii
-
- mov edx, [ebp + %$charge]
- movss xmm3, [edx + ebx*4]
- movss xmm4, xmm3
- movss xmm5, [edx + ebx*4 + 4]
- movss xmm6, [ebp + %$facel]
- mulss xmm3, xmm3
- mulss xmm4, xmm5
- mulss xmm5, xmm5
- mulss xmm3, xmm6
- mulss xmm4, xmm6
- mulss xmm5, xmm6
- shufps xmm3, xmm3, 0b
- shufps xmm4, xmm4, 0b
- shufps xmm5, xmm5, 0b
- movaps [esp + .qqOO], xmm3
- movaps [esp + .qqOH], xmm4
- movaps [esp + .qqHH], xmm5
-
- xorps xmm0, xmm0
- mov edx, [ebp + %$type]
- mov ecx, [edx + ebx*4]
- shl ecx, 1
- mov edx, ecx
- imul ecx, [ebp + %$ntype] ; ecx = ntia = 2*ntype*type[ii0]
- add edx, ecx
- mov eax, [ebp + %$nbfp]
- movlps xmm0, [eax + edx*4]
- movaps xmm1, xmm0
- shufps xmm0, xmm0, 0b
- shufps xmm1, xmm1, 01010101b
- movaps [esp + .c6], xmm0
- movaps [esp + .c12], xmm1
-
-.outer:
- mov eax, [ebp + %$shift] ; eax = pointer into shift[]
- mov ebx, [eax] ; ebx=shift[n]
- add [ebp + %$shift], dword 4 ; advance pointer one step
-
- lea ebx, [ebx + ebx*2] ; ebx=3*is
- mov [esp + .is3],ebx ; store is3
-
- mov eax, [ebp + %$shiftvec] ; eax = base of shiftvec[]
-
- movss xmm0, [eax + ebx*4]
- movss xmm1, [eax + ebx*4 + 4]
- movss xmm2, [eax + ebx*4 + 8]
-
- mov ecx, [ebp + %$iinr] ; ecx = pointer into iinr[]
- add [ebp + %$iinr], dword 4 ; advance pointer
- mov ebx, [ecx] ; ebx =ii
-
- lea ebx, [ebx + ebx*2] ; ebx = 3*ii=ii3
- mov eax, [ebp + %$pos] ; eax = base of pos[]
- mov [esp + .ii3], ebx
-
- movaps xmm3, xmm0
- movaps xmm4, xmm1
- movaps xmm5, xmm2
- addss xmm3, [eax + ebx*4]
- addss xmm4, [eax + ebx*4 + 4]
- addss xmm5, [eax + ebx*4 + 8]
- shufps xmm3, xmm3, 0b
- shufps xmm4, xmm4, 0b
- shufps xmm5, xmm5, 0b
- movaps [esp + .ixO], xmm3
- movaps [esp + .iyO], xmm4
- movaps [esp + .izO], xmm5
-
- movss xmm3, xmm0
- movss xmm4, xmm1
- movss xmm5, xmm2
- addss xmm0, [eax + ebx*4 + 12]
- addss xmm1, [eax + ebx*4 + 16]
- addss xmm2, [eax + ebx*4 + 20]
- addss xmm3, [eax + ebx*4 + 24]
- addss xmm4, [eax + ebx*4 + 28]
- addss xmm5, [eax + ebx*4 + 32]
-
- shufps xmm0, xmm0, 0b
- shufps xmm1, xmm1, 0b
- shufps xmm2, xmm2, 0b
- shufps xmm3, xmm3, 0b
- shufps xmm4, xmm4, 0b
- shufps xmm5, xmm5, 0b
- movaps [esp + .ixH1], xmm0
- movaps [esp + .iyH1], xmm1
- movaps [esp + .izH1], xmm2
- movaps [esp + .ixH2], xmm3
- movaps [esp + .iyH2], xmm4
- movaps [esp + .izH2], xmm5
-
- ; clear vctot and i forces
- xorps xmm4, xmm4
- movaps [esp + .vctot], xmm4
- movaps [esp + .vnbtot], xmm4
- movaps [esp + .fixO], xmm4
- movaps [esp + .fiyO], xmm4
- movaps [esp + .fizO], xmm4
- movaps [esp + .fixH1], xmm4
- movaps [esp + .fiyH1], xmm4
- movaps [esp + .fizH1], xmm4
- movaps [esp + .fixH2], xmm4
- movaps [esp + .fiyH2], xmm4
- movaps [esp + .fizH2], xmm4
-
- mov eax, [ebp + %$jindex]
- mov ecx, [eax] ; jindex[n]
- mov edx, [eax + 4] ; jindex[n+1]
- add [ebp + %$jindex], dword 4
- sub edx, ecx ; number of innerloop atoms
-
- mov esi, [ebp + %$pos]
- mov edi, [ebp + %$faction]
- mov eax, [ebp + %$jjnr]
- shl ecx, 2
- add eax, ecx
- mov [esp + .innerjjnr], eax ; pointer to jjnr[nj0]
- sub edx, dword 4
- mov [esp + .innerk], edx ; number of innerloop atoms
- jge .unroll_loop
- jmp .single_check
-.unroll_loop:
- ;; quad-unroll innerloop here.
- mov edx, [esp + .innerjjnr] ; pointer to jjnr[k]
-
- mov eax, [edx]
- mov ebx, [edx + 4]
- mov ecx, [edx + 8]
- mov edx, [edx + 12] ; eax-edx=jnr1-4
-
- add [esp + .innerjjnr], dword 16 ; advance pointer (unrolled 4)
-
- mov esi, [ebp + %$pos] ; base of pos[]
-
- lea eax, [eax + eax*2] ; replace jnr with j3
- lea ebx, [ebx + ebx*2]
- lea ecx, [ecx + ecx*2] ; replace jnr with j3
- lea edx, [edx + edx*2]
-
- ; move j coordinates to local temp. variables
- movlps xmm2, [esi + eax*4]
- movlps xmm3, [esi + eax*4 + 12]
- movlps xmm4, [esi + eax*4 + 24]
-
- movlps xmm5, [esi + ebx*4]
- movlps xmm6, [esi + ebx*4 + 12]
- movlps xmm7, [esi + ebx*4 + 24]
-
- movhps xmm2, [esi + ecx*4]
- movhps xmm3, [esi + ecx*4 + 12]
- movhps xmm4, [esi + ecx*4 + 24]
-
- movhps xmm5, [esi + edx*4]
- movhps xmm6, [esi + edx*4 + 12]
- movhps xmm7, [esi + edx*4 + 24]
-
- ;; current state:
- ;; xmm2= jxOa jyOa jxOc jyOc
- ;; xmm3= jxH1a jyH1a jxH1c jyH1c
- ;; xmm4= jxH2a jyH2a jxH2c jyH2c
- ;; xmm5= jxOb jyOb jxOd jyOd
- ;; xmm6= jxH1b jyH1b jxH1d jyH1d
- ;; xmm7= jxH2b jyH2b jxH2d jyH2d
-
- movaps xmm0, xmm2
- movaps xmm1, xmm3
- unpcklps xmm0, xmm5 ; xmm0= jxOa jxOb jyOa jyOb
- unpcklps xmm1, xmm6 ; xmm1= jxH1a jxH1b jyH1a jyH1b
- unpckhps xmm2, xmm5 ; xmm2= jxOc jxOd jyOc jyOd
- unpckhps xmm3, xmm6 ; xmm3= jxH1c jxH1d jyH1c jyH1d
- movaps xmm5, xmm4
- movaps xmm6, xmm0
- unpcklps xmm4, xmm7 ; xmm4= jxH2a jxH2b jyH2a jyH2b
- unpckhps xmm5, xmm7 ; xmm5= jxH2c jxH2d jyH2c jyH2d
- movaps xmm7, xmm1
- movlhps xmm0, xmm2 ; xmm0= jxOa jxOb jxOc jxOd
- movaps [esp + .jxO], xmm0
- movhlps xmm2, xmm6 ; xmm2= jyOa jyOb jyOc jyOd
- movaps [esp + .jyO], xmm2
- movlhps xmm1, xmm3
- movaps [esp + .jxH1], xmm1
- movhlps xmm3, xmm7
- movaps xmm6, xmm4
- movaps [esp + .jyH1], xmm3
- movlhps xmm4, xmm5
- movaps [esp + .jxH2], xmm4
- movhlps xmm5, xmm6
- movaps [esp + .jyH2], xmm5
-
- movss xmm0, [esi + eax*4 + 8]
- movss xmm1, [esi + eax*4 + 20]
- movss xmm2, [esi + eax*4 + 32]
-
- movss xmm3, [esi + ecx*4 + 8]
- movss xmm4, [esi + ecx*4 + 20]
- movss xmm5, [esi + ecx*4 + 32]
-
- movhps xmm0, [esi + ebx*4 + 4]
- movhps xmm1, [esi + ebx*4 + 16]
- movhps xmm2, [esi + ebx*4 + 28]
-
- movhps xmm3, [esi + edx*4 + 4]
- movhps xmm4, [esi + edx*4 + 16]
- movhps xmm5, [esi + edx*4 + 28]
-
- shufps xmm0, xmm3, 11001100b
- shufps xmm1, xmm4, 11001100b
- shufps xmm2, xmm5, 11001100b
- movaps [esp + .jzO], xmm0
- movaps [esp + .jzH1], xmm1
- movaps [esp + .jzH2], xmm2
-
- movaps xmm0, [esp + .ixO]
- movaps xmm1, [esp + .iyO]
- movaps xmm2, [esp + .izO]
- movaps xmm3, [esp + .ixO]
- movaps xmm4, [esp + .iyO]
- movaps xmm5, [esp + .izO]
- subps xmm0, [esp + .jxO]
- subps xmm1, [esp + .jyO]
- subps xmm2, [esp + .jzO]
- subps xmm3, [esp + .jxH1]
- subps xmm4, [esp + .jyH1]
- subps xmm5, [esp + .jzH1]
- movaps [esp + .dxOO], xmm0
- movaps [esp + .dyOO], xmm1
- movaps [esp + .dzOO], xmm2
- mulps xmm0, xmm0
- mulps xmm1, xmm1
- mulps xmm2, xmm2
- movaps [esp + .dxOH1], xmm3
- movaps [esp + .dyOH1], xmm4
- movaps [esp + .dzOH1], xmm5
- mulps xmm3, xmm3
- mulps xmm4, xmm4
- mulps xmm5, xmm5
- addps xmm0, xmm1
- addps xmm0, xmm2
- addps xmm3, xmm4
- addps xmm3, xmm5
- movaps [esp + .rsqOO], xmm0
- movaps [esp + .rsqOH1], xmm3
-
- movaps xmm0, [esp + .ixO]
- movaps xmm1, [esp + .iyO]
- movaps xmm2, [esp + .izO]
- movaps xmm3, [esp + .ixH1]
- movaps xmm4, [esp + .iyH1]
- movaps xmm5, [esp + .izH1]
- subps xmm0, [esp + .jxH2]
- subps xmm1, [esp + .jyH2]
- subps xmm2, [esp + .jzH2]
- subps xmm3, [esp + .jxO]
- subps xmm4, [esp + .jyO]
- subps xmm5, [esp + .jzO]
- movaps [esp + .dxOH2], xmm0
- movaps [esp + .dyOH2], xmm1
- movaps [esp + .dzOH2], xmm2
- mulps xmm0, xmm0
- mulps xmm1, xmm1
- mulps xmm2, xmm2
- movaps [esp + .dxH1O], xmm3
- movaps [esp + .dyH1O], xmm4
- movaps [esp + .dzH1O], xmm5
- mulps xmm3, xmm3
- mulps xmm4, xmm4
- mulps xmm5, xmm5
- addps xmm0, xmm1
- addps xmm0, xmm2
- addps xmm3, xmm4
- addps xmm3, xmm5
- movaps [esp + .rsqOH2], xmm0
- movaps [esp + .rsqH1O], xmm3
-
- movaps xmm0, [esp + .ixH1]
- movaps xmm1, [esp + .iyH1]
- movaps xmm2, [esp + .izH1]
- movaps xmm3, [esp + .ixH1]
- movaps xmm4, [esp + .iyH1]
- movaps xmm5, [esp + .izH1]
- subps xmm0, [esp + .jxH1]
- subps xmm1, [esp + .jyH1]
- subps xmm2, [esp + .jzH1]
- subps xmm3, [esp + .jxH2]
- subps xmm4, [esp + .jyH2]
- subps xmm5, [esp + .jzH2]
- movaps [esp + .dxH1H1], xmm0
- movaps [esp + .dyH1H1], xmm1
- movaps [esp + .dzH1H1], xmm2
- mulps xmm0, xmm0
- mulps xmm1, xmm1
- mulps xmm2, xmm2
- movaps [esp + .dxH1H2], xmm3
- movaps [esp + .dyH1H2], xmm4
- movaps [esp + .dzH1H2], xmm5
- mulps xmm3, xmm3
- mulps xmm4, xmm4
- mulps xmm5, xmm5
- addps xmm0, xmm1
- addps xmm0, xmm2
- addps xmm3, xmm4
- addps xmm3, xmm5
- movaps [esp + .rsqH1H1], xmm0
- movaps [esp + .rsqH1H2], xmm3
-
- movaps xmm0, [esp + .ixH2]
- movaps xmm1, [esp + .iyH2]
- movaps xmm2, [esp + .izH2]
- movaps xmm3, [esp + .ixH2]
- movaps xmm4, [esp + .iyH2]
- movaps xmm5, [esp + .izH2]
- subps xmm0, [esp + .jxO]
- subps xmm1, [esp + .jyO]
- subps xmm2, [esp + .jzO]
- subps xmm3, [esp + .jxH1]
- subps xmm4, [esp + .jyH1]
- subps xmm5, [esp + .jzH1]
- movaps [esp + .dxH2O], xmm0
- movaps [esp + .dyH2O], xmm1
- movaps [esp + .dzH2O], xmm2
- mulps xmm0, xmm0
- mulps xmm1, xmm1
- mulps xmm2, xmm2
- movaps [esp + .dxH2H1], xmm3
- movaps [esp + .dyH2H1], xmm4
- movaps [esp + .dzH2H1], xmm5
- mulps xmm3, xmm3
- mulps xmm4, xmm4
- mulps xmm5, xmm5
- addps xmm0, xmm1
- addps xmm0, xmm2
- addps xmm4, xmm3
- addps xmm4, xmm5
- movaps [esp + .rsqH2O], xmm0
- movaps [esp + .rsqH2H1], xmm4
-
- movaps xmm0, [esp + .ixH2]
- movaps xmm1, [esp + .iyH2]
- movaps xmm2, [esp + .izH2]
- subps xmm0, [esp + .jxH2]
- subps xmm1, [esp + .jyH2]
- subps xmm2, [esp + .jzH2]
- movaps [esp + .dxH2H2], xmm0
- movaps [esp + .dyH2H2], xmm1
- movaps [esp + .dzH2H2], xmm2
- mulps xmm0, xmm0
- mulps xmm1, xmm1
- mulps xmm2, xmm2
- addps xmm0, xmm1
- addps xmm0, xmm2
- movaps [esp + .rsqH2H2], xmm0
-
- ; start doing invsqrt. use rsq values in xmm0, xmm4
- rsqrtps xmm1, xmm0
- rsqrtps xmm5, xmm4
- movaps xmm2, xmm1
- movaps xmm6, xmm5
- mulps xmm1, xmm1
- mulps xmm5, xmm5
- movaps xmm3, [esp + .three]
- movaps xmm7, xmm3
- mulps xmm1, xmm0
- mulps xmm5, xmm4
- subps xmm3, xmm1
- subps xmm7, xmm5
- mulps xmm3, xmm2
- mulps xmm7, xmm6
- mulps xmm3, [esp + .half] ; rinvH2H2
- mulps xmm7, [esp + .half] ; rinvH2H1
- movaps [esp + .rinvH2H2], xmm3
- movaps [esp + .rinvH2H1], xmm7
-
- rsqrtps xmm1, [esp + .rsqOO]
- rsqrtps xmm5, [esp + .rsqOH1]
- movaps xmm2, xmm1
- movaps xmm6, xmm5
- mulps xmm1, xmm1
- mulps xmm5, xmm5
- movaps xmm3, [esp + .three]
- movaps xmm7, xmm3
- mulps xmm1, [esp + .rsqOO]
- mulps xmm5, [esp + .rsqOH1]
- subps xmm3, xmm1
- subps xmm7, xmm5
- mulps xmm3, xmm2
- mulps xmm7, xmm6
- mulps xmm3, [esp + .half]
- mulps xmm7, [esp + .half]
- movaps [esp + .rinvOO], xmm3
- movaps [esp + .rinvOH1], xmm7
-
- rsqrtps xmm1, [esp + .rsqOH2]
- rsqrtps xmm5, [esp + .rsqH1O]
- movaps xmm2, xmm1
- movaps xmm6, xmm5
- mulps xmm1, xmm1
- mulps xmm5, xmm5
- movaps xmm3, [esp + .three]
- movaps xmm7, xmm3
- mulps xmm1, [esp + .rsqOH2]
- mulps xmm5, [esp + .rsqH1O]
- subps xmm3, xmm1
- subps xmm7, xmm5
- mulps xmm3, xmm2
- mulps xmm7, xmm6
- mulps xmm3, [esp + .half]
- mulps xmm7, [esp + .half]
- movaps [esp + .rinvOH2], xmm3
- movaps [esp + .rinvH1O], xmm7
-
- rsqrtps xmm1, [esp + .rsqH1H1]
- rsqrtps xmm5, [esp + .rsqH1H2]
- movaps xmm2, xmm1
- movaps xmm6, xmm5
- mulps xmm1, xmm1
- mulps xmm5, xmm5
- movaps xmm3, [esp + .three]
- movaps xmm7, xmm3
- mulps xmm1, [esp + .rsqH1H1]
- mulps xmm5, [esp + .rsqH1H2]
- subps xmm3, xmm1
- subps xmm7, xmm5
- mulps xmm3, xmm2
- mulps xmm7, xmm6
- mulps xmm3, [esp + .half]
- mulps xmm7, [esp + .half]
- movaps [esp + .rinvH1H1], xmm3
- movaps [esp + .rinvH1H2], xmm7
-
- rsqrtps xmm1, [esp + .rsqH2O]
- movaps xmm2, xmm1
- mulps xmm1, xmm1
- movaps xmm3, [esp + .three]
- mulps xmm1, [esp + .rsqH2O]
- subps xmm3, xmm1
- mulps xmm3, xmm2
- mulps xmm3, [esp + .half]
- movaps [esp + .rinvH2O], xmm3
-
- ;; start with OO interaction.
- movaps xmm0, [esp + .rinvOO]
- movaps xmm1, xmm0
- mulps xmm1, [esp + .rsqOO] ; xmm1=r
- mulps xmm1, [esp + .tabscale]
-
- movhlps xmm2, xmm1
- cvttps2pi mm6, xmm1
- cvttps2pi mm7, xmm2 ; mm6/mm7 contain lu indices
- cvtpi2ps xmm3, mm6
- cvtpi2ps xmm2, mm7
- movlhps xmm3, xmm2
- subps xmm1, xmm3 ; xmm1=eps
- movaps xmm2, xmm1
- mulps xmm2, xmm2 ;xmm2=eps2
- pslld mm6, 2
- pslld mm7, 2
-
- movd mm0, eax
- movd mm1, ebx
- movd mm2, ecx
- movd mm3, edx
-
- mov esi, [ebp + %$VFtab]
- movd eax, mm6
- psrlq mm6, 32
- movd ecx, mm7
- psrlq mm7, 32
- movd ebx, mm6
- movd edx, mm7
-
- lea eax, [eax + eax*2]
- lea ebx, [ebx + ebx*2]
- lea ecx, [ecx + ecx*2]
- lea edx, [edx + edx*2]
-
- movlps xmm5, [esi + eax*4]
- movlps xmm7, [esi + ecx*4]
- movhps xmm5, [esi + ebx*4]
- movhps xmm7, [esi + edx*4] ; got half coulomb table
-
- movaps xmm4, xmm5
- shufps xmm4, xmm7, 10001000b
- shufps xmm5, xmm7, 11011101b
-
- movlps xmm7, [esi + eax*4 + 8]
- movlps xmm3, [esi + ecx*4 + 8]
- movhps xmm7, [esi + ebx*4 + 8]
- movhps xmm3, [esi + edx*4 + 8] ; other half of coulomb table
- movaps xmm6, xmm7
- shufps xmm6, xmm3, 10001000b
- shufps xmm7, xmm3, 11011101b
- ; coulomb table ready, in xmm4-xmm7
-
- mulps xmm6, xmm1 ; xmm6=Geps
- mulps xmm7, xmm2 ; xmm7=Heps2
- addps xmm5, xmm6
- addps xmm5, xmm7 ; xmm5=Fp
- mulps xmm7, [esp + .two] ; two*Heps2
- movaps xmm3, [esp + .qqOO]
- addps xmm7, xmm6
- addps xmm7, xmm5 ; xmm7=FF
- mulps xmm5, xmm1 ; xmm5=eps*Fp
- addps xmm5, xmm4 ; xmm5=VV
- mulps xmm5, xmm3 ; vcoul=qq*VV
- mulps xmm3, xmm7 ; fijC=FF*qq
- ; at this point mm5 contains vcoul and mm3 fijC.
- ; increment vcoul - then we can get rid of mm5.
- ;; update vctot
- addps xmm5, [esp + .vctot]
- movaps [esp + .vctot], xmm5
-
- ; put scalar force on stack temporarily...
- movaps [esp + .fstmp], xmm3
-
- ; dispersion
- movlps xmm5, [esi + eax*4 + 16]
- movlps xmm7, [esi + ecx*4 + 16]
- movhps xmm5, [esi + ebx*4 + 16]
- movhps xmm7, [esi + edx*4 + 16] ; got half dispersion table
- movaps xmm4, xmm5
- shufps xmm4, xmm7, 10001000b
- shufps xmm5, xmm7, 11011101b
-
- movlps xmm7, [esi + eax*4 + 24]
- movlps xmm3, [esi + ecx*4 + 24]
- movhps xmm7, [esi + ebx*4 + 24]
- movhps xmm3, [esi + edx*4 + 24] ; other half of dispersion table
- movaps xmm6, xmm7
- shufps xmm6, xmm3, 10001000b
- shufps xmm7, xmm3, 11011101b
- ; dispersion table ready, in xmm4-xmm7
- mulps xmm6, xmm1 ; xmm6=Geps
- mulps xmm7, xmm2 ; xmm7=Heps2
- addps xmm5, xmm6
- addps xmm5, xmm7 ; xmm5=Fp
- mulps xmm7, [esp + .two] ; two*Heps2
- addps xmm7, xmm6
- addps xmm7, xmm5 ; xmm7=FF
- mulps xmm5, xmm1 ; xmm5=eps*Fp
- addps xmm5, xmm4 ; xmm5=VV
-
- movaps xmm4, [esp + .c6]
- mulps xmm7, xmm4 ; fijD
- mulps xmm5, xmm4 ; vnb6
- addps xmm7, [esp + .fstmp] ; add to fscal
-
- ; put scalar force on stack. Update vnbtot directly.
- addps xmm5, [esp + .vnbtot]
- movaps [esp + .fstmp], xmm7
- movaps [esp + .vnbtot], xmm5
-
- ; repulsion
- movlps xmm5, [esi + eax*4 + 32]
- movlps xmm7, [esi + ecx*4 + 32]
- movhps xmm5, [esi + ebx*4 + 32]
- movhps xmm7, [esi + edx*4 + 32] ; got half repulsion table
- movaps xmm4, xmm5
- shufps xmm4, xmm7, 10001000b
- shufps xmm5, xmm7, 11011101b
-
- movlps xmm7, [esi + eax*4 + 40]
- movlps xmm3, [esi + ecx*4 + 40]
- movhps xmm7, [esi + ebx*4 + 40]
- movhps xmm3, [esi + edx*4 + 40] ; other half of repulsion table
- movaps xmm6, xmm7
- shufps xmm6, xmm3, 10001000b
- shufps xmm7, xmm3, 11011101b
- ; table ready, in xmm4-xmm7
- mulps xmm6, xmm1 ; xmm6=Geps
- mulps xmm7, xmm2 ; xmm7=Heps2
- addps xmm5, xmm6
- addps xmm5, xmm7 ; xmm5=Fp
- mulps xmm7, [esp + .two] ; two*Heps2
- addps xmm7, xmm6
- addps xmm7, xmm5 ; xmm7=FF
- mulps xmm5, xmm1 ; xmm5=eps*Fp
- addps xmm5, xmm4 ; xmm5=VV
-
- movaps xmm4, [esp + .c12]
- mulps xmm7, xmm4 ; fijR
- mulps xmm5, xmm4 ; vnb12
- addps xmm7, [esp + .fstmp]
-
- addps xmm5, [esp + .vnbtot]
- movaps [esp + .vnbtot], xmm5
- xorps xmm1, xmm1
-
- mulps xmm7, [esp + .tabscale]
- mulps xmm7, xmm0
- subps xmm1, xmm7
-
- movaps xmm0, xmm1
- movaps xmm2, xmm1
-
- xorps xmm3, xmm3
- movaps xmm4, xmm3
- movaps xmm5, xmm3
- mulps xmm0, [esp + .dxOO]
- mulps xmm1, [esp + .dyOO]
- mulps xmm2, [esp + .dzOO]
- subps xmm3, xmm0
- subps xmm4, xmm1
- subps xmm5, xmm2
- addps xmm0, [esp + .fixO]
- addps xmm1, [esp + .fiyO]
- addps xmm2, [esp + .fizO]
- movaps [esp + .fjxO], xmm3
- movaps [esp + .fjyO], xmm4
- movaps [esp + .fjzO], xmm5
- movaps [esp + .fixO], xmm0
- movaps [esp + .fiyO], xmm1
- movaps [esp + .fizO], xmm2
-
- ; O-H1 interaction
- movaps xmm0, [esp + .rinvOH1]
- movaps xmm1, xmm0
- mulps xmm1, [esp + .rsqOH1] ; xmm1=r
- mulps xmm1, [esp + .tabscale]
- movhlps xmm2, xmm1
- cvttps2pi mm6, xmm1
- cvttps2pi mm7, xmm2 ; mm6/mm7 contain lu indices
- cvtpi2ps xmm3, mm6
- cvtpi2ps xmm2, mm7
- movlhps xmm3, xmm2
- subps xmm1, xmm3 ; xmm1=eps
- movaps xmm2, xmm1
- mulps xmm2, xmm2 ;xmm2=eps2
- pslld mm6, 2
- pslld mm7, 2
-
- movd eax, mm6
- psrlq mm6, 32
- movd ecx, mm7
- psrlq mm7, 32
- movd ebx, mm6
- movd edx, mm7
-
- lea eax, [eax + eax*2]
- lea ebx, [ebx + ebx*2]
- lea ecx, [ecx + ecx*2]
- lea edx, [edx + edx*2]
-
- movlps xmm5, [esi + eax*4]
- movlps xmm7, [esi + ecx*4]
- movhps xmm5, [esi + ebx*4]
- movhps xmm7, [esi + edx*4] ; got half coulomb table
-
- movaps xmm4, xmm5
- shufps xmm4, xmm7, 10001000b
- shufps xmm5, xmm7, 11011101b
-
- movlps xmm7, [esi + eax*4 + 8]
- movlps xmm3, [esi + ecx*4 + 8]
- movhps xmm7, [esi + ebx*4 + 8]
- movhps xmm3, [esi + edx*4 + 8] ; other half of coulomb table
- movaps xmm6, xmm7
- shufps xmm6, xmm3, 10001000b
- shufps xmm7, xmm3, 11011101b
- ; coulomb table ready, in xmm4-xmm7
-
- mulps xmm6, xmm1 ; xmm6=Geps
- mulps xmm7, xmm2 ; xmm7=Heps2
- addps xmm5, xmm6
- addps xmm5, xmm7 ; xmm5=Fp
- mulps xmm7, [esp + .two] ; two*Heps2
- movaps xmm3, [esp + .qqOH]
- addps xmm7, xmm6
- addps xmm7, xmm5 ; xmm7=FF
- mulps xmm5, xmm1 ; xmm5=eps*Fp
- addps xmm5, xmm4 ; xmm5=VV
- mulps xmm5, xmm3 ; vcoul=qq*VV
- mulps xmm3, xmm7 ; fijC=FF*qq
- ; at this point mm5 contains vcoul and mm3 fijC.
-
- addps xmm5, [esp + .vctot]
- movaps [esp + .vctot], xmm5
- xorps xmm1, xmm1
- mulps xmm3, [esp + .tabscale]
- mulps xmm3, xmm0
- subps xmm1, xmm3
-
- movaps xmm0, xmm1
- movaps xmm2, xmm1
-
- xorps xmm3, xmm3
- movaps xmm4, xmm3
- movaps xmm5, xmm3
- mulps xmm0, [esp + .dxOH1]
- mulps xmm1, [esp + .dyOH1]
- mulps xmm2, [esp + .dzOH1]
- subps xmm3, xmm0
- subps xmm4, xmm1
- subps xmm5, xmm2
- addps xmm0, [esp + .fixO]
- addps xmm1, [esp + .fiyO]
- addps xmm2, [esp + .fizO]
- movaps [esp + .fjxH1], xmm3
- movaps [esp + .fjyH1], xmm4
- movaps [esp + .fjzH1], xmm5
- movaps [esp + .fixO], xmm0
- movaps [esp + .fiyO], xmm1
- movaps [esp + .fizO], xmm2
-
- ; O-H2 interaction
- movaps xmm0, [esp + .rinvOH2]
- movaps xmm1, xmm0
- mulps xmm1, [esp + .rsqOH2] ; xmm1=r
- mulps xmm1, [esp + .tabscale]
- movhlps xmm2, xmm1
- cvttps2pi mm6, xmm1
- cvttps2pi mm7, xmm2 ; mm6/mm7 contain lu indices
- cvtpi2ps xmm3, mm6
- cvtpi2ps xmm2, mm7
- movlhps xmm3, xmm2
- subps xmm1, xmm3 ; xmm1=eps
- movaps xmm2, xmm1
- mulps xmm2, xmm2 ;xmm2=eps2
- pslld mm6, 2
- pslld mm7, 2
-
- movd eax, mm6
- psrlq mm6, 32
- movd ecx, mm7
- psrlq mm7, 32
- movd ebx, mm6
- movd edx, mm7
-
- lea eax, [eax + eax*2]
- lea ebx, [ebx + ebx*2]
- lea ecx, [ecx + ecx*2]
- lea edx, [edx + edx*2]
-
- movlps xmm5, [esi + eax*4]
- movlps xmm7, [esi + ecx*4]
- movhps xmm5, [esi + ebx*4]
- movhps xmm7, [esi + edx*4] ; got half coulomb table
-
- movaps xmm4, xmm5
- shufps xmm4, xmm7, 10001000b
- shufps xmm5, xmm7, 11011101b
-
- movlps xmm7, [esi + eax*4 + 8]
- movlps xmm3, [esi + ecx*4 + 8]
- movhps xmm7, [esi + ebx*4 + 8]
- movhps xmm3, [esi + edx*4 + 8] ; other half of coulomb table
- movaps xmm6, xmm7
- shufps xmm6, xmm3, 10001000b
- shufps xmm7, xmm3, 11011101b
- ; coulomb table ready, in xmm4-xmm7
-
- mulps xmm6, xmm1 ; xmm6=Geps
- mulps xmm7, xmm2 ; xmm7=Heps2
- addps xmm5, xmm6
- addps xmm5, xmm7 ; xmm5=Fp
- mulps xmm7, [esp + .two] ; two*Heps2
- movaps xmm3, [esp + .qqOH]
- addps xmm7, xmm6
- addps xmm7, xmm5 ; xmm7=FF
- mulps xmm5, xmm1 ; xmm5=eps*Fp
- addps xmm5, xmm4 ; xmm5=VV
- mulps xmm5, xmm3 ; vcoul=qq*VV
- mulps xmm3, xmm7 ; fijC=FF*qq
- ; at this point mm5 contains vcoul and mm3 fijC.
-
- addps xmm5, [esp + .vctot]
- movaps [esp + .vctot], xmm5
- xorps xmm1, xmm1
- mulps xmm3, [esp + .tabscale]
- mulps xmm3, xmm0
- subps xmm1, xmm3
-
- movaps xmm0, xmm1
- movaps xmm2, xmm1
-
- xorps xmm3, xmm3
- movaps xmm4, xmm3
- movaps xmm5, xmm3
- mulps xmm0, [esp + .dxOH2]
- mulps xmm1, [esp + .dyOH2]
- mulps xmm2, [esp + .dzOH2]
- subps xmm3, xmm0
- subps xmm4, xmm1
- subps xmm5, xmm2
- addps xmm0, [esp + .fixO]
- addps xmm1, [esp + .fiyO]
- addps xmm2, [esp + .fizO]
- movaps [esp + .fjxH2], xmm3
- movaps [esp + .fjyH2], xmm4
- movaps [esp + .fjzH2], xmm5
- movaps [esp + .fixO], xmm0
- movaps [esp + .fiyO], xmm1
- movaps [esp + .fizO], xmm2
-
- ; H1-O interaction
- movaps xmm0, [esp + .rinvH1O]
- movaps xmm1, xmm0
- mulps xmm1, [esp + .rsqH1O] ; xmm1=r
- mulps xmm1, [esp + .tabscale]
- movhlps xmm2, xmm1
- cvttps2pi mm6, xmm1
- cvttps2pi mm7, xmm2 ; mm6/mm7 contain lu indices
- cvtpi2ps xmm3, mm6
- cvtpi2ps xmm2, mm7
- movlhps xmm3, xmm2
- subps xmm1, xmm3 ; xmm1=eps
- movaps xmm2, xmm1
- mulps xmm2, xmm2 ;xmm2=eps2
- pslld mm6, 2
- pslld mm7, 2
-
- movd eax, mm6
- psrlq mm6, 32
- movd ecx, mm7
- psrlq mm7, 32
- movd ebx, mm6
- movd edx, mm7
-
- lea eax, [eax + eax*2]
- lea ebx, [ebx + ebx*2]
- lea ecx, [ecx + ecx*2]
- lea edx, [edx + edx*2]
-
- movlps xmm5, [esi + eax*4]
- movlps xmm7, [esi + ecx*4]
- movhps xmm5, [esi + ebx*4]
- movhps xmm7, [esi + edx*4] ; got half coulomb table
-
- movaps xmm4, xmm5
- shufps xmm4, xmm7, 10001000b
- shufps xmm5, xmm7, 11011101b
-
- movlps xmm7, [esi + eax*4 + 8]
- movlps xmm3, [esi + ecx*4 + 8]
- movhps xmm7, [esi + ebx*4 + 8]
- movhps xmm3, [esi + edx*4 + 8] ; other half of coulomb table
- movaps xmm6, xmm7
- shufps xmm6, xmm3, 10001000b
- shufps xmm7, xmm3, 11011101b
- ; coulomb table ready, in xmm4-xmm7
-
- mulps xmm6, xmm1 ; xmm6=Geps
- mulps xmm7, xmm2 ; xmm7=Heps2
- addps xmm5, xmm6
- addps xmm5, xmm7 ; xmm5=Fp
- mulps xmm7, [esp + .two] ; two*Heps2
- movaps xmm3, [esp + .qqOH]
- addps xmm7, xmm6
- addps xmm7, xmm5 ; xmm7=FF
- mulps xmm5, xmm1 ; xmm5=eps*Fp
- addps xmm5, xmm4 ; xmm5=VV
- mulps xmm5, xmm3 ; vcoul=qq*VV
- mulps xmm3, xmm7 ; fijC=FF*qq
- ; at this point mm5 contains vcoul and mm3 fijC.
-
- addps xmm5, [esp + .vctot]
- movaps [esp + .vctot], xmm5
- xorps xmm1, xmm1
- mulps xmm3, [esp + .tabscale]
- mulps xmm3, xmm0
- subps xmm1, xmm3
-
- movaps xmm0, xmm1
- movaps xmm2, xmm1
-
- movaps xmm3, [esp + .fjxO]
- movaps xmm4, [esp + .fjyO]
- movaps xmm5, [esp + .fjzO]
- mulps xmm0, [esp + .dxH1O]
- mulps xmm1, [esp + .dyH1O]
- mulps xmm2, [esp + .dzH1O]
- subps xmm3, xmm0
- subps xmm4, xmm1
- subps xmm5, xmm2
- addps xmm0, [esp + .fixH1]
- addps xmm1, [esp + .fiyH1]
- addps xmm2, [esp + .fizH1]
- movaps [esp + .fjxO], xmm3
- movaps [esp + .fjyO], xmm4
- movaps [esp + .fjzO], xmm5
- movaps [esp + .fixH1], xmm0
- movaps [esp + .fiyH1], xmm1
- movaps [esp + .fizH1], xmm2
-
- ; H1-H1 interaction
- movaps xmm0, [esp + .rinvH1H1]
- movaps xmm1, xmm0
- mulps xmm1, [esp + .rsqH1H1] ; xmm1=r
- mulps xmm1, [esp + .tabscale]
- movhlps xmm2, xmm1
- cvttps2pi mm6, xmm1
- cvttps2pi mm7, xmm2 ; mm6/mm7 contain lu indices
- cvtpi2ps xmm3, mm6
- cvtpi2ps xmm2, mm7
- movlhps xmm3, xmm2
- subps xmm1, xmm3 ; xmm1=eps
- movaps xmm2, xmm1
- mulps xmm2, xmm2 ;xmm2=eps2
- pslld mm6, 2
- pslld mm7, 2
-
- movd eax, mm6
- psrlq mm6, 32
- movd ecx, mm7
- psrlq mm7, 32
- movd ebx, mm6
- movd edx, mm7
-
- lea eax, [eax + eax*2]
- lea ebx, [ebx + ebx*2]
- lea ecx, [ecx + ecx*2]
- lea edx, [edx + edx*2]
-
- movlps xmm5, [esi + eax*4]
- movlps xmm7, [esi + ecx*4]
- movhps xmm5, [esi + ebx*4]
- movhps xmm7, [esi + edx*4] ; got half coulomb table
-
- movaps xmm4, xmm5
- shufps xmm4, xmm7, 10001000b
- shufps xmm5, xmm7, 11011101b
-
- movlps xmm7, [esi + eax*4 + 8]
- movlps xmm3, [esi + ecx*4 + 8]
- movhps xmm7, [esi + ebx*4 + 8]
- movhps xmm3, [esi + edx*4 + 8] ; other half of coulomb table
- movaps xmm6, xmm7
- shufps xmm6, xmm3, 10001000b
- shufps xmm7, xmm3, 11011101b
- ; coulomb table ready, in xmm4-xmm7
-
- mulps xmm6, xmm1 ; xmm6=Geps
- mulps xmm7, xmm2 ; xmm7=Heps2
- addps xmm5, xmm6
- addps xmm5, xmm7 ; xmm5=Fp
- mulps xmm7, [esp + .two] ; two*Heps2
- movaps xmm3, [esp + .qqHH]
- addps xmm7, xmm6
- addps xmm7, xmm5 ; xmm7=FF
- mulps xmm5, xmm1 ; xmm5=eps*Fp
- addps xmm5, xmm4 ; xmm5=VV
- mulps xmm5, xmm3 ; vcoul=qq*VV
- mulps xmm3, xmm7 ; fijC=FF*qq
- ; at this point mm5 contains vcoul and mm3 fijC.
-
- addps xmm5, [esp + .vctot]
- movaps [esp + .vctot], xmm5
- xorps xmm1, xmm1
- mulps xmm3, [esp + .tabscale]
- mulps xmm3, xmm0
- subps xmm1, xmm3
-
- movaps xmm0, xmm1
- movaps xmm2, xmm1
-
- movaps xmm3, [esp + .fjxH1]
- movaps xmm4, [esp + .fjyH1]
- movaps xmm5, [esp + .fjzH1]
- mulps xmm0, [esp + .dxH1H1]
- mulps xmm1, [esp + .dyH1H1]
- mulps xmm2, [esp + .dzH1H1]
- subps xmm3, xmm0
- subps xmm4, xmm1
- subps xmm5, xmm2
- addps xmm0, [esp + .fixH1]
- addps xmm1, [esp + .fiyH1]
- addps xmm2, [esp + .fizH1]
- movaps [esp + .fjxH1], xmm3
- movaps [esp + .fjyH1], xmm4
- movaps [esp + .fjzH1], xmm5
- movaps [esp + .fixH1], xmm0
- movaps [esp + .fiyH1], xmm1
- movaps [esp + .fizH1], xmm2
-
- ; H1-H2 interaction
- movaps xmm0, [esp + .rinvH1H2]
- movaps xmm1, xmm0
- mulps xmm1, [esp + .rsqH1H2] ; xmm1=r
- mulps xmm1, [esp + .tabscale]
- movhlps xmm2, xmm1
- cvttps2pi mm6, xmm1
- cvttps2pi mm7, xmm2 ; mm6/mm7 contain lu indices
- cvtpi2ps xmm3, mm6
- cvtpi2ps xmm2, mm7
- movlhps xmm3, xmm2
- subps xmm1, xmm3 ; xmm1=eps
- movaps xmm2, xmm1
- mulps xmm2, xmm2 ;xmm2=eps2
- pslld mm6, 2
- pslld mm7, 2
-
- movd eax, mm6
- psrlq mm6, 32
- movd ecx, mm7
- psrlq mm7, 32
- movd ebx, mm6
- movd edx, mm7
-
- lea eax, [eax + eax*2]
- lea ebx, [ebx + ebx*2]
- lea ecx, [ecx + ecx*2]
- lea edx, [edx + edx*2]
-
- movlps xmm5, [esi + eax*4]
- movlps xmm7, [esi + ecx*4]
- movhps xmm5, [esi + ebx*4]
- movhps xmm7, [esi + edx*4] ; got half coulomb table
-
- movaps xmm4, xmm5
- shufps xmm4, xmm7, 10001000b
- shufps xmm5, xmm7, 11011101b
-
- movlps xmm7, [esi + eax*4 + 8]
- movlps xmm3, [esi + ecx*4 + 8]
- movhps xmm7, [esi + ebx*4 + 8]
- movhps xmm3, [esi + edx*4 + 8] ; other half of coulomb table
- movaps xmm6, xmm7
- shufps xmm6, xmm3, 10001000b
- shufps xmm7, xmm3, 11011101b
- ; coulomb table ready, in xmm4-xmm7
-
- mulps xmm6, xmm1 ; xmm6=Geps
- mulps xmm7, xmm2 ; xmm7=Heps2
- addps xmm5, xmm6
- addps xmm5, xmm7 ; xmm5=Fp
- mulps xmm7, [esp + .two] ; two*Heps2
- movaps xmm3, [esp + .qqHH]
- addps xmm7, xmm6
- addps xmm7, xmm5 ; xmm7=FF
- mulps xmm5, xmm1 ; xmm5=eps*Fp
- addps xmm5, xmm4 ; xmm5=VV
- mulps xmm5, xmm3 ; vcoul=qq*VV
- mulps xmm3, xmm7 ; fijC=FF*qq
- ; at this point mm5 contains vcoul and mm3 fijC.
-
- addps xmm5, [esp + .vctot]
- movaps [esp + .vctot], xmm5
- xorps xmm1, xmm1
- mulps xmm3, [esp + .tabscale]
- mulps xmm3, xmm0
- subps xmm1, xmm3
-
- movaps xmm0, xmm1
- movaps xmm2, xmm1
-
- movaps xmm3, [esp + .fjxH2]
- movaps xmm4, [esp + .fjyH2]
- movaps xmm5, [esp + .fjzH2]
- mulps xmm0, [esp + .dxH1H2]
- mulps xmm1, [esp + .dyH1H2]
- mulps xmm2, [esp + .dzH1H2]
- subps xmm3, xmm0
- subps xmm4, xmm1
- subps xmm5, xmm2
- addps xmm0, [esp + .fixH1]
- addps xmm1, [esp + .fiyH1]
- addps xmm2, [esp + .fizH1]
- movaps [esp + .fjxH2], xmm3
- movaps [esp + .fjyH2], xmm4
- movaps [esp + .fjzH2], xmm5
- movaps [esp + .fixH1], xmm0
- movaps [esp + .fiyH1], xmm1
- movaps [esp + .fizH1], xmm2
-
- ; H2-O interaction
- movaps xmm0, [esp + .rinvH2O]
- movaps xmm1, xmm0
- mulps xmm1, [esp + .rsqH2O] ; xmm1=r
- mulps xmm1, [esp + .tabscale]
- movhlps xmm2, xmm1
- cvttps2pi mm6, xmm1
- cvttps2pi mm7, xmm2 ; mm6/mm7 contain lu indices
- cvtpi2ps xmm3, mm6
- cvtpi2ps xmm2, mm7
- movlhps xmm3, xmm2
- subps xmm1, xmm3 ; xmm1=eps
- movaps xmm2, xmm1
- mulps xmm2, xmm2 ;xmm2=eps2
- pslld mm6, 2
- pslld mm7, 2
-
- movd eax, mm6
- psrlq mm6, 32
- movd ecx, mm7
- psrlq mm7, 32
- movd ebx, mm6
- movd edx, mm7
-
- lea eax, [eax + eax*2]
- lea ebx, [ebx + ebx*2]
- lea ecx, [ecx + ecx*2]
- lea edx, [edx + edx*2]
-
- movlps xmm5, [esi + eax*4]
- movlps xmm7, [esi + ecx*4]
- movhps xmm5, [esi + ebx*4]
- movhps xmm7, [esi + edx*4] ; got half coulomb table
-
- movaps xmm4, xmm5
- shufps xmm4, xmm7, 10001000b
- shufps xmm5, xmm7, 11011101b
-
- movlps xmm7, [esi + eax*4 + 8]
- movlps xmm3, [esi + ecx*4 + 8]
- movhps xmm7, [esi + ebx*4 + 8]
- movhps xmm3, [esi + edx*4 + 8] ; other half of coulomb table
- movaps xmm6, xmm7
- shufps xmm6, xmm3, 10001000b
- shufps xmm7, xmm3, 11011101b
- ; coulomb table ready, in xmm4-xmm7
-
- mulps xmm6, xmm1 ; xmm6=Geps
- mulps xmm7, xmm2 ; xmm7=Heps2
- addps xmm5, xmm6
- addps xmm5, xmm7 ; xmm5=Fp
- mulps xmm7, [esp + .two] ; two*Heps2
- movaps xmm3, [esp + .qqOH]
- addps xmm7, xmm6
- addps xmm7, xmm5 ; xmm7=FF
- mulps xmm5, xmm1 ; xmm5=eps*Fp
- addps xmm5, xmm4 ; xmm5=VV
- mulps xmm5, xmm3 ; vcoul=qq*VV
- mulps xmm3, xmm7 ; fijC=FF*qq
- ; at this point mm5 contains vcoul and mm3 fijC.
-
- addps xmm5, [esp + .vctot]
- movaps [esp + .vctot], xmm5
- xorps xmm1, xmm1
- mulps xmm3, [esp + .tabscale]
- mulps xmm3, xmm0
- subps xmm1, xmm3
-
- movaps xmm0, xmm1
- movaps xmm2, xmm1
-
- movaps xmm3, [esp + .fjxO]
- movaps xmm4, [esp + .fjyO]
- movaps xmm5, [esp + .fjzO]
- mulps xmm0, [esp + .dxH2O]
- mulps xmm1, [esp + .dyH2O]
- mulps xmm2, [esp + .dzH2O]
- subps xmm3, xmm0
- subps xmm4, xmm1
- subps xmm5, xmm2
- addps xmm0, [esp + .fixH2]
- addps xmm1, [esp + .fiyH2]
- addps xmm2, [esp + .fizH2]
- movaps [esp + .fjxO], xmm3
- movaps [esp + .fjyO], xmm4
- movaps [esp + .fjzO], xmm5
- movaps [esp + .fixH2], xmm0
- movaps [esp + .fiyH2], xmm1
- movaps [esp + .fizH2], xmm2
-
- ; H2-H1 interaction
- movaps xmm0, [esp + .rinvH2H1]
- movaps xmm1, xmm0
- mulps xmm1, [esp + .rsqH2H1] ; xmm1=r
- mulps xmm1, [esp + .tabscale]
- movhlps xmm2, xmm1
- cvttps2pi mm6, xmm1
- cvttps2pi mm7, xmm2 ; mm6/mm7 contain lu indices
- cvtpi2ps xmm3, mm6
- cvtpi2ps xmm2, mm7
- movlhps xmm3, xmm2
- subps xmm1, xmm3 ; xmm1=eps
- movaps xmm2, xmm1
- mulps xmm2, xmm2 ;xmm2=eps2
- pslld mm6, 2
- pslld mm7, 2
-
- movd eax, mm6
- psrlq mm6, 32
- movd ecx, mm7
- psrlq mm7, 32
- movd ebx, mm6
- movd edx, mm7
-
- lea eax, [eax + eax*2]
- lea ebx, [ebx + ebx*2]
- lea ecx, [ecx + ecx*2]
- lea edx, [edx + edx*2]
-
- movlps xmm5, [esi + eax*4]
- movlps xmm7, [esi + ecx*4]
- movhps xmm5, [esi + ebx*4]
- movhps xmm7, [esi + edx*4] ; got half coulomb table
-
- movaps xmm4, xmm5
- shufps xmm4, xmm7, 10001000b
- shufps xmm5, xmm7, 11011101b
-
- movlps xmm7, [esi + eax*4 + 8]
- movlps xmm3, [esi + ecx*4 + 8]
- movhps xmm7, [esi + ebx*4 + 8]
- movhps xmm3, [esi + edx*4 + 8] ; other half of coulomb table
- movaps xmm6, xmm7
- shufps xmm6, xmm3, 10001000b
- shufps xmm7, xmm3, 11011101b
- ; coulomb table ready, in xmm4-xmm7
-
- mulps xmm6, xmm1 ; xmm6=Geps
- mulps xmm7, xmm2 ; xmm7=Heps2
- addps xmm5, xmm6
- addps xmm5, xmm7 ; xmm5=Fp
- mulps xmm7, [esp + .two] ; two*Heps2
- movaps xmm3, [esp + .qqHH]
- addps xmm7, xmm6
- addps xmm7, xmm5 ; xmm7=FF
- mulps xmm5, xmm1 ; xmm5=eps*Fp
- addps xmm5, xmm4 ; xmm5=VV
- mulps xmm5, xmm3 ; vcoul=qq*VV
- mulps xmm3, xmm7 ; fijC=FF*qq
- ; at this point mm5 contains vcoul and mm3 fijC.
-
- addps xmm5, [esp + .vctot]
- movaps [esp + .vctot], xmm5
- xorps xmm1, xmm1
- mulps xmm3, [esp + .tabscale]
- mulps xmm3, xmm0
- subps xmm1, xmm3
-
- movaps xmm0, xmm1
- movaps xmm2, xmm1
-
- movaps xmm3, [esp + .fjxH1]
- movaps xmm4, [esp + .fjyH1]
- movaps xmm5, [esp + .fjzH1]
- mulps xmm0, [esp + .dxH2H1]
- mulps xmm1, [esp + .dyH2H1]
- mulps xmm2, [esp + .dzH2H1]
- subps xmm3, xmm0
- subps xmm4, xmm1
- subps xmm5, xmm2
- addps xmm0, [esp + .fixH2]
- addps xmm1, [esp + .fiyH2]
- addps xmm2, [esp + .fizH2]
- movaps [esp + .fjxH1], xmm3
- movaps [esp + .fjyH1], xmm4
- movaps [esp + .fjzH1], xmm5
- movaps [esp + .fixH2], xmm0
- movaps [esp + .fiyH2], xmm1
- movaps [esp + .fizH2], xmm2
-
- ; H2-H2 interaction
- movaps xmm0, [esp + .rinvH2H2]
- movaps xmm1, xmm0
- mulps xmm1, [esp + .rsqH2H2] ; xmm1=r
- mulps xmm1, [esp + .tabscale]
- movhlps xmm2, xmm1
- cvttps2pi mm6, xmm1
- cvttps2pi mm7, xmm2 ; mm6/mm7 contain lu indices
- cvtpi2ps xmm3, mm6
- cvtpi2ps xmm2, mm7
- movlhps xmm3, xmm2
- subps xmm1, xmm3 ; xmm1=eps
- movaps xmm2, xmm1
- mulps xmm2, xmm2 ;xmm2=eps2
- pslld mm6, 2
- pslld mm7, 2
-
- movd eax, mm6
- psrlq mm6, 32
- movd ecx, mm7
- psrlq mm7, 32
- movd ebx, mm6
- movd edx, mm7
-
- lea eax, [eax + eax*2]
- lea ebx, [ebx + ebx*2]
- lea ecx, [ecx + ecx*2]
- lea edx, [edx + edx*2]
-
- movlps xmm5, [esi + eax*4]
- movlps xmm7, [esi + ecx*4]
- movhps xmm5, [esi + ebx*4]
- movhps xmm7, [esi + edx*4] ; got half coulomb table
-
- movaps xmm4, xmm5
- shufps xmm4, xmm7, 10001000b
- shufps xmm5, xmm7, 11011101b
-
- movlps xmm7, [esi + eax*4 + 8]
- movlps xmm3, [esi + ecx*4 + 8]
- movhps xmm7, [esi + ebx*4 + 8]
- movhps xmm3, [esi + edx*4 + 8] ; other half of coulomb table
- movaps xmm6, xmm7
- shufps xmm6, xmm3, 10001000b
- shufps xmm7, xmm3, 11011101b
- ; coulomb table ready, in xmm4-xmm7
-
- mulps xmm6, xmm1 ; xmm6=Geps
- mulps xmm7, xmm2 ; xmm7=Heps2
- addps xmm5, xmm6
- addps xmm5, xmm7 ; xmm5=Fp
- mulps xmm7, [esp + .two] ; two*Heps2
- movaps xmm3, [esp + .qqHH]
- addps xmm7, xmm6
- addps xmm7, xmm5 ; xmm7=FF
- mulps xmm5, xmm1 ; xmm5=eps*Fp
- addps xmm5, xmm4 ; xmm5=VV
- mulps xmm5, xmm3 ; vcoul=qq*VV
- mulps xmm3, xmm7 ; fijC=FF*qq
- ; at this point mm5 contains vcoul and mm3 fijC.
-
- addps xmm5, [esp + .vctot]
- movaps [esp + .vctot], xmm5
- xorps xmm1, xmm1
- mulps xmm3, [esp + .tabscale]
- mulps xmm3, xmm0
- subps xmm1, xmm3
-
- movaps xmm0, xmm1
- movaps xmm2, xmm1
-
- movaps xmm3, [esp + .fjxH2]
- movaps xmm4, [esp + .fjyH2]
- movaps xmm5, [esp + .fjzH2]
- mulps xmm0, [esp + .dxH2H2]
- mulps xmm1, [esp + .dyH2H2]
- mulps xmm2, [esp + .dzH2H2]
- subps xmm3, xmm0
- subps xmm4, xmm1
- subps xmm5, xmm2
- addps xmm0, [esp + .fixH2]
- addps xmm1, [esp + .fiyH2]
- addps xmm2, [esp + .fizH2]
- movaps [esp + .fjxH2], xmm3
- movaps [esp + .fjyH2], xmm4
- movaps [esp + .fjzH2], xmm5
- movaps [esp + .fixH2], xmm0
- movaps [esp + .fiyH2], xmm1
- movaps [esp + .fizH2], xmm2
-
- mov edi, [ebp +%$faction]
-
- movd eax, mm0
- movd ebx, mm1
- movd ecx, mm2
- movd edx, mm3
-
- ; Did all interactions - now update j forces.
- ; 4 j waters with three atoms each - first do a & b j particles
- movaps xmm0, [esp + .fjxO] ; xmm0= fjxOa fjxOb fjxOc fjxOd
- movaps xmm1, [esp + .fjyO] ; xmm1= fjyOa fjyOb fjyOc fjyOd
- unpcklps xmm0, xmm1 ; xmm0= fjxOa fjyOa fjxOb fjyOb
- movaps xmm1, [esp + .fjzO]
- movaps xmm2, [esp + .fjxH1]
- movhlps xmm3, xmm0 ; xmm3= fjxOb fjyOb
- unpcklps xmm1, xmm2 ; xmm1= fjzOa fjxH1a fjzOb fjxH1b
- movaps xmm4, [esp + .fjyH1]
- movaps xmm5, [esp + .fjzH1]
- unpcklps xmm4, xmm5 ; xmm4= fjyH1a fjzH1a fjyH1b fjzH1b
- movaps xmm5, [esp + .fjxH2]
- movaps xmm6, [esp + .fjyH2]
- movhlps xmm7, xmm4 ; xmm7= fjyH1b fjzH1b
- unpcklps xmm5, xmm6 ; xmm5= fjxH2a fjyH2a fjxH2b fjyH2b
- movlhps xmm0, xmm1 ; xmm0= fjxOa fjyOa fjzOa fjxH1a
- shufps xmm3, xmm1, 11100100b
- ; xmm3= fjxOb fjyOb fjzOb fjxH1b
- movlhps xmm4, xmm5 ; xmm4= fjyH1a fjzH1a fjxH2a fjyH2a
- shufps xmm7, xmm5, 11100100b
- ; xmm7= fjyH1b fjzH1b fjxH2b fjyH2b
- movups xmm1, [edi + eax*4]
- movups xmm2, [edi + eax*4 + 16]
- movups xmm5, [edi + ebx*4]
- movups xmm6, [edi + ebx*4 + 16]
- addps xmm1, xmm0
- addps xmm2, xmm4
- addps xmm5, xmm3
- addps xmm6, xmm7
- movss xmm0, [edi + eax*4 + 32]
- movss xmm3, [edi + ebx*4 + 32]
-
- movaps xmm4, [esp + .fjzH2]
- movaps xmm7, xmm4
- shufps xmm7, xmm7, 1b
-
- movups [edi + eax*4], xmm1
- movups [edi + eax*4 + 16],xmm2
- movups [edi + ebx*4], xmm5
- movups [edi + ebx*4 + 16],xmm6
- addss xmm0, xmm4
- addss xmm3, xmm7
- movss [edi + eax*4 + 32], xmm0
- movss [edi + ebx*4 + 32], xmm3
-
- ;; then do the second pair (c & d)
- movaps xmm0, [esp + .fjxO] ; xmm0= fjxOa fjxOb fjxOc fjxOd
- movaps xmm1, [esp + .fjyO] ; xmm1= fjyOa fjyOb fjyOc fjyOd
- unpckhps xmm0, xmm1 ; xmm0= fjxOc fjyOc fjxOd fjyOd
- movaps xmm1, [esp + .fjzO]
- movaps xmm2, [esp + .fjxH1]
- movhlps xmm3, xmm0 ; xmm3= fjxOd fjyOd
- unpckhps xmm1, xmm2 ; xmm1= fjzOc fjxH1c fjzOd fjxH1d
- movaps xmm4, [esp + .fjyH1]
- movaps xmm5, [esp + .fjzH1]
- unpckhps xmm4, xmm5 ; xmm4= fjyH1c fjzH1c fjyH1d fjzH1d
- movaps xmm5, [esp + .fjxH2]
- movaps xmm6, [esp + .fjyH2]
- movhlps xmm7, xmm4 ; xmm7= fjyH1d fjzH1d
- unpckhps xmm5, xmm6 ; xmm5= fjxH2c fjyH2c fjxH2d fjyH2d
- movlhps xmm0, xmm1 ; xmm0= fjxOc fjyOc fjzOc fjxH1c
- shufps xmm3, xmm1, 11100100b
- ; xmm3= fjxOd fjyOd fjzOd fjxH1d
- movlhps xmm4, xmm5 ; xmm4= fjyH1c fjzH1c fjxH2c fjyH2c
- shufps xmm7, xmm5, 11100100b
- ; xmm7= fjyH1d fjzH1d fjxH2d fjyH2d
- movups xmm1, [edi + ecx*4]
- movups xmm2, [edi + ecx*4 + 16]
- movups xmm5, [edi + edx*4]
- movups xmm6, [edi + edx*4 + 16]
- addps xmm1, xmm0
- addps xmm2, xmm4
- addps xmm5, xmm3
- addps xmm6, xmm7
- movss xmm0, [edi + ecx*4 + 32]
- movss xmm3, [edi + edx*4 + 32]
-
- movaps xmm4, [esp + .fjzH2]
- movaps xmm7, xmm4
- shufps xmm4, xmm4, 10b
- shufps xmm7, xmm7, 11b
- movups [edi + ecx*4], xmm1
- movups [edi + ecx*4 + 16],xmm2
- movups [edi + edx*4], xmm5
- movups [edi + edx*4 + 16],xmm6
- addss xmm0, xmm4
- addss xmm3, xmm7
- movss [edi + ecx*4 + 32], xmm0
- movss [edi + edx*4 + 32], xmm3
-
- ;; should we do one more iteration?
- sub [esp + .innerk], dword 4
- jl .single_check
- jmp .unroll_loop
-.single_check:
- add [esp + .innerk], dword 4
- jnz .single_loop
- jmp .updateouterdata
-.single_loop:
- mov edx, [esp + .innerjjnr] ; pointer to jjnr[k]
- mov eax, [edx]
- add [esp + .innerjjnr], dword 4
-
- mov esi, [ebp + %$pos]
- lea eax, [eax + eax*2]
-
- ; fetch j coordinates
- xorps xmm3, xmm3
- xorps xmm4, xmm4
- xorps xmm5, xmm5
- movss xmm3, [esi + eax*4]
- movss xmm4, [esi + eax*4 + 4]
- movss xmm5, [esi + eax*4 + 8]
-
- movlps xmm6, [esi + eax*4 + 12]
- movhps xmm6, [esi + eax*4 + 24] ; xmm6=jxH1 jyH1 jxH2 jyH2
- ;; fetch both z coords in one go, to positions 0 and 3 in xmm7
- movups xmm7, [esi + eax*4 + 20] ; xmm7=jzH1 jxH2 jyH2 jzH2
- shufps xmm6, xmm6, 11011000b ; xmm6=jxH1 jxH2 jyH1 jyH2
- movlhps xmm3, xmm6 ; xmm3= jxO 0 jxH1 jxH2
- movaps xmm0, [esp + .ixO]
- movaps xmm1, [esp + .iyO]
- movaps xmm2, [esp + .izO]
- shufps xmm4, xmm6, 11100100b ; xmm4= jyO 0 jyH1 jyH2
- shufps xmm5, xmm7, 11000100b ; xmm5= jzO 0 jzH1 jzH2
- ;; store all j coordinates in jO
- movaps [esp + .jxO], xmm3
- movaps [esp + .jyO], xmm4
- movaps [esp + .jzO], xmm5
- subps xmm0, xmm3
- subps xmm1, xmm4
- subps xmm2, xmm5
- movaps [esp + .dxOO], xmm0
- movaps [esp + .dyOO], xmm1
- movaps [esp + .dzOO], xmm2
- mulps xmm0, xmm0
- mulps xmm1, xmm1
- mulps xmm2, xmm2
- addps xmm0, xmm1
- addps xmm0, xmm2 ; have rsq in xmm0.
-
- ;; do invsqrt
- rsqrtps xmm1, xmm0
- movaps xmm2, xmm1
- mulps xmm1, xmm1
- movaps xmm3, [esp + .three]
- mulps xmm1, xmm0
- subps xmm3, xmm1
- mulps xmm3, xmm2
- mulps xmm3, [esp + .half] ; rinv iO - j water
-
- movaps xmm1, xmm3
- mulps xmm1, xmm0 ; xmm1=r
- movaps xmm0, xmm3 ; xmm0=rinv
- mulps xmm1, [esp + .tabscale]
-
- movhlps xmm2, xmm1
- cvttps2pi mm6, xmm1
- cvttps2pi mm7, xmm2 ; mm6/mm7 contain lu indices
- cvtpi2ps xmm3, mm6
- cvtpi2ps xmm2, mm7
- movlhps xmm3, xmm2
- subps xmm1, xmm3 ; xmm1=eps
- movaps xmm2, xmm1
- mulps xmm2, xmm2 ; xmm2=eps2
- pslld mm6, 2
- pslld mm7, 2
- movd ebx, mm6
- movd ecx, mm7
- psrlq mm7, 32
- movd edx, mm7 ; table indices in ebx,ecx,edx
-
- mov esi, [ebp + %$VFtab]
-
- lea ebx, [ebx + ebx*2]
- lea ecx, [ecx + ecx*2]
- lea edx, [edx + edx*2]
-
- movlps xmm5, [esi + ebx*4]
- movlps xmm7, [esi + ecx*4]
- movhps xmm7, [esi + edx*4] ; got half coulomb table
- movaps xmm4, xmm5
- shufps xmm4, xmm7, 10001000b
- shufps xmm5, xmm7, 11011101b
-
- movlps xmm7, [esi + ebx*4 + 8]
- movlps xmm3, [esi + ecx*4 + 8]
- movhps xmm3, [esi + edx*4 + 8] ; other half of coulomb table
- movaps xmm6, xmm7
- shufps xmm6, xmm3, 10001000b
- shufps xmm7, xmm3, 11011101b
- ; coulomb table ready, in xmm4-xmm7
- mulps xmm6, xmm1 ; xmm6=Geps
- mulps xmm7, xmm2 ; xmm7=Heps2
- addps xmm5, xmm6
- addps xmm5, xmm7 ; xmm5=Fp
- mulps xmm7, [esp + .two] ; two*Heps2
-
- xorps xmm3, xmm3
- ;; fetch charges to xmm3 (temporary)
- movss xmm3, [esp + .qqOO]
- movhps xmm3, [esp + .qqOH]
-
- addps xmm7, xmm6
- addps xmm7, xmm5 ; xmm7=FF
- mulps xmm5, xmm1 ; xmm5=eps*Fp
- addps xmm5, xmm4 ; xmm5=VV
- mulps xmm5, xmm3 ; vcoul=qq*VV
- mulps xmm3, xmm7 ; fijC=FF*qq
- ; at this point xmm5 contains vcoul and xmm3 fijC.
-
- addps xmm5, [esp + .vctot]
- movaps [esp + .vctot], xmm5
- ; put scalar force on stack temporarily...
- movaps [esp + .fstmp], xmm3
-
- ; dispersion
- movss xmm4, [esi + ebx*4 + 16]
- movss xmm5, [esi + ebx*4 + 20]
- movss xmm6, [esi + ebx*4 + 24]
- movss xmm7, [esi + ebx*4 + 28]
- ; dispersion table ready, in xmm4-xmm7
- mulss xmm6, xmm1 ; xmm6=Geps
- mulss xmm7, xmm2 ; xmm7=Heps2
- addss xmm5, xmm6
- addss xmm5, xmm7 ; xmm5=Fp
- mulss xmm7, [esp + .two] ; two*Heps2
- addss xmm7, xmm6
- addss xmm7, xmm5 ; xmm7=FF
- mulss xmm5, xmm1 ; xmm5=eps*Fp
- addss xmm5, xmm4 ; xmm5=VV
- xorps xmm4, xmm4
- movss xmm4, [esp + .c6]
- mulps xmm7, xmm4 ; fijD
- mulps xmm5, xmm4 ; vnb6
- addps xmm7, [esp + .fstmp] ; add to fscal
-
- ; put scalar force on stack. Update vnbtot directly.
- addps xmm5, [esp + .vnbtot]
- movaps [esp + .fstmp], xmm7
- movaps [esp + .vnbtot], xmm5
-
- ; repulsion
- movss xmm4, [esi + ebx*4 + 32]
- movss xmm5, [esi + ebx*4 + 36]
- movss xmm6, [esi + ebx*4 + 40]
- movss xmm7, [esi + ebx*4 + 44]
- ; table ready, in xmm4-xmm7
- mulss xmm6, xmm1 ; xmm6=Geps
- mulss xmm7, xmm2 ; xmm7=Heps2
- addss xmm5, xmm6
- addss xmm5, xmm7 ; xmm5=Fp
- mulss xmm7, [esp + .two] ; two*Heps2
- addss xmm7, xmm6
- addss xmm7, xmm5 ; xmm7=FF
- mulss xmm5, xmm1 ; xmm5=eps*Fp
- addss xmm5, xmm4 ; xmm5=VV
-
- xorps xmm4, xmm4
- movss xmm4, [esp + .c12]
- mulps xmm7, xmm4 ; fijR
- mulps xmm5, xmm4 ; vnb12
- addps xmm7, [esp + .fstmp]
-
- addps xmm5, [esp + .vnbtot]
- movaps [esp + .vnbtot], xmm5
- xorps xmm1, xmm1
-
- mulps xmm7, [esp + .tabscale]
- mulps xmm7, xmm0
- subps xmm1, xmm7
-
- movaps xmm0, xmm1
- movaps xmm2, xmm1
-
- mulps xmm0, [esp + .dxOO]
- mulps xmm1, [esp + .dyOO]
- mulps xmm2, [esp + .dzOO]
- ;; initial update for j forces
- xorps xmm3, xmm3
- xorps xmm4, xmm4
- xorps xmm5, xmm5
- subps xmm3, xmm0
- subps xmm4, xmm1
- subps xmm5, xmm2
- movaps [esp + .fjxO], xmm3
- movaps [esp + .fjyO], xmm4
- movaps [esp + .fjzO], xmm5
- addps xmm0, [esp + .fixO]
- addps xmm1, [esp + .fiyO]
- addps xmm2, [esp + .fizO]
- movaps [esp + .fixO], xmm0
- movaps [esp + .fiyO], xmm1
- movaps [esp + .fizO], xmm2
-
-
- ;; done with i O. Now do i H1 & H2 simultaneously. first get i particle coords:
- movaps xmm0, [esp + .ixH1]
- movaps xmm1, [esp + .iyH1]
- movaps xmm2, [esp + .izH1]
- movaps xmm3, [esp + .ixH2]
- movaps xmm4, [esp + .iyH2]
- movaps xmm5, [esp + .izH2]
- subps xmm0, [esp + .jxO]
- subps xmm1, [esp + .jyO]
- subps xmm2, [esp + .jzO]
- subps xmm3, [esp + .jxO]
- subps xmm4, [esp + .jyO]
- subps xmm5, [esp + .jzO]
- movaps [esp + .dxH1O], xmm0
- movaps [esp + .dyH1O], xmm1
- movaps [esp + .dzH1O], xmm2
- movaps [esp + .dxH2O], xmm3
- movaps [esp + .dyH2O], xmm4
- movaps [esp + .dzH2O], xmm5
- mulps xmm0, xmm0
- mulps xmm1, xmm1
- mulps xmm2, xmm2
- mulps xmm3, xmm3
- mulps xmm4, xmm4
- mulps xmm5, xmm5
- addps xmm0, xmm1
- addps xmm4, xmm3
- addps xmm0, xmm2 ; have rsqH1 in xmm0.
- addps xmm4, xmm5 ; have rsqH2 in xmm4.
-
- ;; start with H1, save H2 data
- movaps [esp + .rsqH2O], xmm4
-
- ;; do invsqrt
- rsqrtps xmm1, xmm0
- rsqrtps xmm5, xmm4
- movaps xmm2, xmm1
- movaps xmm6, xmm5
- mulps xmm1, xmm1
- mulps xmm5, xmm5
- movaps xmm3, [esp + .three]
- movaps xmm7, xmm3
- mulps xmm1, xmm0
- mulps xmm5, xmm4
- subps xmm3, xmm1
- subps xmm7, xmm5
- mulps xmm3, xmm2
- mulps xmm7, xmm6
- mulps xmm3, [esp + .half] ; rinv H1 - j water
- mulps xmm7, [esp + .half] ; rinv H2 - j water
-
- ;; start with H1, save H2 data
- movaps [esp + .rinvH2O], xmm7
-
- movaps xmm1, xmm3
- mulps xmm1, xmm0 ; xmm1=r
- movaps xmm0, xmm3 ; xmm0=rinv
- mulps xmm1, [esp + .tabscale]
-
- movhlps xmm2, xmm1
- cvttps2pi mm6, xmm1
- cvttps2pi mm7, xmm2 ; mm6/mm7 contain lu indices
- cvtpi2ps xmm3, mm6
- cvtpi2ps xmm2, mm7
- movlhps xmm3, xmm2
- subps xmm1, xmm3 ; xmm1=eps
- movaps xmm2, xmm1
- mulps xmm2, xmm2 ; xmm2=eps2
- pslld mm6, 2
- pslld mm7, 2
- movd ebx, mm6
- movd ecx, mm7
- psrlq mm7, 32
- movd edx, mm7 ; table indices in ebx,ecx,edx
-
- lea ebx, [ebx + ebx*2]
- lea ecx, [ecx + ecx*2]
- lea edx, [edx + edx*2]
-
- movlps xmm5, [esi + ebx*4]
- movlps xmm7, [esi + ecx*4]
- movhps xmm7, [esi + edx*4] ; got half coulomb table
- movaps xmm4, xmm5
- shufps xmm4, xmm7, 10001000b
- shufps xmm5, xmm7, 11011101b
-
- movlps xmm7, [esi + ebx*4 + 8]
- movlps xmm3, [esi + ecx*4 + 8]
- movhps xmm3, [esi + edx*4 + 8] ; other half of coulomb table
- movaps xmm6, xmm7
- shufps xmm6, xmm3, 10001000b
- shufps xmm7, xmm3, 11011101b
- ; coulomb table ready, in xmm4-xmm7
- mulps xmm6, xmm1 ; xmm6=Geps
- mulps xmm7, xmm2 ; xmm7=Heps2
- addps xmm5, xmm6
- addps xmm5, xmm7 ; xmm5=Fp
- mulps xmm7, [esp + .two] ; two*Heps2
-
- xorps xmm3, xmm3
- ;; fetch charges to xmm3 (temporary)
- movss xmm3, [esp + .qqOH]
- movhps xmm3, [esp + .qqHH]
-
- addps xmm7, xmm6
- addps xmm7, xmm5 ; xmm7=FF
- mulps xmm5, xmm1 ; xmm5=eps*Fp
- addps xmm5, xmm4 ; xmm5=VV
- mulps xmm5, xmm3 ; vcoul=qq*VV
- mulps xmm3, xmm7 ; fijC=FF*qq
- ; at this point xmm5 contains vcoul and xmm3 fijC.
- addps xmm5, [esp + .vctot]
- movaps [esp + .vctot], xmm5
-
- xorps xmm1, xmm1
-
- mulps xmm3, [esp + .tabscale]
- mulps xmm3, xmm0
- subps xmm1, xmm3
-
- movaps xmm0, xmm1
- movaps xmm2, xmm1
- mulps xmm0, [esp + .dxH1O]
- mulps xmm1, [esp + .dyH1O]
- mulps xmm2, [esp + .dzH1O]
- ;; update forces H1 - j water
- movaps xmm3, [esp + .fjxO]
- movaps xmm4, [esp + .fjyO]
- movaps xmm5, [esp + .fjzO]
- subps xmm3, xmm0
- subps xmm4, xmm1
- subps xmm5, xmm2
- movaps [esp + .fjxO], xmm3
- movaps [esp + .fjyO], xmm4
- movaps [esp + .fjzO], xmm5
- addps xmm0, [esp + .fixH1]
- addps xmm1, [esp + .fiyH1]
- addps xmm2, [esp + .fizH1]
- movaps [esp + .fixH1], xmm0
- movaps [esp + .fiyH1], xmm1
- movaps [esp + .fizH1], xmm2
- ;; do table for H2 - j water interaction
- movaps xmm0, [esp + .rinvH2O]
- movaps xmm1, [esp + .rsqH2O]
- mulps xmm1, xmm0 ; xmm0=rinv, xmm1=r
- mulps xmm1, [esp + .tabscale]
-
- movhlps xmm2, xmm1
- cvttps2pi mm6, xmm1
- cvttps2pi mm7, xmm2 ; mm6/mm7 contain lu indices
- cvtpi2ps xmm3, mm6
- cvtpi2ps xmm2, mm7
- movlhps xmm3, xmm2
- subps xmm1, xmm3 ; xmm1=eps
- movaps xmm2, xmm1
- mulps xmm2, xmm2 ; xmm2=eps2
- pslld mm6, 2
- pslld mm7, 2
- movd ebx, mm6
- movd ecx, mm7
- psrlq mm7, 32
- movd edx, mm7 ; table indices in ebx,ecx,edx
-
- lea ebx, [ebx + ebx*2]
- lea ecx, [ecx + ecx*2]
- lea edx, [edx + edx*2]
-
- movlps xmm5, [esi + ebx*4]
- movlps xmm7, [esi + ecx*4]
- movhps xmm7, [esi + edx*4] ; got half coulomb table
- movaps xmm4, xmm5
- shufps xmm4, xmm7, 10001000b
- shufps xmm5, xmm7, 11011101b
-
- movlps xmm7, [esi + ebx*4 + 8]
- movlps xmm3, [esi + ecx*4 + 8]
- movhps xmm3, [esi + edx*4 + 8] ; other half of coulomb table
- movaps xmm6, xmm7
- shufps xmm6, xmm3, 10001000b
- shufps xmm7, xmm3, 11011101b
- ; coulomb table ready, in xmm4-xmm7
- mulps xmm6, xmm1 ; xmm6=Geps
- mulps xmm7, xmm2 ; xmm7=Heps2
- addps xmm5, xmm6
- addps xmm5, xmm7 ; xmm5=Fp
- mulps xmm7, [esp + .two] ; two*Heps2
-
- xorps xmm3, xmm3
- ;; fetch charges to xmm3 (temporary)
- movss xmm3, [esp + .qqOH]
- movhps xmm3, [esp + .qqHH]
-
- addps xmm7, xmm6
- addps xmm7, xmm5 ; xmm7=FF
- mulps xmm5, xmm1 ; xmm5=eps*Fp
- addps xmm5, xmm4 ; xmm5=VV
- mulps xmm5, xmm3 ; vcoul=qq*VV
- mulps xmm3, xmm7 ; fijC=FF*qq
- ; at this point xmm5 contains vcoul and xmm3 fijC.
- addps xmm5, [esp + .vctot]
- movaps [esp + .vctot], xmm5
-
- xorps xmm1, xmm1
-
- mulps xmm3, [esp + .tabscale]
- mulps xmm3, xmm0
- subps xmm1, xmm3
-
- movaps xmm0, xmm1
- movaps xmm2, xmm1
-
- mulps xmm0, [esp + .dxH2O]
- mulps xmm1, [esp + .dyH2O]
- mulps xmm2, [esp + .dzH2O]
- movaps xmm3, [esp + .fjxO]
- movaps xmm4, [esp + .fjyO]
- movaps xmm5, [esp + .fjzO]
- subps xmm3, xmm0
- subps xmm4, xmm1
- subps xmm5, xmm2
- mov esi, [ebp + %$faction]
- movaps [esp + .fjxO], xmm3
- movaps [esp + .fjyO], xmm4
- movaps [esp + .fjzO], xmm5
- addps xmm0, [esp + .fixH2]
- addps xmm1, [esp + .fiyH2]
- addps xmm2, [esp + .fizH2]
- movaps [esp + .fixH2], xmm0
- movaps [esp + .fiyH2], xmm1
- movaps [esp + .fizH2], xmm2
-
- ;; update j water forces from local variables
- movlps xmm0, [esi + eax*4]
- movlps xmm1, [esi + eax*4 + 12]
- movhps xmm1, [esi + eax*4 + 24]
- movaps xmm3, [esp + .fjxO]
- movaps xmm4, [esp + .fjyO]
- movaps xmm5, [esp + .fjzO]
- movaps xmm6, xmm5
- movaps xmm7, xmm5
- shufps xmm6, xmm6, 10b
- shufps xmm7, xmm7, 11b
- addss xmm5, [esi + eax*4 + 8]
- addss xmm6, [esi + eax*4 + 20]
- addss xmm7, [esi + eax*4 + 32]
- movss [esi + eax*4 + 8], xmm5
- movss [esi + eax*4 + 20], xmm6
- movss [esi + eax*4 + 32], xmm7
- movaps xmm5, xmm3
- unpcklps xmm3, xmm4
- unpckhps xmm5, xmm4
- addps xmm0, xmm3
- addps xmm1, xmm5
- movlps [esi + eax*4], xmm0
- movlps [esi + eax*4 + 12], xmm1
- movhps [esi + eax*4 + 24], xmm1
-
- dec dword [esp + .innerk]
- jz .updateouterdata
- jmp .single_loop
-.updateouterdata:
- mov ecx, [esp + .ii3]
- mov edi, [ebp + %$faction]
- mov esi, [ebp + %$fshift]
- mov edx, [esp + .is3]
-
- ; accumulate Oi forces in xmm0, xmm1, xmm2
- movaps xmm0, [esp + .fixO]
- movaps xmm1, [esp + .fiyO]
- movaps xmm2, [esp + .fizO]
-
- movhlps xmm3, xmm0
- movhlps xmm4, xmm1
- movhlps xmm5, xmm2
- addps xmm0, xmm3
- addps xmm1, xmm4
- addps xmm2, xmm5 ; summan ligger i 1/2 i xmm0-xmm2
-
- movaps xmm3, xmm0
- movaps xmm4, xmm1
- movaps xmm5, xmm2
-
- shufps xmm3, xmm3, 1b
- shufps xmm4, xmm4, 1b
- shufps xmm5, xmm5, 1b
- addss xmm0, xmm3
- addss xmm1, xmm4
- addss xmm2, xmm5 ; xmm0-xmm2 has single force in pos0.
-
- ; increment i force
- movss xmm3, [edi + ecx*4]
- movss xmm4, [edi + ecx*4 + 4]
- movss xmm5, [edi + ecx*4 + 8]
- addss xmm3, xmm0
- addss xmm4, xmm1
- addss xmm5, xmm2
- movss [edi + ecx*4], xmm3
- movss [edi + ecx*4 + 4], xmm4
- movss [edi + ecx*4 + 8], xmm5
-
- ; accumulate force in xmm6/xmm7 for fshift
- movaps xmm6, xmm0
- movss xmm7, xmm2
- movlhps xmm6, xmm1
- shufps xmm6, xmm6, 1000b
-
- ; accumulate H1i forces in xmm0, xmm1, xmm2
- movaps xmm0, [esp + .fixH1]
- movaps xmm1, [esp + .fiyH1]
- movaps xmm2, [esp + .fizH1]
-
- movhlps xmm3, xmm0
- movhlps xmm4, xmm1
- movhlps xmm5, xmm2
- addps xmm0, xmm3
- addps xmm1, xmm4
- addps xmm2, xmm5 ; summan ligger i 1/2 i xmm0-xmm2
-
- movaps xmm3, xmm0
- movaps xmm4, xmm1
- movaps xmm5, xmm2
-
- shufps xmm3, xmm3, 1b
- shufps xmm4, xmm4, 1b
- shufps xmm5, xmm5, 1b
- addss xmm0, xmm3
- addss xmm1, xmm4
- addss xmm2, xmm5 ; xmm0-xmm2 has single force in pos0.
-
- ; increment i force
- movss xmm3, [edi + ecx*4 + 12]
- movss xmm4, [edi + ecx*4 + 16]
- movss xmm5, [edi + ecx*4 + 20]
- addss xmm3, xmm0
- addss xmm4, xmm1
- addss xmm5, xmm2
- movss [edi + ecx*4 + 12], xmm3
- movss [edi + ecx*4 + 16], xmm4
- movss [edi + ecx*4 + 20], xmm5
-
- ;accumulate force in xmm6/xmm7 for fshift
- addss xmm7, xmm2
- movlhps xmm0, xmm1
- shufps xmm0, xmm0, 1000b
- addps xmm6, xmm0
-
- ; accumulate H2i forces in xmm0, xmm1, xmm2
- movaps xmm0, [esp + .fixH2]
- movaps xmm1, [esp + .fiyH2]
- movaps xmm2, [esp + .fizH2]
-
- movhlps xmm3, xmm0
- movhlps xmm4, xmm1
- movhlps xmm5, xmm2
- addps xmm0, xmm3
- addps xmm1, xmm4
- addps xmm2, xmm5 ; summan ligger i 1/2 i xmm0-xmm2
-
- movaps xmm3, xmm0
- movaps xmm4, xmm1
- movaps xmm5, xmm2
-
- shufps xmm3, xmm3, 1b
- shufps xmm4, xmm4, 1b
- shufps xmm5, xmm5, 1b
- addss xmm0, xmm3
- addss xmm1, xmm4
- addss xmm2, xmm5 ; xmm0-xmm2 has single force in pos0.
-
- ; increment i force
- movss xmm3, [edi + ecx*4 + 24]
- movss xmm4, [edi + ecx*4 + 28]
- movss xmm5, [edi + ecx*4 + 32]
- addss xmm3, xmm0
- addss xmm4, xmm1
- addss xmm5, xmm2
- movss [edi + ecx*4 + 24], xmm3
- movss [edi + ecx*4 + 28], xmm4
- movss [edi + ecx*4 + 32], xmm5
-
- ;accumulate force in xmm6/xmm7 for fshift
- addss xmm7, xmm2
- movlhps xmm0, xmm1
- shufps xmm0, xmm0, 1000b
- addps xmm6, xmm0
-
- ; increment fshift force
- movlps xmm3, [esi + edx*4]
- movss xmm4, [esi + edx*4 + 8]
- addps xmm3, xmm6
- addss xmm4, xmm7
- movlps [esi + edx*4], xmm3
- movss [esi + edx*4 + 8], xmm4
-
- ; get group index for i particle
- mov edx, [ebp + %$gid] ; get group index for this i particle
- mov edx, [edx]
- add [ebp + %$gid], dword 4 ; advance pointer
-
- ; accumulate total potential energy and update it.
- movaps xmm7, [esp + .vctot]
- ; accumulate
- movhlps xmm6, xmm7
- addps xmm7, xmm6 ; pos 0-1 in xmm7 have the sum now
- movaps xmm6, xmm7
- shufps xmm6, xmm6, 1b
- addss xmm7, xmm6
-
- ; add earlier value from mem.
- mov eax, [ebp + %$Vc]
- addss xmm7, [eax + edx*4]
- ; move back to mem.
- movss [eax + edx*4], xmm7
-
- ; accumulate total lj energy and update it.
- movaps xmm7, [esp + .vnbtot]
- ; accumulate
- movhlps xmm6, xmm7
- addps xmm7, xmm6 ; pos 0-1 in xmm7 have the sum now
- movaps xmm6, xmm7
- shufps xmm6, xmm6, 1b
- addss xmm7, xmm6
-
- ; add earlier value from mem.
- mov eax, [ebp + %$Vnb]
- addss xmm7, [eax + edx*4]
- ; move back to mem.
- movss [eax + edx*4], xmm7
-
- ;; finish if last
- mov ecx, [ebp + %$nri]
- dec ecx
- jecxz .end
- ;; not last, iterate once more!
- mov [ebp + %$nri], ecx
- jmp .outer
-.end:
- emms
- mov eax, [esp + .salign]
- add esp, eax
- add esp, 1508
- pop edi
- pop esi
- pop edx
- pop ecx
- pop ebx
- pop eax
- endproc
-
-
# but it can probably be done in a nicer way...
INCLUDES = @INCLUDES@ -I$(top_srcdir)/src/include
LDFLAGS = @LDFLAGS@ -L${top_builddir}/src/gmxlib -L${top_builddir}/src/mdlib
-LDADD = -lmdXXX_SUFFIX_XXX -lgmxXXX_SUFFIX_XXX
-
+LDADD = ../gmxlib/libgmxXXX_SUFFIX_XXX.la ../mdlib/libmdXXX_SUFFIX_XXX.la
bin_PROGRAMS = \
gromppXXX_SUFFIX_XXX mdrunXXX_SUFFIX_XXX \
sorting.h topdirs.h toppush.h dum_parm.h \
readir.h topcat.h topexcl.h topshake.h
+
mdrunXXX_SUFFIX_XXX_SOURCES = \
mdrun.c
+
tpbconvXXX_SUFFIX_XXX_SOURCES = \
readir.c toputil.c topdirs.c add_par.c \
topexcl.c tpbconv.c add_par.h toputil.h \
topdirs.h readir.h topexcl.h
+
pdb2gmxXXX_SUFFIX_XXX_SOURCES = \
hizzie.c xlate.c specbond.c ter_db.c \
h_db.c genhydro.c pdb2top.c gen_ad.c \
topdirs.h genhydro.h hizzie.h specbond.h \
topexcl.h
+
protonateXXX_SUFFIX_XXX_SOURCES = \
hackblock.c ter_db.c h_db.c genhydro.c \
pgutil.c resall.c add_par.c topexcl.c \
h_db.h resall.h toputil.h hackblock.h \
pgutil.h ter_db.h topdirs.h topexcl.h
+
nmrunXXX_SUFFIX_XXX_SOURCES = \
nmrun.c
+
luckXXX_SUFFIX_XXX_SOURCES = \
luck.c
+
gmxdumpXXX_SUFFIX_XXX_SOURCES = \
gmxdump.c
+
gmxcheckXXX_SUFFIX_XXX_SOURCES = \
gmxcheck.c tpbcmp.c tpbcmp.h
+
x2topXXX_SUFFIX_XXX_SOURCES = \
toppush.c nm2type.c pdb2top.c gen_ad.c \
gen_dum.c pgutil.c resall.c hackblock.c \
pgutil.h topdirs.h toppush.h topexcl.h \
x2top.h
+
xmdrunXXX_SUFFIX_XXX_SOURCES = \
glaasje.c glaasje.h gctio.c init_sh.c \
init_sh.h ionize.c ionize.h ion_data.h \
xmdrun.c do_gct.c do_gct.h relax_sh.c
+
install-mdrun: mdrunXXX_SUFFIX_XXX
$(mkinstalldirs) $(DESTDIR)$(bindir)
if test -f mdrunXXX_SUFFIX_XXX; then \
$(INSTALL_PROGRAM) $(INSTALL_STRIP_FLAG) mdrunXXX_SUFFIX_XXX $(DESTDIR)$(bindir)/mdrunXXX_SUFFIX_XXX; \
fi
+
+
# clean things explicitly, since the target names might have changed
CLEANFILES = ${bin_PROGRAMS} ${EXTRA_PROGRAMS} \
# but it can probably be done in a nicer way...
INCLUDES = @INCLUDES@ -I$(top_srcdir)/src/include
-lib_LIBRARIES = libmdXXX_SUFFIX_XXX.a
+# Dependencies and extra objects should come before the target definition
+libmdXXX_SUFFIX_XXX_la_LIBADD = @MDLIB_COND_OBJ@
+libmdXXX_SUFFIX_XXX_la_DEPENDENCIES = @MDLIB_COND_OBJ@
-libmdXXX_SUFFIX_XXX_a_SOURCES = \
+
+
+# Finally, the library definition
+
+lib_LTLIBRARIES = libmdXXX_SUFFIX_XXX.la
+
+libmdXXX_SUFFIX_XXX_la_SOURCES = \
calcmu.c calcvir.c \
congrad.c constr.c coupling.c \
dummies.c ebin.c edsam.c \
vcm.c wnblist.c poisson.h \
splittop.h wnblist.h
-EXTRA_libmdXXX_SUFFIX_XXX_a_SOURCES = \
+EXTRA_libmdXXX_SUFFIX_XXX_la_SOURCES = \
cshake.c csettle.c clincs.c \
fshake.f fsettle.f flincs.f \
fshaked.f fsettled.f flincsd.f
-if USE_DOUBLE
- inner_f77_obj = fshaked.o fsettled.o flincsd.o
-else
- inner_f77_obj = fshake.o fsettle.o flincs.o
-endif
-
-inner_c_obj = cshake.o csettle.o clincs.o
-
-
-libmdXXX_SUFFIX_XXX_a_LIBADD = @INNER_F77_OBJ@ @INNER_C_OBJ@
-libmdXXX_SUFFIX_XXX_a_DEPENDENCIES = @INNER_F77_OBJ@ @INNER_C_OBJ@
-
# clean things explicitly, since the target names might have changed
CLEANFILES = ${lib_LIBRARIES} *_d.a *_mpi.a *~ \\\#*
# but it can probably be done in a nicer way...
INCLUDES = @INCLUDES@ -I$(top_srcdir)/src/include
LDFLAGS = @LDFLAGS@ -L${top_builddir}/src/gmxlib -L${top_builddir}/src/mdlib
-LDADD = -lmdXXX_SUFFIX_XXX -lgmxXXX_SUFFIX_XXX
+LDADD = ../gmxlib/libgmxXXX_SUFFIX_XXX.la ../mdlib/libmdXXX_SUFFIX_XXX.la
# Ngmx requires X - nothing is built if it doesn't exist
# clean things explicitly, since the target names might have changed
CLEANFILES = ${bin_PROGRAMS} ${EXTRA_PROGRAMS} \
- *_mpi *_d *~ \\\#*
\ No newline at end of file
+ *_mpi *_d *~ \\\#*
# but it can probably be done in a nicer way...
INCLUDES = @INCLUDES@ -I$(top_srcdir)/src/include
LDFLAGS = @LDFLAGS@ -L${top_builddir}/src/gmxlib -L${top_builddir}/src/mdlib
-LDADD = -lmdXXX_SUFFIX_XXX -lgmxXXX_SUFFIX_XXX
+LDADD = ../gmxlib/libgmxXXX_SUFFIX_XXX.la ../mdlib/libmdXXX_SUFFIX_XXX.la
bin_PROGRAMS = \
averageXXX_SUFFIX_XXX do_dsspXXX_SUFFIX_XXX \